xref: /spdk/module/bdev/nvme/bdev_nvme.c (revision 6e5d6032a09ca918509e7c6f28d6d2e20b8dc832)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation. All rights reserved.
5  *   Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
6  *   Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
7  *
8  *   Redistribution and use in source and binary forms, with or without
9  *   modification, are permitted provided that the following conditions
10  *   are met:
11  *
12  *     * Redistributions of source code must retain the above copyright
13  *       notice, this list of conditions and the following disclaimer.
14  *     * Redistributions in binary form must reproduce the above copyright
15  *       notice, this list of conditions and the following disclaimer in
16  *       the documentation and/or other materials provided with the
17  *       distribution.
18  *     * Neither the name of Intel Corporation nor the names of its
19  *       contributors may be used to endorse or promote products derived
20  *       from this software without specific prior written permission.
21  *
22  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
25  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
26  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  */
34 
35 #include "spdk/stdinc.h"
36 
37 #include "bdev_nvme.h"
38 
39 #include "spdk/accel_engine.h"
40 #include "spdk/config.h"
41 #include "spdk/endian.h"
42 #include "spdk/bdev.h"
43 #include "spdk/json.h"
44 #include "spdk/nvme.h"
45 #include "spdk/nvme_ocssd.h"
46 #include "spdk/nvme_zns.h"
47 #include "spdk/thread.h"
48 #include "spdk/string.h"
49 #include "spdk/util.h"
50 
51 #include "spdk/bdev_module.h"
52 #include "spdk/log.h"
53 
54 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true
55 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS	(10000)
56 
57 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w);
58 
59 struct nvme_bdev_io {
60 	/** array of iovecs to transfer. */
61 	struct iovec *iovs;
62 
63 	/** Number of iovecs in iovs array. */
64 	int iovcnt;
65 
66 	/** Current iovec position. */
67 	int iovpos;
68 
69 	/** Offset in current iovec. */
70 	uint32_t iov_offset;
71 
72 	/** array of iovecs to transfer. */
73 	struct iovec *fused_iovs;
74 
75 	/** Number of iovecs in iovs array. */
76 	int fused_iovcnt;
77 
78 	/** Current iovec position. */
79 	int fused_iovpos;
80 
81 	/** Offset in current iovec. */
82 	uint32_t fused_iov_offset;
83 
84 	/** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */
85 	struct spdk_nvme_cpl cpl;
86 	/** Extended IO opts passed by the user to bdev layer and mapped to NVME format */
87 	struct spdk_nvme_ns_cmd_ext_io_opts ext_opts;
88 
89 	/** Originating thread */
90 	struct spdk_thread *orig_thread;
91 
92 	/** Keeps track if first of fused commands was submitted */
93 	bool first_fused_submitted;
94 
95 	/** Temporary pointer to zone report buffer */
96 	struct spdk_nvme_zns_zone_report *zone_report_buf;
97 
98 	/** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */
99 	uint64_t handled_zones;
100 };
101 
102 struct nvme_probe_ctx {
103 	size_t count;
104 	struct spdk_nvme_transport_id trids[NVME_MAX_CONTROLLERS];
105 	struct spdk_nvme_host_id hostids[NVME_MAX_CONTROLLERS];
106 	const char *names[NVME_MAX_CONTROLLERS];
107 	uint32_t prchk_flags[NVME_MAX_CONTROLLERS];
108 	const char *hostnqn;
109 };
110 
111 struct nvme_probe_skip_entry {
112 	struct spdk_nvme_transport_id		trid;
113 	TAILQ_ENTRY(nvme_probe_skip_entry)	tailq;
114 };
115 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */
116 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER(
117 			g_skipped_nvme_ctrlrs);
118 
119 static struct spdk_bdev_nvme_opts g_opts = {
120 	.action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE,
121 	.timeout_us = 0,
122 	.timeout_admin_us = 0,
123 	.keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS,
124 	.retry_count = 4,
125 	.arbitration_burst = 0,
126 	.low_priority_weight = 0,
127 	.medium_priority_weight = 0,
128 	.high_priority_weight = 0,
129 	.nvme_adminq_poll_period_us = 10000ULL,
130 	.nvme_ioq_poll_period_us = 0,
131 	.io_queue_requests = 0,
132 	.delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT,
133 };
134 
135 #define NVME_HOTPLUG_POLL_PERIOD_MAX			10000000ULL
136 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT		100000ULL
137 
138 static int g_hot_insert_nvme_controller_index = 0;
139 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT;
140 static bool g_nvme_hotplug_enabled = false;
141 static struct spdk_thread *g_bdev_nvme_init_thread;
142 static struct spdk_poller *g_hotplug_poller;
143 static struct spdk_poller *g_hotplug_probe_poller;
144 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx;
145 
146 static void nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr,
147 		struct nvme_async_probe_ctx *ctx);
148 static void nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr,
149 		struct nvme_async_probe_ctx *ctx);
150 static int bdev_nvme_library_init(void);
151 static void bdev_nvme_library_fini(void);
152 static int bdev_nvme_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
153 			   struct nvme_bdev_io *bio,
154 			   struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
155 			   uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts);
156 static int bdev_nvme_no_pi_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
157 				 struct nvme_bdev_io *bio,
158 				 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba);
159 static int bdev_nvme_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
160 			    struct nvme_bdev_io *bio,
161 			    struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
162 			    uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts);
163 static int bdev_nvme_zone_appendv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
164 				  struct nvme_bdev_io *bio,
165 				  struct iovec *iov, int iovcnt, void *md, uint64_t lba_count,
166 				  uint64_t zslba, uint32_t flags);
167 static int bdev_nvme_comparev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
168 			      struct nvme_bdev_io *bio,
169 			      struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
170 			      uint32_t flags);
171 static int bdev_nvme_comparev_and_writev(struct spdk_nvme_ns *ns,
172 		struct spdk_nvme_qpair *qpair,
173 		struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov,
174 		int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba,
175 		uint32_t flags);
176 static int bdev_nvme_get_zone_info(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
177 				   struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones,
178 				   struct spdk_bdev_zone_info *info);
179 static int bdev_nvme_zone_management(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
180 				     struct nvme_bdev_io *bio, uint64_t zone_id,
181 				     enum spdk_bdev_zone_action action);
182 static int bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch,
183 				    struct nvme_bdev_io *bio,
184 				    struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes);
185 static int bdev_nvme_io_passthru(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
186 				 struct nvme_bdev_io *bio,
187 				 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes);
188 static int bdev_nvme_io_passthru_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
189 				    struct nvme_bdev_io *bio,
190 				    struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len);
191 static int bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch,
192 			   struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort);
193 static int bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio);
194 static int bdev_nvme_failover(struct nvme_ctrlr *nvme_ctrlr, bool remove);
195 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr);
196 
197 struct spdk_nvme_qpair *
198 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch)
199 {
200 	struct nvme_ctrlr_channel *ctrlr_ch;
201 
202 	assert(ctrlr_io_ch != NULL);
203 
204 	ctrlr_ch = spdk_io_channel_get_ctx(ctrlr_io_ch);
205 
206 	return ctrlr_ch->qpair;
207 }
208 
209 static int
210 bdev_nvme_get_ctx_size(void)
211 {
212 	return sizeof(struct nvme_bdev_io);
213 }
214 
215 static struct spdk_bdev_module nvme_if = {
216 	.name = "nvme",
217 	.async_fini = true,
218 	.module_init = bdev_nvme_library_init,
219 	.module_fini = bdev_nvme_library_fini,
220 	.config_json = bdev_nvme_config_json,
221 	.get_ctx_size = bdev_nvme_get_ctx_size,
222 
223 };
224 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if)
225 
226 static inline bool
227 bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch,
228 		       struct spdk_nvme_ns **_ns, struct spdk_nvme_qpair **_qpair)
229 {
230 	if (spdk_unlikely(nbdev_ch->ctrlr_ch->qpair == NULL)) {
231 		/* The device is currently resetting. */
232 		return false;
233 	}
234 
235 	*_ns = nbdev_ch->nvme_ns->ns;
236 	*_qpair = nbdev_ch->ctrlr_ch->qpair;
237 	return true;
238 }
239 
240 static inline bool
241 bdev_nvme_find_admin_path(struct nvme_bdev_channel *nbdev_ch,
242 			  struct nvme_ctrlr **_nvme_ctrlr)
243 {
244 	*_nvme_ctrlr = nbdev_ch->ctrlr_ch->ctrlr;
245 	return true;
246 }
247 
248 static inline void
249 bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio,
250 				  const struct spdk_nvme_cpl *cpl)
251 {
252 	spdk_bdev_io_complete_nvme_status(spdk_bdev_io_from_ctx(bio), cpl->cdw0,
253 					  cpl->status.sct, cpl->status.sc);
254 }
255 
256 static inline void
257 bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc)
258 {
259 	enum spdk_bdev_io_status io_status;
260 
261 	if (rc == 0) {
262 		io_status = SPDK_BDEV_IO_STATUS_SUCCESS;
263 	} else if (rc == -ENOMEM) {
264 		io_status = SPDK_BDEV_IO_STATUS_NOMEM;
265 	} else {
266 		io_status = SPDK_BDEV_IO_STATUS_FAILED;
267 	}
268 
269 	spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), io_status);
270 }
271 
272 static void
273 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx)
274 {
275 	int rc;
276 
277 	SPDK_DEBUGLOG(bdev_nvme, "qpair %p is disconnected, attempting reconnect.\n", qpair);
278 	/*
279 	 * Currently, just try to reconnect indefinitely. If we are doing a reset, the reset will
280 	 * reconnect a qpair and we will stop getting a callback for this one.
281 	 */
282 	rc = spdk_nvme_ctrlr_reconnect_io_qpair(qpair);
283 	if (rc != 0) {
284 		SPDK_DEBUGLOG(bdev_nvme, "Failed to reconnect to qpair %p, errno %d\n", qpair, -rc);
285 	}
286 }
287 
288 static int
289 bdev_nvme_poll(void *arg)
290 {
291 	struct nvme_poll_group *group = arg;
292 	int64_t num_completions;
293 
294 	if (group->collect_spin_stat && group->start_ticks == 0) {
295 		group->start_ticks = spdk_get_ticks();
296 	}
297 
298 	num_completions = spdk_nvme_poll_group_process_completions(group->group, 0,
299 			  bdev_nvme_disconnected_qpair_cb);
300 	if (group->collect_spin_stat) {
301 		if (num_completions > 0) {
302 			if (group->end_ticks != 0) {
303 				group->spin_ticks += (group->end_ticks - group->start_ticks);
304 				group->end_ticks = 0;
305 			}
306 			group->start_ticks = 0;
307 		} else {
308 			group->end_ticks = spdk_get_ticks();
309 		}
310 	}
311 
312 	return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
313 }
314 
315 static int
316 bdev_nvme_poll_adminq(void *arg)
317 {
318 	int32_t rc;
319 	struct nvme_ctrlr *nvme_ctrlr = arg;
320 
321 	assert(nvme_ctrlr != NULL);
322 
323 	rc = spdk_nvme_ctrlr_process_admin_completions(nvme_ctrlr->ctrlr);
324 	if (rc < 0) {
325 		bdev_nvme_failover(nvme_ctrlr, false);
326 	}
327 
328 	return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY;
329 }
330 
331 static void
332 _bdev_nvme_unregister_dev_cb(void *io_device)
333 {
334 	struct nvme_bdev *nvme_disk = io_device;
335 
336 	free(nvme_disk->disk.name);
337 	free(nvme_disk);
338 }
339 
340 static int
341 bdev_nvme_destruct(void *ctx)
342 {
343 	struct nvme_bdev *nvme_disk = ctx;
344 	struct nvme_ns *nvme_ns = nvme_disk->nvme_ns;
345 
346 	pthread_mutex_lock(&nvme_ns->ctrlr->mutex);
347 
348 	nvme_ns->bdev = NULL;
349 
350 	if (!nvme_ns->populated) {
351 		pthread_mutex_unlock(&nvme_ns->ctrlr->mutex);
352 
353 		nvme_ctrlr_release(nvme_ns->ctrlr);
354 	} else {
355 		pthread_mutex_unlock(&nvme_ns->ctrlr->mutex);
356 	}
357 
358 	spdk_io_device_unregister(nvme_disk, _bdev_nvme_unregister_dev_cb);
359 
360 	return 0;
361 }
362 
363 static int
364 bdev_nvme_flush(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
365 		struct nvme_bdev_io *bio, uint64_t offset, uint64_t nbytes)
366 {
367 	bdev_nvme_io_complete(bio, 0);
368 
369 	return 0;
370 }
371 
372 static int
373 bdev_nvme_create_qpair(struct nvme_ctrlr_channel *ctrlr_ch)
374 {
375 	struct spdk_nvme_ctrlr *ctrlr = ctrlr_ch->ctrlr->ctrlr;
376 	struct spdk_nvme_io_qpair_opts opts;
377 	struct spdk_nvme_qpair *qpair;
378 	int rc;
379 
380 	spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &opts, sizeof(opts));
381 	opts.delay_cmd_submit = g_opts.delay_cmd_submit;
382 	opts.create_only = true;
383 	opts.async_mode = true;
384 	opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests);
385 	g_opts.io_queue_requests = opts.io_queue_requests;
386 
387 	qpair = spdk_nvme_ctrlr_alloc_io_qpair(ctrlr, &opts, sizeof(opts));
388 	if (qpair == NULL) {
389 		return -1;
390 	}
391 
392 	assert(ctrlr_ch->group != NULL);
393 
394 	rc = spdk_nvme_poll_group_add(ctrlr_ch->group->group, qpair);
395 	if (rc != 0) {
396 		SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n");
397 		goto err;
398 	}
399 
400 	rc = spdk_nvme_ctrlr_connect_io_qpair(ctrlr, qpair);
401 	if (rc != 0) {
402 		SPDK_ERRLOG("Unable to connect I/O qpair.\n");
403 		goto err;
404 	}
405 
406 	ctrlr_ch->qpair = qpair;
407 
408 	return 0;
409 
410 err:
411 	spdk_nvme_ctrlr_free_io_qpair(qpair);
412 
413 	return rc;
414 }
415 
416 static void
417 bdev_nvme_destroy_qpair(struct nvme_ctrlr_channel *ctrlr_ch)
418 {
419 	if (ctrlr_ch->qpair != NULL) {
420 		spdk_nvme_ctrlr_free_io_qpair(ctrlr_ch->qpair);
421 		ctrlr_ch->qpair = NULL;
422 	}
423 }
424 
425 static void
426 _bdev_nvme_check_pending_destruct(struct nvme_ctrlr *nvme_ctrlr)
427 {
428 	pthread_mutex_lock(&nvme_ctrlr->mutex);
429 	if (nvme_ctrlr->destruct_after_reset) {
430 		assert(nvme_ctrlr->ref == 0 && nvme_ctrlr->destruct);
431 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
432 
433 		spdk_thread_send_msg(nvme_ctrlr->thread, nvme_ctrlr_unregister,
434 				     nvme_ctrlr);
435 	} else {
436 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
437 	}
438 }
439 
440 static void
441 bdev_nvme_check_pending_destruct(struct spdk_io_channel_iter *i, int status)
442 {
443 	struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i);
444 
445 	_bdev_nvme_check_pending_destruct(nvme_ctrlr);
446 }
447 
448 static void
449 _bdev_nvme_complete_pending_resets(struct nvme_ctrlr_channel *ctrlr_ch,
450 				   enum spdk_bdev_io_status status)
451 {
452 	struct spdk_bdev_io *bdev_io;
453 
454 	while (!TAILQ_EMPTY(&ctrlr_ch->pending_resets)) {
455 		bdev_io = TAILQ_FIRST(&ctrlr_ch->pending_resets);
456 		TAILQ_REMOVE(&ctrlr_ch->pending_resets, bdev_io, module_link);
457 		spdk_bdev_io_complete(bdev_io, status);
458 	}
459 }
460 
461 static void
462 bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i)
463 {
464 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
465 	struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch);
466 
467 	_bdev_nvme_complete_pending_resets(ctrlr_ch, SPDK_BDEV_IO_STATUS_SUCCESS);
468 
469 	spdk_for_each_channel_continue(i, 0);
470 }
471 
472 static void
473 bdev_nvme_abort_pending_resets(struct spdk_io_channel_iter *i)
474 {
475 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
476 	struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch);
477 
478 	_bdev_nvme_complete_pending_resets(ctrlr_ch, SPDK_BDEV_IO_STATUS_FAILED);
479 
480 	spdk_for_each_channel_continue(i, 0);
481 }
482 
483 static void
484 bdev_nvme_reset_complete(struct nvme_ctrlr *nvme_ctrlr, int rc)
485 {
486 	struct nvme_ctrlr_trid *curr_trid;
487 	bdev_nvme_reset_cb reset_cb_fn = nvme_ctrlr->reset_cb_fn;
488 	void *reset_cb_arg = nvme_ctrlr->reset_cb_arg;
489 
490 	nvme_ctrlr->reset_cb_fn = NULL;
491 	nvme_ctrlr->reset_cb_arg = NULL;
492 
493 	if (rc) {
494 		SPDK_ERRLOG("Resetting controller failed.\n");
495 	} else {
496 		SPDK_NOTICELOG("Resetting controller successful.\n");
497 	}
498 
499 	pthread_mutex_lock(&nvme_ctrlr->mutex);
500 	nvme_ctrlr->resetting = false;
501 	nvme_ctrlr->failover_in_progress = false;
502 
503 	curr_trid = TAILQ_FIRST(&nvme_ctrlr->trids);
504 	assert(curr_trid != NULL);
505 	assert(&curr_trid->trid == nvme_ctrlr->connected_trid);
506 
507 	curr_trid->is_failed = rc != 0 ? true : false;
508 
509 	if (nvme_ctrlr->ref == 0 && nvme_ctrlr->destruct) {
510 		/* Destruct ctrlr after clearing pending resets. */
511 		nvme_ctrlr->destruct_after_reset = true;
512 	}
513 
514 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
515 
516 	if (reset_cb_fn) {
517 		reset_cb_fn(reset_cb_arg, rc);
518 	}
519 
520 	/* Make sure we clear any pending resets before returning. */
521 	spdk_for_each_channel(nvme_ctrlr,
522 			      rc == 0 ? bdev_nvme_complete_pending_resets :
523 			      bdev_nvme_abort_pending_resets,
524 			      NULL,
525 			      bdev_nvme_check_pending_destruct);
526 }
527 
528 static void
529 bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status)
530 {
531 	struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i);
532 
533 	bdev_nvme_reset_complete(nvme_ctrlr, status);
534 }
535 
536 static void
537 bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i)
538 {
539 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
540 	struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch);
541 	int rc;
542 
543 	rc = bdev_nvme_create_qpair(ctrlr_ch);
544 
545 	spdk_for_each_channel_continue(i, rc);
546 }
547 
548 static int
549 bdev_nvme_ctrlr_reset_poll(void *arg)
550 {
551 	struct nvme_ctrlr *nvme_ctrlr = arg;
552 	int rc;
553 
554 	rc = spdk_nvme_ctrlr_reset_poll_async(nvme_ctrlr->reset_ctx);
555 	if (rc == -EAGAIN) {
556 		return SPDK_POLLER_BUSY;
557 	}
558 
559 	spdk_poller_unregister(&nvme_ctrlr->reset_poller);
560 	if (rc == 0) {
561 		/* Recreate all of the I/O queue pairs */
562 		spdk_for_each_channel(nvme_ctrlr,
563 				      bdev_nvme_reset_create_qpair,
564 				      NULL,
565 				      bdev_nvme_reset_create_qpairs_done);
566 	} else {
567 		bdev_nvme_reset_complete(nvme_ctrlr, rc);
568 	}
569 	return SPDK_POLLER_BUSY;
570 }
571 
572 static void
573 bdev_nvme_reset_ctrlr(struct spdk_io_channel_iter *i, int status)
574 {
575 	struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i);
576 	int rc;
577 
578 	if (status) {
579 		rc = status;
580 		goto err;
581 	}
582 
583 	rc = spdk_nvme_ctrlr_reset_async(nvme_ctrlr->ctrlr, &nvme_ctrlr->reset_ctx);
584 	if (rc != 0) {
585 		SPDK_ERRLOG("Create controller reset context failed\n");
586 		goto err;
587 	}
588 	assert(nvme_ctrlr->reset_poller == NULL);
589 	nvme_ctrlr->reset_poller = SPDK_POLLER_REGISTER(bdev_nvme_ctrlr_reset_poll,
590 				   nvme_ctrlr, 0);
591 
592 	return;
593 
594 err:
595 	bdev_nvme_reset_complete(nvme_ctrlr, rc);
596 }
597 
598 static void
599 bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i)
600 {
601 	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
602 	struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch);
603 
604 	bdev_nvme_destroy_qpair(ctrlr_ch);
605 	spdk_for_each_channel_continue(i, 0);
606 }
607 
608 static int
609 bdev_nvme_reset(struct nvme_ctrlr *nvme_ctrlr)
610 {
611 	pthread_mutex_lock(&nvme_ctrlr->mutex);
612 	if (nvme_ctrlr->destruct) {
613 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
614 		return -EBUSY;
615 	}
616 
617 	if (nvme_ctrlr->resetting) {
618 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
619 		SPDK_NOTICELOG("Unable to perform reset, already in progress.\n");
620 		return -EAGAIN;
621 	}
622 
623 	nvme_ctrlr->resetting = true;
624 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
625 	spdk_nvme_ctrlr_prepare_for_reset(nvme_ctrlr->ctrlr);
626 
627 	/* First, delete all NVMe I/O queue pairs. */
628 	spdk_for_each_channel(nvme_ctrlr,
629 			      bdev_nvme_reset_destroy_qpair,
630 			      NULL,
631 			      bdev_nvme_reset_ctrlr);
632 
633 	return 0;
634 }
635 
636 int
637 bdev_nvme_reset_rpc(struct nvme_ctrlr *nvme_ctrlr, bdev_nvme_reset_cb cb_fn, void *cb_arg)
638 {
639 	int rc;
640 
641 	rc = bdev_nvme_reset(nvme_ctrlr);
642 	if (rc == 0) {
643 		nvme_ctrlr->reset_cb_fn = cb_fn;
644 		nvme_ctrlr->reset_cb_arg = cb_arg;
645 	}
646 	return rc;
647 }
648 
649 static void
650 bdev_nvme_reset_io_complete(void *cb_arg, int rc)
651 {
652 	struct nvme_bdev_io *bio = cb_arg;
653 
654 	bdev_nvme_io_complete(bio, rc);
655 }
656 
657 static int
658 bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio)
659 {
660 	struct nvme_ctrlr_channel *ctrlr_ch = nbdev_ch->ctrlr_ch;
661 	struct spdk_bdev_io *bdev_io;
662 	int rc;
663 
664 	rc = bdev_nvme_reset(ctrlr_ch->ctrlr);
665 	if (rc == 0) {
666 		assert(ctrlr_ch->ctrlr->reset_cb_fn == NULL);
667 		assert(ctrlr_ch->ctrlr->reset_cb_arg == NULL);
668 		ctrlr_ch->ctrlr->reset_cb_fn = bdev_nvme_reset_io_complete;
669 		ctrlr_ch->ctrlr->reset_cb_arg = bio;
670 	} else if (rc == -EAGAIN) {
671 		/*
672 		 * Reset call is queued only if it is from the app framework. This is on purpose so that
673 		 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the
674 		 * upper level. If they are in the middle of a reset, we won't try to schedule another one.
675 		 */
676 		bdev_io = spdk_bdev_io_from_ctx(bio);
677 		TAILQ_INSERT_TAIL(&ctrlr_ch->pending_resets, bdev_io, module_link);
678 	} else {
679 		return rc;
680 	}
681 
682 	return 0;
683 }
684 
685 static int
686 bdev_nvme_failover_start(struct nvme_ctrlr *nvme_ctrlr, bool remove)
687 {
688 	struct nvme_ctrlr_trid *curr_trid = NULL, *next_trid = NULL;
689 	int rc;
690 
691 	pthread_mutex_lock(&nvme_ctrlr->mutex);
692 	if (nvme_ctrlr->destruct) {
693 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
694 		/* Don't bother resetting if the controller is in the process of being destructed. */
695 		return -EBUSY;
696 	}
697 
698 	curr_trid = TAILQ_FIRST(&nvme_ctrlr->trids);
699 	assert(curr_trid);
700 	assert(&curr_trid->trid == nvme_ctrlr->connected_trid);
701 	next_trid = TAILQ_NEXT(curr_trid, link);
702 
703 	if (nvme_ctrlr->resetting) {
704 		if (next_trid && !nvme_ctrlr->failover_in_progress) {
705 			rc = -EAGAIN;
706 		} else {
707 			rc = -EBUSY;
708 		}
709 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
710 		SPDK_NOTICELOG("Unable to perform reset, already in progress.\n");
711 		return rc;
712 	}
713 
714 	nvme_ctrlr->resetting = true;
715 	curr_trid->is_failed = true;
716 
717 	if (next_trid) {
718 		assert(curr_trid->trid.trtype != SPDK_NVME_TRANSPORT_PCIE);
719 
720 		SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", curr_trid->trid.traddr,
721 			       curr_trid->trid.trsvcid,	next_trid->trid.traddr, next_trid->trid.trsvcid);
722 
723 		nvme_ctrlr->failover_in_progress = true;
724 		spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr);
725 		nvme_ctrlr->connected_trid = &next_trid->trid;
726 		rc = spdk_nvme_ctrlr_set_trid(nvme_ctrlr->ctrlr, &next_trid->trid);
727 		assert(rc == 0);
728 		TAILQ_REMOVE(&nvme_ctrlr->trids, curr_trid, link);
729 		if (!remove) {
730 			/** Shuffle the old trid to the end of the list and use the new one.
731 			 * Allows for round robin through multiple connections.
732 			 */
733 			TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, curr_trid, link);
734 		} else {
735 			free(curr_trid);
736 		}
737 	}
738 
739 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
740 	return 0;
741 }
742 
743 static int
744 bdev_nvme_failover(struct nvme_ctrlr *nvme_ctrlr, bool remove)
745 {
746 	int rc;
747 
748 	rc = bdev_nvme_failover_start(nvme_ctrlr, remove);
749 	if (rc == 0) {
750 		/* First, delete all NVMe I/O queue pairs. */
751 		spdk_for_each_channel(nvme_ctrlr,
752 				      bdev_nvme_reset_destroy_qpair,
753 				      NULL,
754 				      bdev_nvme_reset_ctrlr);
755 	} else if (rc != -EBUSY) {
756 		return rc;
757 	}
758 
759 	return 0;
760 }
761 
762 static int
763 bdev_nvme_unmap(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
764 		struct nvme_bdev_io *bio,
765 		uint64_t offset_blocks,
766 		uint64_t num_blocks);
767 
768 static int
769 bdev_nvme_write_zeroes(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
770 		       struct nvme_bdev_io *bio,
771 		       uint64_t offset_blocks,
772 		       uint64_t num_blocks);
773 
774 static void
775 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
776 		     bool success)
777 {
778 	struct nvme_bdev_io *bio = (struct nvme_bdev_io *)bdev_io->driver_ctx;
779 	struct spdk_bdev *bdev = bdev_io->bdev;
780 	struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch);
781 	struct spdk_nvme_ns *ns;
782 	struct spdk_nvme_qpair *qpair;
783 	int ret;
784 
785 	if (!success) {
786 		ret = -EINVAL;
787 		goto exit;
788 	}
789 
790 	if (spdk_unlikely(!bdev_nvme_find_io_path(nbdev_ch, &ns, &qpair))) {
791 		ret = -ENXIO;
792 		goto exit;
793 	}
794 
795 	ret = bdev_nvme_readv(ns,
796 			      qpair,
797 			      bio,
798 			      bdev_io->u.bdev.iovs,
799 			      bdev_io->u.bdev.iovcnt,
800 			      bdev_io->u.bdev.md_buf,
801 			      bdev_io->u.bdev.num_blocks,
802 			      bdev_io->u.bdev.offset_blocks,
803 			      bdev->dif_check_flags,
804 			      bdev_io->internal.ext_opts);
805 
806 exit:
807 	if (spdk_unlikely(ret != 0)) {
808 		bdev_nvme_io_complete(bio, ret);
809 	}
810 }
811 
812 static void
813 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
814 {
815 	struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch);
816 	struct spdk_bdev *bdev = bdev_io->bdev;
817 	struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx;
818 	struct nvme_bdev_io *nbdev_io_to_abort;
819 	struct spdk_nvme_ns *ns;
820 	struct spdk_nvme_qpair *qpair;
821 	int rc = 0;
822 
823 	if (spdk_unlikely(!bdev_nvme_find_io_path(nbdev_ch, &ns, &qpair))) {
824 		rc = -ENXIO;
825 		goto exit;
826 	}
827 
828 	switch (bdev_io->type) {
829 	case SPDK_BDEV_IO_TYPE_READ:
830 		if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) {
831 			rc = bdev_nvme_readv(ns,
832 					     qpair,
833 					     nbdev_io,
834 					     bdev_io->u.bdev.iovs,
835 					     bdev_io->u.bdev.iovcnt,
836 					     bdev_io->u.bdev.md_buf,
837 					     bdev_io->u.bdev.num_blocks,
838 					     bdev_io->u.bdev.offset_blocks,
839 					     bdev->dif_check_flags,
840 					     bdev_io->internal.ext_opts);
841 		} else {
842 			spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb,
843 					     bdev_io->u.bdev.num_blocks * bdev->blocklen);
844 			rc = 0;
845 		}
846 		break;
847 	case SPDK_BDEV_IO_TYPE_WRITE:
848 		rc = bdev_nvme_writev(ns,
849 				      qpair,
850 				      nbdev_io,
851 				      bdev_io->u.bdev.iovs,
852 				      bdev_io->u.bdev.iovcnt,
853 				      bdev_io->u.bdev.md_buf,
854 				      bdev_io->u.bdev.num_blocks,
855 				      bdev_io->u.bdev.offset_blocks,
856 				      bdev->dif_check_flags,
857 				      bdev_io->internal.ext_opts);
858 		break;
859 	case SPDK_BDEV_IO_TYPE_COMPARE:
860 		rc = bdev_nvme_comparev(ns,
861 					qpair,
862 					nbdev_io,
863 					bdev_io->u.bdev.iovs,
864 					bdev_io->u.bdev.iovcnt,
865 					bdev_io->u.bdev.md_buf,
866 					bdev_io->u.bdev.num_blocks,
867 					bdev_io->u.bdev.offset_blocks,
868 					bdev->dif_check_flags);
869 		break;
870 	case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE:
871 		rc = bdev_nvme_comparev_and_writev(ns,
872 						   qpair,
873 						   nbdev_io,
874 						   bdev_io->u.bdev.iovs,
875 						   bdev_io->u.bdev.iovcnt,
876 						   bdev_io->u.bdev.fused_iovs,
877 						   bdev_io->u.bdev.fused_iovcnt,
878 						   bdev_io->u.bdev.md_buf,
879 						   bdev_io->u.bdev.num_blocks,
880 						   bdev_io->u.bdev.offset_blocks,
881 						   bdev->dif_check_flags);
882 		break;
883 	case SPDK_BDEV_IO_TYPE_UNMAP:
884 		rc = bdev_nvme_unmap(ns,
885 				     qpair,
886 				     nbdev_io,
887 				     bdev_io->u.bdev.offset_blocks,
888 				     bdev_io->u.bdev.num_blocks);
889 		break;
890 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
891 		rc =  bdev_nvme_write_zeroes(ns, qpair,
892 					     nbdev_io,
893 					     bdev_io->u.bdev.offset_blocks,
894 					     bdev_io->u.bdev.num_blocks);
895 		break;
896 	case SPDK_BDEV_IO_TYPE_RESET:
897 		rc = bdev_nvme_reset_io(nbdev_ch, nbdev_io);
898 		break;
899 	case SPDK_BDEV_IO_TYPE_FLUSH:
900 		rc = bdev_nvme_flush(ns,
901 				     qpair,
902 				     nbdev_io,
903 				     bdev_io->u.bdev.offset_blocks,
904 				     bdev_io->u.bdev.num_blocks);
905 		break;
906 	case SPDK_BDEV_IO_TYPE_ZONE_APPEND:
907 		rc = bdev_nvme_zone_appendv(ns,
908 					    qpair,
909 					    nbdev_io,
910 					    bdev_io->u.bdev.iovs,
911 					    bdev_io->u.bdev.iovcnt,
912 					    bdev_io->u.bdev.md_buf,
913 					    bdev_io->u.bdev.num_blocks,
914 					    bdev_io->u.bdev.offset_blocks,
915 					    bdev->dif_check_flags);
916 		break;
917 	case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO:
918 		rc = bdev_nvme_get_zone_info(ns,
919 					     qpair,
920 					     nbdev_io,
921 					     bdev_io->u.zone_mgmt.zone_id,
922 					     bdev_io->u.zone_mgmt.num_zones,
923 					     bdev_io->u.zone_mgmt.buf);
924 		break;
925 	case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
926 		rc = bdev_nvme_zone_management(ns,
927 					       qpair,
928 					       nbdev_io,
929 					       bdev_io->u.zone_mgmt.zone_id,
930 					       bdev_io->u.zone_mgmt.zone_action);
931 		break;
932 	case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
933 		rc = bdev_nvme_admin_passthru(nbdev_ch,
934 					      nbdev_io,
935 					      &bdev_io->u.nvme_passthru.cmd,
936 					      bdev_io->u.nvme_passthru.buf,
937 					      bdev_io->u.nvme_passthru.nbytes);
938 		break;
939 	case SPDK_BDEV_IO_TYPE_NVME_IO:
940 		rc = bdev_nvme_io_passthru(ns,
941 					   qpair,
942 					   nbdev_io,
943 					   &bdev_io->u.nvme_passthru.cmd,
944 					   bdev_io->u.nvme_passthru.buf,
945 					   bdev_io->u.nvme_passthru.nbytes);
946 		break;
947 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
948 		rc = bdev_nvme_io_passthru_md(ns,
949 					      qpair,
950 					      nbdev_io,
951 					      &bdev_io->u.nvme_passthru.cmd,
952 					      bdev_io->u.nvme_passthru.buf,
953 					      bdev_io->u.nvme_passthru.nbytes,
954 					      bdev_io->u.nvme_passthru.md_buf,
955 					      bdev_io->u.nvme_passthru.md_len);
956 		break;
957 	case SPDK_BDEV_IO_TYPE_ABORT:
958 		nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx;
959 		rc = bdev_nvme_abort(nbdev_ch,
960 				     nbdev_io,
961 				     nbdev_io_to_abort);
962 		break;
963 	default:
964 		rc = -EINVAL;
965 		break;
966 	}
967 
968 exit:
969 	if (spdk_unlikely(rc != 0)) {
970 		bdev_nvme_io_complete(nbdev_io, rc);
971 	}
972 }
973 
974 static bool
975 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
976 {
977 	struct nvme_bdev *nbdev = ctx;
978 	struct nvme_ns *nvme_ns;
979 	struct spdk_nvme_ns *ns;
980 	struct spdk_nvme_ctrlr *ctrlr;
981 	const struct spdk_nvme_ctrlr_data *cdata;
982 
983 	nvme_ns = nbdev->nvme_ns;
984 	assert(nvme_ns != NULL);
985 	ns = nvme_ns->ns;
986 	ctrlr = spdk_nvme_ns_get_ctrlr(ns);
987 
988 	switch (io_type) {
989 	case SPDK_BDEV_IO_TYPE_READ:
990 	case SPDK_BDEV_IO_TYPE_WRITE:
991 	case SPDK_BDEV_IO_TYPE_RESET:
992 	case SPDK_BDEV_IO_TYPE_FLUSH:
993 	case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
994 	case SPDK_BDEV_IO_TYPE_NVME_IO:
995 	case SPDK_BDEV_IO_TYPE_ABORT:
996 		return true;
997 
998 	case SPDK_BDEV_IO_TYPE_COMPARE:
999 		return spdk_nvme_ns_supports_compare(ns);
1000 
1001 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
1002 		return spdk_nvme_ns_get_md_size(ns) ? true : false;
1003 
1004 	case SPDK_BDEV_IO_TYPE_UNMAP:
1005 		cdata = spdk_nvme_ctrlr_get_data(ctrlr);
1006 		return cdata->oncs.dsm;
1007 
1008 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
1009 		cdata = spdk_nvme_ctrlr_get_data(ctrlr);
1010 		return cdata->oncs.write_zeroes;
1011 
1012 	case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE:
1013 		if (spdk_nvme_ctrlr_get_flags(ctrlr) &
1014 		    SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) {
1015 			return true;
1016 		}
1017 		return false;
1018 
1019 	case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO:
1020 	case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
1021 		return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS;
1022 
1023 	case SPDK_BDEV_IO_TYPE_ZONE_APPEND:
1024 		return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS &&
1025 		       spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED;
1026 
1027 	default:
1028 		return false;
1029 	}
1030 }
1031 
1032 static int
1033 bdev_nvme_create_ctrlr_channel_cb(void *io_device, void *ctx_buf)
1034 {
1035 	struct nvme_ctrlr *nvme_ctrlr = io_device;
1036 	struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf;
1037 	struct spdk_io_channel *pg_ch;
1038 	int rc;
1039 
1040 	pg_ch = spdk_get_io_channel(&g_nvme_ctrlrs);
1041 	if (!pg_ch) {
1042 		return -1;
1043 	}
1044 
1045 	ctrlr_ch->group = spdk_io_channel_get_ctx(pg_ch);
1046 
1047 #ifdef SPDK_CONFIG_VTUNE
1048 	ctrlr_ch->group->collect_spin_stat = true;
1049 #else
1050 	ctrlr_ch->group->collect_spin_stat = false;
1051 #endif
1052 
1053 	TAILQ_INIT(&ctrlr_ch->pending_resets);
1054 
1055 	ctrlr_ch->ctrlr = nvme_ctrlr;
1056 
1057 	rc = bdev_nvme_create_qpair(ctrlr_ch);
1058 	if (rc != 0) {
1059 		goto err_qpair;
1060 	}
1061 
1062 	return 0;
1063 
1064 err_qpair:
1065 	spdk_put_io_channel(pg_ch);
1066 
1067 	return rc;
1068 }
1069 
1070 static void
1071 bdev_nvme_destroy_ctrlr_channel_cb(void *io_device, void *ctx_buf)
1072 {
1073 	struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf;
1074 
1075 	assert(ctrlr_ch->group != NULL);
1076 
1077 	bdev_nvme_destroy_qpair(ctrlr_ch);
1078 
1079 	spdk_put_io_channel(spdk_io_channel_from_ctx(ctrlr_ch->group));
1080 }
1081 
1082 static void
1083 bdev_nvme_submit_accel_crc32c(void *ctx, uint32_t *dst, struct iovec *iov,
1084 			      uint32_t iov_cnt, uint32_t seed,
1085 			      spdk_nvme_accel_completion_cb cb_fn, void *cb_arg)
1086 {
1087 	struct nvme_poll_group *group = ctx;
1088 	int rc;
1089 
1090 	assert(group->accel_channel != NULL);
1091 	assert(cb_fn != NULL);
1092 
1093 	rc = spdk_accel_submit_crc32cv(group->accel_channel, dst, iov, iov_cnt, seed, cb_fn, cb_arg);
1094 	if (rc) {
1095 		/* For the two cases, spdk_accel_submit_crc32cv does not call the user's cb_fn */
1096 		if (rc == -ENOMEM || rc == -EINVAL) {
1097 			cb_fn(cb_arg, rc);
1098 		}
1099 		SPDK_ERRLOG("Cannot complete the accelerated crc32c operation with iov=%p\n", iov);
1100 	}
1101 }
1102 
1103 static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = {
1104 	.table_size		= sizeof(struct spdk_nvme_accel_fn_table),
1105 	.submit_accel_crc32c	= bdev_nvme_submit_accel_crc32c,
1106 };
1107 
1108 static int
1109 bdev_nvme_create_poll_group_cb(void *io_device, void *ctx_buf)
1110 {
1111 	struct nvme_poll_group *group = ctx_buf;
1112 
1113 	group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table);
1114 	if (group->group == NULL) {
1115 		return -1;
1116 	}
1117 
1118 	group->accel_channel = spdk_accel_engine_get_io_channel();
1119 	if (!group->accel_channel) {
1120 		spdk_nvme_poll_group_destroy(group->group);
1121 		SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n",
1122 			    group);
1123 		return -1;
1124 	}
1125 
1126 	group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us);
1127 
1128 	if (group->poller == NULL) {
1129 		spdk_put_io_channel(group->accel_channel);
1130 		spdk_nvme_poll_group_destroy(group->group);
1131 		return -1;
1132 	}
1133 
1134 	return 0;
1135 }
1136 
1137 static void
1138 bdev_nvme_destroy_poll_group_cb(void *io_device, void *ctx_buf)
1139 {
1140 	struct nvme_poll_group *group = ctx_buf;
1141 
1142 	if (group->accel_channel) {
1143 		spdk_put_io_channel(group->accel_channel);
1144 	}
1145 
1146 	spdk_poller_unregister(&group->poller);
1147 	if (spdk_nvme_poll_group_destroy(group->group)) {
1148 		SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.\n");
1149 		assert(false);
1150 	}
1151 }
1152 
1153 static struct spdk_io_channel *
1154 bdev_nvme_get_io_channel(void *ctx)
1155 {
1156 	struct nvme_bdev *nvme_bdev = ctx;
1157 
1158 	return spdk_get_io_channel(nvme_bdev);
1159 }
1160 
1161 static void *
1162 bdev_nvme_get_module_ctx(void *ctx)
1163 {
1164 	struct nvme_bdev *nvme_bdev = ctx;
1165 
1166 	return bdev_nvme_get_ctrlr(&nvme_bdev->disk);
1167 }
1168 
1169 static const char *
1170 _nvme_ana_state_str(enum spdk_nvme_ana_state ana_state)
1171 {
1172 	switch (ana_state) {
1173 	case SPDK_NVME_ANA_OPTIMIZED_STATE:
1174 		return "optimized";
1175 	case SPDK_NVME_ANA_NON_OPTIMIZED_STATE:
1176 		return "non_optimized";
1177 	case SPDK_NVME_ANA_INACCESSIBLE_STATE:
1178 		return "inaccessible";
1179 	case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE:
1180 		return "persistent_loss";
1181 	case SPDK_NVME_ANA_CHANGE_STATE:
1182 		return "change";
1183 	default:
1184 		return NULL;
1185 	}
1186 }
1187 
1188 static int
1189 bdev_nvme_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size)
1190 {
1191 	struct nvme_bdev *nbdev = ctx;
1192 	struct spdk_memory_domain *domain;
1193 
1194 	domain = spdk_nvme_ctrlr_get_memory_domain(nbdev->nvme_ns->ctrlr->ctrlr);
1195 
1196 	if (domain) {
1197 		if (array_size > 0 && domains) {
1198 			domains[0] = domain;
1199 		}
1200 		return 1;
1201 	}
1202 
1203 	return 0;
1204 }
1205 
1206 static int
1207 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
1208 {
1209 	struct nvme_bdev *nvme_bdev = ctx;
1210 	struct nvme_ns *nvme_ns;
1211 	struct spdk_nvme_ns *ns;
1212 	struct spdk_nvme_ctrlr *ctrlr;
1213 	const struct spdk_nvme_ctrlr_data *cdata;
1214 	const struct spdk_nvme_transport_id *trid;
1215 	union spdk_nvme_vs_register vs;
1216 	union spdk_nvme_csts_register csts;
1217 	char buf[128];
1218 
1219 	nvme_ns = nvme_bdev->nvme_ns;
1220 	assert(nvme_ns != NULL);
1221 	ns = nvme_ns->ns;
1222 	ctrlr = spdk_nvme_ns_get_ctrlr(ns);
1223 
1224 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
1225 	trid = spdk_nvme_ctrlr_get_transport_id(ctrlr);
1226 	vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr);
1227 	csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr);
1228 
1229 	spdk_json_write_named_object_begin(w, "nvme");
1230 
1231 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
1232 		spdk_json_write_named_string(w, "pci_address", trid->traddr);
1233 	}
1234 
1235 	spdk_json_write_named_object_begin(w, "trid");
1236 
1237 	nvme_bdev_dump_trid_json(trid, w);
1238 
1239 	spdk_json_write_object_end(w);
1240 
1241 #ifdef SPDK_CONFIG_NVME_CUSE
1242 	size_t cuse_name_size = 128;
1243 	char cuse_name[cuse_name_size];
1244 
1245 	int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns),
1246 					    cuse_name, &cuse_name_size);
1247 	if (rc == 0) {
1248 		spdk_json_write_named_string(w, "cuse_device", cuse_name);
1249 	}
1250 #endif
1251 
1252 	spdk_json_write_named_object_begin(w, "ctrlr_data");
1253 
1254 	spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid);
1255 
1256 	snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn);
1257 	spdk_str_trim(buf);
1258 	spdk_json_write_named_string(w, "model_number", buf);
1259 
1260 	snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn);
1261 	spdk_str_trim(buf);
1262 	spdk_json_write_named_string(w, "serial_number", buf);
1263 
1264 	snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr);
1265 	spdk_str_trim(buf);
1266 	spdk_json_write_named_string(w, "firmware_revision", buf);
1267 
1268 	if (cdata->subnqn[0] != '\0') {
1269 		spdk_json_write_named_string(w, "subnqn", cdata->subnqn);
1270 	}
1271 
1272 	spdk_json_write_named_object_begin(w, "oacs");
1273 
1274 	spdk_json_write_named_uint32(w, "security", cdata->oacs.security);
1275 	spdk_json_write_named_uint32(w, "format", cdata->oacs.format);
1276 	spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware);
1277 	spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage);
1278 
1279 	spdk_json_write_object_end(w);
1280 
1281 	spdk_json_write_object_end(w);
1282 
1283 	spdk_json_write_named_object_begin(w, "vs");
1284 
1285 	spdk_json_write_name(w, "nvme_version");
1286 	if (vs.bits.ter) {
1287 		spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter);
1288 	} else {
1289 		spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr);
1290 	}
1291 
1292 	spdk_json_write_object_end(w);
1293 
1294 	spdk_json_write_named_object_begin(w, "csts");
1295 
1296 	spdk_json_write_named_uint32(w, "rdy", csts.bits.rdy);
1297 	spdk_json_write_named_uint32(w, "cfs", csts.bits.cfs);
1298 
1299 	spdk_json_write_object_end(w);
1300 
1301 	spdk_json_write_named_object_begin(w, "ns_data");
1302 
1303 	spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns));
1304 
1305 	if (cdata->cmic.ana_reporting) {
1306 		spdk_json_write_named_string(w, "ana_state",
1307 					     _nvme_ana_state_str(nvme_ns->ana_state));
1308 	}
1309 
1310 	spdk_json_write_object_end(w);
1311 
1312 	if (cdata->oacs.security) {
1313 		spdk_json_write_named_object_begin(w, "security");
1314 
1315 		spdk_json_write_named_bool(w, "opal", nvme_bdev->opal);
1316 
1317 		spdk_json_write_object_end(w);
1318 	}
1319 
1320 	spdk_json_write_object_end(w);
1321 
1322 	return 0;
1323 }
1324 
1325 static void
1326 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
1327 {
1328 	/* No config per bdev needed */
1329 }
1330 
1331 static uint64_t
1332 bdev_nvme_get_spin_time(struct spdk_io_channel *ch)
1333 {
1334 	struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch);
1335 	struct nvme_ctrlr_channel *ctrlr_ch = nbdev_ch->ctrlr_ch;
1336 	struct nvme_poll_group *group = ctrlr_ch->group;
1337 	uint64_t spin_time;
1338 
1339 	if (!group || !group->collect_spin_stat) {
1340 		return 0;
1341 	}
1342 
1343 	if (group->end_ticks != 0) {
1344 		group->spin_ticks += (group->end_ticks - group->start_ticks);
1345 		group->end_ticks = 0;
1346 	}
1347 
1348 	spin_time = (group->spin_ticks * 1000000ULL) / spdk_get_ticks_hz();
1349 	group->start_ticks = 0;
1350 	group->spin_ticks = 0;
1351 
1352 	return spin_time;
1353 }
1354 
1355 static const struct spdk_bdev_fn_table nvmelib_fn_table = {
1356 	.destruct		= bdev_nvme_destruct,
1357 	.submit_request		= bdev_nvme_submit_request,
1358 	.io_type_supported	= bdev_nvme_io_type_supported,
1359 	.get_io_channel		= bdev_nvme_get_io_channel,
1360 	.dump_info_json		= bdev_nvme_dump_info_json,
1361 	.write_config_json	= bdev_nvme_write_config_json,
1362 	.get_spin_time		= bdev_nvme_get_spin_time,
1363 	.get_module_ctx		= bdev_nvme_get_module_ctx,
1364 	.get_memory_domains	= bdev_nvme_get_memory_domains,
1365 };
1366 
1367 typedef int (*bdev_nvme_parse_ana_log_page_cb)(
1368 	const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg);
1369 
1370 static int
1371 bdev_nvme_parse_ana_log_page(struct nvme_ctrlr *nvme_ctrlr,
1372 			     bdev_nvme_parse_ana_log_page_cb cb_fn, void *cb_arg)
1373 {
1374 	struct spdk_nvme_ana_group_descriptor *copied_desc;
1375 	uint8_t *orig_desc;
1376 	uint32_t i, desc_size, copy_len;
1377 	int rc = 0;
1378 
1379 	if (nvme_ctrlr->ana_log_page == NULL) {
1380 		return -EINVAL;
1381 	}
1382 
1383 	copied_desc = nvme_ctrlr->copied_ana_desc;
1384 
1385 	orig_desc = (uint8_t *)nvme_ctrlr->ana_log_page + sizeof(struct spdk_nvme_ana_page);
1386 	copy_len = nvme_ctrlr->ana_log_page_size - sizeof(struct spdk_nvme_ana_page);
1387 
1388 	for (i = 0; i < nvme_ctrlr->ana_log_page->num_ana_group_desc; i++) {
1389 		memcpy(copied_desc, orig_desc, copy_len);
1390 
1391 		rc = cb_fn(copied_desc, cb_arg);
1392 		if (rc != 0) {
1393 			break;
1394 		}
1395 
1396 		desc_size = sizeof(struct spdk_nvme_ana_group_descriptor) +
1397 			    copied_desc->num_of_nsid * sizeof(uint32_t);
1398 		orig_desc += desc_size;
1399 		copy_len -= desc_size;
1400 	}
1401 
1402 	return rc;
1403 }
1404 
1405 static int
1406 nvme_ns_set_ana_state(const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg)
1407 {
1408 	struct nvme_ns *nvme_ns = cb_arg;
1409 	uint32_t i;
1410 
1411 	for (i = 0; i < desc->num_of_nsid; i++) {
1412 		if (desc->nsid[i] != spdk_nvme_ns_get_id(nvme_ns->ns)) {
1413 			continue;
1414 		}
1415 		nvme_ns->ana_group_id = desc->ana_group_id;
1416 		nvme_ns->ana_state = desc->ana_state;
1417 		return 1;
1418 	}
1419 
1420 	return 0;
1421 }
1422 
1423 static int
1424 nvme_disk_create(struct spdk_bdev *disk, const char *base_name,
1425 		 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns,
1426 		 uint32_t prchk_flags, void *ctx)
1427 {
1428 	const struct spdk_uuid		*uuid;
1429 	const uint8_t *nguid;
1430 	const struct spdk_nvme_ctrlr_data *cdata;
1431 	const struct spdk_nvme_ns_data	*nsdata;
1432 	enum spdk_nvme_csi		csi;
1433 	uint32_t atomic_bs, phys_bs, bs;
1434 
1435 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
1436 	csi = spdk_nvme_ns_get_csi(ns);
1437 
1438 	switch (csi) {
1439 	case SPDK_NVME_CSI_NVM:
1440 		disk->product_name = "NVMe disk";
1441 		break;
1442 	case SPDK_NVME_CSI_ZNS:
1443 		disk->product_name = "NVMe ZNS disk";
1444 		disk->zoned = true;
1445 		disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns);
1446 		disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) /
1447 					     spdk_nvme_ns_get_extended_sector_size(ns);
1448 		disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns);
1449 		disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns);
1450 		break;
1451 	default:
1452 		SPDK_ERRLOG("unsupported CSI: %u\n", csi);
1453 		return -ENOTSUP;
1454 	}
1455 
1456 	disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns));
1457 	if (!disk->name) {
1458 		return -ENOMEM;
1459 	}
1460 
1461 	disk->write_cache = 0;
1462 	if (cdata->vwc.present) {
1463 		/* Enable if the Volatile Write Cache exists */
1464 		disk->write_cache = 1;
1465 	}
1466 	if (cdata->oncs.write_zeroes) {
1467 		disk->max_write_zeroes = UINT16_MAX + 1;
1468 	}
1469 	disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns);
1470 	disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns);
1471 	disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns);
1472 
1473 	nguid = spdk_nvme_ns_get_nguid(ns);
1474 	if (!nguid) {
1475 		uuid = spdk_nvme_ns_get_uuid(ns);
1476 		if (uuid) {
1477 			disk->uuid = *uuid;
1478 		}
1479 	} else {
1480 		memcpy(&disk->uuid, nguid, sizeof(disk->uuid));
1481 	}
1482 
1483 	nsdata = spdk_nvme_ns_get_data(ns);
1484 	bs = spdk_nvme_ns_get_sector_size(ns);
1485 	atomic_bs = bs;
1486 	phys_bs = bs;
1487 	if (nsdata->nabo == 0) {
1488 		if (nsdata->nsfeat.ns_atomic_write_unit && nsdata->nawupf) {
1489 			atomic_bs = bs * (1 + nsdata->nawupf);
1490 		} else {
1491 			atomic_bs = bs * (1 + cdata->awupf);
1492 		}
1493 	}
1494 	if (nsdata->nsfeat.optperf) {
1495 		phys_bs = bs * (1 + nsdata->npwg);
1496 	}
1497 	disk->phys_blocklen = spdk_min(phys_bs, atomic_bs);
1498 
1499 	disk->md_len = spdk_nvme_ns_get_md_size(ns);
1500 	if (disk->md_len != 0) {
1501 		disk->md_interleave = nsdata->flbas.extended;
1502 		disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns);
1503 		if (disk->dif_type != SPDK_DIF_DISABLE) {
1504 			disk->dif_is_head_of_md = nsdata->dps.md_start;
1505 			disk->dif_check_flags = prchk_flags;
1506 		}
1507 	}
1508 
1509 	if (!(spdk_nvme_ctrlr_get_flags(ctrlr) &
1510 	      SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) {
1511 		disk->acwu = 0;
1512 	} else if (nsdata->nsfeat.ns_atomic_write_unit) {
1513 		disk->acwu = nsdata->nacwu;
1514 	} else {
1515 		disk->acwu = cdata->acwu;
1516 	}
1517 
1518 	disk->ctxt = ctx;
1519 	disk->fn_table = &nvmelib_fn_table;
1520 	disk->module = &nvme_if;
1521 
1522 	return 0;
1523 }
1524 
1525 static int
1526 nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns)
1527 {
1528 	struct nvme_bdev *bdev;
1529 	int rc;
1530 
1531 	bdev = calloc(1, sizeof(*bdev));
1532 	if (!bdev) {
1533 		SPDK_ERRLOG("bdev calloc() failed\n");
1534 		return -ENOMEM;
1535 	}
1536 
1537 	bdev->nvme_ns = nvme_ns;
1538 	bdev->opal = nvme_ctrlr->opal_dev != NULL;
1539 
1540 	rc = nvme_disk_create(&bdev->disk, nvme_ctrlr->name, nvme_ctrlr->ctrlr,
1541 			      nvme_ns->ns, nvme_ctrlr->prchk_flags, bdev);
1542 	if (rc != 0) {
1543 		SPDK_ERRLOG("Failed to create NVMe disk\n");
1544 		free(bdev);
1545 		return rc;
1546 	}
1547 
1548 	spdk_io_device_register(bdev,
1549 				bdev_nvme_create_bdev_channel_cb,
1550 				bdev_nvme_destroy_bdev_channel_cb,
1551 				sizeof(struct nvme_bdev_channel),
1552 				bdev->disk.name);
1553 
1554 	rc = spdk_bdev_register(&bdev->disk);
1555 	if (rc != 0) {
1556 		SPDK_ERRLOG("spdk_bdev_register() failed\n");
1557 		spdk_io_device_unregister(bdev, NULL);
1558 		free(bdev->disk.name);
1559 		free(bdev);
1560 		return rc;
1561 	}
1562 
1563 	nvme_ns->bdev = bdev;
1564 
1565 	return 0;
1566 }
1567 
1568 static bool
1569 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2)
1570 {
1571 	const struct spdk_nvme_ns_data *nsdata1, *nsdata2;
1572 	const struct spdk_uuid *uuid1, *uuid2;
1573 
1574 	nsdata1 = spdk_nvme_ns_get_data(ns1);
1575 	nsdata2 = spdk_nvme_ns_get_data(ns2);
1576 	uuid1 = spdk_nvme_ns_get_uuid(ns1);
1577 	uuid2 = spdk_nvme_ns_get_uuid(ns2);
1578 
1579 	return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)) == 0 &&
1580 	       nsdata1->eui64 == nsdata2->eui64 &&
1581 	       uuid1 != NULL && uuid2 != NULL && spdk_uuid_compare(uuid1, uuid2) == 0;
1582 }
1583 
1584 static bool
1585 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
1586 		 struct spdk_nvme_ctrlr_opts *opts)
1587 {
1588 	struct nvme_probe_skip_entry *entry;
1589 
1590 	TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) {
1591 		if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) {
1592 			return false;
1593 		}
1594 	}
1595 
1596 	opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst;
1597 	opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight;
1598 	opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight;
1599 	opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight;
1600 	opts->disable_read_ana_log_page = true;
1601 
1602 	SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr);
1603 
1604 	return true;
1605 }
1606 
1607 static void
1608 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl)
1609 {
1610 	struct nvme_ctrlr *nvme_ctrlr = ctx;
1611 
1612 	if (spdk_nvme_cpl_is_error(cpl)) {
1613 		SPDK_WARNLOG("Abort failed. Resetting controller. sc is %u, sct is %u.\n", cpl->status.sc,
1614 			     cpl->status.sct);
1615 		bdev_nvme_reset(nvme_ctrlr);
1616 	} else if (cpl->cdw0 & 0x1) {
1617 		SPDK_WARNLOG("Specified command could not be aborted.\n");
1618 		bdev_nvme_reset(nvme_ctrlr);
1619 	}
1620 }
1621 
1622 static void
1623 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr,
1624 	   struct spdk_nvme_qpair *qpair, uint16_t cid)
1625 {
1626 	struct nvme_ctrlr *nvme_ctrlr = cb_arg;
1627 	union spdk_nvme_csts_register csts;
1628 	int rc;
1629 
1630 	assert(nvme_ctrlr->ctrlr == ctrlr);
1631 
1632 	SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid);
1633 
1634 	/* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O
1635 	 * queue.  (Note: qpair == NULL when there's an admin cmd timeout.)  Otherwise we
1636 	 * would submit another fabrics cmd on the admin queue to read CSTS and check for its
1637 	 * completion recursively.
1638 	 */
1639 	if (nvme_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) {
1640 		csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr);
1641 		if (csts.bits.cfs) {
1642 			SPDK_ERRLOG("Controller Fatal Status, reset required\n");
1643 			bdev_nvme_reset(nvme_ctrlr);
1644 			return;
1645 		}
1646 	}
1647 
1648 	switch (g_opts.action_on_timeout) {
1649 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT:
1650 		if (qpair) {
1651 			/* Don't send abort to ctrlr when reset is running. */
1652 			pthread_mutex_lock(&nvme_ctrlr->mutex);
1653 			if (nvme_ctrlr->resetting) {
1654 				pthread_mutex_unlock(&nvme_ctrlr->mutex);
1655 				SPDK_NOTICELOG("Quit abort. Ctrlr is in the process of reseting.\n");
1656 				return;
1657 			}
1658 			pthread_mutex_unlock(&nvme_ctrlr->mutex);
1659 
1660 			rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid,
1661 						       nvme_abort_cpl, nvme_ctrlr);
1662 			if (rc == 0) {
1663 				return;
1664 			}
1665 
1666 			SPDK_ERRLOG("Unable to send abort. Resetting, rc is %d.\n", rc);
1667 		}
1668 
1669 	/* FALLTHROUGH */
1670 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET:
1671 		bdev_nvme_reset(nvme_ctrlr);
1672 		break;
1673 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE:
1674 		SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n");
1675 		break;
1676 	default:
1677 		SPDK_ERRLOG("An invalid timeout action value is found.\n");
1678 		break;
1679 	}
1680 }
1681 
1682 static void
1683 nvme_ctrlr_populate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns,
1684 			      struct nvme_async_probe_ctx *ctx)
1685 {
1686 	struct spdk_nvme_ctrlr	*ctrlr = nvme_ctrlr->ctrlr;
1687 	struct spdk_nvme_ns	*ns;
1688 	int			rc = 0;
1689 
1690 	ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id);
1691 	if (!ns) {
1692 		SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id);
1693 		rc = -EINVAL;
1694 		goto done;
1695 	}
1696 
1697 	nvme_ns->ns = ns;
1698 	nvme_ns->populated = true;
1699 	nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE;
1700 
1701 	if (nvme_ctrlr->ana_log_page != NULL) {
1702 		bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ns_set_ana_state, nvme_ns);
1703 	}
1704 
1705 	rc = nvme_bdev_create(nvme_ctrlr, nvme_ns);
1706 
1707 done:
1708 	if (rc == 0) {
1709 		pthread_mutex_lock(&nvme_ctrlr->mutex);
1710 		nvme_ctrlr->ref++;
1711 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
1712 	} else {
1713 		memset(nvme_ns, 0, sizeof(*nvme_ns));
1714 	}
1715 
1716 	if (ctx) {
1717 		ctx->populates_in_progress--;
1718 		if (ctx->populates_in_progress == 0) {
1719 			nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx);
1720 		}
1721 	}
1722 }
1723 
1724 static void
1725 nvme_ctrlr_depopulate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns)
1726 {
1727 	struct nvme_bdev *bdev;
1728 
1729 	bdev = nvme_ns->bdev;
1730 	if (bdev != NULL) {
1731 		spdk_bdev_unregister(&bdev->disk, NULL, NULL);
1732 	}
1733 
1734 	pthread_mutex_lock(&nvme_ctrlr->mutex);
1735 
1736 	nvme_ns->populated = false;
1737 
1738 	if (nvme_ns->bdev != NULL) {
1739 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
1740 		return;
1741 	}
1742 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
1743 
1744 	nvme_ctrlr_release(nvme_ctrlr);
1745 }
1746 
1747 static void
1748 nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr,
1749 			       struct nvme_async_probe_ctx *ctx)
1750 {
1751 	struct spdk_nvme_ctrlr	*ctrlr = nvme_ctrlr->ctrlr;
1752 	struct nvme_ns	*nvme_ns;
1753 	struct spdk_nvme_ns	*ns;
1754 	struct nvme_bdev	*bdev;
1755 	uint32_t		i;
1756 	int			rc;
1757 	uint64_t		num_sectors;
1758 	bool			ns_is_active;
1759 
1760 	if (ctx) {
1761 		/* Initialize this count to 1 to handle the populate functions
1762 		 * calling nvme_ctrlr_populate_namespace_done() immediately.
1763 		 */
1764 		ctx->populates_in_progress = 1;
1765 	}
1766 
1767 	for (i = 0; i < nvme_ctrlr->num_ns; i++) {
1768 		uint32_t	nsid = i + 1;
1769 
1770 		nvme_ns = nvme_ctrlr->namespaces[i];
1771 		ns_is_active = spdk_nvme_ctrlr_is_active_ns(ctrlr, nsid);
1772 
1773 		if (nvme_ns->populated && ns_is_active) {
1774 			/* NS is still there but attributes may have changed */
1775 			ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid);
1776 			num_sectors = spdk_nvme_ns_get_num_sectors(ns);
1777 			bdev = nvme_ns->bdev;
1778 			assert(bdev != NULL);
1779 			if (bdev->disk.blockcnt != num_sectors) {
1780 				SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n",
1781 					       nsid,
1782 					       bdev->disk.name,
1783 					       bdev->disk.blockcnt,
1784 					       num_sectors);
1785 				rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors);
1786 				if (rc != 0) {
1787 					SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n",
1788 						    bdev->disk.name, rc);
1789 				}
1790 			}
1791 		}
1792 
1793 		if (!nvme_ns->populated && ns_is_active) {
1794 			nvme_ns->id = nsid;
1795 			nvme_ns->ctrlr = nvme_ctrlr;
1796 
1797 			nvme_ns->bdev = NULL;
1798 
1799 			if (ctx) {
1800 				ctx->populates_in_progress++;
1801 			}
1802 			nvme_ctrlr_populate_namespace(nvme_ctrlr, nvme_ns, ctx);
1803 		}
1804 
1805 		if (nvme_ns->populated && !ns_is_active) {
1806 			nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns);
1807 		}
1808 	}
1809 
1810 	if (ctx) {
1811 		/* Decrement this count now that the loop is over to account
1812 		 * for the one we started with.  If the count is then 0, we
1813 		 * know any populate_namespace functions completed immediately,
1814 		 * so we'll kick the callback here.
1815 		 */
1816 		ctx->populates_in_progress--;
1817 		if (ctx->populates_in_progress == 0) {
1818 			nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx);
1819 		}
1820 	}
1821 
1822 }
1823 
1824 static void
1825 nvme_ctrlr_depopulate_namespaces(struct nvme_ctrlr *nvme_ctrlr)
1826 {
1827 	uint32_t i;
1828 	struct nvme_ns *nvme_ns;
1829 
1830 	for (i = 0; i < nvme_ctrlr->num_ns; i++) {
1831 		uint32_t nsid = i + 1;
1832 
1833 		nvme_ns = nvme_ctrlr->namespaces[nsid - 1];
1834 		if (nvme_ns->populated) {
1835 			assert(nvme_ns->id == nsid);
1836 			nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns);
1837 		}
1838 	}
1839 }
1840 
1841 static bool
1842 nvme_ctrlr_acquire(struct nvme_ctrlr *nvme_ctrlr)
1843 {
1844 	pthread_mutex_lock(&nvme_ctrlr->mutex);
1845 	if (nvme_ctrlr->destruct || nvme_ctrlr->resetting) {
1846 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
1847 		return false;
1848 	}
1849 	nvme_ctrlr->ref++;
1850 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
1851 	return true;
1852 }
1853 
1854 static int
1855 nvme_ctrlr_set_ana_states(const struct spdk_nvme_ana_group_descriptor *desc,
1856 			  void *cb_arg)
1857 {
1858 	struct nvme_ctrlr *nvme_ctrlr = cb_arg;
1859 	struct nvme_ns *nvme_ns;
1860 	uint32_t i, nsid;
1861 
1862 	for (i = 0; i < desc->num_of_nsid; i++) {
1863 		nsid = desc->nsid[i];
1864 		if (nsid == 0 || nsid > nvme_ctrlr->num_ns) {
1865 			continue;
1866 		}
1867 
1868 		nvme_ns = nvme_ctrlr->namespaces[nsid - 1];
1869 		assert(nvme_ns != NULL);
1870 
1871 		if (!nvme_ns->populated) {
1872 			continue;
1873 		}
1874 
1875 		nvme_ns->ana_group_id = desc->ana_group_id;
1876 		nvme_ns->ana_state = desc->ana_state;
1877 	}
1878 
1879 	return 0;
1880 }
1881 
1882 static void
1883 nvme_ctrlr_read_ana_log_page_done(void *ctx, const struct spdk_nvme_cpl *cpl)
1884 {
1885 	struct nvme_ctrlr *nvme_ctrlr = ctx;
1886 
1887 	if (spdk_nvme_cpl_is_success(cpl)) {
1888 		bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ctrlr_set_ana_states,
1889 					     nvme_ctrlr);
1890 	}
1891 
1892 	nvme_ctrlr_release(nvme_ctrlr);
1893 }
1894 
1895 static void
1896 nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr)
1897 {
1898 	int rc;
1899 
1900 	if (nvme_ctrlr->ana_log_page == NULL) {
1901 		return;
1902 	}
1903 
1904 	if (!nvme_ctrlr_acquire(nvme_ctrlr)) {
1905 		return;
1906 	}
1907 
1908 	rc = spdk_nvme_ctrlr_cmd_get_log_page(nvme_ctrlr->ctrlr,
1909 					      SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS,
1910 					      SPDK_NVME_GLOBAL_NS_TAG,
1911 					      nvme_ctrlr->ana_log_page,
1912 					      nvme_ctrlr->ana_log_page_size, 0,
1913 					      nvme_ctrlr_read_ana_log_page_done,
1914 					      nvme_ctrlr);
1915 	if (rc != 0) {
1916 		nvme_ctrlr_release(nvme_ctrlr);
1917 	}
1918 }
1919 
1920 static void
1921 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl)
1922 {
1923 	struct nvme_ctrlr *nvme_ctrlr		= arg;
1924 	union spdk_nvme_async_event_completion	event;
1925 
1926 	if (spdk_nvme_cpl_is_error(cpl)) {
1927 		SPDK_WARNLOG("AER request execute failed");
1928 		return;
1929 	}
1930 
1931 	event.raw = cpl->cdw0;
1932 	if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) &&
1933 	    (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) {
1934 		nvme_ctrlr_populate_namespaces(nvme_ctrlr, NULL);
1935 	} else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) &&
1936 		   (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_ANA_CHANGE)) {
1937 		nvme_ctrlr_read_ana_log_page(nvme_ctrlr);
1938 	}
1939 }
1940 
1941 static void
1942 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, size_t count, int rc)
1943 {
1944 	if (ctx->cb_fn) {
1945 		ctx->cb_fn(ctx->cb_ctx, count, rc);
1946 	}
1947 
1948 	ctx->namespaces_populated = true;
1949 	if (ctx->probe_done) {
1950 		/* The probe was already completed, so we need to free the context
1951 		 * here.  This can happen for cases like OCSSD, where we need to
1952 		 * send additional commands to the SSD after attach.
1953 		 */
1954 		free(ctx);
1955 	}
1956 }
1957 
1958 static void
1959 nvme_ctrlr_create_done(struct nvme_ctrlr *nvme_ctrlr,
1960 		       struct nvme_async_probe_ctx *ctx)
1961 {
1962 	spdk_io_device_register(nvme_ctrlr,
1963 				bdev_nvme_create_ctrlr_channel_cb,
1964 				bdev_nvme_destroy_ctrlr_channel_cb,
1965 				sizeof(struct nvme_ctrlr_channel),
1966 				nvme_ctrlr->name);
1967 
1968 	nvme_ctrlr_populate_namespaces(nvme_ctrlr, ctx);
1969 }
1970 
1971 static void
1972 nvme_ctrlr_init_ana_log_page_done(void *_ctx, const struct spdk_nvme_cpl *cpl)
1973 {
1974 	struct nvme_ctrlr *nvme_ctrlr = _ctx;
1975 	struct nvme_async_probe_ctx *ctx = nvme_ctrlr->probe_ctx;
1976 
1977 	nvme_ctrlr->probe_ctx = NULL;
1978 
1979 	if (spdk_nvme_cpl_is_error(cpl)) {
1980 		nvme_ctrlr_delete(nvme_ctrlr);
1981 
1982 		if (ctx != NULL) {
1983 			populate_namespaces_cb(ctx, 0, -1);
1984 		}
1985 		return;
1986 	}
1987 
1988 	nvme_ctrlr_create_done(nvme_ctrlr, ctx);
1989 }
1990 
1991 static int
1992 nvme_ctrlr_init_ana_log_page(struct nvme_ctrlr *nvme_ctrlr,
1993 			     struct nvme_async_probe_ctx *ctx)
1994 {
1995 	struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr;
1996 	const struct spdk_nvme_ctrlr_data *cdata;
1997 	uint32_t ana_log_page_size;
1998 
1999 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
2000 
2001 	ana_log_page_size = sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid *
2002 			    sizeof(struct spdk_nvme_ana_group_descriptor) + cdata->nn *
2003 			    sizeof(uint32_t);
2004 
2005 	nvme_ctrlr->ana_log_page = spdk_zmalloc(ana_log_page_size, 64, NULL,
2006 						SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
2007 	if (nvme_ctrlr->ana_log_page == NULL) {
2008 		SPDK_ERRLOG("could not allocate ANA log page buffer\n");
2009 		return -ENXIO;
2010 	}
2011 
2012 	/* Each descriptor in a ANA log page is not ensured to be 8-bytes aligned.
2013 	 * Hence copy each descriptor to a temporary area when parsing it.
2014 	 *
2015 	 * Allocate a buffer whose size is as large as ANA log page buffer because
2016 	 * we do not know the size of a descriptor until actually reading it.
2017 	 */
2018 	nvme_ctrlr->copied_ana_desc = calloc(1, ana_log_page_size);
2019 	if (nvme_ctrlr->copied_ana_desc == NULL) {
2020 		SPDK_ERRLOG("could not allocate a buffer to parse ANA descriptor\n");
2021 		return -ENOMEM;
2022 	}
2023 
2024 	nvme_ctrlr->ana_log_page_size = ana_log_page_size;
2025 
2026 	nvme_ctrlr->probe_ctx = ctx;
2027 
2028 	return spdk_nvme_ctrlr_cmd_get_log_page(ctrlr,
2029 						SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS,
2030 						SPDK_NVME_GLOBAL_NS_TAG,
2031 						nvme_ctrlr->ana_log_page,
2032 						nvme_ctrlr->ana_log_page_size, 0,
2033 						nvme_ctrlr_init_ana_log_page_done,
2034 						nvme_ctrlr);
2035 }
2036 
2037 static int
2038 nvme_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr,
2039 		  const char *name,
2040 		  const struct spdk_nvme_transport_id *trid,
2041 		  uint32_t prchk_flags,
2042 		  struct nvme_async_probe_ctx *ctx)
2043 {
2044 	struct nvme_ctrlr *nvme_ctrlr;
2045 	struct nvme_ctrlr_trid *trid_entry;
2046 	uint32_t i, num_ns;
2047 	const struct spdk_nvme_ctrlr_data *cdata;
2048 	int rc;
2049 
2050 	nvme_ctrlr = calloc(1, sizeof(*nvme_ctrlr));
2051 	if (nvme_ctrlr == NULL) {
2052 		SPDK_ERRLOG("Failed to allocate device struct\n");
2053 		return -ENOMEM;
2054 	}
2055 
2056 	rc = pthread_mutex_init(&nvme_ctrlr->mutex, NULL);
2057 	if (rc != 0) {
2058 		free(nvme_ctrlr);
2059 		return rc;
2060 	}
2061 
2062 	TAILQ_INIT(&nvme_ctrlr->trids);
2063 
2064 	num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr);
2065 	if (num_ns != 0) {
2066 		nvme_ctrlr->namespaces = calloc(num_ns, sizeof(struct nvme_ns *));
2067 		if (!nvme_ctrlr->namespaces) {
2068 			SPDK_ERRLOG("Failed to allocate block namespaces pointer\n");
2069 			rc = -ENOMEM;
2070 			goto err;
2071 		}
2072 
2073 		for (i = 0; i < num_ns; i++) {
2074 			nvme_ctrlr->namespaces[i] = calloc(1, sizeof(struct nvme_ns));
2075 			if (nvme_ctrlr->namespaces[i] == NULL) {
2076 				SPDK_ERRLOG("Failed to allocate block namespace struct\n");
2077 				rc = -ENOMEM;
2078 				goto err;
2079 			}
2080 			nvme_ctrlr->num_ns++;
2081 		}
2082 
2083 		assert(num_ns == nvme_ctrlr->num_ns);
2084 	}
2085 
2086 	trid_entry = calloc(1, sizeof(*trid_entry));
2087 	if (trid_entry == NULL) {
2088 		SPDK_ERRLOG("Failed to allocate trid entry pointer\n");
2089 		rc = -ENOMEM;
2090 		goto err;
2091 	}
2092 
2093 	trid_entry->trid = *trid;
2094 	nvme_ctrlr->connected_trid = &trid_entry->trid;
2095 	TAILQ_INSERT_HEAD(&nvme_ctrlr->trids, trid_entry, link);
2096 
2097 	nvme_ctrlr->thread = spdk_get_thread();
2098 	nvme_ctrlr->ctrlr = ctrlr;
2099 	nvme_ctrlr->ref = 1;
2100 	nvme_ctrlr->name = strdup(name);
2101 	if (nvme_ctrlr->name == NULL) {
2102 		rc = -ENOMEM;
2103 		goto err;
2104 	}
2105 
2106 	if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) {
2107 		SPDK_ERRLOG("OCSSDs are not supported");
2108 		rc = -ENOTSUP;
2109 		goto err;
2110 	}
2111 
2112 	nvme_ctrlr->prchk_flags = prchk_flags;
2113 
2114 	nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_ctrlr,
2115 					  g_opts.nvme_adminq_poll_period_us);
2116 
2117 	pthread_mutex_lock(&g_bdev_nvme_mutex);
2118 	TAILQ_INSERT_TAIL(&g_nvme_ctrlrs, nvme_ctrlr, tailq);
2119 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
2120 
2121 	if (g_opts.timeout_us > 0) {
2122 		/* Register timeout callback. Timeout values for IO vs. admin reqs can be different. */
2123 		/* If timeout_admin_us is 0 (not specified), admin uses same timeout as IO. */
2124 		uint64_t adm_timeout_us = (g_opts.timeout_admin_us == 0) ?
2125 					  g_opts.timeout_us : g_opts.timeout_admin_us;
2126 		spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us,
2127 				adm_timeout_us, timeout_cb, nvme_ctrlr);
2128 	}
2129 
2130 	spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_ctrlr);
2131 	spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_ctrlr);
2132 
2133 	if (spdk_nvme_ctrlr_get_flags(ctrlr) &
2134 	    SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) {
2135 		nvme_ctrlr->opal_dev = spdk_opal_dev_construct(ctrlr);
2136 	}
2137 
2138 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
2139 
2140 	if (cdata->cmic.ana_reporting) {
2141 		rc = nvme_ctrlr_init_ana_log_page(nvme_ctrlr, ctx);
2142 		if (rc == 0) {
2143 			return 0;
2144 		}
2145 	} else {
2146 		nvme_ctrlr_create_done(nvme_ctrlr, ctx);
2147 		return 0;
2148 	}
2149 
2150 err:
2151 	nvme_ctrlr_delete(nvme_ctrlr);
2152 	return rc;
2153 }
2154 
2155 static void
2156 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
2157 	  struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
2158 {
2159 	struct nvme_probe_ctx *ctx = cb_ctx;
2160 	char *name = NULL;
2161 	uint32_t prchk_flags = 0;
2162 	size_t i;
2163 
2164 	if (ctx) {
2165 		for (i = 0; i < ctx->count; i++) {
2166 			if (spdk_nvme_transport_id_compare(trid, &ctx->trids[i]) == 0) {
2167 				prchk_flags = ctx->prchk_flags[i];
2168 				name = strdup(ctx->names[i]);
2169 				break;
2170 			}
2171 		}
2172 	} else {
2173 		name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++);
2174 	}
2175 	if (!name) {
2176 		SPDK_ERRLOG("Failed to assign name to NVMe device\n");
2177 		return;
2178 	}
2179 
2180 	SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name);
2181 
2182 	nvme_ctrlr_create(ctrlr, name, trid, prchk_flags, NULL);
2183 
2184 	free(name);
2185 }
2186 
2187 static void
2188 _nvme_ctrlr_destruct(void *ctx)
2189 {
2190 	struct nvme_ctrlr *nvme_ctrlr = ctx;
2191 
2192 	nvme_ctrlr_depopulate_namespaces(nvme_ctrlr);
2193 	nvme_ctrlr_release(nvme_ctrlr);
2194 }
2195 
2196 static int
2197 _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, bool hotplug)
2198 {
2199 	struct nvme_probe_skip_entry *entry;
2200 
2201 	pthread_mutex_lock(&nvme_ctrlr->mutex);
2202 
2203 	/* The controller's destruction was already started */
2204 	if (nvme_ctrlr->destruct) {
2205 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
2206 		return 0;
2207 	}
2208 
2209 	if (!hotplug &&
2210 	    nvme_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
2211 		entry = calloc(1, sizeof(*entry));
2212 		if (!entry) {
2213 			pthread_mutex_unlock(&nvme_ctrlr->mutex);
2214 			return -ENOMEM;
2215 		}
2216 		entry->trid = *nvme_ctrlr->connected_trid;
2217 		TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq);
2218 	}
2219 
2220 	nvme_ctrlr->destruct = true;
2221 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
2222 
2223 	_nvme_ctrlr_destruct(nvme_ctrlr);
2224 
2225 	return 0;
2226 }
2227 
2228 static void
2229 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr)
2230 {
2231 	struct nvme_ctrlr *nvme_ctrlr = cb_ctx;
2232 
2233 	_bdev_nvme_delete(nvme_ctrlr, true);
2234 }
2235 
2236 static int
2237 bdev_nvme_hotplug_probe(void *arg)
2238 {
2239 	if (g_hotplug_probe_ctx == NULL) {
2240 		spdk_poller_unregister(&g_hotplug_probe_poller);
2241 		return SPDK_POLLER_IDLE;
2242 	}
2243 
2244 	if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) {
2245 		g_hotplug_probe_ctx = NULL;
2246 		spdk_poller_unregister(&g_hotplug_probe_poller);
2247 	}
2248 
2249 	return SPDK_POLLER_BUSY;
2250 }
2251 
2252 static int
2253 bdev_nvme_hotplug(void *arg)
2254 {
2255 	struct spdk_nvme_transport_id trid_pcie;
2256 
2257 	if (g_hotplug_probe_ctx) {
2258 		return SPDK_POLLER_BUSY;
2259 	}
2260 
2261 	memset(&trid_pcie, 0, sizeof(trid_pcie));
2262 	spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE);
2263 
2264 	g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL,
2265 			      hotplug_probe_cb, attach_cb, NULL);
2266 
2267 	if (g_hotplug_probe_ctx) {
2268 		assert(g_hotplug_probe_poller == NULL);
2269 		g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000);
2270 	}
2271 
2272 	return SPDK_POLLER_BUSY;
2273 }
2274 
2275 void
2276 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts)
2277 {
2278 	*opts = g_opts;
2279 }
2280 
2281 static int
2282 bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts)
2283 {
2284 	if ((opts->timeout_us == 0) && (opts->timeout_admin_us != 0)) {
2285 		/* Can't set timeout_admin_us without also setting timeout_us */
2286 		SPDK_WARNLOG("Invalid options: Can't have (timeout_us == 0) with (timeout_admin_us > 0)\n");
2287 		return -EINVAL;
2288 	}
2289 
2290 	return 0;
2291 }
2292 
2293 int
2294 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts)
2295 {
2296 	int ret = bdev_nvme_validate_opts(opts);
2297 	if (ret) {
2298 		SPDK_WARNLOG("Failed to set nvme opts.\n");
2299 		return ret;
2300 	}
2301 
2302 	if (g_bdev_nvme_init_thread != NULL) {
2303 		if (!TAILQ_EMPTY(&g_nvme_ctrlrs)) {
2304 			return -EPERM;
2305 		}
2306 	}
2307 
2308 	g_opts = *opts;
2309 
2310 	return 0;
2311 }
2312 
2313 struct set_nvme_hotplug_ctx {
2314 	uint64_t period_us;
2315 	bool enabled;
2316 	spdk_msg_fn fn;
2317 	void *fn_ctx;
2318 };
2319 
2320 static void
2321 set_nvme_hotplug_period_cb(void *_ctx)
2322 {
2323 	struct set_nvme_hotplug_ctx *ctx = _ctx;
2324 
2325 	spdk_poller_unregister(&g_hotplug_poller);
2326 	if (ctx->enabled) {
2327 		g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us);
2328 	}
2329 
2330 	g_nvme_hotplug_poll_period_us = ctx->period_us;
2331 	g_nvme_hotplug_enabled = ctx->enabled;
2332 	if (ctx->fn) {
2333 		ctx->fn(ctx->fn_ctx);
2334 	}
2335 
2336 	free(ctx);
2337 }
2338 
2339 int
2340 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx)
2341 {
2342 	struct set_nvme_hotplug_ctx *ctx;
2343 
2344 	if (enabled == true && !spdk_process_is_primary()) {
2345 		return -EPERM;
2346 	}
2347 
2348 	ctx = calloc(1, sizeof(*ctx));
2349 	if (ctx == NULL) {
2350 		return -ENOMEM;
2351 	}
2352 
2353 	period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us;
2354 	ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX);
2355 	ctx->enabled = enabled;
2356 	ctx->fn = cb;
2357 	ctx->fn_ctx = cb_ctx;
2358 
2359 	spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx);
2360 	return 0;
2361 }
2362 
2363 static void
2364 nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr,
2365 				    struct nvme_async_probe_ctx *ctx)
2366 {
2367 	struct nvme_ns	*nvme_ns;
2368 	struct nvme_bdev	*nvme_bdev;
2369 	uint32_t		i, nsid;
2370 	size_t			j;
2371 
2372 	assert(nvme_ctrlr != NULL);
2373 
2374 	/*
2375 	 * Report the new bdevs that were created in this call.
2376 	 * There can be more than one bdev per NVMe controller.
2377 	 */
2378 	j = 0;
2379 	for (i = 0; i < nvme_ctrlr->num_ns; i++) {
2380 		nsid = i + 1;
2381 		nvme_ns = nvme_ctrlr->namespaces[nsid - 1];
2382 		if (!nvme_ns->populated) {
2383 			continue;
2384 		}
2385 		assert(nvme_ns->id == nsid);
2386 		nvme_bdev = nvme_ns->bdev;
2387 		if (j < ctx->count) {
2388 			ctx->names[j] = nvme_bdev->disk.name;
2389 			j++;
2390 		} else {
2391 			SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n",
2392 				    ctx->count);
2393 			populate_namespaces_cb(ctx, 0, -ERANGE);
2394 			return;
2395 		}
2396 	}
2397 
2398 	populate_namespaces_cb(ctx, j, 0);
2399 }
2400 
2401 static int
2402 bdev_nvme_compare_trids(struct nvme_ctrlr *nvme_ctrlr,
2403 			struct spdk_nvme_ctrlr *new_ctrlr,
2404 			struct spdk_nvme_transport_id *trid)
2405 {
2406 	struct nvme_ctrlr_trid *tmp_trid;
2407 
2408 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
2409 		SPDK_ERRLOG("PCIe failover is not supported.\n");
2410 		return -ENOTSUP;
2411 	}
2412 
2413 	/* Currently we only support failover to the same transport type. */
2414 	if (nvme_ctrlr->connected_trid->trtype != trid->trtype) {
2415 		return -EINVAL;
2416 	}
2417 
2418 	/* Currently we only support failover to the same NQN. */
2419 	if (strncmp(trid->subnqn, nvme_ctrlr->connected_trid->subnqn, SPDK_NVMF_NQN_MAX_LEN)) {
2420 		return -EINVAL;
2421 	}
2422 
2423 	/* Skip all the other checks if we've already registered this path. */
2424 	TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) {
2425 		if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) {
2426 			return -EEXIST;
2427 		}
2428 	}
2429 
2430 	return 0;
2431 }
2432 
2433 static int
2434 bdev_nvme_compare_namespaces(struct nvme_ctrlr *nvme_ctrlr,
2435 			     struct spdk_nvme_ctrlr *new_ctrlr)
2436 {
2437 	uint32_t i, nsid;
2438 	struct nvme_ns *nvme_ns;
2439 	struct spdk_nvme_ns *new_ns;
2440 
2441 	if (spdk_nvme_ctrlr_get_num_ns(new_ctrlr) != nvme_ctrlr->num_ns) {
2442 		return -EINVAL;
2443 	}
2444 
2445 	for (i = 0; i < nvme_ctrlr->num_ns; i++) {
2446 		nsid = i + 1;
2447 
2448 		nvme_ns = nvme_ctrlr->namespaces[i];
2449 		if (!nvme_ns->populated) {
2450 			continue;
2451 		}
2452 
2453 		new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nsid);
2454 		assert(new_ns != NULL);
2455 
2456 		if (!bdev_nvme_compare_ns(nvme_ns->ns, new_ns)) {
2457 			return -EINVAL;
2458 		}
2459 	}
2460 
2461 	return 0;
2462 }
2463 
2464 static int
2465 _bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr,
2466 			      struct spdk_nvme_transport_id *trid)
2467 {
2468 	struct nvme_ctrlr_trid *new_trid, *tmp_trid;
2469 
2470 	new_trid = calloc(1, sizeof(*new_trid));
2471 	if (new_trid == NULL) {
2472 		return -ENOMEM;
2473 	}
2474 	new_trid->trid = *trid;
2475 	new_trid->is_failed = false;
2476 
2477 	TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) {
2478 		if (tmp_trid->is_failed) {
2479 			TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link);
2480 			return 0;
2481 		}
2482 	}
2483 
2484 	TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, new_trid, link);
2485 	return 0;
2486 }
2487 
2488 /* This is the case that a secondary path is added to an existing
2489  * nvme_ctrlr for failover. After checking if it can access the same
2490  * namespaces as the primary path, it is disconnected until failover occurs.
2491  */
2492 static int
2493 bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr,
2494 			     struct spdk_nvme_ctrlr *new_ctrlr,
2495 			     struct spdk_nvme_transport_id *trid)
2496 {
2497 	int rc;
2498 
2499 	assert(nvme_ctrlr != NULL);
2500 
2501 	pthread_mutex_lock(&nvme_ctrlr->mutex);
2502 
2503 	rc = bdev_nvme_compare_trids(nvme_ctrlr, new_ctrlr, trid);
2504 	if (rc != 0) {
2505 		goto exit;
2506 	}
2507 
2508 	rc = bdev_nvme_compare_namespaces(nvme_ctrlr, new_ctrlr);
2509 	if (rc != 0) {
2510 		goto exit;
2511 	}
2512 
2513 	rc = _bdev_nvme_add_secondary_trid(nvme_ctrlr, trid);
2514 
2515 exit:
2516 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
2517 
2518 	spdk_nvme_detach(new_ctrlr);
2519 
2520 	return rc;
2521 }
2522 
2523 static void
2524 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
2525 		  struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
2526 {
2527 	struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx;
2528 	struct nvme_async_probe_ctx *ctx;
2529 	int rc;
2530 
2531 	ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, opts);
2532 	ctx->ctrlr_attached = true;
2533 
2534 	rc = nvme_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx->prchk_flags, ctx);
2535 	if (rc != 0) {
2536 		populate_namespaces_cb(ctx, 0, rc);
2537 	}
2538 }
2539 
2540 static void
2541 connect_set_failover_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
2542 			struct spdk_nvme_ctrlr *ctrlr,
2543 			const struct spdk_nvme_ctrlr_opts *opts)
2544 {
2545 	struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx;
2546 	struct nvme_ctrlr *nvme_ctrlr;
2547 	struct nvme_async_probe_ctx *ctx;
2548 	int rc;
2549 
2550 	ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, opts);
2551 	ctx->ctrlr_attached = true;
2552 
2553 	nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name);
2554 	if (nvme_ctrlr) {
2555 		rc = bdev_nvme_add_secondary_trid(nvme_ctrlr, ctrlr, &ctx->trid);
2556 	} else {
2557 		rc = -ENODEV;
2558 	}
2559 
2560 	populate_namespaces_cb(ctx, 0, rc);
2561 }
2562 
2563 static int
2564 bdev_nvme_async_poll(void *arg)
2565 {
2566 	struct nvme_async_probe_ctx	*ctx = arg;
2567 	int				rc;
2568 
2569 	rc = spdk_nvme_probe_poll_async(ctx->probe_ctx);
2570 	if (spdk_unlikely(rc != -EAGAIN)) {
2571 		ctx->probe_done = true;
2572 		spdk_poller_unregister(&ctx->poller);
2573 		if (!ctx->ctrlr_attached) {
2574 			/* The probe is done, but no controller was attached.
2575 			 * That means we had a failure, so report -EIO back to
2576 			 * the caller (usually the RPC). populate_namespaces_cb()
2577 			 * will take care of freeing the nvme_async_probe_ctx.
2578 			 */
2579 			populate_namespaces_cb(ctx, 0, -EIO);
2580 		} else if (ctx->namespaces_populated) {
2581 			/* The namespaces for the attached controller were all
2582 			 * populated and the response was already sent to the
2583 			 * caller (usually the RPC).  So free the context here.
2584 			 */
2585 			free(ctx);
2586 		}
2587 	}
2588 
2589 	return SPDK_POLLER_BUSY;
2590 }
2591 
2592 int
2593 bdev_nvme_create(struct spdk_nvme_transport_id *trid,
2594 		 struct spdk_nvme_host_id *hostid,
2595 		 const char *base_name,
2596 		 const char **names,
2597 		 uint32_t count,
2598 		 uint32_t prchk_flags,
2599 		 spdk_bdev_create_nvme_fn cb_fn,
2600 		 void *cb_ctx,
2601 		 struct spdk_nvme_ctrlr_opts *opts)
2602 {
2603 	struct nvme_probe_skip_entry	*entry, *tmp;
2604 	struct nvme_async_probe_ctx	*ctx;
2605 	spdk_nvme_attach_cb attach_cb;
2606 
2607 	/* TODO expand this check to include both the host and target TRIDs.
2608 	 * Only if both are the same should we fail.
2609 	 */
2610 	if (nvme_ctrlr_get(trid) != NULL) {
2611 		SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr);
2612 		return -EEXIST;
2613 	}
2614 
2615 	ctx = calloc(1, sizeof(*ctx));
2616 	if (!ctx) {
2617 		return -ENOMEM;
2618 	}
2619 	ctx->base_name = base_name;
2620 	ctx->names = names;
2621 	ctx->count = count;
2622 	ctx->cb_fn = cb_fn;
2623 	ctx->cb_ctx = cb_ctx;
2624 	ctx->prchk_flags = prchk_flags;
2625 	ctx->trid = *trid;
2626 
2627 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
2628 		TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) {
2629 			if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) {
2630 				TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq);
2631 				free(entry);
2632 				break;
2633 			}
2634 		}
2635 	}
2636 
2637 	if (opts) {
2638 		memcpy(&ctx->opts, opts, sizeof(*opts));
2639 	} else {
2640 		spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->opts, sizeof(ctx->opts));
2641 	}
2642 
2643 	ctx->opts.transport_retry_count = g_opts.retry_count;
2644 	ctx->opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms;
2645 	ctx->opts.disable_read_ana_log_page = true;
2646 
2647 	if (hostid->hostaddr[0] != '\0') {
2648 		snprintf(ctx->opts.src_addr, sizeof(ctx->opts.src_addr), "%s", hostid->hostaddr);
2649 	}
2650 
2651 	if (hostid->hostsvcid[0] != '\0') {
2652 		snprintf(ctx->opts.src_svcid, sizeof(ctx->opts.src_svcid), "%s", hostid->hostsvcid);
2653 	}
2654 
2655 	if (nvme_ctrlr_get_by_name(base_name) == NULL) {
2656 		attach_cb = connect_attach_cb;
2657 	} else {
2658 		attach_cb = connect_set_failover_cb;
2659 	}
2660 
2661 	ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->opts, attach_cb);
2662 	if (ctx->probe_ctx == NULL) {
2663 		SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr);
2664 		free(ctx);
2665 		return -ENODEV;
2666 	}
2667 	ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000);
2668 
2669 	return 0;
2670 }
2671 
2672 static int
2673 bdev_nvme_delete_secondary_trid(struct nvme_ctrlr *nvme_ctrlr,
2674 				const struct spdk_nvme_transport_id *trid)
2675 {
2676 	struct nvme_ctrlr_trid	*ctrlr_trid, *tmp_trid;
2677 
2678 	if (!spdk_nvme_transport_id_compare(trid, nvme_ctrlr->connected_trid)) {
2679 		return -EBUSY;
2680 	}
2681 
2682 	TAILQ_FOREACH_SAFE(ctrlr_trid, &nvme_ctrlr->trids, link, tmp_trid) {
2683 		if (!spdk_nvme_transport_id_compare(&ctrlr_trid->trid, trid)) {
2684 			TAILQ_REMOVE(&nvme_ctrlr->trids, ctrlr_trid, link);
2685 			free(ctrlr_trid);
2686 			return 0;
2687 		}
2688 	}
2689 
2690 	return -ENXIO;
2691 }
2692 
2693 int
2694 bdev_nvme_delete(const char *name, const struct spdk_nvme_transport_id *trid)
2695 {
2696 	struct nvme_ctrlr	*nvme_ctrlr;
2697 	struct nvme_ctrlr_trid	*ctrlr_trid;
2698 
2699 	if (name == NULL) {
2700 		return -EINVAL;
2701 	}
2702 
2703 	nvme_ctrlr = nvme_ctrlr_get_by_name(name);
2704 	if (nvme_ctrlr == NULL) {
2705 		SPDK_ERRLOG("Failed to find NVMe controller\n");
2706 		return -ENODEV;
2707 	}
2708 
2709 	/* case 1: remove the controller itself. */
2710 	if (trid == NULL) {
2711 		return _bdev_nvme_delete(nvme_ctrlr, false);
2712 	}
2713 
2714 	/* case 2: we are currently using the path to be removed. */
2715 	if (!spdk_nvme_transport_id_compare(trid, nvme_ctrlr->connected_trid)) {
2716 		ctrlr_trid = TAILQ_FIRST(&nvme_ctrlr->trids);
2717 		assert(nvme_ctrlr->connected_trid == &ctrlr_trid->trid);
2718 		/* case 2A: the current path is the only path. */
2719 		if (!TAILQ_NEXT(ctrlr_trid, link)) {
2720 			return _bdev_nvme_delete(nvme_ctrlr, false);
2721 		}
2722 
2723 		/* case 2B: there is an alternative path. */
2724 		return bdev_nvme_failover(nvme_ctrlr, true);
2725 	}
2726 
2727 	/* case 3: We are not using the specified path. */
2728 	return bdev_nvme_delete_secondary_trid(nvme_ctrlr, trid);
2729 }
2730 
2731 static int
2732 bdev_nvme_library_init(void)
2733 {
2734 	g_bdev_nvme_init_thread = spdk_get_thread();
2735 
2736 	spdk_io_device_register(&g_nvme_ctrlrs, bdev_nvme_create_poll_group_cb,
2737 				bdev_nvme_destroy_poll_group_cb,
2738 				sizeof(struct nvme_poll_group),  "nvme_poll_groups");
2739 
2740 	return 0;
2741 }
2742 
2743 static void
2744 bdev_nvme_library_fini(void)
2745 {
2746 	struct nvme_ctrlr *nvme_ctrlr, *tmp;
2747 	struct nvme_probe_skip_entry *entry, *entry_tmp;
2748 
2749 	spdk_poller_unregister(&g_hotplug_poller);
2750 	free(g_hotplug_probe_ctx);
2751 	g_hotplug_probe_ctx = NULL;
2752 
2753 	TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) {
2754 		TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq);
2755 		free(entry);
2756 	}
2757 
2758 	pthread_mutex_lock(&g_bdev_nvme_mutex);
2759 	TAILQ_FOREACH_SAFE(nvme_ctrlr, &g_nvme_ctrlrs, tailq, tmp) {
2760 		pthread_mutex_lock(&nvme_ctrlr->mutex);
2761 		if (nvme_ctrlr->destruct) {
2762 			/* This controller's destruction was already started
2763 			 * before the application started shutting down
2764 			 */
2765 			pthread_mutex_unlock(&nvme_ctrlr->mutex);
2766 			continue;
2767 		}
2768 		nvme_ctrlr->destruct = true;
2769 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
2770 
2771 		spdk_thread_send_msg(nvme_ctrlr->thread, _nvme_ctrlr_destruct,
2772 				     nvme_ctrlr);
2773 	}
2774 
2775 	g_bdev_nvme_module_finish = true;
2776 	if (TAILQ_EMPTY(&g_nvme_ctrlrs)) {
2777 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
2778 		spdk_io_device_unregister(&g_nvme_ctrlrs, NULL);
2779 		spdk_bdev_module_fini_done();
2780 		return;
2781 	}
2782 
2783 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
2784 }
2785 
2786 static void
2787 bdev_nvme_verify_pi_error(struct nvme_bdev_io *bio)
2788 {
2789 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2790 	struct spdk_bdev *bdev = bdev_io->bdev;
2791 	struct spdk_dif_ctx dif_ctx;
2792 	struct spdk_dif_error err_blk = {};
2793 	int rc;
2794 
2795 	rc = spdk_dif_ctx_init(&dif_ctx,
2796 			       bdev->blocklen, bdev->md_len, bdev->md_interleave,
2797 			       bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags,
2798 			       bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0);
2799 	if (rc != 0) {
2800 		SPDK_ERRLOG("Initialization of DIF context failed\n");
2801 		return;
2802 	}
2803 
2804 	if (bdev->md_interleave) {
2805 		rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
2806 				     bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
2807 	} else {
2808 		struct iovec md_iov = {
2809 			.iov_base	= bdev_io->u.bdev.md_buf,
2810 			.iov_len	= bdev_io->u.bdev.num_blocks * bdev->md_len,
2811 		};
2812 
2813 		rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
2814 				     &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
2815 	}
2816 
2817 	if (rc != 0) {
2818 		SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n",
2819 			    err_blk.err_type, err_blk.err_offset);
2820 	} else {
2821 		SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n");
2822 	}
2823 }
2824 
2825 static void
2826 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl)
2827 {
2828 	struct nvme_bdev_io *bio = ref;
2829 
2830 	if (spdk_nvme_cpl_is_success(cpl)) {
2831 		/* Run PI verification for read data buffer. */
2832 		bdev_nvme_verify_pi_error(bio);
2833 	}
2834 
2835 	/* Return original completion status */
2836 	bdev_nvme_io_complete_nvme_status(bio, &bio->cpl);
2837 }
2838 
2839 static void
2840 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl)
2841 {
2842 	struct nvme_bdev_io *bio = ref;
2843 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2844 	struct nvme_bdev_channel *nbdev_ch;
2845 	struct spdk_nvme_ns *ns;
2846 	struct spdk_nvme_qpair *qpair;
2847 	int ret;
2848 
2849 	if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) {
2850 		SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n",
2851 			    cpl->status.sct, cpl->status.sc);
2852 
2853 		/* Save completion status to use after verifying PI error. */
2854 		bio->cpl = *cpl;
2855 
2856 		nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io));
2857 
2858 		if (spdk_likely(bdev_nvme_find_io_path(nbdev_ch, &ns, &qpair))) {
2859 			/* Read without PI checking to verify PI error. */
2860 			ret = bdev_nvme_no_pi_readv(ns,
2861 						    qpair,
2862 						    bio,
2863 						    bdev_io->u.bdev.iovs,
2864 						    bdev_io->u.bdev.iovcnt,
2865 						    bdev_io->u.bdev.md_buf,
2866 						    bdev_io->u.bdev.num_blocks,
2867 						    bdev_io->u.bdev.offset_blocks);
2868 			if (ret == 0) {
2869 				return;
2870 			}
2871 		}
2872 	}
2873 
2874 	bdev_nvme_io_complete_nvme_status(bio, cpl);
2875 }
2876 
2877 static void
2878 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl)
2879 {
2880 	struct nvme_bdev_io *bio = ref;
2881 
2882 	if (spdk_nvme_cpl_is_pi_error(cpl)) {
2883 		SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n",
2884 			    cpl->status.sct, cpl->status.sc);
2885 		/* Run PI verification for write data buffer if PI error is detected. */
2886 		bdev_nvme_verify_pi_error(bio);
2887 	}
2888 
2889 	bdev_nvme_io_complete_nvme_status(bio, cpl);
2890 }
2891 
2892 static void
2893 bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl)
2894 {
2895 	struct nvme_bdev_io *bio = ref;
2896 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2897 
2898 	/* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks.
2899 	 * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error().
2900 	 */
2901 	bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0;
2902 
2903 	if (spdk_nvme_cpl_is_pi_error(cpl)) {
2904 		SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n",
2905 			    cpl->status.sct, cpl->status.sc);
2906 		/* Run PI verification for zone append data buffer if PI error is detected. */
2907 		bdev_nvme_verify_pi_error(bio);
2908 	}
2909 
2910 	bdev_nvme_io_complete_nvme_status(bio, cpl);
2911 }
2912 
2913 static void
2914 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl)
2915 {
2916 	struct nvme_bdev_io *bio = ref;
2917 
2918 	if (spdk_nvme_cpl_is_pi_error(cpl)) {
2919 		SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n",
2920 			    cpl->status.sct, cpl->status.sc);
2921 		/* Run PI verification for compare data buffer if PI error is detected. */
2922 		bdev_nvme_verify_pi_error(bio);
2923 	}
2924 
2925 	bdev_nvme_io_complete_nvme_status(bio, cpl);
2926 }
2927 
2928 static void
2929 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl)
2930 {
2931 	struct nvme_bdev_io *bio = ref;
2932 
2933 	/* Compare operation completion */
2934 	if ((cpl->cdw0 & 0xFF) == SPDK_NVME_OPC_COMPARE) {
2935 		/* Save compare result for write callback */
2936 		bio->cpl = *cpl;
2937 		return;
2938 	}
2939 
2940 	/* Write operation completion */
2941 	if (spdk_nvme_cpl_is_error(&bio->cpl)) {
2942 		/* If bio->cpl is already an error, it means the compare operation failed.  In that case,
2943 		 * complete the IO with the compare operation's status.
2944 		 */
2945 		if (!spdk_nvme_cpl_is_error(cpl)) {
2946 			SPDK_ERRLOG("Unexpected write success after compare failure.\n");
2947 		}
2948 
2949 		bdev_nvme_io_complete_nvme_status(bio, &bio->cpl);
2950 	} else {
2951 		bdev_nvme_io_complete_nvme_status(bio, cpl);
2952 	}
2953 }
2954 
2955 static void
2956 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl)
2957 {
2958 	struct nvme_bdev_io *bio = ref;
2959 
2960 	bdev_nvme_io_complete_nvme_status(bio, cpl);
2961 }
2962 
2963 static int
2964 fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc)
2965 {
2966 	switch (desc->zs) {
2967 	case SPDK_NVME_ZONE_STATE_EMPTY:
2968 		info->state = SPDK_BDEV_ZONE_STATE_EMPTY;
2969 		break;
2970 	case SPDK_NVME_ZONE_STATE_IOPEN:
2971 		info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN;
2972 		break;
2973 	case SPDK_NVME_ZONE_STATE_EOPEN:
2974 		info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN;
2975 		break;
2976 	case SPDK_NVME_ZONE_STATE_CLOSED:
2977 		info->state = SPDK_BDEV_ZONE_STATE_CLOSED;
2978 		break;
2979 	case SPDK_NVME_ZONE_STATE_RONLY:
2980 		info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY;
2981 		break;
2982 	case SPDK_NVME_ZONE_STATE_FULL:
2983 		info->state = SPDK_BDEV_ZONE_STATE_FULL;
2984 		break;
2985 	case SPDK_NVME_ZONE_STATE_OFFLINE:
2986 		info->state = SPDK_BDEV_ZONE_STATE_OFFLINE;
2987 		break;
2988 	default:
2989 		SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs);
2990 		return -EIO;
2991 	}
2992 
2993 	info->zone_id = desc->zslba;
2994 	info->write_pointer = desc->wp;
2995 	info->capacity = desc->zcap;
2996 
2997 	return 0;
2998 }
2999 
3000 static void
3001 bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl)
3002 {
3003 	struct nvme_bdev_io *bio = ref;
3004 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
3005 	struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io);
3006 	struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch);
3007 	uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id;
3008 	uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones;
3009 	struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf;
3010 	uint64_t max_zones_per_buf, i;
3011 	uint32_t zone_report_bufsize;
3012 	struct spdk_nvme_ns *ns;
3013 	struct spdk_nvme_qpair *qpair;
3014 	int ret;
3015 
3016 	if (spdk_nvme_cpl_is_error(cpl)) {
3017 		goto out_complete_io_nvme_cpl;
3018 	}
3019 
3020 	if (!bdev_nvme_find_io_path(nbdev_ch, &ns, &qpair)) {
3021 		ret = -ENXIO;
3022 		goto out_complete_io_ret;
3023 	}
3024 
3025 	zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns);
3026 	max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) /
3027 			    sizeof(bio->zone_report_buf->descs[0]);
3028 
3029 	if (bio->zone_report_buf->nr_zones > max_zones_per_buf) {
3030 		ret = -EINVAL;
3031 		goto out_complete_io_ret;
3032 	}
3033 
3034 	if (!bio->zone_report_buf->nr_zones) {
3035 		ret = -EINVAL;
3036 		goto out_complete_io_ret;
3037 	}
3038 
3039 	for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) {
3040 		ret = fill_zone_from_report(&info[bio->handled_zones],
3041 					    &bio->zone_report_buf->descs[i]);
3042 		if (ret) {
3043 			goto out_complete_io_ret;
3044 		}
3045 		bio->handled_zones++;
3046 	}
3047 
3048 	if (bio->handled_zones < zones_to_copy) {
3049 		uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(ns);
3050 		uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones);
3051 
3052 		memset(bio->zone_report_buf, 0, zone_report_bufsize);
3053 		ret = spdk_nvme_zns_report_zones(ns, qpair,
3054 						 bio->zone_report_buf, zone_report_bufsize,
3055 						 slba, SPDK_NVME_ZRA_LIST_ALL, true,
3056 						 bdev_nvme_get_zone_info_done, bio);
3057 		if (!ret) {
3058 			return;
3059 		} else {
3060 			goto out_complete_io_ret;
3061 		}
3062 	}
3063 
3064 out_complete_io_nvme_cpl:
3065 	free(bio->zone_report_buf);
3066 	bio->zone_report_buf = NULL;
3067 	bdev_nvme_io_complete_nvme_status(bio, cpl);
3068 	return;
3069 
3070 out_complete_io_ret:
3071 	free(bio->zone_report_buf);
3072 	bio->zone_report_buf = NULL;
3073 	bdev_nvme_io_complete(bio, ret);
3074 }
3075 
3076 static void
3077 bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl)
3078 {
3079 	struct nvme_bdev_io *bio = ref;
3080 
3081 	bdev_nvme_io_complete_nvme_status(bio, cpl);
3082 }
3083 
3084 static void
3085 bdev_nvme_admin_passthru_completion(void *ctx)
3086 {
3087 	struct nvme_bdev_io *bio = ctx;
3088 
3089 	bdev_nvme_io_complete_nvme_status(bio, &bio->cpl);
3090 }
3091 
3092 static void
3093 bdev_nvme_abort_completion(void *ctx)
3094 {
3095 	struct nvme_bdev_io *bio = ctx;
3096 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
3097 
3098 	if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) {
3099 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
3100 	} else {
3101 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
3102 	}
3103 }
3104 
3105 static void
3106 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl)
3107 {
3108 	struct nvme_bdev_io *bio = ref;
3109 
3110 	bio->cpl = *cpl;
3111 	spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_completion, bio);
3112 }
3113 
3114 static void
3115 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl)
3116 {
3117 	struct nvme_bdev_io *bio = ref;
3118 
3119 	bio->cpl = *cpl;
3120 	spdk_thread_send_msg(bio->orig_thread, bdev_nvme_admin_passthru_completion, bio);
3121 }
3122 
3123 static void
3124 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset)
3125 {
3126 	struct nvme_bdev_io *bio = ref;
3127 	struct iovec *iov;
3128 
3129 	bio->iov_offset = sgl_offset;
3130 	for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) {
3131 		iov = &bio->iovs[bio->iovpos];
3132 		if (bio->iov_offset < iov->iov_len) {
3133 			break;
3134 		}
3135 
3136 		bio->iov_offset -= iov->iov_len;
3137 	}
3138 }
3139 
3140 static int
3141 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length)
3142 {
3143 	struct nvme_bdev_io *bio = ref;
3144 	struct iovec *iov;
3145 
3146 	assert(bio->iovpos < bio->iovcnt);
3147 
3148 	iov = &bio->iovs[bio->iovpos];
3149 
3150 	*address = iov->iov_base;
3151 	*length = iov->iov_len;
3152 
3153 	if (bio->iov_offset) {
3154 		assert(bio->iov_offset <= iov->iov_len);
3155 		*address += bio->iov_offset;
3156 		*length -= bio->iov_offset;
3157 	}
3158 
3159 	bio->iov_offset += *length;
3160 	if (bio->iov_offset == iov->iov_len) {
3161 		bio->iovpos++;
3162 		bio->iov_offset = 0;
3163 	}
3164 
3165 	return 0;
3166 }
3167 
3168 static void
3169 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset)
3170 {
3171 	struct nvme_bdev_io *bio = ref;
3172 	struct iovec *iov;
3173 
3174 	bio->fused_iov_offset = sgl_offset;
3175 	for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) {
3176 		iov = &bio->fused_iovs[bio->fused_iovpos];
3177 		if (bio->fused_iov_offset < iov->iov_len) {
3178 			break;
3179 		}
3180 
3181 		bio->fused_iov_offset -= iov->iov_len;
3182 	}
3183 }
3184 
3185 static int
3186 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length)
3187 {
3188 	struct nvme_bdev_io *bio = ref;
3189 	struct iovec *iov;
3190 
3191 	assert(bio->fused_iovpos < bio->fused_iovcnt);
3192 
3193 	iov = &bio->fused_iovs[bio->fused_iovpos];
3194 
3195 	*address = iov->iov_base;
3196 	*length = iov->iov_len;
3197 
3198 	if (bio->fused_iov_offset) {
3199 		assert(bio->fused_iov_offset <= iov->iov_len);
3200 		*address += bio->fused_iov_offset;
3201 		*length -= bio->fused_iov_offset;
3202 	}
3203 
3204 	bio->fused_iov_offset += *length;
3205 	if (bio->fused_iov_offset == iov->iov_len) {
3206 		bio->fused_iovpos++;
3207 		bio->fused_iov_offset = 0;
3208 	}
3209 
3210 	return 0;
3211 }
3212 
3213 static int
3214 bdev_nvme_no_pi_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3215 		      struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
3216 		      void *md, uint64_t lba_count, uint64_t lba)
3217 {
3218 	int rc;
3219 
3220 	SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n",
3221 		      lba_count, lba);
3222 
3223 	bio->iovs = iov;
3224 	bio->iovcnt = iovcnt;
3225 	bio->iovpos = 0;
3226 	bio->iov_offset = 0;
3227 
3228 	rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count,
3229 					    bdev_nvme_no_pi_readv_done, bio, 0,
3230 					    bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
3231 					    md, 0, 0);
3232 
3233 	if (rc != 0 && rc != -ENOMEM) {
3234 		SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc);
3235 	}
3236 	return rc;
3237 }
3238 
3239 static int
3240 bdev_nvme_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3241 		struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
3242 		void *md, uint64_t lba_count, uint64_t lba, uint32_t flags,
3243 		struct spdk_bdev_ext_io_opts *ext_opts)
3244 {
3245 	int rc;
3246 
3247 	SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n",
3248 		      lba_count, lba);
3249 
3250 	bio->iovs = iov;
3251 	bio->iovcnt = iovcnt;
3252 	bio->iovpos = 0;
3253 	bio->iov_offset = 0;
3254 
3255 	if (ext_opts) {
3256 		bio->ext_opts.size = sizeof(struct spdk_nvme_ns_cmd_ext_io_opts);
3257 		bio->ext_opts.memory_domain = ext_opts->memory_domain;
3258 		bio->ext_opts.memory_domain_ctx = ext_opts->memory_domain_ctx;
3259 		bio->ext_opts.io_flags = flags;
3260 		bio->ext_opts.metadata = md;
3261 
3262 		rc = spdk_nvme_ns_cmd_readv_ext(ns, qpair, lba, lba_count,
3263 						bdev_nvme_readv_done, bio,
3264 						bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
3265 						&bio->ext_opts);
3266 	} else if (iovcnt == 1) {
3267 		rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, md, lba,
3268 						   lba_count,
3269 						   bdev_nvme_readv_done, bio,
3270 						   flags,
3271 						   0, 0);
3272 	} else {
3273 		rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count,
3274 						    bdev_nvme_readv_done, bio, flags,
3275 						    bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
3276 						    md, 0, 0);
3277 	}
3278 
3279 	if (rc != 0 && rc != -ENOMEM) {
3280 		SPDK_ERRLOG("readv failed: rc = %d\n", rc);
3281 	}
3282 	return rc;
3283 }
3284 
3285 static int
3286 bdev_nvme_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3287 		 struct nvme_bdev_io *bio,
3288 		 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
3289 		 uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts)
3290 {
3291 	int rc;
3292 
3293 	SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n",
3294 		      lba_count, lba);
3295 
3296 	bio->iovs = iov;
3297 	bio->iovcnt = iovcnt;
3298 	bio->iovpos = 0;
3299 	bio->iov_offset = 0;
3300 
3301 	if (ext_opts) {
3302 		bio->ext_opts.size = sizeof(struct spdk_nvme_ns_cmd_ext_io_opts);
3303 		bio->ext_opts.memory_domain = ext_opts->memory_domain;
3304 		bio->ext_opts.memory_domain_ctx = ext_opts->memory_domain_ctx;
3305 		bio->ext_opts.io_flags = flags;
3306 		bio->ext_opts.metadata = md;
3307 
3308 		rc = spdk_nvme_ns_cmd_writev_ext(ns, qpair, lba, lba_count,
3309 						 bdev_nvme_readv_done, bio,
3310 						 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
3311 						 &bio->ext_opts);
3312 	} else if (iovcnt == 1) {
3313 		rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, md, lba,
3314 						    lba_count,
3315 						    bdev_nvme_writev_done, bio,
3316 						    flags,
3317 						    0, 0);
3318 	} else {
3319 		rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count,
3320 						     bdev_nvme_writev_done, bio, flags,
3321 						     bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
3322 						     md, 0, 0);
3323 	}
3324 
3325 	if (rc != 0 && rc != -ENOMEM) {
3326 		SPDK_ERRLOG("writev failed: rc = %d\n", rc);
3327 	}
3328 	return rc;
3329 }
3330 
3331 static int
3332 bdev_nvme_zone_appendv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3333 		       struct nvme_bdev_io *bio,
3334 		       struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t zslba,
3335 		       uint32_t flags)
3336 {
3337 	int rc;
3338 
3339 	SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n",
3340 		      lba_count, zslba);
3341 
3342 	bio->iovs = iov;
3343 	bio->iovcnt = iovcnt;
3344 	bio->iovpos = 0;
3345 	bio->iov_offset = 0;
3346 
3347 	if (iovcnt == 1) {
3348 		rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba,
3349 						       lba_count,
3350 						       bdev_nvme_zone_appendv_done, bio,
3351 						       flags,
3352 						       0, 0);
3353 	} else {
3354 		rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count,
3355 							bdev_nvme_zone_appendv_done, bio, flags,
3356 							bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
3357 							md, 0, 0);
3358 	}
3359 
3360 	if (rc != 0 && rc != -ENOMEM) {
3361 		SPDK_ERRLOG("zone append failed: rc = %d\n", rc);
3362 	}
3363 	return rc;
3364 }
3365 
3366 static int
3367 bdev_nvme_comparev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3368 		   struct nvme_bdev_io *bio,
3369 		   struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
3370 		   uint32_t flags)
3371 {
3372 	int rc;
3373 
3374 	SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n",
3375 		      lba_count, lba);
3376 
3377 	bio->iovs = iov;
3378 	bio->iovcnt = iovcnt;
3379 	bio->iovpos = 0;
3380 	bio->iov_offset = 0;
3381 
3382 	rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count,
3383 					       bdev_nvme_comparev_done, bio, flags,
3384 					       bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
3385 					       md, 0, 0);
3386 
3387 	if (rc != 0 && rc != -ENOMEM) {
3388 		SPDK_ERRLOG("comparev failed: rc = %d\n", rc);
3389 	}
3390 	return rc;
3391 }
3392 
3393 static int
3394 bdev_nvme_comparev_and_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3395 			      struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt,
3396 			      struct iovec *write_iov, int write_iovcnt,
3397 			      void *md, uint64_t lba_count, uint64_t lba, uint32_t flags)
3398 {
3399 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
3400 	int rc;
3401 
3402 	SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n",
3403 		      lba_count, lba);
3404 
3405 	bio->iovs = cmp_iov;
3406 	bio->iovcnt = cmp_iovcnt;
3407 	bio->iovpos = 0;
3408 	bio->iov_offset = 0;
3409 	bio->fused_iovs = write_iov;
3410 	bio->fused_iovcnt = write_iovcnt;
3411 	bio->fused_iovpos = 0;
3412 	bio->fused_iov_offset = 0;
3413 
3414 	if (bdev_io->num_retries == 0) {
3415 		bio->first_fused_submitted = false;
3416 	}
3417 
3418 	if (!bio->first_fused_submitted) {
3419 		flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST;
3420 		memset(&bio->cpl, 0, sizeof(bio->cpl));
3421 
3422 		rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count,
3423 						       bdev_nvme_comparev_and_writev_done, bio, flags,
3424 						       bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0);
3425 		if (rc == 0) {
3426 			bio->first_fused_submitted = true;
3427 			flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST;
3428 		} else {
3429 			if (rc != -ENOMEM) {
3430 				SPDK_ERRLOG("compare failed: rc = %d\n", rc);
3431 			}
3432 			return rc;
3433 		}
3434 	}
3435 
3436 	flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND;
3437 
3438 	rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count,
3439 					     bdev_nvme_comparev_and_writev_done, bio, flags,
3440 					     bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0);
3441 	if (rc != 0 && rc != -ENOMEM) {
3442 		SPDK_ERRLOG("write failed: rc = %d\n", rc);
3443 		rc = 0;
3444 	}
3445 
3446 	return rc;
3447 }
3448 
3449 static int
3450 bdev_nvme_unmap(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3451 		struct nvme_bdev_io *bio,
3452 		uint64_t offset_blocks,
3453 		uint64_t num_blocks)
3454 {
3455 	struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES];
3456 	struct spdk_nvme_dsm_range *range;
3457 	uint64_t offset, remaining;
3458 	uint64_t num_ranges_u64;
3459 	uint16_t num_ranges;
3460 	int rc;
3461 
3462 	num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) /
3463 			 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
3464 	if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) {
3465 		SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks);
3466 		return -EINVAL;
3467 	}
3468 	num_ranges = (uint16_t)num_ranges_u64;
3469 
3470 	offset = offset_blocks;
3471 	remaining = num_blocks;
3472 	range = &dsm_ranges[0];
3473 
3474 	/* Fill max-size ranges until the remaining blocks fit into one range */
3475 	while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) {
3476 		range->attributes.raw = 0;
3477 		range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
3478 		range->starting_lba = offset;
3479 
3480 		offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
3481 		remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
3482 		range++;
3483 	}
3484 
3485 	/* Final range describes the remaining blocks */
3486 	range->attributes.raw = 0;
3487 	range->length = remaining;
3488 	range->starting_lba = offset;
3489 
3490 	rc = spdk_nvme_ns_cmd_dataset_management(ns, qpair,
3491 			SPDK_NVME_DSM_ATTR_DEALLOCATE,
3492 			dsm_ranges, num_ranges,
3493 			bdev_nvme_queued_done, bio);
3494 
3495 	return rc;
3496 }
3497 
3498 static int
3499 bdev_nvme_write_zeroes(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3500 		       struct nvme_bdev_io *bio,
3501 		       uint64_t offset_blocks,
3502 		       uint64_t num_blocks)
3503 {
3504 	if (num_blocks > UINT16_MAX + 1) {
3505 		SPDK_ERRLOG("NVMe write zeroes is limited to 16-bit block count\n");
3506 		return -EINVAL;
3507 	}
3508 
3509 	return spdk_nvme_ns_cmd_write_zeroes(ns, qpair,
3510 					     offset_blocks, num_blocks,
3511 					     bdev_nvme_queued_done, bio,
3512 					     0);
3513 }
3514 
3515 static int
3516 bdev_nvme_get_zone_info(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3517 			struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones,
3518 			struct spdk_bdev_zone_info *info)
3519 {
3520 	uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns);
3521 	uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns);
3522 	uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns);
3523 
3524 	if (zone_id % zone_size != 0) {
3525 		return -EINVAL;
3526 	}
3527 
3528 	if (num_zones > total_zones || !num_zones) {
3529 		return -EINVAL;
3530 	}
3531 
3532 	assert(!bio->zone_report_buf);
3533 	bio->zone_report_buf = calloc(1, zone_report_bufsize);
3534 	if (!bio->zone_report_buf) {
3535 		return -ENOMEM;
3536 	}
3537 
3538 	bio->handled_zones = 0;
3539 
3540 	return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize,
3541 					  zone_id, SPDK_NVME_ZRA_LIST_ALL, true,
3542 					  bdev_nvme_get_zone_info_done, bio);
3543 }
3544 
3545 static int
3546 bdev_nvme_zone_management(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3547 			  struct nvme_bdev_io *bio, uint64_t zone_id,
3548 			  enum spdk_bdev_zone_action action)
3549 {
3550 	switch (action) {
3551 	case SPDK_BDEV_ZONE_CLOSE:
3552 		return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false,
3553 						bdev_nvme_zone_management_done, bio);
3554 	case SPDK_BDEV_ZONE_FINISH:
3555 		return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false,
3556 						 bdev_nvme_zone_management_done, bio);
3557 	case SPDK_BDEV_ZONE_OPEN:
3558 		return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false,
3559 					       bdev_nvme_zone_management_done, bio);
3560 	case SPDK_BDEV_ZONE_RESET:
3561 		return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false,
3562 						bdev_nvme_zone_management_done, bio);
3563 	case SPDK_BDEV_ZONE_OFFLINE:
3564 		return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false,
3565 						  bdev_nvme_zone_management_done, bio);
3566 	default:
3567 		return -EINVAL;
3568 	}
3569 }
3570 
3571 static int
3572 bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio,
3573 			 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes)
3574 {
3575 	struct nvme_ctrlr *nvme_ctrlr;
3576 	uint32_t max_xfer_size;
3577 
3578 	if (!bdev_nvme_find_admin_path(nbdev_ch, &nvme_ctrlr)) {
3579 		return -EINVAL;
3580 	}
3581 
3582 	max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ctrlr->ctrlr);
3583 
3584 	if (nbytes > max_xfer_size) {
3585 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
3586 		return -EINVAL;
3587 	}
3588 
3589 	bio->orig_thread = spdk_get_thread();
3590 
3591 	return spdk_nvme_ctrlr_cmd_admin_raw(nvme_ctrlr->ctrlr, cmd, buf,
3592 					     (uint32_t)nbytes, bdev_nvme_admin_passthru_done, bio);
3593 }
3594 
3595 static int
3596 bdev_nvme_io_passthru(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3597 		      struct nvme_bdev_io *bio,
3598 		      struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes)
3599 {
3600 	uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns);
3601 	struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns);
3602 
3603 	if (nbytes > max_xfer_size) {
3604 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
3605 		return -EINVAL;
3606 	}
3607 
3608 	/*
3609 	 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid,
3610 	 * so fill it out automatically.
3611 	 */
3612 	cmd->nsid = spdk_nvme_ns_get_id(ns);
3613 
3614 	return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf,
3615 					  (uint32_t)nbytes, bdev_nvme_queued_done, bio);
3616 }
3617 
3618 static int
3619 bdev_nvme_io_passthru_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3620 			 struct nvme_bdev_io *bio,
3621 			 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len)
3622 {
3623 	size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns);
3624 	uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns);
3625 	struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns);
3626 
3627 	if (nbytes > max_xfer_size) {
3628 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
3629 		return -EINVAL;
3630 	}
3631 
3632 	if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) {
3633 		SPDK_ERRLOG("invalid meta data buffer size\n");
3634 		return -EINVAL;
3635 	}
3636 
3637 	/*
3638 	 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid,
3639 	 * so fill it out automatically.
3640 	 */
3641 	cmd->nsid = spdk_nvme_ns_get_id(ns);
3642 
3643 	return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf,
3644 			(uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio);
3645 }
3646 
3647 static int
3648 bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio,
3649 		struct nvme_bdev_io *bio_to_abort)
3650 {
3651 	struct nvme_ctrlr_channel *ctrlr_ch = nbdev_ch->ctrlr_ch;
3652 	int rc;
3653 
3654 	bio->orig_thread = spdk_get_thread();
3655 
3656 	rc = spdk_nvme_ctrlr_cmd_abort_ext(ctrlr_ch->ctrlr->ctrlr,
3657 					   ctrlr_ch->qpair,
3658 					   bio_to_abort,
3659 					   bdev_nvme_abort_done, bio);
3660 	if (rc == -ENOENT) {
3661 		/* If no command was found in I/O qpair, the target command may be
3662 		 * admin command.
3663 		 */
3664 		rc = spdk_nvme_ctrlr_cmd_abort_ext(ctrlr_ch->ctrlr->ctrlr,
3665 						   NULL,
3666 						   bio_to_abort,
3667 						   bdev_nvme_abort_done, bio);
3668 	}
3669 
3670 	if (rc == -ENOENT) {
3671 		/* If no command was found, complete the abort request with failure. */
3672 		bio->cpl.cdw0 |= 1U;
3673 		bio->cpl.status.sc = SPDK_NVME_SC_SUCCESS;
3674 		bio->cpl.status.sct = SPDK_NVME_SCT_GENERIC;
3675 
3676 		bdev_nvme_abort_completion(bio);
3677 
3678 		rc = 0;
3679 	}
3680 
3681 	return rc;
3682 }
3683 
3684 static void
3685 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w)
3686 {
3687 	const char	*action;
3688 
3689 	if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) {
3690 		action = "reset";
3691 	} else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) {
3692 		action = "abort";
3693 	} else {
3694 		action = "none";
3695 	}
3696 
3697 	spdk_json_write_object_begin(w);
3698 
3699 	spdk_json_write_named_string(w, "method", "bdev_nvme_set_options");
3700 
3701 	spdk_json_write_named_object_begin(w, "params");
3702 	spdk_json_write_named_string(w, "action_on_timeout", action);
3703 	spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us);
3704 	spdk_json_write_named_uint64(w, "timeout_admin_us", g_opts.timeout_admin_us);
3705 	spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms);
3706 	spdk_json_write_named_uint32(w, "retry_count", g_opts.retry_count);
3707 	spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst);
3708 	spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight);
3709 	spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight);
3710 	spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight);
3711 	spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us);
3712 	spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us);
3713 	spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests);
3714 	spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit);
3715 	spdk_json_write_object_end(w);
3716 
3717 	spdk_json_write_object_end(w);
3718 }
3719 
3720 static void
3721 nvme_ctrlr_config_json(struct spdk_json_write_ctx *w,
3722 		       struct nvme_ctrlr *nvme_ctrlr)
3723 {
3724 	struct spdk_nvme_transport_id	*trid;
3725 
3726 	trid = nvme_ctrlr->connected_trid;
3727 
3728 	spdk_json_write_object_begin(w);
3729 
3730 	spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller");
3731 
3732 	spdk_json_write_named_object_begin(w, "params");
3733 	spdk_json_write_named_string(w, "name", nvme_ctrlr->name);
3734 	nvme_bdev_dump_trid_json(trid, w);
3735 	spdk_json_write_named_bool(w, "prchk_reftag",
3736 				   (nvme_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0);
3737 	spdk_json_write_named_bool(w, "prchk_guard",
3738 				   (nvme_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0);
3739 
3740 	spdk_json_write_object_end(w);
3741 
3742 	spdk_json_write_object_end(w);
3743 }
3744 
3745 static void
3746 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w)
3747 {
3748 	spdk_json_write_object_begin(w);
3749 	spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug");
3750 
3751 	spdk_json_write_named_object_begin(w, "params");
3752 	spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us);
3753 	spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled);
3754 	spdk_json_write_object_end(w);
3755 
3756 	spdk_json_write_object_end(w);
3757 }
3758 
3759 static int
3760 bdev_nvme_config_json(struct spdk_json_write_ctx *w)
3761 {
3762 	struct nvme_ctrlr	*nvme_ctrlr;
3763 
3764 	bdev_nvme_opts_config_json(w);
3765 
3766 	pthread_mutex_lock(&g_bdev_nvme_mutex);
3767 
3768 	TAILQ_FOREACH(nvme_ctrlr, &g_nvme_ctrlrs, tailq) {
3769 		nvme_ctrlr_config_json(w, nvme_ctrlr);
3770 	}
3771 
3772 	/* Dump as last parameter to give all NVMe bdevs chance to be constructed
3773 	 * before enabling hotplug poller.
3774 	 */
3775 	bdev_nvme_hotplug_config_json(w);
3776 
3777 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
3778 	return 0;
3779 }
3780 
3781 struct spdk_nvme_ctrlr *
3782 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev)
3783 {
3784 	if (!bdev || bdev->module != &nvme_if) {
3785 		return NULL;
3786 	}
3787 
3788 	return SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk)->nvme_ns->ctrlr->ctrlr;
3789 }
3790 
3791 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme)
3792