xref: /spdk/module/bdev/nvme/bdev_nvme.c (revision 7b8e7212a6b9541dda27a83042cc40a76c29a58c)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation. All rights reserved.
5  *   Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
6  *   Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
7  *
8  *   Redistribution and use in source and binary forms, with or without
9  *   modification, are permitted provided that the following conditions
10  *   are met:
11  *
12  *     * Redistributions of source code must retain the above copyright
13  *       notice, this list of conditions and the following disclaimer.
14  *     * Redistributions in binary form must reproduce the above copyright
15  *       notice, this list of conditions and the following disclaimer in
16  *       the documentation and/or other materials provided with the
17  *       distribution.
18  *     * Neither the name of Intel Corporation nor the names of its
19  *       contributors may be used to endorse or promote products derived
20  *       from this software without specific prior written permission.
21  *
22  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
25  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
26  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  */
34 
35 #include "spdk/stdinc.h"
36 
37 #include "bdev_nvme.h"
38 
39 #include "spdk/accel_engine.h"
40 #include "spdk/config.h"
41 #include "spdk/endian.h"
42 #include "spdk/bdev.h"
43 #include "spdk/json.h"
44 #include "spdk/likely.h"
45 #include "spdk/nvme.h"
46 #include "spdk/nvme_ocssd.h"
47 #include "spdk/nvme_zns.h"
48 #include "spdk/opal.h"
49 #include "spdk/thread.h"
50 #include "spdk/string.h"
51 #include "spdk/util.h"
52 
53 #include "spdk/bdev_module.h"
54 #include "spdk/log.h"
55 
56 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true
57 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS	(10000)
58 
59 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w);
60 
61 struct nvme_bdev_io {
62 	/** array of iovecs to transfer. */
63 	struct iovec *iovs;
64 
65 	/** Number of iovecs in iovs array. */
66 	int iovcnt;
67 
68 	/** Current iovec position. */
69 	int iovpos;
70 
71 	/** Offset in current iovec. */
72 	uint32_t iov_offset;
73 
74 	/** I/O path the current I/O is submitted on, or the I/O path being reset
75 	 *  in a reset I/O.
76 	 */
77 	struct nvme_io_path *io_path;
78 
79 	/** array of iovecs to transfer. */
80 	struct iovec *fused_iovs;
81 
82 	/** Number of iovecs in iovs array. */
83 	int fused_iovcnt;
84 
85 	/** Current iovec position. */
86 	int fused_iovpos;
87 
88 	/** Offset in current iovec. */
89 	uint32_t fused_iov_offset;
90 
91 	/** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */
92 	struct spdk_nvme_cpl cpl;
93 	/** Extended IO opts passed by the user to bdev layer and mapped to NVME format */
94 	struct spdk_nvme_ns_cmd_ext_io_opts ext_opts;
95 
96 	/** Originating thread */
97 	struct spdk_thread *orig_thread;
98 
99 	/** Keeps track if first of fused commands was submitted */
100 	bool first_fused_submitted;
101 
102 	/** Temporary pointer to zone report buffer */
103 	struct spdk_nvme_zns_zone_report *zone_report_buf;
104 
105 	/** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */
106 	uint64_t handled_zones;
107 
108 	/** Expiration value in ticks to retry the current I/O. */
109 	uint64_t retry_ticks;
110 
111 	/* How many times the current I/O was retried. */
112 	int32_t retry_count;
113 };
114 
115 struct nvme_probe_ctx {
116 	size_t count;
117 	struct spdk_nvme_transport_id trids[NVME_MAX_CONTROLLERS];
118 	struct spdk_nvme_host_id hostids[NVME_MAX_CONTROLLERS];
119 	const char *names[NVME_MAX_CONTROLLERS];
120 	uint32_t prchk_flags[NVME_MAX_CONTROLLERS];
121 	const char *hostnqn;
122 };
123 
124 struct nvme_probe_skip_entry {
125 	struct spdk_nvme_transport_id		trid;
126 	TAILQ_ENTRY(nvme_probe_skip_entry)	tailq;
127 };
128 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */
129 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER(
130 			g_skipped_nvme_ctrlrs);
131 
132 static struct spdk_bdev_nvme_opts g_opts = {
133 	.action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE,
134 	.timeout_us = 0,
135 	.timeout_admin_us = 0,
136 	.keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS,
137 	.transport_retry_count = 4,
138 	.arbitration_burst = 0,
139 	.low_priority_weight = 0,
140 	.medium_priority_weight = 0,
141 	.high_priority_weight = 0,
142 	.nvme_adminq_poll_period_us = 10000ULL,
143 	.nvme_ioq_poll_period_us = 0,
144 	.io_queue_requests = 0,
145 	.delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT,
146 	.bdev_retry_count = 0,
147 };
148 
149 #define NVME_HOTPLUG_POLL_PERIOD_MAX			10000000ULL
150 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT		100000ULL
151 
152 static int g_hot_insert_nvme_controller_index = 0;
153 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT;
154 static bool g_nvme_hotplug_enabled = false;
155 static struct spdk_thread *g_bdev_nvme_init_thread;
156 static struct spdk_poller *g_hotplug_poller;
157 static struct spdk_poller *g_hotplug_probe_poller;
158 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx;
159 
160 static void nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr,
161 		struct nvme_async_probe_ctx *ctx);
162 static void nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr,
163 		struct nvme_async_probe_ctx *ctx);
164 static int bdev_nvme_library_init(void);
165 static void bdev_nvme_library_fini(void);
166 static void bdev_nvme_submit_request(struct spdk_io_channel *ch,
167 				     struct spdk_bdev_io *bdev_io);
168 static int bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
169 			   void *md, uint64_t lba_count, uint64_t lba,
170 			   uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts);
171 static int bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
172 				 void *md, uint64_t lba_count, uint64_t lba);
173 static int bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
174 			    void *md, uint64_t lba_count, uint64_t lba,
175 			    uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts);
176 static int bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
177 				  void *md, uint64_t lba_count,
178 				  uint64_t zslba, uint32_t flags);
179 static int bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
180 			      void *md, uint64_t lba_count, uint64_t lba,
181 			      uint32_t flags);
182 static int bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio,
183 		struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov,
184 		int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba,
185 		uint32_t flags);
186 static int bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id,
187 				   uint32_t num_zones, struct spdk_bdev_zone_info *info);
188 static int bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id,
189 				     enum spdk_bdev_zone_action action);
190 static int bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch,
191 				    struct nvme_bdev_io *bio,
192 				    struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes);
193 static int bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd,
194 				 void *buf, size_t nbytes);
195 static int bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd,
196 				    void *buf, size_t nbytes, void *md_buf, size_t md_len);
197 static void bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch,
198 			    struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort);
199 static void bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio);
200 static int bdev_nvme_reset(struct nvme_ctrlr *nvme_ctrlr);
201 static int bdev_nvme_failover(struct nvme_ctrlr *nvme_ctrlr, bool remove);
202 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr);
203 static void nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr);
204 
205 static int
206 nvme_ns_cmp(struct nvme_ns *ns1, struct nvme_ns *ns2)
207 {
208 	return ns1->id - ns2->id;
209 }
210 
211 RB_GENERATE_STATIC(nvme_ns_tree, nvme_ns, node, nvme_ns_cmp);
212 
213 struct spdk_nvme_qpair *
214 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch)
215 {
216 	struct nvme_ctrlr_channel *ctrlr_ch;
217 
218 	assert(ctrlr_io_ch != NULL);
219 
220 	ctrlr_ch = spdk_io_channel_get_ctx(ctrlr_io_ch);
221 
222 	return ctrlr_ch->qpair;
223 }
224 
225 static int
226 bdev_nvme_get_ctx_size(void)
227 {
228 	return sizeof(struct nvme_bdev_io);
229 }
230 
231 static struct spdk_bdev_module nvme_if = {
232 	.name = "nvme",
233 	.async_fini = true,
234 	.module_init = bdev_nvme_library_init,
235 	.module_fini = bdev_nvme_library_fini,
236 	.config_json = bdev_nvme_config_json,
237 	.get_ctx_size = bdev_nvme_get_ctx_size,
238 
239 };
240 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if)
241 
242 struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_bdev_ctrlrs);
243 pthread_mutex_t g_bdev_nvme_mutex = PTHREAD_MUTEX_INITIALIZER;
244 bool g_bdev_nvme_module_finish;
245 
246 struct nvme_bdev_ctrlr *
247 nvme_bdev_ctrlr_get_by_name(const char *name)
248 {
249 	struct nvme_bdev_ctrlr *nbdev_ctrlr;
250 
251 	TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
252 		if (strcmp(name, nbdev_ctrlr->name) == 0) {
253 			break;
254 		}
255 	}
256 
257 	return nbdev_ctrlr;
258 }
259 
260 static struct nvme_ctrlr *
261 nvme_bdev_ctrlr_get_ctrlr(struct nvme_bdev_ctrlr *nbdev_ctrlr,
262 			  const struct spdk_nvme_transport_id *trid)
263 {
264 	struct nvme_ctrlr *nvme_ctrlr;
265 
266 	TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) {
267 		if (spdk_nvme_transport_id_compare(trid, &nvme_ctrlr->active_path_id->trid) == 0) {
268 			break;
269 		}
270 	}
271 
272 	return nvme_ctrlr;
273 }
274 
275 static struct nvme_bdev *
276 nvme_bdev_ctrlr_get_bdev(struct nvme_bdev_ctrlr *nbdev_ctrlr, uint32_t nsid)
277 {
278 	struct nvme_bdev *bdev;
279 
280 	pthread_mutex_lock(&g_bdev_nvme_mutex);
281 	TAILQ_FOREACH(bdev, &nbdev_ctrlr->bdevs, tailq) {
282 		if (bdev->nsid == nsid) {
283 			break;
284 		}
285 	}
286 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
287 
288 	return bdev;
289 }
290 
291 struct nvme_ns *
292 nvme_ctrlr_get_ns(struct nvme_ctrlr *nvme_ctrlr, uint32_t nsid)
293 {
294 	struct nvme_ns ns;
295 
296 	assert(nsid > 0);
297 
298 	ns.id = nsid;
299 	return RB_FIND(nvme_ns_tree, &nvme_ctrlr->namespaces, &ns);
300 }
301 
302 struct nvme_ns *
303 nvme_ctrlr_get_first_active_ns(struct nvme_ctrlr *nvme_ctrlr)
304 {
305 	return RB_MIN(nvme_ns_tree, &nvme_ctrlr->namespaces);
306 }
307 
308 struct nvme_ns *
309 nvme_ctrlr_get_next_active_ns(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *ns)
310 {
311 	if (ns == NULL) {
312 		return NULL;
313 	}
314 
315 	return RB_NEXT(nvme_ns_tree, &nvme_ctrlr->namespaces, ns);
316 }
317 
318 static struct nvme_ctrlr *
319 nvme_ctrlr_get(const struct spdk_nvme_transport_id *trid)
320 {
321 	struct nvme_bdev_ctrlr	*nbdev_ctrlr;
322 	struct nvme_ctrlr	*nvme_ctrlr = NULL;
323 
324 	pthread_mutex_lock(&g_bdev_nvme_mutex);
325 	TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
326 		nvme_ctrlr = nvme_bdev_ctrlr_get_ctrlr(nbdev_ctrlr, trid);
327 		if (nvme_ctrlr != NULL) {
328 			break;
329 		}
330 	}
331 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
332 
333 	return nvme_ctrlr;
334 }
335 
336 struct nvme_ctrlr *
337 nvme_ctrlr_get_by_name(const char *name)
338 {
339 	struct nvme_bdev_ctrlr *nbdev_ctrlr;
340 	struct nvme_ctrlr *nvme_ctrlr = NULL;
341 
342 	if (name == NULL) {
343 		return NULL;
344 	}
345 
346 	pthread_mutex_lock(&g_bdev_nvme_mutex);
347 	nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name);
348 	if (nbdev_ctrlr != NULL) {
349 		nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs);
350 	}
351 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
352 
353 	return nvme_ctrlr;
354 }
355 
356 void
357 nvme_bdev_ctrlr_for_each(nvme_bdev_ctrlr_for_each_fn fn, void *ctx)
358 {
359 	struct nvme_bdev_ctrlr *nbdev_ctrlr;
360 
361 	pthread_mutex_lock(&g_bdev_nvme_mutex);
362 	TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
363 		fn(nbdev_ctrlr, ctx);
364 	}
365 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
366 }
367 
368 void
369 nvme_bdev_dump_trid_json(const struct spdk_nvme_transport_id *trid, struct spdk_json_write_ctx *w)
370 {
371 	const char *trtype_str;
372 	const char *adrfam_str;
373 
374 	trtype_str = spdk_nvme_transport_id_trtype_str(trid->trtype);
375 	if (trtype_str) {
376 		spdk_json_write_named_string(w, "trtype", trtype_str);
377 	}
378 
379 	adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam);
380 	if (adrfam_str) {
381 		spdk_json_write_named_string(w, "adrfam", adrfam_str);
382 	}
383 
384 	if (trid->traddr[0] != '\0') {
385 		spdk_json_write_named_string(w, "traddr", trid->traddr);
386 	}
387 
388 	if (trid->trsvcid[0] != '\0') {
389 		spdk_json_write_named_string(w, "trsvcid", trid->trsvcid);
390 	}
391 
392 	if (trid->subnqn[0] != '\0') {
393 		spdk_json_write_named_string(w, "subnqn", trid->subnqn);
394 	}
395 }
396 
397 static void
398 nvme_bdev_ctrlr_delete(struct nvme_bdev_ctrlr *nbdev_ctrlr,
399 		       struct nvme_ctrlr *nvme_ctrlr)
400 {
401 	pthread_mutex_lock(&g_bdev_nvme_mutex);
402 
403 	TAILQ_REMOVE(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq);
404 	if (!TAILQ_EMPTY(&nbdev_ctrlr->ctrlrs)) {
405 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
406 
407 		return;
408 	}
409 	TAILQ_REMOVE(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq);
410 
411 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
412 
413 	assert(TAILQ_EMPTY(&nbdev_ctrlr->bdevs));
414 
415 	free(nbdev_ctrlr->name);
416 	free(nbdev_ctrlr);
417 }
418 
419 static void
420 _nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr)
421 {
422 	struct nvme_path_id *path_id, *tmp_path;
423 	struct nvme_ns *ns, *tmp_ns;
424 
425 	free(nvme_ctrlr->copied_ana_desc);
426 	spdk_free(nvme_ctrlr->ana_log_page);
427 
428 	if (nvme_ctrlr->opal_dev) {
429 		spdk_opal_dev_destruct(nvme_ctrlr->opal_dev);
430 		nvme_ctrlr->opal_dev = NULL;
431 	}
432 
433 	if (nvme_ctrlr->nbdev_ctrlr) {
434 		nvme_bdev_ctrlr_delete(nvme_ctrlr->nbdev_ctrlr, nvme_ctrlr);
435 	}
436 
437 	RB_FOREACH_SAFE(ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp_ns) {
438 		RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, ns);
439 		free(ns);
440 	}
441 
442 	TAILQ_FOREACH_SAFE(path_id, &nvme_ctrlr->trids, link, tmp_path) {
443 		TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link);
444 		free(path_id);
445 	}
446 
447 	pthread_mutex_destroy(&nvme_ctrlr->mutex);
448 
449 	free(nvme_ctrlr);
450 
451 	pthread_mutex_lock(&g_bdev_nvme_mutex);
452 	if (g_bdev_nvme_module_finish && TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
453 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
454 		spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL);
455 		spdk_bdev_module_fini_done();
456 		return;
457 	}
458 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
459 }
460 
461 static int
462 nvme_detach_poller(void *arg)
463 {
464 	struct nvme_ctrlr *nvme_ctrlr = arg;
465 	int rc;
466 
467 	rc = spdk_nvme_detach_poll_async(nvme_ctrlr->detach_ctx);
468 	if (rc != -EAGAIN) {
469 		spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller);
470 		_nvme_ctrlr_delete(nvme_ctrlr);
471 	}
472 
473 	return SPDK_POLLER_BUSY;
474 }
475 
476 static void
477 nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr)
478 {
479 	int rc;
480 
481 	/* First, unregister the adminq poller, as the driver will poll adminq if necessary */
482 	spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller);
483 
484 	/* If we got here, the reset/detach poller cannot be active */
485 	assert(nvme_ctrlr->reset_detach_poller == NULL);
486 	nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(nvme_detach_poller,
487 					  nvme_ctrlr, 1000);
488 	if (nvme_ctrlr->reset_detach_poller == NULL) {
489 		SPDK_ERRLOG("Failed to register detach poller\n");
490 		goto error;
491 	}
492 
493 	rc = spdk_nvme_detach_async(nvme_ctrlr->ctrlr, &nvme_ctrlr->detach_ctx);
494 	if (rc != 0) {
495 		SPDK_ERRLOG("Failed to detach the NVMe controller\n");
496 		goto error;
497 	}
498 
499 	return;
500 error:
501 	/* We don't have a good way to handle errors here, so just do what we can and delete the
502 	 * controller without detaching the underlying NVMe device.
503 	 */
504 	spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller);
505 	_nvme_ctrlr_delete(nvme_ctrlr);
506 }
507 
508 static void
509 nvme_ctrlr_unregister_cb(void *io_device)
510 {
511 	struct nvme_ctrlr *nvme_ctrlr = io_device;
512 
513 	nvme_ctrlr_delete(nvme_ctrlr);
514 }
515 
516 static void
517 nvme_ctrlr_unregister(void *ctx)
518 {
519 	struct nvme_ctrlr *nvme_ctrlr = ctx;
520 
521 	spdk_io_device_unregister(nvme_ctrlr, nvme_ctrlr_unregister_cb);
522 }
523 
524 static void
525 nvme_ctrlr_release(struct nvme_ctrlr *nvme_ctrlr)
526 {
527 	pthread_mutex_lock(&nvme_ctrlr->mutex);
528 
529 	assert(nvme_ctrlr->ref > 0);
530 	nvme_ctrlr->ref--;
531 
532 	if (nvme_ctrlr->ref > 0 || !nvme_ctrlr->destruct ||
533 	    nvme_ctrlr->resetting || nvme_ctrlr->ana_log_page_updating) {
534 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
535 		return;
536 	}
537 
538 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
539 
540 	nvme_ctrlr_unregister(nvme_ctrlr);
541 }
542 
543 static struct nvme_io_path *
544 _bdev_nvme_get_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns)
545 {
546 	struct nvme_io_path *io_path;
547 
548 	STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
549 		if (io_path->nvme_ns == nvme_ns) {
550 			break;
551 		}
552 	}
553 
554 	return io_path;
555 }
556 
557 static int
558 _bdev_nvme_add_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns)
559 {
560 	struct nvme_io_path *io_path;
561 	struct spdk_io_channel *ch;
562 
563 	io_path = calloc(1, sizeof(*io_path));
564 	if (io_path == NULL) {
565 		SPDK_ERRLOG("Failed to alloc io_path.\n");
566 		return -ENOMEM;
567 	}
568 
569 	ch = spdk_get_io_channel(nvme_ns->ctrlr);
570 	if (ch == NULL) {
571 		free(io_path);
572 		SPDK_ERRLOG("Failed to alloc io_channel.\n");
573 		return -ENOMEM;
574 	}
575 
576 	io_path->ctrlr_ch = spdk_io_channel_get_ctx(ch);
577 	TAILQ_INSERT_TAIL(&io_path->ctrlr_ch->io_path_list, io_path, tailq);
578 
579 	io_path->nvme_ns = nvme_ns;
580 
581 	io_path->nbdev_ch = nbdev_ch;
582 	STAILQ_INSERT_TAIL(&nbdev_ch->io_path_list, io_path, stailq);
583 
584 	nbdev_ch->current_io_path = NULL;
585 
586 	return 0;
587 }
588 
589 static void
590 _bdev_nvme_delete_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *io_path)
591 {
592 	struct spdk_io_channel *ch;
593 
594 	nbdev_ch->current_io_path = NULL;
595 
596 	STAILQ_REMOVE(&nbdev_ch->io_path_list, io_path, nvme_io_path, stailq);
597 
598 	TAILQ_REMOVE(&io_path->ctrlr_ch->io_path_list, io_path, tailq);
599 	ch = spdk_io_channel_from_ctx(io_path->ctrlr_ch);
600 	spdk_put_io_channel(ch);
601 
602 	free(io_path);
603 }
604 
605 static void
606 _bdev_nvme_delete_io_paths(struct nvme_bdev_channel *nbdev_ch)
607 {
608 	struct nvme_io_path *io_path, *tmp_io_path;
609 
610 	STAILQ_FOREACH_SAFE(io_path, &nbdev_ch->io_path_list, stailq, tmp_io_path) {
611 		_bdev_nvme_delete_io_path(nbdev_ch, io_path);
612 	}
613 }
614 
615 static int
616 bdev_nvme_create_bdev_channel_cb(void *io_device, void *ctx_buf)
617 {
618 	struct nvme_bdev_channel *nbdev_ch = ctx_buf;
619 	struct nvme_bdev *nbdev = io_device;
620 	struct nvme_ns *nvme_ns;
621 	int rc;
622 
623 	STAILQ_INIT(&nbdev_ch->io_path_list);
624 	TAILQ_INIT(&nbdev_ch->retry_io_list);
625 
626 	pthread_mutex_lock(&nbdev->mutex);
627 	TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) {
628 		rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns);
629 		if (rc != 0) {
630 			pthread_mutex_unlock(&nbdev->mutex);
631 
632 			_bdev_nvme_delete_io_paths(nbdev_ch);
633 			return rc;
634 		}
635 	}
636 	pthread_mutex_unlock(&nbdev->mutex);
637 
638 	return 0;
639 }
640 
641 static void
642 bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch)
643 {
644 	struct spdk_bdev_io *bdev_io, *tmp_io;
645 
646 	TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_io) {
647 		TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link);
648 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED);
649 	}
650 
651 	spdk_poller_unregister(&nbdev_ch->retry_io_poller);
652 }
653 
654 static void
655 bdev_nvme_destroy_bdev_channel_cb(void *io_device, void *ctx_buf)
656 {
657 	struct nvme_bdev_channel *nbdev_ch = ctx_buf;
658 
659 	bdev_nvme_abort_retry_ios(nbdev_ch);
660 	_bdev_nvme_delete_io_paths(nbdev_ch);
661 }
662 
663 static inline bool
664 bdev_nvme_io_type_is_admin(enum spdk_bdev_io_type io_type)
665 {
666 	switch (io_type) {
667 	case SPDK_BDEV_IO_TYPE_RESET:
668 	case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
669 	case SPDK_BDEV_IO_TYPE_ABORT:
670 		return true;
671 	default:
672 		break;
673 	}
674 
675 	return false;
676 }
677 
678 static inline bool
679 nvme_ns_is_accessible(struct nvme_ns *nvme_ns)
680 {
681 	if (spdk_unlikely(nvme_ns->ana_state_updating)) {
682 		return false;
683 	}
684 
685 	switch (nvme_ns->ana_state) {
686 	case SPDK_NVME_ANA_OPTIMIZED_STATE:
687 	case SPDK_NVME_ANA_NON_OPTIMIZED_STATE:
688 		return true;
689 	default:
690 		break;
691 	}
692 
693 	return false;
694 }
695 
696 static inline bool
697 nvme_io_path_is_connected(struct nvme_io_path *io_path)
698 {
699 	return io_path->ctrlr_ch->qpair != NULL;
700 }
701 
702 static inline bool
703 nvme_io_path_is_available(struct nvme_io_path *io_path)
704 {
705 	if (spdk_unlikely(!nvme_io_path_is_connected(io_path))) {
706 		return false;
707 	}
708 
709 	if (spdk_unlikely(!nvme_ns_is_accessible(io_path->nvme_ns))) {
710 		return false;
711 	}
712 
713 	return true;
714 }
715 
716 static bool
717 nvme_io_path_is_failed(struct nvme_io_path *io_path)
718 {
719 	struct nvme_ctrlr *nvme_ctrlr;
720 
721 	nvme_ctrlr = nvme_ctrlr_channel_get_ctrlr(io_path->ctrlr_ch);
722 
723 	return spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr);
724 }
725 
726 static inline struct nvme_io_path *
727 bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch)
728 {
729 	struct nvme_io_path *io_path, *non_optimized = NULL;
730 
731 	if (spdk_likely(nbdev_ch->current_io_path != NULL)) {
732 		return nbdev_ch->current_io_path;
733 	}
734 
735 	STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
736 		if (spdk_unlikely(!nvme_io_path_is_connected(io_path))) {
737 			/* The device is currently resetting. */
738 			continue;
739 		}
740 
741 		if (spdk_unlikely(io_path->nvme_ns->ana_state_updating)) {
742 			continue;
743 		}
744 
745 		switch (io_path->nvme_ns->ana_state) {
746 		case SPDK_NVME_ANA_OPTIMIZED_STATE:
747 			nbdev_ch->current_io_path = io_path;
748 			return io_path;
749 		case SPDK_NVME_ANA_NON_OPTIMIZED_STATE:
750 			if (non_optimized == NULL) {
751 				non_optimized = io_path;
752 			}
753 			break;
754 		default:
755 			break;
756 		}
757 	}
758 
759 	return non_optimized;
760 }
761 
762 /* Return true if there is any io_path whose qpair is active or ctrlr is not failed,
763  * or false otherwise.
764  *
765  * If any io_path has an active qpair but find_io_path() returned NULL, its namespace
766  * is likely to be non-accessible now but may become accessible.
767  *
768  * If any io_path has an unfailed ctrlr but find_io_path() returned NULL, the ctrlr
769  * is likely to be resetting now but the reset may succeeed. A ctrlr is set to unfailed
770  * when starting to reset it but it is set to failed when the reset failed. Hence, if
771  * a ctrlr is unfailed, it is likely that it works fine or is resetting.
772  */
773 static bool
774 any_io_path_may_become_available(struct nvme_bdev_channel *nbdev_ch)
775 {
776 	struct nvme_io_path *io_path;
777 
778 	STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
779 		if (nvme_io_path_is_connected(io_path) ||
780 		    !nvme_io_path_is_failed(io_path)) {
781 			return true;
782 		}
783 	}
784 
785 	return false;
786 }
787 
788 static int
789 bdev_nvme_retry_ios(void *arg)
790 {
791 	struct nvme_bdev_channel *nbdev_ch = arg;
792 	struct spdk_io_channel *ch = spdk_io_channel_from_ctx(nbdev_ch);
793 	struct spdk_bdev_io *bdev_io, *tmp_bdev_io;
794 	struct nvme_bdev_io *bio;
795 	uint64_t now, delay_us;
796 
797 	now = spdk_get_ticks();
798 
799 	TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_bdev_io) {
800 		bio = (struct nvme_bdev_io *)bdev_io->driver_ctx;
801 		if (bio->retry_ticks > now) {
802 			break;
803 		}
804 
805 		TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link);
806 
807 		bdev_nvme_submit_request(ch, bdev_io);
808 	}
809 
810 	spdk_poller_unregister(&nbdev_ch->retry_io_poller);
811 
812 	bdev_io = TAILQ_FIRST(&nbdev_ch->retry_io_list);
813 	if (bdev_io != NULL) {
814 		bio = (struct nvme_bdev_io *)bdev_io->driver_ctx;
815 
816 		delay_us = (bio->retry_ticks - now) * SPDK_SEC_TO_USEC / spdk_get_ticks_hz();
817 
818 		nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch,
819 					    delay_us);
820 	}
821 
822 	return SPDK_POLLER_BUSY;
823 }
824 
825 static void
826 bdev_nvme_queue_retry_io(struct nvme_bdev_channel *nbdev_ch,
827 			 struct nvme_bdev_io *bio, uint64_t delay_ms)
828 {
829 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
830 	struct spdk_bdev_io *tmp_bdev_io;
831 	struct nvme_bdev_io *tmp_bio;
832 
833 	bio->retry_ticks = spdk_get_ticks() + delay_ms * spdk_get_ticks_hz() / 1000ULL;
834 
835 	TAILQ_FOREACH_REVERSE(tmp_bdev_io, &nbdev_ch->retry_io_list, retry_io_head, module_link) {
836 		tmp_bio = (struct nvme_bdev_io *)tmp_bdev_io->driver_ctx;
837 
838 		if (tmp_bio->retry_ticks <= bio->retry_ticks) {
839 			TAILQ_INSERT_AFTER(&nbdev_ch->retry_io_list, tmp_bdev_io, bdev_io,
840 					   module_link);
841 			return;
842 		}
843 	}
844 
845 	/* No earlier I/Os were found. This I/O must be the new head. */
846 	TAILQ_INSERT_HEAD(&nbdev_ch->retry_io_list, bdev_io, module_link);
847 
848 	spdk_poller_unregister(&nbdev_ch->retry_io_poller);
849 
850 	nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch,
851 				    delay_ms * 1000ULL);
852 }
853 
854 static inline void
855 bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio,
856 				  const struct spdk_nvme_cpl *cpl)
857 {
858 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
859 	struct nvme_bdev_channel *nbdev_ch;
860 	struct nvme_ctrlr *nvme_ctrlr;
861 	const struct spdk_nvme_ctrlr_data *cdata;
862 	uint64_t delay_ms;
863 
864 	if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) {
865 		goto complete;
866 	}
867 
868 	if (cpl->status.dnr != 0 || bdev_nvme_io_type_is_admin(bdev_io->type) ||
869 	    (g_opts.bdev_retry_count != -1 && bio->retry_count >= g_opts.bdev_retry_count)) {
870 		goto complete;
871 	}
872 
873 	nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io));
874 	nvme_ctrlr = nvme_ctrlr_channel_get_ctrlr(bio->io_path->ctrlr_ch);
875 
876 	assert(bio->io_path != NULL);
877 
878 	if (spdk_nvme_cpl_is_path_error(cpl) ||
879 	    spdk_nvme_cpl_is_aborted_sq_deletion(cpl) ||
880 	    !nvme_io_path_is_available(bio->io_path) ||
881 	    nvme_io_path_is_failed(bio->io_path)) {
882 		nbdev_ch->current_io_path = NULL;
883 		if (spdk_nvme_cpl_is_ana_error(cpl)) {
884 			bio->io_path->nvme_ns->ana_state_updating = true;
885 			nvme_ctrlr_read_ana_log_page(nvme_ctrlr);
886 		}
887 		delay_ms = 0;
888 	} else if (spdk_nvme_cpl_is_aborted_by_request(cpl)) {
889 		goto complete;
890 	} else {
891 		bio->retry_count++;
892 
893 		cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr);
894 
895 		if (cpl->status.crd != 0) {
896 			delay_ms = cdata->crdt[cpl->status.crd] * 100;
897 		} else {
898 			delay_ms = 0;
899 		}
900 	}
901 
902 	if (any_io_path_may_become_available(nbdev_ch)) {
903 		bdev_nvme_queue_retry_io(nbdev_ch, bio, delay_ms);
904 		return;
905 	}
906 
907 complete:
908 	bio->retry_count = 0;
909 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
910 }
911 
912 static inline void
913 bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc)
914 {
915 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
916 	struct nvme_bdev_channel *nbdev_ch;
917 	enum spdk_bdev_io_status io_status;
918 
919 	switch (rc) {
920 	case 0:
921 		io_status = SPDK_BDEV_IO_STATUS_SUCCESS;
922 		break;
923 	case -ENOMEM:
924 		io_status = SPDK_BDEV_IO_STATUS_NOMEM;
925 		break;
926 	case -ENXIO:
927 		nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io));
928 
929 		nbdev_ch->current_io_path = NULL;
930 
931 		if (!bdev_nvme_io_type_is_admin(bdev_io->type) &&
932 		    any_io_path_may_become_available(nbdev_ch)) {
933 			bdev_nvme_queue_retry_io(nbdev_ch, bio, 1000ULL);
934 			return;
935 		}
936 
937 	/* fallthrough */
938 	default:
939 		io_status = SPDK_BDEV_IO_STATUS_FAILED;
940 		break;
941 	}
942 
943 	bio->retry_count = 0;
944 	spdk_bdev_io_complete(bdev_io, io_status);
945 }
946 
947 static void
948 _bdev_nvme_clear_io_path_cache(struct nvme_ctrlr_channel *ctrlr_ch)
949 {
950 	struct nvme_io_path *io_path;
951 
952 	TAILQ_FOREACH(io_path, &ctrlr_ch->io_path_list, tailq) {
953 		io_path->nbdev_ch->current_io_path = NULL;
954 	}
955 }
956 
957 static struct nvme_ctrlr_channel *
958 nvme_poll_group_get_ctrlr_channel(struct nvme_poll_group *group,
959 				  struct spdk_nvme_qpair *qpair)
960 {
961 	struct nvme_ctrlr_channel *ctrlr_ch;
962 
963 	TAILQ_FOREACH(ctrlr_ch, &group->ctrlr_ch_list, tailq) {
964 		if (ctrlr_ch->qpair == qpair) {
965 			break;
966 		}
967 	}
968 
969 	return ctrlr_ch;
970 }
971 
972 static void
973 bdev_nvme_destroy_qpair(struct nvme_ctrlr_channel *ctrlr_ch)
974 {
975 	if (ctrlr_ch->qpair != NULL) {
976 		spdk_nvme_ctrlr_free_io_qpair(ctrlr_ch->qpair);
977 		ctrlr_ch->qpair = NULL;
978 	}
979 
980 	_bdev_nvme_clear_io_path_cache(ctrlr_ch);
981 }
982 
983 static void
984 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx)
985 {
986 	struct nvme_poll_group *group = poll_group_ctx;
987 	struct nvme_ctrlr_channel *ctrlr_ch;
988 	struct nvme_ctrlr *nvme_ctrlr;
989 
990 	SPDK_NOTICELOG("qpair %p is disconnected, free the qpair and reset controller.\n", qpair);
991 	/*
992 	 * Free the I/O qpair and reset the nvme_ctrlr.
993 	 */
994 	ctrlr_ch = nvme_poll_group_get_ctrlr_channel(group, qpair);
995 	if (ctrlr_ch != NULL) {
996 		bdev_nvme_destroy_qpair(ctrlr_ch);
997 
998 		nvme_ctrlr = nvme_ctrlr_channel_get_ctrlr(ctrlr_ch);
999 		bdev_nvme_reset(nvme_ctrlr);
1000 	}
1001 }
1002 
1003 static int
1004 bdev_nvme_poll(void *arg)
1005 {
1006 	struct nvme_poll_group *group = arg;
1007 	int64_t num_completions;
1008 
1009 	if (group->collect_spin_stat && group->start_ticks == 0) {
1010 		group->start_ticks = spdk_get_ticks();
1011 	}
1012 
1013 	num_completions = spdk_nvme_poll_group_process_completions(group->group, 0,
1014 			  bdev_nvme_disconnected_qpair_cb);
1015 	if (group->collect_spin_stat) {
1016 		if (num_completions > 0) {
1017 			if (group->end_ticks != 0) {
1018 				group->spin_ticks += (group->end_ticks - group->start_ticks);
1019 				group->end_ticks = 0;
1020 			}
1021 			group->start_ticks = 0;
1022 		} else {
1023 			group->end_ticks = spdk_get_ticks();
1024 		}
1025 	}
1026 
1027 	return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
1028 }
1029 
1030 static int
1031 bdev_nvme_poll_adminq(void *arg)
1032 {
1033 	int32_t rc;
1034 	struct nvme_ctrlr *nvme_ctrlr = arg;
1035 
1036 	assert(nvme_ctrlr != NULL);
1037 
1038 	rc = spdk_nvme_ctrlr_process_admin_completions(nvme_ctrlr->ctrlr);
1039 	if (rc < 0) {
1040 		bdev_nvme_failover(nvme_ctrlr, false);
1041 	}
1042 
1043 	return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY;
1044 }
1045 
1046 static void
1047 _bdev_nvme_unregister_dev_cb(void *io_device)
1048 {
1049 	struct nvme_bdev *nvme_disk = io_device;
1050 
1051 	free(nvme_disk->disk.name);
1052 	free(nvme_disk);
1053 }
1054 
1055 static int
1056 bdev_nvme_destruct(void *ctx)
1057 {
1058 	struct nvme_bdev *nvme_disk = ctx;
1059 	struct nvme_ns *nvme_ns, *tmp_nvme_ns;
1060 
1061 	TAILQ_FOREACH_SAFE(nvme_ns, &nvme_disk->nvme_ns_list, tailq, tmp_nvme_ns) {
1062 		pthread_mutex_lock(&nvme_ns->ctrlr->mutex);
1063 
1064 		nvme_ns->bdev = NULL;
1065 
1066 		assert(nvme_ns->id > 0);
1067 
1068 		if (nvme_ctrlr_get_ns(nvme_ns->ctrlr, nvme_ns->id) == NULL) {
1069 			pthread_mutex_unlock(&nvme_ns->ctrlr->mutex);
1070 
1071 			nvme_ctrlr_release(nvme_ns->ctrlr);
1072 			free(nvme_ns);
1073 		} else {
1074 			pthread_mutex_unlock(&nvme_ns->ctrlr->mutex);
1075 		}
1076 	}
1077 
1078 	pthread_mutex_lock(&g_bdev_nvme_mutex);
1079 	TAILQ_REMOVE(&nvme_disk->nbdev_ctrlr->bdevs, nvme_disk, tailq);
1080 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
1081 
1082 	spdk_io_device_unregister(nvme_disk, _bdev_nvme_unregister_dev_cb);
1083 
1084 	return 0;
1085 }
1086 
1087 static int
1088 bdev_nvme_flush(struct nvme_bdev_io *bio, uint64_t offset, uint64_t nbytes)
1089 {
1090 	bdev_nvme_io_complete(bio, 0);
1091 
1092 	return 0;
1093 }
1094 
1095 static int
1096 bdev_nvme_create_qpair(struct nvme_ctrlr_channel *ctrlr_ch)
1097 {
1098 	struct nvme_ctrlr *nvme_ctrlr;
1099 	struct spdk_nvme_io_qpair_opts opts;
1100 	struct spdk_nvme_qpair *qpair;
1101 	int rc;
1102 
1103 	nvme_ctrlr = nvme_ctrlr_channel_get_ctrlr(ctrlr_ch);
1104 
1105 	spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_ctrlr->ctrlr, &opts, sizeof(opts));
1106 	opts.delay_cmd_submit = g_opts.delay_cmd_submit;
1107 	opts.create_only = true;
1108 	opts.async_mode = true;
1109 	opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests);
1110 	g_opts.io_queue_requests = opts.io_queue_requests;
1111 
1112 	qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_ctrlr->ctrlr, &opts, sizeof(opts));
1113 	if (qpair == NULL) {
1114 		return -1;
1115 	}
1116 
1117 	assert(ctrlr_ch->group != NULL);
1118 
1119 	rc = spdk_nvme_poll_group_add(ctrlr_ch->group->group, qpair);
1120 	if (rc != 0) {
1121 		SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n");
1122 		goto err;
1123 	}
1124 
1125 	rc = spdk_nvme_ctrlr_connect_io_qpair(nvme_ctrlr->ctrlr, qpair);
1126 	if (rc != 0) {
1127 		SPDK_ERRLOG("Unable to connect I/O qpair.\n");
1128 		goto err;
1129 	}
1130 
1131 	ctrlr_ch->qpair = qpair;
1132 
1133 	return 0;
1134 
1135 err:
1136 	spdk_nvme_ctrlr_free_io_qpair(qpair);
1137 
1138 	return rc;
1139 }
1140 
1141 static void
1142 bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i)
1143 {
1144 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
1145 	struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch);
1146 	enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS;
1147 	struct spdk_bdev_io *bdev_io;
1148 
1149 	if (spdk_io_channel_iter_get_ctx(i) != NULL) {
1150 		status = SPDK_BDEV_IO_STATUS_FAILED;
1151 	}
1152 
1153 	while (!TAILQ_EMPTY(&ctrlr_ch->pending_resets)) {
1154 		bdev_io = TAILQ_FIRST(&ctrlr_ch->pending_resets);
1155 		TAILQ_REMOVE(&ctrlr_ch->pending_resets, bdev_io, module_link);
1156 		spdk_bdev_io_complete(bdev_io, status);
1157 	}
1158 
1159 	spdk_for_each_channel_continue(i, 0);
1160 }
1161 
1162 static void
1163 _bdev_nvme_reset_complete(struct spdk_io_channel_iter *i, int status)
1164 {
1165 	struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i);
1166 	bool success = spdk_io_channel_iter_get_ctx(i) == NULL;
1167 	struct nvme_path_id *path_id;
1168 	bdev_nvme_reset_cb reset_cb_fn = nvme_ctrlr->reset_cb_fn;
1169 	void *reset_cb_arg = nvme_ctrlr->reset_cb_arg;
1170 	bool complete_pending_destruct = false;
1171 
1172 	nvme_ctrlr->reset_cb_fn = NULL;
1173 	nvme_ctrlr->reset_cb_arg = NULL;
1174 
1175 	if (!success) {
1176 		SPDK_ERRLOG("Resetting controller failed.\n");
1177 	} else {
1178 		SPDK_NOTICELOG("Resetting controller successful.\n");
1179 	}
1180 
1181 	pthread_mutex_lock(&nvme_ctrlr->mutex);
1182 	nvme_ctrlr->resetting = false;
1183 	nvme_ctrlr->failover_in_progress = false;
1184 
1185 	path_id = TAILQ_FIRST(&nvme_ctrlr->trids);
1186 	assert(path_id != NULL);
1187 	assert(path_id == nvme_ctrlr->active_path_id);
1188 
1189 	path_id->is_failed = !success;
1190 
1191 	if (nvme_ctrlr->ref == 0 && nvme_ctrlr->destruct &&
1192 	    !nvme_ctrlr->ana_log_page_updating) {
1193 		/* Complete pending destruct after reset completes. */
1194 		complete_pending_destruct = true;
1195 	}
1196 
1197 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
1198 
1199 	if (reset_cb_fn) {
1200 		reset_cb_fn(reset_cb_arg, success);
1201 	}
1202 
1203 	if (complete_pending_destruct) {
1204 		spdk_thread_send_msg(nvme_ctrlr->thread, nvme_ctrlr_unregister,
1205 				     nvme_ctrlr);
1206 	}
1207 }
1208 
1209 static void
1210 bdev_nvme_reset_complete(struct nvme_ctrlr *nvme_ctrlr, bool success)
1211 {
1212 	/* Make sure we clear any pending resets before returning. */
1213 	spdk_for_each_channel(nvme_ctrlr,
1214 			      bdev_nvme_complete_pending_resets,
1215 			      success ? NULL : (void *)0x1,
1216 			      _bdev_nvme_reset_complete);
1217 }
1218 
1219 static void
1220 bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status)
1221 {
1222 	struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i);
1223 
1224 	bdev_nvme_reset_complete(nvme_ctrlr, status == 0);
1225 }
1226 
1227 static void
1228 bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i)
1229 {
1230 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
1231 	struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch);
1232 	int rc;
1233 
1234 	rc = bdev_nvme_create_qpair(ctrlr_ch);
1235 
1236 	spdk_for_each_channel_continue(i, rc);
1237 }
1238 
1239 static int
1240 bdev_nvme_ctrlr_reset_poll(void *arg)
1241 {
1242 	struct nvme_ctrlr *nvme_ctrlr = arg;
1243 	int rc;
1244 
1245 	rc = spdk_nvme_ctrlr_reset_poll_async(nvme_ctrlr->reset_ctx);
1246 	if (rc == -EAGAIN) {
1247 		return SPDK_POLLER_BUSY;
1248 	}
1249 
1250 	spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller);
1251 	if (rc == 0) {
1252 		/* Recreate all of the I/O queue pairs */
1253 		spdk_for_each_channel(nvme_ctrlr,
1254 				      bdev_nvme_reset_create_qpair,
1255 				      NULL,
1256 				      bdev_nvme_reset_create_qpairs_done);
1257 	} else {
1258 		bdev_nvme_reset_complete(nvme_ctrlr, false);
1259 	}
1260 	return SPDK_POLLER_BUSY;
1261 }
1262 
1263 static void
1264 bdev_nvme_reset_ctrlr(struct spdk_io_channel_iter *i, int status)
1265 {
1266 	struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i);
1267 	int rc;
1268 
1269 	if (status) {
1270 		goto err;
1271 	}
1272 
1273 	rc = spdk_nvme_ctrlr_reset_async(nvme_ctrlr->ctrlr, &nvme_ctrlr->reset_ctx);
1274 	if (rc != 0) {
1275 		SPDK_ERRLOG("Create controller reset context failed\n");
1276 		goto err;
1277 	}
1278 	assert(nvme_ctrlr->reset_detach_poller == NULL);
1279 	nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(bdev_nvme_ctrlr_reset_poll,
1280 					  nvme_ctrlr, 0);
1281 
1282 	return;
1283 
1284 err:
1285 	bdev_nvme_reset_complete(nvme_ctrlr, false);
1286 }
1287 
1288 static void
1289 bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i)
1290 {
1291 	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
1292 	struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch);
1293 
1294 	bdev_nvme_destroy_qpair(ctrlr_ch);
1295 	spdk_for_each_channel_continue(i, 0);
1296 }
1297 
1298 static int
1299 bdev_nvme_reset(struct nvme_ctrlr *nvme_ctrlr)
1300 {
1301 	pthread_mutex_lock(&nvme_ctrlr->mutex);
1302 	if (nvme_ctrlr->destruct) {
1303 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
1304 		return -ENXIO;
1305 	}
1306 
1307 	if (nvme_ctrlr->resetting) {
1308 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
1309 		SPDK_NOTICELOG("Unable to perform reset, already in progress.\n");
1310 		return -EBUSY;
1311 	}
1312 
1313 	nvme_ctrlr->resetting = true;
1314 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
1315 	spdk_nvme_ctrlr_prepare_for_reset(nvme_ctrlr->ctrlr);
1316 
1317 	/* First, delete all NVMe I/O queue pairs. */
1318 	spdk_for_each_channel(nvme_ctrlr,
1319 			      bdev_nvme_reset_destroy_qpair,
1320 			      NULL,
1321 			      bdev_nvme_reset_ctrlr);
1322 
1323 	return 0;
1324 }
1325 
1326 int
1327 bdev_nvme_reset_rpc(struct nvme_ctrlr *nvme_ctrlr, bdev_nvme_reset_cb cb_fn, void *cb_arg)
1328 {
1329 	int rc;
1330 
1331 	rc = bdev_nvme_reset(nvme_ctrlr);
1332 	if (rc == 0) {
1333 		nvme_ctrlr->reset_cb_fn = cb_fn;
1334 		nvme_ctrlr->reset_cb_arg = cb_arg;
1335 	}
1336 	return rc;
1337 }
1338 
1339 static int _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio);
1340 
1341 static void
1342 bdev_nvme_reset_io_complete(struct nvme_bdev_io *bio, bool success)
1343 {
1344 	enum spdk_bdev_io_status io_status;
1345 
1346 	if (success) {
1347 		io_status = SPDK_BDEV_IO_STATUS_SUCCESS;
1348 	} else {
1349 		io_status = SPDK_BDEV_IO_STATUS_FAILED;
1350 	}
1351 
1352 	spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), io_status);
1353 }
1354 
1355 static void
1356 bdev_nvme_reset_io_continue(void *cb_arg, bool success)
1357 {
1358 	struct nvme_bdev_io *bio = cb_arg;
1359 	struct nvme_io_path *prev_io_path, *next_io_path;
1360 	int rc;
1361 
1362 	prev_io_path = bio->io_path;
1363 	bio->io_path = NULL;
1364 
1365 	if (!success) {
1366 		goto complete;
1367 	}
1368 
1369 	next_io_path = STAILQ_NEXT(prev_io_path, stailq);
1370 	if (next_io_path == NULL) {
1371 		goto complete;
1372 	}
1373 
1374 	rc = _bdev_nvme_reset_io(next_io_path, bio);
1375 	if (rc == 0) {
1376 		return;
1377 	}
1378 
1379 	success = false;
1380 
1381 complete:
1382 	bdev_nvme_reset_io_complete(bio, success);
1383 }
1384 
1385 static int
1386 _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio)
1387 {
1388 	struct nvme_ctrlr_channel *ctrlr_ch = io_path->ctrlr_ch;
1389 	struct nvme_ctrlr *nvme_ctrlr;
1390 	struct spdk_bdev_io *bdev_io;
1391 	int rc;
1392 
1393 	nvme_ctrlr = nvme_ctrlr_channel_get_ctrlr(ctrlr_ch);
1394 
1395 	rc = bdev_nvme_reset(nvme_ctrlr);
1396 	if (rc == 0) {
1397 		assert(bio->io_path == NULL);
1398 		bio->io_path = io_path;
1399 
1400 		assert(nvme_ctrlr->reset_cb_fn == NULL);
1401 		assert(nvme_ctrlr->reset_cb_arg == NULL);
1402 		nvme_ctrlr->reset_cb_fn = bdev_nvme_reset_io_continue;
1403 		nvme_ctrlr->reset_cb_arg = bio;
1404 	} else if (rc == -EBUSY) {
1405 		/*
1406 		 * Reset call is queued only if it is from the app framework. This is on purpose so that
1407 		 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the
1408 		 * upper level. If they are in the middle of a reset, we won't try to schedule another one.
1409 		 */
1410 		bdev_io = spdk_bdev_io_from_ctx(bio);
1411 		TAILQ_INSERT_TAIL(&ctrlr_ch->pending_resets, bdev_io, module_link);
1412 	} else {
1413 		return rc;
1414 	}
1415 
1416 	return 0;
1417 }
1418 
1419 static void
1420 bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio)
1421 {
1422 	struct nvme_io_path *io_path;
1423 	int rc;
1424 
1425 	/* Reset only the first nvme_ctrlr in the nvme_bdev_ctrlr for now.
1426 	 *
1427 	 * TODO: Reset all nvme_ctrlrs in the nvme_bdev_ctrlr sequentially.
1428 	 * This will be done in the following patches.
1429 	 */
1430 	io_path = STAILQ_FIRST(&nbdev_ch->io_path_list);
1431 	assert(io_path != NULL);
1432 
1433 	rc = _bdev_nvme_reset_io(io_path, bio);
1434 	if (rc != 0) {
1435 		bdev_nvme_reset_io_complete(bio, false);
1436 	}
1437 }
1438 
1439 static int
1440 bdev_nvme_failover_start(struct nvme_ctrlr *nvme_ctrlr, bool remove)
1441 {
1442 	struct nvme_path_id *path_id = NULL, *next_path = NULL;
1443 	int rc;
1444 
1445 	pthread_mutex_lock(&nvme_ctrlr->mutex);
1446 	if (nvme_ctrlr->destruct) {
1447 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
1448 		/* Don't bother resetting if the controller is in the process of being destructed. */
1449 		return -ENXIO;
1450 	}
1451 
1452 	path_id = TAILQ_FIRST(&nvme_ctrlr->trids);
1453 	assert(path_id);
1454 	assert(path_id == nvme_ctrlr->active_path_id);
1455 	next_path = TAILQ_NEXT(path_id, link);
1456 
1457 	if (nvme_ctrlr->resetting) {
1458 		if (next_path && !nvme_ctrlr->failover_in_progress) {
1459 			rc = -EBUSY;
1460 		} else {
1461 			rc = -EALREADY;
1462 		}
1463 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
1464 		SPDK_NOTICELOG("Unable to perform reset, already in progress.\n");
1465 		return rc;
1466 	}
1467 
1468 	nvme_ctrlr->resetting = true;
1469 	path_id->is_failed = true;
1470 
1471 	if (next_path) {
1472 		assert(path_id->trid.trtype != SPDK_NVME_TRANSPORT_PCIE);
1473 
1474 		SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", path_id->trid.traddr,
1475 			       path_id->trid.trsvcid,	next_path->trid.traddr, next_path->trid.trsvcid);
1476 
1477 		nvme_ctrlr->failover_in_progress = true;
1478 		spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr);
1479 		nvme_ctrlr->active_path_id = next_path;
1480 		rc = spdk_nvme_ctrlr_set_trid(nvme_ctrlr->ctrlr, &next_path->trid);
1481 		assert(rc == 0);
1482 		TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link);
1483 		if (!remove) {
1484 			/** Shuffle the old trid to the end of the list and use the new one.
1485 			 * Allows for round robin through multiple connections.
1486 			 */
1487 			TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, path_id, link);
1488 		} else {
1489 			free(path_id);
1490 		}
1491 	}
1492 
1493 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
1494 	return 0;
1495 }
1496 
1497 static int
1498 bdev_nvme_failover(struct nvme_ctrlr *nvme_ctrlr, bool remove)
1499 {
1500 	int rc;
1501 
1502 	rc = bdev_nvme_failover_start(nvme_ctrlr, remove);
1503 	if (rc == 0) {
1504 		/* First, delete all NVMe I/O queue pairs. */
1505 		spdk_for_each_channel(nvme_ctrlr,
1506 				      bdev_nvme_reset_destroy_qpair,
1507 				      NULL,
1508 				      bdev_nvme_reset_ctrlr);
1509 	} else if (rc != -EALREADY) {
1510 		return rc;
1511 	}
1512 
1513 	return 0;
1514 }
1515 
1516 static int bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks,
1517 			   uint64_t num_blocks);
1518 
1519 static int bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks,
1520 				  uint64_t num_blocks);
1521 
1522 static void
1523 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
1524 		     bool success)
1525 {
1526 	struct nvme_bdev_io *bio = (struct nvme_bdev_io *)bdev_io->driver_ctx;
1527 	struct spdk_bdev *bdev = bdev_io->bdev;
1528 	int ret;
1529 
1530 	if (!success) {
1531 		ret = -EINVAL;
1532 		goto exit;
1533 	}
1534 
1535 	if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) {
1536 		ret = -ENXIO;
1537 		goto exit;
1538 	}
1539 
1540 	ret = bdev_nvme_readv(bio,
1541 			      bdev_io->u.bdev.iovs,
1542 			      bdev_io->u.bdev.iovcnt,
1543 			      bdev_io->u.bdev.md_buf,
1544 			      bdev_io->u.bdev.num_blocks,
1545 			      bdev_io->u.bdev.offset_blocks,
1546 			      bdev->dif_check_flags,
1547 			      bdev_io->internal.ext_opts);
1548 
1549 exit:
1550 	if (spdk_unlikely(ret != 0)) {
1551 		bdev_nvme_io_complete(bio, ret);
1552 	}
1553 }
1554 
1555 static void
1556 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
1557 {
1558 	struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch);
1559 	struct spdk_bdev *bdev = bdev_io->bdev;
1560 	struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx;
1561 	struct nvme_bdev_io *nbdev_io_to_abort;
1562 	int rc = 0;
1563 
1564 	nbdev_io->io_path = bdev_nvme_find_io_path(nbdev_ch);
1565 	if (spdk_unlikely(!nbdev_io->io_path)) {
1566 		if (!bdev_nvme_io_type_is_admin(bdev_io->type)) {
1567 			rc = -ENXIO;
1568 			goto exit;
1569 		}
1570 
1571 		/* Admin commands do not use the optimal I/O path.
1572 		 * Simply fall through even if it is not found.
1573 		 */
1574 	}
1575 
1576 	switch (bdev_io->type) {
1577 	case SPDK_BDEV_IO_TYPE_READ:
1578 		if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) {
1579 			rc = bdev_nvme_readv(nbdev_io,
1580 					     bdev_io->u.bdev.iovs,
1581 					     bdev_io->u.bdev.iovcnt,
1582 					     bdev_io->u.bdev.md_buf,
1583 					     bdev_io->u.bdev.num_blocks,
1584 					     bdev_io->u.bdev.offset_blocks,
1585 					     bdev->dif_check_flags,
1586 					     bdev_io->internal.ext_opts);
1587 		} else {
1588 			spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb,
1589 					     bdev_io->u.bdev.num_blocks * bdev->blocklen);
1590 			rc = 0;
1591 		}
1592 		break;
1593 	case SPDK_BDEV_IO_TYPE_WRITE:
1594 		rc = bdev_nvme_writev(nbdev_io,
1595 				      bdev_io->u.bdev.iovs,
1596 				      bdev_io->u.bdev.iovcnt,
1597 				      bdev_io->u.bdev.md_buf,
1598 				      bdev_io->u.bdev.num_blocks,
1599 				      bdev_io->u.bdev.offset_blocks,
1600 				      bdev->dif_check_flags,
1601 				      bdev_io->internal.ext_opts);
1602 		break;
1603 	case SPDK_BDEV_IO_TYPE_COMPARE:
1604 		rc = bdev_nvme_comparev(nbdev_io,
1605 					bdev_io->u.bdev.iovs,
1606 					bdev_io->u.bdev.iovcnt,
1607 					bdev_io->u.bdev.md_buf,
1608 					bdev_io->u.bdev.num_blocks,
1609 					bdev_io->u.bdev.offset_blocks,
1610 					bdev->dif_check_flags);
1611 		break;
1612 	case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE:
1613 		rc = bdev_nvme_comparev_and_writev(nbdev_io,
1614 						   bdev_io->u.bdev.iovs,
1615 						   bdev_io->u.bdev.iovcnt,
1616 						   bdev_io->u.bdev.fused_iovs,
1617 						   bdev_io->u.bdev.fused_iovcnt,
1618 						   bdev_io->u.bdev.md_buf,
1619 						   bdev_io->u.bdev.num_blocks,
1620 						   bdev_io->u.bdev.offset_blocks,
1621 						   bdev->dif_check_flags);
1622 		break;
1623 	case SPDK_BDEV_IO_TYPE_UNMAP:
1624 		rc = bdev_nvme_unmap(nbdev_io,
1625 				     bdev_io->u.bdev.offset_blocks,
1626 				     bdev_io->u.bdev.num_blocks);
1627 		break;
1628 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
1629 		rc =  bdev_nvme_write_zeroes(nbdev_io,
1630 					     bdev_io->u.bdev.offset_blocks,
1631 					     bdev_io->u.bdev.num_blocks);
1632 		break;
1633 	case SPDK_BDEV_IO_TYPE_RESET:
1634 		nbdev_io->io_path = NULL;
1635 		bdev_nvme_reset_io(nbdev_ch, nbdev_io);
1636 		break;
1637 	case SPDK_BDEV_IO_TYPE_FLUSH:
1638 		rc = bdev_nvme_flush(nbdev_io,
1639 				     bdev_io->u.bdev.offset_blocks,
1640 				     bdev_io->u.bdev.num_blocks);
1641 		break;
1642 	case SPDK_BDEV_IO_TYPE_ZONE_APPEND:
1643 		rc = bdev_nvme_zone_appendv(nbdev_io,
1644 					    bdev_io->u.bdev.iovs,
1645 					    bdev_io->u.bdev.iovcnt,
1646 					    bdev_io->u.bdev.md_buf,
1647 					    bdev_io->u.bdev.num_blocks,
1648 					    bdev_io->u.bdev.offset_blocks,
1649 					    bdev->dif_check_flags);
1650 		break;
1651 	case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO:
1652 		rc = bdev_nvme_get_zone_info(nbdev_io,
1653 					     bdev_io->u.zone_mgmt.zone_id,
1654 					     bdev_io->u.zone_mgmt.num_zones,
1655 					     bdev_io->u.zone_mgmt.buf);
1656 		break;
1657 	case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
1658 		rc = bdev_nvme_zone_management(nbdev_io,
1659 					       bdev_io->u.zone_mgmt.zone_id,
1660 					       bdev_io->u.zone_mgmt.zone_action);
1661 		break;
1662 	case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
1663 		nbdev_io->io_path = NULL;
1664 		rc = bdev_nvme_admin_passthru(nbdev_ch,
1665 					      nbdev_io,
1666 					      &bdev_io->u.nvme_passthru.cmd,
1667 					      bdev_io->u.nvme_passthru.buf,
1668 					      bdev_io->u.nvme_passthru.nbytes);
1669 		break;
1670 	case SPDK_BDEV_IO_TYPE_NVME_IO:
1671 		rc = bdev_nvme_io_passthru(nbdev_io,
1672 					   &bdev_io->u.nvme_passthru.cmd,
1673 					   bdev_io->u.nvme_passthru.buf,
1674 					   bdev_io->u.nvme_passthru.nbytes);
1675 		break;
1676 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
1677 		rc = bdev_nvme_io_passthru_md(nbdev_io,
1678 					      &bdev_io->u.nvme_passthru.cmd,
1679 					      bdev_io->u.nvme_passthru.buf,
1680 					      bdev_io->u.nvme_passthru.nbytes,
1681 					      bdev_io->u.nvme_passthru.md_buf,
1682 					      bdev_io->u.nvme_passthru.md_len);
1683 		break;
1684 	case SPDK_BDEV_IO_TYPE_ABORT:
1685 		nbdev_io->io_path = NULL;
1686 		nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx;
1687 		bdev_nvme_abort(nbdev_ch,
1688 				nbdev_io,
1689 				nbdev_io_to_abort);
1690 		break;
1691 	default:
1692 		rc = -EINVAL;
1693 		break;
1694 	}
1695 
1696 exit:
1697 	if (spdk_unlikely(rc != 0)) {
1698 		bdev_nvme_io_complete(nbdev_io, rc);
1699 	}
1700 }
1701 
1702 static bool
1703 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
1704 {
1705 	struct nvme_bdev *nbdev = ctx;
1706 	struct nvme_ns *nvme_ns;
1707 	struct spdk_nvme_ns *ns;
1708 	struct spdk_nvme_ctrlr *ctrlr;
1709 	const struct spdk_nvme_ctrlr_data *cdata;
1710 
1711 	nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list);
1712 	assert(nvme_ns != NULL);
1713 	ns = nvme_ns->ns;
1714 	ctrlr = spdk_nvme_ns_get_ctrlr(ns);
1715 
1716 	switch (io_type) {
1717 	case SPDK_BDEV_IO_TYPE_READ:
1718 	case SPDK_BDEV_IO_TYPE_WRITE:
1719 	case SPDK_BDEV_IO_TYPE_RESET:
1720 	case SPDK_BDEV_IO_TYPE_FLUSH:
1721 	case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
1722 	case SPDK_BDEV_IO_TYPE_NVME_IO:
1723 	case SPDK_BDEV_IO_TYPE_ABORT:
1724 		return true;
1725 
1726 	case SPDK_BDEV_IO_TYPE_COMPARE:
1727 		return spdk_nvme_ns_supports_compare(ns);
1728 
1729 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
1730 		return spdk_nvme_ns_get_md_size(ns) ? true : false;
1731 
1732 	case SPDK_BDEV_IO_TYPE_UNMAP:
1733 		cdata = spdk_nvme_ctrlr_get_data(ctrlr);
1734 		return cdata->oncs.dsm;
1735 
1736 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
1737 		cdata = spdk_nvme_ctrlr_get_data(ctrlr);
1738 		return cdata->oncs.write_zeroes;
1739 
1740 	case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE:
1741 		if (spdk_nvme_ctrlr_get_flags(ctrlr) &
1742 		    SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) {
1743 			return true;
1744 		}
1745 		return false;
1746 
1747 	case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO:
1748 	case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
1749 		return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS;
1750 
1751 	case SPDK_BDEV_IO_TYPE_ZONE_APPEND:
1752 		return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS &&
1753 		       spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED;
1754 
1755 	default:
1756 		return false;
1757 	}
1758 }
1759 
1760 static int
1761 bdev_nvme_create_ctrlr_channel_cb(void *io_device, void *ctx_buf)
1762 {
1763 	struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf;
1764 	struct spdk_io_channel *pg_ch;
1765 	int rc;
1766 
1767 	pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs);
1768 	if (!pg_ch) {
1769 		return -1;
1770 	}
1771 
1772 	ctrlr_ch->group = spdk_io_channel_get_ctx(pg_ch);
1773 	TAILQ_INSERT_TAIL(&ctrlr_ch->group->ctrlr_ch_list, ctrlr_ch, tailq);
1774 
1775 #ifdef SPDK_CONFIG_VTUNE
1776 	ctrlr_ch->group->collect_spin_stat = true;
1777 #else
1778 	ctrlr_ch->group->collect_spin_stat = false;
1779 #endif
1780 
1781 	TAILQ_INIT(&ctrlr_ch->pending_resets);
1782 	TAILQ_INIT(&ctrlr_ch->io_path_list);
1783 
1784 	rc = bdev_nvme_create_qpair(ctrlr_ch);
1785 	if (rc != 0) {
1786 		goto err_qpair;
1787 	}
1788 
1789 	return 0;
1790 
1791 err_qpair:
1792 	spdk_put_io_channel(pg_ch);
1793 
1794 	return rc;
1795 }
1796 
1797 static void
1798 bdev_nvme_destroy_ctrlr_channel_cb(void *io_device, void *ctx_buf)
1799 {
1800 	struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf;
1801 
1802 	assert(ctrlr_ch->group != NULL);
1803 
1804 	bdev_nvme_destroy_qpair(ctrlr_ch);
1805 
1806 	TAILQ_REMOVE(&ctrlr_ch->group->ctrlr_ch_list, ctrlr_ch, tailq);
1807 
1808 	spdk_put_io_channel(spdk_io_channel_from_ctx(ctrlr_ch->group));
1809 }
1810 
1811 static void
1812 bdev_nvme_submit_accel_crc32c(void *ctx, uint32_t *dst, struct iovec *iov,
1813 			      uint32_t iov_cnt, uint32_t seed,
1814 			      spdk_nvme_accel_completion_cb cb_fn, void *cb_arg)
1815 {
1816 	struct nvme_poll_group *group = ctx;
1817 	int rc;
1818 
1819 	assert(group->accel_channel != NULL);
1820 	assert(cb_fn != NULL);
1821 
1822 	rc = spdk_accel_submit_crc32cv(group->accel_channel, dst, iov, iov_cnt, seed, cb_fn, cb_arg);
1823 	if (rc) {
1824 		/* For the two cases, spdk_accel_submit_crc32cv does not call the user's cb_fn */
1825 		if (rc == -ENOMEM || rc == -EINVAL) {
1826 			cb_fn(cb_arg, rc);
1827 		}
1828 		SPDK_ERRLOG("Cannot complete the accelerated crc32c operation with iov=%p\n", iov);
1829 	}
1830 }
1831 
1832 static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = {
1833 	.table_size		= sizeof(struct spdk_nvme_accel_fn_table),
1834 	.submit_accel_crc32c	= bdev_nvme_submit_accel_crc32c,
1835 };
1836 
1837 static int
1838 bdev_nvme_create_poll_group_cb(void *io_device, void *ctx_buf)
1839 {
1840 	struct nvme_poll_group *group = ctx_buf;
1841 
1842 	TAILQ_INIT(&group->ctrlr_ch_list);
1843 
1844 	group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table);
1845 	if (group->group == NULL) {
1846 		return -1;
1847 	}
1848 
1849 	group->accel_channel = spdk_accel_engine_get_io_channel();
1850 	if (!group->accel_channel) {
1851 		spdk_nvme_poll_group_destroy(group->group);
1852 		SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n",
1853 			    group);
1854 		return -1;
1855 	}
1856 
1857 	group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us);
1858 
1859 	if (group->poller == NULL) {
1860 		spdk_put_io_channel(group->accel_channel);
1861 		spdk_nvme_poll_group_destroy(group->group);
1862 		return -1;
1863 	}
1864 
1865 	return 0;
1866 }
1867 
1868 static void
1869 bdev_nvme_destroy_poll_group_cb(void *io_device, void *ctx_buf)
1870 {
1871 	struct nvme_poll_group *group = ctx_buf;
1872 
1873 	assert(TAILQ_EMPTY(&group->ctrlr_ch_list));
1874 
1875 	if (group->accel_channel) {
1876 		spdk_put_io_channel(group->accel_channel);
1877 	}
1878 
1879 	spdk_poller_unregister(&group->poller);
1880 	if (spdk_nvme_poll_group_destroy(group->group)) {
1881 		SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.\n");
1882 		assert(false);
1883 	}
1884 }
1885 
1886 static struct spdk_io_channel *
1887 bdev_nvme_get_io_channel(void *ctx)
1888 {
1889 	struct nvme_bdev *nvme_bdev = ctx;
1890 
1891 	return spdk_get_io_channel(nvme_bdev);
1892 }
1893 
1894 static void *
1895 bdev_nvme_get_module_ctx(void *ctx)
1896 {
1897 	struct nvme_bdev *nvme_bdev = ctx;
1898 	struct nvme_ns *nvme_ns;
1899 
1900 	if (!nvme_bdev || nvme_bdev->disk.module != &nvme_if) {
1901 		return NULL;
1902 	}
1903 
1904 	nvme_ns = TAILQ_FIRST(&nvme_bdev->nvme_ns_list);
1905 	if (!nvme_ns) {
1906 		return NULL;
1907 	}
1908 
1909 	return nvme_ns->ns;
1910 }
1911 
1912 static const char *
1913 _nvme_ana_state_str(enum spdk_nvme_ana_state ana_state)
1914 {
1915 	switch (ana_state) {
1916 	case SPDK_NVME_ANA_OPTIMIZED_STATE:
1917 		return "optimized";
1918 	case SPDK_NVME_ANA_NON_OPTIMIZED_STATE:
1919 		return "non_optimized";
1920 	case SPDK_NVME_ANA_INACCESSIBLE_STATE:
1921 		return "inaccessible";
1922 	case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE:
1923 		return "persistent_loss";
1924 	case SPDK_NVME_ANA_CHANGE_STATE:
1925 		return "change";
1926 	default:
1927 		return NULL;
1928 	}
1929 }
1930 
1931 static int
1932 bdev_nvme_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size)
1933 {
1934 	struct nvme_bdev *nbdev = ctx;
1935 	struct nvme_ns *nvme_ns;
1936 
1937 	nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list);
1938 	assert(nvme_ns != NULL);
1939 
1940 	return spdk_nvme_ctrlr_get_memory_domains(nvme_ns->ctrlr->ctrlr, domains, array_size);
1941 }
1942 
1943 static void
1944 nvme_namespace_info_json(struct spdk_json_write_ctx *w,
1945 			 struct nvme_ns *nvme_ns)
1946 {
1947 	struct spdk_nvme_ns *ns;
1948 	struct spdk_nvme_ctrlr *ctrlr;
1949 	const struct spdk_nvme_ctrlr_data *cdata;
1950 	const struct spdk_nvme_transport_id *trid;
1951 	union spdk_nvme_vs_register vs;
1952 	char buf[128];
1953 
1954 	ns = nvme_ns->ns;
1955 	ctrlr = spdk_nvme_ns_get_ctrlr(ns);
1956 
1957 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
1958 	trid = spdk_nvme_ctrlr_get_transport_id(ctrlr);
1959 	vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr);
1960 
1961 	spdk_json_write_object_begin(w);
1962 
1963 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
1964 		spdk_json_write_named_string(w, "pci_address", trid->traddr);
1965 	}
1966 
1967 	spdk_json_write_named_object_begin(w, "trid");
1968 
1969 	nvme_bdev_dump_trid_json(trid, w);
1970 
1971 	spdk_json_write_object_end(w);
1972 
1973 #ifdef SPDK_CONFIG_NVME_CUSE
1974 	size_t cuse_name_size = 128;
1975 	char cuse_name[cuse_name_size];
1976 
1977 	int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns),
1978 					    cuse_name, &cuse_name_size);
1979 	if (rc == 0) {
1980 		spdk_json_write_named_string(w, "cuse_device", cuse_name);
1981 	}
1982 #endif
1983 
1984 	spdk_json_write_named_object_begin(w, "ctrlr_data");
1985 
1986 	spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid);
1987 
1988 	snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn);
1989 	spdk_str_trim(buf);
1990 	spdk_json_write_named_string(w, "model_number", buf);
1991 
1992 	snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn);
1993 	spdk_str_trim(buf);
1994 	spdk_json_write_named_string(w, "serial_number", buf);
1995 
1996 	snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr);
1997 	spdk_str_trim(buf);
1998 	spdk_json_write_named_string(w, "firmware_revision", buf);
1999 
2000 	if (cdata->subnqn[0] != '\0') {
2001 		spdk_json_write_named_string(w, "subnqn", cdata->subnqn);
2002 	}
2003 
2004 	spdk_json_write_named_object_begin(w, "oacs");
2005 
2006 	spdk_json_write_named_uint32(w, "security", cdata->oacs.security);
2007 	spdk_json_write_named_uint32(w, "format", cdata->oacs.format);
2008 	spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware);
2009 	spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage);
2010 
2011 	spdk_json_write_object_end(w);
2012 
2013 	spdk_json_write_object_end(w);
2014 
2015 	spdk_json_write_named_object_begin(w, "vs");
2016 
2017 	spdk_json_write_name(w, "nvme_version");
2018 	if (vs.bits.ter) {
2019 		spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter);
2020 	} else {
2021 		spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr);
2022 	}
2023 
2024 	spdk_json_write_object_end(w);
2025 
2026 	spdk_json_write_named_object_begin(w, "ns_data");
2027 
2028 	spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns));
2029 
2030 	if (cdata->cmic.ana_reporting) {
2031 		spdk_json_write_named_string(w, "ana_state",
2032 					     _nvme_ana_state_str(nvme_ns->ana_state));
2033 	}
2034 
2035 	spdk_json_write_object_end(w);
2036 
2037 	if (cdata->oacs.security) {
2038 		spdk_json_write_named_object_begin(w, "security");
2039 
2040 		spdk_json_write_named_bool(w, "opal", nvme_ns->bdev->opal);
2041 
2042 		spdk_json_write_object_end(w);
2043 	}
2044 
2045 	spdk_json_write_object_end(w);
2046 }
2047 
2048 static int
2049 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
2050 {
2051 	struct nvme_bdev *nvme_bdev = ctx;
2052 	struct nvme_ns *nvme_ns;
2053 
2054 	pthread_mutex_lock(&nvme_bdev->mutex);
2055 	spdk_json_write_named_array_begin(w, "nvme");
2056 	TAILQ_FOREACH(nvme_ns, &nvme_bdev->nvme_ns_list, tailq) {
2057 		nvme_namespace_info_json(w, nvme_ns);
2058 	}
2059 	spdk_json_write_array_end(w);
2060 	pthread_mutex_unlock(&nvme_bdev->mutex);
2061 
2062 	return 0;
2063 }
2064 
2065 static void
2066 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
2067 {
2068 	/* No config per bdev needed */
2069 }
2070 
2071 static uint64_t
2072 bdev_nvme_get_spin_time(struct spdk_io_channel *ch)
2073 {
2074 	struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch);
2075 	struct nvme_io_path *io_path;
2076 	struct nvme_poll_group *group;
2077 	uint64_t spin_time = 0;
2078 
2079 	STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
2080 		group = io_path->ctrlr_ch->group;
2081 
2082 		if (!group || !group->collect_spin_stat) {
2083 			continue;
2084 		}
2085 
2086 		if (group->end_ticks != 0) {
2087 			group->spin_ticks += (group->end_ticks - group->start_ticks);
2088 			group->end_ticks = 0;
2089 		}
2090 
2091 		spin_time += group->spin_ticks;
2092 		group->start_ticks = 0;
2093 		group->spin_ticks = 0;
2094 	}
2095 
2096 	return (spin_time * 1000000ULL) / spdk_get_ticks_hz();
2097 }
2098 
2099 static const struct spdk_bdev_fn_table nvmelib_fn_table = {
2100 	.destruct		= bdev_nvme_destruct,
2101 	.submit_request		= bdev_nvme_submit_request,
2102 	.io_type_supported	= bdev_nvme_io_type_supported,
2103 	.get_io_channel		= bdev_nvme_get_io_channel,
2104 	.dump_info_json		= bdev_nvme_dump_info_json,
2105 	.write_config_json	= bdev_nvme_write_config_json,
2106 	.get_spin_time		= bdev_nvme_get_spin_time,
2107 	.get_module_ctx		= bdev_nvme_get_module_ctx,
2108 	.get_memory_domains	= bdev_nvme_get_memory_domains,
2109 };
2110 
2111 typedef int (*bdev_nvme_parse_ana_log_page_cb)(
2112 	const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg);
2113 
2114 static int
2115 bdev_nvme_parse_ana_log_page(struct nvme_ctrlr *nvme_ctrlr,
2116 			     bdev_nvme_parse_ana_log_page_cb cb_fn, void *cb_arg)
2117 {
2118 	struct spdk_nvme_ana_group_descriptor *copied_desc;
2119 	uint8_t *orig_desc;
2120 	uint32_t i, desc_size, copy_len;
2121 	int rc = 0;
2122 
2123 	if (nvme_ctrlr->ana_log_page == NULL) {
2124 		return -EINVAL;
2125 	}
2126 
2127 	copied_desc = nvme_ctrlr->copied_ana_desc;
2128 
2129 	orig_desc = (uint8_t *)nvme_ctrlr->ana_log_page + sizeof(struct spdk_nvme_ana_page);
2130 	copy_len = nvme_ctrlr->ana_log_page_size - sizeof(struct spdk_nvme_ana_page);
2131 
2132 	for (i = 0; i < nvme_ctrlr->ana_log_page->num_ana_group_desc; i++) {
2133 		memcpy(copied_desc, orig_desc, copy_len);
2134 
2135 		rc = cb_fn(copied_desc, cb_arg);
2136 		if (rc != 0) {
2137 			break;
2138 		}
2139 
2140 		desc_size = sizeof(struct spdk_nvme_ana_group_descriptor) +
2141 			    copied_desc->num_of_nsid * sizeof(uint32_t);
2142 		orig_desc += desc_size;
2143 		copy_len -= desc_size;
2144 	}
2145 
2146 	return rc;
2147 }
2148 
2149 static int
2150 nvme_ns_set_ana_state(const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg)
2151 {
2152 	struct nvme_ns *nvme_ns = cb_arg;
2153 	uint32_t i;
2154 
2155 	for (i = 0; i < desc->num_of_nsid; i++) {
2156 		if (desc->nsid[i] != spdk_nvme_ns_get_id(nvme_ns->ns)) {
2157 			continue;
2158 		}
2159 		nvme_ns->ana_group_id = desc->ana_group_id;
2160 		nvme_ns->ana_state = desc->ana_state;
2161 		return 1;
2162 	}
2163 
2164 	return 0;
2165 }
2166 
2167 static int
2168 nvme_disk_create(struct spdk_bdev *disk, const char *base_name,
2169 		 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns,
2170 		 uint32_t prchk_flags, void *ctx)
2171 {
2172 	const struct spdk_uuid		*uuid;
2173 	const uint8_t *nguid;
2174 	const struct spdk_nvme_ctrlr_data *cdata;
2175 	const struct spdk_nvme_ns_data	*nsdata;
2176 	enum spdk_nvme_csi		csi;
2177 	uint32_t atomic_bs, phys_bs, bs;
2178 
2179 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
2180 	csi = spdk_nvme_ns_get_csi(ns);
2181 
2182 	switch (csi) {
2183 	case SPDK_NVME_CSI_NVM:
2184 		disk->product_name = "NVMe disk";
2185 		break;
2186 	case SPDK_NVME_CSI_ZNS:
2187 		disk->product_name = "NVMe ZNS disk";
2188 		disk->zoned = true;
2189 		disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns);
2190 		disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) /
2191 					     spdk_nvme_ns_get_extended_sector_size(ns);
2192 		disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns);
2193 		disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns);
2194 		break;
2195 	default:
2196 		SPDK_ERRLOG("unsupported CSI: %u\n", csi);
2197 		return -ENOTSUP;
2198 	}
2199 
2200 	disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns));
2201 	if (!disk->name) {
2202 		return -ENOMEM;
2203 	}
2204 
2205 	disk->write_cache = 0;
2206 	if (cdata->vwc.present) {
2207 		/* Enable if the Volatile Write Cache exists */
2208 		disk->write_cache = 1;
2209 	}
2210 	if (cdata->oncs.write_zeroes) {
2211 		disk->max_write_zeroes = UINT16_MAX + 1;
2212 	}
2213 	disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns);
2214 	disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns);
2215 	disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns);
2216 
2217 	nguid = spdk_nvme_ns_get_nguid(ns);
2218 	if (!nguid) {
2219 		uuid = spdk_nvme_ns_get_uuid(ns);
2220 		if (uuid) {
2221 			disk->uuid = *uuid;
2222 		}
2223 	} else {
2224 		memcpy(&disk->uuid, nguid, sizeof(disk->uuid));
2225 	}
2226 
2227 	nsdata = spdk_nvme_ns_get_data(ns);
2228 	bs = spdk_nvme_ns_get_sector_size(ns);
2229 	atomic_bs = bs;
2230 	phys_bs = bs;
2231 	if (nsdata->nabo == 0) {
2232 		if (nsdata->nsfeat.ns_atomic_write_unit && nsdata->nawupf) {
2233 			atomic_bs = bs * (1 + nsdata->nawupf);
2234 		} else {
2235 			atomic_bs = bs * (1 + cdata->awupf);
2236 		}
2237 	}
2238 	if (nsdata->nsfeat.optperf) {
2239 		phys_bs = bs * (1 + nsdata->npwg);
2240 	}
2241 	disk->phys_blocklen = spdk_min(phys_bs, atomic_bs);
2242 
2243 	disk->md_len = spdk_nvme_ns_get_md_size(ns);
2244 	if (disk->md_len != 0) {
2245 		disk->md_interleave = nsdata->flbas.extended;
2246 		disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns);
2247 		if (disk->dif_type != SPDK_DIF_DISABLE) {
2248 			disk->dif_is_head_of_md = nsdata->dps.md_start;
2249 			disk->dif_check_flags = prchk_flags;
2250 		}
2251 	}
2252 
2253 	if (!(spdk_nvme_ctrlr_get_flags(ctrlr) &
2254 	      SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) {
2255 		disk->acwu = 0;
2256 	} else if (nsdata->nsfeat.ns_atomic_write_unit) {
2257 		disk->acwu = nsdata->nacwu;
2258 	} else {
2259 		disk->acwu = cdata->acwu;
2260 	}
2261 
2262 	disk->ctxt = ctx;
2263 	disk->fn_table = &nvmelib_fn_table;
2264 	disk->module = &nvme_if;
2265 
2266 	return 0;
2267 }
2268 
2269 static int
2270 nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns)
2271 {
2272 	struct nvme_bdev *bdev;
2273 	int rc;
2274 
2275 	bdev = calloc(1, sizeof(*bdev));
2276 	if (!bdev) {
2277 		SPDK_ERRLOG("bdev calloc() failed\n");
2278 		return -ENOMEM;
2279 	}
2280 
2281 	rc = pthread_mutex_init(&bdev->mutex, NULL);
2282 	if (rc != 0) {
2283 		free(bdev);
2284 		return rc;
2285 	}
2286 
2287 	bdev->ref = 1;
2288 	TAILQ_INIT(&bdev->nvme_ns_list);
2289 	TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq);
2290 	bdev->opal = nvme_ctrlr->opal_dev != NULL;
2291 
2292 	rc = nvme_disk_create(&bdev->disk, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ctrlr,
2293 			      nvme_ns->ns, nvme_ctrlr->prchk_flags, bdev);
2294 	if (rc != 0) {
2295 		SPDK_ERRLOG("Failed to create NVMe disk\n");
2296 		pthread_mutex_destroy(&bdev->mutex);
2297 		free(bdev);
2298 		return rc;
2299 	}
2300 
2301 	spdk_io_device_register(bdev,
2302 				bdev_nvme_create_bdev_channel_cb,
2303 				bdev_nvme_destroy_bdev_channel_cb,
2304 				sizeof(struct nvme_bdev_channel),
2305 				bdev->disk.name);
2306 
2307 	rc = spdk_bdev_register(&bdev->disk);
2308 	if (rc != 0) {
2309 		SPDK_ERRLOG("spdk_bdev_register() failed\n");
2310 		spdk_io_device_unregister(bdev, NULL);
2311 		pthread_mutex_destroy(&bdev->mutex);
2312 		free(bdev->disk.name);
2313 		free(bdev);
2314 		return rc;
2315 	}
2316 
2317 	nvme_ns->bdev = bdev;
2318 	bdev->nsid = nvme_ns->id;
2319 
2320 	bdev->nbdev_ctrlr = nvme_ctrlr->nbdev_ctrlr;
2321 	TAILQ_INSERT_TAIL(&nvme_ctrlr->nbdev_ctrlr->bdevs, bdev, tailq);
2322 
2323 	return 0;
2324 }
2325 
2326 static bool
2327 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2)
2328 {
2329 	const struct spdk_nvme_ns_data *nsdata1, *nsdata2;
2330 	const struct spdk_uuid *uuid1, *uuid2;
2331 
2332 	nsdata1 = spdk_nvme_ns_get_data(ns1);
2333 	nsdata2 = spdk_nvme_ns_get_data(ns2);
2334 	uuid1 = spdk_nvme_ns_get_uuid(ns1);
2335 	uuid2 = spdk_nvme_ns_get_uuid(ns2);
2336 
2337 	return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)) == 0 &&
2338 	       nsdata1->eui64 == nsdata2->eui64 &&
2339 	       uuid1 != NULL && uuid2 != NULL && spdk_uuid_compare(uuid1, uuid2) == 0;
2340 }
2341 
2342 static bool
2343 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
2344 		 struct spdk_nvme_ctrlr_opts *opts)
2345 {
2346 	struct nvme_probe_skip_entry *entry;
2347 
2348 	TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) {
2349 		if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) {
2350 			return false;
2351 		}
2352 	}
2353 
2354 	opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst;
2355 	opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight;
2356 	opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight;
2357 	opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight;
2358 	opts->disable_read_ana_log_page = true;
2359 
2360 	SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr);
2361 
2362 	return true;
2363 }
2364 
2365 static void
2366 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl)
2367 {
2368 	struct nvme_ctrlr *nvme_ctrlr = ctx;
2369 
2370 	if (spdk_nvme_cpl_is_error(cpl)) {
2371 		SPDK_WARNLOG("Abort failed. Resetting controller. sc is %u, sct is %u.\n", cpl->status.sc,
2372 			     cpl->status.sct);
2373 		bdev_nvme_reset(nvme_ctrlr);
2374 	} else if (cpl->cdw0 & 0x1) {
2375 		SPDK_WARNLOG("Specified command could not be aborted.\n");
2376 		bdev_nvme_reset(nvme_ctrlr);
2377 	}
2378 }
2379 
2380 static void
2381 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr,
2382 	   struct spdk_nvme_qpair *qpair, uint16_t cid)
2383 {
2384 	struct nvme_ctrlr *nvme_ctrlr = cb_arg;
2385 	union spdk_nvme_csts_register csts;
2386 	int rc;
2387 
2388 	assert(nvme_ctrlr->ctrlr == ctrlr);
2389 
2390 	SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid);
2391 
2392 	/* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O
2393 	 * queue.  (Note: qpair == NULL when there's an admin cmd timeout.)  Otherwise we
2394 	 * would submit another fabrics cmd on the admin queue to read CSTS and check for its
2395 	 * completion recursively.
2396 	 */
2397 	if (nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) {
2398 		csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr);
2399 		if (csts.bits.cfs) {
2400 			SPDK_ERRLOG("Controller Fatal Status, reset required\n");
2401 			bdev_nvme_reset(nvme_ctrlr);
2402 			return;
2403 		}
2404 	}
2405 
2406 	switch (g_opts.action_on_timeout) {
2407 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT:
2408 		if (qpair) {
2409 			/* Don't send abort to ctrlr when reset is running. */
2410 			pthread_mutex_lock(&nvme_ctrlr->mutex);
2411 			if (nvme_ctrlr->resetting) {
2412 				pthread_mutex_unlock(&nvme_ctrlr->mutex);
2413 				SPDK_NOTICELOG("Quit abort. Ctrlr is in the process of reseting.\n");
2414 				return;
2415 			}
2416 			pthread_mutex_unlock(&nvme_ctrlr->mutex);
2417 
2418 			rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid,
2419 						       nvme_abort_cpl, nvme_ctrlr);
2420 			if (rc == 0) {
2421 				return;
2422 			}
2423 
2424 			SPDK_ERRLOG("Unable to send abort. Resetting, rc is %d.\n", rc);
2425 		}
2426 
2427 	/* FALLTHROUGH */
2428 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET:
2429 		bdev_nvme_reset(nvme_ctrlr);
2430 		break;
2431 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE:
2432 		SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n");
2433 		break;
2434 	default:
2435 		SPDK_ERRLOG("An invalid timeout action value is found.\n");
2436 		break;
2437 	}
2438 }
2439 
2440 static void
2441 nvme_ctrlr_populate_namespace_done(struct nvme_ns *nvme_ns, int rc)
2442 {
2443 	struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr;
2444 	struct nvme_async_probe_ctx *ctx = nvme_ns->probe_ctx;
2445 
2446 	if (rc == 0) {
2447 		nvme_ns->probe_ctx = NULL;
2448 		pthread_mutex_lock(&nvme_ctrlr->mutex);
2449 		nvme_ctrlr->ref++;
2450 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
2451 	} else {
2452 		RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns);
2453 		free(nvme_ns);
2454 	}
2455 
2456 	if (ctx) {
2457 		ctx->populates_in_progress--;
2458 		if (ctx->populates_in_progress == 0) {
2459 			nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx);
2460 		}
2461 	}
2462 }
2463 
2464 static void
2465 bdev_nvme_add_io_path(struct spdk_io_channel_iter *i)
2466 {
2467 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
2468 	struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch);
2469 	struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i);
2470 	int rc;
2471 
2472 	rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns);
2473 	if (rc != 0) {
2474 		SPDK_ERRLOG("Failed to add I/O path to bdev_channel dynamically.\n");
2475 	}
2476 
2477 	spdk_for_each_channel_continue(i, rc);
2478 }
2479 
2480 static void
2481 bdev_nvme_delete_io_path(struct spdk_io_channel_iter *i)
2482 {
2483 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
2484 	struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch);
2485 	struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i);
2486 	struct nvme_io_path *io_path;
2487 
2488 	io_path = _bdev_nvme_get_io_path(nbdev_ch, nvme_ns);
2489 	if (io_path != NULL) {
2490 		_bdev_nvme_delete_io_path(nbdev_ch, io_path);
2491 	}
2492 
2493 	spdk_for_each_channel_continue(i, 0);
2494 }
2495 
2496 static void
2497 bdev_nvme_add_io_path_failed(struct spdk_io_channel_iter *i, int status)
2498 {
2499 	struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i);
2500 
2501 	nvme_ctrlr_populate_namespace_done(nvme_ns, -1);
2502 }
2503 
2504 static void
2505 bdev_nvme_add_io_path_done(struct spdk_io_channel_iter *i, int status)
2506 {
2507 	struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i);
2508 	struct nvme_bdev *bdev = spdk_io_channel_iter_get_io_device(i);
2509 
2510 	if (status == 0) {
2511 		nvme_ctrlr_populate_namespace_done(nvme_ns, 0);
2512 	} else {
2513 		/* Delete the added io_paths and fail populating the namespace. */
2514 		spdk_for_each_channel(bdev,
2515 				      bdev_nvme_delete_io_path,
2516 				      nvme_ns,
2517 				      bdev_nvme_add_io_path_failed);
2518 	}
2519 }
2520 
2521 static int
2522 nvme_bdev_add_ns(struct nvme_bdev *bdev, struct nvme_ns *nvme_ns)
2523 {
2524 	struct nvme_ns *tmp_ns;
2525 
2526 	pthread_mutex_lock(&bdev->mutex);
2527 
2528 	tmp_ns = TAILQ_FIRST(&bdev->nvme_ns_list);
2529 	assert(tmp_ns != NULL);
2530 
2531 	if (!bdev_nvme_compare_ns(nvme_ns->ns, tmp_ns->ns)) {
2532 		pthread_mutex_unlock(&bdev->mutex);
2533 		SPDK_ERRLOG("Namespaces are not identical.\n");
2534 		return -EINVAL;
2535 	}
2536 
2537 	bdev->ref++;
2538 	TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq);
2539 	nvme_ns->bdev = bdev;
2540 
2541 	pthread_mutex_unlock(&bdev->mutex);
2542 
2543 	/* Add nvme_io_path to nvme_bdev_channels dynamically. */
2544 	spdk_for_each_channel(bdev,
2545 			      bdev_nvme_add_io_path,
2546 			      nvme_ns,
2547 			      bdev_nvme_add_io_path_done);
2548 
2549 	return 0;
2550 }
2551 
2552 static void
2553 nvme_ctrlr_populate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns)
2554 {
2555 	struct spdk_nvme_ns	*ns;
2556 	struct nvme_bdev	*bdev;
2557 	int			rc = 0;
2558 
2559 	ns = spdk_nvme_ctrlr_get_ns(nvme_ctrlr->ctrlr, nvme_ns->id);
2560 	if (!ns) {
2561 		SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id);
2562 		rc = -EINVAL;
2563 		goto done;
2564 	}
2565 
2566 	nvme_ns->ns = ns;
2567 	nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE;
2568 
2569 	if (nvme_ctrlr->ana_log_page != NULL) {
2570 		bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ns_set_ana_state, nvme_ns);
2571 	}
2572 
2573 	bdev = nvme_bdev_ctrlr_get_bdev(nvme_ctrlr->nbdev_ctrlr, nvme_ns->id);
2574 	if (bdev == NULL) {
2575 		rc = nvme_bdev_create(nvme_ctrlr, nvme_ns);
2576 	} else {
2577 		rc = nvme_bdev_add_ns(bdev, nvme_ns);
2578 		if (rc == 0) {
2579 			return;
2580 		}
2581 	}
2582 done:
2583 	nvme_ctrlr_populate_namespace_done(nvme_ns, rc);
2584 }
2585 
2586 static void
2587 nvme_ctrlr_depopulate_namespace_done(struct nvme_ns *nvme_ns)
2588 {
2589 	struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr;
2590 
2591 	assert(nvme_ctrlr != NULL);
2592 
2593 	pthread_mutex_lock(&nvme_ctrlr->mutex);
2594 
2595 	RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns);
2596 
2597 	if (nvme_ns->bdev != NULL) {
2598 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
2599 		return;
2600 	}
2601 
2602 	free(nvme_ns);
2603 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
2604 
2605 	nvme_ctrlr_release(nvme_ctrlr);
2606 }
2607 
2608 static void
2609 bdev_nvme_delete_io_path_done(struct spdk_io_channel_iter *i, int status)
2610 {
2611 	struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i);
2612 
2613 	nvme_ctrlr_depopulate_namespace_done(nvme_ns);
2614 }
2615 
2616 static void
2617 nvme_ctrlr_depopulate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns)
2618 {
2619 	struct nvme_bdev *bdev;
2620 
2621 	bdev = nvme_ns->bdev;
2622 	if (bdev != NULL) {
2623 		pthread_mutex_lock(&bdev->mutex);
2624 
2625 		assert(bdev->ref > 0);
2626 		bdev->ref--;
2627 		if (bdev->ref == 0) {
2628 			pthread_mutex_unlock(&bdev->mutex);
2629 
2630 			spdk_bdev_unregister(&bdev->disk, NULL, NULL);
2631 		} else {
2632 			/* spdk_bdev_unregister() is not called until the last nvme_ns is
2633 			 * depopulated. Hence we need to remove nvme_ns from bdev->nvme_ns_list
2634 			 * and clear nvme_ns->bdev here.
2635 			 */
2636 			TAILQ_REMOVE(&bdev->nvme_ns_list, nvme_ns, tailq);
2637 			nvme_ns->bdev = NULL;
2638 
2639 			pthread_mutex_unlock(&bdev->mutex);
2640 
2641 			/* Delete nvme_io_paths from nvme_bdev_channels dynamically. After that,
2642 			 * we call depopulate_namespace_done() to avoid use-after-free.
2643 			 */
2644 			spdk_for_each_channel(bdev,
2645 					      bdev_nvme_delete_io_path,
2646 					      nvme_ns,
2647 					      bdev_nvme_delete_io_path_done);
2648 			return;
2649 		}
2650 	}
2651 
2652 	nvme_ctrlr_depopulate_namespace_done(nvme_ns);
2653 }
2654 
2655 static void
2656 nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr,
2657 			       struct nvme_async_probe_ctx *ctx)
2658 {
2659 	struct spdk_nvme_ctrlr	*ctrlr = nvme_ctrlr->ctrlr;
2660 	struct nvme_ns	*nvme_ns, *next;
2661 	struct spdk_nvme_ns	*ns;
2662 	struct nvme_bdev	*bdev;
2663 	uint32_t		nsid;
2664 	int			rc;
2665 	uint64_t		num_sectors;
2666 
2667 	if (ctx) {
2668 		/* Initialize this count to 1 to handle the populate functions
2669 		 * calling nvme_ctrlr_populate_namespace_done() immediately.
2670 		 */
2671 		ctx->populates_in_progress = 1;
2672 	}
2673 
2674 	/* First loop over our existing namespaces and see if they have been
2675 	 * removed. */
2676 	nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr);
2677 	while (nvme_ns != NULL) {
2678 		next = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns);
2679 
2680 		if (spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) {
2681 			/* NS is still there but attributes may have changed */
2682 			ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id);
2683 			num_sectors = spdk_nvme_ns_get_num_sectors(ns);
2684 			bdev = nvme_ns->bdev;
2685 			assert(bdev != NULL);
2686 			if (bdev->disk.blockcnt != num_sectors) {
2687 				SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n",
2688 					       nvme_ns->id,
2689 					       bdev->disk.name,
2690 					       bdev->disk.blockcnt,
2691 					       num_sectors);
2692 				rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors);
2693 				if (rc != 0) {
2694 					SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n",
2695 						    bdev->disk.name, rc);
2696 				}
2697 			}
2698 		} else {
2699 			/* Namespace was removed */
2700 			nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns);
2701 		}
2702 
2703 		nvme_ns = next;
2704 	}
2705 
2706 	/* Loop through all of the namespaces at the nvme level and see if any of them are new */
2707 	nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr);
2708 	while (nsid != 0) {
2709 		nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid);
2710 
2711 		if (nvme_ns == NULL) {
2712 			/* Found a new one */
2713 			nvme_ns = calloc(1, sizeof(struct nvme_ns));
2714 			if (nvme_ns == NULL) {
2715 				SPDK_ERRLOG("Failed to allocate namespace\n");
2716 				/* This just fails to attach the namespace. It may work on a future attempt. */
2717 				continue;
2718 			}
2719 
2720 			nvme_ns->id = nsid;
2721 			nvme_ns->ctrlr = nvme_ctrlr;
2722 
2723 			nvme_ns->bdev = NULL;
2724 
2725 			if (ctx) {
2726 				ctx->populates_in_progress++;
2727 			}
2728 			nvme_ns->probe_ctx = ctx;
2729 
2730 			RB_INSERT(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns);
2731 
2732 			nvme_ctrlr_populate_namespace(nvme_ctrlr, nvme_ns);
2733 		}
2734 
2735 		nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid);
2736 	}
2737 
2738 	if (ctx) {
2739 		/* Decrement this count now that the loop is over to account
2740 		 * for the one we started with.  If the count is then 0, we
2741 		 * know any populate_namespace functions completed immediately,
2742 		 * so we'll kick the callback here.
2743 		 */
2744 		ctx->populates_in_progress--;
2745 		if (ctx->populates_in_progress == 0) {
2746 			nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx);
2747 		}
2748 	}
2749 
2750 }
2751 
2752 static void
2753 nvme_ctrlr_depopulate_namespaces(struct nvme_ctrlr *nvme_ctrlr)
2754 {
2755 	struct nvme_ns *nvme_ns, *tmp;
2756 
2757 	RB_FOREACH_SAFE(nvme_ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp) {
2758 		nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns);
2759 	}
2760 }
2761 
2762 static int
2763 nvme_ctrlr_set_ana_states(const struct spdk_nvme_ana_group_descriptor *desc,
2764 			  void *cb_arg)
2765 {
2766 	struct nvme_ctrlr *nvme_ctrlr = cb_arg;
2767 	struct nvme_ns *nvme_ns;
2768 	uint32_t i, nsid;
2769 
2770 	for (i = 0; i < desc->num_of_nsid; i++) {
2771 		nsid = desc->nsid[i];
2772 		if (nsid == 0) {
2773 			continue;
2774 		}
2775 
2776 		nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid);
2777 
2778 		assert(nvme_ns != NULL);
2779 		if (nvme_ns == NULL) {
2780 			/* Target told us that an inactive namespace had an ANA change */
2781 			continue;
2782 		}
2783 
2784 		nvme_ns->ana_group_id = desc->ana_group_id;
2785 		nvme_ns->ana_state = desc->ana_state;
2786 		nvme_ns->ana_state_updating = false;
2787 	}
2788 
2789 	return 0;
2790 }
2791 
2792 static void
2793 bdev_nvme_clear_io_path_cache(struct spdk_io_channel_iter *i)
2794 {
2795 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
2796 	struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch);
2797 
2798 	_bdev_nvme_clear_io_path_cache(ctrlr_ch);
2799 
2800 	spdk_for_each_channel_continue(i, 0);
2801 }
2802 
2803 static void
2804 bdev_nvme_clear_io_path_cache_done(struct spdk_io_channel_iter *i, int status)
2805 {
2806 	struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i);
2807 
2808 	pthread_mutex_lock(&nvme_ctrlr->mutex);
2809 
2810 	assert(nvme_ctrlr->ana_log_page_updating == true);
2811 	nvme_ctrlr->ana_log_page_updating = false;
2812 
2813 	if (nvme_ctrlr->ref > 0 || !nvme_ctrlr->destruct ||
2814 	    nvme_ctrlr->resetting) {
2815 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
2816 		return;
2817 	}
2818 
2819 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
2820 
2821 	nvme_ctrlr_unregister(nvme_ctrlr);
2822 }
2823 
2824 static void
2825 nvme_ctrlr_read_ana_log_page_done(void *ctx, const struct spdk_nvme_cpl *cpl)
2826 {
2827 	struct nvme_ctrlr *nvme_ctrlr = ctx;
2828 
2829 	if (cpl != NULL && spdk_nvme_cpl_is_success(cpl)) {
2830 		bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ctrlr_set_ana_states,
2831 					     nvme_ctrlr);
2832 	}
2833 
2834 	spdk_for_each_channel(nvme_ctrlr,
2835 			      bdev_nvme_clear_io_path_cache,
2836 			      NULL,
2837 			      bdev_nvme_clear_io_path_cache_done);
2838 }
2839 
2840 static void
2841 nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr)
2842 {
2843 	int rc;
2844 
2845 	if (nvme_ctrlr->ana_log_page == NULL) {
2846 		return;
2847 	}
2848 
2849 	pthread_mutex_lock(&nvme_ctrlr->mutex);
2850 	if (nvme_ctrlr->destruct || nvme_ctrlr->resetting ||
2851 	    nvme_ctrlr->ana_log_page_updating) {
2852 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
2853 		return;
2854 	}
2855 
2856 	nvme_ctrlr->ana_log_page_updating = true;
2857 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
2858 
2859 	rc = spdk_nvme_ctrlr_cmd_get_log_page(nvme_ctrlr->ctrlr,
2860 					      SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS,
2861 					      SPDK_NVME_GLOBAL_NS_TAG,
2862 					      nvme_ctrlr->ana_log_page,
2863 					      nvme_ctrlr->ana_log_page_size, 0,
2864 					      nvme_ctrlr_read_ana_log_page_done,
2865 					      nvme_ctrlr);
2866 	if (rc != 0) {
2867 		nvme_ctrlr_read_ana_log_page_done(nvme_ctrlr, NULL);
2868 	}
2869 }
2870 
2871 static void
2872 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl)
2873 {
2874 	struct nvme_ctrlr *nvme_ctrlr		= arg;
2875 	union spdk_nvme_async_event_completion	event;
2876 
2877 	if (spdk_nvme_cpl_is_error(cpl)) {
2878 		SPDK_WARNLOG("AER request execute failed");
2879 		return;
2880 	}
2881 
2882 	event.raw = cpl->cdw0;
2883 	if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) &&
2884 	    (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) {
2885 		nvme_ctrlr_populate_namespaces(nvme_ctrlr, NULL);
2886 	} else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) &&
2887 		   (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_ANA_CHANGE)) {
2888 		nvme_ctrlr_read_ana_log_page(nvme_ctrlr);
2889 	}
2890 }
2891 
2892 static void
2893 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, size_t count, int rc)
2894 {
2895 	if (ctx->cb_fn) {
2896 		ctx->cb_fn(ctx->cb_ctx, count, rc);
2897 	}
2898 
2899 	ctx->namespaces_populated = true;
2900 	if (ctx->probe_done) {
2901 		/* The probe was already completed, so we need to free the context
2902 		 * here.  This can happen for cases like OCSSD, where we need to
2903 		 * send additional commands to the SSD after attach.
2904 		 */
2905 		free(ctx);
2906 	}
2907 }
2908 
2909 static void
2910 nvme_ctrlr_create_done(struct nvme_ctrlr *nvme_ctrlr,
2911 		       struct nvme_async_probe_ctx *ctx)
2912 {
2913 	spdk_io_device_register(nvme_ctrlr,
2914 				bdev_nvme_create_ctrlr_channel_cb,
2915 				bdev_nvme_destroy_ctrlr_channel_cb,
2916 				sizeof(struct nvme_ctrlr_channel),
2917 				nvme_ctrlr->nbdev_ctrlr->name);
2918 
2919 	nvme_ctrlr_populate_namespaces(nvme_ctrlr, ctx);
2920 }
2921 
2922 static void
2923 nvme_ctrlr_init_ana_log_page_done(void *_ctx, const struct spdk_nvme_cpl *cpl)
2924 {
2925 	struct nvme_ctrlr *nvme_ctrlr = _ctx;
2926 	struct nvme_async_probe_ctx *ctx = nvme_ctrlr->probe_ctx;
2927 
2928 	nvme_ctrlr->probe_ctx = NULL;
2929 
2930 	if (spdk_nvme_cpl_is_error(cpl)) {
2931 		nvme_ctrlr_delete(nvme_ctrlr);
2932 
2933 		if (ctx != NULL) {
2934 			populate_namespaces_cb(ctx, 0, -1);
2935 		}
2936 		return;
2937 	}
2938 
2939 	nvme_ctrlr_create_done(nvme_ctrlr, ctx);
2940 }
2941 
2942 static int
2943 nvme_ctrlr_init_ana_log_page(struct nvme_ctrlr *nvme_ctrlr,
2944 			     struct nvme_async_probe_ctx *ctx)
2945 {
2946 	struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr;
2947 	const struct spdk_nvme_ctrlr_data *cdata;
2948 	uint32_t ana_log_page_size;
2949 
2950 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
2951 
2952 	ana_log_page_size = sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid *
2953 			    sizeof(struct spdk_nvme_ana_group_descriptor) + cdata->nn *
2954 			    sizeof(uint32_t);
2955 
2956 	nvme_ctrlr->ana_log_page = spdk_zmalloc(ana_log_page_size, 64, NULL,
2957 						SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
2958 	if (nvme_ctrlr->ana_log_page == NULL) {
2959 		SPDK_ERRLOG("could not allocate ANA log page buffer\n");
2960 		return -ENXIO;
2961 	}
2962 
2963 	/* Each descriptor in a ANA log page is not ensured to be 8-bytes aligned.
2964 	 * Hence copy each descriptor to a temporary area when parsing it.
2965 	 *
2966 	 * Allocate a buffer whose size is as large as ANA log page buffer because
2967 	 * we do not know the size of a descriptor until actually reading it.
2968 	 */
2969 	nvme_ctrlr->copied_ana_desc = calloc(1, ana_log_page_size);
2970 	if (nvme_ctrlr->copied_ana_desc == NULL) {
2971 		SPDK_ERRLOG("could not allocate a buffer to parse ANA descriptor\n");
2972 		return -ENOMEM;
2973 	}
2974 
2975 	nvme_ctrlr->ana_log_page_size = ana_log_page_size;
2976 
2977 	nvme_ctrlr->probe_ctx = ctx;
2978 
2979 	return spdk_nvme_ctrlr_cmd_get_log_page(ctrlr,
2980 						SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS,
2981 						SPDK_NVME_GLOBAL_NS_TAG,
2982 						nvme_ctrlr->ana_log_page,
2983 						nvme_ctrlr->ana_log_page_size, 0,
2984 						nvme_ctrlr_init_ana_log_page_done,
2985 						nvme_ctrlr);
2986 }
2987 
2988 /* hostnqn and subnqn were already verified before attaching a controller.
2989  * Hence check only the multipath capability and cntlid here.
2990  */
2991 static bool
2992 bdev_nvme_check_multipath(struct nvme_bdev_ctrlr *nbdev_ctrlr, struct spdk_nvme_ctrlr *ctrlr)
2993 {
2994 	struct nvme_ctrlr *tmp;
2995 	const struct spdk_nvme_ctrlr_data *cdata, *tmp_cdata;
2996 
2997 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
2998 
2999 	if (!cdata->cmic.multi_ctrlr) {
3000 		SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid);
3001 		return false;
3002 	}
3003 
3004 	TAILQ_FOREACH(tmp, &nbdev_ctrlr->ctrlrs, tailq) {
3005 		tmp_cdata = spdk_nvme_ctrlr_get_data(tmp->ctrlr);
3006 
3007 		if (!tmp_cdata->cmic.multi_ctrlr) {
3008 			SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid);
3009 			return false;
3010 		}
3011 		if (cdata->cntlid == tmp_cdata->cntlid) {
3012 			SPDK_ERRLOG("cntlid %u are duplicated.\n", tmp_cdata->cntlid);
3013 			return false;
3014 		}
3015 	}
3016 
3017 	return true;
3018 }
3019 
3020 static int
3021 nvme_bdev_ctrlr_create(const char *name, struct nvme_ctrlr *nvme_ctrlr)
3022 {
3023 	struct nvme_bdev_ctrlr *nbdev_ctrlr;
3024 	struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr;
3025 	int rc = 0;
3026 
3027 	pthread_mutex_lock(&g_bdev_nvme_mutex);
3028 
3029 	nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name);
3030 	if (nbdev_ctrlr != NULL) {
3031 		if (!bdev_nvme_check_multipath(nbdev_ctrlr, ctrlr)) {
3032 			rc = -EINVAL;
3033 			goto exit;
3034 		}
3035 	} else {
3036 		nbdev_ctrlr = calloc(1, sizeof(*nbdev_ctrlr));
3037 		if (nbdev_ctrlr == NULL) {
3038 			SPDK_ERRLOG("Failed to allocate nvme_bdev_ctrlr.\n");
3039 			rc = -ENOMEM;
3040 			goto exit;
3041 		}
3042 		nbdev_ctrlr->name = strdup(name);
3043 		if (nbdev_ctrlr->name == NULL) {
3044 			SPDK_ERRLOG("Failed to allocate name of nvme_bdev_ctrlr.\n");
3045 			free(nbdev_ctrlr);
3046 			goto exit;
3047 		}
3048 		TAILQ_INIT(&nbdev_ctrlr->ctrlrs);
3049 		TAILQ_INIT(&nbdev_ctrlr->bdevs);
3050 		TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq);
3051 	}
3052 	nvme_ctrlr->nbdev_ctrlr = nbdev_ctrlr;
3053 	TAILQ_INSERT_TAIL(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq);
3054 exit:
3055 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
3056 	return rc;
3057 }
3058 
3059 static int
3060 nvme_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr,
3061 		  const char *name,
3062 		  const struct spdk_nvme_transport_id *trid,
3063 		  uint32_t prchk_flags,
3064 		  struct nvme_async_probe_ctx *ctx)
3065 {
3066 	struct nvme_ctrlr *nvme_ctrlr;
3067 	struct nvme_path_id *path_id;
3068 	const struct spdk_nvme_ctrlr_data *cdata;
3069 	int rc;
3070 
3071 	nvme_ctrlr = calloc(1, sizeof(*nvme_ctrlr));
3072 	if (nvme_ctrlr == NULL) {
3073 		SPDK_ERRLOG("Failed to allocate device struct\n");
3074 		return -ENOMEM;
3075 	}
3076 
3077 	rc = pthread_mutex_init(&nvme_ctrlr->mutex, NULL);
3078 	if (rc != 0) {
3079 		free(nvme_ctrlr);
3080 		return rc;
3081 	}
3082 
3083 	TAILQ_INIT(&nvme_ctrlr->trids);
3084 
3085 	RB_INIT(&nvme_ctrlr->namespaces);
3086 
3087 	path_id = calloc(1, sizeof(*path_id));
3088 	if (path_id == NULL) {
3089 		SPDK_ERRLOG("Failed to allocate trid entry pointer\n");
3090 		rc = -ENOMEM;
3091 		goto err;
3092 	}
3093 
3094 	path_id->trid = *trid;
3095 	if (ctx != NULL) {
3096 		memcpy(path_id->hostid.hostaddr, ctx->opts.src_addr, sizeof(path_id->hostid.hostaddr));
3097 		memcpy(path_id->hostid.hostsvcid, ctx->opts.src_svcid, sizeof(path_id->hostid.hostsvcid));
3098 	}
3099 	nvme_ctrlr->active_path_id = path_id;
3100 	TAILQ_INSERT_HEAD(&nvme_ctrlr->trids, path_id, link);
3101 
3102 	nvme_ctrlr->thread = spdk_get_thread();
3103 	nvme_ctrlr->ctrlr = ctrlr;
3104 	nvme_ctrlr->ref = 1;
3105 
3106 	if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) {
3107 		SPDK_ERRLOG("OCSSDs are not supported");
3108 		rc = -ENOTSUP;
3109 		goto err;
3110 	}
3111 
3112 	nvme_ctrlr->prchk_flags = prchk_flags;
3113 
3114 	nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_ctrlr,
3115 					  g_opts.nvme_adminq_poll_period_us);
3116 
3117 	if (g_opts.timeout_us > 0) {
3118 		/* Register timeout callback. Timeout values for IO vs. admin reqs can be different. */
3119 		/* If timeout_admin_us is 0 (not specified), admin uses same timeout as IO. */
3120 		uint64_t adm_timeout_us = (g_opts.timeout_admin_us == 0) ?
3121 					  g_opts.timeout_us : g_opts.timeout_admin_us;
3122 		spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us,
3123 				adm_timeout_us, timeout_cb, nvme_ctrlr);
3124 	}
3125 
3126 	spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_ctrlr);
3127 	spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_ctrlr);
3128 
3129 	if (spdk_nvme_ctrlr_get_flags(ctrlr) &
3130 	    SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) {
3131 		nvme_ctrlr->opal_dev = spdk_opal_dev_construct(ctrlr);
3132 	}
3133 
3134 	rc = nvme_bdev_ctrlr_create(name, nvme_ctrlr);
3135 	if (rc != 0) {
3136 		goto err;
3137 	}
3138 
3139 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
3140 
3141 	if (cdata->cmic.ana_reporting) {
3142 		rc = nvme_ctrlr_init_ana_log_page(nvme_ctrlr, ctx);
3143 		if (rc == 0) {
3144 			return 0;
3145 		}
3146 	} else {
3147 		nvme_ctrlr_create_done(nvme_ctrlr, ctx);
3148 		return 0;
3149 	}
3150 
3151 err:
3152 	nvme_ctrlr_delete(nvme_ctrlr);
3153 	return rc;
3154 }
3155 
3156 static void
3157 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
3158 	  struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
3159 {
3160 	struct nvme_probe_ctx *ctx = cb_ctx;
3161 	char *name = NULL;
3162 	uint32_t prchk_flags = 0;
3163 	size_t i;
3164 
3165 	if (ctx) {
3166 		for (i = 0; i < ctx->count; i++) {
3167 			if (spdk_nvme_transport_id_compare(trid, &ctx->trids[i]) == 0) {
3168 				prchk_flags = ctx->prchk_flags[i];
3169 				name = strdup(ctx->names[i]);
3170 				break;
3171 			}
3172 		}
3173 	} else {
3174 		name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++);
3175 	}
3176 	if (!name) {
3177 		SPDK_ERRLOG("Failed to assign name to NVMe device\n");
3178 		return;
3179 	}
3180 
3181 	SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name);
3182 
3183 	nvme_ctrlr_create(ctrlr, name, trid, prchk_flags, NULL);
3184 
3185 	free(name);
3186 }
3187 
3188 static void
3189 _nvme_ctrlr_destruct(void *ctx)
3190 {
3191 	struct nvme_ctrlr *nvme_ctrlr = ctx;
3192 
3193 	nvme_ctrlr_depopulate_namespaces(nvme_ctrlr);
3194 	nvme_ctrlr_release(nvme_ctrlr);
3195 }
3196 
3197 static int
3198 _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, bool hotplug)
3199 {
3200 	struct nvme_probe_skip_entry *entry;
3201 
3202 	pthread_mutex_lock(&nvme_ctrlr->mutex);
3203 
3204 	/* The controller's destruction was already started */
3205 	if (nvme_ctrlr->destruct) {
3206 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
3207 		return 0;
3208 	}
3209 
3210 	if (!hotplug &&
3211 	    nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) {
3212 		entry = calloc(1, sizeof(*entry));
3213 		if (!entry) {
3214 			pthread_mutex_unlock(&nvme_ctrlr->mutex);
3215 			return -ENOMEM;
3216 		}
3217 		entry->trid = nvme_ctrlr->active_path_id->trid;
3218 		TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq);
3219 	}
3220 
3221 	nvme_ctrlr->destruct = true;
3222 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
3223 
3224 	_nvme_ctrlr_destruct(nvme_ctrlr);
3225 
3226 	return 0;
3227 }
3228 
3229 static void
3230 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr)
3231 {
3232 	struct nvme_ctrlr *nvme_ctrlr = cb_ctx;
3233 
3234 	_bdev_nvme_delete(nvme_ctrlr, true);
3235 }
3236 
3237 static int
3238 bdev_nvme_hotplug_probe(void *arg)
3239 {
3240 	if (g_hotplug_probe_ctx == NULL) {
3241 		spdk_poller_unregister(&g_hotplug_probe_poller);
3242 		return SPDK_POLLER_IDLE;
3243 	}
3244 
3245 	if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) {
3246 		g_hotplug_probe_ctx = NULL;
3247 		spdk_poller_unregister(&g_hotplug_probe_poller);
3248 	}
3249 
3250 	return SPDK_POLLER_BUSY;
3251 }
3252 
3253 static int
3254 bdev_nvme_hotplug(void *arg)
3255 {
3256 	struct spdk_nvme_transport_id trid_pcie;
3257 
3258 	if (g_hotplug_probe_ctx) {
3259 		return SPDK_POLLER_BUSY;
3260 	}
3261 
3262 	memset(&trid_pcie, 0, sizeof(trid_pcie));
3263 	spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE);
3264 
3265 	g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL,
3266 			      hotplug_probe_cb, attach_cb, NULL);
3267 
3268 	if (g_hotplug_probe_ctx) {
3269 		assert(g_hotplug_probe_poller == NULL);
3270 		g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000);
3271 	}
3272 
3273 	return SPDK_POLLER_BUSY;
3274 }
3275 
3276 void
3277 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts)
3278 {
3279 	*opts = g_opts;
3280 }
3281 
3282 static int
3283 bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts)
3284 {
3285 	if ((opts->timeout_us == 0) && (opts->timeout_admin_us != 0)) {
3286 		/* Can't set timeout_admin_us without also setting timeout_us */
3287 		SPDK_WARNLOG("Invalid options: Can't have (timeout_us == 0) with (timeout_admin_us > 0)\n");
3288 		return -EINVAL;
3289 	}
3290 
3291 	if (opts->bdev_retry_count < -1) {
3292 		SPDK_WARNLOG("Invalid option: bdev_retry_count can't be less than -1.\n");
3293 		return -EINVAL;
3294 	}
3295 
3296 	return 0;
3297 }
3298 
3299 int
3300 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts)
3301 {
3302 	int ret = bdev_nvme_validate_opts(opts);
3303 	if (ret) {
3304 		SPDK_WARNLOG("Failed to set nvme opts.\n");
3305 		return ret;
3306 	}
3307 
3308 	if (g_bdev_nvme_init_thread != NULL) {
3309 		if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
3310 			return -EPERM;
3311 		}
3312 	}
3313 
3314 	g_opts = *opts;
3315 
3316 	return 0;
3317 }
3318 
3319 struct set_nvme_hotplug_ctx {
3320 	uint64_t period_us;
3321 	bool enabled;
3322 	spdk_msg_fn fn;
3323 	void *fn_ctx;
3324 };
3325 
3326 static void
3327 set_nvme_hotplug_period_cb(void *_ctx)
3328 {
3329 	struct set_nvme_hotplug_ctx *ctx = _ctx;
3330 
3331 	spdk_poller_unregister(&g_hotplug_poller);
3332 	if (ctx->enabled) {
3333 		g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us);
3334 	}
3335 
3336 	g_nvme_hotplug_poll_period_us = ctx->period_us;
3337 	g_nvme_hotplug_enabled = ctx->enabled;
3338 	if (ctx->fn) {
3339 		ctx->fn(ctx->fn_ctx);
3340 	}
3341 
3342 	free(ctx);
3343 }
3344 
3345 int
3346 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx)
3347 {
3348 	struct set_nvme_hotplug_ctx *ctx;
3349 
3350 	if (enabled == true && !spdk_process_is_primary()) {
3351 		return -EPERM;
3352 	}
3353 
3354 	ctx = calloc(1, sizeof(*ctx));
3355 	if (ctx == NULL) {
3356 		return -ENOMEM;
3357 	}
3358 
3359 	period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us;
3360 	ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX);
3361 	ctx->enabled = enabled;
3362 	ctx->fn = cb;
3363 	ctx->fn_ctx = cb_ctx;
3364 
3365 	spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx);
3366 	return 0;
3367 }
3368 
3369 static void
3370 nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr,
3371 				    struct nvme_async_probe_ctx *ctx)
3372 {
3373 	struct nvme_ns	*nvme_ns;
3374 	struct nvme_bdev	*nvme_bdev;
3375 	size_t			j;
3376 
3377 	assert(nvme_ctrlr != NULL);
3378 
3379 	/*
3380 	 * Report the new bdevs that were created in this call.
3381 	 * There can be more than one bdev per NVMe controller.
3382 	 */
3383 	j = 0;
3384 	nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr);
3385 	while (nvme_ns != NULL) {
3386 		nvme_bdev = nvme_ns->bdev;
3387 		if (j < ctx->count) {
3388 			ctx->names[j] = nvme_bdev->disk.name;
3389 			j++;
3390 		} else {
3391 			SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n",
3392 				    ctx->count);
3393 			populate_namespaces_cb(ctx, 0, -ERANGE);
3394 			return;
3395 		}
3396 
3397 		nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns);
3398 	}
3399 
3400 	populate_namespaces_cb(ctx, j, 0);
3401 }
3402 
3403 static int
3404 bdev_nvme_compare_trids(struct nvme_ctrlr *nvme_ctrlr,
3405 			struct spdk_nvme_ctrlr *new_ctrlr,
3406 			struct spdk_nvme_transport_id *trid)
3407 {
3408 	struct nvme_path_id *tmp_trid;
3409 
3410 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
3411 		SPDK_ERRLOG("PCIe failover is not supported.\n");
3412 		return -ENOTSUP;
3413 	}
3414 
3415 	/* Currently we only support failover to the same transport type. */
3416 	if (nvme_ctrlr->active_path_id->trid.trtype != trid->trtype) {
3417 		return -EINVAL;
3418 	}
3419 
3420 	/* Currently we only support failover to the same NQN. */
3421 	if (strncmp(trid->subnqn, nvme_ctrlr->active_path_id->trid.subnqn, SPDK_NVMF_NQN_MAX_LEN)) {
3422 		return -EINVAL;
3423 	}
3424 
3425 	/* Skip all the other checks if we've already registered this path. */
3426 	TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) {
3427 		if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) {
3428 			return -EEXIST;
3429 		}
3430 	}
3431 
3432 	return 0;
3433 }
3434 
3435 static int
3436 bdev_nvme_compare_namespaces(struct nvme_ctrlr *nvme_ctrlr,
3437 			     struct spdk_nvme_ctrlr *new_ctrlr)
3438 {
3439 	struct nvme_ns *nvme_ns;
3440 	struct spdk_nvme_ns *new_ns;
3441 
3442 	nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr);
3443 	while (nvme_ns != NULL) {
3444 		new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nvme_ns->id);
3445 		assert(new_ns != NULL);
3446 
3447 		if (!bdev_nvme_compare_ns(nvme_ns->ns, new_ns)) {
3448 			return -EINVAL;
3449 		}
3450 
3451 		nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns);
3452 	}
3453 
3454 	return 0;
3455 }
3456 
3457 static int
3458 _bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr,
3459 			      struct spdk_nvme_transport_id *trid)
3460 {
3461 	struct nvme_path_id *new_trid, *tmp_trid;
3462 
3463 	new_trid = calloc(1, sizeof(*new_trid));
3464 	if (new_trid == NULL) {
3465 		return -ENOMEM;
3466 	}
3467 	new_trid->trid = *trid;
3468 	new_trid->is_failed = false;
3469 
3470 	TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) {
3471 		if (tmp_trid->is_failed) {
3472 			TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link);
3473 			return 0;
3474 		}
3475 	}
3476 
3477 	TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, new_trid, link);
3478 	return 0;
3479 }
3480 
3481 /* This is the case that a secondary path is added to an existing
3482  * nvme_ctrlr for failover. After checking if it can access the same
3483  * namespaces as the primary path, it is disconnected until failover occurs.
3484  */
3485 static int
3486 bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr,
3487 			     struct spdk_nvme_ctrlr *new_ctrlr,
3488 			     struct spdk_nvme_transport_id *trid)
3489 {
3490 	int rc;
3491 
3492 	assert(nvme_ctrlr != NULL);
3493 
3494 	pthread_mutex_lock(&nvme_ctrlr->mutex);
3495 
3496 	rc = bdev_nvme_compare_trids(nvme_ctrlr, new_ctrlr, trid);
3497 	if (rc != 0) {
3498 		goto exit;
3499 	}
3500 
3501 	rc = bdev_nvme_compare_namespaces(nvme_ctrlr, new_ctrlr);
3502 	if (rc != 0) {
3503 		goto exit;
3504 	}
3505 
3506 	rc = _bdev_nvme_add_secondary_trid(nvme_ctrlr, trid);
3507 
3508 exit:
3509 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
3510 
3511 	spdk_nvme_detach(new_ctrlr);
3512 
3513 	return rc;
3514 }
3515 
3516 static void
3517 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
3518 		  struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
3519 {
3520 	struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx;
3521 	struct nvme_async_probe_ctx *ctx;
3522 	int rc;
3523 
3524 	ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, opts);
3525 	ctx->ctrlr_attached = true;
3526 
3527 	rc = nvme_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx->prchk_flags, ctx);
3528 	if (rc != 0) {
3529 		populate_namespaces_cb(ctx, 0, rc);
3530 	}
3531 }
3532 
3533 static void
3534 connect_set_failover_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
3535 			struct spdk_nvme_ctrlr *ctrlr,
3536 			const struct spdk_nvme_ctrlr_opts *opts)
3537 {
3538 	struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx;
3539 	struct nvme_ctrlr *nvme_ctrlr;
3540 	struct nvme_async_probe_ctx *ctx;
3541 	int rc;
3542 
3543 	ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, opts);
3544 	ctx->ctrlr_attached = true;
3545 
3546 	nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name);
3547 	if (nvme_ctrlr) {
3548 		rc = bdev_nvme_add_secondary_trid(nvme_ctrlr, ctrlr, &ctx->trid);
3549 	} else {
3550 		rc = -ENODEV;
3551 	}
3552 
3553 	populate_namespaces_cb(ctx, 0, rc);
3554 }
3555 
3556 static int
3557 bdev_nvme_async_poll(void *arg)
3558 {
3559 	struct nvme_async_probe_ctx	*ctx = arg;
3560 	int				rc;
3561 
3562 	rc = spdk_nvme_probe_poll_async(ctx->probe_ctx);
3563 	if (spdk_unlikely(rc != -EAGAIN)) {
3564 		ctx->probe_done = true;
3565 		spdk_poller_unregister(&ctx->poller);
3566 		if (!ctx->ctrlr_attached) {
3567 			/* The probe is done, but no controller was attached.
3568 			 * That means we had a failure, so report -EIO back to
3569 			 * the caller (usually the RPC). populate_namespaces_cb()
3570 			 * will take care of freeing the nvme_async_probe_ctx.
3571 			 */
3572 			populate_namespaces_cb(ctx, 0, -EIO);
3573 		} else if (ctx->namespaces_populated) {
3574 			/* The namespaces for the attached controller were all
3575 			 * populated and the response was already sent to the
3576 			 * caller (usually the RPC).  So free the context here.
3577 			 */
3578 			free(ctx);
3579 		}
3580 	}
3581 
3582 	return SPDK_POLLER_BUSY;
3583 }
3584 
3585 int
3586 bdev_nvme_create(struct spdk_nvme_transport_id *trid,
3587 		 const char *base_name,
3588 		 const char **names,
3589 		 uint32_t count,
3590 		 uint32_t prchk_flags,
3591 		 spdk_bdev_create_nvme_fn cb_fn,
3592 		 void *cb_ctx,
3593 		 struct spdk_nvme_ctrlr_opts *opts,
3594 		 bool multipath)
3595 {
3596 	struct nvme_probe_skip_entry	*entry, *tmp;
3597 	struct nvme_async_probe_ctx	*ctx;
3598 	spdk_nvme_attach_cb attach_cb;
3599 
3600 	/* TODO expand this check to include both the host and target TRIDs.
3601 	 * Only if both are the same should we fail.
3602 	 */
3603 	if (nvme_ctrlr_get(trid) != NULL) {
3604 		SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr);
3605 		return -EEXIST;
3606 	}
3607 
3608 	ctx = calloc(1, sizeof(*ctx));
3609 	if (!ctx) {
3610 		return -ENOMEM;
3611 	}
3612 	ctx->base_name = base_name;
3613 	ctx->names = names;
3614 	ctx->count = count;
3615 	ctx->cb_fn = cb_fn;
3616 	ctx->cb_ctx = cb_ctx;
3617 	ctx->prchk_flags = prchk_flags;
3618 	ctx->trid = *trid;
3619 
3620 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
3621 		TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) {
3622 			if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) {
3623 				TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq);
3624 				free(entry);
3625 				break;
3626 			}
3627 		}
3628 	}
3629 
3630 	if (opts) {
3631 		memcpy(&ctx->opts, opts, sizeof(*opts));
3632 	} else {
3633 		spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->opts, sizeof(ctx->opts));
3634 	}
3635 
3636 	ctx->opts.transport_retry_count = g_opts.transport_retry_count;
3637 	ctx->opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms;
3638 	ctx->opts.disable_read_ana_log_page = true;
3639 
3640 	if (nvme_bdev_ctrlr_get_by_name(base_name) == NULL || multipath) {
3641 		attach_cb = connect_attach_cb;
3642 	} else {
3643 		attach_cb = connect_set_failover_cb;
3644 	}
3645 
3646 	ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->opts, attach_cb);
3647 	if (ctx->probe_ctx == NULL) {
3648 		SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr);
3649 		free(ctx);
3650 		return -ENODEV;
3651 	}
3652 	ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000);
3653 
3654 	return 0;
3655 }
3656 
3657 int
3658 bdev_nvme_delete(const char *name, const struct nvme_path_id *path_id)
3659 {
3660 	struct nvme_bdev_ctrlr	*nbdev_ctrlr;
3661 	struct nvme_ctrlr	*nvme_ctrlr, *tmp_nvme_ctrlr;
3662 	struct nvme_path_id	*p, *t;
3663 	int			rc = -ENXIO;
3664 
3665 	if (name == NULL || path_id == NULL) {
3666 		return -EINVAL;
3667 	}
3668 
3669 	nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name);
3670 	if (nbdev_ctrlr == NULL) {
3671 		SPDK_ERRLOG("Failed to find NVMe bdev controller\n");
3672 		return -ENODEV;
3673 	}
3674 
3675 	TAILQ_FOREACH_SAFE(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq, tmp_nvme_ctrlr) {
3676 		TAILQ_FOREACH_REVERSE_SAFE(p, &nvme_ctrlr->trids, nvme_paths, link, t) {
3677 			if (path_id->trid.trtype != 0) {
3678 				if (path_id->trid.trtype == SPDK_NVME_TRANSPORT_CUSTOM) {
3679 					if (strcasecmp(path_id->trid.trstring, p->trid.trstring) != 0) {
3680 						continue;
3681 					}
3682 				} else {
3683 					if (path_id->trid.trtype != p->trid.trtype) {
3684 						continue;
3685 					}
3686 				}
3687 			}
3688 
3689 			if (!spdk_mem_all_zero(path_id->trid.traddr, sizeof(path_id->trid.traddr))) {
3690 				if (strcasecmp(path_id->trid.traddr, p->trid.traddr) != 0) {
3691 					continue;
3692 				}
3693 			}
3694 
3695 			if (path_id->trid.adrfam != 0) {
3696 				if (path_id->trid.adrfam != p->trid.adrfam) {
3697 					continue;
3698 				}
3699 			}
3700 
3701 			if (!spdk_mem_all_zero(path_id->trid.trsvcid, sizeof(path_id->trid.trsvcid))) {
3702 				if (strcasecmp(path_id->trid.trsvcid, p->trid.trsvcid) != 0) {
3703 					continue;
3704 				}
3705 			}
3706 
3707 			if (!spdk_mem_all_zero(path_id->trid.subnqn, sizeof(path_id->trid.subnqn))) {
3708 				if (strcmp(path_id->trid.subnqn, p->trid.subnqn) != 0) {
3709 					continue;
3710 				}
3711 			}
3712 
3713 			if (!spdk_mem_all_zero(path_id->hostid.hostaddr, sizeof(path_id->hostid.hostaddr))) {
3714 				if (strcmp(path_id->hostid.hostaddr, p->hostid.hostaddr) != 0) {
3715 					continue;
3716 				}
3717 			}
3718 
3719 			if (!spdk_mem_all_zero(path_id->hostid.hostsvcid, sizeof(path_id->hostid.hostsvcid))) {
3720 				if (strcmp(path_id->hostid.hostsvcid, p->hostid.hostsvcid) != 0) {
3721 					continue;
3722 				}
3723 			}
3724 
3725 			/* If we made it here, then this path is a match! Now we need to remove it. */
3726 			if (p == nvme_ctrlr->active_path_id) {
3727 				/* This is the active path in use right now. The active path is always the first in the list. */
3728 
3729 				if (!TAILQ_NEXT(p, link)) {
3730 					/* The current path is the only path. */
3731 					rc = _bdev_nvme_delete(nvme_ctrlr, false);
3732 				} else {
3733 					/* There is an alternative path. */
3734 					rc = bdev_nvme_failover(nvme_ctrlr, true);
3735 				}
3736 			} else {
3737 				/* We are not using the specified path. */
3738 				TAILQ_REMOVE(&nvme_ctrlr->trids, p, link);
3739 				free(p);
3740 				rc = 0;
3741 			}
3742 
3743 			if (rc < 0 && rc != -ENXIO) {
3744 				return rc;
3745 			}
3746 
3747 
3748 		}
3749 	}
3750 
3751 	/* All nvme_ctrlrs were deleted or no nvme_ctrlr which had the trid was found. */
3752 	return rc;
3753 }
3754 
3755 static int
3756 bdev_nvme_library_init(void)
3757 {
3758 	g_bdev_nvme_init_thread = spdk_get_thread();
3759 
3760 	spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_create_poll_group_cb,
3761 				bdev_nvme_destroy_poll_group_cb,
3762 				sizeof(struct nvme_poll_group),  "nvme_poll_groups");
3763 
3764 	return 0;
3765 }
3766 
3767 static void
3768 bdev_nvme_library_fini(void)
3769 {
3770 	struct nvme_bdev_ctrlr *nbdev_ctrlr;
3771 	struct nvme_ctrlr *nvme_ctrlr;
3772 	struct nvme_probe_skip_entry *entry, *entry_tmp;
3773 
3774 	spdk_poller_unregister(&g_hotplug_poller);
3775 	free(g_hotplug_probe_ctx);
3776 	g_hotplug_probe_ctx = NULL;
3777 
3778 	TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) {
3779 		TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq);
3780 		free(entry);
3781 	}
3782 
3783 	pthread_mutex_lock(&g_bdev_nvme_mutex);
3784 	TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
3785 		TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) {
3786 			pthread_mutex_lock(&nvme_ctrlr->mutex);
3787 			if (nvme_ctrlr->destruct) {
3788 				/* This controller's destruction was already started
3789 				 * before the application started shutting down
3790 				 */
3791 				pthread_mutex_unlock(&nvme_ctrlr->mutex);
3792 				continue;
3793 			}
3794 			nvme_ctrlr->destruct = true;
3795 			pthread_mutex_unlock(&nvme_ctrlr->mutex);
3796 
3797 			spdk_thread_send_msg(nvme_ctrlr->thread, _nvme_ctrlr_destruct,
3798 					     nvme_ctrlr);
3799 		}
3800 	}
3801 
3802 	g_bdev_nvme_module_finish = true;
3803 	if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
3804 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
3805 		spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL);
3806 		spdk_bdev_module_fini_done();
3807 		return;
3808 	}
3809 
3810 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
3811 }
3812 
3813 static void
3814 bdev_nvme_verify_pi_error(struct nvme_bdev_io *bio)
3815 {
3816 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
3817 	struct spdk_bdev *bdev = bdev_io->bdev;
3818 	struct spdk_dif_ctx dif_ctx;
3819 	struct spdk_dif_error err_blk = {};
3820 	int rc;
3821 
3822 	rc = spdk_dif_ctx_init(&dif_ctx,
3823 			       bdev->blocklen, bdev->md_len, bdev->md_interleave,
3824 			       bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags,
3825 			       bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0);
3826 	if (rc != 0) {
3827 		SPDK_ERRLOG("Initialization of DIF context failed\n");
3828 		return;
3829 	}
3830 
3831 	if (bdev->md_interleave) {
3832 		rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
3833 				     bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
3834 	} else {
3835 		struct iovec md_iov = {
3836 			.iov_base	= bdev_io->u.bdev.md_buf,
3837 			.iov_len	= bdev_io->u.bdev.num_blocks * bdev->md_len,
3838 		};
3839 
3840 		rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
3841 				     &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
3842 	}
3843 
3844 	if (rc != 0) {
3845 		SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n",
3846 			    err_blk.err_type, err_blk.err_offset);
3847 	} else {
3848 		SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n");
3849 	}
3850 }
3851 
3852 static void
3853 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl)
3854 {
3855 	struct nvme_bdev_io *bio = ref;
3856 
3857 	if (spdk_nvme_cpl_is_success(cpl)) {
3858 		/* Run PI verification for read data buffer. */
3859 		bdev_nvme_verify_pi_error(bio);
3860 	}
3861 
3862 	/* Return original completion status */
3863 	bdev_nvme_io_complete_nvme_status(bio, &bio->cpl);
3864 }
3865 
3866 static void
3867 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl)
3868 {
3869 	struct nvme_bdev_io *bio = ref;
3870 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
3871 	int ret;
3872 
3873 	if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) {
3874 		SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n",
3875 			    cpl->status.sct, cpl->status.sc);
3876 
3877 		/* Save completion status to use after verifying PI error. */
3878 		bio->cpl = *cpl;
3879 
3880 		if (spdk_likely(nvme_io_path_is_available(bio->io_path))) {
3881 			/* Read without PI checking to verify PI error. */
3882 			ret = bdev_nvme_no_pi_readv(bio,
3883 						    bdev_io->u.bdev.iovs,
3884 						    bdev_io->u.bdev.iovcnt,
3885 						    bdev_io->u.bdev.md_buf,
3886 						    bdev_io->u.bdev.num_blocks,
3887 						    bdev_io->u.bdev.offset_blocks);
3888 			if (ret == 0) {
3889 				return;
3890 			}
3891 		}
3892 	}
3893 
3894 	bdev_nvme_io_complete_nvme_status(bio, cpl);
3895 }
3896 
3897 static void
3898 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl)
3899 {
3900 	struct nvme_bdev_io *bio = ref;
3901 
3902 	if (spdk_nvme_cpl_is_pi_error(cpl)) {
3903 		SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n",
3904 			    cpl->status.sct, cpl->status.sc);
3905 		/* Run PI verification for write data buffer if PI error is detected. */
3906 		bdev_nvme_verify_pi_error(bio);
3907 	}
3908 
3909 	bdev_nvme_io_complete_nvme_status(bio, cpl);
3910 }
3911 
3912 static void
3913 bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl)
3914 {
3915 	struct nvme_bdev_io *bio = ref;
3916 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
3917 
3918 	/* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks.
3919 	 * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error().
3920 	 */
3921 	bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0;
3922 
3923 	if (spdk_nvme_cpl_is_pi_error(cpl)) {
3924 		SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n",
3925 			    cpl->status.sct, cpl->status.sc);
3926 		/* Run PI verification for zone append data buffer if PI error is detected. */
3927 		bdev_nvme_verify_pi_error(bio);
3928 	}
3929 
3930 	bdev_nvme_io_complete_nvme_status(bio, cpl);
3931 }
3932 
3933 static void
3934 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl)
3935 {
3936 	struct nvme_bdev_io *bio = ref;
3937 
3938 	if (spdk_nvme_cpl_is_pi_error(cpl)) {
3939 		SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n",
3940 			    cpl->status.sct, cpl->status.sc);
3941 		/* Run PI verification for compare data buffer if PI error is detected. */
3942 		bdev_nvme_verify_pi_error(bio);
3943 	}
3944 
3945 	bdev_nvme_io_complete_nvme_status(bio, cpl);
3946 }
3947 
3948 static void
3949 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl)
3950 {
3951 	struct nvme_bdev_io *bio = ref;
3952 
3953 	/* Compare operation completion */
3954 	if ((cpl->cdw0 & 0xFF) == SPDK_NVME_OPC_COMPARE) {
3955 		/* Save compare result for write callback */
3956 		bio->cpl = *cpl;
3957 		return;
3958 	}
3959 
3960 	/* Write operation completion */
3961 	if (spdk_nvme_cpl_is_error(&bio->cpl)) {
3962 		/* If bio->cpl is already an error, it means the compare operation failed.  In that case,
3963 		 * complete the IO with the compare operation's status.
3964 		 */
3965 		if (!spdk_nvme_cpl_is_error(cpl)) {
3966 			SPDK_ERRLOG("Unexpected write success after compare failure.\n");
3967 		}
3968 
3969 		bdev_nvme_io_complete_nvme_status(bio, &bio->cpl);
3970 	} else {
3971 		bdev_nvme_io_complete_nvme_status(bio, cpl);
3972 	}
3973 }
3974 
3975 static void
3976 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl)
3977 {
3978 	struct nvme_bdev_io *bio = ref;
3979 
3980 	bdev_nvme_io_complete_nvme_status(bio, cpl);
3981 }
3982 
3983 static int
3984 fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc)
3985 {
3986 	switch (desc->zs) {
3987 	case SPDK_NVME_ZONE_STATE_EMPTY:
3988 		info->state = SPDK_BDEV_ZONE_STATE_EMPTY;
3989 		break;
3990 	case SPDK_NVME_ZONE_STATE_IOPEN:
3991 		info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN;
3992 		break;
3993 	case SPDK_NVME_ZONE_STATE_EOPEN:
3994 		info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN;
3995 		break;
3996 	case SPDK_NVME_ZONE_STATE_CLOSED:
3997 		info->state = SPDK_BDEV_ZONE_STATE_CLOSED;
3998 		break;
3999 	case SPDK_NVME_ZONE_STATE_RONLY:
4000 		info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY;
4001 		break;
4002 	case SPDK_NVME_ZONE_STATE_FULL:
4003 		info->state = SPDK_BDEV_ZONE_STATE_FULL;
4004 		break;
4005 	case SPDK_NVME_ZONE_STATE_OFFLINE:
4006 		info->state = SPDK_BDEV_ZONE_STATE_OFFLINE;
4007 		break;
4008 	default:
4009 		SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs);
4010 		return -EIO;
4011 	}
4012 
4013 	info->zone_id = desc->zslba;
4014 	info->write_pointer = desc->wp;
4015 	info->capacity = desc->zcap;
4016 
4017 	return 0;
4018 }
4019 
4020 static void
4021 bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl)
4022 {
4023 	struct nvme_bdev_io *bio = ref;
4024 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
4025 	uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id;
4026 	uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones;
4027 	struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf;
4028 	uint64_t max_zones_per_buf, i;
4029 	uint32_t zone_report_bufsize;
4030 	struct spdk_nvme_ns *ns;
4031 	struct spdk_nvme_qpair *qpair;
4032 	int ret;
4033 
4034 	if (spdk_nvme_cpl_is_error(cpl)) {
4035 		goto out_complete_io_nvme_cpl;
4036 	}
4037 
4038 	if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) {
4039 		ret = -ENXIO;
4040 		goto out_complete_io_ret;
4041 	}
4042 
4043 	ns = bio->io_path->nvme_ns->ns;
4044 	qpair = bio->io_path->ctrlr_ch->qpair;
4045 
4046 	zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns);
4047 	max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) /
4048 			    sizeof(bio->zone_report_buf->descs[0]);
4049 
4050 	if (bio->zone_report_buf->nr_zones > max_zones_per_buf) {
4051 		ret = -EINVAL;
4052 		goto out_complete_io_ret;
4053 	}
4054 
4055 	if (!bio->zone_report_buf->nr_zones) {
4056 		ret = -EINVAL;
4057 		goto out_complete_io_ret;
4058 	}
4059 
4060 	for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) {
4061 		ret = fill_zone_from_report(&info[bio->handled_zones],
4062 					    &bio->zone_report_buf->descs[i]);
4063 		if (ret) {
4064 			goto out_complete_io_ret;
4065 		}
4066 		bio->handled_zones++;
4067 	}
4068 
4069 	if (bio->handled_zones < zones_to_copy) {
4070 		uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(ns);
4071 		uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones);
4072 
4073 		memset(bio->zone_report_buf, 0, zone_report_bufsize);
4074 		ret = spdk_nvme_zns_report_zones(ns, qpair,
4075 						 bio->zone_report_buf, zone_report_bufsize,
4076 						 slba, SPDK_NVME_ZRA_LIST_ALL, true,
4077 						 bdev_nvme_get_zone_info_done, bio);
4078 		if (!ret) {
4079 			return;
4080 		} else {
4081 			goto out_complete_io_ret;
4082 		}
4083 	}
4084 
4085 out_complete_io_nvme_cpl:
4086 	free(bio->zone_report_buf);
4087 	bio->zone_report_buf = NULL;
4088 	bdev_nvme_io_complete_nvme_status(bio, cpl);
4089 	return;
4090 
4091 out_complete_io_ret:
4092 	free(bio->zone_report_buf);
4093 	bio->zone_report_buf = NULL;
4094 	bdev_nvme_io_complete(bio, ret);
4095 }
4096 
4097 static void
4098 bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl)
4099 {
4100 	struct nvme_bdev_io *bio = ref;
4101 
4102 	bdev_nvme_io_complete_nvme_status(bio, cpl);
4103 }
4104 
4105 static void
4106 bdev_nvme_admin_passthru_completion(void *ctx)
4107 {
4108 	struct nvme_bdev_io *bio = ctx;
4109 
4110 	bdev_nvme_io_complete_nvme_status(bio, &bio->cpl);
4111 }
4112 
4113 static void
4114 bdev_nvme_abort_completion(void *ctx)
4115 {
4116 	struct nvme_bdev_io *bio = ctx;
4117 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
4118 
4119 	if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) {
4120 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
4121 	} else {
4122 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
4123 	}
4124 }
4125 
4126 static void
4127 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl)
4128 {
4129 	struct nvme_bdev_io *bio = ref;
4130 
4131 	bio->cpl = *cpl;
4132 	spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_completion, bio);
4133 }
4134 
4135 static void
4136 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl)
4137 {
4138 	struct nvme_bdev_io *bio = ref;
4139 
4140 	bio->cpl = *cpl;
4141 	spdk_thread_send_msg(bio->orig_thread, bdev_nvme_admin_passthru_completion, bio);
4142 }
4143 
4144 static void
4145 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset)
4146 {
4147 	struct nvme_bdev_io *bio = ref;
4148 	struct iovec *iov;
4149 
4150 	bio->iov_offset = sgl_offset;
4151 	for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) {
4152 		iov = &bio->iovs[bio->iovpos];
4153 		if (bio->iov_offset < iov->iov_len) {
4154 			break;
4155 		}
4156 
4157 		bio->iov_offset -= iov->iov_len;
4158 	}
4159 }
4160 
4161 static int
4162 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length)
4163 {
4164 	struct nvme_bdev_io *bio = ref;
4165 	struct iovec *iov;
4166 
4167 	assert(bio->iovpos < bio->iovcnt);
4168 
4169 	iov = &bio->iovs[bio->iovpos];
4170 
4171 	*address = iov->iov_base;
4172 	*length = iov->iov_len;
4173 
4174 	if (bio->iov_offset) {
4175 		assert(bio->iov_offset <= iov->iov_len);
4176 		*address += bio->iov_offset;
4177 		*length -= bio->iov_offset;
4178 	}
4179 
4180 	bio->iov_offset += *length;
4181 	if (bio->iov_offset == iov->iov_len) {
4182 		bio->iovpos++;
4183 		bio->iov_offset = 0;
4184 	}
4185 
4186 	return 0;
4187 }
4188 
4189 static void
4190 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset)
4191 {
4192 	struct nvme_bdev_io *bio = ref;
4193 	struct iovec *iov;
4194 
4195 	bio->fused_iov_offset = sgl_offset;
4196 	for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) {
4197 		iov = &bio->fused_iovs[bio->fused_iovpos];
4198 		if (bio->fused_iov_offset < iov->iov_len) {
4199 			break;
4200 		}
4201 
4202 		bio->fused_iov_offset -= iov->iov_len;
4203 	}
4204 }
4205 
4206 static int
4207 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length)
4208 {
4209 	struct nvme_bdev_io *bio = ref;
4210 	struct iovec *iov;
4211 
4212 	assert(bio->fused_iovpos < bio->fused_iovcnt);
4213 
4214 	iov = &bio->fused_iovs[bio->fused_iovpos];
4215 
4216 	*address = iov->iov_base;
4217 	*length = iov->iov_len;
4218 
4219 	if (bio->fused_iov_offset) {
4220 		assert(bio->fused_iov_offset <= iov->iov_len);
4221 		*address += bio->fused_iov_offset;
4222 		*length -= bio->fused_iov_offset;
4223 	}
4224 
4225 	bio->fused_iov_offset += *length;
4226 	if (bio->fused_iov_offset == iov->iov_len) {
4227 		bio->fused_iovpos++;
4228 		bio->fused_iov_offset = 0;
4229 	}
4230 
4231 	return 0;
4232 }
4233 
4234 static int
4235 bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
4236 		      void *md, uint64_t lba_count, uint64_t lba)
4237 {
4238 	int rc;
4239 
4240 	SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n",
4241 		      lba_count, lba);
4242 
4243 	bio->iovs = iov;
4244 	bio->iovcnt = iovcnt;
4245 	bio->iovpos = 0;
4246 	bio->iov_offset = 0;
4247 
4248 	rc = spdk_nvme_ns_cmd_readv_with_md(bio->io_path->nvme_ns->ns,
4249 					    bio->io_path->ctrlr_ch->qpair,
4250 					    lba, lba_count,
4251 					    bdev_nvme_no_pi_readv_done, bio, 0,
4252 					    bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
4253 					    md, 0, 0);
4254 
4255 	if (rc != 0 && rc != -ENOMEM) {
4256 		SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc);
4257 	}
4258 	return rc;
4259 }
4260 
4261 static int
4262 bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
4263 		void *md, uint64_t lba_count, uint64_t lba, uint32_t flags,
4264 		struct spdk_bdev_ext_io_opts *ext_opts)
4265 {
4266 	struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
4267 	struct spdk_nvme_qpair *qpair = bio->io_path->ctrlr_ch->qpair;
4268 	int rc;
4269 
4270 	SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n",
4271 		      lba_count, lba);
4272 
4273 	bio->iovs = iov;
4274 	bio->iovcnt = iovcnt;
4275 	bio->iovpos = 0;
4276 	bio->iov_offset = 0;
4277 
4278 	if (ext_opts) {
4279 		bio->ext_opts.size = sizeof(struct spdk_nvme_ns_cmd_ext_io_opts);
4280 		bio->ext_opts.memory_domain = ext_opts->memory_domain;
4281 		bio->ext_opts.memory_domain_ctx = ext_opts->memory_domain_ctx;
4282 		bio->ext_opts.io_flags = flags;
4283 		bio->ext_opts.metadata = md;
4284 
4285 		rc = spdk_nvme_ns_cmd_readv_ext(ns, qpair, lba, lba_count,
4286 						bdev_nvme_readv_done, bio,
4287 						bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
4288 						&bio->ext_opts);
4289 	} else if (iovcnt == 1) {
4290 		rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, md, lba,
4291 						   lba_count,
4292 						   bdev_nvme_readv_done, bio,
4293 						   flags,
4294 						   0, 0);
4295 	} else {
4296 		rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count,
4297 						    bdev_nvme_readv_done, bio, flags,
4298 						    bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
4299 						    md, 0, 0);
4300 	}
4301 
4302 	if (rc != 0 && rc != -ENOMEM) {
4303 		SPDK_ERRLOG("readv failed: rc = %d\n", rc);
4304 	}
4305 	return rc;
4306 }
4307 
4308 static int
4309 bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
4310 		 void *md, uint64_t lba_count, uint64_t lba,
4311 		 uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts)
4312 {
4313 	struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
4314 	struct spdk_nvme_qpair *qpair = bio->io_path->ctrlr_ch->qpair;
4315 	int rc;
4316 
4317 	SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n",
4318 		      lba_count, lba);
4319 
4320 	bio->iovs = iov;
4321 	bio->iovcnt = iovcnt;
4322 	bio->iovpos = 0;
4323 	bio->iov_offset = 0;
4324 
4325 	if (ext_opts) {
4326 		bio->ext_opts.size = sizeof(struct spdk_nvme_ns_cmd_ext_io_opts);
4327 		bio->ext_opts.memory_domain = ext_opts->memory_domain;
4328 		bio->ext_opts.memory_domain_ctx = ext_opts->memory_domain_ctx;
4329 		bio->ext_opts.io_flags = flags;
4330 		bio->ext_opts.metadata = md;
4331 
4332 		rc = spdk_nvme_ns_cmd_writev_ext(ns, qpair, lba, lba_count,
4333 						 bdev_nvme_writev_done, bio,
4334 						 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
4335 						 &bio->ext_opts);
4336 	} else if (iovcnt == 1) {
4337 		rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, md, lba,
4338 						    lba_count,
4339 						    bdev_nvme_writev_done, bio,
4340 						    flags,
4341 						    0, 0);
4342 	} else {
4343 		rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count,
4344 						     bdev_nvme_writev_done, bio, flags,
4345 						     bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
4346 						     md, 0, 0);
4347 	}
4348 
4349 	if (rc != 0 && rc != -ENOMEM) {
4350 		SPDK_ERRLOG("writev failed: rc = %d\n", rc);
4351 	}
4352 	return rc;
4353 }
4354 
4355 static int
4356 bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
4357 		       void *md, uint64_t lba_count, uint64_t zslba,
4358 		       uint32_t flags)
4359 {
4360 	struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
4361 	struct spdk_nvme_qpair *qpair = bio->io_path->ctrlr_ch->qpair;
4362 	int rc;
4363 
4364 	SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n",
4365 		      lba_count, zslba);
4366 
4367 	bio->iovs = iov;
4368 	bio->iovcnt = iovcnt;
4369 	bio->iovpos = 0;
4370 	bio->iov_offset = 0;
4371 
4372 	if (iovcnt == 1) {
4373 		rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba,
4374 						       lba_count,
4375 						       bdev_nvme_zone_appendv_done, bio,
4376 						       flags,
4377 						       0, 0);
4378 	} else {
4379 		rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count,
4380 							bdev_nvme_zone_appendv_done, bio, flags,
4381 							bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
4382 							md, 0, 0);
4383 	}
4384 
4385 	if (rc != 0 && rc != -ENOMEM) {
4386 		SPDK_ERRLOG("zone append failed: rc = %d\n", rc);
4387 	}
4388 	return rc;
4389 }
4390 
4391 static int
4392 bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
4393 		   void *md, uint64_t lba_count, uint64_t lba,
4394 		   uint32_t flags)
4395 {
4396 	int rc;
4397 
4398 	SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n",
4399 		      lba_count, lba);
4400 
4401 	bio->iovs = iov;
4402 	bio->iovcnt = iovcnt;
4403 	bio->iovpos = 0;
4404 	bio->iov_offset = 0;
4405 
4406 	rc = spdk_nvme_ns_cmd_comparev_with_md(bio->io_path->nvme_ns->ns,
4407 					       bio->io_path->ctrlr_ch->qpair,
4408 					       lba, lba_count,
4409 					       bdev_nvme_comparev_done, bio, flags,
4410 					       bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
4411 					       md, 0, 0);
4412 
4413 	if (rc != 0 && rc != -ENOMEM) {
4414 		SPDK_ERRLOG("comparev failed: rc = %d\n", rc);
4415 	}
4416 	return rc;
4417 }
4418 
4419 static int
4420 bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt,
4421 			      struct iovec *write_iov, int write_iovcnt,
4422 			      void *md, uint64_t lba_count, uint64_t lba, uint32_t flags)
4423 {
4424 	struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
4425 	struct spdk_nvme_qpair *qpair = bio->io_path->ctrlr_ch->qpair;
4426 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
4427 	int rc;
4428 
4429 	SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n",
4430 		      lba_count, lba);
4431 
4432 	bio->iovs = cmp_iov;
4433 	bio->iovcnt = cmp_iovcnt;
4434 	bio->iovpos = 0;
4435 	bio->iov_offset = 0;
4436 	bio->fused_iovs = write_iov;
4437 	bio->fused_iovcnt = write_iovcnt;
4438 	bio->fused_iovpos = 0;
4439 	bio->fused_iov_offset = 0;
4440 
4441 	if (bdev_io->num_retries == 0) {
4442 		bio->first_fused_submitted = false;
4443 	}
4444 
4445 	if (!bio->first_fused_submitted) {
4446 		flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST;
4447 		memset(&bio->cpl, 0, sizeof(bio->cpl));
4448 
4449 		rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count,
4450 						       bdev_nvme_comparev_and_writev_done, bio, flags,
4451 						       bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0);
4452 		if (rc == 0) {
4453 			bio->first_fused_submitted = true;
4454 			flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST;
4455 		} else {
4456 			if (rc != -ENOMEM) {
4457 				SPDK_ERRLOG("compare failed: rc = %d\n", rc);
4458 			}
4459 			return rc;
4460 		}
4461 	}
4462 
4463 	flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND;
4464 
4465 	rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count,
4466 					     bdev_nvme_comparev_and_writev_done, bio, flags,
4467 					     bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0);
4468 	if (rc != 0 && rc != -ENOMEM) {
4469 		SPDK_ERRLOG("write failed: rc = %d\n", rc);
4470 		rc = 0;
4471 	}
4472 
4473 	return rc;
4474 }
4475 
4476 static int
4477 bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks)
4478 {
4479 	struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES];
4480 	struct spdk_nvme_dsm_range *range;
4481 	uint64_t offset, remaining;
4482 	uint64_t num_ranges_u64;
4483 	uint16_t num_ranges;
4484 	int rc;
4485 
4486 	num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) /
4487 			 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
4488 	if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) {
4489 		SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks);
4490 		return -EINVAL;
4491 	}
4492 	num_ranges = (uint16_t)num_ranges_u64;
4493 
4494 	offset = offset_blocks;
4495 	remaining = num_blocks;
4496 	range = &dsm_ranges[0];
4497 
4498 	/* Fill max-size ranges until the remaining blocks fit into one range */
4499 	while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) {
4500 		range->attributes.raw = 0;
4501 		range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
4502 		range->starting_lba = offset;
4503 
4504 		offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
4505 		remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
4506 		range++;
4507 	}
4508 
4509 	/* Final range describes the remaining blocks */
4510 	range->attributes.raw = 0;
4511 	range->length = remaining;
4512 	range->starting_lba = offset;
4513 
4514 	rc = spdk_nvme_ns_cmd_dataset_management(bio->io_path->nvme_ns->ns,
4515 			bio->io_path->ctrlr_ch->qpair,
4516 			SPDK_NVME_DSM_ATTR_DEALLOCATE,
4517 			dsm_ranges, num_ranges,
4518 			bdev_nvme_queued_done, bio);
4519 
4520 	return rc;
4521 }
4522 
4523 static int
4524 bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks)
4525 {
4526 	if (num_blocks > UINT16_MAX + 1) {
4527 		SPDK_ERRLOG("NVMe write zeroes is limited to 16-bit block count\n");
4528 		return -EINVAL;
4529 	}
4530 
4531 	return spdk_nvme_ns_cmd_write_zeroes(bio->io_path->nvme_ns->ns,
4532 					     bio->io_path->ctrlr_ch->qpair,
4533 					     offset_blocks, num_blocks,
4534 					     bdev_nvme_queued_done, bio,
4535 					     0);
4536 }
4537 
4538 static int
4539 bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones,
4540 			struct spdk_bdev_zone_info *info)
4541 {
4542 	struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
4543 	struct spdk_nvme_qpair *qpair = bio->io_path->ctrlr_ch->qpair;
4544 	uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns);
4545 	uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns);
4546 	uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns);
4547 
4548 	if (zone_id % zone_size != 0) {
4549 		return -EINVAL;
4550 	}
4551 
4552 	if (num_zones > total_zones || !num_zones) {
4553 		return -EINVAL;
4554 	}
4555 
4556 	assert(!bio->zone_report_buf);
4557 	bio->zone_report_buf = calloc(1, zone_report_bufsize);
4558 	if (!bio->zone_report_buf) {
4559 		return -ENOMEM;
4560 	}
4561 
4562 	bio->handled_zones = 0;
4563 
4564 	return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize,
4565 					  zone_id, SPDK_NVME_ZRA_LIST_ALL, true,
4566 					  bdev_nvme_get_zone_info_done, bio);
4567 }
4568 
4569 static int
4570 bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id,
4571 			  enum spdk_bdev_zone_action action)
4572 {
4573 	struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
4574 	struct spdk_nvme_qpair *qpair = bio->io_path->ctrlr_ch->qpair;
4575 
4576 	switch (action) {
4577 	case SPDK_BDEV_ZONE_CLOSE:
4578 		return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false,
4579 						bdev_nvme_zone_management_done, bio);
4580 	case SPDK_BDEV_ZONE_FINISH:
4581 		return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false,
4582 						 bdev_nvme_zone_management_done, bio);
4583 	case SPDK_BDEV_ZONE_OPEN:
4584 		return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false,
4585 					       bdev_nvme_zone_management_done, bio);
4586 	case SPDK_BDEV_ZONE_RESET:
4587 		return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false,
4588 						bdev_nvme_zone_management_done, bio);
4589 	case SPDK_BDEV_ZONE_OFFLINE:
4590 		return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false,
4591 						  bdev_nvme_zone_management_done, bio);
4592 	default:
4593 		return -EINVAL;
4594 	}
4595 }
4596 
4597 static int
4598 bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio,
4599 			 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes)
4600 {
4601 	struct nvme_io_path *io_path;
4602 	struct nvme_ctrlr *nvme_ctrlr;
4603 	struct spdk_nvme_ctrlr *ctrlr = NULL;
4604 	uint32_t max_xfer_size;
4605 
4606 	/* Choose the first ctrlr which is not failed. */
4607 	STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
4608 		nvme_ctrlr = nvme_ctrlr_channel_get_ctrlr(io_path->ctrlr_ch);
4609 		if (!spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) {
4610 			ctrlr = nvme_ctrlr->ctrlr;
4611 			break;
4612 		}
4613 	}
4614 
4615 	if (ctrlr == NULL) {
4616 		return -ENXIO;
4617 	}
4618 
4619 	max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(ctrlr);
4620 
4621 	if (nbytes > max_xfer_size) {
4622 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
4623 		return -EINVAL;
4624 	}
4625 
4626 	bio->orig_thread = spdk_get_thread();
4627 
4628 	return spdk_nvme_ctrlr_cmd_admin_raw(ctrlr, cmd, buf, (uint32_t)nbytes,
4629 					     bdev_nvme_admin_passthru_done, bio);
4630 }
4631 
4632 static int
4633 bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd,
4634 		      void *buf, size_t nbytes)
4635 {
4636 	struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
4637 	struct spdk_nvme_qpair *qpair = bio->io_path->ctrlr_ch->qpair;
4638 	uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns);
4639 	struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns);
4640 
4641 	if (nbytes > max_xfer_size) {
4642 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
4643 		return -EINVAL;
4644 	}
4645 
4646 	/*
4647 	 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid,
4648 	 * so fill it out automatically.
4649 	 */
4650 	cmd->nsid = spdk_nvme_ns_get_id(ns);
4651 
4652 	return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf,
4653 					  (uint32_t)nbytes, bdev_nvme_queued_done, bio);
4654 }
4655 
4656 static int
4657 bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd,
4658 			 void *buf, size_t nbytes, void *md_buf, size_t md_len)
4659 {
4660 	struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
4661 	struct spdk_nvme_qpair *qpair = bio->io_path->ctrlr_ch->qpair;
4662 	size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns);
4663 	uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns);
4664 	struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns);
4665 
4666 	if (nbytes > max_xfer_size) {
4667 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
4668 		return -EINVAL;
4669 	}
4670 
4671 	if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) {
4672 		SPDK_ERRLOG("invalid meta data buffer size\n");
4673 		return -EINVAL;
4674 	}
4675 
4676 	/*
4677 	 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid,
4678 	 * so fill it out automatically.
4679 	 */
4680 	cmd->nsid = spdk_nvme_ns_get_id(ns);
4681 
4682 	return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf,
4683 			(uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio);
4684 }
4685 
4686 static void
4687 bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio,
4688 		struct nvme_bdev_io *bio_to_abort)
4689 {
4690 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
4691 	struct spdk_bdev_io *bdev_io_to_abort;
4692 	struct nvme_io_path *io_path;
4693 	struct nvme_ctrlr *nvme_ctrlr;
4694 	int rc = 0;
4695 
4696 	bio->orig_thread = spdk_get_thread();
4697 
4698 	/* Traverse the retry_io_list first. */
4699 	TAILQ_FOREACH(bdev_io_to_abort, &nbdev_ch->retry_io_list, module_link) {
4700 		if ((struct nvme_bdev_io *)bdev_io_to_abort->driver_ctx == bio_to_abort) {
4701 			TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io_to_abort, module_link);
4702 			spdk_bdev_io_complete(bdev_io_to_abort, SPDK_BDEV_IO_STATUS_ABORTED);
4703 
4704 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
4705 			return;
4706 		}
4707 	}
4708 
4709 	/* Even admin commands, they were submitted to only nvme_ctrlrs which were
4710 	 * on any io_path. So traverse the io_path list for not only I/O commands
4711 	 * but also admin commands.
4712 	 */
4713 	STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
4714 		nvme_ctrlr = nvme_ctrlr_channel_get_ctrlr(io_path->ctrlr_ch);
4715 
4716 		rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ctrlr->ctrlr,
4717 						   io_path->ctrlr_ch->qpair,
4718 						   bio_to_abort,
4719 						   bdev_nvme_abort_done, bio);
4720 		if (rc == -ENOENT) {
4721 			/* If no command was found in I/O qpair, the target command may be
4722 			 * admin command.
4723 			 */
4724 			rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ctrlr->ctrlr,
4725 							   NULL,
4726 							   bio_to_abort,
4727 							   bdev_nvme_abort_done, bio);
4728 		}
4729 
4730 		if (rc != -ENOENT) {
4731 			break;
4732 		}
4733 	}
4734 
4735 	if (rc != 0) {
4736 		/* If no command was found or there was any error, complete the abort
4737 		 * request with failure.
4738 		 */
4739 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
4740 	}
4741 }
4742 
4743 static void
4744 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w)
4745 {
4746 	const char	*action;
4747 
4748 	if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) {
4749 		action = "reset";
4750 	} else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) {
4751 		action = "abort";
4752 	} else {
4753 		action = "none";
4754 	}
4755 
4756 	spdk_json_write_object_begin(w);
4757 
4758 	spdk_json_write_named_string(w, "method", "bdev_nvme_set_options");
4759 
4760 	spdk_json_write_named_object_begin(w, "params");
4761 	spdk_json_write_named_string(w, "action_on_timeout", action);
4762 	spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us);
4763 	spdk_json_write_named_uint64(w, "timeout_admin_us", g_opts.timeout_admin_us);
4764 	spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms);
4765 	spdk_json_write_named_uint32(w, "transport_retry_count", g_opts.transport_retry_count);
4766 	spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst);
4767 	spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight);
4768 	spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight);
4769 	spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight);
4770 	spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us);
4771 	spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us);
4772 	spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests);
4773 	spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit);
4774 	spdk_json_write_named_int32(w, "bdev_retry_count", g_opts.bdev_retry_count);
4775 	spdk_json_write_object_end(w);
4776 
4777 	spdk_json_write_object_end(w);
4778 }
4779 
4780 static void
4781 nvme_ctrlr_config_json(struct spdk_json_write_ctx *w,
4782 		       struct nvme_ctrlr *nvme_ctrlr)
4783 {
4784 	struct spdk_nvme_transport_id	*trid;
4785 
4786 	trid = &nvme_ctrlr->active_path_id->trid;
4787 
4788 	spdk_json_write_object_begin(w);
4789 
4790 	spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller");
4791 
4792 	spdk_json_write_named_object_begin(w, "params");
4793 	spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name);
4794 	nvme_bdev_dump_trid_json(trid, w);
4795 	spdk_json_write_named_bool(w, "prchk_reftag",
4796 				   (nvme_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0);
4797 	spdk_json_write_named_bool(w, "prchk_guard",
4798 				   (nvme_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0);
4799 
4800 	spdk_json_write_object_end(w);
4801 
4802 	spdk_json_write_object_end(w);
4803 }
4804 
4805 static void
4806 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w)
4807 {
4808 	spdk_json_write_object_begin(w);
4809 	spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug");
4810 
4811 	spdk_json_write_named_object_begin(w, "params");
4812 	spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us);
4813 	spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled);
4814 	spdk_json_write_object_end(w);
4815 
4816 	spdk_json_write_object_end(w);
4817 }
4818 
4819 static int
4820 bdev_nvme_config_json(struct spdk_json_write_ctx *w)
4821 {
4822 	struct nvme_bdev_ctrlr	*nbdev_ctrlr;
4823 	struct nvme_ctrlr	*nvme_ctrlr;
4824 
4825 	bdev_nvme_opts_config_json(w);
4826 
4827 	pthread_mutex_lock(&g_bdev_nvme_mutex);
4828 
4829 	TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
4830 		TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) {
4831 			nvme_ctrlr_config_json(w, nvme_ctrlr);
4832 		}
4833 	}
4834 
4835 	/* Dump as last parameter to give all NVMe bdevs chance to be constructed
4836 	 * before enabling hotplug poller.
4837 	 */
4838 	bdev_nvme_hotplug_config_json(w);
4839 
4840 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
4841 	return 0;
4842 }
4843 
4844 struct spdk_nvme_ctrlr *
4845 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev)
4846 {
4847 	struct nvme_bdev *nbdev;
4848 	struct nvme_ns *nvme_ns;
4849 
4850 	if (!bdev || bdev->module != &nvme_if) {
4851 		return NULL;
4852 	}
4853 
4854 	nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk);
4855 	nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list);
4856 	assert(nvme_ns != NULL);
4857 
4858 	return nvme_ns->ctrlr->ctrlr;
4859 }
4860 
4861 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme)
4862