xref: /spdk/module/bdev/nvme/bdev_nvme.c (revision 80e81273e2ea32a96f12f23a7a1cbdb0fe6f70f7)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation. All rights reserved.
5  *   Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
6  *   Copyright (c) 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
7  *
8  *   Redistribution and use in source and binary forms, with or without
9  *   modification, are permitted provided that the following conditions
10  *   are met:
11  *
12  *     * Redistributions of source code must retain the above copyright
13  *       notice, this list of conditions and the following disclaimer.
14  *     * Redistributions in binary form must reproduce the above copyright
15  *       notice, this list of conditions and the following disclaimer in
16  *       the documentation and/or other materials provided with the
17  *       distribution.
18  *     * Neither the name of Intel Corporation nor the names of its
19  *       contributors may be used to endorse or promote products derived
20  *       from this software without specific prior written permission.
21  *
22  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
25  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
26  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  */
34 
35 #include "spdk/stdinc.h"
36 
37 #include "bdev_nvme.h"
38 
39 #include "spdk/accel_engine.h"
40 #include "spdk/config.h"
41 #include "spdk/endian.h"
42 #include "spdk/bdev.h"
43 #include "spdk/json.h"
44 #include "spdk/likely.h"
45 #include "spdk/nvme.h"
46 #include "spdk/nvme_ocssd.h"
47 #include "spdk/nvme_zns.h"
48 #include "spdk/opal.h"
49 #include "spdk/thread.h"
50 #include "spdk/string.h"
51 #include "spdk/util.h"
52 
53 #include "spdk/bdev_module.h"
54 #include "spdk/log.h"
55 
56 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true
57 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS	(10000)
58 
59 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w);
60 
61 struct nvme_bdev_io {
62 	/** array of iovecs to transfer. */
63 	struct iovec *iovs;
64 
65 	/** Number of iovecs in iovs array. */
66 	int iovcnt;
67 
68 	/** Current iovec position. */
69 	int iovpos;
70 
71 	/** Offset in current iovec. */
72 	uint32_t iov_offset;
73 
74 	/** I/O path the current I/O or admin passthrough is submitted on, or the I/O path
75 	 *  being reset in a reset I/O.
76 	 */
77 	struct nvme_io_path *io_path;
78 
79 	/** array of iovecs to transfer. */
80 	struct iovec *fused_iovs;
81 
82 	/** Number of iovecs in iovs array. */
83 	int fused_iovcnt;
84 
85 	/** Current iovec position. */
86 	int fused_iovpos;
87 
88 	/** Offset in current iovec. */
89 	uint32_t fused_iov_offset;
90 
91 	/** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */
92 	struct spdk_nvme_cpl cpl;
93 
94 	/** Extended IO opts passed by the user to bdev layer and mapped to NVME format */
95 	struct spdk_nvme_ns_cmd_ext_io_opts ext_opts;
96 
97 	/** Originating thread */
98 	struct spdk_thread *orig_thread;
99 
100 	/** Keeps track if first of fused commands was submitted */
101 	bool first_fused_submitted;
102 
103 	/** Temporary pointer to zone report buffer */
104 	struct spdk_nvme_zns_zone_report *zone_report_buf;
105 
106 	/** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */
107 	uint64_t handled_zones;
108 
109 	/** Expiration value in ticks to retry the current I/O. */
110 	uint64_t retry_ticks;
111 
112 	/* How many times the current I/O was retried. */
113 	int32_t retry_count;
114 };
115 
116 struct nvme_probe_skip_entry {
117 	struct spdk_nvme_transport_id		trid;
118 	TAILQ_ENTRY(nvme_probe_skip_entry)	tailq;
119 };
120 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */
121 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER(
122 			g_skipped_nvme_ctrlrs);
123 
124 static struct spdk_bdev_nvme_opts g_opts = {
125 	.action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE,
126 	.timeout_us = 0,
127 	.timeout_admin_us = 0,
128 	.keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS,
129 	.transport_retry_count = 4,
130 	.arbitration_burst = 0,
131 	.low_priority_weight = 0,
132 	.medium_priority_weight = 0,
133 	.high_priority_weight = 0,
134 	.nvme_adminq_poll_period_us = 10000ULL,
135 	.nvme_ioq_poll_period_us = 0,
136 	.io_queue_requests = 0,
137 	.delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT,
138 	.bdev_retry_count = 0,
139 };
140 
141 #define NVME_HOTPLUG_POLL_PERIOD_MAX			10000000ULL
142 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT		100000ULL
143 
144 static int g_hot_insert_nvme_controller_index = 0;
145 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT;
146 static bool g_nvme_hotplug_enabled = false;
147 static struct spdk_thread *g_bdev_nvme_init_thread;
148 static struct spdk_poller *g_hotplug_poller;
149 static struct spdk_poller *g_hotplug_probe_poller;
150 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx;
151 
152 static void nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr,
153 		struct nvme_async_probe_ctx *ctx);
154 static void nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr,
155 		struct nvme_async_probe_ctx *ctx);
156 static int bdev_nvme_library_init(void);
157 static void bdev_nvme_library_fini(void);
158 static void bdev_nvme_submit_request(struct spdk_io_channel *ch,
159 				     struct spdk_bdev_io *bdev_io);
160 static int bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
161 			   void *md, uint64_t lba_count, uint64_t lba,
162 			   uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts);
163 static int bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
164 				 void *md, uint64_t lba_count, uint64_t lba);
165 static int bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
166 			    void *md, uint64_t lba_count, uint64_t lba,
167 			    uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts);
168 static int bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
169 				  void *md, uint64_t lba_count,
170 				  uint64_t zslba, uint32_t flags);
171 static int bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
172 			      void *md, uint64_t lba_count, uint64_t lba,
173 			      uint32_t flags);
174 static int bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio,
175 		struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov,
176 		int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba,
177 		uint32_t flags);
178 static int bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id,
179 				   uint32_t num_zones, struct spdk_bdev_zone_info *info);
180 static int bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id,
181 				     enum spdk_bdev_zone_action action);
182 static void bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch,
183 				     struct nvme_bdev_io *bio,
184 				     struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes);
185 static int bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd,
186 				 void *buf, size_t nbytes);
187 static int bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd,
188 				    void *buf, size_t nbytes, void *md_buf, size_t md_len);
189 static void bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch,
190 			    struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort);
191 static void bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio);
192 static int bdev_nvme_reset(struct nvme_ctrlr *nvme_ctrlr);
193 static int bdev_nvme_failover(struct nvme_ctrlr *nvme_ctrlr, bool remove);
194 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr);
195 static void nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr);
196 
197 static int
198 nvme_ns_cmp(struct nvme_ns *ns1, struct nvme_ns *ns2)
199 {
200 	return ns1->id - ns2->id;
201 }
202 
203 RB_GENERATE_STATIC(nvme_ns_tree, nvme_ns, node, nvme_ns_cmp);
204 
205 struct spdk_nvme_qpair *
206 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch)
207 {
208 	struct nvme_ctrlr_channel *ctrlr_ch;
209 
210 	assert(ctrlr_io_ch != NULL);
211 
212 	ctrlr_ch = spdk_io_channel_get_ctx(ctrlr_io_ch);
213 
214 	return ctrlr_ch->qpair;
215 }
216 
217 static int
218 bdev_nvme_get_ctx_size(void)
219 {
220 	return sizeof(struct nvme_bdev_io);
221 }
222 
223 static struct spdk_bdev_module nvme_if = {
224 	.name = "nvme",
225 	.async_fini = true,
226 	.module_init = bdev_nvme_library_init,
227 	.module_fini = bdev_nvme_library_fini,
228 	.config_json = bdev_nvme_config_json,
229 	.get_ctx_size = bdev_nvme_get_ctx_size,
230 
231 };
232 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if)
233 
234 struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_bdev_ctrlrs);
235 pthread_mutex_t g_bdev_nvme_mutex = PTHREAD_MUTEX_INITIALIZER;
236 bool g_bdev_nvme_module_finish;
237 
238 struct nvme_bdev_ctrlr *
239 nvme_bdev_ctrlr_get_by_name(const char *name)
240 {
241 	struct nvme_bdev_ctrlr *nbdev_ctrlr;
242 
243 	TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
244 		if (strcmp(name, nbdev_ctrlr->name) == 0) {
245 			break;
246 		}
247 	}
248 
249 	return nbdev_ctrlr;
250 }
251 
252 static struct nvme_ctrlr *
253 nvme_bdev_ctrlr_get_ctrlr(struct nvme_bdev_ctrlr *nbdev_ctrlr,
254 			  const struct spdk_nvme_transport_id *trid)
255 {
256 	struct nvme_ctrlr *nvme_ctrlr;
257 
258 	TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) {
259 		if (spdk_nvme_transport_id_compare(trid, &nvme_ctrlr->active_path_id->trid) == 0) {
260 			break;
261 		}
262 	}
263 
264 	return nvme_ctrlr;
265 }
266 
267 static struct nvme_bdev *
268 nvme_bdev_ctrlr_get_bdev(struct nvme_bdev_ctrlr *nbdev_ctrlr, uint32_t nsid)
269 {
270 	struct nvme_bdev *bdev;
271 
272 	pthread_mutex_lock(&g_bdev_nvme_mutex);
273 	TAILQ_FOREACH(bdev, &nbdev_ctrlr->bdevs, tailq) {
274 		if (bdev->nsid == nsid) {
275 			break;
276 		}
277 	}
278 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
279 
280 	return bdev;
281 }
282 
283 struct nvme_ns *
284 nvme_ctrlr_get_ns(struct nvme_ctrlr *nvme_ctrlr, uint32_t nsid)
285 {
286 	struct nvme_ns ns;
287 
288 	assert(nsid > 0);
289 
290 	ns.id = nsid;
291 	return RB_FIND(nvme_ns_tree, &nvme_ctrlr->namespaces, &ns);
292 }
293 
294 struct nvme_ns *
295 nvme_ctrlr_get_first_active_ns(struct nvme_ctrlr *nvme_ctrlr)
296 {
297 	return RB_MIN(nvme_ns_tree, &nvme_ctrlr->namespaces);
298 }
299 
300 struct nvme_ns *
301 nvme_ctrlr_get_next_active_ns(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *ns)
302 {
303 	if (ns == NULL) {
304 		return NULL;
305 	}
306 
307 	return RB_NEXT(nvme_ns_tree, &nvme_ctrlr->namespaces, ns);
308 }
309 
310 static struct nvme_ctrlr *
311 nvme_ctrlr_get(const struct spdk_nvme_transport_id *trid)
312 {
313 	struct nvme_bdev_ctrlr	*nbdev_ctrlr;
314 	struct nvme_ctrlr	*nvme_ctrlr = NULL;
315 
316 	pthread_mutex_lock(&g_bdev_nvme_mutex);
317 	TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
318 		nvme_ctrlr = nvme_bdev_ctrlr_get_ctrlr(nbdev_ctrlr, trid);
319 		if (nvme_ctrlr != NULL) {
320 			break;
321 		}
322 	}
323 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
324 
325 	return nvme_ctrlr;
326 }
327 
328 struct nvme_ctrlr *
329 nvme_ctrlr_get_by_name(const char *name)
330 {
331 	struct nvme_bdev_ctrlr *nbdev_ctrlr;
332 	struct nvme_ctrlr *nvme_ctrlr = NULL;
333 
334 	if (name == NULL) {
335 		return NULL;
336 	}
337 
338 	pthread_mutex_lock(&g_bdev_nvme_mutex);
339 	nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name);
340 	if (nbdev_ctrlr != NULL) {
341 		nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs);
342 	}
343 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
344 
345 	return nvme_ctrlr;
346 }
347 
348 void
349 nvme_bdev_ctrlr_for_each(nvme_bdev_ctrlr_for_each_fn fn, void *ctx)
350 {
351 	struct nvme_bdev_ctrlr *nbdev_ctrlr;
352 
353 	pthread_mutex_lock(&g_bdev_nvme_mutex);
354 	TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
355 		fn(nbdev_ctrlr, ctx);
356 	}
357 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
358 }
359 
360 void
361 nvme_bdev_dump_trid_json(const struct spdk_nvme_transport_id *trid, struct spdk_json_write_ctx *w)
362 {
363 	const char *trtype_str;
364 	const char *adrfam_str;
365 
366 	trtype_str = spdk_nvme_transport_id_trtype_str(trid->trtype);
367 	if (trtype_str) {
368 		spdk_json_write_named_string(w, "trtype", trtype_str);
369 	}
370 
371 	adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam);
372 	if (adrfam_str) {
373 		spdk_json_write_named_string(w, "adrfam", adrfam_str);
374 	}
375 
376 	if (trid->traddr[0] != '\0') {
377 		spdk_json_write_named_string(w, "traddr", trid->traddr);
378 	}
379 
380 	if (trid->trsvcid[0] != '\0') {
381 		spdk_json_write_named_string(w, "trsvcid", trid->trsvcid);
382 	}
383 
384 	if (trid->subnqn[0] != '\0') {
385 		spdk_json_write_named_string(w, "subnqn", trid->subnqn);
386 	}
387 }
388 
389 static void
390 nvme_bdev_ctrlr_delete(struct nvme_bdev_ctrlr *nbdev_ctrlr,
391 		       struct nvme_ctrlr *nvme_ctrlr)
392 {
393 	pthread_mutex_lock(&g_bdev_nvme_mutex);
394 
395 	TAILQ_REMOVE(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq);
396 	if (!TAILQ_EMPTY(&nbdev_ctrlr->ctrlrs)) {
397 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
398 
399 		return;
400 	}
401 	TAILQ_REMOVE(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq);
402 
403 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
404 
405 	assert(TAILQ_EMPTY(&nbdev_ctrlr->bdevs));
406 
407 	free(nbdev_ctrlr->name);
408 	free(nbdev_ctrlr);
409 }
410 
411 static void
412 _nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr)
413 {
414 	struct nvme_path_id *path_id, *tmp_path;
415 	struct nvme_ns *ns, *tmp_ns;
416 
417 	free(nvme_ctrlr->copied_ana_desc);
418 	spdk_free(nvme_ctrlr->ana_log_page);
419 
420 	if (nvme_ctrlr->opal_dev) {
421 		spdk_opal_dev_destruct(nvme_ctrlr->opal_dev);
422 		nvme_ctrlr->opal_dev = NULL;
423 	}
424 
425 	if (nvme_ctrlr->nbdev_ctrlr) {
426 		nvme_bdev_ctrlr_delete(nvme_ctrlr->nbdev_ctrlr, nvme_ctrlr);
427 	}
428 
429 	RB_FOREACH_SAFE(ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp_ns) {
430 		RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, ns);
431 		free(ns);
432 	}
433 
434 	TAILQ_FOREACH_SAFE(path_id, &nvme_ctrlr->trids, link, tmp_path) {
435 		TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link);
436 		free(path_id);
437 	}
438 
439 	pthread_mutex_destroy(&nvme_ctrlr->mutex);
440 
441 	free(nvme_ctrlr);
442 
443 	pthread_mutex_lock(&g_bdev_nvme_mutex);
444 	if (g_bdev_nvme_module_finish && TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
445 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
446 		spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL);
447 		spdk_bdev_module_fini_done();
448 		return;
449 	}
450 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
451 }
452 
453 static int
454 nvme_detach_poller(void *arg)
455 {
456 	struct nvme_ctrlr *nvme_ctrlr = arg;
457 	int rc;
458 
459 	rc = spdk_nvme_detach_poll_async(nvme_ctrlr->detach_ctx);
460 	if (rc != -EAGAIN) {
461 		spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller);
462 		_nvme_ctrlr_delete(nvme_ctrlr);
463 	}
464 
465 	return SPDK_POLLER_BUSY;
466 }
467 
468 static void
469 nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr)
470 {
471 	int rc;
472 
473 	spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer);
474 
475 	/* First, unregister the adminq poller, as the driver will poll adminq if necessary */
476 	spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller);
477 
478 	/* If we got here, the reset/detach poller cannot be active */
479 	assert(nvme_ctrlr->reset_detach_poller == NULL);
480 	nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(nvme_detach_poller,
481 					  nvme_ctrlr, 1000);
482 	if (nvme_ctrlr->reset_detach_poller == NULL) {
483 		SPDK_ERRLOG("Failed to register detach poller\n");
484 		goto error;
485 	}
486 
487 	rc = spdk_nvme_detach_async(nvme_ctrlr->ctrlr, &nvme_ctrlr->detach_ctx);
488 	if (rc != 0) {
489 		SPDK_ERRLOG("Failed to detach the NVMe controller\n");
490 		goto error;
491 	}
492 
493 	return;
494 error:
495 	/* We don't have a good way to handle errors here, so just do what we can and delete the
496 	 * controller without detaching the underlying NVMe device.
497 	 */
498 	spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller);
499 	_nvme_ctrlr_delete(nvme_ctrlr);
500 }
501 
502 static void
503 nvme_ctrlr_unregister_cb(void *io_device)
504 {
505 	struct nvme_ctrlr *nvme_ctrlr = io_device;
506 
507 	nvme_ctrlr_delete(nvme_ctrlr);
508 }
509 
510 static void
511 nvme_ctrlr_unregister(struct nvme_ctrlr *nvme_ctrlr)
512 {
513 	spdk_io_device_unregister(nvme_ctrlr, nvme_ctrlr_unregister_cb);
514 }
515 
516 static bool
517 nvme_ctrlr_can_be_unregistered(struct nvme_ctrlr *nvme_ctrlr)
518 {
519 	if (!nvme_ctrlr->destruct) {
520 		return false;
521 	}
522 
523 	if (nvme_ctrlr->ref > 0) {
524 		return false;
525 	}
526 
527 	if (nvme_ctrlr->resetting) {
528 		return false;
529 	}
530 
531 	if (nvme_ctrlr->ana_log_page_updating) {
532 		return false;
533 	}
534 
535 	return true;
536 }
537 
538 static void
539 nvme_ctrlr_release(struct nvme_ctrlr *nvme_ctrlr)
540 {
541 	pthread_mutex_lock(&nvme_ctrlr->mutex);
542 
543 	assert(nvme_ctrlr->ref > 0);
544 	nvme_ctrlr->ref--;
545 
546 	if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) {
547 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
548 		return;
549 	}
550 
551 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
552 
553 	nvme_ctrlr_unregister(nvme_ctrlr);
554 }
555 
556 static struct nvme_io_path *
557 _bdev_nvme_get_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns)
558 {
559 	struct nvme_io_path *io_path;
560 
561 	STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
562 		if (io_path->nvme_ns == nvme_ns) {
563 			break;
564 		}
565 	}
566 
567 	return io_path;
568 }
569 
570 static int
571 _bdev_nvme_add_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns)
572 {
573 	struct nvme_io_path *io_path;
574 	struct spdk_io_channel *ch;
575 
576 	io_path = calloc(1, sizeof(*io_path));
577 	if (io_path == NULL) {
578 		SPDK_ERRLOG("Failed to alloc io_path.\n");
579 		return -ENOMEM;
580 	}
581 
582 	ch = spdk_get_io_channel(nvme_ns->ctrlr);
583 	if (ch == NULL) {
584 		free(io_path);
585 		SPDK_ERRLOG("Failed to alloc io_channel.\n");
586 		return -ENOMEM;
587 	}
588 
589 	io_path->ctrlr_ch = spdk_io_channel_get_ctx(ch);
590 	TAILQ_INSERT_TAIL(&io_path->ctrlr_ch->io_path_list, io_path, tailq);
591 
592 	io_path->nvme_ns = nvme_ns;
593 
594 	io_path->nbdev_ch = nbdev_ch;
595 	STAILQ_INSERT_TAIL(&nbdev_ch->io_path_list, io_path, stailq);
596 
597 	nbdev_ch->current_io_path = NULL;
598 
599 	return 0;
600 }
601 
602 static void
603 _bdev_nvme_delete_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *io_path)
604 {
605 	struct spdk_io_channel *ch;
606 
607 	nbdev_ch->current_io_path = NULL;
608 
609 	STAILQ_REMOVE(&nbdev_ch->io_path_list, io_path, nvme_io_path, stailq);
610 
611 	TAILQ_REMOVE(&io_path->ctrlr_ch->io_path_list, io_path, tailq);
612 	ch = spdk_io_channel_from_ctx(io_path->ctrlr_ch);
613 	spdk_put_io_channel(ch);
614 
615 	free(io_path);
616 }
617 
618 static void
619 _bdev_nvme_delete_io_paths(struct nvme_bdev_channel *nbdev_ch)
620 {
621 	struct nvme_io_path *io_path, *tmp_io_path;
622 
623 	STAILQ_FOREACH_SAFE(io_path, &nbdev_ch->io_path_list, stailq, tmp_io_path) {
624 		_bdev_nvme_delete_io_path(nbdev_ch, io_path);
625 	}
626 }
627 
628 static int
629 bdev_nvme_create_bdev_channel_cb(void *io_device, void *ctx_buf)
630 {
631 	struct nvme_bdev_channel *nbdev_ch = ctx_buf;
632 	struct nvme_bdev *nbdev = io_device;
633 	struct nvme_ns *nvme_ns;
634 	int rc;
635 
636 	STAILQ_INIT(&nbdev_ch->io_path_list);
637 	TAILQ_INIT(&nbdev_ch->retry_io_list);
638 
639 	pthread_mutex_lock(&nbdev->mutex);
640 	TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) {
641 		rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns);
642 		if (rc != 0) {
643 			pthread_mutex_unlock(&nbdev->mutex);
644 
645 			_bdev_nvme_delete_io_paths(nbdev_ch);
646 			return rc;
647 		}
648 	}
649 	pthread_mutex_unlock(&nbdev->mutex);
650 
651 	return 0;
652 }
653 
654 static void
655 bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch)
656 {
657 	struct spdk_bdev_io *bdev_io, *tmp_io;
658 
659 	TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_io) {
660 		TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link);
661 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED);
662 	}
663 
664 	spdk_poller_unregister(&nbdev_ch->retry_io_poller);
665 }
666 
667 static void
668 bdev_nvme_destroy_bdev_channel_cb(void *io_device, void *ctx_buf)
669 {
670 	struct nvme_bdev_channel *nbdev_ch = ctx_buf;
671 
672 	bdev_nvme_abort_retry_ios(nbdev_ch);
673 	_bdev_nvme_delete_io_paths(nbdev_ch);
674 }
675 
676 static inline bool
677 bdev_nvme_io_type_is_admin(enum spdk_bdev_io_type io_type)
678 {
679 	switch (io_type) {
680 	case SPDK_BDEV_IO_TYPE_RESET:
681 	case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
682 	case SPDK_BDEV_IO_TYPE_ABORT:
683 		return true;
684 	default:
685 		break;
686 	}
687 
688 	return false;
689 }
690 
691 static inline bool
692 nvme_ns_is_accessible(struct nvme_ns *nvme_ns)
693 {
694 	if (spdk_unlikely(nvme_ns->ana_state_updating)) {
695 		return false;
696 	}
697 
698 	switch (nvme_ns->ana_state) {
699 	case SPDK_NVME_ANA_OPTIMIZED_STATE:
700 	case SPDK_NVME_ANA_NON_OPTIMIZED_STATE:
701 		return true;
702 	default:
703 		break;
704 	}
705 
706 	return false;
707 }
708 
709 static inline bool
710 nvme_io_path_is_connected(struct nvme_io_path *io_path)
711 {
712 	return io_path->ctrlr_ch->qpair != NULL;
713 }
714 
715 static inline bool
716 nvme_io_path_is_available(struct nvme_io_path *io_path)
717 {
718 	if (spdk_unlikely(!nvme_io_path_is_connected(io_path))) {
719 		return false;
720 	}
721 
722 	if (spdk_unlikely(!nvme_ns_is_accessible(io_path->nvme_ns))) {
723 		return false;
724 	}
725 
726 	return true;
727 }
728 
729 static inline bool
730 nvme_io_path_is_failed(struct nvme_io_path *io_path)
731 {
732 	struct nvme_ctrlr *nvme_ctrlr;
733 
734 	nvme_ctrlr = nvme_ctrlr_channel_get_ctrlr(io_path->ctrlr_ch);
735 
736 	if (nvme_ctrlr->destruct) {
737 		return true;
738 	}
739 
740 	if (nvme_ctrlr->fast_io_fail_timedout) {
741 		return true;
742 	}
743 
744 	if (nvme_ctrlr->resetting) {
745 		if (nvme_ctrlr->reconnect_delay_sec != 0) {
746 			return false;
747 		} else {
748 			return true;
749 		}
750 	}
751 
752 	if (nvme_ctrlr->reconnect_is_delayed) {
753 		return false;
754 	}
755 
756 	if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) {
757 		return true;
758 	} else {
759 		return false;
760 	}
761 }
762 
763 static bool
764 nvme_ctrlr_is_available(struct nvme_ctrlr *nvme_ctrlr)
765 {
766 	if (nvme_ctrlr->destruct) {
767 		return false;
768 	}
769 
770 	if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) {
771 		return false;
772 	}
773 
774 	if (nvme_ctrlr->resetting || nvme_ctrlr->reconnect_is_delayed) {
775 		return false;
776 	}
777 
778 	return true;
779 }
780 
781 static inline struct nvme_io_path *
782 bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch)
783 {
784 	struct nvme_io_path *io_path, *non_optimized = NULL;
785 
786 	if (spdk_likely(nbdev_ch->current_io_path != NULL)) {
787 		return nbdev_ch->current_io_path;
788 	}
789 
790 	STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
791 		if (spdk_unlikely(!nvme_io_path_is_connected(io_path))) {
792 			/* The device is currently resetting. */
793 			continue;
794 		}
795 
796 		if (spdk_unlikely(io_path->nvme_ns->ana_state_updating)) {
797 			continue;
798 		}
799 
800 		switch (io_path->nvme_ns->ana_state) {
801 		case SPDK_NVME_ANA_OPTIMIZED_STATE:
802 			nbdev_ch->current_io_path = io_path;
803 			return io_path;
804 		case SPDK_NVME_ANA_NON_OPTIMIZED_STATE:
805 			if (non_optimized == NULL) {
806 				non_optimized = io_path;
807 			}
808 			break;
809 		default:
810 			break;
811 		}
812 	}
813 
814 	return non_optimized;
815 }
816 
817 /* Return true if there is any io_path whose qpair is active or ctrlr is not failed,
818  * or false otherwise.
819  *
820  * If any io_path has an active qpair but find_io_path() returned NULL, its namespace
821  * is likely to be non-accessible now but may become accessible.
822  *
823  * If any io_path has an unfailed ctrlr but find_io_path() returned NULL, the ctrlr
824  * is likely to be resetting now but the reset may succeed. A ctrlr is set to unfailed
825  * when starting to reset it but it is set to failed when the reset failed. Hence, if
826  * a ctrlr is unfailed, it is likely that it works fine or is resetting.
827  */
828 static bool
829 any_io_path_may_become_available(struct nvme_bdev_channel *nbdev_ch)
830 {
831 	struct nvme_io_path *io_path;
832 
833 	STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
834 		if (nvme_io_path_is_connected(io_path) ||
835 		    !nvme_io_path_is_failed(io_path)) {
836 			return true;
837 		}
838 	}
839 
840 	return false;
841 }
842 
843 static bool
844 any_ctrlr_may_become_available(struct nvme_bdev_channel *nbdev_ch)
845 {
846 	struct nvme_io_path *io_path;
847 
848 	STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
849 		if (!nvme_io_path_is_failed(io_path)) {
850 			return true;
851 		}
852 	}
853 
854 	return false;
855 }
856 
857 static int
858 bdev_nvme_retry_ios(void *arg)
859 {
860 	struct nvme_bdev_channel *nbdev_ch = arg;
861 	struct spdk_io_channel *ch = spdk_io_channel_from_ctx(nbdev_ch);
862 	struct spdk_bdev_io *bdev_io, *tmp_bdev_io;
863 	struct nvme_bdev_io *bio;
864 	uint64_t now, delay_us;
865 
866 	now = spdk_get_ticks();
867 
868 	TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_bdev_io) {
869 		bio = (struct nvme_bdev_io *)bdev_io->driver_ctx;
870 		if (bio->retry_ticks > now) {
871 			break;
872 		}
873 
874 		TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link);
875 
876 		bdev_nvme_submit_request(ch, bdev_io);
877 	}
878 
879 	spdk_poller_unregister(&nbdev_ch->retry_io_poller);
880 
881 	bdev_io = TAILQ_FIRST(&nbdev_ch->retry_io_list);
882 	if (bdev_io != NULL) {
883 		bio = (struct nvme_bdev_io *)bdev_io->driver_ctx;
884 
885 		delay_us = (bio->retry_ticks - now) * SPDK_SEC_TO_USEC / spdk_get_ticks_hz();
886 
887 		nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch,
888 					    delay_us);
889 	}
890 
891 	return SPDK_POLLER_BUSY;
892 }
893 
894 static void
895 bdev_nvme_queue_retry_io(struct nvme_bdev_channel *nbdev_ch,
896 			 struct nvme_bdev_io *bio, uint64_t delay_ms)
897 {
898 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
899 	struct spdk_bdev_io *tmp_bdev_io;
900 	struct nvme_bdev_io *tmp_bio;
901 
902 	bio->retry_ticks = spdk_get_ticks() + delay_ms * spdk_get_ticks_hz() / 1000ULL;
903 
904 	TAILQ_FOREACH_REVERSE(tmp_bdev_io, &nbdev_ch->retry_io_list, retry_io_head, module_link) {
905 		tmp_bio = (struct nvme_bdev_io *)tmp_bdev_io->driver_ctx;
906 
907 		if (tmp_bio->retry_ticks <= bio->retry_ticks) {
908 			TAILQ_INSERT_AFTER(&nbdev_ch->retry_io_list, tmp_bdev_io, bdev_io,
909 					   module_link);
910 			return;
911 		}
912 	}
913 
914 	/* No earlier I/Os were found. This I/O must be the new head. */
915 	TAILQ_INSERT_HEAD(&nbdev_ch->retry_io_list, bdev_io, module_link);
916 
917 	spdk_poller_unregister(&nbdev_ch->retry_io_poller);
918 
919 	nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch,
920 				    delay_ms * 1000ULL);
921 }
922 
923 static inline void
924 bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio,
925 				  const struct spdk_nvme_cpl *cpl)
926 {
927 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
928 	struct nvme_bdev_channel *nbdev_ch;
929 	struct nvme_ctrlr *nvme_ctrlr;
930 	const struct spdk_nvme_ctrlr_data *cdata;
931 	uint64_t delay_ms;
932 
933 	assert(!bdev_nvme_io_type_is_admin(bdev_io->type));
934 
935 	if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) {
936 		goto complete;
937 	}
938 
939 	if (cpl->status.dnr != 0 || (g_opts.bdev_retry_count != -1 &&
940 				     bio->retry_count >= g_opts.bdev_retry_count)) {
941 		goto complete;
942 	}
943 
944 	nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io));
945 
946 	assert(bio->io_path != NULL);
947 	nvme_ctrlr = nvme_ctrlr_channel_get_ctrlr(bio->io_path->ctrlr_ch);
948 
949 	if (spdk_nvme_cpl_is_path_error(cpl) ||
950 	    spdk_nvme_cpl_is_aborted_sq_deletion(cpl) ||
951 	    !nvme_io_path_is_available(bio->io_path) ||
952 	    !nvme_ctrlr_is_available(nvme_ctrlr)) {
953 		nbdev_ch->current_io_path = NULL;
954 		if (spdk_nvme_cpl_is_ana_error(cpl)) {
955 			bio->io_path->nvme_ns->ana_state_updating = true;
956 			nvme_ctrlr_read_ana_log_page(nvme_ctrlr);
957 		}
958 		delay_ms = 0;
959 	} else if (spdk_nvme_cpl_is_aborted_by_request(cpl)) {
960 		goto complete;
961 	} else {
962 		bio->retry_count++;
963 
964 		cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr);
965 
966 		if (cpl->status.crd != 0) {
967 			delay_ms = cdata->crdt[cpl->status.crd] * 100;
968 		} else {
969 			delay_ms = 0;
970 		}
971 	}
972 
973 	if (any_io_path_may_become_available(nbdev_ch)) {
974 		bdev_nvme_queue_retry_io(nbdev_ch, bio, delay_ms);
975 		return;
976 	}
977 
978 complete:
979 	bio->retry_count = 0;
980 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
981 }
982 
983 static inline void
984 bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc)
985 {
986 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
987 	struct nvme_bdev_channel *nbdev_ch;
988 	enum spdk_bdev_io_status io_status;
989 
990 	switch (rc) {
991 	case 0:
992 		io_status = SPDK_BDEV_IO_STATUS_SUCCESS;
993 		break;
994 	case -ENOMEM:
995 		io_status = SPDK_BDEV_IO_STATUS_NOMEM;
996 		break;
997 	case -ENXIO:
998 		nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io));
999 
1000 		nbdev_ch->current_io_path = NULL;
1001 
1002 		if (any_io_path_may_become_available(nbdev_ch)) {
1003 			bdev_nvme_queue_retry_io(nbdev_ch, bio, 1000ULL);
1004 			return;
1005 		}
1006 
1007 	/* fallthrough */
1008 	default:
1009 		io_status = SPDK_BDEV_IO_STATUS_FAILED;
1010 		break;
1011 	}
1012 
1013 	bio->retry_count = 0;
1014 	spdk_bdev_io_complete(bdev_io, io_status);
1015 }
1016 
1017 static inline void
1018 bdev_nvme_admin_passthru_complete(struct nvme_bdev_io *bio, int rc)
1019 {
1020 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
1021 	struct nvme_bdev_channel *nbdev_ch;
1022 	enum spdk_bdev_io_status io_status;
1023 
1024 	switch (rc) {
1025 	case 0:
1026 		io_status = SPDK_BDEV_IO_STATUS_SUCCESS;
1027 		break;
1028 	case -ENOMEM:
1029 		io_status = SPDK_BDEV_IO_STATUS_NOMEM;
1030 		break;
1031 	case -ENXIO:
1032 		nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io));
1033 
1034 		if (any_ctrlr_may_become_available(nbdev_ch)) {
1035 			bdev_nvme_queue_retry_io(nbdev_ch, bio, 1000ULL);
1036 			return;
1037 		}
1038 
1039 	/* fallthrough */
1040 	default:
1041 		io_status = SPDK_BDEV_IO_STATUS_FAILED;
1042 		break;
1043 	}
1044 
1045 	bio->retry_count = 0;
1046 	spdk_bdev_io_complete(bdev_io, io_status);
1047 }
1048 
1049 static void
1050 _bdev_nvme_clear_io_path_cache(struct nvme_ctrlr_channel *ctrlr_ch)
1051 {
1052 	struct nvme_io_path *io_path;
1053 
1054 	TAILQ_FOREACH(io_path, &ctrlr_ch->io_path_list, tailq) {
1055 		io_path->nbdev_ch->current_io_path = NULL;
1056 	}
1057 }
1058 
1059 static struct nvme_ctrlr_channel *
1060 nvme_poll_group_get_ctrlr_channel(struct nvme_poll_group *group,
1061 				  struct spdk_nvme_qpair *qpair)
1062 {
1063 	struct nvme_ctrlr_channel *ctrlr_ch;
1064 
1065 	TAILQ_FOREACH(ctrlr_ch, &group->ctrlr_ch_list, tailq) {
1066 		if (ctrlr_ch->qpair == qpair) {
1067 			break;
1068 		}
1069 	}
1070 
1071 	return ctrlr_ch;
1072 }
1073 
1074 static void
1075 bdev_nvme_destroy_qpair(struct nvme_ctrlr_channel *ctrlr_ch)
1076 {
1077 	if (ctrlr_ch->qpair != NULL) {
1078 		spdk_nvme_ctrlr_free_io_qpair(ctrlr_ch->qpair);
1079 		ctrlr_ch->qpair = NULL;
1080 	}
1081 
1082 	_bdev_nvme_clear_io_path_cache(ctrlr_ch);
1083 }
1084 
1085 static void
1086 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx)
1087 {
1088 	struct nvme_poll_group *group = poll_group_ctx;
1089 	struct nvme_ctrlr_channel *ctrlr_ch;
1090 	struct nvme_ctrlr *nvme_ctrlr;
1091 
1092 	SPDK_NOTICELOG("qpair %p is disconnected, free the qpair and reset controller.\n", qpair);
1093 	/*
1094 	 * Free the I/O qpair and reset the nvme_ctrlr.
1095 	 */
1096 	ctrlr_ch = nvme_poll_group_get_ctrlr_channel(group, qpair);
1097 	if (ctrlr_ch != NULL) {
1098 		bdev_nvme_destroy_qpair(ctrlr_ch);
1099 
1100 		nvme_ctrlr = nvme_ctrlr_channel_get_ctrlr(ctrlr_ch);
1101 		bdev_nvme_reset(nvme_ctrlr);
1102 	}
1103 }
1104 
1105 static int
1106 bdev_nvme_poll(void *arg)
1107 {
1108 	struct nvme_poll_group *group = arg;
1109 	int64_t num_completions;
1110 
1111 	if (group->collect_spin_stat && group->start_ticks == 0) {
1112 		group->start_ticks = spdk_get_ticks();
1113 	}
1114 
1115 	num_completions = spdk_nvme_poll_group_process_completions(group->group, 0,
1116 			  bdev_nvme_disconnected_qpair_cb);
1117 	if (group->collect_spin_stat) {
1118 		if (num_completions > 0) {
1119 			if (group->end_ticks != 0) {
1120 				group->spin_ticks += (group->end_ticks - group->start_ticks);
1121 				group->end_ticks = 0;
1122 			}
1123 			group->start_ticks = 0;
1124 		} else {
1125 			group->end_ticks = spdk_get_ticks();
1126 		}
1127 	}
1128 
1129 	return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
1130 }
1131 
1132 static int
1133 bdev_nvme_poll_adminq(void *arg)
1134 {
1135 	int32_t rc;
1136 	struct nvme_ctrlr *nvme_ctrlr = arg;
1137 
1138 	assert(nvme_ctrlr != NULL);
1139 
1140 	rc = spdk_nvme_ctrlr_process_admin_completions(nvme_ctrlr->ctrlr);
1141 	if (rc < 0) {
1142 		bdev_nvme_failover(nvme_ctrlr, false);
1143 	}
1144 
1145 	return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY;
1146 }
1147 
1148 static void
1149 _bdev_nvme_unregister_dev_cb(void *io_device)
1150 {
1151 	struct nvme_bdev *nvme_disk = io_device;
1152 
1153 	free(nvme_disk->disk.name);
1154 	free(nvme_disk);
1155 }
1156 
1157 static int
1158 bdev_nvme_destruct(void *ctx)
1159 {
1160 	struct nvme_bdev *nvme_disk = ctx;
1161 	struct nvme_ns *nvme_ns, *tmp_nvme_ns;
1162 
1163 	TAILQ_FOREACH_SAFE(nvme_ns, &nvme_disk->nvme_ns_list, tailq, tmp_nvme_ns) {
1164 		pthread_mutex_lock(&nvme_ns->ctrlr->mutex);
1165 
1166 		nvme_ns->bdev = NULL;
1167 
1168 		assert(nvme_ns->id > 0);
1169 
1170 		if (nvme_ctrlr_get_ns(nvme_ns->ctrlr, nvme_ns->id) == NULL) {
1171 			pthread_mutex_unlock(&nvme_ns->ctrlr->mutex);
1172 
1173 			nvme_ctrlr_release(nvme_ns->ctrlr);
1174 			free(nvme_ns);
1175 		} else {
1176 			pthread_mutex_unlock(&nvme_ns->ctrlr->mutex);
1177 		}
1178 	}
1179 
1180 	pthread_mutex_lock(&g_bdev_nvme_mutex);
1181 	TAILQ_REMOVE(&nvme_disk->nbdev_ctrlr->bdevs, nvme_disk, tailq);
1182 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
1183 
1184 	spdk_io_device_unregister(nvme_disk, _bdev_nvme_unregister_dev_cb);
1185 
1186 	return 0;
1187 }
1188 
1189 static int
1190 bdev_nvme_flush(struct nvme_bdev_io *bio, uint64_t offset, uint64_t nbytes)
1191 {
1192 	bdev_nvme_io_complete(bio, 0);
1193 
1194 	return 0;
1195 }
1196 
1197 static int
1198 bdev_nvme_create_qpair(struct nvme_ctrlr_channel *ctrlr_ch)
1199 {
1200 	struct nvme_ctrlr *nvme_ctrlr;
1201 	struct spdk_nvme_io_qpair_opts opts;
1202 	struct spdk_nvme_qpair *qpair;
1203 	int rc;
1204 
1205 	nvme_ctrlr = nvme_ctrlr_channel_get_ctrlr(ctrlr_ch);
1206 
1207 	spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_ctrlr->ctrlr, &opts, sizeof(opts));
1208 	opts.delay_cmd_submit = g_opts.delay_cmd_submit;
1209 	opts.create_only = true;
1210 	opts.async_mode = true;
1211 	opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests);
1212 	g_opts.io_queue_requests = opts.io_queue_requests;
1213 
1214 	qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_ctrlr->ctrlr, &opts, sizeof(opts));
1215 	if (qpair == NULL) {
1216 		return -1;
1217 	}
1218 
1219 	assert(ctrlr_ch->group != NULL);
1220 
1221 	rc = spdk_nvme_poll_group_add(ctrlr_ch->group->group, qpair);
1222 	if (rc != 0) {
1223 		SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n");
1224 		goto err;
1225 	}
1226 
1227 	rc = spdk_nvme_ctrlr_connect_io_qpair(nvme_ctrlr->ctrlr, qpair);
1228 	if (rc != 0) {
1229 		SPDK_ERRLOG("Unable to connect I/O qpair.\n");
1230 		goto err;
1231 	}
1232 
1233 	ctrlr_ch->qpair = qpair;
1234 
1235 	_bdev_nvme_clear_io_path_cache(ctrlr_ch);
1236 
1237 	return 0;
1238 
1239 err:
1240 	spdk_nvme_ctrlr_free_io_qpair(qpair);
1241 
1242 	return rc;
1243 }
1244 
1245 static void
1246 bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i)
1247 {
1248 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
1249 	struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch);
1250 	enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS;
1251 	struct spdk_bdev_io *bdev_io;
1252 
1253 	if (spdk_io_channel_iter_get_ctx(i) != NULL) {
1254 		status = SPDK_BDEV_IO_STATUS_FAILED;
1255 	}
1256 
1257 	while (!TAILQ_EMPTY(&ctrlr_ch->pending_resets)) {
1258 		bdev_io = TAILQ_FIRST(&ctrlr_ch->pending_resets);
1259 		TAILQ_REMOVE(&ctrlr_ch->pending_resets, bdev_io, module_link);
1260 		spdk_bdev_io_complete(bdev_io, status);
1261 	}
1262 
1263 	spdk_for_each_channel_continue(i, 0);
1264 }
1265 
1266 static void
1267 bdev_nvme_failover_trid(struct nvme_ctrlr *nvme_ctrlr, bool remove)
1268 {
1269 	struct nvme_path_id *path_id, *next_path;
1270 	int rc __attribute__((unused));
1271 
1272 	path_id = TAILQ_FIRST(&nvme_ctrlr->trids);
1273 	assert(path_id);
1274 	assert(path_id == nvme_ctrlr->active_path_id);
1275 	next_path = TAILQ_NEXT(path_id, link);
1276 
1277 	path_id->is_failed = true;
1278 
1279 	if (next_path) {
1280 		assert(path_id->trid.trtype != SPDK_NVME_TRANSPORT_PCIE);
1281 
1282 		SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", path_id->trid.traddr,
1283 			       path_id->trid.trsvcid,	next_path->trid.traddr, next_path->trid.trsvcid);
1284 
1285 		spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr);
1286 		nvme_ctrlr->active_path_id = next_path;
1287 		rc = spdk_nvme_ctrlr_set_trid(nvme_ctrlr->ctrlr, &next_path->trid);
1288 		assert(rc == 0);
1289 		TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link);
1290 		if (!remove) {
1291 			/** Shuffle the old trid to the end of the list and use the new one.
1292 			 * Allows for round robin through multiple connections.
1293 			 */
1294 			TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, path_id, link);
1295 		} else {
1296 			free(path_id);
1297 		}
1298 	}
1299 }
1300 
1301 static bool
1302 bdev_nvme_check_ctrlr_loss_timeout(struct nvme_ctrlr *nvme_ctrlr)
1303 {
1304 	int32_t elapsed;
1305 
1306 	if (nvme_ctrlr->ctrlr_loss_timeout_sec == 0 ||
1307 	    nvme_ctrlr->ctrlr_loss_timeout_sec == -1) {
1308 		return false;
1309 	}
1310 
1311 	elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz();
1312 	if (elapsed >= nvme_ctrlr->ctrlr_loss_timeout_sec) {
1313 		return true;
1314 	} else {
1315 		return false;
1316 	}
1317 }
1318 
1319 static bool
1320 bdev_nvme_check_fast_io_fail_timeout(struct nvme_ctrlr *nvme_ctrlr)
1321 {
1322 	uint32_t elapsed;
1323 
1324 	if (nvme_ctrlr->fast_io_fail_timeout_sec == 0) {
1325 		return false;
1326 	}
1327 
1328 	elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz();
1329 	if (elapsed >= nvme_ctrlr->fast_io_fail_timeout_sec) {
1330 		return true;
1331 	} else {
1332 		return false;
1333 	}
1334 }
1335 
1336 enum bdev_nvme_op_after_reset {
1337 	OP_NONE,
1338 	OP_COMPLETE_PENDING_DESTRUCT,
1339 	OP_DESTRUCT,
1340 	OP_DELAYED_RECONNECT,
1341 };
1342 
1343 typedef enum bdev_nvme_op_after_reset _bdev_nvme_op_after_reset;
1344 
1345 static _bdev_nvme_op_after_reset
1346 bdev_nvme_check_op_after_reset(struct nvme_ctrlr *nvme_ctrlr, bool success)
1347 {
1348 	if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) {
1349 		/* Complete pending destruct after reset completes. */
1350 		return OP_COMPLETE_PENDING_DESTRUCT;
1351 	} else if (success || nvme_ctrlr->reconnect_delay_sec == 0) {
1352 		nvme_ctrlr->reset_start_tsc = 0;
1353 		return OP_NONE;
1354 	} else if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) {
1355 		return OP_DESTRUCT;
1356 	} else {
1357 		if (bdev_nvme_check_fast_io_fail_timeout(nvme_ctrlr)) {
1358 			nvme_ctrlr->fast_io_fail_timedout = true;
1359 		}
1360 		bdev_nvme_failover_trid(nvme_ctrlr, false);
1361 		return OP_DELAYED_RECONNECT;
1362 	}
1363 }
1364 
1365 static int _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, bool hotplug);
1366 static void bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr);
1367 
1368 static int
1369 bdev_nvme_reconnect_delay_timer_expired(void *ctx)
1370 {
1371 	struct nvme_ctrlr *nvme_ctrlr = ctx;
1372 
1373 	pthread_mutex_lock(&nvme_ctrlr->mutex);
1374 
1375 	spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer);
1376 
1377 	assert(nvme_ctrlr->reconnect_is_delayed == true);
1378 	nvme_ctrlr->reconnect_is_delayed = false;
1379 
1380 	if (nvme_ctrlr->destruct) {
1381 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
1382 		return SPDK_POLLER_BUSY;
1383 	}
1384 
1385 	assert(nvme_ctrlr->resetting == false);
1386 	nvme_ctrlr->resetting = true;
1387 
1388 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
1389 
1390 	spdk_poller_resume(nvme_ctrlr->adminq_timer_poller);
1391 
1392 	bdev_nvme_reconnect_ctrlr(nvme_ctrlr);
1393 	return SPDK_POLLER_BUSY;
1394 }
1395 
1396 static void
1397 bdev_nvme_start_reconnect_delay_timer(struct nvme_ctrlr *nvme_ctrlr)
1398 {
1399 	spdk_poller_pause(nvme_ctrlr->adminq_timer_poller);
1400 
1401 	assert(nvme_ctrlr->reconnect_is_delayed == false);
1402 	nvme_ctrlr->reconnect_is_delayed = true;
1403 
1404 	assert(nvme_ctrlr->reconnect_delay_timer == NULL);
1405 	nvme_ctrlr->reconnect_delay_timer = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_delay_timer_expired,
1406 					    nvme_ctrlr,
1407 					    nvme_ctrlr->reconnect_delay_sec * SPDK_SEC_TO_USEC);
1408 }
1409 
1410 static void
1411 _bdev_nvme_reset_complete(struct spdk_io_channel_iter *i, int status)
1412 {
1413 	struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i);
1414 	bool success = spdk_io_channel_iter_get_ctx(i) == NULL;
1415 	struct nvme_path_id *path_id;
1416 	bdev_nvme_reset_cb reset_cb_fn = nvme_ctrlr->reset_cb_fn;
1417 	void *reset_cb_arg = nvme_ctrlr->reset_cb_arg;
1418 	enum bdev_nvme_op_after_reset op_after_reset;
1419 
1420 	assert(nvme_ctrlr->thread == spdk_get_thread());
1421 
1422 	nvme_ctrlr->reset_cb_fn = NULL;
1423 	nvme_ctrlr->reset_cb_arg = NULL;
1424 
1425 	if (!success) {
1426 		SPDK_ERRLOG("Resetting controller failed.\n");
1427 	} else {
1428 		SPDK_NOTICELOG("Resetting controller successful.\n");
1429 	}
1430 
1431 	pthread_mutex_lock(&nvme_ctrlr->mutex);
1432 	nvme_ctrlr->resetting = false;
1433 
1434 	path_id = TAILQ_FIRST(&nvme_ctrlr->trids);
1435 	assert(path_id != NULL);
1436 	assert(path_id == nvme_ctrlr->active_path_id);
1437 
1438 	path_id->is_failed = !success;
1439 
1440 	op_after_reset = bdev_nvme_check_op_after_reset(nvme_ctrlr, success);
1441 
1442 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
1443 
1444 	if (reset_cb_fn) {
1445 		reset_cb_fn(reset_cb_arg, success);
1446 	}
1447 
1448 	switch (op_after_reset) {
1449 	case OP_COMPLETE_PENDING_DESTRUCT:
1450 		nvme_ctrlr_unregister(nvme_ctrlr);
1451 		break;
1452 	case OP_DESTRUCT:
1453 		_bdev_nvme_delete(nvme_ctrlr, false);
1454 		break;
1455 	case OP_DELAYED_RECONNECT:
1456 		spdk_nvme_ctrlr_disconnect(nvme_ctrlr->ctrlr);
1457 		bdev_nvme_start_reconnect_delay_timer(nvme_ctrlr);
1458 		break;
1459 	default:
1460 		break;
1461 	}
1462 }
1463 
1464 static void
1465 bdev_nvme_reset_complete(struct nvme_ctrlr *nvme_ctrlr, bool success)
1466 {
1467 	/* Make sure we clear any pending resets before returning. */
1468 	spdk_for_each_channel(nvme_ctrlr,
1469 			      bdev_nvme_complete_pending_resets,
1470 			      success ? NULL : (void *)0x1,
1471 			      _bdev_nvme_reset_complete);
1472 }
1473 
1474 static void
1475 bdev_nvme_reset_create_qpairs_failed(struct spdk_io_channel_iter *i, int status)
1476 {
1477 	struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i);
1478 
1479 	bdev_nvme_reset_complete(nvme_ctrlr, false);
1480 }
1481 
1482 static void
1483 bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i)
1484 {
1485 	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
1486 	struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch);
1487 
1488 	bdev_nvme_destroy_qpair(ctrlr_ch);
1489 
1490 	spdk_for_each_channel_continue(i, 0);
1491 }
1492 
1493 static void
1494 bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status)
1495 {
1496 	struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i);
1497 
1498 	if (status == 0) {
1499 		bdev_nvme_reset_complete(nvme_ctrlr, true);
1500 	} else {
1501 		/* Delete the added qpairs and quiesce ctrlr to make the states clean. */
1502 		spdk_for_each_channel(nvme_ctrlr,
1503 				      bdev_nvme_reset_destroy_qpair,
1504 				      NULL,
1505 				      bdev_nvme_reset_create_qpairs_failed);
1506 	}
1507 }
1508 
1509 static void
1510 bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i)
1511 {
1512 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
1513 	struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch);
1514 	int rc;
1515 
1516 	rc = bdev_nvme_create_qpair(ctrlr_ch);
1517 
1518 	spdk_for_each_channel_continue(i, rc);
1519 }
1520 
1521 static int
1522 bdev_nvme_reconnect_ctrlr_poll(void *arg)
1523 {
1524 	struct nvme_ctrlr *nvme_ctrlr = arg;
1525 	int rc = -ETIMEDOUT;
1526 
1527 	if (!bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) {
1528 		rc = spdk_nvme_ctrlr_reconnect_poll_async(nvme_ctrlr->ctrlr);
1529 		if (rc == -EAGAIN) {
1530 			return SPDK_POLLER_BUSY;
1531 		}
1532 	}
1533 
1534 	spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller);
1535 	if (rc == 0) {
1536 		/* Recreate all of the I/O queue pairs */
1537 		spdk_for_each_channel(nvme_ctrlr,
1538 				      bdev_nvme_reset_create_qpair,
1539 				      NULL,
1540 				      bdev_nvme_reset_create_qpairs_done);
1541 	} else {
1542 		bdev_nvme_reset_complete(nvme_ctrlr, false);
1543 	}
1544 	return SPDK_POLLER_BUSY;
1545 }
1546 
1547 static void
1548 bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr)
1549 {
1550 	spdk_nvme_ctrlr_reconnect_async(nvme_ctrlr->ctrlr);
1551 
1552 	assert(nvme_ctrlr->reset_detach_poller == NULL);
1553 	nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_ctrlr_poll,
1554 					  nvme_ctrlr, 0);
1555 }
1556 
1557 static void
1558 bdev_nvme_reset_ctrlr(struct spdk_io_channel_iter *i, int status)
1559 {
1560 	struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i);
1561 	int rc __attribute__((unused));
1562 
1563 	assert(status == 0);
1564 
1565 	/* Disconnect fails if ctrlr is already resetting or removed. Both cases are
1566 	 * not possible. Reset is controlled and the callback to hot remove is called
1567 	 * when ctrlr is hot removed.
1568 	 */
1569 	rc = spdk_nvme_ctrlr_disconnect(nvme_ctrlr->ctrlr);
1570 	assert(rc == 0);
1571 
1572 	bdev_nvme_reconnect_ctrlr(nvme_ctrlr);
1573 }
1574 
1575 static void
1576 _bdev_nvme_reset(void *ctx)
1577 {
1578 	struct nvme_ctrlr *nvme_ctrlr = ctx;
1579 
1580 	assert(nvme_ctrlr->resetting == true);
1581 	assert(nvme_ctrlr->thread == spdk_get_thread());
1582 
1583 	spdk_nvme_ctrlr_prepare_for_reset(nvme_ctrlr->ctrlr);
1584 
1585 	/* First, delete all NVMe I/O queue pairs. */
1586 	spdk_for_each_channel(nvme_ctrlr,
1587 			      bdev_nvme_reset_destroy_qpair,
1588 			      NULL,
1589 			      bdev_nvme_reset_ctrlr);
1590 }
1591 
1592 static int
1593 bdev_nvme_reset(struct nvme_ctrlr *nvme_ctrlr)
1594 {
1595 	pthread_mutex_lock(&nvme_ctrlr->mutex);
1596 	if (nvme_ctrlr->destruct) {
1597 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
1598 		return -ENXIO;
1599 	}
1600 
1601 	if (nvme_ctrlr->resetting) {
1602 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
1603 		SPDK_NOTICELOG("Unable to perform reset, already in progress.\n");
1604 		return -EBUSY;
1605 	}
1606 
1607 	if (nvme_ctrlr->reconnect_is_delayed) {
1608 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
1609 		SPDK_NOTICELOG("Reconnect is already scheduled.\n");
1610 		return -EBUSY;
1611 	}
1612 
1613 	nvme_ctrlr->resetting = true;
1614 
1615 	assert(nvme_ctrlr->reset_start_tsc == 0);
1616 	nvme_ctrlr->reset_start_tsc = spdk_get_ticks();
1617 
1618 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
1619 
1620 	spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset, nvme_ctrlr);
1621 	return 0;
1622 }
1623 
1624 int
1625 bdev_nvme_reset_rpc(struct nvme_ctrlr *nvme_ctrlr, bdev_nvme_reset_cb cb_fn, void *cb_arg)
1626 {
1627 	int rc;
1628 
1629 	rc = bdev_nvme_reset(nvme_ctrlr);
1630 	if (rc == 0) {
1631 		nvme_ctrlr->reset_cb_fn = cb_fn;
1632 		nvme_ctrlr->reset_cb_arg = cb_arg;
1633 	}
1634 	return rc;
1635 }
1636 
1637 static int _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio);
1638 
1639 static void
1640 bdev_nvme_reset_io_complete(struct nvme_bdev_io *bio)
1641 {
1642 	enum spdk_bdev_io_status io_status;
1643 
1644 	if (bio->cpl.cdw0 == 0) {
1645 		io_status = SPDK_BDEV_IO_STATUS_SUCCESS;
1646 	} else {
1647 		io_status = SPDK_BDEV_IO_STATUS_FAILED;
1648 	}
1649 
1650 	spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), io_status);
1651 }
1652 
1653 static void
1654 _bdev_nvme_reset_io_continue(void *ctx)
1655 {
1656 	struct nvme_bdev_io *bio = ctx;
1657 	struct nvme_io_path *prev_io_path, *next_io_path;
1658 	int rc;
1659 
1660 	prev_io_path = bio->io_path;
1661 	bio->io_path = NULL;
1662 
1663 	if (bio->cpl.cdw0 != 0) {
1664 		goto complete;
1665 	}
1666 
1667 	next_io_path = STAILQ_NEXT(prev_io_path, stailq);
1668 	if (next_io_path == NULL) {
1669 		goto complete;
1670 	}
1671 
1672 	rc = _bdev_nvme_reset_io(next_io_path, bio);
1673 	if (rc == 0) {
1674 		return;
1675 	}
1676 
1677 	bio->cpl.cdw0 = 1;
1678 
1679 complete:
1680 	bdev_nvme_reset_io_complete(bio);
1681 }
1682 
1683 static void
1684 bdev_nvme_reset_io_continue(void *cb_arg, bool success)
1685 {
1686 	struct nvme_bdev_io *bio = cb_arg;
1687 
1688 	bio->cpl.cdw0 = !success;
1689 
1690 	spdk_thread_send_msg(bio->orig_thread, _bdev_nvme_reset_io_continue, bio);
1691 }
1692 
1693 static int
1694 _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio)
1695 {
1696 	struct nvme_ctrlr_channel *ctrlr_ch = io_path->ctrlr_ch;
1697 	struct nvme_ctrlr *nvme_ctrlr;
1698 	struct spdk_bdev_io *bdev_io;
1699 	int rc;
1700 
1701 	nvme_ctrlr = nvme_ctrlr_channel_get_ctrlr(ctrlr_ch);
1702 
1703 	rc = bdev_nvme_reset(nvme_ctrlr);
1704 	if (rc == 0) {
1705 		assert(bio->io_path == NULL);
1706 		bio->io_path = io_path;
1707 
1708 		assert(nvme_ctrlr->reset_cb_fn == NULL);
1709 		assert(nvme_ctrlr->reset_cb_arg == NULL);
1710 		nvme_ctrlr->reset_cb_fn = bdev_nvme_reset_io_continue;
1711 		nvme_ctrlr->reset_cb_arg = bio;
1712 	} else if (rc == -EBUSY) {
1713 		/*
1714 		 * Reset call is queued only if it is from the app framework. This is on purpose so that
1715 		 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the
1716 		 * upper level. If they are in the middle of a reset, we won't try to schedule another one.
1717 		 */
1718 		bdev_io = spdk_bdev_io_from_ctx(bio);
1719 		TAILQ_INSERT_TAIL(&ctrlr_ch->pending_resets, bdev_io, module_link);
1720 	} else {
1721 		return rc;
1722 	}
1723 
1724 	return 0;
1725 }
1726 
1727 static void
1728 bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio)
1729 {
1730 	struct nvme_io_path *io_path;
1731 	int rc;
1732 
1733 	bio->cpl.cdw0 = 0;
1734 	bio->orig_thread = spdk_get_thread();
1735 
1736 	/* Reset only the first nvme_ctrlr in the nvme_bdev_ctrlr for now.
1737 	 *
1738 	 * TODO: Reset all nvme_ctrlrs in the nvme_bdev_ctrlr sequentially.
1739 	 * This will be done in the following patches.
1740 	 */
1741 	io_path = STAILQ_FIRST(&nbdev_ch->io_path_list);
1742 	assert(io_path != NULL);
1743 
1744 	rc = _bdev_nvme_reset_io(io_path, bio);
1745 	if (rc != 0) {
1746 		bio->cpl.cdw0 = 1;
1747 		bdev_nvme_reset_io_complete(bio);
1748 	}
1749 }
1750 
1751 static int
1752 bdev_nvme_failover(struct nvme_ctrlr *nvme_ctrlr, bool remove)
1753 {
1754 	pthread_mutex_lock(&nvme_ctrlr->mutex);
1755 	if (nvme_ctrlr->destruct) {
1756 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
1757 		/* Don't bother resetting if the controller is in the process of being destructed. */
1758 		return -ENXIO;
1759 	}
1760 
1761 	if (nvme_ctrlr->resetting) {
1762 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
1763 		SPDK_NOTICELOG("Unable to perform reset, already in progress.\n");
1764 		return -EBUSY;
1765 	}
1766 
1767 	bdev_nvme_failover_trid(nvme_ctrlr, remove);
1768 
1769 	if (nvme_ctrlr->reconnect_is_delayed) {
1770 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
1771 		SPDK_NOTICELOG("Reconnect is already scheduled.\n");
1772 
1773 		/* We rely on the next reconnect for the failover. */
1774 		return 0;
1775 	}
1776 
1777 	nvme_ctrlr->resetting = true;
1778 
1779 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
1780 
1781 	spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset, nvme_ctrlr);
1782 	return 0;
1783 }
1784 
1785 static int bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks,
1786 			   uint64_t num_blocks);
1787 
1788 static int bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks,
1789 				  uint64_t num_blocks);
1790 
1791 static void
1792 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
1793 		     bool success)
1794 {
1795 	struct nvme_bdev_io *bio = (struct nvme_bdev_io *)bdev_io->driver_ctx;
1796 	struct spdk_bdev *bdev = bdev_io->bdev;
1797 	int ret;
1798 
1799 	if (!success) {
1800 		ret = -EINVAL;
1801 		goto exit;
1802 	}
1803 
1804 	if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) {
1805 		ret = -ENXIO;
1806 		goto exit;
1807 	}
1808 
1809 	ret = bdev_nvme_readv(bio,
1810 			      bdev_io->u.bdev.iovs,
1811 			      bdev_io->u.bdev.iovcnt,
1812 			      bdev_io->u.bdev.md_buf,
1813 			      bdev_io->u.bdev.num_blocks,
1814 			      bdev_io->u.bdev.offset_blocks,
1815 			      bdev->dif_check_flags,
1816 			      bdev_io->internal.ext_opts);
1817 
1818 exit:
1819 	if (spdk_unlikely(ret != 0)) {
1820 		bdev_nvme_io_complete(bio, ret);
1821 	}
1822 }
1823 
1824 static void
1825 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
1826 {
1827 	struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch);
1828 	struct spdk_bdev *bdev = bdev_io->bdev;
1829 	struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx;
1830 	struct nvme_bdev_io *nbdev_io_to_abort;
1831 	int rc = 0;
1832 
1833 	nbdev_io->io_path = bdev_nvme_find_io_path(nbdev_ch);
1834 	if (spdk_unlikely(!nbdev_io->io_path)) {
1835 		if (!bdev_nvme_io_type_is_admin(bdev_io->type)) {
1836 			rc = -ENXIO;
1837 			goto exit;
1838 		}
1839 
1840 		/* Admin commands do not use the optimal I/O path.
1841 		 * Simply fall through even if it is not found.
1842 		 */
1843 	}
1844 
1845 	switch (bdev_io->type) {
1846 	case SPDK_BDEV_IO_TYPE_READ:
1847 		if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) {
1848 			rc = bdev_nvme_readv(nbdev_io,
1849 					     bdev_io->u.bdev.iovs,
1850 					     bdev_io->u.bdev.iovcnt,
1851 					     bdev_io->u.bdev.md_buf,
1852 					     bdev_io->u.bdev.num_blocks,
1853 					     bdev_io->u.bdev.offset_blocks,
1854 					     bdev->dif_check_flags,
1855 					     bdev_io->internal.ext_opts);
1856 		} else {
1857 			spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb,
1858 					     bdev_io->u.bdev.num_blocks * bdev->blocklen);
1859 			rc = 0;
1860 		}
1861 		break;
1862 	case SPDK_BDEV_IO_TYPE_WRITE:
1863 		rc = bdev_nvme_writev(nbdev_io,
1864 				      bdev_io->u.bdev.iovs,
1865 				      bdev_io->u.bdev.iovcnt,
1866 				      bdev_io->u.bdev.md_buf,
1867 				      bdev_io->u.bdev.num_blocks,
1868 				      bdev_io->u.bdev.offset_blocks,
1869 				      bdev->dif_check_flags,
1870 				      bdev_io->internal.ext_opts);
1871 		break;
1872 	case SPDK_BDEV_IO_TYPE_COMPARE:
1873 		rc = bdev_nvme_comparev(nbdev_io,
1874 					bdev_io->u.bdev.iovs,
1875 					bdev_io->u.bdev.iovcnt,
1876 					bdev_io->u.bdev.md_buf,
1877 					bdev_io->u.bdev.num_blocks,
1878 					bdev_io->u.bdev.offset_blocks,
1879 					bdev->dif_check_flags);
1880 		break;
1881 	case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE:
1882 		rc = bdev_nvme_comparev_and_writev(nbdev_io,
1883 						   bdev_io->u.bdev.iovs,
1884 						   bdev_io->u.bdev.iovcnt,
1885 						   bdev_io->u.bdev.fused_iovs,
1886 						   bdev_io->u.bdev.fused_iovcnt,
1887 						   bdev_io->u.bdev.md_buf,
1888 						   bdev_io->u.bdev.num_blocks,
1889 						   bdev_io->u.bdev.offset_blocks,
1890 						   bdev->dif_check_flags);
1891 		break;
1892 	case SPDK_BDEV_IO_TYPE_UNMAP:
1893 		rc = bdev_nvme_unmap(nbdev_io,
1894 				     bdev_io->u.bdev.offset_blocks,
1895 				     bdev_io->u.bdev.num_blocks);
1896 		break;
1897 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
1898 		rc =  bdev_nvme_write_zeroes(nbdev_io,
1899 					     bdev_io->u.bdev.offset_blocks,
1900 					     bdev_io->u.bdev.num_blocks);
1901 		break;
1902 	case SPDK_BDEV_IO_TYPE_RESET:
1903 		nbdev_io->io_path = NULL;
1904 		bdev_nvme_reset_io(nbdev_ch, nbdev_io);
1905 		break;
1906 	case SPDK_BDEV_IO_TYPE_FLUSH:
1907 		rc = bdev_nvme_flush(nbdev_io,
1908 				     bdev_io->u.bdev.offset_blocks,
1909 				     bdev_io->u.bdev.num_blocks);
1910 		break;
1911 	case SPDK_BDEV_IO_TYPE_ZONE_APPEND:
1912 		rc = bdev_nvme_zone_appendv(nbdev_io,
1913 					    bdev_io->u.bdev.iovs,
1914 					    bdev_io->u.bdev.iovcnt,
1915 					    bdev_io->u.bdev.md_buf,
1916 					    bdev_io->u.bdev.num_blocks,
1917 					    bdev_io->u.bdev.offset_blocks,
1918 					    bdev->dif_check_flags);
1919 		break;
1920 	case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO:
1921 		rc = bdev_nvme_get_zone_info(nbdev_io,
1922 					     bdev_io->u.zone_mgmt.zone_id,
1923 					     bdev_io->u.zone_mgmt.num_zones,
1924 					     bdev_io->u.zone_mgmt.buf);
1925 		break;
1926 	case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
1927 		rc = bdev_nvme_zone_management(nbdev_io,
1928 					       bdev_io->u.zone_mgmt.zone_id,
1929 					       bdev_io->u.zone_mgmt.zone_action);
1930 		break;
1931 	case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
1932 		nbdev_io->io_path = NULL;
1933 		bdev_nvme_admin_passthru(nbdev_ch,
1934 					 nbdev_io,
1935 					 &bdev_io->u.nvme_passthru.cmd,
1936 					 bdev_io->u.nvme_passthru.buf,
1937 					 bdev_io->u.nvme_passthru.nbytes);
1938 		break;
1939 	case SPDK_BDEV_IO_TYPE_NVME_IO:
1940 		rc = bdev_nvme_io_passthru(nbdev_io,
1941 					   &bdev_io->u.nvme_passthru.cmd,
1942 					   bdev_io->u.nvme_passthru.buf,
1943 					   bdev_io->u.nvme_passthru.nbytes);
1944 		break;
1945 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
1946 		rc = bdev_nvme_io_passthru_md(nbdev_io,
1947 					      &bdev_io->u.nvme_passthru.cmd,
1948 					      bdev_io->u.nvme_passthru.buf,
1949 					      bdev_io->u.nvme_passthru.nbytes,
1950 					      bdev_io->u.nvme_passthru.md_buf,
1951 					      bdev_io->u.nvme_passthru.md_len);
1952 		break;
1953 	case SPDK_BDEV_IO_TYPE_ABORT:
1954 		nbdev_io->io_path = NULL;
1955 		nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx;
1956 		bdev_nvme_abort(nbdev_ch,
1957 				nbdev_io,
1958 				nbdev_io_to_abort);
1959 		break;
1960 	default:
1961 		rc = -EINVAL;
1962 		break;
1963 	}
1964 
1965 exit:
1966 	if (spdk_unlikely(rc != 0)) {
1967 		bdev_nvme_io_complete(nbdev_io, rc);
1968 	}
1969 }
1970 
1971 static bool
1972 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
1973 {
1974 	struct nvme_bdev *nbdev = ctx;
1975 	struct nvme_ns *nvme_ns;
1976 	struct spdk_nvme_ns *ns;
1977 	struct spdk_nvme_ctrlr *ctrlr;
1978 	const struct spdk_nvme_ctrlr_data *cdata;
1979 
1980 	nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list);
1981 	assert(nvme_ns != NULL);
1982 	ns = nvme_ns->ns;
1983 	ctrlr = spdk_nvme_ns_get_ctrlr(ns);
1984 
1985 	switch (io_type) {
1986 	case SPDK_BDEV_IO_TYPE_READ:
1987 	case SPDK_BDEV_IO_TYPE_WRITE:
1988 	case SPDK_BDEV_IO_TYPE_RESET:
1989 	case SPDK_BDEV_IO_TYPE_FLUSH:
1990 	case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
1991 	case SPDK_BDEV_IO_TYPE_NVME_IO:
1992 	case SPDK_BDEV_IO_TYPE_ABORT:
1993 		return true;
1994 
1995 	case SPDK_BDEV_IO_TYPE_COMPARE:
1996 		return spdk_nvme_ns_supports_compare(ns);
1997 
1998 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
1999 		return spdk_nvme_ns_get_md_size(ns) ? true : false;
2000 
2001 	case SPDK_BDEV_IO_TYPE_UNMAP:
2002 		cdata = spdk_nvme_ctrlr_get_data(ctrlr);
2003 		return cdata->oncs.dsm;
2004 
2005 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
2006 		cdata = spdk_nvme_ctrlr_get_data(ctrlr);
2007 		return cdata->oncs.write_zeroes;
2008 
2009 	case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE:
2010 		if (spdk_nvme_ctrlr_get_flags(ctrlr) &
2011 		    SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) {
2012 			return true;
2013 		}
2014 		return false;
2015 
2016 	case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO:
2017 	case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
2018 		return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS;
2019 
2020 	case SPDK_BDEV_IO_TYPE_ZONE_APPEND:
2021 		return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS &&
2022 		       spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED;
2023 
2024 	default:
2025 		return false;
2026 	}
2027 }
2028 
2029 static int
2030 bdev_nvme_create_ctrlr_channel_cb(void *io_device, void *ctx_buf)
2031 {
2032 	struct nvme_ctrlr *nvme_ctrlr = io_device;
2033 	struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf;
2034 	struct spdk_io_channel *pg_ch;
2035 	int rc;
2036 
2037 	pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs);
2038 	if (!pg_ch) {
2039 		return -1;
2040 	}
2041 
2042 	ctrlr_ch->group = spdk_io_channel_get_ctx(pg_ch);
2043 	TAILQ_INSERT_TAIL(&ctrlr_ch->group->ctrlr_ch_list, ctrlr_ch, tailq);
2044 
2045 #ifdef SPDK_CONFIG_VTUNE
2046 	ctrlr_ch->group->collect_spin_stat = true;
2047 #else
2048 	ctrlr_ch->group->collect_spin_stat = false;
2049 #endif
2050 
2051 	TAILQ_INIT(&ctrlr_ch->pending_resets);
2052 	TAILQ_INIT(&ctrlr_ch->io_path_list);
2053 
2054 	rc = bdev_nvme_create_qpair(ctrlr_ch);
2055 	if (rc != 0) {
2056 		/* nvme ctrlr can't create IO qpair during reset. In that case ctrlr_ch->qpair
2057 		 * pointer will be NULL and IO qpair will be created when reset completes.
2058 		 * If the user submits IO requests during reset, they will be queued and resubmitted later */
2059 		if (!nvme_ctrlr->resetting) {
2060 			goto err_qpair;
2061 		}
2062 	}
2063 
2064 	return 0;
2065 
2066 err_qpair:
2067 	TAILQ_REMOVE(&ctrlr_ch->group->ctrlr_ch_list, ctrlr_ch, tailq);
2068 	spdk_put_io_channel(pg_ch);
2069 
2070 	return rc;
2071 }
2072 
2073 static void
2074 bdev_nvme_destroy_ctrlr_channel_cb(void *io_device, void *ctx_buf)
2075 {
2076 	struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf;
2077 
2078 	assert(ctrlr_ch->group != NULL);
2079 
2080 	bdev_nvme_destroy_qpair(ctrlr_ch);
2081 
2082 	TAILQ_REMOVE(&ctrlr_ch->group->ctrlr_ch_list, ctrlr_ch, tailq);
2083 
2084 	spdk_put_io_channel(spdk_io_channel_from_ctx(ctrlr_ch->group));
2085 }
2086 
2087 static void
2088 bdev_nvme_submit_accel_crc32c(void *ctx, uint32_t *dst, struct iovec *iov,
2089 			      uint32_t iov_cnt, uint32_t seed,
2090 			      spdk_nvme_accel_completion_cb cb_fn, void *cb_arg)
2091 {
2092 	struct nvme_poll_group *group = ctx;
2093 	int rc;
2094 
2095 	assert(group->accel_channel != NULL);
2096 	assert(cb_fn != NULL);
2097 
2098 	rc = spdk_accel_submit_crc32cv(group->accel_channel, dst, iov, iov_cnt, seed, cb_fn, cb_arg);
2099 	if (rc) {
2100 		/* For the two cases, spdk_accel_submit_crc32cv does not call the user's cb_fn */
2101 		if (rc == -ENOMEM || rc == -EINVAL) {
2102 			cb_fn(cb_arg, rc);
2103 		}
2104 		SPDK_ERRLOG("Cannot complete the accelerated crc32c operation with iov=%p\n", iov);
2105 	}
2106 }
2107 
2108 static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = {
2109 	.table_size		= sizeof(struct spdk_nvme_accel_fn_table),
2110 	.submit_accel_crc32c	= bdev_nvme_submit_accel_crc32c,
2111 };
2112 
2113 static int
2114 bdev_nvme_create_poll_group_cb(void *io_device, void *ctx_buf)
2115 {
2116 	struct nvme_poll_group *group = ctx_buf;
2117 
2118 	TAILQ_INIT(&group->ctrlr_ch_list);
2119 
2120 	group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table);
2121 	if (group->group == NULL) {
2122 		return -1;
2123 	}
2124 
2125 	group->accel_channel = spdk_accel_engine_get_io_channel();
2126 	if (!group->accel_channel) {
2127 		spdk_nvme_poll_group_destroy(group->group);
2128 		SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n",
2129 			    group);
2130 		return -1;
2131 	}
2132 
2133 	group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us);
2134 
2135 	if (group->poller == NULL) {
2136 		spdk_put_io_channel(group->accel_channel);
2137 		spdk_nvme_poll_group_destroy(group->group);
2138 		return -1;
2139 	}
2140 
2141 	return 0;
2142 }
2143 
2144 static void
2145 bdev_nvme_destroy_poll_group_cb(void *io_device, void *ctx_buf)
2146 {
2147 	struct nvme_poll_group *group = ctx_buf;
2148 
2149 	assert(TAILQ_EMPTY(&group->ctrlr_ch_list));
2150 
2151 	if (group->accel_channel) {
2152 		spdk_put_io_channel(group->accel_channel);
2153 	}
2154 
2155 	spdk_poller_unregister(&group->poller);
2156 	if (spdk_nvme_poll_group_destroy(group->group)) {
2157 		SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.\n");
2158 		assert(false);
2159 	}
2160 }
2161 
2162 static struct spdk_io_channel *
2163 bdev_nvme_get_io_channel(void *ctx)
2164 {
2165 	struct nvme_bdev *nvme_bdev = ctx;
2166 
2167 	return spdk_get_io_channel(nvme_bdev);
2168 }
2169 
2170 static void *
2171 bdev_nvme_get_module_ctx(void *ctx)
2172 {
2173 	struct nvme_bdev *nvme_bdev = ctx;
2174 	struct nvme_ns *nvme_ns;
2175 
2176 	if (!nvme_bdev || nvme_bdev->disk.module != &nvme_if) {
2177 		return NULL;
2178 	}
2179 
2180 	nvme_ns = TAILQ_FIRST(&nvme_bdev->nvme_ns_list);
2181 	if (!nvme_ns) {
2182 		return NULL;
2183 	}
2184 
2185 	return nvme_ns->ns;
2186 }
2187 
2188 static const char *
2189 _nvme_ana_state_str(enum spdk_nvme_ana_state ana_state)
2190 {
2191 	switch (ana_state) {
2192 	case SPDK_NVME_ANA_OPTIMIZED_STATE:
2193 		return "optimized";
2194 	case SPDK_NVME_ANA_NON_OPTIMIZED_STATE:
2195 		return "non_optimized";
2196 	case SPDK_NVME_ANA_INACCESSIBLE_STATE:
2197 		return "inaccessible";
2198 	case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE:
2199 		return "persistent_loss";
2200 	case SPDK_NVME_ANA_CHANGE_STATE:
2201 		return "change";
2202 	default:
2203 		return NULL;
2204 	}
2205 }
2206 
2207 static int
2208 bdev_nvme_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size)
2209 {
2210 	struct nvme_bdev *nbdev = ctx;
2211 	struct nvme_ns *nvme_ns;
2212 
2213 	nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list);
2214 	assert(nvme_ns != NULL);
2215 
2216 	return spdk_nvme_ctrlr_get_memory_domains(nvme_ns->ctrlr->ctrlr, domains, array_size);
2217 }
2218 
2219 static void
2220 nvme_namespace_info_json(struct spdk_json_write_ctx *w,
2221 			 struct nvme_ns *nvme_ns)
2222 {
2223 	struct spdk_nvme_ns *ns;
2224 	struct spdk_nvme_ctrlr *ctrlr;
2225 	const struct spdk_nvme_ctrlr_data *cdata;
2226 	const struct spdk_nvme_transport_id *trid;
2227 	union spdk_nvme_vs_register vs;
2228 	char buf[128];
2229 
2230 	ns = nvme_ns->ns;
2231 	ctrlr = spdk_nvme_ns_get_ctrlr(ns);
2232 
2233 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
2234 	trid = spdk_nvme_ctrlr_get_transport_id(ctrlr);
2235 	vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr);
2236 
2237 	spdk_json_write_object_begin(w);
2238 
2239 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
2240 		spdk_json_write_named_string(w, "pci_address", trid->traddr);
2241 	}
2242 
2243 	spdk_json_write_named_object_begin(w, "trid");
2244 
2245 	nvme_bdev_dump_trid_json(trid, w);
2246 
2247 	spdk_json_write_object_end(w);
2248 
2249 #ifdef SPDK_CONFIG_NVME_CUSE
2250 	size_t cuse_name_size = 128;
2251 	char cuse_name[cuse_name_size];
2252 
2253 	int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns),
2254 					    cuse_name, &cuse_name_size);
2255 	if (rc == 0) {
2256 		spdk_json_write_named_string(w, "cuse_device", cuse_name);
2257 	}
2258 #endif
2259 
2260 	spdk_json_write_named_object_begin(w, "ctrlr_data");
2261 
2262 	spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid);
2263 
2264 	snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn);
2265 	spdk_str_trim(buf);
2266 	spdk_json_write_named_string(w, "model_number", buf);
2267 
2268 	snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn);
2269 	spdk_str_trim(buf);
2270 	spdk_json_write_named_string(w, "serial_number", buf);
2271 
2272 	snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr);
2273 	spdk_str_trim(buf);
2274 	spdk_json_write_named_string(w, "firmware_revision", buf);
2275 
2276 	if (cdata->subnqn[0] != '\0') {
2277 		spdk_json_write_named_string(w, "subnqn", cdata->subnqn);
2278 	}
2279 
2280 	spdk_json_write_named_object_begin(w, "oacs");
2281 
2282 	spdk_json_write_named_uint32(w, "security", cdata->oacs.security);
2283 	spdk_json_write_named_uint32(w, "format", cdata->oacs.format);
2284 	spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware);
2285 	spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage);
2286 
2287 	spdk_json_write_object_end(w);
2288 
2289 	spdk_json_write_object_end(w);
2290 
2291 	spdk_json_write_named_object_begin(w, "vs");
2292 
2293 	spdk_json_write_name(w, "nvme_version");
2294 	if (vs.bits.ter) {
2295 		spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter);
2296 	} else {
2297 		spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr);
2298 	}
2299 
2300 	spdk_json_write_object_end(w);
2301 
2302 	spdk_json_write_named_object_begin(w, "ns_data");
2303 
2304 	spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns));
2305 
2306 	if (cdata->cmic.ana_reporting) {
2307 		spdk_json_write_named_string(w, "ana_state",
2308 					     _nvme_ana_state_str(nvme_ns->ana_state));
2309 	}
2310 
2311 	spdk_json_write_object_end(w);
2312 
2313 	if (cdata->oacs.security) {
2314 		spdk_json_write_named_object_begin(w, "security");
2315 
2316 		spdk_json_write_named_bool(w, "opal", nvme_ns->bdev->opal);
2317 
2318 		spdk_json_write_object_end(w);
2319 	}
2320 
2321 	spdk_json_write_object_end(w);
2322 }
2323 
2324 static int
2325 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
2326 {
2327 	struct nvme_bdev *nvme_bdev = ctx;
2328 	struct nvme_ns *nvme_ns;
2329 
2330 	pthread_mutex_lock(&nvme_bdev->mutex);
2331 	spdk_json_write_named_array_begin(w, "nvme");
2332 	TAILQ_FOREACH(nvme_ns, &nvme_bdev->nvme_ns_list, tailq) {
2333 		nvme_namespace_info_json(w, nvme_ns);
2334 	}
2335 	spdk_json_write_array_end(w);
2336 	pthread_mutex_unlock(&nvme_bdev->mutex);
2337 
2338 	return 0;
2339 }
2340 
2341 static void
2342 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
2343 {
2344 	/* No config per bdev needed */
2345 }
2346 
2347 static uint64_t
2348 bdev_nvme_get_spin_time(struct spdk_io_channel *ch)
2349 {
2350 	struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch);
2351 	struct nvme_io_path *io_path;
2352 	struct nvme_poll_group *group;
2353 	uint64_t spin_time = 0;
2354 
2355 	STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
2356 		group = io_path->ctrlr_ch->group;
2357 
2358 		if (!group || !group->collect_spin_stat) {
2359 			continue;
2360 		}
2361 
2362 		if (group->end_ticks != 0) {
2363 			group->spin_ticks += (group->end_ticks - group->start_ticks);
2364 			group->end_ticks = 0;
2365 		}
2366 
2367 		spin_time += group->spin_ticks;
2368 		group->start_ticks = 0;
2369 		group->spin_ticks = 0;
2370 	}
2371 
2372 	return (spin_time * 1000000ULL) / spdk_get_ticks_hz();
2373 }
2374 
2375 static const struct spdk_bdev_fn_table nvmelib_fn_table = {
2376 	.destruct		= bdev_nvme_destruct,
2377 	.submit_request		= bdev_nvme_submit_request,
2378 	.io_type_supported	= bdev_nvme_io_type_supported,
2379 	.get_io_channel		= bdev_nvme_get_io_channel,
2380 	.dump_info_json		= bdev_nvme_dump_info_json,
2381 	.write_config_json	= bdev_nvme_write_config_json,
2382 	.get_spin_time		= bdev_nvme_get_spin_time,
2383 	.get_module_ctx		= bdev_nvme_get_module_ctx,
2384 	.get_memory_domains	= bdev_nvme_get_memory_domains,
2385 };
2386 
2387 typedef int (*bdev_nvme_parse_ana_log_page_cb)(
2388 	const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg);
2389 
2390 static int
2391 bdev_nvme_parse_ana_log_page(struct nvme_ctrlr *nvme_ctrlr,
2392 			     bdev_nvme_parse_ana_log_page_cb cb_fn, void *cb_arg)
2393 {
2394 	struct spdk_nvme_ana_group_descriptor *copied_desc;
2395 	uint8_t *orig_desc;
2396 	uint32_t i, desc_size, copy_len;
2397 	int rc = 0;
2398 
2399 	if (nvme_ctrlr->ana_log_page == NULL) {
2400 		return -EINVAL;
2401 	}
2402 
2403 	copied_desc = nvme_ctrlr->copied_ana_desc;
2404 
2405 	orig_desc = (uint8_t *)nvme_ctrlr->ana_log_page + sizeof(struct spdk_nvme_ana_page);
2406 	copy_len = nvme_ctrlr->ana_log_page_size - sizeof(struct spdk_nvme_ana_page);
2407 
2408 	for (i = 0; i < nvme_ctrlr->ana_log_page->num_ana_group_desc; i++) {
2409 		memcpy(copied_desc, orig_desc, copy_len);
2410 
2411 		rc = cb_fn(copied_desc, cb_arg);
2412 		if (rc != 0) {
2413 			break;
2414 		}
2415 
2416 		desc_size = sizeof(struct spdk_nvme_ana_group_descriptor) +
2417 			    copied_desc->num_of_nsid * sizeof(uint32_t);
2418 		orig_desc += desc_size;
2419 		copy_len -= desc_size;
2420 	}
2421 
2422 	return rc;
2423 }
2424 
2425 static int
2426 nvme_ns_set_ana_state(const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg)
2427 {
2428 	struct nvme_ns *nvme_ns = cb_arg;
2429 	uint32_t i;
2430 
2431 	for (i = 0; i < desc->num_of_nsid; i++) {
2432 		if (desc->nsid[i] != spdk_nvme_ns_get_id(nvme_ns->ns)) {
2433 			continue;
2434 		}
2435 		nvme_ns->ana_group_id = desc->ana_group_id;
2436 		nvme_ns->ana_state = desc->ana_state;
2437 		return 1;
2438 	}
2439 
2440 	return 0;
2441 }
2442 
2443 static int
2444 nvme_disk_create(struct spdk_bdev *disk, const char *base_name,
2445 		 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns,
2446 		 uint32_t prchk_flags, void *ctx)
2447 {
2448 	const struct spdk_uuid		*uuid;
2449 	const uint8_t *nguid;
2450 	const struct spdk_nvme_ctrlr_data *cdata;
2451 	const struct spdk_nvme_ns_data	*nsdata;
2452 	enum spdk_nvme_csi		csi;
2453 	uint32_t atomic_bs, phys_bs, bs;
2454 
2455 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
2456 	csi = spdk_nvme_ns_get_csi(ns);
2457 
2458 	switch (csi) {
2459 	case SPDK_NVME_CSI_NVM:
2460 		disk->product_name = "NVMe disk";
2461 		break;
2462 	case SPDK_NVME_CSI_ZNS:
2463 		disk->product_name = "NVMe ZNS disk";
2464 		disk->zoned = true;
2465 		disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns);
2466 		disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) /
2467 					     spdk_nvme_ns_get_extended_sector_size(ns);
2468 		disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns);
2469 		disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns);
2470 		break;
2471 	default:
2472 		SPDK_ERRLOG("unsupported CSI: %u\n", csi);
2473 		return -ENOTSUP;
2474 	}
2475 
2476 	disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns));
2477 	if (!disk->name) {
2478 		return -ENOMEM;
2479 	}
2480 
2481 	disk->write_cache = 0;
2482 	if (cdata->vwc.present) {
2483 		/* Enable if the Volatile Write Cache exists */
2484 		disk->write_cache = 1;
2485 	}
2486 	if (cdata->oncs.write_zeroes) {
2487 		disk->max_write_zeroes = UINT16_MAX + 1;
2488 	}
2489 	disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns);
2490 	disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns);
2491 	disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns);
2492 
2493 	nguid = spdk_nvme_ns_get_nguid(ns);
2494 	if (!nguid) {
2495 		uuid = spdk_nvme_ns_get_uuid(ns);
2496 		if (uuid) {
2497 			disk->uuid = *uuid;
2498 		}
2499 	} else {
2500 		memcpy(&disk->uuid, nguid, sizeof(disk->uuid));
2501 	}
2502 
2503 	nsdata = spdk_nvme_ns_get_data(ns);
2504 	bs = spdk_nvme_ns_get_sector_size(ns);
2505 	atomic_bs = bs;
2506 	phys_bs = bs;
2507 	if (nsdata->nabo == 0) {
2508 		if (nsdata->nsfeat.ns_atomic_write_unit && nsdata->nawupf) {
2509 			atomic_bs = bs * (1 + nsdata->nawupf);
2510 		} else {
2511 			atomic_bs = bs * (1 + cdata->awupf);
2512 		}
2513 	}
2514 	if (nsdata->nsfeat.optperf) {
2515 		phys_bs = bs * (1 + nsdata->npwg);
2516 	}
2517 	disk->phys_blocklen = spdk_min(phys_bs, atomic_bs);
2518 
2519 	disk->md_len = spdk_nvme_ns_get_md_size(ns);
2520 	if (disk->md_len != 0) {
2521 		disk->md_interleave = nsdata->flbas.extended;
2522 		disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns);
2523 		if (disk->dif_type != SPDK_DIF_DISABLE) {
2524 			disk->dif_is_head_of_md = nsdata->dps.md_start;
2525 			disk->dif_check_flags = prchk_flags;
2526 		}
2527 	}
2528 
2529 	if (!(spdk_nvme_ctrlr_get_flags(ctrlr) &
2530 	      SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) {
2531 		disk->acwu = 0;
2532 	} else if (nsdata->nsfeat.ns_atomic_write_unit) {
2533 		disk->acwu = nsdata->nacwu;
2534 	} else {
2535 		disk->acwu = cdata->acwu;
2536 	}
2537 
2538 	disk->ctxt = ctx;
2539 	disk->fn_table = &nvmelib_fn_table;
2540 	disk->module = &nvme_if;
2541 
2542 	return 0;
2543 }
2544 
2545 static int
2546 nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns)
2547 {
2548 	struct nvme_bdev *bdev;
2549 	int rc;
2550 
2551 	bdev = calloc(1, sizeof(*bdev));
2552 	if (!bdev) {
2553 		SPDK_ERRLOG("bdev calloc() failed\n");
2554 		return -ENOMEM;
2555 	}
2556 
2557 	rc = pthread_mutex_init(&bdev->mutex, NULL);
2558 	if (rc != 0) {
2559 		free(bdev);
2560 		return rc;
2561 	}
2562 
2563 	bdev->ref = 1;
2564 	TAILQ_INIT(&bdev->nvme_ns_list);
2565 	TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq);
2566 	bdev->opal = nvme_ctrlr->opal_dev != NULL;
2567 
2568 	rc = nvme_disk_create(&bdev->disk, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ctrlr,
2569 			      nvme_ns->ns, nvme_ctrlr->prchk_flags, bdev);
2570 	if (rc != 0) {
2571 		SPDK_ERRLOG("Failed to create NVMe disk\n");
2572 		pthread_mutex_destroy(&bdev->mutex);
2573 		free(bdev);
2574 		return rc;
2575 	}
2576 
2577 	spdk_io_device_register(bdev,
2578 				bdev_nvme_create_bdev_channel_cb,
2579 				bdev_nvme_destroy_bdev_channel_cb,
2580 				sizeof(struct nvme_bdev_channel),
2581 				bdev->disk.name);
2582 
2583 	rc = spdk_bdev_register(&bdev->disk);
2584 	if (rc != 0) {
2585 		SPDK_ERRLOG("spdk_bdev_register() failed\n");
2586 		spdk_io_device_unregister(bdev, NULL);
2587 		pthread_mutex_destroy(&bdev->mutex);
2588 		free(bdev->disk.name);
2589 		free(bdev);
2590 		return rc;
2591 	}
2592 
2593 	nvme_ns->bdev = bdev;
2594 	bdev->nsid = nvme_ns->id;
2595 
2596 	bdev->nbdev_ctrlr = nvme_ctrlr->nbdev_ctrlr;
2597 	TAILQ_INSERT_TAIL(&nvme_ctrlr->nbdev_ctrlr->bdevs, bdev, tailq);
2598 
2599 	return 0;
2600 }
2601 
2602 static bool
2603 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2)
2604 {
2605 	const struct spdk_nvme_ns_data *nsdata1, *nsdata2;
2606 	const struct spdk_uuid *uuid1, *uuid2;
2607 
2608 	nsdata1 = spdk_nvme_ns_get_data(ns1);
2609 	nsdata2 = spdk_nvme_ns_get_data(ns2);
2610 	uuid1 = spdk_nvme_ns_get_uuid(ns1);
2611 	uuid2 = spdk_nvme_ns_get_uuid(ns2);
2612 
2613 	return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)) == 0 &&
2614 	       nsdata1->eui64 == nsdata2->eui64 &&
2615 	       uuid1 != NULL && uuid2 != NULL && spdk_uuid_compare(uuid1, uuid2) == 0;
2616 }
2617 
2618 static bool
2619 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
2620 		 struct spdk_nvme_ctrlr_opts *opts)
2621 {
2622 	struct nvme_probe_skip_entry *entry;
2623 
2624 	TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) {
2625 		if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) {
2626 			return false;
2627 		}
2628 	}
2629 
2630 	opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst;
2631 	opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight;
2632 	opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight;
2633 	opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight;
2634 	opts->disable_read_ana_log_page = true;
2635 
2636 	SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr);
2637 
2638 	return true;
2639 }
2640 
2641 static void
2642 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl)
2643 {
2644 	struct nvme_ctrlr *nvme_ctrlr = ctx;
2645 
2646 	if (spdk_nvme_cpl_is_error(cpl)) {
2647 		SPDK_WARNLOG("Abort failed. Resetting controller. sc is %u, sct is %u.\n", cpl->status.sc,
2648 			     cpl->status.sct);
2649 		bdev_nvme_reset(nvme_ctrlr);
2650 	} else if (cpl->cdw0 & 0x1) {
2651 		SPDK_WARNLOG("Specified command could not be aborted.\n");
2652 		bdev_nvme_reset(nvme_ctrlr);
2653 	}
2654 }
2655 
2656 static void
2657 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr,
2658 	   struct spdk_nvme_qpair *qpair, uint16_t cid)
2659 {
2660 	struct nvme_ctrlr *nvme_ctrlr = cb_arg;
2661 	union spdk_nvme_csts_register csts;
2662 	int rc;
2663 
2664 	assert(nvme_ctrlr->ctrlr == ctrlr);
2665 
2666 	SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid);
2667 
2668 	/* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O
2669 	 * queue.  (Note: qpair == NULL when there's an admin cmd timeout.)  Otherwise we
2670 	 * would submit another fabrics cmd on the admin queue to read CSTS and check for its
2671 	 * completion recursively.
2672 	 */
2673 	if (nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) {
2674 		csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr);
2675 		if (csts.bits.cfs) {
2676 			SPDK_ERRLOG("Controller Fatal Status, reset required\n");
2677 			bdev_nvme_reset(nvme_ctrlr);
2678 			return;
2679 		}
2680 	}
2681 
2682 	switch (g_opts.action_on_timeout) {
2683 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT:
2684 		if (qpair) {
2685 			/* Don't send abort to ctrlr when ctrlr is not available. */
2686 			pthread_mutex_lock(&nvme_ctrlr->mutex);
2687 			if (!nvme_ctrlr_is_available(nvme_ctrlr)) {
2688 				pthread_mutex_unlock(&nvme_ctrlr->mutex);
2689 				SPDK_NOTICELOG("Quit abort. Ctrlr is not available.\n");
2690 				return;
2691 			}
2692 			pthread_mutex_unlock(&nvme_ctrlr->mutex);
2693 
2694 			rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid,
2695 						       nvme_abort_cpl, nvme_ctrlr);
2696 			if (rc == 0) {
2697 				return;
2698 			}
2699 
2700 			SPDK_ERRLOG("Unable to send abort. Resetting, rc is %d.\n", rc);
2701 		}
2702 
2703 	/* FALLTHROUGH */
2704 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET:
2705 		bdev_nvme_reset(nvme_ctrlr);
2706 		break;
2707 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE:
2708 		SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n");
2709 		break;
2710 	default:
2711 		SPDK_ERRLOG("An invalid timeout action value is found.\n");
2712 		break;
2713 	}
2714 }
2715 
2716 static void
2717 nvme_ctrlr_populate_namespace_done(struct nvme_ns *nvme_ns, int rc)
2718 {
2719 	struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr;
2720 	struct nvme_async_probe_ctx *ctx = nvme_ns->probe_ctx;
2721 
2722 	if (rc == 0) {
2723 		nvme_ns->probe_ctx = NULL;
2724 		pthread_mutex_lock(&nvme_ctrlr->mutex);
2725 		nvme_ctrlr->ref++;
2726 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
2727 	} else {
2728 		RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns);
2729 		free(nvme_ns);
2730 	}
2731 
2732 	if (ctx) {
2733 		ctx->populates_in_progress--;
2734 		if (ctx->populates_in_progress == 0) {
2735 			nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx);
2736 		}
2737 	}
2738 }
2739 
2740 static void
2741 bdev_nvme_add_io_path(struct spdk_io_channel_iter *i)
2742 {
2743 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
2744 	struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch);
2745 	struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i);
2746 	int rc;
2747 
2748 	rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns);
2749 	if (rc != 0) {
2750 		SPDK_ERRLOG("Failed to add I/O path to bdev_channel dynamically.\n");
2751 	}
2752 
2753 	spdk_for_each_channel_continue(i, rc);
2754 }
2755 
2756 static void
2757 bdev_nvme_delete_io_path(struct spdk_io_channel_iter *i)
2758 {
2759 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
2760 	struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch);
2761 	struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i);
2762 	struct nvme_io_path *io_path;
2763 
2764 	io_path = _bdev_nvme_get_io_path(nbdev_ch, nvme_ns);
2765 	if (io_path != NULL) {
2766 		_bdev_nvme_delete_io_path(nbdev_ch, io_path);
2767 	}
2768 
2769 	spdk_for_each_channel_continue(i, 0);
2770 }
2771 
2772 static void
2773 bdev_nvme_add_io_path_failed(struct spdk_io_channel_iter *i, int status)
2774 {
2775 	struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i);
2776 
2777 	nvme_ctrlr_populate_namespace_done(nvme_ns, -1);
2778 }
2779 
2780 static void
2781 bdev_nvme_add_io_path_done(struct spdk_io_channel_iter *i, int status)
2782 {
2783 	struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i);
2784 	struct nvme_bdev *bdev = spdk_io_channel_iter_get_io_device(i);
2785 
2786 	if (status == 0) {
2787 		nvme_ctrlr_populate_namespace_done(nvme_ns, 0);
2788 	} else {
2789 		/* Delete the added io_paths and fail populating the namespace. */
2790 		spdk_for_each_channel(bdev,
2791 				      bdev_nvme_delete_io_path,
2792 				      nvme_ns,
2793 				      bdev_nvme_add_io_path_failed);
2794 	}
2795 }
2796 
2797 static int
2798 nvme_bdev_add_ns(struct nvme_bdev *bdev, struct nvme_ns *nvme_ns)
2799 {
2800 	struct nvme_ns *tmp_ns;
2801 	const struct spdk_nvme_ns_data *nsdata;
2802 
2803 	nsdata = spdk_nvme_ns_get_data(nvme_ns->ns);
2804 	if (!nsdata->nmic.can_share) {
2805 		SPDK_ERRLOG("Namespace cannot be shared.\n");
2806 		return -EINVAL;
2807 	}
2808 
2809 	pthread_mutex_lock(&bdev->mutex);
2810 
2811 	tmp_ns = TAILQ_FIRST(&bdev->nvme_ns_list);
2812 	assert(tmp_ns != NULL);
2813 
2814 	if (!bdev_nvme_compare_ns(nvme_ns->ns, tmp_ns->ns)) {
2815 		pthread_mutex_unlock(&bdev->mutex);
2816 		SPDK_ERRLOG("Namespaces are not identical.\n");
2817 		return -EINVAL;
2818 	}
2819 
2820 	bdev->ref++;
2821 	TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq);
2822 	nvme_ns->bdev = bdev;
2823 
2824 	pthread_mutex_unlock(&bdev->mutex);
2825 
2826 	/* Add nvme_io_path to nvme_bdev_channels dynamically. */
2827 	spdk_for_each_channel(bdev,
2828 			      bdev_nvme_add_io_path,
2829 			      nvme_ns,
2830 			      bdev_nvme_add_io_path_done);
2831 
2832 	return 0;
2833 }
2834 
2835 static void
2836 nvme_ctrlr_populate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns)
2837 {
2838 	struct spdk_nvme_ns	*ns;
2839 	struct nvme_bdev	*bdev;
2840 	int			rc = 0;
2841 
2842 	ns = spdk_nvme_ctrlr_get_ns(nvme_ctrlr->ctrlr, nvme_ns->id);
2843 	if (!ns) {
2844 		SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id);
2845 		rc = -EINVAL;
2846 		goto done;
2847 	}
2848 
2849 	nvme_ns->ns = ns;
2850 	nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE;
2851 
2852 	if (nvme_ctrlr->ana_log_page != NULL) {
2853 		bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ns_set_ana_state, nvme_ns);
2854 	}
2855 
2856 	bdev = nvme_bdev_ctrlr_get_bdev(nvme_ctrlr->nbdev_ctrlr, nvme_ns->id);
2857 	if (bdev == NULL) {
2858 		rc = nvme_bdev_create(nvme_ctrlr, nvme_ns);
2859 	} else {
2860 		rc = nvme_bdev_add_ns(bdev, nvme_ns);
2861 		if (rc == 0) {
2862 			return;
2863 		}
2864 	}
2865 done:
2866 	nvme_ctrlr_populate_namespace_done(nvme_ns, rc);
2867 }
2868 
2869 static void
2870 nvme_ctrlr_depopulate_namespace_done(struct nvme_ns *nvme_ns)
2871 {
2872 	struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr;
2873 
2874 	assert(nvme_ctrlr != NULL);
2875 
2876 	pthread_mutex_lock(&nvme_ctrlr->mutex);
2877 
2878 	RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns);
2879 
2880 	if (nvme_ns->bdev != NULL) {
2881 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
2882 		return;
2883 	}
2884 
2885 	free(nvme_ns);
2886 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
2887 
2888 	nvme_ctrlr_release(nvme_ctrlr);
2889 }
2890 
2891 static void
2892 bdev_nvme_delete_io_path_done(struct spdk_io_channel_iter *i, int status)
2893 {
2894 	struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i);
2895 
2896 	nvme_ctrlr_depopulate_namespace_done(nvme_ns);
2897 }
2898 
2899 static void
2900 nvme_ctrlr_depopulate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns)
2901 {
2902 	struct nvme_bdev *bdev;
2903 
2904 	bdev = nvme_ns->bdev;
2905 	if (bdev != NULL) {
2906 		pthread_mutex_lock(&bdev->mutex);
2907 
2908 		assert(bdev->ref > 0);
2909 		bdev->ref--;
2910 		if (bdev->ref == 0) {
2911 			pthread_mutex_unlock(&bdev->mutex);
2912 
2913 			spdk_bdev_unregister(&bdev->disk, NULL, NULL);
2914 		} else {
2915 			/* spdk_bdev_unregister() is not called until the last nvme_ns is
2916 			 * depopulated. Hence we need to remove nvme_ns from bdev->nvme_ns_list
2917 			 * and clear nvme_ns->bdev here.
2918 			 */
2919 			TAILQ_REMOVE(&bdev->nvme_ns_list, nvme_ns, tailq);
2920 			nvme_ns->bdev = NULL;
2921 
2922 			pthread_mutex_unlock(&bdev->mutex);
2923 
2924 			/* Delete nvme_io_paths from nvme_bdev_channels dynamically. After that,
2925 			 * we call depopulate_namespace_done() to avoid use-after-free.
2926 			 */
2927 			spdk_for_each_channel(bdev,
2928 					      bdev_nvme_delete_io_path,
2929 					      nvme_ns,
2930 					      bdev_nvme_delete_io_path_done);
2931 			return;
2932 		}
2933 	}
2934 
2935 	nvme_ctrlr_depopulate_namespace_done(nvme_ns);
2936 }
2937 
2938 static void
2939 nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr,
2940 			       struct nvme_async_probe_ctx *ctx)
2941 {
2942 	struct spdk_nvme_ctrlr	*ctrlr = nvme_ctrlr->ctrlr;
2943 	struct nvme_ns	*nvme_ns, *next;
2944 	struct spdk_nvme_ns	*ns;
2945 	struct nvme_bdev	*bdev;
2946 	uint32_t		nsid;
2947 	int			rc;
2948 	uint64_t		num_sectors;
2949 
2950 	if (ctx) {
2951 		/* Initialize this count to 1 to handle the populate functions
2952 		 * calling nvme_ctrlr_populate_namespace_done() immediately.
2953 		 */
2954 		ctx->populates_in_progress = 1;
2955 	}
2956 
2957 	/* First loop over our existing namespaces and see if they have been
2958 	 * removed. */
2959 	nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr);
2960 	while (nvme_ns != NULL) {
2961 		next = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns);
2962 
2963 		if (spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) {
2964 			/* NS is still there but attributes may have changed */
2965 			ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id);
2966 			num_sectors = spdk_nvme_ns_get_num_sectors(ns);
2967 			bdev = nvme_ns->bdev;
2968 			assert(bdev != NULL);
2969 			if (bdev->disk.blockcnt != num_sectors) {
2970 				SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n",
2971 					       nvme_ns->id,
2972 					       bdev->disk.name,
2973 					       bdev->disk.blockcnt,
2974 					       num_sectors);
2975 				rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors);
2976 				if (rc != 0) {
2977 					SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n",
2978 						    bdev->disk.name, rc);
2979 				}
2980 			}
2981 		} else {
2982 			/* Namespace was removed */
2983 			nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns);
2984 		}
2985 
2986 		nvme_ns = next;
2987 	}
2988 
2989 	/* Loop through all of the namespaces at the nvme level and see if any of them are new */
2990 	nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr);
2991 	while (nsid != 0) {
2992 		nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid);
2993 
2994 		if (nvme_ns == NULL) {
2995 			/* Found a new one */
2996 			nvme_ns = calloc(1, sizeof(struct nvme_ns));
2997 			if (nvme_ns == NULL) {
2998 				SPDK_ERRLOG("Failed to allocate namespace\n");
2999 				/* This just fails to attach the namespace. It may work on a future attempt. */
3000 				continue;
3001 			}
3002 
3003 			nvme_ns->id = nsid;
3004 			nvme_ns->ctrlr = nvme_ctrlr;
3005 
3006 			nvme_ns->bdev = NULL;
3007 
3008 			if (ctx) {
3009 				ctx->populates_in_progress++;
3010 			}
3011 			nvme_ns->probe_ctx = ctx;
3012 
3013 			RB_INSERT(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns);
3014 
3015 			nvme_ctrlr_populate_namespace(nvme_ctrlr, nvme_ns);
3016 		}
3017 
3018 		nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid);
3019 	}
3020 
3021 	if (ctx) {
3022 		/* Decrement this count now that the loop is over to account
3023 		 * for the one we started with.  If the count is then 0, we
3024 		 * know any populate_namespace functions completed immediately,
3025 		 * so we'll kick the callback here.
3026 		 */
3027 		ctx->populates_in_progress--;
3028 		if (ctx->populates_in_progress == 0) {
3029 			nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx);
3030 		}
3031 	}
3032 
3033 }
3034 
3035 static void
3036 nvme_ctrlr_depopulate_namespaces(struct nvme_ctrlr *nvme_ctrlr)
3037 {
3038 	struct nvme_ns *nvme_ns, *tmp;
3039 
3040 	RB_FOREACH_SAFE(nvme_ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp) {
3041 		nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns);
3042 	}
3043 }
3044 
3045 static int
3046 nvme_ctrlr_set_ana_states(const struct spdk_nvme_ana_group_descriptor *desc,
3047 			  void *cb_arg)
3048 {
3049 	struct nvme_ctrlr *nvme_ctrlr = cb_arg;
3050 	struct nvme_ns *nvme_ns;
3051 	uint32_t i, nsid;
3052 
3053 	for (i = 0; i < desc->num_of_nsid; i++) {
3054 		nsid = desc->nsid[i];
3055 		if (nsid == 0) {
3056 			continue;
3057 		}
3058 
3059 		nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid);
3060 
3061 		assert(nvme_ns != NULL);
3062 		if (nvme_ns == NULL) {
3063 			/* Target told us that an inactive namespace had an ANA change */
3064 			continue;
3065 		}
3066 
3067 		nvme_ns->ana_group_id = desc->ana_group_id;
3068 		nvme_ns->ana_state = desc->ana_state;
3069 		nvme_ns->ana_state_updating = false;
3070 	}
3071 
3072 	return 0;
3073 }
3074 
3075 static void
3076 bdev_nvme_clear_io_path_cache(struct spdk_io_channel_iter *i)
3077 {
3078 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
3079 	struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch);
3080 
3081 	_bdev_nvme_clear_io_path_cache(ctrlr_ch);
3082 
3083 	spdk_for_each_channel_continue(i, 0);
3084 }
3085 
3086 static void
3087 bdev_nvme_clear_io_path_cache_done(struct spdk_io_channel_iter *i, int status)
3088 {
3089 	struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i);
3090 
3091 	pthread_mutex_lock(&nvme_ctrlr->mutex);
3092 
3093 	assert(nvme_ctrlr->ana_log_page_updating == true);
3094 	nvme_ctrlr->ana_log_page_updating = false;
3095 
3096 	if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) {
3097 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
3098 		return;
3099 	}
3100 
3101 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
3102 
3103 	nvme_ctrlr_unregister(nvme_ctrlr);
3104 }
3105 
3106 static void
3107 nvme_ctrlr_read_ana_log_page_done(void *ctx, const struct spdk_nvme_cpl *cpl)
3108 {
3109 	struct nvme_ctrlr *nvme_ctrlr = ctx;
3110 
3111 	if (cpl != NULL && spdk_nvme_cpl_is_success(cpl)) {
3112 		bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ctrlr_set_ana_states,
3113 					     nvme_ctrlr);
3114 	}
3115 
3116 	spdk_for_each_channel(nvme_ctrlr,
3117 			      bdev_nvme_clear_io_path_cache,
3118 			      NULL,
3119 			      bdev_nvme_clear_io_path_cache_done);
3120 }
3121 
3122 static void
3123 nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr)
3124 {
3125 	int rc;
3126 
3127 	if (nvme_ctrlr->ana_log_page == NULL) {
3128 		return;
3129 	}
3130 
3131 	pthread_mutex_lock(&nvme_ctrlr->mutex);
3132 	if (!nvme_ctrlr_is_available(nvme_ctrlr) ||
3133 	    nvme_ctrlr->ana_log_page_updating) {
3134 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
3135 		return;
3136 	}
3137 
3138 	nvme_ctrlr->ana_log_page_updating = true;
3139 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
3140 
3141 	rc = spdk_nvme_ctrlr_cmd_get_log_page(nvme_ctrlr->ctrlr,
3142 					      SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS,
3143 					      SPDK_NVME_GLOBAL_NS_TAG,
3144 					      nvme_ctrlr->ana_log_page,
3145 					      nvme_ctrlr->ana_log_page_size, 0,
3146 					      nvme_ctrlr_read_ana_log_page_done,
3147 					      nvme_ctrlr);
3148 	if (rc != 0) {
3149 		nvme_ctrlr_read_ana_log_page_done(nvme_ctrlr, NULL);
3150 	}
3151 }
3152 
3153 static void
3154 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl)
3155 {
3156 	struct nvme_ctrlr *nvme_ctrlr		= arg;
3157 	union spdk_nvme_async_event_completion	event;
3158 
3159 	if (spdk_nvme_cpl_is_error(cpl)) {
3160 		SPDK_WARNLOG("AER request execute failed");
3161 		return;
3162 	}
3163 
3164 	event.raw = cpl->cdw0;
3165 	if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) &&
3166 	    (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) {
3167 		nvme_ctrlr_populate_namespaces(nvme_ctrlr, NULL);
3168 	} else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) &&
3169 		   (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_ANA_CHANGE)) {
3170 		nvme_ctrlr_read_ana_log_page(nvme_ctrlr);
3171 	}
3172 }
3173 
3174 static void
3175 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, size_t count, int rc)
3176 {
3177 	if (ctx->cb_fn) {
3178 		ctx->cb_fn(ctx->cb_ctx, count, rc);
3179 	}
3180 
3181 	ctx->namespaces_populated = true;
3182 	if (ctx->probe_done) {
3183 		/* The probe was already completed, so we need to free the context
3184 		 * here.  This can happen for cases like OCSSD, where we need to
3185 		 * send additional commands to the SSD after attach.
3186 		 */
3187 		free(ctx);
3188 	}
3189 }
3190 
3191 static void
3192 nvme_ctrlr_create_done(struct nvme_ctrlr *nvme_ctrlr,
3193 		       struct nvme_async_probe_ctx *ctx)
3194 {
3195 	spdk_io_device_register(nvme_ctrlr,
3196 				bdev_nvme_create_ctrlr_channel_cb,
3197 				bdev_nvme_destroy_ctrlr_channel_cb,
3198 				sizeof(struct nvme_ctrlr_channel),
3199 				nvme_ctrlr->nbdev_ctrlr->name);
3200 
3201 	nvme_ctrlr_populate_namespaces(nvme_ctrlr, ctx);
3202 }
3203 
3204 static void
3205 nvme_ctrlr_init_ana_log_page_done(void *_ctx, const struct spdk_nvme_cpl *cpl)
3206 {
3207 	struct nvme_ctrlr *nvme_ctrlr = _ctx;
3208 	struct nvme_async_probe_ctx *ctx = nvme_ctrlr->probe_ctx;
3209 
3210 	nvme_ctrlr->probe_ctx = NULL;
3211 
3212 	if (spdk_nvme_cpl_is_error(cpl)) {
3213 		nvme_ctrlr_delete(nvme_ctrlr);
3214 
3215 		if (ctx != NULL) {
3216 			populate_namespaces_cb(ctx, 0, -1);
3217 		}
3218 		return;
3219 	}
3220 
3221 	nvme_ctrlr_create_done(nvme_ctrlr, ctx);
3222 }
3223 
3224 static int
3225 nvme_ctrlr_init_ana_log_page(struct nvme_ctrlr *nvme_ctrlr,
3226 			     struct nvme_async_probe_ctx *ctx)
3227 {
3228 	struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr;
3229 	const struct spdk_nvme_ctrlr_data *cdata;
3230 	uint32_t ana_log_page_size;
3231 
3232 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
3233 
3234 	ana_log_page_size = sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid *
3235 			    sizeof(struct spdk_nvme_ana_group_descriptor) + cdata->nn *
3236 			    sizeof(uint32_t);
3237 
3238 	nvme_ctrlr->ana_log_page = spdk_zmalloc(ana_log_page_size, 64, NULL,
3239 						SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
3240 	if (nvme_ctrlr->ana_log_page == NULL) {
3241 		SPDK_ERRLOG("could not allocate ANA log page buffer\n");
3242 		return -ENXIO;
3243 	}
3244 
3245 	/* Each descriptor in a ANA log page is not ensured to be 8-bytes aligned.
3246 	 * Hence copy each descriptor to a temporary area when parsing it.
3247 	 *
3248 	 * Allocate a buffer whose size is as large as ANA log page buffer because
3249 	 * we do not know the size of a descriptor until actually reading it.
3250 	 */
3251 	nvme_ctrlr->copied_ana_desc = calloc(1, ana_log_page_size);
3252 	if (nvme_ctrlr->copied_ana_desc == NULL) {
3253 		SPDK_ERRLOG("could not allocate a buffer to parse ANA descriptor\n");
3254 		return -ENOMEM;
3255 	}
3256 
3257 	nvme_ctrlr->ana_log_page_size = ana_log_page_size;
3258 
3259 	nvme_ctrlr->probe_ctx = ctx;
3260 
3261 	return spdk_nvme_ctrlr_cmd_get_log_page(ctrlr,
3262 						SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS,
3263 						SPDK_NVME_GLOBAL_NS_TAG,
3264 						nvme_ctrlr->ana_log_page,
3265 						nvme_ctrlr->ana_log_page_size, 0,
3266 						nvme_ctrlr_init_ana_log_page_done,
3267 						nvme_ctrlr);
3268 }
3269 
3270 /* hostnqn and subnqn were already verified before attaching a controller.
3271  * Hence check only the multipath capability and cntlid here.
3272  */
3273 static bool
3274 bdev_nvme_check_multipath(struct nvme_bdev_ctrlr *nbdev_ctrlr, struct spdk_nvme_ctrlr *ctrlr)
3275 {
3276 	struct nvme_ctrlr *tmp;
3277 	const struct spdk_nvme_ctrlr_data *cdata, *tmp_cdata;
3278 
3279 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
3280 
3281 	if (!cdata->cmic.multi_ctrlr) {
3282 		SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid);
3283 		return false;
3284 	}
3285 
3286 	TAILQ_FOREACH(tmp, &nbdev_ctrlr->ctrlrs, tailq) {
3287 		tmp_cdata = spdk_nvme_ctrlr_get_data(tmp->ctrlr);
3288 
3289 		if (!tmp_cdata->cmic.multi_ctrlr) {
3290 			SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid);
3291 			return false;
3292 		}
3293 		if (cdata->cntlid == tmp_cdata->cntlid) {
3294 			SPDK_ERRLOG("cntlid %u are duplicated.\n", tmp_cdata->cntlid);
3295 			return false;
3296 		}
3297 	}
3298 
3299 	return true;
3300 }
3301 
3302 static int
3303 nvme_bdev_ctrlr_create(const char *name, struct nvme_ctrlr *nvme_ctrlr)
3304 {
3305 	struct nvme_bdev_ctrlr *nbdev_ctrlr;
3306 	struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr;
3307 	int rc = 0;
3308 
3309 	pthread_mutex_lock(&g_bdev_nvme_mutex);
3310 
3311 	nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name);
3312 	if (nbdev_ctrlr != NULL) {
3313 		if (!bdev_nvme_check_multipath(nbdev_ctrlr, ctrlr)) {
3314 			rc = -EINVAL;
3315 			goto exit;
3316 		}
3317 	} else {
3318 		nbdev_ctrlr = calloc(1, sizeof(*nbdev_ctrlr));
3319 		if (nbdev_ctrlr == NULL) {
3320 			SPDK_ERRLOG("Failed to allocate nvme_bdev_ctrlr.\n");
3321 			rc = -ENOMEM;
3322 			goto exit;
3323 		}
3324 		nbdev_ctrlr->name = strdup(name);
3325 		if (nbdev_ctrlr->name == NULL) {
3326 			SPDK_ERRLOG("Failed to allocate name of nvme_bdev_ctrlr.\n");
3327 			free(nbdev_ctrlr);
3328 			goto exit;
3329 		}
3330 		TAILQ_INIT(&nbdev_ctrlr->ctrlrs);
3331 		TAILQ_INIT(&nbdev_ctrlr->bdevs);
3332 		TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq);
3333 	}
3334 	nvme_ctrlr->nbdev_ctrlr = nbdev_ctrlr;
3335 	TAILQ_INSERT_TAIL(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq);
3336 exit:
3337 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
3338 	return rc;
3339 }
3340 
3341 static int
3342 nvme_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr,
3343 		  const char *name,
3344 		  const struct spdk_nvme_transport_id *trid,
3345 		  struct nvme_async_probe_ctx *ctx)
3346 {
3347 	struct nvme_ctrlr *nvme_ctrlr;
3348 	struct nvme_path_id *path_id;
3349 	const struct spdk_nvme_ctrlr_data *cdata;
3350 	int rc;
3351 
3352 	nvme_ctrlr = calloc(1, sizeof(*nvme_ctrlr));
3353 	if (nvme_ctrlr == NULL) {
3354 		SPDK_ERRLOG("Failed to allocate device struct\n");
3355 		return -ENOMEM;
3356 	}
3357 
3358 	rc = pthread_mutex_init(&nvme_ctrlr->mutex, NULL);
3359 	if (rc != 0) {
3360 		free(nvme_ctrlr);
3361 		return rc;
3362 	}
3363 
3364 	TAILQ_INIT(&nvme_ctrlr->trids);
3365 
3366 	RB_INIT(&nvme_ctrlr->namespaces);
3367 
3368 	path_id = calloc(1, sizeof(*path_id));
3369 	if (path_id == NULL) {
3370 		SPDK_ERRLOG("Failed to allocate trid entry pointer\n");
3371 		rc = -ENOMEM;
3372 		goto err;
3373 	}
3374 
3375 	path_id->trid = *trid;
3376 	if (ctx != NULL) {
3377 		memcpy(path_id->hostid.hostaddr, ctx->opts.src_addr, sizeof(path_id->hostid.hostaddr));
3378 		memcpy(path_id->hostid.hostsvcid, ctx->opts.src_svcid, sizeof(path_id->hostid.hostsvcid));
3379 	}
3380 	nvme_ctrlr->active_path_id = path_id;
3381 	TAILQ_INSERT_HEAD(&nvme_ctrlr->trids, path_id, link);
3382 
3383 	nvme_ctrlr->thread = spdk_get_thread();
3384 	nvme_ctrlr->ctrlr = ctrlr;
3385 	nvme_ctrlr->ref = 1;
3386 
3387 	if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) {
3388 		SPDK_ERRLOG("OCSSDs are not supported");
3389 		rc = -ENOTSUP;
3390 		goto err;
3391 	}
3392 
3393 	if (ctx != NULL) {
3394 		nvme_ctrlr->prchk_flags = ctx->prchk_flags;
3395 		nvme_ctrlr->ctrlr_loss_timeout_sec = ctx->ctrlr_loss_timeout_sec;
3396 		nvme_ctrlr->reconnect_delay_sec = ctx->reconnect_delay_sec;
3397 		nvme_ctrlr->fast_io_fail_timeout_sec = ctx->fast_io_fail_timeout_sec;
3398 	}
3399 
3400 	nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_ctrlr,
3401 					  g_opts.nvme_adminq_poll_period_us);
3402 
3403 	if (g_opts.timeout_us > 0) {
3404 		/* Register timeout callback. Timeout values for IO vs. admin reqs can be different. */
3405 		/* If timeout_admin_us is 0 (not specified), admin uses same timeout as IO. */
3406 		uint64_t adm_timeout_us = (g_opts.timeout_admin_us == 0) ?
3407 					  g_opts.timeout_us : g_opts.timeout_admin_us;
3408 		spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us,
3409 				adm_timeout_us, timeout_cb, nvme_ctrlr);
3410 	}
3411 
3412 	spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_ctrlr);
3413 	spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_ctrlr);
3414 
3415 	if (spdk_nvme_ctrlr_get_flags(ctrlr) &
3416 	    SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) {
3417 		nvme_ctrlr->opal_dev = spdk_opal_dev_construct(ctrlr);
3418 	}
3419 
3420 	rc = nvme_bdev_ctrlr_create(name, nvme_ctrlr);
3421 	if (rc != 0) {
3422 		goto err;
3423 	}
3424 
3425 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
3426 
3427 	if (cdata->cmic.ana_reporting) {
3428 		rc = nvme_ctrlr_init_ana_log_page(nvme_ctrlr, ctx);
3429 		if (rc == 0) {
3430 			return 0;
3431 		}
3432 	} else {
3433 		nvme_ctrlr_create_done(nvme_ctrlr, ctx);
3434 		return 0;
3435 	}
3436 
3437 err:
3438 	nvme_ctrlr_delete(nvme_ctrlr);
3439 	return rc;
3440 }
3441 
3442 static void
3443 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
3444 	  struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
3445 {
3446 	char *name;
3447 
3448 	name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++);
3449 	if (!name) {
3450 		SPDK_ERRLOG("Failed to assign name to NVMe device\n");
3451 		return;
3452 	}
3453 
3454 	SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name);
3455 
3456 	nvme_ctrlr_create(ctrlr, name, trid, NULL);
3457 
3458 	free(name);
3459 }
3460 
3461 static void
3462 _nvme_ctrlr_destruct(void *ctx)
3463 {
3464 	struct nvme_ctrlr *nvme_ctrlr = ctx;
3465 
3466 	nvme_ctrlr_depopulate_namespaces(nvme_ctrlr);
3467 	nvme_ctrlr_release(nvme_ctrlr);
3468 }
3469 
3470 static int
3471 _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, bool hotplug)
3472 {
3473 	struct nvme_probe_skip_entry *entry;
3474 
3475 	pthread_mutex_lock(&nvme_ctrlr->mutex);
3476 
3477 	/* The controller's destruction was already started */
3478 	if (nvme_ctrlr->destruct) {
3479 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
3480 		return 0;
3481 	}
3482 
3483 	if (!hotplug &&
3484 	    nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) {
3485 		entry = calloc(1, sizeof(*entry));
3486 		if (!entry) {
3487 			pthread_mutex_unlock(&nvme_ctrlr->mutex);
3488 			return -ENOMEM;
3489 		}
3490 		entry->trid = nvme_ctrlr->active_path_id->trid;
3491 		TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq);
3492 	}
3493 
3494 	nvme_ctrlr->destruct = true;
3495 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
3496 
3497 	_nvme_ctrlr_destruct(nvme_ctrlr);
3498 
3499 	return 0;
3500 }
3501 
3502 static void
3503 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr)
3504 {
3505 	struct nvme_ctrlr *nvme_ctrlr = cb_ctx;
3506 
3507 	_bdev_nvme_delete(nvme_ctrlr, true);
3508 }
3509 
3510 static int
3511 bdev_nvme_hotplug_probe(void *arg)
3512 {
3513 	if (g_hotplug_probe_ctx == NULL) {
3514 		spdk_poller_unregister(&g_hotplug_probe_poller);
3515 		return SPDK_POLLER_IDLE;
3516 	}
3517 
3518 	if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) {
3519 		g_hotplug_probe_ctx = NULL;
3520 		spdk_poller_unregister(&g_hotplug_probe_poller);
3521 	}
3522 
3523 	return SPDK_POLLER_BUSY;
3524 }
3525 
3526 static int
3527 bdev_nvme_hotplug(void *arg)
3528 {
3529 	struct spdk_nvme_transport_id trid_pcie;
3530 
3531 	if (g_hotplug_probe_ctx) {
3532 		return SPDK_POLLER_BUSY;
3533 	}
3534 
3535 	memset(&trid_pcie, 0, sizeof(trid_pcie));
3536 	spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE);
3537 
3538 	g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL,
3539 			      hotplug_probe_cb, attach_cb, NULL);
3540 
3541 	if (g_hotplug_probe_ctx) {
3542 		assert(g_hotplug_probe_poller == NULL);
3543 		g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000);
3544 	}
3545 
3546 	return SPDK_POLLER_BUSY;
3547 }
3548 
3549 void
3550 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts)
3551 {
3552 	*opts = g_opts;
3553 }
3554 
3555 static int
3556 bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts)
3557 {
3558 	if ((opts->timeout_us == 0) && (opts->timeout_admin_us != 0)) {
3559 		/* Can't set timeout_admin_us without also setting timeout_us */
3560 		SPDK_WARNLOG("Invalid options: Can't have (timeout_us == 0) with (timeout_admin_us > 0)\n");
3561 		return -EINVAL;
3562 	}
3563 
3564 	if (opts->bdev_retry_count < -1) {
3565 		SPDK_WARNLOG("Invalid option: bdev_retry_count can't be less than -1.\n");
3566 		return -EINVAL;
3567 	}
3568 
3569 	return 0;
3570 }
3571 
3572 int
3573 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts)
3574 {
3575 	int ret = bdev_nvme_validate_opts(opts);
3576 	if (ret) {
3577 		SPDK_WARNLOG("Failed to set nvme opts.\n");
3578 		return ret;
3579 	}
3580 
3581 	if (g_bdev_nvme_init_thread != NULL) {
3582 		if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
3583 			return -EPERM;
3584 		}
3585 	}
3586 
3587 	g_opts = *opts;
3588 
3589 	return 0;
3590 }
3591 
3592 struct set_nvme_hotplug_ctx {
3593 	uint64_t period_us;
3594 	bool enabled;
3595 	spdk_msg_fn fn;
3596 	void *fn_ctx;
3597 };
3598 
3599 static void
3600 set_nvme_hotplug_period_cb(void *_ctx)
3601 {
3602 	struct set_nvme_hotplug_ctx *ctx = _ctx;
3603 
3604 	spdk_poller_unregister(&g_hotplug_poller);
3605 	if (ctx->enabled) {
3606 		g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us);
3607 	}
3608 
3609 	g_nvme_hotplug_poll_period_us = ctx->period_us;
3610 	g_nvme_hotplug_enabled = ctx->enabled;
3611 	if (ctx->fn) {
3612 		ctx->fn(ctx->fn_ctx);
3613 	}
3614 
3615 	free(ctx);
3616 }
3617 
3618 int
3619 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx)
3620 {
3621 	struct set_nvme_hotplug_ctx *ctx;
3622 
3623 	if (enabled == true && !spdk_process_is_primary()) {
3624 		return -EPERM;
3625 	}
3626 
3627 	ctx = calloc(1, sizeof(*ctx));
3628 	if (ctx == NULL) {
3629 		return -ENOMEM;
3630 	}
3631 
3632 	period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us;
3633 	ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX);
3634 	ctx->enabled = enabled;
3635 	ctx->fn = cb;
3636 	ctx->fn_ctx = cb_ctx;
3637 
3638 	spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx);
3639 	return 0;
3640 }
3641 
3642 static void
3643 nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr,
3644 				    struct nvme_async_probe_ctx *ctx)
3645 {
3646 	struct nvme_ns	*nvme_ns;
3647 	struct nvme_bdev	*nvme_bdev;
3648 	size_t			j;
3649 
3650 	assert(nvme_ctrlr != NULL);
3651 
3652 	if (ctx->names == NULL) {
3653 		populate_namespaces_cb(ctx, 0, 0);
3654 		return;
3655 	}
3656 
3657 	/*
3658 	 * Report the new bdevs that were created in this call.
3659 	 * There can be more than one bdev per NVMe controller.
3660 	 */
3661 	j = 0;
3662 	nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr);
3663 	while (nvme_ns != NULL) {
3664 		nvme_bdev = nvme_ns->bdev;
3665 		if (j < ctx->count) {
3666 			ctx->names[j] = nvme_bdev->disk.name;
3667 			j++;
3668 		} else {
3669 			SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n",
3670 				    ctx->count);
3671 			populate_namespaces_cb(ctx, 0, -ERANGE);
3672 			return;
3673 		}
3674 
3675 		nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns);
3676 	}
3677 
3678 	populate_namespaces_cb(ctx, j, 0);
3679 }
3680 
3681 static int
3682 bdev_nvme_compare_trids(struct nvme_ctrlr *nvme_ctrlr,
3683 			struct spdk_nvme_ctrlr *new_ctrlr,
3684 			struct spdk_nvme_transport_id *trid)
3685 {
3686 	struct nvme_path_id *tmp_trid;
3687 
3688 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
3689 		SPDK_ERRLOG("PCIe failover is not supported.\n");
3690 		return -ENOTSUP;
3691 	}
3692 
3693 	/* Currently we only support failover to the same transport type. */
3694 	if (nvme_ctrlr->active_path_id->trid.trtype != trid->trtype) {
3695 		return -EINVAL;
3696 	}
3697 
3698 	/* Currently we only support failover to the same NQN. */
3699 	if (strncmp(trid->subnqn, nvme_ctrlr->active_path_id->trid.subnqn, SPDK_NVMF_NQN_MAX_LEN)) {
3700 		return -EINVAL;
3701 	}
3702 
3703 	/* Skip all the other checks if we've already registered this path. */
3704 	TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) {
3705 		if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) {
3706 			return -EEXIST;
3707 		}
3708 	}
3709 
3710 	return 0;
3711 }
3712 
3713 static int
3714 bdev_nvme_compare_namespaces(struct nvme_ctrlr *nvme_ctrlr,
3715 			     struct spdk_nvme_ctrlr *new_ctrlr)
3716 {
3717 	struct nvme_ns *nvme_ns;
3718 	struct spdk_nvme_ns *new_ns;
3719 
3720 	nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr);
3721 	while (nvme_ns != NULL) {
3722 		new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nvme_ns->id);
3723 		assert(new_ns != NULL);
3724 
3725 		if (!bdev_nvme_compare_ns(nvme_ns->ns, new_ns)) {
3726 			return -EINVAL;
3727 		}
3728 
3729 		nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns);
3730 	}
3731 
3732 	return 0;
3733 }
3734 
3735 static int
3736 _bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr,
3737 			      struct spdk_nvme_transport_id *trid)
3738 {
3739 	struct nvme_path_id *new_trid, *tmp_trid;
3740 
3741 	new_trid = calloc(1, sizeof(*new_trid));
3742 	if (new_trid == NULL) {
3743 		return -ENOMEM;
3744 	}
3745 	new_trid->trid = *trid;
3746 	new_trid->is_failed = false;
3747 
3748 	TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) {
3749 		if (tmp_trid->is_failed && tmp_trid != nvme_ctrlr->active_path_id) {
3750 			TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link);
3751 			return 0;
3752 		}
3753 	}
3754 
3755 	TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, new_trid, link);
3756 	return 0;
3757 }
3758 
3759 /* This is the case that a secondary path is added to an existing
3760  * nvme_ctrlr for failover. After checking if it can access the same
3761  * namespaces as the primary path, it is disconnected until failover occurs.
3762  */
3763 static int
3764 bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr,
3765 			     struct spdk_nvme_ctrlr *new_ctrlr,
3766 			     struct spdk_nvme_transport_id *trid)
3767 {
3768 	int rc;
3769 
3770 	assert(nvme_ctrlr != NULL);
3771 
3772 	pthread_mutex_lock(&nvme_ctrlr->mutex);
3773 
3774 	rc = bdev_nvme_compare_trids(nvme_ctrlr, new_ctrlr, trid);
3775 	if (rc != 0) {
3776 		goto exit;
3777 	}
3778 
3779 	rc = bdev_nvme_compare_namespaces(nvme_ctrlr, new_ctrlr);
3780 	if (rc != 0) {
3781 		goto exit;
3782 	}
3783 
3784 	rc = _bdev_nvme_add_secondary_trid(nvme_ctrlr, trid);
3785 
3786 exit:
3787 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
3788 
3789 	spdk_nvme_detach(new_ctrlr);
3790 
3791 	return rc;
3792 }
3793 
3794 static void
3795 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
3796 		  struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
3797 {
3798 	struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx;
3799 	struct nvme_async_probe_ctx *ctx;
3800 	int rc;
3801 
3802 	ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, opts);
3803 	ctx->ctrlr_attached = true;
3804 
3805 	rc = nvme_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx);
3806 	if (rc != 0) {
3807 		populate_namespaces_cb(ctx, 0, rc);
3808 	}
3809 }
3810 
3811 static void
3812 connect_set_failover_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
3813 			struct spdk_nvme_ctrlr *ctrlr,
3814 			const struct spdk_nvme_ctrlr_opts *opts)
3815 {
3816 	struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx;
3817 	struct nvme_ctrlr *nvme_ctrlr;
3818 	struct nvme_async_probe_ctx *ctx;
3819 	int rc;
3820 
3821 	ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, opts);
3822 	ctx->ctrlr_attached = true;
3823 
3824 	nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name);
3825 	if (nvme_ctrlr) {
3826 		rc = bdev_nvme_add_secondary_trid(nvme_ctrlr, ctrlr, &ctx->trid);
3827 	} else {
3828 		rc = -ENODEV;
3829 	}
3830 
3831 	populate_namespaces_cb(ctx, 0, rc);
3832 }
3833 
3834 static int
3835 bdev_nvme_async_poll(void *arg)
3836 {
3837 	struct nvme_async_probe_ctx	*ctx = arg;
3838 	int				rc;
3839 
3840 	rc = spdk_nvme_probe_poll_async(ctx->probe_ctx);
3841 	if (spdk_unlikely(rc != -EAGAIN)) {
3842 		ctx->probe_done = true;
3843 		spdk_poller_unregister(&ctx->poller);
3844 		if (!ctx->ctrlr_attached) {
3845 			/* The probe is done, but no controller was attached.
3846 			 * That means we had a failure, so report -EIO back to
3847 			 * the caller (usually the RPC). populate_namespaces_cb()
3848 			 * will take care of freeing the nvme_async_probe_ctx.
3849 			 */
3850 			populate_namespaces_cb(ctx, 0, -EIO);
3851 		} else if (ctx->namespaces_populated) {
3852 			/* The namespaces for the attached controller were all
3853 			 * populated and the response was already sent to the
3854 			 * caller (usually the RPC).  So free the context here.
3855 			 */
3856 			free(ctx);
3857 		}
3858 	}
3859 
3860 	return SPDK_POLLER_BUSY;
3861 }
3862 
3863 static bool
3864 bdev_nvme_check_multipath_params(int32_t ctrlr_loss_timeout_sec,
3865 				 uint32_t reconnect_delay_sec,
3866 				 uint32_t fast_io_fail_timeout_sec)
3867 {
3868 	if (ctrlr_loss_timeout_sec < -1) {
3869 		SPDK_ERRLOG("ctrlr_loss_timeout_sec can't be less than -1.\n");
3870 		return false;
3871 	} else if (ctrlr_loss_timeout_sec == -1) {
3872 		if (reconnect_delay_sec == 0) {
3873 			SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n");
3874 			return false;
3875 		} else if (fast_io_fail_timeout_sec != 0 &&
3876 			   fast_io_fail_timeout_sec < reconnect_delay_sec) {
3877 			SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io-fail_timeout_sec.\n");
3878 			return false;
3879 		}
3880 	} else if (ctrlr_loss_timeout_sec != 0) {
3881 		if (reconnect_delay_sec == 0) {
3882 			SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n");
3883 			return false;
3884 		} else if (reconnect_delay_sec > (uint32_t)ctrlr_loss_timeout_sec) {
3885 			SPDK_ERRLOG("reconnect_delay_sec can't be more than ctrlr_loss_timeout_sec.\n");
3886 			return false;
3887 		} else if (fast_io_fail_timeout_sec != 0) {
3888 			if (fast_io_fail_timeout_sec < reconnect_delay_sec) {
3889 				SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io_fail_timeout_sec.\n");
3890 				return false;
3891 			} else if (fast_io_fail_timeout_sec > (uint32_t)ctrlr_loss_timeout_sec) {
3892 				SPDK_ERRLOG("fast_io_fail_timeout_sec can't be more than ctrlr_loss_timeout_sec.\n");
3893 				return false;
3894 			}
3895 		}
3896 	} else if (reconnect_delay_sec != 0 || fast_io_fail_timeout_sec != 0) {
3897 		SPDK_ERRLOG("Both reconnect_delay_sec and fast_io_fail_timeout_sec must be 0 if ctrlr_loss_timeout_sec is 0.\n");
3898 		return false;
3899 	}
3900 
3901 	return true;
3902 }
3903 
3904 int
3905 bdev_nvme_create(struct spdk_nvme_transport_id *trid,
3906 		 const char *base_name,
3907 		 const char **names,
3908 		 uint32_t count,
3909 		 uint32_t prchk_flags,
3910 		 spdk_bdev_create_nvme_fn cb_fn,
3911 		 void *cb_ctx,
3912 		 struct spdk_nvme_ctrlr_opts *opts,
3913 		 bool multipath,
3914 		 int32_t ctrlr_loss_timeout_sec,
3915 		 uint32_t reconnect_delay_sec,
3916 		 uint32_t fast_io_fail_timeout_sec)
3917 {
3918 	struct nvme_probe_skip_entry	*entry, *tmp;
3919 	struct nvme_async_probe_ctx	*ctx;
3920 	spdk_nvme_attach_cb attach_cb;
3921 
3922 	/* TODO expand this check to include both the host and target TRIDs.
3923 	 * Only if both are the same should we fail.
3924 	 */
3925 	if (nvme_ctrlr_get(trid) != NULL) {
3926 		SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr);
3927 		return -EEXIST;
3928 	}
3929 
3930 	if (!bdev_nvme_check_multipath_params(ctrlr_loss_timeout_sec, reconnect_delay_sec,
3931 					      fast_io_fail_timeout_sec)) {
3932 		return -EINVAL;
3933 	}
3934 
3935 	ctx = calloc(1, sizeof(*ctx));
3936 	if (!ctx) {
3937 		return -ENOMEM;
3938 	}
3939 	ctx->base_name = base_name;
3940 	ctx->names = names;
3941 	ctx->count = count;
3942 	ctx->cb_fn = cb_fn;
3943 	ctx->cb_ctx = cb_ctx;
3944 	ctx->prchk_flags = prchk_flags;
3945 	ctx->trid = *trid;
3946 	ctx->ctrlr_loss_timeout_sec = ctrlr_loss_timeout_sec;
3947 	ctx->reconnect_delay_sec = reconnect_delay_sec;
3948 	ctx->fast_io_fail_timeout_sec = fast_io_fail_timeout_sec;
3949 
3950 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
3951 		TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) {
3952 			if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) {
3953 				TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq);
3954 				free(entry);
3955 				break;
3956 			}
3957 		}
3958 	}
3959 
3960 	if (opts) {
3961 		memcpy(&ctx->opts, opts, sizeof(*opts));
3962 	} else {
3963 		spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->opts, sizeof(ctx->opts));
3964 	}
3965 
3966 	ctx->opts.transport_retry_count = g_opts.transport_retry_count;
3967 	ctx->opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms;
3968 	ctx->opts.disable_read_ana_log_page = true;
3969 
3970 	if (nvme_bdev_ctrlr_get_by_name(base_name) == NULL || multipath) {
3971 		attach_cb = connect_attach_cb;
3972 	} else {
3973 		attach_cb = connect_set_failover_cb;
3974 	}
3975 
3976 	ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->opts, attach_cb);
3977 	if (ctx->probe_ctx == NULL) {
3978 		SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr);
3979 		free(ctx);
3980 		return -ENODEV;
3981 	}
3982 	ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000);
3983 
3984 	return 0;
3985 }
3986 
3987 int
3988 bdev_nvme_delete(const char *name, const struct nvme_path_id *path_id)
3989 {
3990 	struct nvme_bdev_ctrlr	*nbdev_ctrlr;
3991 	struct nvme_ctrlr	*nvme_ctrlr, *tmp_nvme_ctrlr;
3992 	struct nvme_path_id	*p, *t;
3993 	int			rc = -ENXIO;
3994 
3995 	if (name == NULL || path_id == NULL) {
3996 		return -EINVAL;
3997 	}
3998 
3999 	nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name);
4000 	if (nbdev_ctrlr == NULL) {
4001 		SPDK_ERRLOG("Failed to find NVMe bdev controller\n");
4002 		return -ENODEV;
4003 	}
4004 
4005 	TAILQ_FOREACH_SAFE(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq, tmp_nvme_ctrlr) {
4006 		TAILQ_FOREACH_REVERSE_SAFE(p, &nvme_ctrlr->trids, nvme_paths, link, t) {
4007 			if (path_id->trid.trtype != 0) {
4008 				if (path_id->trid.trtype == SPDK_NVME_TRANSPORT_CUSTOM) {
4009 					if (strcasecmp(path_id->trid.trstring, p->trid.trstring) != 0) {
4010 						continue;
4011 					}
4012 				} else {
4013 					if (path_id->trid.trtype != p->trid.trtype) {
4014 						continue;
4015 					}
4016 				}
4017 			}
4018 
4019 			if (!spdk_mem_all_zero(path_id->trid.traddr, sizeof(path_id->trid.traddr))) {
4020 				if (strcasecmp(path_id->trid.traddr, p->trid.traddr) != 0) {
4021 					continue;
4022 				}
4023 			}
4024 
4025 			if (path_id->trid.adrfam != 0) {
4026 				if (path_id->trid.adrfam != p->trid.adrfam) {
4027 					continue;
4028 				}
4029 			}
4030 
4031 			if (!spdk_mem_all_zero(path_id->trid.trsvcid, sizeof(path_id->trid.trsvcid))) {
4032 				if (strcasecmp(path_id->trid.trsvcid, p->trid.trsvcid) != 0) {
4033 					continue;
4034 				}
4035 			}
4036 
4037 			if (!spdk_mem_all_zero(path_id->trid.subnqn, sizeof(path_id->trid.subnqn))) {
4038 				if (strcmp(path_id->trid.subnqn, p->trid.subnqn) != 0) {
4039 					continue;
4040 				}
4041 			}
4042 
4043 			if (!spdk_mem_all_zero(path_id->hostid.hostaddr, sizeof(path_id->hostid.hostaddr))) {
4044 				if (strcmp(path_id->hostid.hostaddr, p->hostid.hostaddr) != 0) {
4045 					continue;
4046 				}
4047 			}
4048 
4049 			if (!spdk_mem_all_zero(path_id->hostid.hostsvcid, sizeof(path_id->hostid.hostsvcid))) {
4050 				if (strcmp(path_id->hostid.hostsvcid, p->hostid.hostsvcid) != 0) {
4051 					continue;
4052 				}
4053 			}
4054 
4055 			/* If we made it here, then this path is a match! Now we need to remove it. */
4056 			if (p == nvme_ctrlr->active_path_id) {
4057 				/* This is the active path in use right now. The active path is always the first in the list. */
4058 
4059 				if (!TAILQ_NEXT(p, link)) {
4060 					/* The current path is the only path. */
4061 					rc = _bdev_nvme_delete(nvme_ctrlr, false);
4062 				} else {
4063 					/* There is an alternative path. */
4064 					rc = bdev_nvme_failover(nvme_ctrlr, true);
4065 				}
4066 			} else {
4067 				/* We are not using the specified path. */
4068 				TAILQ_REMOVE(&nvme_ctrlr->trids, p, link);
4069 				free(p);
4070 				rc = 0;
4071 			}
4072 
4073 			if (rc < 0 && rc != -ENXIO) {
4074 				return rc;
4075 			}
4076 
4077 
4078 		}
4079 	}
4080 
4081 	/* All nvme_ctrlrs were deleted or no nvme_ctrlr which had the trid was found. */
4082 	return rc;
4083 }
4084 
4085 struct discovery_ctrlr_ctx {
4086 	char						name[128];
4087 	struct spdk_nvme_transport_id			trid;
4088 	struct spdk_nvme_ctrlr_opts			opts;
4089 	struct spdk_nvmf_discovery_log_page_entry	entry;
4090 	TAILQ_ENTRY(discovery_ctrlr_ctx)		tailq;
4091 	struct discovery_ctx				*ctx;
4092 };
4093 
4094 struct discovery_ctx {
4095 	char					*name;
4096 	spdk_bdev_nvme_start_discovery_fn	start_cb_fn;
4097 	spdk_bdev_nvme_stop_discovery_fn	stop_cb_fn;
4098 	void					*cb_ctx;
4099 	struct spdk_nvme_probe_ctx		*probe_ctx;
4100 	struct spdk_nvme_detach_ctx		*detach_ctx;
4101 	struct spdk_nvme_ctrlr			*ctrlr;
4102 	struct spdk_poller			*poller;
4103 	struct spdk_nvme_ctrlr_opts		opts;
4104 	TAILQ_ENTRY(discovery_ctx)		tailq;
4105 	TAILQ_HEAD(, discovery_ctrlr_ctx)	ctrlr_ctxs;
4106 	int					rc;
4107 	/* Denotes if a discovery is currently in progress for this context.
4108 	 * That includes connecting to newly discovered subsystems.  Used to
4109 	 * ensure we do not start a new discovery until an existing one is
4110 	 * complete.
4111 	 */
4112 	bool					in_progress;
4113 
4114 	/* Denotes if another discovery is needed after the one in progress
4115 	 * completes.  Set when we receive an AER completion while a discovery
4116 	 * is already in progress.
4117 	 */
4118 	bool					pending;
4119 
4120 	/* Signal to the discovery context poller that it should detach from
4121 	 * the discovery controller.
4122 	 */
4123 	bool					detach;
4124 
4125 	struct spdk_thread			*calling_thread;
4126 	uint32_t				index;
4127 	uint32_t				attach_in_progress;
4128 	char					*hostnqn;
4129 };
4130 
4131 TAILQ_HEAD(discovery_ctxs, discovery_ctx);
4132 static struct discovery_ctxs g_discovery_ctxs = TAILQ_HEAD_INITIALIZER(g_discovery_ctxs);
4133 
4134 static void get_discovery_log_page(struct discovery_ctx *ctx);
4135 
4136 static void
4137 free_discovery_ctx(struct discovery_ctx *ctx)
4138 {
4139 	free(ctx->hostnqn);
4140 	free(ctx->name);
4141 	free(ctx);
4142 }
4143 
4144 static void
4145 discovery_complete(struct discovery_ctx *ctx)
4146 {
4147 	ctx->in_progress = false;
4148 	if (ctx->pending) {
4149 		ctx->pending = false;
4150 		get_discovery_log_page(ctx);
4151 	}
4152 }
4153 
4154 static void
4155 build_trid_from_log_page_entry(struct spdk_nvme_transport_id *trid,
4156 			       struct spdk_nvmf_discovery_log_page_entry *entry)
4157 {
4158 	char *space;
4159 
4160 	trid->trtype = entry->trtype;
4161 	trid->adrfam = entry->adrfam;
4162 	memcpy(trid->traddr, entry->traddr, sizeof(trid->traddr));
4163 	memcpy(trid->trsvcid, entry->trsvcid, sizeof(trid->trsvcid));
4164 	memcpy(trid->subnqn, entry->subnqn, sizeof(trid->subnqn));
4165 
4166 	/* We want the traddr, trsvcid and subnqn fields to be NULL-terminated.
4167 	 * But the log page entries typically pad them with spaces, not zeroes.
4168 	 * So add a NULL terminator to each of these fields at the appropriate
4169 	 * location.
4170 	 */
4171 	space = strchr(trid->traddr, ' ');
4172 	if (space) {
4173 		*space = 0;
4174 	}
4175 	space = strchr(trid->trsvcid, ' ');
4176 	if (space) {
4177 		*space = 0;
4178 	}
4179 	space = strchr(trid->subnqn, ' ');
4180 	if (space) {
4181 		*space = 0;
4182 	}
4183 }
4184 
4185 static void
4186 discovery_attach_controller_done(void *cb_ctx, size_t bdev_count, int rc)
4187 {
4188 	struct discovery_ctrlr_ctx *ctrlr_ctx = cb_ctx;
4189 	struct discovery_ctx *ctx = ctrlr_ctx->ctx;;
4190 
4191 	SPDK_DEBUGLOG(bdev_nvme, "attach %s done\n", ctrlr_ctx->name);
4192 	ctx->attach_in_progress--;
4193 	if (ctx->attach_in_progress == 0) {
4194 		discovery_complete(ctx);
4195 	}
4196 }
4197 
4198 static void
4199 discovery_log_page_cb(void *cb_arg, int rc, const struct spdk_nvme_cpl *cpl,
4200 		      struct spdk_nvmf_discovery_log_page *log_page)
4201 {
4202 	struct discovery_ctx *ctx = cb_arg;
4203 	struct discovery_ctrlr_ctx *ctrlr_ctx, *tmp;
4204 	struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry;
4205 	uint64_t numrec, i;
4206 	bool found;
4207 
4208 	if (rc || spdk_nvme_cpl_is_error(cpl)) {
4209 		SPDK_ERRLOG("could not get discovery log page\n");
4210 		return;
4211 	}
4212 
4213 	assert(ctx->attach_in_progress == 0);
4214 	numrec = from_le64(&log_page->numrec);
4215 	TAILQ_FOREACH_SAFE(ctrlr_ctx, &ctx->ctrlr_ctxs, tailq, tmp) {
4216 		found = false;
4217 		old_entry = &ctrlr_ctx->entry;
4218 		for (i = 0; i < numrec; i++) {
4219 			new_entry = &log_page->entries[i];
4220 			if (!memcmp(old_entry, new_entry, sizeof(*old_entry))) {
4221 				found = true;
4222 				break;
4223 			}
4224 		}
4225 		if (!found) {
4226 			struct nvme_path_id path = {};
4227 
4228 			SPDK_DEBUGLOG(bdev_nvme, "detach controller\n");
4229 
4230 			path.trid = ctrlr_ctx->trid;
4231 			bdev_nvme_delete(ctrlr_ctx->name, &path);
4232 			TAILQ_REMOVE(&ctx->ctrlr_ctxs, ctrlr_ctx, tailq);
4233 			free(ctrlr_ctx);
4234 		}
4235 	}
4236 	for (i = 0; i < numrec; i++) {
4237 		found = false;
4238 		new_entry = &log_page->entries[i];
4239 		TAILQ_FOREACH(ctrlr_ctx, &ctx->ctrlr_ctxs, tailq) {
4240 			old_entry = &ctrlr_ctx->entry;
4241 			if (!memcmp(new_entry, old_entry, sizeof(*new_entry))) {
4242 				found = true;
4243 				break;
4244 			}
4245 		}
4246 		if (!found) {
4247 			struct discovery_ctrlr_ctx *subnqn_ctx, *new_ctx;
4248 
4249 			TAILQ_FOREACH(subnqn_ctx, &ctx->ctrlr_ctxs, tailq) {
4250 				if (!memcmp(subnqn_ctx->entry.subnqn, new_entry->subnqn,
4251 					    sizeof(new_entry->subnqn))) {
4252 					break;
4253 				}
4254 			}
4255 
4256 			new_ctx = calloc(1, sizeof(*ctrlr_ctx));
4257 			if (new_ctx == NULL) {
4258 				SPDK_ERRLOG("could not allocate new ctrlr_ctx\n");
4259 				break;
4260 			}
4261 
4262 			new_ctx->ctx = ctx;
4263 			memcpy(&new_ctx->entry, new_entry, sizeof(*new_entry));
4264 			build_trid_from_log_page_entry(&new_ctx->trid, new_entry);
4265 			if (subnqn_ctx) {
4266 				snprintf(new_ctx->name, sizeof(new_ctx->name), "%s", subnqn_ctx->name);
4267 			} else {
4268 				snprintf(new_ctx->name, sizeof(new_ctx->name), "%s%d", ctx->name, ctx->index++);
4269 			}
4270 			spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->opts, sizeof(new_ctx->opts));
4271 			snprintf(new_ctx->opts.hostnqn, sizeof(new_ctx->opts.hostnqn), "%s", ctx->hostnqn);
4272 			rc = bdev_nvme_create(&new_ctx->trid, new_ctx->name, NULL, 0, 0,
4273 					      discovery_attach_controller_done, new_ctx,
4274 					      &new_ctx->opts, true, 0, 0, 0);
4275 			if (rc == 0) {
4276 				TAILQ_INSERT_TAIL(&ctx->ctrlr_ctxs, new_ctx, tailq);
4277 				ctx->attach_in_progress++;
4278 			} else {
4279 				SPDK_ERRLOG("bdev_nvme_create failed (%s)\n", spdk_strerror(-rc));
4280 			}
4281 		}
4282 	}
4283 	free(log_page);
4284 
4285 	if (ctx->attach_in_progress == 0) {
4286 		discovery_complete(ctx);
4287 	}
4288 }
4289 
4290 static void
4291 get_discovery_log_page(struct discovery_ctx *ctx)
4292 {
4293 	int rc;
4294 
4295 	assert(ctx->in_progress == false);
4296 	ctx->in_progress = true;
4297 	rc = spdk_nvme_ctrlr_get_discovery_log_page(ctx->ctrlr, discovery_log_page_cb, ctx);
4298 	if (rc != 0) {
4299 		SPDK_ERRLOG("could not get discovery log page\n");
4300 	}
4301 	SPDK_DEBUGLOG(bdev_nvme, "sent discovery log page command\n");
4302 }
4303 
4304 static void
4305 discovery_aer_cb(void *arg, const struct spdk_nvme_cpl *cpl)
4306 {
4307 	struct discovery_ctx *ctx = arg;
4308 	uint32_t log_page_id = (cpl->cdw0 & 0xFF0000) >> 16;
4309 
4310 	if (spdk_nvme_cpl_is_error(cpl)) {
4311 		SPDK_ERRLOG("aer failed\n");
4312 		return;
4313 	}
4314 
4315 	if (log_page_id != SPDK_NVME_LOG_DISCOVERY) {
4316 		SPDK_ERRLOG("unexpected log page 0x%x\n", log_page_id);
4317 		return;
4318 	}
4319 
4320 	SPDK_DEBUGLOG(bdev_nvme, "got aer\n");
4321 	if (ctx->in_progress) {
4322 		ctx->pending = true;
4323 		return;
4324 	}
4325 
4326 	get_discovery_log_page(ctx);
4327 }
4328 
4329 static void
4330 start_discovery_done(void *cb_ctx)
4331 {
4332 	struct discovery_ctx *ctx = cb_ctx;
4333 
4334 	SPDK_DEBUGLOG(bdev_nvme, "start discovery done\n");
4335 	ctx->start_cb_fn(ctx->cb_ctx, ctx->rc);
4336 	if (ctx->rc != 0) {
4337 		SPDK_ERRLOG("could not connect to discovery ctrlr\n");
4338 		TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq);
4339 		free_discovery_ctx(ctx);
4340 	}
4341 }
4342 
4343 static int
4344 discovery_poller(void *arg)
4345 {
4346 	struct discovery_ctx *ctx = arg;
4347 	int rc;
4348 
4349 	if (ctx->probe_ctx) {
4350 		rc = spdk_nvme_probe_poll_async(ctx->probe_ctx);
4351 		if (rc != -EAGAIN) {
4352 			ctx->rc = rc;
4353 			spdk_thread_send_msg(ctx->calling_thread, start_discovery_done, ctx);
4354 			if (rc == 0) {
4355 				get_discovery_log_page(ctx);
4356 			}
4357 		}
4358 	} else if (ctx->detach) {
4359 		bool detach_done = false;
4360 
4361 		if (ctx->detach_ctx == NULL) {
4362 			rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx);
4363 			if (rc != 0) {
4364 				SPDK_ERRLOG("could not detach discovery ctrlr\n");
4365 				detach_done = true;
4366 			}
4367 		} else {
4368 			rc = spdk_nvme_detach_poll_async(ctx->detach_ctx);
4369 			if (rc != -EAGAIN) {
4370 				detach_done = true;
4371 			}
4372 		}
4373 		if (detach_done) {
4374 			spdk_poller_unregister(&ctx->poller);
4375 			TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq);
4376 			ctx->stop_cb_fn(ctx->cb_ctx);
4377 			free_discovery_ctx(ctx);
4378 		}
4379 	} else {
4380 		spdk_nvme_ctrlr_process_admin_completions(ctx->ctrlr);
4381 	}
4382 
4383 	return SPDK_POLLER_BUSY;
4384 }
4385 
4386 static void
4387 discovery_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
4388 		    struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
4389 {
4390 	struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx;
4391 	struct discovery_ctx *ctx;
4392 
4393 	ctx = SPDK_CONTAINEROF(user_opts, struct discovery_ctx, opts);
4394 
4395 	SPDK_DEBUGLOG(bdev_nvme, "discovery ctrlr attached\n");
4396 	ctx->probe_ctx = NULL;
4397 	ctx->ctrlr = ctrlr;
4398 	spdk_nvme_ctrlr_register_aer_callback(ctx->ctrlr, discovery_aer_cb, ctx);
4399 }
4400 
4401 static void
4402 start_discovery_poller(void *arg)
4403 {
4404 	struct discovery_ctx *ctx = arg;
4405 
4406 	TAILQ_INSERT_TAIL(&g_discovery_ctxs, ctx, tailq);
4407 	ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000);
4408 }
4409 
4410 int
4411 bdev_nvme_start_discovery(struct spdk_nvme_transport_id *trid,
4412 			  const char *base_name,
4413 			  struct spdk_nvme_ctrlr_opts *opts,
4414 			  spdk_bdev_nvme_start_discovery_fn cb_fn,
4415 			  void *cb_ctx)
4416 {
4417 	struct discovery_ctx *ctx;
4418 
4419 	ctx = calloc(1, sizeof(*ctx));
4420 	if (ctx == NULL) {
4421 		return -ENOMEM;
4422 	}
4423 
4424 	ctx->name = strdup(base_name);
4425 	if (ctx->name == NULL) {
4426 		free_discovery_ctx(ctx);
4427 		return -ENOMEM;
4428 	}
4429 	ctx->start_cb_fn = cb_fn;
4430 	ctx->cb_ctx = cb_ctx;
4431 	memcpy(&ctx->opts, opts, sizeof(*opts));
4432 	ctx->calling_thread = spdk_get_thread();
4433 	TAILQ_INIT(&ctx->ctrlr_ctxs);
4434 	snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN);
4435 	/* Even if user did not specify hostnqn, we can still strdup("\0"); */
4436 	ctx->hostnqn = strdup(ctx->opts.hostnqn);
4437 	if (ctx->hostnqn == NULL) {
4438 		free_discovery_ctx(ctx);
4439 		return -ENOMEM;
4440 	}
4441 	ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->opts, discovery_attach_cb);
4442 	if (ctx->probe_ctx == NULL) {
4443 		SPDK_ERRLOG("could not start discovery connect\n");
4444 		free_discovery_ctx(ctx);
4445 		return -EIO;
4446 	}
4447 
4448 	spdk_thread_send_msg(g_bdev_nvme_init_thread, start_discovery_poller, ctx);
4449 	return 0;
4450 }
4451 
4452 int
4453 bdev_nvme_stop_discovery(const char *name, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx)
4454 {
4455 	struct discovery_ctx *ctx;
4456 
4457 	TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) {
4458 		if (strcmp(name, ctx->name) == 0) {
4459 			if (ctx->detach) {
4460 				return -EALREADY;
4461 			}
4462 			ctx->detach = true;
4463 			ctx->stop_cb_fn = cb_fn;
4464 			ctx->cb_ctx = cb_ctx;
4465 			while (!TAILQ_EMPTY(&ctx->ctrlr_ctxs)) {
4466 				struct discovery_ctrlr_ctx *ctrlr_ctx;
4467 				struct nvme_path_id path = {};
4468 
4469 				ctrlr_ctx = TAILQ_FIRST(&ctx->ctrlr_ctxs);
4470 				path.trid = ctrlr_ctx->trid;
4471 				bdev_nvme_delete(ctrlr_ctx->name, &path);
4472 				TAILQ_REMOVE(&ctx->ctrlr_ctxs, ctrlr_ctx, tailq);
4473 				free(ctrlr_ctx);
4474 			}
4475 			return 0;
4476 		}
4477 	}
4478 
4479 	return -ENOENT;
4480 }
4481 
4482 static int
4483 bdev_nvme_library_init(void)
4484 {
4485 	g_bdev_nvme_init_thread = spdk_get_thread();
4486 
4487 	spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_create_poll_group_cb,
4488 				bdev_nvme_destroy_poll_group_cb,
4489 				sizeof(struct nvme_poll_group),  "nvme_poll_groups");
4490 
4491 	return 0;
4492 }
4493 
4494 static void
4495 bdev_nvme_fini_destruct_ctrlrs(void)
4496 {
4497 	struct nvme_bdev_ctrlr *nbdev_ctrlr;
4498 	struct nvme_ctrlr *nvme_ctrlr;
4499 
4500 	pthread_mutex_lock(&g_bdev_nvme_mutex);
4501 	TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
4502 		TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) {
4503 			pthread_mutex_lock(&nvme_ctrlr->mutex);
4504 			if (nvme_ctrlr->destruct) {
4505 				/* This controller's destruction was already started
4506 				 * before the application started shutting down
4507 				 */
4508 				pthread_mutex_unlock(&nvme_ctrlr->mutex);
4509 				continue;
4510 			}
4511 			nvme_ctrlr->destruct = true;
4512 			pthread_mutex_unlock(&nvme_ctrlr->mutex);
4513 
4514 			spdk_thread_send_msg(nvme_ctrlr->thread, _nvme_ctrlr_destruct,
4515 					     nvme_ctrlr);
4516 		}
4517 	}
4518 
4519 	g_bdev_nvme_module_finish = true;
4520 	if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
4521 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
4522 		spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL);
4523 		spdk_bdev_module_fini_done();
4524 		return;
4525 	}
4526 
4527 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
4528 }
4529 
4530 static void
4531 check_discovery_fini(void *arg)
4532 {
4533 	if (TAILQ_EMPTY(&g_discovery_ctxs)) {
4534 		bdev_nvme_fini_destruct_ctrlrs();
4535 	}
4536 }
4537 
4538 static void
4539 bdev_nvme_library_fini(void)
4540 {
4541 	struct nvme_probe_skip_entry *entry, *entry_tmp;
4542 	struct discovery_ctx *ctx;
4543 
4544 	spdk_poller_unregister(&g_hotplug_poller);
4545 	free(g_hotplug_probe_ctx);
4546 	g_hotplug_probe_ctx = NULL;
4547 
4548 	TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) {
4549 		TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq);
4550 		free(entry);
4551 	}
4552 
4553 	assert(spdk_get_thread() == g_bdev_nvme_init_thread);
4554 	if (TAILQ_EMPTY(&g_discovery_ctxs)) {
4555 		bdev_nvme_fini_destruct_ctrlrs();
4556 	} else {
4557 		TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) {
4558 			ctx->detach = true;
4559 			ctx->stop_cb_fn = check_discovery_fini;
4560 		}
4561 	}
4562 }
4563 
4564 static void
4565 bdev_nvme_verify_pi_error(struct nvme_bdev_io *bio)
4566 {
4567 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
4568 	struct spdk_bdev *bdev = bdev_io->bdev;
4569 	struct spdk_dif_ctx dif_ctx;
4570 	struct spdk_dif_error err_blk = {};
4571 	int rc;
4572 
4573 	rc = spdk_dif_ctx_init(&dif_ctx,
4574 			       bdev->blocklen, bdev->md_len, bdev->md_interleave,
4575 			       bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags,
4576 			       bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0);
4577 	if (rc != 0) {
4578 		SPDK_ERRLOG("Initialization of DIF context failed\n");
4579 		return;
4580 	}
4581 
4582 	if (bdev->md_interleave) {
4583 		rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
4584 				     bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
4585 	} else {
4586 		struct iovec md_iov = {
4587 			.iov_base	= bdev_io->u.bdev.md_buf,
4588 			.iov_len	= bdev_io->u.bdev.num_blocks * bdev->md_len,
4589 		};
4590 
4591 		rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
4592 				     &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
4593 	}
4594 
4595 	if (rc != 0) {
4596 		SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n",
4597 			    err_blk.err_type, err_blk.err_offset);
4598 	} else {
4599 		SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n");
4600 	}
4601 }
4602 
4603 static void
4604 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl)
4605 {
4606 	struct nvme_bdev_io *bio = ref;
4607 
4608 	if (spdk_nvme_cpl_is_success(cpl)) {
4609 		/* Run PI verification for read data buffer. */
4610 		bdev_nvme_verify_pi_error(bio);
4611 	}
4612 
4613 	/* Return original completion status */
4614 	bdev_nvme_io_complete_nvme_status(bio, &bio->cpl);
4615 }
4616 
4617 static void
4618 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl)
4619 {
4620 	struct nvme_bdev_io *bio = ref;
4621 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
4622 	int ret;
4623 
4624 	if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) {
4625 		SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n",
4626 			    cpl->status.sct, cpl->status.sc);
4627 
4628 		/* Save completion status to use after verifying PI error. */
4629 		bio->cpl = *cpl;
4630 
4631 		if (spdk_likely(nvme_io_path_is_available(bio->io_path))) {
4632 			/* Read without PI checking to verify PI error. */
4633 			ret = bdev_nvme_no_pi_readv(bio,
4634 						    bdev_io->u.bdev.iovs,
4635 						    bdev_io->u.bdev.iovcnt,
4636 						    bdev_io->u.bdev.md_buf,
4637 						    bdev_io->u.bdev.num_blocks,
4638 						    bdev_io->u.bdev.offset_blocks);
4639 			if (ret == 0) {
4640 				return;
4641 			}
4642 		}
4643 	}
4644 
4645 	bdev_nvme_io_complete_nvme_status(bio, cpl);
4646 }
4647 
4648 static void
4649 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl)
4650 {
4651 	struct nvme_bdev_io *bio = ref;
4652 
4653 	if (spdk_nvme_cpl_is_pi_error(cpl)) {
4654 		SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n",
4655 			    cpl->status.sct, cpl->status.sc);
4656 		/* Run PI verification for write data buffer if PI error is detected. */
4657 		bdev_nvme_verify_pi_error(bio);
4658 	}
4659 
4660 	bdev_nvme_io_complete_nvme_status(bio, cpl);
4661 }
4662 
4663 static void
4664 bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl)
4665 {
4666 	struct nvme_bdev_io *bio = ref;
4667 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
4668 
4669 	/* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks.
4670 	 * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error().
4671 	 */
4672 	bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0;
4673 
4674 	if (spdk_nvme_cpl_is_pi_error(cpl)) {
4675 		SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n",
4676 			    cpl->status.sct, cpl->status.sc);
4677 		/* Run PI verification for zone append data buffer if PI error is detected. */
4678 		bdev_nvme_verify_pi_error(bio);
4679 	}
4680 
4681 	bdev_nvme_io_complete_nvme_status(bio, cpl);
4682 }
4683 
4684 static void
4685 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl)
4686 {
4687 	struct nvme_bdev_io *bio = ref;
4688 
4689 	if (spdk_nvme_cpl_is_pi_error(cpl)) {
4690 		SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n",
4691 			    cpl->status.sct, cpl->status.sc);
4692 		/* Run PI verification for compare data buffer if PI error is detected. */
4693 		bdev_nvme_verify_pi_error(bio);
4694 	}
4695 
4696 	bdev_nvme_io_complete_nvme_status(bio, cpl);
4697 }
4698 
4699 static void
4700 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl)
4701 {
4702 	struct nvme_bdev_io *bio = ref;
4703 
4704 	/* Compare operation completion */
4705 	if ((cpl->cdw0 & 0xFF) == SPDK_NVME_OPC_COMPARE) {
4706 		/* Save compare result for write callback */
4707 		bio->cpl = *cpl;
4708 		return;
4709 	}
4710 
4711 	/* Write operation completion */
4712 	if (spdk_nvme_cpl_is_error(&bio->cpl)) {
4713 		/* If bio->cpl is already an error, it means the compare operation failed.  In that case,
4714 		 * complete the IO with the compare operation's status.
4715 		 */
4716 		if (!spdk_nvme_cpl_is_error(cpl)) {
4717 			SPDK_ERRLOG("Unexpected write success after compare failure.\n");
4718 		}
4719 
4720 		bdev_nvme_io_complete_nvme_status(bio, &bio->cpl);
4721 	} else {
4722 		bdev_nvme_io_complete_nvme_status(bio, cpl);
4723 	}
4724 }
4725 
4726 static void
4727 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl)
4728 {
4729 	struct nvme_bdev_io *bio = ref;
4730 
4731 	bdev_nvme_io_complete_nvme_status(bio, cpl);
4732 }
4733 
4734 static int
4735 fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc)
4736 {
4737 	switch (desc->zs) {
4738 	case SPDK_NVME_ZONE_STATE_EMPTY:
4739 		info->state = SPDK_BDEV_ZONE_STATE_EMPTY;
4740 		break;
4741 	case SPDK_NVME_ZONE_STATE_IOPEN:
4742 		info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN;
4743 		break;
4744 	case SPDK_NVME_ZONE_STATE_EOPEN:
4745 		info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN;
4746 		break;
4747 	case SPDK_NVME_ZONE_STATE_CLOSED:
4748 		info->state = SPDK_BDEV_ZONE_STATE_CLOSED;
4749 		break;
4750 	case SPDK_NVME_ZONE_STATE_RONLY:
4751 		info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY;
4752 		break;
4753 	case SPDK_NVME_ZONE_STATE_FULL:
4754 		info->state = SPDK_BDEV_ZONE_STATE_FULL;
4755 		break;
4756 	case SPDK_NVME_ZONE_STATE_OFFLINE:
4757 		info->state = SPDK_BDEV_ZONE_STATE_OFFLINE;
4758 		break;
4759 	default:
4760 		SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs);
4761 		return -EIO;
4762 	}
4763 
4764 	info->zone_id = desc->zslba;
4765 	info->write_pointer = desc->wp;
4766 	info->capacity = desc->zcap;
4767 
4768 	return 0;
4769 }
4770 
4771 static void
4772 bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl)
4773 {
4774 	struct nvme_bdev_io *bio = ref;
4775 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
4776 	uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id;
4777 	uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones;
4778 	struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf;
4779 	uint64_t max_zones_per_buf, i;
4780 	uint32_t zone_report_bufsize;
4781 	struct spdk_nvme_ns *ns;
4782 	struct spdk_nvme_qpair *qpair;
4783 	int ret;
4784 
4785 	if (spdk_nvme_cpl_is_error(cpl)) {
4786 		goto out_complete_io_nvme_cpl;
4787 	}
4788 
4789 	if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) {
4790 		ret = -ENXIO;
4791 		goto out_complete_io_ret;
4792 	}
4793 
4794 	ns = bio->io_path->nvme_ns->ns;
4795 	qpair = bio->io_path->ctrlr_ch->qpair;
4796 
4797 	zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns);
4798 	max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) /
4799 			    sizeof(bio->zone_report_buf->descs[0]);
4800 
4801 	if (bio->zone_report_buf->nr_zones > max_zones_per_buf) {
4802 		ret = -EINVAL;
4803 		goto out_complete_io_ret;
4804 	}
4805 
4806 	if (!bio->zone_report_buf->nr_zones) {
4807 		ret = -EINVAL;
4808 		goto out_complete_io_ret;
4809 	}
4810 
4811 	for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) {
4812 		ret = fill_zone_from_report(&info[bio->handled_zones],
4813 					    &bio->zone_report_buf->descs[i]);
4814 		if (ret) {
4815 			goto out_complete_io_ret;
4816 		}
4817 		bio->handled_zones++;
4818 	}
4819 
4820 	if (bio->handled_zones < zones_to_copy) {
4821 		uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(ns);
4822 		uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones);
4823 
4824 		memset(bio->zone_report_buf, 0, zone_report_bufsize);
4825 		ret = spdk_nvme_zns_report_zones(ns, qpair,
4826 						 bio->zone_report_buf, zone_report_bufsize,
4827 						 slba, SPDK_NVME_ZRA_LIST_ALL, true,
4828 						 bdev_nvme_get_zone_info_done, bio);
4829 		if (!ret) {
4830 			return;
4831 		} else {
4832 			goto out_complete_io_ret;
4833 		}
4834 	}
4835 
4836 out_complete_io_nvme_cpl:
4837 	free(bio->zone_report_buf);
4838 	bio->zone_report_buf = NULL;
4839 	bdev_nvme_io_complete_nvme_status(bio, cpl);
4840 	return;
4841 
4842 out_complete_io_ret:
4843 	free(bio->zone_report_buf);
4844 	bio->zone_report_buf = NULL;
4845 	bdev_nvme_io_complete(bio, ret);
4846 }
4847 
4848 static void
4849 bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl)
4850 {
4851 	struct nvme_bdev_io *bio = ref;
4852 
4853 	bdev_nvme_io_complete_nvme_status(bio, cpl);
4854 }
4855 
4856 static void
4857 bdev_nvme_admin_passthru_complete_nvme_status(void *ctx)
4858 {
4859 	struct nvme_bdev_io *bio = ctx;
4860 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
4861 	const struct spdk_nvme_cpl *cpl = &bio->cpl;
4862 	struct nvme_bdev_channel *nbdev_ch;
4863 	struct nvme_ctrlr *nvme_ctrlr;
4864 	const struct spdk_nvme_ctrlr_data *cdata;
4865 	uint64_t delay_ms;
4866 
4867 	assert(bdev_nvme_io_type_is_admin(bdev_io->type));
4868 
4869 	if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) {
4870 		goto complete;
4871 	}
4872 
4873 	if (cpl->status.dnr != 0 || (g_opts.bdev_retry_count != -1 &&
4874 				     bio->retry_count >= g_opts.bdev_retry_count)) {
4875 		goto complete;
4876 	}
4877 
4878 	nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io));
4879 	nvme_ctrlr = nvme_ctrlr_channel_get_ctrlr(bio->io_path->ctrlr_ch);
4880 
4881 	if (spdk_nvme_cpl_is_path_error(cpl) ||
4882 	    spdk_nvme_cpl_is_aborted_sq_deletion(cpl) ||
4883 	    !nvme_ctrlr_is_available(nvme_ctrlr)) {
4884 		delay_ms = 0;
4885 	} else if (spdk_nvme_cpl_is_aborted_by_request(cpl)) {
4886 		goto complete;
4887 	} else {
4888 		bio->retry_count++;
4889 
4890 		cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr);
4891 
4892 		if (cpl->status.crd != 0) {
4893 			delay_ms = cdata->crdt[cpl->status.crd] * 100;
4894 		} else {
4895 			delay_ms = 0;
4896 		}
4897 	}
4898 
4899 	if (any_ctrlr_may_become_available(nbdev_ch)) {
4900 		bdev_nvme_queue_retry_io(nbdev_ch, bio, delay_ms);
4901 		return;
4902 	}
4903 
4904 complete:
4905 	bio->retry_count = 0;
4906 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
4907 }
4908 
4909 static void
4910 bdev_nvme_abort_complete(void *ctx)
4911 {
4912 	struct nvme_bdev_io *bio = ctx;
4913 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
4914 
4915 	if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) {
4916 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
4917 	} else {
4918 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
4919 	}
4920 }
4921 
4922 static void
4923 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl)
4924 {
4925 	struct nvme_bdev_io *bio = ref;
4926 
4927 	bio->cpl = *cpl;
4928 	spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_complete, bio);
4929 }
4930 
4931 static void
4932 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl)
4933 {
4934 	struct nvme_bdev_io *bio = ref;
4935 
4936 	bio->cpl = *cpl;
4937 	spdk_thread_send_msg(bio->orig_thread,
4938 			     bdev_nvme_admin_passthru_complete_nvme_status, bio);
4939 }
4940 
4941 static void
4942 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset)
4943 {
4944 	struct nvme_bdev_io *bio = ref;
4945 	struct iovec *iov;
4946 
4947 	bio->iov_offset = sgl_offset;
4948 	for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) {
4949 		iov = &bio->iovs[bio->iovpos];
4950 		if (bio->iov_offset < iov->iov_len) {
4951 			break;
4952 		}
4953 
4954 		bio->iov_offset -= iov->iov_len;
4955 	}
4956 }
4957 
4958 static int
4959 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length)
4960 {
4961 	struct nvme_bdev_io *bio = ref;
4962 	struct iovec *iov;
4963 
4964 	assert(bio->iovpos < bio->iovcnt);
4965 
4966 	iov = &bio->iovs[bio->iovpos];
4967 
4968 	*address = iov->iov_base;
4969 	*length = iov->iov_len;
4970 
4971 	if (bio->iov_offset) {
4972 		assert(bio->iov_offset <= iov->iov_len);
4973 		*address += bio->iov_offset;
4974 		*length -= bio->iov_offset;
4975 	}
4976 
4977 	bio->iov_offset += *length;
4978 	if (bio->iov_offset == iov->iov_len) {
4979 		bio->iovpos++;
4980 		bio->iov_offset = 0;
4981 	}
4982 
4983 	return 0;
4984 }
4985 
4986 static void
4987 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset)
4988 {
4989 	struct nvme_bdev_io *bio = ref;
4990 	struct iovec *iov;
4991 
4992 	bio->fused_iov_offset = sgl_offset;
4993 	for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) {
4994 		iov = &bio->fused_iovs[bio->fused_iovpos];
4995 		if (bio->fused_iov_offset < iov->iov_len) {
4996 			break;
4997 		}
4998 
4999 		bio->fused_iov_offset -= iov->iov_len;
5000 	}
5001 }
5002 
5003 static int
5004 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length)
5005 {
5006 	struct nvme_bdev_io *bio = ref;
5007 	struct iovec *iov;
5008 
5009 	assert(bio->fused_iovpos < bio->fused_iovcnt);
5010 
5011 	iov = &bio->fused_iovs[bio->fused_iovpos];
5012 
5013 	*address = iov->iov_base;
5014 	*length = iov->iov_len;
5015 
5016 	if (bio->fused_iov_offset) {
5017 		assert(bio->fused_iov_offset <= iov->iov_len);
5018 		*address += bio->fused_iov_offset;
5019 		*length -= bio->fused_iov_offset;
5020 	}
5021 
5022 	bio->fused_iov_offset += *length;
5023 	if (bio->fused_iov_offset == iov->iov_len) {
5024 		bio->fused_iovpos++;
5025 		bio->fused_iov_offset = 0;
5026 	}
5027 
5028 	return 0;
5029 }
5030 
5031 static int
5032 bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
5033 		      void *md, uint64_t lba_count, uint64_t lba)
5034 {
5035 	int rc;
5036 
5037 	SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n",
5038 		      lba_count, lba);
5039 
5040 	bio->iovs = iov;
5041 	bio->iovcnt = iovcnt;
5042 	bio->iovpos = 0;
5043 	bio->iov_offset = 0;
5044 
5045 	rc = spdk_nvme_ns_cmd_readv_with_md(bio->io_path->nvme_ns->ns,
5046 					    bio->io_path->ctrlr_ch->qpair,
5047 					    lba, lba_count,
5048 					    bdev_nvme_no_pi_readv_done, bio, 0,
5049 					    bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
5050 					    md, 0, 0);
5051 
5052 	if (rc != 0 && rc != -ENOMEM) {
5053 		SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc);
5054 	}
5055 	return rc;
5056 }
5057 
5058 static int
5059 bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
5060 		void *md, uint64_t lba_count, uint64_t lba, uint32_t flags,
5061 		struct spdk_bdev_ext_io_opts *ext_opts)
5062 {
5063 	struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
5064 	struct spdk_nvme_qpair *qpair = bio->io_path->ctrlr_ch->qpair;
5065 	int rc;
5066 
5067 	SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n",
5068 		      lba_count, lba);
5069 
5070 	bio->iovs = iov;
5071 	bio->iovcnt = iovcnt;
5072 	bio->iovpos = 0;
5073 	bio->iov_offset = 0;
5074 
5075 	if (ext_opts) {
5076 		bio->ext_opts.size = sizeof(struct spdk_nvme_ns_cmd_ext_io_opts);
5077 		bio->ext_opts.memory_domain = ext_opts->memory_domain;
5078 		bio->ext_opts.memory_domain_ctx = ext_opts->memory_domain_ctx;
5079 		bio->ext_opts.io_flags = flags;
5080 		bio->ext_opts.metadata = md;
5081 
5082 		rc = spdk_nvme_ns_cmd_readv_ext(ns, qpair, lba, lba_count,
5083 						bdev_nvme_readv_done, bio,
5084 						bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
5085 						&bio->ext_opts);
5086 	} else if (iovcnt == 1) {
5087 		rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, md, lba,
5088 						   lba_count,
5089 						   bdev_nvme_readv_done, bio,
5090 						   flags,
5091 						   0, 0);
5092 	} else {
5093 		rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count,
5094 						    bdev_nvme_readv_done, bio, flags,
5095 						    bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
5096 						    md, 0, 0);
5097 	}
5098 
5099 	if (rc != 0 && rc != -ENOMEM) {
5100 		SPDK_ERRLOG("readv failed: rc = %d\n", rc);
5101 	}
5102 	return rc;
5103 }
5104 
5105 static int
5106 bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
5107 		 void *md, uint64_t lba_count, uint64_t lba,
5108 		 uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts)
5109 {
5110 	struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
5111 	struct spdk_nvme_qpair *qpair = bio->io_path->ctrlr_ch->qpair;
5112 	int rc;
5113 
5114 	SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n",
5115 		      lba_count, lba);
5116 
5117 	bio->iovs = iov;
5118 	bio->iovcnt = iovcnt;
5119 	bio->iovpos = 0;
5120 	bio->iov_offset = 0;
5121 
5122 	if (ext_opts) {
5123 		bio->ext_opts.size = sizeof(struct spdk_nvme_ns_cmd_ext_io_opts);
5124 		bio->ext_opts.memory_domain = ext_opts->memory_domain;
5125 		bio->ext_opts.memory_domain_ctx = ext_opts->memory_domain_ctx;
5126 		bio->ext_opts.io_flags = flags;
5127 		bio->ext_opts.metadata = md;
5128 
5129 		rc = spdk_nvme_ns_cmd_writev_ext(ns, qpair, lba, lba_count,
5130 						 bdev_nvme_writev_done, bio,
5131 						 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
5132 						 &bio->ext_opts);
5133 	} else if (iovcnt == 1) {
5134 		rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, md, lba,
5135 						    lba_count,
5136 						    bdev_nvme_writev_done, bio,
5137 						    flags,
5138 						    0, 0);
5139 	} else {
5140 		rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count,
5141 						     bdev_nvme_writev_done, bio, flags,
5142 						     bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
5143 						     md, 0, 0);
5144 	}
5145 
5146 	if (rc != 0 && rc != -ENOMEM) {
5147 		SPDK_ERRLOG("writev failed: rc = %d\n", rc);
5148 	}
5149 	return rc;
5150 }
5151 
5152 static int
5153 bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
5154 		       void *md, uint64_t lba_count, uint64_t zslba,
5155 		       uint32_t flags)
5156 {
5157 	struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
5158 	struct spdk_nvme_qpair *qpair = bio->io_path->ctrlr_ch->qpair;
5159 	int rc;
5160 
5161 	SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n",
5162 		      lba_count, zslba);
5163 
5164 	bio->iovs = iov;
5165 	bio->iovcnt = iovcnt;
5166 	bio->iovpos = 0;
5167 	bio->iov_offset = 0;
5168 
5169 	if (iovcnt == 1) {
5170 		rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba,
5171 						       lba_count,
5172 						       bdev_nvme_zone_appendv_done, bio,
5173 						       flags,
5174 						       0, 0);
5175 	} else {
5176 		rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count,
5177 							bdev_nvme_zone_appendv_done, bio, flags,
5178 							bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
5179 							md, 0, 0);
5180 	}
5181 
5182 	if (rc != 0 && rc != -ENOMEM) {
5183 		SPDK_ERRLOG("zone append failed: rc = %d\n", rc);
5184 	}
5185 	return rc;
5186 }
5187 
5188 static int
5189 bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
5190 		   void *md, uint64_t lba_count, uint64_t lba,
5191 		   uint32_t flags)
5192 {
5193 	int rc;
5194 
5195 	SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n",
5196 		      lba_count, lba);
5197 
5198 	bio->iovs = iov;
5199 	bio->iovcnt = iovcnt;
5200 	bio->iovpos = 0;
5201 	bio->iov_offset = 0;
5202 
5203 	rc = spdk_nvme_ns_cmd_comparev_with_md(bio->io_path->nvme_ns->ns,
5204 					       bio->io_path->ctrlr_ch->qpair,
5205 					       lba, lba_count,
5206 					       bdev_nvme_comparev_done, bio, flags,
5207 					       bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
5208 					       md, 0, 0);
5209 
5210 	if (rc != 0 && rc != -ENOMEM) {
5211 		SPDK_ERRLOG("comparev failed: rc = %d\n", rc);
5212 	}
5213 	return rc;
5214 }
5215 
5216 static int
5217 bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt,
5218 			      struct iovec *write_iov, int write_iovcnt,
5219 			      void *md, uint64_t lba_count, uint64_t lba, uint32_t flags)
5220 {
5221 	struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
5222 	struct spdk_nvme_qpair *qpair = bio->io_path->ctrlr_ch->qpair;
5223 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
5224 	int rc;
5225 
5226 	SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n",
5227 		      lba_count, lba);
5228 
5229 	bio->iovs = cmp_iov;
5230 	bio->iovcnt = cmp_iovcnt;
5231 	bio->iovpos = 0;
5232 	bio->iov_offset = 0;
5233 	bio->fused_iovs = write_iov;
5234 	bio->fused_iovcnt = write_iovcnt;
5235 	bio->fused_iovpos = 0;
5236 	bio->fused_iov_offset = 0;
5237 
5238 	if (bdev_io->num_retries == 0) {
5239 		bio->first_fused_submitted = false;
5240 	}
5241 
5242 	if (!bio->first_fused_submitted) {
5243 		flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST;
5244 		memset(&bio->cpl, 0, sizeof(bio->cpl));
5245 
5246 		rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count,
5247 						       bdev_nvme_comparev_and_writev_done, bio, flags,
5248 						       bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0);
5249 		if (rc == 0) {
5250 			bio->first_fused_submitted = true;
5251 			flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST;
5252 		} else {
5253 			if (rc != -ENOMEM) {
5254 				SPDK_ERRLOG("compare failed: rc = %d\n", rc);
5255 			}
5256 			return rc;
5257 		}
5258 	}
5259 
5260 	flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND;
5261 
5262 	rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count,
5263 					     bdev_nvme_comparev_and_writev_done, bio, flags,
5264 					     bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0);
5265 	if (rc != 0 && rc != -ENOMEM) {
5266 		SPDK_ERRLOG("write failed: rc = %d\n", rc);
5267 		rc = 0;
5268 	}
5269 
5270 	return rc;
5271 }
5272 
5273 static int
5274 bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks)
5275 {
5276 	struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES];
5277 	struct spdk_nvme_dsm_range *range;
5278 	uint64_t offset, remaining;
5279 	uint64_t num_ranges_u64;
5280 	uint16_t num_ranges;
5281 	int rc;
5282 
5283 	num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) /
5284 			 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
5285 	if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) {
5286 		SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks);
5287 		return -EINVAL;
5288 	}
5289 	num_ranges = (uint16_t)num_ranges_u64;
5290 
5291 	offset = offset_blocks;
5292 	remaining = num_blocks;
5293 	range = &dsm_ranges[0];
5294 
5295 	/* Fill max-size ranges until the remaining blocks fit into one range */
5296 	while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) {
5297 		range->attributes.raw = 0;
5298 		range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
5299 		range->starting_lba = offset;
5300 
5301 		offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
5302 		remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
5303 		range++;
5304 	}
5305 
5306 	/* Final range describes the remaining blocks */
5307 	range->attributes.raw = 0;
5308 	range->length = remaining;
5309 	range->starting_lba = offset;
5310 
5311 	rc = spdk_nvme_ns_cmd_dataset_management(bio->io_path->nvme_ns->ns,
5312 			bio->io_path->ctrlr_ch->qpair,
5313 			SPDK_NVME_DSM_ATTR_DEALLOCATE,
5314 			dsm_ranges, num_ranges,
5315 			bdev_nvme_queued_done, bio);
5316 
5317 	return rc;
5318 }
5319 
5320 static int
5321 bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks)
5322 {
5323 	if (num_blocks > UINT16_MAX + 1) {
5324 		SPDK_ERRLOG("NVMe write zeroes is limited to 16-bit block count\n");
5325 		return -EINVAL;
5326 	}
5327 
5328 	return spdk_nvme_ns_cmd_write_zeroes(bio->io_path->nvme_ns->ns,
5329 					     bio->io_path->ctrlr_ch->qpair,
5330 					     offset_blocks, num_blocks,
5331 					     bdev_nvme_queued_done, bio,
5332 					     0);
5333 }
5334 
5335 static int
5336 bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones,
5337 			struct spdk_bdev_zone_info *info)
5338 {
5339 	struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
5340 	struct spdk_nvme_qpair *qpair = bio->io_path->ctrlr_ch->qpair;
5341 	uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns);
5342 	uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns);
5343 	uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns);
5344 
5345 	if (zone_id % zone_size != 0) {
5346 		return -EINVAL;
5347 	}
5348 
5349 	if (num_zones > total_zones || !num_zones) {
5350 		return -EINVAL;
5351 	}
5352 
5353 	assert(!bio->zone_report_buf);
5354 	bio->zone_report_buf = calloc(1, zone_report_bufsize);
5355 	if (!bio->zone_report_buf) {
5356 		return -ENOMEM;
5357 	}
5358 
5359 	bio->handled_zones = 0;
5360 
5361 	return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize,
5362 					  zone_id, SPDK_NVME_ZRA_LIST_ALL, true,
5363 					  bdev_nvme_get_zone_info_done, bio);
5364 }
5365 
5366 static int
5367 bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id,
5368 			  enum spdk_bdev_zone_action action)
5369 {
5370 	struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
5371 	struct spdk_nvme_qpair *qpair = bio->io_path->ctrlr_ch->qpair;
5372 
5373 	switch (action) {
5374 	case SPDK_BDEV_ZONE_CLOSE:
5375 		return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false,
5376 						bdev_nvme_zone_management_done, bio);
5377 	case SPDK_BDEV_ZONE_FINISH:
5378 		return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false,
5379 						 bdev_nvme_zone_management_done, bio);
5380 	case SPDK_BDEV_ZONE_OPEN:
5381 		return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false,
5382 					       bdev_nvme_zone_management_done, bio);
5383 	case SPDK_BDEV_ZONE_RESET:
5384 		return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false,
5385 						bdev_nvme_zone_management_done, bio);
5386 	case SPDK_BDEV_ZONE_OFFLINE:
5387 		return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false,
5388 						  bdev_nvme_zone_management_done, bio);
5389 	default:
5390 		return -EINVAL;
5391 	}
5392 }
5393 
5394 static void
5395 bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio,
5396 			 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes)
5397 {
5398 	struct nvme_io_path *io_path;
5399 	struct nvme_ctrlr *nvme_ctrlr;
5400 	uint32_t max_xfer_size;
5401 	int rc = -ENXIO;
5402 
5403 	/* Choose the first ctrlr which is not failed. */
5404 	STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
5405 		nvme_ctrlr = nvme_ctrlr_channel_get_ctrlr(io_path->ctrlr_ch);
5406 
5407 		/* We should skip any unavailable nvme_ctrlr rather than checking
5408 		 * if the return value of spdk_nvme_ctrlr_cmd_admin_raw() is -ENXIO.
5409 		 */
5410 		if (!nvme_ctrlr_is_available(nvme_ctrlr)) {
5411 			continue;
5412 		}
5413 
5414 		max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ctrlr->ctrlr);
5415 
5416 		if (nbytes > max_xfer_size) {
5417 			SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
5418 			rc = -EINVAL;
5419 			goto err;
5420 		}
5421 
5422 		bio->io_path = io_path;
5423 		bio->orig_thread = spdk_get_thread();
5424 
5425 		rc = spdk_nvme_ctrlr_cmd_admin_raw(nvme_ctrlr->ctrlr, cmd, buf, (uint32_t)nbytes,
5426 						   bdev_nvme_admin_passthru_done, bio);
5427 		if (rc == 0) {
5428 			return;
5429 		}
5430 	}
5431 
5432 err:
5433 	bdev_nvme_admin_passthru_complete(bio, rc);
5434 }
5435 
5436 static int
5437 bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd,
5438 		      void *buf, size_t nbytes)
5439 {
5440 	struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
5441 	struct spdk_nvme_qpair *qpair = bio->io_path->ctrlr_ch->qpair;
5442 	uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns);
5443 	struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns);
5444 
5445 	if (nbytes > max_xfer_size) {
5446 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
5447 		return -EINVAL;
5448 	}
5449 
5450 	/*
5451 	 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid,
5452 	 * so fill it out automatically.
5453 	 */
5454 	cmd->nsid = spdk_nvme_ns_get_id(ns);
5455 
5456 	return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf,
5457 					  (uint32_t)nbytes, bdev_nvme_queued_done, bio);
5458 }
5459 
5460 static int
5461 bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd,
5462 			 void *buf, size_t nbytes, void *md_buf, size_t md_len)
5463 {
5464 	struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
5465 	struct spdk_nvme_qpair *qpair = bio->io_path->ctrlr_ch->qpair;
5466 	size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns);
5467 	uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns);
5468 	struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns);
5469 
5470 	if (nbytes > max_xfer_size) {
5471 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
5472 		return -EINVAL;
5473 	}
5474 
5475 	if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) {
5476 		SPDK_ERRLOG("invalid meta data buffer size\n");
5477 		return -EINVAL;
5478 	}
5479 
5480 	/*
5481 	 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid,
5482 	 * so fill it out automatically.
5483 	 */
5484 	cmd->nsid = spdk_nvme_ns_get_id(ns);
5485 
5486 	return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf,
5487 			(uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio);
5488 }
5489 
5490 static void
5491 bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio,
5492 		struct nvme_bdev_io *bio_to_abort)
5493 {
5494 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
5495 	struct spdk_bdev_io *bdev_io_to_abort;
5496 	struct nvme_io_path *io_path;
5497 	struct nvme_ctrlr *nvme_ctrlr;
5498 	int rc = 0;
5499 
5500 	bio->orig_thread = spdk_get_thread();
5501 
5502 	/* Traverse the retry_io_list first. */
5503 	TAILQ_FOREACH(bdev_io_to_abort, &nbdev_ch->retry_io_list, module_link) {
5504 		if ((struct nvme_bdev_io *)bdev_io_to_abort->driver_ctx == bio_to_abort) {
5505 			TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io_to_abort, module_link);
5506 			spdk_bdev_io_complete(bdev_io_to_abort, SPDK_BDEV_IO_STATUS_ABORTED);
5507 
5508 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
5509 			return;
5510 		}
5511 	}
5512 
5513 	/* Even admin commands, they were submitted to only nvme_ctrlrs which were
5514 	 * on any io_path. So traverse the io_path list for not only I/O commands
5515 	 * but also admin commands.
5516 	 */
5517 	STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
5518 		nvme_ctrlr = nvme_ctrlr_channel_get_ctrlr(io_path->ctrlr_ch);
5519 
5520 		rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ctrlr->ctrlr,
5521 						   io_path->ctrlr_ch->qpair,
5522 						   bio_to_abort,
5523 						   bdev_nvme_abort_done, bio);
5524 		if (rc == -ENOENT) {
5525 			/* If no command was found in I/O qpair, the target command may be
5526 			 * admin command.
5527 			 */
5528 			rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ctrlr->ctrlr,
5529 							   NULL,
5530 							   bio_to_abort,
5531 							   bdev_nvme_abort_done, bio);
5532 		}
5533 
5534 		if (rc != -ENOENT) {
5535 			break;
5536 		}
5537 	}
5538 
5539 	if (rc != 0) {
5540 		/* If no command was found or there was any error, complete the abort
5541 		 * request with failure.
5542 		 */
5543 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
5544 	}
5545 }
5546 
5547 static void
5548 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w)
5549 {
5550 	const char	*action;
5551 
5552 	if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) {
5553 		action = "reset";
5554 	} else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) {
5555 		action = "abort";
5556 	} else {
5557 		action = "none";
5558 	}
5559 
5560 	spdk_json_write_object_begin(w);
5561 
5562 	spdk_json_write_named_string(w, "method", "bdev_nvme_set_options");
5563 
5564 	spdk_json_write_named_object_begin(w, "params");
5565 	spdk_json_write_named_string(w, "action_on_timeout", action);
5566 	spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us);
5567 	spdk_json_write_named_uint64(w, "timeout_admin_us", g_opts.timeout_admin_us);
5568 	spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms);
5569 	spdk_json_write_named_uint32(w, "transport_retry_count", g_opts.transport_retry_count);
5570 	spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst);
5571 	spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight);
5572 	spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight);
5573 	spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight);
5574 	spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us);
5575 	spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us);
5576 	spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests);
5577 	spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit);
5578 	spdk_json_write_named_int32(w, "bdev_retry_count", g_opts.bdev_retry_count);
5579 	spdk_json_write_object_end(w);
5580 
5581 	spdk_json_write_object_end(w);
5582 }
5583 
5584 static void
5585 nvme_ctrlr_config_json(struct spdk_json_write_ctx *w,
5586 		       struct nvme_ctrlr *nvme_ctrlr)
5587 {
5588 	struct spdk_nvme_transport_id	*trid;
5589 
5590 	trid = &nvme_ctrlr->active_path_id->trid;
5591 
5592 	spdk_json_write_object_begin(w);
5593 
5594 	spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller");
5595 
5596 	spdk_json_write_named_object_begin(w, "params");
5597 	spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name);
5598 	nvme_bdev_dump_trid_json(trid, w);
5599 	spdk_json_write_named_bool(w, "prchk_reftag",
5600 				   (nvme_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0);
5601 	spdk_json_write_named_bool(w, "prchk_guard",
5602 				   (nvme_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0);
5603 	spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", nvme_ctrlr->ctrlr_loss_timeout_sec);
5604 	spdk_json_write_named_uint32(w, "reconnect_delay_sec", nvme_ctrlr->reconnect_delay_sec);
5605 	spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", nvme_ctrlr->fast_io_fail_timeout_sec);
5606 
5607 	spdk_json_write_object_end(w);
5608 
5609 	spdk_json_write_object_end(w);
5610 }
5611 
5612 static void
5613 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w)
5614 {
5615 	spdk_json_write_object_begin(w);
5616 	spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug");
5617 
5618 	spdk_json_write_named_object_begin(w, "params");
5619 	spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us);
5620 	spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled);
5621 	spdk_json_write_object_end(w);
5622 
5623 	spdk_json_write_object_end(w);
5624 }
5625 
5626 static int
5627 bdev_nvme_config_json(struct spdk_json_write_ctx *w)
5628 {
5629 	struct nvme_bdev_ctrlr	*nbdev_ctrlr;
5630 	struct nvme_ctrlr	*nvme_ctrlr;
5631 
5632 	bdev_nvme_opts_config_json(w);
5633 
5634 	pthread_mutex_lock(&g_bdev_nvme_mutex);
5635 
5636 	TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
5637 		TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) {
5638 			nvme_ctrlr_config_json(w, nvme_ctrlr);
5639 		}
5640 	}
5641 
5642 	/* Dump as last parameter to give all NVMe bdevs chance to be constructed
5643 	 * before enabling hotplug poller.
5644 	 */
5645 	bdev_nvme_hotplug_config_json(w);
5646 
5647 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
5648 	return 0;
5649 }
5650 
5651 struct spdk_nvme_ctrlr *
5652 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev)
5653 {
5654 	struct nvme_bdev *nbdev;
5655 	struct nvme_ns *nvme_ns;
5656 
5657 	if (!bdev || bdev->module != &nvme_if) {
5658 		return NULL;
5659 	}
5660 
5661 	nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk);
5662 	nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list);
5663 	assert(nvme_ns != NULL);
5664 
5665 	return nvme_ns->ctrlr->ctrlr;
5666 }
5667 
5668 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme)
5669