xref: /spdk/module/bdev/nvme/bdev_nvme.c (revision 1a00f5c09488e7466a331b8c75cde4969740357f)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation. All rights reserved.
5  *   Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
6  *   Copyright (c) 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
7  *
8  *   Redistribution and use in source and binary forms, with or without
9  *   modification, are permitted provided that the following conditions
10  *   are met:
11  *
12  *     * Redistributions of source code must retain the above copyright
13  *       notice, this list of conditions and the following disclaimer.
14  *     * Redistributions in binary form must reproduce the above copyright
15  *       notice, this list of conditions and the following disclaimer in
16  *       the documentation and/or other materials provided with the
17  *       distribution.
18  *     * Neither the name of Intel Corporation nor the names of its
19  *       contributors may be used to endorse or promote products derived
20  *       from this software without specific prior written permission.
21  *
22  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
25  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
26  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  */
34 
35 #include "spdk/stdinc.h"
36 
37 #include "bdev_nvme.h"
38 
39 #include "spdk/accel_engine.h"
40 #include "spdk/config.h"
41 #include "spdk/endian.h"
42 #include "spdk/bdev.h"
43 #include "spdk/json.h"
44 #include "spdk/likely.h"
45 #include "spdk/nvme.h"
46 #include "spdk/nvme_ocssd.h"
47 #include "spdk/nvme_zns.h"
48 #include "spdk/opal.h"
49 #include "spdk/thread.h"
50 #include "spdk/string.h"
51 #include "spdk/util.h"
52 
53 #include "spdk/bdev_module.h"
54 #include "spdk/log.h"
55 
56 #include "spdk_internal/usdt.h"
57 
58 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true
59 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS	(10000)
60 
61 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w);
62 
63 struct nvme_bdev_io {
64 	/** array of iovecs to transfer. */
65 	struct iovec *iovs;
66 
67 	/** Number of iovecs in iovs array. */
68 	int iovcnt;
69 
70 	/** Current iovec position. */
71 	int iovpos;
72 
73 	/** Offset in current iovec. */
74 	uint32_t iov_offset;
75 
76 	/** I/O path the current I/O or admin passthrough is submitted on, or the I/O path
77 	 *  being reset in a reset I/O.
78 	 */
79 	struct nvme_io_path *io_path;
80 
81 	/** array of iovecs to transfer. */
82 	struct iovec *fused_iovs;
83 
84 	/** Number of iovecs in iovs array. */
85 	int fused_iovcnt;
86 
87 	/** Current iovec position. */
88 	int fused_iovpos;
89 
90 	/** Offset in current iovec. */
91 	uint32_t fused_iov_offset;
92 
93 	/** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */
94 	struct spdk_nvme_cpl cpl;
95 
96 	/** Extended IO opts passed by the user to bdev layer and mapped to NVME format */
97 	struct spdk_nvme_ns_cmd_ext_io_opts ext_opts;
98 
99 	/** Originating thread */
100 	struct spdk_thread *orig_thread;
101 
102 	/** Keeps track if first of fused commands was submitted */
103 	bool first_fused_submitted;
104 
105 	/** Temporary pointer to zone report buffer */
106 	struct spdk_nvme_zns_zone_report *zone_report_buf;
107 
108 	/** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */
109 	uint64_t handled_zones;
110 
111 	/** Expiration value in ticks to retry the current I/O. */
112 	uint64_t retry_ticks;
113 
114 	/* How many times the current I/O was retried. */
115 	int32_t retry_count;
116 };
117 
118 struct nvme_probe_skip_entry {
119 	struct spdk_nvme_transport_id		trid;
120 	TAILQ_ENTRY(nvme_probe_skip_entry)	tailq;
121 };
122 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */
123 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER(
124 			g_skipped_nvme_ctrlrs);
125 
126 static struct spdk_bdev_nvme_opts g_opts = {
127 	.action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE,
128 	.timeout_us = 0,
129 	.timeout_admin_us = 0,
130 	.keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS,
131 	.transport_retry_count = 4,
132 	.arbitration_burst = 0,
133 	.low_priority_weight = 0,
134 	.medium_priority_weight = 0,
135 	.high_priority_weight = 0,
136 	.nvme_adminq_poll_period_us = 10000ULL,
137 	.nvme_ioq_poll_period_us = 0,
138 	.io_queue_requests = 0,
139 	.delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT,
140 	.bdev_retry_count = 3,
141 	.transport_ack_timeout = 0,
142 };
143 
144 #define NVME_HOTPLUG_POLL_PERIOD_MAX			10000000ULL
145 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT		100000ULL
146 
147 static int g_hot_insert_nvme_controller_index = 0;
148 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT;
149 static bool g_nvme_hotplug_enabled = false;
150 static struct spdk_thread *g_bdev_nvme_init_thread;
151 static struct spdk_poller *g_hotplug_poller;
152 static struct spdk_poller *g_hotplug_probe_poller;
153 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx;
154 
155 static void nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr,
156 		struct nvme_async_probe_ctx *ctx);
157 static void nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr,
158 		struct nvme_async_probe_ctx *ctx);
159 static int bdev_nvme_library_init(void);
160 static void bdev_nvme_library_fini(void);
161 static void bdev_nvme_submit_request(struct spdk_io_channel *ch,
162 				     struct spdk_bdev_io *bdev_io);
163 static int bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
164 			   void *md, uint64_t lba_count, uint64_t lba,
165 			   uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts);
166 static int bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
167 				 void *md, uint64_t lba_count, uint64_t lba);
168 static int bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
169 			    void *md, uint64_t lba_count, uint64_t lba,
170 			    uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts);
171 static int bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
172 				  void *md, uint64_t lba_count,
173 				  uint64_t zslba, uint32_t flags);
174 static int bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
175 			      void *md, uint64_t lba_count, uint64_t lba,
176 			      uint32_t flags);
177 static int bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio,
178 		struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov,
179 		int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba,
180 		uint32_t flags);
181 static int bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id,
182 				   uint32_t num_zones, struct spdk_bdev_zone_info *info);
183 static int bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id,
184 				     enum spdk_bdev_zone_action action);
185 static void bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch,
186 				     struct nvme_bdev_io *bio,
187 				     struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes);
188 static int bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd,
189 				 void *buf, size_t nbytes);
190 static int bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd,
191 				    void *buf, size_t nbytes, void *md_buf, size_t md_len);
192 static void bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch,
193 			    struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort);
194 static void bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio);
195 static int bdev_nvme_reset(struct nvme_ctrlr *nvme_ctrlr);
196 static int bdev_nvme_failover(struct nvme_ctrlr *nvme_ctrlr, bool remove);
197 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr);
198 static int nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr);
199 
200 static int
201 nvme_ns_cmp(struct nvme_ns *ns1, struct nvme_ns *ns2)
202 {
203 	return ns1->id < ns2->id ? -1 : ns1->id > ns2->id;
204 }
205 
206 RB_GENERATE_STATIC(nvme_ns_tree, nvme_ns, node, nvme_ns_cmp);
207 
208 struct spdk_nvme_qpair *
209 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch)
210 {
211 	struct nvme_ctrlr_channel *ctrlr_ch;
212 
213 	assert(ctrlr_io_ch != NULL);
214 
215 	ctrlr_ch = spdk_io_channel_get_ctx(ctrlr_io_ch);
216 
217 	return ctrlr_ch->qpair;
218 }
219 
220 static int
221 bdev_nvme_get_ctx_size(void)
222 {
223 	return sizeof(struct nvme_bdev_io);
224 }
225 
226 static struct spdk_bdev_module nvme_if = {
227 	.name = "nvme",
228 	.async_fini = true,
229 	.module_init = bdev_nvme_library_init,
230 	.module_fini = bdev_nvme_library_fini,
231 	.config_json = bdev_nvme_config_json,
232 	.get_ctx_size = bdev_nvme_get_ctx_size,
233 
234 };
235 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if)
236 
237 struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_bdev_ctrlrs);
238 pthread_mutex_t g_bdev_nvme_mutex = PTHREAD_MUTEX_INITIALIZER;
239 bool g_bdev_nvme_module_finish;
240 
241 struct nvme_bdev_ctrlr *
242 nvme_bdev_ctrlr_get_by_name(const char *name)
243 {
244 	struct nvme_bdev_ctrlr *nbdev_ctrlr;
245 
246 	TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
247 		if (strcmp(name, nbdev_ctrlr->name) == 0) {
248 			break;
249 		}
250 	}
251 
252 	return nbdev_ctrlr;
253 }
254 
255 static struct nvme_ctrlr *
256 nvme_bdev_ctrlr_get_ctrlr(struct nvme_bdev_ctrlr *nbdev_ctrlr,
257 			  const struct spdk_nvme_transport_id *trid)
258 {
259 	struct nvme_ctrlr *nvme_ctrlr;
260 
261 	TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) {
262 		if (spdk_nvme_transport_id_compare(trid, &nvme_ctrlr->active_path_id->trid) == 0) {
263 			break;
264 		}
265 	}
266 
267 	return nvme_ctrlr;
268 }
269 
270 static struct nvme_bdev *
271 nvme_bdev_ctrlr_get_bdev(struct nvme_bdev_ctrlr *nbdev_ctrlr, uint32_t nsid)
272 {
273 	struct nvme_bdev *bdev;
274 
275 	pthread_mutex_lock(&g_bdev_nvme_mutex);
276 	TAILQ_FOREACH(bdev, &nbdev_ctrlr->bdevs, tailq) {
277 		if (bdev->nsid == nsid) {
278 			break;
279 		}
280 	}
281 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
282 
283 	return bdev;
284 }
285 
286 struct nvme_ns *
287 nvme_ctrlr_get_ns(struct nvme_ctrlr *nvme_ctrlr, uint32_t nsid)
288 {
289 	struct nvme_ns ns;
290 
291 	assert(nsid > 0);
292 
293 	ns.id = nsid;
294 	return RB_FIND(nvme_ns_tree, &nvme_ctrlr->namespaces, &ns);
295 }
296 
297 struct nvme_ns *
298 nvme_ctrlr_get_first_active_ns(struct nvme_ctrlr *nvme_ctrlr)
299 {
300 	return RB_MIN(nvme_ns_tree, &nvme_ctrlr->namespaces);
301 }
302 
303 struct nvme_ns *
304 nvme_ctrlr_get_next_active_ns(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *ns)
305 {
306 	if (ns == NULL) {
307 		return NULL;
308 	}
309 
310 	return RB_NEXT(nvme_ns_tree, &nvme_ctrlr->namespaces, ns);
311 }
312 
313 static struct nvme_ctrlr *
314 nvme_ctrlr_get(const struct spdk_nvme_transport_id *trid)
315 {
316 	struct nvme_bdev_ctrlr	*nbdev_ctrlr;
317 	struct nvme_ctrlr	*nvme_ctrlr = NULL;
318 
319 	pthread_mutex_lock(&g_bdev_nvme_mutex);
320 	TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
321 		nvme_ctrlr = nvme_bdev_ctrlr_get_ctrlr(nbdev_ctrlr, trid);
322 		if (nvme_ctrlr != NULL) {
323 			break;
324 		}
325 	}
326 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
327 
328 	return nvme_ctrlr;
329 }
330 
331 struct nvme_ctrlr *
332 nvme_ctrlr_get_by_name(const char *name)
333 {
334 	struct nvme_bdev_ctrlr *nbdev_ctrlr;
335 	struct nvme_ctrlr *nvme_ctrlr = NULL;
336 
337 	if (name == NULL) {
338 		return NULL;
339 	}
340 
341 	pthread_mutex_lock(&g_bdev_nvme_mutex);
342 	nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name);
343 	if (nbdev_ctrlr != NULL) {
344 		nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs);
345 	}
346 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
347 
348 	return nvme_ctrlr;
349 }
350 
351 void
352 nvme_bdev_ctrlr_for_each(nvme_bdev_ctrlr_for_each_fn fn, void *ctx)
353 {
354 	struct nvme_bdev_ctrlr *nbdev_ctrlr;
355 
356 	pthread_mutex_lock(&g_bdev_nvme_mutex);
357 	TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
358 		fn(nbdev_ctrlr, ctx);
359 	}
360 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
361 }
362 
363 void
364 nvme_bdev_dump_trid_json(const struct spdk_nvme_transport_id *trid, struct spdk_json_write_ctx *w)
365 {
366 	const char *trtype_str;
367 	const char *adrfam_str;
368 
369 	trtype_str = spdk_nvme_transport_id_trtype_str(trid->trtype);
370 	if (trtype_str) {
371 		spdk_json_write_named_string(w, "trtype", trtype_str);
372 	}
373 
374 	adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam);
375 	if (adrfam_str) {
376 		spdk_json_write_named_string(w, "adrfam", adrfam_str);
377 	}
378 
379 	if (trid->traddr[0] != '\0') {
380 		spdk_json_write_named_string(w, "traddr", trid->traddr);
381 	}
382 
383 	if (trid->trsvcid[0] != '\0') {
384 		spdk_json_write_named_string(w, "trsvcid", trid->trsvcid);
385 	}
386 
387 	if (trid->subnqn[0] != '\0') {
388 		spdk_json_write_named_string(w, "subnqn", trid->subnqn);
389 	}
390 }
391 
392 static void
393 nvme_bdev_ctrlr_delete(struct nvme_bdev_ctrlr *nbdev_ctrlr,
394 		       struct nvme_ctrlr *nvme_ctrlr)
395 {
396 	SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_delete, nvme_ctrlr->nbdev_ctrlr->name);
397 	pthread_mutex_lock(&g_bdev_nvme_mutex);
398 
399 	TAILQ_REMOVE(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq);
400 	if (!TAILQ_EMPTY(&nbdev_ctrlr->ctrlrs)) {
401 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
402 
403 		return;
404 	}
405 	TAILQ_REMOVE(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq);
406 
407 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
408 
409 	assert(TAILQ_EMPTY(&nbdev_ctrlr->bdevs));
410 
411 	free(nbdev_ctrlr->name);
412 	free(nbdev_ctrlr);
413 }
414 
415 static void
416 _nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr)
417 {
418 	struct nvme_path_id *path_id, *tmp_path;
419 	struct nvme_ns *ns, *tmp_ns;
420 
421 	free(nvme_ctrlr->copied_ana_desc);
422 	spdk_free(nvme_ctrlr->ana_log_page);
423 
424 	if (nvme_ctrlr->opal_dev) {
425 		spdk_opal_dev_destruct(nvme_ctrlr->opal_dev);
426 		nvme_ctrlr->opal_dev = NULL;
427 	}
428 
429 	if (nvme_ctrlr->nbdev_ctrlr) {
430 		nvme_bdev_ctrlr_delete(nvme_ctrlr->nbdev_ctrlr, nvme_ctrlr);
431 	}
432 
433 	RB_FOREACH_SAFE(ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp_ns) {
434 		RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, ns);
435 		free(ns);
436 	}
437 
438 	TAILQ_FOREACH_SAFE(path_id, &nvme_ctrlr->trids, link, tmp_path) {
439 		TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link);
440 		free(path_id);
441 	}
442 
443 	pthread_mutex_destroy(&nvme_ctrlr->mutex);
444 
445 	free(nvme_ctrlr);
446 
447 	pthread_mutex_lock(&g_bdev_nvme_mutex);
448 	if (g_bdev_nvme_module_finish && TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
449 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
450 		spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL);
451 		spdk_bdev_module_fini_done();
452 		return;
453 	}
454 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
455 }
456 
457 static int
458 nvme_detach_poller(void *arg)
459 {
460 	struct nvme_ctrlr *nvme_ctrlr = arg;
461 	int rc;
462 
463 	rc = spdk_nvme_detach_poll_async(nvme_ctrlr->detach_ctx);
464 	if (rc != -EAGAIN) {
465 		spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller);
466 		_nvme_ctrlr_delete(nvme_ctrlr);
467 	}
468 
469 	return SPDK_POLLER_BUSY;
470 }
471 
472 static void
473 nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr)
474 {
475 	int rc;
476 
477 	spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer);
478 
479 	/* First, unregister the adminq poller, as the driver will poll adminq if necessary */
480 	spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller);
481 
482 	/* If we got here, the reset/detach poller cannot be active */
483 	assert(nvme_ctrlr->reset_detach_poller == NULL);
484 	nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(nvme_detach_poller,
485 					  nvme_ctrlr, 1000);
486 	if (nvme_ctrlr->reset_detach_poller == NULL) {
487 		SPDK_ERRLOG("Failed to register detach poller\n");
488 		goto error;
489 	}
490 
491 	rc = spdk_nvme_detach_async(nvme_ctrlr->ctrlr, &nvme_ctrlr->detach_ctx);
492 	if (rc != 0) {
493 		SPDK_ERRLOG("Failed to detach the NVMe controller\n");
494 		goto error;
495 	}
496 
497 	return;
498 error:
499 	/* We don't have a good way to handle errors here, so just do what we can and delete the
500 	 * controller without detaching the underlying NVMe device.
501 	 */
502 	spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller);
503 	_nvme_ctrlr_delete(nvme_ctrlr);
504 }
505 
506 static void
507 nvme_ctrlr_unregister_cb(void *io_device)
508 {
509 	struct nvme_ctrlr *nvme_ctrlr = io_device;
510 
511 	nvme_ctrlr_delete(nvme_ctrlr);
512 }
513 
514 static void
515 nvme_ctrlr_unregister(struct nvme_ctrlr *nvme_ctrlr)
516 {
517 	spdk_io_device_unregister(nvme_ctrlr, nvme_ctrlr_unregister_cb);
518 }
519 
520 static bool
521 nvme_ctrlr_can_be_unregistered(struct nvme_ctrlr *nvme_ctrlr)
522 {
523 	if (!nvme_ctrlr->destruct) {
524 		return false;
525 	}
526 
527 	if (nvme_ctrlr->ref > 0) {
528 		return false;
529 	}
530 
531 	if (nvme_ctrlr->resetting) {
532 		return false;
533 	}
534 
535 	if (nvme_ctrlr->ana_log_page_updating) {
536 		return false;
537 	}
538 
539 	return true;
540 }
541 
542 static void
543 nvme_ctrlr_release(struct nvme_ctrlr *nvme_ctrlr)
544 {
545 	pthread_mutex_lock(&nvme_ctrlr->mutex);
546 	SPDK_DTRACE_PROBE2(bdev_nvme_ctrlr_release, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ref);
547 
548 	assert(nvme_ctrlr->ref > 0);
549 	nvme_ctrlr->ref--;
550 
551 	if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) {
552 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
553 		return;
554 	}
555 
556 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
557 
558 	nvme_ctrlr_unregister(nvme_ctrlr);
559 }
560 
561 static struct nvme_io_path *
562 _bdev_nvme_get_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns)
563 {
564 	struct nvme_io_path *io_path;
565 
566 	STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
567 		if (io_path->nvme_ns == nvme_ns) {
568 			break;
569 		}
570 	}
571 
572 	return io_path;
573 }
574 
575 static int
576 _bdev_nvme_add_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns)
577 {
578 	struct nvme_io_path *io_path;
579 	struct spdk_io_channel *ch;
580 
581 	io_path = calloc(1, sizeof(*io_path));
582 	if (io_path == NULL) {
583 		SPDK_ERRLOG("Failed to alloc io_path.\n");
584 		return -ENOMEM;
585 	}
586 
587 	ch = spdk_get_io_channel(nvme_ns->ctrlr);
588 	if (ch == NULL) {
589 		free(io_path);
590 		SPDK_ERRLOG("Failed to alloc io_channel.\n");
591 		return -ENOMEM;
592 	}
593 
594 	io_path->ctrlr_ch = spdk_io_channel_get_ctx(ch);
595 	TAILQ_INSERT_TAIL(&io_path->ctrlr_ch->io_path_list, io_path, tailq);
596 
597 	io_path->nvme_ns = nvme_ns;
598 
599 	io_path->nbdev_ch = nbdev_ch;
600 	STAILQ_INSERT_TAIL(&nbdev_ch->io_path_list, io_path, stailq);
601 
602 	nbdev_ch->current_io_path = NULL;
603 
604 	return 0;
605 }
606 
607 static void
608 _bdev_nvme_delete_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *io_path)
609 {
610 	struct spdk_io_channel *ch;
611 
612 	nbdev_ch->current_io_path = NULL;
613 
614 	STAILQ_REMOVE(&nbdev_ch->io_path_list, io_path, nvme_io_path, stailq);
615 
616 	TAILQ_REMOVE(&io_path->ctrlr_ch->io_path_list, io_path, tailq);
617 	ch = spdk_io_channel_from_ctx(io_path->ctrlr_ch);
618 	spdk_put_io_channel(ch);
619 
620 	free(io_path);
621 }
622 
623 static void
624 _bdev_nvme_delete_io_paths(struct nvme_bdev_channel *nbdev_ch)
625 {
626 	struct nvme_io_path *io_path, *tmp_io_path;
627 
628 	STAILQ_FOREACH_SAFE(io_path, &nbdev_ch->io_path_list, stailq, tmp_io_path) {
629 		_bdev_nvme_delete_io_path(nbdev_ch, io_path);
630 	}
631 }
632 
633 static int
634 bdev_nvme_create_bdev_channel_cb(void *io_device, void *ctx_buf)
635 {
636 	struct nvme_bdev_channel *nbdev_ch = ctx_buf;
637 	struct nvme_bdev *nbdev = io_device;
638 	struct nvme_ns *nvme_ns;
639 	int rc;
640 
641 	STAILQ_INIT(&nbdev_ch->io_path_list);
642 	TAILQ_INIT(&nbdev_ch->retry_io_list);
643 
644 	pthread_mutex_lock(&nbdev->mutex);
645 	TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) {
646 		rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns);
647 		if (rc != 0) {
648 			pthread_mutex_unlock(&nbdev->mutex);
649 
650 			_bdev_nvme_delete_io_paths(nbdev_ch);
651 			return rc;
652 		}
653 	}
654 	pthread_mutex_unlock(&nbdev->mutex);
655 
656 	return 0;
657 }
658 
659 static void
660 bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch)
661 {
662 	struct spdk_bdev_io *bdev_io, *tmp_io;
663 
664 	TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_io) {
665 		TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link);
666 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED);
667 	}
668 
669 	spdk_poller_unregister(&nbdev_ch->retry_io_poller);
670 }
671 
672 static void
673 bdev_nvme_destroy_bdev_channel_cb(void *io_device, void *ctx_buf)
674 {
675 	struct nvme_bdev_channel *nbdev_ch = ctx_buf;
676 
677 	bdev_nvme_abort_retry_ios(nbdev_ch);
678 	_bdev_nvme_delete_io_paths(nbdev_ch);
679 }
680 
681 static inline bool
682 bdev_nvme_io_type_is_admin(enum spdk_bdev_io_type io_type)
683 {
684 	switch (io_type) {
685 	case SPDK_BDEV_IO_TYPE_RESET:
686 	case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
687 	case SPDK_BDEV_IO_TYPE_ABORT:
688 		return true;
689 	default:
690 		break;
691 	}
692 
693 	return false;
694 }
695 
696 static inline bool
697 nvme_ns_is_accessible(struct nvme_ns *nvme_ns)
698 {
699 	if (spdk_unlikely(nvme_ns->ana_state_updating)) {
700 		return false;
701 	}
702 
703 	switch (nvme_ns->ana_state) {
704 	case SPDK_NVME_ANA_OPTIMIZED_STATE:
705 	case SPDK_NVME_ANA_NON_OPTIMIZED_STATE:
706 		return true;
707 	default:
708 		break;
709 	}
710 
711 	return false;
712 }
713 
714 static inline bool
715 nvme_io_path_is_connected(struct nvme_io_path *io_path)
716 {
717 	return io_path->ctrlr_ch->qpair != NULL;
718 }
719 
720 static inline bool
721 nvme_io_path_is_available(struct nvme_io_path *io_path)
722 {
723 	if (spdk_unlikely(!nvme_io_path_is_connected(io_path))) {
724 		return false;
725 	}
726 
727 	if (spdk_unlikely(!nvme_ns_is_accessible(io_path->nvme_ns))) {
728 		return false;
729 	}
730 
731 	return true;
732 }
733 
734 static inline bool
735 nvme_io_path_is_failed(struct nvme_io_path *io_path)
736 {
737 	struct nvme_ctrlr *nvme_ctrlr;
738 
739 	nvme_ctrlr = nvme_ctrlr_channel_get_ctrlr(io_path->ctrlr_ch);
740 
741 	if (nvme_ctrlr->destruct) {
742 		return true;
743 	}
744 
745 	if (nvme_ctrlr->fast_io_fail_timedout) {
746 		return true;
747 	}
748 
749 	if (nvme_ctrlr->resetting) {
750 		if (nvme_ctrlr->reconnect_delay_sec != 0) {
751 			return false;
752 		} else {
753 			return true;
754 		}
755 	}
756 
757 	if (nvme_ctrlr->reconnect_is_delayed) {
758 		return false;
759 	}
760 
761 	if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) {
762 		return true;
763 	} else {
764 		return false;
765 	}
766 }
767 
768 static bool
769 nvme_ctrlr_is_available(struct nvme_ctrlr *nvme_ctrlr)
770 {
771 	if (nvme_ctrlr->destruct) {
772 		return false;
773 	}
774 
775 	if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) {
776 		return false;
777 	}
778 
779 	if (nvme_ctrlr->resetting || nvme_ctrlr->reconnect_is_delayed) {
780 		return false;
781 	}
782 
783 	return true;
784 }
785 
786 static inline struct nvme_io_path *
787 bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch)
788 {
789 	struct nvme_io_path *io_path, *non_optimized = NULL;
790 
791 	if (spdk_likely(nbdev_ch->current_io_path != NULL)) {
792 		return nbdev_ch->current_io_path;
793 	}
794 
795 	STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
796 		if (spdk_unlikely(!nvme_io_path_is_connected(io_path))) {
797 			/* The device is currently resetting. */
798 			continue;
799 		}
800 
801 		if (spdk_unlikely(io_path->nvme_ns->ana_state_updating)) {
802 			continue;
803 		}
804 
805 		switch (io_path->nvme_ns->ana_state) {
806 		case SPDK_NVME_ANA_OPTIMIZED_STATE:
807 			nbdev_ch->current_io_path = io_path;
808 			return io_path;
809 		case SPDK_NVME_ANA_NON_OPTIMIZED_STATE:
810 			if (non_optimized == NULL) {
811 				non_optimized = io_path;
812 			}
813 			break;
814 		default:
815 			break;
816 		}
817 	}
818 
819 	return non_optimized;
820 }
821 
822 /* Return true if there is any io_path whose qpair is active or ctrlr is not failed,
823  * or false otherwise.
824  *
825  * If any io_path has an active qpair but find_io_path() returned NULL, its namespace
826  * is likely to be non-accessible now but may become accessible.
827  *
828  * If any io_path has an unfailed ctrlr but find_io_path() returned NULL, the ctrlr
829  * is likely to be resetting now but the reset may succeed. A ctrlr is set to unfailed
830  * when starting to reset it but it is set to failed when the reset failed. Hence, if
831  * a ctrlr is unfailed, it is likely that it works fine or is resetting.
832  */
833 static bool
834 any_io_path_may_become_available(struct nvme_bdev_channel *nbdev_ch)
835 {
836 	struct nvme_io_path *io_path;
837 
838 	STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
839 		if (nvme_io_path_is_connected(io_path) ||
840 		    !nvme_io_path_is_failed(io_path)) {
841 			return true;
842 		}
843 	}
844 
845 	return false;
846 }
847 
848 static bool
849 any_ctrlr_may_become_available(struct nvme_bdev_channel *nbdev_ch)
850 {
851 	struct nvme_io_path *io_path;
852 
853 	STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
854 		if (!nvme_io_path_is_failed(io_path)) {
855 			return true;
856 		}
857 	}
858 
859 	return false;
860 }
861 
862 static int
863 bdev_nvme_retry_ios(void *arg)
864 {
865 	struct nvme_bdev_channel *nbdev_ch = arg;
866 	struct spdk_io_channel *ch = spdk_io_channel_from_ctx(nbdev_ch);
867 	struct spdk_bdev_io *bdev_io, *tmp_bdev_io;
868 	struct nvme_bdev_io *bio;
869 	uint64_t now, delay_us;
870 
871 	now = spdk_get_ticks();
872 
873 	TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_bdev_io) {
874 		bio = (struct nvme_bdev_io *)bdev_io->driver_ctx;
875 		if (bio->retry_ticks > now) {
876 			break;
877 		}
878 
879 		TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link);
880 
881 		bdev_nvme_submit_request(ch, bdev_io);
882 	}
883 
884 	spdk_poller_unregister(&nbdev_ch->retry_io_poller);
885 
886 	bdev_io = TAILQ_FIRST(&nbdev_ch->retry_io_list);
887 	if (bdev_io != NULL) {
888 		bio = (struct nvme_bdev_io *)bdev_io->driver_ctx;
889 
890 		delay_us = (bio->retry_ticks - now) * SPDK_SEC_TO_USEC / spdk_get_ticks_hz();
891 
892 		nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch,
893 					    delay_us);
894 	}
895 
896 	return SPDK_POLLER_BUSY;
897 }
898 
899 static void
900 bdev_nvme_queue_retry_io(struct nvme_bdev_channel *nbdev_ch,
901 			 struct nvme_bdev_io *bio, uint64_t delay_ms)
902 {
903 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
904 	struct spdk_bdev_io *tmp_bdev_io;
905 	struct nvme_bdev_io *tmp_bio;
906 
907 	bio->retry_ticks = spdk_get_ticks() + delay_ms * spdk_get_ticks_hz() / 1000ULL;
908 
909 	TAILQ_FOREACH_REVERSE(tmp_bdev_io, &nbdev_ch->retry_io_list, retry_io_head, module_link) {
910 		tmp_bio = (struct nvme_bdev_io *)tmp_bdev_io->driver_ctx;
911 
912 		if (tmp_bio->retry_ticks <= bio->retry_ticks) {
913 			TAILQ_INSERT_AFTER(&nbdev_ch->retry_io_list, tmp_bdev_io, bdev_io,
914 					   module_link);
915 			return;
916 		}
917 	}
918 
919 	/* No earlier I/Os were found. This I/O must be the new head. */
920 	TAILQ_INSERT_HEAD(&nbdev_ch->retry_io_list, bdev_io, module_link);
921 
922 	spdk_poller_unregister(&nbdev_ch->retry_io_poller);
923 
924 	nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch,
925 				    delay_ms * 1000ULL);
926 }
927 
928 static inline void
929 bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio,
930 				  const struct spdk_nvme_cpl *cpl)
931 {
932 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
933 	struct nvme_bdev_channel *nbdev_ch;
934 	struct nvme_ctrlr *nvme_ctrlr;
935 	const struct spdk_nvme_ctrlr_data *cdata;
936 	uint64_t delay_ms;
937 
938 	assert(!bdev_nvme_io_type_is_admin(bdev_io->type));
939 
940 	if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) {
941 		goto complete;
942 	}
943 
944 	if (cpl->status.dnr != 0 || (g_opts.bdev_retry_count != -1 &&
945 				     bio->retry_count >= g_opts.bdev_retry_count)) {
946 		goto complete;
947 	}
948 
949 	nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io));
950 
951 	assert(bio->io_path != NULL);
952 	nvme_ctrlr = nvme_ctrlr_channel_get_ctrlr(bio->io_path->ctrlr_ch);
953 
954 	if (spdk_nvme_cpl_is_path_error(cpl) ||
955 	    spdk_nvme_cpl_is_aborted_sq_deletion(cpl) ||
956 	    !nvme_io_path_is_available(bio->io_path) ||
957 	    !nvme_ctrlr_is_available(nvme_ctrlr)) {
958 		nbdev_ch->current_io_path = NULL;
959 		if (spdk_nvme_cpl_is_ana_error(cpl)) {
960 			if (nvme_ctrlr_read_ana_log_page(nvme_ctrlr) == 0) {
961 				bio->io_path->nvme_ns->ana_state_updating = true;
962 			}
963 		}
964 		delay_ms = 0;
965 	} else if (spdk_nvme_cpl_is_aborted_by_request(cpl)) {
966 		goto complete;
967 	} else {
968 		bio->retry_count++;
969 
970 		cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr);
971 
972 		if (cpl->status.crd != 0) {
973 			delay_ms = cdata->crdt[cpl->status.crd] * 100;
974 		} else {
975 			delay_ms = 0;
976 		}
977 	}
978 
979 	if (any_io_path_may_become_available(nbdev_ch)) {
980 		bdev_nvme_queue_retry_io(nbdev_ch, bio, delay_ms);
981 		return;
982 	}
983 
984 complete:
985 	bio->retry_count = 0;
986 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
987 }
988 
989 static inline void
990 bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc)
991 {
992 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
993 	struct nvme_bdev_channel *nbdev_ch;
994 	enum spdk_bdev_io_status io_status;
995 
996 	switch (rc) {
997 	case 0:
998 		io_status = SPDK_BDEV_IO_STATUS_SUCCESS;
999 		break;
1000 	case -ENOMEM:
1001 		io_status = SPDK_BDEV_IO_STATUS_NOMEM;
1002 		break;
1003 	case -ENXIO:
1004 		nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io));
1005 
1006 		nbdev_ch->current_io_path = NULL;
1007 
1008 		if (any_io_path_may_become_available(nbdev_ch)) {
1009 			bdev_nvme_queue_retry_io(nbdev_ch, bio, 1000ULL);
1010 			return;
1011 		}
1012 
1013 	/* fallthrough */
1014 	default:
1015 		io_status = SPDK_BDEV_IO_STATUS_FAILED;
1016 		break;
1017 	}
1018 
1019 	bio->retry_count = 0;
1020 	spdk_bdev_io_complete(bdev_io, io_status);
1021 }
1022 
1023 static inline void
1024 bdev_nvme_admin_passthru_complete(struct nvme_bdev_io *bio, int rc)
1025 {
1026 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
1027 	struct nvme_bdev_channel *nbdev_ch;
1028 	enum spdk_bdev_io_status io_status;
1029 
1030 	switch (rc) {
1031 	case 0:
1032 		io_status = SPDK_BDEV_IO_STATUS_SUCCESS;
1033 		break;
1034 	case -ENOMEM:
1035 		io_status = SPDK_BDEV_IO_STATUS_NOMEM;
1036 		break;
1037 	case -ENXIO:
1038 		nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io));
1039 
1040 		if (any_ctrlr_may_become_available(nbdev_ch)) {
1041 			bdev_nvme_queue_retry_io(nbdev_ch, bio, 1000ULL);
1042 			return;
1043 		}
1044 
1045 	/* fallthrough */
1046 	default:
1047 		io_status = SPDK_BDEV_IO_STATUS_FAILED;
1048 		break;
1049 	}
1050 
1051 	bio->retry_count = 0;
1052 	spdk_bdev_io_complete(bdev_io, io_status);
1053 }
1054 
1055 static void
1056 _bdev_nvme_clear_io_path_cache(struct nvme_ctrlr_channel *ctrlr_ch)
1057 {
1058 	struct nvme_io_path *io_path;
1059 
1060 	TAILQ_FOREACH(io_path, &ctrlr_ch->io_path_list, tailq) {
1061 		io_path->nbdev_ch->current_io_path = NULL;
1062 	}
1063 }
1064 
1065 static struct nvme_ctrlr_channel *
1066 nvme_poll_group_get_ctrlr_channel(struct nvme_poll_group *group,
1067 				  struct spdk_nvme_qpair *qpair)
1068 {
1069 	struct nvme_ctrlr_channel *ctrlr_ch;
1070 
1071 	TAILQ_FOREACH(ctrlr_ch, &group->ctrlr_ch_list, tailq) {
1072 		if (ctrlr_ch->qpair == qpair) {
1073 			break;
1074 		}
1075 	}
1076 
1077 	return ctrlr_ch;
1078 }
1079 
1080 static void
1081 bdev_nvme_destroy_qpair(struct nvme_ctrlr_channel *ctrlr_ch)
1082 {
1083 	struct nvme_ctrlr *nvme_ctrlr __attribute__((unused));
1084 
1085 	if (ctrlr_ch->qpair != NULL) {
1086 		nvme_ctrlr = nvme_ctrlr_channel_get_ctrlr(ctrlr_ch);
1087 		SPDK_DTRACE_PROBE2(bdev_nvme_destroy_qpair, nvme_ctrlr->nbdev_ctrlr->name,
1088 				   spdk_nvme_qpair_get_id(ctrlr_ch->qpair));
1089 		spdk_nvme_ctrlr_free_io_qpair(ctrlr_ch->qpair);
1090 		ctrlr_ch->qpair = NULL;
1091 	}
1092 
1093 	_bdev_nvme_clear_io_path_cache(ctrlr_ch);
1094 }
1095 
1096 static void
1097 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx)
1098 {
1099 	struct nvme_poll_group *group = poll_group_ctx;
1100 	struct nvme_ctrlr_channel *ctrlr_ch;
1101 	struct nvme_ctrlr *nvme_ctrlr;
1102 
1103 	SPDK_NOTICELOG("qpair %p is disconnected, free the qpair and reset controller.\n", qpair);
1104 	/*
1105 	 * Free the I/O qpair and reset the nvme_ctrlr.
1106 	 */
1107 	ctrlr_ch = nvme_poll_group_get_ctrlr_channel(group, qpair);
1108 	if (ctrlr_ch != NULL) {
1109 		bdev_nvme_destroy_qpair(ctrlr_ch);
1110 
1111 		nvme_ctrlr = nvme_ctrlr_channel_get_ctrlr(ctrlr_ch);
1112 		bdev_nvme_reset(nvme_ctrlr);
1113 	}
1114 }
1115 
1116 static int
1117 bdev_nvme_poll(void *arg)
1118 {
1119 	struct nvme_poll_group *group = arg;
1120 	int64_t num_completions;
1121 
1122 	if (group->collect_spin_stat && group->start_ticks == 0) {
1123 		group->start_ticks = spdk_get_ticks();
1124 	}
1125 
1126 	num_completions = spdk_nvme_poll_group_process_completions(group->group, 0,
1127 			  bdev_nvme_disconnected_qpair_cb);
1128 	if (group->collect_spin_stat) {
1129 		if (num_completions > 0) {
1130 			if (group->end_ticks != 0) {
1131 				group->spin_ticks += (group->end_ticks - group->start_ticks);
1132 				group->end_ticks = 0;
1133 			}
1134 			group->start_ticks = 0;
1135 		} else {
1136 			group->end_ticks = spdk_get_ticks();
1137 		}
1138 	}
1139 
1140 	return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
1141 }
1142 
1143 static int
1144 bdev_nvme_poll_adminq(void *arg)
1145 {
1146 	int32_t rc;
1147 	struct nvme_ctrlr *nvme_ctrlr = arg;
1148 
1149 	assert(nvme_ctrlr != NULL);
1150 
1151 	rc = spdk_nvme_ctrlr_process_admin_completions(nvme_ctrlr->ctrlr);
1152 	if (rc < 0) {
1153 		bdev_nvme_failover(nvme_ctrlr, false);
1154 	}
1155 
1156 	return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY;
1157 }
1158 
1159 static void
1160 _bdev_nvme_unregister_dev_cb(void *io_device)
1161 {
1162 	struct nvme_bdev *nvme_disk = io_device;
1163 
1164 	free(nvme_disk->disk.name);
1165 	free(nvme_disk);
1166 }
1167 
1168 static int
1169 bdev_nvme_destruct(void *ctx)
1170 {
1171 	struct nvme_bdev *nvme_disk = ctx;
1172 	struct nvme_ns *nvme_ns, *tmp_nvme_ns;
1173 
1174 	SPDK_DTRACE_PROBE2(bdev_nvme_destruct, nvme_disk->nbdev_ctrlr->name, nvme_disk->nsid);
1175 
1176 	TAILQ_FOREACH_SAFE(nvme_ns, &nvme_disk->nvme_ns_list, tailq, tmp_nvme_ns) {
1177 		pthread_mutex_lock(&nvme_ns->ctrlr->mutex);
1178 
1179 		nvme_ns->bdev = NULL;
1180 
1181 		assert(nvme_ns->id > 0);
1182 
1183 		if (nvme_ctrlr_get_ns(nvme_ns->ctrlr, nvme_ns->id) == NULL) {
1184 			pthread_mutex_unlock(&nvme_ns->ctrlr->mutex);
1185 
1186 			nvme_ctrlr_release(nvme_ns->ctrlr);
1187 			free(nvme_ns);
1188 		} else {
1189 			pthread_mutex_unlock(&nvme_ns->ctrlr->mutex);
1190 		}
1191 	}
1192 
1193 	pthread_mutex_lock(&g_bdev_nvme_mutex);
1194 	TAILQ_REMOVE(&nvme_disk->nbdev_ctrlr->bdevs, nvme_disk, tailq);
1195 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
1196 
1197 	spdk_io_device_unregister(nvme_disk, _bdev_nvme_unregister_dev_cb);
1198 
1199 	return 0;
1200 }
1201 
1202 static int
1203 bdev_nvme_flush(struct nvme_bdev_io *bio, uint64_t offset, uint64_t nbytes)
1204 {
1205 	bdev_nvme_io_complete(bio, 0);
1206 
1207 	return 0;
1208 }
1209 
1210 static int
1211 bdev_nvme_create_qpair(struct nvme_ctrlr_channel *ctrlr_ch)
1212 {
1213 	struct nvme_ctrlr *nvme_ctrlr;
1214 	struct spdk_nvme_io_qpair_opts opts;
1215 	struct spdk_nvme_qpair *qpair;
1216 	int rc;
1217 
1218 	nvme_ctrlr = nvme_ctrlr_channel_get_ctrlr(ctrlr_ch);
1219 
1220 	spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_ctrlr->ctrlr, &opts, sizeof(opts));
1221 	opts.delay_cmd_submit = g_opts.delay_cmd_submit;
1222 	opts.create_only = true;
1223 	opts.async_mode = true;
1224 	opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests);
1225 	g_opts.io_queue_requests = opts.io_queue_requests;
1226 
1227 	qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_ctrlr->ctrlr, &opts, sizeof(opts));
1228 	if (qpair == NULL) {
1229 		return -1;
1230 	}
1231 
1232 	SPDK_DTRACE_PROBE3(bdev_nvme_create_qpair, nvme_ctrlr->nbdev_ctrlr->name,
1233 			   spdk_nvme_qpair_get_id(ctrlr_ch->qpair), spdk_thread_get_id(nvme_ctrlr->thread));
1234 
1235 	assert(ctrlr_ch->group != NULL);
1236 
1237 	rc = spdk_nvme_poll_group_add(ctrlr_ch->group->group, qpair);
1238 	if (rc != 0) {
1239 		SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n");
1240 		goto err;
1241 	}
1242 
1243 	rc = spdk_nvme_ctrlr_connect_io_qpair(nvme_ctrlr->ctrlr, qpair);
1244 	if (rc != 0) {
1245 		SPDK_ERRLOG("Unable to connect I/O qpair.\n");
1246 		goto err;
1247 	}
1248 
1249 	ctrlr_ch->qpair = qpair;
1250 
1251 	_bdev_nvme_clear_io_path_cache(ctrlr_ch);
1252 
1253 	return 0;
1254 
1255 err:
1256 	spdk_nvme_ctrlr_free_io_qpair(qpair);
1257 
1258 	return rc;
1259 }
1260 
1261 static void
1262 bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i)
1263 {
1264 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
1265 	struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch);
1266 	enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS;
1267 	struct spdk_bdev_io *bdev_io;
1268 
1269 	if (spdk_io_channel_iter_get_ctx(i) != NULL) {
1270 		status = SPDK_BDEV_IO_STATUS_FAILED;
1271 	}
1272 
1273 	while (!TAILQ_EMPTY(&ctrlr_ch->pending_resets)) {
1274 		bdev_io = TAILQ_FIRST(&ctrlr_ch->pending_resets);
1275 		TAILQ_REMOVE(&ctrlr_ch->pending_resets, bdev_io, module_link);
1276 		spdk_bdev_io_complete(bdev_io, status);
1277 	}
1278 
1279 	spdk_for_each_channel_continue(i, 0);
1280 }
1281 
1282 static void
1283 bdev_nvme_failover_trid(struct nvme_ctrlr *nvme_ctrlr, bool remove)
1284 {
1285 	struct nvme_path_id *path_id, *next_path;
1286 	int rc __attribute__((unused));
1287 
1288 	path_id = TAILQ_FIRST(&nvme_ctrlr->trids);
1289 	assert(path_id);
1290 	assert(path_id == nvme_ctrlr->active_path_id);
1291 	next_path = TAILQ_NEXT(path_id, link);
1292 
1293 	path_id->is_failed = true;
1294 
1295 	if (next_path) {
1296 		assert(path_id->trid.trtype != SPDK_NVME_TRANSPORT_PCIE);
1297 
1298 		SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", path_id->trid.traddr,
1299 			       path_id->trid.trsvcid,	next_path->trid.traddr, next_path->trid.trsvcid);
1300 
1301 		spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr);
1302 		nvme_ctrlr->active_path_id = next_path;
1303 		rc = spdk_nvme_ctrlr_set_trid(nvme_ctrlr->ctrlr, &next_path->trid);
1304 		assert(rc == 0);
1305 		TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link);
1306 		if (!remove) {
1307 			/** Shuffle the old trid to the end of the list and use the new one.
1308 			 * Allows for round robin through multiple connections.
1309 			 */
1310 			TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, path_id, link);
1311 		} else {
1312 			free(path_id);
1313 		}
1314 	}
1315 }
1316 
1317 static bool
1318 bdev_nvme_check_ctrlr_loss_timeout(struct nvme_ctrlr *nvme_ctrlr)
1319 {
1320 	int32_t elapsed;
1321 
1322 	if (nvme_ctrlr->ctrlr_loss_timeout_sec == 0 ||
1323 	    nvme_ctrlr->ctrlr_loss_timeout_sec == -1) {
1324 		return false;
1325 	}
1326 
1327 	elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz();
1328 	if (elapsed >= nvme_ctrlr->ctrlr_loss_timeout_sec) {
1329 		return true;
1330 	} else {
1331 		return false;
1332 	}
1333 }
1334 
1335 static bool
1336 bdev_nvme_check_fast_io_fail_timeout(struct nvme_ctrlr *nvme_ctrlr)
1337 {
1338 	uint32_t elapsed;
1339 
1340 	if (nvme_ctrlr->fast_io_fail_timeout_sec == 0) {
1341 		return false;
1342 	}
1343 
1344 	elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz();
1345 	if (elapsed >= nvme_ctrlr->fast_io_fail_timeout_sec) {
1346 		return true;
1347 	} else {
1348 		return false;
1349 	}
1350 }
1351 
1352 enum bdev_nvme_op_after_reset {
1353 	OP_NONE,
1354 	OP_COMPLETE_PENDING_DESTRUCT,
1355 	OP_DESTRUCT,
1356 	OP_DELAYED_RECONNECT,
1357 };
1358 
1359 typedef enum bdev_nvme_op_after_reset _bdev_nvme_op_after_reset;
1360 
1361 static _bdev_nvme_op_after_reset
1362 bdev_nvme_check_op_after_reset(struct nvme_ctrlr *nvme_ctrlr, bool success)
1363 {
1364 	if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) {
1365 		/* Complete pending destruct after reset completes. */
1366 		return OP_COMPLETE_PENDING_DESTRUCT;
1367 	} else if (success || nvme_ctrlr->reconnect_delay_sec == 0) {
1368 		nvme_ctrlr->reset_start_tsc = 0;
1369 		return OP_NONE;
1370 	} else if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) {
1371 		return OP_DESTRUCT;
1372 	} else {
1373 		if (bdev_nvme_check_fast_io_fail_timeout(nvme_ctrlr)) {
1374 			nvme_ctrlr->fast_io_fail_timedout = true;
1375 		}
1376 		bdev_nvme_failover_trid(nvme_ctrlr, false);
1377 		return OP_DELAYED_RECONNECT;
1378 	}
1379 }
1380 
1381 static int _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, bool hotplug);
1382 static void bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr);
1383 
1384 static int
1385 bdev_nvme_reconnect_delay_timer_expired(void *ctx)
1386 {
1387 	struct nvme_ctrlr *nvme_ctrlr = ctx;
1388 
1389 	pthread_mutex_lock(&nvme_ctrlr->mutex);
1390 
1391 	spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer);
1392 
1393 	assert(nvme_ctrlr->reconnect_is_delayed == true);
1394 	nvme_ctrlr->reconnect_is_delayed = false;
1395 
1396 	if (nvme_ctrlr->destruct) {
1397 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
1398 		return SPDK_POLLER_BUSY;
1399 	}
1400 
1401 	assert(nvme_ctrlr->resetting == false);
1402 	nvme_ctrlr->resetting = true;
1403 
1404 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
1405 
1406 	spdk_poller_resume(nvme_ctrlr->adminq_timer_poller);
1407 
1408 	bdev_nvme_reconnect_ctrlr(nvme_ctrlr);
1409 	return SPDK_POLLER_BUSY;
1410 }
1411 
1412 static void
1413 bdev_nvme_start_reconnect_delay_timer(struct nvme_ctrlr *nvme_ctrlr)
1414 {
1415 	spdk_poller_pause(nvme_ctrlr->adminq_timer_poller);
1416 
1417 	assert(nvme_ctrlr->reconnect_is_delayed == false);
1418 	nvme_ctrlr->reconnect_is_delayed = true;
1419 
1420 	assert(nvme_ctrlr->reconnect_delay_timer == NULL);
1421 	nvme_ctrlr->reconnect_delay_timer = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_delay_timer_expired,
1422 					    nvme_ctrlr,
1423 					    nvme_ctrlr->reconnect_delay_sec * SPDK_SEC_TO_USEC);
1424 }
1425 
1426 static void
1427 _bdev_nvme_reset_complete(struct spdk_io_channel_iter *i, int status)
1428 {
1429 	struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i);
1430 	bool success = spdk_io_channel_iter_get_ctx(i) == NULL;
1431 	struct nvme_path_id *path_id;
1432 	bdev_nvme_reset_cb reset_cb_fn = nvme_ctrlr->reset_cb_fn;
1433 	void *reset_cb_arg = nvme_ctrlr->reset_cb_arg;
1434 	enum bdev_nvme_op_after_reset op_after_reset;
1435 
1436 	assert(nvme_ctrlr->thread == spdk_get_thread());
1437 
1438 	nvme_ctrlr->reset_cb_fn = NULL;
1439 	nvme_ctrlr->reset_cb_arg = NULL;
1440 
1441 	if (!success) {
1442 		SPDK_ERRLOG("Resetting controller failed.\n");
1443 	} else {
1444 		SPDK_NOTICELOG("Resetting controller successful.\n");
1445 	}
1446 
1447 	pthread_mutex_lock(&nvme_ctrlr->mutex);
1448 	nvme_ctrlr->resetting = false;
1449 
1450 	path_id = TAILQ_FIRST(&nvme_ctrlr->trids);
1451 	assert(path_id != NULL);
1452 	assert(path_id == nvme_ctrlr->active_path_id);
1453 
1454 	path_id->is_failed = !success;
1455 
1456 	op_after_reset = bdev_nvme_check_op_after_reset(nvme_ctrlr, success);
1457 
1458 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
1459 
1460 	if (reset_cb_fn) {
1461 		reset_cb_fn(reset_cb_arg, success);
1462 	}
1463 
1464 	switch (op_after_reset) {
1465 	case OP_COMPLETE_PENDING_DESTRUCT:
1466 		nvme_ctrlr_unregister(nvme_ctrlr);
1467 		break;
1468 	case OP_DESTRUCT:
1469 		_bdev_nvme_delete(nvme_ctrlr, false);
1470 		break;
1471 	case OP_DELAYED_RECONNECT:
1472 		spdk_nvme_ctrlr_disconnect(nvme_ctrlr->ctrlr);
1473 		bdev_nvme_start_reconnect_delay_timer(nvme_ctrlr);
1474 		break;
1475 	default:
1476 		break;
1477 	}
1478 }
1479 
1480 static void
1481 bdev_nvme_reset_complete(struct nvme_ctrlr *nvme_ctrlr, bool success)
1482 {
1483 	/* Make sure we clear any pending resets before returning. */
1484 	spdk_for_each_channel(nvme_ctrlr,
1485 			      bdev_nvme_complete_pending_resets,
1486 			      success ? NULL : (void *)0x1,
1487 			      _bdev_nvme_reset_complete);
1488 }
1489 
1490 static void
1491 bdev_nvme_reset_create_qpairs_failed(struct spdk_io_channel_iter *i, int status)
1492 {
1493 	struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i);
1494 
1495 	bdev_nvme_reset_complete(nvme_ctrlr, false);
1496 }
1497 
1498 static void
1499 bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i)
1500 {
1501 	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
1502 	struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch);
1503 
1504 	bdev_nvme_destroy_qpair(ctrlr_ch);
1505 
1506 	spdk_for_each_channel_continue(i, 0);
1507 }
1508 
1509 static void
1510 bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status)
1511 {
1512 	struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i);
1513 
1514 	if (status == 0) {
1515 		bdev_nvme_reset_complete(nvme_ctrlr, true);
1516 	} else {
1517 		/* Delete the added qpairs and quiesce ctrlr to make the states clean. */
1518 		spdk_for_each_channel(nvme_ctrlr,
1519 				      bdev_nvme_reset_destroy_qpair,
1520 				      NULL,
1521 				      bdev_nvme_reset_create_qpairs_failed);
1522 	}
1523 }
1524 
1525 static void
1526 bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i)
1527 {
1528 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
1529 	struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch);
1530 	int rc;
1531 
1532 	rc = bdev_nvme_create_qpair(ctrlr_ch);
1533 
1534 	spdk_for_each_channel_continue(i, rc);
1535 }
1536 
1537 static int
1538 bdev_nvme_reconnect_ctrlr_poll(void *arg)
1539 {
1540 	struct nvme_ctrlr *nvme_ctrlr = arg;
1541 	int rc = -ETIMEDOUT;
1542 
1543 	if (!bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) {
1544 		rc = spdk_nvme_ctrlr_reconnect_poll_async(nvme_ctrlr->ctrlr);
1545 		if (rc == -EAGAIN) {
1546 			return SPDK_POLLER_BUSY;
1547 		}
1548 	}
1549 
1550 	spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller);
1551 	if (rc == 0) {
1552 		/* Recreate all of the I/O queue pairs */
1553 		spdk_for_each_channel(nvme_ctrlr,
1554 				      bdev_nvme_reset_create_qpair,
1555 				      NULL,
1556 				      bdev_nvme_reset_create_qpairs_done);
1557 	} else {
1558 		bdev_nvme_reset_complete(nvme_ctrlr, false);
1559 	}
1560 	return SPDK_POLLER_BUSY;
1561 }
1562 
1563 static void
1564 bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr)
1565 {
1566 	spdk_nvme_ctrlr_reconnect_async(nvme_ctrlr->ctrlr);
1567 
1568 	assert(nvme_ctrlr->reset_detach_poller == NULL);
1569 	nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_ctrlr_poll,
1570 					  nvme_ctrlr, 0);
1571 }
1572 
1573 static void
1574 bdev_nvme_reset_ctrlr(struct spdk_io_channel_iter *i, int status)
1575 {
1576 	struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i);
1577 	int rc __attribute__((unused));
1578 
1579 	assert(status == 0);
1580 
1581 	/* Disconnect fails if ctrlr is already resetting or removed. Both cases are
1582 	 * not possible. Reset is controlled and the callback to hot remove is called
1583 	 * when ctrlr is hot removed.
1584 	 */
1585 	rc = spdk_nvme_ctrlr_disconnect(nvme_ctrlr->ctrlr);
1586 	assert(rc == 0);
1587 
1588 	bdev_nvme_reconnect_ctrlr(nvme_ctrlr);
1589 }
1590 
1591 static void
1592 _bdev_nvme_reset(void *ctx)
1593 {
1594 	struct nvme_ctrlr *nvme_ctrlr = ctx;
1595 
1596 	assert(nvme_ctrlr->resetting == true);
1597 	assert(nvme_ctrlr->thread == spdk_get_thread());
1598 
1599 	spdk_nvme_ctrlr_prepare_for_reset(nvme_ctrlr->ctrlr);
1600 
1601 	/* First, delete all NVMe I/O queue pairs. */
1602 	spdk_for_each_channel(nvme_ctrlr,
1603 			      bdev_nvme_reset_destroy_qpair,
1604 			      NULL,
1605 			      bdev_nvme_reset_ctrlr);
1606 }
1607 
1608 static int
1609 bdev_nvme_reset(struct nvme_ctrlr *nvme_ctrlr)
1610 {
1611 	pthread_mutex_lock(&nvme_ctrlr->mutex);
1612 	if (nvme_ctrlr->destruct) {
1613 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
1614 		return -ENXIO;
1615 	}
1616 
1617 	if (nvme_ctrlr->resetting) {
1618 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
1619 		SPDK_NOTICELOG("Unable to perform reset, already in progress.\n");
1620 		return -EBUSY;
1621 	}
1622 
1623 	if (nvme_ctrlr->reconnect_is_delayed) {
1624 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
1625 		SPDK_NOTICELOG("Reconnect is already scheduled.\n");
1626 		return -EBUSY;
1627 	}
1628 
1629 	nvme_ctrlr->resetting = true;
1630 
1631 	assert(nvme_ctrlr->reset_start_tsc == 0);
1632 	nvme_ctrlr->reset_start_tsc = spdk_get_ticks();
1633 
1634 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
1635 
1636 	spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset, nvme_ctrlr);
1637 	return 0;
1638 }
1639 
1640 int
1641 bdev_nvme_reset_rpc(struct nvme_ctrlr *nvme_ctrlr, bdev_nvme_reset_cb cb_fn, void *cb_arg)
1642 {
1643 	int rc;
1644 
1645 	rc = bdev_nvme_reset(nvme_ctrlr);
1646 	if (rc == 0) {
1647 		nvme_ctrlr->reset_cb_fn = cb_fn;
1648 		nvme_ctrlr->reset_cb_arg = cb_arg;
1649 	}
1650 	return rc;
1651 }
1652 
1653 static int _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio);
1654 
1655 static void
1656 bdev_nvme_reset_io_complete(struct nvme_bdev_io *bio)
1657 {
1658 	enum spdk_bdev_io_status io_status;
1659 
1660 	if (bio->cpl.cdw0 == 0) {
1661 		io_status = SPDK_BDEV_IO_STATUS_SUCCESS;
1662 	} else {
1663 		io_status = SPDK_BDEV_IO_STATUS_FAILED;
1664 	}
1665 
1666 	spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), io_status);
1667 }
1668 
1669 static void
1670 _bdev_nvme_reset_io_continue(void *ctx)
1671 {
1672 	struct nvme_bdev_io *bio = ctx;
1673 	struct nvme_io_path *prev_io_path, *next_io_path;
1674 	int rc;
1675 
1676 	prev_io_path = bio->io_path;
1677 	bio->io_path = NULL;
1678 
1679 	if (bio->cpl.cdw0 != 0) {
1680 		goto complete;
1681 	}
1682 
1683 	next_io_path = STAILQ_NEXT(prev_io_path, stailq);
1684 	if (next_io_path == NULL) {
1685 		goto complete;
1686 	}
1687 
1688 	rc = _bdev_nvme_reset_io(next_io_path, bio);
1689 	if (rc == 0) {
1690 		return;
1691 	}
1692 
1693 	bio->cpl.cdw0 = 1;
1694 
1695 complete:
1696 	bdev_nvme_reset_io_complete(bio);
1697 }
1698 
1699 static void
1700 bdev_nvme_reset_io_continue(void *cb_arg, bool success)
1701 {
1702 	struct nvme_bdev_io *bio = cb_arg;
1703 
1704 	bio->cpl.cdw0 = !success;
1705 
1706 	spdk_thread_send_msg(bio->orig_thread, _bdev_nvme_reset_io_continue, bio);
1707 }
1708 
1709 static int
1710 _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio)
1711 {
1712 	struct nvme_ctrlr_channel *ctrlr_ch = io_path->ctrlr_ch;
1713 	struct nvme_ctrlr *nvme_ctrlr;
1714 	struct spdk_bdev_io *bdev_io;
1715 	int rc;
1716 
1717 	nvme_ctrlr = nvme_ctrlr_channel_get_ctrlr(ctrlr_ch);
1718 
1719 	rc = bdev_nvme_reset(nvme_ctrlr);
1720 	if (rc == 0) {
1721 		assert(bio->io_path == NULL);
1722 		bio->io_path = io_path;
1723 
1724 		assert(nvme_ctrlr->reset_cb_fn == NULL);
1725 		assert(nvme_ctrlr->reset_cb_arg == NULL);
1726 		nvme_ctrlr->reset_cb_fn = bdev_nvme_reset_io_continue;
1727 		nvme_ctrlr->reset_cb_arg = bio;
1728 	} else if (rc == -EBUSY) {
1729 		/*
1730 		 * Reset call is queued only if it is from the app framework. This is on purpose so that
1731 		 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the
1732 		 * upper level. If they are in the middle of a reset, we won't try to schedule another one.
1733 		 */
1734 		bdev_io = spdk_bdev_io_from_ctx(bio);
1735 		TAILQ_INSERT_TAIL(&ctrlr_ch->pending_resets, bdev_io, module_link);
1736 	} else {
1737 		return rc;
1738 	}
1739 
1740 	return 0;
1741 }
1742 
1743 static void
1744 bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio)
1745 {
1746 	struct nvme_io_path *io_path;
1747 	int rc;
1748 
1749 	bio->cpl.cdw0 = 0;
1750 	bio->orig_thread = spdk_get_thread();
1751 
1752 	/* Reset only the first nvme_ctrlr in the nvme_bdev_ctrlr for now.
1753 	 *
1754 	 * TODO: Reset all nvme_ctrlrs in the nvme_bdev_ctrlr sequentially.
1755 	 * This will be done in the following patches.
1756 	 */
1757 	io_path = STAILQ_FIRST(&nbdev_ch->io_path_list);
1758 	assert(io_path != NULL);
1759 
1760 	rc = _bdev_nvme_reset_io(io_path, bio);
1761 	if (rc != 0) {
1762 		bio->cpl.cdw0 = 1;
1763 		bdev_nvme_reset_io_complete(bio);
1764 	}
1765 }
1766 
1767 static int
1768 bdev_nvme_failover(struct nvme_ctrlr *nvme_ctrlr, bool remove)
1769 {
1770 	pthread_mutex_lock(&nvme_ctrlr->mutex);
1771 	if (nvme_ctrlr->destruct) {
1772 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
1773 		/* Don't bother resetting if the controller is in the process of being destructed. */
1774 		return -ENXIO;
1775 	}
1776 
1777 	if (nvme_ctrlr->resetting) {
1778 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
1779 		SPDK_NOTICELOG("Unable to perform reset, already in progress.\n");
1780 		return -EBUSY;
1781 	}
1782 
1783 	bdev_nvme_failover_trid(nvme_ctrlr, remove);
1784 
1785 	if (nvme_ctrlr->reconnect_is_delayed) {
1786 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
1787 		SPDK_NOTICELOG("Reconnect is already scheduled.\n");
1788 
1789 		/* We rely on the next reconnect for the failover. */
1790 		return 0;
1791 	}
1792 
1793 	nvme_ctrlr->resetting = true;
1794 
1795 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
1796 
1797 	spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset, nvme_ctrlr);
1798 	return 0;
1799 }
1800 
1801 static int bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks,
1802 			   uint64_t num_blocks);
1803 
1804 static int bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks,
1805 				  uint64_t num_blocks);
1806 
1807 static void
1808 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
1809 		     bool success)
1810 {
1811 	struct nvme_bdev_io *bio = (struct nvme_bdev_io *)bdev_io->driver_ctx;
1812 	struct spdk_bdev *bdev = bdev_io->bdev;
1813 	int ret;
1814 
1815 	if (!success) {
1816 		ret = -EINVAL;
1817 		goto exit;
1818 	}
1819 
1820 	if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) {
1821 		ret = -ENXIO;
1822 		goto exit;
1823 	}
1824 
1825 	ret = bdev_nvme_readv(bio,
1826 			      bdev_io->u.bdev.iovs,
1827 			      bdev_io->u.bdev.iovcnt,
1828 			      bdev_io->u.bdev.md_buf,
1829 			      bdev_io->u.bdev.num_blocks,
1830 			      bdev_io->u.bdev.offset_blocks,
1831 			      bdev->dif_check_flags,
1832 			      bdev_io->internal.ext_opts);
1833 
1834 exit:
1835 	if (spdk_unlikely(ret != 0)) {
1836 		bdev_nvme_io_complete(bio, ret);
1837 	}
1838 }
1839 
1840 static void
1841 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
1842 {
1843 	struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch);
1844 	struct spdk_bdev *bdev = bdev_io->bdev;
1845 	struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx;
1846 	struct nvme_bdev_io *nbdev_io_to_abort;
1847 	int rc = 0;
1848 
1849 	nbdev_io->io_path = bdev_nvme_find_io_path(nbdev_ch);
1850 	if (spdk_unlikely(!nbdev_io->io_path)) {
1851 		if (!bdev_nvme_io_type_is_admin(bdev_io->type)) {
1852 			rc = -ENXIO;
1853 			goto exit;
1854 		}
1855 
1856 		/* Admin commands do not use the optimal I/O path.
1857 		 * Simply fall through even if it is not found.
1858 		 */
1859 	}
1860 
1861 	switch (bdev_io->type) {
1862 	case SPDK_BDEV_IO_TYPE_READ:
1863 		if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) {
1864 			rc = bdev_nvme_readv(nbdev_io,
1865 					     bdev_io->u.bdev.iovs,
1866 					     bdev_io->u.bdev.iovcnt,
1867 					     bdev_io->u.bdev.md_buf,
1868 					     bdev_io->u.bdev.num_blocks,
1869 					     bdev_io->u.bdev.offset_blocks,
1870 					     bdev->dif_check_flags,
1871 					     bdev_io->internal.ext_opts);
1872 		} else {
1873 			spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb,
1874 					     bdev_io->u.bdev.num_blocks * bdev->blocklen);
1875 			rc = 0;
1876 		}
1877 		break;
1878 	case SPDK_BDEV_IO_TYPE_WRITE:
1879 		rc = bdev_nvme_writev(nbdev_io,
1880 				      bdev_io->u.bdev.iovs,
1881 				      bdev_io->u.bdev.iovcnt,
1882 				      bdev_io->u.bdev.md_buf,
1883 				      bdev_io->u.bdev.num_blocks,
1884 				      bdev_io->u.bdev.offset_blocks,
1885 				      bdev->dif_check_flags,
1886 				      bdev_io->internal.ext_opts);
1887 		break;
1888 	case SPDK_BDEV_IO_TYPE_COMPARE:
1889 		rc = bdev_nvme_comparev(nbdev_io,
1890 					bdev_io->u.bdev.iovs,
1891 					bdev_io->u.bdev.iovcnt,
1892 					bdev_io->u.bdev.md_buf,
1893 					bdev_io->u.bdev.num_blocks,
1894 					bdev_io->u.bdev.offset_blocks,
1895 					bdev->dif_check_flags);
1896 		break;
1897 	case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE:
1898 		rc = bdev_nvme_comparev_and_writev(nbdev_io,
1899 						   bdev_io->u.bdev.iovs,
1900 						   bdev_io->u.bdev.iovcnt,
1901 						   bdev_io->u.bdev.fused_iovs,
1902 						   bdev_io->u.bdev.fused_iovcnt,
1903 						   bdev_io->u.bdev.md_buf,
1904 						   bdev_io->u.bdev.num_blocks,
1905 						   bdev_io->u.bdev.offset_blocks,
1906 						   bdev->dif_check_flags);
1907 		break;
1908 	case SPDK_BDEV_IO_TYPE_UNMAP:
1909 		rc = bdev_nvme_unmap(nbdev_io,
1910 				     bdev_io->u.bdev.offset_blocks,
1911 				     bdev_io->u.bdev.num_blocks);
1912 		break;
1913 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
1914 		rc =  bdev_nvme_write_zeroes(nbdev_io,
1915 					     bdev_io->u.bdev.offset_blocks,
1916 					     bdev_io->u.bdev.num_blocks);
1917 		break;
1918 	case SPDK_BDEV_IO_TYPE_RESET:
1919 		nbdev_io->io_path = NULL;
1920 		bdev_nvme_reset_io(nbdev_ch, nbdev_io);
1921 		break;
1922 	case SPDK_BDEV_IO_TYPE_FLUSH:
1923 		rc = bdev_nvme_flush(nbdev_io,
1924 				     bdev_io->u.bdev.offset_blocks,
1925 				     bdev_io->u.bdev.num_blocks);
1926 		break;
1927 	case SPDK_BDEV_IO_TYPE_ZONE_APPEND:
1928 		rc = bdev_nvme_zone_appendv(nbdev_io,
1929 					    bdev_io->u.bdev.iovs,
1930 					    bdev_io->u.bdev.iovcnt,
1931 					    bdev_io->u.bdev.md_buf,
1932 					    bdev_io->u.bdev.num_blocks,
1933 					    bdev_io->u.bdev.offset_blocks,
1934 					    bdev->dif_check_flags);
1935 		break;
1936 	case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO:
1937 		rc = bdev_nvme_get_zone_info(nbdev_io,
1938 					     bdev_io->u.zone_mgmt.zone_id,
1939 					     bdev_io->u.zone_mgmt.num_zones,
1940 					     bdev_io->u.zone_mgmt.buf);
1941 		break;
1942 	case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
1943 		rc = bdev_nvme_zone_management(nbdev_io,
1944 					       bdev_io->u.zone_mgmt.zone_id,
1945 					       bdev_io->u.zone_mgmt.zone_action);
1946 		break;
1947 	case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
1948 		nbdev_io->io_path = NULL;
1949 		bdev_nvme_admin_passthru(nbdev_ch,
1950 					 nbdev_io,
1951 					 &bdev_io->u.nvme_passthru.cmd,
1952 					 bdev_io->u.nvme_passthru.buf,
1953 					 bdev_io->u.nvme_passthru.nbytes);
1954 		break;
1955 	case SPDK_BDEV_IO_TYPE_NVME_IO:
1956 		rc = bdev_nvme_io_passthru(nbdev_io,
1957 					   &bdev_io->u.nvme_passthru.cmd,
1958 					   bdev_io->u.nvme_passthru.buf,
1959 					   bdev_io->u.nvme_passthru.nbytes);
1960 		break;
1961 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
1962 		rc = bdev_nvme_io_passthru_md(nbdev_io,
1963 					      &bdev_io->u.nvme_passthru.cmd,
1964 					      bdev_io->u.nvme_passthru.buf,
1965 					      bdev_io->u.nvme_passthru.nbytes,
1966 					      bdev_io->u.nvme_passthru.md_buf,
1967 					      bdev_io->u.nvme_passthru.md_len);
1968 		break;
1969 	case SPDK_BDEV_IO_TYPE_ABORT:
1970 		nbdev_io->io_path = NULL;
1971 		nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx;
1972 		bdev_nvme_abort(nbdev_ch,
1973 				nbdev_io,
1974 				nbdev_io_to_abort);
1975 		break;
1976 	default:
1977 		rc = -EINVAL;
1978 		break;
1979 	}
1980 
1981 exit:
1982 	if (spdk_unlikely(rc != 0)) {
1983 		bdev_nvme_io_complete(nbdev_io, rc);
1984 	}
1985 }
1986 
1987 static bool
1988 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
1989 {
1990 	struct nvme_bdev *nbdev = ctx;
1991 	struct nvme_ns *nvme_ns;
1992 	struct spdk_nvme_ns *ns;
1993 	struct spdk_nvme_ctrlr *ctrlr;
1994 	const struct spdk_nvme_ctrlr_data *cdata;
1995 
1996 	nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list);
1997 	assert(nvme_ns != NULL);
1998 	ns = nvme_ns->ns;
1999 	ctrlr = spdk_nvme_ns_get_ctrlr(ns);
2000 
2001 	switch (io_type) {
2002 	case SPDK_BDEV_IO_TYPE_READ:
2003 	case SPDK_BDEV_IO_TYPE_WRITE:
2004 	case SPDK_BDEV_IO_TYPE_RESET:
2005 	case SPDK_BDEV_IO_TYPE_FLUSH:
2006 	case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
2007 	case SPDK_BDEV_IO_TYPE_NVME_IO:
2008 	case SPDK_BDEV_IO_TYPE_ABORT:
2009 		return true;
2010 
2011 	case SPDK_BDEV_IO_TYPE_COMPARE:
2012 		return spdk_nvme_ns_supports_compare(ns);
2013 
2014 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
2015 		return spdk_nvme_ns_get_md_size(ns) ? true : false;
2016 
2017 	case SPDK_BDEV_IO_TYPE_UNMAP:
2018 		cdata = spdk_nvme_ctrlr_get_data(ctrlr);
2019 		return cdata->oncs.dsm;
2020 
2021 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
2022 		cdata = spdk_nvme_ctrlr_get_data(ctrlr);
2023 		return cdata->oncs.write_zeroes;
2024 
2025 	case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE:
2026 		if (spdk_nvme_ctrlr_get_flags(ctrlr) &
2027 		    SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) {
2028 			return true;
2029 		}
2030 		return false;
2031 
2032 	case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO:
2033 	case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
2034 		return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS;
2035 
2036 	case SPDK_BDEV_IO_TYPE_ZONE_APPEND:
2037 		return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS &&
2038 		       spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED;
2039 
2040 	default:
2041 		return false;
2042 	}
2043 }
2044 
2045 static int
2046 bdev_nvme_create_ctrlr_channel_cb(void *io_device, void *ctx_buf)
2047 {
2048 	struct nvme_ctrlr *nvme_ctrlr = io_device;
2049 	struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf;
2050 	struct spdk_io_channel *pg_ch;
2051 	int rc;
2052 
2053 	pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs);
2054 	if (!pg_ch) {
2055 		return -1;
2056 	}
2057 
2058 	ctrlr_ch->group = spdk_io_channel_get_ctx(pg_ch);
2059 	TAILQ_INSERT_TAIL(&ctrlr_ch->group->ctrlr_ch_list, ctrlr_ch, tailq);
2060 
2061 #ifdef SPDK_CONFIG_VTUNE
2062 	ctrlr_ch->group->collect_spin_stat = true;
2063 #else
2064 	ctrlr_ch->group->collect_spin_stat = false;
2065 #endif
2066 
2067 	TAILQ_INIT(&ctrlr_ch->pending_resets);
2068 	TAILQ_INIT(&ctrlr_ch->io_path_list);
2069 
2070 	rc = bdev_nvme_create_qpair(ctrlr_ch);
2071 	if (rc != 0) {
2072 		/* nvme ctrlr can't create IO qpair during reset. In that case ctrlr_ch->qpair
2073 		 * pointer will be NULL and IO qpair will be created when reset completes.
2074 		 * If the user submits IO requests during reset, they will be queued and resubmitted later */
2075 		if (!nvme_ctrlr->resetting) {
2076 			goto err_qpair;
2077 		}
2078 	}
2079 
2080 	return 0;
2081 
2082 err_qpair:
2083 	TAILQ_REMOVE(&ctrlr_ch->group->ctrlr_ch_list, ctrlr_ch, tailq);
2084 	spdk_put_io_channel(pg_ch);
2085 
2086 	return rc;
2087 }
2088 
2089 static void
2090 bdev_nvme_destroy_ctrlr_channel_cb(void *io_device, void *ctx_buf)
2091 {
2092 	struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf;
2093 
2094 	assert(ctrlr_ch->group != NULL);
2095 
2096 	bdev_nvme_destroy_qpair(ctrlr_ch);
2097 
2098 	TAILQ_REMOVE(&ctrlr_ch->group->ctrlr_ch_list, ctrlr_ch, tailq);
2099 
2100 	spdk_put_io_channel(spdk_io_channel_from_ctx(ctrlr_ch->group));
2101 }
2102 
2103 static void
2104 bdev_nvme_submit_accel_crc32c(void *ctx, uint32_t *dst, struct iovec *iov,
2105 			      uint32_t iov_cnt, uint32_t seed,
2106 			      spdk_nvme_accel_completion_cb cb_fn, void *cb_arg)
2107 {
2108 	struct nvme_poll_group *group = ctx;
2109 	int rc;
2110 
2111 	assert(group->accel_channel != NULL);
2112 	assert(cb_fn != NULL);
2113 
2114 	rc = spdk_accel_submit_crc32cv(group->accel_channel, dst, iov, iov_cnt, seed, cb_fn, cb_arg);
2115 	if (rc) {
2116 		/* For the two cases, spdk_accel_submit_crc32cv does not call the user's cb_fn */
2117 		if (rc == -ENOMEM || rc == -EINVAL) {
2118 			cb_fn(cb_arg, rc);
2119 		}
2120 		SPDK_ERRLOG("Cannot complete the accelerated crc32c operation with iov=%p\n", iov);
2121 	}
2122 }
2123 
2124 static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = {
2125 	.table_size		= sizeof(struct spdk_nvme_accel_fn_table),
2126 	.submit_accel_crc32c	= bdev_nvme_submit_accel_crc32c,
2127 };
2128 
2129 static int
2130 bdev_nvme_create_poll_group_cb(void *io_device, void *ctx_buf)
2131 {
2132 	struct nvme_poll_group *group = ctx_buf;
2133 
2134 	TAILQ_INIT(&group->ctrlr_ch_list);
2135 
2136 	group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table);
2137 	if (group->group == NULL) {
2138 		return -1;
2139 	}
2140 
2141 	group->accel_channel = spdk_accel_engine_get_io_channel();
2142 	if (!group->accel_channel) {
2143 		spdk_nvme_poll_group_destroy(group->group);
2144 		SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n",
2145 			    group);
2146 		return -1;
2147 	}
2148 
2149 	group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us);
2150 
2151 	if (group->poller == NULL) {
2152 		spdk_put_io_channel(group->accel_channel);
2153 		spdk_nvme_poll_group_destroy(group->group);
2154 		return -1;
2155 	}
2156 
2157 	return 0;
2158 }
2159 
2160 static void
2161 bdev_nvme_destroy_poll_group_cb(void *io_device, void *ctx_buf)
2162 {
2163 	struct nvme_poll_group *group = ctx_buf;
2164 
2165 	assert(TAILQ_EMPTY(&group->ctrlr_ch_list));
2166 
2167 	if (group->accel_channel) {
2168 		spdk_put_io_channel(group->accel_channel);
2169 	}
2170 
2171 	spdk_poller_unregister(&group->poller);
2172 	if (spdk_nvme_poll_group_destroy(group->group)) {
2173 		SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.\n");
2174 		assert(false);
2175 	}
2176 }
2177 
2178 static struct spdk_io_channel *
2179 bdev_nvme_get_io_channel(void *ctx)
2180 {
2181 	struct nvme_bdev *nvme_bdev = ctx;
2182 
2183 	return spdk_get_io_channel(nvme_bdev);
2184 }
2185 
2186 static void *
2187 bdev_nvme_get_module_ctx(void *ctx)
2188 {
2189 	struct nvme_bdev *nvme_bdev = ctx;
2190 	struct nvme_ns *nvme_ns;
2191 
2192 	if (!nvme_bdev || nvme_bdev->disk.module != &nvme_if) {
2193 		return NULL;
2194 	}
2195 
2196 	nvme_ns = TAILQ_FIRST(&nvme_bdev->nvme_ns_list);
2197 	if (!nvme_ns) {
2198 		return NULL;
2199 	}
2200 
2201 	return nvme_ns->ns;
2202 }
2203 
2204 static const char *
2205 _nvme_ana_state_str(enum spdk_nvme_ana_state ana_state)
2206 {
2207 	switch (ana_state) {
2208 	case SPDK_NVME_ANA_OPTIMIZED_STATE:
2209 		return "optimized";
2210 	case SPDK_NVME_ANA_NON_OPTIMIZED_STATE:
2211 		return "non_optimized";
2212 	case SPDK_NVME_ANA_INACCESSIBLE_STATE:
2213 		return "inaccessible";
2214 	case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE:
2215 		return "persistent_loss";
2216 	case SPDK_NVME_ANA_CHANGE_STATE:
2217 		return "change";
2218 	default:
2219 		return NULL;
2220 	}
2221 }
2222 
2223 static int
2224 bdev_nvme_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size)
2225 {
2226 	struct nvme_bdev *nbdev = ctx;
2227 	struct nvme_ns *nvme_ns;
2228 
2229 	nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list);
2230 	assert(nvme_ns != NULL);
2231 
2232 	return spdk_nvme_ctrlr_get_memory_domains(nvme_ns->ctrlr->ctrlr, domains, array_size);
2233 }
2234 
2235 static void
2236 nvme_namespace_info_json(struct spdk_json_write_ctx *w,
2237 			 struct nvme_ns *nvme_ns)
2238 {
2239 	struct spdk_nvme_ns *ns;
2240 	struct spdk_nvme_ctrlr *ctrlr;
2241 	const struct spdk_nvme_ctrlr_data *cdata;
2242 	const struct spdk_nvme_transport_id *trid;
2243 	union spdk_nvme_vs_register vs;
2244 	char buf[128];
2245 
2246 	ns = nvme_ns->ns;
2247 	ctrlr = spdk_nvme_ns_get_ctrlr(ns);
2248 
2249 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
2250 	trid = spdk_nvme_ctrlr_get_transport_id(ctrlr);
2251 	vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr);
2252 
2253 	spdk_json_write_object_begin(w);
2254 
2255 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
2256 		spdk_json_write_named_string(w, "pci_address", trid->traddr);
2257 	}
2258 
2259 	spdk_json_write_named_object_begin(w, "trid");
2260 
2261 	nvme_bdev_dump_trid_json(trid, w);
2262 
2263 	spdk_json_write_object_end(w);
2264 
2265 #ifdef SPDK_CONFIG_NVME_CUSE
2266 	size_t cuse_name_size = 128;
2267 	char cuse_name[cuse_name_size];
2268 
2269 	int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns),
2270 					    cuse_name, &cuse_name_size);
2271 	if (rc == 0) {
2272 		spdk_json_write_named_string(w, "cuse_device", cuse_name);
2273 	}
2274 #endif
2275 
2276 	spdk_json_write_named_object_begin(w, "ctrlr_data");
2277 
2278 	spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid);
2279 
2280 	snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn);
2281 	spdk_str_trim(buf);
2282 	spdk_json_write_named_string(w, "model_number", buf);
2283 
2284 	snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn);
2285 	spdk_str_trim(buf);
2286 	spdk_json_write_named_string(w, "serial_number", buf);
2287 
2288 	snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr);
2289 	spdk_str_trim(buf);
2290 	spdk_json_write_named_string(w, "firmware_revision", buf);
2291 
2292 	if (cdata->subnqn[0] != '\0') {
2293 		spdk_json_write_named_string(w, "subnqn", cdata->subnqn);
2294 	}
2295 
2296 	spdk_json_write_named_object_begin(w, "oacs");
2297 
2298 	spdk_json_write_named_uint32(w, "security", cdata->oacs.security);
2299 	spdk_json_write_named_uint32(w, "format", cdata->oacs.format);
2300 	spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware);
2301 	spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage);
2302 
2303 	spdk_json_write_object_end(w);
2304 
2305 	spdk_json_write_object_end(w);
2306 
2307 	spdk_json_write_named_object_begin(w, "vs");
2308 
2309 	spdk_json_write_name(w, "nvme_version");
2310 	if (vs.bits.ter) {
2311 		spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter);
2312 	} else {
2313 		spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr);
2314 	}
2315 
2316 	spdk_json_write_object_end(w);
2317 
2318 	spdk_json_write_named_object_begin(w, "ns_data");
2319 
2320 	spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns));
2321 
2322 	if (cdata->cmic.ana_reporting) {
2323 		spdk_json_write_named_string(w, "ana_state",
2324 					     _nvme_ana_state_str(nvme_ns->ana_state));
2325 	}
2326 
2327 	spdk_json_write_object_end(w);
2328 
2329 	if (cdata->oacs.security) {
2330 		spdk_json_write_named_object_begin(w, "security");
2331 
2332 		spdk_json_write_named_bool(w, "opal", nvme_ns->bdev->opal);
2333 
2334 		spdk_json_write_object_end(w);
2335 	}
2336 
2337 	spdk_json_write_object_end(w);
2338 }
2339 
2340 static int
2341 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
2342 {
2343 	struct nvme_bdev *nvme_bdev = ctx;
2344 	struct nvme_ns *nvme_ns;
2345 
2346 	pthread_mutex_lock(&nvme_bdev->mutex);
2347 	spdk_json_write_named_array_begin(w, "nvme");
2348 	TAILQ_FOREACH(nvme_ns, &nvme_bdev->nvme_ns_list, tailq) {
2349 		nvme_namespace_info_json(w, nvme_ns);
2350 	}
2351 	spdk_json_write_array_end(w);
2352 	pthread_mutex_unlock(&nvme_bdev->mutex);
2353 
2354 	return 0;
2355 }
2356 
2357 static void
2358 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
2359 {
2360 	/* No config per bdev needed */
2361 }
2362 
2363 static uint64_t
2364 bdev_nvme_get_spin_time(struct spdk_io_channel *ch)
2365 {
2366 	struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch);
2367 	struct nvme_io_path *io_path;
2368 	struct nvme_poll_group *group;
2369 	uint64_t spin_time = 0;
2370 
2371 	STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
2372 		group = io_path->ctrlr_ch->group;
2373 
2374 		if (!group || !group->collect_spin_stat) {
2375 			continue;
2376 		}
2377 
2378 		if (group->end_ticks != 0) {
2379 			group->spin_ticks += (group->end_ticks - group->start_ticks);
2380 			group->end_ticks = 0;
2381 		}
2382 
2383 		spin_time += group->spin_ticks;
2384 		group->start_ticks = 0;
2385 		group->spin_ticks = 0;
2386 	}
2387 
2388 	return (spin_time * 1000000ULL) / spdk_get_ticks_hz();
2389 }
2390 
2391 static const struct spdk_bdev_fn_table nvmelib_fn_table = {
2392 	.destruct		= bdev_nvme_destruct,
2393 	.submit_request		= bdev_nvme_submit_request,
2394 	.io_type_supported	= bdev_nvme_io_type_supported,
2395 	.get_io_channel		= bdev_nvme_get_io_channel,
2396 	.dump_info_json		= bdev_nvme_dump_info_json,
2397 	.write_config_json	= bdev_nvme_write_config_json,
2398 	.get_spin_time		= bdev_nvme_get_spin_time,
2399 	.get_module_ctx		= bdev_nvme_get_module_ctx,
2400 	.get_memory_domains	= bdev_nvme_get_memory_domains,
2401 };
2402 
2403 typedef int (*bdev_nvme_parse_ana_log_page_cb)(
2404 	const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg);
2405 
2406 static int
2407 bdev_nvme_parse_ana_log_page(struct nvme_ctrlr *nvme_ctrlr,
2408 			     bdev_nvme_parse_ana_log_page_cb cb_fn, void *cb_arg)
2409 {
2410 	struct spdk_nvme_ana_group_descriptor *copied_desc;
2411 	uint8_t *orig_desc;
2412 	uint32_t i, desc_size, copy_len;
2413 	int rc = 0;
2414 
2415 	if (nvme_ctrlr->ana_log_page == NULL) {
2416 		return -EINVAL;
2417 	}
2418 
2419 	copied_desc = nvme_ctrlr->copied_ana_desc;
2420 
2421 	orig_desc = (uint8_t *)nvme_ctrlr->ana_log_page + sizeof(struct spdk_nvme_ana_page);
2422 	copy_len = nvme_ctrlr->ana_log_page_size - sizeof(struct spdk_nvme_ana_page);
2423 
2424 	for (i = 0; i < nvme_ctrlr->ana_log_page->num_ana_group_desc; i++) {
2425 		memcpy(copied_desc, orig_desc, copy_len);
2426 
2427 		rc = cb_fn(copied_desc, cb_arg);
2428 		if (rc != 0) {
2429 			break;
2430 		}
2431 
2432 		desc_size = sizeof(struct spdk_nvme_ana_group_descriptor) +
2433 			    copied_desc->num_of_nsid * sizeof(uint32_t);
2434 		orig_desc += desc_size;
2435 		copy_len -= desc_size;
2436 	}
2437 
2438 	return rc;
2439 }
2440 
2441 static int
2442 nvme_ns_set_ana_state(const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg)
2443 {
2444 	struct nvme_ns *nvme_ns = cb_arg;
2445 	uint32_t i;
2446 
2447 	for (i = 0; i < desc->num_of_nsid; i++) {
2448 		if (desc->nsid[i] != spdk_nvme_ns_get_id(nvme_ns->ns)) {
2449 			continue;
2450 		}
2451 		nvme_ns->ana_group_id = desc->ana_group_id;
2452 		nvme_ns->ana_state = desc->ana_state;
2453 		return 1;
2454 	}
2455 
2456 	return 0;
2457 }
2458 
2459 static int
2460 nvme_disk_create(struct spdk_bdev *disk, const char *base_name,
2461 		 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns,
2462 		 uint32_t prchk_flags, void *ctx)
2463 {
2464 	const struct spdk_uuid		*uuid;
2465 	const uint8_t *nguid;
2466 	const struct spdk_nvme_ctrlr_data *cdata;
2467 	const struct spdk_nvme_ns_data	*nsdata;
2468 	enum spdk_nvme_csi		csi;
2469 	uint32_t atomic_bs, phys_bs, bs;
2470 
2471 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
2472 	csi = spdk_nvme_ns_get_csi(ns);
2473 
2474 	switch (csi) {
2475 	case SPDK_NVME_CSI_NVM:
2476 		disk->product_name = "NVMe disk";
2477 		break;
2478 	case SPDK_NVME_CSI_ZNS:
2479 		disk->product_name = "NVMe ZNS disk";
2480 		disk->zoned = true;
2481 		disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns);
2482 		disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) /
2483 					     spdk_nvme_ns_get_extended_sector_size(ns);
2484 		disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns);
2485 		disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns);
2486 		break;
2487 	default:
2488 		SPDK_ERRLOG("unsupported CSI: %u\n", csi);
2489 		return -ENOTSUP;
2490 	}
2491 
2492 	disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns));
2493 	if (!disk->name) {
2494 		return -ENOMEM;
2495 	}
2496 
2497 	disk->write_cache = 0;
2498 	if (cdata->vwc.present) {
2499 		/* Enable if the Volatile Write Cache exists */
2500 		disk->write_cache = 1;
2501 	}
2502 	if (cdata->oncs.write_zeroes) {
2503 		disk->max_write_zeroes = UINT16_MAX + 1;
2504 	}
2505 	disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns);
2506 	disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns);
2507 	disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns);
2508 
2509 	nguid = spdk_nvme_ns_get_nguid(ns);
2510 	if (!nguid) {
2511 		uuid = spdk_nvme_ns_get_uuid(ns);
2512 		if (uuid) {
2513 			disk->uuid = *uuid;
2514 		}
2515 	} else {
2516 		memcpy(&disk->uuid, nguid, sizeof(disk->uuid));
2517 	}
2518 
2519 	nsdata = spdk_nvme_ns_get_data(ns);
2520 	bs = spdk_nvme_ns_get_sector_size(ns);
2521 	atomic_bs = bs;
2522 	phys_bs = bs;
2523 	if (nsdata->nabo == 0) {
2524 		if (nsdata->nsfeat.ns_atomic_write_unit && nsdata->nawupf) {
2525 			atomic_bs = bs * (1 + nsdata->nawupf);
2526 		} else {
2527 			atomic_bs = bs * (1 + cdata->awupf);
2528 		}
2529 	}
2530 	if (nsdata->nsfeat.optperf) {
2531 		phys_bs = bs * (1 + nsdata->npwg);
2532 	}
2533 	disk->phys_blocklen = spdk_min(phys_bs, atomic_bs);
2534 
2535 	disk->md_len = spdk_nvme_ns_get_md_size(ns);
2536 	if (disk->md_len != 0) {
2537 		disk->md_interleave = nsdata->flbas.extended;
2538 		disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns);
2539 		if (disk->dif_type != SPDK_DIF_DISABLE) {
2540 			disk->dif_is_head_of_md = nsdata->dps.md_start;
2541 			disk->dif_check_flags = prchk_flags;
2542 		}
2543 	}
2544 
2545 	if (!(spdk_nvme_ctrlr_get_flags(ctrlr) &
2546 	      SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) {
2547 		disk->acwu = 0;
2548 	} else if (nsdata->nsfeat.ns_atomic_write_unit) {
2549 		disk->acwu = nsdata->nacwu;
2550 	} else {
2551 		disk->acwu = cdata->acwu;
2552 	}
2553 
2554 	disk->ctxt = ctx;
2555 	disk->fn_table = &nvmelib_fn_table;
2556 	disk->module = &nvme_if;
2557 
2558 	return 0;
2559 }
2560 
2561 static int
2562 nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns)
2563 {
2564 	struct nvme_bdev *bdev;
2565 	int rc;
2566 
2567 	bdev = calloc(1, sizeof(*bdev));
2568 	if (!bdev) {
2569 		SPDK_ERRLOG("bdev calloc() failed\n");
2570 		return -ENOMEM;
2571 	}
2572 
2573 	rc = pthread_mutex_init(&bdev->mutex, NULL);
2574 	if (rc != 0) {
2575 		free(bdev);
2576 		return rc;
2577 	}
2578 
2579 	bdev->ref = 1;
2580 	TAILQ_INIT(&bdev->nvme_ns_list);
2581 	TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq);
2582 	bdev->opal = nvme_ctrlr->opal_dev != NULL;
2583 
2584 	rc = nvme_disk_create(&bdev->disk, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ctrlr,
2585 			      nvme_ns->ns, nvme_ctrlr->prchk_flags, bdev);
2586 	if (rc != 0) {
2587 		SPDK_ERRLOG("Failed to create NVMe disk\n");
2588 		pthread_mutex_destroy(&bdev->mutex);
2589 		free(bdev);
2590 		return rc;
2591 	}
2592 
2593 	spdk_io_device_register(bdev,
2594 				bdev_nvme_create_bdev_channel_cb,
2595 				bdev_nvme_destroy_bdev_channel_cb,
2596 				sizeof(struct nvme_bdev_channel),
2597 				bdev->disk.name);
2598 
2599 	rc = spdk_bdev_register(&bdev->disk);
2600 	if (rc != 0) {
2601 		SPDK_ERRLOG("spdk_bdev_register() failed\n");
2602 		spdk_io_device_unregister(bdev, NULL);
2603 		pthread_mutex_destroy(&bdev->mutex);
2604 		free(bdev->disk.name);
2605 		free(bdev);
2606 		return rc;
2607 	}
2608 
2609 	nvme_ns->bdev = bdev;
2610 	bdev->nsid = nvme_ns->id;
2611 
2612 	bdev->nbdev_ctrlr = nvme_ctrlr->nbdev_ctrlr;
2613 	TAILQ_INSERT_TAIL(&nvme_ctrlr->nbdev_ctrlr->bdevs, bdev, tailq);
2614 
2615 	return 0;
2616 }
2617 
2618 static bool
2619 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2)
2620 {
2621 	const struct spdk_nvme_ns_data *nsdata1, *nsdata2;
2622 	const struct spdk_uuid *uuid1, *uuid2;
2623 
2624 	nsdata1 = spdk_nvme_ns_get_data(ns1);
2625 	nsdata2 = spdk_nvme_ns_get_data(ns2);
2626 	uuid1 = spdk_nvme_ns_get_uuid(ns1);
2627 	uuid2 = spdk_nvme_ns_get_uuid(ns2);
2628 
2629 	return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)) == 0 &&
2630 	       nsdata1->eui64 == nsdata2->eui64 &&
2631 	       ((uuid1 == NULL && uuid2 == NULL) ||
2632 		(uuid1 != NULL && uuid2 != NULL && spdk_uuid_compare(uuid1, uuid2) == 0)) &&
2633 	       spdk_nvme_ns_get_csi(ns1) == spdk_nvme_ns_get_csi(ns2);
2634 }
2635 
2636 static bool
2637 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
2638 		 struct spdk_nvme_ctrlr_opts *opts)
2639 {
2640 	struct nvme_probe_skip_entry *entry;
2641 
2642 	TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) {
2643 		if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) {
2644 			return false;
2645 		}
2646 	}
2647 
2648 	opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst;
2649 	opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight;
2650 	opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight;
2651 	opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight;
2652 	opts->disable_read_ana_log_page = true;
2653 
2654 	SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr);
2655 
2656 	return true;
2657 }
2658 
2659 static void
2660 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl)
2661 {
2662 	struct nvme_ctrlr *nvme_ctrlr = ctx;
2663 
2664 	if (spdk_nvme_cpl_is_error(cpl)) {
2665 		SPDK_WARNLOG("Abort failed. Resetting controller. sc is %u, sct is %u.\n", cpl->status.sc,
2666 			     cpl->status.sct);
2667 		bdev_nvme_reset(nvme_ctrlr);
2668 	} else if (cpl->cdw0 & 0x1) {
2669 		SPDK_WARNLOG("Specified command could not be aborted.\n");
2670 		bdev_nvme_reset(nvme_ctrlr);
2671 	}
2672 }
2673 
2674 static void
2675 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr,
2676 	   struct spdk_nvme_qpair *qpair, uint16_t cid)
2677 {
2678 	struct nvme_ctrlr *nvme_ctrlr = cb_arg;
2679 	union spdk_nvme_csts_register csts;
2680 	int rc;
2681 
2682 	assert(nvme_ctrlr->ctrlr == ctrlr);
2683 
2684 	SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid);
2685 
2686 	/* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O
2687 	 * queue.  (Note: qpair == NULL when there's an admin cmd timeout.)  Otherwise we
2688 	 * would submit another fabrics cmd on the admin queue to read CSTS and check for its
2689 	 * completion recursively.
2690 	 */
2691 	if (nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) {
2692 		csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr);
2693 		if (csts.bits.cfs) {
2694 			SPDK_ERRLOG("Controller Fatal Status, reset required\n");
2695 			bdev_nvme_reset(nvme_ctrlr);
2696 			return;
2697 		}
2698 	}
2699 
2700 	switch (g_opts.action_on_timeout) {
2701 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT:
2702 		if (qpair) {
2703 			/* Don't send abort to ctrlr when ctrlr is not available. */
2704 			pthread_mutex_lock(&nvme_ctrlr->mutex);
2705 			if (!nvme_ctrlr_is_available(nvme_ctrlr)) {
2706 				pthread_mutex_unlock(&nvme_ctrlr->mutex);
2707 				SPDK_NOTICELOG("Quit abort. Ctrlr is not available.\n");
2708 				return;
2709 			}
2710 			pthread_mutex_unlock(&nvme_ctrlr->mutex);
2711 
2712 			rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid,
2713 						       nvme_abort_cpl, nvme_ctrlr);
2714 			if (rc == 0) {
2715 				return;
2716 			}
2717 
2718 			SPDK_ERRLOG("Unable to send abort. Resetting, rc is %d.\n", rc);
2719 		}
2720 
2721 	/* FALLTHROUGH */
2722 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET:
2723 		bdev_nvme_reset(nvme_ctrlr);
2724 		break;
2725 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE:
2726 		SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n");
2727 		break;
2728 	default:
2729 		SPDK_ERRLOG("An invalid timeout action value is found.\n");
2730 		break;
2731 	}
2732 }
2733 
2734 static void
2735 nvme_ctrlr_populate_namespace_done(struct nvme_ns *nvme_ns, int rc)
2736 {
2737 	struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr;
2738 	struct nvme_async_probe_ctx *ctx = nvme_ns->probe_ctx;
2739 
2740 	if (rc == 0) {
2741 		nvme_ns->probe_ctx = NULL;
2742 		pthread_mutex_lock(&nvme_ctrlr->mutex);
2743 		nvme_ctrlr->ref++;
2744 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
2745 	} else {
2746 		RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns);
2747 		free(nvme_ns);
2748 	}
2749 
2750 	if (ctx) {
2751 		ctx->populates_in_progress--;
2752 		if (ctx->populates_in_progress == 0) {
2753 			nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx);
2754 		}
2755 	}
2756 }
2757 
2758 static void
2759 bdev_nvme_add_io_path(struct spdk_io_channel_iter *i)
2760 {
2761 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
2762 	struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch);
2763 	struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i);
2764 	int rc;
2765 
2766 	rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns);
2767 	if (rc != 0) {
2768 		SPDK_ERRLOG("Failed to add I/O path to bdev_channel dynamically.\n");
2769 	}
2770 
2771 	spdk_for_each_channel_continue(i, rc);
2772 }
2773 
2774 static void
2775 bdev_nvme_delete_io_path(struct spdk_io_channel_iter *i)
2776 {
2777 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
2778 	struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch);
2779 	struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i);
2780 	struct nvme_io_path *io_path;
2781 
2782 	io_path = _bdev_nvme_get_io_path(nbdev_ch, nvme_ns);
2783 	if (io_path != NULL) {
2784 		_bdev_nvme_delete_io_path(nbdev_ch, io_path);
2785 	}
2786 
2787 	spdk_for_each_channel_continue(i, 0);
2788 }
2789 
2790 static void
2791 bdev_nvme_add_io_path_failed(struct spdk_io_channel_iter *i, int status)
2792 {
2793 	struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i);
2794 
2795 	nvme_ctrlr_populate_namespace_done(nvme_ns, -1);
2796 }
2797 
2798 static void
2799 bdev_nvme_add_io_path_done(struct spdk_io_channel_iter *i, int status)
2800 {
2801 	struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i);
2802 	struct nvme_bdev *bdev = spdk_io_channel_iter_get_io_device(i);
2803 
2804 	if (status == 0) {
2805 		nvme_ctrlr_populate_namespace_done(nvme_ns, 0);
2806 	} else {
2807 		/* Delete the added io_paths and fail populating the namespace. */
2808 		spdk_for_each_channel(bdev,
2809 				      bdev_nvme_delete_io_path,
2810 				      nvme_ns,
2811 				      bdev_nvme_add_io_path_failed);
2812 	}
2813 }
2814 
2815 static int
2816 nvme_bdev_add_ns(struct nvme_bdev *bdev, struct nvme_ns *nvme_ns)
2817 {
2818 	struct nvme_ns *tmp_ns;
2819 	const struct spdk_nvme_ns_data *nsdata;
2820 
2821 	nsdata = spdk_nvme_ns_get_data(nvme_ns->ns);
2822 	if (!nsdata->nmic.can_share) {
2823 		SPDK_ERRLOG("Namespace cannot be shared.\n");
2824 		return -EINVAL;
2825 	}
2826 
2827 	pthread_mutex_lock(&bdev->mutex);
2828 
2829 	tmp_ns = TAILQ_FIRST(&bdev->nvme_ns_list);
2830 	assert(tmp_ns != NULL);
2831 
2832 	if (!bdev_nvme_compare_ns(nvme_ns->ns, tmp_ns->ns)) {
2833 		pthread_mutex_unlock(&bdev->mutex);
2834 		SPDK_ERRLOG("Namespaces are not identical.\n");
2835 		return -EINVAL;
2836 	}
2837 
2838 	bdev->ref++;
2839 	TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq);
2840 	nvme_ns->bdev = bdev;
2841 
2842 	pthread_mutex_unlock(&bdev->mutex);
2843 
2844 	/* Add nvme_io_path to nvme_bdev_channels dynamically. */
2845 	spdk_for_each_channel(bdev,
2846 			      bdev_nvme_add_io_path,
2847 			      nvme_ns,
2848 			      bdev_nvme_add_io_path_done);
2849 
2850 	return 0;
2851 }
2852 
2853 static void
2854 nvme_ctrlr_populate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns)
2855 {
2856 	struct spdk_nvme_ns	*ns;
2857 	struct nvme_bdev	*bdev;
2858 	int			rc = 0;
2859 
2860 	ns = spdk_nvme_ctrlr_get_ns(nvme_ctrlr->ctrlr, nvme_ns->id);
2861 	if (!ns) {
2862 		SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id);
2863 		rc = -EINVAL;
2864 		goto done;
2865 	}
2866 
2867 	nvme_ns->ns = ns;
2868 	nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE;
2869 
2870 	if (nvme_ctrlr->ana_log_page != NULL) {
2871 		bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ns_set_ana_state, nvme_ns);
2872 	}
2873 
2874 	bdev = nvme_bdev_ctrlr_get_bdev(nvme_ctrlr->nbdev_ctrlr, nvme_ns->id);
2875 	if (bdev == NULL) {
2876 		rc = nvme_bdev_create(nvme_ctrlr, nvme_ns);
2877 	} else {
2878 		rc = nvme_bdev_add_ns(bdev, nvme_ns);
2879 		if (rc == 0) {
2880 			return;
2881 		}
2882 	}
2883 done:
2884 	nvme_ctrlr_populate_namespace_done(nvme_ns, rc);
2885 }
2886 
2887 static void
2888 nvme_ctrlr_depopulate_namespace_done(struct nvme_ns *nvme_ns)
2889 {
2890 	struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr;
2891 
2892 	assert(nvme_ctrlr != NULL);
2893 
2894 	pthread_mutex_lock(&nvme_ctrlr->mutex);
2895 
2896 	RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns);
2897 
2898 	if (nvme_ns->bdev != NULL) {
2899 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
2900 		return;
2901 	}
2902 
2903 	free(nvme_ns);
2904 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
2905 
2906 	nvme_ctrlr_release(nvme_ctrlr);
2907 }
2908 
2909 static void
2910 bdev_nvme_delete_io_path_done(struct spdk_io_channel_iter *i, int status)
2911 {
2912 	struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i);
2913 
2914 	nvme_ctrlr_depopulate_namespace_done(nvme_ns);
2915 }
2916 
2917 static void
2918 nvme_ctrlr_depopulate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns)
2919 {
2920 	struct nvme_bdev *bdev;
2921 
2922 	bdev = nvme_ns->bdev;
2923 	if (bdev != NULL) {
2924 		pthread_mutex_lock(&bdev->mutex);
2925 
2926 		assert(bdev->ref > 0);
2927 		bdev->ref--;
2928 		if (bdev->ref == 0) {
2929 			pthread_mutex_unlock(&bdev->mutex);
2930 
2931 			spdk_bdev_unregister(&bdev->disk, NULL, NULL);
2932 		} else {
2933 			/* spdk_bdev_unregister() is not called until the last nvme_ns is
2934 			 * depopulated. Hence we need to remove nvme_ns from bdev->nvme_ns_list
2935 			 * and clear nvme_ns->bdev here.
2936 			 */
2937 			TAILQ_REMOVE(&bdev->nvme_ns_list, nvme_ns, tailq);
2938 			nvme_ns->bdev = NULL;
2939 
2940 			pthread_mutex_unlock(&bdev->mutex);
2941 
2942 			/* Delete nvme_io_paths from nvme_bdev_channels dynamically. After that,
2943 			 * we call depopulate_namespace_done() to avoid use-after-free.
2944 			 */
2945 			spdk_for_each_channel(bdev,
2946 					      bdev_nvme_delete_io_path,
2947 					      nvme_ns,
2948 					      bdev_nvme_delete_io_path_done);
2949 			return;
2950 		}
2951 	}
2952 
2953 	nvme_ctrlr_depopulate_namespace_done(nvme_ns);
2954 }
2955 
2956 static void
2957 nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr,
2958 			       struct nvme_async_probe_ctx *ctx)
2959 {
2960 	struct spdk_nvme_ctrlr	*ctrlr = nvme_ctrlr->ctrlr;
2961 	struct nvme_ns	*nvme_ns, *next;
2962 	struct spdk_nvme_ns	*ns;
2963 	struct nvme_bdev	*bdev;
2964 	uint32_t		nsid;
2965 	int			rc;
2966 	uint64_t		num_sectors;
2967 
2968 	if (ctx) {
2969 		/* Initialize this count to 1 to handle the populate functions
2970 		 * calling nvme_ctrlr_populate_namespace_done() immediately.
2971 		 */
2972 		ctx->populates_in_progress = 1;
2973 	}
2974 
2975 	/* First loop over our existing namespaces and see if they have been
2976 	 * removed. */
2977 	nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr);
2978 	while (nvme_ns != NULL) {
2979 		next = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns);
2980 
2981 		if (spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) {
2982 			/* NS is still there but attributes may have changed */
2983 			ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id);
2984 			num_sectors = spdk_nvme_ns_get_num_sectors(ns);
2985 			bdev = nvme_ns->bdev;
2986 			assert(bdev != NULL);
2987 			if (bdev->disk.blockcnt != num_sectors) {
2988 				SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n",
2989 					       nvme_ns->id,
2990 					       bdev->disk.name,
2991 					       bdev->disk.blockcnt,
2992 					       num_sectors);
2993 				rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors);
2994 				if (rc != 0) {
2995 					SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n",
2996 						    bdev->disk.name, rc);
2997 				}
2998 			}
2999 		} else {
3000 			/* Namespace was removed */
3001 			nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns);
3002 		}
3003 
3004 		nvme_ns = next;
3005 	}
3006 
3007 	/* Loop through all of the namespaces at the nvme level and see if any of them are new */
3008 	nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr);
3009 	while (nsid != 0) {
3010 		nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid);
3011 
3012 		if (nvme_ns == NULL) {
3013 			/* Found a new one */
3014 			nvme_ns = calloc(1, sizeof(struct nvme_ns));
3015 			if (nvme_ns == NULL) {
3016 				SPDK_ERRLOG("Failed to allocate namespace\n");
3017 				/* This just fails to attach the namespace. It may work on a future attempt. */
3018 				continue;
3019 			}
3020 
3021 			nvme_ns->id = nsid;
3022 			nvme_ns->ctrlr = nvme_ctrlr;
3023 
3024 			nvme_ns->bdev = NULL;
3025 
3026 			if (ctx) {
3027 				ctx->populates_in_progress++;
3028 			}
3029 			nvme_ns->probe_ctx = ctx;
3030 
3031 			RB_INSERT(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns);
3032 
3033 			nvme_ctrlr_populate_namespace(nvme_ctrlr, nvme_ns);
3034 		}
3035 
3036 		nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid);
3037 	}
3038 
3039 	if (ctx) {
3040 		/* Decrement this count now that the loop is over to account
3041 		 * for the one we started with.  If the count is then 0, we
3042 		 * know any populate_namespace functions completed immediately,
3043 		 * so we'll kick the callback here.
3044 		 */
3045 		ctx->populates_in_progress--;
3046 		if (ctx->populates_in_progress == 0) {
3047 			nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx);
3048 		}
3049 	}
3050 
3051 }
3052 
3053 static void
3054 nvme_ctrlr_depopulate_namespaces(struct nvme_ctrlr *nvme_ctrlr)
3055 {
3056 	struct nvme_ns *nvme_ns, *tmp;
3057 
3058 	RB_FOREACH_SAFE(nvme_ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp) {
3059 		nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns);
3060 	}
3061 }
3062 
3063 static int
3064 nvme_ctrlr_set_ana_states(const struct spdk_nvme_ana_group_descriptor *desc,
3065 			  void *cb_arg)
3066 {
3067 	struct nvme_ctrlr *nvme_ctrlr = cb_arg;
3068 	struct nvme_ns *nvme_ns;
3069 	uint32_t i, nsid;
3070 
3071 	for (i = 0; i < desc->num_of_nsid; i++) {
3072 		nsid = desc->nsid[i];
3073 		if (nsid == 0) {
3074 			continue;
3075 		}
3076 
3077 		nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid);
3078 
3079 		assert(nvme_ns != NULL);
3080 		if (nvme_ns == NULL) {
3081 			/* Target told us that an inactive namespace had an ANA change */
3082 			continue;
3083 		}
3084 
3085 		nvme_ns->ana_group_id = desc->ana_group_id;
3086 		nvme_ns->ana_state = desc->ana_state;
3087 		nvme_ns->ana_state_updating = false;
3088 	}
3089 
3090 	return 0;
3091 }
3092 
3093 static void
3094 bdev_nvme_clear_io_path_cache(struct spdk_io_channel_iter *i)
3095 {
3096 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
3097 	struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch);
3098 
3099 	_bdev_nvme_clear_io_path_cache(ctrlr_ch);
3100 
3101 	spdk_for_each_channel_continue(i, 0);
3102 }
3103 
3104 static void
3105 bdev_nvme_clear_io_path_cache_done(struct spdk_io_channel_iter *i, int status)
3106 {
3107 	struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i);
3108 
3109 	pthread_mutex_lock(&nvme_ctrlr->mutex);
3110 
3111 	assert(nvme_ctrlr->ana_log_page_updating == true);
3112 	nvme_ctrlr->ana_log_page_updating = false;
3113 
3114 	if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) {
3115 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
3116 		return;
3117 	}
3118 
3119 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
3120 
3121 	nvme_ctrlr_unregister(nvme_ctrlr);
3122 }
3123 
3124 static void
3125 bdev_nvme_disable_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr)
3126 {
3127 	struct nvme_ns *nvme_ns;
3128 
3129 	spdk_free(nvme_ctrlr->ana_log_page);
3130 	nvme_ctrlr->ana_log_page = NULL;
3131 
3132 	for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr);
3133 	     nvme_ns != NULL;
3134 	     nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) {
3135 		nvme_ns->ana_state_updating = false;
3136 		nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE;
3137 	}
3138 }
3139 
3140 static void
3141 nvme_ctrlr_read_ana_log_page_done(void *ctx, const struct spdk_nvme_cpl *cpl)
3142 {
3143 	struct nvme_ctrlr *nvme_ctrlr = ctx;
3144 
3145 	if (cpl != NULL && spdk_nvme_cpl_is_success(cpl)) {
3146 		bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ctrlr_set_ana_states,
3147 					     nvme_ctrlr);
3148 	} else {
3149 		bdev_nvme_disable_read_ana_log_page(nvme_ctrlr);
3150 	}
3151 
3152 	spdk_for_each_channel(nvme_ctrlr,
3153 			      bdev_nvme_clear_io_path_cache,
3154 			      NULL,
3155 			      bdev_nvme_clear_io_path_cache_done);
3156 }
3157 
3158 static int
3159 nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr)
3160 {
3161 	int rc;
3162 
3163 	if (nvme_ctrlr->ana_log_page == NULL) {
3164 		return -EINVAL;
3165 	}
3166 
3167 	pthread_mutex_lock(&nvme_ctrlr->mutex);
3168 	if (!nvme_ctrlr_is_available(nvme_ctrlr) ||
3169 	    nvme_ctrlr->ana_log_page_updating) {
3170 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
3171 		return -EBUSY;
3172 	}
3173 
3174 	nvme_ctrlr->ana_log_page_updating = true;
3175 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
3176 
3177 	rc = spdk_nvme_ctrlr_cmd_get_log_page(nvme_ctrlr->ctrlr,
3178 					      SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS,
3179 					      SPDK_NVME_GLOBAL_NS_TAG,
3180 					      nvme_ctrlr->ana_log_page,
3181 					      nvme_ctrlr->ana_log_page_size, 0,
3182 					      nvme_ctrlr_read_ana_log_page_done,
3183 					      nvme_ctrlr);
3184 	if (rc != 0) {
3185 		nvme_ctrlr_read_ana_log_page_done(nvme_ctrlr, NULL);
3186 	}
3187 
3188 	return rc;
3189 }
3190 
3191 static void
3192 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl)
3193 {
3194 	struct nvme_ctrlr *nvme_ctrlr		= arg;
3195 	union spdk_nvme_async_event_completion	event;
3196 
3197 	if (spdk_nvme_cpl_is_error(cpl)) {
3198 		SPDK_WARNLOG("AER request execute failed");
3199 		return;
3200 	}
3201 
3202 	event.raw = cpl->cdw0;
3203 	if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) &&
3204 	    (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) {
3205 		nvme_ctrlr_populate_namespaces(nvme_ctrlr, NULL);
3206 	} else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) &&
3207 		   (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_ANA_CHANGE)) {
3208 		nvme_ctrlr_read_ana_log_page(nvme_ctrlr);
3209 	}
3210 }
3211 
3212 static void
3213 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, size_t count, int rc)
3214 {
3215 	if (ctx->cb_fn) {
3216 		ctx->cb_fn(ctx->cb_ctx, count, rc);
3217 	}
3218 
3219 	ctx->namespaces_populated = true;
3220 	if (ctx->probe_done) {
3221 		/* The probe was already completed, so we need to free the context
3222 		 * here.  This can happen for cases like OCSSD, where we need to
3223 		 * send additional commands to the SSD after attach.
3224 		 */
3225 		free(ctx);
3226 	}
3227 }
3228 
3229 static void
3230 nvme_ctrlr_create_done(struct nvme_ctrlr *nvme_ctrlr,
3231 		       struct nvme_async_probe_ctx *ctx)
3232 {
3233 	spdk_io_device_register(nvme_ctrlr,
3234 				bdev_nvme_create_ctrlr_channel_cb,
3235 				bdev_nvme_destroy_ctrlr_channel_cb,
3236 				sizeof(struct nvme_ctrlr_channel),
3237 				nvme_ctrlr->nbdev_ctrlr->name);
3238 
3239 	nvme_ctrlr_populate_namespaces(nvme_ctrlr, ctx);
3240 }
3241 
3242 static void
3243 nvme_ctrlr_init_ana_log_page_done(void *_ctx, const struct spdk_nvme_cpl *cpl)
3244 {
3245 	struct nvme_ctrlr *nvme_ctrlr = _ctx;
3246 	struct nvme_async_probe_ctx *ctx = nvme_ctrlr->probe_ctx;
3247 
3248 	nvme_ctrlr->probe_ctx = NULL;
3249 
3250 	if (spdk_nvme_cpl_is_error(cpl)) {
3251 		nvme_ctrlr_delete(nvme_ctrlr);
3252 
3253 		if (ctx != NULL) {
3254 			populate_namespaces_cb(ctx, 0, -1);
3255 		}
3256 		return;
3257 	}
3258 
3259 	nvme_ctrlr_create_done(nvme_ctrlr, ctx);
3260 }
3261 
3262 static int
3263 nvme_ctrlr_init_ana_log_page(struct nvme_ctrlr *nvme_ctrlr,
3264 			     struct nvme_async_probe_ctx *ctx)
3265 {
3266 	struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr;
3267 	const struct spdk_nvme_ctrlr_data *cdata;
3268 	uint32_t ana_log_page_size;
3269 
3270 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
3271 
3272 	ana_log_page_size = sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid *
3273 			    sizeof(struct spdk_nvme_ana_group_descriptor) + cdata->nn *
3274 			    sizeof(uint32_t);
3275 
3276 	nvme_ctrlr->ana_log_page = spdk_zmalloc(ana_log_page_size, 64, NULL,
3277 						SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
3278 	if (nvme_ctrlr->ana_log_page == NULL) {
3279 		SPDK_ERRLOG("could not allocate ANA log page buffer\n");
3280 		return -ENXIO;
3281 	}
3282 
3283 	/* Each descriptor in a ANA log page is not ensured to be 8-bytes aligned.
3284 	 * Hence copy each descriptor to a temporary area when parsing it.
3285 	 *
3286 	 * Allocate a buffer whose size is as large as ANA log page buffer because
3287 	 * we do not know the size of a descriptor until actually reading it.
3288 	 */
3289 	nvme_ctrlr->copied_ana_desc = calloc(1, ana_log_page_size);
3290 	if (nvme_ctrlr->copied_ana_desc == NULL) {
3291 		SPDK_ERRLOG("could not allocate a buffer to parse ANA descriptor\n");
3292 		return -ENOMEM;
3293 	}
3294 
3295 	nvme_ctrlr->ana_log_page_size = ana_log_page_size;
3296 
3297 	nvme_ctrlr->probe_ctx = ctx;
3298 
3299 	return spdk_nvme_ctrlr_cmd_get_log_page(ctrlr,
3300 						SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS,
3301 						SPDK_NVME_GLOBAL_NS_TAG,
3302 						nvme_ctrlr->ana_log_page,
3303 						nvme_ctrlr->ana_log_page_size, 0,
3304 						nvme_ctrlr_init_ana_log_page_done,
3305 						nvme_ctrlr);
3306 }
3307 
3308 /* hostnqn and subnqn were already verified before attaching a controller.
3309  * Hence check only the multipath capability and cntlid here.
3310  */
3311 static bool
3312 bdev_nvme_check_multipath(struct nvme_bdev_ctrlr *nbdev_ctrlr, struct spdk_nvme_ctrlr *ctrlr)
3313 {
3314 	struct nvme_ctrlr *tmp;
3315 	const struct spdk_nvme_ctrlr_data *cdata, *tmp_cdata;
3316 
3317 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
3318 
3319 	if (!cdata->cmic.multi_ctrlr) {
3320 		SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid);
3321 		return false;
3322 	}
3323 
3324 	TAILQ_FOREACH(tmp, &nbdev_ctrlr->ctrlrs, tailq) {
3325 		tmp_cdata = spdk_nvme_ctrlr_get_data(tmp->ctrlr);
3326 
3327 		if (!tmp_cdata->cmic.multi_ctrlr) {
3328 			SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid);
3329 			return false;
3330 		}
3331 		if (cdata->cntlid == tmp_cdata->cntlid) {
3332 			SPDK_ERRLOG("cntlid %u are duplicated.\n", tmp_cdata->cntlid);
3333 			return false;
3334 		}
3335 	}
3336 
3337 	return true;
3338 }
3339 
3340 static int
3341 nvme_bdev_ctrlr_create(const char *name, struct nvme_ctrlr *nvme_ctrlr)
3342 {
3343 	struct nvme_bdev_ctrlr *nbdev_ctrlr;
3344 	struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr;
3345 	int rc = 0;
3346 
3347 	pthread_mutex_lock(&g_bdev_nvme_mutex);
3348 
3349 	nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name);
3350 	if (nbdev_ctrlr != NULL) {
3351 		if (!bdev_nvme_check_multipath(nbdev_ctrlr, ctrlr)) {
3352 			rc = -EINVAL;
3353 			goto exit;
3354 		}
3355 	} else {
3356 		nbdev_ctrlr = calloc(1, sizeof(*nbdev_ctrlr));
3357 		if (nbdev_ctrlr == NULL) {
3358 			SPDK_ERRLOG("Failed to allocate nvme_bdev_ctrlr.\n");
3359 			rc = -ENOMEM;
3360 			goto exit;
3361 		}
3362 		nbdev_ctrlr->name = strdup(name);
3363 		if (nbdev_ctrlr->name == NULL) {
3364 			SPDK_ERRLOG("Failed to allocate name of nvme_bdev_ctrlr.\n");
3365 			free(nbdev_ctrlr);
3366 			goto exit;
3367 		}
3368 		TAILQ_INIT(&nbdev_ctrlr->ctrlrs);
3369 		TAILQ_INIT(&nbdev_ctrlr->bdevs);
3370 		TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq);
3371 	}
3372 	nvme_ctrlr->nbdev_ctrlr = nbdev_ctrlr;
3373 	TAILQ_INSERT_TAIL(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq);
3374 exit:
3375 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
3376 	return rc;
3377 }
3378 
3379 static int
3380 nvme_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr,
3381 		  const char *name,
3382 		  const struct spdk_nvme_transport_id *trid,
3383 		  struct nvme_async_probe_ctx *ctx)
3384 {
3385 	struct nvme_ctrlr *nvme_ctrlr;
3386 	struct nvme_path_id *path_id;
3387 	const struct spdk_nvme_ctrlr_data *cdata;
3388 	int rc;
3389 
3390 	nvme_ctrlr = calloc(1, sizeof(*nvme_ctrlr));
3391 	if (nvme_ctrlr == NULL) {
3392 		SPDK_ERRLOG("Failed to allocate device struct\n");
3393 		return -ENOMEM;
3394 	}
3395 
3396 	rc = pthread_mutex_init(&nvme_ctrlr->mutex, NULL);
3397 	if (rc != 0) {
3398 		free(nvme_ctrlr);
3399 		return rc;
3400 	}
3401 
3402 	TAILQ_INIT(&nvme_ctrlr->trids);
3403 
3404 	RB_INIT(&nvme_ctrlr->namespaces);
3405 
3406 	path_id = calloc(1, sizeof(*path_id));
3407 	if (path_id == NULL) {
3408 		SPDK_ERRLOG("Failed to allocate trid entry pointer\n");
3409 		rc = -ENOMEM;
3410 		goto err;
3411 	}
3412 
3413 	path_id->trid = *trid;
3414 	if (ctx != NULL) {
3415 		memcpy(path_id->hostid.hostaddr, ctx->opts.src_addr, sizeof(path_id->hostid.hostaddr));
3416 		memcpy(path_id->hostid.hostsvcid, ctx->opts.src_svcid, sizeof(path_id->hostid.hostsvcid));
3417 	}
3418 	nvme_ctrlr->active_path_id = path_id;
3419 	TAILQ_INSERT_HEAD(&nvme_ctrlr->trids, path_id, link);
3420 
3421 	nvme_ctrlr->thread = spdk_get_thread();
3422 	nvme_ctrlr->ctrlr = ctrlr;
3423 	nvme_ctrlr->ref = 1;
3424 
3425 	if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) {
3426 		SPDK_ERRLOG("OCSSDs are not supported");
3427 		rc = -ENOTSUP;
3428 		goto err;
3429 	}
3430 
3431 	if (ctx != NULL) {
3432 		nvme_ctrlr->prchk_flags = ctx->prchk_flags;
3433 		nvme_ctrlr->ctrlr_loss_timeout_sec = ctx->ctrlr_loss_timeout_sec;
3434 		nvme_ctrlr->reconnect_delay_sec = ctx->reconnect_delay_sec;
3435 		nvme_ctrlr->fast_io_fail_timeout_sec = ctx->fast_io_fail_timeout_sec;
3436 	}
3437 
3438 	nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_ctrlr,
3439 					  g_opts.nvme_adminq_poll_period_us);
3440 
3441 	if (g_opts.timeout_us > 0) {
3442 		/* Register timeout callback. Timeout values for IO vs. admin reqs can be different. */
3443 		/* If timeout_admin_us is 0 (not specified), admin uses same timeout as IO. */
3444 		uint64_t adm_timeout_us = (g_opts.timeout_admin_us == 0) ?
3445 					  g_opts.timeout_us : g_opts.timeout_admin_us;
3446 		spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us,
3447 				adm_timeout_us, timeout_cb, nvme_ctrlr);
3448 	}
3449 
3450 	spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_ctrlr);
3451 	spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_ctrlr);
3452 
3453 	if (spdk_nvme_ctrlr_get_flags(ctrlr) &
3454 	    SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) {
3455 		nvme_ctrlr->opal_dev = spdk_opal_dev_construct(ctrlr);
3456 	}
3457 
3458 	rc = nvme_bdev_ctrlr_create(name, nvme_ctrlr);
3459 	if (rc != 0) {
3460 		goto err;
3461 	}
3462 
3463 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
3464 
3465 	if (cdata->cmic.ana_reporting) {
3466 		rc = nvme_ctrlr_init_ana_log_page(nvme_ctrlr, ctx);
3467 		if (rc == 0) {
3468 			return 0;
3469 		}
3470 	} else {
3471 		nvme_ctrlr_create_done(nvme_ctrlr, ctx);
3472 		return 0;
3473 	}
3474 
3475 err:
3476 	nvme_ctrlr_delete(nvme_ctrlr);
3477 	return rc;
3478 }
3479 
3480 static void
3481 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
3482 	  struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
3483 {
3484 	char *name;
3485 
3486 	name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++);
3487 	if (!name) {
3488 		SPDK_ERRLOG("Failed to assign name to NVMe device\n");
3489 		return;
3490 	}
3491 
3492 	SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name);
3493 
3494 	nvme_ctrlr_create(ctrlr, name, trid, NULL);
3495 
3496 	free(name);
3497 }
3498 
3499 static void
3500 _nvme_ctrlr_destruct(void *ctx)
3501 {
3502 	struct nvme_ctrlr *nvme_ctrlr = ctx;
3503 
3504 	nvme_ctrlr_depopulate_namespaces(nvme_ctrlr);
3505 	nvme_ctrlr_release(nvme_ctrlr);
3506 }
3507 
3508 static int
3509 _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, bool hotplug)
3510 {
3511 	struct nvme_probe_skip_entry *entry;
3512 
3513 	pthread_mutex_lock(&nvme_ctrlr->mutex);
3514 
3515 	/* The controller's destruction was already started */
3516 	if (nvme_ctrlr->destruct) {
3517 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
3518 		return 0;
3519 	}
3520 
3521 	if (!hotplug &&
3522 	    nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) {
3523 		entry = calloc(1, sizeof(*entry));
3524 		if (!entry) {
3525 			pthread_mutex_unlock(&nvme_ctrlr->mutex);
3526 			return -ENOMEM;
3527 		}
3528 		entry->trid = nvme_ctrlr->active_path_id->trid;
3529 		TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq);
3530 	}
3531 
3532 	nvme_ctrlr->destruct = true;
3533 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
3534 
3535 	_nvme_ctrlr_destruct(nvme_ctrlr);
3536 
3537 	return 0;
3538 }
3539 
3540 static void
3541 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr)
3542 {
3543 	struct nvme_ctrlr *nvme_ctrlr = cb_ctx;
3544 
3545 	_bdev_nvme_delete(nvme_ctrlr, true);
3546 }
3547 
3548 static int
3549 bdev_nvme_hotplug_probe(void *arg)
3550 {
3551 	if (g_hotplug_probe_ctx == NULL) {
3552 		spdk_poller_unregister(&g_hotplug_probe_poller);
3553 		return SPDK_POLLER_IDLE;
3554 	}
3555 
3556 	if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) {
3557 		g_hotplug_probe_ctx = NULL;
3558 		spdk_poller_unregister(&g_hotplug_probe_poller);
3559 	}
3560 
3561 	return SPDK_POLLER_BUSY;
3562 }
3563 
3564 static int
3565 bdev_nvme_hotplug(void *arg)
3566 {
3567 	struct spdk_nvme_transport_id trid_pcie;
3568 
3569 	if (g_hotplug_probe_ctx) {
3570 		return SPDK_POLLER_BUSY;
3571 	}
3572 
3573 	memset(&trid_pcie, 0, sizeof(trid_pcie));
3574 	spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE);
3575 
3576 	g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL,
3577 			      hotplug_probe_cb, attach_cb, NULL);
3578 
3579 	if (g_hotplug_probe_ctx) {
3580 		assert(g_hotplug_probe_poller == NULL);
3581 		g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000);
3582 	}
3583 
3584 	return SPDK_POLLER_BUSY;
3585 }
3586 
3587 void
3588 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts)
3589 {
3590 	*opts = g_opts;
3591 }
3592 
3593 static int
3594 bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts)
3595 {
3596 	if ((opts->timeout_us == 0) && (opts->timeout_admin_us != 0)) {
3597 		/* Can't set timeout_admin_us without also setting timeout_us */
3598 		SPDK_WARNLOG("Invalid options: Can't have (timeout_us == 0) with (timeout_admin_us > 0)\n");
3599 		return -EINVAL;
3600 	}
3601 
3602 	if (opts->bdev_retry_count < -1) {
3603 		SPDK_WARNLOG("Invalid option: bdev_retry_count can't be less than -1.\n");
3604 		return -EINVAL;
3605 	}
3606 
3607 	return 0;
3608 }
3609 
3610 int
3611 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts)
3612 {
3613 	int ret = bdev_nvme_validate_opts(opts);
3614 	if (ret) {
3615 		SPDK_WARNLOG("Failed to set nvme opts.\n");
3616 		return ret;
3617 	}
3618 
3619 	if (g_bdev_nvme_init_thread != NULL) {
3620 		if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
3621 			return -EPERM;
3622 		}
3623 	}
3624 
3625 	g_opts = *opts;
3626 
3627 	return 0;
3628 }
3629 
3630 struct set_nvme_hotplug_ctx {
3631 	uint64_t period_us;
3632 	bool enabled;
3633 	spdk_msg_fn fn;
3634 	void *fn_ctx;
3635 };
3636 
3637 static void
3638 set_nvme_hotplug_period_cb(void *_ctx)
3639 {
3640 	struct set_nvme_hotplug_ctx *ctx = _ctx;
3641 
3642 	spdk_poller_unregister(&g_hotplug_poller);
3643 	if (ctx->enabled) {
3644 		g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us);
3645 	}
3646 
3647 	g_nvme_hotplug_poll_period_us = ctx->period_us;
3648 	g_nvme_hotplug_enabled = ctx->enabled;
3649 	if (ctx->fn) {
3650 		ctx->fn(ctx->fn_ctx);
3651 	}
3652 
3653 	free(ctx);
3654 }
3655 
3656 int
3657 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx)
3658 {
3659 	struct set_nvme_hotplug_ctx *ctx;
3660 
3661 	if (enabled == true && !spdk_process_is_primary()) {
3662 		return -EPERM;
3663 	}
3664 
3665 	ctx = calloc(1, sizeof(*ctx));
3666 	if (ctx == NULL) {
3667 		return -ENOMEM;
3668 	}
3669 
3670 	period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us;
3671 	ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX);
3672 	ctx->enabled = enabled;
3673 	ctx->fn = cb;
3674 	ctx->fn_ctx = cb_ctx;
3675 
3676 	spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx);
3677 	return 0;
3678 }
3679 
3680 static void
3681 nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr,
3682 				    struct nvme_async_probe_ctx *ctx)
3683 {
3684 	struct nvme_ns	*nvme_ns;
3685 	struct nvme_bdev	*nvme_bdev;
3686 	size_t			j;
3687 
3688 	assert(nvme_ctrlr != NULL);
3689 
3690 	if (ctx->names == NULL) {
3691 		populate_namespaces_cb(ctx, 0, 0);
3692 		return;
3693 	}
3694 
3695 	/*
3696 	 * Report the new bdevs that were created in this call.
3697 	 * There can be more than one bdev per NVMe controller.
3698 	 */
3699 	j = 0;
3700 	nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr);
3701 	while (nvme_ns != NULL) {
3702 		nvme_bdev = nvme_ns->bdev;
3703 		if (j < ctx->count) {
3704 			ctx->names[j] = nvme_bdev->disk.name;
3705 			j++;
3706 		} else {
3707 			SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n",
3708 				    ctx->count);
3709 			populate_namespaces_cb(ctx, 0, -ERANGE);
3710 			return;
3711 		}
3712 
3713 		nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns);
3714 	}
3715 
3716 	populate_namespaces_cb(ctx, j, 0);
3717 }
3718 
3719 static int
3720 bdev_nvme_compare_trids(struct nvme_ctrlr *nvme_ctrlr,
3721 			struct spdk_nvme_ctrlr *new_ctrlr,
3722 			struct spdk_nvme_transport_id *trid)
3723 {
3724 	struct nvme_path_id *tmp_trid;
3725 
3726 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
3727 		SPDK_ERRLOG("PCIe failover is not supported.\n");
3728 		return -ENOTSUP;
3729 	}
3730 
3731 	/* Currently we only support failover to the same transport type. */
3732 	if (nvme_ctrlr->active_path_id->trid.trtype != trid->trtype) {
3733 		return -EINVAL;
3734 	}
3735 
3736 	/* Currently we only support failover to the same NQN. */
3737 	if (strncmp(trid->subnqn, nvme_ctrlr->active_path_id->trid.subnqn, SPDK_NVMF_NQN_MAX_LEN)) {
3738 		return -EINVAL;
3739 	}
3740 
3741 	/* Skip all the other checks if we've already registered this path. */
3742 	TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) {
3743 		if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) {
3744 			return -EEXIST;
3745 		}
3746 	}
3747 
3748 	return 0;
3749 }
3750 
3751 static int
3752 bdev_nvme_compare_namespaces(struct nvme_ctrlr *nvme_ctrlr,
3753 			     struct spdk_nvme_ctrlr *new_ctrlr)
3754 {
3755 	struct nvme_ns *nvme_ns;
3756 	struct spdk_nvme_ns *new_ns;
3757 
3758 	nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr);
3759 	while (nvme_ns != NULL) {
3760 		new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nvme_ns->id);
3761 		assert(new_ns != NULL);
3762 
3763 		if (!bdev_nvme_compare_ns(nvme_ns->ns, new_ns)) {
3764 			return -EINVAL;
3765 		}
3766 
3767 		nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns);
3768 	}
3769 
3770 	return 0;
3771 }
3772 
3773 static int
3774 _bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr,
3775 			      struct spdk_nvme_transport_id *trid)
3776 {
3777 	struct nvme_path_id *new_trid, *tmp_trid;
3778 
3779 	new_trid = calloc(1, sizeof(*new_trid));
3780 	if (new_trid == NULL) {
3781 		return -ENOMEM;
3782 	}
3783 	new_trid->trid = *trid;
3784 	new_trid->is_failed = false;
3785 
3786 	TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) {
3787 		if (tmp_trid->is_failed && tmp_trid != nvme_ctrlr->active_path_id) {
3788 			TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link);
3789 			return 0;
3790 		}
3791 	}
3792 
3793 	TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, new_trid, link);
3794 	return 0;
3795 }
3796 
3797 /* This is the case that a secondary path is added to an existing
3798  * nvme_ctrlr for failover. After checking if it can access the same
3799  * namespaces as the primary path, it is disconnected until failover occurs.
3800  */
3801 static int
3802 bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr,
3803 			     struct spdk_nvme_ctrlr *new_ctrlr,
3804 			     struct spdk_nvme_transport_id *trid)
3805 {
3806 	int rc;
3807 
3808 	assert(nvme_ctrlr != NULL);
3809 
3810 	pthread_mutex_lock(&nvme_ctrlr->mutex);
3811 
3812 	rc = bdev_nvme_compare_trids(nvme_ctrlr, new_ctrlr, trid);
3813 	if (rc != 0) {
3814 		goto exit;
3815 	}
3816 
3817 	rc = bdev_nvme_compare_namespaces(nvme_ctrlr, new_ctrlr);
3818 	if (rc != 0) {
3819 		goto exit;
3820 	}
3821 
3822 	rc = _bdev_nvme_add_secondary_trid(nvme_ctrlr, trid);
3823 
3824 exit:
3825 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
3826 
3827 	spdk_nvme_detach(new_ctrlr);
3828 
3829 	return rc;
3830 }
3831 
3832 static void
3833 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
3834 		  struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
3835 {
3836 	struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx;
3837 	struct nvme_async_probe_ctx *ctx;
3838 	int rc;
3839 
3840 	ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, opts);
3841 	ctx->ctrlr_attached = true;
3842 
3843 	rc = nvme_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx);
3844 	if (rc != 0) {
3845 		populate_namespaces_cb(ctx, 0, rc);
3846 	}
3847 }
3848 
3849 static void
3850 connect_set_failover_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
3851 			struct spdk_nvme_ctrlr *ctrlr,
3852 			const struct spdk_nvme_ctrlr_opts *opts)
3853 {
3854 	struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx;
3855 	struct nvme_ctrlr *nvme_ctrlr;
3856 	struct nvme_async_probe_ctx *ctx;
3857 	int rc;
3858 
3859 	ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, opts);
3860 	ctx->ctrlr_attached = true;
3861 
3862 	nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name);
3863 	if (nvme_ctrlr) {
3864 		rc = bdev_nvme_add_secondary_trid(nvme_ctrlr, ctrlr, &ctx->trid);
3865 	} else {
3866 		rc = -ENODEV;
3867 	}
3868 
3869 	populate_namespaces_cb(ctx, 0, rc);
3870 }
3871 
3872 static int
3873 bdev_nvme_async_poll(void *arg)
3874 {
3875 	struct nvme_async_probe_ctx	*ctx = arg;
3876 	int				rc;
3877 
3878 	rc = spdk_nvme_probe_poll_async(ctx->probe_ctx);
3879 	if (spdk_unlikely(rc != -EAGAIN)) {
3880 		ctx->probe_done = true;
3881 		spdk_poller_unregister(&ctx->poller);
3882 		if (!ctx->ctrlr_attached) {
3883 			/* The probe is done, but no controller was attached.
3884 			 * That means we had a failure, so report -EIO back to
3885 			 * the caller (usually the RPC). populate_namespaces_cb()
3886 			 * will take care of freeing the nvme_async_probe_ctx.
3887 			 */
3888 			populate_namespaces_cb(ctx, 0, -EIO);
3889 		} else if (ctx->namespaces_populated) {
3890 			/* The namespaces for the attached controller were all
3891 			 * populated and the response was already sent to the
3892 			 * caller (usually the RPC).  So free the context here.
3893 			 */
3894 			free(ctx);
3895 		}
3896 	}
3897 
3898 	return SPDK_POLLER_BUSY;
3899 }
3900 
3901 static bool
3902 bdev_nvme_check_multipath_params(int32_t ctrlr_loss_timeout_sec,
3903 				 uint32_t reconnect_delay_sec,
3904 				 uint32_t fast_io_fail_timeout_sec)
3905 {
3906 	if (ctrlr_loss_timeout_sec < -1) {
3907 		SPDK_ERRLOG("ctrlr_loss_timeout_sec can't be less than -1.\n");
3908 		return false;
3909 	} else if (ctrlr_loss_timeout_sec == -1) {
3910 		if (reconnect_delay_sec == 0) {
3911 			SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n");
3912 			return false;
3913 		} else if (fast_io_fail_timeout_sec != 0 &&
3914 			   fast_io_fail_timeout_sec < reconnect_delay_sec) {
3915 			SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io-fail_timeout_sec.\n");
3916 			return false;
3917 		}
3918 	} else if (ctrlr_loss_timeout_sec != 0) {
3919 		if (reconnect_delay_sec == 0) {
3920 			SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n");
3921 			return false;
3922 		} else if (reconnect_delay_sec > (uint32_t)ctrlr_loss_timeout_sec) {
3923 			SPDK_ERRLOG("reconnect_delay_sec can't be more than ctrlr_loss_timeout_sec.\n");
3924 			return false;
3925 		} else if (fast_io_fail_timeout_sec != 0) {
3926 			if (fast_io_fail_timeout_sec < reconnect_delay_sec) {
3927 				SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io_fail_timeout_sec.\n");
3928 				return false;
3929 			} else if (fast_io_fail_timeout_sec > (uint32_t)ctrlr_loss_timeout_sec) {
3930 				SPDK_ERRLOG("fast_io_fail_timeout_sec can't be more than ctrlr_loss_timeout_sec.\n");
3931 				return false;
3932 			}
3933 		}
3934 	} else if (reconnect_delay_sec != 0 || fast_io_fail_timeout_sec != 0) {
3935 		SPDK_ERRLOG("Both reconnect_delay_sec and fast_io_fail_timeout_sec must be 0 if ctrlr_loss_timeout_sec is 0.\n");
3936 		return false;
3937 	}
3938 
3939 	return true;
3940 }
3941 
3942 int
3943 bdev_nvme_create(struct spdk_nvme_transport_id *trid,
3944 		 const char *base_name,
3945 		 const char **names,
3946 		 uint32_t count,
3947 		 uint32_t prchk_flags,
3948 		 spdk_bdev_create_nvme_fn cb_fn,
3949 		 void *cb_ctx,
3950 		 struct spdk_nvme_ctrlr_opts *opts,
3951 		 bool multipath,
3952 		 int32_t ctrlr_loss_timeout_sec,
3953 		 uint32_t reconnect_delay_sec,
3954 		 uint32_t fast_io_fail_timeout_sec)
3955 {
3956 	struct nvme_probe_skip_entry	*entry, *tmp;
3957 	struct nvme_async_probe_ctx	*ctx;
3958 	spdk_nvme_attach_cb attach_cb;
3959 
3960 	/* TODO expand this check to include both the host and target TRIDs.
3961 	 * Only if both are the same should we fail.
3962 	 */
3963 	if (nvme_ctrlr_get(trid) != NULL) {
3964 		SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr);
3965 		return -EEXIST;
3966 	}
3967 
3968 	if (!bdev_nvme_check_multipath_params(ctrlr_loss_timeout_sec, reconnect_delay_sec,
3969 					      fast_io_fail_timeout_sec)) {
3970 		return -EINVAL;
3971 	}
3972 
3973 	ctx = calloc(1, sizeof(*ctx));
3974 	if (!ctx) {
3975 		return -ENOMEM;
3976 	}
3977 	ctx->base_name = base_name;
3978 	ctx->names = names;
3979 	ctx->count = count;
3980 	ctx->cb_fn = cb_fn;
3981 	ctx->cb_ctx = cb_ctx;
3982 	ctx->prchk_flags = prchk_flags;
3983 	ctx->trid = *trid;
3984 	ctx->ctrlr_loss_timeout_sec = ctrlr_loss_timeout_sec;
3985 	ctx->reconnect_delay_sec = reconnect_delay_sec;
3986 	ctx->fast_io_fail_timeout_sec = fast_io_fail_timeout_sec;
3987 
3988 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
3989 		TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) {
3990 			if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) {
3991 				TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq);
3992 				free(entry);
3993 				break;
3994 			}
3995 		}
3996 	}
3997 
3998 	if (opts) {
3999 		memcpy(&ctx->opts, opts, sizeof(*opts));
4000 	} else {
4001 		spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->opts, sizeof(ctx->opts));
4002 	}
4003 
4004 	ctx->opts.transport_retry_count = g_opts.transport_retry_count;
4005 	ctx->opts.transport_ack_timeout = g_opts.transport_ack_timeout;
4006 	ctx->opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms;
4007 	ctx->opts.disable_read_ana_log_page = true;
4008 
4009 	if (nvme_bdev_ctrlr_get_by_name(base_name) == NULL || multipath) {
4010 		attach_cb = connect_attach_cb;
4011 	} else {
4012 		attach_cb = connect_set_failover_cb;
4013 	}
4014 
4015 	ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->opts, attach_cb);
4016 	if (ctx->probe_ctx == NULL) {
4017 		SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr);
4018 		free(ctx);
4019 		return -ENODEV;
4020 	}
4021 	ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000);
4022 
4023 	return 0;
4024 }
4025 
4026 int
4027 bdev_nvme_delete(const char *name, const struct nvme_path_id *path_id)
4028 {
4029 	struct nvme_bdev_ctrlr	*nbdev_ctrlr;
4030 	struct nvme_ctrlr	*nvme_ctrlr, *tmp_nvme_ctrlr;
4031 	struct nvme_path_id	*p, *t;
4032 	int			rc = -ENXIO;
4033 
4034 	if (name == NULL || path_id == NULL) {
4035 		return -EINVAL;
4036 	}
4037 
4038 	nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name);
4039 	if (nbdev_ctrlr == NULL) {
4040 		SPDK_ERRLOG("Failed to find NVMe bdev controller\n");
4041 		return -ENODEV;
4042 	}
4043 
4044 	TAILQ_FOREACH_SAFE(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq, tmp_nvme_ctrlr) {
4045 		TAILQ_FOREACH_REVERSE_SAFE(p, &nvme_ctrlr->trids, nvme_paths, link, t) {
4046 			if (path_id->trid.trtype != 0) {
4047 				if (path_id->trid.trtype == SPDK_NVME_TRANSPORT_CUSTOM) {
4048 					if (strcasecmp(path_id->trid.trstring, p->trid.trstring) != 0) {
4049 						continue;
4050 					}
4051 				} else {
4052 					if (path_id->trid.trtype != p->trid.trtype) {
4053 						continue;
4054 					}
4055 				}
4056 			}
4057 
4058 			if (!spdk_mem_all_zero(path_id->trid.traddr, sizeof(path_id->trid.traddr))) {
4059 				if (strcasecmp(path_id->trid.traddr, p->trid.traddr) != 0) {
4060 					continue;
4061 				}
4062 			}
4063 
4064 			if (path_id->trid.adrfam != 0) {
4065 				if (path_id->trid.adrfam != p->trid.adrfam) {
4066 					continue;
4067 				}
4068 			}
4069 
4070 			if (!spdk_mem_all_zero(path_id->trid.trsvcid, sizeof(path_id->trid.trsvcid))) {
4071 				if (strcasecmp(path_id->trid.trsvcid, p->trid.trsvcid) != 0) {
4072 					continue;
4073 				}
4074 			}
4075 
4076 			if (!spdk_mem_all_zero(path_id->trid.subnqn, sizeof(path_id->trid.subnqn))) {
4077 				if (strcmp(path_id->trid.subnqn, p->trid.subnqn) != 0) {
4078 					continue;
4079 				}
4080 			}
4081 
4082 			if (!spdk_mem_all_zero(path_id->hostid.hostaddr, sizeof(path_id->hostid.hostaddr))) {
4083 				if (strcmp(path_id->hostid.hostaddr, p->hostid.hostaddr) != 0) {
4084 					continue;
4085 				}
4086 			}
4087 
4088 			if (!spdk_mem_all_zero(path_id->hostid.hostsvcid, sizeof(path_id->hostid.hostsvcid))) {
4089 				if (strcmp(path_id->hostid.hostsvcid, p->hostid.hostsvcid) != 0) {
4090 					continue;
4091 				}
4092 			}
4093 
4094 			/* If we made it here, then this path is a match! Now we need to remove it. */
4095 			if (p == nvme_ctrlr->active_path_id) {
4096 				/* This is the active path in use right now. The active path is always the first in the list. */
4097 
4098 				if (!TAILQ_NEXT(p, link)) {
4099 					/* The current path is the only path. */
4100 					rc = _bdev_nvme_delete(nvme_ctrlr, false);
4101 				} else {
4102 					/* There is an alternative path. */
4103 					rc = bdev_nvme_failover(nvme_ctrlr, true);
4104 				}
4105 			} else {
4106 				/* We are not using the specified path. */
4107 				TAILQ_REMOVE(&nvme_ctrlr->trids, p, link);
4108 				free(p);
4109 				rc = 0;
4110 			}
4111 
4112 			if (rc < 0 && rc != -ENXIO) {
4113 				return rc;
4114 			}
4115 
4116 
4117 		}
4118 	}
4119 
4120 	/* All nvme_ctrlrs were deleted or no nvme_ctrlr which had the trid was found. */
4121 	return rc;
4122 }
4123 
4124 #define DISCOVERY_DEBUGLOG(ctx, format, ...) \
4125 	SPDK_DEBUGLOG(bdev_nvme, "Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__);
4126 
4127 #define DISCOVERY_ERRLOG(ctx, format, ...) \
4128 	SPDK_ERRLOG("Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__);
4129 
4130 struct discovery_entry_ctx {
4131 	char						name[128];
4132 	struct spdk_nvme_transport_id			trid;
4133 	struct spdk_nvme_ctrlr_opts			opts;
4134 	struct spdk_nvmf_discovery_log_page_entry	entry;
4135 	TAILQ_ENTRY(discovery_entry_ctx)		tailq;
4136 	struct discovery_ctx				*ctx;
4137 };
4138 
4139 struct discovery_ctx {
4140 	char					*name;
4141 	spdk_bdev_nvme_start_discovery_fn	start_cb_fn;
4142 	spdk_bdev_nvme_stop_discovery_fn	stop_cb_fn;
4143 	void					*cb_ctx;
4144 	struct spdk_nvme_probe_ctx		*probe_ctx;
4145 	struct spdk_nvme_detach_ctx		*detach_ctx;
4146 	struct spdk_nvme_ctrlr			*ctrlr;
4147 	struct spdk_nvme_transport_id		trid;
4148 	struct spdk_poller			*poller;
4149 	struct spdk_nvme_ctrlr_opts		opts;
4150 	struct spdk_nvmf_discovery_log_page	*log_page;
4151 	TAILQ_ENTRY(discovery_ctx)		tailq;
4152 	TAILQ_HEAD(, discovery_entry_ctx)	nvm_entry_ctxs;
4153 	TAILQ_HEAD(, discovery_entry_ctx)	discovery_entry_ctxs;
4154 	int					rc;
4155 	/* Denotes if a discovery is currently in progress for this context.
4156 	 * That includes connecting to newly discovered subsystems.  Used to
4157 	 * ensure we do not start a new discovery until an existing one is
4158 	 * complete.
4159 	 */
4160 	bool					in_progress;
4161 
4162 	/* Denotes if another discovery is needed after the one in progress
4163 	 * completes.  Set when we receive an AER completion while a discovery
4164 	 * is already in progress.
4165 	 */
4166 	bool					pending;
4167 
4168 	/* Signal to the discovery context poller that it should detach from
4169 	 * the discovery controller.
4170 	 */
4171 	bool					detach;
4172 
4173 	struct spdk_thread			*calling_thread;
4174 	uint32_t				index;
4175 	uint32_t				attach_in_progress;
4176 	char					*hostnqn;
4177 };
4178 
4179 TAILQ_HEAD(discovery_ctxs, discovery_ctx);
4180 static struct discovery_ctxs g_discovery_ctxs = TAILQ_HEAD_INITIALIZER(g_discovery_ctxs);
4181 
4182 static void get_discovery_log_page(struct discovery_ctx *ctx);
4183 
4184 static void
4185 free_discovery_ctx(struct discovery_ctx *ctx)
4186 {
4187 	free(ctx->hostnqn);
4188 	free(ctx->name);
4189 	free(ctx);
4190 }
4191 
4192 static void
4193 discovery_complete(struct discovery_ctx *ctx)
4194 {
4195 	ctx->in_progress = false;
4196 	if (ctx->pending) {
4197 		ctx->pending = false;
4198 		get_discovery_log_page(ctx);
4199 	}
4200 }
4201 
4202 static void
4203 build_trid_from_log_page_entry(struct spdk_nvme_transport_id *trid,
4204 			       struct spdk_nvmf_discovery_log_page_entry *entry)
4205 {
4206 	char *space;
4207 
4208 	trid->trtype = entry->trtype;
4209 	trid->adrfam = entry->adrfam;
4210 	memcpy(trid->traddr, entry->traddr, sizeof(trid->traddr));
4211 	memcpy(trid->trsvcid, entry->trsvcid, sizeof(trid->trsvcid));
4212 	memcpy(trid->subnqn, entry->subnqn, sizeof(trid->subnqn));
4213 
4214 	/* We want the traddr, trsvcid and subnqn fields to be NULL-terminated.
4215 	 * But the log page entries typically pad them with spaces, not zeroes.
4216 	 * So add a NULL terminator to each of these fields at the appropriate
4217 	 * location.
4218 	 */
4219 	space = strchr(trid->traddr, ' ');
4220 	if (space) {
4221 		*space = 0;
4222 	}
4223 	space = strchr(trid->trsvcid, ' ');
4224 	if (space) {
4225 		*space = 0;
4226 	}
4227 	space = strchr(trid->subnqn, ' ');
4228 	if (space) {
4229 		*space = 0;
4230 	}
4231 }
4232 
4233 static void
4234 discovery_remove_controllers(struct discovery_ctx *ctx)
4235 {
4236 	struct spdk_nvmf_discovery_log_page *log_page = ctx->log_page;
4237 	struct discovery_entry_ctx *entry_ctx, *tmp;
4238 	struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry;
4239 	struct spdk_nvme_transport_id old_trid;
4240 	uint64_t numrec, i;
4241 	bool found;
4242 
4243 	numrec = from_le64(&log_page->numrec);
4244 	TAILQ_FOREACH_SAFE(entry_ctx, &ctx->nvm_entry_ctxs, tailq, tmp) {
4245 		found = false;
4246 		old_entry = &entry_ctx->entry;
4247 		build_trid_from_log_page_entry(&old_trid, old_entry);
4248 		for (i = 0; i < numrec; i++) {
4249 			new_entry = &log_page->entries[i];
4250 			if (!memcmp(old_entry, new_entry, sizeof(*old_entry))) {
4251 				DISCOVERY_DEBUGLOG(ctx, "NVM %s:%s:%s found again\n",
4252 						   old_trid.subnqn, old_trid.traddr, old_trid.trsvcid);
4253 				found = true;
4254 				break;
4255 			}
4256 		}
4257 		if (!found) {
4258 			struct nvme_path_id path = {};
4259 
4260 			DISCOVERY_DEBUGLOG(ctx, "NVM %s:%s:%s not found\n",
4261 					   old_trid.subnqn, old_trid.traddr, old_trid.trsvcid);
4262 
4263 			path.trid = entry_ctx->trid;
4264 			bdev_nvme_delete(entry_ctx->name, &path);
4265 			TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq);
4266 			free(entry_ctx);
4267 		}
4268 	}
4269 	free(log_page);
4270 	ctx->log_page = NULL;
4271 	discovery_complete(ctx);
4272 }
4273 
4274 static void
4275 discovery_attach_controller_done(void *cb_ctx, size_t bdev_count, int rc)
4276 {
4277 	struct discovery_entry_ctx *entry_ctx = cb_ctx;
4278 	struct discovery_ctx *ctx = entry_ctx->ctx;;
4279 
4280 	DISCOVERY_DEBUGLOG(ctx, "attach %s done\n", entry_ctx->name);
4281 	ctx->attach_in_progress--;
4282 	if (ctx->attach_in_progress == 0) {
4283 		discovery_remove_controllers(ctx);
4284 	}
4285 }
4286 
4287 static void
4288 discovery_log_page_cb(void *cb_arg, int rc, const struct spdk_nvme_cpl *cpl,
4289 		      struct spdk_nvmf_discovery_log_page *log_page)
4290 {
4291 	struct discovery_ctx *ctx = cb_arg;
4292 	struct discovery_entry_ctx *entry_ctx, *tmp;
4293 	struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry;
4294 	uint64_t numrec, i;
4295 	bool found;
4296 
4297 	if (rc || spdk_nvme_cpl_is_error(cpl)) {
4298 		DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n");
4299 		return;
4300 	}
4301 
4302 	ctx->log_page = log_page;
4303 	assert(ctx->attach_in_progress == 0);
4304 	numrec = from_le64(&log_page->numrec);
4305 	TAILQ_FOREACH_SAFE(entry_ctx, &ctx->discovery_entry_ctxs, tailq, tmp) {
4306 		TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq);
4307 		free(entry_ctx);
4308 	}
4309 	for (i = 0; i < numrec; i++) {
4310 		found = false;
4311 		new_entry = &log_page->entries[i];
4312 		if (new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) {
4313 			struct discovery_entry_ctx *new_ctx;
4314 
4315 			new_ctx = calloc(1, sizeof(*new_ctx));
4316 			if (new_ctx == NULL) {
4317 				DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n");
4318 				break;
4319 			}
4320 
4321 			new_ctx->ctx = ctx;
4322 			memcpy(&new_ctx->entry, new_entry, sizeof(*new_entry));
4323 			build_trid_from_log_page_entry(&new_ctx->trid, new_entry);
4324 			spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->opts, sizeof(new_ctx->opts));
4325 			snprintf(new_ctx->opts.hostnqn, sizeof(new_ctx->opts.hostnqn), "%s", ctx->hostnqn);
4326 			TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, new_ctx, tailq);
4327 			continue;
4328 		}
4329 		TAILQ_FOREACH(entry_ctx, &ctx->nvm_entry_ctxs, tailq) {
4330 			old_entry = &entry_ctx->entry;
4331 			if (!memcmp(new_entry, old_entry, sizeof(*new_entry))) {
4332 				found = true;
4333 				break;
4334 			}
4335 		}
4336 		if (!found) {
4337 			struct discovery_entry_ctx *subnqn_ctx, *new_ctx;
4338 
4339 			TAILQ_FOREACH(subnqn_ctx, &ctx->nvm_entry_ctxs, tailq) {
4340 				if (!memcmp(subnqn_ctx->entry.subnqn, new_entry->subnqn,
4341 					    sizeof(new_entry->subnqn))) {
4342 					break;
4343 				}
4344 			}
4345 
4346 			new_ctx = calloc(1, sizeof(*new_ctx));
4347 			if (new_ctx == NULL) {
4348 				DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n");
4349 				break;
4350 			}
4351 
4352 			new_ctx->ctx = ctx;
4353 			memcpy(&new_ctx->entry, new_entry, sizeof(*new_entry));
4354 			build_trid_from_log_page_entry(&new_ctx->trid, new_entry);
4355 			if (subnqn_ctx) {
4356 				snprintf(new_ctx->name, sizeof(new_ctx->name), "%s", subnqn_ctx->name);
4357 				DISCOVERY_DEBUGLOG(ctx, "NVM %s:%s:%s new path for %s\n",
4358 						   new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid,
4359 						   new_ctx->name);
4360 			} else {
4361 				snprintf(new_ctx->name, sizeof(new_ctx->name), "%s%d", ctx->name, ctx->index++);
4362 				DISCOVERY_DEBUGLOG(ctx, "NVM %s:%s:%s new subsystem %s\n",
4363 						   new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid,
4364 						   new_ctx->name);
4365 			}
4366 			spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->opts, sizeof(new_ctx->opts));
4367 			snprintf(new_ctx->opts.hostnqn, sizeof(new_ctx->opts.hostnqn), "%s", ctx->hostnqn);
4368 			rc = bdev_nvme_create(&new_ctx->trid, new_ctx->name, NULL, 0, 0,
4369 					      discovery_attach_controller_done, new_ctx,
4370 					      &new_ctx->opts, true, 0, 0, 0);
4371 			if (rc == 0) {
4372 				TAILQ_INSERT_TAIL(&ctx->nvm_entry_ctxs, new_ctx, tailq);
4373 				ctx->attach_in_progress++;
4374 			} else {
4375 				DISCOVERY_ERRLOG(ctx, "bdev_nvme_create failed (%s)\n", spdk_strerror(-rc));
4376 			}
4377 		}
4378 	}
4379 
4380 	if (ctx->attach_in_progress == 0) {
4381 		discovery_remove_controllers(ctx);
4382 	}
4383 }
4384 
4385 static void
4386 get_discovery_log_page(struct discovery_ctx *ctx)
4387 {
4388 	int rc;
4389 
4390 	assert(ctx->in_progress == false);
4391 	ctx->in_progress = true;
4392 	rc = spdk_nvme_ctrlr_get_discovery_log_page(ctx->ctrlr, discovery_log_page_cb, ctx);
4393 	if (rc != 0) {
4394 		DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n");
4395 	}
4396 	DISCOVERY_DEBUGLOG(ctx, "sent discovery log page command\n");
4397 }
4398 
4399 static void
4400 discovery_aer_cb(void *arg, const struct spdk_nvme_cpl *cpl)
4401 {
4402 	struct discovery_ctx *ctx = arg;
4403 	uint32_t log_page_id = (cpl->cdw0 & 0xFF0000) >> 16;
4404 
4405 	if (spdk_nvme_cpl_is_error(cpl)) {
4406 		DISCOVERY_ERRLOG(ctx, "aer failed\n");
4407 		return;
4408 	}
4409 
4410 	if (log_page_id != SPDK_NVME_LOG_DISCOVERY) {
4411 		DISCOVERY_ERRLOG(ctx, "unexpected log page 0x%x\n", log_page_id);
4412 		return;
4413 	}
4414 
4415 	DISCOVERY_DEBUGLOG(ctx, "got aer\n");
4416 	if (ctx->in_progress) {
4417 		ctx->pending = true;
4418 		return;
4419 	}
4420 
4421 	get_discovery_log_page(ctx);
4422 }
4423 
4424 static void
4425 start_discovery_done(void *cb_ctx)
4426 {
4427 	struct discovery_ctx *ctx = cb_ctx;
4428 
4429 	DISCOVERY_DEBUGLOG(ctx, "start discovery done\n");
4430 	ctx->start_cb_fn(ctx->cb_ctx, ctx->rc);
4431 	if (ctx->rc != 0) {
4432 		DISCOVERY_ERRLOG(ctx, "could not connect to discovery ctrlr\n");
4433 		TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq);
4434 		free_discovery_ctx(ctx);
4435 	}
4436 }
4437 
4438 static void
4439 discovery_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
4440 		    struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
4441 {
4442 	struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx;
4443 	struct discovery_ctx *ctx;
4444 
4445 	ctx = SPDK_CONTAINEROF(user_opts, struct discovery_ctx, opts);
4446 
4447 	DISCOVERY_DEBUGLOG(ctx, "discovery ctrlr attached\n");
4448 	ctx->probe_ctx = NULL;
4449 	ctx->ctrlr = ctrlr;
4450 	spdk_nvme_ctrlr_register_aer_callback(ctx->ctrlr, discovery_aer_cb, ctx);
4451 }
4452 
4453 static int
4454 discovery_poller(void *arg)
4455 {
4456 	struct discovery_ctx *ctx = arg;
4457 	int rc;
4458 
4459 	if (ctx->detach) {
4460 		bool detach_done = false;
4461 
4462 		if (ctx->detach_ctx == NULL) {
4463 			rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx);
4464 			if (rc != 0) {
4465 				DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n");
4466 				detach_done = true;
4467 			}
4468 		} else {
4469 			rc = spdk_nvme_detach_poll_async(ctx->detach_ctx);
4470 			if (rc != -EAGAIN) {
4471 				detach_done = true;
4472 			}
4473 		}
4474 		if (detach_done) {
4475 			spdk_poller_unregister(&ctx->poller);
4476 			TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq);
4477 			ctx->stop_cb_fn(ctx->cb_ctx);
4478 			free_discovery_ctx(ctx);
4479 		}
4480 	} else if (ctx->probe_ctx) {
4481 		rc = spdk_nvme_probe_poll_async(ctx->probe_ctx);
4482 		if (rc != -EAGAIN) {
4483 			DISCOVERY_DEBUGLOG(ctx, "discovery ctrlr connected\n");
4484 			ctx->rc = rc;
4485 			spdk_thread_send_msg(ctx->calling_thread, start_discovery_done, ctx);
4486 			if (rc == 0) {
4487 				get_discovery_log_page(ctx);
4488 			}
4489 		}
4490 	} else {
4491 		spdk_nvme_ctrlr_process_admin_completions(ctx->ctrlr);
4492 	}
4493 
4494 	return SPDK_POLLER_BUSY;
4495 }
4496 
4497 static void
4498 start_discovery_poller(void *arg)
4499 {
4500 	struct discovery_ctx *ctx = arg;
4501 
4502 	TAILQ_INSERT_TAIL(&g_discovery_ctxs, ctx, tailq);
4503 	ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000);
4504 }
4505 
4506 int
4507 bdev_nvme_start_discovery(struct spdk_nvme_transport_id *trid,
4508 			  const char *base_name,
4509 			  struct spdk_nvme_ctrlr_opts *opts,
4510 			  spdk_bdev_nvme_start_discovery_fn cb_fn,
4511 			  void *cb_ctx)
4512 {
4513 	struct discovery_ctx *ctx;
4514 
4515 	ctx = calloc(1, sizeof(*ctx));
4516 	if (ctx == NULL) {
4517 		return -ENOMEM;
4518 	}
4519 
4520 	ctx->name = strdup(base_name);
4521 	if (ctx->name == NULL) {
4522 		free_discovery_ctx(ctx);
4523 		return -ENOMEM;
4524 	}
4525 	ctx->start_cb_fn = cb_fn;
4526 	ctx->cb_ctx = cb_ctx;
4527 	memcpy(&ctx->opts, opts, sizeof(*opts));
4528 	ctx->calling_thread = spdk_get_thread();
4529 	TAILQ_INIT(&ctx->nvm_entry_ctxs);
4530 	TAILQ_INIT(&ctx->discovery_entry_ctxs);
4531 	snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN);
4532 	memcpy(&ctx->trid, trid, sizeof(*trid));
4533 	/* Even if user did not specify hostnqn, we can still strdup("\0"); */
4534 	ctx->hostnqn = strdup(ctx->opts.hostnqn);
4535 	if (ctx->hostnqn == NULL) {
4536 		free_discovery_ctx(ctx);
4537 		return -ENOMEM;
4538 	}
4539 	ctx->probe_ctx = spdk_nvme_connect_async(&ctx->trid, &ctx->opts, discovery_attach_cb);
4540 	if (ctx->probe_ctx == NULL) {
4541 		DISCOVERY_ERRLOG(ctx, "could not start discovery connect\n");
4542 		free_discovery_ctx(ctx);
4543 		return -EIO;
4544 	}
4545 
4546 	spdk_thread_send_msg(g_bdev_nvme_init_thread, start_discovery_poller, ctx);
4547 	return 0;
4548 }
4549 
4550 int
4551 bdev_nvme_stop_discovery(const char *name, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx)
4552 {
4553 	struct discovery_ctx *ctx;
4554 
4555 	TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) {
4556 		if (strcmp(name, ctx->name) == 0) {
4557 			if (ctx->detach) {
4558 				return -EALREADY;
4559 			}
4560 			ctx->detach = true;
4561 			ctx->stop_cb_fn = cb_fn;
4562 			ctx->cb_ctx = cb_ctx;
4563 			while (!TAILQ_EMPTY(&ctx->nvm_entry_ctxs)) {
4564 				struct discovery_entry_ctx *entry_ctx;
4565 				struct nvme_path_id path = {};
4566 
4567 				entry_ctx = TAILQ_FIRST(&ctx->nvm_entry_ctxs);
4568 				path.trid = entry_ctx->trid;
4569 				bdev_nvme_delete(entry_ctx->name, &path);
4570 				TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq);
4571 				free(entry_ctx);
4572 			}
4573 			while (!TAILQ_EMPTY(&ctx->discovery_entry_ctxs)) {
4574 				struct discovery_entry_ctx *entry_ctx;
4575 
4576 				entry_ctx = TAILQ_FIRST(&ctx->discovery_entry_ctxs);
4577 				TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq);
4578 				free(entry_ctx);
4579 			}
4580 			return 0;
4581 		}
4582 	}
4583 
4584 	return -ENOENT;
4585 }
4586 
4587 static int
4588 bdev_nvme_library_init(void)
4589 {
4590 	g_bdev_nvme_init_thread = spdk_get_thread();
4591 
4592 	spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_create_poll_group_cb,
4593 				bdev_nvme_destroy_poll_group_cb,
4594 				sizeof(struct nvme_poll_group),  "nvme_poll_groups");
4595 
4596 	return 0;
4597 }
4598 
4599 static void
4600 bdev_nvme_fini_destruct_ctrlrs(void)
4601 {
4602 	struct nvme_bdev_ctrlr *nbdev_ctrlr;
4603 	struct nvme_ctrlr *nvme_ctrlr;
4604 
4605 	pthread_mutex_lock(&g_bdev_nvme_mutex);
4606 	TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
4607 		TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) {
4608 			pthread_mutex_lock(&nvme_ctrlr->mutex);
4609 			if (nvme_ctrlr->destruct) {
4610 				/* This controller's destruction was already started
4611 				 * before the application started shutting down
4612 				 */
4613 				pthread_mutex_unlock(&nvme_ctrlr->mutex);
4614 				continue;
4615 			}
4616 			nvme_ctrlr->destruct = true;
4617 			pthread_mutex_unlock(&nvme_ctrlr->mutex);
4618 
4619 			spdk_thread_send_msg(nvme_ctrlr->thread, _nvme_ctrlr_destruct,
4620 					     nvme_ctrlr);
4621 		}
4622 	}
4623 
4624 	g_bdev_nvme_module_finish = true;
4625 	if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
4626 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
4627 		spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL);
4628 		spdk_bdev_module_fini_done();
4629 		return;
4630 	}
4631 
4632 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
4633 }
4634 
4635 static void
4636 check_discovery_fini(void *arg)
4637 {
4638 	if (TAILQ_EMPTY(&g_discovery_ctxs)) {
4639 		bdev_nvme_fini_destruct_ctrlrs();
4640 	}
4641 }
4642 
4643 static void
4644 bdev_nvme_library_fini(void)
4645 {
4646 	struct nvme_probe_skip_entry *entry, *entry_tmp;
4647 	struct discovery_ctx *ctx;
4648 
4649 	spdk_poller_unregister(&g_hotplug_poller);
4650 	free(g_hotplug_probe_ctx);
4651 	g_hotplug_probe_ctx = NULL;
4652 
4653 	TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) {
4654 		TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq);
4655 		free(entry);
4656 	}
4657 
4658 	assert(spdk_get_thread() == g_bdev_nvme_init_thread);
4659 	if (TAILQ_EMPTY(&g_discovery_ctxs)) {
4660 		bdev_nvme_fini_destruct_ctrlrs();
4661 	} else {
4662 		TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) {
4663 			ctx->detach = true;
4664 			ctx->stop_cb_fn = check_discovery_fini;
4665 		}
4666 	}
4667 }
4668 
4669 static void
4670 bdev_nvme_verify_pi_error(struct nvme_bdev_io *bio)
4671 {
4672 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
4673 	struct spdk_bdev *bdev = bdev_io->bdev;
4674 	struct spdk_dif_ctx dif_ctx;
4675 	struct spdk_dif_error err_blk = {};
4676 	int rc;
4677 
4678 	rc = spdk_dif_ctx_init(&dif_ctx,
4679 			       bdev->blocklen, bdev->md_len, bdev->md_interleave,
4680 			       bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags,
4681 			       bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0);
4682 	if (rc != 0) {
4683 		SPDK_ERRLOG("Initialization of DIF context failed\n");
4684 		return;
4685 	}
4686 
4687 	if (bdev->md_interleave) {
4688 		rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
4689 				     bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
4690 	} else {
4691 		struct iovec md_iov = {
4692 			.iov_base	= bdev_io->u.bdev.md_buf,
4693 			.iov_len	= bdev_io->u.bdev.num_blocks * bdev->md_len,
4694 		};
4695 
4696 		rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
4697 				     &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
4698 	}
4699 
4700 	if (rc != 0) {
4701 		SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n",
4702 			    err_blk.err_type, err_blk.err_offset);
4703 	} else {
4704 		SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n");
4705 	}
4706 }
4707 
4708 static void
4709 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl)
4710 {
4711 	struct nvme_bdev_io *bio = ref;
4712 
4713 	if (spdk_nvme_cpl_is_success(cpl)) {
4714 		/* Run PI verification for read data buffer. */
4715 		bdev_nvme_verify_pi_error(bio);
4716 	}
4717 
4718 	/* Return original completion status */
4719 	bdev_nvme_io_complete_nvme_status(bio, &bio->cpl);
4720 }
4721 
4722 static void
4723 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl)
4724 {
4725 	struct nvme_bdev_io *bio = ref;
4726 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
4727 	int ret;
4728 
4729 	if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) {
4730 		SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n",
4731 			    cpl->status.sct, cpl->status.sc);
4732 
4733 		/* Save completion status to use after verifying PI error. */
4734 		bio->cpl = *cpl;
4735 
4736 		if (spdk_likely(nvme_io_path_is_available(bio->io_path))) {
4737 			/* Read without PI checking to verify PI error. */
4738 			ret = bdev_nvme_no_pi_readv(bio,
4739 						    bdev_io->u.bdev.iovs,
4740 						    bdev_io->u.bdev.iovcnt,
4741 						    bdev_io->u.bdev.md_buf,
4742 						    bdev_io->u.bdev.num_blocks,
4743 						    bdev_io->u.bdev.offset_blocks);
4744 			if (ret == 0) {
4745 				return;
4746 			}
4747 		}
4748 	}
4749 
4750 	bdev_nvme_io_complete_nvme_status(bio, cpl);
4751 }
4752 
4753 static void
4754 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl)
4755 {
4756 	struct nvme_bdev_io *bio = ref;
4757 
4758 	if (spdk_nvme_cpl_is_pi_error(cpl)) {
4759 		SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n",
4760 			    cpl->status.sct, cpl->status.sc);
4761 		/* Run PI verification for write data buffer if PI error is detected. */
4762 		bdev_nvme_verify_pi_error(bio);
4763 	}
4764 
4765 	bdev_nvme_io_complete_nvme_status(bio, cpl);
4766 }
4767 
4768 static void
4769 bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl)
4770 {
4771 	struct nvme_bdev_io *bio = ref;
4772 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
4773 
4774 	/* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks.
4775 	 * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error().
4776 	 */
4777 	bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0;
4778 
4779 	if (spdk_nvme_cpl_is_pi_error(cpl)) {
4780 		SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n",
4781 			    cpl->status.sct, cpl->status.sc);
4782 		/* Run PI verification for zone append data buffer if PI error is detected. */
4783 		bdev_nvme_verify_pi_error(bio);
4784 	}
4785 
4786 	bdev_nvme_io_complete_nvme_status(bio, cpl);
4787 }
4788 
4789 static void
4790 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl)
4791 {
4792 	struct nvme_bdev_io *bio = ref;
4793 
4794 	if (spdk_nvme_cpl_is_pi_error(cpl)) {
4795 		SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n",
4796 			    cpl->status.sct, cpl->status.sc);
4797 		/* Run PI verification for compare data buffer if PI error is detected. */
4798 		bdev_nvme_verify_pi_error(bio);
4799 	}
4800 
4801 	bdev_nvme_io_complete_nvme_status(bio, cpl);
4802 }
4803 
4804 static void
4805 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl)
4806 {
4807 	struct nvme_bdev_io *bio = ref;
4808 
4809 	/* Compare operation completion */
4810 	if ((cpl->cdw0 & 0xFF) == SPDK_NVME_OPC_COMPARE) {
4811 		/* Save compare result for write callback */
4812 		bio->cpl = *cpl;
4813 		return;
4814 	}
4815 
4816 	/* Write operation completion */
4817 	if (spdk_nvme_cpl_is_error(&bio->cpl)) {
4818 		/* If bio->cpl is already an error, it means the compare operation failed.  In that case,
4819 		 * complete the IO with the compare operation's status.
4820 		 */
4821 		if (!spdk_nvme_cpl_is_error(cpl)) {
4822 			SPDK_ERRLOG("Unexpected write success after compare failure.\n");
4823 		}
4824 
4825 		bdev_nvme_io_complete_nvme_status(bio, &bio->cpl);
4826 	} else {
4827 		bdev_nvme_io_complete_nvme_status(bio, cpl);
4828 	}
4829 }
4830 
4831 static void
4832 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl)
4833 {
4834 	struct nvme_bdev_io *bio = ref;
4835 
4836 	bdev_nvme_io_complete_nvme_status(bio, cpl);
4837 }
4838 
4839 static int
4840 fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc)
4841 {
4842 	switch (desc->zs) {
4843 	case SPDK_NVME_ZONE_STATE_EMPTY:
4844 		info->state = SPDK_BDEV_ZONE_STATE_EMPTY;
4845 		break;
4846 	case SPDK_NVME_ZONE_STATE_IOPEN:
4847 		info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN;
4848 		break;
4849 	case SPDK_NVME_ZONE_STATE_EOPEN:
4850 		info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN;
4851 		break;
4852 	case SPDK_NVME_ZONE_STATE_CLOSED:
4853 		info->state = SPDK_BDEV_ZONE_STATE_CLOSED;
4854 		break;
4855 	case SPDK_NVME_ZONE_STATE_RONLY:
4856 		info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY;
4857 		break;
4858 	case SPDK_NVME_ZONE_STATE_FULL:
4859 		info->state = SPDK_BDEV_ZONE_STATE_FULL;
4860 		break;
4861 	case SPDK_NVME_ZONE_STATE_OFFLINE:
4862 		info->state = SPDK_BDEV_ZONE_STATE_OFFLINE;
4863 		break;
4864 	default:
4865 		SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs);
4866 		return -EIO;
4867 	}
4868 
4869 	info->zone_id = desc->zslba;
4870 	info->write_pointer = desc->wp;
4871 	info->capacity = desc->zcap;
4872 
4873 	return 0;
4874 }
4875 
4876 static void
4877 bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl)
4878 {
4879 	struct nvme_bdev_io *bio = ref;
4880 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
4881 	uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id;
4882 	uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones;
4883 	struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf;
4884 	uint64_t max_zones_per_buf, i;
4885 	uint32_t zone_report_bufsize;
4886 	struct spdk_nvme_ns *ns;
4887 	struct spdk_nvme_qpair *qpair;
4888 	int ret;
4889 
4890 	if (spdk_nvme_cpl_is_error(cpl)) {
4891 		goto out_complete_io_nvme_cpl;
4892 	}
4893 
4894 	if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) {
4895 		ret = -ENXIO;
4896 		goto out_complete_io_ret;
4897 	}
4898 
4899 	ns = bio->io_path->nvme_ns->ns;
4900 	qpair = bio->io_path->ctrlr_ch->qpair;
4901 
4902 	zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns);
4903 	max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) /
4904 			    sizeof(bio->zone_report_buf->descs[0]);
4905 
4906 	if (bio->zone_report_buf->nr_zones > max_zones_per_buf) {
4907 		ret = -EINVAL;
4908 		goto out_complete_io_ret;
4909 	}
4910 
4911 	if (!bio->zone_report_buf->nr_zones) {
4912 		ret = -EINVAL;
4913 		goto out_complete_io_ret;
4914 	}
4915 
4916 	for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) {
4917 		ret = fill_zone_from_report(&info[bio->handled_zones],
4918 					    &bio->zone_report_buf->descs[i]);
4919 		if (ret) {
4920 			goto out_complete_io_ret;
4921 		}
4922 		bio->handled_zones++;
4923 	}
4924 
4925 	if (bio->handled_zones < zones_to_copy) {
4926 		uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(ns);
4927 		uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones);
4928 
4929 		memset(bio->zone_report_buf, 0, zone_report_bufsize);
4930 		ret = spdk_nvme_zns_report_zones(ns, qpair,
4931 						 bio->zone_report_buf, zone_report_bufsize,
4932 						 slba, SPDK_NVME_ZRA_LIST_ALL, true,
4933 						 bdev_nvme_get_zone_info_done, bio);
4934 		if (!ret) {
4935 			return;
4936 		} else {
4937 			goto out_complete_io_ret;
4938 		}
4939 	}
4940 
4941 out_complete_io_nvme_cpl:
4942 	free(bio->zone_report_buf);
4943 	bio->zone_report_buf = NULL;
4944 	bdev_nvme_io_complete_nvme_status(bio, cpl);
4945 	return;
4946 
4947 out_complete_io_ret:
4948 	free(bio->zone_report_buf);
4949 	bio->zone_report_buf = NULL;
4950 	bdev_nvme_io_complete(bio, ret);
4951 }
4952 
4953 static void
4954 bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl)
4955 {
4956 	struct nvme_bdev_io *bio = ref;
4957 
4958 	bdev_nvme_io_complete_nvme_status(bio, cpl);
4959 }
4960 
4961 static void
4962 bdev_nvme_admin_passthru_complete_nvme_status(void *ctx)
4963 {
4964 	struct nvme_bdev_io *bio = ctx;
4965 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
4966 	const struct spdk_nvme_cpl *cpl = &bio->cpl;
4967 	struct nvme_bdev_channel *nbdev_ch;
4968 	struct nvme_ctrlr *nvme_ctrlr;
4969 	const struct spdk_nvme_ctrlr_data *cdata;
4970 	uint64_t delay_ms;
4971 
4972 	assert(bdev_nvme_io_type_is_admin(bdev_io->type));
4973 
4974 	if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) {
4975 		goto complete;
4976 	}
4977 
4978 	if (cpl->status.dnr != 0 || (g_opts.bdev_retry_count != -1 &&
4979 				     bio->retry_count >= g_opts.bdev_retry_count)) {
4980 		goto complete;
4981 	}
4982 
4983 	nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io));
4984 	nvme_ctrlr = nvme_ctrlr_channel_get_ctrlr(bio->io_path->ctrlr_ch);
4985 
4986 	if (spdk_nvme_cpl_is_path_error(cpl) ||
4987 	    spdk_nvme_cpl_is_aborted_sq_deletion(cpl) ||
4988 	    !nvme_ctrlr_is_available(nvme_ctrlr)) {
4989 		delay_ms = 0;
4990 	} else if (spdk_nvme_cpl_is_aborted_by_request(cpl)) {
4991 		goto complete;
4992 	} else {
4993 		bio->retry_count++;
4994 
4995 		cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr);
4996 
4997 		if (cpl->status.crd != 0) {
4998 			delay_ms = cdata->crdt[cpl->status.crd] * 100;
4999 		} else {
5000 			delay_ms = 0;
5001 		}
5002 	}
5003 
5004 	if (any_ctrlr_may_become_available(nbdev_ch)) {
5005 		bdev_nvme_queue_retry_io(nbdev_ch, bio, delay_ms);
5006 		return;
5007 	}
5008 
5009 complete:
5010 	bio->retry_count = 0;
5011 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
5012 }
5013 
5014 static void
5015 bdev_nvme_abort_complete(void *ctx)
5016 {
5017 	struct nvme_bdev_io *bio = ctx;
5018 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
5019 
5020 	if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) {
5021 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
5022 	} else {
5023 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
5024 	}
5025 }
5026 
5027 static void
5028 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl)
5029 {
5030 	struct nvme_bdev_io *bio = ref;
5031 
5032 	bio->cpl = *cpl;
5033 	spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_complete, bio);
5034 }
5035 
5036 static void
5037 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl)
5038 {
5039 	struct nvme_bdev_io *bio = ref;
5040 
5041 	bio->cpl = *cpl;
5042 	spdk_thread_send_msg(bio->orig_thread,
5043 			     bdev_nvme_admin_passthru_complete_nvme_status, bio);
5044 }
5045 
5046 static void
5047 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset)
5048 {
5049 	struct nvme_bdev_io *bio = ref;
5050 	struct iovec *iov;
5051 
5052 	bio->iov_offset = sgl_offset;
5053 	for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) {
5054 		iov = &bio->iovs[bio->iovpos];
5055 		if (bio->iov_offset < iov->iov_len) {
5056 			break;
5057 		}
5058 
5059 		bio->iov_offset -= iov->iov_len;
5060 	}
5061 }
5062 
5063 static int
5064 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length)
5065 {
5066 	struct nvme_bdev_io *bio = ref;
5067 	struct iovec *iov;
5068 
5069 	assert(bio->iovpos < bio->iovcnt);
5070 
5071 	iov = &bio->iovs[bio->iovpos];
5072 
5073 	*address = iov->iov_base;
5074 	*length = iov->iov_len;
5075 
5076 	if (bio->iov_offset) {
5077 		assert(bio->iov_offset <= iov->iov_len);
5078 		*address += bio->iov_offset;
5079 		*length -= bio->iov_offset;
5080 	}
5081 
5082 	bio->iov_offset += *length;
5083 	if (bio->iov_offset == iov->iov_len) {
5084 		bio->iovpos++;
5085 		bio->iov_offset = 0;
5086 	}
5087 
5088 	return 0;
5089 }
5090 
5091 static void
5092 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset)
5093 {
5094 	struct nvme_bdev_io *bio = ref;
5095 	struct iovec *iov;
5096 
5097 	bio->fused_iov_offset = sgl_offset;
5098 	for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) {
5099 		iov = &bio->fused_iovs[bio->fused_iovpos];
5100 		if (bio->fused_iov_offset < iov->iov_len) {
5101 			break;
5102 		}
5103 
5104 		bio->fused_iov_offset -= iov->iov_len;
5105 	}
5106 }
5107 
5108 static int
5109 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length)
5110 {
5111 	struct nvme_bdev_io *bio = ref;
5112 	struct iovec *iov;
5113 
5114 	assert(bio->fused_iovpos < bio->fused_iovcnt);
5115 
5116 	iov = &bio->fused_iovs[bio->fused_iovpos];
5117 
5118 	*address = iov->iov_base;
5119 	*length = iov->iov_len;
5120 
5121 	if (bio->fused_iov_offset) {
5122 		assert(bio->fused_iov_offset <= iov->iov_len);
5123 		*address += bio->fused_iov_offset;
5124 		*length -= bio->fused_iov_offset;
5125 	}
5126 
5127 	bio->fused_iov_offset += *length;
5128 	if (bio->fused_iov_offset == iov->iov_len) {
5129 		bio->fused_iovpos++;
5130 		bio->fused_iov_offset = 0;
5131 	}
5132 
5133 	return 0;
5134 }
5135 
5136 static int
5137 bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
5138 		      void *md, uint64_t lba_count, uint64_t lba)
5139 {
5140 	int rc;
5141 
5142 	SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n",
5143 		      lba_count, lba);
5144 
5145 	bio->iovs = iov;
5146 	bio->iovcnt = iovcnt;
5147 	bio->iovpos = 0;
5148 	bio->iov_offset = 0;
5149 
5150 	rc = spdk_nvme_ns_cmd_readv_with_md(bio->io_path->nvme_ns->ns,
5151 					    bio->io_path->ctrlr_ch->qpair,
5152 					    lba, lba_count,
5153 					    bdev_nvme_no_pi_readv_done, bio, 0,
5154 					    bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
5155 					    md, 0, 0);
5156 
5157 	if (rc != 0 && rc != -ENOMEM) {
5158 		SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc);
5159 	}
5160 	return rc;
5161 }
5162 
5163 static int
5164 bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
5165 		void *md, uint64_t lba_count, uint64_t lba, uint32_t flags,
5166 		struct spdk_bdev_ext_io_opts *ext_opts)
5167 {
5168 	struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
5169 	struct spdk_nvme_qpair *qpair = bio->io_path->ctrlr_ch->qpair;
5170 	int rc;
5171 
5172 	SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n",
5173 		      lba_count, lba);
5174 
5175 	bio->iovs = iov;
5176 	bio->iovcnt = iovcnt;
5177 	bio->iovpos = 0;
5178 	bio->iov_offset = 0;
5179 
5180 	if (ext_opts) {
5181 		bio->ext_opts.size = sizeof(struct spdk_nvme_ns_cmd_ext_io_opts);
5182 		bio->ext_opts.memory_domain = ext_opts->memory_domain;
5183 		bio->ext_opts.memory_domain_ctx = ext_opts->memory_domain_ctx;
5184 		bio->ext_opts.io_flags = flags;
5185 		bio->ext_opts.metadata = md;
5186 
5187 		rc = spdk_nvme_ns_cmd_readv_ext(ns, qpair, lba, lba_count,
5188 						bdev_nvme_readv_done, bio,
5189 						bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
5190 						&bio->ext_opts);
5191 	} else if (iovcnt == 1) {
5192 		rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, md, lba,
5193 						   lba_count,
5194 						   bdev_nvme_readv_done, bio,
5195 						   flags,
5196 						   0, 0);
5197 	} else {
5198 		rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count,
5199 						    bdev_nvme_readv_done, bio, flags,
5200 						    bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
5201 						    md, 0, 0);
5202 	}
5203 
5204 	if (rc != 0 && rc != -ENOMEM) {
5205 		SPDK_ERRLOG("readv failed: rc = %d\n", rc);
5206 	}
5207 	return rc;
5208 }
5209 
5210 static int
5211 bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
5212 		 void *md, uint64_t lba_count, uint64_t lba,
5213 		 uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts)
5214 {
5215 	struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
5216 	struct spdk_nvme_qpair *qpair = bio->io_path->ctrlr_ch->qpair;
5217 	int rc;
5218 
5219 	SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n",
5220 		      lba_count, lba);
5221 
5222 	bio->iovs = iov;
5223 	bio->iovcnt = iovcnt;
5224 	bio->iovpos = 0;
5225 	bio->iov_offset = 0;
5226 
5227 	if (ext_opts) {
5228 		bio->ext_opts.size = sizeof(struct spdk_nvme_ns_cmd_ext_io_opts);
5229 		bio->ext_opts.memory_domain = ext_opts->memory_domain;
5230 		bio->ext_opts.memory_domain_ctx = ext_opts->memory_domain_ctx;
5231 		bio->ext_opts.io_flags = flags;
5232 		bio->ext_opts.metadata = md;
5233 
5234 		rc = spdk_nvme_ns_cmd_writev_ext(ns, qpair, lba, lba_count,
5235 						 bdev_nvme_writev_done, bio,
5236 						 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
5237 						 &bio->ext_opts);
5238 	} else if (iovcnt == 1) {
5239 		rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, md, lba,
5240 						    lba_count,
5241 						    bdev_nvme_writev_done, bio,
5242 						    flags,
5243 						    0, 0);
5244 	} else {
5245 		rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count,
5246 						     bdev_nvme_writev_done, bio, flags,
5247 						     bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
5248 						     md, 0, 0);
5249 	}
5250 
5251 	if (rc != 0 && rc != -ENOMEM) {
5252 		SPDK_ERRLOG("writev failed: rc = %d\n", rc);
5253 	}
5254 	return rc;
5255 }
5256 
5257 static int
5258 bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
5259 		       void *md, uint64_t lba_count, uint64_t zslba,
5260 		       uint32_t flags)
5261 {
5262 	struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
5263 	struct spdk_nvme_qpair *qpair = bio->io_path->ctrlr_ch->qpair;
5264 	int rc;
5265 
5266 	SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n",
5267 		      lba_count, zslba);
5268 
5269 	bio->iovs = iov;
5270 	bio->iovcnt = iovcnt;
5271 	bio->iovpos = 0;
5272 	bio->iov_offset = 0;
5273 
5274 	if (iovcnt == 1) {
5275 		rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba,
5276 						       lba_count,
5277 						       bdev_nvme_zone_appendv_done, bio,
5278 						       flags,
5279 						       0, 0);
5280 	} else {
5281 		rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count,
5282 							bdev_nvme_zone_appendv_done, bio, flags,
5283 							bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
5284 							md, 0, 0);
5285 	}
5286 
5287 	if (rc != 0 && rc != -ENOMEM) {
5288 		SPDK_ERRLOG("zone append failed: rc = %d\n", rc);
5289 	}
5290 	return rc;
5291 }
5292 
5293 static int
5294 bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
5295 		   void *md, uint64_t lba_count, uint64_t lba,
5296 		   uint32_t flags)
5297 {
5298 	int rc;
5299 
5300 	SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n",
5301 		      lba_count, lba);
5302 
5303 	bio->iovs = iov;
5304 	bio->iovcnt = iovcnt;
5305 	bio->iovpos = 0;
5306 	bio->iov_offset = 0;
5307 
5308 	rc = spdk_nvme_ns_cmd_comparev_with_md(bio->io_path->nvme_ns->ns,
5309 					       bio->io_path->ctrlr_ch->qpair,
5310 					       lba, lba_count,
5311 					       bdev_nvme_comparev_done, bio, flags,
5312 					       bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
5313 					       md, 0, 0);
5314 
5315 	if (rc != 0 && rc != -ENOMEM) {
5316 		SPDK_ERRLOG("comparev failed: rc = %d\n", rc);
5317 	}
5318 	return rc;
5319 }
5320 
5321 static int
5322 bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt,
5323 			      struct iovec *write_iov, int write_iovcnt,
5324 			      void *md, uint64_t lba_count, uint64_t lba, uint32_t flags)
5325 {
5326 	struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
5327 	struct spdk_nvme_qpair *qpair = bio->io_path->ctrlr_ch->qpair;
5328 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
5329 	int rc;
5330 
5331 	SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n",
5332 		      lba_count, lba);
5333 
5334 	bio->iovs = cmp_iov;
5335 	bio->iovcnt = cmp_iovcnt;
5336 	bio->iovpos = 0;
5337 	bio->iov_offset = 0;
5338 	bio->fused_iovs = write_iov;
5339 	bio->fused_iovcnt = write_iovcnt;
5340 	bio->fused_iovpos = 0;
5341 	bio->fused_iov_offset = 0;
5342 
5343 	if (bdev_io->num_retries == 0) {
5344 		bio->first_fused_submitted = false;
5345 	}
5346 
5347 	if (!bio->first_fused_submitted) {
5348 		flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST;
5349 		memset(&bio->cpl, 0, sizeof(bio->cpl));
5350 
5351 		rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count,
5352 						       bdev_nvme_comparev_and_writev_done, bio, flags,
5353 						       bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0);
5354 		if (rc == 0) {
5355 			bio->first_fused_submitted = true;
5356 			flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST;
5357 		} else {
5358 			if (rc != -ENOMEM) {
5359 				SPDK_ERRLOG("compare failed: rc = %d\n", rc);
5360 			}
5361 			return rc;
5362 		}
5363 	}
5364 
5365 	flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND;
5366 
5367 	rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count,
5368 					     bdev_nvme_comparev_and_writev_done, bio, flags,
5369 					     bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0);
5370 	if (rc != 0 && rc != -ENOMEM) {
5371 		SPDK_ERRLOG("write failed: rc = %d\n", rc);
5372 		rc = 0;
5373 	}
5374 
5375 	return rc;
5376 }
5377 
5378 static int
5379 bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks)
5380 {
5381 	struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES];
5382 	struct spdk_nvme_dsm_range *range;
5383 	uint64_t offset, remaining;
5384 	uint64_t num_ranges_u64;
5385 	uint16_t num_ranges;
5386 	int rc;
5387 
5388 	num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) /
5389 			 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
5390 	if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) {
5391 		SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks);
5392 		return -EINVAL;
5393 	}
5394 	num_ranges = (uint16_t)num_ranges_u64;
5395 
5396 	offset = offset_blocks;
5397 	remaining = num_blocks;
5398 	range = &dsm_ranges[0];
5399 
5400 	/* Fill max-size ranges until the remaining blocks fit into one range */
5401 	while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) {
5402 		range->attributes.raw = 0;
5403 		range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
5404 		range->starting_lba = offset;
5405 
5406 		offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
5407 		remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
5408 		range++;
5409 	}
5410 
5411 	/* Final range describes the remaining blocks */
5412 	range->attributes.raw = 0;
5413 	range->length = remaining;
5414 	range->starting_lba = offset;
5415 
5416 	rc = spdk_nvme_ns_cmd_dataset_management(bio->io_path->nvme_ns->ns,
5417 			bio->io_path->ctrlr_ch->qpair,
5418 			SPDK_NVME_DSM_ATTR_DEALLOCATE,
5419 			dsm_ranges, num_ranges,
5420 			bdev_nvme_queued_done, bio);
5421 
5422 	return rc;
5423 }
5424 
5425 static int
5426 bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks)
5427 {
5428 	if (num_blocks > UINT16_MAX + 1) {
5429 		SPDK_ERRLOG("NVMe write zeroes is limited to 16-bit block count\n");
5430 		return -EINVAL;
5431 	}
5432 
5433 	return spdk_nvme_ns_cmd_write_zeroes(bio->io_path->nvme_ns->ns,
5434 					     bio->io_path->ctrlr_ch->qpair,
5435 					     offset_blocks, num_blocks,
5436 					     bdev_nvme_queued_done, bio,
5437 					     0);
5438 }
5439 
5440 static int
5441 bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones,
5442 			struct spdk_bdev_zone_info *info)
5443 {
5444 	struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
5445 	struct spdk_nvme_qpair *qpair = bio->io_path->ctrlr_ch->qpair;
5446 	uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns);
5447 	uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns);
5448 	uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns);
5449 
5450 	if (zone_id % zone_size != 0) {
5451 		return -EINVAL;
5452 	}
5453 
5454 	if (num_zones > total_zones || !num_zones) {
5455 		return -EINVAL;
5456 	}
5457 
5458 	assert(!bio->zone_report_buf);
5459 	bio->zone_report_buf = calloc(1, zone_report_bufsize);
5460 	if (!bio->zone_report_buf) {
5461 		return -ENOMEM;
5462 	}
5463 
5464 	bio->handled_zones = 0;
5465 
5466 	return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize,
5467 					  zone_id, SPDK_NVME_ZRA_LIST_ALL, true,
5468 					  bdev_nvme_get_zone_info_done, bio);
5469 }
5470 
5471 static int
5472 bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id,
5473 			  enum spdk_bdev_zone_action action)
5474 {
5475 	struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
5476 	struct spdk_nvme_qpair *qpair = bio->io_path->ctrlr_ch->qpair;
5477 
5478 	switch (action) {
5479 	case SPDK_BDEV_ZONE_CLOSE:
5480 		return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false,
5481 						bdev_nvme_zone_management_done, bio);
5482 	case SPDK_BDEV_ZONE_FINISH:
5483 		return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false,
5484 						 bdev_nvme_zone_management_done, bio);
5485 	case SPDK_BDEV_ZONE_OPEN:
5486 		return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false,
5487 					       bdev_nvme_zone_management_done, bio);
5488 	case SPDK_BDEV_ZONE_RESET:
5489 		return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false,
5490 						bdev_nvme_zone_management_done, bio);
5491 	case SPDK_BDEV_ZONE_OFFLINE:
5492 		return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false,
5493 						  bdev_nvme_zone_management_done, bio);
5494 	default:
5495 		return -EINVAL;
5496 	}
5497 }
5498 
5499 static void
5500 bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio,
5501 			 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes)
5502 {
5503 	struct nvme_io_path *io_path;
5504 	struct nvme_ctrlr *nvme_ctrlr;
5505 	uint32_t max_xfer_size;
5506 	int rc = -ENXIO;
5507 
5508 	/* Choose the first ctrlr which is not failed. */
5509 	STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
5510 		nvme_ctrlr = nvme_ctrlr_channel_get_ctrlr(io_path->ctrlr_ch);
5511 
5512 		/* We should skip any unavailable nvme_ctrlr rather than checking
5513 		 * if the return value of spdk_nvme_ctrlr_cmd_admin_raw() is -ENXIO.
5514 		 */
5515 		if (!nvme_ctrlr_is_available(nvme_ctrlr)) {
5516 			continue;
5517 		}
5518 
5519 		max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ctrlr->ctrlr);
5520 
5521 		if (nbytes > max_xfer_size) {
5522 			SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
5523 			rc = -EINVAL;
5524 			goto err;
5525 		}
5526 
5527 		bio->io_path = io_path;
5528 		bio->orig_thread = spdk_get_thread();
5529 
5530 		rc = spdk_nvme_ctrlr_cmd_admin_raw(nvme_ctrlr->ctrlr, cmd, buf, (uint32_t)nbytes,
5531 						   bdev_nvme_admin_passthru_done, bio);
5532 		if (rc == 0) {
5533 			return;
5534 		}
5535 	}
5536 
5537 err:
5538 	bdev_nvme_admin_passthru_complete(bio, rc);
5539 }
5540 
5541 static int
5542 bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd,
5543 		      void *buf, size_t nbytes)
5544 {
5545 	struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
5546 	struct spdk_nvme_qpair *qpair = bio->io_path->ctrlr_ch->qpair;
5547 	uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns);
5548 	struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns);
5549 
5550 	if (nbytes > max_xfer_size) {
5551 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
5552 		return -EINVAL;
5553 	}
5554 
5555 	/*
5556 	 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid,
5557 	 * so fill it out automatically.
5558 	 */
5559 	cmd->nsid = spdk_nvme_ns_get_id(ns);
5560 
5561 	return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf,
5562 					  (uint32_t)nbytes, bdev_nvme_queued_done, bio);
5563 }
5564 
5565 static int
5566 bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd,
5567 			 void *buf, size_t nbytes, void *md_buf, size_t md_len)
5568 {
5569 	struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
5570 	struct spdk_nvme_qpair *qpair = bio->io_path->ctrlr_ch->qpair;
5571 	size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns);
5572 	uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns);
5573 	struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns);
5574 
5575 	if (nbytes > max_xfer_size) {
5576 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
5577 		return -EINVAL;
5578 	}
5579 
5580 	if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) {
5581 		SPDK_ERRLOG("invalid meta data buffer size\n");
5582 		return -EINVAL;
5583 	}
5584 
5585 	/*
5586 	 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid,
5587 	 * so fill it out automatically.
5588 	 */
5589 	cmd->nsid = spdk_nvme_ns_get_id(ns);
5590 
5591 	return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf,
5592 			(uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio);
5593 }
5594 
5595 static void
5596 bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio,
5597 		struct nvme_bdev_io *bio_to_abort)
5598 {
5599 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
5600 	struct spdk_bdev_io *bdev_io_to_abort;
5601 	struct nvme_io_path *io_path;
5602 	struct nvme_ctrlr *nvme_ctrlr;
5603 	int rc = 0;
5604 
5605 	bio->orig_thread = spdk_get_thread();
5606 
5607 	/* Traverse the retry_io_list first. */
5608 	TAILQ_FOREACH(bdev_io_to_abort, &nbdev_ch->retry_io_list, module_link) {
5609 		if ((struct nvme_bdev_io *)bdev_io_to_abort->driver_ctx == bio_to_abort) {
5610 			TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io_to_abort, module_link);
5611 			spdk_bdev_io_complete(bdev_io_to_abort, SPDK_BDEV_IO_STATUS_ABORTED);
5612 
5613 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
5614 			return;
5615 		}
5616 	}
5617 
5618 	/* Even admin commands, they were submitted to only nvme_ctrlrs which were
5619 	 * on any io_path. So traverse the io_path list for not only I/O commands
5620 	 * but also admin commands.
5621 	 */
5622 	STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
5623 		nvme_ctrlr = nvme_ctrlr_channel_get_ctrlr(io_path->ctrlr_ch);
5624 
5625 		rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ctrlr->ctrlr,
5626 						   io_path->ctrlr_ch->qpair,
5627 						   bio_to_abort,
5628 						   bdev_nvme_abort_done, bio);
5629 		if (rc == -ENOENT) {
5630 			/* If no command was found in I/O qpair, the target command may be
5631 			 * admin command.
5632 			 */
5633 			rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ctrlr->ctrlr,
5634 							   NULL,
5635 							   bio_to_abort,
5636 							   bdev_nvme_abort_done, bio);
5637 		}
5638 
5639 		if (rc != -ENOENT) {
5640 			break;
5641 		}
5642 	}
5643 
5644 	if (rc != 0) {
5645 		/* If no command was found or there was any error, complete the abort
5646 		 * request with failure.
5647 		 */
5648 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
5649 	}
5650 }
5651 
5652 static void
5653 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w)
5654 {
5655 	const char	*action;
5656 
5657 	if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) {
5658 		action = "reset";
5659 	} else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) {
5660 		action = "abort";
5661 	} else {
5662 		action = "none";
5663 	}
5664 
5665 	spdk_json_write_object_begin(w);
5666 
5667 	spdk_json_write_named_string(w, "method", "bdev_nvme_set_options");
5668 
5669 	spdk_json_write_named_object_begin(w, "params");
5670 	spdk_json_write_named_string(w, "action_on_timeout", action);
5671 	spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us);
5672 	spdk_json_write_named_uint64(w, "timeout_admin_us", g_opts.timeout_admin_us);
5673 	spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms);
5674 	spdk_json_write_named_uint32(w, "transport_retry_count", g_opts.transport_retry_count);
5675 	spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst);
5676 	spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight);
5677 	spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight);
5678 	spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight);
5679 	spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us);
5680 	spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us);
5681 	spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests);
5682 	spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit);
5683 	spdk_json_write_named_int32(w, "bdev_retry_count", g_opts.bdev_retry_count);
5684 	spdk_json_write_named_uint8(w, "transport_ack_timeout", g_opts.transport_ack_timeout);
5685 	spdk_json_write_object_end(w);
5686 
5687 	spdk_json_write_object_end(w);
5688 }
5689 
5690 static void
5691 nvme_ctrlr_config_json(struct spdk_json_write_ctx *w,
5692 		       struct nvme_ctrlr *nvme_ctrlr)
5693 {
5694 	struct spdk_nvme_transport_id	*trid;
5695 
5696 	trid = &nvme_ctrlr->active_path_id->trid;
5697 
5698 	spdk_json_write_object_begin(w);
5699 
5700 	spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller");
5701 
5702 	spdk_json_write_named_object_begin(w, "params");
5703 	spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name);
5704 	nvme_bdev_dump_trid_json(trid, w);
5705 	spdk_json_write_named_bool(w, "prchk_reftag",
5706 				   (nvme_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0);
5707 	spdk_json_write_named_bool(w, "prchk_guard",
5708 				   (nvme_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0);
5709 	spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", nvme_ctrlr->ctrlr_loss_timeout_sec);
5710 	spdk_json_write_named_uint32(w, "reconnect_delay_sec", nvme_ctrlr->reconnect_delay_sec);
5711 	spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", nvme_ctrlr->fast_io_fail_timeout_sec);
5712 
5713 	spdk_json_write_object_end(w);
5714 
5715 	spdk_json_write_object_end(w);
5716 }
5717 
5718 static void
5719 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w)
5720 {
5721 	spdk_json_write_object_begin(w);
5722 	spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug");
5723 
5724 	spdk_json_write_named_object_begin(w, "params");
5725 	spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us);
5726 	spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled);
5727 	spdk_json_write_object_end(w);
5728 
5729 	spdk_json_write_object_end(w);
5730 }
5731 
5732 static int
5733 bdev_nvme_config_json(struct spdk_json_write_ctx *w)
5734 {
5735 	struct nvme_bdev_ctrlr	*nbdev_ctrlr;
5736 	struct nvme_ctrlr	*nvme_ctrlr;
5737 
5738 	bdev_nvme_opts_config_json(w);
5739 
5740 	pthread_mutex_lock(&g_bdev_nvme_mutex);
5741 
5742 	TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
5743 		TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) {
5744 			nvme_ctrlr_config_json(w, nvme_ctrlr);
5745 		}
5746 	}
5747 
5748 	/* Dump as last parameter to give all NVMe bdevs chance to be constructed
5749 	 * before enabling hotplug poller.
5750 	 */
5751 	bdev_nvme_hotplug_config_json(w);
5752 
5753 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
5754 	return 0;
5755 }
5756 
5757 struct spdk_nvme_ctrlr *
5758 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev)
5759 {
5760 	struct nvme_bdev *nbdev;
5761 	struct nvme_ns *nvme_ns;
5762 
5763 	if (!bdev || bdev->module != &nvme_if) {
5764 		return NULL;
5765 	}
5766 
5767 	nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk);
5768 	nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list);
5769 	assert(nvme_ns != NULL);
5770 
5771 	return nvme_ns->ctrlr->ctrlr;
5772 }
5773 
5774 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme)
5775