xref: /spdk/module/bdev/nvme/bdev_nvme.c (revision a2815831cc4c36d82d7e4f9938308d38d0f304bd)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation. All rights reserved.
5  *   Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
6  *   Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
7  *
8  *   Redistribution and use in source and binary forms, with or without
9  *   modification, are permitted provided that the following conditions
10  *   are met:
11  *
12  *     * Redistributions of source code must retain the above copyright
13  *       notice, this list of conditions and the following disclaimer.
14  *     * Redistributions in binary form must reproduce the above copyright
15  *       notice, this list of conditions and the following disclaimer in
16  *       the documentation and/or other materials provided with the
17  *       distribution.
18  *     * Neither the name of Intel Corporation nor the names of its
19  *       contributors may be used to endorse or promote products derived
20  *       from this software without specific prior written permission.
21  *
22  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
25  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
26  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  */
34 
35 #include "spdk/stdinc.h"
36 
37 #include "bdev_nvme.h"
38 
39 #include "spdk/accel_engine.h"
40 #include "spdk/config.h"
41 #include "spdk/endian.h"
42 #include "spdk/bdev.h"
43 #include "spdk/json.h"
44 #include "spdk/likely.h"
45 #include "spdk/nvme.h"
46 #include "spdk/nvme_ocssd.h"
47 #include "spdk/nvme_zns.h"
48 #include "spdk/opal.h"
49 #include "spdk/thread.h"
50 #include "spdk/string.h"
51 #include "spdk/util.h"
52 
53 #include "spdk/bdev_module.h"
54 #include "spdk/log.h"
55 
56 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true
57 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS	(10000)
58 
59 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w);
60 
61 struct nvme_bdev_io {
62 	/** array of iovecs to transfer. */
63 	struct iovec *iovs;
64 
65 	/** Number of iovecs in iovs array. */
66 	int iovcnt;
67 
68 	/** Current iovec position. */
69 	int iovpos;
70 
71 	/** Offset in current iovec. */
72 	uint32_t iov_offset;
73 
74 	/** I/O path the current I/O or admin passthrough is submitted on, or the I/O path
75 	 *  being reset in a reset I/O.
76 	 */
77 	struct nvme_io_path *io_path;
78 
79 	/** array of iovecs to transfer. */
80 	struct iovec *fused_iovs;
81 
82 	/** Number of iovecs in iovs array. */
83 	int fused_iovcnt;
84 
85 	/** Current iovec position. */
86 	int fused_iovpos;
87 
88 	/** Offset in current iovec. */
89 	uint32_t fused_iov_offset;
90 
91 	/** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */
92 	struct spdk_nvme_cpl cpl;
93 
94 	/** Extended IO opts passed by the user to bdev layer and mapped to NVME format */
95 	struct spdk_nvme_ns_cmd_ext_io_opts ext_opts;
96 
97 	/** Originating thread */
98 	struct spdk_thread *orig_thread;
99 
100 	/** Keeps track if first of fused commands was submitted */
101 	bool first_fused_submitted;
102 
103 	/** Temporary pointer to zone report buffer */
104 	struct spdk_nvme_zns_zone_report *zone_report_buf;
105 
106 	/** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */
107 	uint64_t handled_zones;
108 
109 	/** Expiration value in ticks to retry the current I/O. */
110 	uint64_t retry_ticks;
111 
112 	/* How many times the current I/O was retried. */
113 	int32_t retry_count;
114 };
115 
116 struct nvme_probe_skip_entry {
117 	struct spdk_nvme_transport_id		trid;
118 	TAILQ_ENTRY(nvme_probe_skip_entry)	tailq;
119 };
120 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */
121 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER(
122 			g_skipped_nvme_ctrlrs);
123 
124 static struct spdk_bdev_nvme_opts g_opts = {
125 	.action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE,
126 	.timeout_us = 0,
127 	.timeout_admin_us = 0,
128 	.keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS,
129 	.transport_retry_count = 4,
130 	.arbitration_burst = 0,
131 	.low_priority_weight = 0,
132 	.medium_priority_weight = 0,
133 	.high_priority_weight = 0,
134 	.nvme_adminq_poll_period_us = 10000ULL,
135 	.nvme_ioq_poll_period_us = 0,
136 	.io_queue_requests = 0,
137 	.delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT,
138 	.bdev_retry_count = 0,
139 };
140 
141 #define NVME_HOTPLUG_POLL_PERIOD_MAX			10000000ULL
142 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT		100000ULL
143 
144 static int g_hot_insert_nvme_controller_index = 0;
145 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT;
146 static bool g_nvme_hotplug_enabled = false;
147 static struct spdk_thread *g_bdev_nvme_init_thread;
148 static struct spdk_poller *g_hotplug_poller;
149 static struct spdk_poller *g_hotplug_probe_poller;
150 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx;
151 
152 static void nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr,
153 		struct nvme_async_probe_ctx *ctx);
154 static void nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr,
155 		struct nvme_async_probe_ctx *ctx);
156 static int bdev_nvme_library_init(void);
157 static void bdev_nvme_library_fini(void);
158 static void bdev_nvme_submit_request(struct spdk_io_channel *ch,
159 				     struct spdk_bdev_io *bdev_io);
160 static int bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
161 			   void *md, uint64_t lba_count, uint64_t lba,
162 			   uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts);
163 static int bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
164 				 void *md, uint64_t lba_count, uint64_t lba);
165 static int bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
166 			    void *md, uint64_t lba_count, uint64_t lba,
167 			    uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts);
168 static int bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
169 				  void *md, uint64_t lba_count,
170 				  uint64_t zslba, uint32_t flags);
171 static int bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
172 			      void *md, uint64_t lba_count, uint64_t lba,
173 			      uint32_t flags);
174 static int bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio,
175 		struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov,
176 		int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba,
177 		uint32_t flags);
178 static int bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id,
179 				   uint32_t num_zones, struct spdk_bdev_zone_info *info);
180 static int bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id,
181 				     enum spdk_bdev_zone_action action);
182 static void bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch,
183 				     struct nvme_bdev_io *bio,
184 				     struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes);
185 static int bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd,
186 				 void *buf, size_t nbytes);
187 static int bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd,
188 				    void *buf, size_t nbytes, void *md_buf, size_t md_len);
189 static void bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch,
190 			    struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort);
191 static void bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio);
192 static int bdev_nvme_reset(struct nvme_ctrlr *nvme_ctrlr);
193 static int bdev_nvme_failover(struct nvme_ctrlr *nvme_ctrlr, bool remove);
194 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr);
195 static void nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr);
196 
197 static int
198 nvme_ns_cmp(struct nvme_ns *ns1, struct nvme_ns *ns2)
199 {
200 	return ns1->id - ns2->id;
201 }
202 
203 RB_GENERATE_STATIC(nvme_ns_tree, nvme_ns, node, nvme_ns_cmp);
204 
205 struct spdk_nvme_qpair *
206 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch)
207 {
208 	struct nvme_ctrlr_channel *ctrlr_ch;
209 
210 	assert(ctrlr_io_ch != NULL);
211 
212 	ctrlr_ch = spdk_io_channel_get_ctx(ctrlr_io_ch);
213 
214 	return ctrlr_ch->qpair;
215 }
216 
217 static int
218 bdev_nvme_get_ctx_size(void)
219 {
220 	return sizeof(struct nvme_bdev_io);
221 }
222 
223 static struct spdk_bdev_module nvme_if = {
224 	.name = "nvme",
225 	.async_fini = true,
226 	.module_init = bdev_nvme_library_init,
227 	.module_fini = bdev_nvme_library_fini,
228 	.config_json = bdev_nvme_config_json,
229 	.get_ctx_size = bdev_nvme_get_ctx_size,
230 
231 };
232 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if)
233 
234 struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_bdev_ctrlrs);
235 pthread_mutex_t g_bdev_nvme_mutex = PTHREAD_MUTEX_INITIALIZER;
236 bool g_bdev_nvme_module_finish;
237 
238 struct nvme_bdev_ctrlr *
239 nvme_bdev_ctrlr_get_by_name(const char *name)
240 {
241 	struct nvme_bdev_ctrlr *nbdev_ctrlr;
242 
243 	TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
244 		if (strcmp(name, nbdev_ctrlr->name) == 0) {
245 			break;
246 		}
247 	}
248 
249 	return nbdev_ctrlr;
250 }
251 
252 static struct nvme_ctrlr *
253 nvme_bdev_ctrlr_get_ctrlr(struct nvme_bdev_ctrlr *nbdev_ctrlr,
254 			  const struct spdk_nvme_transport_id *trid)
255 {
256 	struct nvme_ctrlr *nvme_ctrlr;
257 
258 	TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) {
259 		if (spdk_nvme_transport_id_compare(trid, &nvme_ctrlr->active_path_id->trid) == 0) {
260 			break;
261 		}
262 	}
263 
264 	return nvme_ctrlr;
265 }
266 
267 static struct nvme_bdev *
268 nvme_bdev_ctrlr_get_bdev(struct nvme_bdev_ctrlr *nbdev_ctrlr, uint32_t nsid)
269 {
270 	struct nvme_bdev *bdev;
271 
272 	pthread_mutex_lock(&g_bdev_nvme_mutex);
273 	TAILQ_FOREACH(bdev, &nbdev_ctrlr->bdevs, tailq) {
274 		if (bdev->nsid == nsid) {
275 			break;
276 		}
277 	}
278 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
279 
280 	return bdev;
281 }
282 
283 struct nvme_ns *
284 nvme_ctrlr_get_ns(struct nvme_ctrlr *nvme_ctrlr, uint32_t nsid)
285 {
286 	struct nvme_ns ns;
287 
288 	assert(nsid > 0);
289 
290 	ns.id = nsid;
291 	return RB_FIND(nvme_ns_tree, &nvme_ctrlr->namespaces, &ns);
292 }
293 
294 struct nvme_ns *
295 nvme_ctrlr_get_first_active_ns(struct nvme_ctrlr *nvme_ctrlr)
296 {
297 	return RB_MIN(nvme_ns_tree, &nvme_ctrlr->namespaces);
298 }
299 
300 struct nvme_ns *
301 nvme_ctrlr_get_next_active_ns(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *ns)
302 {
303 	if (ns == NULL) {
304 		return NULL;
305 	}
306 
307 	return RB_NEXT(nvme_ns_tree, &nvme_ctrlr->namespaces, ns);
308 }
309 
310 static struct nvme_ctrlr *
311 nvme_ctrlr_get(const struct spdk_nvme_transport_id *trid)
312 {
313 	struct nvme_bdev_ctrlr	*nbdev_ctrlr;
314 	struct nvme_ctrlr	*nvme_ctrlr = NULL;
315 
316 	pthread_mutex_lock(&g_bdev_nvme_mutex);
317 	TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
318 		nvme_ctrlr = nvme_bdev_ctrlr_get_ctrlr(nbdev_ctrlr, trid);
319 		if (nvme_ctrlr != NULL) {
320 			break;
321 		}
322 	}
323 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
324 
325 	return nvme_ctrlr;
326 }
327 
328 struct nvme_ctrlr *
329 nvme_ctrlr_get_by_name(const char *name)
330 {
331 	struct nvme_bdev_ctrlr *nbdev_ctrlr;
332 	struct nvme_ctrlr *nvme_ctrlr = NULL;
333 
334 	if (name == NULL) {
335 		return NULL;
336 	}
337 
338 	pthread_mutex_lock(&g_bdev_nvme_mutex);
339 	nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name);
340 	if (nbdev_ctrlr != NULL) {
341 		nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs);
342 	}
343 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
344 
345 	return nvme_ctrlr;
346 }
347 
348 void
349 nvme_bdev_ctrlr_for_each(nvme_bdev_ctrlr_for_each_fn fn, void *ctx)
350 {
351 	struct nvme_bdev_ctrlr *nbdev_ctrlr;
352 
353 	pthread_mutex_lock(&g_bdev_nvme_mutex);
354 	TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
355 		fn(nbdev_ctrlr, ctx);
356 	}
357 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
358 }
359 
360 void
361 nvme_bdev_dump_trid_json(const struct spdk_nvme_transport_id *trid, struct spdk_json_write_ctx *w)
362 {
363 	const char *trtype_str;
364 	const char *adrfam_str;
365 
366 	trtype_str = spdk_nvme_transport_id_trtype_str(trid->trtype);
367 	if (trtype_str) {
368 		spdk_json_write_named_string(w, "trtype", trtype_str);
369 	}
370 
371 	adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam);
372 	if (adrfam_str) {
373 		spdk_json_write_named_string(w, "adrfam", adrfam_str);
374 	}
375 
376 	if (trid->traddr[0] != '\0') {
377 		spdk_json_write_named_string(w, "traddr", trid->traddr);
378 	}
379 
380 	if (trid->trsvcid[0] != '\0') {
381 		spdk_json_write_named_string(w, "trsvcid", trid->trsvcid);
382 	}
383 
384 	if (trid->subnqn[0] != '\0') {
385 		spdk_json_write_named_string(w, "subnqn", trid->subnqn);
386 	}
387 }
388 
389 static void
390 nvme_bdev_ctrlr_delete(struct nvme_bdev_ctrlr *nbdev_ctrlr,
391 		       struct nvme_ctrlr *nvme_ctrlr)
392 {
393 	pthread_mutex_lock(&g_bdev_nvme_mutex);
394 
395 	TAILQ_REMOVE(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq);
396 	if (!TAILQ_EMPTY(&nbdev_ctrlr->ctrlrs)) {
397 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
398 
399 		return;
400 	}
401 	TAILQ_REMOVE(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq);
402 
403 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
404 
405 	assert(TAILQ_EMPTY(&nbdev_ctrlr->bdevs));
406 
407 	free(nbdev_ctrlr->name);
408 	free(nbdev_ctrlr);
409 }
410 
411 static void
412 _nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr)
413 {
414 	struct nvme_path_id *path_id, *tmp_path;
415 	struct nvme_ns *ns, *tmp_ns;
416 
417 	free(nvme_ctrlr->copied_ana_desc);
418 	spdk_free(nvme_ctrlr->ana_log_page);
419 
420 	if (nvme_ctrlr->opal_dev) {
421 		spdk_opal_dev_destruct(nvme_ctrlr->opal_dev);
422 		nvme_ctrlr->opal_dev = NULL;
423 	}
424 
425 	if (nvme_ctrlr->nbdev_ctrlr) {
426 		nvme_bdev_ctrlr_delete(nvme_ctrlr->nbdev_ctrlr, nvme_ctrlr);
427 	}
428 
429 	RB_FOREACH_SAFE(ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp_ns) {
430 		RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, ns);
431 		free(ns);
432 	}
433 
434 	TAILQ_FOREACH_SAFE(path_id, &nvme_ctrlr->trids, link, tmp_path) {
435 		TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link);
436 		free(path_id);
437 	}
438 
439 	pthread_mutex_destroy(&nvme_ctrlr->mutex);
440 
441 	free(nvme_ctrlr);
442 
443 	pthread_mutex_lock(&g_bdev_nvme_mutex);
444 	if (g_bdev_nvme_module_finish && TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
445 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
446 		spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL);
447 		spdk_bdev_module_fini_done();
448 		return;
449 	}
450 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
451 }
452 
453 static int
454 nvme_detach_poller(void *arg)
455 {
456 	struct nvme_ctrlr *nvme_ctrlr = arg;
457 	int rc;
458 
459 	rc = spdk_nvme_detach_poll_async(nvme_ctrlr->detach_ctx);
460 	if (rc != -EAGAIN) {
461 		spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller);
462 		_nvme_ctrlr_delete(nvme_ctrlr);
463 	}
464 
465 	return SPDK_POLLER_BUSY;
466 }
467 
468 static void
469 nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr)
470 {
471 	int rc;
472 
473 	/* First, unregister the adminq poller, as the driver will poll adminq if necessary */
474 	spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller);
475 
476 	/* If we got here, the reset/detach poller cannot be active */
477 	assert(nvme_ctrlr->reset_detach_poller == NULL);
478 	nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(nvme_detach_poller,
479 					  nvme_ctrlr, 1000);
480 	if (nvme_ctrlr->reset_detach_poller == NULL) {
481 		SPDK_ERRLOG("Failed to register detach poller\n");
482 		goto error;
483 	}
484 
485 	rc = spdk_nvme_detach_async(nvme_ctrlr->ctrlr, &nvme_ctrlr->detach_ctx);
486 	if (rc != 0) {
487 		SPDK_ERRLOG("Failed to detach the NVMe controller\n");
488 		goto error;
489 	}
490 
491 	return;
492 error:
493 	/* We don't have a good way to handle errors here, so just do what we can and delete the
494 	 * controller without detaching the underlying NVMe device.
495 	 */
496 	spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller);
497 	_nvme_ctrlr_delete(nvme_ctrlr);
498 }
499 
500 static void
501 nvme_ctrlr_unregister_cb(void *io_device)
502 {
503 	struct nvme_ctrlr *nvme_ctrlr = io_device;
504 
505 	nvme_ctrlr_delete(nvme_ctrlr);
506 }
507 
508 static void
509 nvme_ctrlr_unregister(struct nvme_ctrlr *nvme_ctrlr)
510 {
511 	spdk_io_device_unregister(nvme_ctrlr, nvme_ctrlr_unregister_cb);
512 }
513 
514 static bool
515 nvme_ctrlr_can_be_unregistered(struct nvme_ctrlr *nvme_ctrlr)
516 {
517 	if (!nvme_ctrlr->destruct) {
518 		return false;
519 	}
520 
521 	if (nvme_ctrlr->ref > 0) {
522 		return false;
523 	}
524 
525 	if (nvme_ctrlr->resetting) {
526 		return false;
527 	}
528 
529 	if (nvme_ctrlr->ana_log_page_updating) {
530 		return false;
531 	}
532 
533 	return true;
534 }
535 
536 static void
537 nvme_ctrlr_release(struct nvme_ctrlr *nvme_ctrlr)
538 {
539 	pthread_mutex_lock(&nvme_ctrlr->mutex);
540 
541 	assert(nvme_ctrlr->ref > 0);
542 	nvme_ctrlr->ref--;
543 
544 	if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) {
545 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
546 		return;
547 	}
548 
549 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
550 
551 	nvme_ctrlr_unregister(nvme_ctrlr);
552 }
553 
554 static struct nvme_io_path *
555 _bdev_nvme_get_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns)
556 {
557 	struct nvme_io_path *io_path;
558 
559 	STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
560 		if (io_path->nvme_ns == nvme_ns) {
561 			break;
562 		}
563 	}
564 
565 	return io_path;
566 }
567 
568 static int
569 _bdev_nvme_add_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns)
570 {
571 	struct nvme_io_path *io_path;
572 	struct spdk_io_channel *ch;
573 
574 	io_path = calloc(1, sizeof(*io_path));
575 	if (io_path == NULL) {
576 		SPDK_ERRLOG("Failed to alloc io_path.\n");
577 		return -ENOMEM;
578 	}
579 
580 	ch = spdk_get_io_channel(nvme_ns->ctrlr);
581 	if (ch == NULL) {
582 		free(io_path);
583 		SPDK_ERRLOG("Failed to alloc io_channel.\n");
584 		return -ENOMEM;
585 	}
586 
587 	io_path->ctrlr_ch = spdk_io_channel_get_ctx(ch);
588 	TAILQ_INSERT_TAIL(&io_path->ctrlr_ch->io_path_list, io_path, tailq);
589 
590 	io_path->nvme_ns = nvme_ns;
591 
592 	io_path->nbdev_ch = nbdev_ch;
593 	STAILQ_INSERT_TAIL(&nbdev_ch->io_path_list, io_path, stailq);
594 
595 	nbdev_ch->current_io_path = NULL;
596 
597 	return 0;
598 }
599 
600 static void
601 _bdev_nvme_delete_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *io_path)
602 {
603 	struct spdk_io_channel *ch;
604 
605 	nbdev_ch->current_io_path = NULL;
606 
607 	STAILQ_REMOVE(&nbdev_ch->io_path_list, io_path, nvme_io_path, stailq);
608 
609 	TAILQ_REMOVE(&io_path->ctrlr_ch->io_path_list, io_path, tailq);
610 	ch = spdk_io_channel_from_ctx(io_path->ctrlr_ch);
611 	spdk_put_io_channel(ch);
612 
613 	free(io_path);
614 }
615 
616 static void
617 _bdev_nvme_delete_io_paths(struct nvme_bdev_channel *nbdev_ch)
618 {
619 	struct nvme_io_path *io_path, *tmp_io_path;
620 
621 	STAILQ_FOREACH_SAFE(io_path, &nbdev_ch->io_path_list, stailq, tmp_io_path) {
622 		_bdev_nvme_delete_io_path(nbdev_ch, io_path);
623 	}
624 }
625 
626 static int
627 bdev_nvme_create_bdev_channel_cb(void *io_device, void *ctx_buf)
628 {
629 	struct nvme_bdev_channel *nbdev_ch = ctx_buf;
630 	struct nvme_bdev *nbdev = io_device;
631 	struct nvme_ns *nvme_ns;
632 	int rc;
633 
634 	STAILQ_INIT(&nbdev_ch->io_path_list);
635 	TAILQ_INIT(&nbdev_ch->retry_io_list);
636 
637 	pthread_mutex_lock(&nbdev->mutex);
638 	TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) {
639 		rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns);
640 		if (rc != 0) {
641 			pthread_mutex_unlock(&nbdev->mutex);
642 
643 			_bdev_nvme_delete_io_paths(nbdev_ch);
644 			return rc;
645 		}
646 	}
647 	pthread_mutex_unlock(&nbdev->mutex);
648 
649 	return 0;
650 }
651 
652 static void
653 bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch)
654 {
655 	struct spdk_bdev_io *bdev_io, *tmp_io;
656 
657 	TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_io) {
658 		TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link);
659 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED);
660 	}
661 
662 	spdk_poller_unregister(&nbdev_ch->retry_io_poller);
663 }
664 
665 static void
666 bdev_nvme_destroy_bdev_channel_cb(void *io_device, void *ctx_buf)
667 {
668 	struct nvme_bdev_channel *nbdev_ch = ctx_buf;
669 
670 	bdev_nvme_abort_retry_ios(nbdev_ch);
671 	_bdev_nvme_delete_io_paths(nbdev_ch);
672 }
673 
674 static inline bool
675 bdev_nvme_io_type_is_admin(enum spdk_bdev_io_type io_type)
676 {
677 	switch (io_type) {
678 	case SPDK_BDEV_IO_TYPE_RESET:
679 	case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
680 	case SPDK_BDEV_IO_TYPE_ABORT:
681 		return true;
682 	default:
683 		break;
684 	}
685 
686 	return false;
687 }
688 
689 static inline bool
690 nvme_ns_is_accessible(struct nvme_ns *nvme_ns)
691 {
692 	if (spdk_unlikely(nvme_ns->ana_state_updating)) {
693 		return false;
694 	}
695 
696 	switch (nvme_ns->ana_state) {
697 	case SPDK_NVME_ANA_OPTIMIZED_STATE:
698 	case SPDK_NVME_ANA_NON_OPTIMIZED_STATE:
699 		return true;
700 	default:
701 		break;
702 	}
703 
704 	return false;
705 }
706 
707 static inline bool
708 nvme_io_path_is_connected(struct nvme_io_path *io_path)
709 {
710 	return io_path->ctrlr_ch->qpair != NULL;
711 }
712 
713 static inline bool
714 nvme_io_path_is_available(struct nvme_io_path *io_path)
715 {
716 	if (spdk_unlikely(!nvme_io_path_is_connected(io_path))) {
717 		return false;
718 	}
719 
720 	if (spdk_unlikely(!nvme_ns_is_accessible(io_path->nvme_ns))) {
721 		return false;
722 	}
723 
724 	return true;
725 }
726 
727 static inline bool
728 nvme_io_path_is_failed(struct nvme_io_path *io_path)
729 {
730 	struct nvme_ctrlr *nvme_ctrlr;
731 
732 	nvme_ctrlr = nvme_ctrlr_channel_get_ctrlr(io_path->ctrlr_ch);
733 
734 	if (nvme_ctrlr->destruct) {
735 		return true;
736 	}
737 
738 	/* In a full reset sequence, ctrlr is set to unfailed but it is after
739 	 * destroying all qpairs. Ctrlr may be still failed even after starting
740 	 * a full reset sequence. Hence we check the resetting flag first.
741 	 */
742 	if (nvme_ctrlr->resetting) {
743 		return false;
744 	}
745 
746 	if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) {
747 		return true;
748 	} else {
749 		return false;
750 	}
751 }
752 
753 static bool
754 nvme_ctrlr_is_available(struct nvme_ctrlr *nvme_ctrlr)
755 {
756 	if (nvme_ctrlr->destruct) {
757 		return false;
758 	}
759 
760 	if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) {
761 		return false;
762 	}
763 
764 	if (nvme_ctrlr->resetting) {
765 		return false;
766 	}
767 
768 	return true;
769 }
770 
771 static inline struct nvme_io_path *
772 bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch)
773 {
774 	struct nvme_io_path *io_path, *non_optimized = NULL;
775 
776 	if (spdk_likely(nbdev_ch->current_io_path != NULL)) {
777 		return nbdev_ch->current_io_path;
778 	}
779 
780 	STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
781 		if (spdk_unlikely(!nvme_io_path_is_connected(io_path))) {
782 			/* The device is currently resetting. */
783 			continue;
784 		}
785 
786 		if (spdk_unlikely(io_path->nvme_ns->ana_state_updating)) {
787 			continue;
788 		}
789 
790 		switch (io_path->nvme_ns->ana_state) {
791 		case SPDK_NVME_ANA_OPTIMIZED_STATE:
792 			nbdev_ch->current_io_path = io_path;
793 			return io_path;
794 		case SPDK_NVME_ANA_NON_OPTIMIZED_STATE:
795 			if (non_optimized == NULL) {
796 				non_optimized = io_path;
797 			}
798 			break;
799 		default:
800 			break;
801 		}
802 	}
803 
804 	return non_optimized;
805 }
806 
807 /* Return true if there is any io_path whose qpair is active or ctrlr is not failed,
808  * or false otherwise.
809  *
810  * If any io_path has an active qpair but find_io_path() returned NULL, its namespace
811  * is likely to be non-accessible now but may become accessible.
812  *
813  * If any io_path has an unfailed ctrlr but find_io_path() returned NULL, the ctrlr
814  * is likely to be resetting now but the reset may succeed. A ctrlr is set to unfailed
815  * when starting to reset it but it is set to failed when the reset failed. Hence, if
816  * a ctrlr is unfailed, it is likely that it works fine or is resetting.
817  */
818 static bool
819 any_io_path_may_become_available(struct nvme_bdev_channel *nbdev_ch)
820 {
821 	struct nvme_io_path *io_path;
822 
823 	STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
824 		if (nvme_io_path_is_connected(io_path) ||
825 		    !nvme_io_path_is_failed(io_path)) {
826 			return true;
827 		}
828 	}
829 
830 	return false;
831 }
832 
833 static bool
834 any_ctrlr_may_become_available(struct nvme_bdev_channel *nbdev_ch)
835 {
836 	struct nvme_io_path *io_path;
837 
838 	STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
839 		if (!nvme_io_path_is_failed(io_path)) {
840 			return true;
841 		}
842 	}
843 
844 	return false;
845 }
846 
847 static int
848 bdev_nvme_retry_ios(void *arg)
849 {
850 	struct nvme_bdev_channel *nbdev_ch = arg;
851 	struct spdk_io_channel *ch = spdk_io_channel_from_ctx(nbdev_ch);
852 	struct spdk_bdev_io *bdev_io, *tmp_bdev_io;
853 	struct nvme_bdev_io *bio;
854 	uint64_t now, delay_us;
855 
856 	now = spdk_get_ticks();
857 
858 	TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_bdev_io) {
859 		bio = (struct nvme_bdev_io *)bdev_io->driver_ctx;
860 		if (bio->retry_ticks > now) {
861 			break;
862 		}
863 
864 		TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link);
865 
866 		bdev_nvme_submit_request(ch, bdev_io);
867 	}
868 
869 	spdk_poller_unregister(&nbdev_ch->retry_io_poller);
870 
871 	bdev_io = TAILQ_FIRST(&nbdev_ch->retry_io_list);
872 	if (bdev_io != NULL) {
873 		bio = (struct nvme_bdev_io *)bdev_io->driver_ctx;
874 
875 		delay_us = (bio->retry_ticks - now) * SPDK_SEC_TO_USEC / spdk_get_ticks_hz();
876 
877 		nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch,
878 					    delay_us);
879 	}
880 
881 	return SPDK_POLLER_BUSY;
882 }
883 
884 static void
885 bdev_nvme_queue_retry_io(struct nvme_bdev_channel *nbdev_ch,
886 			 struct nvme_bdev_io *bio, uint64_t delay_ms)
887 {
888 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
889 	struct spdk_bdev_io *tmp_bdev_io;
890 	struct nvme_bdev_io *tmp_bio;
891 
892 	bio->retry_ticks = spdk_get_ticks() + delay_ms * spdk_get_ticks_hz() / 1000ULL;
893 
894 	TAILQ_FOREACH_REVERSE(tmp_bdev_io, &nbdev_ch->retry_io_list, retry_io_head, module_link) {
895 		tmp_bio = (struct nvme_bdev_io *)tmp_bdev_io->driver_ctx;
896 
897 		if (tmp_bio->retry_ticks <= bio->retry_ticks) {
898 			TAILQ_INSERT_AFTER(&nbdev_ch->retry_io_list, tmp_bdev_io, bdev_io,
899 					   module_link);
900 			return;
901 		}
902 	}
903 
904 	/* No earlier I/Os were found. This I/O must be the new head. */
905 	TAILQ_INSERT_HEAD(&nbdev_ch->retry_io_list, bdev_io, module_link);
906 
907 	spdk_poller_unregister(&nbdev_ch->retry_io_poller);
908 
909 	nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch,
910 				    delay_ms * 1000ULL);
911 }
912 
913 static inline void
914 bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio,
915 				  const struct spdk_nvme_cpl *cpl)
916 {
917 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
918 	struct nvme_bdev_channel *nbdev_ch;
919 	struct nvme_ctrlr *nvme_ctrlr;
920 	const struct spdk_nvme_ctrlr_data *cdata;
921 	uint64_t delay_ms;
922 
923 	assert(!bdev_nvme_io_type_is_admin(bdev_io->type));
924 
925 	if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) {
926 		goto complete;
927 	}
928 
929 	if (cpl->status.dnr != 0 || (g_opts.bdev_retry_count != -1 &&
930 				     bio->retry_count >= g_opts.bdev_retry_count)) {
931 		goto complete;
932 	}
933 
934 	nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io));
935 
936 	assert(bio->io_path != NULL);
937 	nvme_ctrlr = nvme_ctrlr_channel_get_ctrlr(bio->io_path->ctrlr_ch);
938 
939 	if (spdk_nvme_cpl_is_path_error(cpl) ||
940 	    spdk_nvme_cpl_is_aborted_sq_deletion(cpl) ||
941 	    !nvme_io_path_is_available(bio->io_path) ||
942 	    !nvme_ctrlr_is_available(nvme_ctrlr)) {
943 		nbdev_ch->current_io_path = NULL;
944 		if (spdk_nvme_cpl_is_ana_error(cpl)) {
945 			bio->io_path->nvme_ns->ana_state_updating = true;
946 			nvme_ctrlr_read_ana_log_page(nvme_ctrlr);
947 		}
948 		delay_ms = 0;
949 	} else if (spdk_nvme_cpl_is_aborted_by_request(cpl)) {
950 		goto complete;
951 	} else {
952 		bio->retry_count++;
953 
954 		cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr);
955 
956 		if (cpl->status.crd != 0) {
957 			delay_ms = cdata->crdt[cpl->status.crd] * 100;
958 		} else {
959 			delay_ms = 0;
960 		}
961 	}
962 
963 	if (any_io_path_may_become_available(nbdev_ch)) {
964 		bdev_nvme_queue_retry_io(nbdev_ch, bio, delay_ms);
965 		return;
966 	}
967 
968 complete:
969 	bio->retry_count = 0;
970 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
971 }
972 
973 static inline void
974 bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc)
975 {
976 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
977 	struct nvme_bdev_channel *nbdev_ch;
978 	enum spdk_bdev_io_status io_status;
979 
980 	switch (rc) {
981 	case 0:
982 		io_status = SPDK_BDEV_IO_STATUS_SUCCESS;
983 		break;
984 	case -ENOMEM:
985 		io_status = SPDK_BDEV_IO_STATUS_NOMEM;
986 		break;
987 	case -ENXIO:
988 		nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io));
989 
990 		nbdev_ch->current_io_path = NULL;
991 
992 		if (any_io_path_may_become_available(nbdev_ch)) {
993 			bdev_nvme_queue_retry_io(nbdev_ch, bio, 1000ULL);
994 			return;
995 		}
996 
997 	/* fallthrough */
998 	default:
999 		io_status = SPDK_BDEV_IO_STATUS_FAILED;
1000 		break;
1001 	}
1002 
1003 	bio->retry_count = 0;
1004 	spdk_bdev_io_complete(bdev_io, io_status);
1005 }
1006 
1007 static inline void
1008 bdev_nvme_admin_passthru_complete(struct nvme_bdev_io *bio, int rc)
1009 {
1010 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
1011 	struct nvme_bdev_channel *nbdev_ch;
1012 	enum spdk_bdev_io_status io_status;
1013 
1014 	switch (rc) {
1015 	case 0:
1016 		io_status = SPDK_BDEV_IO_STATUS_SUCCESS;
1017 		break;
1018 	case -ENOMEM:
1019 		io_status = SPDK_BDEV_IO_STATUS_NOMEM;
1020 		break;
1021 	case -ENXIO:
1022 		nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io));
1023 
1024 		if (any_ctrlr_may_become_available(nbdev_ch)) {
1025 			bdev_nvme_queue_retry_io(nbdev_ch, bio, 1000ULL);
1026 			return;
1027 		}
1028 
1029 	/* fallthrough */
1030 	default:
1031 		io_status = SPDK_BDEV_IO_STATUS_FAILED;
1032 		break;
1033 	}
1034 
1035 	bio->retry_count = 0;
1036 	spdk_bdev_io_complete(bdev_io, io_status);
1037 }
1038 
1039 static void
1040 _bdev_nvme_clear_io_path_cache(struct nvme_ctrlr_channel *ctrlr_ch)
1041 {
1042 	struct nvme_io_path *io_path;
1043 
1044 	TAILQ_FOREACH(io_path, &ctrlr_ch->io_path_list, tailq) {
1045 		io_path->nbdev_ch->current_io_path = NULL;
1046 	}
1047 }
1048 
1049 static struct nvme_ctrlr_channel *
1050 nvme_poll_group_get_ctrlr_channel(struct nvme_poll_group *group,
1051 				  struct spdk_nvme_qpair *qpair)
1052 {
1053 	struct nvme_ctrlr_channel *ctrlr_ch;
1054 
1055 	TAILQ_FOREACH(ctrlr_ch, &group->ctrlr_ch_list, tailq) {
1056 		if (ctrlr_ch->qpair == qpair) {
1057 			break;
1058 		}
1059 	}
1060 
1061 	return ctrlr_ch;
1062 }
1063 
1064 static void
1065 bdev_nvme_destroy_qpair(struct nvme_ctrlr_channel *ctrlr_ch)
1066 {
1067 	if (ctrlr_ch->qpair != NULL) {
1068 		spdk_nvme_ctrlr_free_io_qpair(ctrlr_ch->qpair);
1069 		ctrlr_ch->qpair = NULL;
1070 	}
1071 
1072 	_bdev_nvme_clear_io_path_cache(ctrlr_ch);
1073 }
1074 
1075 static void
1076 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx)
1077 {
1078 	struct nvme_poll_group *group = poll_group_ctx;
1079 	struct nvme_ctrlr_channel *ctrlr_ch;
1080 	struct nvme_ctrlr *nvme_ctrlr;
1081 
1082 	SPDK_NOTICELOG("qpair %p is disconnected, free the qpair and reset controller.\n", qpair);
1083 	/*
1084 	 * Free the I/O qpair and reset the nvme_ctrlr.
1085 	 */
1086 	ctrlr_ch = nvme_poll_group_get_ctrlr_channel(group, qpair);
1087 	if (ctrlr_ch != NULL) {
1088 		bdev_nvme_destroy_qpair(ctrlr_ch);
1089 
1090 		nvme_ctrlr = nvme_ctrlr_channel_get_ctrlr(ctrlr_ch);
1091 		bdev_nvme_reset(nvme_ctrlr);
1092 	}
1093 }
1094 
1095 static int
1096 bdev_nvme_poll(void *arg)
1097 {
1098 	struct nvme_poll_group *group = arg;
1099 	int64_t num_completions;
1100 
1101 	if (group->collect_spin_stat && group->start_ticks == 0) {
1102 		group->start_ticks = spdk_get_ticks();
1103 	}
1104 
1105 	num_completions = spdk_nvme_poll_group_process_completions(group->group, 0,
1106 			  bdev_nvme_disconnected_qpair_cb);
1107 	if (group->collect_spin_stat) {
1108 		if (num_completions > 0) {
1109 			if (group->end_ticks != 0) {
1110 				group->spin_ticks += (group->end_ticks - group->start_ticks);
1111 				group->end_ticks = 0;
1112 			}
1113 			group->start_ticks = 0;
1114 		} else {
1115 			group->end_ticks = spdk_get_ticks();
1116 		}
1117 	}
1118 
1119 	return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
1120 }
1121 
1122 static int
1123 bdev_nvme_poll_adminq(void *arg)
1124 {
1125 	int32_t rc;
1126 	struct nvme_ctrlr *nvme_ctrlr = arg;
1127 
1128 	assert(nvme_ctrlr != NULL);
1129 
1130 	rc = spdk_nvme_ctrlr_process_admin_completions(nvme_ctrlr->ctrlr);
1131 	if (rc < 0) {
1132 		bdev_nvme_failover(nvme_ctrlr, false);
1133 	}
1134 
1135 	return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY;
1136 }
1137 
1138 static void
1139 _bdev_nvme_unregister_dev_cb(void *io_device)
1140 {
1141 	struct nvme_bdev *nvme_disk = io_device;
1142 
1143 	free(nvme_disk->disk.name);
1144 	free(nvme_disk);
1145 }
1146 
1147 static int
1148 bdev_nvme_destruct(void *ctx)
1149 {
1150 	struct nvme_bdev *nvme_disk = ctx;
1151 	struct nvme_ns *nvme_ns, *tmp_nvme_ns;
1152 
1153 	TAILQ_FOREACH_SAFE(nvme_ns, &nvme_disk->nvme_ns_list, tailq, tmp_nvme_ns) {
1154 		pthread_mutex_lock(&nvme_ns->ctrlr->mutex);
1155 
1156 		nvme_ns->bdev = NULL;
1157 
1158 		assert(nvme_ns->id > 0);
1159 
1160 		if (nvme_ctrlr_get_ns(nvme_ns->ctrlr, nvme_ns->id) == NULL) {
1161 			pthread_mutex_unlock(&nvme_ns->ctrlr->mutex);
1162 
1163 			nvme_ctrlr_release(nvme_ns->ctrlr);
1164 			free(nvme_ns);
1165 		} else {
1166 			pthread_mutex_unlock(&nvme_ns->ctrlr->mutex);
1167 		}
1168 	}
1169 
1170 	pthread_mutex_lock(&g_bdev_nvme_mutex);
1171 	TAILQ_REMOVE(&nvme_disk->nbdev_ctrlr->bdevs, nvme_disk, tailq);
1172 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
1173 
1174 	spdk_io_device_unregister(nvme_disk, _bdev_nvme_unregister_dev_cb);
1175 
1176 	return 0;
1177 }
1178 
1179 static int
1180 bdev_nvme_flush(struct nvme_bdev_io *bio, uint64_t offset, uint64_t nbytes)
1181 {
1182 	bdev_nvme_io_complete(bio, 0);
1183 
1184 	return 0;
1185 }
1186 
1187 static int
1188 bdev_nvme_create_qpair(struct nvme_ctrlr_channel *ctrlr_ch)
1189 {
1190 	struct nvme_ctrlr *nvme_ctrlr;
1191 	struct spdk_nvme_io_qpair_opts opts;
1192 	struct spdk_nvme_qpair *qpair;
1193 	int rc;
1194 
1195 	nvme_ctrlr = nvme_ctrlr_channel_get_ctrlr(ctrlr_ch);
1196 
1197 	spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_ctrlr->ctrlr, &opts, sizeof(opts));
1198 	opts.delay_cmd_submit = g_opts.delay_cmd_submit;
1199 	opts.create_only = true;
1200 	opts.async_mode = true;
1201 	opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests);
1202 	g_opts.io_queue_requests = opts.io_queue_requests;
1203 
1204 	qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_ctrlr->ctrlr, &opts, sizeof(opts));
1205 	if (qpair == NULL) {
1206 		return -1;
1207 	}
1208 
1209 	assert(ctrlr_ch->group != NULL);
1210 
1211 	rc = spdk_nvme_poll_group_add(ctrlr_ch->group->group, qpair);
1212 	if (rc != 0) {
1213 		SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n");
1214 		goto err;
1215 	}
1216 
1217 	rc = spdk_nvme_ctrlr_connect_io_qpair(nvme_ctrlr->ctrlr, qpair);
1218 	if (rc != 0) {
1219 		SPDK_ERRLOG("Unable to connect I/O qpair.\n");
1220 		goto err;
1221 	}
1222 
1223 	ctrlr_ch->qpair = qpair;
1224 
1225 	return 0;
1226 
1227 err:
1228 	spdk_nvme_ctrlr_free_io_qpair(qpair);
1229 
1230 	return rc;
1231 }
1232 
1233 static void
1234 bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i)
1235 {
1236 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
1237 	struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch);
1238 	enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS;
1239 	struct spdk_bdev_io *bdev_io;
1240 
1241 	if (spdk_io_channel_iter_get_ctx(i) != NULL) {
1242 		status = SPDK_BDEV_IO_STATUS_FAILED;
1243 	}
1244 
1245 	while (!TAILQ_EMPTY(&ctrlr_ch->pending_resets)) {
1246 		bdev_io = TAILQ_FIRST(&ctrlr_ch->pending_resets);
1247 		TAILQ_REMOVE(&ctrlr_ch->pending_resets, bdev_io, module_link);
1248 		spdk_bdev_io_complete(bdev_io, status);
1249 	}
1250 
1251 	spdk_for_each_channel_continue(i, 0);
1252 }
1253 
1254 static void
1255 bdev_nvme_failover_trid(struct nvme_ctrlr *nvme_ctrlr, bool remove)
1256 {
1257 	struct nvme_path_id *path_id, *next_path;
1258 	int rc __attribute__((unused));
1259 
1260 	path_id = TAILQ_FIRST(&nvme_ctrlr->trids);
1261 	assert(path_id);
1262 	assert(path_id == nvme_ctrlr->active_path_id);
1263 	next_path = TAILQ_NEXT(path_id, link);
1264 
1265 	path_id->is_failed = true;
1266 
1267 	if (next_path) {
1268 		assert(path_id->trid.trtype != SPDK_NVME_TRANSPORT_PCIE);
1269 
1270 		SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", path_id->trid.traddr,
1271 			       path_id->trid.trsvcid,	next_path->trid.traddr, next_path->trid.trsvcid);
1272 
1273 		spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr);
1274 		nvme_ctrlr->active_path_id = next_path;
1275 		rc = spdk_nvme_ctrlr_set_trid(nvme_ctrlr->ctrlr, &next_path->trid);
1276 		assert(rc == 0);
1277 		TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link);
1278 		if (!remove) {
1279 			/** Shuffle the old trid to the end of the list and use the new one.
1280 			 * Allows for round robin through multiple connections.
1281 			 */
1282 			TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, path_id, link);
1283 		} else {
1284 			free(path_id);
1285 		}
1286 	}
1287 }
1288 
1289 static void
1290 _bdev_nvme_reset_complete(struct spdk_io_channel_iter *i, int status)
1291 {
1292 	struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i);
1293 	bool success = spdk_io_channel_iter_get_ctx(i) == NULL;
1294 	struct nvme_path_id *path_id;
1295 	bdev_nvme_reset_cb reset_cb_fn = nvme_ctrlr->reset_cb_fn;
1296 	void *reset_cb_arg = nvme_ctrlr->reset_cb_arg;
1297 	bool complete_pending_destruct = false;
1298 
1299 	assert(nvme_ctrlr->thread == spdk_get_thread());
1300 
1301 	nvme_ctrlr->reset_cb_fn = NULL;
1302 	nvme_ctrlr->reset_cb_arg = NULL;
1303 
1304 	if (!success) {
1305 		SPDK_ERRLOG("Resetting controller failed.\n");
1306 	} else {
1307 		SPDK_NOTICELOG("Resetting controller successful.\n");
1308 	}
1309 
1310 	pthread_mutex_lock(&nvme_ctrlr->mutex);
1311 	nvme_ctrlr->resetting = false;
1312 
1313 	path_id = TAILQ_FIRST(&nvme_ctrlr->trids);
1314 	assert(path_id != NULL);
1315 	assert(path_id == nvme_ctrlr->active_path_id);
1316 
1317 	path_id->is_failed = !success;
1318 
1319 	if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) {
1320 		/* Complete pending destruct after reset completes. */
1321 		complete_pending_destruct = true;
1322 	}
1323 
1324 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
1325 
1326 	if (reset_cb_fn) {
1327 		reset_cb_fn(reset_cb_arg, success);
1328 	}
1329 
1330 	if (complete_pending_destruct) {
1331 		nvme_ctrlr_unregister(nvme_ctrlr);
1332 	}
1333 }
1334 
1335 static void
1336 bdev_nvme_reset_complete(struct nvme_ctrlr *nvme_ctrlr, bool success)
1337 {
1338 	/* Make sure we clear any pending resets before returning. */
1339 	spdk_for_each_channel(nvme_ctrlr,
1340 			      bdev_nvme_complete_pending_resets,
1341 			      success ? NULL : (void *)0x1,
1342 			      _bdev_nvme_reset_complete);
1343 }
1344 
1345 static void
1346 bdev_nvme_reset_create_qpairs_failed(struct spdk_io_channel_iter *i, int status)
1347 {
1348 	struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i);
1349 
1350 	bdev_nvme_reset_complete(nvme_ctrlr, false);
1351 }
1352 
1353 static void
1354 bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i)
1355 {
1356 	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
1357 	struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch);
1358 
1359 	bdev_nvme_destroy_qpair(ctrlr_ch);
1360 
1361 	spdk_for_each_channel_continue(i, 0);
1362 }
1363 
1364 static void
1365 bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status)
1366 {
1367 	struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i);
1368 
1369 	if (status == 0) {
1370 		bdev_nvme_reset_complete(nvme_ctrlr, true);
1371 	} else {
1372 		/* Delete the added qpairs and quiesce ctrlr to make the states clean. */
1373 		spdk_for_each_channel(nvme_ctrlr,
1374 				      bdev_nvme_reset_destroy_qpair,
1375 				      NULL,
1376 				      bdev_nvme_reset_create_qpairs_failed);
1377 	}
1378 }
1379 
1380 static void
1381 bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i)
1382 {
1383 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
1384 	struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch);
1385 	int rc;
1386 
1387 	rc = bdev_nvme_create_qpair(ctrlr_ch);
1388 
1389 	spdk_for_each_channel_continue(i, rc);
1390 }
1391 
1392 static int
1393 bdev_nvme_ctrlr_reset_poll(void *arg)
1394 {
1395 	struct nvme_ctrlr *nvme_ctrlr = arg;
1396 	int rc;
1397 
1398 	rc = spdk_nvme_ctrlr_reconnect_poll_async(nvme_ctrlr->ctrlr);
1399 	if (rc == -EAGAIN) {
1400 		return SPDK_POLLER_BUSY;
1401 	}
1402 
1403 	spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller);
1404 	if (rc == 0) {
1405 		/* Recreate all of the I/O queue pairs */
1406 		spdk_for_each_channel(nvme_ctrlr,
1407 				      bdev_nvme_reset_create_qpair,
1408 				      NULL,
1409 				      bdev_nvme_reset_create_qpairs_done);
1410 	} else {
1411 		bdev_nvme_reset_complete(nvme_ctrlr, false);
1412 	}
1413 	return SPDK_POLLER_BUSY;
1414 }
1415 
1416 static void
1417 bdev_nvme_reset_ctrlr(struct spdk_io_channel_iter *i, int status)
1418 {
1419 	struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i);
1420 	int rc __attribute__((unused));
1421 
1422 	assert(status == 0);
1423 
1424 	/* Disconnect fails if ctrlr is already resetting or removed. Both cases are
1425 	 * not possible. Reset is controlled and the callback to hot remove is called
1426 	 * when ctrlr is hot removed.
1427 	 */
1428 	rc = spdk_nvme_ctrlr_disconnect(nvme_ctrlr->ctrlr);
1429 	assert(rc == 0);
1430 
1431 	spdk_nvme_ctrlr_reconnect_async(nvme_ctrlr->ctrlr);
1432 
1433 	assert(nvme_ctrlr->reset_detach_poller == NULL);
1434 	nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(bdev_nvme_ctrlr_reset_poll,
1435 					  nvme_ctrlr, 0);
1436 }
1437 
1438 static void
1439 _bdev_nvme_reset(void *ctx)
1440 {
1441 	struct nvme_ctrlr *nvme_ctrlr = ctx;
1442 
1443 	assert(nvme_ctrlr->resetting == true);
1444 	assert(nvme_ctrlr->thread == spdk_get_thread());
1445 
1446 	spdk_nvme_ctrlr_prepare_for_reset(nvme_ctrlr->ctrlr);
1447 
1448 	/* First, delete all NVMe I/O queue pairs. */
1449 	spdk_for_each_channel(nvme_ctrlr,
1450 			      bdev_nvme_reset_destroy_qpair,
1451 			      NULL,
1452 			      bdev_nvme_reset_ctrlr);
1453 }
1454 
1455 static int
1456 bdev_nvme_reset(struct nvme_ctrlr *nvme_ctrlr)
1457 {
1458 	pthread_mutex_lock(&nvme_ctrlr->mutex);
1459 	if (nvme_ctrlr->destruct) {
1460 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
1461 		return -ENXIO;
1462 	}
1463 
1464 	if (nvme_ctrlr->resetting) {
1465 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
1466 		SPDK_NOTICELOG("Unable to perform reset, already in progress.\n");
1467 		return -EBUSY;
1468 	}
1469 
1470 	nvme_ctrlr->resetting = true;
1471 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
1472 
1473 	spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset, nvme_ctrlr);
1474 	return 0;
1475 }
1476 
1477 int
1478 bdev_nvme_reset_rpc(struct nvme_ctrlr *nvme_ctrlr, bdev_nvme_reset_cb cb_fn, void *cb_arg)
1479 {
1480 	int rc;
1481 
1482 	rc = bdev_nvme_reset(nvme_ctrlr);
1483 	if (rc == 0) {
1484 		nvme_ctrlr->reset_cb_fn = cb_fn;
1485 		nvme_ctrlr->reset_cb_arg = cb_arg;
1486 	}
1487 	return rc;
1488 }
1489 
1490 static int _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio);
1491 
1492 static void
1493 bdev_nvme_reset_io_complete(struct nvme_bdev_io *bio)
1494 {
1495 	enum spdk_bdev_io_status io_status;
1496 
1497 	if (bio->cpl.cdw0 == 0) {
1498 		io_status = SPDK_BDEV_IO_STATUS_SUCCESS;
1499 	} else {
1500 		io_status = SPDK_BDEV_IO_STATUS_FAILED;
1501 	}
1502 
1503 	spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), io_status);
1504 }
1505 
1506 static void
1507 _bdev_nvme_reset_io_continue(void *ctx)
1508 {
1509 	struct nvme_bdev_io *bio = ctx;
1510 	struct nvme_io_path *prev_io_path, *next_io_path;
1511 	int rc;
1512 
1513 	prev_io_path = bio->io_path;
1514 	bio->io_path = NULL;
1515 
1516 	if (bio->cpl.cdw0 != 0) {
1517 		goto complete;
1518 	}
1519 
1520 	next_io_path = STAILQ_NEXT(prev_io_path, stailq);
1521 	if (next_io_path == NULL) {
1522 		goto complete;
1523 	}
1524 
1525 	rc = _bdev_nvme_reset_io(next_io_path, bio);
1526 	if (rc == 0) {
1527 		return;
1528 	}
1529 
1530 	bio->cpl.cdw0 = 1;
1531 
1532 complete:
1533 	bdev_nvme_reset_io_complete(bio);
1534 }
1535 
1536 static void
1537 bdev_nvme_reset_io_continue(void *cb_arg, bool success)
1538 {
1539 	struct nvme_bdev_io *bio = cb_arg;
1540 
1541 	bio->cpl.cdw0 = !success;
1542 
1543 	spdk_thread_send_msg(bio->orig_thread, _bdev_nvme_reset_io_continue, bio);
1544 }
1545 
1546 static int
1547 _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio)
1548 {
1549 	struct nvme_ctrlr_channel *ctrlr_ch = io_path->ctrlr_ch;
1550 	struct nvme_ctrlr *nvme_ctrlr;
1551 	struct spdk_bdev_io *bdev_io;
1552 	int rc;
1553 
1554 	nvme_ctrlr = nvme_ctrlr_channel_get_ctrlr(ctrlr_ch);
1555 
1556 	rc = bdev_nvme_reset(nvme_ctrlr);
1557 	if (rc == 0) {
1558 		assert(bio->io_path == NULL);
1559 		bio->io_path = io_path;
1560 
1561 		assert(nvme_ctrlr->reset_cb_fn == NULL);
1562 		assert(nvme_ctrlr->reset_cb_arg == NULL);
1563 		nvme_ctrlr->reset_cb_fn = bdev_nvme_reset_io_continue;
1564 		nvme_ctrlr->reset_cb_arg = bio;
1565 	} else if (rc == -EBUSY) {
1566 		/*
1567 		 * Reset call is queued only if it is from the app framework. This is on purpose so that
1568 		 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the
1569 		 * upper level. If they are in the middle of a reset, we won't try to schedule another one.
1570 		 */
1571 		bdev_io = spdk_bdev_io_from_ctx(bio);
1572 		TAILQ_INSERT_TAIL(&ctrlr_ch->pending_resets, bdev_io, module_link);
1573 	} else {
1574 		return rc;
1575 	}
1576 
1577 	return 0;
1578 }
1579 
1580 static void
1581 bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio)
1582 {
1583 	struct nvme_io_path *io_path;
1584 	int rc;
1585 
1586 	bio->cpl.cdw0 = 0;
1587 	bio->orig_thread = spdk_get_thread();
1588 
1589 	/* Reset only the first nvme_ctrlr in the nvme_bdev_ctrlr for now.
1590 	 *
1591 	 * TODO: Reset all nvme_ctrlrs in the nvme_bdev_ctrlr sequentially.
1592 	 * This will be done in the following patches.
1593 	 */
1594 	io_path = STAILQ_FIRST(&nbdev_ch->io_path_list);
1595 	assert(io_path != NULL);
1596 
1597 	rc = _bdev_nvme_reset_io(io_path, bio);
1598 	if (rc != 0) {
1599 		bio->cpl.cdw0 = 1;
1600 		bdev_nvme_reset_io_complete(bio);
1601 	}
1602 }
1603 
1604 static int
1605 bdev_nvme_failover(struct nvme_ctrlr *nvme_ctrlr, bool remove)
1606 {
1607 	pthread_mutex_lock(&nvme_ctrlr->mutex);
1608 	if (nvme_ctrlr->destruct) {
1609 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
1610 		/* Don't bother resetting if the controller is in the process of being destructed. */
1611 		return -ENXIO;
1612 	}
1613 
1614 	if (nvme_ctrlr->resetting) {
1615 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
1616 		SPDK_NOTICELOG("Unable to perform reset, already in progress.\n");
1617 		return -EBUSY;
1618 	}
1619 
1620 	bdev_nvme_failover_trid(nvme_ctrlr, remove);
1621 
1622 	nvme_ctrlr->resetting = true;
1623 
1624 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
1625 
1626 	spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset, nvme_ctrlr);
1627 	return 0;
1628 }
1629 
1630 static int bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks,
1631 			   uint64_t num_blocks);
1632 
1633 static int bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks,
1634 				  uint64_t num_blocks);
1635 
1636 static void
1637 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
1638 		     bool success)
1639 {
1640 	struct nvme_bdev_io *bio = (struct nvme_bdev_io *)bdev_io->driver_ctx;
1641 	struct spdk_bdev *bdev = bdev_io->bdev;
1642 	int ret;
1643 
1644 	if (!success) {
1645 		ret = -EINVAL;
1646 		goto exit;
1647 	}
1648 
1649 	if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) {
1650 		ret = -ENXIO;
1651 		goto exit;
1652 	}
1653 
1654 	ret = bdev_nvme_readv(bio,
1655 			      bdev_io->u.bdev.iovs,
1656 			      bdev_io->u.bdev.iovcnt,
1657 			      bdev_io->u.bdev.md_buf,
1658 			      bdev_io->u.bdev.num_blocks,
1659 			      bdev_io->u.bdev.offset_blocks,
1660 			      bdev->dif_check_flags,
1661 			      bdev_io->internal.ext_opts);
1662 
1663 exit:
1664 	if (spdk_unlikely(ret != 0)) {
1665 		bdev_nvme_io_complete(bio, ret);
1666 	}
1667 }
1668 
1669 static void
1670 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
1671 {
1672 	struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch);
1673 	struct spdk_bdev *bdev = bdev_io->bdev;
1674 	struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx;
1675 	struct nvme_bdev_io *nbdev_io_to_abort;
1676 	int rc = 0;
1677 
1678 	nbdev_io->io_path = bdev_nvme_find_io_path(nbdev_ch);
1679 	if (spdk_unlikely(!nbdev_io->io_path)) {
1680 		if (!bdev_nvme_io_type_is_admin(bdev_io->type)) {
1681 			rc = -ENXIO;
1682 			goto exit;
1683 		}
1684 
1685 		/* Admin commands do not use the optimal I/O path.
1686 		 * Simply fall through even if it is not found.
1687 		 */
1688 	}
1689 
1690 	switch (bdev_io->type) {
1691 	case SPDK_BDEV_IO_TYPE_READ:
1692 		if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) {
1693 			rc = bdev_nvme_readv(nbdev_io,
1694 					     bdev_io->u.bdev.iovs,
1695 					     bdev_io->u.bdev.iovcnt,
1696 					     bdev_io->u.bdev.md_buf,
1697 					     bdev_io->u.bdev.num_blocks,
1698 					     bdev_io->u.bdev.offset_blocks,
1699 					     bdev->dif_check_flags,
1700 					     bdev_io->internal.ext_opts);
1701 		} else {
1702 			spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb,
1703 					     bdev_io->u.bdev.num_blocks * bdev->blocklen);
1704 			rc = 0;
1705 		}
1706 		break;
1707 	case SPDK_BDEV_IO_TYPE_WRITE:
1708 		rc = bdev_nvme_writev(nbdev_io,
1709 				      bdev_io->u.bdev.iovs,
1710 				      bdev_io->u.bdev.iovcnt,
1711 				      bdev_io->u.bdev.md_buf,
1712 				      bdev_io->u.bdev.num_blocks,
1713 				      bdev_io->u.bdev.offset_blocks,
1714 				      bdev->dif_check_flags,
1715 				      bdev_io->internal.ext_opts);
1716 		break;
1717 	case SPDK_BDEV_IO_TYPE_COMPARE:
1718 		rc = bdev_nvme_comparev(nbdev_io,
1719 					bdev_io->u.bdev.iovs,
1720 					bdev_io->u.bdev.iovcnt,
1721 					bdev_io->u.bdev.md_buf,
1722 					bdev_io->u.bdev.num_blocks,
1723 					bdev_io->u.bdev.offset_blocks,
1724 					bdev->dif_check_flags);
1725 		break;
1726 	case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE:
1727 		rc = bdev_nvme_comparev_and_writev(nbdev_io,
1728 						   bdev_io->u.bdev.iovs,
1729 						   bdev_io->u.bdev.iovcnt,
1730 						   bdev_io->u.bdev.fused_iovs,
1731 						   bdev_io->u.bdev.fused_iovcnt,
1732 						   bdev_io->u.bdev.md_buf,
1733 						   bdev_io->u.bdev.num_blocks,
1734 						   bdev_io->u.bdev.offset_blocks,
1735 						   bdev->dif_check_flags);
1736 		break;
1737 	case SPDK_BDEV_IO_TYPE_UNMAP:
1738 		rc = bdev_nvme_unmap(nbdev_io,
1739 				     bdev_io->u.bdev.offset_blocks,
1740 				     bdev_io->u.bdev.num_blocks);
1741 		break;
1742 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
1743 		rc =  bdev_nvme_write_zeroes(nbdev_io,
1744 					     bdev_io->u.bdev.offset_blocks,
1745 					     bdev_io->u.bdev.num_blocks);
1746 		break;
1747 	case SPDK_BDEV_IO_TYPE_RESET:
1748 		nbdev_io->io_path = NULL;
1749 		bdev_nvme_reset_io(nbdev_ch, nbdev_io);
1750 		break;
1751 	case SPDK_BDEV_IO_TYPE_FLUSH:
1752 		rc = bdev_nvme_flush(nbdev_io,
1753 				     bdev_io->u.bdev.offset_blocks,
1754 				     bdev_io->u.bdev.num_blocks);
1755 		break;
1756 	case SPDK_BDEV_IO_TYPE_ZONE_APPEND:
1757 		rc = bdev_nvme_zone_appendv(nbdev_io,
1758 					    bdev_io->u.bdev.iovs,
1759 					    bdev_io->u.bdev.iovcnt,
1760 					    bdev_io->u.bdev.md_buf,
1761 					    bdev_io->u.bdev.num_blocks,
1762 					    bdev_io->u.bdev.offset_blocks,
1763 					    bdev->dif_check_flags);
1764 		break;
1765 	case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO:
1766 		rc = bdev_nvme_get_zone_info(nbdev_io,
1767 					     bdev_io->u.zone_mgmt.zone_id,
1768 					     bdev_io->u.zone_mgmt.num_zones,
1769 					     bdev_io->u.zone_mgmt.buf);
1770 		break;
1771 	case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
1772 		rc = bdev_nvme_zone_management(nbdev_io,
1773 					       bdev_io->u.zone_mgmt.zone_id,
1774 					       bdev_io->u.zone_mgmt.zone_action);
1775 		break;
1776 	case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
1777 		nbdev_io->io_path = NULL;
1778 		bdev_nvme_admin_passthru(nbdev_ch,
1779 					 nbdev_io,
1780 					 &bdev_io->u.nvme_passthru.cmd,
1781 					 bdev_io->u.nvme_passthru.buf,
1782 					 bdev_io->u.nvme_passthru.nbytes);
1783 		break;
1784 	case SPDK_BDEV_IO_TYPE_NVME_IO:
1785 		rc = bdev_nvme_io_passthru(nbdev_io,
1786 					   &bdev_io->u.nvme_passthru.cmd,
1787 					   bdev_io->u.nvme_passthru.buf,
1788 					   bdev_io->u.nvme_passthru.nbytes);
1789 		break;
1790 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
1791 		rc = bdev_nvme_io_passthru_md(nbdev_io,
1792 					      &bdev_io->u.nvme_passthru.cmd,
1793 					      bdev_io->u.nvme_passthru.buf,
1794 					      bdev_io->u.nvme_passthru.nbytes,
1795 					      bdev_io->u.nvme_passthru.md_buf,
1796 					      bdev_io->u.nvme_passthru.md_len);
1797 		break;
1798 	case SPDK_BDEV_IO_TYPE_ABORT:
1799 		nbdev_io->io_path = NULL;
1800 		nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx;
1801 		bdev_nvme_abort(nbdev_ch,
1802 				nbdev_io,
1803 				nbdev_io_to_abort);
1804 		break;
1805 	default:
1806 		rc = -EINVAL;
1807 		break;
1808 	}
1809 
1810 exit:
1811 	if (spdk_unlikely(rc != 0)) {
1812 		bdev_nvme_io_complete(nbdev_io, rc);
1813 	}
1814 }
1815 
1816 static bool
1817 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
1818 {
1819 	struct nvme_bdev *nbdev = ctx;
1820 	struct nvme_ns *nvme_ns;
1821 	struct spdk_nvme_ns *ns;
1822 	struct spdk_nvme_ctrlr *ctrlr;
1823 	const struct spdk_nvme_ctrlr_data *cdata;
1824 
1825 	nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list);
1826 	assert(nvme_ns != NULL);
1827 	ns = nvme_ns->ns;
1828 	ctrlr = spdk_nvme_ns_get_ctrlr(ns);
1829 
1830 	switch (io_type) {
1831 	case SPDK_BDEV_IO_TYPE_READ:
1832 	case SPDK_BDEV_IO_TYPE_WRITE:
1833 	case SPDK_BDEV_IO_TYPE_RESET:
1834 	case SPDK_BDEV_IO_TYPE_FLUSH:
1835 	case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
1836 	case SPDK_BDEV_IO_TYPE_NVME_IO:
1837 	case SPDK_BDEV_IO_TYPE_ABORT:
1838 		return true;
1839 
1840 	case SPDK_BDEV_IO_TYPE_COMPARE:
1841 		return spdk_nvme_ns_supports_compare(ns);
1842 
1843 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
1844 		return spdk_nvme_ns_get_md_size(ns) ? true : false;
1845 
1846 	case SPDK_BDEV_IO_TYPE_UNMAP:
1847 		cdata = spdk_nvme_ctrlr_get_data(ctrlr);
1848 		return cdata->oncs.dsm;
1849 
1850 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
1851 		cdata = spdk_nvme_ctrlr_get_data(ctrlr);
1852 		return cdata->oncs.write_zeroes;
1853 
1854 	case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE:
1855 		if (spdk_nvme_ctrlr_get_flags(ctrlr) &
1856 		    SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) {
1857 			return true;
1858 		}
1859 		return false;
1860 
1861 	case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO:
1862 	case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
1863 		return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS;
1864 
1865 	case SPDK_BDEV_IO_TYPE_ZONE_APPEND:
1866 		return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS &&
1867 		       spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED;
1868 
1869 	default:
1870 		return false;
1871 	}
1872 }
1873 
1874 static int
1875 bdev_nvme_create_ctrlr_channel_cb(void *io_device, void *ctx_buf)
1876 {
1877 	struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf;
1878 	struct spdk_io_channel *pg_ch;
1879 	int rc;
1880 
1881 	pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs);
1882 	if (!pg_ch) {
1883 		return -1;
1884 	}
1885 
1886 	ctrlr_ch->group = spdk_io_channel_get_ctx(pg_ch);
1887 	TAILQ_INSERT_TAIL(&ctrlr_ch->group->ctrlr_ch_list, ctrlr_ch, tailq);
1888 
1889 #ifdef SPDK_CONFIG_VTUNE
1890 	ctrlr_ch->group->collect_spin_stat = true;
1891 #else
1892 	ctrlr_ch->group->collect_spin_stat = false;
1893 #endif
1894 
1895 	TAILQ_INIT(&ctrlr_ch->pending_resets);
1896 	TAILQ_INIT(&ctrlr_ch->io_path_list);
1897 
1898 	rc = bdev_nvme_create_qpair(ctrlr_ch);
1899 	if (rc != 0) {
1900 		goto err_qpair;
1901 	}
1902 
1903 	return 0;
1904 
1905 err_qpair:
1906 	spdk_put_io_channel(pg_ch);
1907 
1908 	return rc;
1909 }
1910 
1911 static void
1912 bdev_nvme_destroy_ctrlr_channel_cb(void *io_device, void *ctx_buf)
1913 {
1914 	struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf;
1915 
1916 	assert(ctrlr_ch->group != NULL);
1917 
1918 	bdev_nvme_destroy_qpair(ctrlr_ch);
1919 
1920 	TAILQ_REMOVE(&ctrlr_ch->group->ctrlr_ch_list, ctrlr_ch, tailq);
1921 
1922 	spdk_put_io_channel(spdk_io_channel_from_ctx(ctrlr_ch->group));
1923 }
1924 
1925 static void
1926 bdev_nvme_submit_accel_crc32c(void *ctx, uint32_t *dst, struct iovec *iov,
1927 			      uint32_t iov_cnt, uint32_t seed,
1928 			      spdk_nvme_accel_completion_cb cb_fn, void *cb_arg)
1929 {
1930 	struct nvme_poll_group *group = ctx;
1931 	int rc;
1932 
1933 	assert(group->accel_channel != NULL);
1934 	assert(cb_fn != NULL);
1935 
1936 	rc = spdk_accel_submit_crc32cv(group->accel_channel, dst, iov, iov_cnt, seed, cb_fn, cb_arg);
1937 	if (rc) {
1938 		/* For the two cases, spdk_accel_submit_crc32cv does not call the user's cb_fn */
1939 		if (rc == -ENOMEM || rc == -EINVAL) {
1940 			cb_fn(cb_arg, rc);
1941 		}
1942 		SPDK_ERRLOG("Cannot complete the accelerated crc32c operation with iov=%p\n", iov);
1943 	}
1944 }
1945 
1946 static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = {
1947 	.table_size		= sizeof(struct spdk_nvme_accel_fn_table),
1948 	.submit_accel_crc32c	= bdev_nvme_submit_accel_crc32c,
1949 };
1950 
1951 static int
1952 bdev_nvme_create_poll_group_cb(void *io_device, void *ctx_buf)
1953 {
1954 	struct nvme_poll_group *group = ctx_buf;
1955 
1956 	TAILQ_INIT(&group->ctrlr_ch_list);
1957 
1958 	group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table);
1959 	if (group->group == NULL) {
1960 		return -1;
1961 	}
1962 
1963 	group->accel_channel = spdk_accel_engine_get_io_channel();
1964 	if (!group->accel_channel) {
1965 		spdk_nvme_poll_group_destroy(group->group);
1966 		SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n",
1967 			    group);
1968 		return -1;
1969 	}
1970 
1971 	group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us);
1972 
1973 	if (group->poller == NULL) {
1974 		spdk_put_io_channel(group->accel_channel);
1975 		spdk_nvme_poll_group_destroy(group->group);
1976 		return -1;
1977 	}
1978 
1979 	return 0;
1980 }
1981 
1982 static void
1983 bdev_nvme_destroy_poll_group_cb(void *io_device, void *ctx_buf)
1984 {
1985 	struct nvme_poll_group *group = ctx_buf;
1986 
1987 	assert(TAILQ_EMPTY(&group->ctrlr_ch_list));
1988 
1989 	if (group->accel_channel) {
1990 		spdk_put_io_channel(group->accel_channel);
1991 	}
1992 
1993 	spdk_poller_unregister(&group->poller);
1994 	if (spdk_nvme_poll_group_destroy(group->group)) {
1995 		SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.\n");
1996 		assert(false);
1997 	}
1998 }
1999 
2000 static struct spdk_io_channel *
2001 bdev_nvme_get_io_channel(void *ctx)
2002 {
2003 	struct nvme_bdev *nvme_bdev = ctx;
2004 
2005 	return spdk_get_io_channel(nvme_bdev);
2006 }
2007 
2008 static void *
2009 bdev_nvme_get_module_ctx(void *ctx)
2010 {
2011 	struct nvme_bdev *nvme_bdev = ctx;
2012 	struct nvme_ns *nvme_ns;
2013 
2014 	if (!nvme_bdev || nvme_bdev->disk.module != &nvme_if) {
2015 		return NULL;
2016 	}
2017 
2018 	nvme_ns = TAILQ_FIRST(&nvme_bdev->nvme_ns_list);
2019 	if (!nvme_ns) {
2020 		return NULL;
2021 	}
2022 
2023 	return nvme_ns->ns;
2024 }
2025 
2026 static const char *
2027 _nvme_ana_state_str(enum spdk_nvme_ana_state ana_state)
2028 {
2029 	switch (ana_state) {
2030 	case SPDK_NVME_ANA_OPTIMIZED_STATE:
2031 		return "optimized";
2032 	case SPDK_NVME_ANA_NON_OPTIMIZED_STATE:
2033 		return "non_optimized";
2034 	case SPDK_NVME_ANA_INACCESSIBLE_STATE:
2035 		return "inaccessible";
2036 	case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE:
2037 		return "persistent_loss";
2038 	case SPDK_NVME_ANA_CHANGE_STATE:
2039 		return "change";
2040 	default:
2041 		return NULL;
2042 	}
2043 }
2044 
2045 static int
2046 bdev_nvme_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size)
2047 {
2048 	struct nvme_bdev *nbdev = ctx;
2049 	struct nvme_ns *nvme_ns;
2050 
2051 	nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list);
2052 	assert(nvme_ns != NULL);
2053 
2054 	return spdk_nvme_ctrlr_get_memory_domains(nvme_ns->ctrlr->ctrlr, domains, array_size);
2055 }
2056 
2057 static void
2058 nvme_namespace_info_json(struct spdk_json_write_ctx *w,
2059 			 struct nvme_ns *nvme_ns)
2060 {
2061 	struct spdk_nvme_ns *ns;
2062 	struct spdk_nvme_ctrlr *ctrlr;
2063 	const struct spdk_nvme_ctrlr_data *cdata;
2064 	const struct spdk_nvme_transport_id *trid;
2065 	union spdk_nvme_vs_register vs;
2066 	char buf[128];
2067 
2068 	ns = nvme_ns->ns;
2069 	ctrlr = spdk_nvme_ns_get_ctrlr(ns);
2070 
2071 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
2072 	trid = spdk_nvme_ctrlr_get_transport_id(ctrlr);
2073 	vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr);
2074 
2075 	spdk_json_write_object_begin(w);
2076 
2077 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
2078 		spdk_json_write_named_string(w, "pci_address", trid->traddr);
2079 	}
2080 
2081 	spdk_json_write_named_object_begin(w, "trid");
2082 
2083 	nvme_bdev_dump_trid_json(trid, w);
2084 
2085 	spdk_json_write_object_end(w);
2086 
2087 #ifdef SPDK_CONFIG_NVME_CUSE
2088 	size_t cuse_name_size = 128;
2089 	char cuse_name[cuse_name_size];
2090 
2091 	int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns),
2092 					    cuse_name, &cuse_name_size);
2093 	if (rc == 0) {
2094 		spdk_json_write_named_string(w, "cuse_device", cuse_name);
2095 	}
2096 #endif
2097 
2098 	spdk_json_write_named_object_begin(w, "ctrlr_data");
2099 
2100 	spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid);
2101 
2102 	snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn);
2103 	spdk_str_trim(buf);
2104 	spdk_json_write_named_string(w, "model_number", buf);
2105 
2106 	snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn);
2107 	spdk_str_trim(buf);
2108 	spdk_json_write_named_string(w, "serial_number", buf);
2109 
2110 	snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr);
2111 	spdk_str_trim(buf);
2112 	spdk_json_write_named_string(w, "firmware_revision", buf);
2113 
2114 	if (cdata->subnqn[0] != '\0') {
2115 		spdk_json_write_named_string(w, "subnqn", cdata->subnqn);
2116 	}
2117 
2118 	spdk_json_write_named_object_begin(w, "oacs");
2119 
2120 	spdk_json_write_named_uint32(w, "security", cdata->oacs.security);
2121 	spdk_json_write_named_uint32(w, "format", cdata->oacs.format);
2122 	spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware);
2123 	spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage);
2124 
2125 	spdk_json_write_object_end(w);
2126 
2127 	spdk_json_write_object_end(w);
2128 
2129 	spdk_json_write_named_object_begin(w, "vs");
2130 
2131 	spdk_json_write_name(w, "nvme_version");
2132 	if (vs.bits.ter) {
2133 		spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter);
2134 	} else {
2135 		spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr);
2136 	}
2137 
2138 	spdk_json_write_object_end(w);
2139 
2140 	spdk_json_write_named_object_begin(w, "ns_data");
2141 
2142 	spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns));
2143 
2144 	if (cdata->cmic.ana_reporting) {
2145 		spdk_json_write_named_string(w, "ana_state",
2146 					     _nvme_ana_state_str(nvme_ns->ana_state));
2147 	}
2148 
2149 	spdk_json_write_object_end(w);
2150 
2151 	if (cdata->oacs.security) {
2152 		spdk_json_write_named_object_begin(w, "security");
2153 
2154 		spdk_json_write_named_bool(w, "opal", nvme_ns->bdev->opal);
2155 
2156 		spdk_json_write_object_end(w);
2157 	}
2158 
2159 	spdk_json_write_object_end(w);
2160 }
2161 
2162 static int
2163 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
2164 {
2165 	struct nvme_bdev *nvme_bdev = ctx;
2166 	struct nvme_ns *nvme_ns;
2167 
2168 	pthread_mutex_lock(&nvme_bdev->mutex);
2169 	spdk_json_write_named_array_begin(w, "nvme");
2170 	TAILQ_FOREACH(nvme_ns, &nvme_bdev->nvme_ns_list, tailq) {
2171 		nvme_namespace_info_json(w, nvme_ns);
2172 	}
2173 	spdk_json_write_array_end(w);
2174 	pthread_mutex_unlock(&nvme_bdev->mutex);
2175 
2176 	return 0;
2177 }
2178 
2179 static void
2180 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
2181 {
2182 	/* No config per bdev needed */
2183 }
2184 
2185 static uint64_t
2186 bdev_nvme_get_spin_time(struct spdk_io_channel *ch)
2187 {
2188 	struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch);
2189 	struct nvme_io_path *io_path;
2190 	struct nvme_poll_group *group;
2191 	uint64_t spin_time = 0;
2192 
2193 	STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
2194 		group = io_path->ctrlr_ch->group;
2195 
2196 		if (!group || !group->collect_spin_stat) {
2197 			continue;
2198 		}
2199 
2200 		if (group->end_ticks != 0) {
2201 			group->spin_ticks += (group->end_ticks - group->start_ticks);
2202 			group->end_ticks = 0;
2203 		}
2204 
2205 		spin_time += group->spin_ticks;
2206 		group->start_ticks = 0;
2207 		group->spin_ticks = 0;
2208 	}
2209 
2210 	return (spin_time * 1000000ULL) / spdk_get_ticks_hz();
2211 }
2212 
2213 static const struct spdk_bdev_fn_table nvmelib_fn_table = {
2214 	.destruct		= bdev_nvme_destruct,
2215 	.submit_request		= bdev_nvme_submit_request,
2216 	.io_type_supported	= bdev_nvme_io_type_supported,
2217 	.get_io_channel		= bdev_nvme_get_io_channel,
2218 	.dump_info_json		= bdev_nvme_dump_info_json,
2219 	.write_config_json	= bdev_nvme_write_config_json,
2220 	.get_spin_time		= bdev_nvme_get_spin_time,
2221 	.get_module_ctx		= bdev_nvme_get_module_ctx,
2222 	.get_memory_domains	= bdev_nvme_get_memory_domains,
2223 };
2224 
2225 typedef int (*bdev_nvme_parse_ana_log_page_cb)(
2226 	const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg);
2227 
2228 static int
2229 bdev_nvme_parse_ana_log_page(struct nvme_ctrlr *nvme_ctrlr,
2230 			     bdev_nvme_parse_ana_log_page_cb cb_fn, void *cb_arg)
2231 {
2232 	struct spdk_nvme_ana_group_descriptor *copied_desc;
2233 	uint8_t *orig_desc;
2234 	uint32_t i, desc_size, copy_len;
2235 	int rc = 0;
2236 
2237 	if (nvme_ctrlr->ana_log_page == NULL) {
2238 		return -EINVAL;
2239 	}
2240 
2241 	copied_desc = nvme_ctrlr->copied_ana_desc;
2242 
2243 	orig_desc = (uint8_t *)nvme_ctrlr->ana_log_page + sizeof(struct spdk_nvme_ana_page);
2244 	copy_len = nvme_ctrlr->ana_log_page_size - sizeof(struct spdk_nvme_ana_page);
2245 
2246 	for (i = 0; i < nvme_ctrlr->ana_log_page->num_ana_group_desc; i++) {
2247 		memcpy(copied_desc, orig_desc, copy_len);
2248 
2249 		rc = cb_fn(copied_desc, cb_arg);
2250 		if (rc != 0) {
2251 			break;
2252 		}
2253 
2254 		desc_size = sizeof(struct spdk_nvme_ana_group_descriptor) +
2255 			    copied_desc->num_of_nsid * sizeof(uint32_t);
2256 		orig_desc += desc_size;
2257 		copy_len -= desc_size;
2258 	}
2259 
2260 	return rc;
2261 }
2262 
2263 static int
2264 nvme_ns_set_ana_state(const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg)
2265 {
2266 	struct nvme_ns *nvme_ns = cb_arg;
2267 	uint32_t i;
2268 
2269 	for (i = 0; i < desc->num_of_nsid; i++) {
2270 		if (desc->nsid[i] != spdk_nvme_ns_get_id(nvme_ns->ns)) {
2271 			continue;
2272 		}
2273 		nvme_ns->ana_group_id = desc->ana_group_id;
2274 		nvme_ns->ana_state = desc->ana_state;
2275 		return 1;
2276 	}
2277 
2278 	return 0;
2279 }
2280 
2281 static int
2282 nvme_disk_create(struct spdk_bdev *disk, const char *base_name,
2283 		 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns,
2284 		 uint32_t prchk_flags, void *ctx)
2285 {
2286 	const struct spdk_uuid		*uuid;
2287 	const uint8_t *nguid;
2288 	const struct spdk_nvme_ctrlr_data *cdata;
2289 	const struct spdk_nvme_ns_data	*nsdata;
2290 	enum spdk_nvme_csi		csi;
2291 	uint32_t atomic_bs, phys_bs, bs;
2292 
2293 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
2294 	csi = spdk_nvme_ns_get_csi(ns);
2295 
2296 	switch (csi) {
2297 	case SPDK_NVME_CSI_NVM:
2298 		disk->product_name = "NVMe disk";
2299 		break;
2300 	case SPDK_NVME_CSI_ZNS:
2301 		disk->product_name = "NVMe ZNS disk";
2302 		disk->zoned = true;
2303 		disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns);
2304 		disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) /
2305 					     spdk_nvme_ns_get_extended_sector_size(ns);
2306 		disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns);
2307 		disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns);
2308 		break;
2309 	default:
2310 		SPDK_ERRLOG("unsupported CSI: %u\n", csi);
2311 		return -ENOTSUP;
2312 	}
2313 
2314 	disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns));
2315 	if (!disk->name) {
2316 		return -ENOMEM;
2317 	}
2318 
2319 	disk->write_cache = 0;
2320 	if (cdata->vwc.present) {
2321 		/* Enable if the Volatile Write Cache exists */
2322 		disk->write_cache = 1;
2323 	}
2324 	if (cdata->oncs.write_zeroes) {
2325 		disk->max_write_zeroes = UINT16_MAX + 1;
2326 	}
2327 	disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns);
2328 	disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns);
2329 	disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns);
2330 
2331 	nguid = spdk_nvme_ns_get_nguid(ns);
2332 	if (!nguid) {
2333 		uuid = spdk_nvme_ns_get_uuid(ns);
2334 		if (uuid) {
2335 			disk->uuid = *uuid;
2336 		}
2337 	} else {
2338 		memcpy(&disk->uuid, nguid, sizeof(disk->uuid));
2339 	}
2340 
2341 	nsdata = spdk_nvme_ns_get_data(ns);
2342 	bs = spdk_nvme_ns_get_sector_size(ns);
2343 	atomic_bs = bs;
2344 	phys_bs = bs;
2345 	if (nsdata->nabo == 0) {
2346 		if (nsdata->nsfeat.ns_atomic_write_unit && nsdata->nawupf) {
2347 			atomic_bs = bs * (1 + nsdata->nawupf);
2348 		} else {
2349 			atomic_bs = bs * (1 + cdata->awupf);
2350 		}
2351 	}
2352 	if (nsdata->nsfeat.optperf) {
2353 		phys_bs = bs * (1 + nsdata->npwg);
2354 	}
2355 	disk->phys_blocklen = spdk_min(phys_bs, atomic_bs);
2356 
2357 	disk->md_len = spdk_nvme_ns_get_md_size(ns);
2358 	if (disk->md_len != 0) {
2359 		disk->md_interleave = nsdata->flbas.extended;
2360 		disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns);
2361 		if (disk->dif_type != SPDK_DIF_DISABLE) {
2362 			disk->dif_is_head_of_md = nsdata->dps.md_start;
2363 			disk->dif_check_flags = prchk_flags;
2364 		}
2365 	}
2366 
2367 	if (!(spdk_nvme_ctrlr_get_flags(ctrlr) &
2368 	      SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) {
2369 		disk->acwu = 0;
2370 	} else if (nsdata->nsfeat.ns_atomic_write_unit) {
2371 		disk->acwu = nsdata->nacwu;
2372 	} else {
2373 		disk->acwu = cdata->acwu;
2374 	}
2375 
2376 	disk->ctxt = ctx;
2377 	disk->fn_table = &nvmelib_fn_table;
2378 	disk->module = &nvme_if;
2379 
2380 	return 0;
2381 }
2382 
2383 static int
2384 nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns)
2385 {
2386 	struct nvme_bdev *bdev;
2387 	int rc;
2388 
2389 	bdev = calloc(1, sizeof(*bdev));
2390 	if (!bdev) {
2391 		SPDK_ERRLOG("bdev calloc() failed\n");
2392 		return -ENOMEM;
2393 	}
2394 
2395 	rc = pthread_mutex_init(&bdev->mutex, NULL);
2396 	if (rc != 0) {
2397 		free(bdev);
2398 		return rc;
2399 	}
2400 
2401 	bdev->ref = 1;
2402 	TAILQ_INIT(&bdev->nvme_ns_list);
2403 	TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq);
2404 	bdev->opal = nvme_ctrlr->opal_dev != NULL;
2405 
2406 	rc = nvme_disk_create(&bdev->disk, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ctrlr,
2407 			      nvme_ns->ns, nvme_ctrlr->prchk_flags, bdev);
2408 	if (rc != 0) {
2409 		SPDK_ERRLOG("Failed to create NVMe disk\n");
2410 		pthread_mutex_destroy(&bdev->mutex);
2411 		free(bdev);
2412 		return rc;
2413 	}
2414 
2415 	spdk_io_device_register(bdev,
2416 				bdev_nvme_create_bdev_channel_cb,
2417 				bdev_nvme_destroy_bdev_channel_cb,
2418 				sizeof(struct nvme_bdev_channel),
2419 				bdev->disk.name);
2420 
2421 	rc = spdk_bdev_register(&bdev->disk);
2422 	if (rc != 0) {
2423 		SPDK_ERRLOG("spdk_bdev_register() failed\n");
2424 		spdk_io_device_unregister(bdev, NULL);
2425 		pthread_mutex_destroy(&bdev->mutex);
2426 		free(bdev->disk.name);
2427 		free(bdev);
2428 		return rc;
2429 	}
2430 
2431 	nvme_ns->bdev = bdev;
2432 	bdev->nsid = nvme_ns->id;
2433 
2434 	bdev->nbdev_ctrlr = nvme_ctrlr->nbdev_ctrlr;
2435 	TAILQ_INSERT_TAIL(&nvme_ctrlr->nbdev_ctrlr->bdevs, bdev, tailq);
2436 
2437 	return 0;
2438 }
2439 
2440 static bool
2441 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2)
2442 {
2443 	const struct spdk_nvme_ns_data *nsdata1, *nsdata2;
2444 	const struct spdk_uuid *uuid1, *uuid2;
2445 
2446 	nsdata1 = spdk_nvme_ns_get_data(ns1);
2447 	nsdata2 = spdk_nvme_ns_get_data(ns2);
2448 	uuid1 = spdk_nvme_ns_get_uuid(ns1);
2449 	uuid2 = spdk_nvme_ns_get_uuid(ns2);
2450 
2451 	return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)) == 0 &&
2452 	       nsdata1->eui64 == nsdata2->eui64 &&
2453 	       uuid1 != NULL && uuid2 != NULL && spdk_uuid_compare(uuid1, uuid2) == 0;
2454 }
2455 
2456 static bool
2457 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
2458 		 struct spdk_nvme_ctrlr_opts *opts)
2459 {
2460 	struct nvme_probe_skip_entry *entry;
2461 
2462 	TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) {
2463 		if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) {
2464 			return false;
2465 		}
2466 	}
2467 
2468 	opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst;
2469 	opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight;
2470 	opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight;
2471 	opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight;
2472 	opts->disable_read_ana_log_page = true;
2473 
2474 	SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr);
2475 
2476 	return true;
2477 }
2478 
2479 static void
2480 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl)
2481 {
2482 	struct nvme_ctrlr *nvme_ctrlr = ctx;
2483 
2484 	if (spdk_nvme_cpl_is_error(cpl)) {
2485 		SPDK_WARNLOG("Abort failed. Resetting controller. sc is %u, sct is %u.\n", cpl->status.sc,
2486 			     cpl->status.sct);
2487 		bdev_nvme_reset(nvme_ctrlr);
2488 	} else if (cpl->cdw0 & 0x1) {
2489 		SPDK_WARNLOG("Specified command could not be aborted.\n");
2490 		bdev_nvme_reset(nvme_ctrlr);
2491 	}
2492 }
2493 
2494 static void
2495 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr,
2496 	   struct spdk_nvme_qpair *qpair, uint16_t cid)
2497 {
2498 	struct nvme_ctrlr *nvme_ctrlr = cb_arg;
2499 	union spdk_nvme_csts_register csts;
2500 	int rc;
2501 
2502 	assert(nvme_ctrlr->ctrlr == ctrlr);
2503 
2504 	SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid);
2505 
2506 	/* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O
2507 	 * queue.  (Note: qpair == NULL when there's an admin cmd timeout.)  Otherwise we
2508 	 * would submit another fabrics cmd on the admin queue to read CSTS and check for its
2509 	 * completion recursively.
2510 	 */
2511 	if (nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) {
2512 		csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr);
2513 		if (csts.bits.cfs) {
2514 			SPDK_ERRLOG("Controller Fatal Status, reset required\n");
2515 			bdev_nvme_reset(nvme_ctrlr);
2516 			return;
2517 		}
2518 	}
2519 
2520 	switch (g_opts.action_on_timeout) {
2521 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT:
2522 		if (qpair) {
2523 			/* Don't send abort to ctrlr when ctrlr is not available. */
2524 			pthread_mutex_lock(&nvme_ctrlr->mutex);
2525 			if (!nvme_ctrlr_is_available(nvme_ctrlr)) {
2526 				pthread_mutex_unlock(&nvme_ctrlr->mutex);
2527 				SPDK_NOTICELOG("Quit abort. Ctrlr is not available.\n");
2528 				return;
2529 			}
2530 			pthread_mutex_unlock(&nvme_ctrlr->mutex);
2531 
2532 			rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid,
2533 						       nvme_abort_cpl, nvme_ctrlr);
2534 			if (rc == 0) {
2535 				return;
2536 			}
2537 
2538 			SPDK_ERRLOG("Unable to send abort. Resetting, rc is %d.\n", rc);
2539 		}
2540 
2541 	/* FALLTHROUGH */
2542 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET:
2543 		bdev_nvme_reset(nvme_ctrlr);
2544 		break;
2545 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE:
2546 		SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n");
2547 		break;
2548 	default:
2549 		SPDK_ERRLOG("An invalid timeout action value is found.\n");
2550 		break;
2551 	}
2552 }
2553 
2554 static void
2555 nvme_ctrlr_populate_namespace_done(struct nvme_ns *nvme_ns, int rc)
2556 {
2557 	struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr;
2558 	struct nvme_async_probe_ctx *ctx = nvme_ns->probe_ctx;
2559 
2560 	if (rc == 0) {
2561 		nvme_ns->probe_ctx = NULL;
2562 		pthread_mutex_lock(&nvme_ctrlr->mutex);
2563 		nvme_ctrlr->ref++;
2564 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
2565 	} else {
2566 		RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns);
2567 		free(nvme_ns);
2568 	}
2569 
2570 	if (ctx) {
2571 		ctx->populates_in_progress--;
2572 		if (ctx->populates_in_progress == 0) {
2573 			nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx);
2574 		}
2575 	}
2576 }
2577 
2578 static void
2579 bdev_nvme_add_io_path(struct spdk_io_channel_iter *i)
2580 {
2581 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
2582 	struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch);
2583 	struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i);
2584 	int rc;
2585 
2586 	rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns);
2587 	if (rc != 0) {
2588 		SPDK_ERRLOG("Failed to add I/O path to bdev_channel dynamically.\n");
2589 	}
2590 
2591 	spdk_for_each_channel_continue(i, rc);
2592 }
2593 
2594 static void
2595 bdev_nvme_delete_io_path(struct spdk_io_channel_iter *i)
2596 {
2597 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
2598 	struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch);
2599 	struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i);
2600 	struct nvme_io_path *io_path;
2601 
2602 	io_path = _bdev_nvme_get_io_path(nbdev_ch, nvme_ns);
2603 	if (io_path != NULL) {
2604 		_bdev_nvme_delete_io_path(nbdev_ch, io_path);
2605 	}
2606 
2607 	spdk_for_each_channel_continue(i, 0);
2608 }
2609 
2610 static void
2611 bdev_nvme_add_io_path_failed(struct spdk_io_channel_iter *i, int status)
2612 {
2613 	struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i);
2614 
2615 	nvme_ctrlr_populate_namespace_done(nvme_ns, -1);
2616 }
2617 
2618 static void
2619 bdev_nvme_add_io_path_done(struct spdk_io_channel_iter *i, int status)
2620 {
2621 	struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i);
2622 	struct nvme_bdev *bdev = spdk_io_channel_iter_get_io_device(i);
2623 
2624 	if (status == 0) {
2625 		nvme_ctrlr_populate_namespace_done(nvme_ns, 0);
2626 	} else {
2627 		/* Delete the added io_paths and fail populating the namespace. */
2628 		spdk_for_each_channel(bdev,
2629 				      bdev_nvme_delete_io_path,
2630 				      nvme_ns,
2631 				      bdev_nvme_add_io_path_failed);
2632 	}
2633 }
2634 
2635 static int
2636 nvme_bdev_add_ns(struct nvme_bdev *bdev, struct nvme_ns *nvme_ns)
2637 {
2638 	struct nvme_ns *tmp_ns;
2639 	const struct spdk_nvme_ns_data *nsdata;
2640 
2641 	nsdata = spdk_nvme_ns_get_data(nvme_ns->ns);
2642 	if (!nsdata->nmic.can_share) {
2643 		SPDK_ERRLOG("Namespace cannot be shared.\n");
2644 		return -EINVAL;
2645 	}
2646 
2647 	pthread_mutex_lock(&bdev->mutex);
2648 
2649 	tmp_ns = TAILQ_FIRST(&bdev->nvme_ns_list);
2650 	assert(tmp_ns != NULL);
2651 
2652 	if (!bdev_nvme_compare_ns(nvme_ns->ns, tmp_ns->ns)) {
2653 		pthread_mutex_unlock(&bdev->mutex);
2654 		SPDK_ERRLOG("Namespaces are not identical.\n");
2655 		return -EINVAL;
2656 	}
2657 
2658 	bdev->ref++;
2659 	TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq);
2660 	nvme_ns->bdev = bdev;
2661 
2662 	pthread_mutex_unlock(&bdev->mutex);
2663 
2664 	/* Add nvme_io_path to nvme_bdev_channels dynamically. */
2665 	spdk_for_each_channel(bdev,
2666 			      bdev_nvme_add_io_path,
2667 			      nvme_ns,
2668 			      bdev_nvme_add_io_path_done);
2669 
2670 	return 0;
2671 }
2672 
2673 static void
2674 nvme_ctrlr_populate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns)
2675 {
2676 	struct spdk_nvme_ns	*ns;
2677 	struct nvme_bdev	*bdev;
2678 	int			rc = 0;
2679 
2680 	ns = spdk_nvme_ctrlr_get_ns(nvme_ctrlr->ctrlr, nvme_ns->id);
2681 	if (!ns) {
2682 		SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id);
2683 		rc = -EINVAL;
2684 		goto done;
2685 	}
2686 
2687 	nvme_ns->ns = ns;
2688 	nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE;
2689 
2690 	if (nvme_ctrlr->ana_log_page != NULL) {
2691 		bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ns_set_ana_state, nvme_ns);
2692 	}
2693 
2694 	bdev = nvme_bdev_ctrlr_get_bdev(nvme_ctrlr->nbdev_ctrlr, nvme_ns->id);
2695 	if (bdev == NULL) {
2696 		rc = nvme_bdev_create(nvme_ctrlr, nvme_ns);
2697 	} else {
2698 		rc = nvme_bdev_add_ns(bdev, nvme_ns);
2699 		if (rc == 0) {
2700 			return;
2701 		}
2702 	}
2703 done:
2704 	nvme_ctrlr_populate_namespace_done(nvme_ns, rc);
2705 }
2706 
2707 static void
2708 nvme_ctrlr_depopulate_namespace_done(struct nvme_ns *nvme_ns)
2709 {
2710 	struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr;
2711 
2712 	assert(nvme_ctrlr != NULL);
2713 
2714 	pthread_mutex_lock(&nvme_ctrlr->mutex);
2715 
2716 	RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns);
2717 
2718 	if (nvme_ns->bdev != NULL) {
2719 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
2720 		return;
2721 	}
2722 
2723 	free(nvme_ns);
2724 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
2725 
2726 	nvme_ctrlr_release(nvme_ctrlr);
2727 }
2728 
2729 static void
2730 bdev_nvme_delete_io_path_done(struct spdk_io_channel_iter *i, int status)
2731 {
2732 	struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i);
2733 
2734 	nvme_ctrlr_depopulate_namespace_done(nvme_ns);
2735 }
2736 
2737 static void
2738 nvme_ctrlr_depopulate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns)
2739 {
2740 	struct nvme_bdev *bdev;
2741 
2742 	bdev = nvme_ns->bdev;
2743 	if (bdev != NULL) {
2744 		pthread_mutex_lock(&bdev->mutex);
2745 
2746 		assert(bdev->ref > 0);
2747 		bdev->ref--;
2748 		if (bdev->ref == 0) {
2749 			pthread_mutex_unlock(&bdev->mutex);
2750 
2751 			spdk_bdev_unregister(&bdev->disk, NULL, NULL);
2752 		} else {
2753 			/* spdk_bdev_unregister() is not called until the last nvme_ns is
2754 			 * depopulated. Hence we need to remove nvme_ns from bdev->nvme_ns_list
2755 			 * and clear nvme_ns->bdev here.
2756 			 */
2757 			TAILQ_REMOVE(&bdev->nvme_ns_list, nvme_ns, tailq);
2758 			nvme_ns->bdev = NULL;
2759 
2760 			pthread_mutex_unlock(&bdev->mutex);
2761 
2762 			/* Delete nvme_io_paths from nvme_bdev_channels dynamically. After that,
2763 			 * we call depopulate_namespace_done() to avoid use-after-free.
2764 			 */
2765 			spdk_for_each_channel(bdev,
2766 					      bdev_nvme_delete_io_path,
2767 					      nvme_ns,
2768 					      bdev_nvme_delete_io_path_done);
2769 			return;
2770 		}
2771 	}
2772 
2773 	nvme_ctrlr_depopulate_namespace_done(nvme_ns);
2774 }
2775 
2776 static void
2777 nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr,
2778 			       struct nvme_async_probe_ctx *ctx)
2779 {
2780 	struct spdk_nvme_ctrlr	*ctrlr = nvme_ctrlr->ctrlr;
2781 	struct nvme_ns	*nvme_ns, *next;
2782 	struct spdk_nvme_ns	*ns;
2783 	struct nvme_bdev	*bdev;
2784 	uint32_t		nsid;
2785 	int			rc;
2786 	uint64_t		num_sectors;
2787 
2788 	if (ctx) {
2789 		/* Initialize this count to 1 to handle the populate functions
2790 		 * calling nvme_ctrlr_populate_namespace_done() immediately.
2791 		 */
2792 		ctx->populates_in_progress = 1;
2793 	}
2794 
2795 	/* First loop over our existing namespaces and see if they have been
2796 	 * removed. */
2797 	nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr);
2798 	while (nvme_ns != NULL) {
2799 		next = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns);
2800 
2801 		if (spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) {
2802 			/* NS is still there but attributes may have changed */
2803 			ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id);
2804 			num_sectors = spdk_nvme_ns_get_num_sectors(ns);
2805 			bdev = nvme_ns->bdev;
2806 			assert(bdev != NULL);
2807 			if (bdev->disk.blockcnt != num_sectors) {
2808 				SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n",
2809 					       nvme_ns->id,
2810 					       bdev->disk.name,
2811 					       bdev->disk.blockcnt,
2812 					       num_sectors);
2813 				rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors);
2814 				if (rc != 0) {
2815 					SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n",
2816 						    bdev->disk.name, rc);
2817 				}
2818 			}
2819 		} else {
2820 			/* Namespace was removed */
2821 			nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns);
2822 		}
2823 
2824 		nvme_ns = next;
2825 	}
2826 
2827 	/* Loop through all of the namespaces at the nvme level and see if any of them are new */
2828 	nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr);
2829 	while (nsid != 0) {
2830 		nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid);
2831 
2832 		if (nvme_ns == NULL) {
2833 			/* Found a new one */
2834 			nvme_ns = calloc(1, sizeof(struct nvme_ns));
2835 			if (nvme_ns == NULL) {
2836 				SPDK_ERRLOG("Failed to allocate namespace\n");
2837 				/* This just fails to attach the namespace. It may work on a future attempt. */
2838 				continue;
2839 			}
2840 
2841 			nvme_ns->id = nsid;
2842 			nvme_ns->ctrlr = nvme_ctrlr;
2843 
2844 			nvme_ns->bdev = NULL;
2845 
2846 			if (ctx) {
2847 				ctx->populates_in_progress++;
2848 			}
2849 			nvme_ns->probe_ctx = ctx;
2850 
2851 			RB_INSERT(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns);
2852 
2853 			nvme_ctrlr_populate_namespace(nvme_ctrlr, nvme_ns);
2854 		}
2855 
2856 		nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid);
2857 	}
2858 
2859 	if (ctx) {
2860 		/* Decrement this count now that the loop is over to account
2861 		 * for the one we started with.  If the count is then 0, we
2862 		 * know any populate_namespace functions completed immediately,
2863 		 * so we'll kick the callback here.
2864 		 */
2865 		ctx->populates_in_progress--;
2866 		if (ctx->populates_in_progress == 0) {
2867 			nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx);
2868 		}
2869 	}
2870 
2871 }
2872 
2873 static void
2874 nvme_ctrlr_depopulate_namespaces(struct nvme_ctrlr *nvme_ctrlr)
2875 {
2876 	struct nvme_ns *nvme_ns, *tmp;
2877 
2878 	RB_FOREACH_SAFE(nvme_ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp) {
2879 		nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns);
2880 	}
2881 }
2882 
2883 static int
2884 nvme_ctrlr_set_ana_states(const struct spdk_nvme_ana_group_descriptor *desc,
2885 			  void *cb_arg)
2886 {
2887 	struct nvme_ctrlr *nvme_ctrlr = cb_arg;
2888 	struct nvme_ns *nvme_ns;
2889 	uint32_t i, nsid;
2890 
2891 	for (i = 0; i < desc->num_of_nsid; i++) {
2892 		nsid = desc->nsid[i];
2893 		if (nsid == 0) {
2894 			continue;
2895 		}
2896 
2897 		nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid);
2898 
2899 		assert(nvme_ns != NULL);
2900 		if (nvme_ns == NULL) {
2901 			/* Target told us that an inactive namespace had an ANA change */
2902 			continue;
2903 		}
2904 
2905 		nvme_ns->ana_group_id = desc->ana_group_id;
2906 		nvme_ns->ana_state = desc->ana_state;
2907 		nvme_ns->ana_state_updating = false;
2908 	}
2909 
2910 	return 0;
2911 }
2912 
2913 static void
2914 bdev_nvme_clear_io_path_cache(struct spdk_io_channel_iter *i)
2915 {
2916 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
2917 	struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch);
2918 
2919 	_bdev_nvme_clear_io_path_cache(ctrlr_ch);
2920 
2921 	spdk_for_each_channel_continue(i, 0);
2922 }
2923 
2924 static void
2925 bdev_nvme_clear_io_path_cache_done(struct spdk_io_channel_iter *i, int status)
2926 {
2927 	struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i);
2928 
2929 	pthread_mutex_lock(&nvme_ctrlr->mutex);
2930 
2931 	assert(nvme_ctrlr->ana_log_page_updating == true);
2932 	nvme_ctrlr->ana_log_page_updating = false;
2933 
2934 	if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) {
2935 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
2936 		return;
2937 	}
2938 
2939 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
2940 
2941 	nvme_ctrlr_unregister(nvme_ctrlr);
2942 }
2943 
2944 static void
2945 nvme_ctrlr_read_ana_log_page_done(void *ctx, const struct spdk_nvme_cpl *cpl)
2946 {
2947 	struct nvme_ctrlr *nvme_ctrlr = ctx;
2948 
2949 	if (cpl != NULL && spdk_nvme_cpl_is_success(cpl)) {
2950 		bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ctrlr_set_ana_states,
2951 					     nvme_ctrlr);
2952 	}
2953 
2954 	spdk_for_each_channel(nvme_ctrlr,
2955 			      bdev_nvme_clear_io_path_cache,
2956 			      NULL,
2957 			      bdev_nvme_clear_io_path_cache_done);
2958 }
2959 
2960 static void
2961 nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr)
2962 {
2963 	int rc;
2964 
2965 	if (nvme_ctrlr->ana_log_page == NULL) {
2966 		return;
2967 	}
2968 
2969 	pthread_mutex_lock(&nvme_ctrlr->mutex);
2970 	if (!nvme_ctrlr_is_available(nvme_ctrlr) ||
2971 	    nvme_ctrlr->ana_log_page_updating) {
2972 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
2973 		return;
2974 	}
2975 
2976 	nvme_ctrlr->ana_log_page_updating = true;
2977 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
2978 
2979 	rc = spdk_nvme_ctrlr_cmd_get_log_page(nvme_ctrlr->ctrlr,
2980 					      SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS,
2981 					      SPDK_NVME_GLOBAL_NS_TAG,
2982 					      nvme_ctrlr->ana_log_page,
2983 					      nvme_ctrlr->ana_log_page_size, 0,
2984 					      nvme_ctrlr_read_ana_log_page_done,
2985 					      nvme_ctrlr);
2986 	if (rc != 0) {
2987 		nvme_ctrlr_read_ana_log_page_done(nvme_ctrlr, NULL);
2988 	}
2989 }
2990 
2991 static void
2992 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl)
2993 {
2994 	struct nvme_ctrlr *nvme_ctrlr		= arg;
2995 	union spdk_nvme_async_event_completion	event;
2996 
2997 	if (spdk_nvme_cpl_is_error(cpl)) {
2998 		SPDK_WARNLOG("AER request execute failed");
2999 		return;
3000 	}
3001 
3002 	event.raw = cpl->cdw0;
3003 	if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) &&
3004 	    (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) {
3005 		nvme_ctrlr_populate_namespaces(nvme_ctrlr, NULL);
3006 	} else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) &&
3007 		   (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_ANA_CHANGE)) {
3008 		nvme_ctrlr_read_ana_log_page(nvme_ctrlr);
3009 	}
3010 }
3011 
3012 static void
3013 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, size_t count, int rc)
3014 {
3015 	if (ctx->cb_fn) {
3016 		ctx->cb_fn(ctx->cb_ctx, count, rc);
3017 	}
3018 
3019 	ctx->namespaces_populated = true;
3020 	if (ctx->probe_done) {
3021 		/* The probe was already completed, so we need to free the context
3022 		 * here.  This can happen for cases like OCSSD, where we need to
3023 		 * send additional commands to the SSD after attach.
3024 		 */
3025 		free(ctx);
3026 	}
3027 }
3028 
3029 static void
3030 nvme_ctrlr_create_done(struct nvme_ctrlr *nvme_ctrlr,
3031 		       struct nvme_async_probe_ctx *ctx)
3032 {
3033 	spdk_io_device_register(nvme_ctrlr,
3034 				bdev_nvme_create_ctrlr_channel_cb,
3035 				bdev_nvme_destroy_ctrlr_channel_cb,
3036 				sizeof(struct nvme_ctrlr_channel),
3037 				nvme_ctrlr->nbdev_ctrlr->name);
3038 
3039 	nvme_ctrlr_populate_namespaces(nvme_ctrlr, ctx);
3040 }
3041 
3042 static void
3043 nvme_ctrlr_init_ana_log_page_done(void *_ctx, const struct spdk_nvme_cpl *cpl)
3044 {
3045 	struct nvme_ctrlr *nvme_ctrlr = _ctx;
3046 	struct nvme_async_probe_ctx *ctx = nvme_ctrlr->probe_ctx;
3047 
3048 	nvme_ctrlr->probe_ctx = NULL;
3049 
3050 	if (spdk_nvme_cpl_is_error(cpl)) {
3051 		nvme_ctrlr_delete(nvme_ctrlr);
3052 
3053 		if (ctx != NULL) {
3054 			populate_namespaces_cb(ctx, 0, -1);
3055 		}
3056 		return;
3057 	}
3058 
3059 	nvme_ctrlr_create_done(nvme_ctrlr, ctx);
3060 }
3061 
3062 static int
3063 nvme_ctrlr_init_ana_log_page(struct nvme_ctrlr *nvme_ctrlr,
3064 			     struct nvme_async_probe_ctx *ctx)
3065 {
3066 	struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr;
3067 	const struct spdk_nvme_ctrlr_data *cdata;
3068 	uint32_t ana_log_page_size;
3069 
3070 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
3071 
3072 	ana_log_page_size = sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid *
3073 			    sizeof(struct spdk_nvme_ana_group_descriptor) + cdata->nn *
3074 			    sizeof(uint32_t);
3075 
3076 	nvme_ctrlr->ana_log_page = spdk_zmalloc(ana_log_page_size, 64, NULL,
3077 						SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
3078 	if (nvme_ctrlr->ana_log_page == NULL) {
3079 		SPDK_ERRLOG("could not allocate ANA log page buffer\n");
3080 		return -ENXIO;
3081 	}
3082 
3083 	/* Each descriptor in a ANA log page is not ensured to be 8-bytes aligned.
3084 	 * Hence copy each descriptor to a temporary area when parsing it.
3085 	 *
3086 	 * Allocate a buffer whose size is as large as ANA log page buffer because
3087 	 * we do not know the size of a descriptor until actually reading it.
3088 	 */
3089 	nvme_ctrlr->copied_ana_desc = calloc(1, ana_log_page_size);
3090 	if (nvme_ctrlr->copied_ana_desc == NULL) {
3091 		SPDK_ERRLOG("could not allocate a buffer to parse ANA descriptor\n");
3092 		return -ENOMEM;
3093 	}
3094 
3095 	nvme_ctrlr->ana_log_page_size = ana_log_page_size;
3096 
3097 	nvme_ctrlr->probe_ctx = ctx;
3098 
3099 	return spdk_nvme_ctrlr_cmd_get_log_page(ctrlr,
3100 						SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS,
3101 						SPDK_NVME_GLOBAL_NS_TAG,
3102 						nvme_ctrlr->ana_log_page,
3103 						nvme_ctrlr->ana_log_page_size, 0,
3104 						nvme_ctrlr_init_ana_log_page_done,
3105 						nvme_ctrlr);
3106 }
3107 
3108 /* hostnqn and subnqn were already verified before attaching a controller.
3109  * Hence check only the multipath capability and cntlid here.
3110  */
3111 static bool
3112 bdev_nvme_check_multipath(struct nvme_bdev_ctrlr *nbdev_ctrlr, struct spdk_nvme_ctrlr *ctrlr)
3113 {
3114 	struct nvme_ctrlr *tmp;
3115 	const struct spdk_nvme_ctrlr_data *cdata, *tmp_cdata;
3116 
3117 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
3118 
3119 	if (!cdata->cmic.multi_ctrlr) {
3120 		SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid);
3121 		return false;
3122 	}
3123 
3124 	TAILQ_FOREACH(tmp, &nbdev_ctrlr->ctrlrs, tailq) {
3125 		tmp_cdata = spdk_nvme_ctrlr_get_data(tmp->ctrlr);
3126 
3127 		if (!tmp_cdata->cmic.multi_ctrlr) {
3128 			SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid);
3129 			return false;
3130 		}
3131 		if (cdata->cntlid == tmp_cdata->cntlid) {
3132 			SPDK_ERRLOG("cntlid %u are duplicated.\n", tmp_cdata->cntlid);
3133 			return false;
3134 		}
3135 	}
3136 
3137 	return true;
3138 }
3139 
3140 static int
3141 nvme_bdev_ctrlr_create(const char *name, struct nvme_ctrlr *nvme_ctrlr)
3142 {
3143 	struct nvme_bdev_ctrlr *nbdev_ctrlr;
3144 	struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr;
3145 	int rc = 0;
3146 
3147 	pthread_mutex_lock(&g_bdev_nvme_mutex);
3148 
3149 	nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name);
3150 	if (nbdev_ctrlr != NULL) {
3151 		if (!bdev_nvme_check_multipath(nbdev_ctrlr, ctrlr)) {
3152 			rc = -EINVAL;
3153 			goto exit;
3154 		}
3155 	} else {
3156 		nbdev_ctrlr = calloc(1, sizeof(*nbdev_ctrlr));
3157 		if (nbdev_ctrlr == NULL) {
3158 			SPDK_ERRLOG("Failed to allocate nvme_bdev_ctrlr.\n");
3159 			rc = -ENOMEM;
3160 			goto exit;
3161 		}
3162 		nbdev_ctrlr->name = strdup(name);
3163 		if (nbdev_ctrlr->name == NULL) {
3164 			SPDK_ERRLOG("Failed to allocate name of nvme_bdev_ctrlr.\n");
3165 			free(nbdev_ctrlr);
3166 			goto exit;
3167 		}
3168 		TAILQ_INIT(&nbdev_ctrlr->ctrlrs);
3169 		TAILQ_INIT(&nbdev_ctrlr->bdevs);
3170 		TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq);
3171 	}
3172 	nvme_ctrlr->nbdev_ctrlr = nbdev_ctrlr;
3173 	TAILQ_INSERT_TAIL(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq);
3174 exit:
3175 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
3176 	return rc;
3177 }
3178 
3179 static int
3180 nvme_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr,
3181 		  const char *name,
3182 		  const struct spdk_nvme_transport_id *trid,
3183 		  struct nvme_async_probe_ctx *ctx)
3184 {
3185 	struct nvme_ctrlr *nvme_ctrlr;
3186 	struct nvme_path_id *path_id;
3187 	const struct spdk_nvme_ctrlr_data *cdata;
3188 	int rc;
3189 
3190 	nvme_ctrlr = calloc(1, sizeof(*nvme_ctrlr));
3191 	if (nvme_ctrlr == NULL) {
3192 		SPDK_ERRLOG("Failed to allocate device struct\n");
3193 		return -ENOMEM;
3194 	}
3195 
3196 	rc = pthread_mutex_init(&nvme_ctrlr->mutex, NULL);
3197 	if (rc != 0) {
3198 		free(nvme_ctrlr);
3199 		return rc;
3200 	}
3201 
3202 	TAILQ_INIT(&nvme_ctrlr->trids);
3203 
3204 	RB_INIT(&nvme_ctrlr->namespaces);
3205 
3206 	path_id = calloc(1, sizeof(*path_id));
3207 	if (path_id == NULL) {
3208 		SPDK_ERRLOG("Failed to allocate trid entry pointer\n");
3209 		rc = -ENOMEM;
3210 		goto err;
3211 	}
3212 
3213 	path_id->trid = *trid;
3214 	if (ctx != NULL) {
3215 		memcpy(path_id->hostid.hostaddr, ctx->opts.src_addr, sizeof(path_id->hostid.hostaddr));
3216 		memcpy(path_id->hostid.hostsvcid, ctx->opts.src_svcid, sizeof(path_id->hostid.hostsvcid));
3217 	}
3218 	nvme_ctrlr->active_path_id = path_id;
3219 	TAILQ_INSERT_HEAD(&nvme_ctrlr->trids, path_id, link);
3220 
3221 	nvme_ctrlr->thread = spdk_get_thread();
3222 	nvme_ctrlr->ctrlr = ctrlr;
3223 	nvme_ctrlr->ref = 1;
3224 
3225 	if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) {
3226 		SPDK_ERRLOG("OCSSDs are not supported");
3227 		rc = -ENOTSUP;
3228 		goto err;
3229 	}
3230 
3231 	if (ctx != NULL) {
3232 		nvme_ctrlr->prchk_flags = ctx->prchk_flags;
3233 	}
3234 
3235 	nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_ctrlr,
3236 					  g_opts.nvme_adminq_poll_period_us);
3237 
3238 	if (g_opts.timeout_us > 0) {
3239 		/* Register timeout callback. Timeout values for IO vs. admin reqs can be different. */
3240 		/* If timeout_admin_us is 0 (not specified), admin uses same timeout as IO. */
3241 		uint64_t adm_timeout_us = (g_opts.timeout_admin_us == 0) ?
3242 					  g_opts.timeout_us : g_opts.timeout_admin_us;
3243 		spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us,
3244 				adm_timeout_us, timeout_cb, nvme_ctrlr);
3245 	}
3246 
3247 	spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_ctrlr);
3248 	spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_ctrlr);
3249 
3250 	if (spdk_nvme_ctrlr_get_flags(ctrlr) &
3251 	    SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) {
3252 		nvme_ctrlr->opal_dev = spdk_opal_dev_construct(ctrlr);
3253 	}
3254 
3255 	rc = nvme_bdev_ctrlr_create(name, nvme_ctrlr);
3256 	if (rc != 0) {
3257 		goto err;
3258 	}
3259 
3260 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
3261 
3262 	if (cdata->cmic.ana_reporting) {
3263 		rc = nvme_ctrlr_init_ana_log_page(nvme_ctrlr, ctx);
3264 		if (rc == 0) {
3265 			return 0;
3266 		}
3267 	} else {
3268 		nvme_ctrlr_create_done(nvme_ctrlr, ctx);
3269 		return 0;
3270 	}
3271 
3272 err:
3273 	nvme_ctrlr_delete(nvme_ctrlr);
3274 	return rc;
3275 }
3276 
3277 static void
3278 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
3279 	  struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
3280 {
3281 	char *name;
3282 
3283 	name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++);
3284 	if (!name) {
3285 		SPDK_ERRLOG("Failed to assign name to NVMe device\n");
3286 		return;
3287 	}
3288 
3289 	SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name);
3290 
3291 	nvme_ctrlr_create(ctrlr, name, trid, NULL);
3292 
3293 	free(name);
3294 }
3295 
3296 static void
3297 _nvme_ctrlr_destruct(void *ctx)
3298 {
3299 	struct nvme_ctrlr *nvme_ctrlr = ctx;
3300 
3301 	nvme_ctrlr_depopulate_namespaces(nvme_ctrlr);
3302 	nvme_ctrlr_release(nvme_ctrlr);
3303 }
3304 
3305 static int
3306 _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, bool hotplug)
3307 {
3308 	struct nvme_probe_skip_entry *entry;
3309 
3310 	pthread_mutex_lock(&nvme_ctrlr->mutex);
3311 
3312 	/* The controller's destruction was already started */
3313 	if (nvme_ctrlr->destruct) {
3314 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
3315 		return 0;
3316 	}
3317 
3318 	if (!hotplug &&
3319 	    nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) {
3320 		entry = calloc(1, sizeof(*entry));
3321 		if (!entry) {
3322 			pthread_mutex_unlock(&nvme_ctrlr->mutex);
3323 			return -ENOMEM;
3324 		}
3325 		entry->trid = nvme_ctrlr->active_path_id->trid;
3326 		TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq);
3327 	}
3328 
3329 	nvme_ctrlr->destruct = true;
3330 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
3331 
3332 	_nvme_ctrlr_destruct(nvme_ctrlr);
3333 
3334 	return 0;
3335 }
3336 
3337 static void
3338 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr)
3339 {
3340 	struct nvme_ctrlr *nvme_ctrlr = cb_ctx;
3341 
3342 	_bdev_nvme_delete(nvme_ctrlr, true);
3343 }
3344 
3345 static int
3346 bdev_nvme_hotplug_probe(void *arg)
3347 {
3348 	if (g_hotplug_probe_ctx == NULL) {
3349 		spdk_poller_unregister(&g_hotplug_probe_poller);
3350 		return SPDK_POLLER_IDLE;
3351 	}
3352 
3353 	if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) {
3354 		g_hotplug_probe_ctx = NULL;
3355 		spdk_poller_unregister(&g_hotplug_probe_poller);
3356 	}
3357 
3358 	return SPDK_POLLER_BUSY;
3359 }
3360 
3361 static int
3362 bdev_nvme_hotplug(void *arg)
3363 {
3364 	struct spdk_nvme_transport_id trid_pcie;
3365 
3366 	if (g_hotplug_probe_ctx) {
3367 		return SPDK_POLLER_BUSY;
3368 	}
3369 
3370 	memset(&trid_pcie, 0, sizeof(trid_pcie));
3371 	spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE);
3372 
3373 	g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL,
3374 			      hotplug_probe_cb, attach_cb, NULL);
3375 
3376 	if (g_hotplug_probe_ctx) {
3377 		assert(g_hotplug_probe_poller == NULL);
3378 		g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000);
3379 	}
3380 
3381 	return SPDK_POLLER_BUSY;
3382 }
3383 
3384 void
3385 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts)
3386 {
3387 	*opts = g_opts;
3388 }
3389 
3390 static int
3391 bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts)
3392 {
3393 	if ((opts->timeout_us == 0) && (opts->timeout_admin_us != 0)) {
3394 		/* Can't set timeout_admin_us without also setting timeout_us */
3395 		SPDK_WARNLOG("Invalid options: Can't have (timeout_us == 0) with (timeout_admin_us > 0)\n");
3396 		return -EINVAL;
3397 	}
3398 
3399 	if (opts->bdev_retry_count < -1) {
3400 		SPDK_WARNLOG("Invalid option: bdev_retry_count can't be less than -1.\n");
3401 		return -EINVAL;
3402 	}
3403 
3404 	return 0;
3405 }
3406 
3407 int
3408 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts)
3409 {
3410 	int ret = bdev_nvme_validate_opts(opts);
3411 	if (ret) {
3412 		SPDK_WARNLOG("Failed to set nvme opts.\n");
3413 		return ret;
3414 	}
3415 
3416 	if (g_bdev_nvme_init_thread != NULL) {
3417 		if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
3418 			return -EPERM;
3419 		}
3420 	}
3421 
3422 	g_opts = *opts;
3423 
3424 	return 0;
3425 }
3426 
3427 struct set_nvme_hotplug_ctx {
3428 	uint64_t period_us;
3429 	bool enabled;
3430 	spdk_msg_fn fn;
3431 	void *fn_ctx;
3432 };
3433 
3434 static void
3435 set_nvme_hotplug_period_cb(void *_ctx)
3436 {
3437 	struct set_nvme_hotplug_ctx *ctx = _ctx;
3438 
3439 	spdk_poller_unregister(&g_hotplug_poller);
3440 	if (ctx->enabled) {
3441 		g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us);
3442 	}
3443 
3444 	g_nvme_hotplug_poll_period_us = ctx->period_us;
3445 	g_nvme_hotplug_enabled = ctx->enabled;
3446 	if (ctx->fn) {
3447 		ctx->fn(ctx->fn_ctx);
3448 	}
3449 
3450 	free(ctx);
3451 }
3452 
3453 int
3454 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx)
3455 {
3456 	struct set_nvme_hotplug_ctx *ctx;
3457 
3458 	if (enabled == true && !spdk_process_is_primary()) {
3459 		return -EPERM;
3460 	}
3461 
3462 	ctx = calloc(1, sizeof(*ctx));
3463 	if (ctx == NULL) {
3464 		return -ENOMEM;
3465 	}
3466 
3467 	period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us;
3468 	ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX);
3469 	ctx->enabled = enabled;
3470 	ctx->fn = cb;
3471 	ctx->fn_ctx = cb_ctx;
3472 
3473 	spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx);
3474 	return 0;
3475 }
3476 
3477 static void
3478 nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr,
3479 				    struct nvme_async_probe_ctx *ctx)
3480 {
3481 	struct nvme_ns	*nvme_ns;
3482 	struct nvme_bdev	*nvme_bdev;
3483 	size_t			j;
3484 
3485 	assert(nvme_ctrlr != NULL);
3486 
3487 	/*
3488 	 * Report the new bdevs that were created in this call.
3489 	 * There can be more than one bdev per NVMe controller.
3490 	 */
3491 	j = 0;
3492 	nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr);
3493 	while (nvme_ns != NULL) {
3494 		nvme_bdev = nvme_ns->bdev;
3495 		if (j < ctx->count) {
3496 			ctx->names[j] = nvme_bdev->disk.name;
3497 			j++;
3498 		} else {
3499 			SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n",
3500 				    ctx->count);
3501 			populate_namespaces_cb(ctx, 0, -ERANGE);
3502 			return;
3503 		}
3504 
3505 		nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns);
3506 	}
3507 
3508 	populate_namespaces_cb(ctx, j, 0);
3509 }
3510 
3511 static int
3512 bdev_nvme_compare_trids(struct nvme_ctrlr *nvme_ctrlr,
3513 			struct spdk_nvme_ctrlr *new_ctrlr,
3514 			struct spdk_nvme_transport_id *trid)
3515 {
3516 	struct nvme_path_id *tmp_trid;
3517 
3518 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
3519 		SPDK_ERRLOG("PCIe failover is not supported.\n");
3520 		return -ENOTSUP;
3521 	}
3522 
3523 	/* Currently we only support failover to the same transport type. */
3524 	if (nvme_ctrlr->active_path_id->trid.trtype != trid->trtype) {
3525 		return -EINVAL;
3526 	}
3527 
3528 	/* Currently we only support failover to the same NQN. */
3529 	if (strncmp(trid->subnqn, nvme_ctrlr->active_path_id->trid.subnqn, SPDK_NVMF_NQN_MAX_LEN)) {
3530 		return -EINVAL;
3531 	}
3532 
3533 	/* Skip all the other checks if we've already registered this path. */
3534 	TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) {
3535 		if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) {
3536 			return -EEXIST;
3537 		}
3538 	}
3539 
3540 	return 0;
3541 }
3542 
3543 static int
3544 bdev_nvme_compare_namespaces(struct nvme_ctrlr *nvme_ctrlr,
3545 			     struct spdk_nvme_ctrlr *new_ctrlr)
3546 {
3547 	struct nvme_ns *nvme_ns;
3548 	struct spdk_nvme_ns *new_ns;
3549 
3550 	nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr);
3551 	while (nvme_ns != NULL) {
3552 		new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nvme_ns->id);
3553 		assert(new_ns != NULL);
3554 
3555 		if (!bdev_nvme_compare_ns(nvme_ns->ns, new_ns)) {
3556 			return -EINVAL;
3557 		}
3558 
3559 		nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns);
3560 	}
3561 
3562 	return 0;
3563 }
3564 
3565 static int
3566 _bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr,
3567 			      struct spdk_nvme_transport_id *trid)
3568 {
3569 	struct nvme_path_id *new_trid, *tmp_trid;
3570 
3571 	new_trid = calloc(1, sizeof(*new_trid));
3572 	if (new_trid == NULL) {
3573 		return -ENOMEM;
3574 	}
3575 	new_trid->trid = *trid;
3576 	new_trid->is_failed = false;
3577 
3578 	TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) {
3579 		if (tmp_trid->is_failed) {
3580 			TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link);
3581 			return 0;
3582 		}
3583 	}
3584 
3585 	TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, new_trid, link);
3586 	return 0;
3587 }
3588 
3589 /* This is the case that a secondary path is added to an existing
3590  * nvme_ctrlr for failover. After checking if it can access the same
3591  * namespaces as the primary path, it is disconnected until failover occurs.
3592  */
3593 static int
3594 bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr,
3595 			     struct spdk_nvme_ctrlr *new_ctrlr,
3596 			     struct spdk_nvme_transport_id *trid)
3597 {
3598 	int rc;
3599 
3600 	assert(nvme_ctrlr != NULL);
3601 
3602 	pthread_mutex_lock(&nvme_ctrlr->mutex);
3603 
3604 	rc = bdev_nvme_compare_trids(nvme_ctrlr, new_ctrlr, trid);
3605 	if (rc != 0) {
3606 		goto exit;
3607 	}
3608 
3609 	rc = bdev_nvme_compare_namespaces(nvme_ctrlr, new_ctrlr);
3610 	if (rc != 0) {
3611 		goto exit;
3612 	}
3613 
3614 	rc = _bdev_nvme_add_secondary_trid(nvme_ctrlr, trid);
3615 
3616 exit:
3617 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
3618 
3619 	spdk_nvme_detach(new_ctrlr);
3620 
3621 	return rc;
3622 }
3623 
3624 static void
3625 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
3626 		  struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
3627 {
3628 	struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx;
3629 	struct nvme_async_probe_ctx *ctx;
3630 	int rc;
3631 
3632 	ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, opts);
3633 	ctx->ctrlr_attached = true;
3634 
3635 	rc = nvme_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx);
3636 	if (rc != 0) {
3637 		populate_namespaces_cb(ctx, 0, rc);
3638 	}
3639 }
3640 
3641 static void
3642 connect_set_failover_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
3643 			struct spdk_nvme_ctrlr *ctrlr,
3644 			const struct spdk_nvme_ctrlr_opts *opts)
3645 {
3646 	struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx;
3647 	struct nvme_ctrlr *nvme_ctrlr;
3648 	struct nvme_async_probe_ctx *ctx;
3649 	int rc;
3650 
3651 	ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, opts);
3652 	ctx->ctrlr_attached = true;
3653 
3654 	nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name);
3655 	if (nvme_ctrlr) {
3656 		rc = bdev_nvme_add_secondary_trid(nvme_ctrlr, ctrlr, &ctx->trid);
3657 	} else {
3658 		rc = -ENODEV;
3659 	}
3660 
3661 	populate_namespaces_cb(ctx, 0, rc);
3662 }
3663 
3664 static int
3665 bdev_nvme_async_poll(void *arg)
3666 {
3667 	struct nvme_async_probe_ctx	*ctx = arg;
3668 	int				rc;
3669 
3670 	rc = spdk_nvme_probe_poll_async(ctx->probe_ctx);
3671 	if (spdk_unlikely(rc != -EAGAIN)) {
3672 		ctx->probe_done = true;
3673 		spdk_poller_unregister(&ctx->poller);
3674 		if (!ctx->ctrlr_attached) {
3675 			/* The probe is done, but no controller was attached.
3676 			 * That means we had a failure, so report -EIO back to
3677 			 * the caller (usually the RPC). populate_namespaces_cb()
3678 			 * will take care of freeing the nvme_async_probe_ctx.
3679 			 */
3680 			populate_namespaces_cb(ctx, 0, -EIO);
3681 		} else if (ctx->namespaces_populated) {
3682 			/* The namespaces for the attached controller were all
3683 			 * populated and the response was already sent to the
3684 			 * caller (usually the RPC).  So free the context here.
3685 			 */
3686 			free(ctx);
3687 		}
3688 	}
3689 
3690 	return SPDK_POLLER_BUSY;
3691 }
3692 
3693 int
3694 bdev_nvme_create(struct spdk_nvme_transport_id *trid,
3695 		 const char *base_name,
3696 		 const char **names,
3697 		 uint32_t count,
3698 		 uint32_t prchk_flags,
3699 		 spdk_bdev_create_nvme_fn cb_fn,
3700 		 void *cb_ctx,
3701 		 struct spdk_nvme_ctrlr_opts *opts,
3702 		 bool multipath)
3703 {
3704 	struct nvme_probe_skip_entry	*entry, *tmp;
3705 	struct nvme_async_probe_ctx	*ctx;
3706 	spdk_nvme_attach_cb attach_cb;
3707 
3708 	/* TODO expand this check to include both the host and target TRIDs.
3709 	 * Only if both are the same should we fail.
3710 	 */
3711 	if (nvme_ctrlr_get(trid) != NULL) {
3712 		SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr);
3713 		return -EEXIST;
3714 	}
3715 
3716 	ctx = calloc(1, sizeof(*ctx));
3717 	if (!ctx) {
3718 		return -ENOMEM;
3719 	}
3720 	ctx->base_name = base_name;
3721 	ctx->names = names;
3722 	ctx->count = count;
3723 	ctx->cb_fn = cb_fn;
3724 	ctx->cb_ctx = cb_ctx;
3725 	ctx->prchk_flags = prchk_flags;
3726 	ctx->trid = *trid;
3727 
3728 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
3729 		TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) {
3730 			if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) {
3731 				TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq);
3732 				free(entry);
3733 				break;
3734 			}
3735 		}
3736 	}
3737 
3738 	if (opts) {
3739 		memcpy(&ctx->opts, opts, sizeof(*opts));
3740 	} else {
3741 		spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->opts, sizeof(ctx->opts));
3742 	}
3743 
3744 	ctx->opts.transport_retry_count = g_opts.transport_retry_count;
3745 	ctx->opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms;
3746 	ctx->opts.disable_read_ana_log_page = true;
3747 
3748 	if (nvme_bdev_ctrlr_get_by_name(base_name) == NULL || multipath) {
3749 		attach_cb = connect_attach_cb;
3750 	} else {
3751 		attach_cb = connect_set_failover_cb;
3752 	}
3753 
3754 	ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->opts, attach_cb);
3755 	if (ctx->probe_ctx == NULL) {
3756 		SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr);
3757 		free(ctx);
3758 		return -ENODEV;
3759 	}
3760 	ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000);
3761 
3762 	return 0;
3763 }
3764 
3765 int
3766 bdev_nvme_delete(const char *name, const struct nvme_path_id *path_id)
3767 {
3768 	struct nvme_bdev_ctrlr	*nbdev_ctrlr;
3769 	struct nvme_ctrlr	*nvme_ctrlr, *tmp_nvme_ctrlr;
3770 	struct nvme_path_id	*p, *t;
3771 	int			rc = -ENXIO;
3772 
3773 	if (name == NULL || path_id == NULL) {
3774 		return -EINVAL;
3775 	}
3776 
3777 	nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name);
3778 	if (nbdev_ctrlr == NULL) {
3779 		SPDK_ERRLOG("Failed to find NVMe bdev controller\n");
3780 		return -ENODEV;
3781 	}
3782 
3783 	TAILQ_FOREACH_SAFE(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq, tmp_nvme_ctrlr) {
3784 		TAILQ_FOREACH_REVERSE_SAFE(p, &nvme_ctrlr->trids, nvme_paths, link, t) {
3785 			if (path_id->trid.trtype != 0) {
3786 				if (path_id->trid.trtype == SPDK_NVME_TRANSPORT_CUSTOM) {
3787 					if (strcasecmp(path_id->trid.trstring, p->trid.trstring) != 0) {
3788 						continue;
3789 					}
3790 				} else {
3791 					if (path_id->trid.trtype != p->trid.trtype) {
3792 						continue;
3793 					}
3794 				}
3795 			}
3796 
3797 			if (!spdk_mem_all_zero(path_id->trid.traddr, sizeof(path_id->trid.traddr))) {
3798 				if (strcasecmp(path_id->trid.traddr, p->trid.traddr) != 0) {
3799 					continue;
3800 				}
3801 			}
3802 
3803 			if (path_id->trid.adrfam != 0) {
3804 				if (path_id->trid.adrfam != p->trid.adrfam) {
3805 					continue;
3806 				}
3807 			}
3808 
3809 			if (!spdk_mem_all_zero(path_id->trid.trsvcid, sizeof(path_id->trid.trsvcid))) {
3810 				if (strcasecmp(path_id->trid.trsvcid, p->trid.trsvcid) != 0) {
3811 					continue;
3812 				}
3813 			}
3814 
3815 			if (!spdk_mem_all_zero(path_id->trid.subnqn, sizeof(path_id->trid.subnqn))) {
3816 				if (strcmp(path_id->trid.subnqn, p->trid.subnqn) != 0) {
3817 					continue;
3818 				}
3819 			}
3820 
3821 			if (!spdk_mem_all_zero(path_id->hostid.hostaddr, sizeof(path_id->hostid.hostaddr))) {
3822 				if (strcmp(path_id->hostid.hostaddr, p->hostid.hostaddr) != 0) {
3823 					continue;
3824 				}
3825 			}
3826 
3827 			if (!spdk_mem_all_zero(path_id->hostid.hostsvcid, sizeof(path_id->hostid.hostsvcid))) {
3828 				if (strcmp(path_id->hostid.hostsvcid, p->hostid.hostsvcid) != 0) {
3829 					continue;
3830 				}
3831 			}
3832 
3833 			/* If we made it here, then this path is a match! Now we need to remove it. */
3834 			if (p == nvme_ctrlr->active_path_id) {
3835 				/* This is the active path in use right now. The active path is always the first in the list. */
3836 
3837 				if (!TAILQ_NEXT(p, link)) {
3838 					/* The current path is the only path. */
3839 					rc = _bdev_nvme_delete(nvme_ctrlr, false);
3840 				} else {
3841 					/* There is an alternative path. */
3842 					rc = bdev_nvme_failover(nvme_ctrlr, true);
3843 				}
3844 			} else {
3845 				/* We are not using the specified path. */
3846 				TAILQ_REMOVE(&nvme_ctrlr->trids, p, link);
3847 				free(p);
3848 				rc = 0;
3849 			}
3850 
3851 			if (rc < 0 && rc != -ENXIO) {
3852 				return rc;
3853 			}
3854 
3855 
3856 		}
3857 	}
3858 
3859 	/* All nvme_ctrlrs were deleted or no nvme_ctrlr which had the trid was found. */
3860 	return rc;
3861 }
3862 
3863 static int
3864 bdev_nvme_library_init(void)
3865 {
3866 	g_bdev_nvme_init_thread = spdk_get_thread();
3867 
3868 	spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_create_poll_group_cb,
3869 				bdev_nvme_destroy_poll_group_cb,
3870 				sizeof(struct nvme_poll_group),  "nvme_poll_groups");
3871 
3872 	return 0;
3873 }
3874 
3875 static void
3876 bdev_nvme_library_fini(void)
3877 {
3878 	struct nvme_bdev_ctrlr *nbdev_ctrlr;
3879 	struct nvme_ctrlr *nvme_ctrlr;
3880 	struct nvme_probe_skip_entry *entry, *entry_tmp;
3881 
3882 	spdk_poller_unregister(&g_hotplug_poller);
3883 	free(g_hotplug_probe_ctx);
3884 	g_hotplug_probe_ctx = NULL;
3885 
3886 	TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) {
3887 		TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq);
3888 		free(entry);
3889 	}
3890 
3891 	pthread_mutex_lock(&g_bdev_nvme_mutex);
3892 	TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
3893 		TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) {
3894 			pthread_mutex_lock(&nvme_ctrlr->mutex);
3895 			if (nvme_ctrlr->destruct) {
3896 				/* This controller's destruction was already started
3897 				 * before the application started shutting down
3898 				 */
3899 				pthread_mutex_unlock(&nvme_ctrlr->mutex);
3900 				continue;
3901 			}
3902 			nvme_ctrlr->destruct = true;
3903 			pthread_mutex_unlock(&nvme_ctrlr->mutex);
3904 
3905 			spdk_thread_send_msg(nvme_ctrlr->thread, _nvme_ctrlr_destruct,
3906 					     nvme_ctrlr);
3907 		}
3908 	}
3909 
3910 	g_bdev_nvme_module_finish = true;
3911 	if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
3912 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
3913 		spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL);
3914 		spdk_bdev_module_fini_done();
3915 		return;
3916 	}
3917 
3918 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
3919 }
3920 
3921 static void
3922 bdev_nvme_verify_pi_error(struct nvme_bdev_io *bio)
3923 {
3924 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
3925 	struct spdk_bdev *bdev = bdev_io->bdev;
3926 	struct spdk_dif_ctx dif_ctx;
3927 	struct spdk_dif_error err_blk = {};
3928 	int rc;
3929 
3930 	rc = spdk_dif_ctx_init(&dif_ctx,
3931 			       bdev->blocklen, bdev->md_len, bdev->md_interleave,
3932 			       bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags,
3933 			       bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0);
3934 	if (rc != 0) {
3935 		SPDK_ERRLOG("Initialization of DIF context failed\n");
3936 		return;
3937 	}
3938 
3939 	if (bdev->md_interleave) {
3940 		rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
3941 				     bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
3942 	} else {
3943 		struct iovec md_iov = {
3944 			.iov_base	= bdev_io->u.bdev.md_buf,
3945 			.iov_len	= bdev_io->u.bdev.num_blocks * bdev->md_len,
3946 		};
3947 
3948 		rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
3949 				     &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
3950 	}
3951 
3952 	if (rc != 0) {
3953 		SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n",
3954 			    err_blk.err_type, err_blk.err_offset);
3955 	} else {
3956 		SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n");
3957 	}
3958 }
3959 
3960 static void
3961 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl)
3962 {
3963 	struct nvme_bdev_io *bio = ref;
3964 
3965 	if (spdk_nvme_cpl_is_success(cpl)) {
3966 		/* Run PI verification for read data buffer. */
3967 		bdev_nvme_verify_pi_error(bio);
3968 	}
3969 
3970 	/* Return original completion status */
3971 	bdev_nvme_io_complete_nvme_status(bio, &bio->cpl);
3972 }
3973 
3974 static void
3975 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl)
3976 {
3977 	struct nvme_bdev_io *bio = ref;
3978 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
3979 	int ret;
3980 
3981 	if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) {
3982 		SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n",
3983 			    cpl->status.sct, cpl->status.sc);
3984 
3985 		/* Save completion status to use after verifying PI error. */
3986 		bio->cpl = *cpl;
3987 
3988 		if (spdk_likely(nvme_io_path_is_available(bio->io_path))) {
3989 			/* Read without PI checking to verify PI error. */
3990 			ret = bdev_nvme_no_pi_readv(bio,
3991 						    bdev_io->u.bdev.iovs,
3992 						    bdev_io->u.bdev.iovcnt,
3993 						    bdev_io->u.bdev.md_buf,
3994 						    bdev_io->u.bdev.num_blocks,
3995 						    bdev_io->u.bdev.offset_blocks);
3996 			if (ret == 0) {
3997 				return;
3998 			}
3999 		}
4000 	}
4001 
4002 	bdev_nvme_io_complete_nvme_status(bio, cpl);
4003 }
4004 
4005 static void
4006 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl)
4007 {
4008 	struct nvme_bdev_io *bio = ref;
4009 
4010 	if (spdk_nvme_cpl_is_pi_error(cpl)) {
4011 		SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n",
4012 			    cpl->status.sct, cpl->status.sc);
4013 		/* Run PI verification for write data buffer if PI error is detected. */
4014 		bdev_nvme_verify_pi_error(bio);
4015 	}
4016 
4017 	bdev_nvme_io_complete_nvme_status(bio, cpl);
4018 }
4019 
4020 static void
4021 bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl)
4022 {
4023 	struct nvme_bdev_io *bio = ref;
4024 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
4025 
4026 	/* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks.
4027 	 * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error().
4028 	 */
4029 	bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0;
4030 
4031 	if (spdk_nvme_cpl_is_pi_error(cpl)) {
4032 		SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n",
4033 			    cpl->status.sct, cpl->status.sc);
4034 		/* Run PI verification for zone append data buffer if PI error is detected. */
4035 		bdev_nvme_verify_pi_error(bio);
4036 	}
4037 
4038 	bdev_nvme_io_complete_nvme_status(bio, cpl);
4039 }
4040 
4041 static void
4042 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl)
4043 {
4044 	struct nvme_bdev_io *bio = ref;
4045 
4046 	if (spdk_nvme_cpl_is_pi_error(cpl)) {
4047 		SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n",
4048 			    cpl->status.sct, cpl->status.sc);
4049 		/* Run PI verification for compare data buffer if PI error is detected. */
4050 		bdev_nvme_verify_pi_error(bio);
4051 	}
4052 
4053 	bdev_nvme_io_complete_nvme_status(bio, cpl);
4054 }
4055 
4056 static void
4057 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl)
4058 {
4059 	struct nvme_bdev_io *bio = ref;
4060 
4061 	/* Compare operation completion */
4062 	if ((cpl->cdw0 & 0xFF) == SPDK_NVME_OPC_COMPARE) {
4063 		/* Save compare result for write callback */
4064 		bio->cpl = *cpl;
4065 		return;
4066 	}
4067 
4068 	/* Write operation completion */
4069 	if (spdk_nvme_cpl_is_error(&bio->cpl)) {
4070 		/* If bio->cpl is already an error, it means the compare operation failed.  In that case,
4071 		 * complete the IO with the compare operation's status.
4072 		 */
4073 		if (!spdk_nvme_cpl_is_error(cpl)) {
4074 			SPDK_ERRLOG("Unexpected write success after compare failure.\n");
4075 		}
4076 
4077 		bdev_nvme_io_complete_nvme_status(bio, &bio->cpl);
4078 	} else {
4079 		bdev_nvme_io_complete_nvme_status(bio, cpl);
4080 	}
4081 }
4082 
4083 static void
4084 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl)
4085 {
4086 	struct nvme_bdev_io *bio = ref;
4087 
4088 	bdev_nvme_io_complete_nvme_status(bio, cpl);
4089 }
4090 
4091 static int
4092 fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc)
4093 {
4094 	switch (desc->zs) {
4095 	case SPDK_NVME_ZONE_STATE_EMPTY:
4096 		info->state = SPDK_BDEV_ZONE_STATE_EMPTY;
4097 		break;
4098 	case SPDK_NVME_ZONE_STATE_IOPEN:
4099 		info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN;
4100 		break;
4101 	case SPDK_NVME_ZONE_STATE_EOPEN:
4102 		info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN;
4103 		break;
4104 	case SPDK_NVME_ZONE_STATE_CLOSED:
4105 		info->state = SPDK_BDEV_ZONE_STATE_CLOSED;
4106 		break;
4107 	case SPDK_NVME_ZONE_STATE_RONLY:
4108 		info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY;
4109 		break;
4110 	case SPDK_NVME_ZONE_STATE_FULL:
4111 		info->state = SPDK_BDEV_ZONE_STATE_FULL;
4112 		break;
4113 	case SPDK_NVME_ZONE_STATE_OFFLINE:
4114 		info->state = SPDK_BDEV_ZONE_STATE_OFFLINE;
4115 		break;
4116 	default:
4117 		SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs);
4118 		return -EIO;
4119 	}
4120 
4121 	info->zone_id = desc->zslba;
4122 	info->write_pointer = desc->wp;
4123 	info->capacity = desc->zcap;
4124 
4125 	return 0;
4126 }
4127 
4128 static void
4129 bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl)
4130 {
4131 	struct nvme_bdev_io *bio = ref;
4132 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
4133 	uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id;
4134 	uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones;
4135 	struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf;
4136 	uint64_t max_zones_per_buf, i;
4137 	uint32_t zone_report_bufsize;
4138 	struct spdk_nvme_ns *ns;
4139 	struct spdk_nvme_qpair *qpair;
4140 	int ret;
4141 
4142 	if (spdk_nvme_cpl_is_error(cpl)) {
4143 		goto out_complete_io_nvme_cpl;
4144 	}
4145 
4146 	if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) {
4147 		ret = -ENXIO;
4148 		goto out_complete_io_ret;
4149 	}
4150 
4151 	ns = bio->io_path->nvme_ns->ns;
4152 	qpair = bio->io_path->ctrlr_ch->qpair;
4153 
4154 	zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns);
4155 	max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) /
4156 			    sizeof(bio->zone_report_buf->descs[0]);
4157 
4158 	if (bio->zone_report_buf->nr_zones > max_zones_per_buf) {
4159 		ret = -EINVAL;
4160 		goto out_complete_io_ret;
4161 	}
4162 
4163 	if (!bio->zone_report_buf->nr_zones) {
4164 		ret = -EINVAL;
4165 		goto out_complete_io_ret;
4166 	}
4167 
4168 	for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) {
4169 		ret = fill_zone_from_report(&info[bio->handled_zones],
4170 					    &bio->zone_report_buf->descs[i]);
4171 		if (ret) {
4172 			goto out_complete_io_ret;
4173 		}
4174 		bio->handled_zones++;
4175 	}
4176 
4177 	if (bio->handled_zones < zones_to_copy) {
4178 		uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(ns);
4179 		uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones);
4180 
4181 		memset(bio->zone_report_buf, 0, zone_report_bufsize);
4182 		ret = spdk_nvme_zns_report_zones(ns, qpair,
4183 						 bio->zone_report_buf, zone_report_bufsize,
4184 						 slba, SPDK_NVME_ZRA_LIST_ALL, true,
4185 						 bdev_nvme_get_zone_info_done, bio);
4186 		if (!ret) {
4187 			return;
4188 		} else {
4189 			goto out_complete_io_ret;
4190 		}
4191 	}
4192 
4193 out_complete_io_nvme_cpl:
4194 	free(bio->zone_report_buf);
4195 	bio->zone_report_buf = NULL;
4196 	bdev_nvme_io_complete_nvme_status(bio, cpl);
4197 	return;
4198 
4199 out_complete_io_ret:
4200 	free(bio->zone_report_buf);
4201 	bio->zone_report_buf = NULL;
4202 	bdev_nvme_io_complete(bio, ret);
4203 }
4204 
4205 static void
4206 bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl)
4207 {
4208 	struct nvme_bdev_io *bio = ref;
4209 
4210 	bdev_nvme_io_complete_nvme_status(bio, cpl);
4211 }
4212 
4213 static void
4214 bdev_nvme_admin_passthru_complete_nvme_status(void *ctx)
4215 {
4216 	struct nvme_bdev_io *bio = ctx;
4217 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
4218 	const struct spdk_nvme_cpl *cpl = &bio->cpl;
4219 	struct nvme_bdev_channel *nbdev_ch;
4220 	struct nvme_ctrlr *nvme_ctrlr;
4221 	const struct spdk_nvme_ctrlr_data *cdata;
4222 	uint64_t delay_ms;
4223 
4224 	assert(bdev_nvme_io_type_is_admin(bdev_io->type));
4225 
4226 	if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) {
4227 		goto complete;
4228 	}
4229 
4230 	if (cpl->status.dnr != 0 || (g_opts.bdev_retry_count != -1 &&
4231 				     bio->retry_count >= g_opts.bdev_retry_count)) {
4232 		goto complete;
4233 	}
4234 
4235 	nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io));
4236 	nvme_ctrlr = nvme_ctrlr_channel_get_ctrlr(bio->io_path->ctrlr_ch);
4237 
4238 	if (spdk_nvme_cpl_is_path_error(cpl) ||
4239 	    spdk_nvme_cpl_is_aborted_sq_deletion(cpl) ||
4240 	    !nvme_ctrlr_is_available(nvme_ctrlr)) {
4241 		delay_ms = 0;
4242 	} else if (spdk_nvme_cpl_is_aborted_by_request(cpl)) {
4243 		goto complete;
4244 	} else {
4245 		bio->retry_count++;
4246 
4247 		cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr);
4248 
4249 		if (cpl->status.crd != 0) {
4250 			delay_ms = cdata->crdt[cpl->status.crd] * 100;
4251 		} else {
4252 			delay_ms = 0;
4253 		}
4254 	}
4255 
4256 	if (any_ctrlr_may_become_available(nbdev_ch)) {
4257 		bdev_nvme_queue_retry_io(nbdev_ch, bio, delay_ms);
4258 		return;
4259 	}
4260 
4261 complete:
4262 	bio->retry_count = 0;
4263 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
4264 }
4265 
4266 static void
4267 bdev_nvme_abort_complete(void *ctx)
4268 {
4269 	struct nvme_bdev_io *bio = ctx;
4270 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
4271 
4272 	if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) {
4273 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
4274 	} else {
4275 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
4276 	}
4277 }
4278 
4279 static void
4280 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl)
4281 {
4282 	struct nvme_bdev_io *bio = ref;
4283 
4284 	bio->cpl = *cpl;
4285 	spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_complete, bio);
4286 }
4287 
4288 static void
4289 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl)
4290 {
4291 	struct nvme_bdev_io *bio = ref;
4292 
4293 	bio->cpl = *cpl;
4294 	spdk_thread_send_msg(bio->orig_thread,
4295 			     bdev_nvme_admin_passthru_complete_nvme_status, bio);
4296 }
4297 
4298 static void
4299 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset)
4300 {
4301 	struct nvme_bdev_io *bio = ref;
4302 	struct iovec *iov;
4303 
4304 	bio->iov_offset = sgl_offset;
4305 	for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) {
4306 		iov = &bio->iovs[bio->iovpos];
4307 		if (bio->iov_offset < iov->iov_len) {
4308 			break;
4309 		}
4310 
4311 		bio->iov_offset -= iov->iov_len;
4312 	}
4313 }
4314 
4315 static int
4316 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length)
4317 {
4318 	struct nvme_bdev_io *bio = ref;
4319 	struct iovec *iov;
4320 
4321 	assert(bio->iovpos < bio->iovcnt);
4322 
4323 	iov = &bio->iovs[bio->iovpos];
4324 
4325 	*address = iov->iov_base;
4326 	*length = iov->iov_len;
4327 
4328 	if (bio->iov_offset) {
4329 		assert(bio->iov_offset <= iov->iov_len);
4330 		*address += bio->iov_offset;
4331 		*length -= bio->iov_offset;
4332 	}
4333 
4334 	bio->iov_offset += *length;
4335 	if (bio->iov_offset == iov->iov_len) {
4336 		bio->iovpos++;
4337 		bio->iov_offset = 0;
4338 	}
4339 
4340 	return 0;
4341 }
4342 
4343 static void
4344 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset)
4345 {
4346 	struct nvme_bdev_io *bio = ref;
4347 	struct iovec *iov;
4348 
4349 	bio->fused_iov_offset = sgl_offset;
4350 	for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) {
4351 		iov = &bio->fused_iovs[bio->fused_iovpos];
4352 		if (bio->fused_iov_offset < iov->iov_len) {
4353 			break;
4354 		}
4355 
4356 		bio->fused_iov_offset -= iov->iov_len;
4357 	}
4358 }
4359 
4360 static int
4361 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length)
4362 {
4363 	struct nvme_bdev_io *bio = ref;
4364 	struct iovec *iov;
4365 
4366 	assert(bio->fused_iovpos < bio->fused_iovcnt);
4367 
4368 	iov = &bio->fused_iovs[bio->fused_iovpos];
4369 
4370 	*address = iov->iov_base;
4371 	*length = iov->iov_len;
4372 
4373 	if (bio->fused_iov_offset) {
4374 		assert(bio->fused_iov_offset <= iov->iov_len);
4375 		*address += bio->fused_iov_offset;
4376 		*length -= bio->fused_iov_offset;
4377 	}
4378 
4379 	bio->fused_iov_offset += *length;
4380 	if (bio->fused_iov_offset == iov->iov_len) {
4381 		bio->fused_iovpos++;
4382 		bio->fused_iov_offset = 0;
4383 	}
4384 
4385 	return 0;
4386 }
4387 
4388 static int
4389 bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
4390 		      void *md, uint64_t lba_count, uint64_t lba)
4391 {
4392 	int rc;
4393 
4394 	SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n",
4395 		      lba_count, lba);
4396 
4397 	bio->iovs = iov;
4398 	bio->iovcnt = iovcnt;
4399 	bio->iovpos = 0;
4400 	bio->iov_offset = 0;
4401 
4402 	rc = spdk_nvme_ns_cmd_readv_with_md(bio->io_path->nvme_ns->ns,
4403 					    bio->io_path->ctrlr_ch->qpair,
4404 					    lba, lba_count,
4405 					    bdev_nvme_no_pi_readv_done, bio, 0,
4406 					    bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
4407 					    md, 0, 0);
4408 
4409 	if (rc != 0 && rc != -ENOMEM) {
4410 		SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc);
4411 	}
4412 	return rc;
4413 }
4414 
4415 static int
4416 bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
4417 		void *md, uint64_t lba_count, uint64_t lba, uint32_t flags,
4418 		struct spdk_bdev_ext_io_opts *ext_opts)
4419 {
4420 	struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
4421 	struct spdk_nvme_qpair *qpair = bio->io_path->ctrlr_ch->qpair;
4422 	int rc;
4423 
4424 	SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n",
4425 		      lba_count, lba);
4426 
4427 	bio->iovs = iov;
4428 	bio->iovcnt = iovcnt;
4429 	bio->iovpos = 0;
4430 	bio->iov_offset = 0;
4431 
4432 	if (ext_opts) {
4433 		bio->ext_opts.size = sizeof(struct spdk_nvme_ns_cmd_ext_io_opts);
4434 		bio->ext_opts.memory_domain = ext_opts->memory_domain;
4435 		bio->ext_opts.memory_domain_ctx = ext_opts->memory_domain_ctx;
4436 		bio->ext_opts.io_flags = flags;
4437 		bio->ext_opts.metadata = md;
4438 
4439 		rc = spdk_nvme_ns_cmd_readv_ext(ns, qpair, lba, lba_count,
4440 						bdev_nvme_readv_done, bio,
4441 						bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
4442 						&bio->ext_opts);
4443 	} else if (iovcnt == 1) {
4444 		rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, md, lba,
4445 						   lba_count,
4446 						   bdev_nvme_readv_done, bio,
4447 						   flags,
4448 						   0, 0);
4449 	} else {
4450 		rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count,
4451 						    bdev_nvme_readv_done, bio, flags,
4452 						    bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
4453 						    md, 0, 0);
4454 	}
4455 
4456 	if (rc != 0 && rc != -ENOMEM) {
4457 		SPDK_ERRLOG("readv failed: rc = %d\n", rc);
4458 	}
4459 	return rc;
4460 }
4461 
4462 static int
4463 bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
4464 		 void *md, uint64_t lba_count, uint64_t lba,
4465 		 uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts)
4466 {
4467 	struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
4468 	struct spdk_nvme_qpair *qpair = bio->io_path->ctrlr_ch->qpair;
4469 	int rc;
4470 
4471 	SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n",
4472 		      lba_count, lba);
4473 
4474 	bio->iovs = iov;
4475 	bio->iovcnt = iovcnt;
4476 	bio->iovpos = 0;
4477 	bio->iov_offset = 0;
4478 
4479 	if (ext_opts) {
4480 		bio->ext_opts.size = sizeof(struct spdk_nvme_ns_cmd_ext_io_opts);
4481 		bio->ext_opts.memory_domain = ext_opts->memory_domain;
4482 		bio->ext_opts.memory_domain_ctx = ext_opts->memory_domain_ctx;
4483 		bio->ext_opts.io_flags = flags;
4484 		bio->ext_opts.metadata = md;
4485 
4486 		rc = spdk_nvme_ns_cmd_writev_ext(ns, qpair, lba, lba_count,
4487 						 bdev_nvme_writev_done, bio,
4488 						 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
4489 						 &bio->ext_opts);
4490 	} else if (iovcnt == 1) {
4491 		rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, md, lba,
4492 						    lba_count,
4493 						    bdev_nvme_writev_done, bio,
4494 						    flags,
4495 						    0, 0);
4496 	} else {
4497 		rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count,
4498 						     bdev_nvme_writev_done, bio, flags,
4499 						     bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
4500 						     md, 0, 0);
4501 	}
4502 
4503 	if (rc != 0 && rc != -ENOMEM) {
4504 		SPDK_ERRLOG("writev failed: rc = %d\n", rc);
4505 	}
4506 	return rc;
4507 }
4508 
4509 static int
4510 bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
4511 		       void *md, uint64_t lba_count, uint64_t zslba,
4512 		       uint32_t flags)
4513 {
4514 	struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
4515 	struct spdk_nvme_qpair *qpair = bio->io_path->ctrlr_ch->qpair;
4516 	int rc;
4517 
4518 	SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n",
4519 		      lba_count, zslba);
4520 
4521 	bio->iovs = iov;
4522 	bio->iovcnt = iovcnt;
4523 	bio->iovpos = 0;
4524 	bio->iov_offset = 0;
4525 
4526 	if (iovcnt == 1) {
4527 		rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba,
4528 						       lba_count,
4529 						       bdev_nvme_zone_appendv_done, bio,
4530 						       flags,
4531 						       0, 0);
4532 	} else {
4533 		rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count,
4534 							bdev_nvme_zone_appendv_done, bio, flags,
4535 							bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
4536 							md, 0, 0);
4537 	}
4538 
4539 	if (rc != 0 && rc != -ENOMEM) {
4540 		SPDK_ERRLOG("zone append failed: rc = %d\n", rc);
4541 	}
4542 	return rc;
4543 }
4544 
4545 static int
4546 bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
4547 		   void *md, uint64_t lba_count, uint64_t lba,
4548 		   uint32_t flags)
4549 {
4550 	int rc;
4551 
4552 	SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n",
4553 		      lba_count, lba);
4554 
4555 	bio->iovs = iov;
4556 	bio->iovcnt = iovcnt;
4557 	bio->iovpos = 0;
4558 	bio->iov_offset = 0;
4559 
4560 	rc = spdk_nvme_ns_cmd_comparev_with_md(bio->io_path->nvme_ns->ns,
4561 					       bio->io_path->ctrlr_ch->qpair,
4562 					       lba, lba_count,
4563 					       bdev_nvme_comparev_done, bio, flags,
4564 					       bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
4565 					       md, 0, 0);
4566 
4567 	if (rc != 0 && rc != -ENOMEM) {
4568 		SPDK_ERRLOG("comparev failed: rc = %d\n", rc);
4569 	}
4570 	return rc;
4571 }
4572 
4573 static int
4574 bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt,
4575 			      struct iovec *write_iov, int write_iovcnt,
4576 			      void *md, uint64_t lba_count, uint64_t lba, uint32_t flags)
4577 {
4578 	struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
4579 	struct spdk_nvme_qpair *qpair = bio->io_path->ctrlr_ch->qpair;
4580 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
4581 	int rc;
4582 
4583 	SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n",
4584 		      lba_count, lba);
4585 
4586 	bio->iovs = cmp_iov;
4587 	bio->iovcnt = cmp_iovcnt;
4588 	bio->iovpos = 0;
4589 	bio->iov_offset = 0;
4590 	bio->fused_iovs = write_iov;
4591 	bio->fused_iovcnt = write_iovcnt;
4592 	bio->fused_iovpos = 0;
4593 	bio->fused_iov_offset = 0;
4594 
4595 	if (bdev_io->num_retries == 0) {
4596 		bio->first_fused_submitted = false;
4597 	}
4598 
4599 	if (!bio->first_fused_submitted) {
4600 		flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST;
4601 		memset(&bio->cpl, 0, sizeof(bio->cpl));
4602 
4603 		rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count,
4604 						       bdev_nvme_comparev_and_writev_done, bio, flags,
4605 						       bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0);
4606 		if (rc == 0) {
4607 			bio->first_fused_submitted = true;
4608 			flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST;
4609 		} else {
4610 			if (rc != -ENOMEM) {
4611 				SPDK_ERRLOG("compare failed: rc = %d\n", rc);
4612 			}
4613 			return rc;
4614 		}
4615 	}
4616 
4617 	flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND;
4618 
4619 	rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count,
4620 					     bdev_nvme_comparev_and_writev_done, bio, flags,
4621 					     bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0);
4622 	if (rc != 0 && rc != -ENOMEM) {
4623 		SPDK_ERRLOG("write failed: rc = %d\n", rc);
4624 		rc = 0;
4625 	}
4626 
4627 	return rc;
4628 }
4629 
4630 static int
4631 bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks)
4632 {
4633 	struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES];
4634 	struct spdk_nvme_dsm_range *range;
4635 	uint64_t offset, remaining;
4636 	uint64_t num_ranges_u64;
4637 	uint16_t num_ranges;
4638 	int rc;
4639 
4640 	num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) /
4641 			 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
4642 	if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) {
4643 		SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks);
4644 		return -EINVAL;
4645 	}
4646 	num_ranges = (uint16_t)num_ranges_u64;
4647 
4648 	offset = offset_blocks;
4649 	remaining = num_blocks;
4650 	range = &dsm_ranges[0];
4651 
4652 	/* Fill max-size ranges until the remaining blocks fit into one range */
4653 	while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) {
4654 		range->attributes.raw = 0;
4655 		range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
4656 		range->starting_lba = offset;
4657 
4658 		offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
4659 		remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
4660 		range++;
4661 	}
4662 
4663 	/* Final range describes the remaining blocks */
4664 	range->attributes.raw = 0;
4665 	range->length = remaining;
4666 	range->starting_lba = offset;
4667 
4668 	rc = spdk_nvme_ns_cmd_dataset_management(bio->io_path->nvme_ns->ns,
4669 			bio->io_path->ctrlr_ch->qpair,
4670 			SPDK_NVME_DSM_ATTR_DEALLOCATE,
4671 			dsm_ranges, num_ranges,
4672 			bdev_nvme_queued_done, bio);
4673 
4674 	return rc;
4675 }
4676 
4677 static int
4678 bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks)
4679 {
4680 	if (num_blocks > UINT16_MAX + 1) {
4681 		SPDK_ERRLOG("NVMe write zeroes is limited to 16-bit block count\n");
4682 		return -EINVAL;
4683 	}
4684 
4685 	return spdk_nvme_ns_cmd_write_zeroes(bio->io_path->nvme_ns->ns,
4686 					     bio->io_path->ctrlr_ch->qpair,
4687 					     offset_blocks, num_blocks,
4688 					     bdev_nvme_queued_done, bio,
4689 					     0);
4690 }
4691 
4692 static int
4693 bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones,
4694 			struct spdk_bdev_zone_info *info)
4695 {
4696 	struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
4697 	struct spdk_nvme_qpair *qpair = bio->io_path->ctrlr_ch->qpair;
4698 	uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns);
4699 	uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns);
4700 	uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns);
4701 
4702 	if (zone_id % zone_size != 0) {
4703 		return -EINVAL;
4704 	}
4705 
4706 	if (num_zones > total_zones || !num_zones) {
4707 		return -EINVAL;
4708 	}
4709 
4710 	assert(!bio->zone_report_buf);
4711 	bio->zone_report_buf = calloc(1, zone_report_bufsize);
4712 	if (!bio->zone_report_buf) {
4713 		return -ENOMEM;
4714 	}
4715 
4716 	bio->handled_zones = 0;
4717 
4718 	return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize,
4719 					  zone_id, SPDK_NVME_ZRA_LIST_ALL, true,
4720 					  bdev_nvme_get_zone_info_done, bio);
4721 }
4722 
4723 static int
4724 bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id,
4725 			  enum spdk_bdev_zone_action action)
4726 {
4727 	struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
4728 	struct spdk_nvme_qpair *qpair = bio->io_path->ctrlr_ch->qpair;
4729 
4730 	switch (action) {
4731 	case SPDK_BDEV_ZONE_CLOSE:
4732 		return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false,
4733 						bdev_nvme_zone_management_done, bio);
4734 	case SPDK_BDEV_ZONE_FINISH:
4735 		return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false,
4736 						 bdev_nvme_zone_management_done, bio);
4737 	case SPDK_BDEV_ZONE_OPEN:
4738 		return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false,
4739 					       bdev_nvme_zone_management_done, bio);
4740 	case SPDK_BDEV_ZONE_RESET:
4741 		return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false,
4742 						bdev_nvme_zone_management_done, bio);
4743 	case SPDK_BDEV_ZONE_OFFLINE:
4744 		return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false,
4745 						  bdev_nvme_zone_management_done, bio);
4746 	default:
4747 		return -EINVAL;
4748 	}
4749 }
4750 
4751 static void
4752 bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio,
4753 			 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes)
4754 {
4755 	struct nvme_io_path *io_path;
4756 	struct nvme_ctrlr *nvme_ctrlr;
4757 	uint32_t max_xfer_size;
4758 	int rc = -ENXIO;
4759 
4760 	/* Choose the first ctrlr which is not failed. */
4761 	STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
4762 		nvme_ctrlr = nvme_ctrlr_channel_get_ctrlr(io_path->ctrlr_ch);
4763 
4764 		/* We should skip any unavailable nvme_ctrlr rather than checking
4765 		 * if the return value of spdk_nvme_ctrlr_cmd_admin_raw() is -ENXIO.
4766 		 */
4767 		if (!nvme_ctrlr_is_available(nvme_ctrlr)) {
4768 			continue;
4769 		}
4770 
4771 		max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ctrlr->ctrlr);
4772 
4773 		if (nbytes > max_xfer_size) {
4774 			SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
4775 			rc = -EINVAL;
4776 			goto err;
4777 		}
4778 
4779 		bio->io_path = io_path;
4780 		bio->orig_thread = spdk_get_thread();
4781 
4782 		rc = spdk_nvme_ctrlr_cmd_admin_raw(nvme_ctrlr->ctrlr, cmd, buf, (uint32_t)nbytes,
4783 						   bdev_nvme_admin_passthru_done, bio);
4784 		if (rc == 0) {
4785 			return;
4786 		}
4787 	}
4788 
4789 err:
4790 	bdev_nvme_admin_passthru_complete(bio, rc);
4791 }
4792 
4793 static int
4794 bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd,
4795 		      void *buf, size_t nbytes)
4796 {
4797 	struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
4798 	struct spdk_nvme_qpair *qpair = bio->io_path->ctrlr_ch->qpair;
4799 	uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns);
4800 	struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns);
4801 
4802 	if (nbytes > max_xfer_size) {
4803 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
4804 		return -EINVAL;
4805 	}
4806 
4807 	/*
4808 	 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid,
4809 	 * so fill it out automatically.
4810 	 */
4811 	cmd->nsid = spdk_nvme_ns_get_id(ns);
4812 
4813 	return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf,
4814 					  (uint32_t)nbytes, bdev_nvme_queued_done, bio);
4815 }
4816 
4817 static int
4818 bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd,
4819 			 void *buf, size_t nbytes, void *md_buf, size_t md_len)
4820 {
4821 	struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
4822 	struct spdk_nvme_qpair *qpair = bio->io_path->ctrlr_ch->qpair;
4823 	size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns);
4824 	uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns);
4825 	struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns);
4826 
4827 	if (nbytes > max_xfer_size) {
4828 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
4829 		return -EINVAL;
4830 	}
4831 
4832 	if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) {
4833 		SPDK_ERRLOG("invalid meta data buffer size\n");
4834 		return -EINVAL;
4835 	}
4836 
4837 	/*
4838 	 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid,
4839 	 * so fill it out automatically.
4840 	 */
4841 	cmd->nsid = spdk_nvme_ns_get_id(ns);
4842 
4843 	return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf,
4844 			(uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio);
4845 }
4846 
4847 static void
4848 bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio,
4849 		struct nvme_bdev_io *bio_to_abort)
4850 {
4851 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
4852 	struct spdk_bdev_io *bdev_io_to_abort;
4853 	struct nvme_io_path *io_path;
4854 	struct nvme_ctrlr *nvme_ctrlr;
4855 	int rc = 0;
4856 
4857 	bio->orig_thread = spdk_get_thread();
4858 
4859 	/* Traverse the retry_io_list first. */
4860 	TAILQ_FOREACH(bdev_io_to_abort, &nbdev_ch->retry_io_list, module_link) {
4861 		if ((struct nvme_bdev_io *)bdev_io_to_abort->driver_ctx == bio_to_abort) {
4862 			TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io_to_abort, module_link);
4863 			spdk_bdev_io_complete(bdev_io_to_abort, SPDK_BDEV_IO_STATUS_ABORTED);
4864 
4865 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
4866 			return;
4867 		}
4868 	}
4869 
4870 	/* Even admin commands, they were submitted to only nvme_ctrlrs which were
4871 	 * on any io_path. So traverse the io_path list for not only I/O commands
4872 	 * but also admin commands.
4873 	 */
4874 	STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
4875 		nvme_ctrlr = nvme_ctrlr_channel_get_ctrlr(io_path->ctrlr_ch);
4876 
4877 		rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ctrlr->ctrlr,
4878 						   io_path->ctrlr_ch->qpair,
4879 						   bio_to_abort,
4880 						   bdev_nvme_abort_done, bio);
4881 		if (rc == -ENOENT) {
4882 			/* If no command was found in I/O qpair, the target command may be
4883 			 * admin command.
4884 			 */
4885 			rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ctrlr->ctrlr,
4886 							   NULL,
4887 							   bio_to_abort,
4888 							   bdev_nvme_abort_done, bio);
4889 		}
4890 
4891 		if (rc != -ENOENT) {
4892 			break;
4893 		}
4894 	}
4895 
4896 	if (rc != 0) {
4897 		/* If no command was found or there was any error, complete the abort
4898 		 * request with failure.
4899 		 */
4900 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
4901 	}
4902 }
4903 
4904 static void
4905 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w)
4906 {
4907 	const char	*action;
4908 
4909 	if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) {
4910 		action = "reset";
4911 	} else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) {
4912 		action = "abort";
4913 	} else {
4914 		action = "none";
4915 	}
4916 
4917 	spdk_json_write_object_begin(w);
4918 
4919 	spdk_json_write_named_string(w, "method", "bdev_nvme_set_options");
4920 
4921 	spdk_json_write_named_object_begin(w, "params");
4922 	spdk_json_write_named_string(w, "action_on_timeout", action);
4923 	spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us);
4924 	spdk_json_write_named_uint64(w, "timeout_admin_us", g_opts.timeout_admin_us);
4925 	spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms);
4926 	spdk_json_write_named_uint32(w, "transport_retry_count", g_opts.transport_retry_count);
4927 	spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst);
4928 	spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight);
4929 	spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight);
4930 	spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight);
4931 	spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us);
4932 	spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us);
4933 	spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests);
4934 	spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit);
4935 	spdk_json_write_named_int32(w, "bdev_retry_count", g_opts.bdev_retry_count);
4936 	spdk_json_write_object_end(w);
4937 
4938 	spdk_json_write_object_end(w);
4939 }
4940 
4941 static void
4942 nvme_ctrlr_config_json(struct spdk_json_write_ctx *w,
4943 		       struct nvme_ctrlr *nvme_ctrlr)
4944 {
4945 	struct spdk_nvme_transport_id	*trid;
4946 
4947 	trid = &nvme_ctrlr->active_path_id->trid;
4948 
4949 	spdk_json_write_object_begin(w);
4950 
4951 	spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller");
4952 
4953 	spdk_json_write_named_object_begin(w, "params");
4954 	spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name);
4955 	nvme_bdev_dump_trid_json(trid, w);
4956 	spdk_json_write_named_bool(w, "prchk_reftag",
4957 				   (nvme_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0);
4958 	spdk_json_write_named_bool(w, "prchk_guard",
4959 				   (nvme_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0);
4960 
4961 	spdk_json_write_object_end(w);
4962 
4963 	spdk_json_write_object_end(w);
4964 }
4965 
4966 static void
4967 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w)
4968 {
4969 	spdk_json_write_object_begin(w);
4970 	spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug");
4971 
4972 	spdk_json_write_named_object_begin(w, "params");
4973 	spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us);
4974 	spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled);
4975 	spdk_json_write_object_end(w);
4976 
4977 	spdk_json_write_object_end(w);
4978 }
4979 
4980 static int
4981 bdev_nvme_config_json(struct spdk_json_write_ctx *w)
4982 {
4983 	struct nvme_bdev_ctrlr	*nbdev_ctrlr;
4984 	struct nvme_ctrlr	*nvme_ctrlr;
4985 
4986 	bdev_nvme_opts_config_json(w);
4987 
4988 	pthread_mutex_lock(&g_bdev_nvme_mutex);
4989 
4990 	TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
4991 		TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) {
4992 			nvme_ctrlr_config_json(w, nvme_ctrlr);
4993 		}
4994 	}
4995 
4996 	/* Dump as last parameter to give all NVMe bdevs chance to be constructed
4997 	 * before enabling hotplug poller.
4998 	 */
4999 	bdev_nvme_hotplug_config_json(w);
5000 
5001 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
5002 	return 0;
5003 }
5004 
5005 struct spdk_nvme_ctrlr *
5006 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev)
5007 {
5008 	struct nvme_bdev *nbdev;
5009 	struct nvme_ns *nvme_ns;
5010 
5011 	if (!bdev || bdev->module != &nvme_if) {
5012 		return NULL;
5013 	}
5014 
5015 	nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk);
5016 	nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list);
5017 	assert(nvme_ns != NULL);
5018 
5019 	return nvme_ns->ctrlr->ctrlr;
5020 }
5021 
5022 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme)
5023