xref: /spdk/module/bdev/nvme/bdev_nvme.c (revision 2ee6ab36f9a0e38f0e47e9dab3db40a6ea72cfd5)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation. All rights reserved.
5  *   Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
6  *   Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
7  *
8  *   Redistribution and use in source and binary forms, with or without
9  *   modification, are permitted provided that the following conditions
10  *   are met:
11  *
12  *     * Redistributions of source code must retain the above copyright
13  *       notice, this list of conditions and the following disclaimer.
14  *     * Redistributions in binary form must reproduce the above copyright
15  *       notice, this list of conditions and the following disclaimer in
16  *       the documentation and/or other materials provided with the
17  *       distribution.
18  *     * Neither the name of Intel Corporation nor the names of its
19  *       contributors may be used to endorse or promote products derived
20  *       from this software without specific prior written permission.
21  *
22  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
25  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
26  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  */
34 
35 #include "spdk/stdinc.h"
36 
37 #include "bdev_nvme.h"
38 
39 #include "spdk/accel_engine.h"
40 #include "spdk/config.h"
41 #include "spdk/endian.h"
42 #include "spdk/bdev.h"
43 #include "spdk/json.h"
44 #include "spdk/likely.h"
45 #include "spdk/nvme.h"
46 #include "spdk/nvme_ocssd.h"
47 #include "spdk/nvme_zns.h"
48 #include "spdk/opal.h"
49 #include "spdk/thread.h"
50 #include "spdk/string.h"
51 #include "spdk/util.h"
52 
53 #include "spdk/bdev_module.h"
54 #include "spdk/log.h"
55 
56 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true
57 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS	(10000)
58 
59 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w);
60 
61 struct nvme_bdev_io {
62 	/** array of iovecs to transfer. */
63 	struct iovec *iovs;
64 
65 	/** Number of iovecs in iovs array. */
66 	int iovcnt;
67 
68 	/** Current iovec position. */
69 	int iovpos;
70 
71 	/** Offset in current iovec. */
72 	uint32_t iov_offset;
73 
74 	/** array of iovecs to transfer. */
75 	struct iovec *fused_iovs;
76 
77 	/** Number of iovecs in iovs array. */
78 	int fused_iovcnt;
79 
80 	/** Current iovec position. */
81 	int fused_iovpos;
82 
83 	/** Offset in current iovec. */
84 	uint32_t fused_iov_offset;
85 
86 	/** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */
87 	struct spdk_nvme_cpl cpl;
88 	/** Extended IO opts passed by the user to bdev layer and mapped to NVME format */
89 	struct spdk_nvme_ns_cmd_ext_io_opts ext_opts;
90 
91 	/** Originating thread */
92 	struct spdk_thread *orig_thread;
93 
94 	/** Keeps track if first of fused commands was submitted */
95 	bool first_fused_submitted;
96 
97 	/** Temporary pointer to zone report buffer */
98 	struct spdk_nvme_zns_zone_report *zone_report_buf;
99 
100 	/** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */
101 	uint64_t handled_zones;
102 };
103 
104 struct nvme_probe_ctx {
105 	size_t count;
106 	struct spdk_nvme_transport_id trids[NVME_MAX_CONTROLLERS];
107 	struct spdk_nvme_host_id hostids[NVME_MAX_CONTROLLERS];
108 	const char *names[NVME_MAX_CONTROLLERS];
109 	uint32_t prchk_flags[NVME_MAX_CONTROLLERS];
110 	const char *hostnqn;
111 };
112 
113 struct nvme_probe_skip_entry {
114 	struct spdk_nvme_transport_id		trid;
115 	TAILQ_ENTRY(nvme_probe_skip_entry)	tailq;
116 };
117 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */
118 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER(
119 			g_skipped_nvme_ctrlrs);
120 
121 static struct spdk_bdev_nvme_opts g_opts = {
122 	.action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE,
123 	.timeout_us = 0,
124 	.timeout_admin_us = 0,
125 	.keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS,
126 	.retry_count = 4,
127 	.arbitration_burst = 0,
128 	.low_priority_weight = 0,
129 	.medium_priority_weight = 0,
130 	.high_priority_weight = 0,
131 	.nvme_adminq_poll_period_us = 10000ULL,
132 	.nvme_ioq_poll_period_us = 0,
133 	.io_queue_requests = 0,
134 	.delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT,
135 };
136 
137 #define NVME_HOTPLUG_POLL_PERIOD_MAX			10000000ULL
138 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT		100000ULL
139 
140 static int g_hot_insert_nvme_controller_index = 0;
141 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT;
142 static bool g_nvme_hotplug_enabled = false;
143 static struct spdk_thread *g_bdev_nvme_init_thread;
144 static struct spdk_poller *g_hotplug_poller;
145 static struct spdk_poller *g_hotplug_probe_poller;
146 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx;
147 
148 static void nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr,
149 		struct nvme_async_probe_ctx *ctx);
150 static void nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr,
151 		struct nvme_async_probe_ctx *ctx);
152 static int bdev_nvme_library_init(void);
153 static void bdev_nvme_library_fini(void);
154 static int bdev_nvme_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
155 			   struct nvme_bdev_io *bio,
156 			   struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
157 			   uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts);
158 static int bdev_nvme_no_pi_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
159 				 struct nvme_bdev_io *bio,
160 				 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba);
161 static int bdev_nvme_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
162 			    struct nvme_bdev_io *bio,
163 			    struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
164 			    uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts);
165 static int bdev_nvme_zone_appendv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
166 				  struct nvme_bdev_io *bio,
167 				  struct iovec *iov, int iovcnt, void *md, uint64_t lba_count,
168 				  uint64_t zslba, uint32_t flags);
169 static int bdev_nvme_comparev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
170 			      struct nvme_bdev_io *bio,
171 			      struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
172 			      uint32_t flags);
173 static int bdev_nvme_comparev_and_writev(struct spdk_nvme_ns *ns,
174 		struct spdk_nvme_qpair *qpair,
175 		struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov,
176 		int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba,
177 		uint32_t flags);
178 static int bdev_nvme_get_zone_info(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
179 				   struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones,
180 				   struct spdk_bdev_zone_info *info);
181 static int bdev_nvme_zone_management(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
182 				     struct nvme_bdev_io *bio, uint64_t zone_id,
183 				     enum spdk_bdev_zone_action action);
184 static int bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch,
185 				    struct nvme_bdev_io *bio,
186 				    struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes);
187 static int bdev_nvme_io_passthru(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
188 				 struct nvme_bdev_io *bio,
189 				 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes);
190 static int bdev_nvme_io_passthru_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
191 				    struct nvme_bdev_io *bio,
192 				    struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len);
193 static int bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch,
194 			   struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort);
195 static int bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio);
196 static int bdev_nvme_failover(struct nvme_ctrlr *nvme_ctrlr, bool remove);
197 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr);
198 
199 struct spdk_nvme_qpair *
200 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch)
201 {
202 	struct nvme_ctrlr_channel *ctrlr_ch;
203 
204 	assert(ctrlr_io_ch != NULL);
205 
206 	ctrlr_ch = spdk_io_channel_get_ctx(ctrlr_io_ch);
207 
208 	return ctrlr_ch->qpair;
209 }
210 
211 static int
212 bdev_nvme_get_ctx_size(void)
213 {
214 	return sizeof(struct nvme_bdev_io);
215 }
216 
217 static struct spdk_bdev_module nvme_if = {
218 	.name = "nvme",
219 	.async_fini = true,
220 	.module_init = bdev_nvme_library_init,
221 	.module_fini = bdev_nvme_library_fini,
222 	.config_json = bdev_nvme_config_json,
223 	.get_ctx_size = bdev_nvme_get_ctx_size,
224 
225 };
226 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if)
227 
228 struct nvme_ctrlrs g_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_ctrlrs);
229 pthread_mutex_t g_bdev_nvme_mutex = PTHREAD_MUTEX_INITIALIZER;
230 bool g_bdev_nvme_module_finish;
231 
232 struct nvme_ns *
233 nvme_ctrlr_get_ns(struct nvme_ctrlr *nvme_ctrlr, uint32_t nsid)
234 {
235 	assert(nsid > 0);
236 	assert(nsid <= nvme_ctrlr->num_ns);
237 	if (nsid == 0 || nsid > nvme_ctrlr->num_ns) {
238 		return NULL;
239 	}
240 
241 	return nvme_ctrlr->namespaces[nsid - 1];
242 }
243 
244 struct nvme_ns *
245 nvme_ctrlr_get_first_active_ns(struct nvme_ctrlr *nvme_ctrlr)
246 {
247 	uint32_t i;
248 
249 	for (i = 0; i < nvme_ctrlr->num_ns; i++) {
250 		if (nvme_ctrlr->namespaces[i] != NULL) {
251 			return nvme_ctrlr->namespaces[i];
252 		}
253 	}
254 
255 	return NULL;
256 }
257 
258 struct nvme_ns *
259 nvme_ctrlr_get_next_active_ns(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *ns)
260 {
261 	uint32_t i;
262 
263 	if (ns == NULL) {
264 		return NULL;
265 	}
266 
267 	/* ns->id is a 1's based value and we want to start at the next
268 	 * entry in this array, so we start at ns->id and don't subtract to
269 	 * convert to 0's based. */
270 	for (i = ns->id; i < nvme_ctrlr->num_ns; i++) {
271 		if (nvme_ctrlr->namespaces[i] != NULL) {
272 			return nvme_ctrlr->namespaces[i];
273 		}
274 	}
275 
276 	return NULL;
277 }
278 
279 static struct nvme_ctrlr *
280 nvme_ctrlr_get(const struct spdk_nvme_transport_id *trid)
281 {
282 	struct nvme_ctrlr	*nvme_ctrlr;
283 
284 	pthread_mutex_lock(&g_bdev_nvme_mutex);
285 	TAILQ_FOREACH(nvme_ctrlr, &g_nvme_ctrlrs, tailq) {
286 		if (spdk_nvme_transport_id_compare(trid, nvme_ctrlr->connected_trid) == 0) {
287 			break;
288 		}
289 	}
290 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
291 
292 	return nvme_ctrlr;
293 }
294 
295 struct nvme_ctrlr *
296 nvme_ctrlr_get_by_name(const char *name)
297 {
298 	struct nvme_ctrlr *nvme_ctrlr;
299 
300 	if (name == NULL) {
301 		return NULL;
302 	}
303 
304 	pthread_mutex_lock(&g_bdev_nvme_mutex);
305 	TAILQ_FOREACH(nvme_ctrlr, &g_nvme_ctrlrs, tailq) {
306 		if (strcmp(name, nvme_ctrlr->name) == 0) {
307 			break;
308 		}
309 	}
310 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
311 
312 	return nvme_ctrlr;
313 }
314 
315 void
316 nvme_ctrlr_for_each(nvme_ctrlr_for_each_fn fn, void *ctx)
317 {
318 	struct nvme_ctrlr *nvme_ctrlr;
319 
320 	pthread_mutex_lock(&g_bdev_nvme_mutex);
321 	TAILQ_FOREACH(nvme_ctrlr, &g_nvme_ctrlrs, tailq) {
322 		fn(nvme_ctrlr, ctx);
323 	}
324 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
325 }
326 
327 void
328 nvme_bdev_dump_trid_json(const struct spdk_nvme_transport_id *trid, struct spdk_json_write_ctx *w)
329 {
330 	const char *trtype_str;
331 	const char *adrfam_str;
332 
333 	trtype_str = spdk_nvme_transport_id_trtype_str(trid->trtype);
334 	if (trtype_str) {
335 		spdk_json_write_named_string(w, "trtype", trtype_str);
336 	}
337 
338 	adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam);
339 	if (adrfam_str) {
340 		spdk_json_write_named_string(w, "adrfam", adrfam_str);
341 	}
342 
343 	if (trid->traddr[0] != '\0') {
344 		spdk_json_write_named_string(w, "traddr", trid->traddr);
345 	}
346 
347 	if (trid->trsvcid[0] != '\0') {
348 		spdk_json_write_named_string(w, "trsvcid", trid->trsvcid);
349 	}
350 
351 	if (trid->subnqn[0] != '\0') {
352 		spdk_json_write_named_string(w, "subnqn", trid->subnqn);
353 	}
354 }
355 
356 static void
357 nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr)
358 {
359 	struct nvme_ctrlr_trid *trid, *tmp_trid;
360 	uint32_t i;
361 
362 	free(nvme_ctrlr->copied_ana_desc);
363 	spdk_free(nvme_ctrlr->ana_log_page);
364 
365 	if (nvme_ctrlr->opal_dev) {
366 		spdk_opal_dev_destruct(nvme_ctrlr->opal_dev);
367 		nvme_ctrlr->opal_dev = NULL;
368 	}
369 
370 	pthread_mutex_lock(&g_bdev_nvme_mutex);
371 	TAILQ_REMOVE(&g_nvme_ctrlrs, nvme_ctrlr, tailq);
372 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
373 	spdk_nvme_detach(nvme_ctrlr->ctrlr);
374 	spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller);
375 	free(nvme_ctrlr->name);
376 	for (i = 0; i < nvme_ctrlr->num_ns; i++) {
377 		free(nvme_ctrlr->namespaces[i]);
378 	}
379 
380 	TAILQ_FOREACH_SAFE(trid, &nvme_ctrlr->trids, link, tmp_trid) {
381 		TAILQ_REMOVE(&nvme_ctrlr->trids, trid, link);
382 		free(trid);
383 	}
384 
385 	pthread_mutex_destroy(&nvme_ctrlr->mutex);
386 
387 	free(nvme_ctrlr->namespaces);
388 	free(nvme_ctrlr);
389 }
390 
391 static void
392 nvme_ctrlr_unregister_cb(void *io_device)
393 {
394 	struct nvme_ctrlr *nvme_ctrlr = io_device;
395 
396 	nvme_ctrlr_delete(nvme_ctrlr);
397 
398 	pthread_mutex_lock(&g_bdev_nvme_mutex);
399 	if (g_bdev_nvme_module_finish && TAILQ_EMPTY(&g_nvme_ctrlrs)) {
400 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
401 		spdk_io_device_unregister(&g_nvme_ctrlrs, NULL);
402 		spdk_bdev_module_fini_done();
403 		return;
404 	}
405 
406 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
407 }
408 
409 static void
410 nvme_ctrlr_unregister(void *ctx)
411 {
412 	struct nvme_ctrlr *nvme_ctrlr = ctx;
413 
414 	spdk_io_device_unregister(nvme_ctrlr, nvme_ctrlr_unregister_cb);
415 }
416 
417 static void
418 nvme_ctrlr_release(struct nvme_ctrlr *nvme_ctrlr)
419 {
420 	pthread_mutex_lock(&nvme_ctrlr->mutex);
421 
422 	assert(nvme_ctrlr->ref > 0);
423 	nvme_ctrlr->ref--;
424 
425 	if (nvme_ctrlr->ref > 0 || !nvme_ctrlr->destruct ||
426 	    nvme_ctrlr->resetting) {
427 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
428 		return;
429 	}
430 
431 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
432 
433 	nvme_ctrlr_unregister(nvme_ctrlr);
434 }
435 
436 static int
437 bdev_nvme_create_bdev_channel_cb(void *io_device, void *ctx_buf)
438 {
439 	struct nvme_bdev_channel *nbdev_ch = ctx_buf;
440 	struct nvme_bdev *nbdev = io_device;
441 	struct nvme_ns *nvme_ns;
442 	struct spdk_io_channel *ch;
443 
444 	nvme_ns = nbdev->nvme_ns;
445 
446 	ch = spdk_get_io_channel(nvme_ns->ctrlr);
447 	if (ch == NULL) {
448 		SPDK_ERRLOG("Failed to alloc io_channel.\n");
449 		return -ENOMEM;
450 	}
451 
452 	nbdev_ch->ctrlr_ch = spdk_io_channel_get_ctx(ch);
453 	nbdev_ch->nvme_ns = nvme_ns;
454 
455 	return 0;
456 }
457 
458 static void
459 bdev_nvme_destroy_bdev_channel_cb(void *io_device, void *ctx_buf)
460 {
461 	struct nvme_bdev_channel *nbdev_ch = ctx_buf;
462 	struct spdk_io_channel *ch;
463 
464 	ch = spdk_io_channel_from_ctx(nbdev_ch->ctrlr_ch);
465 	spdk_put_io_channel(ch);
466 }
467 
468 static inline bool
469 bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch,
470 		       struct spdk_nvme_ns **_ns, struct spdk_nvme_qpair **_qpair)
471 {
472 	if (spdk_unlikely(nbdev_ch->ctrlr_ch->qpair == NULL)) {
473 		/* The device is currently resetting. */
474 		return false;
475 	}
476 
477 	*_ns = nbdev_ch->nvme_ns->ns;
478 	*_qpair = nbdev_ch->ctrlr_ch->qpair;
479 	return true;
480 }
481 
482 static inline bool
483 bdev_nvme_find_admin_path(struct nvme_bdev_channel *nbdev_ch,
484 			  struct nvme_ctrlr **_nvme_ctrlr)
485 {
486 	*_nvme_ctrlr = nbdev_ch->ctrlr_ch->ctrlr;
487 	return true;
488 }
489 
490 static inline void
491 bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio,
492 				  const struct spdk_nvme_cpl *cpl)
493 {
494 	spdk_bdev_io_complete_nvme_status(spdk_bdev_io_from_ctx(bio), cpl->cdw0,
495 					  cpl->status.sct, cpl->status.sc);
496 }
497 
498 static inline void
499 bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc)
500 {
501 	enum spdk_bdev_io_status io_status;
502 
503 	if (rc == 0) {
504 		io_status = SPDK_BDEV_IO_STATUS_SUCCESS;
505 	} else if (rc == -ENOMEM) {
506 		io_status = SPDK_BDEV_IO_STATUS_NOMEM;
507 	} else {
508 		io_status = SPDK_BDEV_IO_STATUS_FAILED;
509 	}
510 
511 	spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), io_status);
512 }
513 
514 static void
515 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx)
516 {
517 	int rc;
518 
519 	SPDK_DEBUGLOG(bdev_nvme, "qpair %p is disconnected, attempting reconnect.\n", qpair);
520 	/*
521 	 * Currently, just try to reconnect indefinitely. If we are doing a reset, the reset will
522 	 * reconnect a qpair and we will stop getting a callback for this one.
523 	 */
524 	rc = spdk_nvme_ctrlr_reconnect_io_qpair(qpair);
525 	if (rc != 0) {
526 		SPDK_DEBUGLOG(bdev_nvme, "Failed to reconnect to qpair %p, errno %d\n", qpair, -rc);
527 	}
528 }
529 
530 static int
531 bdev_nvme_poll(void *arg)
532 {
533 	struct nvme_poll_group *group = arg;
534 	int64_t num_completions;
535 
536 	if (group->collect_spin_stat && group->start_ticks == 0) {
537 		group->start_ticks = spdk_get_ticks();
538 	}
539 
540 	num_completions = spdk_nvme_poll_group_process_completions(group->group, 0,
541 			  bdev_nvme_disconnected_qpair_cb);
542 	if (group->collect_spin_stat) {
543 		if (num_completions > 0) {
544 			if (group->end_ticks != 0) {
545 				group->spin_ticks += (group->end_ticks - group->start_ticks);
546 				group->end_ticks = 0;
547 			}
548 			group->start_ticks = 0;
549 		} else {
550 			group->end_ticks = spdk_get_ticks();
551 		}
552 	}
553 
554 	return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
555 }
556 
557 static int
558 bdev_nvme_poll_adminq(void *arg)
559 {
560 	int32_t rc;
561 	struct nvme_ctrlr *nvme_ctrlr = arg;
562 
563 	assert(nvme_ctrlr != NULL);
564 
565 	rc = spdk_nvme_ctrlr_process_admin_completions(nvme_ctrlr->ctrlr);
566 	if (rc < 0) {
567 		bdev_nvme_failover(nvme_ctrlr, false);
568 	}
569 
570 	return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY;
571 }
572 
573 static void
574 _bdev_nvme_unregister_dev_cb(void *io_device)
575 {
576 	struct nvme_bdev *nvme_disk = io_device;
577 
578 	free(nvme_disk->disk.name);
579 	free(nvme_disk);
580 }
581 
582 static int
583 bdev_nvme_destruct(void *ctx)
584 {
585 	struct nvme_bdev *nvme_disk = ctx;
586 	struct nvme_ns *nvme_ns = nvme_disk->nvme_ns;
587 
588 	pthread_mutex_lock(&nvme_ns->ctrlr->mutex);
589 
590 	nvme_ns->bdev = NULL;
591 
592 	assert(nvme_ns->id > 0);
593 
594 	if (nvme_ctrlr_get_ns(nvme_ns->ctrlr, nvme_ns->id) == NULL) {
595 		pthread_mutex_unlock(&nvme_ns->ctrlr->mutex);
596 
597 		nvme_ctrlr_release(nvme_ns->ctrlr);
598 		free(nvme_ns);
599 	} else {
600 		pthread_mutex_unlock(&nvme_ns->ctrlr->mutex);
601 	}
602 
603 	spdk_io_device_unregister(nvme_disk, _bdev_nvme_unregister_dev_cb);
604 
605 	return 0;
606 }
607 
608 static int
609 bdev_nvme_flush(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
610 		struct nvme_bdev_io *bio, uint64_t offset, uint64_t nbytes)
611 {
612 	bdev_nvme_io_complete(bio, 0);
613 
614 	return 0;
615 }
616 
617 static int
618 bdev_nvme_create_qpair(struct nvme_ctrlr_channel *ctrlr_ch)
619 {
620 	struct spdk_nvme_ctrlr *ctrlr = ctrlr_ch->ctrlr->ctrlr;
621 	struct spdk_nvme_io_qpair_opts opts;
622 	struct spdk_nvme_qpair *qpair;
623 	int rc;
624 
625 	spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &opts, sizeof(opts));
626 	opts.delay_cmd_submit = g_opts.delay_cmd_submit;
627 	opts.create_only = true;
628 	opts.async_mode = true;
629 	opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests);
630 	g_opts.io_queue_requests = opts.io_queue_requests;
631 
632 	qpair = spdk_nvme_ctrlr_alloc_io_qpair(ctrlr, &opts, sizeof(opts));
633 	if (qpair == NULL) {
634 		return -1;
635 	}
636 
637 	assert(ctrlr_ch->group != NULL);
638 
639 	rc = spdk_nvme_poll_group_add(ctrlr_ch->group->group, qpair);
640 	if (rc != 0) {
641 		SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n");
642 		goto err;
643 	}
644 
645 	rc = spdk_nvme_ctrlr_connect_io_qpair(ctrlr, qpair);
646 	if (rc != 0) {
647 		SPDK_ERRLOG("Unable to connect I/O qpair.\n");
648 		goto err;
649 	}
650 
651 	ctrlr_ch->qpair = qpair;
652 
653 	return 0;
654 
655 err:
656 	spdk_nvme_ctrlr_free_io_qpair(qpair);
657 
658 	return rc;
659 }
660 
661 static void
662 bdev_nvme_destroy_qpair(struct nvme_ctrlr_channel *ctrlr_ch)
663 {
664 	if (ctrlr_ch->qpair != NULL) {
665 		spdk_nvme_ctrlr_free_io_qpair(ctrlr_ch->qpair);
666 		ctrlr_ch->qpair = NULL;
667 	}
668 }
669 
670 static void
671 _bdev_nvme_check_pending_destruct(struct nvme_ctrlr *nvme_ctrlr)
672 {
673 	pthread_mutex_lock(&nvme_ctrlr->mutex);
674 	if (nvme_ctrlr->destruct_after_reset) {
675 		assert(nvme_ctrlr->ref == 0 && nvme_ctrlr->destruct);
676 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
677 
678 		spdk_thread_send_msg(nvme_ctrlr->thread, nvme_ctrlr_unregister,
679 				     nvme_ctrlr);
680 	} else {
681 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
682 	}
683 }
684 
685 static void
686 bdev_nvme_check_pending_destruct(struct spdk_io_channel_iter *i, int status)
687 {
688 	struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i);
689 
690 	_bdev_nvme_check_pending_destruct(nvme_ctrlr);
691 }
692 
693 static void
694 _bdev_nvme_complete_pending_resets(struct nvme_ctrlr_channel *ctrlr_ch,
695 				   enum spdk_bdev_io_status status)
696 {
697 	struct spdk_bdev_io *bdev_io;
698 
699 	while (!TAILQ_EMPTY(&ctrlr_ch->pending_resets)) {
700 		bdev_io = TAILQ_FIRST(&ctrlr_ch->pending_resets);
701 		TAILQ_REMOVE(&ctrlr_ch->pending_resets, bdev_io, module_link);
702 		spdk_bdev_io_complete(bdev_io, status);
703 	}
704 }
705 
706 static void
707 bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i)
708 {
709 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
710 	struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch);
711 
712 	_bdev_nvme_complete_pending_resets(ctrlr_ch, SPDK_BDEV_IO_STATUS_SUCCESS);
713 
714 	spdk_for_each_channel_continue(i, 0);
715 }
716 
717 static void
718 bdev_nvme_abort_pending_resets(struct spdk_io_channel_iter *i)
719 {
720 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
721 	struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch);
722 
723 	_bdev_nvme_complete_pending_resets(ctrlr_ch, SPDK_BDEV_IO_STATUS_FAILED);
724 
725 	spdk_for_each_channel_continue(i, 0);
726 }
727 
728 static void
729 bdev_nvme_reset_complete(struct nvme_ctrlr *nvme_ctrlr, int rc)
730 {
731 	struct nvme_ctrlr_trid *curr_trid;
732 	bdev_nvme_reset_cb reset_cb_fn = nvme_ctrlr->reset_cb_fn;
733 	void *reset_cb_arg = nvme_ctrlr->reset_cb_arg;
734 
735 	nvme_ctrlr->reset_cb_fn = NULL;
736 	nvme_ctrlr->reset_cb_arg = NULL;
737 
738 	if (rc) {
739 		SPDK_ERRLOG("Resetting controller failed.\n");
740 	} else {
741 		SPDK_NOTICELOG("Resetting controller successful.\n");
742 	}
743 
744 	pthread_mutex_lock(&nvme_ctrlr->mutex);
745 	nvme_ctrlr->resetting = false;
746 	nvme_ctrlr->failover_in_progress = false;
747 
748 	curr_trid = TAILQ_FIRST(&nvme_ctrlr->trids);
749 	assert(curr_trid != NULL);
750 	assert(&curr_trid->trid == nvme_ctrlr->connected_trid);
751 
752 	curr_trid->is_failed = rc != 0 ? true : false;
753 
754 	if (nvme_ctrlr->ref == 0 && nvme_ctrlr->destruct) {
755 		/* Destruct ctrlr after clearing pending resets. */
756 		nvme_ctrlr->destruct_after_reset = true;
757 	}
758 
759 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
760 
761 	if (reset_cb_fn) {
762 		reset_cb_fn(reset_cb_arg, rc);
763 	}
764 
765 	/* Make sure we clear any pending resets before returning. */
766 	spdk_for_each_channel(nvme_ctrlr,
767 			      rc == 0 ? bdev_nvme_complete_pending_resets :
768 			      bdev_nvme_abort_pending_resets,
769 			      NULL,
770 			      bdev_nvme_check_pending_destruct);
771 }
772 
773 static void
774 bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status)
775 {
776 	struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i);
777 
778 	bdev_nvme_reset_complete(nvme_ctrlr, status);
779 }
780 
781 static void
782 bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i)
783 {
784 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
785 	struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch);
786 	int rc;
787 
788 	rc = bdev_nvme_create_qpair(ctrlr_ch);
789 
790 	spdk_for_each_channel_continue(i, rc);
791 }
792 
793 static int
794 bdev_nvme_ctrlr_reset_poll(void *arg)
795 {
796 	struct nvme_ctrlr *nvme_ctrlr = arg;
797 	int rc;
798 
799 	rc = spdk_nvme_ctrlr_reset_poll_async(nvme_ctrlr->reset_ctx);
800 	if (rc == -EAGAIN) {
801 		return SPDK_POLLER_BUSY;
802 	}
803 
804 	spdk_poller_unregister(&nvme_ctrlr->reset_poller);
805 	if (rc == 0) {
806 		/* Recreate all of the I/O queue pairs */
807 		spdk_for_each_channel(nvme_ctrlr,
808 				      bdev_nvme_reset_create_qpair,
809 				      NULL,
810 				      bdev_nvme_reset_create_qpairs_done);
811 	} else {
812 		bdev_nvme_reset_complete(nvme_ctrlr, rc);
813 	}
814 	return SPDK_POLLER_BUSY;
815 }
816 
817 static void
818 bdev_nvme_reset_ctrlr(struct spdk_io_channel_iter *i, int status)
819 {
820 	struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i);
821 	int rc;
822 
823 	if (status) {
824 		rc = status;
825 		goto err;
826 	}
827 
828 	rc = spdk_nvme_ctrlr_reset_async(nvme_ctrlr->ctrlr, &nvme_ctrlr->reset_ctx);
829 	if (rc != 0) {
830 		SPDK_ERRLOG("Create controller reset context failed\n");
831 		goto err;
832 	}
833 	assert(nvme_ctrlr->reset_poller == NULL);
834 	nvme_ctrlr->reset_poller = SPDK_POLLER_REGISTER(bdev_nvme_ctrlr_reset_poll,
835 				   nvme_ctrlr, 0);
836 
837 	return;
838 
839 err:
840 	bdev_nvme_reset_complete(nvme_ctrlr, rc);
841 }
842 
843 static void
844 bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i)
845 {
846 	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
847 	struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch);
848 
849 	bdev_nvme_destroy_qpair(ctrlr_ch);
850 	spdk_for_each_channel_continue(i, 0);
851 }
852 
853 static int
854 bdev_nvme_reset(struct nvme_ctrlr *nvme_ctrlr)
855 {
856 	pthread_mutex_lock(&nvme_ctrlr->mutex);
857 	if (nvme_ctrlr->destruct) {
858 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
859 		return -ENXIO;
860 	}
861 
862 	if (nvme_ctrlr->resetting) {
863 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
864 		SPDK_NOTICELOG("Unable to perform reset, already in progress.\n");
865 		return -EBUSY;
866 	}
867 
868 	nvme_ctrlr->resetting = true;
869 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
870 	spdk_nvme_ctrlr_prepare_for_reset(nvme_ctrlr->ctrlr);
871 
872 	/* First, delete all NVMe I/O queue pairs. */
873 	spdk_for_each_channel(nvme_ctrlr,
874 			      bdev_nvme_reset_destroy_qpair,
875 			      NULL,
876 			      bdev_nvme_reset_ctrlr);
877 
878 	return 0;
879 }
880 
881 int
882 bdev_nvme_reset_rpc(struct nvme_ctrlr *nvme_ctrlr, bdev_nvme_reset_cb cb_fn, void *cb_arg)
883 {
884 	int rc;
885 
886 	rc = bdev_nvme_reset(nvme_ctrlr);
887 	if (rc == 0) {
888 		nvme_ctrlr->reset_cb_fn = cb_fn;
889 		nvme_ctrlr->reset_cb_arg = cb_arg;
890 	}
891 	return rc;
892 }
893 
894 static void
895 bdev_nvme_reset_io_complete(void *cb_arg, int rc)
896 {
897 	struct nvme_bdev_io *bio = cb_arg;
898 
899 	bdev_nvme_io_complete(bio, rc);
900 }
901 
902 static int
903 bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio)
904 {
905 	struct nvme_ctrlr_channel *ctrlr_ch = nbdev_ch->ctrlr_ch;
906 	struct spdk_bdev_io *bdev_io;
907 	int rc;
908 
909 	rc = bdev_nvme_reset(ctrlr_ch->ctrlr);
910 	if (rc == 0) {
911 		assert(ctrlr_ch->ctrlr->reset_cb_fn == NULL);
912 		assert(ctrlr_ch->ctrlr->reset_cb_arg == NULL);
913 		ctrlr_ch->ctrlr->reset_cb_fn = bdev_nvme_reset_io_complete;
914 		ctrlr_ch->ctrlr->reset_cb_arg = bio;
915 	} else if (rc == -EBUSY) {
916 		/*
917 		 * Reset call is queued only if it is from the app framework. This is on purpose so that
918 		 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the
919 		 * upper level. If they are in the middle of a reset, we won't try to schedule another one.
920 		 */
921 		bdev_io = spdk_bdev_io_from_ctx(bio);
922 		TAILQ_INSERT_TAIL(&ctrlr_ch->pending_resets, bdev_io, module_link);
923 	} else {
924 		return rc;
925 	}
926 
927 	return 0;
928 }
929 
930 static int
931 bdev_nvme_failover_start(struct nvme_ctrlr *nvme_ctrlr, bool remove)
932 {
933 	struct nvme_ctrlr_trid *curr_trid = NULL, *next_trid = NULL;
934 	int rc;
935 
936 	pthread_mutex_lock(&nvme_ctrlr->mutex);
937 	if (nvme_ctrlr->destruct) {
938 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
939 		/* Don't bother resetting if the controller is in the process of being destructed. */
940 		return -ENXIO;
941 	}
942 
943 	curr_trid = TAILQ_FIRST(&nvme_ctrlr->trids);
944 	assert(curr_trid);
945 	assert(&curr_trid->trid == nvme_ctrlr->connected_trid);
946 	next_trid = TAILQ_NEXT(curr_trid, link);
947 
948 	if (nvme_ctrlr->resetting) {
949 		if (next_trid && !nvme_ctrlr->failover_in_progress) {
950 			rc = -EBUSY;
951 		} else {
952 			rc = -EALREADY;
953 		}
954 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
955 		SPDK_NOTICELOG("Unable to perform reset, already in progress.\n");
956 		return rc;
957 	}
958 
959 	nvme_ctrlr->resetting = true;
960 	curr_trid->is_failed = true;
961 
962 	if (next_trid) {
963 		assert(curr_trid->trid.trtype != SPDK_NVME_TRANSPORT_PCIE);
964 
965 		SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", curr_trid->trid.traddr,
966 			       curr_trid->trid.trsvcid,	next_trid->trid.traddr, next_trid->trid.trsvcid);
967 
968 		nvme_ctrlr->failover_in_progress = true;
969 		spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr);
970 		nvme_ctrlr->connected_trid = &next_trid->trid;
971 		rc = spdk_nvme_ctrlr_set_trid(nvme_ctrlr->ctrlr, &next_trid->trid);
972 		assert(rc == 0);
973 		TAILQ_REMOVE(&nvme_ctrlr->trids, curr_trid, link);
974 		if (!remove) {
975 			/** Shuffle the old trid to the end of the list and use the new one.
976 			 * Allows for round robin through multiple connections.
977 			 */
978 			TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, curr_trid, link);
979 		} else {
980 			free(curr_trid);
981 		}
982 	}
983 
984 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
985 	return 0;
986 }
987 
988 static int
989 bdev_nvme_failover(struct nvme_ctrlr *nvme_ctrlr, bool remove)
990 {
991 	int rc;
992 
993 	rc = bdev_nvme_failover_start(nvme_ctrlr, remove);
994 	if (rc == 0) {
995 		/* First, delete all NVMe I/O queue pairs. */
996 		spdk_for_each_channel(nvme_ctrlr,
997 				      bdev_nvme_reset_destroy_qpair,
998 				      NULL,
999 				      bdev_nvme_reset_ctrlr);
1000 	} else if (rc != -EALREADY) {
1001 		return rc;
1002 	}
1003 
1004 	return 0;
1005 }
1006 
1007 static int
1008 bdev_nvme_unmap(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
1009 		struct nvme_bdev_io *bio,
1010 		uint64_t offset_blocks,
1011 		uint64_t num_blocks);
1012 
1013 static int
1014 bdev_nvme_write_zeroes(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
1015 		       struct nvme_bdev_io *bio,
1016 		       uint64_t offset_blocks,
1017 		       uint64_t num_blocks);
1018 
1019 static void
1020 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
1021 		     bool success)
1022 {
1023 	struct nvme_bdev_io *bio = (struct nvme_bdev_io *)bdev_io->driver_ctx;
1024 	struct spdk_bdev *bdev = bdev_io->bdev;
1025 	struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch);
1026 	struct spdk_nvme_ns *ns;
1027 	struct spdk_nvme_qpair *qpair;
1028 	int ret;
1029 
1030 	if (!success) {
1031 		ret = -EINVAL;
1032 		goto exit;
1033 	}
1034 
1035 	if (spdk_unlikely(!bdev_nvme_find_io_path(nbdev_ch, &ns, &qpair))) {
1036 		ret = -ENXIO;
1037 		goto exit;
1038 	}
1039 
1040 	ret = bdev_nvme_readv(ns,
1041 			      qpair,
1042 			      bio,
1043 			      bdev_io->u.bdev.iovs,
1044 			      bdev_io->u.bdev.iovcnt,
1045 			      bdev_io->u.bdev.md_buf,
1046 			      bdev_io->u.bdev.num_blocks,
1047 			      bdev_io->u.bdev.offset_blocks,
1048 			      bdev->dif_check_flags,
1049 			      bdev_io->internal.ext_opts);
1050 
1051 exit:
1052 	if (spdk_unlikely(ret != 0)) {
1053 		bdev_nvme_io_complete(bio, ret);
1054 	}
1055 }
1056 
1057 static void
1058 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
1059 {
1060 	struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch);
1061 	struct spdk_bdev *bdev = bdev_io->bdev;
1062 	struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx;
1063 	struct nvme_bdev_io *nbdev_io_to_abort;
1064 	struct spdk_nvme_ns *ns;
1065 	struct spdk_nvme_qpair *qpair;
1066 	int rc = 0;
1067 
1068 	if (spdk_unlikely(!bdev_nvme_find_io_path(nbdev_ch, &ns, &qpair))) {
1069 		rc = -ENXIO;
1070 		goto exit;
1071 	}
1072 
1073 	switch (bdev_io->type) {
1074 	case SPDK_BDEV_IO_TYPE_READ:
1075 		if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) {
1076 			rc = bdev_nvme_readv(ns,
1077 					     qpair,
1078 					     nbdev_io,
1079 					     bdev_io->u.bdev.iovs,
1080 					     bdev_io->u.bdev.iovcnt,
1081 					     bdev_io->u.bdev.md_buf,
1082 					     bdev_io->u.bdev.num_blocks,
1083 					     bdev_io->u.bdev.offset_blocks,
1084 					     bdev->dif_check_flags,
1085 					     bdev_io->internal.ext_opts);
1086 		} else {
1087 			spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb,
1088 					     bdev_io->u.bdev.num_blocks * bdev->blocklen);
1089 			rc = 0;
1090 		}
1091 		break;
1092 	case SPDK_BDEV_IO_TYPE_WRITE:
1093 		rc = bdev_nvme_writev(ns,
1094 				      qpair,
1095 				      nbdev_io,
1096 				      bdev_io->u.bdev.iovs,
1097 				      bdev_io->u.bdev.iovcnt,
1098 				      bdev_io->u.bdev.md_buf,
1099 				      bdev_io->u.bdev.num_blocks,
1100 				      bdev_io->u.bdev.offset_blocks,
1101 				      bdev->dif_check_flags,
1102 				      bdev_io->internal.ext_opts);
1103 		break;
1104 	case SPDK_BDEV_IO_TYPE_COMPARE:
1105 		rc = bdev_nvme_comparev(ns,
1106 					qpair,
1107 					nbdev_io,
1108 					bdev_io->u.bdev.iovs,
1109 					bdev_io->u.bdev.iovcnt,
1110 					bdev_io->u.bdev.md_buf,
1111 					bdev_io->u.bdev.num_blocks,
1112 					bdev_io->u.bdev.offset_blocks,
1113 					bdev->dif_check_flags);
1114 		break;
1115 	case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE:
1116 		rc = bdev_nvme_comparev_and_writev(ns,
1117 						   qpair,
1118 						   nbdev_io,
1119 						   bdev_io->u.bdev.iovs,
1120 						   bdev_io->u.bdev.iovcnt,
1121 						   bdev_io->u.bdev.fused_iovs,
1122 						   bdev_io->u.bdev.fused_iovcnt,
1123 						   bdev_io->u.bdev.md_buf,
1124 						   bdev_io->u.bdev.num_blocks,
1125 						   bdev_io->u.bdev.offset_blocks,
1126 						   bdev->dif_check_flags);
1127 		break;
1128 	case SPDK_BDEV_IO_TYPE_UNMAP:
1129 		rc = bdev_nvme_unmap(ns,
1130 				     qpair,
1131 				     nbdev_io,
1132 				     bdev_io->u.bdev.offset_blocks,
1133 				     bdev_io->u.bdev.num_blocks);
1134 		break;
1135 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
1136 		rc =  bdev_nvme_write_zeroes(ns, qpair,
1137 					     nbdev_io,
1138 					     bdev_io->u.bdev.offset_blocks,
1139 					     bdev_io->u.bdev.num_blocks);
1140 		break;
1141 	case SPDK_BDEV_IO_TYPE_RESET:
1142 		rc = bdev_nvme_reset_io(nbdev_ch, nbdev_io);
1143 		break;
1144 	case SPDK_BDEV_IO_TYPE_FLUSH:
1145 		rc = bdev_nvme_flush(ns,
1146 				     qpair,
1147 				     nbdev_io,
1148 				     bdev_io->u.bdev.offset_blocks,
1149 				     bdev_io->u.bdev.num_blocks);
1150 		break;
1151 	case SPDK_BDEV_IO_TYPE_ZONE_APPEND:
1152 		rc = bdev_nvme_zone_appendv(ns,
1153 					    qpair,
1154 					    nbdev_io,
1155 					    bdev_io->u.bdev.iovs,
1156 					    bdev_io->u.bdev.iovcnt,
1157 					    bdev_io->u.bdev.md_buf,
1158 					    bdev_io->u.bdev.num_blocks,
1159 					    bdev_io->u.bdev.offset_blocks,
1160 					    bdev->dif_check_flags);
1161 		break;
1162 	case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO:
1163 		rc = bdev_nvme_get_zone_info(ns,
1164 					     qpair,
1165 					     nbdev_io,
1166 					     bdev_io->u.zone_mgmt.zone_id,
1167 					     bdev_io->u.zone_mgmt.num_zones,
1168 					     bdev_io->u.zone_mgmt.buf);
1169 		break;
1170 	case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
1171 		rc = bdev_nvme_zone_management(ns,
1172 					       qpair,
1173 					       nbdev_io,
1174 					       bdev_io->u.zone_mgmt.zone_id,
1175 					       bdev_io->u.zone_mgmt.zone_action);
1176 		break;
1177 	case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
1178 		rc = bdev_nvme_admin_passthru(nbdev_ch,
1179 					      nbdev_io,
1180 					      &bdev_io->u.nvme_passthru.cmd,
1181 					      bdev_io->u.nvme_passthru.buf,
1182 					      bdev_io->u.nvme_passthru.nbytes);
1183 		break;
1184 	case SPDK_BDEV_IO_TYPE_NVME_IO:
1185 		rc = bdev_nvme_io_passthru(ns,
1186 					   qpair,
1187 					   nbdev_io,
1188 					   &bdev_io->u.nvme_passthru.cmd,
1189 					   bdev_io->u.nvme_passthru.buf,
1190 					   bdev_io->u.nvme_passthru.nbytes);
1191 		break;
1192 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
1193 		rc = bdev_nvme_io_passthru_md(ns,
1194 					      qpair,
1195 					      nbdev_io,
1196 					      &bdev_io->u.nvme_passthru.cmd,
1197 					      bdev_io->u.nvme_passthru.buf,
1198 					      bdev_io->u.nvme_passthru.nbytes,
1199 					      bdev_io->u.nvme_passthru.md_buf,
1200 					      bdev_io->u.nvme_passthru.md_len);
1201 		break;
1202 	case SPDK_BDEV_IO_TYPE_ABORT:
1203 		nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx;
1204 		rc = bdev_nvme_abort(nbdev_ch,
1205 				     nbdev_io,
1206 				     nbdev_io_to_abort);
1207 		break;
1208 	default:
1209 		rc = -EINVAL;
1210 		break;
1211 	}
1212 
1213 exit:
1214 	if (spdk_unlikely(rc != 0)) {
1215 		bdev_nvme_io_complete(nbdev_io, rc);
1216 	}
1217 }
1218 
1219 static bool
1220 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
1221 {
1222 	struct nvme_bdev *nbdev = ctx;
1223 	struct nvme_ns *nvme_ns;
1224 	struct spdk_nvme_ns *ns;
1225 	struct spdk_nvme_ctrlr *ctrlr;
1226 	const struct spdk_nvme_ctrlr_data *cdata;
1227 
1228 	nvme_ns = nbdev->nvme_ns;
1229 	assert(nvme_ns != NULL);
1230 	ns = nvme_ns->ns;
1231 	ctrlr = spdk_nvme_ns_get_ctrlr(ns);
1232 
1233 	switch (io_type) {
1234 	case SPDK_BDEV_IO_TYPE_READ:
1235 	case SPDK_BDEV_IO_TYPE_WRITE:
1236 	case SPDK_BDEV_IO_TYPE_RESET:
1237 	case SPDK_BDEV_IO_TYPE_FLUSH:
1238 	case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
1239 	case SPDK_BDEV_IO_TYPE_NVME_IO:
1240 	case SPDK_BDEV_IO_TYPE_ABORT:
1241 		return true;
1242 
1243 	case SPDK_BDEV_IO_TYPE_COMPARE:
1244 		return spdk_nvme_ns_supports_compare(ns);
1245 
1246 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
1247 		return spdk_nvme_ns_get_md_size(ns) ? true : false;
1248 
1249 	case SPDK_BDEV_IO_TYPE_UNMAP:
1250 		cdata = spdk_nvme_ctrlr_get_data(ctrlr);
1251 		return cdata->oncs.dsm;
1252 
1253 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
1254 		cdata = spdk_nvme_ctrlr_get_data(ctrlr);
1255 		return cdata->oncs.write_zeroes;
1256 
1257 	case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE:
1258 		if (spdk_nvme_ctrlr_get_flags(ctrlr) &
1259 		    SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) {
1260 			return true;
1261 		}
1262 		return false;
1263 
1264 	case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO:
1265 	case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
1266 		return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS;
1267 
1268 	case SPDK_BDEV_IO_TYPE_ZONE_APPEND:
1269 		return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS &&
1270 		       spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED;
1271 
1272 	default:
1273 		return false;
1274 	}
1275 }
1276 
1277 static int
1278 bdev_nvme_create_ctrlr_channel_cb(void *io_device, void *ctx_buf)
1279 {
1280 	struct nvme_ctrlr *nvme_ctrlr = io_device;
1281 	struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf;
1282 	struct spdk_io_channel *pg_ch;
1283 	int rc;
1284 
1285 	pg_ch = spdk_get_io_channel(&g_nvme_ctrlrs);
1286 	if (!pg_ch) {
1287 		return -1;
1288 	}
1289 
1290 	ctrlr_ch->group = spdk_io_channel_get_ctx(pg_ch);
1291 
1292 #ifdef SPDK_CONFIG_VTUNE
1293 	ctrlr_ch->group->collect_spin_stat = true;
1294 #else
1295 	ctrlr_ch->group->collect_spin_stat = false;
1296 #endif
1297 
1298 	TAILQ_INIT(&ctrlr_ch->pending_resets);
1299 
1300 	ctrlr_ch->ctrlr = nvme_ctrlr;
1301 
1302 	rc = bdev_nvme_create_qpair(ctrlr_ch);
1303 	if (rc != 0) {
1304 		goto err_qpair;
1305 	}
1306 
1307 	return 0;
1308 
1309 err_qpair:
1310 	spdk_put_io_channel(pg_ch);
1311 
1312 	return rc;
1313 }
1314 
1315 static void
1316 bdev_nvme_destroy_ctrlr_channel_cb(void *io_device, void *ctx_buf)
1317 {
1318 	struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf;
1319 
1320 	assert(ctrlr_ch->group != NULL);
1321 
1322 	bdev_nvme_destroy_qpair(ctrlr_ch);
1323 
1324 	spdk_put_io_channel(spdk_io_channel_from_ctx(ctrlr_ch->group));
1325 }
1326 
1327 static void
1328 bdev_nvme_submit_accel_crc32c(void *ctx, uint32_t *dst, struct iovec *iov,
1329 			      uint32_t iov_cnt, uint32_t seed,
1330 			      spdk_nvme_accel_completion_cb cb_fn, void *cb_arg)
1331 {
1332 	struct nvme_poll_group *group = ctx;
1333 	int rc;
1334 
1335 	assert(group->accel_channel != NULL);
1336 	assert(cb_fn != NULL);
1337 
1338 	rc = spdk_accel_submit_crc32cv(group->accel_channel, dst, iov, iov_cnt, seed, cb_fn, cb_arg);
1339 	if (rc) {
1340 		/* For the two cases, spdk_accel_submit_crc32cv does not call the user's cb_fn */
1341 		if (rc == -ENOMEM || rc == -EINVAL) {
1342 			cb_fn(cb_arg, rc);
1343 		}
1344 		SPDK_ERRLOG("Cannot complete the accelerated crc32c operation with iov=%p\n", iov);
1345 	}
1346 }
1347 
1348 static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = {
1349 	.table_size		= sizeof(struct spdk_nvme_accel_fn_table),
1350 	.submit_accel_crc32c	= bdev_nvme_submit_accel_crc32c,
1351 };
1352 
1353 static int
1354 bdev_nvme_create_poll_group_cb(void *io_device, void *ctx_buf)
1355 {
1356 	struct nvme_poll_group *group = ctx_buf;
1357 
1358 	group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table);
1359 	if (group->group == NULL) {
1360 		return -1;
1361 	}
1362 
1363 	group->accel_channel = spdk_accel_engine_get_io_channel();
1364 	if (!group->accel_channel) {
1365 		spdk_nvme_poll_group_destroy(group->group);
1366 		SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n",
1367 			    group);
1368 		return -1;
1369 	}
1370 
1371 	group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us);
1372 
1373 	if (group->poller == NULL) {
1374 		spdk_put_io_channel(group->accel_channel);
1375 		spdk_nvme_poll_group_destroy(group->group);
1376 		return -1;
1377 	}
1378 
1379 	return 0;
1380 }
1381 
1382 static void
1383 bdev_nvme_destroy_poll_group_cb(void *io_device, void *ctx_buf)
1384 {
1385 	struct nvme_poll_group *group = ctx_buf;
1386 
1387 	if (group->accel_channel) {
1388 		spdk_put_io_channel(group->accel_channel);
1389 	}
1390 
1391 	spdk_poller_unregister(&group->poller);
1392 	if (spdk_nvme_poll_group_destroy(group->group)) {
1393 		SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.\n");
1394 		assert(false);
1395 	}
1396 }
1397 
1398 static struct spdk_io_channel *
1399 bdev_nvme_get_io_channel(void *ctx)
1400 {
1401 	struct nvme_bdev *nvme_bdev = ctx;
1402 
1403 	return spdk_get_io_channel(nvme_bdev);
1404 }
1405 
1406 static void *
1407 bdev_nvme_get_module_ctx(void *ctx)
1408 {
1409 	struct nvme_bdev *nvme_bdev = ctx;
1410 
1411 	if (!nvme_bdev || nvme_bdev->disk.module != &nvme_if || !nvme_bdev->nvme_ns) {
1412 		return NULL;
1413 	}
1414 
1415 	return nvme_bdev->nvme_ns->ns;
1416 }
1417 
1418 static const char *
1419 _nvme_ana_state_str(enum spdk_nvme_ana_state ana_state)
1420 {
1421 	switch (ana_state) {
1422 	case SPDK_NVME_ANA_OPTIMIZED_STATE:
1423 		return "optimized";
1424 	case SPDK_NVME_ANA_NON_OPTIMIZED_STATE:
1425 		return "non_optimized";
1426 	case SPDK_NVME_ANA_INACCESSIBLE_STATE:
1427 		return "inaccessible";
1428 	case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE:
1429 		return "persistent_loss";
1430 	case SPDK_NVME_ANA_CHANGE_STATE:
1431 		return "change";
1432 	default:
1433 		return NULL;
1434 	}
1435 }
1436 
1437 static int
1438 bdev_nvme_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size)
1439 {
1440 	struct nvme_bdev *nbdev = ctx;
1441 	struct spdk_memory_domain *domain;
1442 
1443 	domain = spdk_nvme_ctrlr_get_memory_domain(nbdev->nvme_ns->ctrlr->ctrlr);
1444 
1445 	if (domain) {
1446 		if (array_size > 0 && domains) {
1447 			domains[0] = domain;
1448 		}
1449 		return 1;
1450 	}
1451 
1452 	return 0;
1453 }
1454 
1455 static int
1456 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
1457 {
1458 	struct nvme_bdev *nvme_bdev = ctx;
1459 	struct nvme_ns *nvme_ns;
1460 	struct spdk_nvme_ns *ns;
1461 	struct spdk_nvme_ctrlr *ctrlr;
1462 	const struct spdk_nvme_ctrlr_data *cdata;
1463 	const struct spdk_nvme_transport_id *trid;
1464 	union spdk_nvme_vs_register vs;
1465 	union spdk_nvme_csts_register csts;
1466 	char buf[128];
1467 
1468 	nvme_ns = nvme_bdev->nvme_ns;
1469 	assert(nvme_ns != NULL);
1470 	ns = nvme_ns->ns;
1471 	ctrlr = spdk_nvme_ns_get_ctrlr(ns);
1472 
1473 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
1474 	trid = spdk_nvme_ctrlr_get_transport_id(ctrlr);
1475 	vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr);
1476 	csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr);
1477 
1478 	spdk_json_write_named_object_begin(w, "nvme");
1479 
1480 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
1481 		spdk_json_write_named_string(w, "pci_address", trid->traddr);
1482 	}
1483 
1484 	spdk_json_write_named_object_begin(w, "trid");
1485 
1486 	nvme_bdev_dump_trid_json(trid, w);
1487 
1488 	spdk_json_write_object_end(w);
1489 
1490 #ifdef SPDK_CONFIG_NVME_CUSE
1491 	size_t cuse_name_size = 128;
1492 	char cuse_name[cuse_name_size];
1493 
1494 	int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns),
1495 					    cuse_name, &cuse_name_size);
1496 	if (rc == 0) {
1497 		spdk_json_write_named_string(w, "cuse_device", cuse_name);
1498 	}
1499 #endif
1500 
1501 	spdk_json_write_named_object_begin(w, "ctrlr_data");
1502 
1503 	spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid);
1504 
1505 	snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn);
1506 	spdk_str_trim(buf);
1507 	spdk_json_write_named_string(w, "model_number", buf);
1508 
1509 	snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn);
1510 	spdk_str_trim(buf);
1511 	spdk_json_write_named_string(w, "serial_number", buf);
1512 
1513 	snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr);
1514 	spdk_str_trim(buf);
1515 	spdk_json_write_named_string(w, "firmware_revision", buf);
1516 
1517 	if (cdata->subnqn[0] != '\0') {
1518 		spdk_json_write_named_string(w, "subnqn", cdata->subnqn);
1519 	}
1520 
1521 	spdk_json_write_named_object_begin(w, "oacs");
1522 
1523 	spdk_json_write_named_uint32(w, "security", cdata->oacs.security);
1524 	spdk_json_write_named_uint32(w, "format", cdata->oacs.format);
1525 	spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware);
1526 	spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage);
1527 
1528 	spdk_json_write_object_end(w);
1529 
1530 	spdk_json_write_object_end(w);
1531 
1532 	spdk_json_write_named_object_begin(w, "vs");
1533 
1534 	spdk_json_write_name(w, "nvme_version");
1535 	if (vs.bits.ter) {
1536 		spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter);
1537 	} else {
1538 		spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr);
1539 	}
1540 
1541 	spdk_json_write_object_end(w);
1542 
1543 	spdk_json_write_named_object_begin(w, "csts");
1544 
1545 	spdk_json_write_named_uint32(w, "rdy", csts.bits.rdy);
1546 	spdk_json_write_named_uint32(w, "cfs", csts.bits.cfs);
1547 
1548 	spdk_json_write_object_end(w);
1549 
1550 	spdk_json_write_named_object_begin(w, "ns_data");
1551 
1552 	spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns));
1553 
1554 	if (cdata->cmic.ana_reporting) {
1555 		spdk_json_write_named_string(w, "ana_state",
1556 					     _nvme_ana_state_str(nvme_ns->ana_state));
1557 	}
1558 
1559 	spdk_json_write_object_end(w);
1560 
1561 	if (cdata->oacs.security) {
1562 		spdk_json_write_named_object_begin(w, "security");
1563 
1564 		spdk_json_write_named_bool(w, "opal", nvme_bdev->opal);
1565 
1566 		spdk_json_write_object_end(w);
1567 	}
1568 
1569 	spdk_json_write_object_end(w);
1570 
1571 	return 0;
1572 }
1573 
1574 static void
1575 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
1576 {
1577 	/* No config per bdev needed */
1578 }
1579 
1580 static uint64_t
1581 bdev_nvme_get_spin_time(struct spdk_io_channel *ch)
1582 {
1583 	struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch);
1584 	struct nvme_ctrlr_channel *ctrlr_ch = nbdev_ch->ctrlr_ch;
1585 	struct nvme_poll_group *group = ctrlr_ch->group;
1586 	uint64_t spin_time;
1587 
1588 	if (!group || !group->collect_spin_stat) {
1589 		return 0;
1590 	}
1591 
1592 	if (group->end_ticks != 0) {
1593 		group->spin_ticks += (group->end_ticks - group->start_ticks);
1594 		group->end_ticks = 0;
1595 	}
1596 
1597 	spin_time = (group->spin_ticks * 1000000ULL) / spdk_get_ticks_hz();
1598 	group->start_ticks = 0;
1599 	group->spin_ticks = 0;
1600 
1601 	return spin_time;
1602 }
1603 
1604 static const struct spdk_bdev_fn_table nvmelib_fn_table = {
1605 	.destruct		= bdev_nvme_destruct,
1606 	.submit_request		= bdev_nvme_submit_request,
1607 	.io_type_supported	= bdev_nvme_io_type_supported,
1608 	.get_io_channel		= bdev_nvme_get_io_channel,
1609 	.dump_info_json		= bdev_nvme_dump_info_json,
1610 	.write_config_json	= bdev_nvme_write_config_json,
1611 	.get_spin_time		= bdev_nvme_get_spin_time,
1612 	.get_module_ctx		= bdev_nvme_get_module_ctx,
1613 	.get_memory_domains	= bdev_nvme_get_memory_domains,
1614 };
1615 
1616 typedef int (*bdev_nvme_parse_ana_log_page_cb)(
1617 	const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg);
1618 
1619 static int
1620 bdev_nvme_parse_ana_log_page(struct nvme_ctrlr *nvme_ctrlr,
1621 			     bdev_nvme_parse_ana_log_page_cb cb_fn, void *cb_arg)
1622 {
1623 	struct spdk_nvme_ana_group_descriptor *copied_desc;
1624 	uint8_t *orig_desc;
1625 	uint32_t i, desc_size, copy_len;
1626 	int rc = 0;
1627 
1628 	if (nvme_ctrlr->ana_log_page == NULL) {
1629 		return -EINVAL;
1630 	}
1631 
1632 	copied_desc = nvme_ctrlr->copied_ana_desc;
1633 
1634 	orig_desc = (uint8_t *)nvme_ctrlr->ana_log_page + sizeof(struct spdk_nvme_ana_page);
1635 	copy_len = nvme_ctrlr->ana_log_page_size - sizeof(struct spdk_nvme_ana_page);
1636 
1637 	for (i = 0; i < nvme_ctrlr->ana_log_page->num_ana_group_desc; i++) {
1638 		memcpy(copied_desc, orig_desc, copy_len);
1639 
1640 		rc = cb_fn(copied_desc, cb_arg);
1641 		if (rc != 0) {
1642 			break;
1643 		}
1644 
1645 		desc_size = sizeof(struct spdk_nvme_ana_group_descriptor) +
1646 			    copied_desc->num_of_nsid * sizeof(uint32_t);
1647 		orig_desc += desc_size;
1648 		copy_len -= desc_size;
1649 	}
1650 
1651 	return rc;
1652 }
1653 
1654 static int
1655 nvme_ns_set_ana_state(const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg)
1656 {
1657 	struct nvme_ns *nvme_ns = cb_arg;
1658 	uint32_t i;
1659 
1660 	for (i = 0; i < desc->num_of_nsid; i++) {
1661 		if (desc->nsid[i] != spdk_nvme_ns_get_id(nvme_ns->ns)) {
1662 			continue;
1663 		}
1664 		nvme_ns->ana_group_id = desc->ana_group_id;
1665 		nvme_ns->ana_state = desc->ana_state;
1666 		return 1;
1667 	}
1668 
1669 	return 0;
1670 }
1671 
1672 static int
1673 nvme_disk_create(struct spdk_bdev *disk, const char *base_name,
1674 		 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns,
1675 		 uint32_t prchk_flags, void *ctx)
1676 {
1677 	const struct spdk_uuid		*uuid;
1678 	const uint8_t *nguid;
1679 	const struct spdk_nvme_ctrlr_data *cdata;
1680 	const struct spdk_nvme_ns_data	*nsdata;
1681 	enum spdk_nvme_csi		csi;
1682 	uint32_t atomic_bs, phys_bs, bs;
1683 
1684 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
1685 	csi = spdk_nvme_ns_get_csi(ns);
1686 
1687 	switch (csi) {
1688 	case SPDK_NVME_CSI_NVM:
1689 		disk->product_name = "NVMe disk";
1690 		break;
1691 	case SPDK_NVME_CSI_ZNS:
1692 		disk->product_name = "NVMe ZNS disk";
1693 		disk->zoned = true;
1694 		disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns);
1695 		disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) /
1696 					     spdk_nvme_ns_get_extended_sector_size(ns);
1697 		disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns);
1698 		disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns);
1699 		break;
1700 	default:
1701 		SPDK_ERRLOG("unsupported CSI: %u\n", csi);
1702 		return -ENOTSUP;
1703 	}
1704 
1705 	disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns));
1706 	if (!disk->name) {
1707 		return -ENOMEM;
1708 	}
1709 
1710 	disk->write_cache = 0;
1711 	if (cdata->vwc.present) {
1712 		/* Enable if the Volatile Write Cache exists */
1713 		disk->write_cache = 1;
1714 	}
1715 	if (cdata->oncs.write_zeroes) {
1716 		disk->max_write_zeroes = UINT16_MAX + 1;
1717 	}
1718 	disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns);
1719 	disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns);
1720 	disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns);
1721 
1722 	nguid = spdk_nvme_ns_get_nguid(ns);
1723 	if (!nguid) {
1724 		uuid = spdk_nvme_ns_get_uuid(ns);
1725 		if (uuid) {
1726 			disk->uuid = *uuid;
1727 		}
1728 	} else {
1729 		memcpy(&disk->uuid, nguid, sizeof(disk->uuid));
1730 	}
1731 
1732 	nsdata = spdk_nvme_ns_get_data(ns);
1733 	bs = spdk_nvme_ns_get_sector_size(ns);
1734 	atomic_bs = bs;
1735 	phys_bs = bs;
1736 	if (nsdata->nabo == 0) {
1737 		if (nsdata->nsfeat.ns_atomic_write_unit && nsdata->nawupf) {
1738 			atomic_bs = bs * (1 + nsdata->nawupf);
1739 		} else {
1740 			atomic_bs = bs * (1 + cdata->awupf);
1741 		}
1742 	}
1743 	if (nsdata->nsfeat.optperf) {
1744 		phys_bs = bs * (1 + nsdata->npwg);
1745 	}
1746 	disk->phys_blocklen = spdk_min(phys_bs, atomic_bs);
1747 
1748 	disk->md_len = spdk_nvme_ns_get_md_size(ns);
1749 	if (disk->md_len != 0) {
1750 		disk->md_interleave = nsdata->flbas.extended;
1751 		disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns);
1752 		if (disk->dif_type != SPDK_DIF_DISABLE) {
1753 			disk->dif_is_head_of_md = nsdata->dps.md_start;
1754 			disk->dif_check_flags = prchk_flags;
1755 		}
1756 	}
1757 
1758 	if (!(spdk_nvme_ctrlr_get_flags(ctrlr) &
1759 	      SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) {
1760 		disk->acwu = 0;
1761 	} else if (nsdata->nsfeat.ns_atomic_write_unit) {
1762 		disk->acwu = nsdata->nacwu;
1763 	} else {
1764 		disk->acwu = cdata->acwu;
1765 	}
1766 
1767 	disk->ctxt = ctx;
1768 	disk->fn_table = &nvmelib_fn_table;
1769 	disk->module = &nvme_if;
1770 
1771 	return 0;
1772 }
1773 
1774 static int
1775 nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns)
1776 {
1777 	struct nvme_bdev *bdev;
1778 	int rc;
1779 
1780 	bdev = calloc(1, sizeof(*bdev));
1781 	if (!bdev) {
1782 		SPDK_ERRLOG("bdev calloc() failed\n");
1783 		return -ENOMEM;
1784 	}
1785 
1786 	bdev->nvme_ns = nvme_ns;
1787 	bdev->opal = nvme_ctrlr->opal_dev != NULL;
1788 
1789 	rc = nvme_disk_create(&bdev->disk, nvme_ctrlr->name, nvme_ctrlr->ctrlr,
1790 			      nvme_ns->ns, nvme_ctrlr->prchk_flags, bdev);
1791 	if (rc != 0) {
1792 		SPDK_ERRLOG("Failed to create NVMe disk\n");
1793 		free(bdev);
1794 		return rc;
1795 	}
1796 
1797 	spdk_io_device_register(bdev,
1798 				bdev_nvme_create_bdev_channel_cb,
1799 				bdev_nvme_destroy_bdev_channel_cb,
1800 				sizeof(struct nvme_bdev_channel),
1801 				bdev->disk.name);
1802 
1803 	rc = spdk_bdev_register(&bdev->disk);
1804 	if (rc != 0) {
1805 		SPDK_ERRLOG("spdk_bdev_register() failed\n");
1806 		spdk_io_device_unregister(bdev, NULL);
1807 		free(bdev->disk.name);
1808 		free(bdev);
1809 		return rc;
1810 	}
1811 
1812 	nvme_ns->bdev = bdev;
1813 
1814 	return 0;
1815 }
1816 
1817 static bool
1818 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2)
1819 {
1820 	const struct spdk_nvme_ns_data *nsdata1, *nsdata2;
1821 	const struct spdk_uuid *uuid1, *uuid2;
1822 
1823 	nsdata1 = spdk_nvme_ns_get_data(ns1);
1824 	nsdata2 = spdk_nvme_ns_get_data(ns2);
1825 	uuid1 = spdk_nvme_ns_get_uuid(ns1);
1826 	uuid2 = spdk_nvme_ns_get_uuid(ns2);
1827 
1828 	return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)) == 0 &&
1829 	       nsdata1->eui64 == nsdata2->eui64 &&
1830 	       uuid1 != NULL && uuid2 != NULL && spdk_uuid_compare(uuid1, uuid2) == 0;
1831 }
1832 
1833 static bool
1834 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
1835 		 struct spdk_nvme_ctrlr_opts *opts)
1836 {
1837 	struct nvme_probe_skip_entry *entry;
1838 
1839 	TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) {
1840 		if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) {
1841 			return false;
1842 		}
1843 	}
1844 
1845 	opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst;
1846 	opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight;
1847 	opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight;
1848 	opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight;
1849 	opts->disable_read_ana_log_page = true;
1850 
1851 	SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr);
1852 
1853 	return true;
1854 }
1855 
1856 static void
1857 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl)
1858 {
1859 	struct nvme_ctrlr *nvme_ctrlr = ctx;
1860 
1861 	if (spdk_nvme_cpl_is_error(cpl)) {
1862 		SPDK_WARNLOG("Abort failed. Resetting controller. sc is %u, sct is %u.\n", cpl->status.sc,
1863 			     cpl->status.sct);
1864 		bdev_nvme_reset(nvme_ctrlr);
1865 	} else if (cpl->cdw0 & 0x1) {
1866 		SPDK_WARNLOG("Specified command could not be aborted.\n");
1867 		bdev_nvme_reset(nvme_ctrlr);
1868 	}
1869 }
1870 
1871 static void
1872 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr,
1873 	   struct spdk_nvme_qpair *qpair, uint16_t cid)
1874 {
1875 	struct nvme_ctrlr *nvme_ctrlr = cb_arg;
1876 	union spdk_nvme_csts_register csts;
1877 	int rc;
1878 
1879 	assert(nvme_ctrlr->ctrlr == ctrlr);
1880 
1881 	SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid);
1882 
1883 	/* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O
1884 	 * queue.  (Note: qpair == NULL when there's an admin cmd timeout.)  Otherwise we
1885 	 * would submit another fabrics cmd on the admin queue to read CSTS and check for its
1886 	 * completion recursively.
1887 	 */
1888 	if (nvme_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) {
1889 		csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr);
1890 		if (csts.bits.cfs) {
1891 			SPDK_ERRLOG("Controller Fatal Status, reset required\n");
1892 			bdev_nvme_reset(nvme_ctrlr);
1893 			return;
1894 		}
1895 	}
1896 
1897 	switch (g_opts.action_on_timeout) {
1898 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT:
1899 		if (qpair) {
1900 			/* Don't send abort to ctrlr when reset is running. */
1901 			pthread_mutex_lock(&nvme_ctrlr->mutex);
1902 			if (nvme_ctrlr->resetting) {
1903 				pthread_mutex_unlock(&nvme_ctrlr->mutex);
1904 				SPDK_NOTICELOG("Quit abort. Ctrlr is in the process of reseting.\n");
1905 				return;
1906 			}
1907 			pthread_mutex_unlock(&nvme_ctrlr->mutex);
1908 
1909 			rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid,
1910 						       nvme_abort_cpl, nvme_ctrlr);
1911 			if (rc == 0) {
1912 				return;
1913 			}
1914 
1915 			SPDK_ERRLOG("Unable to send abort. Resetting, rc is %d.\n", rc);
1916 		}
1917 
1918 	/* FALLTHROUGH */
1919 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET:
1920 		bdev_nvme_reset(nvme_ctrlr);
1921 		break;
1922 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE:
1923 		SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n");
1924 		break;
1925 	default:
1926 		SPDK_ERRLOG("An invalid timeout action value is found.\n");
1927 		break;
1928 	}
1929 }
1930 
1931 static void
1932 nvme_ctrlr_populate_namespace_done(struct nvme_ns *nvme_ns, int rc)
1933 {
1934 	struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr;
1935 	struct nvme_async_probe_ctx *ctx = nvme_ns->probe_ctx;
1936 
1937 	if (rc == 0) {
1938 		nvme_ns->probe_ctx = NULL;
1939 		pthread_mutex_lock(&nvme_ctrlr->mutex);
1940 		nvme_ctrlr->ref++;
1941 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
1942 	} else {
1943 		nvme_ctrlr->namespaces[nvme_ns->id - 1] = NULL;
1944 		free(nvme_ns);
1945 	}
1946 
1947 	if (ctx) {
1948 		ctx->populates_in_progress--;
1949 		if (ctx->populates_in_progress == 0) {
1950 			nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx);
1951 		}
1952 	}
1953 }
1954 
1955 static void
1956 nvme_ctrlr_populate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns)
1957 {
1958 	struct spdk_nvme_ctrlr	*ctrlr = nvme_ctrlr->ctrlr;
1959 	struct spdk_nvme_ns	*ns;
1960 	int			rc = 0;
1961 
1962 	ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id);
1963 	if (!ns) {
1964 		SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id);
1965 		rc = -EINVAL;
1966 		goto done;
1967 	}
1968 
1969 	nvme_ns->ns = ns;
1970 	nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE;
1971 
1972 	if (nvme_ctrlr->ana_log_page != NULL) {
1973 		bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ns_set_ana_state, nvme_ns);
1974 	}
1975 
1976 	rc = nvme_bdev_create(nvme_ctrlr, nvme_ns);
1977 
1978 done:
1979 	nvme_ctrlr_populate_namespace_done(nvme_ns, rc);
1980 }
1981 
1982 static void
1983 nvme_ctrlr_depopulate_namespace_done(struct nvme_ns *nvme_ns)
1984 {
1985 	struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr;
1986 
1987 	assert(nvme_ctrlr != NULL);
1988 
1989 	pthread_mutex_lock(&nvme_ctrlr->mutex);
1990 
1991 	nvme_ctrlr->namespaces[nvme_ns->id - 1] = NULL;
1992 
1993 	if (nvme_ns->bdev != NULL) {
1994 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
1995 		return;
1996 	}
1997 
1998 	free(nvme_ns);
1999 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
2000 
2001 	nvme_ctrlr_release(nvme_ctrlr);
2002 }
2003 
2004 static void
2005 nvme_ctrlr_depopulate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns)
2006 {
2007 	struct nvme_bdev *bdev;
2008 
2009 	bdev = nvme_ns->bdev;
2010 	if (bdev != NULL) {
2011 		spdk_bdev_unregister(&bdev->disk, NULL, NULL);
2012 	}
2013 
2014 	nvme_ctrlr_depopulate_namespace_done(nvme_ns);
2015 }
2016 
2017 static void
2018 nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr,
2019 			       struct nvme_async_probe_ctx *ctx)
2020 {
2021 	struct spdk_nvme_ctrlr	*ctrlr = nvme_ctrlr->ctrlr;
2022 	struct nvme_ns	*nvme_ns, *next;
2023 	struct spdk_nvme_ns	*ns;
2024 	struct nvme_bdev	*bdev;
2025 	uint32_t		nsid;
2026 	int			rc;
2027 	uint64_t		num_sectors;
2028 
2029 	if (ctx) {
2030 		/* Initialize this count to 1 to handle the populate functions
2031 		 * calling nvme_ctrlr_populate_namespace_done() immediately.
2032 		 */
2033 		ctx->populates_in_progress = 1;
2034 	}
2035 
2036 	/* First loop over our existing namespaces and see if they have been
2037 	 * removed. */
2038 	nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr);
2039 	while (nvme_ns != NULL) {
2040 		next = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns);
2041 
2042 		if (spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) {
2043 			/* NS is still there but attributes may have changed */
2044 			ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id);
2045 			num_sectors = spdk_nvme_ns_get_num_sectors(ns);
2046 			bdev = nvme_ns->bdev;
2047 			assert(bdev != NULL);
2048 			if (bdev->disk.blockcnt != num_sectors) {
2049 				SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n",
2050 					       nvme_ns->id,
2051 					       bdev->disk.name,
2052 					       bdev->disk.blockcnt,
2053 					       num_sectors);
2054 				rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors);
2055 				if (rc != 0) {
2056 					SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n",
2057 						    bdev->disk.name, rc);
2058 				}
2059 			}
2060 		} else {
2061 			/* Namespace was removed */
2062 			nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns);
2063 		}
2064 
2065 		nvme_ns = next;
2066 	}
2067 
2068 	/* Loop through all of the namespaces at the nvme level and see if any of them are new */
2069 	nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr);
2070 	while (nsid != 0) {
2071 		nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid);
2072 
2073 		if (nvme_ns == NULL) {
2074 			/* Found a new one */
2075 			nvme_ns = calloc(1, sizeof(struct nvme_ns));
2076 			if (nvme_ns == NULL) {
2077 				SPDK_ERRLOG("Failed to allocate namespace\n");
2078 				/* This just fails to attach the namespace. It may work on a future attempt. */
2079 				continue;
2080 			}
2081 
2082 			nvme_ctrlr->namespaces[nsid - 1] = nvme_ns;
2083 
2084 			nvme_ns->id = nsid;
2085 			nvme_ns->ctrlr = nvme_ctrlr;
2086 
2087 			nvme_ns->bdev = NULL;
2088 
2089 			if (ctx) {
2090 				ctx->populates_in_progress++;
2091 			}
2092 			nvme_ns->probe_ctx = ctx;
2093 
2094 			nvme_ctrlr_populate_namespace(nvme_ctrlr, nvme_ns);
2095 		}
2096 
2097 		nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid);
2098 	}
2099 
2100 	if (ctx) {
2101 		/* Decrement this count now that the loop is over to account
2102 		 * for the one we started with.  If the count is then 0, we
2103 		 * know any populate_namespace functions completed immediately,
2104 		 * so we'll kick the callback here.
2105 		 */
2106 		ctx->populates_in_progress--;
2107 		if (ctx->populates_in_progress == 0) {
2108 			nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx);
2109 		}
2110 	}
2111 
2112 }
2113 
2114 static void
2115 nvme_ctrlr_depopulate_namespaces(struct nvme_ctrlr *nvme_ctrlr)
2116 {
2117 	uint32_t i;
2118 	struct nvme_ns *nvme_ns;
2119 
2120 	for (i = 0; i < nvme_ctrlr->num_ns; i++) {
2121 		uint32_t nsid = i + 1;
2122 
2123 		nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid);
2124 		if (nvme_ns != NULL) {
2125 			assert(nvme_ns->id == nsid);
2126 			nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns);
2127 		}
2128 	}
2129 }
2130 
2131 static bool
2132 nvme_ctrlr_acquire(struct nvme_ctrlr *nvme_ctrlr)
2133 {
2134 	pthread_mutex_lock(&nvme_ctrlr->mutex);
2135 	if (nvme_ctrlr->destruct || nvme_ctrlr->resetting) {
2136 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
2137 		return false;
2138 	}
2139 	nvme_ctrlr->ref++;
2140 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
2141 	return true;
2142 }
2143 
2144 static int
2145 nvme_ctrlr_set_ana_states(const struct spdk_nvme_ana_group_descriptor *desc,
2146 			  void *cb_arg)
2147 {
2148 	struct nvme_ctrlr *nvme_ctrlr = cb_arg;
2149 	struct nvme_ns *nvme_ns;
2150 	uint32_t i, nsid;
2151 
2152 	for (i = 0; i < desc->num_of_nsid; i++) {
2153 		nsid = desc->nsid[i];
2154 		if (nsid == 0 || nsid > nvme_ctrlr->num_ns) {
2155 			continue;
2156 		}
2157 
2158 		nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid);
2159 
2160 		assert(nvme_ns != NULL);
2161 		if (nvme_ns == NULL) {
2162 			/* Target told us that an inactive namespace had an ANA change */
2163 			continue;
2164 		}
2165 
2166 		nvme_ns->ana_group_id = desc->ana_group_id;
2167 		nvme_ns->ana_state = desc->ana_state;
2168 	}
2169 
2170 	return 0;
2171 }
2172 
2173 static void
2174 nvme_ctrlr_read_ana_log_page_done(void *ctx, const struct spdk_nvme_cpl *cpl)
2175 {
2176 	struct nvme_ctrlr *nvme_ctrlr = ctx;
2177 
2178 	if (spdk_nvme_cpl_is_success(cpl)) {
2179 		bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ctrlr_set_ana_states,
2180 					     nvme_ctrlr);
2181 	}
2182 
2183 	nvme_ctrlr_release(nvme_ctrlr);
2184 }
2185 
2186 static void
2187 nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr)
2188 {
2189 	int rc;
2190 
2191 	if (nvme_ctrlr->ana_log_page == NULL) {
2192 		return;
2193 	}
2194 
2195 	if (!nvme_ctrlr_acquire(nvme_ctrlr)) {
2196 		return;
2197 	}
2198 
2199 	rc = spdk_nvme_ctrlr_cmd_get_log_page(nvme_ctrlr->ctrlr,
2200 					      SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS,
2201 					      SPDK_NVME_GLOBAL_NS_TAG,
2202 					      nvme_ctrlr->ana_log_page,
2203 					      nvme_ctrlr->ana_log_page_size, 0,
2204 					      nvme_ctrlr_read_ana_log_page_done,
2205 					      nvme_ctrlr);
2206 	if (rc != 0) {
2207 		nvme_ctrlr_release(nvme_ctrlr);
2208 	}
2209 }
2210 
2211 static void
2212 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl)
2213 {
2214 	struct nvme_ctrlr *nvme_ctrlr		= arg;
2215 	union spdk_nvme_async_event_completion	event;
2216 
2217 	if (spdk_nvme_cpl_is_error(cpl)) {
2218 		SPDK_WARNLOG("AER request execute failed");
2219 		return;
2220 	}
2221 
2222 	event.raw = cpl->cdw0;
2223 	if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) &&
2224 	    (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) {
2225 		nvme_ctrlr_populate_namespaces(nvme_ctrlr, NULL);
2226 	} else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) &&
2227 		   (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_ANA_CHANGE)) {
2228 		nvme_ctrlr_read_ana_log_page(nvme_ctrlr);
2229 	}
2230 }
2231 
2232 static void
2233 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, size_t count, int rc)
2234 {
2235 	if (ctx->cb_fn) {
2236 		ctx->cb_fn(ctx->cb_ctx, count, rc);
2237 	}
2238 
2239 	ctx->namespaces_populated = true;
2240 	if (ctx->probe_done) {
2241 		/* The probe was already completed, so we need to free the context
2242 		 * here.  This can happen for cases like OCSSD, where we need to
2243 		 * send additional commands to the SSD after attach.
2244 		 */
2245 		free(ctx);
2246 	}
2247 }
2248 
2249 static void
2250 nvme_ctrlr_create_done(struct nvme_ctrlr *nvme_ctrlr,
2251 		       struct nvme_async_probe_ctx *ctx)
2252 {
2253 	spdk_io_device_register(nvme_ctrlr,
2254 				bdev_nvme_create_ctrlr_channel_cb,
2255 				bdev_nvme_destroy_ctrlr_channel_cb,
2256 				sizeof(struct nvme_ctrlr_channel),
2257 				nvme_ctrlr->name);
2258 
2259 	nvme_ctrlr_populate_namespaces(nvme_ctrlr, ctx);
2260 }
2261 
2262 static void
2263 nvme_ctrlr_init_ana_log_page_done(void *_ctx, const struct spdk_nvme_cpl *cpl)
2264 {
2265 	struct nvme_ctrlr *nvme_ctrlr = _ctx;
2266 	struct nvme_async_probe_ctx *ctx = nvme_ctrlr->probe_ctx;
2267 
2268 	nvme_ctrlr->probe_ctx = NULL;
2269 
2270 	if (spdk_nvme_cpl_is_error(cpl)) {
2271 		nvme_ctrlr_delete(nvme_ctrlr);
2272 
2273 		if (ctx != NULL) {
2274 			populate_namespaces_cb(ctx, 0, -1);
2275 		}
2276 		return;
2277 	}
2278 
2279 	nvme_ctrlr_create_done(nvme_ctrlr, ctx);
2280 }
2281 
2282 static int
2283 nvme_ctrlr_init_ana_log_page(struct nvme_ctrlr *nvme_ctrlr,
2284 			     struct nvme_async_probe_ctx *ctx)
2285 {
2286 	struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr;
2287 	const struct spdk_nvme_ctrlr_data *cdata;
2288 	uint32_t ana_log_page_size;
2289 
2290 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
2291 
2292 	ana_log_page_size = sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid *
2293 			    sizeof(struct spdk_nvme_ana_group_descriptor) + cdata->nn *
2294 			    sizeof(uint32_t);
2295 
2296 	nvme_ctrlr->ana_log_page = spdk_zmalloc(ana_log_page_size, 64, NULL,
2297 						SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
2298 	if (nvme_ctrlr->ana_log_page == NULL) {
2299 		SPDK_ERRLOG("could not allocate ANA log page buffer\n");
2300 		return -ENXIO;
2301 	}
2302 
2303 	/* Each descriptor in a ANA log page is not ensured to be 8-bytes aligned.
2304 	 * Hence copy each descriptor to a temporary area when parsing it.
2305 	 *
2306 	 * Allocate a buffer whose size is as large as ANA log page buffer because
2307 	 * we do not know the size of a descriptor until actually reading it.
2308 	 */
2309 	nvme_ctrlr->copied_ana_desc = calloc(1, ana_log_page_size);
2310 	if (nvme_ctrlr->copied_ana_desc == NULL) {
2311 		SPDK_ERRLOG("could not allocate a buffer to parse ANA descriptor\n");
2312 		return -ENOMEM;
2313 	}
2314 
2315 	nvme_ctrlr->ana_log_page_size = ana_log_page_size;
2316 
2317 	nvme_ctrlr->probe_ctx = ctx;
2318 
2319 	return spdk_nvme_ctrlr_cmd_get_log_page(ctrlr,
2320 						SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS,
2321 						SPDK_NVME_GLOBAL_NS_TAG,
2322 						nvme_ctrlr->ana_log_page,
2323 						nvme_ctrlr->ana_log_page_size, 0,
2324 						nvme_ctrlr_init_ana_log_page_done,
2325 						nvme_ctrlr);
2326 }
2327 
2328 static int
2329 nvme_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr,
2330 		  const char *name,
2331 		  const struct spdk_nvme_transport_id *trid,
2332 		  uint32_t prchk_flags,
2333 		  struct nvme_async_probe_ctx *ctx)
2334 {
2335 	struct nvme_ctrlr *nvme_ctrlr;
2336 	struct nvme_ctrlr_trid *trid_entry;
2337 	uint32_t num_ns;
2338 	const struct spdk_nvme_ctrlr_data *cdata;
2339 	int rc;
2340 
2341 	nvme_ctrlr = calloc(1, sizeof(*nvme_ctrlr));
2342 	if (nvme_ctrlr == NULL) {
2343 		SPDK_ERRLOG("Failed to allocate device struct\n");
2344 		return -ENOMEM;
2345 	}
2346 
2347 	rc = pthread_mutex_init(&nvme_ctrlr->mutex, NULL);
2348 	if (rc != 0) {
2349 		free(nvme_ctrlr);
2350 		return rc;
2351 	}
2352 
2353 	TAILQ_INIT(&nvme_ctrlr->trids);
2354 
2355 	num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr);
2356 	if (num_ns != 0) {
2357 		nvme_ctrlr->namespaces = calloc(num_ns, sizeof(struct nvme_ns *));
2358 		if (!nvme_ctrlr->namespaces) {
2359 			SPDK_ERRLOG("Failed to allocate block namespaces pointer\n");
2360 			rc = -ENOMEM;
2361 			goto err;
2362 		}
2363 
2364 		nvme_ctrlr->num_ns = num_ns;
2365 	}
2366 
2367 	trid_entry = calloc(1, sizeof(*trid_entry));
2368 	if (trid_entry == NULL) {
2369 		SPDK_ERRLOG("Failed to allocate trid entry pointer\n");
2370 		rc = -ENOMEM;
2371 		goto err;
2372 	}
2373 
2374 	trid_entry->trid = *trid;
2375 	nvme_ctrlr->connected_trid = &trid_entry->trid;
2376 	TAILQ_INSERT_HEAD(&nvme_ctrlr->trids, trid_entry, link);
2377 
2378 	nvme_ctrlr->thread = spdk_get_thread();
2379 	nvme_ctrlr->ctrlr = ctrlr;
2380 	nvme_ctrlr->ref = 1;
2381 	nvme_ctrlr->name = strdup(name);
2382 	if (nvme_ctrlr->name == NULL) {
2383 		rc = -ENOMEM;
2384 		goto err;
2385 	}
2386 
2387 	if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) {
2388 		SPDK_ERRLOG("OCSSDs are not supported");
2389 		rc = -ENOTSUP;
2390 		goto err;
2391 	}
2392 
2393 	nvme_ctrlr->prchk_flags = prchk_flags;
2394 
2395 	nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_ctrlr,
2396 					  g_opts.nvme_adminq_poll_period_us);
2397 
2398 	pthread_mutex_lock(&g_bdev_nvme_mutex);
2399 	TAILQ_INSERT_TAIL(&g_nvme_ctrlrs, nvme_ctrlr, tailq);
2400 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
2401 
2402 	if (g_opts.timeout_us > 0) {
2403 		/* Register timeout callback. Timeout values for IO vs. admin reqs can be different. */
2404 		/* If timeout_admin_us is 0 (not specified), admin uses same timeout as IO. */
2405 		uint64_t adm_timeout_us = (g_opts.timeout_admin_us == 0) ?
2406 					  g_opts.timeout_us : g_opts.timeout_admin_us;
2407 		spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us,
2408 				adm_timeout_us, timeout_cb, nvme_ctrlr);
2409 	}
2410 
2411 	spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_ctrlr);
2412 	spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_ctrlr);
2413 
2414 	if (spdk_nvme_ctrlr_get_flags(ctrlr) &
2415 	    SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) {
2416 		nvme_ctrlr->opal_dev = spdk_opal_dev_construct(ctrlr);
2417 	}
2418 
2419 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
2420 
2421 	if (cdata->cmic.ana_reporting) {
2422 		rc = nvme_ctrlr_init_ana_log_page(nvme_ctrlr, ctx);
2423 		if (rc == 0) {
2424 			return 0;
2425 		}
2426 	} else {
2427 		nvme_ctrlr_create_done(nvme_ctrlr, ctx);
2428 		return 0;
2429 	}
2430 
2431 err:
2432 	nvme_ctrlr_delete(nvme_ctrlr);
2433 	return rc;
2434 }
2435 
2436 static void
2437 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
2438 	  struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
2439 {
2440 	struct nvme_probe_ctx *ctx = cb_ctx;
2441 	char *name = NULL;
2442 	uint32_t prchk_flags = 0;
2443 	size_t i;
2444 
2445 	if (ctx) {
2446 		for (i = 0; i < ctx->count; i++) {
2447 			if (spdk_nvme_transport_id_compare(trid, &ctx->trids[i]) == 0) {
2448 				prchk_flags = ctx->prchk_flags[i];
2449 				name = strdup(ctx->names[i]);
2450 				break;
2451 			}
2452 		}
2453 	} else {
2454 		name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++);
2455 	}
2456 	if (!name) {
2457 		SPDK_ERRLOG("Failed to assign name to NVMe device\n");
2458 		return;
2459 	}
2460 
2461 	SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name);
2462 
2463 	nvme_ctrlr_create(ctrlr, name, trid, prchk_flags, NULL);
2464 
2465 	free(name);
2466 }
2467 
2468 static void
2469 _nvme_ctrlr_destruct(void *ctx)
2470 {
2471 	struct nvme_ctrlr *nvme_ctrlr = ctx;
2472 
2473 	nvme_ctrlr_depopulate_namespaces(nvme_ctrlr);
2474 	nvme_ctrlr_release(nvme_ctrlr);
2475 }
2476 
2477 static int
2478 _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, bool hotplug)
2479 {
2480 	struct nvme_probe_skip_entry *entry;
2481 
2482 	pthread_mutex_lock(&nvme_ctrlr->mutex);
2483 
2484 	/* The controller's destruction was already started */
2485 	if (nvme_ctrlr->destruct) {
2486 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
2487 		return 0;
2488 	}
2489 
2490 	if (!hotplug &&
2491 	    nvme_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
2492 		entry = calloc(1, sizeof(*entry));
2493 		if (!entry) {
2494 			pthread_mutex_unlock(&nvme_ctrlr->mutex);
2495 			return -ENOMEM;
2496 		}
2497 		entry->trid = *nvme_ctrlr->connected_trid;
2498 		TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq);
2499 	}
2500 
2501 	nvme_ctrlr->destruct = true;
2502 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
2503 
2504 	_nvme_ctrlr_destruct(nvme_ctrlr);
2505 
2506 	return 0;
2507 }
2508 
2509 static void
2510 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr)
2511 {
2512 	struct nvme_ctrlr *nvme_ctrlr = cb_ctx;
2513 
2514 	_bdev_nvme_delete(nvme_ctrlr, true);
2515 }
2516 
2517 static int
2518 bdev_nvme_hotplug_probe(void *arg)
2519 {
2520 	if (g_hotplug_probe_ctx == NULL) {
2521 		spdk_poller_unregister(&g_hotplug_probe_poller);
2522 		return SPDK_POLLER_IDLE;
2523 	}
2524 
2525 	if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) {
2526 		g_hotplug_probe_ctx = NULL;
2527 		spdk_poller_unregister(&g_hotplug_probe_poller);
2528 	}
2529 
2530 	return SPDK_POLLER_BUSY;
2531 }
2532 
2533 static int
2534 bdev_nvme_hotplug(void *arg)
2535 {
2536 	struct spdk_nvme_transport_id trid_pcie;
2537 
2538 	if (g_hotplug_probe_ctx) {
2539 		return SPDK_POLLER_BUSY;
2540 	}
2541 
2542 	memset(&trid_pcie, 0, sizeof(trid_pcie));
2543 	spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE);
2544 
2545 	g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL,
2546 			      hotplug_probe_cb, attach_cb, NULL);
2547 
2548 	if (g_hotplug_probe_ctx) {
2549 		assert(g_hotplug_probe_poller == NULL);
2550 		g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000);
2551 	}
2552 
2553 	return SPDK_POLLER_BUSY;
2554 }
2555 
2556 void
2557 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts)
2558 {
2559 	*opts = g_opts;
2560 }
2561 
2562 static int
2563 bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts)
2564 {
2565 	if ((opts->timeout_us == 0) && (opts->timeout_admin_us != 0)) {
2566 		/* Can't set timeout_admin_us without also setting timeout_us */
2567 		SPDK_WARNLOG("Invalid options: Can't have (timeout_us == 0) with (timeout_admin_us > 0)\n");
2568 		return -EINVAL;
2569 	}
2570 
2571 	return 0;
2572 }
2573 
2574 int
2575 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts)
2576 {
2577 	int ret = bdev_nvme_validate_opts(opts);
2578 	if (ret) {
2579 		SPDK_WARNLOG("Failed to set nvme opts.\n");
2580 		return ret;
2581 	}
2582 
2583 	if (g_bdev_nvme_init_thread != NULL) {
2584 		if (!TAILQ_EMPTY(&g_nvme_ctrlrs)) {
2585 			return -EPERM;
2586 		}
2587 	}
2588 
2589 	g_opts = *opts;
2590 
2591 	return 0;
2592 }
2593 
2594 struct set_nvme_hotplug_ctx {
2595 	uint64_t period_us;
2596 	bool enabled;
2597 	spdk_msg_fn fn;
2598 	void *fn_ctx;
2599 };
2600 
2601 static void
2602 set_nvme_hotplug_period_cb(void *_ctx)
2603 {
2604 	struct set_nvme_hotplug_ctx *ctx = _ctx;
2605 
2606 	spdk_poller_unregister(&g_hotplug_poller);
2607 	if (ctx->enabled) {
2608 		g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us);
2609 	}
2610 
2611 	g_nvme_hotplug_poll_period_us = ctx->period_us;
2612 	g_nvme_hotplug_enabled = ctx->enabled;
2613 	if (ctx->fn) {
2614 		ctx->fn(ctx->fn_ctx);
2615 	}
2616 
2617 	free(ctx);
2618 }
2619 
2620 int
2621 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx)
2622 {
2623 	struct set_nvme_hotplug_ctx *ctx;
2624 
2625 	if (enabled == true && !spdk_process_is_primary()) {
2626 		return -EPERM;
2627 	}
2628 
2629 	ctx = calloc(1, sizeof(*ctx));
2630 	if (ctx == NULL) {
2631 		return -ENOMEM;
2632 	}
2633 
2634 	period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us;
2635 	ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX);
2636 	ctx->enabled = enabled;
2637 	ctx->fn = cb;
2638 	ctx->fn_ctx = cb_ctx;
2639 
2640 	spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx);
2641 	return 0;
2642 }
2643 
2644 static void
2645 nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr,
2646 				    struct nvme_async_probe_ctx *ctx)
2647 {
2648 	struct nvme_ns	*nvme_ns;
2649 	struct nvme_bdev	*nvme_bdev;
2650 	size_t			j;
2651 
2652 	assert(nvme_ctrlr != NULL);
2653 
2654 	/*
2655 	 * Report the new bdevs that were created in this call.
2656 	 * There can be more than one bdev per NVMe controller.
2657 	 */
2658 	j = 0;
2659 	nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr);
2660 	while (nvme_ns != NULL) {
2661 		nvme_bdev = nvme_ns->bdev;
2662 		if (j < ctx->count) {
2663 			ctx->names[j] = nvme_bdev->disk.name;
2664 			j++;
2665 		} else {
2666 			SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n",
2667 				    ctx->count);
2668 			populate_namespaces_cb(ctx, 0, -ERANGE);
2669 			return;
2670 		}
2671 
2672 		nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns);
2673 	}
2674 
2675 	populate_namespaces_cb(ctx, j, 0);
2676 }
2677 
2678 static int
2679 bdev_nvme_compare_trids(struct nvme_ctrlr *nvme_ctrlr,
2680 			struct spdk_nvme_ctrlr *new_ctrlr,
2681 			struct spdk_nvme_transport_id *trid)
2682 {
2683 	struct nvme_ctrlr_trid *tmp_trid;
2684 
2685 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
2686 		SPDK_ERRLOG("PCIe failover is not supported.\n");
2687 		return -ENOTSUP;
2688 	}
2689 
2690 	/* Currently we only support failover to the same transport type. */
2691 	if (nvme_ctrlr->connected_trid->trtype != trid->trtype) {
2692 		return -EINVAL;
2693 	}
2694 
2695 	/* Currently we only support failover to the same NQN. */
2696 	if (strncmp(trid->subnqn, nvme_ctrlr->connected_trid->subnqn, SPDK_NVMF_NQN_MAX_LEN)) {
2697 		return -EINVAL;
2698 	}
2699 
2700 	/* Skip all the other checks if we've already registered this path. */
2701 	TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) {
2702 		if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) {
2703 			return -EEXIST;
2704 		}
2705 	}
2706 
2707 	return 0;
2708 }
2709 
2710 static int
2711 bdev_nvme_compare_namespaces(struct nvme_ctrlr *nvme_ctrlr,
2712 			     struct spdk_nvme_ctrlr *new_ctrlr)
2713 {
2714 	struct nvme_ns *nvme_ns;
2715 	struct spdk_nvme_ns *new_ns;
2716 
2717 	if (spdk_nvme_ctrlr_get_num_ns(new_ctrlr) != nvme_ctrlr->num_ns) {
2718 		return -EINVAL;
2719 	}
2720 
2721 	nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr);
2722 	while (nvme_ns != NULL) {
2723 		new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nvme_ns->id);
2724 		assert(new_ns != NULL);
2725 
2726 		if (!bdev_nvme_compare_ns(nvme_ns->ns, new_ns)) {
2727 			return -EINVAL;
2728 		}
2729 
2730 		nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns);
2731 	}
2732 
2733 	return 0;
2734 }
2735 
2736 static int
2737 _bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr,
2738 			      struct spdk_nvme_transport_id *trid)
2739 {
2740 	struct nvme_ctrlr_trid *new_trid, *tmp_trid;
2741 
2742 	new_trid = calloc(1, sizeof(*new_trid));
2743 	if (new_trid == NULL) {
2744 		return -ENOMEM;
2745 	}
2746 	new_trid->trid = *trid;
2747 	new_trid->is_failed = false;
2748 
2749 	TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) {
2750 		if (tmp_trid->is_failed) {
2751 			TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link);
2752 			return 0;
2753 		}
2754 	}
2755 
2756 	TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, new_trid, link);
2757 	return 0;
2758 }
2759 
2760 /* This is the case that a secondary path is added to an existing
2761  * nvme_ctrlr for failover. After checking if it can access the same
2762  * namespaces as the primary path, it is disconnected until failover occurs.
2763  */
2764 static int
2765 bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr,
2766 			     struct spdk_nvme_ctrlr *new_ctrlr,
2767 			     struct spdk_nvme_transport_id *trid)
2768 {
2769 	int rc;
2770 
2771 	assert(nvme_ctrlr != NULL);
2772 
2773 	pthread_mutex_lock(&nvme_ctrlr->mutex);
2774 
2775 	rc = bdev_nvme_compare_trids(nvme_ctrlr, new_ctrlr, trid);
2776 	if (rc != 0) {
2777 		goto exit;
2778 	}
2779 
2780 	rc = bdev_nvme_compare_namespaces(nvme_ctrlr, new_ctrlr);
2781 	if (rc != 0) {
2782 		goto exit;
2783 	}
2784 
2785 	rc = _bdev_nvme_add_secondary_trid(nvme_ctrlr, trid);
2786 
2787 exit:
2788 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
2789 
2790 	spdk_nvme_detach(new_ctrlr);
2791 
2792 	return rc;
2793 }
2794 
2795 static void
2796 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
2797 		  struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
2798 {
2799 	struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx;
2800 	struct nvme_async_probe_ctx *ctx;
2801 	int rc;
2802 
2803 	ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, opts);
2804 	ctx->ctrlr_attached = true;
2805 
2806 	rc = nvme_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx->prchk_flags, ctx);
2807 	if (rc != 0) {
2808 		populate_namespaces_cb(ctx, 0, rc);
2809 	}
2810 }
2811 
2812 static void
2813 connect_set_failover_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
2814 			struct spdk_nvme_ctrlr *ctrlr,
2815 			const struct spdk_nvme_ctrlr_opts *opts)
2816 {
2817 	struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx;
2818 	struct nvme_ctrlr *nvme_ctrlr;
2819 	struct nvme_async_probe_ctx *ctx;
2820 	int rc;
2821 
2822 	ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, opts);
2823 	ctx->ctrlr_attached = true;
2824 
2825 	nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name);
2826 	if (nvme_ctrlr) {
2827 		rc = bdev_nvme_add_secondary_trid(nvme_ctrlr, ctrlr, &ctx->trid);
2828 	} else {
2829 		rc = -ENODEV;
2830 	}
2831 
2832 	populate_namespaces_cb(ctx, 0, rc);
2833 }
2834 
2835 static int
2836 bdev_nvme_async_poll(void *arg)
2837 {
2838 	struct nvme_async_probe_ctx	*ctx = arg;
2839 	int				rc;
2840 
2841 	rc = spdk_nvme_probe_poll_async(ctx->probe_ctx);
2842 	if (spdk_unlikely(rc != -EAGAIN)) {
2843 		ctx->probe_done = true;
2844 		spdk_poller_unregister(&ctx->poller);
2845 		if (!ctx->ctrlr_attached) {
2846 			/* The probe is done, but no controller was attached.
2847 			 * That means we had a failure, so report -EIO back to
2848 			 * the caller (usually the RPC). populate_namespaces_cb()
2849 			 * will take care of freeing the nvme_async_probe_ctx.
2850 			 */
2851 			populate_namespaces_cb(ctx, 0, -EIO);
2852 		} else if (ctx->namespaces_populated) {
2853 			/* The namespaces for the attached controller were all
2854 			 * populated and the response was already sent to the
2855 			 * caller (usually the RPC).  So free the context here.
2856 			 */
2857 			free(ctx);
2858 		}
2859 	}
2860 
2861 	return SPDK_POLLER_BUSY;
2862 }
2863 
2864 int
2865 bdev_nvme_create(struct spdk_nvme_transport_id *trid,
2866 		 const char *base_name,
2867 		 const char **names,
2868 		 uint32_t count,
2869 		 uint32_t prchk_flags,
2870 		 spdk_bdev_create_nvme_fn cb_fn,
2871 		 void *cb_ctx,
2872 		 struct spdk_nvme_ctrlr_opts *opts)
2873 {
2874 	struct nvme_probe_skip_entry	*entry, *tmp;
2875 	struct nvme_async_probe_ctx	*ctx;
2876 	spdk_nvme_attach_cb attach_cb;
2877 
2878 	/* TODO expand this check to include both the host and target TRIDs.
2879 	 * Only if both are the same should we fail.
2880 	 */
2881 	if (nvme_ctrlr_get(trid) != NULL) {
2882 		SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr);
2883 		return -EEXIST;
2884 	}
2885 
2886 	ctx = calloc(1, sizeof(*ctx));
2887 	if (!ctx) {
2888 		return -ENOMEM;
2889 	}
2890 	ctx->base_name = base_name;
2891 	ctx->names = names;
2892 	ctx->count = count;
2893 	ctx->cb_fn = cb_fn;
2894 	ctx->cb_ctx = cb_ctx;
2895 	ctx->prchk_flags = prchk_flags;
2896 	ctx->trid = *trid;
2897 
2898 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
2899 		TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) {
2900 			if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) {
2901 				TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq);
2902 				free(entry);
2903 				break;
2904 			}
2905 		}
2906 	}
2907 
2908 	if (opts) {
2909 		memcpy(&ctx->opts, opts, sizeof(*opts));
2910 	} else {
2911 		spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->opts, sizeof(ctx->opts));
2912 	}
2913 
2914 	ctx->opts.transport_retry_count = g_opts.retry_count;
2915 	ctx->opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms;
2916 	ctx->opts.disable_read_ana_log_page = true;
2917 
2918 	if (nvme_ctrlr_get_by_name(base_name) == NULL) {
2919 		attach_cb = connect_attach_cb;
2920 	} else {
2921 		attach_cb = connect_set_failover_cb;
2922 	}
2923 
2924 	ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->opts, attach_cb);
2925 	if (ctx->probe_ctx == NULL) {
2926 		SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr);
2927 		free(ctx);
2928 		return -ENODEV;
2929 	}
2930 	ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000);
2931 
2932 	return 0;
2933 }
2934 
2935 static int
2936 bdev_nvme_delete_secondary_trid(struct nvme_ctrlr *nvme_ctrlr,
2937 				const struct spdk_nvme_transport_id *trid)
2938 {
2939 	struct nvme_ctrlr_trid	*ctrlr_trid, *tmp_trid;
2940 
2941 	if (!spdk_nvme_transport_id_compare(trid, nvme_ctrlr->connected_trid)) {
2942 		return -EBUSY;
2943 	}
2944 
2945 	TAILQ_FOREACH_SAFE(ctrlr_trid, &nvme_ctrlr->trids, link, tmp_trid) {
2946 		if (!spdk_nvme_transport_id_compare(&ctrlr_trid->trid, trid)) {
2947 			TAILQ_REMOVE(&nvme_ctrlr->trids, ctrlr_trid, link);
2948 			free(ctrlr_trid);
2949 			return 0;
2950 		}
2951 	}
2952 
2953 	return -ENXIO;
2954 }
2955 
2956 int
2957 bdev_nvme_delete(const char *name, const struct spdk_nvme_transport_id *trid)
2958 {
2959 	struct nvme_ctrlr	*nvme_ctrlr;
2960 	struct nvme_ctrlr_trid	*ctrlr_trid;
2961 
2962 	if (name == NULL) {
2963 		return -EINVAL;
2964 	}
2965 
2966 	nvme_ctrlr = nvme_ctrlr_get_by_name(name);
2967 	if (nvme_ctrlr == NULL) {
2968 		SPDK_ERRLOG("Failed to find NVMe controller\n");
2969 		return -ENODEV;
2970 	}
2971 
2972 	/* case 1: remove the controller itself. */
2973 	if (trid == NULL) {
2974 		return _bdev_nvme_delete(nvme_ctrlr, false);
2975 	}
2976 
2977 	/* case 2: we are currently using the path to be removed. */
2978 	if (!spdk_nvme_transport_id_compare(trid, nvme_ctrlr->connected_trid)) {
2979 		ctrlr_trid = TAILQ_FIRST(&nvme_ctrlr->trids);
2980 		assert(nvme_ctrlr->connected_trid == &ctrlr_trid->trid);
2981 		/* case 2A: the current path is the only path. */
2982 		if (!TAILQ_NEXT(ctrlr_trid, link)) {
2983 			return _bdev_nvme_delete(nvme_ctrlr, false);
2984 		}
2985 
2986 		/* case 2B: there is an alternative path. */
2987 		return bdev_nvme_failover(nvme_ctrlr, true);
2988 	}
2989 
2990 	/* case 3: We are not using the specified path. */
2991 	return bdev_nvme_delete_secondary_trid(nvme_ctrlr, trid);
2992 }
2993 
2994 static int
2995 bdev_nvme_library_init(void)
2996 {
2997 	g_bdev_nvme_init_thread = spdk_get_thread();
2998 
2999 	spdk_io_device_register(&g_nvme_ctrlrs, bdev_nvme_create_poll_group_cb,
3000 				bdev_nvme_destroy_poll_group_cb,
3001 				sizeof(struct nvme_poll_group),  "nvme_poll_groups");
3002 
3003 	return 0;
3004 }
3005 
3006 static void
3007 bdev_nvme_library_fini(void)
3008 {
3009 	struct nvme_ctrlr *nvme_ctrlr, *tmp;
3010 	struct nvme_probe_skip_entry *entry, *entry_tmp;
3011 
3012 	spdk_poller_unregister(&g_hotplug_poller);
3013 	free(g_hotplug_probe_ctx);
3014 	g_hotplug_probe_ctx = NULL;
3015 
3016 	TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) {
3017 		TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq);
3018 		free(entry);
3019 	}
3020 
3021 	pthread_mutex_lock(&g_bdev_nvme_mutex);
3022 	TAILQ_FOREACH_SAFE(nvme_ctrlr, &g_nvme_ctrlrs, tailq, tmp) {
3023 		pthread_mutex_lock(&nvme_ctrlr->mutex);
3024 		if (nvme_ctrlr->destruct) {
3025 			/* This controller's destruction was already started
3026 			 * before the application started shutting down
3027 			 */
3028 			pthread_mutex_unlock(&nvme_ctrlr->mutex);
3029 			continue;
3030 		}
3031 		nvme_ctrlr->destruct = true;
3032 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
3033 
3034 		spdk_thread_send_msg(nvme_ctrlr->thread, _nvme_ctrlr_destruct,
3035 				     nvme_ctrlr);
3036 	}
3037 
3038 	g_bdev_nvme_module_finish = true;
3039 	if (TAILQ_EMPTY(&g_nvme_ctrlrs)) {
3040 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
3041 		spdk_io_device_unregister(&g_nvme_ctrlrs, NULL);
3042 		spdk_bdev_module_fini_done();
3043 		return;
3044 	}
3045 
3046 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
3047 }
3048 
3049 static void
3050 bdev_nvme_verify_pi_error(struct nvme_bdev_io *bio)
3051 {
3052 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
3053 	struct spdk_bdev *bdev = bdev_io->bdev;
3054 	struct spdk_dif_ctx dif_ctx;
3055 	struct spdk_dif_error err_blk = {};
3056 	int rc;
3057 
3058 	rc = spdk_dif_ctx_init(&dif_ctx,
3059 			       bdev->blocklen, bdev->md_len, bdev->md_interleave,
3060 			       bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags,
3061 			       bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0);
3062 	if (rc != 0) {
3063 		SPDK_ERRLOG("Initialization of DIF context failed\n");
3064 		return;
3065 	}
3066 
3067 	if (bdev->md_interleave) {
3068 		rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
3069 				     bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
3070 	} else {
3071 		struct iovec md_iov = {
3072 			.iov_base	= bdev_io->u.bdev.md_buf,
3073 			.iov_len	= bdev_io->u.bdev.num_blocks * bdev->md_len,
3074 		};
3075 
3076 		rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
3077 				     &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
3078 	}
3079 
3080 	if (rc != 0) {
3081 		SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n",
3082 			    err_blk.err_type, err_blk.err_offset);
3083 	} else {
3084 		SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n");
3085 	}
3086 }
3087 
3088 static void
3089 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl)
3090 {
3091 	struct nvme_bdev_io *bio = ref;
3092 
3093 	if (spdk_nvme_cpl_is_success(cpl)) {
3094 		/* Run PI verification for read data buffer. */
3095 		bdev_nvme_verify_pi_error(bio);
3096 	}
3097 
3098 	/* Return original completion status */
3099 	bdev_nvme_io_complete_nvme_status(bio, &bio->cpl);
3100 }
3101 
3102 static void
3103 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl)
3104 {
3105 	struct nvme_bdev_io *bio = ref;
3106 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
3107 	struct nvme_bdev_channel *nbdev_ch;
3108 	struct spdk_nvme_ns *ns;
3109 	struct spdk_nvme_qpair *qpair;
3110 	int ret;
3111 
3112 	if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) {
3113 		SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n",
3114 			    cpl->status.sct, cpl->status.sc);
3115 
3116 		/* Save completion status to use after verifying PI error. */
3117 		bio->cpl = *cpl;
3118 
3119 		nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io));
3120 
3121 		if (spdk_likely(bdev_nvme_find_io_path(nbdev_ch, &ns, &qpair))) {
3122 			/* Read without PI checking to verify PI error. */
3123 			ret = bdev_nvme_no_pi_readv(ns,
3124 						    qpair,
3125 						    bio,
3126 						    bdev_io->u.bdev.iovs,
3127 						    bdev_io->u.bdev.iovcnt,
3128 						    bdev_io->u.bdev.md_buf,
3129 						    bdev_io->u.bdev.num_blocks,
3130 						    bdev_io->u.bdev.offset_blocks);
3131 			if (ret == 0) {
3132 				return;
3133 			}
3134 		}
3135 	}
3136 
3137 	bdev_nvme_io_complete_nvme_status(bio, cpl);
3138 }
3139 
3140 static void
3141 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl)
3142 {
3143 	struct nvme_bdev_io *bio = ref;
3144 
3145 	if (spdk_nvme_cpl_is_pi_error(cpl)) {
3146 		SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n",
3147 			    cpl->status.sct, cpl->status.sc);
3148 		/* Run PI verification for write data buffer if PI error is detected. */
3149 		bdev_nvme_verify_pi_error(bio);
3150 	}
3151 
3152 	bdev_nvme_io_complete_nvme_status(bio, cpl);
3153 }
3154 
3155 static void
3156 bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl)
3157 {
3158 	struct nvme_bdev_io *bio = ref;
3159 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
3160 
3161 	/* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks.
3162 	 * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error().
3163 	 */
3164 	bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0;
3165 
3166 	if (spdk_nvme_cpl_is_pi_error(cpl)) {
3167 		SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n",
3168 			    cpl->status.sct, cpl->status.sc);
3169 		/* Run PI verification for zone append data buffer if PI error is detected. */
3170 		bdev_nvme_verify_pi_error(bio);
3171 	}
3172 
3173 	bdev_nvme_io_complete_nvme_status(bio, cpl);
3174 }
3175 
3176 static void
3177 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl)
3178 {
3179 	struct nvme_bdev_io *bio = ref;
3180 
3181 	if (spdk_nvme_cpl_is_pi_error(cpl)) {
3182 		SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n",
3183 			    cpl->status.sct, cpl->status.sc);
3184 		/* Run PI verification for compare data buffer if PI error is detected. */
3185 		bdev_nvme_verify_pi_error(bio);
3186 	}
3187 
3188 	bdev_nvme_io_complete_nvme_status(bio, cpl);
3189 }
3190 
3191 static void
3192 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl)
3193 {
3194 	struct nvme_bdev_io *bio = ref;
3195 
3196 	/* Compare operation completion */
3197 	if ((cpl->cdw0 & 0xFF) == SPDK_NVME_OPC_COMPARE) {
3198 		/* Save compare result for write callback */
3199 		bio->cpl = *cpl;
3200 		return;
3201 	}
3202 
3203 	/* Write operation completion */
3204 	if (spdk_nvme_cpl_is_error(&bio->cpl)) {
3205 		/* If bio->cpl is already an error, it means the compare operation failed.  In that case,
3206 		 * complete the IO with the compare operation's status.
3207 		 */
3208 		if (!spdk_nvme_cpl_is_error(cpl)) {
3209 			SPDK_ERRLOG("Unexpected write success after compare failure.\n");
3210 		}
3211 
3212 		bdev_nvme_io_complete_nvme_status(bio, &bio->cpl);
3213 	} else {
3214 		bdev_nvme_io_complete_nvme_status(bio, cpl);
3215 	}
3216 }
3217 
3218 static void
3219 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl)
3220 {
3221 	struct nvme_bdev_io *bio = ref;
3222 
3223 	bdev_nvme_io_complete_nvme_status(bio, cpl);
3224 }
3225 
3226 static int
3227 fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc)
3228 {
3229 	switch (desc->zs) {
3230 	case SPDK_NVME_ZONE_STATE_EMPTY:
3231 		info->state = SPDK_BDEV_ZONE_STATE_EMPTY;
3232 		break;
3233 	case SPDK_NVME_ZONE_STATE_IOPEN:
3234 		info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN;
3235 		break;
3236 	case SPDK_NVME_ZONE_STATE_EOPEN:
3237 		info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN;
3238 		break;
3239 	case SPDK_NVME_ZONE_STATE_CLOSED:
3240 		info->state = SPDK_BDEV_ZONE_STATE_CLOSED;
3241 		break;
3242 	case SPDK_NVME_ZONE_STATE_RONLY:
3243 		info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY;
3244 		break;
3245 	case SPDK_NVME_ZONE_STATE_FULL:
3246 		info->state = SPDK_BDEV_ZONE_STATE_FULL;
3247 		break;
3248 	case SPDK_NVME_ZONE_STATE_OFFLINE:
3249 		info->state = SPDK_BDEV_ZONE_STATE_OFFLINE;
3250 		break;
3251 	default:
3252 		SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs);
3253 		return -EIO;
3254 	}
3255 
3256 	info->zone_id = desc->zslba;
3257 	info->write_pointer = desc->wp;
3258 	info->capacity = desc->zcap;
3259 
3260 	return 0;
3261 }
3262 
3263 static void
3264 bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl)
3265 {
3266 	struct nvme_bdev_io *bio = ref;
3267 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
3268 	struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io);
3269 	struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch);
3270 	uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id;
3271 	uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones;
3272 	struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf;
3273 	uint64_t max_zones_per_buf, i;
3274 	uint32_t zone_report_bufsize;
3275 	struct spdk_nvme_ns *ns;
3276 	struct spdk_nvme_qpair *qpair;
3277 	int ret;
3278 
3279 	if (spdk_nvme_cpl_is_error(cpl)) {
3280 		goto out_complete_io_nvme_cpl;
3281 	}
3282 
3283 	if (!bdev_nvme_find_io_path(nbdev_ch, &ns, &qpair)) {
3284 		ret = -ENXIO;
3285 		goto out_complete_io_ret;
3286 	}
3287 
3288 	zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns);
3289 	max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) /
3290 			    sizeof(bio->zone_report_buf->descs[0]);
3291 
3292 	if (bio->zone_report_buf->nr_zones > max_zones_per_buf) {
3293 		ret = -EINVAL;
3294 		goto out_complete_io_ret;
3295 	}
3296 
3297 	if (!bio->zone_report_buf->nr_zones) {
3298 		ret = -EINVAL;
3299 		goto out_complete_io_ret;
3300 	}
3301 
3302 	for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) {
3303 		ret = fill_zone_from_report(&info[bio->handled_zones],
3304 					    &bio->zone_report_buf->descs[i]);
3305 		if (ret) {
3306 			goto out_complete_io_ret;
3307 		}
3308 		bio->handled_zones++;
3309 	}
3310 
3311 	if (bio->handled_zones < zones_to_copy) {
3312 		uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(ns);
3313 		uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones);
3314 
3315 		memset(bio->zone_report_buf, 0, zone_report_bufsize);
3316 		ret = spdk_nvme_zns_report_zones(ns, qpair,
3317 						 bio->zone_report_buf, zone_report_bufsize,
3318 						 slba, SPDK_NVME_ZRA_LIST_ALL, true,
3319 						 bdev_nvme_get_zone_info_done, bio);
3320 		if (!ret) {
3321 			return;
3322 		} else {
3323 			goto out_complete_io_ret;
3324 		}
3325 	}
3326 
3327 out_complete_io_nvme_cpl:
3328 	free(bio->zone_report_buf);
3329 	bio->zone_report_buf = NULL;
3330 	bdev_nvme_io_complete_nvme_status(bio, cpl);
3331 	return;
3332 
3333 out_complete_io_ret:
3334 	free(bio->zone_report_buf);
3335 	bio->zone_report_buf = NULL;
3336 	bdev_nvme_io_complete(bio, ret);
3337 }
3338 
3339 static void
3340 bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl)
3341 {
3342 	struct nvme_bdev_io *bio = ref;
3343 
3344 	bdev_nvme_io_complete_nvme_status(bio, cpl);
3345 }
3346 
3347 static void
3348 bdev_nvme_admin_passthru_completion(void *ctx)
3349 {
3350 	struct nvme_bdev_io *bio = ctx;
3351 
3352 	bdev_nvme_io_complete_nvme_status(bio, &bio->cpl);
3353 }
3354 
3355 static void
3356 bdev_nvme_abort_completion(void *ctx)
3357 {
3358 	struct nvme_bdev_io *bio = ctx;
3359 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
3360 
3361 	if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) {
3362 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
3363 	} else {
3364 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
3365 	}
3366 }
3367 
3368 static void
3369 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl)
3370 {
3371 	struct nvme_bdev_io *bio = ref;
3372 
3373 	bio->cpl = *cpl;
3374 	spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_completion, bio);
3375 }
3376 
3377 static void
3378 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl)
3379 {
3380 	struct nvme_bdev_io *bio = ref;
3381 
3382 	bio->cpl = *cpl;
3383 	spdk_thread_send_msg(bio->orig_thread, bdev_nvme_admin_passthru_completion, bio);
3384 }
3385 
3386 static void
3387 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset)
3388 {
3389 	struct nvme_bdev_io *bio = ref;
3390 	struct iovec *iov;
3391 
3392 	bio->iov_offset = sgl_offset;
3393 	for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) {
3394 		iov = &bio->iovs[bio->iovpos];
3395 		if (bio->iov_offset < iov->iov_len) {
3396 			break;
3397 		}
3398 
3399 		bio->iov_offset -= iov->iov_len;
3400 	}
3401 }
3402 
3403 static int
3404 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length)
3405 {
3406 	struct nvme_bdev_io *bio = ref;
3407 	struct iovec *iov;
3408 
3409 	assert(bio->iovpos < bio->iovcnt);
3410 
3411 	iov = &bio->iovs[bio->iovpos];
3412 
3413 	*address = iov->iov_base;
3414 	*length = iov->iov_len;
3415 
3416 	if (bio->iov_offset) {
3417 		assert(bio->iov_offset <= iov->iov_len);
3418 		*address += bio->iov_offset;
3419 		*length -= bio->iov_offset;
3420 	}
3421 
3422 	bio->iov_offset += *length;
3423 	if (bio->iov_offset == iov->iov_len) {
3424 		bio->iovpos++;
3425 		bio->iov_offset = 0;
3426 	}
3427 
3428 	return 0;
3429 }
3430 
3431 static void
3432 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset)
3433 {
3434 	struct nvme_bdev_io *bio = ref;
3435 	struct iovec *iov;
3436 
3437 	bio->fused_iov_offset = sgl_offset;
3438 	for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) {
3439 		iov = &bio->fused_iovs[bio->fused_iovpos];
3440 		if (bio->fused_iov_offset < iov->iov_len) {
3441 			break;
3442 		}
3443 
3444 		bio->fused_iov_offset -= iov->iov_len;
3445 	}
3446 }
3447 
3448 static int
3449 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length)
3450 {
3451 	struct nvme_bdev_io *bio = ref;
3452 	struct iovec *iov;
3453 
3454 	assert(bio->fused_iovpos < bio->fused_iovcnt);
3455 
3456 	iov = &bio->fused_iovs[bio->fused_iovpos];
3457 
3458 	*address = iov->iov_base;
3459 	*length = iov->iov_len;
3460 
3461 	if (bio->fused_iov_offset) {
3462 		assert(bio->fused_iov_offset <= iov->iov_len);
3463 		*address += bio->fused_iov_offset;
3464 		*length -= bio->fused_iov_offset;
3465 	}
3466 
3467 	bio->fused_iov_offset += *length;
3468 	if (bio->fused_iov_offset == iov->iov_len) {
3469 		bio->fused_iovpos++;
3470 		bio->fused_iov_offset = 0;
3471 	}
3472 
3473 	return 0;
3474 }
3475 
3476 static int
3477 bdev_nvme_no_pi_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3478 		      struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
3479 		      void *md, uint64_t lba_count, uint64_t lba)
3480 {
3481 	int rc;
3482 
3483 	SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n",
3484 		      lba_count, lba);
3485 
3486 	bio->iovs = iov;
3487 	bio->iovcnt = iovcnt;
3488 	bio->iovpos = 0;
3489 	bio->iov_offset = 0;
3490 
3491 	rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count,
3492 					    bdev_nvme_no_pi_readv_done, bio, 0,
3493 					    bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
3494 					    md, 0, 0);
3495 
3496 	if (rc != 0 && rc != -ENOMEM) {
3497 		SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc);
3498 	}
3499 	return rc;
3500 }
3501 
3502 static int
3503 bdev_nvme_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3504 		struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
3505 		void *md, uint64_t lba_count, uint64_t lba, uint32_t flags,
3506 		struct spdk_bdev_ext_io_opts *ext_opts)
3507 {
3508 	int rc;
3509 
3510 	SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n",
3511 		      lba_count, lba);
3512 
3513 	bio->iovs = iov;
3514 	bio->iovcnt = iovcnt;
3515 	bio->iovpos = 0;
3516 	bio->iov_offset = 0;
3517 
3518 	if (ext_opts) {
3519 		bio->ext_opts.size = sizeof(struct spdk_nvme_ns_cmd_ext_io_opts);
3520 		bio->ext_opts.memory_domain = ext_opts->memory_domain;
3521 		bio->ext_opts.memory_domain_ctx = ext_opts->memory_domain_ctx;
3522 		bio->ext_opts.io_flags = flags;
3523 		bio->ext_opts.metadata = md;
3524 
3525 		rc = spdk_nvme_ns_cmd_readv_ext(ns, qpair, lba, lba_count,
3526 						bdev_nvme_readv_done, bio,
3527 						bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
3528 						&bio->ext_opts);
3529 	} else if (iovcnt == 1) {
3530 		rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, md, lba,
3531 						   lba_count,
3532 						   bdev_nvme_readv_done, bio,
3533 						   flags,
3534 						   0, 0);
3535 	} else {
3536 		rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count,
3537 						    bdev_nvme_readv_done, bio, flags,
3538 						    bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
3539 						    md, 0, 0);
3540 	}
3541 
3542 	if (rc != 0 && rc != -ENOMEM) {
3543 		SPDK_ERRLOG("readv failed: rc = %d\n", rc);
3544 	}
3545 	return rc;
3546 }
3547 
3548 static int
3549 bdev_nvme_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3550 		 struct nvme_bdev_io *bio,
3551 		 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
3552 		 uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts)
3553 {
3554 	int rc;
3555 
3556 	SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n",
3557 		      lba_count, lba);
3558 
3559 	bio->iovs = iov;
3560 	bio->iovcnt = iovcnt;
3561 	bio->iovpos = 0;
3562 	bio->iov_offset = 0;
3563 
3564 	if (ext_opts) {
3565 		bio->ext_opts.size = sizeof(struct spdk_nvme_ns_cmd_ext_io_opts);
3566 		bio->ext_opts.memory_domain = ext_opts->memory_domain;
3567 		bio->ext_opts.memory_domain_ctx = ext_opts->memory_domain_ctx;
3568 		bio->ext_opts.io_flags = flags;
3569 		bio->ext_opts.metadata = md;
3570 
3571 		rc = spdk_nvme_ns_cmd_writev_ext(ns, qpair, lba, lba_count,
3572 						 bdev_nvme_readv_done, bio,
3573 						 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
3574 						 &bio->ext_opts);
3575 	} else if (iovcnt == 1) {
3576 		rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, md, lba,
3577 						    lba_count,
3578 						    bdev_nvme_writev_done, bio,
3579 						    flags,
3580 						    0, 0);
3581 	} else {
3582 		rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count,
3583 						     bdev_nvme_writev_done, bio, flags,
3584 						     bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
3585 						     md, 0, 0);
3586 	}
3587 
3588 	if (rc != 0 && rc != -ENOMEM) {
3589 		SPDK_ERRLOG("writev failed: rc = %d\n", rc);
3590 	}
3591 	return rc;
3592 }
3593 
3594 static int
3595 bdev_nvme_zone_appendv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3596 		       struct nvme_bdev_io *bio,
3597 		       struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t zslba,
3598 		       uint32_t flags)
3599 {
3600 	int rc;
3601 
3602 	SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n",
3603 		      lba_count, zslba);
3604 
3605 	bio->iovs = iov;
3606 	bio->iovcnt = iovcnt;
3607 	bio->iovpos = 0;
3608 	bio->iov_offset = 0;
3609 
3610 	if (iovcnt == 1) {
3611 		rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba,
3612 						       lba_count,
3613 						       bdev_nvme_zone_appendv_done, bio,
3614 						       flags,
3615 						       0, 0);
3616 	} else {
3617 		rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count,
3618 							bdev_nvme_zone_appendv_done, bio, flags,
3619 							bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
3620 							md, 0, 0);
3621 	}
3622 
3623 	if (rc != 0 && rc != -ENOMEM) {
3624 		SPDK_ERRLOG("zone append failed: rc = %d\n", rc);
3625 	}
3626 	return rc;
3627 }
3628 
3629 static int
3630 bdev_nvme_comparev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3631 		   struct nvme_bdev_io *bio,
3632 		   struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
3633 		   uint32_t flags)
3634 {
3635 	int rc;
3636 
3637 	SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n",
3638 		      lba_count, lba);
3639 
3640 	bio->iovs = iov;
3641 	bio->iovcnt = iovcnt;
3642 	bio->iovpos = 0;
3643 	bio->iov_offset = 0;
3644 
3645 	rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count,
3646 					       bdev_nvme_comparev_done, bio, flags,
3647 					       bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
3648 					       md, 0, 0);
3649 
3650 	if (rc != 0 && rc != -ENOMEM) {
3651 		SPDK_ERRLOG("comparev failed: rc = %d\n", rc);
3652 	}
3653 	return rc;
3654 }
3655 
3656 static int
3657 bdev_nvme_comparev_and_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3658 			      struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt,
3659 			      struct iovec *write_iov, int write_iovcnt,
3660 			      void *md, uint64_t lba_count, uint64_t lba, uint32_t flags)
3661 {
3662 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
3663 	int rc;
3664 
3665 	SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n",
3666 		      lba_count, lba);
3667 
3668 	bio->iovs = cmp_iov;
3669 	bio->iovcnt = cmp_iovcnt;
3670 	bio->iovpos = 0;
3671 	bio->iov_offset = 0;
3672 	bio->fused_iovs = write_iov;
3673 	bio->fused_iovcnt = write_iovcnt;
3674 	bio->fused_iovpos = 0;
3675 	bio->fused_iov_offset = 0;
3676 
3677 	if (bdev_io->num_retries == 0) {
3678 		bio->first_fused_submitted = false;
3679 	}
3680 
3681 	if (!bio->first_fused_submitted) {
3682 		flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST;
3683 		memset(&bio->cpl, 0, sizeof(bio->cpl));
3684 
3685 		rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count,
3686 						       bdev_nvme_comparev_and_writev_done, bio, flags,
3687 						       bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0);
3688 		if (rc == 0) {
3689 			bio->first_fused_submitted = true;
3690 			flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST;
3691 		} else {
3692 			if (rc != -ENOMEM) {
3693 				SPDK_ERRLOG("compare failed: rc = %d\n", rc);
3694 			}
3695 			return rc;
3696 		}
3697 	}
3698 
3699 	flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND;
3700 
3701 	rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count,
3702 					     bdev_nvme_comparev_and_writev_done, bio, flags,
3703 					     bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0);
3704 	if (rc != 0 && rc != -ENOMEM) {
3705 		SPDK_ERRLOG("write failed: rc = %d\n", rc);
3706 		rc = 0;
3707 	}
3708 
3709 	return rc;
3710 }
3711 
3712 static int
3713 bdev_nvme_unmap(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3714 		struct nvme_bdev_io *bio,
3715 		uint64_t offset_blocks,
3716 		uint64_t num_blocks)
3717 {
3718 	struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES];
3719 	struct spdk_nvme_dsm_range *range;
3720 	uint64_t offset, remaining;
3721 	uint64_t num_ranges_u64;
3722 	uint16_t num_ranges;
3723 	int rc;
3724 
3725 	num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) /
3726 			 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
3727 	if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) {
3728 		SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks);
3729 		return -EINVAL;
3730 	}
3731 	num_ranges = (uint16_t)num_ranges_u64;
3732 
3733 	offset = offset_blocks;
3734 	remaining = num_blocks;
3735 	range = &dsm_ranges[0];
3736 
3737 	/* Fill max-size ranges until the remaining blocks fit into one range */
3738 	while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) {
3739 		range->attributes.raw = 0;
3740 		range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
3741 		range->starting_lba = offset;
3742 
3743 		offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
3744 		remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
3745 		range++;
3746 	}
3747 
3748 	/* Final range describes the remaining blocks */
3749 	range->attributes.raw = 0;
3750 	range->length = remaining;
3751 	range->starting_lba = offset;
3752 
3753 	rc = spdk_nvme_ns_cmd_dataset_management(ns, qpair,
3754 			SPDK_NVME_DSM_ATTR_DEALLOCATE,
3755 			dsm_ranges, num_ranges,
3756 			bdev_nvme_queued_done, bio);
3757 
3758 	return rc;
3759 }
3760 
3761 static int
3762 bdev_nvme_write_zeroes(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3763 		       struct nvme_bdev_io *bio,
3764 		       uint64_t offset_blocks,
3765 		       uint64_t num_blocks)
3766 {
3767 	if (num_blocks > UINT16_MAX + 1) {
3768 		SPDK_ERRLOG("NVMe write zeroes is limited to 16-bit block count\n");
3769 		return -EINVAL;
3770 	}
3771 
3772 	return spdk_nvme_ns_cmd_write_zeroes(ns, qpair,
3773 					     offset_blocks, num_blocks,
3774 					     bdev_nvme_queued_done, bio,
3775 					     0);
3776 }
3777 
3778 static int
3779 bdev_nvme_get_zone_info(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3780 			struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones,
3781 			struct spdk_bdev_zone_info *info)
3782 {
3783 	uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns);
3784 	uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns);
3785 	uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns);
3786 
3787 	if (zone_id % zone_size != 0) {
3788 		return -EINVAL;
3789 	}
3790 
3791 	if (num_zones > total_zones || !num_zones) {
3792 		return -EINVAL;
3793 	}
3794 
3795 	assert(!bio->zone_report_buf);
3796 	bio->zone_report_buf = calloc(1, zone_report_bufsize);
3797 	if (!bio->zone_report_buf) {
3798 		return -ENOMEM;
3799 	}
3800 
3801 	bio->handled_zones = 0;
3802 
3803 	return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize,
3804 					  zone_id, SPDK_NVME_ZRA_LIST_ALL, true,
3805 					  bdev_nvme_get_zone_info_done, bio);
3806 }
3807 
3808 static int
3809 bdev_nvme_zone_management(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3810 			  struct nvme_bdev_io *bio, uint64_t zone_id,
3811 			  enum spdk_bdev_zone_action action)
3812 {
3813 	switch (action) {
3814 	case SPDK_BDEV_ZONE_CLOSE:
3815 		return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false,
3816 						bdev_nvme_zone_management_done, bio);
3817 	case SPDK_BDEV_ZONE_FINISH:
3818 		return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false,
3819 						 bdev_nvme_zone_management_done, bio);
3820 	case SPDK_BDEV_ZONE_OPEN:
3821 		return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false,
3822 					       bdev_nvme_zone_management_done, bio);
3823 	case SPDK_BDEV_ZONE_RESET:
3824 		return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false,
3825 						bdev_nvme_zone_management_done, bio);
3826 	case SPDK_BDEV_ZONE_OFFLINE:
3827 		return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false,
3828 						  bdev_nvme_zone_management_done, bio);
3829 	default:
3830 		return -EINVAL;
3831 	}
3832 }
3833 
3834 static int
3835 bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio,
3836 			 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes)
3837 {
3838 	struct nvme_ctrlr *nvme_ctrlr;
3839 	uint32_t max_xfer_size;
3840 
3841 	if (!bdev_nvme_find_admin_path(nbdev_ch, &nvme_ctrlr)) {
3842 		return -EINVAL;
3843 	}
3844 
3845 	max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ctrlr->ctrlr);
3846 
3847 	if (nbytes > max_xfer_size) {
3848 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
3849 		return -EINVAL;
3850 	}
3851 
3852 	bio->orig_thread = spdk_get_thread();
3853 
3854 	return spdk_nvme_ctrlr_cmd_admin_raw(nvme_ctrlr->ctrlr, cmd, buf,
3855 					     (uint32_t)nbytes, bdev_nvme_admin_passthru_done, bio);
3856 }
3857 
3858 static int
3859 bdev_nvme_io_passthru(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3860 		      struct nvme_bdev_io *bio,
3861 		      struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes)
3862 {
3863 	uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns);
3864 	struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns);
3865 
3866 	if (nbytes > max_xfer_size) {
3867 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
3868 		return -EINVAL;
3869 	}
3870 
3871 	/*
3872 	 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid,
3873 	 * so fill it out automatically.
3874 	 */
3875 	cmd->nsid = spdk_nvme_ns_get_id(ns);
3876 
3877 	return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf,
3878 					  (uint32_t)nbytes, bdev_nvme_queued_done, bio);
3879 }
3880 
3881 static int
3882 bdev_nvme_io_passthru_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3883 			 struct nvme_bdev_io *bio,
3884 			 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len)
3885 {
3886 	size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns);
3887 	uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns);
3888 	struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns);
3889 
3890 	if (nbytes > max_xfer_size) {
3891 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
3892 		return -EINVAL;
3893 	}
3894 
3895 	if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) {
3896 		SPDK_ERRLOG("invalid meta data buffer size\n");
3897 		return -EINVAL;
3898 	}
3899 
3900 	/*
3901 	 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid,
3902 	 * so fill it out automatically.
3903 	 */
3904 	cmd->nsid = spdk_nvme_ns_get_id(ns);
3905 
3906 	return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf,
3907 			(uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio);
3908 }
3909 
3910 static int
3911 bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio,
3912 		struct nvme_bdev_io *bio_to_abort)
3913 {
3914 	struct nvme_ctrlr_channel *ctrlr_ch = nbdev_ch->ctrlr_ch;
3915 	int rc;
3916 
3917 	bio->orig_thread = spdk_get_thread();
3918 
3919 	rc = spdk_nvme_ctrlr_cmd_abort_ext(ctrlr_ch->ctrlr->ctrlr,
3920 					   ctrlr_ch->qpair,
3921 					   bio_to_abort,
3922 					   bdev_nvme_abort_done, bio);
3923 	if (rc == -ENOENT) {
3924 		/* If no command was found in I/O qpair, the target command may be
3925 		 * admin command.
3926 		 */
3927 		rc = spdk_nvme_ctrlr_cmd_abort_ext(ctrlr_ch->ctrlr->ctrlr,
3928 						   NULL,
3929 						   bio_to_abort,
3930 						   bdev_nvme_abort_done, bio);
3931 	}
3932 
3933 	if (rc == -ENOENT) {
3934 		/* If no command was found, complete the abort request with failure. */
3935 		bio->cpl.cdw0 |= 1U;
3936 		bio->cpl.status.sc = SPDK_NVME_SC_SUCCESS;
3937 		bio->cpl.status.sct = SPDK_NVME_SCT_GENERIC;
3938 
3939 		bdev_nvme_abort_completion(bio);
3940 
3941 		rc = 0;
3942 	}
3943 
3944 	return rc;
3945 }
3946 
3947 static void
3948 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w)
3949 {
3950 	const char	*action;
3951 
3952 	if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) {
3953 		action = "reset";
3954 	} else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) {
3955 		action = "abort";
3956 	} else {
3957 		action = "none";
3958 	}
3959 
3960 	spdk_json_write_object_begin(w);
3961 
3962 	spdk_json_write_named_string(w, "method", "bdev_nvme_set_options");
3963 
3964 	spdk_json_write_named_object_begin(w, "params");
3965 	spdk_json_write_named_string(w, "action_on_timeout", action);
3966 	spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us);
3967 	spdk_json_write_named_uint64(w, "timeout_admin_us", g_opts.timeout_admin_us);
3968 	spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms);
3969 	spdk_json_write_named_uint32(w, "retry_count", g_opts.retry_count);
3970 	spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst);
3971 	spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight);
3972 	spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight);
3973 	spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight);
3974 	spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us);
3975 	spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us);
3976 	spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests);
3977 	spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit);
3978 	spdk_json_write_object_end(w);
3979 
3980 	spdk_json_write_object_end(w);
3981 }
3982 
3983 static void
3984 nvme_ctrlr_config_json(struct spdk_json_write_ctx *w,
3985 		       struct nvme_ctrlr *nvme_ctrlr)
3986 {
3987 	struct spdk_nvme_transport_id	*trid;
3988 
3989 	trid = nvme_ctrlr->connected_trid;
3990 
3991 	spdk_json_write_object_begin(w);
3992 
3993 	spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller");
3994 
3995 	spdk_json_write_named_object_begin(w, "params");
3996 	spdk_json_write_named_string(w, "name", nvme_ctrlr->name);
3997 	nvme_bdev_dump_trid_json(trid, w);
3998 	spdk_json_write_named_bool(w, "prchk_reftag",
3999 				   (nvme_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0);
4000 	spdk_json_write_named_bool(w, "prchk_guard",
4001 				   (nvme_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0);
4002 
4003 	spdk_json_write_object_end(w);
4004 
4005 	spdk_json_write_object_end(w);
4006 }
4007 
4008 static void
4009 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w)
4010 {
4011 	spdk_json_write_object_begin(w);
4012 	spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug");
4013 
4014 	spdk_json_write_named_object_begin(w, "params");
4015 	spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us);
4016 	spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled);
4017 	spdk_json_write_object_end(w);
4018 
4019 	spdk_json_write_object_end(w);
4020 }
4021 
4022 static int
4023 bdev_nvme_config_json(struct spdk_json_write_ctx *w)
4024 {
4025 	struct nvme_ctrlr	*nvme_ctrlr;
4026 
4027 	bdev_nvme_opts_config_json(w);
4028 
4029 	pthread_mutex_lock(&g_bdev_nvme_mutex);
4030 
4031 	TAILQ_FOREACH(nvme_ctrlr, &g_nvme_ctrlrs, tailq) {
4032 		nvme_ctrlr_config_json(w, nvme_ctrlr);
4033 	}
4034 
4035 	/* Dump as last parameter to give all NVMe bdevs chance to be constructed
4036 	 * before enabling hotplug poller.
4037 	 */
4038 	bdev_nvme_hotplug_config_json(w);
4039 
4040 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
4041 	return 0;
4042 }
4043 
4044 struct spdk_nvme_ctrlr *
4045 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev)
4046 {
4047 	if (!bdev || bdev->module != &nvme_if) {
4048 		return NULL;
4049 	}
4050 
4051 	return SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk)->nvme_ns->ctrlr->ctrlr;
4052 }
4053 
4054 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme)
4055