xref: /spdk/module/bdev/nvme/bdev_nvme.c (revision d39c4443d43c9f9ab10fa35965af4af45b55b593)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation. All rights reserved.
5  *   Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
6  *   Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
7  *
8  *   Redistribution and use in source and binary forms, with or without
9  *   modification, are permitted provided that the following conditions
10  *   are met:
11  *
12  *     * Redistributions of source code must retain the above copyright
13  *       notice, this list of conditions and the following disclaimer.
14  *     * Redistributions in binary form must reproduce the above copyright
15  *       notice, this list of conditions and the following disclaimer in
16  *       the documentation and/or other materials provided with the
17  *       distribution.
18  *     * Neither the name of Intel Corporation nor the names of its
19  *       contributors may be used to endorse or promote products derived
20  *       from this software without specific prior written permission.
21  *
22  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
25  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
26  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  */
34 
35 #include "spdk/stdinc.h"
36 
37 #include "bdev_nvme.h"
38 
39 #include "spdk/accel_engine.h"
40 #include "spdk/config.h"
41 #include "spdk/endian.h"
42 #include "spdk/bdev.h"
43 #include "spdk/json.h"
44 #include "spdk/nvme.h"
45 #include "spdk/nvme_ocssd.h"
46 #include "spdk/nvme_zns.h"
47 #include "spdk/thread.h"
48 #include "spdk/string.h"
49 #include "spdk/util.h"
50 
51 #include "spdk/bdev_module.h"
52 #include "spdk/log.h"
53 
54 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true
55 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS	(10000)
56 
57 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w);
58 
59 struct nvme_bdev_io {
60 	/** array of iovecs to transfer. */
61 	struct iovec *iovs;
62 
63 	/** Number of iovecs in iovs array. */
64 	int iovcnt;
65 
66 	/** Current iovec position. */
67 	int iovpos;
68 
69 	/** Offset in current iovec. */
70 	uint32_t iov_offset;
71 
72 	/** array of iovecs to transfer. */
73 	struct iovec *fused_iovs;
74 
75 	/** Number of iovecs in iovs array. */
76 	int fused_iovcnt;
77 
78 	/** Current iovec position. */
79 	int fused_iovpos;
80 
81 	/** Offset in current iovec. */
82 	uint32_t fused_iov_offset;
83 
84 	/** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */
85 	struct spdk_nvme_cpl cpl;
86 	/** Extended IO opts passed by the user to bdev layer and mapped to NVME format */
87 	struct spdk_nvme_ns_cmd_ext_io_opts ext_opts;
88 
89 	/** Originating thread */
90 	struct spdk_thread *orig_thread;
91 
92 	/** Keeps track if first of fused commands was submitted */
93 	bool first_fused_submitted;
94 
95 	/** Temporary pointer to zone report buffer */
96 	struct spdk_nvme_zns_zone_report *zone_report_buf;
97 
98 	/** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */
99 	uint64_t handled_zones;
100 };
101 
102 struct nvme_probe_ctx {
103 	size_t count;
104 	struct spdk_nvme_transport_id trids[NVME_MAX_CONTROLLERS];
105 	struct spdk_nvme_host_id hostids[NVME_MAX_CONTROLLERS];
106 	const char *names[NVME_MAX_CONTROLLERS];
107 	uint32_t prchk_flags[NVME_MAX_CONTROLLERS];
108 	const char *hostnqn;
109 };
110 
111 struct nvme_probe_skip_entry {
112 	struct spdk_nvme_transport_id		trid;
113 	TAILQ_ENTRY(nvme_probe_skip_entry)	tailq;
114 };
115 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */
116 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER(
117 			g_skipped_nvme_ctrlrs);
118 
119 static struct spdk_bdev_nvme_opts g_opts = {
120 	.action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE,
121 	.timeout_us = 0,
122 	.timeout_admin_us = 0,
123 	.keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS,
124 	.retry_count = 4,
125 	.arbitration_burst = 0,
126 	.low_priority_weight = 0,
127 	.medium_priority_weight = 0,
128 	.high_priority_weight = 0,
129 	.nvme_adminq_poll_period_us = 10000ULL,
130 	.nvme_ioq_poll_period_us = 0,
131 	.io_queue_requests = 0,
132 	.delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT,
133 };
134 
135 #define NVME_HOTPLUG_POLL_PERIOD_MAX			10000000ULL
136 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT		100000ULL
137 
138 static int g_hot_insert_nvme_controller_index = 0;
139 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT;
140 static bool g_nvme_hotplug_enabled = false;
141 static struct spdk_thread *g_bdev_nvme_init_thread;
142 static struct spdk_poller *g_hotplug_poller;
143 static struct spdk_poller *g_hotplug_probe_poller;
144 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx;
145 
146 static void nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr,
147 		struct nvme_async_probe_ctx *ctx);
148 static void nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr,
149 		struct nvme_async_probe_ctx *ctx);
150 static int bdev_nvme_library_init(void);
151 static void bdev_nvme_library_fini(void);
152 static int bdev_nvme_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
153 			   struct nvme_bdev_io *bio,
154 			   struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
155 			   uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts);
156 static int bdev_nvme_no_pi_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
157 				 struct nvme_bdev_io *bio,
158 				 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba);
159 static int bdev_nvme_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
160 			    struct nvme_bdev_io *bio,
161 			    struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
162 			    uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts);
163 static int bdev_nvme_zone_appendv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
164 				  struct nvme_bdev_io *bio,
165 				  struct iovec *iov, int iovcnt, void *md, uint64_t lba_count,
166 				  uint64_t zslba, uint32_t flags);
167 static int bdev_nvme_comparev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
168 			      struct nvme_bdev_io *bio,
169 			      struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
170 			      uint32_t flags);
171 static int bdev_nvme_comparev_and_writev(struct spdk_nvme_ns *ns,
172 		struct spdk_nvme_qpair *qpair,
173 		struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov,
174 		int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba,
175 		uint32_t flags);
176 static int bdev_nvme_get_zone_info(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
177 				   struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones,
178 				   struct spdk_bdev_zone_info *info);
179 static int bdev_nvme_zone_management(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
180 				     struct nvme_bdev_io *bio, uint64_t zone_id,
181 				     enum spdk_bdev_zone_action action);
182 static int bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch,
183 				    struct nvme_bdev_io *bio,
184 				    struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes);
185 static int bdev_nvme_io_passthru(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
186 				 struct nvme_bdev_io *bio,
187 				 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes);
188 static int bdev_nvme_io_passthru_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
189 				    struct nvme_bdev_io *bio,
190 				    struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len);
191 static int bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch,
192 			   struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort);
193 static int bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio);
194 static int bdev_nvme_failover(struct nvme_ctrlr *nvme_ctrlr, bool remove);
195 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr);
196 
197 typedef void (*populate_namespace_fn)(struct nvme_ctrlr *nvme_ctrlr,
198 				      struct nvme_ns *nvme_ns, struct nvme_async_probe_ctx *ctx);
199 static void nvme_ctrlr_populate_standard_namespace(struct nvme_ctrlr *nvme_ctrlr,
200 		struct nvme_ns *nvme_ns, struct nvme_async_probe_ctx *ctx);
201 
202 static populate_namespace_fn g_populate_namespace_fn[] = {
203 	NULL,
204 	nvme_ctrlr_populate_standard_namespace,
205 };
206 
207 typedef void (*depopulate_namespace_fn)(struct nvme_ns *nvme_ns);
208 static void nvme_ctrlr_depopulate_standard_namespace(struct nvme_ns *nvme_ns);
209 
210 static depopulate_namespace_fn g_depopulate_namespace_fn[] = {
211 	NULL,
212 	nvme_ctrlr_depopulate_standard_namespace,
213 };
214 
215 typedef void (*config_json_namespace_fn)(struct spdk_json_write_ctx *w,
216 		struct nvme_ns *nvme_ns);
217 static void nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w,
218 		struct nvme_ns *nvme_ns);
219 
220 static config_json_namespace_fn g_config_json_namespace_fn[] = {
221 	NULL,
222 	nvme_ctrlr_config_json_standard_namespace,
223 };
224 
225 struct spdk_nvme_qpair *
226 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch)
227 {
228 	struct nvme_ctrlr_channel *ctrlr_ch;
229 
230 	assert(ctrlr_io_ch != NULL);
231 
232 	ctrlr_ch = spdk_io_channel_get_ctx(ctrlr_io_ch);
233 
234 	return ctrlr_ch->qpair;
235 }
236 
237 static int
238 bdev_nvme_get_ctx_size(void)
239 {
240 	return sizeof(struct nvme_bdev_io);
241 }
242 
243 static struct spdk_bdev_module nvme_if = {
244 	.name = "nvme",
245 	.async_fini = true,
246 	.module_init = bdev_nvme_library_init,
247 	.module_fini = bdev_nvme_library_fini,
248 	.config_json = bdev_nvme_config_json,
249 	.get_ctx_size = bdev_nvme_get_ctx_size,
250 
251 };
252 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if)
253 
254 static inline bool
255 bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch,
256 		       struct spdk_nvme_ns **_ns, struct spdk_nvme_qpair **_qpair)
257 {
258 	if (spdk_unlikely(nbdev_ch->ctrlr_ch->qpair == NULL)) {
259 		/* The device is currently resetting. */
260 		return false;
261 	}
262 
263 	*_ns = nbdev_ch->nvme_ns->ns;
264 	*_qpair = nbdev_ch->ctrlr_ch->qpair;
265 	return true;
266 }
267 
268 static inline bool
269 bdev_nvme_find_admin_path(struct nvme_bdev_channel *nbdev_ch,
270 			  struct nvme_ctrlr **_nvme_ctrlr)
271 {
272 	*_nvme_ctrlr = nbdev_ch->ctrlr_ch->ctrlr;
273 	return true;
274 }
275 
276 static inline void
277 bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio,
278 				  const struct spdk_nvme_cpl *cpl)
279 {
280 	spdk_bdev_io_complete_nvme_status(spdk_bdev_io_from_ctx(bio), cpl->cdw0,
281 					  cpl->status.sct, cpl->status.sc);
282 }
283 
284 static inline void
285 bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc)
286 {
287 	enum spdk_bdev_io_status io_status;
288 
289 	if (rc == 0) {
290 		io_status = SPDK_BDEV_IO_STATUS_SUCCESS;
291 	} else if (rc == -ENOMEM) {
292 		io_status = SPDK_BDEV_IO_STATUS_NOMEM;
293 	} else {
294 		io_status = SPDK_BDEV_IO_STATUS_FAILED;
295 	}
296 
297 	spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), io_status);
298 }
299 
300 static void
301 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx)
302 {
303 	int rc;
304 
305 	SPDK_DEBUGLOG(bdev_nvme, "qpair %p is disconnected, attempting reconnect.\n", qpair);
306 	/*
307 	 * Currently, just try to reconnect indefinitely. If we are doing a reset, the reset will
308 	 * reconnect a qpair and we will stop getting a callback for this one.
309 	 */
310 	rc = spdk_nvme_ctrlr_reconnect_io_qpair(qpair);
311 	if (rc != 0) {
312 		SPDK_DEBUGLOG(bdev_nvme, "Failed to reconnect to qpair %p, errno %d\n", qpair, -rc);
313 	}
314 }
315 
316 static int
317 bdev_nvme_poll(void *arg)
318 {
319 	struct nvme_poll_group *group = arg;
320 	int64_t num_completions;
321 
322 	if (group->collect_spin_stat && group->start_ticks == 0) {
323 		group->start_ticks = spdk_get_ticks();
324 	}
325 
326 	num_completions = spdk_nvme_poll_group_process_completions(group->group, 0,
327 			  bdev_nvme_disconnected_qpair_cb);
328 	if (group->collect_spin_stat) {
329 		if (num_completions > 0) {
330 			if (group->end_ticks != 0) {
331 				group->spin_ticks += (group->end_ticks - group->start_ticks);
332 				group->end_ticks = 0;
333 			}
334 			group->start_ticks = 0;
335 		} else {
336 			group->end_ticks = spdk_get_ticks();
337 		}
338 	}
339 
340 	return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
341 }
342 
343 static int
344 bdev_nvme_poll_adminq(void *arg)
345 {
346 	int32_t rc;
347 	struct nvme_ctrlr *nvme_ctrlr = arg;
348 
349 	assert(nvme_ctrlr != NULL);
350 
351 	rc = spdk_nvme_ctrlr_process_admin_completions(nvme_ctrlr->ctrlr);
352 	if (rc < 0) {
353 		bdev_nvme_failover(nvme_ctrlr, false);
354 	}
355 
356 	return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY;
357 }
358 
359 static void
360 _bdev_nvme_unregister_dev_cb(void *io_device)
361 {
362 	struct nvme_bdev *nvme_disk = io_device;
363 
364 	free(nvme_disk->disk.name);
365 	free(nvme_disk);
366 }
367 
368 static int
369 bdev_nvme_destruct(void *ctx)
370 {
371 	struct nvme_bdev *nvme_disk = ctx;
372 	struct nvme_ns *nvme_ns = nvme_disk->nvme_ns;
373 
374 	pthread_mutex_lock(&nvme_ns->ctrlr->mutex);
375 
376 	nvme_ns->bdev = NULL;
377 
378 	if (!nvme_ns->populated) {
379 		pthread_mutex_unlock(&nvme_ns->ctrlr->mutex);
380 
381 		nvme_ctrlr_release(nvme_ns->ctrlr);
382 	} else {
383 		pthread_mutex_unlock(&nvme_ns->ctrlr->mutex);
384 	}
385 
386 	spdk_io_device_unregister(nvme_disk, _bdev_nvme_unregister_dev_cb);
387 
388 	return 0;
389 }
390 
391 static int
392 bdev_nvme_flush(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
393 		struct nvme_bdev_io *bio, uint64_t offset, uint64_t nbytes)
394 {
395 	bdev_nvme_io_complete(bio, 0);
396 
397 	return 0;
398 }
399 
400 static int
401 bdev_nvme_create_qpair(struct nvme_ctrlr_channel *ctrlr_ch)
402 {
403 	struct spdk_nvme_ctrlr *ctrlr = ctrlr_ch->ctrlr->ctrlr;
404 	struct spdk_nvme_io_qpair_opts opts;
405 	struct spdk_nvme_qpair *qpair;
406 	int rc;
407 
408 	spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &opts, sizeof(opts));
409 	opts.delay_cmd_submit = g_opts.delay_cmd_submit;
410 	opts.create_only = true;
411 	opts.async_mode = true;
412 	opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests);
413 	g_opts.io_queue_requests = opts.io_queue_requests;
414 
415 	qpair = spdk_nvme_ctrlr_alloc_io_qpair(ctrlr, &opts, sizeof(opts));
416 	if (qpair == NULL) {
417 		return -1;
418 	}
419 
420 	assert(ctrlr_ch->group != NULL);
421 
422 	rc = spdk_nvme_poll_group_add(ctrlr_ch->group->group, qpair);
423 	if (rc != 0) {
424 		SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n");
425 		goto err;
426 	}
427 
428 	rc = spdk_nvme_ctrlr_connect_io_qpair(ctrlr, qpair);
429 	if (rc != 0) {
430 		SPDK_ERRLOG("Unable to connect I/O qpair.\n");
431 		goto err;
432 	}
433 
434 	ctrlr_ch->qpair = qpair;
435 
436 	return 0;
437 
438 err:
439 	spdk_nvme_ctrlr_free_io_qpair(qpair);
440 
441 	return rc;
442 }
443 
444 static void
445 bdev_nvme_destroy_qpair(struct nvme_ctrlr_channel *ctrlr_ch)
446 {
447 	if (ctrlr_ch->qpair != NULL) {
448 		spdk_nvme_ctrlr_free_io_qpair(ctrlr_ch->qpair);
449 		ctrlr_ch->qpair = NULL;
450 	}
451 }
452 
453 static void
454 _bdev_nvme_check_pending_destruct(struct nvme_ctrlr *nvme_ctrlr)
455 {
456 	pthread_mutex_lock(&nvme_ctrlr->mutex);
457 	if (nvme_ctrlr->destruct_after_reset) {
458 		assert(nvme_ctrlr->ref == 0 && nvme_ctrlr->destruct);
459 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
460 
461 		spdk_thread_send_msg(nvme_ctrlr->thread, nvme_ctrlr_unregister,
462 				     nvme_ctrlr);
463 	} else {
464 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
465 	}
466 }
467 
468 static void
469 bdev_nvme_check_pending_destruct(struct spdk_io_channel_iter *i, int status)
470 {
471 	struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i);
472 
473 	_bdev_nvme_check_pending_destruct(nvme_ctrlr);
474 }
475 
476 static void
477 _bdev_nvme_complete_pending_resets(struct nvme_ctrlr_channel *ctrlr_ch,
478 				   enum spdk_bdev_io_status status)
479 {
480 	struct spdk_bdev_io *bdev_io;
481 
482 	while (!TAILQ_EMPTY(&ctrlr_ch->pending_resets)) {
483 		bdev_io = TAILQ_FIRST(&ctrlr_ch->pending_resets);
484 		TAILQ_REMOVE(&ctrlr_ch->pending_resets, bdev_io, module_link);
485 		spdk_bdev_io_complete(bdev_io, status);
486 	}
487 }
488 
489 static void
490 bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i)
491 {
492 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
493 	struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch);
494 
495 	_bdev_nvme_complete_pending_resets(ctrlr_ch, SPDK_BDEV_IO_STATUS_SUCCESS);
496 
497 	spdk_for_each_channel_continue(i, 0);
498 }
499 
500 static void
501 bdev_nvme_abort_pending_resets(struct spdk_io_channel_iter *i)
502 {
503 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
504 	struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch);
505 
506 	_bdev_nvme_complete_pending_resets(ctrlr_ch, SPDK_BDEV_IO_STATUS_FAILED);
507 
508 	spdk_for_each_channel_continue(i, 0);
509 }
510 
511 static void
512 bdev_nvme_reset_complete(struct nvme_ctrlr *nvme_ctrlr, int rc)
513 {
514 	struct nvme_ctrlr_trid *curr_trid;
515 	bdev_nvme_reset_cb reset_cb_fn = nvme_ctrlr->reset_cb_fn;
516 	void *reset_cb_arg = nvme_ctrlr->reset_cb_arg;
517 
518 	nvme_ctrlr->reset_cb_fn = NULL;
519 	nvme_ctrlr->reset_cb_arg = NULL;
520 
521 	if (rc) {
522 		SPDK_ERRLOG("Resetting controller failed.\n");
523 	} else {
524 		SPDK_NOTICELOG("Resetting controller successful.\n");
525 	}
526 
527 	pthread_mutex_lock(&nvme_ctrlr->mutex);
528 	nvme_ctrlr->resetting = false;
529 	nvme_ctrlr->failover_in_progress = false;
530 
531 	curr_trid = TAILQ_FIRST(&nvme_ctrlr->trids);
532 	assert(curr_trid != NULL);
533 	assert(&curr_trid->trid == nvme_ctrlr->connected_trid);
534 
535 	curr_trid->is_failed = rc != 0 ? true : false;
536 
537 	if (nvme_ctrlr->ref == 0 && nvme_ctrlr->destruct) {
538 		/* Destruct ctrlr after clearing pending resets. */
539 		nvme_ctrlr->destruct_after_reset = true;
540 	}
541 
542 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
543 
544 	if (reset_cb_fn) {
545 		reset_cb_fn(reset_cb_arg, rc);
546 	}
547 
548 	/* Make sure we clear any pending resets before returning. */
549 	spdk_for_each_channel(nvme_ctrlr,
550 			      rc == 0 ? bdev_nvme_complete_pending_resets :
551 			      bdev_nvme_abort_pending_resets,
552 			      NULL,
553 			      bdev_nvme_check_pending_destruct);
554 }
555 
556 static void
557 bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status)
558 {
559 	struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i);
560 
561 	bdev_nvme_reset_complete(nvme_ctrlr, status);
562 }
563 
564 static void
565 bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i)
566 {
567 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
568 	struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch);
569 	int rc;
570 
571 	rc = bdev_nvme_create_qpair(ctrlr_ch);
572 
573 	spdk_for_each_channel_continue(i, rc);
574 }
575 
576 static int
577 bdev_nvme_ctrlr_reset_poll(void *arg)
578 {
579 	struct nvme_ctrlr *nvme_ctrlr = arg;
580 	int rc;
581 
582 	rc = spdk_nvme_ctrlr_reset_poll_async(nvme_ctrlr->reset_ctx);
583 	if (rc == -EAGAIN) {
584 		return SPDK_POLLER_BUSY;
585 	}
586 
587 	spdk_poller_unregister(&nvme_ctrlr->reset_poller);
588 	if (rc == 0) {
589 		/* Recreate all of the I/O queue pairs */
590 		spdk_for_each_channel(nvme_ctrlr,
591 				      bdev_nvme_reset_create_qpair,
592 				      NULL,
593 				      bdev_nvme_reset_create_qpairs_done);
594 	} else {
595 		bdev_nvme_reset_complete(nvme_ctrlr, rc);
596 	}
597 	return SPDK_POLLER_BUSY;
598 }
599 
600 static void
601 bdev_nvme_reset_ctrlr(struct spdk_io_channel_iter *i, int status)
602 {
603 	struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i);
604 	int rc;
605 
606 	if (status) {
607 		rc = status;
608 		goto err;
609 	}
610 
611 	rc = spdk_nvme_ctrlr_reset_async(nvme_ctrlr->ctrlr, &nvme_ctrlr->reset_ctx);
612 	if (rc != 0) {
613 		SPDK_ERRLOG("Create controller reset context failed\n");
614 		goto err;
615 	}
616 	assert(nvme_ctrlr->reset_poller == NULL);
617 	nvme_ctrlr->reset_poller = SPDK_POLLER_REGISTER(bdev_nvme_ctrlr_reset_poll,
618 				   nvme_ctrlr, 0);
619 
620 	return;
621 
622 err:
623 	bdev_nvme_reset_complete(nvme_ctrlr, rc);
624 }
625 
626 static void
627 bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i)
628 {
629 	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
630 	struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch);
631 
632 	bdev_nvme_destroy_qpair(ctrlr_ch);
633 	spdk_for_each_channel_continue(i, 0);
634 }
635 
636 static int
637 bdev_nvme_reset(struct nvme_ctrlr *nvme_ctrlr)
638 {
639 	pthread_mutex_lock(&nvme_ctrlr->mutex);
640 	if (nvme_ctrlr->destruct) {
641 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
642 		return -EBUSY;
643 	}
644 
645 	if (nvme_ctrlr->resetting) {
646 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
647 		SPDK_NOTICELOG("Unable to perform reset, already in progress.\n");
648 		return -EAGAIN;
649 	}
650 
651 	nvme_ctrlr->resetting = true;
652 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
653 
654 	/* First, delete all NVMe I/O queue pairs. */
655 	spdk_for_each_channel(nvme_ctrlr,
656 			      bdev_nvme_reset_destroy_qpair,
657 			      NULL,
658 			      bdev_nvme_reset_ctrlr);
659 
660 	return 0;
661 }
662 
663 int
664 bdev_nvme_reset_rpc(struct nvme_ctrlr *nvme_ctrlr, bdev_nvme_reset_cb cb_fn, void *cb_arg)
665 {
666 	int rc;
667 
668 	rc = bdev_nvme_reset(nvme_ctrlr);
669 	if (rc == 0) {
670 		nvme_ctrlr->reset_cb_fn = cb_fn;
671 		nvme_ctrlr->reset_cb_arg = cb_arg;
672 	}
673 	return rc;
674 }
675 
676 static void
677 bdev_nvme_reset_io_complete(void *cb_arg, int rc)
678 {
679 	struct nvme_bdev_io *bio = cb_arg;
680 
681 	bdev_nvme_io_complete(bio, rc);
682 }
683 
684 static int
685 bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio)
686 {
687 	struct nvme_ctrlr_channel *ctrlr_ch = nbdev_ch->ctrlr_ch;
688 	struct spdk_bdev_io *bdev_io;
689 	int rc;
690 
691 	rc = bdev_nvme_reset(ctrlr_ch->ctrlr);
692 	if (rc == 0) {
693 		assert(ctrlr_ch->ctrlr->reset_cb_fn == NULL);
694 		assert(ctrlr_ch->ctrlr->reset_cb_arg == NULL);
695 		ctrlr_ch->ctrlr->reset_cb_fn = bdev_nvme_reset_io_complete;
696 		ctrlr_ch->ctrlr->reset_cb_arg = bio;
697 	} else if (rc == -EAGAIN) {
698 		/*
699 		 * Reset call is queued only if it is from the app framework. This is on purpose so that
700 		 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the
701 		 * upper level. If they are in the middle of a reset, we won't try to schedule another one.
702 		 */
703 		bdev_io = spdk_bdev_io_from_ctx(bio);
704 		TAILQ_INSERT_TAIL(&ctrlr_ch->pending_resets, bdev_io, module_link);
705 	} else {
706 		return rc;
707 	}
708 
709 	return 0;
710 }
711 
712 static int
713 bdev_nvme_failover_start(struct nvme_ctrlr *nvme_ctrlr, bool remove)
714 {
715 	struct nvme_ctrlr_trid *curr_trid = NULL, *next_trid = NULL;
716 	int rc;
717 
718 	pthread_mutex_lock(&nvme_ctrlr->mutex);
719 	if (nvme_ctrlr->destruct) {
720 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
721 		/* Don't bother resetting if the controller is in the process of being destructed. */
722 		return -EBUSY;
723 	}
724 
725 	curr_trid = TAILQ_FIRST(&nvme_ctrlr->trids);
726 	assert(curr_trid);
727 	assert(&curr_trid->trid == nvme_ctrlr->connected_trid);
728 	next_trid = TAILQ_NEXT(curr_trid, link);
729 
730 	if (nvme_ctrlr->resetting) {
731 		if (next_trid && !nvme_ctrlr->failover_in_progress) {
732 			rc = -EAGAIN;
733 		} else {
734 			rc = -EBUSY;
735 		}
736 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
737 		SPDK_NOTICELOG("Unable to perform reset, already in progress.\n");
738 		return rc;
739 	}
740 
741 	nvme_ctrlr->resetting = true;
742 	curr_trid->is_failed = true;
743 
744 	if (next_trid) {
745 		assert(curr_trid->trid.trtype != SPDK_NVME_TRANSPORT_PCIE);
746 
747 		SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", curr_trid->trid.traddr,
748 			       curr_trid->trid.trsvcid,	next_trid->trid.traddr, next_trid->trid.trsvcid);
749 
750 		nvme_ctrlr->failover_in_progress = true;
751 		spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr);
752 		nvme_ctrlr->connected_trid = &next_trid->trid;
753 		rc = spdk_nvme_ctrlr_set_trid(nvme_ctrlr->ctrlr, &next_trid->trid);
754 		assert(rc == 0);
755 		TAILQ_REMOVE(&nvme_ctrlr->trids, curr_trid, link);
756 		if (!remove) {
757 			/** Shuffle the old trid to the end of the list and use the new one.
758 			 * Allows for round robin through multiple connections.
759 			 */
760 			TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, curr_trid, link);
761 		} else {
762 			free(curr_trid);
763 		}
764 	}
765 
766 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
767 	return 0;
768 }
769 
770 static int
771 bdev_nvme_failover(struct nvme_ctrlr *nvme_ctrlr, bool remove)
772 {
773 	int rc;
774 
775 	rc = bdev_nvme_failover_start(nvme_ctrlr, remove);
776 	if (rc == 0) {
777 		/* First, delete all NVMe I/O queue pairs. */
778 		spdk_for_each_channel(nvme_ctrlr,
779 				      bdev_nvme_reset_destroy_qpair,
780 				      NULL,
781 				      bdev_nvme_reset_ctrlr);
782 	} else if (rc != -EBUSY) {
783 		return rc;
784 	}
785 
786 	return 0;
787 }
788 
789 static int
790 bdev_nvme_unmap(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
791 		struct nvme_bdev_io *bio,
792 		uint64_t offset_blocks,
793 		uint64_t num_blocks);
794 
795 static int
796 bdev_nvme_write_zeroes(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
797 		       struct nvme_bdev_io *bio,
798 		       uint64_t offset_blocks,
799 		       uint64_t num_blocks);
800 
801 static void
802 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
803 		     bool success)
804 {
805 	struct nvme_bdev_io *bio = (struct nvme_bdev_io *)bdev_io->driver_ctx;
806 	struct spdk_bdev *bdev = bdev_io->bdev;
807 	struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch);
808 	struct spdk_nvme_ns *ns;
809 	struct spdk_nvme_qpair *qpair;
810 	int ret;
811 
812 	if (!success) {
813 		ret = -EINVAL;
814 		goto exit;
815 	}
816 
817 	if (spdk_unlikely(!bdev_nvme_find_io_path(nbdev_ch, &ns, &qpair))) {
818 		ret = -ENXIO;
819 		goto exit;
820 	}
821 
822 	ret = bdev_nvme_readv(ns,
823 			      qpair,
824 			      bio,
825 			      bdev_io->u.bdev.iovs,
826 			      bdev_io->u.bdev.iovcnt,
827 			      bdev_io->u.bdev.md_buf,
828 			      bdev_io->u.bdev.num_blocks,
829 			      bdev_io->u.bdev.offset_blocks,
830 			      bdev->dif_check_flags,
831 			      bdev_io->internal.ext_opts);
832 
833 exit:
834 	if (spdk_unlikely(ret != 0)) {
835 		bdev_nvme_io_complete(bio, ret);
836 	}
837 }
838 
839 static void
840 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
841 {
842 	struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch);
843 	struct spdk_bdev *bdev = bdev_io->bdev;
844 	struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx;
845 	struct nvme_bdev_io *nbdev_io_to_abort;
846 	struct spdk_nvme_ns *ns;
847 	struct spdk_nvme_qpair *qpair;
848 	int rc = 0;
849 
850 	if (spdk_unlikely(!bdev_nvme_find_io_path(nbdev_ch, &ns, &qpair))) {
851 		rc = -ENXIO;
852 		goto exit;
853 	}
854 
855 	switch (bdev_io->type) {
856 	case SPDK_BDEV_IO_TYPE_READ:
857 		if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) {
858 			rc = bdev_nvme_readv(ns,
859 					     qpair,
860 					     nbdev_io,
861 					     bdev_io->u.bdev.iovs,
862 					     bdev_io->u.bdev.iovcnt,
863 					     bdev_io->u.bdev.md_buf,
864 					     bdev_io->u.bdev.num_blocks,
865 					     bdev_io->u.bdev.offset_blocks,
866 					     bdev->dif_check_flags,
867 					     bdev_io->internal.ext_opts);
868 		} else {
869 			spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb,
870 					     bdev_io->u.bdev.num_blocks * bdev->blocklen);
871 			rc = 0;
872 		}
873 		break;
874 	case SPDK_BDEV_IO_TYPE_WRITE:
875 		rc = bdev_nvme_writev(ns,
876 				      qpair,
877 				      nbdev_io,
878 				      bdev_io->u.bdev.iovs,
879 				      bdev_io->u.bdev.iovcnt,
880 				      bdev_io->u.bdev.md_buf,
881 				      bdev_io->u.bdev.num_blocks,
882 				      bdev_io->u.bdev.offset_blocks,
883 				      bdev->dif_check_flags,
884 				      bdev_io->internal.ext_opts);
885 		break;
886 	case SPDK_BDEV_IO_TYPE_COMPARE:
887 		rc = bdev_nvme_comparev(ns,
888 					qpair,
889 					nbdev_io,
890 					bdev_io->u.bdev.iovs,
891 					bdev_io->u.bdev.iovcnt,
892 					bdev_io->u.bdev.md_buf,
893 					bdev_io->u.bdev.num_blocks,
894 					bdev_io->u.bdev.offset_blocks,
895 					bdev->dif_check_flags);
896 		break;
897 	case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE:
898 		rc = bdev_nvme_comparev_and_writev(ns,
899 						   qpair,
900 						   nbdev_io,
901 						   bdev_io->u.bdev.iovs,
902 						   bdev_io->u.bdev.iovcnt,
903 						   bdev_io->u.bdev.fused_iovs,
904 						   bdev_io->u.bdev.fused_iovcnt,
905 						   bdev_io->u.bdev.md_buf,
906 						   bdev_io->u.bdev.num_blocks,
907 						   bdev_io->u.bdev.offset_blocks,
908 						   bdev->dif_check_flags);
909 		break;
910 	case SPDK_BDEV_IO_TYPE_UNMAP:
911 		rc = bdev_nvme_unmap(ns,
912 				     qpair,
913 				     nbdev_io,
914 				     bdev_io->u.bdev.offset_blocks,
915 				     bdev_io->u.bdev.num_blocks);
916 		break;
917 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
918 		rc =  bdev_nvme_write_zeroes(ns, qpair,
919 					     nbdev_io,
920 					     bdev_io->u.bdev.offset_blocks,
921 					     bdev_io->u.bdev.num_blocks);
922 		break;
923 	case SPDK_BDEV_IO_TYPE_RESET:
924 		rc = bdev_nvme_reset_io(nbdev_ch, nbdev_io);
925 		break;
926 	case SPDK_BDEV_IO_TYPE_FLUSH:
927 		rc = bdev_nvme_flush(ns,
928 				     qpair,
929 				     nbdev_io,
930 				     bdev_io->u.bdev.offset_blocks,
931 				     bdev_io->u.bdev.num_blocks);
932 		break;
933 	case SPDK_BDEV_IO_TYPE_ZONE_APPEND:
934 		rc = bdev_nvme_zone_appendv(ns,
935 					    qpair,
936 					    nbdev_io,
937 					    bdev_io->u.bdev.iovs,
938 					    bdev_io->u.bdev.iovcnt,
939 					    bdev_io->u.bdev.md_buf,
940 					    bdev_io->u.bdev.num_blocks,
941 					    bdev_io->u.bdev.offset_blocks,
942 					    bdev->dif_check_flags);
943 		break;
944 	case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO:
945 		rc = bdev_nvme_get_zone_info(ns,
946 					     qpair,
947 					     nbdev_io,
948 					     bdev_io->u.zone_mgmt.zone_id,
949 					     bdev_io->u.zone_mgmt.num_zones,
950 					     bdev_io->u.zone_mgmt.buf);
951 		break;
952 	case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
953 		rc = bdev_nvme_zone_management(ns,
954 					       qpair,
955 					       nbdev_io,
956 					       bdev_io->u.zone_mgmt.zone_id,
957 					       bdev_io->u.zone_mgmt.zone_action);
958 		break;
959 	case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
960 		rc = bdev_nvme_admin_passthru(nbdev_ch,
961 					      nbdev_io,
962 					      &bdev_io->u.nvme_passthru.cmd,
963 					      bdev_io->u.nvme_passthru.buf,
964 					      bdev_io->u.nvme_passthru.nbytes);
965 		break;
966 	case SPDK_BDEV_IO_TYPE_NVME_IO:
967 		rc = bdev_nvme_io_passthru(ns,
968 					   qpair,
969 					   nbdev_io,
970 					   &bdev_io->u.nvme_passthru.cmd,
971 					   bdev_io->u.nvme_passthru.buf,
972 					   bdev_io->u.nvme_passthru.nbytes);
973 		break;
974 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
975 		rc = bdev_nvme_io_passthru_md(ns,
976 					      qpair,
977 					      nbdev_io,
978 					      &bdev_io->u.nvme_passthru.cmd,
979 					      bdev_io->u.nvme_passthru.buf,
980 					      bdev_io->u.nvme_passthru.nbytes,
981 					      bdev_io->u.nvme_passthru.md_buf,
982 					      bdev_io->u.nvme_passthru.md_len);
983 		break;
984 	case SPDK_BDEV_IO_TYPE_ABORT:
985 		nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx;
986 		rc = bdev_nvme_abort(nbdev_ch,
987 				     nbdev_io,
988 				     nbdev_io_to_abort);
989 		break;
990 	default:
991 		rc = -EINVAL;
992 		break;
993 	}
994 
995 exit:
996 	if (spdk_unlikely(rc != 0)) {
997 		bdev_nvme_io_complete(nbdev_io, rc);
998 	}
999 }
1000 
1001 static bool
1002 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
1003 {
1004 	struct nvme_bdev *nbdev = ctx;
1005 	struct nvme_ns *nvme_ns;
1006 	struct spdk_nvme_ns *ns;
1007 	struct spdk_nvme_ctrlr *ctrlr;
1008 	const struct spdk_nvme_ctrlr_data *cdata;
1009 
1010 	nvme_ns = nbdev->nvme_ns;
1011 	assert(nvme_ns != NULL);
1012 	ns = nvme_ns->ns;
1013 	ctrlr = spdk_nvme_ns_get_ctrlr(ns);
1014 
1015 	switch (io_type) {
1016 	case SPDK_BDEV_IO_TYPE_READ:
1017 	case SPDK_BDEV_IO_TYPE_WRITE:
1018 	case SPDK_BDEV_IO_TYPE_RESET:
1019 	case SPDK_BDEV_IO_TYPE_FLUSH:
1020 	case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
1021 	case SPDK_BDEV_IO_TYPE_NVME_IO:
1022 	case SPDK_BDEV_IO_TYPE_ABORT:
1023 		return true;
1024 
1025 	case SPDK_BDEV_IO_TYPE_COMPARE:
1026 		return spdk_nvme_ns_supports_compare(ns);
1027 
1028 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
1029 		return spdk_nvme_ns_get_md_size(ns) ? true : false;
1030 
1031 	case SPDK_BDEV_IO_TYPE_UNMAP:
1032 		cdata = spdk_nvme_ctrlr_get_data(ctrlr);
1033 		return cdata->oncs.dsm;
1034 
1035 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
1036 		cdata = spdk_nvme_ctrlr_get_data(ctrlr);
1037 		return cdata->oncs.write_zeroes;
1038 
1039 	case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE:
1040 		if (spdk_nvme_ctrlr_get_flags(ctrlr) &
1041 		    SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) {
1042 			return true;
1043 		}
1044 		return false;
1045 
1046 	case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO:
1047 	case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
1048 		return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS;
1049 
1050 	case SPDK_BDEV_IO_TYPE_ZONE_APPEND:
1051 		return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS &&
1052 		       spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED;
1053 
1054 	default:
1055 		return false;
1056 	}
1057 }
1058 
1059 static int
1060 bdev_nvme_create_ctrlr_channel_cb(void *io_device, void *ctx_buf)
1061 {
1062 	struct nvme_ctrlr *nvme_ctrlr = io_device;
1063 	struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf;
1064 	struct spdk_io_channel *pg_ch;
1065 	int rc;
1066 
1067 	pg_ch = spdk_get_io_channel(&g_nvme_ctrlrs);
1068 	if (!pg_ch) {
1069 		return -1;
1070 	}
1071 
1072 	ctrlr_ch->group = spdk_io_channel_get_ctx(pg_ch);
1073 
1074 #ifdef SPDK_CONFIG_VTUNE
1075 	ctrlr_ch->group->collect_spin_stat = true;
1076 #else
1077 	ctrlr_ch->group->collect_spin_stat = false;
1078 #endif
1079 
1080 	TAILQ_INIT(&ctrlr_ch->pending_resets);
1081 
1082 	ctrlr_ch->ctrlr = nvme_ctrlr;
1083 
1084 	rc = bdev_nvme_create_qpair(ctrlr_ch);
1085 	if (rc != 0) {
1086 		goto err_qpair;
1087 	}
1088 
1089 	return 0;
1090 
1091 err_qpair:
1092 	spdk_put_io_channel(pg_ch);
1093 
1094 	return rc;
1095 }
1096 
1097 static void
1098 bdev_nvme_destroy_ctrlr_channel_cb(void *io_device, void *ctx_buf)
1099 {
1100 	struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf;
1101 
1102 	assert(ctrlr_ch->group != NULL);
1103 
1104 	bdev_nvme_destroy_qpair(ctrlr_ch);
1105 
1106 	spdk_put_io_channel(spdk_io_channel_from_ctx(ctrlr_ch->group));
1107 }
1108 
1109 static void
1110 bdev_nvme_submit_accel_crc32c(void *ctx, uint32_t *dst, struct iovec *iov,
1111 			      uint32_t iov_cnt, uint32_t seed,
1112 			      spdk_nvme_accel_completion_cb cb_fn, void *cb_arg)
1113 {
1114 	struct nvme_poll_group *group = ctx;
1115 	int rc;
1116 
1117 	assert(group->accel_channel != NULL);
1118 	assert(cb_fn != NULL);
1119 
1120 	rc = spdk_accel_submit_crc32cv(group->accel_channel, dst, iov, iov_cnt, seed, cb_fn, cb_arg);
1121 	if (rc) {
1122 		/* For the two cases, spdk_accel_submit_crc32cv does not call the user's cb_fn */
1123 		if (rc == -ENOMEM || rc == -EINVAL) {
1124 			cb_fn(cb_arg, rc);
1125 		}
1126 		SPDK_ERRLOG("Cannot complete the accelerated crc32c operation with iov=%p\n", iov);
1127 	}
1128 }
1129 
1130 static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = {
1131 	.table_size		= sizeof(struct spdk_nvme_accel_fn_table),
1132 	.submit_accel_crc32c	= bdev_nvme_submit_accel_crc32c,
1133 };
1134 
1135 static int
1136 bdev_nvme_create_poll_group_cb(void *io_device, void *ctx_buf)
1137 {
1138 	struct nvme_poll_group *group = ctx_buf;
1139 
1140 	group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table);
1141 	if (group->group == NULL) {
1142 		return -1;
1143 	}
1144 
1145 	group->accel_channel = spdk_accel_engine_get_io_channel();
1146 	if (!group->accel_channel) {
1147 		spdk_nvme_poll_group_destroy(group->group);
1148 		SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n",
1149 			    group);
1150 		return -1;
1151 	}
1152 
1153 	group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us);
1154 
1155 	if (group->poller == NULL) {
1156 		spdk_put_io_channel(group->accel_channel);
1157 		spdk_nvme_poll_group_destroy(group->group);
1158 		return -1;
1159 	}
1160 
1161 	return 0;
1162 }
1163 
1164 static void
1165 bdev_nvme_destroy_poll_group_cb(void *io_device, void *ctx_buf)
1166 {
1167 	struct nvme_poll_group *group = ctx_buf;
1168 
1169 	if (group->accel_channel) {
1170 		spdk_put_io_channel(group->accel_channel);
1171 	}
1172 
1173 	spdk_poller_unregister(&group->poller);
1174 	if (spdk_nvme_poll_group_destroy(group->group)) {
1175 		SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.\n");
1176 		assert(false);
1177 	}
1178 }
1179 
1180 static struct spdk_io_channel *
1181 bdev_nvme_get_io_channel(void *ctx)
1182 {
1183 	struct nvme_bdev *nvme_bdev = ctx;
1184 
1185 	return spdk_get_io_channel(nvme_bdev);
1186 }
1187 
1188 static void *
1189 bdev_nvme_get_module_ctx(void *ctx)
1190 {
1191 	struct nvme_bdev *nvme_bdev = ctx;
1192 
1193 	return bdev_nvme_get_ctrlr(&nvme_bdev->disk);
1194 }
1195 
1196 static const char *
1197 _nvme_ana_state_str(enum spdk_nvme_ana_state ana_state)
1198 {
1199 	switch (ana_state) {
1200 	case SPDK_NVME_ANA_OPTIMIZED_STATE:
1201 		return "optimized";
1202 	case SPDK_NVME_ANA_NON_OPTIMIZED_STATE:
1203 		return "non_optimized";
1204 	case SPDK_NVME_ANA_INACCESSIBLE_STATE:
1205 		return "inaccessible";
1206 	case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE:
1207 		return "persistent_loss";
1208 	case SPDK_NVME_ANA_CHANGE_STATE:
1209 		return "change";
1210 	default:
1211 		return NULL;
1212 	}
1213 }
1214 
1215 static int
1216 bdev_nvme_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size)
1217 {
1218 	struct nvme_bdev *nbdev = ctx;
1219 	struct spdk_memory_domain *domain;
1220 
1221 	domain = spdk_nvme_ctrlr_get_memory_domain(nbdev->nvme_ns->ctrlr->ctrlr);
1222 
1223 	if (domain) {
1224 		if (array_size > 0 && domains) {
1225 			domains[0] = domain;
1226 		}
1227 		return 1;
1228 	}
1229 
1230 	return 0;
1231 }
1232 
1233 static int
1234 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
1235 {
1236 	struct nvme_bdev *nvme_bdev = ctx;
1237 	struct nvme_ns *nvme_ns;
1238 	struct spdk_nvme_ns *ns;
1239 	struct spdk_nvme_ctrlr *ctrlr;
1240 	const struct spdk_nvme_ctrlr_data *cdata;
1241 	const struct spdk_nvme_transport_id *trid;
1242 	union spdk_nvme_vs_register vs;
1243 	union spdk_nvme_csts_register csts;
1244 	char buf[128];
1245 
1246 	nvme_ns = nvme_bdev->nvme_ns;
1247 	assert(nvme_ns != NULL);
1248 	ns = nvme_ns->ns;
1249 	ctrlr = spdk_nvme_ns_get_ctrlr(ns);
1250 
1251 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
1252 	trid = spdk_nvme_ctrlr_get_transport_id(ctrlr);
1253 	vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr);
1254 	csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr);
1255 
1256 	spdk_json_write_named_object_begin(w, "nvme");
1257 
1258 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
1259 		spdk_json_write_named_string(w, "pci_address", trid->traddr);
1260 	}
1261 
1262 	spdk_json_write_named_object_begin(w, "trid");
1263 
1264 	nvme_bdev_dump_trid_json(trid, w);
1265 
1266 	spdk_json_write_object_end(w);
1267 
1268 #ifdef SPDK_CONFIG_NVME_CUSE
1269 	size_t cuse_name_size = 128;
1270 	char cuse_name[cuse_name_size];
1271 
1272 	int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns),
1273 					    cuse_name, &cuse_name_size);
1274 	if (rc == 0) {
1275 		spdk_json_write_named_string(w, "cuse_device", cuse_name);
1276 	}
1277 #endif
1278 
1279 	spdk_json_write_named_object_begin(w, "ctrlr_data");
1280 
1281 	spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid);
1282 
1283 	snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn);
1284 	spdk_str_trim(buf);
1285 	spdk_json_write_named_string(w, "model_number", buf);
1286 
1287 	snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn);
1288 	spdk_str_trim(buf);
1289 	spdk_json_write_named_string(w, "serial_number", buf);
1290 
1291 	snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr);
1292 	spdk_str_trim(buf);
1293 	spdk_json_write_named_string(w, "firmware_revision", buf);
1294 
1295 	if (cdata->subnqn[0] != '\0') {
1296 		spdk_json_write_named_string(w, "subnqn", cdata->subnqn);
1297 	}
1298 
1299 	spdk_json_write_named_object_begin(w, "oacs");
1300 
1301 	spdk_json_write_named_uint32(w, "security", cdata->oacs.security);
1302 	spdk_json_write_named_uint32(w, "format", cdata->oacs.format);
1303 	spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware);
1304 	spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage);
1305 
1306 	spdk_json_write_object_end(w);
1307 
1308 	spdk_json_write_object_end(w);
1309 
1310 	spdk_json_write_named_object_begin(w, "vs");
1311 
1312 	spdk_json_write_name(w, "nvme_version");
1313 	if (vs.bits.ter) {
1314 		spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter);
1315 	} else {
1316 		spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr);
1317 	}
1318 
1319 	spdk_json_write_object_end(w);
1320 
1321 	spdk_json_write_named_object_begin(w, "csts");
1322 
1323 	spdk_json_write_named_uint32(w, "rdy", csts.bits.rdy);
1324 	spdk_json_write_named_uint32(w, "cfs", csts.bits.cfs);
1325 
1326 	spdk_json_write_object_end(w);
1327 
1328 	spdk_json_write_named_object_begin(w, "ns_data");
1329 
1330 	spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns));
1331 
1332 	if (cdata->cmic.ana_reporting) {
1333 		spdk_json_write_named_string(w, "ana_state",
1334 					     _nvme_ana_state_str(nvme_ns->ana_state));
1335 	}
1336 
1337 	spdk_json_write_object_end(w);
1338 
1339 	if (cdata->oacs.security) {
1340 		spdk_json_write_named_object_begin(w, "security");
1341 
1342 		spdk_json_write_named_bool(w, "opal", nvme_bdev->opal);
1343 
1344 		spdk_json_write_object_end(w);
1345 	}
1346 
1347 	spdk_json_write_object_end(w);
1348 
1349 	return 0;
1350 }
1351 
1352 static void
1353 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
1354 {
1355 	/* No config per bdev needed */
1356 }
1357 
1358 static uint64_t
1359 bdev_nvme_get_spin_time(struct spdk_io_channel *ch)
1360 {
1361 	struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch);
1362 	struct nvme_ctrlr_channel *ctrlr_ch = nbdev_ch->ctrlr_ch;
1363 	struct nvme_poll_group *group = ctrlr_ch->group;
1364 	uint64_t spin_time;
1365 
1366 	if (!group || !group->collect_spin_stat) {
1367 		return 0;
1368 	}
1369 
1370 	if (group->end_ticks != 0) {
1371 		group->spin_ticks += (group->end_ticks - group->start_ticks);
1372 		group->end_ticks = 0;
1373 	}
1374 
1375 	spin_time = (group->spin_ticks * 1000000ULL) / spdk_get_ticks_hz();
1376 	group->start_ticks = 0;
1377 	group->spin_ticks = 0;
1378 
1379 	return spin_time;
1380 }
1381 
1382 static const struct spdk_bdev_fn_table nvmelib_fn_table = {
1383 	.destruct		= bdev_nvme_destruct,
1384 	.submit_request		= bdev_nvme_submit_request,
1385 	.io_type_supported	= bdev_nvme_io_type_supported,
1386 	.get_io_channel		= bdev_nvme_get_io_channel,
1387 	.dump_info_json		= bdev_nvme_dump_info_json,
1388 	.write_config_json	= bdev_nvme_write_config_json,
1389 	.get_spin_time		= bdev_nvme_get_spin_time,
1390 	.get_module_ctx		= bdev_nvme_get_module_ctx,
1391 	.get_memory_domains	= bdev_nvme_get_memory_domains,
1392 };
1393 
1394 typedef int (*bdev_nvme_parse_ana_log_page_cb)(
1395 	const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg);
1396 
1397 static int
1398 bdev_nvme_parse_ana_log_page(struct nvme_ctrlr *nvme_ctrlr,
1399 			     bdev_nvme_parse_ana_log_page_cb cb_fn, void *cb_arg)
1400 {
1401 	struct spdk_nvme_ana_group_descriptor *copied_desc;
1402 	uint8_t *orig_desc;
1403 	uint32_t i, desc_size, copy_len;
1404 	int rc = 0;
1405 
1406 	if (nvme_ctrlr->ana_log_page == NULL) {
1407 		return -EINVAL;
1408 	}
1409 
1410 	copied_desc = nvme_ctrlr->copied_ana_desc;
1411 
1412 	orig_desc = (uint8_t *)nvme_ctrlr->ana_log_page + sizeof(struct spdk_nvme_ana_page);
1413 	copy_len = nvme_ctrlr->ana_log_page_size - sizeof(struct spdk_nvme_ana_page);
1414 
1415 	for (i = 0; i < nvme_ctrlr->ana_log_page->num_ana_group_desc; i++) {
1416 		memcpy(copied_desc, orig_desc, copy_len);
1417 
1418 		rc = cb_fn(copied_desc, cb_arg);
1419 		if (rc != 0) {
1420 			break;
1421 		}
1422 
1423 		desc_size = sizeof(struct spdk_nvme_ana_group_descriptor) +
1424 			    copied_desc->num_of_nsid * sizeof(uint32_t);
1425 		orig_desc += desc_size;
1426 		copy_len -= desc_size;
1427 	}
1428 
1429 	return rc;
1430 }
1431 
1432 static int
1433 nvme_ns_set_ana_state(const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg)
1434 {
1435 	struct nvme_ns *nvme_ns = cb_arg;
1436 	uint32_t i;
1437 
1438 	for (i = 0; i < desc->num_of_nsid; i++) {
1439 		if (desc->nsid[i] != spdk_nvme_ns_get_id(nvme_ns->ns)) {
1440 			continue;
1441 		}
1442 		nvme_ns->ana_group_id = desc->ana_group_id;
1443 		nvme_ns->ana_state = desc->ana_state;
1444 		return 1;
1445 	}
1446 
1447 	return 0;
1448 }
1449 
1450 static int
1451 nvme_disk_create(struct spdk_bdev *disk, const char *base_name,
1452 		 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns,
1453 		 uint32_t prchk_flags, void *ctx)
1454 {
1455 	const struct spdk_uuid		*uuid;
1456 	const uint8_t *nguid;
1457 	const struct spdk_nvme_ctrlr_data *cdata;
1458 	const struct spdk_nvme_ns_data	*nsdata;
1459 	enum spdk_nvme_csi		csi;
1460 	uint32_t atomic_bs, phys_bs, bs;
1461 
1462 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
1463 	csi = spdk_nvme_ns_get_csi(ns);
1464 
1465 	switch (csi) {
1466 	case SPDK_NVME_CSI_NVM:
1467 		disk->product_name = "NVMe disk";
1468 		break;
1469 	case SPDK_NVME_CSI_ZNS:
1470 		disk->product_name = "NVMe ZNS disk";
1471 		disk->zoned = true;
1472 		disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns);
1473 		disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) /
1474 					     spdk_nvme_ns_get_extended_sector_size(ns);
1475 		disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns);
1476 		disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns);
1477 		break;
1478 	default:
1479 		SPDK_ERRLOG("unsupported CSI: %u\n", csi);
1480 		return -ENOTSUP;
1481 	}
1482 
1483 	disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns));
1484 	if (!disk->name) {
1485 		return -ENOMEM;
1486 	}
1487 
1488 	disk->write_cache = 0;
1489 	if (cdata->vwc.present) {
1490 		/* Enable if the Volatile Write Cache exists */
1491 		disk->write_cache = 1;
1492 	}
1493 	if (cdata->oncs.write_zeroes) {
1494 		disk->max_write_zeroes = UINT16_MAX + 1;
1495 	}
1496 	disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns);
1497 	disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns);
1498 	disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns);
1499 
1500 	nguid = spdk_nvme_ns_get_nguid(ns);
1501 	if (!nguid) {
1502 		uuid = spdk_nvme_ns_get_uuid(ns);
1503 		if (uuid) {
1504 			disk->uuid = *uuid;
1505 		}
1506 	} else {
1507 		memcpy(&disk->uuid, nguid, sizeof(disk->uuid));
1508 	}
1509 
1510 	nsdata = spdk_nvme_ns_get_data(ns);
1511 	bs = spdk_nvme_ns_get_sector_size(ns);
1512 	atomic_bs = bs;
1513 	phys_bs = bs;
1514 	if (nsdata->nabo == 0) {
1515 		if (nsdata->nsfeat.ns_atomic_write_unit && nsdata->nawupf) {
1516 			atomic_bs = bs * (1 + nsdata->nawupf);
1517 		} else {
1518 			atomic_bs = bs * (1 + cdata->awupf);
1519 		}
1520 	}
1521 	if (nsdata->nsfeat.optperf) {
1522 		phys_bs = bs * (1 + nsdata->npwg);
1523 	}
1524 	disk->phys_blocklen = spdk_min(phys_bs, atomic_bs);
1525 
1526 	disk->md_len = spdk_nvme_ns_get_md_size(ns);
1527 	if (disk->md_len != 0) {
1528 		disk->md_interleave = nsdata->flbas.extended;
1529 		disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns);
1530 		if (disk->dif_type != SPDK_DIF_DISABLE) {
1531 			disk->dif_is_head_of_md = nsdata->dps.md_start;
1532 			disk->dif_check_flags = prchk_flags;
1533 		}
1534 	}
1535 
1536 	if (!(spdk_nvme_ctrlr_get_flags(ctrlr) &
1537 	      SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) {
1538 		disk->acwu = 0;
1539 	} else if (nsdata->nsfeat.ns_atomic_write_unit) {
1540 		disk->acwu = nsdata->nacwu;
1541 	} else {
1542 		disk->acwu = cdata->acwu;
1543 	}
1544 
1545 	disk->ctxt = ctx;
1546 	disk->fn_table = &nvmelib_fn_table;
1547 	disk->module = &nvme_if;
1548 
1549 	return 0;
1550 }
1551 
1552 static int
1553 nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns)
1554 {
1555 	struct nvme_bdev *bdev;
1556 	int rc;
1557 
1558 	bdev = calloc(1, sizeof(*bdev));
1559 	if (!bdev) {
1560 		SPDK_ERRLOG("bdev calloc() failed\n");
1561 		return -ENOMEM;
1562 	}
1563 
1564 	bdev->nvme_ns = nvme_ns;
1565 	bdev->opal = nvme_ctrlr->opal_dev != NULL;
1566 
1567 	rc = nvme_disk_create(&bdev->disk, nvme_ctrlr->name, nvme_ctrlr->ctrlr,
1568 			      nvme_ns->ns, nvme_ctrlr->prchk_flags, bdev);
1569 	if (rc != 0) {
1570 		SPDK_ERRLOG("Failed to create NVMe disk\n");
1571 		free(bdev);
1572 		return rc;
1573 	}
1574 
1575 	spdk_io_device_register(bdev,
1576 				bdev_nvme_create_bdev_channel_cb,
1577 				bdev_nvme_destroy_bdev_channel_cb,
1578 				sizeof(struct nvme_bdev_channel),
1579 				bdev->disk.name);
1580 
1581 	rc = spdk_bdev_register(&bdev->disk);
1582 	if (rc != 0) {
1583 		SPDK_ERRLOG("spdk_bdev_register() failed\n");
1584 		spdk_io_device_unregister(bdev, NULL);
1585 		free(bdev->disk.name);
1586 		free(bdev);
1587 		return rc;
1588 	}
1589 
1590 	nvme_ns->bdev = bdev;
1591 
1592 	return 0;
1593 }
1594 
1595 static bool
1596 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2)
1597 {
1598 	const struct spdk_nvme_ns_data *nsdata1, *nsdata2;
1599 	const struct spdk_uuid *uuid1, *uuid2;
1600 
1601 	nsdata1 = spdk_nvme_ns_get_data(ns1);
1602 	nsdata2 = spdk_nvme_ns_get_data(ns2);
1603 	uuid1 = spdk_nvme_ns_get_uuid(ns1);
1604 	uuid2 = spdk_nvme_ns_get_uuid(ns2);
1605 
1606 	return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)) == 0 &&
1607 	       nsdata1->eui64 == nsdata2->eui64 &&
1608 	       uuid1 != NULL && uuid2 != NULL && spdk_uuid_compare(uuid1, uuid2) == 0;
1609 }
1610 
1611 static void
1612 nvme_ctrlr_populate_standard_namespace(struct nvme_ctrlr *nvme_ctrlr,
1613 				       struct nvme_ns *nvme_ns, struct nvme_async_probe_ctx *ctx)
1614 {
1615 	struct spdk_nvme_ctrlr	*ctrlr = nvme_ctrlr->ctrlr;
1616 	struct spdk_nvme_ns	*ns;
1617 	int			rc = 0;
1618 
1619 	ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id);
1620 	if (!ns) {
1621 		SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id);
1622 		rc = -EINVAL;
1623 		goto done;
1624 	}
1625 
1626 	nvme_ns->ns = ns;
1627 	nvme_ns->populated = true;
1628 	nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE;
1629 
1630 	if (nvme_ctrlr->ana_log_page != NULL) {
1631 		bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ns_set_ana_state, nvme_ns);
1632 	}
1633 
1634 	rc = nvme_bdev_create(nvme_ctrlr, nvme_ns);
1635 done:
1636 	nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, rc);
1637 }
1638 
1639 static bool
1640 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
1641 		 struct spdk_nvme_ctrlr_opts *opts)
1642 {
1643 	struct nvme_probe_skip_entry *entry;
1644 
1645 	TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) {
1646 		if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) {
1647 			return false;
1648 		}
1649 	}
1650 
1651 	opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst;
1652 	opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight;
1653 	opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight;
1654 	opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight;
1655 	opts->disable_read_ana_log_page = true;
1656 
1657 	SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr);
1658 
1659 	return true;
1660 }
1661 
1662 static void
1663 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl)
1664 {
1665 	struct nvme_ctrlr *nvme_ctrlr = ctx;
1666 
1667 	if (spdk_nvme_cpl_is_error(cpl)) {
1668 		SPDK_WARNLOG("Abort failed. Resetting controller. sc is %u, sct is %u.\n", cpl->status.sc,
1669 			     cpl->status.sct);
1670 		bdev_nvme_reset(nvme_ctrlr);
1671 	} else if (cpl->cdw0 & 0x1) {
1672 		SPDK_WARNLOG("Specified command could not be aborted.\n");
1673 		bdev_nvme_reset(nvme_ctrlr);
1674 	}
1675 }
1676 
1677 static void
1678 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr,
1679 	   struct spdk_nvme_qpair *qpair, uint16_t cid)
1680 {
1681 	struct nvme_ctrlr *nvme_ctrlr = cb_arg;
1682 	union spdk_nvme_csts_register csts;
1683 	int rc;
1684 
1685 	assert(nvme_ctrlr->ctrlr == ctrlr);
1686 
1687 	SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid);
1688 
1689 	/* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O
1690 	 * queue.  (Note: qpair == NULL when there's an admin cmd timeout.)  Otherwise we
1691 	 * would submit another fabrics cmd on the admin queue to read CSTS and check for its
1692 	 * completion recursively.
1693 	 */
1694 	if (nvme_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) {
1695 		csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr);
1696 		if (csts.bits.cfs) {
1697 			SPDK_ERRLOG("Controller Fatal Status, reset required\n");
1698 			bdev_nvme_reset(nvme_ctrlr);
1699 			return;
1700 		}
1701 	}
1702 
1703 	switch (g_opts.action_on_timeout) {
1704 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT:
1705 		if (qpair) {
1706 			/* Don't send abort to ctrlr when reset is running. */
1707 			pthread_mutex_lock(&nvme_ctrlr->mutex);
1708 			if (nvme_ctrlr->resetting) {
1709 				pthread_mutex_unlock(&nvme_ctrlr->mutex);
1710 				SPDK_NOTICELOG("Quit abort. Ctrlr is in the process of reseting.\n");
1711 				return;
1712 			}
1713 			pthread_mutex_unlock(&nvme_ctrlr->mutex);
1714 
1715 			rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid,
1716 						       nvme_abort_cpl, nvme_ctrlr);
1717 			if (rc == 0) {
1718 				return;
1719 			}
1720 
1721 			SPDK_ERRLOG("Unable to send abort. Resetting, rc is %d.\n", rc);
1722 		}
1723 
1724 	/* FALLTHROUGH */
1725 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET:
1726 		bdev_nvme_reset(nvme_ctrlr);
1727 		break;
1728 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE:
1729 		SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n");
1730 		break;
1731 	default:
1732 		SPDK_ERRLOG("An invalid timeout action value is found.\n");
1733 		break;
1734 	}
1735 }
1736 
1737 static void
1738 nvme_ctrlr_depopulate_standard_namespace(struct nvme_ns *nvme_ns)
1739 {
1740 	struct nvme_bdev *bdev;
1741 
1742 	bdev = nvme_ns->bdev;
1743 	if (bdev != NULL) {
1744 		spdk_bdev_unregister(&bdev->disk, NULL, NULL);
1745 	}
1746 
1747 	nvme_ctrlr_depopulate_namespace_done(nvme_ns);
1748 }
1749 
1750 static void
1751 nvme_ctrlr_populate_namespace(struct nvme_ctrlr *ctrlr, struct nvme_ns *nvme_ns,
1752 			      struct nvme_async_probe_ctx *ctx)
1753 {
1754 	g_populate_namespace_fn[nvme_ns->type](ctrlr, nvme_ns, ctx);
1755 }
1756 
1757 static void
1758 nvme_ctrlr_depopulate_namespace(struct nvme_ctrlr *ctrlr, struct nvme_ns *nvme_ns)
1759 {
1760 	g_depopulate_namespace_fn[nvme_ns->type](nvme_ns);
1761 }
1762 
1763 void
1764 nvme_ctrlr_populate_namespace_done(struct nvme_async_probe_ctx *ctx,
1765 				   struct nvme_ns *nvme_ns, int rc)
1766 {
1767 	struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr;
1768 
1769 	assert(nvme_ctrlr != NULL);
1770 
1771 	if (rc == 0) {
1772 		pthread_mutex_lock(&nvme_ctrlr->mutex);
1773 		nvme_ctrlr->ref++;
1774 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
1775 	} else {
1776 		memset(nvme_ns, 0, sizeof(*nvme_ns));
1777 	}
1778 
1779 	if (ctx) {
1780 		ctx->populates_in_progress--;
1781 		if (ctx->populates_in_progress == 0) {
1782 			nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx);
1783 		}
1784 	}
1785 }
1786 
1787 static void
1788 nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr,
1789 			       struct nvme_async_probe_ctx *ctx)
1790 {
1791 	struct spdk_nvme_ctrlr	*ctrlr = nvme_ctrlr->ctrlr;
1792 	struct nvme_ns	*nvme_ns;
1793 	struct spdk_nvme_ns	*ns;
1794 	struct nvme_bdev	*bdev;
1795 	uint32_t		i;
1796 	int			rc;
1797 	uint64_t		num_sectors;
1798 	bool			ns_is_active;
1799 
1800 	if (ctx) {
1801 		/* Initialize this count to 1 to handle the populate functions
1802 		 * calling nvme_ctrlr_populate_namespace_done() immediately.
1803 		 */
1804 		ctx->populates_in_progress = 1;
1805 	}
1806 
1807 	for (i = 0; i < nvme_ctrlr->num_ns; i++) {
1808 		uint32_t	nsid = i + 1;
1809 
1810 		nvme_ns = nvme_ctrlr->namespaces[i];
1811 		ns_is_active = spdk_nvme_ctrlr_is_active_ns(ctrlr, nsid);
1812 
1813 		if (nvme_ns->populated && ns_is_active && nvme_ns->type == NVME_NS_STANDARD) {
1814 			/* NS is still there but attributes may have changed */
1815 			ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid);
1816 			num_sectors = spdk_nvme_ns_get_num_sectors(ns);
1817 			bdev = nvme_ns->bdev;
1818 			assert(bdev != NULL);
1819 			if (bdev->disk.blockcnt != num_sectors) {
1820 				SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n",
1821 					       nsid,
1822 					       bdev->disk.name,
1823 					       bdev->disk.blockcnt,
1824 					       num_sectors);
1825 				rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors);
1826 				if (rc != 0) {
1827 					SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n",
1828 						    bdev->disk.name, rc);
1829 				}
1830 			}
1831 		}
1832 
1833 		if (!nvme_ns->populated && ns_is_active) {
1834 			nvme_ns->id = nsid;
1835 			nvme_ns->ctrlr = nvme_ctrlr;
1836 			nvme_ns->type = NVME_NS_STANDARD;
1837 
1838 			nvme_ns->bdev = NULL;
1839 
1840 			if (ctx) {
1841 				ctx->populates_in_progress++;
1842 			}
1843 			nvme_ctrlr_populate_namespace(nvme_ctrlr, nvme_ns, ctx);
1844 		}
1845 
1846 		if (nvme_ns->populated && !ns_is_active) {
1847 			nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns);
1848 		}
1849 	}
1850 
1851 	if (ctx) {
1852 		/* Decrement this count now that the loop is over to account
1853 		 * for the one we started with.  If the count is then 0, we
1854 		 * know any populate_namespace functions completed immediately,
1855 		 * so we'll kick the callback here.
1856 		 */
1857 		ctx->populates_in_progress--;
1858 		if (ctx->populates_in_progress == 0) {
1859 			nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx);
1860 		}
1861 	}
1862 
1863 }
1864 
1865 static void
1866 nvme_ctrlr_depopulate_namespaces(struct nvme_ctrlr *nvme_ctrlr)
1867 {
1868 	uint32_t i;
1869 	struct nvme_ns *nvme_ns;
1870 
1871 	for (i = 0; i < nvme_ctrlr->num_ns; i++) {
1872 		uint32_t nsid = i + 1;
1873 
1874 		nvme_ns = nvme_ctrlr->namespaces[nsid - 1];
1875 		if (nvme_ns->populated) {
1876 			assert(nvme_ns->id == nsid);
1877 			nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns);
1878 		}
1879 	}
1880 }
1881 
1882 static bool
1883 nvme_ctrlr_acquire(struct nvme_ctrlr *nvme_ctrlr)
1884 {
1885 	pthread_mutex_lock(&nvme_ctrlr->mutex);
1886 	if (nvme_ctrlr->destruct || nvme_ctrlr->resetting) {
1887 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
1888 		return false;
1889 	}
1890 	nvme_ctrlr->ref++;
1891 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
1892 	return true;
1893 }
1894 
1895 static int
1896 nvme_ctrlr_set_ana_states(const struct spdk_nvme_ana_group_descriptor *desc,
1897 			  void *cb_arg)
1898 {
1899 	struct nvme_ctrlr *nvme_ctrlr = cb_arg;
1900 	struct nvme_ns *nvme_ns;
1901 	uint32_t i, nsid;
1902 
1903 	for (i = 0; i < desc->num_of_nsid; i++) {
1904 		nsid = desc->nsid[i];
1905 		if (nsid == 0 || nsid > nvme_ctrlr->num_ns) {
1906 			continue;
1907 		}
1908 
1909 		nvme_ns = nvme_ctrlr->namespaces[nsid - 1];
1910 		assert(nvme_ns != NULL);
1911 
1912 		if (!nvme_ns->populated) {
1913 			continue;
1914 		}
1915 
1916 		nvme_ns->ana_group_id = desc->ana_group_id;
1917 		nvme_ns->ana_state = desc->ana_state;
1918 	}
1919 
1920 	return 0;
1921 }
1922 
1923 static void
1924 nvme_ctrlr_read_ana_log_page_done(void *ctx, const struct spdk_nvme_cpl *cpl)
1925 {
1926 	struct nvme_ctrlr *nvme_ctrlr = ctx;
1927 
1928 	if (spdk_nvme_cpl_is_success(cpl)) {
1929 		bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ctrlr_set_ana_states,
1930 					     nvme_ctrlr);
1931 	}
1932 
1933 	nvme_ctrlr_release(nvme_ctrlr);
1934 }
1935 
1936 static void
1937 nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr)
1938 {
1939 	int rc;
1940 
1941 	if (nvme_ctrlr->ana_log_page == NULL) {
1942 		return;
1943 	}
1944 
1945 	if (!nvme_ctrlr_acquire(nvme_ctrlr)) {
1946 		return;
1947 	}
1948 
1949 	rc = spdk_nvme_ctrlr_cmd_get_log_page(nvme_ctrlr->ctrlr,
1950 					      SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS,
1951 					      SPDK_NVME_GLOBAL_NS_TAG,
1952 					      nvme_ctrlr->ana_log_page,
1953 					      nvme_ctrlr->ana_log_page_size, 0,
1954 					      nvme_ctrlr_read_ana_log_page_done,
1955 					      nvme_ctrlr);
1956 	if (rc != 0) {
1957 		nvme_ctrlr_release(nvme_ctrlr);
1958 	}
1959 }
1960 
1961 static void
1962 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl)
1963 {
1964 	struct nvme_ctrlr *nvme_ctrlr		= arg;
1965 	union spdk_nvme_async_event_completion	event;
1966 
1967 	if (spdk_nvme_cpl_is_error(cpl)) {
1968 		SPDK_WARNLOG("AER request execute failed");
1969 		return;
1970 	}
1971 
1972 	event.raw = cpl->cdw0;
1973 	if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) &&
1974 	    (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) {
1975 		nvme_ctrlr_populate_namespaces(nvme_ctrlr, NULL);
1976 	} else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) &&
1977 		   (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_ANA_CHANGE)) {
1978 		nvme_ctrlr_read_ana_log_page(nvme_ctrlr);
1979 	}
1980 }
1981 
1982 static void
1983 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, size_t count, int rc)
1984 {
1985 	if (ctx->cb_fn) {
1986 		ctx->cb_fn(ctx->cb_ctx, count, rc);
1987 	}
1988 
1989 	ctx->namespaces_populated = true;
1990 	if (ctx->probe_done) {
1991 		/* The probe was already completed, so we need to free the context
1992 		 * here.  This can happen for cases like OCSSD, where we need to
1993 		 * send additional commands to the SSD after attach.
1994 		 */
1995 		free(ctx);
1996 	}
1997 }
1998 
1999 static void
2000 nvme_ctrlr_create_done(struct nvme_ctrlr *nvme_ctrlr,
2001 		       struct nvme_async_probe_ctx *ctx)
2002 {
2003 	spdk_io_device_register(nvme_ctrlr,
2004 				bdev_nvme_create_ctrlr_channel_cb,
2005 				bdev_nvme_destroy_ctrlr_channel_cb,
2006 				sizeof(struct nvme_ctrlr_channel),
2007 				nvme_ctrlr->name);
2008 
2009 	nvme_ctrlr_populate_namespaces(nvme_ctrlr, ctx);
2010 }
2011 
2012 static void
2013 nvme_ctrlr_init_ana_log_page_done(void *_ctx, const struct spdk_nvme_cpl *cpl)
2014 {
2015 	struct nvme_ctrlr *nvme_ctrlr = _ctx;
2016 	struct nvme_async_probe_ctx *ctx = nvme_ctrlr->probe_ctx;
2017 
2018 	nvme_ctrlr->probe_ctx = NULL;
2019 
2020 	if (spdk_nvme_cpl_is_error(cpl)) {
2021 		nvme_ctrlr_delete(nvme_ctrlr);
2022 
2023 		if (ctx != NULL) {
2024 			populate_namespaces_cb(ctx, 0, -1);
2025 		}
2026 		return;
2027 	}
2028 
2029 	nvme_ctrlr_create_done(nvme_ctrlr, ctx);
2030 }
2031 
2032 static int
2033 nvme_ctrlr_init_ana_log_page(struct nvme_ctrlr *nvme_ctrlr,
2034 			     struct nvme_async_probe_ctx *ctx)
2035 {
2036 	struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr;
2037 	const struct spdk_nvme_ctrlr_data *cdata;
2038 	uint32_t ana_log_page_size;
2039 
2040 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
2041 
2042 	ana_log_page_size = sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid *
2043 			    sizeof(struct spdk_nvme_ana_group_descriptor) + cdata->nn *
2044 			    sizeof(uint32_t);
2045 
2046 	nvme_ctrlr->ana_log_page = spdk_zmalloc(ana_log_page_size, 64, NULL,
2047 						SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
2048 	if (nvme_ctrlr->ana_log_page == NULL) {
2049 		SPDK_ERRLOG("could not allocate ANA log page buffer\n");
2050 		return -ENXIO;
2051 	}
2052 
2053 	/* Each descriptor in a ANA log page is not ensured to be 8-bytes aligned.
2054 	 * Hence copy each descriptor to a temporary area when parsing it.
2055 	 *
2056 	 * Allocate a buffer whose size is as large as ANA log page buffer because
2057 	 * we do not know the size of a descriptor until actually reading it.
2058 	 */
2059 	nvme_ctrlr->copied_ana_desc = calloc(1, ana_log_page_size);
2060 	if (nvme_ctrlr->copied_ana_desc == NULL) {
2061 		SPDK_ERRLOG("could not allocate a buffer to parse ANA descriptor\n");
2062 		return -ENOMEM;
2063 	}
2064 
2065 	nvme_ctrlr->ana_log_page_size = ana_log_page_size;
2066 
2067 	nvme_ctrlr->probe_ctx = ctx;
2068 
2069 	return spdk_nvme_ctrlr_cmd_get_log_page(ctrlr,
2070 						SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS,
2071 						SPDK_NVME_GLOBAL_NS_TAG,
2072 						nvme_ctrlr->ana_log_page,
2073 						nvme_ctrlr->ana_log_page_size, 0,
2074 						nvme_ctrlr_init_ana_log_page_done,
2075 						nvme_ctrlr);
2076 }
2077 
2078 static int
2079 nvme_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr,
2080 		  const char *name,
2081 		  const struct spdk_nvme_transport_id *trid,
2082 		  uint32_t prchk_flags,
2083 		  struct nvme_async_probe_ctx *ctx)
2084 {
2085 	struct nvme_ctrlr *nvme_ctrlr;
2086 	struct nvme_ctrlr_trid *trid_entry;
2087 	uint32_t i, num_ns;
2088 	const struct spdk_nvme_ctrlr_data *cdata;
2089 	int rc;
2090 
2091 	nvme_ctrlr = calloc(1, sizeof(*nvme_ctrlr));
2092 	if (nvme_ctrlr == NULL) {
2093 		SPDK_ERRLOG("Failed to allocate device struct\n");
2094 		return -ENOMEM;
2095 	}
2096 
2097 	rc = pthread_mutex_init(&nvme_ctrlr->mutex, NULL);
2098 	if (rc != 0) {
2099 		free(nvme_ctrlr);
2100 		return rc;
2101 	}
2102 
2103 	TAILQ_INIT(&nvme_ctrlr->trids);
2104 
2105 	num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr);
2106 	if (num_ns != 0) {
2107 		nvme_ctrlr->namespaces = calloc(num_ns, sizeof(struct nvme_ns *));
2108 		if (!nvme_ctrlr->namespaces) {
2109 			SPDK_ERRLOG("Failed to allocate block namespaces pointer\n");
2110 			rc = -ENOMEM;
2111 			goto err;
2112 		}
2113 
2114 		for (i = 0; i < num_ns; i++) {
2115 			nvme_ctrlr->namespaces[i] = calloc(1, sizeof(struct nvme_ns));
2116 			if (nvme_ctrlr->namespaces[i] == NULL) {
2117 				SPDK_ERRLOG("Failed to allocate block namespace struct\n");
2118 				rc = -ENOMEM;
2119 				goto err;
2120 			}
2121 			nvme_ctrlr->num_ns++;
2122 		}
2123 
2124 		assert(num_ns == nvme_ctrlr->num_ns);
2125 	}
2126 
2127 	trid_entry = calloc(1, sizeof(*trid_entry));
2128 	if (trid_entry == NULL) {
2129 		SPDK_ERRLOG("Failed to allocate trid entry pointer\n");
2130 		rc = -ENOMEM;
2131 		goto err;
2132 	}
2133 
2134 	trid_entry->trid = *trid;
2135 	nvme_ctrlr->connected_trid = &trid_entry->trid;
2136 	TAILQ_INSERT_HEAD(&nvme_ctrlr->trids, trid_entry, link);
2137 
2138 	nvme_ctrlr->thread = spdk_get_thread();
2139 	nvme_ctrlr->ctrlr = ctrlr;
2140 	nvme_ctrlr->ref = 1;
2141 	nvme_ctrlr->name = strdup(name);
2142 	if (nvme_ctrlr->name == NULL) {
2143 		rc = -ENOMEM;
2144 		goto err;
2145 	}
2146 
2147 	if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) {
2148 		SPDK_ERRLOG("OCSSDs are not supported");
2149 		rc = -ENOTSUP;
2150 		goto err;
2151 	}
2152 
2153 	nvme_ctrlr->prchk_flags = prchk_flags;
2154 
2155 	nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_ctrlr,
2156 					  g_opts.nvme_adminq_poll_period_us);
2157 
2158 	pthread_mutex_lock(&g_bdev_nvme_mutex);
2159 	TAILQ_INSERT_TAIL(&g_nvme_ctrlrs, nvme_ctrlr, tailq);
2160 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
2161 
2162 	if (g_opts.timeout_us > 0) {
2163 		/* Register timeout callback. Timeout values for IO vs. admin reqs can be different. */
2164 		/* If timeout_admin_us is 0 (not specified), admin uses same timeout as IO. */
2165 		uint64_t adm_timeout_us = (g_opts.timeout_admin_us == 0) ?
2166 					  g_opts.timeout_us : g_opts.timeout_admin_us;
2167 		spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us,
2168 				adm_timeout_us, timeout_cb, nvme_ctrlr);
2169 	}
2170 
2171 	spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_ctrlr);
2172 	spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_ctrlr);
2173 
2174 	if (spdk_nvme_ctrlr_get_flags(ctrlr) &
2175 	    SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) {
2176 		nvme_ctrlr->opal_dev = spdk_opal_dev_construct(ctrlr);
2177 	}
2178 
2179 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
2180 
2181 	if (cdata->cmic.ana_reporting) {
2182 		rc = nvme_ctrlr_init_ana_log_page(nvme_ctrlr, ctx);
2183 		if (rc == 0) {
2184 			return 0;
2185 		}
2186 	} else {
2187 		nvme_ctrlr_create_done(nvme_ctrlr, ctx);
2188 		return 0;
2189 	}
2190 
2191 err:
2192 	nvme_ctrlr_delete(nvme_ctrlr);
2193 	return rc;
2194 }
2195 
2196 static void
2197 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
2198 	  struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
2199 {
2200 	struct nvme_probe_ctx *ctx = cb_ctx;
2201 	char *name = NULL;
2202 	uint32_t prchk_flags = 0;
2203 	size_t i;
2204 
2205 	if (ctx) {
2206 		for (i = 0; i < ctx->count; i++) {
2207 			if (spdk_nvme_transport_id_compare(trid, &ctx->trids[i]) == 0) {
2208 				prchk_flags = ctx->prchk_flags[i];
2209 				name = strdup(ctx->names[i]);
2210 				break;
2211 			}
2212 		}
2213 	} else {
2214 		name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++);
2215 	}
2216 	if (!name) {
2217 		SPDK_ERRLOG("Failed to assign name to NVMe device\n");
2218 		return;
2219 	}
2220 
2221 	SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name);
2222 
2223 	nvme_ctrlr_create(ctrlr, name, trid, prchk_flags, NULL);
2224 
2225 	free(name);
2226 }
2227 
2228 static void
2229 _nvme_ctrlr_destruct(void *ctx)
2230 {
2231 	struct nvme_ctrlr *nvme_ctrlr = ctx;
2232 
2233 	nvme_ctrlr_depopulate_namespaces(nvme_ctrlr);
2234 	nvme_ctrlr_release(nvme_ctrlr);
2235 }
2236 
2237 static int
2238 _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, bool hotplug)
2239 {
2240 	struct nvme_probe_skip_entry *entry;
2241 
2242 	pthread_mutex_lock(&nvme_ctrlr->mutex);
2243 
2244 	/* The controller's destruction was already started */
2245 	if (nvme_ctrlr->destruct) {
2246 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
2247 		return 0;
2248 	}
2249 
2250 	if (!hotplug &&
2251 	    nvme_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
2252 		entry = calloc(1, sizeof(*entry));
2253 		if (!entry) {
2254 			pthread_mutex_unlock(&nvme_ctrlr->mutex);
2255 			return -ENOMEM;
2256 		}
2257 		entry->trid = *nvme_ctrlr->connected_trid;
2258 		TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq);
2259 	}
2260 
2261 	nvme_ctrlr->destruct = true;
2262 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
2263 
2264 	_nvme_ctrlr_destruct(nvme_ctrlr);
2265 
2266 	return 0;
2267 }
2268 
2269 static void
2270 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr)
2271 {
2272 	struct nvme_ctrlr *nvme_ctrlr = cb_ctx;
2273 
2274 	_bdev_nvme_delete(nvme_ctrlr, true);
2275 }
2276 
2277 static int
2278 bdev_nvme_hotplug_probe(void *arg)
2279 {
2280 	if (g_hotplug_probe_ctx == NULL) {
2281 		spdk_poller_unregister(&g_hotplug_probe_poller);
2282 		return SPDK_POLLER_IDLE;
2283 	}
2284 
2285 	if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) {
2286 		g_hotplug_probe_ctx = NULL;
2287 		spdk_poller_unregister(&g_hotplug_probe_poller);
2288 	}
2289 
2290 	return SPDK_POLLER_BUSY;
2291 }
2292 
2293 static int
2294 bdev_nvme_hotplug(void *arg)
2295 {
2296 	struct spdk_nvme_transport_id trid_pcie;
2297 
2298 	if (g_hotplug_probe_ctx) {
2299 		return SPDK_POLLER_BUSY;
2300 	}
2301 
2302 	memset(&trid_pcie, 0, sizeof(trid_pcie));
2303 	spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE);
2304 
2305 	g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL,
2306 			      hotplug_probe_cb, attach_cb, NULL);
2307 
2308 	if (g_hotplug_probe_ctx) {
2309 		assert(g_hotplug_probe_poller == NULL);
2310 		g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000);
2311 	}
2312 
2313 	return SPDK_POLLER_BUSY;
2314 }
2315 
2316 void
2317 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts)
2318 {
2319 	*opts = g_opts;
2320 }
2321 
2322 static int
2323 bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts)
2324 {
2325 	if ((opts->timeout_us == 0) && (opts->timeout_admin_us != 0)) {
2326 		/* Can't set timeout_admin_us without also setting timeout_us */
2327 		SPDK_WARNLOG("Invalid options: Can't have (timeout_us == 0) with (timeout_admin_us > 0)\n");
2328 		return -EINVAL;
2329 	}
2330 
2331 	return 0;
2332 }
2333 
2334 int
2335 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts)
2336 {
2337 	int ret = bdev_nvme_validate_opts(opts);
2338 	if (ret) {
2339 		SPDK_WARNLOG("Failed to set nvme opts.\n");
2340 		return ret;
2341 	}
2342 
2343 	if (g_bdev_nvme_init_thread != NULL) {
2344 		if (!TAILQ_EMPTY(&g_nvme_ctrlrs)) {
2345 			return -EPERM;
2346 		}
2347 	}
2348 
2349 	g_opts = *opts;
2350 
2351 	return 0;
2352 }
2353 
2354 struct set_nvme_hotplug_ctx {
2355 	uint64_t period_us;
2356 	bool enabled;
2357 	spdk_msg_fn fn;
2358 	void *fn_ctx;
2359 };
2360 
2361 static void
2362 set_nvme_hotplug_period_cb(void *_ctx)
2363 {
2364 	struct set_nvme_hotplug_ctx *ctx = _ctx;
2365 
2366 	spdk_poller_unregister(&g_hotplug_poller);
2367 	if (ctx->enabled) {
2368 		g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us);
2369 	}
2370 
2371 	g_nvme_hotplug_poll_period_us = ctx->period_us;
2372 	g_nvme_hotplug_enabled = ctx->enabled;
2373 	if (ctx->fn) {
2374 		ctx->fn(ctx->fn_ctx);
2375 	}
2376 
2377 	free(ctx);
2378 }
2379 
2380 int
2381 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx)
2382 {
2383 	struct set_nvme_hotplug_ctx *ctx;
2384 
2385 	if (enabled == true && !spdk_process_is_primary()) {
2386 		return -EPERM;
2387 	}
2388 
2389 	ctx = calloc(1, sizeof(*ctx));
2390 	if (ctx == NULL) {
2391 		return -ENOMEM;
2392 	}
2393 
2394 	period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us;
2395 	ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX);
2396 	ctx->enabled = enabled;
2397 	ctx->fn = cb;
2398 	ctx->fn_ctx = cb_ctx;
2399 
2400 	spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx);
2401 	return 0;
2402 }
2403 
2404 static void
2405 nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr,
2406 				    struct nvme_async_probe_ctx *ctx)
2407 {
2408 	struct nvme_ns	*nvme_ns;
2409 	struct nvme_bdev	*nvme_bdev;
2410 	uint32_t		i, nsid;
2411 	size_t			j;
2412 
2413 	assert(nvme_ctrlr != NULL);
2414 
2415 	/*
2416 	 * Report the new bdevs that were created in this call.
2417 	 * There can be more than one bdev per NVMe controller.
2418 	 */
2419 	j = 0;
2420 	for (i = 0; i < nvme_ctrlr->num_ns; i++) {
2421 		nsid = i + 1;
2422 		nvme_ns = nvme_ctrlr->namespaces[nsid - 1];
2423 		if (!nvme_ns->populated) {
2424 			continue;
2425 		}
2426 		assert(nvme_ns->id == nsid);
2427 		nvme_bdev = nvme_ns->bdev;
2428 		if (nvme_bdev == NULL) {
2429 			assert(nvme_ns->type == NVME_NS_OCSSD);
2430 			continue;
2431 		}
2432 		if (j < ctx->count) {
2433 			ctx->names[j] = nvme_bdev->disk.name;
2434 			j++;
2435 		} else {
2436 			SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n",
2437 				    ctx->count);
2438 			populate_namespaces_cb(ctx, 0, -ERANGE);
2439 			return;
2440 		}
2441 	}
2442 
2443 	populate_namespaces_cb(ctx, j, 0);
2444 }
2445 
2446 static int
2447 bdev_nvme_compare_trids(struct nvme_ctrlr *nvme_ctrlr,
2448 			struct spdk_nvme_ctrlr *new_ctrlr,
2449 			struct spdk_nvme_transport_id *trid)
2450 {
2451 	struct nvme_ctrlr_trid *tmp_trid;
2452 
2453 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
2454 		SPDK_ERRLOG("PCIe failover is not supported.\n");
2455 		return -ENOTSUP;
2456 	}
2457 
2458 	/* Currently we only support failover to the same transport type. */
2459 	if (nvme_ctrlr->connected_trid->trtype != trid->trtype) {
2460 		return -EINVAL;
2461 	}
2462 
2463 	/* Currently we only support failover to the same NQN. */
2464 	if (strncmp(trid->subnqn, nvme_ctrlr->connected_trid->subnqn, SPDK_NVMF_NQN_MAX_LEN)) {
2465 		return -EINVAL;
2466 	}
2467 
2468 	/* Skip all the other checks if we've already registered this path. */
2469 	TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) {
2470 		if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) {
2471 			return -EEXIST;
2472 		}
2473 	}
2474 
2475 	return 0;
2476 }
2477 
2478 static int
2479 bdev_nvme_compare_namespaces(struct nvme_ctrlr *nvme_ctrlr,
2480 			     struct spdk_nvme_ctrlr *new_ctrlr)
2481 {
2482 	uint32_t i, nsid;
2483 	struct nvme_ns *nvme_ns;
2484 	struct spdk_nvme_ns *new_ns;
2485 
2486 	if (spdk_nvme_ctrlr_get_num_ns(new_ctrlr) != nvme_ctrlr->num_ns) {
2487 		return -EINVAL;
2488 	}
2489 
2490 	for (i = 0; i < nvme_ctrlr->num_ns; i++) {
2491 		nsid = i + 1;
2492 
2493 		nvme_ns = nvme_ctrlr->namespaces[i];
2494 		if (!nvme_ns->populated) {
2495 			continue;
2496 		}
2497 
2498 		new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nsid);
2499 		assert(new_ns != NULL);
2500 
2501 		if (!bdev_nvme_compare_ns(nvme_ns->ns, new_ns)) {
2502 			return -EINVAL;
2503 		}
2504 	}
2505 
2506 	return 0;
2507 }
2508 
2509 static int
2510 _bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr,
2511 			      struct spdk_nvme_transport_id *trid)
2512 {
2513 	struct nvme_ctrlr_trid *new_trid, *tmp_trid;
2514 
2515 	new_trid = calloc(1, sizeof(*new_trid));
2516 	if (new_trid == NULL) {
2517 		return -ENOMEM;
2518 	}
2519 	new_trid->trid = *trid;
2520 	new_trid->is_failed = false;
2521 
2522 	TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) {
2523 		if (tmp_trid->is_failed) {
2524 			TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link);
2525 			return 0;
2526 		}
2527 	}
2528 
2529 	TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, new_trid, link);
2530 	return 0;
2531 }
2532 
2533 /* This is the case that a secondary path is added to an existing
2534  * nvme_ctrlr for failover. After checking if it can access the same
2535  * namespaces as the primary path, it is disconnected until failover occurs.
2536  */
2537 static int
2538 bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr,
2539 			     struct spdk_nvme_ctrlr *new_ctrlr,
2540 			     struct spdk_nvme_transport_id *trid)
2541 {
2542 	int rc;
2543 
2544 	assert(nvme_ctrlr != NULL);
2545 
2546 	pthread_mutex_lock(&nvme_ctrlr->mutex);
2547 
2548 	rc = bdev_nvme_compare_trids(nvme_ctrlr, new_ctrlr, trid);
2549 	if (rc != 0) {
2550 		goto exit;
2551 	}
2552 
2553 	rc = bdev_nvme_compare_namespaces(nvme_ctrlr, new_ctrlr);
2554 	if (rc != 0) {
2555 		goto exit;
2556 	}
2557 
2558 	rc = _bdev_nvme_add_secondary_trid(nvme_ctrlr, trid);
2559 
2560 exit:
2561 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
2562 
2563 	spdk_nvme_detach(new_ctrlr);
2564 
2565 	return rc;
2566 }
2567 
2568 static void
2569 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
2570 		  struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
2571 {
2572 	struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx;
2573 	struct nvme_async_probe_ctx *ctx;
2574 	int rc;
2575 
2576 	ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, opts);
2577 	ctx->ctrlr_attached = true;
2578 
2579 	rc = nvme_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx->prchk_flags, ctx);
2580 	if (rc != 0) {
2581 		populate_namespaces_cb(ctx, 0, rc);
2582 	}
2583 }
2584 
2585 static void
2586 connect_set_failover_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
2587 			struct spdk_nvme_ctrlr *ctrlr,
2588 			const struct spdk_nvme_ctrlr_opts *opts)
2589 {
2590 	struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx;
2591 	struct nvme_ctrlr *nvme_ctrlr;
2592 	struct nvme_async_probe_ctx *ctx;
2593 	int rc;
2594 
2595 	ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, opts);
2596 	ctx->ctrlr_attached = true;
2597 
2598 	nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name);
2599 	if (nvme_ctrlr) {
2600 		rc = bdev_nvme_add_secondary_trid(nvme_ctrlr, ctrlr, &ctx->trid);
2601 	} else {
2602 		rc = -ENODEV;
2603 	}
2604 
2605 	populate_namespaces_cb(ctx, 0, rc);
2606 }
2607 
2608 static int
2609 bdev_nvme_async_poll(void *arg)
2610 {
2611 	struct nvme_async_probe_ctx	*ctx = arg;
2612 	int				rc;
2613 
2614 	rc = spdk_nvme_probe_poll_async(ctx->probe_ctx);
2615 	if (spdk_unlikely(rc != -EAGAIN)) {
2616 		ctx->probe_done = true;
2617 		spdk_poller_unregister(&ctx->poller);
2618 		if (!ctx->ctrlr_attached) {
2619 			/* The probe is done, but no controller was attached.
2620 			 * That means we had a failure, so report -EIO back to
2621 			 * the caller (usually the RPC). populate_namespaces_cb()
2622 			 * will take care of freeing the nvme_async_probe_ctx.
2623 			 */
2624 			populate_namespaces_cb(ctx, 0, -EIO);
2625 		} else if (ctx->namespaces_populated) {
2626 			/* The namespaces for the attached controller were all
2627 			 * populated and the response was already sent to the
2628 			 * caller (usually the RPC).  So free the context here.
2629 			 */
2630 			free(ctx);
2631 		}
2632 	}
2633 
2634 	return SPDK_POLLER_BUSY;
2635 }
2636 
2637 int
2638 bdev_nvme_create(struct spdk_nvme_transport_id *trid,
2639 		 struct spdk_nvme_host_id *hostid,
2640 		 const char *base_name,
2641 		 const char **names,
2642 		 uint32_t count,
2643 		 uint32_t prchk_flags,
2644 		 spdk_bdev_create_nvme_fn cb_fn,
2645 		 void *cb_ctx,
2646 		 struct spdk_nvme_ctrlr_opts *opts)
2647 {
2648 	struct nvme_probe_skip_entry	*entry, *tmp;
2649 	struct nvme_async_probe_ctx	*ctx;
2650 	spdk_nvme_attach_cb attach_cb;
2651 
2652 	/* TODO expand this check to include both the host and target TRIDs.
2653 	 * Only if both are the same should we fail.
2654 	 */
2655 	if (nvme_ctrlr_get(trid) != NULL) {
2656 		SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr);
2657 		return -EEXIST;
2658 	}
2659 
2660 	ctx = calloc(1, sizeof(*ctx));
2661 	if (!ctx) {
2662 		return -ENOMEM;
2663 	}
2664 	ctx->base_name = base_name;
2665 	ctx->names = names;
2666 	ctx->count = count;
2667 	ctx->cb_fn = cb_fn;
2668 	ctx->cb_ctx = cb_ctx;
2669 	ctx->prchk_flags = prchk_flags;
2670 	ctx->trid = *trid;
2671 
2672 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
2673 		TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) {
2674 			if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) {
2675 				TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq);
2676 				free(entry);
2677 				break;
2678 			}
2679 		}
2680 	}
2681 
2682 	if (opts) {
2683 		memcpy(&ctx->opts, opts, sizeof(*opts));
2684 	} else {
2685 		spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->opts, sizeof(ctx->opts));
2686 	}
2687 
2688 	ctx->opts.transport_retry_count = g_opts.retry_count;
2689 	ctx->opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms;
2690 	ctx->opts.disable_read_ana_log_page = true;
2691 
2692 	if (hostid->hostaddr[0] != '\0') {
2693 		snprintf(ctx->opts.src_addr, sizeof(ctx->opts.src_addr), "%s", hostid->hostaddr);
2694 	}
2695 
2696 	if (hostid->hostsvcid[0] != '\0') {
2697 		snprintf(ctx->opts.src_svcid, sizeof(ctx->opts.src_svcid), "%s", hostid->hostsvcid);
2698 	}
2699 
2700 	if (nvme_ctrlr_get_by_name(base_name) == NULL) {
2701 		attach_cb = connect_attach_cb;
2702 	} else {
2703 		attach_cb = connect_set_failover_cb;
2704 	}
2705 
2706 	ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->opts, attach_cb);
2707 	if (ctx->probe_ctx == NULL) {
2708 		SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr);
2709 		free(ctx);
2710 		return -ENODEV;
2711 	}
2712 	ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000);
2713 
2714 	return 0;
2715 }
2716 
2717 static int
2718 bdev_nvme_delete_secondary_trid(struct nvme_ctrlr *nvme_ctrlr,
2719 				const struct spdk_nvme_transport_id *trid)
2720 {
2721 	struct nvme_ctrlr_trid	*ctrlr_trid, *tmp_trid;
2722 
2723 	if (!spdk_nvme_transport_id_compare(trid, nvme_ctrlr->connected_trid)) {
2724 		return -EBUSY;
2725 	}
2726 
2727 	TAILQ_FOREACH_SAFE(ctrlr_trid, &nvme_ctrlr->trids, link, tmp_trid) {
2728 		if (!spdk_nvme_transport_id_compare(&ctrlr_trid->trid, trid)) {
2729 			TAILQ_REMOVE(&nvme_ctrlr->trids, ctrlr_trid, link);
2730 			free(ctrlr_trid);
2731 			return 0;
2732 		}
2733 	}
2734 
2735 	return -ENXIO;
2736 }
2737 
2738 int
2739 bdev_nvme_delete(const char *name, const struct spdk_nvme_transport_id *trid)
2740 {
2741 	struct nvme_ctrlr	*nvme_ctrlr;
2742 	struct nvme_ctrlr_trid	*ctrlr_trid;
2743 
2744 	if (name == NULL) {
2745 		return -EINVAL;
2746 	}
2747 
2748 	nvme_ctrlr = nvme_ctrlr_get_by_name(name);
2749 	if (nvme_ctrlr == NULL) {
2750 		SPDK_ERRLOG("Failed to find NVMe controller\n");
2751 		return -ENODEV;
2752 	}
2753 
2754 	/* case 1: remove the controller itself. */
2755 	if (trid == NULL) {
2756 		return _bdev_nvme_delete(nvme_ctrlr, false);
2757 	}
2758 
2759 	/* case 2: we are currently using the path to be removed. */
2760 	if (!spdk_nvme_transport_id_compare(trid, nvme_ctrlr->connected_trid)) {
2761 		ctrlr_trid = TAILQ_FIRST(&nvme_ctrlr->trids);
2762 		assert(nvme_ctrlr->connected_trid == &ctrlr_trid->trid);
2763 		/* case 2A: the current path is the only path. */
2764 		if (!TAILQ_NEXT(ctrlr_trid, link)) {
2765 			return _bdev_nvme_delete(nvme_ctrlr, false);
2766 		}
2767 
2768 		/* case 2B: there is an alternative path. */
2769 		return bdev_nvme_failover(nvme_ctrlr, true);
2770 	}
2771 
2772 	/* case 3: We are not using the specified path. */
2773 	return bdev_nvme_delete_secondary_trid(nvme_ctrlr, trid);
2774 }
2775 
2776 static int
2777 bdev_nvme_library_init(void)
2778 {
2779 	g_bdev_nvme_init_thread = spdk_get_thread();
2780 
2781 	spdk_io_device_register(&g_nvme_ctrlrs, bdev_nvme_create_poll_group_cb,
2782 				bdev_nvme_destroy_poll_group_cb,
2783 				sizeof(struct nvme_poll_group),  "nvme_poll_groups");
2784 
2785 	return 0;
2786 }
2787 
2788 static void
2789 bdev_nvme_library_fini(void)
2790 {
2791 	struct nvme_ctrlr *nvme_ctrlr, *tmp;
2792 	struct nvme_probe_skip_entry *entry, *entry_tmp;
2793 
2794 	spdk_poller_unregister(&g_hotplug_poller);
2795 	free(g_hotplug_probe_ctx);
2796 	g_hotplug_probe_ctx = NULL;
2797 
2798 	TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) {
2799 		TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq);
2800 		free(entry);
2801 	}
2802 
2803 	pthread_mutex_lock(&g_bdev_nvme_mutex);
2804 	TAILQ_FOREACH_SAFE(nvme_ctrlr, &g_nvme_ctrlrs, tailq, tmp) {
2805 		pthread_mutex_lock(&nvme_ctrlr->mutex);
2806 		if (nvme_ctrlr->destruct) {
2807 			/* This controller's destruction was already started
2808 			 * before the application started shutting down
2809 			 */
2810 			pthread_mutex_unlock(&nvme_ctrlr->mutex);
2811 			continue;
2812 		}
2813 		nvme_ctrlr->destruct = true;
2814 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
2815 
2816 		spdk_thread_send_msg(nvme_ctrlr->thread, _nvme_ctrlr_destruct,
2817 				     nvme_ctrlr);
2818 	}
2819 
2820 	g_bdev_nvme_module_finish = true;
2821 	if (TAILQ_EMPTY(&g_nvme_ctrlrs)) {
2822 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
2823 		spdk_io_device_unregister(&g_nvme_ctrlrs, NULL);
2824 		spdk_bdev_module_fini_done();
2825 		return;
2826 	}
2827 
2828 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
2829 }
2830 
2831 static void
2832 bdev_nvme_verify_pi_error(struct nvme_bdev_io *bio)
2833 {
2834 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2835 	struct spdk_bdev *bdev = bdev_io->bdev;
2836 	struct spdk_dif_ctx dif_ctx;
2837 	struct spdk_dif_error err_blk = {};
2838 	int rc;
2839 
2840 	rc = spdk_dif_ctx_init(&dif_ctx,
2841 			       bdev->blocklen, bdev->md_len, bdev->md_interleave,
2842 			       bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags,
2843 			       bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0);
2844 	if (rc != 0) {
2845 		SPDK_ERRLOG("Initialization of DIF context failed\n");
2846 		return;
2847 	}
2848 
2849 	if (bdev->md_interleave) {
2850 		rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
2851 				     bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
2852 	} else {
2853 		struct iovec md_iov = {
2854 			.iov_base	= bdev_io->u.bdev.md_buf,
2855 			.iov_len	= bdev_io->u.bdev.num_blocks * bdev->md_len,
2856 		};
2857 
2858 		rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
2859 				     &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
2860 	}
2861 
2862 	if (rc != 0) {
2863 		SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n",
2864 			    err_blk.err_type, err_blk.err_offset);
2865 	} else {
2866 		SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n");
2867 	}
2868 }
2869 
2870 static void
2871 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl)
2872 {
2873 	struct nvme_bdev_io *bio = ref;
2874 
2875 	if (spdk_nvme_cpl_is_success(cpl)) {
2876 		/* Run PI verification for read data buffer. */
2877 		bdev_nvme_verify_pi_error(bio);
2878 	}
2879 
2880 	/* Return original completion status */
2881 	bdev_nvme_io_complete_nvme_status(bio, &bio->cpl);
2882 }
2883 
2884 static void
2885 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl)
2886 {
2887 	struct nvme_bdev_io *bio = ref;
2888 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2889 	struct nvme_bdev_channel *nbdev_ch;
2890 	struct spdk_nvme_ns *ns;
2891 	struct spdk_nvme_qpair *qpair;
2892 	int ret;
2893 
2894 	if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) {
2895 		SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n",
2896 			    cpl->status.sct, cpl->status.sc);
2897 
2898 		/* Save completion status to use after verifying PI error. */
2899 		bio->cpl = *cpl;
2900 
2901 		nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io));
2902 
2903 		if (spdk_likely(bdev_nvme_find_io_path(nbdev_ch, &ns, &qpair))) {
2904 			/* Read without PI checking to verify PI error. */
2905 			ret = bdev_nvme_no_pi_readv(ns,
2906 						    qpair,
2907 						    bio,
2908 						    bdev_io->u.bdev.iovs,
2909 						    bdev_io->u.bdev.iovcnt,
2910 						    bdev_io->u.bdev.md_buf,
2911 						    bdev_io->u.bdev.num_blocks,
2912 						    bdev_io->u.bdev.offset_blocks);
2913 			if (ret == 0) {
2914 				return;
2915 			}
2916 		}
2917 	}
2918 
2919 	bdev_nvme_io_complete_nvme_status(bio, cpl);
2920 }
2921 
2922 static void
2923 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl)
2924 {
2925 	struct nvme_bdev_io *bio = ref;
2926 
2927 	if (spdk_nvme_cpl_is_pi_error(cpl)) {
2928 		SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n",
2929 			    cpl->status.sct, cpl->status.sc);
2930 		/* Run PI verification for write data buffer if PI error is detected. */
2931 		bdev_nvme_verify_pi_error(bio);
2932 	}
2933 
2934 	bdev_nvme_io_complete_nvme_status(bio, cpl);
2935 }
2936 
2937 static void
2938 bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl)
2939 {
2940 	struct nvme_bdev_io *bio = ref;
2941 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2942 
2943 	/* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks.
2944 	 * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error().
2945 	 */
2946 	bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0;
2947 
2948 	if (spdk_nvme_cpl_is_pi_error(cpl)) {
2949 		SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n",
2950 			    cpl->status.sct, cpl->status.sc);
2951 		/* Run PI verification for zone append data buffer if PI error is detected. */
2952 		bdev_nvme_verify_pi_error(bio);
2953 	}
2954 
2955 	bdev_nvme_io_complete_nvme_status(bio, cpl);
2956 }
2957 
2958 static void
2959 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl)
2960 {
2961 	struct nvme_bdev_io *bio = ref;
2962 
2963 	if (spdk_nvme_cpl_is_pi_error(cpl)) {
2964 		SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n",
2965 			    cpl->status.sct, cpl->status.sc);
2966 		/* Run PI verification for compare data buffer if PI error is detected. */
2967 		bdev_nvme_verify_pi_error(bio);
2968 	}
2969 
2970 	bdev_nvme_io_complete_nvme_status(bio, cpl);
2971 }
2972 
2973 static void
2974 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl)
2975 {
2976 	struct nvme_bdev_io *bio = ref;
2977 
2978 	/* Compare operation completion */
2979 	if ((cpl->cdw0 & 0xFF) == SPDK_NVME_OPC_COMPARE) {
2980 		/* Save compare result for write callback */
2981 		bio->cpl = *cpl;
2982 		return;
2983 	}
2984 
2985 	/* Write operation completion */
2986 	if (spdk_nvme_cpl_is_error(&bio->cpl)) {
2987 		/* If bio->cpl is already an error, it means the compare operation failed.  In that case,
2988 		 * complete the IO with the compare operation's status.
2989 		 */
2990 		if (!spdk_nvme_cpl_is_error(cpl)) {
2991 			SPDK_ERRLOG("Unexpected write success after compare failure.\n");
2992 		}
2993 
2994 		bdev_nvme_io_complete_nvme_status(bio, &bio->cpl);
2995 	} else {
2996 		bdev_nvme_io_complete_nvme_status(bio, cpl);
2997 	}
2998 }
2999 
3000 static void
3001 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl)
3002 {
3003 	struct nvme_bdev_io *bio = ref;
3004 
3005 	bdev_nvme_io_complete_nvme_status(bio, cpl);
3006 }
3007 
3008 static int
3009 fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc)
3010 {
3011 	switch (desc->zs) {
3012 	case SPDK_NVME_ZONE_STATE_EMPTY:
3013 		info->state = SPDK_BDEV_ZONE_STATE_EMPTY;
3014 		break;
3015 	case SPDK_NVME_ZONE_STATE_IOPEN:
3016 		info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN;
3017 		break;
3018 	case SPDK_NVME_ZONE_STATE_EOPEN:
3019 		info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN;
3020 		break;
3021 	case SPDK_NVME_ZONE_STATE_CLOSED:
3022 		info->state = SPDK_BDEV_ZONE_STATE_CLOSED;
3023 		break;
3024 	case SPDK_NVME_ZONE_STATE_RONLY:
3025 		info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY;
3026 		break;
3027 	case SPDK_NVME_ZONE_STATE_FULL:
3028 		info->state = SPDK_BDEV_ZONE_STATE_FULL;
3029 		break;
3030 	case SPDK_NVME_ZONE_STATE_OFFLINE:
3031 		info->state = SPDK_BDEV_ZONE_STATE_OFFLINE;
3032 		break;
3033 	default:
3034 		SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs);
3035 		return -EIO;
3036 	}
3037 
3038 	info->zone_id = desc->zslba;
3039 	info->write_pointer = desc->wp;
3040 	info->capacity = desc->zcap;
3041 
3042 	return 0;
3043 }
3044 
3045 static void
3046 bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl)
3047 {
3048 	struct nvme_bdev_io *bio = ref;
3049 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
3050 	struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io);
3051 	struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch);
3052 	uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id;
3053 	uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones;
3054 	struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf;
3055 	uint64_t max_zones_per_buf, i;
3056 	uint32_t zone_report_bufsize;
3057 	struct spdk_nvme_ns *ns;
3058 	struct spdk_nvme_qpair *qpair;
3059 	int ret;
3060 
3061 	if (spdk_nvme_cpl_is_error(cpl)) {
3062 		goto out_complete_io_nvme_cpl;
3063 	}
3064 
3065 	if (!bdev_nvme_find_io_path(nbdev_ch, &ns, &qpair)) {
3066 		ret = -ENXIO;
3067 		goto out_complete_io_ret;
3068 	}
3069 
3070 	zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns);
3071 	max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) /
3072 			    sizeof(bio->zone_report_buf->descs[0]);
3073 
3074 	if (bio->zone_report_buf->nr_zones > max_zones_per_buf) {
3075 		ret = -EINVAL;
3076 		goto out_complete_io_ret;
3077 	}
3078 
3079 	if (!bio->zone_report_buf->nr_zones) {
3080 		ret = -EINVAL;
3081 		goto out_complete_io_ret;
3082 	}
3083 
3084 	for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) {
3085 		ret = fill_zone_from_report(&info[bio->handled_zones],
3086 					    &bio->zone_report_buf->descs[i]);
3087 		if (ret) {
3088 			goto out_complete_io_ret;
3089 		}
3090 		bio->handled_zones++;
3091 	}
3092 
3093 	if (bio->handled_zones < zones_to_copy) {
3094 		uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(ns);
3095 		uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones);
3096 
3097 		memset(bio->zone_report_buf, 0, zone_report_bufsize);
3098 		ret = spdk_nvme_zns_report_zones(ns, qpair,
3099 						 bio->zone_report_buf, zone_report_bufsize,
3100 						 slba, SPDK_NVME_ZRA_LIST_ALL, true,
3101 						 bdev_nvme_get_zone_info_done, bio);
3102 		if (!ret) {
3103 			return;
3104 		} else {
3105 			goto out_complete_io_ret;
3106 		}
3107 	}
3108 
3109 out_complete_io_nvme_cpl:
3110 	free(bio->zone_report_buf);
3111 	bio->zone_report_buf = NULL;
3112 	bdev_nvme_io_complete_nvme_status(bio, cpl);
3113 	return;
3114 
3115 out_complete_io_ret:
3116 	free(bio->zone_report_buf);
3117 	bio->zone_report_buf = NULL;
3118 	bdev_nvme_io_complete(bio, ret);
3119 }
3120 
3121 static void
3122 bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl)
3123 {
3124 	struct nvme_bdev_io *bio = ref;
3125 
3126 	bdev_nvme_io_complete_nvme_status(bio, cpl);
3127 }
3128 
3129 static void
3130 bdev_nvme_admin_passthru_completion(void *ctx)
3131 {
3132 	struct nvme_bdev_io *bio = ctx;
3133 
3134 	bdev_nvme_io_complete_nvme_status(bio, &bio->cpl);
3135 }
3136 
3137 static void
3138 bdev_nvme_abort_completion(void *ctx)
3139 {
3140 	struct nvme_bdev_io *bio = ctx;
3141 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
3142 
3143 	if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) {
3144 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
3145 	} else {
3146 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
3147 	}
3148 }
3149 
3150 static void
3151 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl)
3152 {
3153 	struct nvme_bdev_io *bio = ref;
3154 
3155 	bio->cpl = *cpl;
3156 	spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_completion, bio);
3157 }
3158 
3159 static void
3160 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl)
3161 {
3162 	struct nvme_bdev_io *bio = ref;
3163 
3164 	bio->cpl = *cpl;
3165 	spdk_thread_send_msg(bio->orig_thread, bdev_nvme_admin_passthru_completion, bio);
3166 }
3167 
3168 static void
3169 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset)
3170 {
3171 	struct nvme_bdev_io *bio = ref;
3172 	struct iovec *iov;
3173 
3174 	bio->iov_offset = sgl_offset;
3175 	for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) {
3176 		iov = &bio->iovs[bio->iovpos];
3177 		if (bio->iov_offset < iov->iov_len) {
3178 			break;
3179 		}
3180 
3181 		bio->iov_offset -= iov->iov_len;
3182 	}
3183 }
3184 
3185 static int
3186 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length)
3187 {
3188 	struct nvme_bdev_io *bio = ref;
3189 	struct iovec *iov;
3190 
3191 	assert(bio->iovpos < bio->iovcnt);
3192 
3193 	iov = &bio->iovs[bio->iovpos];
3194 
3195 	*address = iov->iov_base;
3196 	*length = iov->iov_len;
3197 
3198 	if (bio->iov_offset) {
3199 		assert(bio->iov_offset <= iov->iov_len);
3200 		*address += bio->iov_offset;
3201 		*length -= bio->iov_offset;
3202 	}
3203 
3204 	bio->iov_offset += *length;
3205 	if (bio->iov_offset == iov->iov_len) {
3206 		bio->iovpos++;
3207 		bio->iov_offset = 0;
3208 	}
3209 
3210 	return 0;
3211 }
3212 
3213 static void
3214 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset)
3215 {
3216 	struct nvme_bdev_io *bio = ref;
3217 	struct iovec *iov;
3218 
3219 	bio->fused_iov_offset = sgl_offset;
3220 	for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) {
3221 		iov = &bio->fused_iovs[bio->fused_iovpos];
3222 		if (bio->fused_iov_offset < iov->iov_len) {
3223 			break;
3224 		}
3225 
3226 		bio->fused_iov_offset -= iov->iov_len;
3227 	}
3228 }
3229 
3230 static int
3231 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length)
3232 {
3233 	struct nvme_bdev_io *bio = ref;
3234 	struct iovec *iov;
3235 
3236 	assert(bio->fused_iovpos < bio->fused_iovcnt);
3237 
3238 	iov = &bio->fused_iovs[bio->fused_iovpos];
3239 
3240 	*address = iov->iov_base;
3241 	*length = iov->iov_len;
3242 
3243 	if (bio->fused_iov_offset) {
3244 		assert(bio->fused_iov_offset <= iov->iov_len);
3245 		*address += bio->fused_iov_offset;
3246 		*length -= bio->fused_iov_offset;
3247 	}
3248 
3249 	bio->fused_iov_offset += *length;
3250 	if (bio->fused_iov_offset == iov->iov_len) {
3251 		bio->fused_iovpos++;
3252 		bio->fused_iov_offset = 0;
3253 	}
3254 
3255 	return 0;
3256 }
3257 
3258 static int
3259 bdev_nvme_no_pi_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3260 		      struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
3261 		      void *md, uint64_t lba_count, uint64_t lba)
3262 {
3263 	int rc;
3264 
3265 	SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n",
3266 		      lba_count, lba);
3267 
3268 	bio->iovs = iov;
3269 	bio->iovcnt = iovcnt;
3270 	bio->iovpos = 0;
3271 	bio->iov_offset = 0;
3272 
3273 	rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count,
3274 					    bdev_nvme_no_pi_readv_done, bio, 0,
3275 					    bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
3276 					    md, 0, 0);
3277 
3278 	if (rc != 0 && rc != -ENOMEM) {
3279 		SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc);
3280 	}
3281 	return rc;
3282 }
3283 
3284 static int
3285 bdev_nvme_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3286 		struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
3287 		void *md, uint64_t lba_count, uint64_t lba, uint32_t flags,
3288 		struct spdk_bdev_ext_io_opts *ext_opts)
3289 {
3290 	int rc;
3291 
3292 	SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n",
3293 		      lba_count, lba);
3294 
3295 	bio->iovs = iov;
3296 	bio->iovcnt = iovcnt;
3297 	bio->iovpos = 0;
3298 	bio->iov_offset = 0;
3299 
3300 	if (ext_opts) {
3301 		bio->ext_opts.size = sizeof(struct spdk_nvme_ns_cmd_ext_io_opts);
3302 		bio->ext_opts.memory_domain = ext_opts->memory_domain;
3303 		bio->ext_opts.memory_domain_ctx = ext_opts->memory_domain_ctx;
3304 		bio->ext_opts.io_flags = flags;
3305 		bio->ext_opts.metadata = md;
3306 
3307 		rc = spdk_nvme_ns_cmd_readv_ext(ns, qpair, lba, lba_count,
3308 						bdev_nvme_readv_done, bio,
3309 						bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
3310 						&bio->ext_opts);
3311 	} else if (iovcnt == 1) {
3312 		rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, md, lba,
3313 						   lba_count,
3314 						   bdev_nvme_readv_done, bio,
3315 						   flags,
3316 						   0, 0);
3317 	} else {
3318 		rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count,
3319 						    bdev_nvme_readv_done, bio, flags,
3320 						    bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
3321 						    md, 0, 0);
3322 	}
3323 
3324 	if (rc != 0 && rc != -ENOMEM) {
3325 		SPDK_ERRLOG("readv failed: rc = %d\n", rc);
3326 	}
3327 	return rc;
3328 }
3329 
3330 static int
3331 bdev_nvme_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3332 		 struct nvme_bdev_io *bio,
3333 		 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
3334 		 uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts)
3335 {
3336 	int rc;
3337 
3338 	SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n",
3339 		      lba_count, lba);
3340 
3341 	bio->iovs = iov;
3342 	bio->iovcnt = iovcnt;
3343 	bio->iovpos = 0;
3344 	bio->iov_offset = 0;
3345 
3346 	if (ext_opts) {
3347 		bio->ext_opts.size = sizeof(struct spdk_nvme_ns_cmd_ext_io_opts);
3348 		bio->ext_opts.memory_domain = ext_opts->memory_domain;
3349 		bio->ext_opts.memory_domain_ctx = ext_opts->memory_domain_ctx;
3350 		bio->ext_opts.io_flags = flags;
3351 		bio->ext_opts.metadata = md;
3352 
3353 		rc = spdk_nvme_ns_cmd_writev_ext(ns, qpair, lba, lba_count,
3354 						 bdev_nvme_readv_done, bio,
3355 						 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
3356 						 &bio->ext_opts);
3357 	} else if (iovcnt == 1) {
3358 		rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, md, lba,
3359 						    lba_count,
3360 						    bdev_nvme_writev_done, bio,
3361 						    flags,
3362 						    0, 0);
3363 	} else {
3364 		rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count,
3365 						     bdev_nvme_writev_done, bio, flags,
3366 						     bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
3367 						     md, 0, 0);
3368 	}
3369 
3370 	if (rc != 0 && rc != -ENOMEM) {
3371 		SPDK_ERRLOG("writev failed: rc = %d\n", rc);
3372 	}
3373 	return rc;
3374 }
3375 
3376 static int
3377 bdev_nvme_zone_appendv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3378 		       struct nvme_bdev_io *bio,
3379 		       struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t zslba,
3380 		       uint32_t flags)
3381 {
3382 	int rc;
3383 
3384 	SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n",
3385 		      lba_count, zslba);
3386 
3387 	bio->iovs = iov;
3388 	bio->iovcnt = iovcnt;
3389 	bio->iovpos = 0;
3390 	bio->iov_offset = 0;
3391 
3392 	if (iovcnt == 1) {
3393 		rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba,
3394 						       lba_count,
3395 						       bdev_nvme_zone_appendv_done, bio,
3396 						       flags,
3397 						       0, 0);
3398 	} else {
3399 		rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count,
3400 							bdev_nvme_zone_appendv_done, bio, flags,
3401 							bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
3402 							md, 0, 0);
3403 	}
3404 
3405 	if (rc != 0 && rc != -ENOMEM) {
3406 		SPDK_ERRLOG("zone append failed: rc = %d\n", rc);
3407 	}
3408 	return rc;
3409 }
3410 
3411 static int
3412 bdev_nvme_comparev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3413 		   struct nvme_bdev_io *bio,
3414 		   struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
3415 		   uint32_t flags)
3416 {
3417 	int rc;
3418 
3419 	SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n",
3420 		      lba_count, lba);
3421 
3422 	bio->iovs = iov;
3423 	bio->iovcnt = iovcnt;
3424 	bio->iovpos = 0;
3425 	bio->iov_offset = 0;
3426 
3427 	rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count,
3428 					       bdev_nvme_comparev_done, bio, flags,
3429 					       bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
3430 					       md, 0, 0);
3431 
3432 	if (rc != 0 && rc != -ENOMEM) {
3433 		SPDK_ERRLOG("comparev failed: rc = %d\n", rc);
3434 	}
3435 	return rc;
3436 }
3437 
3438 static int
3439 bdev_nvme_comparev_and_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3440 			      struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt,
3441 			      struct iovec *write_iov, int write_iovcnt,
3442 			      void *md, uint64_t lba_count, uint64_t lba, uint32_t flags)
3443 {
3444 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
3445 	int rc;
3446 
3447 	SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n",
3448 		      lba_count, lba);
3449 
3450 	bio->iovs = cmp_iov;
3451 	bio->iovcnt = cmp_iovcnt;
3452 	bio->iovpos = 0;
3453 	bio->iov_offset = 0;
3454 	bio->fused_iovs = write_iov;
3455 	bio->fused_iovcnt = write_iovcnt;
3456 	bio->fused_iovpos = 0;
3457 	bio->fused_iov_offset = 0;
3458 
3459 	if (bdev_io->num_retries == 0) {
3460 		bio->first_fused_submitted = false;
3461 	}
3462 
3463 	if (!bio->first_fused_submitted) {
3464 		flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST;
3465 		memset(&bio->cpl, 0, sizeof(bio->cpl));
3466 
3467 		rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count,
3468 						       bdev_nvme_comparev_and_writev_done, bio, flags,
3469 						       bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0);
3470 		if (rc == 0) {
3471 			bio->first_fused_submitted = true;
3472 			flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST;
3473 		} else {
3474 			if (rc != -ENOMEM) {
3475 				SPDK_ERRLOG("compare failed: rc = %d\n", rc);
3476 			}
3477 			return rc;
3478 		}
3479 	}
3480 
3481 	flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND;
3482 
3483 	rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count,
3484 					     bdev_nvme_comparev_and_writev_done, bio, flags,
3485 					     bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0);
3486 	if (rc != 0 && rc != -ENOMEM) {
3487 		SPDK_ERRLOG("write failed: rc = %d\n", rc);
3488 		rc = 0;
3489 	}
3490 
3491 	return rc;
3492 }
3493 
3494 static int
3495 bdev_nvme_unmap(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3496 		struct nvme_bdev_io *bio,
3497 		uint64_t offset_blocks,
3498 		uint64_t num_blocks)
3499 {
3500 	struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES];
3501 	struct spdk_nvme_dsm_range *range;
3502 	uint64_t offset, remaining;
3503 	uint64_t num_ranges_u64;
3504 	uint16_t num_ranges;
3505 	int rc;
3506 
3507 	num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) /
3508 			 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
3509 	if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) {
3510 		SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks);
3511 		return -EINVAL;
3512 	}
3513 	num_ranges = (uint16_t)num_ranges_u64;
3514 
3515 	offset = offset_blocks;
3516 	remaining = num_blocks;
3517 	range = &dsm_ranges[0];
3518 
3519 	/* Fill max-size ranges until the remaining blocks fit into one range */
3520 	while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) {
3521 		range->attributes.raw = 0;
3522 		range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
3523 		range->starting_lba = offset;
3524 
3525 		offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
3526 		remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
3527 		range++;
3528 	}
3529 
3530 	/* Final range describes the remaining blocks */
3531 	range->attributes.raw = 0;
3532 	range->length = remaining;
3533 	range->starting_lba = offset;
3534 
3535 	rc = spdk_nvme_ns_cmd_dataset_management(ns, qpair,
3536 			SPDK_NVME_DSM_ATTR_DEALLOCATE,
3537 			dsm_ranges, num_ranges,
3538 			bdev_nvme_queued_done, bio);
3539 
3540 	return rc;
3541 }
3542 
3543 static int
3544 bdev_nvme_write_zeroes(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3545 		       struct nvme_bdev_io *bio,
3546 		       uint64_t offset_blocks,
3547 		       uint64_t num_blocks)
3548 {
3549 	if (num_blocks > UINT16_MAX + 1) {
3550 		SPDK_ERRLOG("NVMe write zeroes is limited to 16-bit block count\n");
3551 		return -EINVAL;
3552 	}
3553 
3554 	return spdk_nvme_ns_cmd_write_zeroes(ns, qpair,
3555 					     offset_blocks, num_blocks,
3556 					     bdev_nvme_queued_done, bio,
3557 					     0);
3558 }
3559 
3560 static int
3561 bdev_nvme_get_zone_info(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3562 			struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones,
3563 			struct spdk_bdev_zone_info *info)
3564 {
3565 	uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns);
3566 	uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns);
3567 	uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns);
3568 
3569 	if (zone_id % zone_size != 0) {
3570 		return -EINVAL;
3571 	}
3572 
3573 	if (num_zones > total_zones || !num_zones) {
3574 		return -EINVAL;
3575 	}
3576 
3577 	assert(!bio->zone_report_buf);
3578 	bio->zone_report_buf = calloc(1, zone_report_bufsize);
3579 	if (!bio->zone_report_buf) {
3580 		return -ENOMEM;
3581 	}
3582 
3583 	bio->handled_zones = 0;
3584 
3585 	return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize,
3586 					  zone_id, SPDK_NVME_ZRA_LIST_ALL, true,
3587 					  bdev_nvme_get_zone_info_done, bio);
3588 }
3589 
3590 static int
3591 bdev_nvme_zone_management(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3592 			  struct nvme_bdev_io *bio, uint64_t zone_id,
3593 			  enum spdk_bdev_zone_action action)
3594 {
3595 	switch (action) {
3596 	case SPDK_BDEV_ZONE_CLOSE:
3597 		return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false,
3598 						bdev_nvme_zone_management_done, bio);
3599 	case SPDK_BDEV_ZONE_FINISH:
3600 		return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false,
3601 						 bdev_nvme_zone_management_done, bio);
3602 	case SPDK_BDEV_ZONE_OPEN:
3603 		return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false,
3604 					       bdev_nvme_zone_management_done, bio);
3605 	case SPDK_BDEV_ZONE_RESET:
3606 		return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false,
3607 						bdev_nvme_zone_management_done, bio);
3608 	case SPDK_BDEV_ZONE_OFFLINE:
3609 		return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false,
3610 						  bdev_nvme_zone_management_done, bio);
3611 	default:
3612 		return -EINVAL;
3613 	}
3614 }
3615 
3616 static int
3617 bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio,
3618 			 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes)
3619 {
3620 	struct nvme_ctrlr *nvme_ctrlr;
3621 	uint32_t max_xfer_size;
3622 
3623 	if (!bdev_nvme_find_admin_path(nbdev_ch, &nvme_ctrlr)) {
3624 		return -EINVAL;
3625 	}
3626 
3627 	max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ctrlr->ctrlr);
3628 
3629 	if (nbytes > max_xfer_size) {
3630 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
3631 		return -EINVAL;
3632 	}
3633 
3634 	bio->orig_thread = spdk_get_thread();
3635 
3636 	return spdk_nvme_ctrlr_cmd_admin_raw(nvme_ctrlr->ctrlr, cmd, buf,
3637 					     (uint32_t)nbytes, bdev_nvme_admin_passthru_done, bio);
3638 }
3639 
3640 static int
3641 bdev_nvme_io_passthru(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3642 		      struct nvme_bdev_io *bio,
3643 		      struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes)
3644 {
3645 	uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns);
3646 	struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns);
3647 
3648 	if (nbytes > max_xfer_size) {
3649 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
3650 		return -EINVAL;
3651 	}
3652 
3653 	/*
3654 	 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid,
3655 	 * so fill it out automatically.
3656 	 */
3657 	cmd->nsid = spdk_nvme_ns_get_id(ns);
3658 
3659 	return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf,
3660 					  (uint32_t)nbytes, bdev_nvme_queued_done, bio);
3661 }
3662 
3663 static int
3664 bdev_nvme_io_passthru_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3665 			 struct nvme_bdev_io *bio,
3666 			 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len)
3667 {
3668 	size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns);
3669 	uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns);
3670 	struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns);
3671 
3672 	if (nbytes > max_xfer_size) {
3673 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
3674 		return -EINVAL;
3675 	}
3676 
3677 	if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) {
3678 		SPDK_ERRLOG("invalid meta data buffer size\n");
3679 		return -EINVAL;
3680 	}
3681 
3682 	/*
3683 	 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid,
3684 	 * so fill it out automatically.
3685 	 */
3686 	cmd->nsid = spdk_nvme_ns_get_id(ns);
3687 
3688 	return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf,
3689 			(uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio);
3690 }
3691 
3692 static int
3693 bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio,
3694 		struct nvme_bdev_io *bio_to_abort)
3695 {
3696 	struct nvme_ctrlr_channel *ctrlr_ch = nbdev_ch->ctrlr_ch;
3697 	int rc;
3698 
3699 	bio->orig_thread = spdk_get_thread();
3700 
3701 	rc = spdk_nvme_ctrlr_cmd_abort_ext(ctrlr_ch->ctrlr->ctrlr,
3702 					   ctrlr_ch->qpair,
3703 					   bio_to_abort,
3704 					   bdev_nvme_abort_done, bio);
3705 	if (rc == -ENOENT) {
3706 		/* If no command was found in I/O qpair, the target command may be
3707 		 * admin command.
3708 		 */
3709 		rc = spdk_nvme_ctrlr_cmd_abort_ext(ctrlr_ch->ctrlr->ctrlr,
3710 						   NULL,
3711 						   bio_to_abort,
3712 						   bdev_nvme_abort_done, bio);
3713 	}
3714 
3715 	if (rc == -ENOENT) {
3716 		/* If no command was found, complete the abort request with failure. */
3717 		bio->cpl.cdw0 |= 1U;
3718 		bio->cpl.status.sc = SPDK_NVME_SC_SUCCESS;
3719 		bio->cpl.status.sct = SPDK_NVME_SCT_GENERIC;
3720 
3721 		bdev_nvme_abort_completion(bio);
3722 
3723 		rc = 0;
3724 	}
3725 
3726 	return rc;
3727 }
3728 
3729 static void
3730 nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w,
3731 		struct nvme_ns *nvme_ns)
3732 {
3733 	/* nop */
3734 }
3735 
3736 static void
3737 nvme_namespace_config_json(struct spdk_json_write_ctx *w, struct nvme_ns *nvme_ns)
3738 {
3739 	g_config_json_namespace_fn[nvme_ns->type](w, nvme_ns);
3740 }
3741 
3742 static void
3743 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w)
3744 {
3745 	const char	*action;
3746 
3747 	if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) {
3748 		action = "reset";
3749 	} else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) {
3750 		action = "abort";
3751 	} else {
3752 		action = "none";
3753 	}
3754 
3755 	spdk_json_write_object_begin(w);
3756 
3757 	spdk_json_write_named_string(w, "method", "bdev_nvme_set_options");
3758 
3759 	spdk_json_write_named_object_begin(w, "params");
3760 	spdk_json_write_named_string(w, "action_on_timeout", action);
3761 	spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us);
3762 	spdk_json_write_named_uint64(w, "timeout_admin_us", g_opts.timeout_admin_us);
3763 	spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms);
3764 	spdk_json_write_named_uint32(w, "retry_count", g_opts.retry_count);
3765 	spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst);
3766 	spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight);
3767 	spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight);
3768 	spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight);
3769 	spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us);
3770 	spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us);
3771 	spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests);
3772 	spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit);
3773 	spdk_json_write_object_end(w);
3774 
3775 	spdk_json_write_object_end(w);
3776 }
3777 
3778 static void
3779 nvme_ctrlr_config_json(struct spdk_json_write_ctx *w,
3780 		       struct nvme_ctrlr *nvme_ctrlr)
3781 {
3782 	struct spdk_nvme_transport_id	*trid;
3783 
3784 	trid = nvme_ctrlr->connected_trid;
3785 
3786 	spdk_json_write_object_begin(w);
3787 
3788 	spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller");
3789 
3790 	spdk_json_write_named_object_begin(w, "params");
3791 	spdk_json_write_named_string(w, "name", nvme_ctrlr->name);
3792 	nvme_bdev_dump_trid_json(trid, w);
3793 	spdk_json_write_named_bool(w, "prchk_reftag",
3794 				   (nvme_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0);
3795 	spdk_json_write_named_bool(w, "prchk_guard",
3796 				   (nvme_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0);
3797 
3798 	spdk_json_write_object_end(w);
3799 
3800 	spdk_json_write_object_end(w);
3801 }
3802 
3803 static void
3804 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w)
3805 {
3806 	spdk_json_write_object_begin(w);
3807 	spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug");
3808 
3809 	spdk_json_write_named_object_begin(w, "params");
3810 	spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us);
3811 	spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled);
3812 	spdk_json_write_object_end(w);
3813 
3814 	spdk_json_write_object_end(w);
3815 }
3816 
3817 static int
3818 bdev_nvme_config_json(struct spdk_json_write_ctx *w)
3819 {
3820 	struct nvme_ctrlr	*nvme_ctrlr;
3821 	uint32_t		nsid;
3822 
3823 	bdev_nvme_opts_config_json(w);
3824 
3825 	pthread_mutex_lock(&g_bdev_nvme_mutex);
3826 
3827 	TAILQ_FOREACH(nvme_ctrlr, &g_nvme_ctrlrs, tailq) {
3828 		nvme_ctrlr_config_json(w, nvme_ctrlr);
3829 
3830 		for (nsid = 0; nsid < nvme_ctrlr->num_ns; ++nsid) {
3831 			if (!nvme_ctrlr->namespaces[nsid]->populated) {
3832 				continue;
3833 			}
3834 
3835 			nvme_namespace_config_json(w, nvme_ctrlr->namespaces[nsid]);
3836 		}
3837 	}
3838 
3839 	/* Dump as last parameter to give all NVMe bdevs chance to be constructed
3840 	 * before enabling hotplug poller.
3841 	 */
3842 	bdev_nvme_hotplug_config_json(w);
3843 
3844 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
3845 	return 0;
3846 }
3847 
3848 struct spdk_nvme_ctrlr *
3849 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev)
3850 {
3851 	if (!bdev || bdev->module != &nvme_if) {
3852 		return NULL;
3853 	}
3854 
3855 	return SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk)->nvme_ns->ctrlr->ctrlr;
3856 }
3857 
3858 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme)
3859