xref: /spdk/module/bdev/nvme/bdev_nvme.c (revision d491e7ea33f0f52fd9abbfc4fbfff6a7f3cf2ec2)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation. All rights reserved.
5  *   Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "spdk/stdinc.h"
35 
36 #include "bdev_nvme.h"
37 #include "bdev_ocssd.h"
38 
39 #include "spdk/accel_engine.h"
40 #include "spdk/config.h"
41 #include "spdk/endian.h"
42 #include "spdk/bdev.h"
43 #include "spdk/json.h"
44 #include "spdk/nvme.h"
45 #include "spdk/nvme_ocssd.h"
46 #include "spdk/nvme_zns.h"
47 #include "spdk/thread.h"
48 #include "spdk/string.h"
49 #include "spdk/util.h"
50 
51 #include "spdk/bdev_module.h"
52 #include "spdk/log.h"
53 
54 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true
55 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS	(10000)
56 
57 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w);
58 
59 struct nvme_bdev_io {
60 	/** array of iovecs to transfer. */
61 	struct iovec *iovs;
62 
63 	/** Number of iovecs in iovs array. */
64 	int iovcnt;
65 
66 	/** Current iovec position. */
67 	int iovpos;
68 
69 	/** Offset in current iovec. */
70 	uint32_t iov_offset;
71 
72 	/** array of iovecs to transfer. */
73 	struct iovec *fused_iovs;
74 
75 	/** Number of iovecs in iovs array. */
76 	int fused_iovcnt;
77 
78 	/** Current iovec position. */
79 	int fused_iovpos;
80 
81 	/** Offset in current iovec. */
82 	uint32_t fused_iov_offset;
83 
84 	/** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */
85 	struct spdk_nvme_cpl cpl;
86 
87 	/** Originating thread */
88 	struct spdk_thread *orig_thread;
89 
90 	/** Keeps track if first of fused commands was submitted */
91 	bool first_fused_submitted;
92 
93 	/** Temporary pointer to zone report buffer */
94 	struct spdk_nvme_zns_zone_report *zone_report_buf;
95 
96 	/** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */
97 	uint64_t handled_zones;
98 };
99 
100 struct nvme_probe_ctx {
101 	size_t count;
102 	struct spdk_nvme_transport_id trids[NVME_MAX_CONTROLLERS];
103 	struct spdk_nvme_host_id hostids[NVME_MAX_CONTROLLERS];
104 	const char *names[NVME_MAX_CONTROLLERS];
105 	uint32_t prchk_flags[NVME_MAX_CONTROLLERS];
106 	const char *hostnqn;
107 };
108 
109 struct nvme_probe_skip_entry {
110 	struct spdk_nvme_transport_id		trid;
111 	TAILQ_ENTRY(nvme_probe_skip_entry)	tailq;
112 };
113 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */
114 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER(
115 			g_skipped_nvme_ctrlrs);
116 
117 static struct spdk_bdev_nvme_opts g_opts = {
118 	.action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE,
119 	.timeout_us = 0,
120 	.keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS,
121 	.retry_count = 4,
122 	.arbitration_burst = 0,
123 	.low_priority_weight = 0,
124 	.medium_priority_weight = 0,
125 	.high_priority_weight = 0,
126 	.nvme_adminq_poll_period_us = 10000ULL,
127 	.nvme_ioq_poll_period_us = 0,
128 	.io_queue_requests = 0,
129 	.delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT,
130 };
131 
132 #define NVME_HOTPLUG_POLL_PERIOD_MAX			10000000ULL
133 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT		100000ULL
134 
135 static int g_hot_insert_nvme_controller_index = 0;
136 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT;
137 static bool g_nvme_hotplug_enabled = false;
138 static struct spdk_thread *g_bdev_nvme_init_thread;
139 static struct spdk_poller *g_hotplug_poller;
140 static struct spdk_poller *g_hotplug_probe_poller;
141 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx;
142 
143 static void nvme_ctrlr_populate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
144 		struct nvme_async_probe_ctx *ctx);
145 static void nvme_ctrlr_populate_namespaces_done(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
146 		struct nvme_async_probe_ctx *ctx);
147 static int bdev_nvme_library_init(void);
148 static void bdev_nvme_library_fini(void);
149 static int bdev_nvme_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
150 			   struct nvme_bdev_io *bio,
151 			   struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
152 			   uint32_t flags);
153 static int bdev_nvme_no_pi_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
154 				 struct nvme_bdev_io *bio,
155 				 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba);
156 static int bdev_nvme_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
157 			    struct nvme_bdev_io *bio,
158 			    struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
159 			    uint32_t flags);
160 static int bdev_nvme_zone_appendv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
161 				  struct nvme_bdev_io *bio,
162 				  struct iovec *iov, int iovcnt, void *md, uint64_t lba_count,
163 				  uint64_t zslba, uint32_t flags);
164 static int bdev_nvme_comparev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
165 			      struct nvme_bdev_io *bio,
166 			      struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
167 			      uint32_t flags);
168 static int bdev_nvme_comparev_and_writev(struct spdk_nvme_ns *ns,
169 		struct spdk_nvme_qpair *qpair,
170 		struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov,
171 		int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba,
172 		uint32_t flags);
173 static int bdev_nvme_get_zone_info(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
174 				   struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones,
175 				   struct spdk_bdev_zone_info *info);
176 static int bdev_nvme_zone_management(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
177 				     struct nvme_bdev_io *bio, uint64_t zone_id,
178 				     enum spdk_bdev_zone_action action);
179 static int bdev_nvme_admin_passthru(struct nvme_io_path *io_path,
180 				    struct nvme_bdev_io *bio,
181 				    struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes);
182 static int bdev_nvme_io_passthru(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
183 				 struct nvme_bdev_io *bio,
184 				 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes);
185 static int bdev_nvme_io_passthru_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
186 				    struct nvme_bdev_io *bio,
187 				    struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len);
188 static int bdev_nvme_abort(struct nvme_io_path *io_path,
189 			   struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort);
190 static int bdev_nvme_reset(struct nvme_io_path *io_path, struct spdk_bdev_io *bdev_io);
191 static int bdev_nvme_failover(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, bool remove);
192 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr);
193 
194 typedef void (*populate_namespace_fn)(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
195 				      struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx);
196 static void nvme_ctrlr_populate_standard_namespace(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
197 		struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx);
198 
199 static populate_namespace_fn g_populate_namespace_fn[] = {
200 	NULL,
201 	nvme_ctrlr_populate_standard_namespace,
202 	bdev_ocssd_populate_namespace,
203 };
204 
205 typedef void (*depopulate_namespace_fn)(struct nvme_bdev_ns *nvme_ns);
206 static void nvme_ctrlr_depopulate_standard_namespace(struct nvme_bdev_ns *nvme_ns);
207 
208 static depopulate_namespace_fn g_depopulate_namespace_fn[] = {
209 	NULL,
210 	nvme_ctrlr_depopulate_standard_namespace,
211 	bdev_ocssd_depopulate_namespace,
212 };
213 
214 typedef void (*config_json_namespace_fn)(struct spdk_json_write_ctx *w,
215 		struct nvme_bdev_ns *nvme_ns);
216 static void nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w,
217 		struct nvme_bdev_ns *nvme_ns);
218 
219 static config_json_namespace_fn g_config_json_namespace_fn[] = {
220 	NULL,
221 	nvme_ctrlr_config_json_standard_namespace,
222 	bdev_ocssd_namespace_config_json,
223 };
224 
225 struct spdk_nvme_qpair *
226 bdev_nvme_get_io_qpair(struct spdk_io_channel *io_path_ch)
227 {
228 	struct nvme_io_path *io_path;
229 
230 	assert(io_path_ch != NULL);
231 
232 	io_path = spdk_io_channel_get_ctx(io_path_ch);
233 
234 	return io_path->qpair;
235 }
236 
237 static int
238 bdev_nvme_get_ctx_size(void)
239 {
240 	return sizeof(struct nvme_bdev_io);
241 }
242 
243 static struct spdk_bdev_module nvme_if = {
244 	.name = "nvme",
245 	.async_fini = true,
246 	.module_init = bdev_nvme_library_init,
247 	.module_fini = bdev_nvme_library_fini,
248 	.config_json = bdev_nvme_config_json,
249 	.get_ctx_size = bdev_nvme_get_ctx_size,
250 
251 };
252 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if)
253 
254 static inline bool
255 bdev_nvme_find_io_path(struct nvme_bdev *nbdev, struct nvme_io_path *io_path,
256 		       struct spdk_nvme_ns **_ns, struct spdk_nvme_qpair **_qpair)
257 {
258 	if (spdk_unlikely(io_path->qpair == NULL)) {
259 		/* The device is currently resetting. */
260 		return false;
261 	}
262 
263 	*_ns = nbdev->nvme_ns->ns;
264 	*_qpair = io_path->qpair;
265 	return true;
266 }
267 
268 static inline bool
269 bdev_nvme_find_admin_path(struct nvme_io_path *io_path,
270 			  struct nvme_bdev_ctrlr **_nvme_bdev_ctrlr)
271 {
272 	*_nvme_bdev_ctrlr = io_path->ctrlr;
273 	return true;
274 }
275 
276 static inline void
277 bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio,
278 				  const struct spdk_nvme_cpl *cpl)
279 {
280 	spdk_bdev_io_complete_nvme_status(spdk_bdev_io_from_ctx(bio), cpl->cdw0,
281 					  cpl->status.sct, cpl->status.sc);
282 }
283 
284 static inline void
285 bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc)
286 {
287 	enum spdk_bdev_io_status io_status;
288 
289 	if (rc == 0) {
290 		io_status = SPDK_BDEV_IO_STATUS_SUCCESS;
291 	} else if (rc == -ENOMEM) {
292 		io_status = SPDK_BDEV_IO_STATUS_NOMEM;
293 	} else {
294 		io_status = SPDK_BDEV_IO_STATUS_FAILED;
295 	}
296 
297 	spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), io_status);
298 }
299 
300 static void
301 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx)
302 {
303 	int rc;
304 
305 	SPDK_DEBUGLOG(bdev_nvme, "qpair %p is disconnected, attempting reconnect.\n", qpair);
306 	/*
307 	 * Currently, just try to reconnect indefinitely. If we are doing a reset, the reset will
308 	 * reconnect a qpair and we will stop getting a callback for this one.
309 	 */
310 	rc = spdk_nvme_ctrlr_reconnect_io_qpair(qpair);
311 	if (rc != 0) {
312 		SPDK_WARNLOG("Failed to reconnect to qpair %p, errno %d\n", qpair, -rc);
313 	}
314 }
315 
316 static int
317 bdev_nvme_poll(void *arg)
318 {
319 	struct nvme_bdev_poll_group *group = arg;
320 	int64_t num_completions;
321 
322 	if (group->collect_spin_stat && group->start_ticks == 0) {
323 		group->start_ticks = spdk_get_ticks();
324 	}
325 
326 	num_completions = spdk_nvme_poll_group_process_completions(group->group, 0,
327 			  bdev_nvme_disconnected_qpair_cb);
328 	if (group->collect_spin_stat) {
329 		if (num_completions > 0) {
330 			if (group->end_ticks != 0) {
331 				group->spin_ticks += (group->end_ticks - group->start_ticks);
332 				group->end_ticks = 0;
333 			}
334 			group->start_ticks = 0;
335 		} else {
336 			group->end_ticks = spdk_get_ticks();
337 		}
338 	}
339 
340 	return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
341 }
342 
343 static int
344 bdev_nvme_poll_adminq(void *arg)
345 {
346 	int32_t rc;
347 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = arg;
348 
349 	assert(nvme_bdev_ctrlr != NULL);
350 
351 	rc = spdk_nvme_ctrlr_process_admin_completions(nvme_bdev_ctrlr->ctrlr);
352 	if (rc < 0) {
353 		bdev_nvme_failover(nvme_bdev_ctrlr, false);
354 	}
355 
356 	return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY;
357 }
358 
359 static int
360 bdev_nvme_destruct(void *ctx)
361 {
362 	struct nvme_bdev *nvme_disk = ctx;
363 	struct nvme_bdev_ns *nvme_ns = nvme_disk->nvme_ns;
364 
365 	pthread_mutex_lock(&nvme_ns->ctrlr->mutex);
366 
367 	nvme_ns->bdev = NULL;
368 
369 	if (!nvme_ns->populated) {
370 		pthread_mutex_unlock(&nvme_ns->ctrlr->mutex);
371 
372 		nvme_bdev_ctrlr_destruct(nvme_ns->ctrlr);
373 	} else {
374 		pthread_mutex_unlock(&nvme_ns->ctrlr->mutex);
375 	}
376 
377 	free(nvme_disk->disk.name);
378 	free(nvme_disk);
379 
380 	return 0;
381 }
382 
383 static int
384 bdev_nvme_flush(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
385 		struct nvme_bdev_io *bio, uint64_t offset, uint64_t nbytes)
386 {
387 	bdev_nvme_io_complete(bio, 0);
388 
389 	return 0;
390 }
391 
392 static int
393 bdev_nvme_create_qpair(struct nvme_io_path *io_path)
394 {
395 	struct spdk_nvme_ctrlr *ctrlr = io_path->ctrlr->ctrlr;
396 	struct spdk_nvme_io_qpair_opts opts;
397 	struct spdk_nvme_qpair *qpair;
398 	int rc;
399 
400 	spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &opts, sizeof(opts));
401 	opts.delay_cmd_submit = g_opts.delay_cmd_submit;
402 	opts.create_only = true;
403 	opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests);
404 	g_opts.io_queue_requests = opts.io_queue_requests;
405 
406 	qpair = spdk_nvme_ctrlr_alloc_io_qpair(ctrlr, &opts, sizeof(opts));
407 	if (qpair == NULL) {
408 		return -1;
409 	}
410 
411 	assert(io_path->group != NULL);
412 
413 	rc = spdk_nvme_poll_group_add(io_path->group->group, qpair);
414 	if (rc != 0) {
415 		SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n");
416 		goto err;
417 	}
418 
419 	rc = spdk_nvme_ctrlr_connect_io_qpair(ctrlr, qpair);
420 	if (rc != 0) {
421 		SPDK_ERRLOG("Unable to connect I/O qpair.\n");
422 		goto err;
423 	}
424 
425 	io_path->qpair = qpair;
426 
427 	return 0;
428 
429 err:
430 	spdk_nvme_ctrlr_free_io_qpair(qpair);
431 
432 	return rc;
433 }
434 
435 static int
436 bdev_nvme_destroy_qpair(struct nvme_io_path *io_path)
437 {
438 	int rc;
439 
440 	if (io_path->qpair == NULL) {
441 		return 0;
442 	}
443 
444 	rc = spdk_nvme_ctrlr_free_io_qpair(io_path->qpair);
445 	if (!rc) {
446 		io_path->qpair = NULL;
447 	}
448 	return rc;
449 }
450 
451 static void
452 _bdev_nvme_check_pending_destruct(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr)
453 {
454 	pthread_mutex_lock(&nvme_bdev_ctrlr->mutex);
455 	if (nvme_bdev_ctrlr->destruct_after_reset) {
456 		assert(nvme_bdev_ctrlr->ref == 0 && nvme_bdev_ctrlr->destruct);
457 		pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
458 
459 		spdk_thread_send_msg(nvme_bdev_ctrlr->thread, nvme_bdev_ctrlr_unregister,
460 				     nvme_bdev_ctrlr);
461 	} else {
462 		pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
463 	}
464 }
465 
466 static void
467 bdev_nvme_check_pending_destruct(struct spdk_io_channel_iter *i, int status)
468 {
469 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_ctx(i);
470 
471 	_bdev_nvme_check_pending_destruct(nvme_bdev_ctrlr);
472 }
473 
474 static void
475 _bdev_nvme_complete_pending_resets(struct nvme_io_path *io_path,
476 				   enum spdk_bdev_io_status status)
477 {
478 	struct spdk_bdev_io *bdev_io;
479 
480 	while (!TAILQ_EMPTY(&io_path->pending_resets)) {
481 		bdev_io = TAILQ_FIRST(&io_path->pending_resets);
482 		TAILQ_REMOVE(&io_path->pending_resets, bdev_io, module_link);
483 		spdk_bdev_io_complete(bdev_io, status);
484 	}
485 }
486 
487 static void
488 bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i)
489 {
490 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
491 	struct nvme_io_path *io_path = spdk_io_channel_get_ctx(_ch);
492 
493 	_bdev_nvme_complete_pending_resets(io_path, SPDK_BDEV_IO_STATUS_SUCCESS);
494 
495 	spdk_for_each_channel_continue(i, 0);
496 }
497 
498 static void
499 bdev_nvme_abort_pending_resets(struct spdk_io_channel_iter *i)
500 {
501 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
502 	struct nvme_io_path *io_path = spdk_io_channel_get_ctx(_ch);
503 
504 	_bdev_nvme_complete_pending_resets(io_path, SPDK_BDEV_IO_STATUS_FAILED);
505 
506 	spdk_for_each_channel_continue(i, 0);
507 }
508 
509 static void
510 bdev_nvme_reset_io_complete(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
511 			    struct spdk_bdev_io *bdev_io, int rc)
512 {
513 	enum spdk_bdev_io_status io_status = SPDK_BDEV_IO_STATUS_SUCCESS;
514 
515 	if (rc) {
516 		io_status = SPDK_BDEV_IO_STATUS_FAILED;
517 	}
518 
519 	spdk_bdev_io_complete(bdev_io, io_status);
520 
521 	/* Make sure we clear any pending resets before returning. */
522 	spdk_for_each_channel(nvme_bdev_ctrlr,
523 			      rc == 0 ? bdev_nvme_complete_pending_resets :
524 			      bdev_nvme_abort_pending_resets,
525 			      nvme_bdev_ctrlr,
526 			      bdev_nvme_check_pending_destruct);
527 }
528 
529 static void
530 _bdev_nvme_reset_complete(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, int rc)
531 {
532 	struct nvme_bdev_ctrlr_trid *curr_trid;
533 	struct spdk_bdev_io *bdev_io = nvme_bdev_ctrlr->reset_bdev_io;
534 
535 	nvme_bdev_ctrlr->reset_bdev_io = NULL;
536 
537 	if (rc) {
538 		SPDK_ERRLOG("Resetting controller failed.\n");
539 	} else {
540 		SPDK_NOTICELOG("Resetting controller successful.\n");
541 	}
542 
543 	pthread_mutex_lock(&nvme_bdev_ctrlr->mutex);
544 	nvme_bdev_ctrlr->resetting = false;
545 	nvme_bdev_ctrlr->failover_in_progress = false;
546 
547 	curr_trid = TAILQ_FIRST(&nvme_bdev_ctrlr->trids);
548 	assert(curr_trid != NULL);
549 	assert(&curr_trid->trid == nvme_bdev_ctrlr->connected_trid);
550 
551 	curr_trid->is_failed = rc != 0 ? true : false;
552 
553 	if (nvme_bdev_ctrlr->ref == 0 && nvme_bdev_ctrlr->destruct) {
554 		/* Destruct ctrlr after clearing pending resets. */
555 		nvme_bdev_ctrlr->destruct_after_reset = true;
556 	}
557 
558 	pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
559 
560 	if (bdev_io) {
561 		bdev_nvme_reset_io_complete(nvme_bdev_ctrlr, bdev_io, rc);
562 	} else {
563 		/* Make sure we clear any pending resets before returning. */
564 		spdk_for_each_channel(nvme_bdev_ctrlr,
565 				      rc == 0 ? bdev_nvme_complete_pending_resets :
566 				      bdev_nvme_abort_pending_resets,
567 				      nvme_bdev_ctrlr,
568 				      bdev_nvme_check_pending_destruct);
569 	}
570 }
571 
572 static void
573 _bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status)
574 {
575 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_ctx(i);
576 
577 	_bdev_nvme_reset_complete(nvme_bdev_ctrlr, status);
578 }
579 
580 static void
581 _bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i)
582 {
583 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
584 	struct nvme_io_path *io_path = spdk_io_channel_get_ctx(_ch);
585 	int rc;
586 
587 	rc = bdev_nvme_create_qpair(io_path);
588 
589 	spdk_for_each_channel_continue(i, rc);
590 }
591 
592 static void
593 _bdev_nvme_reset_ctrlr(struct spdk_io_channel_iter *i, int status)
594 {
595 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_ctx(i);
596 	int rc;
597 
598 	if (status) {
599 		rc = status;
600 		goto err;
601 	}
602 
603 	rc = spdk_nvme_ctrlr_reset(nvme_bdev_ctrlr->ctrlr);
604 	if (rc != 0) {
605 		goto err;
606 	}
607 
608 	/* Recreate all of the I/O queue pairs */
609 	spdk_for_each_channel(nvme_bdev_ctrlr,
610 			      _bdev_nvme_reset_create_qpair,
611 			      nvme_bdev_ctrlr,
612 			      _bdev_nvme_reset_create_qpairs_done);
613 	return;
614 
615 err:
616 	_bdev_nvme_reset_complete(nvme_bdev_ctrlr, rc);
617 }
618 
619 static void
620 _bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i)
621 {
622 	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
623 	struct nvme_io_path *io_path = spdk_io_channel_get_ctx(ch);
624 	int rc;
625 
626 	rc = bdev_nvme_destroy_qpair(io_path);
627 
628 	spdk_for_each_channel_continue(i, rc);
629 }
630 
631 static int
632 _bdev_nvme_reset(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr)
633 {
634 	pthread_mutex_lock(&nvme_bdev_ctrlr->mutex);
635 	if (nvme_bdev_ctrlr->destruct) {
636 		pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
637 		return -EBUSY;
638 	}
639 
640 	if (nvme_bdev_ctrlr->resetting) {
641 		pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
642 		SPDK_NOTICELOG("Unable to perform reset, already in progress.\n");
643 		return -EAGAIN;
644 	}
645 
646 	nvme_bdev_ctrlr->resetting = true;
647 	pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
648 
649 	/* First, delete all NVMe I/O queue pairs. */
650 	spdk_for_each_channel(nvme_bdev_ctrlr,
651 			      _bdev_nvme_reset_destroy_qpair,
652 			      nvme_bdev_ctrlr,
653 			      _bdev_nvme_reset_ctrlr);
654 
655 	return 0;
656 }
657 
658 static int
659 bdev_nvme_reset(struct nvme_io_path *io_path, struct spdk_bdev_io *bdev_io)
660 {
661 	int rc;
662 
663 	rc = _bdev_nvme_reset(io_path->ctrlr);
664 	if (rc == 0) {
665 		assert(io_path->ctrlr->reset_bdev_io == NULL);
666 		io_path->ctrlr->reset_bdev_io = bdev_io;
667 	} else if (rc == -EAGAIN) {
668 		/*
669 		 * Reset call is queued only if it is from the app framework. This is on purpose so that
670 		 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the
671 		 * upper level. If they are in the middle of a reset, we won't try to schedule another one.
672 		 */
673 		TAILQ_INSERT_TAIL(&io_path->pending_resets, bdev_io, module_link);
674 	} else {
675 		return rc;
676 	}
677 
678 	return 0;
679 }
680 
681 static int
682 _bdev_nvme_failover_start(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, bool remove)
683 {
684 	struct nvme_bdev_ctrlr_trid *curr_trid = NULL, *next_trid = NULL;
685 	int rc;
686 
687 	pthread_mutex_lock(&nvme_bdev_ctrlr->mutex);
688 	if (nvme_bdev_ctrlr->destruct) {
689 		pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
690 		/* Don't bother resetting if the controller is in the process of being destructed. */
691 		return -EBUSY;
692 	}
693 
694 	curr_trid = TAILQ_FIRST(&nvme_bdev_ctrlr->trids);
695 	assert(curr_trid);
696 	assert(&curr_trid->trid == nvme_bdev_ctrlr->connected_trid);
697 	next_trid = TAILQ_NEXT(curr_trid, link);
698 
699 	if (nvme_bdev_ctrlr->resetting) {
700 		if (next_trid && !nvme_bdev_ctrlr->failover_in_progress) {
701 			rc = -EAGAIN;
702 		} else {
703 			rc = -EBUSY;
704 		}
705 		pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
706 		SPDK_NOTICELOG("Unable to perform reset, already in progress.\n");
707 		return rc;
708 	}
709 
710 	nvme_bdev_ctrlr->resetting = true;
711 	curr_trid->is_failed = true;
712 
713 	if (next_trid) {
714 		assert(curr_trid->trid.trtype != SPDK_NVME_TRANSPORT_PCIE);
715 
716 		SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", curr_trid->trid.traddr,
717 			       curr_trid->trid.trsvcid,	next_trid->trid.traddr, next_trid->trid.trsvcid);
718 
719 		nvme_bdev_ctrlr->failover_in_progress = true;
720 		spdk_nvme_ctrlr_fail(nvme_bdev_ctrlr->ctrlr);
721 		nvme_bdev_ctrlr->connected_trid = &next_trid->trid;
722 		rc = spdk_nvme_ctrlr_set_trid(nvme_bdev_ctrlr->ctrlr, &next_trid->trid);
723 		assert(rc == 0);
724 		TAILQ_REMOVE(&nvme_bdev_ctrlr->trids, curr_trid, link);
725 		if (!remove) {
726 			/** Shuffle the old trid to the end of the list and use the new one.
727 			 * Allows for round robin through multiple connections.
728 			 */
729 			TAILQ_INSERT_TAIL(&nvme_bdev_ctrlr->trids, curr_trid, link);
730 		} else {
731 			free(curr_trid);
732 		}
733 	}
734 
735 	pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
736 	return 0;
737 }
738 
739 static int
740 bdev_nvme_failover(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, bool remove)
741 {
742 	int rc;
743 
744 	rc = _bdev_nvme_failover_start(nvme_bdev_ctrlr, remove);
745 	if (rc == 0) {
746 		/* First, delete all NVMe I/O queue pairs. */
747 		spdk_for_each_channel(nvme_bdev_ctrlr,
748 				      _bdev_nvme_reset_destroy_qpair,
749 				      nvme_bdev_ctrlr,
750 				      _bdev_nvme_reset_ctrlr);
751 	} else if (rc != -EBUSY) {
752 		return rc;
753 	}
754 
755 	return 0;
756 }
757 
758 static int
759 bdev_nvme_unmap(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
760 		struct nvme_bdev_io *bio,
761 		uint64_t offset_blocks,
762 		uint64_t num_blocks);
763 
764 static void
765 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
766 		     bool success)
767 {
768 	struct nvme_bdev_io *bio = (struct nvme_bdev_io *)bdev_io->driver_ctx;
769 	struct spdk_bdev *bdev = bdev_io->bdev;
770 	struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev->ctxt;
771 	struct nvme_io_path *io_path = spdk_io_channel_get_ctx(ch);
772 	struct spdk_nvme_ns *ns;
773 	struct spdk_nvme_qpair *qpair;
774 	int ret;
775 
776 	if (!success) {
777 		ret = -EINVAL;
778 		goto exit;
779 	}
780 
781 	if (spdk_unlikely(!bdev_nvme_find_io_path(nbdev, io_path, &ns, &qpair))) {
782 		ret = -ENXIO;
783 		goto exit;
784 	}
785 
786 	ret = bdev_nvme_readv(ns,
787 			      qpair,
788 			      bio,
789 			      bdev_io->u.bdev.iovs,
790 			      bdev_io->u.bdev.iovcnt,
791 			      bdev_io->u.bdev.md_buf,
792 			      bdev_io->u.bdev.num_blocks,
793 			      bdev_io->u.bdev.offset_blocks,
794 			      bdev->dif_check_flags);
795 
796 exit:
797 	if (spdk_unlikely(ret != 0)) {
798 		bdev_nvme_io_complete(bio, ret);
799 	}
800 }
801 
802 static void
803 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
804 {
805 	struct nvme_io_path *io_path = spdk_io_channel_get_ctx(ch);
806 	struct spdk_bdev *bdev = bdev_io->bdev;
807 	struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev->ctxt;
808 	struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx;
809 	struct nvme_bdev_io *nbdev_io_to_abort;
810 	struct spdk_nvme_ns *ns;
811 	struct spdk_nvme_qpair *qpair;
812 	int rc = 0;
813 
814 	if (spdk_unlikely(!bdev_nvme_find_io_path(nbdev, io_path, &ns, &qpair))) {
815 		rc = -ENXIO;
816 		goto exit;
817 	}
818 
819 	switch (bdev_io->type) {
820 	case SPDK_BDEV_IO_TYPE_READ:
821 		if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) {
822 			rc = bdev_nvme_readv(ns,
823 					     qpair,
824 					     nbdev_io,
825 					     bdev_io->u.bdev.iovs,
826 					     bdev_io->u.bdev.iovcnt,
827 					     bdev_io->u.bdev.md_buf,
828 					     bdev_io->u.bdev.num_blocks,
829 					     bdev_io->u.bdev.offset_blocks,
830 					     bdev->dif_check_flags);
831 		} else {
832 			spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb,
833 					     bdev_io->u.bdev.num_blocks * bdev->blocklen);
834 			rc = 0;
835 		}
836 		break;
837 	case SPDK_BDEV_IO_TYPE_WRITE:
838 		rc = bdev_nvme_writev(ns,
839 				      qpair,
840 				      nbdev_io,
841 				      bdev_io->u.bdev.iovs,
842 				      bdev_io->u.bdev.iovcnt,
843 				      bdev_io->u.bdev.md_buf,
844 				      bdev_io->u.bdev.num_blocks,
845 				      bdev_io->u.bdev.offset_blocks,
846 				      bdev->dif_check_flags);
847 		break;
848 	case SPDK_BDEV_IO_TYPE_COMPARE:
849 		rc = bdev_nvme_comparev(ns,
850 					qpair,
851 					nbdev_io,
852 					bdev_io->u.bdev.iovs,
853 					bdev_io->u.bdev.iovcnt,
854 					bdev_io->u.bdev.md_buf,
855 					bdev_io->u.bdev.num_blocks,
856 					bdev_io->u.bdev.offset_blocks,
857 					bdev->dif_check_flags);
858 		break;
859 	case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE:
860 		rc = bdev_nvme_comparev_and_writev(ns,
861 						   qpair,
862 						   nbdev_io,
863 						   bdev_io->u.bdev.iovs,
864 						   bdev_io->u.bdev.iovcnt,
865 						   bdev_io->u.bdev.fused_iovs,
866 						   bdev_io->u.bdev.fused_iovcnt,
867 						   bdev_io->u.bdev.md_buf,
868 						   bdev_io->u.bdev.num_blocks,
869 						   bdev_io->u.bdev.offset_blocks,
870 						   bdev->dif_check_flags);
871 		break;
872 	case SPDK_BDEV_IO_TYPE_UNMAP:
873 		rc = bdev_nvme_unmap(ns,
874 				     qpair,
875 				     nbdev_io,
876 				     bdev_io->u.bdev.offset_blocks,
877 				     bdev_io->u.bdev.num_blocks);
878 		break;
879 	case SPDK_BDEV_IO_TYPE_RESET:
880 		rc = bdev_nvme_reset(io_path, bdev_io);
881 		break;
882 	case SPDK_BDEV_IO_TYPE_FLUSH:
883 		rc = bdev_nvme_flush(ns,
884 				     qpair,
885 				     nbdev_io,
886 				     bdev_io->u.bdev.offset_blocks,
887 				     bdev_io->u.bdev.num_blocks);
888 		break;
889 	case SPDK_BDEV_IO_TYPE_ZONE_APPEND:
890 		rc = bdev_nvme_zone_appendv(ns,
891 					    qpair,
892 					    nbdev_io,
893 					    bdev_io->u.bdev.iovs,
894 					    bdev_io->u.bdev.iovcnt,
895 					    bdev_io->u.bdev.md_buf,
896 					    bdev_io->u.bdev.num_blocks,
897 					    bdev_io->u.bdev.offset_blocks,
898 					    bdev->dif_check_flags);
899 		break;
900 	case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO:
901 		rc = bdev_nvme_get_zone_info(ns,
902 					     qpair,
903 					     nbdev_io,
904 					     bdev_io->u.zone_mgmt.zone_id,
905 					     bdev_io->u.zone_mgmt.num_zones,
906 					     bdev_io->u.zone_mgmt.buf);
907 		break;
908 	case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
909 		rc = bdev_nvme_zone_management(ns,
910 					       qpair,
911 					       nbdev_io,
912 					       bdev_io->u.zone_mgmt.zone_id,
913 					       bdev_io->u.zone_mgmt.zone_action);
914 		break;
915 	case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
916 		rc = bdev_nvme_admin_passthru(io_path,
917 					      nbdev_io,
918 					      &bdev_io->u.nvme_passthru.cmd,
919 					      bdev_io->u.nvme_passthru.buf,
920 					      bdev_io->u.nvme_passthru.nbytes);
921 		break;
922 	case SPDK_BDEV_IO_TYPE_NVME_IO:
923 		rc = bdev_nvme_io_passthru(ns,
924 					   qpair,
925 					   nbdev_io,
926 					   &bdev_io->u.nvme_passthru.cmd,
927 					   bdev_io->u.nvme_passthru.buf,
928 					   bdev_io->u.nvme_passthru.nbytes);
929 		break;
930 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
931 		rc = bdev_nvme_io_passthru_md(ns,
932 					      qpair,
933 					      nbdev_io,
934 					      &bdev_io->u.nvme_passthru.cmd,
935 					      bdev_io->u.nvme_passthru.buf,
936 					      bdev_io->u.nvme_passthru.nbytes,
937 					      bdev_io->u.nvme_passthru.md_buf,
938 					      bdev_io->u.nvme_passthru.md_len);
939 		break;
940 	case SPDK_BDEV_IO_TYPE_ABORT:
941 		nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx;
942 		rc = bdev_nvme_abort(io_path,
943 				     nbdev_io,
944 				     nbdev_io_to_abort);
945 		break;
946 	default:
947 		rc = -EINVAL;
948 		break;
949 	}
950 
951 exit:
952 	if (spdk_unlikely(rc != 0)) {
953 		bdev_nvme_io_complete(nbdev_io, rc);
954 	}
955 }
956 
957 static bool
958 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
959 {
960 	struct nvme_bdev *nbdev = ctx;
961 	struct nvme_bdev_ns *nvme_ns;
962 	struct spdk_nvme_ns *ns;
963 	struct spdk_nvme_ctrlr *ctrlr;
964 	const struct spdk_nvme_ctrlr_data *cdata;
965 
966 	nvme_ns = nbdev->nvme_ns;
967 	assert(nvme_ns != NULL);
968 	ns = nvme_ns->ns;
969 	ctrlr = spdk_nvme_ns_get_ctrlr(ns);
970 
971 	switch (io_type) {
972 	case SPDK_BDEV_IO_TYPE_READ:
973 	case SPDK_BDEV_IO_TYPE_WRITE:
974 	case SPDK_BDEV_IO_TYPE_RESET:
975 	case SPDK_BDEV_IO_TYPE_FLUSH:
976 	case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
977 	case SPDK_BDEV_IO_TYPE_NVME_IO:
978 	case SPDK_BDEV_IO_TYPE_ABORT:
979 		return true;
980 
981 	case SPDK_BDEV_IO_TYPE_COMPARE:
982 		return spdk_nvme_ns_supports_compare(ns);
983 
984 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
985 		return spdk_nvme_ns_get_md_size(ns) ? true : false;
986 
987 	case SPDK_BDEV_IO_TYPE_UNMAP:
988 		cdata = spdk_nvme_ctrlr_get_data(ctrlr);
989 		return cdata->oncs.dsm;
990 
991 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
992 		/*
993 		 * The NVMe controller write_zeroes function is currently not used by our driver.
994 		 * NVMe write zeroes is limited to 16-bit block count, and the bdev layer currently
995 		 * has no mechanism for reporting a max write zeroes block count, nor ability to
996 		 * split a write zeroes request.
997 		 */
998 		return false;
999 
1000 	case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE:
1001 		if (spdk_nvme_ctrlr_get_flags(ctrlr) &
1002 		    SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) {
1003 			return true;
1004 		}
1005 		return false;
1006 
1007 	case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO:
1008 	case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
1009 		return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS;
1010 
1011 	case SPDK_BDEV_IO_TYPE_ZONE_APPEND:
1012 		return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS &&
1013 		       spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED;
1014 
1015 	default:
1016 		return false;
1017 	}
1018 }
1019 
1020 static int
1021 bdev_nvme_create_path_cb(void *io_device, void *ctx_buf)
1022 {
1023 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = io_device;
1024 	struct nvme_io_path *io_path = ctx_buf;
1025 	struct spdk_io_channel *pg_ch;
1026 	int rc;
1027 
1028 	pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs);
1029 	if (!pg_ch) {
1030 		return -1;
1031 	}
1032 
1033 	io_path->group = spdk_io_channel_get_ctx(pg_ch);
1034 
1035 #ifdef SPDK_CONFIG_VTUNE
1036 	io_path->group->collect_spin_stat = true;
1037 #else
1038 	io_path->group->collect_spin_stat = false;
1039 #endif
1040 
1041 	TAILQ_INIT(&io_path->pending_resets);
1042 
1043 	if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) {
1044 		rc = bdev_ocssd_create_io_channel(io_path);
1045 		if (rc != 0) {
1046 			goto err_ocssd_ch;
1047 		}
1048 	}
1049 
1050 	io_path->ctrlr = nvme_bdev_ctrlr;
1051 
1052 	rc = bdev_nvme_create_qpair(io_path);
1053 	if (rc != 0) {
1054 		goto err_qpair;
1055 	}
1056 
1057 	return 0;
1058 
1059 err_qpair:
1060 	if (io_path->ocssd_ch) {
1061 		bdev_ocssd_destroy_io_channel(io_path);
1062 	}
1063 err_ocssd_ch:
1064 	spdk_put_io_channel(pg_ch);
1065 
1066 	return rc;
1067 }
1068 
1069 static void
1070 bdev_nvme_destroy_path_cb(void *io_device, void *ctx_buf)
1071 {
1072 	struct nvme_io_path *io_path = ctx_buf;
1073 
1074 	assert(io_path->group != NULL);
1075 
1076 	if (io_path->ocssd_ch != NULL) {
1077 		bdev_ocssd_destroy_io_channel(io_path);
1078 	}
1079 
1080 	bdev_nvme_destroy_qpair(io_path);
1081 
1082 	spdk_put_io_channel(spdk_io_channel_from_ctx(io_path->group));
1083 }
1084 
1085 static void
1086 bdev_nvme_poll_group_submit_accel_crc32c(void *ctx, uint32_t *dst, struct iovec *iov,
1087 		uint32_t iov_cnt, uint32_t seed,
1088 		spdk_nvme_accel_completion_cb cb_fn, void *cb_arg)
1089 {
1090 	struct nvme_bdev_poll_group *group = ctx;
1091 	int rc;
1092 
1093 	assert(group->accel_channel != NULL);
1094 	assert(cb_fn != NULL);
1095 
1096 	rc = spdk_accel_submit_crc32cv(group->accel_channel, dst, iov, iov_cnt, seed, cb_fn, cb_arg);
1097 	if (rc) {
1098 		/* For the two cases, spdk_accel_submit_crc32cv does not call the user's cb_fn */
1099 		if (rc == -ENOMEM || rc == -EINVAL) {
1100 			cb_fn(cb_arg, rc);
1101 		}
1102 		SPDK_ERRLOG("Cannot complete the accelerated crc32c operation with iov=%p\n", iov);
1103 	}
1104 }
1105 
1106 static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = {
1107 	.table_size		= sizeof(struct spdk_nvme_accel_fn_table),
1108 	.submit_accel_crc32c	= bdev_nvme_poll_group_submit_accel_crc32c,
1109 };
1110 
1111 static int
1112 bdev_nvme_poll_group_create_cb(void *io_device, void *ctx_buf)
1113 {
1114 	struct nvme_bdev_poll_group *group = ctx_buf;
1115 
1116 	group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table);
1117 	if (group->group == NULL) {
1118 		return -1;
1119 	}
1120 
1121 	group->accel_channel = spdk_accel_engine_get_io_channel();
1122 	if (!group->accel_channel) {
1123 		spdk_nvme_poll_group_destroy(group->group);
1124 		SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n",
1125 			    group);
1126 		return -1;
1127 	}
1128 
1129 	group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us);
1130 
1131 	if (group->poller == NULL) {
1132 		spdk_put_io_channel(group->accel_channel);
1133 		spdk_nvme_poll_group_destroy(group->group);
1134 		return -1;
1135 	}
1136 
1137 	return 0;
1138 }
1139 
1140 static void
1141 bdev_nvme_poll_group_destroy_cb(void *io_device, void *ctx_buf)
1142 {
1143 	struct nvme_bdev_poll_group *group = ctx_buf;
1144 
1145 	if (group->accel_channel) {
1146 		spdk_put_io_channel(group->accel_channel);
1147 	}
1148 
1149 	spdk_poller_unregister(&group->poller);
1150 	if (spdk_nvme_poll_group_destroy(group->group)) {
1151 		SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.\n");
1152 		assert(false);
1153 	}
1154 }
1155 
1156 static struct spdk_io_channel *
1157 bdev_nvme_get_io_channel(void *ctx)
1158 {
1159 	struct nvme_bdev *nvme_bdev = ctx;
1160 
1161 	return spdk_get_io_channel(nvme_bdev->nvme_ns->ctrlr);
1162 }
1163 
1164 static void *
1165 bdev_nvme_get_module_ctx(void *ctx)
1166 {
1167 	struct nvme_bdev *nvme_bdev = ctx;
1168 
1169 	return bdev_nvme_get_ctrlr(&nvme_bdev->disk);
1170 }
1171 
1172 static const char *
1173 _nvme_ana_state_str(enum spdk_nvme_ana_state ana_state)
1174 {
1175 	switch (ana_state) {
1176 	case SPDK_NVME_ANA_OPTIMIZED_STATE:
1177 		return "optimized";
1178 	case SPDK_NVME_ANA_NON_OPTIMIZED_STATE:
1179 		return "non_optimized";
1180 	case SPDK_NVME_ANA_INACCESSIBLE_STATE:
1181 		return "inaccessible";
1182 	case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE:
1183 		return "persistent_loss";
1184 	case SPDK_NVME_ANA_CHANGE_STATE:
1185 		return "change";
1186 	default:
1187 		return NULL;
1188 	}
1189 }
1190 
1191 static int
1192 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
1193 {
1194 	struct nvme_bdev *nvme_bdev = ctx;
1195 	struct nvme_bdev_ns *nvme_ns;
1196 	struct spdk_nvme_ns *ns;
1197 	struct spdk_nvme_ctrlr *ctrlr;
1198 	const struct spdk_nvme_ctrlr_data *cdata;
1199 	const struct spdk_nvme_transport_id *trid;
1200 	union spdk_nvme_vs_register vs;
1201 	union spdk_nvme_csts_register csts;
1202 	char buf[128];
1203 
1204 	nvme_ns = nvme_bdev->nvme_ns;
1205 	assert(nvme_ns != NULL);
1206 	ns = nvme_ns->ns;
1207 	ctrlr = spdk_nvme_ns_get_ctrlr(ns);
1208 
1209 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
1210 	trid = spdk_nvme_ctrlr_get_transport_id(ctrlr);
1211 	vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr);
1212 	csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr);
1213 
1214 	spdk_json_write_named_object_begin(w, "nvme");
1215 
1216 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
1217 		spdk_json_write_named_string(w, "pci_address", trid->traddr);
1218 	}
1219 
1220 	spdk_json_write_named_object_begin(w, "trid");
1221 
1222 	nvme_bdev_dump_trid_json(trid, w);
1223 
1224 	spdk_json_write_object_end(w);
1225 
1226 #ifdef SPDK_CONFIG_NVME_CUSE
1227 	size_t cuse_name_size = 128;
1228 	char cuse_name[cuse_name_size];
1229 
1230 	int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns),
1231 					    cuse_name, &cuse_name_size);
1232 	if (rc == 0) {
1233 		spdk_json_write_named_string(w, "cuse_device", cuse_name);
1234 	}
1235 #endif
1236 
1237 	spdk_json_write_named_object_begin(w, "ctrlr_data");
1238 
1239 	spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid);
1240 
1241 	snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn);
1242 	spdk_str_trim(buf);
1243 	spdk_json_write_named_string(w, "model_number", buf);
1244 
1245 	snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn);
1246 	spdk_str_trim(buf);
1247 	spdk_json_write_named_string(w, "serial_number", buf);
1248 
1249 	snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr);
1250 	spdk_str_trim(buf);
1251 	spdk_json_write_named_string(w, "firmware_revision", buf);
1252 
1253 	if (cdata->subnqn[0] != '\0') {
1254 		spdk_json_write_named_string(w, "subnqn", cdata->subnqn);
1255 	}
1256 
1257 	spdk_json_write_named_object_begin(w, "oacs");
1258 
1259 	spdk_json_write_named_uint32(w, "security", cdata->oacs.security);
1260 	spdk_json_write_named_uint32(w, "format", cdata->oacs.format);
1261 	spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware);
1262 	spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage);
1263 
1264 	spdk_json_write_object_end(w);
1265 
1266 	spdk_json_write_object_end(w);
1267 
1268 	spdk_json_write_named_object_begin(w, "vs");
1269 
1270 	spdk_json_write_name(w, "nvme_version");
1271 	if (vs.bits.ter) {
1272 		spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter);
1273 	} else {
1274 		spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr);
1275 	}
1276 
1277 	spdk_json_write_object_end(w);
1278 
1279 	spdk_json_write_named_object_begin(w, "csts");
1280 
1281 	spdk_json_write_named_uint32(w, "rdy", csts.bits.rdy);
1282 	spdk_json_write_named_uint32(w, "cfs", csts.bits.cfs);
1283 
1284 	spdk_json_write_object_end(w);
1285 
1286 	spdk_json_write_named_object_begin(w, "ns_data");
1287 
1288 	spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns));
1289 
1290 	if (cdata->cmic.ana_reporting) {
1291 		spdk_json_write_named_string(w, "ana_state",
1292 					     _nvme_ana_state_str(spdk_nvme_ns_get_ana_state(ns)));
1293 	}
1294 
1295 	spdk_json_write_object_end(w);
1296 
1297 	if (cdata->oacs.security) {
1298 		spdk_json_write_named_object_begin(w, "security");
1299 
1300 		spdk_json_write_named_bool(w, "opal", nvme_bdev->opal);
1301 
1302 		spdk_json_write_object_end(w);
1303 	}
1304 
1305 	spdk_json_write_object_end(w);
1306 
1307 	return 0;
1308 }
1309 
1310 static void
1311 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
1312 {
1313 	/* No config per bdev needed */
1314 }
1315 
1316 static uint64_t
1317 bdev_nvme_get_spin_time(struct spdk_io_channel *ch)
1318 {
1319 	struct nvme_io_path *io_path = spdk_io_channel_get_ctx(ch);
1320 	struct nvme_bdev_poll_group *group = io_path->group;
1321 	uint64_t spin_time;
1322 
1323 	if (!group || !group->collect_spin_stat) {
1324 		return 0;
1325 	}
1326 
1327 	if (group->end_ticks != 0) {
1328 		group->spin_ticks += (group->end_ticks - group->start_ticks);
1329 		group->end_ticks = 0;
1330 	}
1331 
1332 	spin_time = (group->spin_ticks * 1000000ULL) / spdk_get_ticks_hz();
1333 	group->start_ticks = 0;
1334 	group->spin_ticks = 0;
1335 
1336 	return spin_time;
1337 }
1338 
1339 static const struct spdk_bdev_fn_table nvmelib_fn_table = {
1340 	.destruct		= bdev_nvme_destruct,
1341 	.submit_request		= bdev_nvme_submit_request,
1342 	.io_type_supported	= bdev_nvme_io_type_supported,
1343 	.get_io_channel		= bdev_nvme_get_io_channel,
1344 	.dump_info_json		= bdev_nvme_dump_info_json,
1345 	.write_config_json	= bdev_nvme_write_config_json,
1346 	.get_spin_time		= bdev_nvme_get_spin_time,
1347 	.get_module_ctx		= bdev_nvme_get_module_ctx,
1348 };
1349 
1350 static int
1351 nvme_disk_create(struct spdk_bdev *disk, const char *base_name,
1352 		 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns,
1353 		 uint32_t prchk_flags, void *ctx)
1354 {
1355 	const struct spdk_uuid		*uuid;
1356 	const struct spdk_nvme_ctrlr_data *cdata;
1357 	const struct spdk_nvme_ns_data	*nsdata;
1358 	int				rc;
1359 	enum spdk_nvme_csi		csi;
1360 	uint32_t atomic_bs, phys_bs, bs;
1361 
1362 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
1363 	csi = spdk_nvme_ns_get_csi(ns);
1364 
1365 	switch (csi) {
1366 	case SPDK_NVME_CSI_NVM:
1367 		disk->product_name = "NVMe disk";
1368 		break;
1369 	case SPDK_NVME_CSI_ZNS:
1370 		disk->product_name = "NVMe ZNS disk";
1371 		disk->zoned = true;
1372 		disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns);
1373 		disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) /
1374 					     spdk_nvme_ns_get_extended_sector_size(ns);
1375 		disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns);
1376 		disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns);
1377 		break;
1378 	default:
1379 		SPDK_ERRLOG("unsupported CSI: %u\n", csi);
1380 		return -ENOTSUP;
1381 	}
1382 
1383 	disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns));
1384 	if (!disk->name) {
1385 		return -ENOMEM;
1386 	}
1387 
1388 	disk->write_cache = 0;
1389 	if (cdata->vwc.present) {
1390 		/* Enable if the Volatile Write Cache exists */
1391 		disk->write_cache = 1;
1392 	}
1393 	disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns);
1394 	disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns);
1395 	disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns);
1396 
1397 	uuid = spdk_nvme_ns_get_uuid(ns);
1398 	if (uuid != NULL) {
1399 		disk->uuid = *uuid;
1400 	}
1401 
1402 	nsdata = spdk_nvme_ns_get_data(ns);
1403 	bs = spdk_nvme_ns_get_sector_size(ns);
1404 	atomic_bs = bs;
1405 	phys_bs = bs;
1406 	if (nsdata->nabo == 0) {
1407 		if (nsdata->nsfeat.ns_atomic_write_unit && nsdata->nawupf) {
1408 			atomic_bs = bs * (1 + nsdata->nawupf);
1409 		} else {
1410 			atomic_bs = bs * (1 + cdata->awupf);
1411 		}
1412 	}
1413 	if (nsdata->nsfeat.optperf) {
1414 		phys_bs = bs * (1 + nsdata->npwg);
1415 	}
1416 	disk->phys_blocklen = spdk_min(phys_bs, atomic_bs);
1417 
1418 	disk->md_len = spdk_nvme_ns_get_md_size(ns);
1419 	if (disk->md_len != 0) {
1420 		disk->md_interleave = nsdata->flbas.extended;
1421 		disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns);
1422 		if (disk->dif_type != SPDK_DIF_DISABLE) {
1423 			disk->dif_is_head_of_md = nsdata->dps.md_start;
1424 			disk->dif_check_flags = prchk_flags;
1425 		}
1426 	}
1427 
1428 	if (!(spdk_nvme_ctrlr_get_flags(ctrlr) &
1429 	      SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) {
1430 		disk->acwu = 0;
1431 	} else if (nsdata->nsfeat.ns_atomic_write_unit) {
1432 		disk->acwu = nsdata->nacwu;
1433 	} else {
1434 		disk->acwu = cdata->acwu;
1435 	}
1436 
1437 	disk->ctxt = ctx;
1438 	disk->fn_table = &nvmelib_fn_table;
1439 	disk->module = &nvme_if;
1440 	rc = spdk_bdev_register(disk);
1441 	if (rc) {
1442 		SPDK_ERRLOG("spdk_bdev_register() failed\n");
1443 		free(disk->name);
1444 		return rc;
1445 	}
1446 
1447 	return 0;
1448 }
1449 
1450 static int
1451 nvme_bdev_create(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, struct nvme_bdev_ns *nvme_ns)
1452 {
1453 	struct nvme_bdev *bdev;
1454 	int rc;
1455 
1456 	bdev = calloc(1, sizeof(*bdev));
1457 	if (!bdev) {
1458 		SPDK_ERRLOG("bdev calloc() failed\n");
1459 		return -ENOMEM;
1460 	}
1461 
1462 	bdev->nvme_ns = nvme_ns;
1463 	bdev->opal = nvme_bdev_ctrlr->opal_dev != NULL;
1464 
1465 	rc = nvme_disk_create(&bdev->disk, nvme_bdev_ctrlr->name, nvme_bdev_ctrlr->ctrlr,
1466 			      nvme_ns->ns, nvme_bdev_ctrlr->prchk_flags, bdev);
1467 	if (rc != 0) {
1468 		SPDK_ERRLOG("Failed to create NVMe disk\n");
1469 		free(bdev);
1470 		return rc;
1471 	}
1472 
1473 	nvme_ns->bdev = bdev;
1474 
1475 	return 0;
1476 }
1477 
1478 static bool
1479 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2)
1480 {
1481 	const struct spdk_nvme_ns_data *nsdata1, *nsdata2;
1482 	const struct spdk_uuid *uuid1, *uuid2;
1483 
1484 	nsdata1 = spdk_nvme_ns_get_data(ns1);
1485 	nsdata2 = spdk_nvme_ns_get_data(ns2);
1486 	uuid1 = spdk_nvme_ns_get_uuid(ns1);
1487 	uuid2 = spdk_nvme_ns_get_uuid(ns2);
1488 
1489 	return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)) == 0 &&
1490 	       nsdata1->eui64 == nsdata2->eui64 &&
1491 	       uuid1 != NULL && uuid2 != NULL && spdk_uuid_compare(uuid1, uuid2) == 0;
1492 }
1493 
1494 static void
1495 nvme_ctrlr_populate_standard_namespace(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
1496 				       struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx)
1497 {
1498 	struct spdk_nvme_ctrlr	*ctrlr = nvme_bdev_ctrlr->ctrlr;
1499 	struct spdk_nvme_ns	*ns;
1500 	int			rc = 0;
1501 
1502 	ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id);
1503 	if (!ns) {
1504 		SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id);
1505 		rc = -EINVAL;
1506 		goto done;
1507 	}
1508 
1509 	nvme_ns->ns = ns;
1510 	nvme_ns->populated = true;
1511 
1512 	rc = nvme_bdev_create(nvme_bdev_ctrlr, nvme_ns);
1513 done:
1514 	nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, rc);
1515 }
1516 
1517 static bool
1518 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
1519 		 struct spdk_nvme_ctrlr_opts *opts)
1520 {
1521 	struct nvme_probe_skip_entry *entry;
1522 
1523 	TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) {
1524 		if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) {
1525 			return false;
1526 		}
1527 	}
1528 
1529 	opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst;
1530 	opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight;
1531 	opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight;
1532 	opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight;
1533 
1534 	SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr);
1535 
1536 	return true;
1537 }
1538 
1539 static void
1540 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl)
1541 {
1542 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = ctx;
1543 
1544 	if (spdk_nvme_cpl_is_error(cpl)) {
1545 		SPDK_WARNLOG("Abort failed. Resetting controller. sc is %u, sct is %u.\n", cpl->status.sc,
1546 			     cpl->status.sct);
1547 		_bdev_nvme_reset(nvme_bdev_ctrlr);
1548 	}
1549 }
1550 
1551 static void
1552 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr,
1553 	   struct spdk_nvme_qpair *qpair, uint16_t cid)
1554 {
1555 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = cb_arg;
1556 	union spdk_nvme_csts_register csts;
1557 	int rc;
1558 
1559 	assert(nvme_bdev_ctrlr->ctrlr == ctrlr);
1560 
1561 	SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid);
1562 
1563 	/* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O
1564 	 * queue.  (Note: qpair == NULL when there's an admin cmd timeout.)  Otherwise we
1565 	 * would submit another fabrics cmd on the admin queue to read CSTS and check for its
1566 	 * completion recursively.
1567 	 */
1568 	if (nvme_bdev_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) {
1569 		csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr);
1570 		if (csts.bits.cfs) {
1571 			SPDK_ERRLOG("Controller Fatal Status, reset required\n");
1572 			_bdev_nvme_reset(nvme_bdev_ctrlr);
1573 			return;
1574 		}
1575 	}
1576 
1577 	switch (g_opts.action_on_timeout) {
1578 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT:
1579 		if (qpair) {
1580 			/* Don't send abort to ctrlr when reset is running. */
1581 			pthread_mutex_lock(&nvme_bdev_ctrlr->mutex);
1582 			if (nvme_bdev_ctrlr->resetting) {
1583 				pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
1584 				SPDK_NOTICELOG("Quit abort. Ctrlr is in the process of reseting.\n");
1585 				return;
1586 			}
1587 			pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
1588 
1589 			rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid,
1590 						       nvme_abort_cpl, nvme_bdev_ctrlr);
1591 			if (rc == 0) {
1592 				return;
1593 			}
1594 
1595 			SPDK_ERRLOG("Unable to send abort. Resetting, rc is %d.\n", rc);
1596 		}
1597 
1598 	/* FALLTHROUGH */
1599 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET:
1600 		_bdev_nvme_reset(nvme_bdev_ctrlr);
1601 		break;
1602 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE:
1603 		SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n");
1604 		break;
1605 	default:
1606 		SPDK_ERRLOG("An invalid timeout action value is found.\n");
1607 		break;
1608 	}
1609 }
1610 
1611 static void
1612 nvme_ctrlr_depopulate_standard_namespace(struct nvme_bdev_ns *nvme_ns)
1613 {
1614 	struct nvme_bdev *bdev;
1615 
1616 	bdev = nvme_ns->bdev;
1617 	if (bdev != NULL) {
1618 		spdk_bdev_unregister(&bdev->disk, NULL, NULL);
1619 	}
1620 
1621 	nvme_ctrlr_depopulate_namespace_done(nvme_ns);
1622 }
1623 
1624 static void
1625 nvme_ctrlr_populate_namespace(struct nvme_bdev_ctrlr *ctrlr, struct nvme_bdev_ns *nvme_ns,
1626 			      struct nvme_async_probe_ctx *ctx)
1627 {
1628 	g_populate_namespace_fn[nvme_ns->type](ctrlr, nvme_ns, ctx);
1629 }
1630 
1631 static void
1632 nvme_ctrlr_depopulate_namespace(struct nvme_bdev_ctrlr *ctrlr, struct nvme_bdev_ns *nvme_ns)
1633 {
1634 	g_depopulate_namespace_fn[nvme_ns->type](nvme_ns);
1635 }
1636 
1637 void
1638 nvme_ctrlr_populate_namespace_done(struct nvme_async_probe_ctx *ctx,
1639 				   struct nvme_bdev_ns *nvme_ns, int rc)
1640 {
1641 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = nvme_ns->ctrlr;
1642 
1643 	assert(nvme_bdev_ctrlr != NULL);
1644 
1645 	if (rc == 0) {
1646 		pthread_mutex_lock(&nvme_bdev_ctrlr->mutex);
1647 		nvme_bdev_ctrlr->ref++;
1648 		pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
1649 	} else {
1650 		memset(nvme_ns, 0, sizeof(*nvme_ns));
1651 	}
1652 
1653 	if (ctx) {
1654 		ctx->populates_in_progress--;
1655 		if (ctx->populates_in_progress == 0) {
1656 			nvme_ctrlr_populate_namespaces_done(nvme_bdev_ctrlr, ctx);
1657 		}
1658 	}
1659 }
1660 
1661 static void
1662 nvme_ctrlr_populate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
1663 			       struct nvme_async_probe_ctx *ctx)
1664 {
1665 	struct spdk_nvme_ctrlr	*ctrlr = nvme_bdev_ctrlr->ctrlr;
1666 	struct nvme_bdev_ns	*nvme_ns;
1667 	struct spdk_nvme_ns	*ns;
1668 	struct nvme_bdev	*bdev;
1669 	uint32_t		i;
1670 	int			rc;
1671 	uint64_t		num_sectors;
1672 	bool			ns_is_active;
1673 
1674 	if (ctx) {
1675 		/* Initialize this count to 1 to handle the populate functions
1676 		 * calling nvme_ctrlr_populate_namespace_done() immediately.
1677 		 */
1678 		ctx->populates_in_progress = 1;
1679 	}
1680 
1681 	for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
1682 		uint32_t	nsid = i + 1;
1683 
1684 		nvme_ns = nvme_bdev_ctrlr->namespaces[i];
1685 		ns_is_active = spdk_nvme_ctrlr_is_active_ns(ctrlr, nsid);
1686 
1687 		if (nvme_ns->populated && ns_is_active && nvme_ns->type == NVME_BDEV_NS_STANDARD) {
1688 			/* NS is still there but attributes may have changed */
1689 			ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid);
1690 			num_sectors = spdk_nvme_ns_get_num_sectors(ns);
1691 			bdev = nvme_ns->bdev;
1692 			assert(bdev != NULL);
1693 			if (bdev->disk.blockcnt != num_sectors) {
1694 				SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n",
1695 					       nsid,
1696 					       bdev->disk.name,
1697 					       bdev->disk.blockcnt,
1698 					       num_sectors);
1699 				rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors);
1700 				if (rc != 0) {
1701 					SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n",
1702 						    bdev->disk.name, rc);
1703 				}
1704 			}
1705 		}
1706 
1707 		if (!nvme_ns->populated && ns_is_active) {
1708 			nvme_ns->id = nsid;
1709 			nvme_ns->ctrlr = nvme_bdev_ctrlr;
1710 			if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) {
1711 				nvme_ns->type = NVME_BDEV_NS_OCSSD;
1712 			} else {
1713 				nvme_ns->type = NVME_BDEV_NS_STANDARD;
1714 			}
1715 
1716 			nvme_ns->bdev = NULL;
1717 
1718 			if (ctx) {
1719 				ctx->populates_in_progress++;
1720 			}
1721 			nvme_ctrlr_populate_namespace(nvme_bdev_ctrlr, nvme_ns, ctx);
1722 		}
1723 
1724 		if (nvme_ns->populated && !ns_is_active) {
1725 			nvme_ctrlr_depopulate_namespace(nvme_bdev_ctrlr, nvme_ns);
1726 		}
1727 	}
1728 
1729 	if (ctx) {
1730 		/* Decrement this count now that the loop is over to account
1731 		 * for the one we started with.  If the count is then 0, we
1732 		 * know any populate_namespace functions completed immediately,
1733 		 * so we'll kick the callback here.
1734 		 */
1735 		ctx->populates_in_progress--;
1736 		if (ctx->populates_in_progress == 0) {
1737 			nvme_ctrlr_populate_namespaces_done(nvme_bdev_ctrlr, ctx);
1738 		}
1739 	}
1740 
1741 }
1742 
1743 static void
1744 nvme_ctrlr_depopulate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr)
1745 {
1746 	uint32_t i;
1747 	struct nvme_bdev_ns *nvme_ns;
1748 
1749 	for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
1750 		uint32_t nsid = i + 1;
1751 
1752 		nvme_ns = nvme_bdev_ctrlr->namespaces[nsid - 1];
1753 		if (nvme_ns->populated) {
1754 			assert(nvme_ns->id == nsid);
1755 			nvme_ctrlr_depopulate_namespace(nvme_bdev_ctrlr, nvme_ns);
1756 		}
1757 	}
1758 }
1759 
1760 static void
1761 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl)
1762 {
1763 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr		= arg;
1764 	union spdk_nvme_async_event_completion	event;
1765 
1766 	if (spdk_nvme_cpl_is_error(cpl)) {
1767 		SPDK_WARNLOG("AER request execute failed");
1768 		return;
1769 	}
1770 
1771 	event.raw = cpl->cdw0;
1772 	if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) &&
1773 	    (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) {
1774 		nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, NULL);
1775 	} else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_VENDOR) &&
1776 		   (event.bits.log_page_identifier == SPDK_OCSSD_LOG_CHUNK_NOTIFICATION) &&
1777 		   spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) {
1778 		bdev_ocssd_handle_chunk_notification(nvme_bdev_ctrlr);
1779 	}
1780 }
1781 
1782 static void
1783 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, size_t count, int rc)
1784 {
1785 	if (ctx->cb_fn) {
1786 		ctx->cb_fn(ctx->cb_ctx, count, rc);
1787 	}
1788 
1789 	ctx->namespaces_populated = true;
1790 	if (ctx->probe_done) {
1791 		/* The probe was already completed, so we need to free the context
1792 		 * here.  This can happen for cases like OCSSD, where we need to
1793 		 * send additional commands to the SSD after attach.
1794 		 */
1795 		free(ctx);
1796 	}
1797 }
1798 
1799 static int
1800 _nvme_bdev_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr,
1801 			const char *name,
1802 			const struct spdk_nvme_transport_id *trid,
1803 			uint32_t prchk_flags,
1804 			struct nvme_bdev_ctrlr **_nvme_bdev_ctrlr)
1805 {
1806 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
1807 	struct nvme_bdev_ctrlr_trid *trid_entry;
1808 	uint32_t i;
1809 	int rc;
1810 
1811 	nvme_bdev_ctrlr = calloc(1, sizeof(*nvme_bdev_ctrlr));
1812 	if (nvme_bdev_ctrlr == NULL) {
1813 		SPDK_ERRLOG("Failed to allocate device struct\n");
1814 		return -ENOMEM;
1815 	}
1816 
1817 	rc = pthread_mutex_init(&nvme_bdev_ctrlr->mutex, NULL);
1818 	if (rc != 0) {
1819 		goto err_init_mutex;
1820 	}
1821 
1822 	TAILQ_INIT(&nvme_bdev_ctrlr->trids);
1823 	nvme_bdev_ctrlr->num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr);
1824 	if (nvme_bdev_ctrlr->num_ns != 0) {
1825 		nvme_bdev_ctrlr->namespaces = calloc(nvme_bdev_ctrlr->num_ns, sizeof(struct nvme_bdev_ns *));
1826 		if (!nvme_bdev_ctrlr->namespaces) {
1827 			SPDK_ERRLOG("Failed to allocate block namespaces pointer\n");
1828 			rc = -ENOMEM;
1829 			goto err_alloc_namespaces;
1830 		}
1831 	}
1832 
1833 	trid_entry = calloc(1, sizeof(*trid_entry));
1834 	if (trid_entry == NULL) {
1835 		SPDK_ERRLOG("Failed to allocate trid entry pointer\n");
1836 		rc = -ENOMEM;
1837 		goto err_alloc_trid;
1838 	}
1839 
1840 	trid_entry->trid = *trid;
1841 
1842 	for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
1843 		nvme_bdev_ctrlr->namespaces[i] = calloc(1, sizeof(struct nvme_bdev_ns));
1844 		if (nvme_bdev_ctrlr->namespaces[i] == NULL) {
1845 			SPDK_ERRLOG("Failed to allocate block namespace struct\n");
1846 			rc = -ENOMEM;
1847 			goto err_alloc_namespace;
1848 		}
1849 	}
1850 
1851 	nvme_bdev_ctrlr->thread = spdk_get_thread();
1852 	nvme_bdev_ctrlr->adminq_timer_poller = NULL;
1853 	nvme_bdev_ctrlr->ctrlr = ctrlr;
1854 	nvme_bdev_ctrlr->ref = 1;
1855 	nvme_bdev_ctrlr->connected_trid = &trid_entry->trid;
1856 	nvme_bdev_ctrlr->name = strdup(name);
1857 	if (nvme_bdev_ctrlr->name == NULL) {
1858 		rc = -ENOMEM;
1859 		goto err_alloc_name;
1860 	}
1861 
1862 	if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) {
1863 		rc = bdev_ocssd_init_ctrlr(nvme_bdev_ctrlr);
1864 		if (spdk_unlikely(rc != 0)) {
1865 			SPDK_ERRLOG("Unable to initialize OCSSD controller\n");
1866 			goto err_init_ocssd;
1867 		}
1868 	}
1869 
1870 	nvme_bdev_ctrlr->prchk_flags = prchk_flags;
1871 
1872 	spdk_io_device_register(nvme_bdev_ctrlr,
1873 				bdev_nvme_create_path_cb,
1874 				bdev_nvme_destroy_path_cb,
1875 				sizeof(struct nvme_io_path),
1876 				name);
1877 
1878 	nvme_bdev_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_bdev_ctrlr,
1879 					       g_opts.nvme_adminq_poll_period_us);
1880 
1881 	TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nvme_bdev_ctrlr, tailq);
1882 
1883 	if (g_opts.timeout_us > 0) {
1884 		spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us,
1885 				timeout_cb, nvme_bdev_ctrlr);
1886 	}
1887 
1888 	spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_bdev_ctrlr);
1889 	spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_bdev_ctrlr);
1890 
1891 	if (spdk_nvme_ctrlr_get_flags(nvme_bdev_ctrlr->ctrlr) &
1892 	    SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) {
1893 		nvme_bdev_ctrlr->opal_dev = spdk_opal_dev_construct(nvme_bdev_ctrlr->ctrlr);
1894 	}
1895 
1896 	TAILQ_INSERT_HEAD(&nvme_bdev_ctrlr->trids, trid_entry, link);
1897 
1898 	if (_nvme_bdev_ctrlr != NULL) {
1899 		*_nvme_bdev_ctrlr = nvme_bdev_ctrlr;
1900 	}
1901 	return 0;
1902 
1903 err_init_ocssd:
1904 	free(nvme_bdev_ctrlr->name);
1905 err_alloc_name:
1906 err_alloc_namespace:
1907 	for (; i > 0; i--) {
1908 		free(nvme_bdev_ctrlr->namespaces[i - 1]);
1909 	}
1910 	free(trid_entry);
1911 err_alloc_trid:
1912 	free(nvme_bdev_ctrlr->namespaces);
1913 err_alloc_namespaces:
1914 	pthread_mutex_destroy(&nvme_bdev_ctrlr->mutex);
1915 err_init_mutex:
1916 	free(nvme_bdev_ctrlr);
1917 	return rc;
1918 }
1919 
1920 static void
1921 nvme_bdev_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr,
1922 		       const char *name,
1923 		       const struct spdk_nvme_transport_id *trid,
1924 		       uint32_t prchk_flags,
1925 		       struct nvme_async_probe_ctx *ctx)
1926 {
1927 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = NULL;
1928 	int rc;
1929 
1930 	rc = _nvme_bdev_ctrlr_create(ctrlr, name, trid, prchk_flags, &nvme_bdev_ctrlr);
1931 	if (rc != 0) {
1932 		SPDK_ERRLOG("Failed to create new NVMe controller\n");
1933 		goto err;
1934 	}
1935 
1936 	nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, ctx);
1937 	return;
1938 
1939 err:
1940 	if (ctx != NULL) {
1941 		populate_namespaces_cb(ctx, 0, rc);
1942 	}
1943 }
1944 
1945 static void
1946 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
1947 	  struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
1948 {
1949 	struct nvme_probe_ctx *ctx = cb_ctx;
1950 	char *name = NULL;
1951 	uint32_t prchk_flags = 0;
1952 	size_t i;
1953 
1954 	if (ctx) {
1955 		for (i = 0; i < ctx->count; i++) {
1956 			if (spdk_nvme_transport_id_compare(trid, &ctx->trids[i]) == 0) {
1957 				prchk_flags = ctx->prchk_flags[i];
1958 				name = strdup(ctx->names[i]);
1959 				break;
1960 			}
1961 		}
1962 	} else {
1963 		name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++);
1964 	}
1965 	if (!name) {
1966 		SPDK_ERRLOG("Failed to assign name to NVMe device\n");
1967 		return;
1968 	}
1969 
1970 	SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name);
1971 
1972 	nvme_bdev_ctrlr_create(ctrlr, name, trid, prchk_flags, NULL);
1973 
1974 	free(name);
1975 }
1976 
1977 static void
1978 _nvme_bdev_ctrlr_destruct(void *ctx)
1979 {
1980 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = ctx;
1981 
1982 	nvme_ctrlr_depopulate_namespaces(nvme_bdev_ctrlr);
1983 	nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr);
1984 }
1985 
1986 static int
1987 _bdev_nvme_delete(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, bool hotplug)
1988 {
1989 	struct nvme_probe_skip_entry *entry;
1990 
1991 	pthread_mutex_lock(&nvme_bdev_ctrlr->mutex);
1992 
1993 	/* The controller's destruction was already started */
1994 	if (nvme_bdev_ctrlr->destruct) {
1995 		pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
1996 		return 0;
1997 	}
1998 
1999 	if (!hotplug &&
2000 	    nvme_bdev_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
2001 		entry = calloc(1, sizeof(*entry));
2002 		if (!entry) {
2003 			pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
2004 			return -ENOMEM;
2005 		}
2006 		entry->trid = *nvme_bdev_ctrlr->connected_trid;
2007 		TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq);
2008 	}
2009 
2010 	nvme_bdev_ctrlr->destruct = true;
2011 	pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
2012 
2013 	_nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr);
2014 
2015 	return 0;
2016 }
2017 
2018 static void
2019 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr)
2020 {
2021 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = cb_ctx;
2022 
2023 	_bdev_nvme_delete(nvme_bdev_ctrlr, true);
2024 }
2025 
2026 static int
2027 bdev_nvme_hotplug_probe(void *arg)
2028 {
2029 	if (g_hotplug_probe_ctx == NULL) {
2030 		spdk_poller_unregister(&g_hotplug_probe_poller);
2031 		return SPDK_POLLER_IDLE;
2032 	}
2033 
2034 	if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) {
2035 		g_hotplug_probe_ctx = NULL;
2036 		spdk_poller_unregister(&g_hotplug_probe_poller);
2037 	}
2038 
2039 	return SPDK_POLLER_BUSY;
2040 }
2041 
2042 static int
2043 bdev_nvme_hotplug(void *arg)
2044 {
2045 	struct spdk_nvme_transport_id trid_pcie;
2046 
2047 	if (g_hotplug_probe_ctx) {
2048 		return SPDK_POLLER_BUSY;
2049 	}
2050 
2051 	memset(&trid_pcie, 0, sizeof(trid_pcie));
2052 	spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE);
2053 
2054 	g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL,
2055 			      hotplug_probe_cb, attach_cb, NULL);
2056 
2057 	if (g_hotplug_probe_ctx) {
2058 		assert(g_hotplug_probe_poller == NULL);
2059 		g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000);
2060 	}
2061 
2062 	return SPDK_POLLER_BUSY;
2063 }
2064 
2065 void
2066 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts)
2067 {
2068 	*opts = g_opts;
2069 }
2070 
2071 int
2072 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts)
2073 {
2074 	if (g_bdev_nvme_init_thread != NULL) {
2075 		if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
2076 			return -EPERM;
2077 		}
2078 	}
2079 
2080 	g_opts = *opts;
2081 
2082 	return 0;
2083 }
2084 
2085 struct set_nvme_hotplug_ctx {
2086 	uint64_t period_us;
2087 	bool enabled;
2088 	spdk_msg_fn fn;
2089 	void *fn_ctx;
2090 };
2091 
2092 static void
2093 set_nvme_hotplug_period_cb(void *_ctx)
2094 {
2095 	struct set_nvme_hotplug_ctx *ctx = _ctx;
2096 
2097 	spdk_poller_unregister(&g_hotplug_poller);
2098 	if (ctx->enabled) {
2099 		g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us);
2100 	}
2101 
2102 	g_nvme_hotplug_poll_period_us = ctx->period_us;
2103 	g_nvme_hotplug_enabled = ctx->enabled;
2104 	if (ctx->fn) {
2105 		ctx->fn(ctx->fn_ctx);
2106 	}
2107 
2108 	free(ctx);
2109 }
2110 
2111 int
2112 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx)
2113 {
2114 	struct set_nvme_hotplug_ctx *ctx;
2115 
2116 	if (enabled == true && !spdk_process_is_primary()) {
2117 		return -EPERM;
2118 	}
2119 
2120 	ctx = calloc(1, sizeof(*ctx));
2121 	if (ctx == NULL) {
2122 		return -ENOMEM;
2123 	}
2124 
2125 	period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us;
2126 	ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX);
2127 	ctx->enabled = enabled;
2128 	ctx->fn = cb;
2129 	ctx->fn_ctx = cb_ctx;
2130 
2131 	spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx);
2132 	return 0;
2133 }
2134 
2135 static void
2136 nvme_ctrlr_populate_namespaces_done(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
2137 				    struct nvme_async_probe_ctx *ctx)
2138 {
2139 	struct nvme_bdev_ns	*nvme_ns;
2140 	struct nvme_bdev	*nvme_bdev;
2141 	uint32_t		i, nsid;
2142 	size_t			j;
2143 
2144 	assert(nvme_bdev_ctrlr != NULL);
2145 
2146 	/*
2147 	 * Report the new bdevs that were created in this call.
2148 	 * There can be more than one bdev per NVMe controller.
2149 	 */
2150 	j = 0;
2151 	for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
2152 		nsid = i + 1;
2153 		nvme_ns = nvme_bdev_ctrlr->namespaces[nsid - 1];
2154 		if (!nvme_ns->populated) {
2155 			continue;
2156 		}
2157 		assert(nvme_ns->id == nsid);
2158 		nvme_bdev = nvme_ns->bdev;
2159 		if (nvme_bdev == NULL) {
2160 			assert(nvme_ns->type == NVME_BDEV_NS_OCSSD);
2161 			continue;
2162 		}
2163 		if (j < ctx->count) {
2164 			ctx->names[j] = nvme_bdev->disk.name;
2165 			j++;
2166 		} else {
2167 			SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n",
2168 				    ctx->count);
2169 			populate_namespaces_cb(ctx, 0, -ERANGE);
2170 			return;
2171 		}
2172 	}
2173 
2174 	populate_namespaces_cb(ctx, j, 0);
2175 }
2176 
2177 static int
2178 bdev_nvme_compare_trids(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
2179 			struct spdk_nvme_ctrlr *new_ctrlr,
2180 			struct spdk_nvme_transport_id *trid)
2181 {
2182 	struct nvme_bdev_ctrlr_trid *tmp_trid;
2183 
2184 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
2185 		SPDK_ERRLOG("PCIe failover is not supported.\n");
2186 		return -ENOTSUP;
2187 	}
2188 
2189 	/* Currently we only support failover to the same transport type. */
2190 	if (nvme_bdev_ctrlr->connected_trid->trtype != trid->trtype) {
2191 		return -EINVAL;
2192 	}
2193 
2194 	/* Currently we only support failover to the same NQN. */
2195 	if (strncmp(trid->subnqn, nvme_bdev_ctrlr->connected_trid->subnqn, SPDK_NVMF_NQN_MAX_LEN)) {
2196 		return -EINVAL;
2197 	}
2198 
2199 	/* Skip all the other checks if we've already registered this path. */
2200 	TAILQ_FOREACH(tmp_trid, &nvme_bdev_ctrlr->trids, link) {
2201 		if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) {
2202 			return -EEXIST;
2203 		}
2204 	}
2205 
2206 	return 0;
2207 }
2208 
2209 static int
2210 bdev_nvme_compare_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
2211 			     struct spdk_nvme_ctrlr *new_ctrlr)
2212 {
2213 	uint32_t i, nsid;
2214 	struct nvme_bdev_ns *nvme_ns;
2215 	struct spdk_nvme_ns *new_ns;
2216 
2217 	if (spdk_nvme_ctrlr_get_num_ns(new_ctrlr) != nvme_bdev_ctrlr->num_ns) {
2218 		return -EINVAL;
2219 	}
2220 
2221 	for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
2222 		nsid = i + 1;
2223 
2224 		nvme_ns = nvme_bdev_ctrlr->namespaces[i];
2225 		if (!nvme_ns->populated) {
2226 			continue;
2227 		}
2228 
2229 		new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nsid);
2230 		assert(new_ns != NULL);
2231 
2232 		if (!bdev_nvme_compare_ns(nvme_ns->ns, new_ns)) {
2233 			return -EINVAL;
2234 		}
2235 	}
2236 
2237 	return 0;
2238 }
2239 
2240 static int
2241 _bdev_nvme_add_secondary_trid(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
2242 			      struct spdk_nvme_transport_id *trid)
2243 {
2244 	struct nvme_bdev_ctrlr_trid *new_trid, *tmp_trid;
2245 
2246 	new_trid = calloc(1, sizeof(*new_trid));
2247 	if (new_trid == NULL) {
2248 		return -ENOMEM;
2249 	}
2250 	new_trid->trid = *trid;
2251 	new_trid->is_failed = false;
2252 
2253 	TAILQ_FOREACH(tmp_trid, &nvme_bdev_ctrlr->trids, link) {
2254 		if (tmp_trid->is_failed) {
2255 			TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link);
2256 			return 0;
2257 		}
2258 	}
2259 
2260 	TAILQ_INSERT_TAIL(&nvme_bdev_ctrlr->trids, new_trid, link);
2261 	return 0;
2262 }
2263 
2264 /* This is the case that a secondary path is added to an existing
2265  * nvme_bdev_ctrlr for failover. After checking if it can access the same
2266  * namespaces as the primary path, it is disconnected until failover occurs.
2267  */
2268 static void
2269 bdev_nvme_add_secondary_trid(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
2270 			     struct spdk_nvme_ctrlr *new_ctrlr,
2271 			     struct spdk_nvme_transport_id *trid,
2272 			     struct nvme_async_probe_ctx *ctx)
2273 {
2274 	int rc;
2275 
2276 	assert(nvme_bdev_ctrlr != NULL);
2277 
2278 	pthread_mutex_lock(&nvme_bdev_ctrlr->mutex);
2279 
2280 	rc = bdev_nvme_compare_trids(nvme_bdev_ctrlr, new_ctrlr, trid);
2281 	if (rc != 0) {
2282 		goto exit;
2283 	}
2284 
2285 	rc = bdev_nvme_compare_namespaces(nvme_bdev_ctrlr, new_ctrlr);
2286 	if (rc != 0) {
2287 		goto exit;
2288 	}
2289 
2290 	rc = _bdev_nvme_add_secondary_trid(nvme_bdev_ctrlr, trid);
2291 
2292 exit:
2293 	pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
2294 
2295 	spdk_nvme_detach(new_ctrlr);
2296 
2297 	if (ctx != NULL) {
2298 		populate_namespaces_cb(ctx, 0, rc);
2299 	}
2300 }
2301 
2302 static void
2303 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
2304 		  struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
2305 {
2306 	struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx;
2307 	struct nvme_bdev_ctrlr	*nvme_bdev_ctrlr;
2308 	struct nvme_async_probe_ctx *ctx;
2309 
2310 	ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, opts);
2311 	ctx->ctrlr_attached = true;
2312 
2313 	nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(ctx->base_name);
2314 	if (nvme_bdev_ctrlr) {
2315 		bdev_nvme_add_secondary_trid(nvme_bdev_ctrlr, ctrlr, &ctx->trid, ctx);
2316 		return;
2317 	}
2318 
2319 	nvme_bdev_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx->prchk_flags, ctx);
2320 }
2321 
2322 static int
2323 bdev_nvme_async_poll(void *arg)
2324 {
2325 	struct nvme_async_probe_ctx	*ctx = arg;
2326 	int				rc;
2327 
2328 	rc = spdk_nvme_probe_poll_async(ctx->probe_ctx);
2329 	if (spdk_unlikely(rc != -EAGAIN)) {
2330 		ctx->probe_done = true;
2331 		spdk_poller_unregister(&ctx->poller);
2332 		if (!ctx->ctrlr_attached) {
2333 			/* The probe is done, but no controller was attached.
2334 			 * That means we had a failure, so report -EIO back to
2335 			 * the caller (usually the RPC). populate_namespaces_cb()
2336 			 * will take care of freeing the nvme_async_probe_ctx.
2337 			 */
2338 			populate_namespaces_cb(ctx, 0, -EIO);
2339 		} else if (ctx->namespaces_populated) {
2340 			/* The namespaces for the attached controller were all
2341 			 * populated and the response was already sent to the
2342 			 * caller (usually the RPC).  So free the context here.
2343 			 */
2344 			free(ctx);
2345 		}
2346 	}
2347 
2348 	return SPDK_POLLER_BUSY;
2349 }
2350 
2351 int
2352 bdev_nvme_create(struct spdk_nvme_transport_id *trid,
2353 		 struct spdk_nvme_host_id *hostid,
2354 		 const char *base_name,
2355 		 const char **names,
2356 		 uint32_t count,
2357 		 const char *hostnqn,
2358 		 uint32_t prchk_flags,
2359 		 spdk_bdev_create_nvme_fn cb_fn,
2360 		 void *cb_ctx,
2361 		 struct spdk_nvme_ctrlr_opts *opts)
2362 {
2363 	struct nvme_probe_skip_entry	*entry, *tmp;
2364 	struct nvme_async_probe_ctx	*ctx;
2365 
2366 	/* TODO expand this check to include both the host and target TRIDs.
2367 	 * Only if both are the same should we fail.
2368 	 */
2369 	if (nvme_bdev_ctrlr_get(trid) != NULL) {
2370 		SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr);
2371 		return -EEXIST;
2372 	}
2373 
2374 	ctx = calloc(1, sizeof(*ctx));
2375 	if (!ctx) {
2376 		return -ENOMEM;
2377 	}
2378 	ctx->base_name = base_name;
2379 	ctx->names = names;
2380 	ctx->count = count;
2381 	ctx->cb_fn = cb_fn;
2382 	ctx->cb_ctx = cb_ctx;
2383 	ctx->prchk_flags = prchk_flags;
2384 	ctx->trid = *trid;
2385 
2386 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
2387 		TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) {
2388 			if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) {
2389 				TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq);
2390 				free(entry);
2391 				break;
2392 			}
2393 		}
2394 	}
2395 
2396 	if (opts) {
2397 		memcpy(&ctx->opts, opts, sizeof(*opts));
2398 	} else {
2399 		spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->opts, sizeof(ctx->opts));
2400 	}
2401 
2402 	ctx->opts.transport_retry_count = g_opts.retry_count;
2403 	ctx->opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms;
2404 
2405 	if (hostnqn) {
2406 		snprintf(ctx->opts.hostnqn, sizeof(ctx->opts.hostnqn), "%s", hostnqn);
2407 	}
2408 
2409 	if (hostid->hostaddr[0] != '\0') {
2410 		snprintf(ctx->opts.src_addr, sizeof(ctx->opts.src_addr), "%s", hostid->hostaddr);
2411 	}
2412 
2413 	if (hostid->hostsvcid[0] != '\0') {
2414 		snprintf(ctx->opts.src_svcid, sizeof(ctx->opts.src_svcid), "%s", hostid->hostsvcid);
2415 	}
2416 
2417 	ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->opts, connect_attach_cb);
2418 	if (ctx->probe_ctx == NULL) {
2419 		SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr);
2420 		free(ctx);
2421 		return -ENODEV;
2422 	}
2423 	ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000);
2424 
2425 	return 0;
2426 }
2427 
2428 static int
2429 bdev_nvme_delete_secondary_trid(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
2430 				const struct spdk_nvme_transport_id *trid)
2431 {
2432 	struct nvme_bdev_ctrlr_trid	*ctrlr_trid, *tmp_trid;
2433 
2434 	if (!spdk_nvme_transport_id_compare(trid, nvme_bdev_ctrlr->connected_trid)) {
2435 		return -EBUSY;
2436 	}
2437 
2438 	TAILQ_FOREACH_SAFE(ctrlr_trid, &nvme_bdev_ctrlr->trids, link, tmp_trid) {
2439 		if (!spdk_nvme_transport_id_compare(&ctrlr_trid->trid, trid)) {
2440 			TAILQ_REMOVE(&nvme_bdev_ctrlr->trids, ctrlr_trid, link);
2441 			free(ctrlr_trid);
2442 			return 0;
2443 		}
2444 	}
2445 
2446 	return -ENXIO;
2447 }
2448 
2449 int
2450 bdev_nvme_delete(const char *name, const struct spdk_nvme_transport_id *trid)
2451 {
2452 	struct nvme_bdev_ctrlr		*nvme_bdev_ctrlr;
2453 	struct nvme_bdev_ctrlr_trid	*ctrlr_trid;
2454 
2455 	if (name == NULL) {
2456 		return -EINVAL;
2457 	}
2458 
2459 	nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name);
2460 	if (nvme_bdev_ctrlr == NULL) {
2461 		SPDK_ERRLOG("Failed to find NVMe controller\n");
2462 		return -ENODEV;
2463 	}
2464 
2465 	/* case 1: remove the controller itself. */
2466 	if (trid == NULL) {
2467 		return _bdev_nvme_delete(nvme_bdev_ctrlr, false);
2468 	}
2469 
2470 	/* case 2: we are currently using the path to be removed. */
2471 	if (!spdk_nvme_transport_id_compare(trid, nvme_bdev_ctrlr->connected_trid)) {
2472 		ctrlr_trid = TAILQ_FIRST(&nvme_bdev_ctrlr->trids);
2473 		assert(nvme_bdev_ctrlr->connected_trid == &ctrlr_trid->trid);
2474 		/* case 2A: the current path is the only path. */
2475 		if (!TAILQ_NEXT(ctrlr_trid, link)) {
2476 			return _bdev_nvme_delete(nvme_bdev_ctrlr, false);
2477 		}
2478 
2479 		/* case 2B: there is an alternative path. */
2480 		return bdev_nvme_failover(nvme_bdev_ctrlr, true);
2481 	}
2482 
2483 	/* case 3: We are not using the specified path. */
2484 	return bdev_nvme_delete_secondary_trid(nvme_bdev_ctrlr, trid);
2485 }
2486 
2487 static int
2488 bdev_nvme_library_init(void)
2489 {
2490 	g_bdev_nvme_init_thread = spdk_get_thread();
2491 
2492 	spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_poll_group_create_cb,
2493 				bdev_nvme_poll_group_destroy_cb,
2494 				sizeof(struct nvme_bdev_poll_group),  "bdev_nvme_poll_groups");
2495 
2496 	return 0;
2497 }
2498 
2499 static void
2500 bdev_nvme_library_fini(void)
2501 {
2502 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, *tmp;
2503 	struct nvme_probe_skip_entry *entry, *entry_tmp;
2504 
2505 	spdk_poller_unregister(&g_hotplug_poller);
2506 	free(g_hotplug_probe_ctx);
2507 	g_hotplug_probe_ctx = NULL;
2508 
2509 	TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) {
2510 		TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq);
2511 		free(entry);
2512 	}
2513 
2514 	pthread_mutex_lock(&g_bdev_nvme_mutex);
2515 	TAILQ_FOREACH_SAFE(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq, tmp) {
2516 		pthread_mutex_lock(&nvme_bdev_ctrlr->mutex);
2517 		if (nvme_bdev_ctrlr->destruct) {
2518 			/* This controller's destruction was already started
2519 			 * before the application started shutting down
2520 			 */
2521 			pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
2522 			continue;
2523 		}
2524 		nvme_bdev_ctrlr->destruct = true;
2525 		pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
2526 
2527 		spdk_thread_send_msg(nvme_bdev_ctrlr->thread, _nvme_bdev_ctrlr_destruct,
2528 				     nvme_bdev_ctrlr);
2529 	}
2530 
2531 	g_bdev_nvme_module_finish = true;
2532 	if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
2533 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
2534 		spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL);
2535 		spdk_bdev_module_finish_done();
2536 		return;
2537 	}
2538 
2539 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
2540 }
2541 
2542 static void
2543 bdev_nvme_verify_pi_error(struct nvme_bdev_io *bio)
2544 {
2545 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2546 	struct spdk_bdev *bdev = bdev_io->bdev;
2547 	struct spdk_dif_ctx dif_ctx;
2548 	struct spdk_dif_error err_blk = {};
2549 	int rc;
2550 
2551 	rc = spdk_dif_ctx_init(&dif_ctx,
2552 			       bdev->blocklen, bdev->md_len, bdev->md_interleave,
2553 			       bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags,
2554 			       bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0);
2555 	if (rc != 0) {
2556 		SPDK_ERRLOG("Initialization of DIF context failed\n");
2557 		return;
2558 	}
2559 
2560 	if (bdev->md_interleave) {
2561 		rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
2562 				     bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
2563 	} else {
2564 		struct iovec md_iov = {
2565 			.iov_base	= bdev_io->u.bdev.md_buf,
2566 			.iov_len	= bdev_io->u.bdev.num_blocks * bdev->md_len,
2567 		};
2568 
2569 		rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
2570 				     &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
2571 	}
2572 
2573 	if (rc != 0) {
2574 		SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n",
2575 			    err_blk.err_type, err_blk.err_offset);
2576 	} else {
2577 		SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n");
2578 	}
2579 }
2580 
2581 static void
2582 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl)
2583 {
2584 	struct nvme_bdev_io *bio = ref;
2585 
2586 	if (spdk_nvme_cpl_is_success(cpl)) {
2587 		/* Run PI verification for read data buffer. */
2588 		bdev_nvme_verify_pi_error(bio);
2589 	}
2590 
2591 	/* Return original completion status */
2592 	bdev_nvme_io_complete_nvme_status(bio, &bio->cpl);
2593 }
2594 
2595 static void
2596 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl)
2597 {
2598 	struct nvme_bdev_io *bio = ref;
2599 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2600 	struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt;
2601 	struct nvme_io_path *io_path;
2602 	struct spdk_nvme_ns *ns;
2603 	struct spdk_nvme_qpair *qpair;
2604 	int ret;
2605 
2606 	if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) {
2607 		SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n",
2608 			    cpl->status.sct, cpl->status.sc);
2609 
2610 		/* Save completion status to use after verifying PI error. */
2611 		bio->cpl = *cpl;
2612 
2613 		io_path = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io));
2614 
2615 		if (spdk_likely(bdev_nvme_find_io_path(nbdev, io_path, &ns, &qpair))) {
2616 			/* Read without PI checking to verify PI error. */
2617 			ret = bdev_nvme_no_pi_readv(ns,
2618 						    qpair,
2619 						    bio,
2620 						    bdev_io->u.bdev.iovs,
2621 						    bdev_io->u.bdev.iovcnt,
2622 						    bdev_io->u.bdev.md_buf,
2623 						    bdev_io->u.bdev.num_blocks,
2624 						    bdev_io->u.bdev.offset_blocks);
2625 			if (ret == 0) {
2626 				return;
2627 			}
2628 		}
2629 	}
2630 
2631 	bdev_nvme_io_complete_nvme_status(bio, cpl);
2632 }
2633 
2634 static void
2635 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl)
2636 {
2637 	struct nvme_bdev_io *bio = ref;
2638 
2639 	if (spdk_nvme_cpl_is_pi_error(cpl)) {
2640 		SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n",
2641 			    cpl->status.sct, cpl->status.sc);
2642 		/* Run PI verification for write data buffer if PI error is detected. */
2643 		bdev_nvme_verify_pi_error(bio);
2644 	}
2645 
2646 	bdev_nvme_io_complete_nvme_status(bio, cpl);
2647 }
2648 
2649 static void
2650 bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl)
2651 {
2652 	struct nvme_bdev_io *bio = ref;
2653 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2654 
2655 	/* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks.
2656 	 * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error().
2657 	 */
2658 	bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0;
2659 
2660 	if (spdk_nvme_cpl_is_pi_error(cpl)) {
2661 		SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n",
2662 			    cpl->status.sct, cpl->status.sc);
2663 		/* Run PI verification for zone append data buffer if PI error is detected. */
2664 		bdev_nvme_verify_pi_error(bio);
2665 	}
2666 
2667 	bdev_nvme_io_complete_nvme_status(bio, cpl);
2668 }
2669 
2670 static void
2671 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl)
2672 {
2673 	struct nvme_bdev_io *bio = ref;
2674 
2675 	if (spdk_nvme_cpl_is_pi_error(cpl)) {
2676 		SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n",
2677 			    cpl->status.sct, cpl->status.sc);
2678 		/* Run PI verification for compare data buffer if PI error is detected. */
2679 		bdev_nvme_verify_pi_error(bio);
2680 	}
2681 
2682 	bdev_nvme_io_complete_nvme_status(bio, cpl);
2683 }
2684 
2685 static void
2686 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl)
2687 {
2688 	struct nvme_bdev_io *bio = ref;
2689 
2690 	/* Compare operation completion */
2691 	if ((cpl->cdw0 & 0xFF) == SPDK_NVME_OPC_COMPARE) {
2692 		/* Save compare result for write callback */
2693 		bio->cpl = *cpl;
2694 		return;
2695 	}
2696 
2697 	/* Write operation completion */
2698 	if (spdk_nvme_cpl_is_error(&bio->cpl)) {
2699 		/* If bio->cpl is already an error, it means the compare operation failed.  In that case,
2700 		 * complete the IO with the compare operation's status.
2701 		 */
2702 		if (!spdk_nvme_cpl_is_error(cpl)) {
2703 			SPDK_ERRLOG("Unexpected write success after compare failure.\n");
2704 		}
2705 
2706 		bdev_nvme_io_complete_nvme_status(bio, &bio->cpl);
2707 	} else {
2708 		bdev_nvme_io_complete_nvme_status(bio, cpl);
2709 	}
2710 }
2711 
2712 static void
2713 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl)
2714 {
2715 	struct nvme_bdev_io *bio = ref;
2716 
2717 	bdev_nvme_io_complete_nvme_status(bio, cpl);
2718 }
2719 
2720 static int
2721 fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc)
2722 {
2723 	switch (desc->zs) {
2724 	case SPDK_NVME_ZONE_STATE_EMPTY:
2725 		info->state = SPDK_BDEV_ZONE_STATE_EMPTY;
2726 		break;
2727 	case SPDK_NVME_ZONE_STATE_IOPEN:
2728 		info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN;
2729 		break;
2730 	case SPDK_NVME_ZONE_STATE_EOPEN:
2731 		info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN;
2732 		break;
2733 	case SPDK_NVME_ZONE_STATE_CLOSED:
2734 		info->state = SPDK_BDEV_ZONE_STATE_CLOSED;
2735 		break;
2736 	case SPDK_NVME_ZONE_STATE_RONLY:
2737 		info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY;
2738 		break;
2739 	case SPDK_NVME_ZONE_STATE_FULL:
2740 		info->state = SPDK_BDEV_ZONE_STATE_FULL;
2741 		break;
2742 	case SPDK_NVME_ZONE_STATE_OFFLINE:
2743 		info->state = SPDK_BDEV_ZONE_STATE_OFFLINE;
2744 		break;
2745 	default:
2746 		SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs);
2747 		return -EIO;
2748 	}
2749 
2750 	info->zone_id = desc->zslba;
2751 	info->write_pointer = desc->wp;
2752 	info->capacity = desc->zcap;
2753 
2754 	return 0;
2755 }
2756 
2757 static void
2758 bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl)
2759 {
2760 	struct nvme_bdev_io *bio = ref;
2761 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2762 	struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt;
2763 	struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io);
2764 	struct nvme_io_path *io_path = spdk_io_channel_get_ctx(ch);
2765 	uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id;
2766 	uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones;
2767 	struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf;
2768 	uint64_t max_zones_per_buf, i;
2769 	uint32_t zone_report_bufsize;
2770 	struct spdk_nvme_ns *ns;
2771 	struct spdk_nvme_qpair *qpair;
2772 	int ret;
2773 
2774 	if (spdk_nvme_cpl_is_error(cpl)) {
2775 		goto out_complete_io_nvme_cpl;
2776 	}
2777 
2778 	if (!bdev_nvme_find_io_path(nbdev, io_path, &ns, &qpair)) {
2779 		ret = -ENXIO;
2780 		goto out_complete_io_ret;
2781 	}
2782 
2783 	zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns);
2784 	max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) /
2785 			    sizeof(bio->zone_report_buf->descs[0]);
2786 
2787 	if (bio->zone_report_buf->nr_zones > max_zones_per_buf) {
2788 		ret = -EINVAL;
2789 		goto out_complete_io_ret;
2790 	}
2791 
2792 	if (!bio->zone_report_buf->nr_zones) {
2793 		ret = -EINVAL;
2794 		goto out_complete_io_ret;
2795 	}
2796 
2797 	for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) {
2798 		ret = fill_zone_from_report(&info[bio->handled_zones],
2799 					    &bio->zone_report_buf->descs[i]);
2800 		if (ret) {
2801 			goto out_complete_io_ret;
2802 		}
2803 		bio->handled_zones++;
2804 	}
2805 
2806 	if (bio->handled_zones < zones_to_copy) {
2807 		uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(ns);
2808 		uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones);
2809 
2810 		memset(bio->zone_report_buf, 0, zone_report_bufsize);
2811 		ret = spdk_nvme_zns_report_zones(ns, qpair,
2812 						 bio->zone_report_buf, zone_report_bufsize,
2813 						 slba, SPDK_NVME_ZRA_LIST_ALL, true,
2814 						 bdev_nvme_get_zone_info_done, bio);
2815 		if (!ret) {
2816 			return;
2817 		} else {
2818 			goto out_complete_io_ret;
2819 		}
2820 	}
2821 
2822 out_complete_io_nvme_cpl:
2823 	free(bio->zone_report_buf);
2824 	bio->zone_report_buf = NULL;
2825 	bdev_nvme_io_complete_nvme_status(bio, cpl);
2826 	return;
2827 
2828 out_complete_io_ret:
2829 	free(bio->zone_report_buf);
2830 	bio->zone_report_buf = NULL;
2831 	bdev_nvme_io_complete(bio, ret);
2832 }
2833 
2834 static void
2835 bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl)
2836 {
2837 	struct nvme_bdev_io *bio = ref;
2838 
2839 	bdev_nvme_io_complete_nvme_status(bio, cpl);
2840 }
2841 
2842 static void
2843 bdev_nvme_admin_passthru_completion(void *ctx)
2844 {
2845 	struct nvme_bdev_io *bio = ctx;
2846 
2847 	bdev_nvme_io_complete_nvme_status(bio, &bio->cpl);
2848 }
2849 
2850 static void
2851 bdev_nvme_abort_completion(void *ctx)
2852 {
2853 	struct nvme_bdev_io *bio = ctx;
2854 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2855 
2856 	if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) {
2857 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
2858 	} else {
2859 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
2860 	}
2861 }
2862 
2863 static void
2864 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl)
2865 {
2866 	struct nvme_bdev_io *bio = ref;
2867 
2868 	bio->cpl = *cpl;
2869 	spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_completion, bio);
2870 }
2871 
2872 static void
2873 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl)
2874 {
2875 	struct nvme_bdev_io *bio = ref;
2876 
2877 	bio->cpl = *cpl;
2878 	spdk_thread_send_msg(bio->orig_thread, bdev_nvme_admin_passthru_completion, bio);
2879 }
2880 
2881 static void
2882 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset)
2883 {
2884 	struct nvme_bdev_io *bio = ref;
2885 	struct iovec *iov;
2886 
2887 	bio->iov_offset = sgl_offset;
2888 	for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) {
2889 		iov = &bio->iovs[bio->iovpos];
2890 		if (bio->iov_offset < iov->iov_len) {
2891 			break;
2892 		}
2893 
2894 		bio->iov_offset -= iov->iov_len;
2895 	}
2896 }
2897 
2898 static int
2899 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length)
2900 {
2901 	struct nvme_bdev_io *bio = ref;
2902 	struct iovec *iov;
2903 
2904 	assert(bio->iovpos < bio->iovcnt);
2905 
2906 	iov = &bio->iovs[bio->iovpos];
2907 
2908 	*address = iov->iov_base;
2909 	*length = iov->iov_len;
2910 
2911 	if (bio->iov_offset) {
2912 		assert(bio->iov_offset <= iov->iov_len);
2913 		*address += bio->iov_offset;
2914 		*length -= bio->iov_offset;
2915 	}
2916 
2917 	bio->iov_offset += *length;
2918 	if (bio->iov_offset == iov->iov_len) {
2919 		bio->iovpos++;
2920 		bio->iov_offset = 0;
2921 	}
2922 
2923 	return 0;
2924 }
2925 
2926 static void
2927 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset)
2928 {
2929 	struct nvme_bdev_io *bio = ref;
2930 	struct iovec *iov;
2931 
2932 	bio->fused_iov_offset = sgl_offset;
2933 	for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) {
2934 		iov = &bio->fused_iovs[bio->fused_iovpos];
2935 		if (bio->fused_iov_offset < iov->iov_len) {
2936 			break;
2937 		}
2938 
2939 		bio->fused_iov_offset -= iov->iov_len;
2940 	}
2941 }
2942 
2943 static int
2944 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length)
2945 {
2946 	struct nvme_bdev_io *bio = ref;
2947 	struct iovec *iov;
2948 
2949 	assert(bio->fused_iovpos < bio->fused_iovcnt);
2950 
2951 	iov = &bio->fused_iovs[bio->fused_iovpos];
2952 
2953 	*address = iov->iov_base;
2954 	*length = iov->iov_len;
2955 
2956 	if (bio->fused_iov_offset) {
2957 		assert(bio->fused_iov_offset <= iov->iov_len);
2958 		*address += bio->fused_iov_offset;
2959 		*length -= bio->fused_iov_offset;
2960 	}
2961 
2962 	bio->fused_iov_offset += *length;
2963 	if (bio->fused_iov_offset == iov->iov_len) {
2964 		bio->fused_iovpos++;
2965 		bio->fused_iov_offset = 0;
2966 	}
2967 
2968 	return 0;
2969 }
2970 
2971 static int
2972 bdev_nvme_no_pi_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
2973 		      struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
2974 		      void *md, uint64_t lba_count, uint64_t lba)
2975 {
2976 	int rc;
2977 
2978 	SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n",
2979 		      lba_count, lba);
2980 
2981 	bio->iovs = iov;
2982 	bio->iovcnt = iovcnt;
2983 	bio->iovpos = 0;
2984 	bio->iov_offset = 0;
2985 
2986 	rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count,
2987 					    bdev_nvme_no_pi_readv_done, bio, 0,
2988 					    bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
2989 					    md, 0, 0);
2990 
2991 	if (rc != 0 && rc != -ENOMEM) {
2992 		SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc);
2993 	}
2994 	return rc;
2995 }
2996 
2997 static int
2998 bdev_nvme_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
2999 		struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
3000 		void *md, uint64_t lba_count, uint64_t lba, uint32_t flags)
3001 {
3002 	int rc;
3003 
3004 	SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n",
3005 		      lba_count, lba);
3006 
3007 	bio->iovs = iov;
3008 	bio->iovcnt = iovcnt;
3009 	bio->iovpos = 0;
3010 	bio->iov_offset = 0;
3011 
3012 	if (iovcnt == 1) {
3013 		rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, md, lba,
3014 						   lba_count,
3015 						   bdev_nvme_readv_done, bio,
3016 						   flags,
3017 						   0, 0);
3018 	} else {
3019 		rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count,
3020 						    bdev_nvme_readv_done, bio, flags,
3021 						    bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
3022 						    md, 0, 0);
3023 	}
3024 
3025 	if (rc != 0 && rc != -ENOMEM) {
3026 		SPDK_ERRLOG("readv failed: rc = %d\n", rc);
3027 	}
3028 	return rc;
3029 }
3030 
3031 static int
3032 bdev_nvme_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3033 		 struct nvme_bdev_io *bio,
3034 		 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
3035 		 uint32_t flags)
3036 {
3037 	int rc;
3038 
3039 	SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n",
3040 		      lba_count, lba);
3041 
3042 	bio->iovs = iov;
3043 	bio->iovcnt = iovcnt;
3044 	bio->iovpos = 0;
3045 	bio->iov_offset = 0;
3046 
3047 	if (iovcnt == 1) {
3048 		rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, md, lba,
3049 						    lba_count,
3050 						    bdev_nvme_writev_done, bio,
3051 						    flags,
3052 						    0, 0);
3053 	} else {
3054 		rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count,
3055 						     bdev_nvme_writev_done, bio, flags,
3056 						     bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
3057 						     md, 0, 0);
3058 	}
3059 
3060 	if (rc != 0 && rc != -ENOMEM) {
3061 		SPDK_ERRLOG("writev failed: rc = %d\n", rc);
3062 	}
3063 	return rc;
3064 }
3065 
3066 static int
3067 bdev_nvme_zone_appendv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3068 		       struct nvme_bdev_io *bio,
3069 		       struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t zslba,
3070 		       uint32_t flags)
3071 {
3072 	int rc;
3073 
3074 	SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n",
3075 		      lba_count, zslba);
3076 
3077 	bio->iovs = iov;
3078 	bio->iovcnt = iovcnt;
3079 	bio->iovpos = 0;
3080 	bio->iov_offset = 0;
3081 
3082 	if (iovcnt == 1) {
3083 		rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba,
3084 						       lba_count,
3085 						       bdev_nvme_zone_appendv_done, bio,
3086 						       flags,
3087 						       0, 0);
3088 	} else {
3089 		rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count,
3090 							bdev_nvme_zone_appendv_done, bio, flags,
3091 							bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
3092 							md, 0, 0);
3093 	}
3094 
3095 	if (rc != 0 && rc != -ENOMEM) {
3096 		SPDK_ERRLOG("zone append failed: rc = %d\n", rc);
3097 	}
3098 	return rc;
3099 }
3100 
3101 static int
3102 bdev_nvme_comparev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3103 		   struct nvme_bdev_io *bio,
3104 		   struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
3105 		   uint32_t flags)
3106 {
3107 	int rc;
3108 
3109 	SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n",
3110 		      lba_count, lba);
3111 
3112 	bio->iovs = iov;
3113 	bio->iovcnt = iovcnt;
3114 	bio->iovpos = 0;
3115 	bio->iov_offset = 0;
3116 
3117 	rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count,
3118 					       bdev_nvme_comparev_done, bio, flags,
3119 					       bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
3120 					       md, 0, 0);
3121 
3122 	if (rc != 0 && rc != -ENOMEM) {
3123 		SPDK_ERRLOG("comparev failed: rc = %d\n", rc);
3124 	}
3125 	return rc;
3126 }
3127 
3128 static int
3129 bdev_nvme_comparev_and_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3130 			      struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt,
3131 			      struct iovec *write_iov, int write_iovcnt,
3132 			      void *md, uint64_t lba_count, uint64_t lba, uint32_t flags)
3133 {
3134 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
3135 	int rc;
3136 
3137 	SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n",
3138 		      lba_count, lba);
3139 
3140 	bio->iovs = cmp_iov;
3141 	bio->iovcnt = cmp_iovcnt;
3142 	bio->iovpos = 0;
3143 	bio->iov_offset = 0;
3144 	bio->fused_iovs = write_iov;
3145 	bio->fused_iovcnt = write_iovcnt;
3146 	bio->fused_iovpos = 0;
3147 	bio->fused_iov_offset = 0;
3148 
3149 	if (bdev_io->num_retries == 0) {
3150 		bio->first_fused_submitted = false;
3151 	}
3152 
3153 	if (!bio->first_fused_submitted) {
3154 		flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST;
3155 		memset(&bio->cpl, 0, sizeof(bio->cpl));
3156 
3157 		rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count,
3158 						       bdev_nvme_comparev_and_writev_done, bio, flags,
3159 						       bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0);
3160 		if (rc == 0) {
3161 			bio->first_fused_submitted = true;
3162 			flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST;
3163 		} else {
3164 			if (rc != -ENOMEM) {
3165 				SPDK_ERRLOG("compare failed: rc = %d\n", rc);
3166 			}
3167 			return rc;
3168 		}
3169 	}
3170 
3171 	flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND;
3172 
3173 	rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count,
3174 					     bdev_nvme_comparev_and_writev_done, bio, flags,
3175 					     bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0);
3176 	if (rc != 0 && rc != -ENOMEM) {
3177 		SPDK_ERRLOG("write failed: rc = %d\n", rc);
3178 		rc = 0;
3179 	}
3180 
3181 	return rc;
3182 }
3183 
3184 static int
3185 bdev_nvme_unmap(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3186 		struct nvme_bdev_io *bio,
3187 		uint64_t offset_blocks,
3188 		uint64_t num_blocks)
3189 {
3190 	struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES];
3191 	struct spdk_nvme_dsm_range *range;
3192 	uint64_t offset, remaining;
3193 	uint64_t num_ranges_u64;
3194 	uint16_t num_ranges;
3195 	int rc;
3196 
3197 	num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) /
3198 			 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
3199 	if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) {
3200 		SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks);
3201 		return -EINVAL;
3202 	}
3203 	num_ranges = (uint16_t)num_ranges_u64;
3204 
3205 	offset = offset_blocks;
3206 	remaining = num_blocks;
3207 	range = &dsm_ranges[0];
3208 
3209 	/* Fill max-size ranges until the remaining blocks fit into one range */
3210 	while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) {
3211 		range->attributes.raw = 0;
3212 		range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
3213 		range->starting_lba = offset;
3214 
3215 		offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
3216 		remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
3217 		range++;
3218 	}
3219 
3220 	/* Final range describes the remaining blocks */
3221 	range->attributes.raw = 0;
3222 	range->length = remaining;
3223 	range->starting_lba = offset;
3224 
3225 	rc = spdk_nvme_ns_cmd_dataset_management(ns, qpair,
3226 			SPDK_NVME_DSM_ATTR_DEALLOCATE,
3227 			dsm_ranges, num_ranges,
3228 			bdev_nvme_queued_done, bio);
3229 
3230 	return rc;
3231 }
3232 
3233 static int
3234 bdev_nvme_get_zone_info(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3235 			struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones,
3236 			struct spdk_bdev_zone_info *info)
3237 {
3238 	uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns);
3239 	uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns);
3240 	uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns);
3241 
3242 	if (zone_id % zone_size != 0) {
3243 		return -EINVAL;
3244 	}
3245 
3246 	if (num_zones > total_zones || !num_zones) {
3247 		return -EINVAL;
3248 	}
3249 
3250 	assert(!bio->zone_report_buf);
3251 	bio->zone_report_buf = calloc(1, zone_report_bufsize);
3252 	if (!bio->zone_report_buf) {
3253 		return -ENOMEM;
3254 	}
3255 
3256 	bio->handled_zones = 0;
3257 
3258 	return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize,
3259 					  zone_id, SPDK_NVME_ZRA_LIST_ALL, true,
3260 					  bdev_nvme_get_zone_info_done, bio);
3261 }
3262 
3263 static int
3264 bdev_nvme_zone_management(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3265 			  struct nvme_bdev_io *bio, uint64_t zone_id,
3266 			  enum spdk_bdev_zone_action action)
3267 {
3268 	switch (action) {
3269 	case SPDK_BDEV_ZONE_CLOSE:
3270 		return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false,
3271 						bdev_nvme_zone_management_done, bio);
3272 	case SPDK_BDEV_ZONE_FINISH:
3273 		return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false,
3274 						 bdev_nvme_zone_management_done, bio);
3275 	case SPDK_BDEV_ZONE_OPEN:
3276 		return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false,
3277 					       bdev_nvme_zone_management_done, bio);
3278 	case SPDK_BDEV_ZONE_RESET:
3279 		return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false,
3280 						bdev_nvme_zone_management_done, bio);
3281 	case SPDK_BDEV_ZONE_OFFLINE:
3282 		return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false,
3283 						  bdev_nvme_zone_management_done, bio);
3284 	default:
3285 		return -EINVAL;
3286 	}
3287 }
3288 
3289 static int
3290 bdev_nvme_admin_passthru(struct nvme_io_path *io_path, struct nvme_bdev_io *bio,
3291 			 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes)
3292 {
3293 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
3294 	uint32_t max_xfer_size;
3295 
3296 	if (!bdev_nvme_find_admin_path(io_path, &nvme_bdev_ctrlr)) {
3297 		return -EINVAL;
3298 	}
3299 
3300 	max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_bdev_ctrlr->ctrlr);
3301 
3302 	if (nbytes > max_xfer_size) {
3303 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
3304 		return -EINVAL;
3305 	}
3306 
3307 	bio->orig_thread = spdk_get_thread();
3308 
3309 	return spdk_nvme_ctrlr_cmd_admin_raw(nvme_bdev_ctrlr->ctrlr, cmd, buf,
3310 					     (uint32_t)nbytes, bdev_nvme_admin_passthru_done, bio);
3311 }
3312 
3313 static int
3314 bdev_nvme_io_passthru(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3315 		      struct nvme_bdev_io *bio,
3316 		      struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes)
3317 {
3318 	uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns);
3319 	struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns);
3320 
3321 	if (nbytes > max_xfer_size) {
3322 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
3323 		return -EINVAL;
3324 	}
3325 
3326 	/*
3327 	 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid,
3328 	 * so fill it out automatically.
3329 	 */
3330 	cmd->nsid = spdk_nvme_ns_get_id(ns);
3331 
3332 	return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf,
3333 					  (uint32_t)nbytes, bdev_nvme_queued_done, bio);
3334 }
3335 
3336 static int
3337 bdev_nvme_io_passthru_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3338 			 struct nvme_bdev_io *bio,
3339 			 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len)
3340 {
3341 	size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns);
3342 	uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns);
3343 	struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns);
3344 
3345 	if (nbytes > max_xfer_size) {
3346 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
3347 		return -EINVAL;
3348 	}
3349 
3350 	if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) {
3351 		SPDK_ERRLOG("invalid meta data buffer size\n");
3352 		return -EINVAL;
3353 	}
3354 
3355 	/*
3356 	 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid,
3357 	 * so fill it out automatically.
3358 	 */
3359 	cmd->nsid = spdk_nvme_ns_get_id(ns);
3360 
3361 	return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf,
3362 			(uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio);
3363 }
3364 
3365 static int
3366 bdev_nvme_abort(struct nvme_io_path *io_path, struct nvme_bdev_io *bio,
3367 		struct nvme_bdev_io *bio_to_abort)
3368 {
3369 	int rc;
3370 
3371 	bio->orig_thread = spdk_get_thread();
3372 
3373 	rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->ctrlr->ctrlr,
3374 					   io_path->qpair,
3375 					   bio_to_abort,
3376 					   bdev_nvme_abort_done, bio);
3377 	if (rc == -ENOENT) {
3378 		/* If no command was found in I/O qpair, the target command may be
3379 		 * admin command.
3380 		 */
3381 		rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->ctrlr->ctrlr,
3382 						   NULL,
3383 						   bio_to_abort,
3384 						   bdev_nvme_abort_done, bio);
3385 	}
3386 
3387 	if (rc == -ENOENT) {
3388 		/* If no command was found, complete the abort request with failure. */
3389 		bio->cpl.cdw0 |= 1U;
3390 		bio->cpl.status.sc = SPDK_NVME_SC_SUCCESS;
3391 		bio->cpl.status.sct = SPDK_NVME_SCT_GENERIC;
3392 
3393 		bdev_nvme_abort_completion(bio);
3394 
3395 		rc = 0;
3396 	}
3397 
3398 	return rc;
3399 }
3400 
3401 static void
3402 nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w,
3403 		struct nvme_bdev_ns *nvme_ns)
3404 {
3405 	/* nop */
3406 }
3407 
3408 static void
3409 nvme_namespace_config_json(struct spdk_json_write_ctx *w, struct nvme_bdev_ns *nvme_ns)
3410 {
3411 	g_config_json_namespace_fn[nvme_ns->type](w, nvme_ns);
3412 }
3413 
3414 static void
3415 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w)
3416 {
3417 	const char	*action;
3418 
3419 	if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) {
3420 		action = "reset";
3421 	} else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) {
3422 		action = "abort";
3423 	} else {
3424 		action = "none";
3425 	}
3426 
3427 	spdk_json_write_object_begin(w);
3428 
3429 	spdk_json_write_named_string(w, "method", "bdev_nvme_set_options");
3430 
3431 	spdk_json_write_named_object_begin(w, "params");
3432 	spdk_json_write_named_string(w, "action_on_timeout", action);
3433 	spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us);
3434 	spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms);
3435 	spdk_json_write_named_uint32(w, "retry_count", g_opts.retry_count);
3436 	spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst);
3437 	spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight);
3438 	spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight);
3439 	spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight);
3440 	spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us);
3441 	spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us);
3442 	spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests);
3443 	spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit);
3444 	spdk_json_write_object_end(w);
3445 
3446 	spdk_json_write_object_end(w);
3447 }
3448 
3449 static void
3450 nvme_bdev_ctrlr_config_json(struct spdk_json_write_ctx *w,
3451 			    struct nvme_bdev_ctrlr *nvme_bdev_ctrlr)
3452 {
3453 	struct spdk_nvme_transport_id	*trid;
3454 
3455 	trid = nvme_bdev_ctrlr->connected_trid;
3456 
3457 	spdk_json_write_object_begin(w);
3458 
3459 	spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller");
3460 
3461 	spdk_json_write_named_object_begin(w, "params");
3462 	spdk_json_write_named_string(w, "name", nvme_bdev_ctrlr->name);
3463 	nvme_bdev_dump_trid_json(trid, w);
3464 	spdk_json_write_named_bool(w, "prchk_reftag",
3465 				   (nvme_bdev_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0);
3466 	spdk_json_write_named_bool(w, "prchk_guard",
3467 				   (nvme_bdev_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0);
3468 
3469 	spdk_json_write_object_end(w);
3470 
3471 	spdk_json_write_object_end(w);
3472 }
3473 
3474 static void
3475 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w)
3476 {
3477 	spdk_json_write_object_begin(w);
3478 	spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug");
3479 
3480 	spdk_json_write_named_object_begin(w, "params");
3481 	spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us);
3482 	spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled);
3483 	spdk_json_write_object_end(w);
3484 
3485 	spdk_json_write_object_end(w);
3486 }
3487 
3488 static int
3489 bdev_nvme_config_json(struct spdk_json_write_ctx *w)
3490 {
3491 	struct nvme_bdev_ctrlr	*nvme_bdev_ctrlr;
3492 	uint32_t		nsid;
3493 
3494 	bdev_nvme_opts_config_json(w);
3495 
3496 	pthread_mutex_lock(&g_bdev_nvme_mutex);
3497 
3498 	TAILQ_FOREACH(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
3499 		nvme_bdev_ctrlr_config_json(w, nvme_bdev_ctrlr);
3500 
3501 		for (nsid = 0; nsid < nvme_bdev_ctrlr->num_ns; ++nsid) {
3502 			if (!nvme_bdev_ctrlr->namespaces[nsid]->populated) {
3503 				continue;
3504 			}
3505 
3506 			nvme_namespace_config_json(w, nvme_bdev_ctrlr->namespaces[nsid]);
3507 		}
3508 	}
3509 
3510 	/* Dump as last parameter to give all NVMe bdevs chance to be constructed
3511 	 * before enabling hotplug poller.
3512 	 */
3513 	bdev_nvme_hotplug_config_json(w);
3514 
3515 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
3516 	return 0;
3517 }
3518 
3519 struct spdk_nvme_ctrlr *
3520 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev)
3521 {
3522 	if (!bdev || bdev->module != &nvme_if) {
3523 		return NULL;
3524 	}
3525 
3526 	return SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk)->nvme_ns->ctrlr->ctrlr;
3527 }
3528 
3529 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme)
3530