xref: /spdk/module/bdev/nvme/bdev_nvme.c (revision 975852a079578816478a906717d1cf45fc97ddf3)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation. All rights reserved.
5  *   Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "spdk/stdinc.h"
35 
36 #include "bdev_nvme.h"
37 #include "bdev_ocssd.h"
38 
39 #include "spdk/accel_engine.h"
40 #include "spdk/config.h"
41 #include "spdk/endian.h"
42 #include "spdk/bdev.h"
43 #include "spdk/json.h"
44 #include "spdk/nvme.h"
45 #include "spdk/nvme_ocssd.h"
46 #include "spdk/nvme_zns.h"
47 #include "spdk/thread.h"
48 #include "spdk/string.h"
49 #include "spdk/util.h"
50 
51 #include "spdk/bdev_module.h"
52 #include "spdk/log.h"
53 
54 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true
55 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS	(10000)
56 
57 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w);
58 
59 struct nvme_bdev_io {
60 	/** array of iovecs to transfer. */
61 	struct iovec *iovs;
62 
63 	/** Number of iovecs in iovs array. */
64 	int iovcnt;
65 
66 	/** Current iovec position. */
67 	int iovpos;
68 
69 	/** Offset in current iovec. */
70 	uint32_t iov_offset;
71 
72 	/** array of iovecs to transfer. */
73 	struct iovec *fused_iovs;
74 
75 	/** Number of iovecs in iovs array. */
76 	int fused_iovcnt;
77 
78 	/** Current iovec position. */
79 	int fused_iovpos;
80 
81 	/** Offset in current iovec. */
82 	uint32_t fused_iov_offset;
83 
84 	/** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */
85 	struct spdk_nvme_cpl cpl;
86 
87 	/** Originating thread */
88 	struct spdk_thread *orig_thread;
89 
90 	/** Keeps track if first of fused commands was submitted */
91 	bool first_fused_submitted;
92 
93 	/** Temporary pointer to zone report buffer */
94 	struct spdk_nvme_zns_zone_report *zone_report_buf;
95 
96 	/** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */
97 	uint64_t handled_zones;
98 };
99 
100 struct nvme_probe_ctx {
101 	size_t count;
102 	struct spdk_nvme_transport_id trids[NVME_MAX_CONTROLLERS];
103 	struct spdk_nvme_host_id hostids[NVME_MAX_CONTROLLERS];
104 	const char *names[NVME_MAX_CONTROLLERS];
105 	uint32_t prchk_flags[NVME_MAX_CONTROLLERS];
106 	const char *hostnqn;
107 };
108 
109 struct nvme_probe_skip_entry {
110 	struct spdk_nvme_transport_id		trid;
111 	TAILQ_ENTRY(nvme_probe_skip_entry)	tailq;
112 };
113 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */
114 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER(
115 			g_skipped_nvme_ctrlrs);
116 
117 static struct spdk_bdev_nvme_opts g_opts = {
118 	.action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE,
119 	.timeout_us = 0,
120 	.timeout_admin_us = 0,
121 	.keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS,
122 	.retry_count = 4,
123 	.arbitration_burst = 0,
124 	.low_priority_weight = 0,
125 	.medium_priority_weight = 0,
126 	.high_priority_weight = 0,
127 	.nvme_adminq_poll_period_us = 10000ULL,
128 	.nvme_ioq_poll_period_us = 0,
129 	.io_queue_requests = 0,
130 	.delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT,
131 };
132 
133 #define NVME_HOTPLUG_POLL_PERIOD_MAX			10000000ULL
134 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT		100000ULL
135 
136 static int g_hot_insert_nvme_controller_index = 0;
137 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT;
138 static bool g_nvme_hotplug_enabled = false;
139 static struct spdk_thread *g_bdev_nvme_init_thread;
140 static struct spdk_poller *g_hotplug_poller;
141 static struct spdk_poller *g_hotplug_probe_poller;
142 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx;
143 
144 static void nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr,
145 		struct nvme_async_probe_ctx *ctx);
146 static void nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr,
147 		struct nvme_async_probe_ctx *ctx);
148 static int bdev_nvme_library_init(void);
149 static void bdev_nvme_library_fini(void);
150 static int bdev_nvme_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
151 			   struct nvme_bdev_io *bio,
152 			   struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
153 			   uint32_t flags);
154 static int bdev_nvme_no_pi_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
155 				 struct nvme_bdev_io *bio,
156 				 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba);
157 static int bdev_nvme_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
158 			    struct nvme_bdev_io *bio,
159 			    struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
160 			    uint32_t flags);
161 static int bdev_nvme_zone_appendv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
162 				  struct nvme_bdev_io *bio,
163 				  struct iovec *iov, int iovcnt, void *md, uint64_t lba_count,
164 				  uint64_t zslba, uint32_t flags);
165 static int bdev_nvme_comparev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
166 			      struct nvme_bdev_io *bio,
167 			      struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
168 			      uint32_t flags);
169 static int bdev_nvme_comparev_and_writev(struct spdk_nvme_ns *ns,
170 		struct spdk_nvme_qpair *qpair,
171 		struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov,
172 		int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba,
173 		uint32_t flags);
174 static int bdev_nvme_get_zone_info(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
175 				   struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones,
176 				   struct spdk_bdev_zone_info *info);
177 static int bdev_nvme_zone_management(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
178 				     struct nvme_bdev_io *bio, uint64_t zone_id,
179 				     enum spdk_bdev_zone_action action);
180 static int bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch,
181 				    struct nvme_bdev_io *bio,
182 				    struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes);
183 static int bdev_nvme_io_passthru(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
184 				 struct nvme_bdev_io *bio,
185 				 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes);
186 static int bdev_nvme_io_passthru_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
187 				    struct nvme_bdev_io *bio,
188 				    struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len);
189 static int bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch,
190 			   struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort);
191 static int bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio);
192 static int bdev_nvme_failover(struct nvme_ctrlr *nvme_ctrlr, bool remove);
193 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr);
194 
195 typedef void (*populate_namespace_fn)(struct nvme_ctrlr *nvme_ctrlr,
196 				      struct nvme_ns *nvme_ns, struct nvme_async_probe_ctx *ctx);
197 static void nvme_ctrlr_populate_standard_namespace(struct nvme_ctrlr *nvme_ctrlr,
198 		struct nvme_ns *nvme_ns, struct nvme_async_probe_ctx *ctx);
199 
200 static populate_namespace_fn g_populate_namespace_fn[] = {
201 	NULL,
202 	nvme_ctrlr_populate_standard_namespace,
203 	bdev_ocssd_populate_namespace,
204 };
205 
206 typedef void (*depopulate_namespace_fn)(struct nvme_ns *nvme_ns);
207 static void nvme_ctrlr_depopulate_standard_namespace(struct nvme_ns *nvme_ns);
208 
209 static depopulate_namespace_fn g_depopulate_namespace_fn[] = {
210 	NULL,
211 	nvme_ctrlr_depopulate_standard_namespace,
212 	bdev_ocssd_depopulate_namespace,
213 };
214 
215 typedef void (*config_json_namespace_fn)(struct spdk_json_write_ctx *w,
216 		struct nvme_ns *nvme_ns);
217 static void nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w,
218 		struct nvme_ns *nvme_ns);
219 
220 static config_json_namespace_fn g_config_json_namespace_fn[] = {
221 	NULL,
222 	nvme_ctrlr_config_json_standard_namespace,
223 	bdev_ocssd_namespace_config_json,
224 };
225 
226 struct spdk_nvme_qpair *
227 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch)
228 {
229 	struct nvme_ctrlr_channel *ctrlr_ch;
230 
231 	assert(ctrlr_io_ch != NULL);
232 
233 	ctrlr_ch = spdk_io_channel_get_ctx(ctrlr_io_ch);
234 
235 	return ctrlr_ch->qpair;
236 }
237 
238 static int
239 bdev_nvme_get_ctx_size(void)
240 {
241 	return sizeof(struct nvme_bdev_io);
242 }
243 
244 static struct spdk_bdev_module nvme_if = {
245 	.name = "nvme",
246 	.async_fini = true,
247 	.module_init = bdev_nvme_library_init,
248 	.module_fini = bdev_nvme_library_fini,
249 	.config_json = bdev_nvme_config_json,
250 	.get_ctx_size = bdev_nvme_get_ctx_size,
251 
252 };
253 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if)
254 
255 static inline bool
256 bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch,
257 		       struct spdk_nvme_ns **_ns, struct spdk_nvme_qpair **_qpair)
258 {
259 	if (spdk_unlikely(nbdev_ch->ctrlr_ch->qpair == NULL)) {
260 		/* The device is currently resetting. */
261 		return false;
262 	}
263 
264 	*_ns = nbdev_ch->nvme_ns->ns;
265 	*_qpair = nbdev_ch->ctrlr_ch->qpair;
266 	return true;
267 }
268 
269 static inline bool
270 bdev_nvme_find_admin_path(struct nvme_bdev_channel *nbdev_ch,
271 			  struct nvme_ctrlr **_nvme_ctrlr)
272 {
273 	*_nvme_ctrlr = nbdev_ch->ctrlr_ch->ctrlr;
274 	return true;
275 }
276 
277 static inline void
278 bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio,
279 				  const struct spdk_nvme_cpl *cpl)
280 {
281 	spdk_bdev_io_complete_nvme_status(spdk_bdev_io_from_ctx(bio), cpl->cdw0,
282 					  cpl->status.sct, cpl->status.sc);
283 }
284 
285 static inline void
286 bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc)
287 {
288 	enum spdk_bdev_io_status io_status;
289 
290 	if (rc == 0) {
291 		io_status = SPDK_BDEV_IO_STATUS_SUCCESS;
292 	} else if (rc == -ENOMEM) {
293 		io_status = SPDK_BDEV_IO_STATUS_NOMEM;
294 	} else {
295 		io_status = SPDK_BDEV_IO_STATUS_FAILED;
296 	}
297 
298 	spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), io_status);
299 }
300 
301 static void
302 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx)
303 {
304 	int rc;
305 
306 	SPDK_DEBUGLOG(bdev_nvme, "qpair %p is disconnected, attempting reconnect.\n", qpair);
307 	/*
308 	 * Currently, just try to reconnect indefinitely. If we are doing a reset, the reset will
309 	 * reconnect a qpair and we will stop getting a callback for this one.
310 	 */
311 	rc = spdk_nvme_ctrlr_reconnect_io_qpair(qpair);
312 	if (rc != 0) {
313 		SPDK_WARNLOG("Failed to reconnect to qpair %p, errno %d\n", qpair, -rc);
314 	}
315 }
316 
317 static int
318 bdev_nvme_poll(void *arg)
319 {
320 	struct nvme_poll_group *group = arg;
321 	int64_t num_completions;
322 
323 	if (group->collect_spin_stat && group->start_ticks == 0) {
324 		group->start_ticks = spdk_get_ticks();
325 	}
326 
327 	num_completions = spdk_nvme_poll_group_process_completions(group->group, 0,
328 			  bdev_nvme_disconnected_qpair_cb);
329 	if (group->collect_spin_stat) {
330 		if (num_completions > 0) {
331 			if (group->end_ticks != 0) {
332 				group->spin_ticks += (group->end_ticks - group->start_ticks);
333 				group->end_ticks = 0;
334 			}
335 			group->start_ticks = 0;
336 		} else {
337 			group->end_ticks = spdk_get_ticks();
338 		}
339 	}
340 
341 	return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
342 }
343 
344 static int
345 bdev_nvme_poll_adminq(void *arg)
346 {
347 	int32_t rc;
348 	struct nvme_ctrlr *nvme_ctrlr = arg;
349 
350 	assert(nvme_ctrlr != NULL);
351 
352 	rc = spdk_nvme_ctrlr_process_admin_completions(nvme_ctrlr->ctrlr);
353 	if (rc < 0) {
354 		bdev_nvme_failover(nvme_ctrlr, false);
355 	}
356 
357 	return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY;
358 }
359 
360 static void
361 _bdev_nvme_unregister_dev_cb(void *io_device)
362 {
363 	struct nvme_bdev *nvme_disk = io_device;
364 
365 	free(nvme_disk->disk.name);
366 	free(nvme_disk);
367 }
368 
369 static int
370 bdev_nvme_destruct(void *ctx)
371 {
372 	struct nvme_bdev *nvme_disk = ctx;
373 	struct nvme_ns *nvme_ns = nvme_disk->nvme_ns;
374 
375 	pthread_mutex_lock(&nvme_ns->ctrlr->mutex);
376 
377 	nvme_ns->bdev = NULL;
378 
379 	if (!nvme_ns->populated) {
380 		pthread_mutex_unlock(&nvme_ns->ctrlr->mutex);
381 
382 		nvme_ctrlr_release(nvme_ns->ctrlr);
383 	} else {
384 		pthread_mutex_unlock(&nvme_ns->ctrlr->mutex);
385 	}
386 
387 	spdk_io_device_unregister(nvme_disk, _bdev_nvme_unregister_dev_cb);
388 
389 	return 0;
390 }
391 
392 static int
393 bdev_nvme_flush(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
394 		struct nvme_bdev_io *bio, uint64_t offset, uint64_t nbytes)
395 {
396 	bdev_nvme_io_complete(bio, 0);
397 
398 	return 0;
399 }
400 
401 static int
402 bdev_nvme_create_qpair(struct nvme_ctrlr_channel *ctrlr_ch)
403 {
404 	struct spdk_nvme_ctrlr *ctrlr = ctrlr_ch->ctrlr->ctrlr;
405 	struct spdk_nvme_io_qpair_opts opts;
406 	struct spdk_nvme_qpair *qpair;
407 	int rc;
408 
409 	spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &opts, sizeof(opts));
410 	opts.delay_cmd_submit = g_opts.delay_cmd_submit;
411 	opts.create_only = true;
412 	opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests);
413 	g_opts.io_queue_requests = opts.io_queue_requests;
414 
415 	qpair = spdk_nvme_ctrlr_alloc_io_qpair(ctrlr, &opts, sizeof(opts));
416 	if (qpair == NULL) {
417 		return -1;
418 	}
419 
420 	assert(ctrlr_ch->group != NULL);
421 
422 	rc = spdk_nvme_poll_group_add(ctrlr_ch->group->group, qpair);
423 	if (rc != 0) {
424 		SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n");
425 		goto err;
426 	}
427 
428 	rc = spdk_nvme_ctrlr_connect_io_qpair(ctrlr, qpair);
429 	if (rc != 0) {
430 		SPDK_ERRLOG("Unable to connect I/O qpair.\n");
431 		goto err;
432 	}
433 
434 	ctrlr_ch->qpair = qpair;
435 
436 	return 0;
437 
438 err:
439 	spdk_nvme_ctrlr_free_io_qpair(qpair);
440 
441 	return rc;
442 }
443 
444 static void
445 bdev_nvme_destroy_qpair(struct nvme_ctrlr_channel *ctrlr_ch)
446 {
447 	if (ctrlr_ch->qpair != NULL) {
448 		spdk_nvme_ctrlr_free_io_qpair(ctrlr_ch->qpair);
449 		ctrlr_ch->qpair = NULL;
450 	}
451 }
452 
453 static void
454 _bdev_nvme_check_pending_destruct(struct nvme_ctrlr *nvme_ctrlr)
455 {
456 	pthread_mutex_lock(&nvme_ctrlr->mutex);
457 	if (nvme_ctrlr->destruct_after_reset) {
458 		assert(nvme_ctrlr->ref == 0 && nvme_ctrlr->destruct);
459 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
460 
461 		spdk_thread_send_msg(nvme_ctrlr->thread, nvme_ctrlr_unregister,
462 				     nvme_ctrlr);
463 	} else {
464 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
465 	}
466 }
467 
468 static void
469 bdev_nvme_check_pending_destruct(struct spdk_io_channel_iter *i, int status)
470 {
471 	struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i);
472 
473 	_bdev_nvme_check_pending_destruct(nvme_ctrlr);
474 }
475 
476 static void
477 _bdev_nvme_complete_pending_resets(struct nvme_ctrlr_channel *ctrlr_ch,
478 				   enum spdk_bdev_io_status status)
479 {
480 	struct spdk_bdev_io *bdev_io;
481 
482 	while (!TAILQ_EMPTY(&ctrlr_ch->pending_resets)) {
483 		bdev_io = TAILQ_FIRST(&ctrlr_ch->pending_resets);
484 		TAILQ_REMOVE(&ctrlr_ch->pending_resets, bdev_io, module_link);
485 		spdk_bdev_io_complete(bdev_io, status);
486 	}
487 }
488 
489 static void
490 bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i)
491 {
492 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
493 	struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch);
494 
495 	_bdev_nvme_complete_pending_resets(ctrlr_ch, SPDK_BDEV_IO_STATUS_SUCCESS);
496 
497 	spdk_for_each_channel_continue(i, 0);
498 }
499 
500 static void
501 bdev_nvme_abort_pending_resets(struct spdk_io_channel_iter *i)
502 {
503 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
504 	struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch);
505 
506 	_bdev_nvme_complete_pending_resets(ctrlr_ch, SPDK_BDEV_IO_STATUS_FAILED);
507 
508 	spdk_for_each_channel_continue(i, 0);
509 }
510 
511 static void
512 bdev_nvme_reset_complete(struct nvme_ctrlr *nvme_ctrlr, int rc)
513 {
514 	struct nvme_ctrlr_trid *curr_trid;
515 	bdev_nvme_reset_cb reset_cb_fn = nvme_ctrlr->reset_cb_fn;
516 	void *reset_cb_arg = nvme_ctrlr->reset_cb_arg;
517 
518 	nvme_ctrlr->reset_cb_fn = NULL;
519 	nvme_ctrlr->reset_cb_arg = NULL;
520 
521 	if (rc) {
522 		SPDK_ERRLOG("Resetting controller failed.\n");
523 	} else {
524 		SPDK_NOTICELOG("Resetting controller successful.\n");
525 	}
526 
527 	pthread_mutex_lock(&nvme_ctrlr->mutex);
528 	nvme_ctrlr->resetting = false;
529 	nvme_ctrlr->failover_in_progress = false;
530 
531 	curr_trid = TAILQ_FIRST(&nvme_ctrlr->trids);
532 	assert(curr_trid != NULL);
533 	assert(&curr_trid->trid == nvme_ctrlr->connected_trid);
534 
535 	curr_trid->is_failed = rc != 0 ? true : false;
536 
537 	if (nvme_ctrlr->ref == 0 && nvme_ctrlr->destruct) {
538 		/* Destruct ctrlr after clearing pending resets. */
539 		nvme_ctrlr->destruct_after_reset = true;
540 	}
541 
542 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
543 
544 	if (reset_cb_fn) {
545 		reset_cb_fn(reset_cb_arg, rc);
546 	}
547 
548 	/* Make sure we clear any pending resets before returning. */
549 	spdk_for_each_channel(nvme_ctrlr,
550 			      rc == 0 ? bdev_nvme_complete_pending_resets :
551 			      bdev_nvme_abort_pending_resets,
552 			      NULL,
553 			      bdev_nvme_check_pending_destruct);
554 }
555 
556 static void
557 bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status)
558 {
559 	struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i);
560 
561 	bdev_nvme_reset_complete(nvme_ctrlr, status);
562 }
563 
564 static void
565 bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i)
566 {
567 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
568 	struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch);
569 	int rc;
570 
571 	rc = bdev_nvme_create_qpair(ctrlr_ch);
572 
573 	spdk_for_each_channel_continue(i, rc);
574 }
575 
576 static int
577 bdev_nvme_ctrlr_reset_poll(void *arg)
578 {
579 	struct nvme_ctrlr *nvme_ctrlr = arg;
580 	int rc;
581 
582 	rc = spdk_nvme_ctrlr_reset_poll_async(nvme_ctrlr->reset_ctx);
583 	if (rc == -EAGAIN) {
584 		return SPDK_POLLER_BUSY;
585 	}
586 
587 	spdk_poller_unregister(&nvme_ctrlr->reset_poller);
588 	if (rc == 0) {
589 		/* Recreate all of the I/O queue pairs */
590 		spdk_for_each_channel(nvme_ctrlr,
591 				      bdev_nvme_reset_create_qpair,
592 				      NULL,
593 				      bdev_nvme_reset_create_qpairs_done);
594 	} else {
595 		bdev_nvme_reset_complete(nvme_ctrlr, rc);
596 	}
597 	return SPDK_POLLER_BUSY;
598 }
599 
600 static void
601 bdev_nvme_reset_ctrlr(struct spdk_io_channel_iter *i, int status)
602 {
603 	struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i);
604 	int rc;
605 
606 	if (status) {
607 		rc = status;
608 		goto err;
609 	}
610 
611 	rc = spdk_nvme_ctrlr_reset_async(nvme_ctrlr->ctrlr, &nvme_ctrlr->reset_ctx);
612 	if (rc != 0) {
613 		SPDK_ERRLOG("Create controller reset context failed\n");
614 		goto err;
615 	}
616 	assert(nvme_ctrlr->reset_poller == NULL);
617 	nvme_ctrlr->reset_poller = SPDK_POLLER_REGISTER(bdev_nvme_ctrlr_reset_poll,
618 				   nvme_ctrlr, 0);
619 
620 	return;
621 
622 err:
623 	bdev_nvme_reset_complete(nvme_ctrlr, rc);
624 }
625 
626 static void
627 bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i)
628 {
629 	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
630 	struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch);
631 
632 	bdev_nvme_destroy_qpair(ctrlr_ch);
633 	spdk_for_each_channel_continue(i, 0);
634 }
635 
636 static int
637 bdev_nvme_reset(struct nvme_ctrlr *nvme_ctrlr)
638 {
639 	pthread_mutex_lock(&nvme_ctrlr->mutex);
640 	if (nvme_ctrlr->destruct) {
641 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
642 		return -EBUSY;
643 	}
644 
645 	if (nvme_ctrlr->resetting) {
646 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
647 		SPDK_NOTICELOG("Unable to perform reset, already in progress.\n");
648 		return -EAGAIN;
649 	}
650 
651 	nvme_ctrlr->resetting = true;
652 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
653 
654 	/* First, delete all NVMe I/O queue pairs. */
655 	spdk_for_each_channel(nvme_ctrlr,
656 			      bdev_nvme_reset_destroy_qpair,
657 			      NULL,
658 			      bdev_nvme_reset_ctrlr);
659 
660 	return 0;
661 }
662 
663 int
664 bdev_nvme_reset_rpc(struct nvme_ctrlr *nvme_ctrlr, bdev_nvme_reset_cb cb_fn, void *cb_arg)
665 {
666 	int rc;
667 
668 	rc = bdev_nvme_reset(nvme_ctrlr);
669 	if (rc == 0) {
670 		nvme_ctrlr->reset_cb_fn = cb_fn;
671 		nvme_ctrlr->reset_cb_arg = cb_arg;
672 	}
673 	return rc;
674 }
675 
676 static void
677 bdev_nvme_reset_io_complete(void *cb_arg, int rc)
678 {
679 	struct nvme_bdev_io *bio = cb_arg;
680 
681 	bdev_nvme_io_complete(bio, rc);
682 }
683 
684 static int
685 bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio)
686 {
687 	struct nvme_ctrlr_channel *ctrlr_ch = nbdev_ch->ctrlr_ch;
688 	struct spdk_bdev_io *bdev_io;
689 	int rc;
690 
691 	rc = bdev_nvme_reset(ctrlr_ch->ctrlr);
692 	if (rc == 0) {
693 		assert(ctrlr_ch->ctrlr->reset_cb_fn == NULL);
694 		assert(ctrlr_ch->ctrlr->reset_cb_arg == NULL);
695 		ctrlr_ch->ctrlr->reset_cb_fn = bdev_nvme_reset_io_complete;
696 		ctrlr_ch->ctrlr->reset_cb_arg = bio;
697 	} else if (rc == -EAGAIN) {
698 		/*
699 		 * Reset call is queued only if it is from the app framework. This is on purpose so that
700 		 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the
701 		 * upper level. If they are in the middle of a reset, we won't try to schedule another one.
702 		 */
703 		bdev_io = spdk_bdev_io_from_ctx(bio);
704 		TAILQ_INSERT_TAIL(&ctrlr_ch->pending_resets, bdev_io, module_link);
705 	} else {
706 		return rc;
707 	}
708 
709 	return 0;
710 }
711 
712 static int
713 bdev_nvme_failover_start(struct nvme_ctrlr *nvme_ctrlr, bool remove)
714 {
715 	struct nvme_ctrlr_trid *curr_trid = NULL, *next_trid = NULL;
716 	int rc;
717 
718 	pthread_mutex_lock(&nvme_ctrlr->mutex);
719 	if (nvme_ctrlr->destruct) {
720 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
721 		/* Don't bother resetting if the controller is in the process of being destructed. */
722 		return -EBUSY;
723 	}
724 
725 	curr_trid = TAILQ_FIRST(&nvme_ctrlr->trids);
726 	assert(curr_trid);
727 	assert(&curr_trid->trid == nvme_ctrlr->connected_trid);
728 	next_trid = TAILQ_NEXT(curr_trid, link);
729 
730 	if (nvme_ctrlr->resetting) {
731 		if (next_trid && !nvme_ctrlr->failover_in_progress) {
732 			rc = -EAGAIN;
733 		} else {
734 			rc = -EBUSY;
735 		}
736 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
737 		SPDK_NOTICELOG("Unable to perform reset, already in progress.\n");
738 		return rc;
739 	}
740 
741 	nvme_ctrlr->resetting = true;
742 	curr_trid->is_failed = true;
743 
744 	if (next_trid) {
745 		assert(curr_trid->trid.trtype != SPDK_NVME_TRANSPORT_PCIE);
746 
747 		SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", curr_trid->trid.traddr,
748 			       curr_trid->trid.trsvcid,	next_trid->trid.traddr, next_trid->trid.trsvcid);
749 
750 		nvme_ctrlr->failover_in_progress = true;
751 		spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr);
752 		nvme_ctrlr->connected_trid = &next_trid->trid;
753 		rc = spdk_nvme_ctrlr_set_trid(nvme_ctrlr->ctrlr, &next_trid->trid);
754 		assert(rc == 0);
755 		TAILQ_REMOVE(&nvme_ctrlr->trids, curr_trid, link);
756 		if (!remove) {
757 			/** Shuffle the old trid to the end of the list and use the new one.
758 			 * Allows for round robin through multiple connections.
759 			 */
760 			TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, curr_trid, link);
761 		} else {
762 			free(curr_trid);
763 		}
764 	}
765 
766 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
767 	return 0;
768 }
769 
770 static int
771 bdev_nvme_failover(struct nvme_ctrlr *nvme_ctrlr, bool remove)
772 {
773 	int rc;
774 
775 	rc = bdev_nvme_failover_start(nvme_ctrlr, remove);
776 	if (rc == 0) {
777 		/* First, delete all NVMe I/O queue pairs. */
778 		spdk_for_each_channel(nvme_ctrlr,
779 				      bdev_nvme_reset_destroy_qpair,
780 				      NULL,
781 				      bdev_nvme_reset_ctrlr);
782 	} else if (rc != -EBUSY) {
783 		return rc;
784 	}
785 
786 	return 0;
787 }
788 
789 static int
790 bdev_nvme_unmap(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
791 		struct nvme_bdev_io *bio,
792 		uint64_t offset_blocks,
793 		uint64_t num_blocks);
794 
795 static int
796 bdev_nvme_write_zeroes(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
797 		       struct nvme_bdev_io *bio,
798 		       uint64_t offset_blocks,
799 		       uint64_t num_blocks);
800 
801 static void
802 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
803 		     bool success)
804 {
805 	struct nvme_bdev_io *bio = (struct nvme_bdev_io *)bdev_io->driver_ctx;
806 	struct spdk_bdev *bdev = bdev_io->bdev;
807 	struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch);
808 	struct spdk_nvme_ns *ns;
809 	struct spdk_nvme_qpair *qpair;
810 	int ret;
811 
812 	if (!success) {
813 		ret = -EINVAL;
814 		goto exit;
815 	}
816 
817 	if (spdk_unlikely(!bdev_nvme_find_io_path(nbdev_ch, &ns, &qpair))) {
818 		ret = -ENXIO;
819 		goto exit;
820 	}
821 
822 	ret = bdev_nvme_readv(ns,
823 			      qpair,
824 			      bio,
825 			      bdev_io->u.bdev.iovs,
826 			      bdev_io->u.bdev.iovcnt,
827 			      bdev_io->u.bdev.md_buf,
828 			      bdev_io->u.bdev.num_blocks,
829 			      bdev_io->u.bdev.offset_blocks,
830 			      bdev->dif_check_flags);
831 
832 exit:
833 	if (spdk_unlikely(ret != 0)) {
834 		bdev_nvme_io_complete(bio, ret);
835 	}
836 }
837 
838 static void
839 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
840 {
841 	struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch);
842 	struct spdk_bdev *bdev = bdev_io->bdev;
843 	struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx;
844 	struct nvme_bdev_io *nbdev_io_to_abort;
845 	struct spdk_nvme_ns *ns;
846 	struct spdk_nvme_qpair *qpair;
847 	int rc = 0;
848 
849 	if (spdk_unlikely(!bdev_nvme_find_io_path(nbdev_ch, &ns, &qpair))) {
850 		rc = -ENXIO;
851 		goto exit;
852 	}
853 
854 	switch (bdev_io->type) {
855 	case SPDK_BDEV_IO_TYPE_READ:
856 		if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) {
857 			rc = bdev_nvme_readv(ns,
858 					     qpair,
859 					     nbdev_io,
860 					     bdev_io->u.bdev.iovs,
861 					     bdev_io->u.bdev.iovcnt,
862 					     bdev_io->u.bdev.md_buf,
863 					     bdev_io->u.bdev.num_blocks,
864 					     bdev_io->u.bdev.offset_blocks,
865 					     bdev->dif_check_flags);
866 		} else {
867 			spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb,
868 					     bdev_io->u.bdev.num_blocks * bdev->blocklen);
869 			rc = 0;
870 		}
871 		break;
872 	case SPDK_BDEV_IO_TYPE_WRITE:
873 		rc = bdev_nvme_writev(ns,
874 				      qpair,
875 				      nbdev_io,
876 				      bdev_io->u.bdev.iovs,
877 				      bdev_io->u.bdev.iovcnt,
878 				      bdev_io->u.bdev.md_buf,
879 				      bdev_io->u.bdev.num_blocks,
880 				      bdev_io->u.bdev.offset_blocks,
881 				      bdev->dif_check_flags);
882 		break;
883 	case SPDK_BDEV_IO_TYPE_COMPARE:
884 		rc = bdev_nvme_comparev(ns,
885 					qpair,
886 					nbdev_io,
887 					bdev_io->u.bdev.iovs,
888 					bdev_io->u.bdev.iovcnt,
889 					bdev_io->u.bdev.md_buf,
890 					bdev_io->u.bdev.num_blocks,
891 					bdev_io->u.bdev.offset_blocks,
892 					bdev->dif_check_flags);
893 		break;
894 	case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE:
895 		rc = bdev_nvme_comparev_and_writev(ns,
896 						   qpair,
897 						   nbdev_io,
898 						   bdev_io->u.bdev.iovs,
899 						   bdev_io->u.bdev.iovcnt,
900 						   bdev_io->u.bdev.fused_iovs,
901 						   bdev_io->u.bdev.fused_iovcnt,
902 						   bdev_io->u.bdev.md_buf,
903 						   bdev_io->u.bdev.num_blocks,
904 						   bdev_io->u.bdev.offset_blocks,
905 						   bdev->dif_check_flags);
906 		break;
907 	case SPDK_BDEV_IO_TYPE_UNMAP:
908 		rc = bdev_nvme_unmap(ns,
909 				     qpair,
910 				     nbdev_io,
911 				     bdev_io->u.bdev.offset_blocks,
912 				     bdev_io->u.bdev.num_blocks);
913 		break;
914 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
915 		rc =  bdev_nvme_write_zeroes(ns, qpair,
916 					     nbdev_io,
917 					     bdev_io->u.bdev.offset_blocks,
918 					     bdev_io->u.bdev.num_blocks);
919 		break;
920 	case SPDK_BDEV_IO_TYPE_RESET:
921 		rc = bdev_nvme_reset_io(nbdev_ch, nbdev_io);
922 		break;
923 	case SPDK_BDEV_IO_TYPE_FLUSH:
924 		rc = bdev_nvme_flush(ns,
925 				     qpair,
926 				     nbdev_io,
927 				     bdev_io->u.bdev.offset_blocks,
928 				     bdev_io->u.bdev.num_blocks);
929 		break;
930 	case SPDK_BDEV_IO_TYPE_ZONE_APPEND:
931 		rc = bdev_nvme_zone_appendv(ns,
932 					    qpair,
933 					    nbdev_io,
934 					    bdev_io->u.bdev.iovs,
935 					    bdev_io->u.bdev.iovcnt,
936 					    bdev_io->u.bdev.md_buf,
937 					    bdev_io->u.bdev.num_blocks,
938 					    bdev_io->u.bdev.offset_blocks,
939 					    bdev->dif_check_flags);
940 		break;
941 	case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO:
942 		rc = bdev_nvme_get_zone_info(ns,
943 					     qpair,
944 					     nbdev_io,
945 					     bdev_io->u.zone_mgmt.zone_id,
946 					     bdev_io->u.zone_mgmt.num_zones,
947 					     bdev_io->u.zone_mgmt.buf);
948 		break;
949 	case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
950 		rc = bdev_nvme_zone_management(ns,
951 					       qpair,
952 					       nbdev_io,
953 					       bdev_io->u.zone_mgmt.zone_id,
954 					       bdev_io->u.zone_mgmt.zone_action);
955 		break;
956 	case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
957 		rc = bdev_nvme_admin_passthru(nbdev_ch,
958 					      nbdev_io,
959 					      &bdev_io->u.nvme_passthru.cmd,
960 					      bdev_io->u.nvme_passthru.buf,
961 					      bdev_io->u.nvme_passthru.nbytes);
962 		break;
963 	case SPDK_BDEV_IO_TYPE_NVME_IO:
964 		rc = bdev_nvme_io_passthru(ns,
965 					   qpair,
966 					   nbdev_io,
967 					   &bdev_io->u.nvme_passthru.cmd,
968 					   bdev_io->u.nvme_passthru.buf,
969 					   bdev_io->u.nvme_passthru.nbytes);
970 		break;
971 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
972 		rc = bdev_nvme_io_passthru_md(ns,
973 					      qpair,
974 					      nbdev_io,
975 					      &bdev_io->u.nvme_passthru.cmd,
976 					      bdev_io->u.nvme_passthru.buf,
977 					      bdev_io->u.nvme_passthru.nbytes,
978 					      bdev_io->u.nvme_passthru.md_buf,
979 					      bdev_io->u.nvme_passthru.md_len);
980 		break;
981 	case SPDK_BDEV_IO_TYPE_ABORT:
982 		nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx;
983 		rc = bdev_nvme_abort(nbdev_ch,
984 				     nbdev_io,
985 				     nbdev_io_to_abort);
986 		break;
987 	default:
988 		rc = -EINVAL;
989 		break;
990 	}
991 
992 exit:
993 	if (spdk_unlikely(rc != 0)) {
994 		bdev_nvme_io_complete(nbdev_io, rc);
995 	}
996 }
997 
998 static bool
999 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
1000 {
1001 	struct nvme_bdev *nbdev = ctx;
1002 	struct nvme_ns *nvme_ns;
1003 	struct spdk_nvme_ns *ns;
1004 	struct spdk_nvme_ctrlr *ctrlr;
1005 	const struct spdk_nvme_ctrlr_data *cdata;
1006 
1007 	nvme_ns = nbdev->nvme_ns;
1008 	assert(nvme_ns != NULL);
1009 	ns = nvme_ns->ns;
1010 	ctrlr = spdk_nvme_ns_get_ctrlr(ns);
1011 
1012 	switch (io_type) {
1013 	case SPDK_BDEV_IO_TYPE_READ:
1014 	case SPDK_BDEV_IO_TYPE_WRITE:
1015 	case SPDK_BDEV_IO_TYPE_RESET:
1016 	case SPDK_BDEV_IO_TYPE_FLUSH:
1017 	case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
1018 	case SPDK_BDEV_IO_TYPE_NVME_IO:
1019 	case SPDK_BDEV_IO_TYPE_ABORT:
1020 		return true;
1021 
1022 	case SPDK_BDEV_IO_TYPE_COMPARE:
1023 		return spdk_nvme_ns_supports_compare(ns);
1024 
1025 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
1026 		return spdk_nvme_ns_get_md_size(ns) ? true : false;
1027 
1028 	case SPDK_BDEV_IO_TYPE_UNMAP:
1029 		cdata = spdk_nvme_ctrlr_get_data(ctrlr);
1030 		return cdata->oncs.dsm;
1031 
1032 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
1033 		cdata = spdk_nvme_ctrlr_get_data(ctrlr);
1034 		return cdata->oncs.write_zeroes;
1035 
1036 	case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE:
1037 		if (spdk_nvme_ctrlr_get_flags(ctrlr) &
1038 		    SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) {
1039 			return true;
1040 		}
1041 		return false;
1042 
1043 	case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO:
1044 	case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
1045 		return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS;
1046 
1047 	case SPDK_BDEV_IO_TYPE_ZONE_APPEND:
1048 		return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS &&
1049 		       spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED;
1050 
1051 	default:
1052 		return false;
1053 	}
1054 }
1055 
1056 static int
1057 bdev_nvme_create_ctrlr_channel_cb(void *io_device, void *ctx_buf)
1058 {
1059 	struct nvme_ctrlr *nvme_ctrlr = io_device;
1060 	struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf;
1061 	struct spdk_io_channel *pg_ch;
1062 	int rc;
1063 
1064 	pg_ch = spdk_get_io_channel(&g_nvme_ctrlrs);
1065 	if (!pg_ch) {
1066 		return -1;
1067 	}
1068 
1069 	ctrlr_ch->group = spdk_io_channel_get_ctx(pg_ch);
1070 
1071 #ifdef SPDK_CONFIG_VTUNE
1072 	ctrlr_ch->group->collect_spin_stat = true;
1073 #else
1074 	ctrlr_ch->group->collect_spin_stat = false;
1075 #endif
1076 
1077 	TAILQ_INIT(&ctrlr_ch->pending_resets);
1078 
1079 	if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_ctrlr->ctrlr)) {
1080 		rc = bdev_ocssd_create_io_channel(ctrlr_ch);
1081 		if (rc != 0) {
1082 			goto err_ocssd_ch;
1083 		}
1084 	}
1085 
1086 	ctrlr_ch->ctrlr = nvme_ctrlr;
1087 
1088 	rc = bdev_nvme_create_qpair(ctrlr_ch);
1089 	if (rc != 0) {
1090 		goto err_qpair;
1091 	}
1092 
1093 	return 0;
1094 
1095 err_qpair:
1096 	if (ctrlr_ch->ocssd_ch) {
1097 		bdev_ocssd_destroy_io_channel(ctrlr_ch);
1098 	}
1099 err_ocssd_ch:
1100 	spdk_put_io_channel(pg_ch);
1101 
1102 	return rc;
1103 }
1104 
1105 static void
1106 bdev_nvme_destroy_ctrlr_channel_cb(void *io_device, void *ctx_buf)
1107 {
1108 	struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf;
1109 
1110 	assert(ctrlr_ch->group != NULL);
1111 
1112 	if (ctrlr_ch->ocssd_ch != NULL) {
1113 		bdev_ocssd_destroy_io_channel(ctrlr_ch);
1114 	}
1115 
1116 	bdev_nvme_destroy_qpair(ctrlr_ch);
1117 
1118 	spdk_put_io_channel(spdk_io_channel_from_ctx(ctrlr_ch->group));
1119 }
1120 
1121 static void
1122 bdev_nvme_submit_accel_crc32c(void *ctx, uint32_t *dst, struct iovec *iov,
1123 			      uint32_t iov_cnt, uint32_t seed,
1124 			      spdk_nvme_accel_completion_cb cb_fn, void *cb_arg)
1125 {
1126 	struct nvme_poll_group *group = ctx;
1127 	int rc;
1128 
1129 	assert(group->accel_channel != NULL);
1130 	assert(cb_fn != NULL);
1131 
1132 	rc = spdk_accel_submit_crc32cv(group->accel_channel, dst, iov, iov_cnt, seed, cb_fn, cb_arg);
1133 	if (rc) {
1134 		/* For the two cases, spdk_accel_submit_crc32cv does not call the user's cb_fn */
1135 		if (rc == -ENOMEM || rc == -EINVAL) {
1136 			cb_fn(cb_arg, rc);
1137 		}
1138 		SPDK_ERRLOG("Cannot complete the accelerated crc32c operation with iov=%p\n", iov);
1139 	}
1140 }
1141 
1142 static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = {
1143 	.table_size		= sizeof(struct spdk_nvme_accel_fn_table),
1144 	.submit_accel_crc32c	= bdev_nvme_submit_accel_crc32c,
1145 };
1146 
1147 static int
1148 bdev_nvme_create_poll_group_cb(void *io_device, void *ctx_buf)
1149 {
1150 	struct nvme_poll_group *group = ctx_buf;
1151 
1152 	group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table);
1153 	if (group->group == NULL) {
1154 		return -1;
1155 	}
1156 
1157 	group->accel_channel = spdk_accel_engine_get_io_channel();
1158 	if (!group->accel_channel) {
1159 		spdk_nvme_poll_group_destroy(group->group);
1160 		SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n",
1161 			    group);
1162 		return -1;
1163 	}
1164 
1165 	group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us);
1166 
1167 	if (group->poller == NULL) {
1168 		spdk_put_io_channel(group->accel_channel);
1169 		spdk_nvme_poll_group_destroy(group->group);
1170 		return -1;
1171 	}
1172 
1173 	return 0;
1174 }
1175 
1176 static void
1177 bdev_nvme_destroy_poll_group_cb(void *io_device, void *ctx_buf)
1178 {
1179 	struct nvme_poll_group *group = ctx_buf;
1180 
1181 	if (group->accel_channel) {
1182 		spdk_put_io_channel(group->accel_channel);
1183 	}
1184 
1185 	spdk_poller_unregister(&group->poller);
1186 	if (spdk_nvme_poll_group_destroy(group->group)) {
1187 		SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.\n");
1188 		assert(false);
1189 	}
1190 }
1191 
1192 static struct spdk_io_channel *
1193 bdev_nvme_get_io_channel(void *ctx)
1194 {
1195 	struct nvme_bdev *nvme_bdev = ctx;
1196 
1197 	return spdk_get_io_channel(nvme_bdev);
1198 }
1199 
1200 static void *
1201 bdev_nvme_get_module_ctx(void *ctx)
1202 {
1203 	struct nvme_bdev *nvme_bdev = ctx;
1204 
1205 	return bdev_nvme_get_ctrlr(&nvme_bdev->disk);
1206 }
1207 
1208 static const char *
1209 _nvme_ana_state_str(enum spdk_nvme_ana_state ana_state)
1210 {
1211 	switch (ana_state) {
1212 	case SPDK_NVME_ANA_OPTIMIZED_STATE:
1213 		return "optimized";
1214 	case SPDK_NVME_ANA_NON_OPTIMIZED_STATE:
1215 		return "non_optimized";
1216 	case SPDK_NVME_ANA_INACCESSIBLE_STATE:
1217 		return "inaccessible";
1218 	case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE:
1219 		return "persistent_loss";
1220 	case SPDK_NVME_ANA_CHANGE_STATE:
1221 		return "change";
1222 	default:
1223 		return NULL;
1224 	}
1225 }
1226 
1227 static int
1228 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
1229 {
1230 	struct nvme_bdev *nvme_bdev = ctx;
1231 	struct nvme_ns *nvme_ns;
1232 	struct spdk_nvme_ns *ns;
1233 	struct spdk_nvme_ctrlr *ctrlr;
1234 	const struct spdk_nvme_ctrlr_data *cdata;
1235 	const struct spdk_nvme_transport_id *trid;
1236 	union spdk_nvme_vs_register vs;
1237 	union spdk_nvme_csts_register csts;
1238 	char buf[128];
1239 
1240 	nvme_ns = nvme_bdev->nvme_ns;
1241 	assert(nvme_ns != NULL);
1242 	ns = nvme_ns->ns;
1243 	ctrlr = spdk_nvme_ns_get_ctrlr(ns);
1244 
1245 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
1246 	trid = spdk_nvme_ctrlr_get_transport_id(ctrlr);
1247 	vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr);
1248 	csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr);
1249 
1250 	spdk_json_write_named_object_begin(w, "nvme");
1251 
1252 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
1253 		spdk_json_write_named_string(w, "pci_address", trid->traddr);
1254 	}
1255 
1256 	spdk_json_write_named_object_begin(w, "trid");
1257 
1258 	nvme_bdev_dump_trid_json(trid, w);
1259 
1260 	spdk_json_write_object_end(w);
1261 
1262 #ifdef SPDK_CONFIG_NVME_CUSE
1263 	size_t cuse_name_size = 128;
1264 	char cuse_name[cuse_name_size];
1265 
1266 	int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns),
1267 					    cuse_name, &cuse_name_size);
1268 	if (rc == 0) {
1269 		spdk_json_write_named_string(w, "cuse_device", cuse_name);
1270 	}
1271 #endif
1272 
1273 	spdk_json_write_named_object_begin(w, "ctrlr_data");
1274 
1275 	spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid);
1276 
1277 	snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn);
1278 	spdk_str_trim(buf);
1279 	spdk_json_write_named_string(w, "model_number", buf);
1280 
1281 	snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn);
1282 	spdk_str_trim(buf);
1283 	spdk_json_write_named_string(w, "serial_number", buf);
1284 
1285 	snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr);
1286 	spdk_str_trim(buf);
1287 	spdk_json_write_named_string(w, "firmware_revision", buf);
1288 
1289 	if (cdata->subnqn[0] != '\0') {
1290 		spdk_json_write_named_string(w, "subnqn", cdata->subnqn);
1291 	}
1292 
1293 	spdk_json_write_named_object_begin(w, "oacs");
1294 
1295 	spdk_json_write_named_uint32(w, "security", cdata->oacs.security);
1296 	spdk_json_write_named_uint32(w, "format", cdata->oacs.format);
1297 	spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware);
1298 	spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage);
1299 
1300 	spdk_json_write_object_end(w);
1301 
1302 	spdk_json_write_object_end(w);
1303 
1304 	spdk_json_write_named_object_begin(w, "vs");
1305 
1306 	spdk_json_write_name(w, "nvme_version");
1307 	if (vs.bits.ter) {
1308 		spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter);
1309 	} else {
1310 		spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr);
1311 	}
1312 
1313 	spdk_json_write_object_end(w);
1314 
1315 	spdk_json_write_named_object_begin(w, "csts");
1316 
1317 	spdk_json_write_named_uint32(w, "rdy", csts.bits.rdy);
1318 	spdk_json_write_named_uint32(w, "cfs", csts.bits.cfs);
1319 
1320 	spdk_json_write_object_end(w);
1321 
1322 	spdk_json_write_named_object_begin(w, "ns_data");
1323 
1324 	spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns));
1325 
1326 	if (cdata->cmic.ana_reporting) {
1327 		spdk_json_write_named_string(w, "ana_state",
1328 					     _nvme_ana_state_str(nvme_ns->ana_state));
1329 	}
1330 
1331 	spdk_json_write_object_end(w);
1332 
1333 	if (cdata->oacs.security) {
1334 		spdk_json_write_named_object_begin(w, "security");
1335 
1336 		spdk_json_write_named_bool(w, "opal", nvme_bdev->opal);
1337 
1338 		spdk_json_write_object_end(w);
1339 	}
1340 
1341 	spdk_json_write_object_end(w);
1342 
1343 	return 0;
1344 }
1345 
1346 static void
1347 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
1348 {
1349 	/* No config per bdev needed */
1350 }
1351 
1352 static uint64_t
1353 bdev_nvme_get_spin_time(struct spdk_io_channel *ch)
1354 {
1355 	struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch);
1356 	struct nvme_ctrlr_channel *ctrlr_ch = nbdev_ch->ctrlr_ch;
1357 	struct nvme_poll_group *group = ctrlr_ch->group;
1358 	uint64_t spin_time;
1359 
1360 	if (!group || !group->collect_spin_stat) {
1361 		return 0;
1362 	}
1363 
1364 	if (group->end_ticks != 0) {
1365 		group->spin_ticks += (group->end_ticks - group->start_ticks);
1366 		group->end_ticks = 0;
1367 	}
1368 
1369 	spin_time = (group->spin_ticks * 1000000ULL) / spdk_get_ticks_hz();
1370 	group->start_ticks = 0;
1371 	group->spin_ticks = 0;
1372 
1373 	return spin_time;
1374 }
1375 
1376 static const struct spdk_bdev_fn_table nvmelib_fn_table = {
1377 	.destruct		= bdev_nvme_destruct,
1378 	.submit_request		= bdev_nvme_submit_request,
1379 	.io_type_supported	= bdev_nvme_io_type_supported,
1380 	.get_io_channel		= bdev_nvme_get_io_channel,
1381 	.dump_info_json		= bdev_nvme_dump_info_json,
1382 	.write_config_json	= bdev_nvme_write_config_json,
1383 	.get_spin_time		= bdev_nvme_get_spin_time,
1384 	.get_module_ctx		= bdev_nvme_get_module_ctx,
1385 };
1386 
1387 typedef int (*bdev_nvme_parse_ana_log_page_cb)(
1388 	const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg);
1389 
1390 static int
1391 bdev_nvme_parse_ana_log_page(struct nvme_ctrlr *nvme_ctrlr,
1392 			     bdev_nvme_parse_ana_log_page_cb cb_fn, void *cb_arg)
1393 {
1394 	struct spdk_nvme_ana_group_descriptor *copied_desc;
1395 	uint8_t *orig_desc;
1396 	uint32_t i, desc_size, copy_len;
1397 	int rc = 0;
1398 
1399 	if (nvme_ctrlr->ana_log_page == NULL) {
1400 		return -EINVAL;
1401 	}
1402 
1403 	copied_desc = nvme_ctrlr->copied_ana_desc;
1404 
1405 	orig_desc = (uint8_t *)nvme_ctrlr->ana_log_page + sizeof(struct spdk_nvme_ana_page);
1406 	copy_len = nvme_ctrlr->ana_log_page_size - sizeof(struct spdk_nvme_ana_page);
1407 
1408 	for (i = 0; i < nvme_ctrlr->ana_log_page->num_ana_group_desc; i++) {
1409 		memcpy(copied_desc, orig_desc, copy_len);
1410 
1411 		rc = cb_fn(copied_desc, cb_arg);
1412 		if (rc != 0) {
1413 			break;
1414 		}
1415 
1416 		desc_size = sizeof(struct spdk_nvme_ana_group_descriptor) +
1417 			    copied_desc->num_of_nsid * sizeof(uint32_t);
1418 		orig_desc += desc_size;
1419 		copy_len -= desc_size;
1420 	}
1421 
1422 	return rc;
1423 }
1424 
1425 static int
1426 nvme_ns_set_ana_state(const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg)
1427 {
1428 	struct nvme_ns *nvme_ns = cb_arg;
1429 	uint32_t i;
1430 
1431 	for (i = 0; i < desc->num_of_nsid; i++) {
1432 		if (desc->nsid[i] != spdk_nvme_ns_get_id(nvme_ns->ns)) {
1433 			continue;
1434 		}
1435 		nvme_ns->ana_group_id = desc->ana_group_id;
1436 		nvme_ns->ana_state = desc->ana_state;
1437 		return 1;
1438 	}
1439 
1440 	return 0;
1441 }
1442 
1443 static int
1444 nvme_disk_create(struct spdk_bdev *disk, const char *base_name,
1445 		 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns,
1446 		 uint32_t prchk_flags, void *ctx)
1447 {
1448 	const struct spdk_uuid		*uuid;
1449 	const uint8_t *nguid;
1450 	const struct spdk_nvme_ctrlr_data *cdata;
1451 	const struct spdk_nvme_ns_data	*nsdata;
1452 	enum spdk_nvme_csi		csi;
1453 	uint32_t atomic_bs, phys_bs, bs;
1454 
1455 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
1456 	csi = spdk_nvme_ns_get_csi(ns);
1457 
1458 	switch (csi) {
1459 	case SPDK_NVME_CSI_NVM:
1460 		disk->product_name = "NVMe disk";
1461 		break;
1462 	case SPDK_NVME_CSI_ZNS:
1463 		disk->product_name = "NVMe ZNS disk";
1464 		disk->zoned = true;
1465 		disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns);
1466 		disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) /
1467 					     spdk_nvme_ns_get_extended_sector_size(ns);
1468 		disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns);
1469 		disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns);
1470 		break;
1471 	default:
1472 		SPDK_ERRLOG("unsupported CSI: %u\n", csi);
1473 		return -ENOTSUP;
1474 	}
1475 
1476 	disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns));
1477 	if (!disk->name) {
1478 		return -ENOMEM;
1479 	}
1480 
1481 	disk->write_cache = 0;
1482 	if (cdata->vwc.present) {
1483 		/* Enable if the Volatile Write Cache exists */
1484 		disk->write_cache = 1;
1485 	}
1486 	if (cdata->oncs.write_zeroes) {
1487 		disk->max_write_zeroes = UINT16_MAX + 1;
1488 	}
1489 	disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns);
1490 	disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns);
1491 	disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns);
1492 
1493 	nguid = spdk_nvme_ns_get_nguid(ns);
1494 	if (!nguid) {
1495 		uuid = spdk_nvme_ns_get_uuid(ns);
1496 		if (uuid) {
1497 			disk->uuid = *uuid;
1498 		}
1499 	} else {
1500 		memcpy(&disk->uuid, nguid, sizeof(disk->uuid));
1501 	}
1502 
1503 	nsdata = spdk_nvme_ns_get_data(ns);
1504 	bs = spdk_nvme_ns_get_sector_size(ns);
1505 	atomic_bs = bs;
1506 	phys_bs = bs;
1507 	if (nsdata->nabo == 0) {
1508 		if (nsdata->nsfeat.ns_atomic_write_unit && nsdata->nawupf) {
1509 			atomic_bs = bs * (1 + nsdata->nawupf);
1510 		} else {
1511 			atomic_bs = bs * (1 + cdata->awupf);
1512 		}
1513 	}
1514 	if (nsdata->nsfeat.optperf) {
1515 		phys_bs = bs * (1 + nsdata->npwg);
1516 	}
1517 	disk->phys_blocklen = spdk_min(phys_bs, atomic_bs);
1518 
1519 	disk->md_len = spdk_nvme_ns_get_md_size(ns);
1520 	if (disk->md_len != 0) {
1521 		disk->md_interleave = nsdata->flbas.extended;
1522 		disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns);
1523 		if (disk->dif_type != SPDK_DIF_DISABLE) {
1524 			disk->dif_is_head_of_md = nsdata->dps.md_start;
1525 			disk->dif_check_flags = prchk_flags;
1526 		}
1527 	}
1528 
1529 	if (!(spdk_nvme_ctrlr_get_flags(ctrlr) &
1530 	      SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) {
1531 		disk->acwu = 0;
1532 	} else if (nsdata->nsfeat.ns_atomic_write_unit) {
1533 		disk->acwu = nsdata->nacwu;
1534 	} else {
1535 		disk->acwu = cdata->acwu;
1536 	}
1537 
1538 	disk->ctxt = ctx;
1539 	disk->fn_table = &nvmelib_fn_table;
1540 	disk->module = &nvme_if;
1541 
1542 	return 0;
1543 }
1544 
1545 static int
1546 nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns)
1547 {
1548 	struct nvme_bdev *bdev;
1549 	int rc;
1550 
1551 	bdev = calloc(1, sizeof(*bdev));
1552 	if (!bdev) {
1553 		SPDK_ERRLOG("bdev calloc() failed\n");
1554 		return -ENOMEM;
1555 	}
1556 
1557 	bdev->nvme_ns = nvme_ns;
1558 	bdev->opal = nvme_ctrlr->opal_dev != NULL;
1559 
1560 	rc = nvme_disk_create(&bdev->disk, nvme_ctrlr->name, nvme_ctrlr->ctrlr,
1561 			      nvme_ns->ns, nvme_ctrlr->prchk_flags, bdev);
1562 	if (rc != 0) {
1563 		SPDK_ERRLOG("Failed to create NVMe disk\n");
1564 		free(bdev);
1565 		return rc;
1566 	}
1567 
1568 	spdk_io_device_register(bdev,
1569 				bdev_nvme_create_bdev_channel_cb,
1570 				bdev_nvme_destroy_bdev_channel_cb,
1571 				sizeof(struct nvme_bdev_channel),
1572 				bdev->disk.name);
1573 
1574 	rc = spdk_bdev_register(&bdev->disk);
1575 	if (rc != 0) {
1576 		SPDK_ERRLOG("spdk_bdev_register() failed\n");
1577 		spdk_io_device_unregister(bdev, NULL);
1578 		free(bdev->disk.name);
1579 		free(bdev);
1580 		return rc;
1581 	}
1582 
1583 	nvme_ns->bdev = bdev;
1584 
1585 	return 0;
1586 }
1587 
1588 static bool
1589 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2)
1590 {
1591 	const struct spdk_nvme_ns_data *nsdata1, *nsdata2;
1592 	const struct spdk_uuid *uuid1, *uuid2;
1593 
1594 	nsdata1 = spdk_nvme_ns_get_data(ns1);
1595 	nsdata2 = spdk_nvme_ns_get_data(ns2);
1596 	uuid1 = spdk_nvme_ns_get_uuid(ns1);
1597 	uuid2 = spdk_nvme_ns_get_uuid(ns2);
1598 
1599 	return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)) == 0 &&
1600 	       nsdata1->eui64 == nsdata2->eui64 &&
1601 	       uuid1 != NULL && uuid2 != NULL && spdk_uuid_compare(uuid1, uuid2) == 0;
1602 }
1603 
1604 static void
1605 nvme_ctrlr_populate_standard_namespace(struct nvme_ctrlr *nvme_ctrlr,
1606 				       struct nvme_ns *nvme_ns, struct nvme_async_probe_ctx *ctx)
1607 {
1608 	struct spdk_nvme_ctrlr	*ctrlr = nvme_ctrlr->ctrlr;
1609 	struct spdk_nvme_ns	*ns;
1610 	int			rc = 0;
1611 
1612 	ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id);
1613 	if (!ns) {
1614 		SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id);
1615 		rc = -EINVAL;
1616 		goto done;
1617 	}
1618 
1619 	nvme_ns->ns = ns;
1620 	nvme_ns->populated = true;
1621 	nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE;
1622 
1623 	if (nvme_ctrlr->ana_log_page != NULL) {
1624 		bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ns_set_ana_state, nvme_ns);
1625 	}
1626 
1627 	rc = nvme_bdev_create(nvme_ctrlr, nvme_ns);
1628 done:
1629 	nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, rc);
1630 }
1631 
1632 static bool
1633 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
1634 		 struct spdk_nvme_ctrlr_opts *opts)
1635 {
1636 	struct nvme_probe_skip_entry *entry;
1637 
1638 	TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) {
1639 		if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) {
1640 			return false;
1641 		}
1642 	}
1643 
1644 	opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst;
1645 	opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight;
1646 	opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight;
1647 	opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight;
1648 	opts->disable_read_ana_log_page = true;
1649 
1650 	SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr);
1651 
1652 	return true;
1653 }
1654 
1655 static void
1656 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl)
1657 {
1658 	struct nvme_ctrlr *nvme_ctrlr = ctx;
1659 
1660 	if (spdk_nvme_cpl_is_error(cpl)) {
1661 		SPDK_WARNLOG("Abort failed. Resetting controller. sc is %u, sct is %u.\n", cpl->status.sc,
1662 			     cpl->status.sct);
1663 		bdev_nvme_reset(nvme_ctrlr);
1664 	}
1665 }
1666 
1667 static void
1668 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr,
1669 	   struct spdk_nvme_qpair *qpair, uint16_t cid)
1670 {
1671 	struct nvme_ctrlr *nvme_ctrlr = cb_arg;
1672 	union spdk_nvme_csts_register csts;
1673 	int rc;
1674 
1675 	assert(nvme_ctrlr->ctrlr == ctrlr);
1676 
1677 	SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid);
1678 
1679 	/* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O
1680 	 * queue.  (Note: qpair == NULL when there's an admin cmd timeout.)  Otherwise we
1681 	 * would submit another fabrics cmd on the admin queue to read CSTS and check for its
1682 	 * completion recursively.
1683 	 */
1684 	if (nvme_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) {
1685 		csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr);
1686 		if (csts.bits.cfs) {
1687 			SPDK_ERRLOG("Controller Fatal Status, reset required\n");
1688 			bdev_nvme_reset(nvme_ctrlr);
1689 			return;
1690 		}
1691 	}
1692 
1693 	switch (g_opts.action_on_timeout) {
1694 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT:
1695 		if (qpair) {
1696 			/* Don't send abort to ctrlr when reset is running. */
1697 			pthread_mutex_lock(&nvme_ctrlr->mutex);
1698 			if (nvme_ctrlr->resetting) {
1699 				pthread_mutex_unlock(&nvme_ctrlr->mutex);
1700 				SPDK_NOTICELOG("Quit abort. Ctrlr is in the process of reseting.\n");
1701 				return;
1702 			}
1703 			pthread_mutex_unlock(&nvme_ctrlr->mutex);
1704 
1705 			rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid,
1706 						       nvme_abort_cpl, nvme_ctrlr);
1707 			if (rc == 0) {
1708 				return;
1709 			}
1710 
1711 			SPDK_ERRLOG("Unable to send abort. Resetting, rc is %d.\n", rc);
1712 		}
1713 
1714 	/* FALLTHROUGH */
1715 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET:
1716 		bdev_nvme_reset(nvme_ctrlr);
1717 		break;
1718 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE:
1719 		SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n");
1720 		break;
1721 	default:
1722 		SPDK_ERRLOG("An invalid timeout action value is found.\n");
1723 		break;
1724 	}
1725 }
1726 
1727 static void
1728 nvme_ctrlr_depopulate_standard_namespace(struct nvme_ns *nvme_ns)
1729 {
1730 	struct nvme_bdev *bdev;
1731 
1732 	bdev = nvme_ns->bdev;
1733 	if (bdev != NULL) {
1734 		spdk_bdev_unregister(&bdev->disk, NULL, NULL);
1735 	}
1736 
1737 	nvme_ctrlr_depopulate_namespace_done(nvme_ns);
1738 }
1739 
1740 static void
1741 nvme_ctrlr_populate_namespace(struct nvme_ctrlr *ctrlr, struct nvme_ns *nvme_ns,
1742 			      struct nvme_async_probe_ctx *ctx)
1743 {
1744 	g_populate_namespace_fn[nvme_ns->type](ctrlr, nvme_ns, ctx);
1745 }
1746 
1747 static void
1748 nvme_ctrlr_depopulate_namespace(struct nvme_ctrlr *ctrlr, struct nvme_ns *nvme_ns)
1749 {
1750 	g_depopulate_namespace_fn[nvme_ns->type](nvme_ns);
1751 }
1752 
1753 void
1754 nvme_ctrlr_populate_namespace_done(struct nvme_async_probe_ctx *ctx,
1755 				   struct nvme_ns *nvme_ns, int rc)
1756 {
1757 	struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr;
1758 
1759 	assert(nvme_ctrlr != NULL);
1760 
1761 	if (rc == 0) {
1762 		pthread_mutex_lock(&nvme_ctrlr->mutex);
1763 		nvme_ctrlr->ref++;
1764 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
1765 	} else {
1766 		memset(nvme_ns, 0, sizeof(*nvme_ns));
1767 	}
1768 
1769 	if (ctx) {
1770 		ctx->populates_in_progress--;
1771 		if (ctx->populates_in_progress == 0) {
1772 			nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx);
1773 		}
1774 	}
1775 }
1776 
1777 static void
1778 nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr,
1779 			       struct nvme_async_probe_ctx *ctx)
1780 {
1781 	struct spdk_nvme_ctrlr	*ctrlr = nvme_ctrlr->ctrlr;
1782 	struct nvme_ns	*nvme_ns;
1783 	struct spdk_nvme_ns	*ns;
1784 	struct nvme_bdev	*bdev;
1785 	uint32_t		i;
1786 	int			rc;
1787 	uint64_t		num_sectors;
1788 	bool			ns_is_active;
1789 
1790 	if (ctx) {
1791 		/* Initialize this count to 1 to handle the populate functions
1792 		 * calling nvme_ctrlr_populate_namespace_done() immediately.
1793 		 */
1794 		ctx->populates_in_progress = 1;
1795 	}
1796 
1797 	for (i = 0; i < nvme_ctrlr->num_ns; i++) {
1798 		uint32_t	nsid = i + 1;
1799 
1800 		nvme_ns = nvme_ctrlr->namespaces[i];
1801 		ns_is_active = spdk_nvme_ctrlr_is_active_ns(ctrlr, nsid);
1802 
1803 		if (nvme_ns->populated && ns_is_active && nvme_ns->type == NVME_NS_STANDARD) {
1804 			/* NS is still there but attributes may have changed */
1805 			ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid);
1806 			num_sectors = spdk_nvme_ns_get_num_sectors(ns);
1807 			bdev = nvme_ns->bdev;
1808 			assert(bdev != NULL);
1809 			if (bdev->disk.blockcnt != num_sectors) {
1810 				SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n",
1811 					       nsid,
1812 					       bdev->disk.name,
1813 					       bdev->disk.blockcnt,
1814 					       num_sectors);
1815 				rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors);
1816 				if (rc != 0) {
1817 					SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n",
1818 						    bdev->disk.name, rc);
1819 				}
1820 			}
1821 		}
1822 
1823 		if (!nvme_ns->populated && ns_is_active) {
1824 			nvme_ns->id = nsid;
1825 			nvme_ns->ctrlr = nvme_ctrlr;
1826 			if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) {
1827 				nvme_ns->type = NVME_NS_OCSSD;
1828 			} else {
1829 				nvme_ns->type = NVME_NS_STANDARD;
1830 			}
1831 
1832 			nvme_ns->bdev = NULL;
1833 
1834 			if (ctx) {
1835 				ctx->populates_in_progress++;
1836 			}
1837 			nvme_ctrlr_populate_namespace(nvme_ctrlr, nvme_ns, ctx);
1838 		}
1839 
1840 		if (nvme_ns->populated && !ns_is_active) {
1841 			nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns);
1842 		}
1843 	}
1844 
1845 	if (ctx) {
1846 		/* Decrement this count now that the loop is over to account
1847 		 * for the one we started with.  If the count is then 0, we
1848 		 * know any populate_namespace functions completed immediately,
1849 		 * so we'll kick the callback here.
1850 		 */
1851 		ctx->populates_in_progress--;
1852 		if (ctx->populates_in_progress == 0) {
1853 			nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx);
1854 		}
1855 	}
1856 
1857 }
1858 
1859 static void
1860 nvme_ctrlr_depopulate_namespaces(struct nvme_ctrlr *nvme_ctrlr)
1861 {
1862 	uint32_t i;
1863 	struct nvme_ns *nvme_ns;
1864 
1865 	for (i = 0; i < nvme_ctrlr->num_ns; i++) {
1866 		uint32_t nsid = i + 1;
1867 
1868 		nvme_ns = nvme_ctrlr->namespaces[nsid - 1];
1869 		if (nvme_ns->populated) {
1870 			assert(nvme_ns->id == nsid);
1871 			nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns);
1872 		}
1873 	}
1874 }
1875 
1876 static bool
1877 nvme_ctrlr_acquire(struct nvme_ctrlr *nvme_ctrlr)
1878 {
1879 	pthread_mutex_lock(&nvme_ctrlr->mutex);
1880 	if (nvme_ctrlr->destruct || nvme_ctrlr->resetting) {
1881 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
1882 		return false;
1883 	}
1884 	nvme_ctrlr->ref++;
1885 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
1886 	return true;
1887 }
1888 
1889 static int
1890 nvme_ctrlr_set_ana_states(const struct spdk_nvme_ana_group_descriptor *desc,
1891 			  void *cb_arg)
1892 {
1893 	struct nvme_ctrlr *nvme_ctrlr = cb_arg;
1894 	struct nvme_ns *nvme_ns;
1895 	uint32_t i, nsid;
1896 
1897 	for (i = 0; i < desc->num_of_nsid; i++) {
1898 		nsid = desc->nsid[i];
1899 		if (nsid == 0 || nsid > nvme_ctrlr->num_ns) {
1900 			continue;
1901 		}
1902 
1903 		nvme_ns = nvme_ctrlr->namespaces[nsid - 1];
1904 		assert(nvme_ns != NULL);
1905 
1906 		if (!nvme_ns->populated) {
1907 			continue;
1908 		}
1909 
1910 		nvme_ns->ana_group_id = desc->ana_group_id;
1911 		nvme_ns->ana_state = desc->ana_state;
1912 	}
1913 
1914 	return 0;
1915 }
1916 
1917 static void
1918 nvme_ctrlr_read_ana_log_page_done(void *ctx, const struct spdk_nvme_cpl *cpl)
1919 {
1920 	struct nvme_ctrlr *nvme_ctrlr = ctx;
1921 
1922 	if (spdk_nvme_cpl_is_success(cpl)) {
1923 		bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ctrlr_set_ana_states,
1924 					     nvme_ctrlr);
1925 	}
1926 
1927 	nvme_ctrlr_release(nvme_ctrlr);
1928 }
1929 
1930 static void
1931 nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr)
1932 {
1933 	int rc;
1934 
1935 	if (nvme_ctrlr->ana_log_page == NULL) {
1936 		return;
1937 	}
1938 
1939 	if (!nvme_ctrlr_acquire(nvme_ctrlr)) {
1940 		return;
1941 	}
1942 
1943 	rc = spdk_nvme_ctrlr_cmd_get_log_page(nvme_ctrlr->ctrlr,
1944 					      SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS,
1945 					      SPDK_NVME_GLOBAL_NS_TAG,
1946 					      nvme_ctrlr->ana_log_page,
1947 					      nvme_ctrlr->ana_log_page_size, 0,
1948 					      nvme_ctrlr_read_ana_log_page_done,
1949 					      nvme_ctrlr);
1950 	if (rc != 0) {
1951 		nvme_ctrlr_release(nvme_ctrlr);
1952 	}
1953 }
1954 
1955 static void
1956 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl)
1957 {
1958 	struct nvme_ctrlr *nvme_ctrlr		= arg;
1959 	union spdk_nvme_async_event_completion	event;
1960 
1961 	if (spdk_nvme_cpl_is_error(cpl)) {
1962 		SPDK_WARNLOG("AER request execute failed");
1963 		return;
1964 	}
1965 
1966 	event.raw = cpl->cdw0;
1967 	if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) &&
1968 	    (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) {
1969 		nvme_ctrlr_populate_namespaces(nvme_ctrlr, NULL);
1970 	} else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_VENDOR) &&
1971 		   (event.bits.log_page_identifier == SPDK_OCSSD_LOG_CHUNK_NOTIFICATION) &&
1972 		   spdk_nvme_ctrlr_is_ocssd_supported(nvme_ctrlr->ctrlr)) {
1973 		bdev_ocssd_handle_chunk_notification(nvme_ctrlr);
1974 	} else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) &&
1975 		   (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_ANA_CHANGE)) {
1976 		nvme_ctrlr_read_ana_log_page(nvme_ctrlr);
1977 	}
1978 }
1979 
1980 static void
1981 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, size_t count, int rc)
1982 {
1983 	if (ctx->cb_fn) {
1984 		ctx->cb_fn(ctx->cb_ctx, count, rc);
1985 	}
1986 
1987 	ctx->namespaces_populated = true;
1988 	if (ctx->probe_done) {
1989 		/* The probe was already completed, so we need to free the context
1990 		 * here.  This can happen for cases like OCSSD, where we need to
1991 		 * send additional commands to the SSD after attach.
1992 		 */
1993 		free(ctx);
1994 	}
1995 }
1996 
1997 static void
1998 nvme_ctrlr_create_done(struct nvme_ctrlr *nvme_ctrlr,
1999 		       struct nvme_async_probe_ctx *ctx)
2000 {
2001 	spdk_io_device_register(nvme_ctrlr,
2002 				bdev_nvme_create_ctrlr_channel_cb,
2003 				bdev_nvme_destroy_ctrlr_channel_cb,
2004 				sizeof(struct nvme_ctrlr_channel),
2005 				nvme_ctrlr->name);
2006 
2007 	nvme_ctrlr_populate_namespaces(nvme_ctrlr, ctx);
2008 }
2009 
2010 static void
2011 nvme_ctrlr_init_ana_log_page_done(void *_ctx, const struct spdk_nvme_cpl *cpl)
2012 {
2013 	struct nvme_ctrlr *nvme_ctrlr = _ctx;
2014 	struct nvme_async_probe_ctx *ctx = nvme_ctrlr->probe_ctx;
2015 
2016 	nvme_ctrlr->probe_ctx = NULL;
2017 
2018 	if (spdk_nvme_cpl_is_error(cpl)) {
2019 		nvme_ctrlr_delete(nvme_ctrlr);
2020 
2021 		if (ctx != NULL) {
2022 			populate_namespaces_cb(ctx, 0, -1);
2023 		}
2024 		return;
2025 	}
2026 
2027 	nvme_ctrlr_create_done(nvme_ctrlr, ctx);
2028 }
2029 
2030 static int
2031 nvme_ctrlr_init_ana_log_page(struct nvme_ctrlr *nvme_ctrlr,
2032 			     struct nvme_async_probe_ctx *ctx)
2033 {
2034 	struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr;
2035 	const struct spdk_nvme_ctrlr_data *cdata;
2036 	uint32_t ana_log_page_size;
2037 
2038 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
2039 
2040 	ana_log_page_size = sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid *
2041 			    sizeof(struct spdk_nvme_ana_group_descriptor) + cdata->nn *
2042 			    sizeof(uint32_t);
2043 
2044 	nvme_ctrlr->ana_log_page = spdk_zmalloc(ana_log_page_size, 64, NULL,
2045 						SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
2046 	if (nvme_ctrlr->ana_log_page == NULL) {
2047 		SPDK_ERRLOG("could not allocate ANA log page buffer\n");
2048 		return -ENXIO;
2049 	}
2050 
2051 	/* Each descriptor in a ANA log page is not ensured to be 8-bytes aligned.
2052 	 * Hence copy each descriptor to a temporary area when parsing it.
2053 	 *
2054 	 * Allocate a buffer whose size is as large as ANA log page buffer because
2055 	 * we do not know the size of a descriptor until actually reading it.
2056 	 */
2057 	nvme_ctrlr->copied_ana_desc = calloc(1, ana_log_page_size);
2058 	if (nvme_ctrlr->copied_ana_desc == NULL) {
2059 		SPDK_ERRLOG("could not allocate a buffer to parse ANA descriptor\n");
2060 		return -ENOMEM;
2061 	}
2062 
2063 	nvme_ctrlr->ana_log_page_size = ana_log_page_size;
2064 
2065 	nvme_ctrlr->probe_ctx = ctx;
2066 
2067 	return spdk_nvme_ctrlr_cmd_get_log_page(ctrlr,
2068 						SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS,
2069 						SPDK_NVME_GLOBAL_NS_TAG,
2070 						nvme_ctrlr->ana_log_page,
2071 						nvme_ctrlr->ana_log_page_size, 0,
2072 						nvme_ctrlr_init_ana_log_page_done,
2073 						nvme_ctrlr);
2074 }
2075 
2076 static int
2077 nvme_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr,
2078 		  const char *name,
2079 		  const struct spdk_nvme_transport_id *trid,
2080 		  uint32_t prchk_flags,
2081 		  struct nvme_async_probe_ctx *ctx)
2082 {
2083 	struct nvme_ctrlr *nvme_ctrlr;
2084 	struct nvme_ctrlr_trid *trid_entry;
2085 	uint32_t i, num_ns;
2086 	const struct spdk_nvme_ctrlr_data *cdata;
2087 	int rc;
2088 
2089 	nvme_ctrlr = calloc(1, sizeof(*nvme_ctrlr));
2090 	if (nvme_ctrlr == NULL) {
2091 		SPDK_ERRLOG("Failed to allocate device struct\n");
2092 		return -ENOMEM;
2093 	}
2094 
2095 	rc = pthread_mutex_init(&nvme_ctrlr->mutex, NULL);
2096 	if (rc != 0) {
2097 		free(nvme_ctrlr);
2098 		return rc;
2099 	}
2100 
2101 	TAILQ_INIT(&nvme_ctrlr->trids);
2102 
2103 	num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr);
2104 	if (num_ns != 0) {
2105 		nvme_ctrlr->namespaces = calloc(num_ns, sizeof(struct nvme_ns *));
2106 		if (!nvme_ctrlr->namespaces) {
2107 			SPDK_ERRLOG("Failed to allocate block namespaces pointer\n");
2108 			rc = -ENOMEM;
2109 			goto err;
2110 		}
2111 
2112 		for (i = 0; i < num_ns; i++) {
2113 			nvme_ctrlr->namespaces[i] = calloc(1, sizeof(struct nvme_ns));
2114 			if (nvme_ctrlr->namespaces[i] == NULL) {
2115 				SPDK_ERRLOG("Failed to allocate block namespace struct\n");
2116 				rc = -ENOMEM;
2117 				goto err;
2118 			}
2119 			nvme_ctrlr->num_ns++;
2120 		}
2121 
2122 		assert(num_ns == nvme_ctrlr->num_ns);
2123 	}
2124 
2125 	trid_entry = calloc(1, sizeof(*trid_entry));
2126 	if (trid_entry == NULL) {
2127 		SPDK_ERRLOG("Failed to allocate trid entry pointer\n");
2128 		rc = -ENOMEM;
2129 		goto err;
2130 	}
2131 
2132 	trid_entry->trid = *trid;
2133 	nvme_ctrlr->connected_trid = &trid_entry->trid;
2134 	TAILQ_INSERT_HEAD(&nvme_ctrlr->trids, trid_entry, link);
2135 
2136 	nvme_ctrlr->thread = spdk_get_thread();
2137 	nvme_ctrlr->ctrlr = ctrlr;
2138 	nvme_ctrlr->ref = 1;
2139 	nvme_ctrlr->name = strdup(name);
2140 	if (nvme_ctrlr->name == NULL) {
2141 		rc = -ENOMEM;
2142 		goto err;
2143 	}
2144 
2145 	if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) {
2146 		rc = bdev_ocssd_init_ctrlr(nvme_ctrlr);
2147 		if (spdk_unlikely(rc != 0)) {
2148 			SPDK_ERRLOG("Unable to initialize OCSSD controller\n");
2149 			goto err;
2150 		}
2151 	}
2152 
2153 	nvme_ctrlr->prchk_flags = prchk_flags;
2154 
2155 	nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_ctrlr,
2156 					  g_opts.nvme_adminq_poll_period_us);
2157 
2158 	pthread_mutex_lock(&g_bdev_nvme_mutex);
2159 	TAILQ_INSERT_TAIL(&g_nvme_ctrlrs, nvme_ctrlr, tailq);
2160 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
2161 
2162 	if (g_opts.timeout_us > 0) {
2163 		/* Register timeout callback. Timeout values for IO vs. admin reqs can be different. */
2164 		/* If timeout_admin_us is 0 (not specified), admin uses same timeout as IO. */
2165 		uint64_t adm_timeout_us = (g_opts.timeout_admin_us == 0) ?
2166 					  g_opts.timeout_us : g_opts.timeout_admin_us;
2167 		spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us,
2168 				adm_timeout_us, timeout_cb, nvme_ctrlr);
2169 	}
2170 
2171 	spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_ctrlr);
2172 	spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_ctrlr);
2173 
2174 	if (spdk_nvme_ctrlr_get_flags(ctrlr) &
2175 	    SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) {
2176 		nvme_ctrlr->opal_dev = spdk_opal_dev_construct(ctrlr);
2177 	}
2178 
2179 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
2180 
2181 	if (cdata->cmic.ana_reporting) {
2182 		rc = nvme_ctrlr_init_ana_log_page(nvme_ctrlr, ctx);
2183 		if (rc == 0) {
2184 			return 0;
2185 		}
2186 	} else {
2187 		nvme_ctrlr_create_done(nvme_ctrlr, ctx);
2188 		return 0;
2189 	}
2190 
2191 err:
2192 	nvme_ctrlr_delete(nvme_ctrlr);
2193 	return rc;
2194 }
2195 
2196 static void
2197 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
2198 	  struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
2199 {
2200 	struct nvme_probe_ctx *ctx = cb_ctx;
2201 	char *name = NULL;
2202 	uint32_t prchk_flags = 0;
2203 	size_t i;
2204 
2205 	if (ctx) {
2206 		for (i = 0; i < ctx->count; i++) {
2207 			if (spdk_nvme_transport_id_compare(trid, &ctx->trids[i]) == 0) {
2208 				prchk_flags = ctx->prchk_flags[i];
2209 				name = strdup(ctx->names[i]);
2210 				break;
2211 			}
2212 		}
2213 	} else {
2214 		name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++);
2215 	}
2216 	if (!name) {
2217 		SPDK_ERRLOG("Failed to assign name to NVMe device\n");
2218 		return;
2219 	}
2220 
2221 	SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name);
2222 
2223 	nvme_ctrlr_create(ctrlr, name, trid, prchk_flags, NULL);
2224 
2225 	free(name);
2226 }
2227 
2228 static void
2229 _nvme_ctrlr_destruct(void *ctx)
2230 {
2231 	struct nvme_ctrlr *nvme_ctrlr = ctx;
2232 
2233 	nvme_ctrlr_depopulate_namespaces(nvme_ctrlr);
2234 	nvme_ctrlr_release(nvme_ctrlr);
2235 }
2236 
2237 static int
2238 _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, bool hotplug)
2239 {
2240 	struct nvme_probe_skip_entry *entry;
2241 
2242 	pthread_mutex_lock(&nvme_ctrlr->mutex);
2243 
2244 	/* The controller's destruction was already started */
2245 	if (nvme_ctrlr->destruct) {
2246 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
2247 		return 0;
2248 	}
2249 
2250 	if (!hotplug &&
2251 	    nvme_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
2252 		entry = calloc(1, sizeof(*entry));
2253 		if (!entry) {
2254 			pthread_mutex_unlock(&nvme_ctrlr->mutex);
2255 			return -ENOMEM;
2256 		}
2257 		entry->trid = *nvme_ctrlr->connected_trid;
2258 		TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq);
2259 	}
2260 
2261 	nvme_ctrlr->destruct = true;
2262 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
2263 
2264 	_nvme_ctrlr_destruct(nvme_ctrlr);
2265 
2266 	return 0;
2267 }
2268 
2269 static void
2270 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr)
2271 {
2272 	struct nvme_ctrlr *nvme_ctrlr = cb_ctx;
2273 
2274 	_bdev_nvme_delete(nvme_ctrlr, true);
2275 }
2276 
2277 static int
2278 bdev_nvme_hotplug_probe(void *arg)
2279 {
2280 	if (g_hotplug_probe_ctx == NULL) {
2281 		spdk_poller_unregister(&g_hotplug_probe_poller);
2282 		return SPDK_POLLER_IDLE;
2283 	}
2284 
2285 	if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) {
2286 		g_hotplug_probe_ctx = NULL;
2287 		spdk_poller_unregister(&g_hotplug_probe_poller);
2288 	}
2289 
2290 	return SPDK_POLLER_BUSY;
2291 }
2292 
2293 static int
2294 bdev_nvme_hotplug(void *arg)
2295 {
2296 	struct spdk_nvme_transport_id trid_pcie;
2297 
2298 	if (g_hotplug_probe_ctx) {
2299 		return SPDK_POLLER_BUSY;
2300 	}
2301 
2302 	memset(&trid_pcie, 0, sizeof(trid_pcie));
2303 	spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE);
2304 
2305 	g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL,
2306 			      hotplug_probe_cb, attach_cb, NULL);
2307 
2308 	if (g_hotplug_probe_ctx) {
2309 		assert(g_hotplug_probe_poller == NULL);
2310 		g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000);
2311 	}
2312 
2313 	return SPDK_POLLER_BUSY;
2314 }
2315 
2316 void
2317 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts)
2318 {
2319 	*opts = g_opts;
2320 }
2321 
2322 static int
2323 bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts)
2324 {
2325 	if ((opts->timeout_us == 0) && (opts->timeout_admin_us != 0)) {
2326 		/* Can't set timeout_admin_us without also setting timeout_us */
2327 		SPDK_WARNLOG("Invalid options: Can't have (timeout_us == 0) with (timeout_admin_us > 0)\n");
2328 		return -EINVAL;
2329 	}
2330 
2331 	return 0;
2332 }
2333 
2334 int
2335 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts)
2336 {
2337 	int ret = bdev_nvme_validate_opts(opts);
2338 	if (ret) {
2339 		SPDK_WARNLOG("Failed to set nvme opts.\n");
2340 		return ret;
2341 	}
2342 
2343 	if (g_bdev_nvme_init_thread != NULL) {
2344 		if (!TAILQ_EMPTY(&g_nvme_ctrlrs)) {
2345 			return -EPERM;
2346 		}
2347 	}
2348 
2349 	g_opts = *opts;
2350 
2351 	return 0;
2352 }
2353 
2354 struct set_nvme_hotplug_ctx {
2355 	uint64_t period_us;
2356 	bool enabled;
2357 	spdk_msg_fn fn;
2358 	void *fn_ctx;
2359 };
2360 
2361 static void
2362 set_nvme_hotplug_period_cb(void *_ctx)
2363 {
2364 	struct set_nvme_hotplug_ctx *ctx = _ctx;
2365 
2366 	spdk_poller_unregister(&g_hotplug_poller);
2367 	if (ctx->enabled) {
2368 		g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us);
2369 	}
2370 
2371 	g_nvme_hotplug_poll_period_us = ctx->period_us;
2372 	g_nvme_hotplug_enabled = ctx->enabled;
2373 	if (ctx->fn) {
2374 		ctx->fn(ctx->fn_ctx);
2375 	}
2376 
2377 	free(ctx);
2378 }
2379 
2380 int
2381 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx)
2382 {
2383 	struct set_nvme_hotplug_ctx *ctx;
2384 
2385 	if (enabled == true && !spdk_process_is_primary()) {
2386 		return -EPERM;
2387 	}
2388 
2389 	ctx = calloc(1, sizeof(*ctx));
2390 	if (ctx == NULL) {
2391 		return -ENOMEM;
2392 	}
2393 
2394 	period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us;
2395 	ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX);
2396 	ctx->enabled = enabled;
2397 	ctx->fn = cb;
2398 	ctx->fn_ctx = cb_ctx;
2399 
2400 	spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx);
2401 	return 0;
2402 }
2403 
2404 static void
2405 nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr,
2406 				    struct nvme_async_probe_ctx *ctx)
2407 {
2408 	struct nvme_ns	*nvme_ns;
2409 	struct nvme_bdev	*nvme_bdev;
2410 	uint32_t		i, nsid;
2411 	size_t			j;
2412 
2413 	assert(nvme_ctrlr != NULL);
2414 
2415 	/*
2416 	 * Report the new bdevs that were created in this call.
2417 	 * There can be more than one bdev per NVMe controller.
2418 	 */
2419 	j = 0;
2420 	for (i = 0; i < nvme_ctrlr->num_ns; i++) {
2421 		nsid = i + 1;
2422 		nvme_ns = nvme_ctrlr->namespaces[nsid - 1];
2423 		if (!nvme_ns->populated) {
2424 			continue;
2425 		}
2426 		assert(nvme_ns->id == nsid);
2427 		nvme_bdev = nvme_ns->bdev;
2428 		if (nvme_bdev == NULL) {
2429 			assert(nvme_ns->type == NVME_NS_OCSSD);
2430 			continue;
2431 		}
2432 		if (j < ctx->count) {
2433 			ctx->names[j] = nvme_bdev->disk.name;
2434 			j++;
2435 		} else {
2436 			SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n",
2437 				    ctx->count);
2438 			populate_namespaces_cb(ctx, 0, -ERANGE);
2439 			return;
2440 		}
2441 	}
2442 
2443 	populate_namespaces_cb(ctx, j, 0);
2444 }
2445 
2446 static int
2447 bdev_nvme_compare_trids(struct nvme_ctrlr *nvme_ctrlr,
2448 			struct spdk_nvme_ctrlr *new_ctrlr,
2449 			struct spdk_nvme_transport_id *trid)
2450 {
2451 	struct nvme_ctrlr_trid *tmp_trid;
2452 
2453 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
2454 		SPDK_ERRLOG("PCIe failover is not supported.\n");
2455 		return -ENOTSUP;
2456 	}
2457 
2458 	/* Currently we only support failover to the same transport type. */
2459 	if (nvme_ctrlr->connected_trid->trtype != trid->trtype) {
2460 		return -EINVAL;
2461 	}
2462 
2463 	/* Currently we only support failover to the same NQN. */
2464 	if (strncmp(trid->subnqn, nvme_ctrlr->connected_trid->subnqn, SPDK_NVMF_NQN_MAX_LEN)) {
2465 		return -EINVAL;
2466 	}
2467 
2468 	/* Skip all the other checks if we've already registered this path. */
2469 	TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) {
2470 		if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) {
2471 			return -EEXIST;
2472 		}
2473 	}
2474 
2475 	return 0;
2476 }
2477 
2478 static int
2479 bdev_nvme_compare_namespaces(struct nvme_ctrlr *nvme_ctrlr,
2480 			     struct spdk_nvme_ctrlr *new_ctrlr)
2481 {
2482 	uint32_t i, nsid;
2483 	struct nvme_ns *nvme_ns;
2484 	struct spdk_nvme_ns *new_ns;
2485 
2486 	if (spdk_nvme_ctrlr_get_num_ns(new_ctrlr) != nvme_ctrlr->num_ns) {
2487 		return -EINVAL;
2488 	}
2489 
2490 	for (i = 0; i < nvme_ctrlr->num_ns; i++) {
2491 		nsid = i + 1;
2492 
2493 		nvme_ns = nvme_ctrlr->namespaces[i];
2494 		if (!nvme_ns->populated) {
2495 			continue;
2496 		}
2497 
2498 		new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nsid);
2499 		assert(new_ns != NULL);
2500 
2501 		if (!bdev_nvme_compare_ns(nvme_ns->ns, new_ns)) {
2502 			return -EINVAL;
2503 		}
2504 	}
2505 
2506 	return 0;
2507 }
2508 
2509 static int
2510 _bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr,
2511 			      struct spdk_nvme_transport_id *trid)
2512 {
2513 	struct nvme_ctrlr_trid *new_trid, *tmp_trid;
2514 
2515 	new_trid = calloc(1, sizeof(*new_trid));
2516 	if (new_trid == NULL) {
2517 		return -ENOMEM;
2518 	}
2519 	new_trid->trid = *trid;
2520 	new_trid->is_failed = false;
2521 
2522 	TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) {
2523 		if (tmp_trid->is_failed) {
2524 			TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link);
2525 			return 0;
2526 		}
2527 	}
2528 
2529 	TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, new_trid, link);
2530 	return 0;
2531 }
2532 
2533 /* This is the case that a secondary path is added to an existing
2534  * nvme_ctrlr for failover. After checking if it can access the same
2535  * namespaces as the primary path, it is disconnected until failover occurs.
2536  */
2537 static int
2538 bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr,
2539 			     struct spdk_nvme_ctrlr *new_ctrlr,
2540 			     struct spdk_nvme_transport_id *trid)
2541 {
2542 	int rc;
2543 
2544 	assert(nvme_ctrlr != NULL);
2545 
2546 	pthread_mutex_lock(&nvme_ctrlr->mutex);
2547 
2548 	rc = bdev_nvme_compare_trids(nvme_ctrlr, new_ctrlr, trid);
2549 	if (rc != 0) {
2550 		goto exit;
2551 	}
2552 
2553 	rc = bdev_nvme_compare_namespaces(nvme_ctrlr, new_ctrlr);
2554 	if (rc != 0) {
2555 		goto exit;
2556 	}
2557 
2558 	rc = _bdev_nvme_add_secondary_trid(nvme_ctrlr, trid);
2559 
2560 exit:
2561 	pthread_mutex_unlock(&nvme_ctrlr->mutex);
2562 
2563 	spdk_nvme_detach(new_ctrlr);
2564 
2565 	return rc;
2566 }
2567 
2568 static void
2569 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
2570 		  struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
2571 {
2572 	struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx;
2573 	struct nvme_ctrlr	*nvme_ctrlr;
2574 	struct nvme_async_probe_ctx *ctx;
2575 	int rc;
2576 
2577 	ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, opts);
2578 	ctx->ctrlr_attached = true;
2579 
2580 	nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name);
2581 	if (nvme_ctrlr) {
2582 		rc = bdev_nvme_add_secondary_trid(nvme_ctrlr, ctrlr, &ctx->trid);
2583 	} else {
2584 		rc = nvme_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx->prchk_flags, ctx);
2585 		if (rc == 0) {
2586 			return;
2587 		}
2588 	}
2589 
2590 	populate_namespaces_cb(ctx, 0, rc);
2591 }
2592 
2593 static int
2594 bdev_nvme_async_poll(void *arg)
2595 {
2596 	struct nvme_async_probe_ctx	*ctx = arg;
2597 	int				rc;
2598 
2599 	rc = spdk_nvme_probe_poll_async(ctx->probe_ctx);
2600 	if (spdk_unlikely(rc != -EAGAIN)) {
2601 		ctx->probe_done = true;
2602 		spdk_poller_unregister(&ctx->poller);
2603 		if (!ctx->ctrlr_attached) {
2604 			/* The probe is done, but no controller was attached.
2605 			 * That means we had a failure, so report -EIO back to
2606 			 * the caller (usually the RPC). populate_namespaces_cb()
2607 			 * will take care of freeing the nvme_async_probe_ctx.
2608 			 */
2609 			populate_namespaces_cb(ctx, 0, -EIO);
2610 		} else if (ctx->namespaces_populated) {
2611 			/* The namespaces for the attached controller were all
2612 			 * populated and the response was already sent to the
2613 			 * caller (usually the RPC).  So free the context here.
2614 			 */
2615 			free(ctx);
2616 		}
2617 	}
2618 
2619 	return SPDK_POLLER_BUSY;
2620 }
2621 
2622 int
2623 bdev_nvme_create(struct spdk_nvme_transport_id *trid,
2624 		 struct spdk_nvme_host_id *hostid,
2625 		 const char *base_name,
2626 		 const char **names,
2627 		 uint32_t count,
2628 		 const char *hostnqn,
2629 		 uint32_t prchk_flags,
2630 		 spdk_bdev_create_nvme_fn cb_fn,
2631 		 void *cb_ctx,
2632 		 struct spdk_nvme_ctrlr_opts *opts)
2633 {
2634 	struct nvme_probe_skip_entry	*entry, *tmp;
2635 	struct nvme_async_probe_ctx	*ctx;
2636 
2637 	/* TODO expand this check to include both the host and target TRIDs.
2638 	 * Only if both are the same should we fail.
2639 	 */
2640 	if (nvme_ctrlr_get(trid) != NULL) {
2641 		SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr);
2642 		return -EEXIST;
2643 	}
2644 
2645 	ctx = calloc(1, sizeof(*ctx));
2646 	if (!ctx) {
2647 		return -ENOMEM;
2648 	}
2649 	ctx->base_name = base_name;
2650 	ctx->names = names;
2651 	ctx->count = count;
2652 	ctx->cb_fn = cb_fn;
2653 	ctx->cb_ctx = cb_ctx;
2654 	ctx->prchk_flags = prchk_flags;
2655 	ctx->trid = *trid;
2656 
2657 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
2658 		TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) {
2659 			if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) {
2660 				TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq);
2661 				free(entry);
2662 				break;
2663 			}
2664 		}
2665 	}
2666 
2667 	if (opts) {
2668 		memcpy(&ctx->opts, opts, sizeof(*opts));
2669 	} else {
2670 		spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->opts, sizeof(ctx->opts));
2671 	}
2672 
2673 	ctx->opts.transport_retry_count = g_opts.retry_count;
2674 	ctx->opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms;
2675 	ctx->opts.disable_read_ana_log_page = true;
2676 
2677 	if (hostnqn) {
2678 		snprintf(ctx->opts.hostnqn, sizeof(ctx->opts.hostnqn), "%s", hostnqn);
2679 	}
2680 
2681 	if (hostid->hostaddr[0] != '\0') {
2682 		snprintf(ctx->opts.src_addr, sizeof(ctx->opts.src_addr), "%s", hostid->hostaddr);
2683 	}
2684 
2685 	if (hostid->hostsvcid[0] != '\0') {
2686 		snprintf(ctx->opts.src_svcid, sizeof(ctx->opts.src_svcid), "%s", hostid->hostsvcid);
2687 	}
2688 
2689 	ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->opts, connect_attach_cb);
2690 	if (ctx->probe_ctx == NULL) {
2691 		SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr);
2692 		free(ctx);
2693 		return -ENODEV;
2694 	}
2695 	ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000);
2696 
2697 	return 0;
2698 }
2699 
2700 static int
2701 bdev_nvme_delete_secondary_trid(struct nvme_ctrlr *nvme_ctrlr,
2702 				const struct spdk_nvme_transport_id *trid)
2703 {
2704 	struct nvme_ctrlr_trid	*ctrlr_trid, *tmp_trid;
2705 
2706 	if (!spdk_nvme_transport_id_compare(trid, nvme_ctrlr->connected_trid)) {
2707 		return -EBUSY;
2708 	}
2709 
2710 	TAILQ_FOREACH_SAFE(ctrlr_trid, &nvme_ctrlr->trids, link, tmp_trid) {
2711 		if (!spdk_nvme_transport_id_compare(&ctrlr_trid->trid, trid)) {
2712 			TAILQ_REMOVE(&nvme_ctrlr->trids, ctrlr_trid, link);
2713 			free(ctrlr_trid);
2714 			return 0;
2715 		}
2716 	}
2717 
2718 	return -ENXIO;
2719 }
2720 
2721 int
2722 bdev_nvme_delete(const char *name, const struct spdk_nvme_transport_id *trid)
2723 {
2724 	struct nvme_ctrlr	*nvme_ctrlr;
2725 	struct nvme_ctrlr_trid	*ctrlr_trid;
2726 
2727 	if (name == NULL) {
2728 		return -EINVAL;
2729 	}
2730 
2731 	nvme_ctrlr = nvme_ctrlr_get_by_name(name);
2732 	if (nvme_ctrlr == NULL) {
2733 		SPDK_ERRLOG("Failed to find NVMe controller\n");
2734 		return -ENODEV;
2735 	}
2736 
2737 	/* case 1: remove the controller itself. */
2738 	if (trid == NULL) {
2739 		return _bdev_nvme_delete(nvme_ctrlr, false);
2740 	}
2741 
2742 	/* case 2: we are currently using the path to be removed. */
2743 	if (!spdk_nvme_transport_id_compare(trid, nvme_ctrlr->connected_trid)) {
2744 		ctrlr_trid = TAILQ_FIRST(&nvme_ctrlr->trids);
2745 		assert(nvme_ctrlr->connected_trid == &ctrlr_trid->trid);
2746 		/* case 2A: the current path is the only path. */
2747 		if (!TAILQ_NEXT(ctrlr_trid, link)) {
2748 			return _bdev_nvme_delete(nvme_ctrlr, false);
2749 		}
2750 
2751 		/* case 2B: there is an alternative path. */
2752 		return bdev_nvme_failover(nvme_ctrlr, true);
2753 	}
2754 
2755 	/* case 3: We are not using the specified path. */
2756 	return bdev_nvme_delete_secondary_trid(nvme_ctrlr, trid);
2757 }
2758 
2759 static int
2760 bdev_nvme_library_init(void)
2761 {
2762 	g_bdev_nvme_init_thread = spdk_get_thread();
2763 
2764 	spdk_io_device_register(&g_nvme_ctrlrs, bdev_nvme_create_poll_group_cb,
2765 				bdev_nvme_destroy_poll_group_cb,
2766 				sizeof(struct nvme_poll_group),  "nvme_poll_groups");
2767 
2768 	return 0;
2769 }
2770 
2771 static void
2772 bdev_nvme_library_fini(void)
2773 {
2774 	struct nvme_ctrlr *nvme_ctrlr, *tmp;
2775 	struct nvme_probe_skip_entry *entry, *entry_tmp;
2776 
2777 	spdk_poller_unregister(&g_hotplug_poller);
2778 	free(g_hotplug_probe_ctx);
2779 	g_hotplug_probe_ctx = NULL;
2780 
2781 	TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) {
2782 		TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq);
2783 		free(entry);
2784 	}
2785 
2786 	pthread_mutex_lock(&g_bdev_nvme_mutex);
2787 	TAILQ_FOREACH_SAFE(nvme_ctrlr, &g_nvme_ctrlrs, tailq, tmp) {
2788 		pthread_mutex_lock(&nvme_ctrlr->mutex);
2789 		if (nvme_ctrlr->destruct) {
2790 			/* This controller's destruction was already started
2791 			 * before the application started shutting down
2792 			 */
2793 			pthread_mutex_unlock(&nvme_ctrlr->mutex);
2794 			continue;
2795 		}
2796 		nvme_ctrlr->destruct = true;
2797 		pthread_mutex_unlock(&nvme_ctrlr->mutex);
2798 
2799 		spdk_thread_send_msg(nvme_ctrlr->thread, _nvme_ctrlr_destruct,
2800 				     nvme_ctrlr);
2801 	}
2802 
2803 	g_bdev_nvme_module_finish = true;
2804 	if (TAILQ_EMPTY(&g_nvme_ctrlrs)) {
2805 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
2806 		spdk_io_device_unregister(&g_nvme_ctrlrs, NULL);
2807 		spdk_bdev_module_finish_done();
2808 		return;
2809 	}
2810 
2811 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
2812 }
2813 
2814 static void
2815 bdev_nvme_verify_pi_error(struct nvme_bdev_io *bio)
2816 {
2817 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2818 	struct spdk_bdev *bdev = bdev_io->bdev;
2819 	struct spdk_dif_ctx dif_ctx;
2820 	struct spdk_dif_error err_blk = {};
2821 	int rc;
2822 
2823 	rc = spdk_dif_ctx_init(&dif_ctx,
2824 			       bdev->blocklen, bdev->md_len, bdev->md_interleave,
2825 			       bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags,
2826 			       bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0);
2827 	if (rc != 0) {
2828 		SPDK_ERRLOG("Initialization of DIF context failed\n");
2829 		return;
2830 	}
2831 
2832 	if (bdev->md_interleave) {
2833 		rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
2834 				     bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
2835 	} else {
2836 		struct iovec md_iov = {
2837 			.iov_base	= bdev_io->u.bdev.md_buf,
2838 			.iov_len	= bdev_io->u.bdev.num_blocks * bdev->md_len,
2839 		};
2840 
2841 		rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
2842 				     &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
2843 	}
2844 
2845 	if (rc != 0) {
2846 		SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n",
2847 			    err_blk.err_type, err_blk.err_offset);
2848 	} else {
2849 		SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n");
2850 	}
2851 }
2852 
2853 static void
2854 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl)
2855 {
2856 	struct nvme_bdev_io *bio = ref;
2857 
2858 	if (spdk_nvme_cpl_is_success(cpl)) {
2859 		/* Run PI verification for read data buffer. */
2860 		bdev_nvme_verify_pi_error(bio);
2861 	}
2862 
2863 	/* Return original completion status */
2864 	bdev_nvme_io_complete_nvme_status(bio, &bio->cpl);
2865 }
2866 
2867 static void
2868 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl)
2869 {
2870 	struct nvme_bdev_io *bio = ref;
2871 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2872 	struct nvme_bdev_channel *nbdev_ch;
2873 	struct spdk_nvme_ns *ns;
2874 	struct spdk_nvme_qpair *qpair;
2875 	int ret;
2876 
2877 	if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) {
2878 		SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n",
2879 			    cpl->status.sct, cpl->status.sc);
2880 
2881 		/* Save completion status to use after verifying PI error. */
2882 		bio->cpl = *cpl;
2883 
2884 		nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io));
2885 
2886 		if (spdk_likely(bdev_nvme_find_io_path(nbdev_ch, &ns, &qpair))) {
2887 			/* Read without PI checking to verify PI error. */
2888 			ret = bdev_nvme_no_pi_readv(ns,
2889 						    qpair,
2890 						    bio,
2891 						    bdev_io->u.bdev.iovs,
2892 						    bdev_io->u.bdev.iovcnt,
2893 						    bdev_io->u.bdev.md_buf,
2894 						    bdev_io->u.bdev.num_blocks,
2895 						    bdev_io->u.bdev.offset_blocks);
2896 			if (ret == 0) {
2897 				return;
2898 			}
2899 		}
2900 	}
2901 
2902 	bdev_nvme_io_complete_nvme_status(bio, cpl);
2903 }
2904 
2905 static void
2906 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl)
2907 {
2908 	struct nvme_bdev_io *bio = ref;
2909 
2910 	if (spdk_nvme_cpl_is_pi_error(cpl)) {
2911 		SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n",
2912 			    cpl->status.sct, cpl->status.sc);
2913 		/* Run PI verification for write data buffer if PI error is detected. */
2914 		bdev_nvme_verify_pi_error(bio);
2915 	}
2916 
2917 	bdev_nvme_io_complete_nvme_status(bio, cpl);
2918 }
2919 
2920 static void
2921 bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl)
2922 {
2923 	struct nvme_bdev_io *bio = ref;
2924 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2925 
2926 	/* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks.
2927 	 * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error().
2928 	 */
2929 	bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0;
2930 
2931 	if (spdk_nvme_cpl_is_pi_error(cpl)) {
2932 		SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n",
2933 			    cpl->status.sct, cpl->status.sc);
2934 		/* Run PI verification for zone append data buffer if PI error is detected. */
2935 		bdev_nvme_verify_pi_error(bio);
2936 	}
2937 
2938 	bdev_nvme_io_complete_nvme_status(bio, cpl);
2939 }
2940 
2941 static void
2942 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl)
2943 {
2944 	struct nvme_bdev_io *bio = ref;
2945 
2946 	if (spdk_nvme_cpl_is_pi_error(cpl)) {
2947 		SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n",
2948 			    cpl->status.sct, cpl->status.sc);
2949 		/* Run PI verification for compare data buffer if PI error is detected. */
2950 		bdev_nvme_verify_pi_error(bio);
2951 	}
2952 
2953 	bdev_nvme_io_complete_nvme_status(bio, cpl);
2954 }
2955 
2956 static void
2957 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl)
2958 {
2959 	struct nvme_bdev_io *bio = ref;
2960 
2961 	/* Compare operation completion */
2962 	if ((cpl->cdw0 & 0xFF) == SPDK_NVME_OPC_COMPARE) {
2963 		/* Save compare result for write callback */
2964 		bio->cpl = *cpl;
2965 		return;
2966 	}
2967 
2968 	/* Write operation completion */
2969 	if (spdk_nvme_cpl_is_error(&bio->cpl)) {
2970 		/* If bio->cpl is already an error, it means the compare operation failed.  In that case,
2971 		 * complete the IO with the compare operation's status.
2972 		 */
2973 		if (!spdk_nvme_cpl_is_error(cpl)) {
2974 			SPDK_ERRLOG("Unexpected write success after compare failure.\n");
2975 		}
2976 
2977 		bdev_nvme_io_complete_nvme_status(bio, &bio->cpl);
2978 	} else {
2979 		bdev_nvme_io_complete_nvme_status(bio, cpl);
2980 	}
2981 }
2982 
2983 static void
2984 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl)
2985 {
2986 	struct nvme_bdev_io *bio = ref;
2987 
2988 	bdev_nvme_io_complete_nvme_status(bio, cpl);
2989 }
2990 
2991 static int
2992 fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc)
2993 {
2994 	switch (desc->zs) {
2995 	case SPDK_NVME_ZONE_STATE_EMPTY:
2996 		info->state = SPDK_BDEV_ZONE_STATE_EMPTY;
2997 		break;
2998 	case SPDK_NVME_ZONE_STATE_IOPEN:
2999 		info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN;
3000 		break;
3001 	case SPDK_NVME_ZONE_STATE_EOPEN:
3002 		info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN;
3003 		break;
3004 	case SPDK_NVME_ZONE_STATE_CLOSED:
3005 		info->state = SPDK_BDEV_ZONE_STATE_CLOSED;
3006 		break;
3007 	case SPDK_NVME_ZONE_STATE_RONLY:
3008 		info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY;
3009 		break;
3010 	case SPDK_NVME_ZONE_STATE_FULL:
3011 		info->state = SPDK_BDEV_ZONE_STATE_FULL;
3012 		break;
3013 	case SPDK_NVME_ZONE_STATE_OFFLINE:
3014 		info->state = SPDK_BDEV_ZONE_STATE_OFFLINE;
3015 		break;
3016 	default:
3017 		SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs);
3018 		return -EIO;
3019 	}
3020 
3021 	info->zone_id = desc->zslba;
3022 	info->write_pointer = desc->wp;
3023 	info->capacity = desc->zcap;
3024 
3025 	return 0;
3026 }
3027 
3028 static void
3029 bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl)
3030 {
3031 	struct nvme_bdev_io *bio = ref;
3032 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
3033 	struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io);
3034 	struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch);
3035 	uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id;
3036 	uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones;
3037 	struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf;
3038 	uint64_t max_zones_per_buf, i;
3039 	uint32_t zone_report_bufsize;
3040 	struct spdk_nvme_ns *ns;
3041 	struct spdk_nvme_qpair *qpair;
3042 	int ret;
3043 
3044 	if (spdk_nvme_cpl_is_error(cpl)) {
3045 		goto out_complete_io_nvme_cpl;
3046 	}
3047 
3048 	if (!bdev_nvme_find_io_path(nbdev_ch, &ns, &qpair)) {
3049 		ret = -ENXIO;
3050 		goto out_complete_io_ret;
3051 	}
3052 
3053 	zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns);
3054 	max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) /
3055 			    sizeof(bio->zone_report_buf->descs[0]);
3056 
3057 	if (bio->zone_report_buf->nr_zones > max_zones_per_buf) {
3058 		ret = -EINVAL;
3059 		goto out_complete_io_ret;
3060 	}
3061 
3062 	if (!bio->zone_report_buf->nr_zones) {
3063 		ret = -EINVAL;
3064 		goto out_complete_io_ret;
3065 	}
3066 
3067 	for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) {
3068 		ret = fill_zone_from_report(&info[bio->handled_zones],
3069 					    &bio->zone_report_buf->descs[i]);
3070 		if (ret) {
3071 			goto out_complete_io_ret;
3072 		}
3073 		bio->handled_zones++;
3074 	}
3075 
3076 	if (bio->handled_zones < zones_to_copy) {
3077 		uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(ns);
3078 		uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones);
3079 
3080 		memset(bio->zone_report_buf, 0, zone_report_bufsize);
3081 		ret = spdk_nvme_zns_report_zones(ns, qpair,
3082 						 bio->zone_report_buf, zone_report_bufsize,
3083 						 slba, SPDK_NVME_ZRA_LIST_ALL, true,
3084 						 bdev_nvme_get_zone_info_done, bio);
3085 		if (!ret) {
3086 			return;
3087 		} else {
3088 			goto out_complete_io_ret;
3089 		}
3090 	}
3091 
3092 out_complete_io_nvme_cpl:
3093 	free(bio->zone_report_buf);
3094 	bio->zone_report_buf = NULL;
3095 	bdev_nvme_io_complete_nvme_status(bio, cpl);
3096 	return;
3097 
3098 out_complete_io_ret:
3099 	free(bio->zone_report_buf);
3100 	bio->zone_report_buf = NULL;
3101 	bdev_nvme_io_complete(bio, ret);
3102 }
3103 
3104 static void
3105 bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl)
3106 {
3107 	struct nvme_bdev_io *bio = ref;
3108 
3109 	bdev_nvme_io_complete_nvme_status(bio, cpl);
3110 }
3111 
3112 static void
3113 bdev_nvme_admin_passthru_completion(void *ctx)
3114 {
3115 	struct nvme_bdev_io *bio = ctx;
3116 
3117 	bdev_nvme_io_complete_nvme_status(bio, &bio->cpl);
3118 }
3119 
3120 static void
3121 bdev_nvme_abort_completion(void *ctx)
3122 {
3123 	struct nvme_bdev_io *bio = ctx;
3124 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
3125 
3126 	if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) {
3127 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
3128 	} else {
3129 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
3130 	}
3131 }
3132 
3133 static void
3134 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl)
3135 {
3136 	struct nvme_bdev_io *bio = ref;
3137 
3138 	bio->cpl = *cpl;
3139 	spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_completion, bio);
3140 }
3141 
3142 static void
3143 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl)
3144 {
3145 	struct nvme_bdev_io *bio = ref;
3146 
3147 	bio->cpl = *cpl;
3148 	spdk_thread_send_msg(bio->orig_thread, bdev_nvme_admin_passthru_completion, bio);
3149 }
3150 
3151 static void
3152 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset)
3153 {
3154 	struct nvme_bdev_io *bio = ref;
3155 	struct iovec *iov;
3156 
3157 	bio->iov_offset = sgl_offset;
3158 	for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) {
3159 		iov = &bio->iovs[bio->iovpos];
3160 		if (bio->iov_offset < iov->iov_len) {
3161 			break;
3162 		}
3163 
3164 		bio->iov_offset -= iov->iov_len;
3165 	}
3166 }
3167 
3168 static int
3169 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length)
3170 {
3171 	struct nvme_bdev_io *bio = ref;
3172 	struct iovec *iov;
3173 
3174 	assert(bio->iovpos < bio->iovcnt);
3175 
3176 	iov = &bio->iovs[bio->iovpos];
3177 
3178 	*address = iov->iov_base;
3179 	*length = iov->iov_len;
3180 
3181 	if (bio->iov_offset) {
3182 		assert(bio->iov_offset <= iov->iov_len);
3183 		*address += bio->iov_offset;
3184 		*length -= bio->iov_offset;
3185 	}
3186 
3187 	bio->iov_offset += *length;
3188 	if (bio->iov_offset == iov->iov_len) {
3189 		bio->iovpos++;
3190 		bio->iov_offset = 0;
3191 	}
3192 
3193 	return 0;
3194 }
3195 
3196 static void
3197 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset)
3198 {
3199 	struct nvme_bdev_io *bio = ref;
3200 	struct iovec *iov;
3201 
3202 	bio->fused_iov_offset = sgl_offset;
3203 	for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) {
3204 		iov = &bio->fused_iovs[bio->fused_iovpos];
3205 		if (bio->fused_iov_offset < iov->iov_len) {
3206 			break;
3207 		}
3208 
3209 		bio->fused_iov_offset -= iov->iov_len;
3210 	}
3211 }
3212 
3213 static int
3214 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length)
3215 {
3216 	struct nvme_bdev_io *bio = ref;
3217 	struct iovec *iov;
3218 
3219 	assert(bio->fused_iovpos < bio->fused_iovcnt);
3220 
3221 	iov = &bio->fused_iovs[bio->fused_iovpos];
3222 
3223 	*address = iov->iov_base;
3224 	*length = iov->iov_len;
3225 
3226 	if (bio->fused_iov_offset) {
3227 		assert(bio->fused_iov_offset <= iov->iov_len);
3228 		*address += bio->fused_iov_offset;
3229 		*length -= bio->fused_iov_offset;
3230 	}
3231 
3232 	bio->fused_iov_offset += *length;
3233 	if (bio->fused_iov_offset == iov->iov_len) {
3234 		bio->fused_iovpos++;
3235 		bio->fused_iov_offset = 0;
3236 	}
3237 
3238 	return 0;
3239 }
3240 
3241 static int
3242 bdev_nvme_no_pi_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3243 		      struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
3244 		      void *md, uint64_t lba_count, uint64_t lba)
3245 {
3246 	int rc;
3247 
3248 	SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n",
3249 		      lba_count, lba);
3250 
3251 	bio->iovs = iov;
3252 	bio->iovcnt = iovcnt;
3253 	bio->iovpos = 0;
3254 	bio->iov_offset = 0;
3255 
3256 	rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count,
3257 					    bdev_nvme_no_pi_readv_done, bio, 0,
3258 					    bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
3259 					    md, 0, 0);
3260 
3261 	if (rc != 0 && rc != -ENOMEM) {
3262 		SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc);
3263 	}
3264 	return rc;
3265 }
3266 
3267 static int
3268 bdev_nvme_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3269 		struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
3270 		void *md, uint64_t lba_count, uint64_t lba, uint32_t flags)
3271 {
3272 	int rc;
3273 
3274 	SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n",
3275 		      lba_count, lba);
3276 
3277 	bio->iovs = iov;
3278 	bio->iovcnt = iovcnt;
3279 	bio->iovpos = 0;
3280 	bio->iov_offset = 0;
3281 
3282 	if (iovcnt == 1) {
3283 		rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, md, lba,
3284 						   lba_count,
3285 						   bdev_nvme_readv_done, bio,
3286 						   flags,
3287 						   0, 0);
3288 	} else {
3289 		rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count,
3290 						    bdev_nvme_readv_done, bio, flags,
3291 						    bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
3292 						    md, 0, 0);
3293 	}
3294 
3295 	if (rc != 0 && rc != -ENOMEM) {
3296 		SPDK_ERRLOG("readv failed: rc = %d\n", rc);
3297 	}
3298 	return rc;
3299 }
3300 
3301 static int
3302 bdev_nvme_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3303 		 struct nvme_bdev_io *bio,
3304 		 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
3305 		 uint32_t flags)
3306 {
3307 	int rc;
3308 
3309 	SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n",
3310 		      lba_count, lba);
3311 
3312 	bio->iovs = iov;
3313 	bio->iovcnt = iovcnt;
3314 	bio->iovpos = 0;
3315 	bio->iov_offset = 0;
3316 
3317 	if (iovcnt == 1) {
3318 		rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, md, lba,
3319 						    lba_count,
3320 						    bdev_nvme_writev_done, bio,
3321 						    flags,
3322 						    0, 0);
3323 	} else {
3324 		rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count,
3325 						     bdev_nvme_writev_done, bio, flags,
3326 						     bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
3327 						     md, 0, 0);
3328 	}
3329 
3330 	if (rc != 0 && rc != -ENOMEM) {
3331 		SPDK_ERRLOG("writev failed: rc = %d\n", rc);
3332 	}
3333 	return rc;
3334 }
3335 
3336 static int
3337 bdev_nvme_zone_appendv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3338 		       struct nvme_bdev_io *bio,
3339 		       struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t zslba,
3340 		       uint32_t flags)
3341 {
3342 	int rc;
3343 
3344 	SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n",
3345 		      lba_count, zslba);
3346 
3347 	bio->iovs = iov;
3348 	bio->iovcnt = iovcnt;
3349 	bio->iovpos = 0;
3350 	bio->iov_offset = 0;
3351 
3352 	if (iovcnt == 1) {
3353 		rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba,
3354 						       lba_count,
3355 						       bdev_nvme_zone_appendv_done, bio,
3356 						       flags,
3357 						       0, 0);
3358 	} else {
3359 		rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count,
3360 							bdev_nvme_zone_appendv_done, bio, flags,
3361 							bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
3362 							md, 0, 0);
3363 	}
3364 
3365 	if (rc != 0 && rc != -ENOMEM) {
3366 		SPDK_ERRLOG("zone append failed: rc = %d\n", rc);
3367 	}
3368 	return rc;
3369 }
3370 
3371 static int
3372 bdev_nvme_comparev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3373 		   struct nvme_bdev_io *bio,
3374 		   struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
3375 		   uint32_t flags)
3376 {
3377 	int rc;
3378 
3379 	SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n",
3380 		      lba_count, lba);
3381 
3382 	bio->iovs = iov;
3383 	bio->iovcnt = iovcnt;
3384 	bio->iovpos = 0;
3385 	bio->iov_offset = 0;
3386 
3387 	rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count,
3388 					       bdev_nvme_comparev_done, bio, flags,
3389 					       bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
3390 					       md, 0, 0);
3391 
3392 	if (rc != 0 && rc != -ENOMEM) {
3393 		SPDK_ERRLOG("comparev failed: rc = %d\n", rc);
3394 	}
3395 	return rc;
3396 }
3397 
3398 static int
3399 bdev_nvme_comparev_and_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3400 			      struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt,
3401 			      struct iovec *write_iov, int write_iovcnt,
3402 			      void *md, uint64_t lba_count, uint64_t lba, uint32_t flags)
3403 {
3404 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
3405 	int rc;
3406 
3407 	SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n",
3408 		      lba_count, lba);
3409 
3410 	bio->iovs = cmp_iov;
3411 	bio->iovcnt = cmp_iovcnt;
3412 	bio->iovpos = 0;
3413 	bio->iov_offset = 0;
3414 	bio->fused_iovs = write_iov;
3415 	bio->fused_iovcnt = write_iovcnt;
3416 	bio->fused_iovpos = 0;
3417 	bio->fused_iov_offset = 0;
3418 
3419 	if (bdev_io->num_retries == 0) {
3420 		bio->first_fused_submitted = false;
3421 	}
3422 
3423 	if (!bio->first_fused_submitted) {
3424 		flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST;
3425 		memset(&bio->cpl, 0, sizeof(bio->cpl));
3426 
3427 		rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count,
3428 						       bdev_nvme_comparev_and_writev_done, bio, flags,
3429 						       bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0);
3430 		if (rc == 0) {
3431 			bio->first_fused_submitted = true;
3432 			flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST;
3433 		} else {
3434 			if (rc != -ENOMEM) {
3435 				SPDK_ERRLOG("compare failed: rc = %d\n", rc);
3436 			}
3437 			return rc;
3438 		}
3439 	}
3440 
3441 	flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND;
3442 
3443 	rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count,
3444 					     bdev_nvme_comparev_and_writev_done, bio, flags,
3445 					     bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0);
3446 	if (rc != 0 && rc != -ENOMEM) {
3447 		SPDK_ERRLOG("write failed: rc = %d\n", rc);
3448 		rc = 0;
3449 	}
3450 
3451 	return rc;
3452 }
3453 
3454 static int
3455 bdev_nvme_unmap(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3456 		struct nvme_bdev_io *bio,
3457 		uint64_t offset_blocks,
3458 		uint64_t num_blocks)
3459 {
3460 	struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES];
3461 	struct spdk_nvme_dsm_range *range;
3462 	uint64_t offset, remaining;
3463 	uint64_t num_ranges_u64;
3464 	uint16_t num_ranges;
3465 	int rc;
3466 
3467 	num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) /
3468 			 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
3469 	if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) {
3470 		SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks);
3471 		return -EINVAL;
3472 	}
3473 	num_ranges = (uint16_t)num_ranges_u64;
3474 
3475 	offset = offset_blocks;
3476 	remaining = num_blocks;
3477 	range = &dsm_ranges[0];
3478 
3479 	/* Fill max-size ranges until the remaining blocks fit into one range */
3480 	while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) {
3481 		range->attributes.raw = 0;
3482 		range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
3483 		range->starting_lba = offset;
3484 
3485 		offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
3486 		remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
3487 		range++;
3488 	}
3489 
3490 	/* Final range describes the remaining blocks */
3491 	range->attributes.raw = 0;
3492 	range->length = remaining;
3493 	range->starting_lba = offset;
3494 
3495 	rc = spdk_nvme_ns_cmd_dataset_management(ns, qpair,
3496 			SPDK_NVME_DSM_ATTR_DEALLOCATE,
3497 			dsm_ranges, num_ranges,
3498 			bdev_nvme_queued_done, bio);
3499 
3500 	return rc;
3501 }
3502 
3503 static int
3504 bdev_nvme_write_zeroes(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3505 		       struct nvme_bdev_io *bio,
3506 		       uint64_t offset_blocks,
3507 		       uint64_t num_blocks)
3508 {
3509 	if (num_blocks > UINT16_MAX + 1) {
3510 		SPDK_ERRLOG("NVMe write zeroes is limited to 16-bit block count\n");
3511 		return -EINVAL;
3512 	}
3513 
3514 	return spdk_nvme_ns_cmd_write_zeroes(ns, qpair,
3515 					     offset_blocks, num_blocks,
3516 					     bdev_nvme_queued_done, bio,
3517 					     0);
3518 }
3519 
3520 static int
3521 bdev_nvme_get_zone_info(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3522 			struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones,
3523 			struct spdk_bdev_zone_info *info)
3524 {
3525 	uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns);
3526 	uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns);
3527 	uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns);
3528 
3529 	if (zone_id % zone_size != 0) {
3530 		return -EINVAL;
3531 	}
3532 
3533 	if (num_zones > total_zones || !num_zones) {
3534 		return -EINVAL;
3535 	}
3536 
3537 	assert(!bio->zone_report_buf);
3538 	bio->zone_report_buf = calloc(1, zone_report_bufsize);
3539 	if (!bio->zone_report_buf) {
3540 		return -ENOMEM;
3541 	}
3542 
3543 	bio->handled_zones = 0;
3544 
3545 	return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize,
3546 					  zone_id, SPDK_NVME_ZRA_LIST_ALL, true,
3547 					  bdev_nvme_get_zone_info_done, bio);
3548 }
3549 
3550 static int
3551 bdev_nvme_zone_management(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3552 			  struct nvme_bdev_io *bio, uint64_t zone_id,
3553 			  enum spdk_bdev_zone_action action)
3554 {
3555 	switch (action) {
3556 	case SPDK_BDEV_ZONE_CLOSE:
3557 		return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false,
3558 						bdev_nvme_zone_management_done, bio);
3559 	case SPDK_BDEV_ZONE_FINISH:
3560 		return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false,
3561 						 bdev_nvme_zone_management_done, bio);
3562 	case SPDK_BDEV_ZONE_OPEN:
3563 		return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false,
3564 					       bdev_nvme_zone_management_done, bio);
3565 	case SPDK_BDEV_ZONE_RESET:
3566 		return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false,
3567 						bdev_nvme_zone_management_done, bio);
3568 	case SPDK_BDEV_ZONE_OFFLINE:
3569 		return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false,
3570 						  bdev_nvme_zone_management_done, bio);
3571 	default:
3572 		return -EINVAL;
3573 	}
3574 }
3575 
3576 static int
3577 bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio,
3578 			 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes)
3579 {
3580 	struct nvme_ctrlr *nvme_ctrlr;
3581 	uint32_t max_xfer_size;
3582 
3583 	if (!bdev_nvme_find_admin_path(nbdev_ch, &nvme_ctrlr)) {
3584 		return -EINVAL;
3585 	}
3586 
3587 	max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ctrlr->ctrlr);
3588 
3589 	if (nbytes > max_xfer_size) {
3590 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
3591 		return -EINVAL;
3592 	}
3593 
3594 	bio->orig_thread = spdk_get_thread();
3595 
3596 	return spdk_nvme_ctrlr_cmd_admin_raw(nvme_ctrlr->ctrlr, cmd, buf,
3597 					     (uint32_t)nbytes, bdev_nvme_admin_passthru_done, bio);
3598 }
3599 
3600 static int
3601 bdev_nvme_io_passthru(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3602 		      struct nvme_bdev_io *bio,
3603 		      struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes)
3604 {
3605 	uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns);
3606 	struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns);
3607 
3608 	if (nbytes > max_xfer_size) {
3609 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
3610 		return -EINVAL;
3611 	}
3612 
3613 	/*
3614 	 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid,
3615 	 * so fill it out automatically.
3616 	 */
3617 	cmd->nsid = spdk_nvme_ns_get_id(ns);
3618 
3619 	return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf,
3620 					  (uint32_t)nbytes, bdev_nvme_queued_done, bio);
3621 }
3622 
3623 static int
3624 bdev_nvme_io_passthru_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3625 			 struct nvme_bdev_io *bio,
3626 			 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len)
3627 {
3628 	size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns);
3629 	uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns);
3630 	struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns);
3631 
3632 	if (nbytes > max_xfer_size) {
3633 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
3634 		return -EINVAL;
3635 	}
3636 
3637 	if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) {
3638 		SPDK_ERRLOG("invalid meta data buffer size\n");
3639 		return -EINVAL;
3640 	}
3641 
3642 	/*
3643 	 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid,
3644 	 * so fill it out automatically.
3645 	 */
3646 	cmd->nsid = spdk_nvme_ns_get_id(ns);
3647 
3648 	return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf,
3649 			(uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio);
3650 }
3651 
3652 static int
3653 bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio,
3654 		struct nvme_bdev_io *bio_to_abort)
3655 {
3656 	struct nvme_ctrlr_channel *ctrlr_ch = nbdev_ch->ctrlr_ch;
3657 	int rc;
3658 
3659 	bio->orig_thread = spdk_get_thread();
3660 
3661 	rc = spdk_nvme_ctrlr_cmd_abort_ext(ctrlr_ch->ctrlr->ctrlr,
3662 					   ctrlr_ch->qpair,
3663 					   bio_to_abort,
3664 					   bdev_nvme_abort_done, bio);
3665 	if (rc == -ENOENT) {
3666 		/* If no command was found in I/O qpair, the target command may be
3667 		 * admin command.
3668 		 */
3669 		rc = spdk_nvme_ctrlr_cmd_abort_ext(ctrlr_ch->ctrlr->ctrlr,
3670 						   NULL,
3671 						   bio_to_abort,
3672 						   bdev_nvme_abort_done, bio);
3673 	}
3674 
3675 	if (rc == -ENOENT) {
3676 		/* If no command was found, complete the abort request with failure. */
3677 		bio->cpl.cdw0 |= 1U;
3678 		bio->cpl.status.sc = SPDK_NVME_SC_SUCCESS;
3679 		bio->cpl.status.sct = SPDK_NVME_SCT_GENERIC;
3680 
3681 		bdev_nvme_abort_completion(bio);
3682 
3683 		rc = 0;
3684 	}
3685 
3686 	return rc;
3687 }
3688 
3689 static void
3690 nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w,
3691 		struct nvme_ns *nvme_ns)
3692 {
3693 	/* nop */
3694 }
3695 
3696 static void
3697 nvme_namespace_config_json(struct spdk_json_write_ctx *w, struct nvme_ns *nvme_ns)
3698 {
3699 	g_config_json_namespace_fn[nvme_ns->type](w, nvme_ns);
3700 }
3701 
3702 static void
3703 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w)
3704 {
3705 	const char	*action;
3706 
3707 	if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) {
3708 		action = "reset";
3709 	} else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) {
3710 		action = "abort";
3711 	} else {
3712 		action = "none";
3713 	}
3714 
3715 	spdk_json_write_object_begin(w);
3716 
3717 	spdk_json_write_named_string(w, "method", "bdev_nvme_set_options");
3718 
3719 	spdk_json_write_named_object_begin(w, "params");
3720 	spdk_json_write_named_string(w, "action_on_timeout", action);
3721 	spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us);
3722 	spdk_json_write_named_uint64(w, "timeout_admin_us", g_opts.timeout_admin_us);
3723 	spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms);
3724 	spdk_json_write_named_uint32(w, "retry_count", g_opts.retry_count);
3725 	spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst);
3726 	spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight);
3727 	spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight);
3728 	spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight);
3729 	spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us);
3730 	spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us);
3731 	spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests);
3732 	spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit);
3733 	spdk_json_write_object_end(w);
3734 
3735 	spdk_json_write_object_end(w);
3736 }
3737 
3738 static void
3739 nvme_ctrlr_config_json(struct spdk_json_write_ctx *w,
3740 		       struct nvme_ctrlr *nvme_ctrlr)
3741 {
3742 	struct spdk_nvme_transport_id	*trid;
3743 
3744 	trid = nvme_ctrlr->connected_trid;
3745 
3746 	spdk_json_write_object_begin(w);
3747 
3748 	spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller");
3749 
3750 	spdk_json_write_named_object_begin(w, "params");
3751 	spdk_json_write_named_string(w, "name", nvme_ctrlr->name);
3752 	nvme_bdev_dump_trid_json(trid, w);
3753 	spdk_json_write_named_bool(w, "prchk_reftag",
3754 				   (nvme_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0);
3755 	spdk_json_write_named_bool(w, "prchk_guard",
3756 				   (nvme_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0);
3757 
3758 	spdk_json_write_object_end(w);
3759 
3760 	spdk_json_write_object_end(w);
3761 }
3762 
3763 static void
3764 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w)
3765 {
3766 	spdk_json_write_object_begin(w);
3767 	spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug");
3768 
3769 	spdk_json_write_named_object_begin(w, "params");
3770 	spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us);
3771 	spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled);
3772 	spdk_json_write_object_end(w);
3773 
3774 	spdk_json_write_object_end(w);
3775 }
3776 
3777 static int
3778 bdev_nvme_config_json(struct spdk_json_write_ctx *w)
3779 {
3780 	struct nvme_ctrlr	*nvme_ctrlr;
3781 	uint32_t		nsid;
3782 
3783 	bdev_nvme_opts_config_json(w);
3784 
3785 	pthread_mutex_lock(&g_bdev_nvme_mutex);
3786 
3787 	TAILQ_FOREACH(nvme_ctrlr, &g_nvme_ctrlrs, tailq) {
3788 		nvme_ctrlr_config_json(w, nvme_ctrlr);
3789 
3790 		for (nsid = 0; nsid < nvme_ctrlr->num_ns; ++nsid) {
3791 			if (!nvme_ctrlr->namespaces[nsid]->populated) {
3792 				continue;
3793 			}
3794 
3795 			nvme_namespace_config_json(w, nvme_ctrlr->namespaces[nsid]);
3796 		}
3797 	}
3798 
3799 	/* Dump as last parameter to give all NVMe bdevs chance to be constructed
3800 	 * before enabling hotplug poller.
3801 	 */
3802 	bdev_nvme_hotplug_config_json(w);
3803 
3804 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
3805 	return 0;
3806 }
3807 
3808 struct spdk_nvme_ctrlr *
3809 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev)
3810 {
3811 	if (!bdev || bdev->module != &nvme_if) {
3812 		return NULL;
3813 	}
3814 
3815 	return SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk)->nvme_ns->ctrlr->ctrlr;
3816 }
3817 
3818 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme)
3819