xref: /spdk/module/bdev/nvme/bdev_nvme.c (revision 4f2f8e8d44672fcbab37196b32c44795f26df470)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation. All rights reserved.
5  *   Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "spdk/stdinc.h"
35 
36 #include "bdev_nvme.h"
37 #include "bdev_ocssd.h"
38 
39 #include "spdk/config.h"
40 #include "spdk/endian.h"
41 #include "spdk/bdev.h"
42 #include "spdk/json.h"
43 #include "spdk/nvme.h"
44 #include "spdk/nvme_ocssd.h"
45 #include "spdk/thread.h"
46 #include "spdk/string.h"
47 #include "spdk/util.h"
48 
49 #include "spdk/bdev_module.h"
50 #include "spdk/log.h"
51 
52 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true
53 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS	(10000)
54 
55 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w);
56 
57 struct nvme_bdev_io {
58 	/** array of iovecs to transfer. */
59 	struct iovec *iovs;
60 
61 	/** Number of iovecs in iovs array. */
62 	int iovcnt;
63 
64 	/** Current iovec position. */
65 	int iovpos;
66 
67 	/** Offset in current iovec. */
68 	uint32_t iov_offset;
69 
70 	/** array of iovecs to transfer. */
71 	struct iovec *fused_iovs;
72 
73 	/** Number of iovecs in iovs array. */
74 	int fused_iovcnt;
75 
76 	/** Current iovec position. */
77 	int fused_iovpos;
78 
79 	/** Offset in current iovec. */
80 	uint32_t fused_iov_offset;
81 
82 	/** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */
83 	struct spdk_nvme_cpl cpl;
84 
85 	/** Originating thread */
86 	struct spdk_thread *orig_thread;
87 
88 	/** Keeps track if first of fused commands was submitted */
89 	bool first_fused_submitted;
90 };
91 
92 struct nvme_probe_ctx {
93 	size_t count;
94 	struct spdk_nvme_transport_id trids[NVME_MAX_CONTROLLERS];
95 	struct spdk_nvme_host_id hostids[NVME_MAX_CONTROLLERS];
96 	const char *names[NVME_MAX_CONTROLLERS];
97 	uint32_t prchk_flags[NVME_MAX_CONTROLLERS];
98 	const char *hostnqn;
99 };
100 
101 struct nvme_probe_skip_entry {
102 	struct spdk_nvme_transport_id		trid;
103 	TAILQ_ENTRY(nvme_probe_skip_entry)	tailq;
104 };
105 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */
106 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER(
107 			g_skipped_nvme_ctrlrs);
108 
109 static struct spdk_bdev_nvme_opts g_opts = {
110 	.action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE,
111 	.timeout_us = 0,
112 	.keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS,
113 	.retry_count = 4,
114 	.arbitration_burst = 0,
115 	.low_priority_weight = 0,
116 	.medium_priority_weight = 0,
117 	.high_priority_weight = 0,
118 	.nvme_adminq_poll_period_us = 10000ULL,
119 	.nvme_ioq_poll_period_us = 0,
120 	.io_queue_requests = 0,
121 	.delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT,
122 };
123 
124 #define NVME_HOTPLUG_POLL_PERIOD_MAX			10000000ULL
125 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT		100000ULL
126 
127 static int g_hot_insert_nvme_controller_index = 0;
128 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT;
129 static bool g_nvme_hotplug_enabled = false;
130 static struct spdk_thread *g_bdev_nvme_init_thread;
131 static struct spdk_poller *g_hotplug_poller;
132 static struct spdk_poller *g_hotplug_probe_poller;
133 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx;
134 
135 static void nvme_ctrlr_populate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
136 		struct nvme_async_probe_ctx *ctx);
137 static void nvme_ctrlr_populate_namespaces_done(struct nvme_async_probe_ctx *ctx);
138 static int bdev_nvme_library_init(void);
139 static void bdev_nvme_library_fini(void);
140 static int bdev_nvme_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
141 			   struct nvme_bdev_io *bio,
142 			   struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
143 			   uint32_t flags);
144 static int bdev_nvme_no_pi_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
145 				 struct nvme_bdev_io *bio,
146 				 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba);
147 static int bdev_nvme_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
148 			    struct nvme_bdev_io *bio,
149 			    struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
150 			    uint32_t flags);
151 static int bdev_nvme_comparev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
152 			      struct nvme_bdev_io *bio,
153 			      struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
154 			      uint32_t flags);
155 static int bdev_nvme_comparev_and_writev(struct spdk_nvme_ns *ns,
156 		struct spdk_nvme_qpair *qpair,
157 		struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov,
158 		int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba,
159 		uint32_t flags);
160 static int bdev_nvme_admin_passthru(struct nvme_io_channel *nvme_ch,
161 				    struct nvme_bdev_io *bio,
162 				    struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes);
163 static int bdev_nvme_io_passthru(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
164 				 struct nvme_bdev_io *bio,
165 				 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes);
166 static int bdev_nvme_io_passthru_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
167 				    struct nvme_bdev_io *bio,
168 				    struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len);
169 static int bdev_nvme_abort(struct nvme_io_channel *nvme_ch,
170 			   struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort);
171 static int bdev_nvme_reset(struct nvme_io_channel *nvme_ch, struct nvme_bdev_io *bio);
172 static int bdev_nvme_failover(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, bool remove);
173 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr);
174 
175 typedef void (*populate_namespace_fn)(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
176 				      struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx);
177 static void nvme_ctrlr_populate_standard_namespace(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
178 		struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx);
179 
180 static populate_namespace_fn g_populate_namespace_fn[] = {
181 	NULL,
182 	nvme_ctrlr_populate_standard_namespace,
183 	bdev_ocssd_populate_namespace,
184 };
185 
186 typedef void (*depopulate_namespace_fn)(struct nvme_bdev_ns *nvme_ns);
187 static void nvme_ctrlr_depopulate_standard_namespace(struct nvme_bdev_ns *nvme_ns);
188 
189 static depopulate_namespace_fn g_depopulate_namespace_fn[] = {
190 	NULL,
191 	nvme_ctrlr_depopulate_standard_namespace,
192 	bdev_ocssd_depopulate_namespace,
193 };
194 
195 typedef void (*config_json_namespace_fn)(struct spdk_json_write_ctx *w,
196 		struct nvme_bdev_ns *nvme_ns);
197 static void nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w,
198 		struct nvme_bdev_ns *nvme_ns);
199 
200 static config_json_namespace_fn g_config_json_namespace_fn[] = {
201 	NULL,
202 	nvme_ctrlr_config_json_standard_namespace,
203 	bdev_ocssd_namespace_config_json,
204 };
205 
206 struct spdk_nvme_qpair *
207 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch)
208 {
209 	struct nvme_io_channel *nvme_ch;
210 
211 	nvme_ch =  spdk_io_channel_get_ctx(ctrlr_io_ch);
212 
213 	return nvme_ch->qpair;
214 }
215 
216 static int
217 bdev_nvme_get_ctx_size(void)
218 {
219 	return sizeof(struct nvme_bdev_io);
220 }
221 
222 static struct spdk_bdev_module nvme_if = {
223 	.name = "nvme",
224 	.async_fini = true,
225 	.module_init = bdev_nvme_library_init,
226 	.module_fini = bdev_nvme_library_fini,
227 	.config_json = bdev_nvme_config_json,
228 	.get_ctx_size = bdev_nvme_get_ctx_size,
229 
230 };
231 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if)
232 
233 static void
234 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx)
235 {
236 	int rc;
237 
238 	SPDK_DEBUGLOG(bdev_nvme, "qpair %p is disconnected, attempting reconnect.\n", qpair);
239 	/*
240 	 * Currently, just try to reconnect indefinitely. If we are doing a reset, the reset will
241 	 * reconnect a qpair and we will stop getting a callback for this one.
242 	 */
243 	rc = spdk_nvme_ctrlr_reconnect_io_qpair(qpair);
244 	if (rc != 0) {
245 		SPDK_WARNLOG("Failed to reconnect to qpair %p, errno %d\n", qpair, -rc);
246 	}
247 }
248 
249 static int
250 bdev_nvme_poll(void *arg)
251 {
252 	struct nvme_bdev_poll_group *group = arg;
253 	int64_t num_completions;
254 
255 	if (group->collect_spin_stat && group->start_ticks == 0) {
256 		group->start_ticks = spdk_get_ticks();
257 	}
258 
259 	num_completions = spdk_nvme_poll_group_process_completions(group->group, 0,
260 			  bdev_nvme_disconnected_qpair_cb);
261 	if (group->collect_spin_stat) {
262 		if (num_completions > 0) {
263 			if (group->end_ticks != 0) {
264 				group->spin_ticks += (group->end_ticks - group->start_ticks);
265 				group->end_ticks = 0;
266 			}
267 			group->start_ticks = 0;
268 		} else {
269 			group->end_ticks = spdk_get_ticks();
270 		}
271 	}
272 
273 	return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
274 }
275 
276 static int
277 bdev_nvme_poll_adminq(void *arg)
278 {
279 	int32_t rc;
280 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = arg;
281 
282 	assert(nvme_bdev_ctrlr != NULL);
283 
284 	rc = spdk_nvme_ctrlr_process_admin_completions(nvme_bdev_ctrlr->ctrlr);
285 	if (rc < 0) {
286 		bdev_nvme_failover(nvme_bdev_ctrlr, false);
287 	}
288 
289 	return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY;
290 }
291 
292 static int
293 bdev_nvme_destruct(void *ctx)
294 {
295 	struct nvme_bdev *nvme_disk = ctx;
296 	struct nvme_bdev_ns *nvme_ns = nvme_disk->nvme_ns;
297 
298 	nvme_ns->bdev = NULL;
299 
300 	nvme_bdev_ns_detach(nvme_ns);
301 
302 	free(nvme_disk->disk.name);
303 	free(nvme_disk);
304 
305 	return 0;
306 }
307 
308 static int
309 bdev_nvme_flush(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
310 		struct nvme_bdev_io *bio, uint64_t offset, uint64_t nbytes)
311 {
312 	spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_SUCCESS);
313 
314 	return 0;
315 }
316 
317 static int
318 bdev_nvme_create_qpair(struct nvme_io_channel *nvme_ch)
319 {
320 	struct spdk_nvme_ctrlr *ctrlr = nvme_ch->ctrlr->ctrlr;
321 	struct spdk_nvme_io_qpair_opts opts;
322 	int rc;
323 
324 	spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &opts, sizeof(opts));
325 	opts.delay_cmd_submit = g_opts.delay_cmd_submit;
326 	opts.create_only = true;
327 	opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests);
328 	g_opts.io_queue_requests = opts.io_queue_requests;
329 
330 	nvme_ch->qpair = spdk_nvme_ctrlr_alloc_io_qpair(ctrlr, &opts, sizeof(opts));
331 	if (nvme_ch->qpair == NULL) {
332 		return -1;
333 	}
334 
335 	assert(nvme_ch->group != NULL);
336 
337 	rc = spdk_nvme_poll_group_add(nvme_ch->group->group, nvme_ch->qpair);
338 	if (rc != 0) {
339 		SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n");
340 		goto err;
341 	}
342 
343 	rc = spdk_nvme_ctrlr_connect_io_qpair(ctrlr, nvme_ch->qpair);
344 	if (rc != 0) {
345 		SPDK_ERRLOG("Unable to connect I/O qpair.\n");
346 		goto err;
347 	}
348 
349 	return 0;
350 
351 err:
352 	spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair);
353 
354 	return rc;
355 }
356 
357 static void
358 _bdev_nvme_reset_destruct_ctrlr(struct spdk_io_channel_iter *i, int status)
359 {
360 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i);
361 
362 	spdk_thread_send_msg(nvme_bdev_ctrlr->thread, nvme_bdev_ctrlr_do_destruct,
363 			     nvme_bdev_ctrlr);
364 }
365 
366 static void
367 _bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i)
368 {
369 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
370 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(_ch);
371 	struct spdk_bdev_io *bdev_io;
372 	enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS;
373 
374 	/* A NULL ctx means success. */
375 	if (spdk_io_channel_iter_get_ctx(i) != NULL) {
376 		status = SPDK_BDEV_IO_STATUS_FAILED;
377 	}
378 
379 	while (!TAILQ_EMPTY(&nvme_ch->pending_resets)) {
380 		bdev_io = TAILQ_FIRST(&nvme_ch->pending_resets);
381 		TAILQ_REMOVE(&nvme_ch->pending_resets, bdev_io, module_link);
382 		spdk_bdev_io_complete(bdev_io, status);
383 	}
384 
385 	spdk_for_each_channel_continue(i, 0);
386 }
387 
388 static void
389 _bdev_nvme_reset_complete(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, int rc)
390 {
391 	/* we are using the for_each_channel cb_arg like a return code here. */
392 	/* If it's zero, we succeeded, otherwise, the reset failed. */
393 	void *cb_arg = NULL;
394 	struct nvme_bdev_ctrlr_trid *curr_trid;
395 	bool do_destruct = false;
396 
397 	if (rc) {
398 		cb_arg = (void *)0x1;
399 		SPDK_ERRLOG("Resetting controller failed.\n");
400 	} else {
401 		SPDK_NOTICELOG("Resetting controller successful.\n");
402 	}
403 
404 	pthread_mutex_lock(&g_bdev_nvme_mutex);
405 	nvme_bdev_ctrlr->resetting = false;
406 	nvme_bdev_ctrlr->failover_in_progress = false;
407 
408 	curr_trid = TAILQ_FIRST(&nvme_bdev_ctrlr->trids);
409 	assert(curr_trid != NULL);
410 	assert(&curr_trid->trid == nvme_bdev_ctrlr->connected_trid);
411 
412 	curr_trid->is_failed = cb_arg != NULL ? true : false;
413 
414 	if (nvme_bdev_ctrlr->ref == 0 && nvme_bdev_ctrlr->destruct) {
415 		/* Destruct ctrlr after clearing pending resets. */
416 		do_destruct = true;
417 	}
418 
419 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
420 	/* Make sure we clear any pending resets before returning. */
421 	spdk_for_each_channel(nvme_bdev_ctrlr,
422 			      _bdev_nvme_complete_pending_resets,
423 			      cb_arg,
424 			      do_destruct ? _bdev_nvme_reset_destruct_ctrlr : NULL);
425 }
426 
427 static void
428 _bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status)
429 {
430 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i);
431 	struct nvme_bdev_io *bio = spdk_io_channel_iter_get_ctx(i);
432 	int rc = SPDK_BDEV_IO_STATUS_SUCCESS;
433 
434 	if (status) {
435 		rc = SPDK_BDEV_IO_STATUS_FAILED;
436 	}
437 	if (bio) {
438 		spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), rc);
439 	}
440 	_bdev_nvme_reset_complete(nvme_bdev_ctrlr, status);
441 }
442 
443 static void
444 _bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i)
445 {
446 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
447 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(_ch);
448 	int rc;
449 
450 	rc = bdev_nvme_create_qpair(nvme_ch);
451 
452 	spdk_for_each_channel_continue(i, rc);
453 }
454 
455 static void
456 _bdev_nvme_reset_ctrlr(struct spdk_io_channel_iter *i, int status)
457 {
458 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i);
459 	struct nvme_bdev_io *bio = spdk_io_channel_iter_get_ctx(i);
460 	int rc;
461 
462 	if (status) {
463 		rc = status;
464 		goto err;
465 	}
466 
467 	rc = spdk_nvme_ctrlr_reset(nvme_bdev_ctrlr->ctrlr);
468 	if (rc != 0) {
469 		goto err;
470 	}
471 
472 	/* Recreate all of the I/O queue pairs */
473 	spdk_for_each_channel(nvme_bdev_ctrlr,
474 			      _bdev_nvme_reset_create_qpair,
475 			      bio,
476 			      _bdev_nvme_reset_create_qpairs_done);
477 	return;
478 
479 err:
480 	if (bio) {
481 		spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_FAILED);
482 	}
483 	_bdev_nvme_reset_complete(nvme_bdev_ctrlr, rc);
484 }
485 
486 static void
487 _bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i)
488 {
489 	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
490 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
491 	int rc;
492 
493 	rc = spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair);
494 	if (!rc) {
495 		nvme_ch->qpair = NULL;
496 	}
497 
498 	spdk_for_each_channel_continue(i, rc);
499 }
500 
501 static int
502 _bdev_nvme_reset(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, void *ctx)
503 {
504 	pthread_mutex_lock(&g_bdev_nvme_mutex);
505 	if (nvme_bdev_ctrlr->destruct) {
506 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
507 		return -EBUSY;
508 	}
509 
510 	if (nvme_bdev_ctrlr->resetting) {
511 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
512 		SPDK_NOTICELOG("Unable to perform reset, already in progress.\n");
513 		return -EAGAIN;
514 	}
515 
516 	nvme_bdev_ctrlr->resetting = true;
517 
518 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
519 	/* First, delete all NVMe I/O queue pairs. */
520 	spdk_for_each_channel(nvme_bdev_ctrlr,
521 			      _bdev_nvme_reset_destroy_qpair,
522 			      ctx,
523 			      _bdev_nvme_reset_ctrlr);
524 
525 	return 0;
526 }
527 
528 static int
529 bdev_nvme_reset(struct nvme_io_channel *nvme_ch, struct nvme_bdev_io *bio)
530 {
531 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
532 	int rc;
533 
534 	rc = _bdev_nvme_reset(nvme_ch->ctrlr, bio);
535 	if (rc == -EBUSY) {
536 		/* Don't bother resetting if the controller is in the process of being destructed. */
537 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
538 		return 0;
539 	} else if (rc == -EAGAIN) {
540 		/*
541 		 * Reset call is queued only if it is from the app framework. This is on purpose so that
542 		 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the
543 		 * upper level. If they are in the middle of a reset, we won't try to schedule another one.
544 		 */
545 		TAILQ_INSERT_TAIL(&nvme_ch->pending_resets, bdev_io, module_link);
546 		return 0;
547 	} else {
548 		return rc;
549 	}
550 }
551 
552 static int
553 bdev_nvme_failover(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, bool remove)
554 {
555 	struct nvme_bdev_ctrlr_trid *curr_trid = NULL, *next_trid = NULL;
556 	int rc = 0;
557 
558 	pthread_mutex_lock(&g_bdev_nvme_mutex);
559 	if (nvme_bdev_ctrlr->destruct) {
560 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
561 		/* Don't bother resetting if the controller is in the process of being destructed. */
562 		return 0;
563 	}
564 
565 	curr_trid = TAILQ_FIRST(&nvme_bdev_ctrlr->trids);
566 	assert(curr_trid);
567 	assert(&curr_trid->trid == nvme_bdev_ctrlr->connected_trid);
568 	next_trid = TAILQ_NEXT(curr_trid, link);
569 
570 	if (nvme_bdev_ctrlr->resetting) {
571 		if (next_trid && !nvme_bdev_ctrlr->failover_in_progress) {
572 			rc = -EAGAIN;
573 		}
574 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
575 		SPDK_NOTICELOG("Unable to perform reset, already in progress.\n");
576 		return rc;
577 	}
578 
579 	nvme_bdev_ctrlr->resetting = true;
580 	curr_trid->is_failed = true;
581 
582 	if (next_trid) {
583 		assert(curr_trid->trid.trtype != SPDK_NVME_TRANSPORT_PCIE);
584 
585 		SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", curr_trid->trid.traddr,
586 			       curr_trid->trid.trsvcid,	next_trid->trid.traddr, next_trid->trid.trsvcid);
587 
588 		nvme_bdev_ctrlr->failover_in_progress = true;
589 		spdk_nvme_ctrlr_fail(nvme_bdev_ctrlr->ctrlr);
590 		nvme_bdev_ctrlr->connected_trid = &next_trid->trid;
591 		rc = spdk_nvme_ctrlr_set_trid(nvme_bdev_ctrlr->ctrlr, &next_trid->trid);
592 		assert(rc == 0);
593 		TAILQ_REMOVE(&nvme_bdev_ctrlr->trids, curr_trid, link);
594 		if (!remove) {
595 			/** Shuffle the old trid to the end of the list and use the new one.
596 			 * Allows for round robin through multiple connections.
597 			 */
598 			TAILQ_INSERT_TAIL(&nvme_bdev_ctrlr->trids, curr_trid, link);
599 		} else {
600 			free(curr_trid);
601 		}
602 	}
603 
604 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
605 	/* First, delete all NVMe I/O queue pairs. */
606 	spdk_for_each_channel(nvme_bdev_ctrlr,
607 			      _bdev_nvme_reset_destroy_qpair,
608 			      NULL,
609 			      _bdev_nvme_reset_ctrlr);
610 
611 	return 0;
612 }
613 
614 static int
615 bdev_nvme_unmap(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
616 		struct nvme_bdev_io *bio,
617 		uint64_t offset_blocks,
618 		uint64_t num_blocks);
619 
620 static void
621 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
622 		     bool success)
623 {
624 	struct spdk_bdev *bdev = bdev_io->bdev;
625 	struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev->ctxt;
626 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
627 	struct nvme_bdev_ns *nvme_ns;
628 	struct spdk_nvme_qpair *qpair;
629 	int ret;
630 
631 	if (!success) {
632 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
633 		return;
634 	}
635 
636 	if (spdk_unlikely(!bdev_nvme_find_io_path(nbdev, nvme_ch, &nvme_ns, &qpair))) {
637 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
638 		return;
639 	}
640 
641 	ret = bdev_nvme_readv(nvme_ns->ns,
642 			      qpair,
643 			      (struct nvme_bdev_io *)bdev_io->driver_ctx,
644 			      bdev_io->u.bdev.iovs,
645 			      bdev_io->u.bdev.iovcnt,
646 			      bdev_io->u.bdev.md_buf,
647 			      bdev_io->u.bdev.num_blocks,
648 			      bdev_io->u.bdev.offset_blocks,
649 			      bdev->dif_check_flags);
650 
651 	if (spdk_likely(ret == 0)) {
652 		return;
653 	} else if (ret == -ENOMEM) {
654 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
655 	} else {
656 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
657 	}
658 }
659 
660 static int
661 _bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
662 {
663 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
664 	struct spdk_bdev *bdev = bdev_io->bdev;
665 	struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev->ctxt;
666 	struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx;
667 	struct nvme_bdev_io *nbdev_io_to_abort;
668 	struct nvme_bdev_ns *nvme_ns;
669 	struct spdk_nvme_qpair *qpair;
670 
671 	if (spdk_unlikely(!bdev_nvme_find_io_path(nbdev, nvme_ch, &nvme_ns, &qpair))) {
672 		return -1;
673 	}
674 
675 	switch (bdev_io->type) {
676 	case SPDK_BDEV_IO_TYPE_READ:
677 		if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) {
678 			return bdev_nvme_readv(nvme_ns->ns,
679 					       qpair,
680 					       nbdev_io,
681 					       bdev_io->u.bdev.iovs,
682 					       bdev_io->u.bdev.iovcnt,
683 					       bdev_io->u.bdev.md_buf,
684 					       bdev_io->u.bdev.num_blocks,
685 					       bdev_io->u.bdev.offset_blocks,
686 					       bdev->dif_check_flags);
687 		} else {
688 			spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb,
689 					     bdev_io->u.bdev.num_blocks * bdev->blocklen);
690 			return 0;
691 		}
692 
693 	case SPDK_BDEV_IO_TYPE_WRITE:
694 		return bdev_nvme_writev(nvme_ns->ns,
695 					qpair,
696 					nbdev_io,
697 					bdev_io->u.bdev.iovs,
698 					bdev_io->u.bdev.iovcnt,
699 					bdev_io->u.bdev.md_buf,
700 					bdev_io->u.bdev.num_blocks,
701 					bdev_io->u.bdev.offset_blocks,
702 					bdev->dif_check_flags);
703 
704 	case SPDK_BDEV_IO_TYPE_COMPARE:
705 		return bdev_nvme_comparev(nvme_ns->ns,
706 					  qpair,
707 					  nbdev_io,
708 					  bdev_io->u.bdev.iovs,
709 					  bdev_io->u.bdev.iovcnt,
710 					  bdev_io->u.bdev.md_buf,
711 					  bdev_io->u.bdev.num_blocks,
712 					  bdev_io->u.bdev.offset_blocks,
713 					  bdev->dif_check_flags);
714 
715 	case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE:
716 		return bdev_nvme_comparev_and_writev(nvme_ns->ns,
717 						     qpair,
718 						     nbdev_io,
719 						     bdev_io->u.bdev.iovs,
720 						     bdev_io->u.bdev.iovcnt,
721 						     bdev_io->u.bdev.fused_iovs,
722 						     bdev_io->u.bdev.fused_iovcnt,
723 						     bdev_io->u.bdev.md_buf,
724 						     bdev_io->u.bdev.num_blocks,
725 						     bdev_io->u.bdev.offset_blocks,
726 						     bdev->dif_check_flags);
727 
728 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
729 		return bdev_nvme_unmap(nvme_ns->ns,
730 				       qpair,
731 				       nbdev_io,
732 				       bdev_io->u.bdev.offset_blocks,
733 				       bdev_io->u.bdev.num_blocks);
734 
735 	case SPDK_BDEV_IO_TYPE_UNMAP:
736 		return bdev_nvme_unmap(nvme_ns->ns,
737 				       qpair,
738 				       nbdev_io,
739 				       bdev_io->u.bdev.offset_blocks,
740 				       bdev_io->u.bdev.num_blocks);
741 
742 	case SPDK_BDEV_IO_TYPE_RESET:
743 		return bdev_nvme_reset(nvme_ch, nbdev_io);
744 
745 	case SPDK_BDEV_IO_TYPE_FLUSH:
746 		return bdev_nvme_flush(nvme_ns->ns,
747 				       qpair,
748 				       nbdev_io,
749 				       bdev_io->u.bdev.offset_blocks,
750 				       bdev_io->u.bdev.num_blocks);
751 
752 	case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
753 		return bdev_nvme_admin_passthru(nvme_ch,
754 						nbdev_io,
755 						&bdev_io->u.nvme_passthru.cmd,
756 						bdev_io->u.nvme_passthru.buf,
757 						bdev_io->u.nvme_passthru.nbytes);
758 
759 	case SPDK_BDEV_IO_TYPE_NVME_IO:
760 		return bdev_nvme_io_passthru(nvme_ns->ns,
761 					     qpair,
762 					     nbdev_io,
763 					     &bdev_io->u.nvme_passthru.cmd,
764 					     bdev_io->u.nvme_passthru.buf,
765 					     bdev_io->u.nvme_passthru.nbytes);
766 
767 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
768 		return bdev_nvme_io_passthru_md(nvme_ns->ns,
769 						qpair,
770 						nbdev_io,
771 						&bdev_io->u.nvme_passthru.cmd,
772 						bdev_io->u.nvme_passthru.buf,
773 						bdev_io->u.nvme_passthru.nbytes,
774 						bdev_io->u.nvme_passthru.md_buf,
775 						bdev_io->u.nvme_passthru.md_len);
776 
777 	case SPDK_BDEV_IO_TYPE_ABORT:
778 		nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx;
779 		return bdev_nvme_abort(nvme_ch,
780 				       nbdev_io,
781 				       nbdev_io_to_abort);
782 
783 	default:
784 		return -EINVAL;
785 	}
786 	return 0;
787 }
788 
789 static void
790 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
791 {
792 	int rc = _bdev_nvme_submit_request(ch, bdev_io);
793 
794 	if (spdk_unlikely(rc != 0)) {
795 		if (rc == -ENOMEM) {
796 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
797 		} else {
798 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
799 		}
800 	}
801 }
802 
803 static bool
804 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
805 {
806 	struct nvme_bdev *nbdev = ctx;
807 	struct nvme_bdev_ns *nvme_ns;
808 	struct spdk_nvme_ns *ns;
809 	struct spdk_nvme_ctrlr *ctrlr;
810 	const struct spdk_nvme_ctrlr_data *cdata;
811 
812 	nvme_ns = nvme_bdev_to_bdev_ns(nbdev);
813 	assert(nvme_ns != NULL);
814 	ns = nvme_ns->ns;
815 	ctrlr = spdk_nvme_ns_get_ctrlr(ns);
816 
817 	switch (io_type) {
818 	case SPDK_BDEV_IO_TYPE_READ:
819 	case SPDK_BDEV_IO_TYPE_WRITE:
820 	case SPDK_BDEV_IO_TYPE_RESET:
821 	case SPDK_BDEV_IO_TYPE_FLUSH:
822 	case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
823 	case SPDK_BDEV_IO_TYPE_NVME_IO:
824 	case SPDK_BDEV_IO_TYPE_ABORT:
825 		return true;
826 
827 	case SPDK_BDEV_IO_TYPE_COMPARE:
828 		return spdk_nvme_ns_supports_compare(ns);
829 
830 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
831 		return spdk_nvme_ns_get_md_size(ns) ? true : false;
832 
833 	case SPDK_BDEV_IO_TYPE_UNMAP:
834 		cdata = spdk_nvme_ctrlr_get_data(ctrlr);
835 		return cdata->oncs.dsm;
836 
837 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
838 		cdata = spdk_nvme_ctrlr_get_data(ctrlr);
839 		/*
840 		 * If an NVMe controller guarantees reading unallocated blocks returns zero,
841 		 * we can implement WRITE_ZEROES as an NVMe deallocate command.
842 		 */
843 		if (cdata->oncs.dsm &&
844 		    spdk_nvme_ns_get_dealloc_logical_block_read_value(ns) ==
845 		    SPDK_NVME_DEALLOC_READ_00) {
846 			return true;
847 		}
848 		/*
849 		 * The NVMe controller write_zeroes function is currently not used by our driver.
850 		 * If a user submits an arbitrarily large write_zeroes request to the controller, the request will fail.
851 		 * Until this is resolved, we only claim support for write_zeroes if deallocated blocks return 0's when read.
852 		 */
853 		return false;
854 
855 	case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE:
856 		if (spdk_nvme_ctrlr_get_flags(ctrlr) &
857 		    SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) {
858 			return true;
859 		}
860 		return false;
861 
862 	default:
863 		return false;
864 	}
865 }
866 
867 static int
868 bdev_nvme_create_cb(void *io_device, void *ctx_buf)
869 {
870 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = io_device;
871 	struct nvme_io_channel *nvme_ch = ctx_buf;
872 	struct spdk_io_channel *pg_ch = NULL;
873 	int rc;
874 
875 	if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) {
876 		rc = bdev_ocssd_create_io_channel(nvme_ch);
877 		if (rc != 0) {
878 			return rc;
879 		}
880 	}
881 
882 	pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs);
883 	if (!pg_ch) {
884 		rc = -1;
885 		goto err_pg_ch;
886 	}
887 
888 	nvme_ch->group = spdk_io_channel_get_ctx(pg_ch);
889 
890 #ifdef SPDK_CONFIG_VTUNE
891 	nvme_ch->group->collect_spin_stat = true;
892 #else
893 	nvme_ch->group->collect_spin_stat = false;
894 #endif
895 
896 	TAILQ_INIT(&nvme_ch->pending_resets);
897 
898 	nvme_ch->ctrlr = nvme_bdev_ctrlr;
899 
900 	rc = bdev_nvme_create_qpair(nvme_ch);
901 	if (rc != 0) {
902 		goto err_qpair;
903 	}
904 
905 	return 0;
906 
907 err_qpair:
908 	spdk_put_io_channel(pg_ch);
909 err_pg_ch:
910 	if (nvme_ch->ocssd_ch) {
911 		bdev_ocssd_destroy_io_channel(nvme_ch);
912 	}
913 
914 	return rc;
915 }
916 
917 static void
918 bdev_nvme_destroy_cb(void *io_device, void *ctx_buf)
919 {
920 	struct nvme_io_channel *nvme_ch = ctx_buf;
921 
922 	assert(nvme_ch->group != NULL);
923 
924 	if (nvme_ch->ocssd_ch != NULL) {
925 		bdev_ocssd_destroy_io_channel(nvme_ch);
926 	}
927 
928 	spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair);
929 
930 	spdk_put_io_channel(spdk_io_channel_from_ctx(nvme_ch->group));
931 }
932 
933 static int
934 bdev_nvme_poll_group_create_cb(void *io_device, void *ctx_buf)
935 {
936 	struct nvme_bdev_poll_group *group = ctx_buf;
937 
938 	group->group = spdk_nvme_poll_group_create(group);
939 	if (group->group == NULL) {
940 		return -1;
941 	}
942 
943 	group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us);
944 
945 	if (group->poller == NULL) {
946 		spdk_nvme_poll_group_destroy(group->group);
947 		return -1;
948 	}
949 
950 	return 0;
951 }
952 
953 static void
954 bdev_nvme_poll_group_destroy_cb(void *io_device, void *ctx_buf)
955 {
956 	struct nvme_bdev_poll_group *group = ctx_buf;
957 
958 	spdk_poller_unregister(&group->poller);
959 	if (spdk_nvme_poll_group_destroy(group->group)) {
960 		SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.");
961 		assert(false);
962 	}
963 }
964 
965 static struct spdk_io_channel *
966 bdev_nvme_get_io_channel(void *ctx)
967 {
968 	struct nvme_bdev *nvme_bdev = ctx;
969 
970 	return spdk_get_io_channel(nvme_bdev->nvme_ns->ctrlr);
971 }
972 
973 static void *
974 bdev_nvme_get_module_ctx(void *ctx)
975 {
976 	struct nvme_bdev *nvme_bdev = ctx;
977 
978 	return bdev_nvme_get_ctrlr(&nvme_bdev->disk);
979 }
980 
981 static int
982 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
983 {
984 	struct nvme_bdev *nvme_bdev = ctx;
985 	struct nvme_bdev_ns *nvme_ns;
986 	struct spdk_nvme_ns *ns;
987 	struct spdk_nvme_ctrlr *ctrlr;
988 	const struct spdk_nvme_ctrlr_data *cdata;
989 	const struct spdk_nvme_transport_id *trid;
990 	union spdk_nvme_vs_register vs;
991 	union spdk_nvme_csts_register csts;
992 	char buf[128];
993 
994 	nvme_ns = nvme_bdev_to_bdev_ns(nvme_bdev);
995 	assert(nvme_ns != NULL);
996 	ns = nvme_ns->ns;
997 	ctrlr = spdk_nvme_ns_get_ctrlr(ns);
998 
999 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
1000 	trid = spdk_nvme_ctrlr_get_transport_id(ctrlr);
1001 	vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr);
1002 	csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr);
1003 
1004 	spdk_json_write_named_object_begin(w, "nvme");
1005 
1006 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
1007 		spdk_json_write_named_string(w, "pci_address", trid->traddr);
1008 	}
1009 
1010 	spdk_json_write_named_object_begin(w, "trid");
1011 
1012 	nvme_bdev_dump_trid_json(trid, w);
1013 
1014 	spdk_json_write_object_end(w);
1015 
1016 #ifdef SPDK_CONFIG_NVME_CUSE
1017 	size_t cuse_name_size = 128;
1018 	char cuse_name[cuse_name_size];
1019 
1020 	int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns),
1021 					    cuse_name, &cuse_name_size);
1022 	if (rc == 0) {
1023 		spdk_json_write_named_string(w, "cuse_device", cuse_name);
1024 	}
1025 #endif
1026 
1027 	spdk_json_write_named_object_begin(w, "ctrlr_data");
1028 
1029 	spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid);
1030 
1031 	snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn);
1032 	spdk_str_trim(buf);
1033 	spdk_json_write_named_string(w, "model_number", buf);
1034 
1035 	snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn);
1036 	spdk_str_trim(buf);
1037 	spdk_json_write_named_string(w, "serial_number", buf);
1038 
1039 	snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr);
1040 	spdk_str_trim(buf);
1041 	spdk_json_write_named_string(w, "firmware_revision", buf);
1042 
1043 	if (cdata->subnqn[0] != '\0') {
1044 		spdk_json_write_named_string(w, "subnqn", cdata->subnqn);
1045 	}
1046 
1047 	spdk_json_write_named_object_begin(w, "oacs");
1048 
1049 	spdk_json_write_named_uint32(w, "security", cdata->oacs.security);
1050 	spdk_json_write_named_uint32(w, "format", cdata->oacs.format);
1051 	spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware);
1052 	spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage);
1053 
1054 	spdk_json_write_object_end(w);
1055 
1056 	spdk_json_write_object_end(w);
1057 
1058 	spdk_json_write_named_object_begin(w, "vs");
1059 
1060 	spdk_json_write_name(w, "nvme_version");
1061 	if (vs.bits.ter) {
1062 		spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter);
1063 	} else {
1064 		spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr);
1065 	}
1066 
1067 	spdk_json_write_object_end(w);
1068 
1069 	spdk_json_write_named_object_begin(w, "csts");
1070 
1071 	spdk_json_write_named_uint32(w, "rdy", csts.bits.rdy);
1072 	spdk_json_write_named_uint32(w, "cfs", csts.bits.cfs);
1073 
1074 	spdk_json_write_object_end(w);
1075 
1076 	spdk_json_write_named_object_begin(w, "ns_data");
1077 
1078 	spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns));
1079 
1080 	spdk_json_write_object_end(w);
1081 
1082 	if (cdata->oacs.security) {
1083 		spdk_json_write_named_object_begin(w, "security");
1084 
1085 		spdk_json_write_named_bool(w, "opal", nvme_bdev->opal);
1086 
1087 		spdk_json_write_object_end(w);
1088 	}
1089 
1090 	spdk_json_write_object_end(w);
1091 
1092 	return 0;
1093 }
1094 
1095 static void
1096 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
1097 {
1098 	/* No config per bdev needed */
1099 }
1100 
1101 static uint64_t
1102 bdev_nvme_get_spin_time(struct spdk_io_channel *ch)
1103 {
1104 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
1105 	struct nvme_bdev_poll_group *group = nvme_ch->group;
1106 	uint64_t spin_time;
1107 
1108 	if (!group || !group->collect_spin_stat) {
1109 		return 0;
1110 	}
1111 
1112 	if (group->end_ticks != 0) {
1113 		group->spin_ticks += (group->end_ticks - group->start_ticks);
1114 		group->end_ticks = 0;
1115 	}
1116 
1117 	spin_time = (group->spin_ticks * 1000000ULL) / spdk_get_ticks_hz();
1118 	group->start_ticks = 0;
1119 	group->spin_ticks = 0;
1120 
1121 	return spin_time;
1122 }
1123 
1124 static const struct spdk_bdev_fn_table nvmelib_fn_table = {
1125 	.destruct		= bdev_nvme_destruct,
1126 	.submit_request		= bdev_nvme_submit_request,
1127 	.io_type_supported	= bdev_nvme_io_type_supported,
1128 	.get_io_channel		= bdev_nvme_get_io_channel,
1129 	.dump_info_json		= bdev_nvme_dump_info_json,
1130 	.write_config_json	= bdev_nvme_write_config_json,
1131 	.get_spin_time		= bdev_nvme_get_spin_time,
1132 	.get_module_ctx		= bdev_nvme_get_module_ctx,
1133 };
1134 
1135 static int
1136 nvme_disk_create(struct spdk_bdev *disk, const char *base_name,
1137 		 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns,
1138 		 uint32_t prchk_flags, void *ctx)
1139 {
1140 	const struct spdk_uuid		*uuid;
1141 	const struct spdk_nvme_ctrlr_data *cdata;
1142 	const struct spdk_nvme_ns_data	*nsdata;
1143 	int				rc;
1144 
1145 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
1146 
1147 	disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns));
1148 	if (!disk->name) {
1149 		return -ENOMEM;
1150 	}
1151 	disk->product_name = "NVMe disk";
1152 
1153 	disk->write_cache = 0;
1154 	if (cdata->vwc.present) {
1155 		/* Enable if the Volatile Write Cache exists */
1156 		disk->write_cache = 1;
1157 	}
1158 	disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns);
1159 	disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns);
1160 	disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns);
1161 
1162 	uuid = spdk_nvme_ns_get_uuid(ns);
1163 	if (uuid != NULL) {
1164 		disk->uuid = *uuid;
1165 	}
1166 
1167 	nsdata = spdk_nvme_ns_get_data(ns);
1168 
1169 	disk->md_len = spdk_nvme_ns_get_md_size(ns);
1170 	if (disk->md_len != 0) {
1171 		disk->md_interleave = nsdata->flbas.extended;
1172 		disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns);
1173 		if (disk->dif_type != SPDK_DIF_DISABLE) {
1174 			disk->dif_is_head_of_md = nsdata->dps.md_start;
1175 			disk->dif_check_flags = prchk_flags;
1176 		}
1177 	}
1178 
1179 	if (!(spdk_nvme_ctrlr_get_flags(ctrlr) &
1180 	      SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) {
1181 		disk->acwu = 0;
1182 	} else if (nsdata->nsfeat.ns_atomic_write_unit) {
1183 		disk->acwu = nsdata->nacwu;
1184 	} else {
1185 		disk->acwu = cdata->acwu;
1186 	}
1187 
1188 	disk->ctxt = ctx;
1189 	disk->fn_table = &nvmelib_fn_table;
1190 	disk->module = &nvme_if;
1191 	rc = spdk_bdev_register(disk);
1192 	if (rc) {
1193 		SPDK_ERRLOG("spdk_bdev_register() failed\n");
1194 		free(disk->name);
1195 		return rc;
1196 	}
1197 
1198 	return 0;
1199 }
1200 
1201 static int
1202 nvme_bdev_create(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, struct nvme_bdev_ns *nvme_ns)
1203 {
1204 	struct nvme_bdev *bdev;
1205 	int rc;
1206 
1207 	bdev = calloc(1, sizeof(*bdev));
1208 	if (!bdev) {
1209 		SPDK_ERRLOG("bdev calloc() failed\n");
1210 		return -ENOMEM;
1211 	}
1212 
1213 	bdev->nvme_ns = nvme_ns;
1214 	bdev->opal = nvme_bdev_ctrlr->opal_dev != NULL;
1215 
1216 	rc = nvme_disk_create(&bdev->disk, nvme_bdev_ctrlr->name, nvme_bdev_ctrlr->ctrlr,
1217 			      nvme_ns->ns, nvme_bdev_ctrlr->prchk_flags, bdev);
1218 	if (rc != 0) {
1219 		SPDK_ERRLOG("Failed to create NVMe disk\n");
1220 		free(bdev);
1221 		return rc;
1222 	}
1223 
1224 	nvme_ns->ref++;
1225 	nvme_ns->bdev = bdev;
1226 
1227 	return 0;
1228 }
1229 
1230 static void
1231 nvme_ctrlr_populate_standard_namespace(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
1232 				       struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx)
1233 {
1234 	struct spdk_nvme_ctrlr	*ctrlr = nvme_bdev_ctrlr->ctrlr;
1235 	struct spdk_nvme_ns	*ns;
1236 	int			rc = 0;
1237 
1238 	ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id);
1239 	if (!ns) {
1240 		SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id);
1241 		rc = -EINVAL;
1242 		goto done;
1243 	}
1244 
1245 	nvme_ns->ns = ns;
1246 	nvme_ns->ref = 1;
1247 
1248 	rc = nvme_bdev_create(nvme_bdev_ctrlr, nvme_ns);
1249 done:
1250 	nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, rc);
1251 }
1252 
1253 static bool
1254 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
1255 		 struct spdk_nvme_ctrlr_opts *opts)
1256 {
1257 	struct nvme_probe_skip_entry *entry;
1258 
1259 	TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) {
1260 		if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) {
1261 			return false;
1262 		}
1263 	}
1264 
1265 	opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst;
1266 	opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight;
1267 	opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight;
1268 	opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight;
1269 
1270 	SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr);
1271 
1272 	return true;
1273 }
1274 
1275 static void
1276 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl)
1277 {
1278 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = ctx;
1279 
1280 	if (spdk_nvme_cpl_is_error(cpl)) {
1281 		SPDK_WARNLOG("Abort failed. Resetting controller.\n");
1282 		_bdev_nvme_reset(nvme_bdev_ctrlr, NULL);
1283 	}
1284 }
1285 
1286 static void
1287 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr,
1288 	   struct spdk_nvme_qpair *qpair, uint16_t cid)
1289 {
1290 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = cb_arg;
1291 	union spdk_nvme_csts_register csts;
1292 	int rc;
1293 
1294 	assert(nvme_bdev_ctrlr->ctrlr == ctrlr);
1295 
1296 	SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid);
1297 
1298 	/* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O
1299 	 * queue.  (Note: qpair == NULL when there's an admin cmd timeout.)  Otherwise we
1300 	 * would submit another fabrics cmd on the admin queue to read CSTS and check for its
1301 	 * completion recursively.
1302 	 */
1303 	if (nvme_bdev_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) {
1304 		csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr);
1305 		if (csts.bits.cfs) {
1306 			SPDK_ERRLOG("Controller Fatal Status, reset required\n");
1307 			_bdev_nvme_reset(nvme_bdev_ctrlr, NULL);
1308 			return;
1309 		}
1310 	}
1311 
1312 	switch (g_opts.action_on_timeout) {
1313 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT:
1314 		if (qpair) {
1315 			rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid,
1316 						       nvme_abort_cpl, nvme_bdev_ctrlr);
1317 			if (rc == 0) {
1318 				return;
1319 			}
1320 
1321 			SPDK_ERRLOG("Unable to send abort. Resetting.\n");
1322 		}
1323 
1324 	/* FALLTHROUGH */
1325 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET:
1326 		_bdev_nvme_reset(nvme_bdev_ctrlr, NULL);
1327 		break;
1328 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE:
1329 		SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n");
1330 		break;
1331 	default:
1332 		SPDK_ERRLOG("An invalid timeout action value is found.\n");
1333 		break;
1334 	}
1335 }
1336 
1337 void
1338 nvme_ctrlr_depopulate_namespace_done(struct nvme_bdev_ns *nvme_ns)
1339 {
1340 	nvme_bdev_ns_detach(nvme_ns);
1341 }
1342 
1343 static void
1344 nvme_ctrlr_depopulate_standard_namespace(struct nvme_bdev_ns *nvme_ns)
1345 {
1346 	struct nvme_bdev *bdev;
1347 
1348 	bdev = nvme_bdev_ns_to_bdev(nvme_ns);
1349 	if (bdev != NULL) {
1350 		spdk_bdev_unregister(&bdev->disk, NULL, NULL);
1351 	}
1352 
1353 	nvme_ns->populated = false;
1354 
1355 	nvme_ctrlr_depopulate_namespace_done(nvme_ns);
1356 }
1357 
1358 static void
1359 nvme_ctrlr_populate_namespace(struct nvme_bdev_ctrlr *ctrlr, struct nvme_bdev_ns *nvme_ns,
1360 			      struct nvme_async_probe_ctx *ctx)
1361 {
1362 	g_populate_namespace_fn[nvme_ns->type](ctrlr, nvme_ns, ctx);
1363 }
1364 
1365 static void
1366 nvme_ctrlr_depopulate_namespace(struct nvme_bdev_ctrlr *ctrlr, struct nvme_bdev_ns *nvme_ns)
1367 {
1368 	g_depopulate_namespace_fn[nvme_ns->type](nvme_ns);
1369 }
1370 
1371 void
1372 nvme_ctrlr_populate_namespace_done(struct nvme_async_probe_ctx *ctx,
1373 				   struct nvme_bdev_ns *nvme_ns, int rc)
1374 {
1375 	if (rc == 0) {
1376 		nvme_ns->populated = true;
1377 		pthread_mutex_lock(&g_bdev_nvme_mutex);
1378 		nvme_ns->ctrlr->ref++;
1379 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
1380 	} else {
1381 		memset(nvme_ns, 0, sizeof(*nvme_ns));
1382 	}
1383 
1384 	if (ctx) {
1385 		ctx->populates_in_progress--;
1386 		if (ctx->populates_in_progress == 0) {
1387 			nvme_ctrlr_populate_namespaces_done(ctx);
1388 		}
1389 	}
1390 }
1391 
1392 static void
1393 nvme_ctrlr_populate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
1394 			       struct nvme_async_probe_ctx *ctx)
1395 {
1396 	struct spdk_nvme_ctrlr	*ctrlr = nvme_bdev_ctrlr->ctrlr;
1397 	struct nvme_bdev_ns	*nvme_ns;
1398 	struct spdk_nvme_ns	*ns;
1399 	struct nvme_bdev	*bdev;
1400 	uint32_t		i;
1401 	int			rc;
1402 	uint64_t		num_sectors;
1403 	bool			ns_is_active;
1404 
1405 	if (ctx) {
1406 		/* Initialize this count to 1 to handle the populate functions
1407 		 * calling nvme_ctrlr_populate_namespace_done() immediately.
1408 		 */
1409 		ctx->populates_in_progress = 1;
1410 	}
1411 
1412 	for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
1413 		uint32_t	nsid = i + 1;
1414 
1415 		nvme_ns = nvme_bdev_ctrlr->namespaces[i];
1416 		ns_is_active = spdk_nvme_ctrlr_is_active_ns(ctrlr, nsid);
1417 
1418 		if (nvme_ns->populated && ns_is_active && nvme_ns->type == NVME_BDEV_NS_STANDARD) {
1419 			/* NS is still there but attributes may have changed */
1420 			ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid);
1421 			num_sectors = spdk_nvme_ns_get_num_sectors(ns);
1422 			bdev = nvme_bdev_ns_to_bdev(nvme_ns);
1423 			assert(bdev != NULL);
1424 			if (bdev->disk.blockcnt != num_sectors) {
1425 				SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n",
1426 					       nsid,
1427 					       bdev->disk.name,
1428 					       bdev->disk.blockcnt,
1429 					       num_sectors);
1430 				rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors);
1431 				if (rc != 0) {
1432 					SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n",
1433 						    bdev->disk.name, rc);
1434 				}
1435 			}
1436 		}
1437 
1438 		if (!nvme_ns->populated && ns_is_active) {
1439 			nvme_ns->id = nsid;
1440 			nvme_ns->ctrlr = nvme_bdev_ctrlr;
1441 			if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) {
1442 				nvme_ns->type = NVME_BDEV_NS_OCSSD;
1443 			} else {
1444 				nvme_ns->type = NVME_BDEV_NS_STANDARD;
1445 			}
1446 
1447 			nvme_ns->bdev = NULL;
1448 
1449 			if (ctx) {
1450 				ctx->populates_in_progress++;
1451 			}
1452 			nvme_ctrlr_populate_namespace(nvme_bdev_ctrlr, nvme_ns, ctx);
1453 		}
1454 
1455 		if (nvme_ns->populated && !ns_is_active) {
1456 			nvme_ctrlr_depopulate_namespace(nvme_bdev_ctrlr, nvme_ns);
1457 		}
1458 	}
1459 
1460 	if (ctx) {
1461 		/* Decrement this count now that the loop is over to account
1462 		 * for the one we started with.  If the count is then 0, we
1463 		 * know any populate_namespace functions completed immediately,
1464 		 * so we'll kick the callback here.
1465 		 */
1466 		ctx->populates_in_progress--;
1467 		if (ctx->populates_in_progress == 0) {
1468 			nvme_ctrlr_populate_namespaces_done(ctx);
1469 		}
1470 	}
1471 
1472 }
1473 
1474 static void
1475 nvme_ctrlr_depopulate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr)
1476 {
1477 	uint32_t i;
1478 	struct nvme_bdev_ns *nvme_ns;
1479 
1480 	for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
1481 		uint32_t nsid = i + 1;
1482 
1483 		nvme_ns = nvme_bdev_ctrlr->namespaces[nsid - 1];
1484 		if (nvme_ns->populated) {
1485 			assert(nvme_ns->id == nsid);
1486 			nvme_ctrlr_depopulate_namespace(nvme_bdev_ctrlr, nvme_ns);
1487 		}
1488 	}
1489 }
1490 
1491 static void
1492 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl)
1493 {
1494 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr		= arg;
1495 	union spdk_nvme_async_event_completion	event;
1496 
1497 	if (spdk_nvme_cpl_is_error(cpl)) {
1498 		SPDK_WARNLOG("AER request execute failed");
1499 		return;
1500 	}
1501 
1502 	event.raw = cpl->cdw0;
1503 	if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) &&
1504 	    (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) {
1505 		nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, NULL);
1506 	} else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_VENDOR) &&
1507 		   (event.bits.log_page_identifier == SPDK_OCSSD_LOG_CHUNK_NOTIFICATION) &&
1508 		   spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) {
1509 		bdev_ocssd_handle_chunk_notification(nvme_bdev_ctrlr);
1510 	}
1511 }
1512 
1513 static int
1514 nvme_bdev_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr,
1515 		       const char *name,
1516 		       const struct spdk_nvme_transport_id *trid,
1517 		       uint32_t prchk_flags)
1518 {
1519 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
1520 	struct nvme_bdev_ctrlr_trid *trid_entry;
1521 	uint32_t i;
1522 	int rc;
1523 
1524 	nvme_bdev_ctrlr = calloc(1, sizeof(*nvme_bdev_ctrlr));
1525 	if (nvme_bdev_ctrlr == NULL) {
1526 		SPDK_ERRLOG("Failed to allocate device struct\n");
1527 		return -ENOMEM;
1528 	}
1529 
1530 	TAILQ_INIT(&nvme_bdev_ctrlr->trids);
1531 	nvme_bdev_ctrlr->num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr);
1532 	if (nvme_bdev_ctrlr->num_ns != 0) {
1533 		nvme_bdev_ctrlr->namespaces = calloc(nvme_bdev_ctrlr->num_ns, sizeof(struct nvme_bdev_ns *));
1534 		if (!nvme_bdev_ctrlr->namespaces) {
1535 			SPDK_ERRLOG("Failed to allocate block namespaces pointer\n");
1536 			rc = -ENOMEM;
1537 			goto err_alloc_namespaces;
1538 		}
1539 	}
1540 
1541 	trid_entry = calloc(1, sizeof(*trid_entry));
1542 	if (trid_entry == NULL) {
1543 		SPDK_ERRLOG("Failed to allocate trid entry pointer\n");
1544 		rc = -ENOMEM;
1545 		goto err_alloc_trid;
1546 	}
1547 
1548 	trid_entry->trid = *trid;
1549 
1550 	for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
1551 		nvme_bdev_ctrlr->namespaces[i] = calloc(1, sizeof(struct nvme_bdev_ns));
1552 		if (nvme_bdev_ctrlr->namespaces[i] == NULL) {
1553 			SPDK_ERRLOG("Failed to allocate block namespace struct\n");
1554 			rc = -ENOMEM;
1555 			goto err_alloc_namespace;
1556 		}
1557 	}
1558 
1559 	nvme_bdev_ctrlr->thread = spdk_get_thread();
1560 	nvme_bdev_ctrlr->adminq_timer_poller = NULL;
1561 	nvme_bdev_ctrlr->ctrlr = ctrlr;
1562 	nvme_bdev_ctrlr->ref = 1;
1563 	nvme_bdev_ctrlr->connected_trid = &trid_entry->trid;
1564 	nvme_bdev_ctrlr->name = strdup(name);
1565 	if (nvme_bdev_ctrlr->name == NULL) {
1566 		rc = -ENOMEM;
1567 		goto err_alloc_name;
1568 	}
1569 
1570 	if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) {
1571 		rc = bdev_ocssd_init_ctrlr(nvme_bdev_ctrlr);
1572 		if (spdk_unlikely(rc != 0)) {
1573 			SPDK_ERRLOG("Unable to initialize OCSSD controller\n");
1574 			goto err_init_ocssd;
1575 		}
1576 	}
1577 
1578 	nvme_bdev_ctrlr->prchk_flags = prchk_flags;
1579 
1580 	spdk_io_device_register(nvme_bdev_ctrlr, bdev_nvme_create_cb, bdev_nvme_destroy_cb,
1581 				sizeof(struct nvme_io_channel),
1582 				name);
1583 
1584 	nvme_bdev_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_bdev_ctrlr,
1585 					       g_opts.nvme_adminq_poll_period_us);
1586 
1587 	TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nvme_bdev_ctrlr, tailq);
1588 
1589 	if (g_opts.timeout_us > 0) {
1590 		spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us,
1591 				timeout_cb, nvme_bdev_ctrlr);
1592 	}
1593 
1594 	spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_bdev_ctrlr);
1595 	spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_bdev_ctrlr);
1596 
1597 	if (spdk_nvme_ctrlr_get_flags(nvme_bdev_ctrlr->ctrlr) &
1598 	    SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) {
1599 		nvme_bdev_ctrlr->opal_dev = spdk_opal_dev_construct(nvme_bdev_ctrlr->ctrlr);
1600 		if (nvme_bdev_ctrlr->opal_dev == NULL) {
1601 			SPDK_ERRLOG("Failed to initialize Opal\n");
1602 		}
1603 	}
1604 
1605 	TAILQ_INSERT_HEAD(&nvme_bdev_ctrlr->trids, trid_entry, link);
1606 	return 0;
1607 
1608 err_init_ocssd:
1609 	free(nvme_bdev_ctrlr->name);
1610 err_alloc_name:
1611 err_alloc_namespace:
1612 	for (; i > 0; i--) {
1613 		free(nvme_bdev_ctrlr->namespaces[i - 1]);
1614 	}
1615 	free(trid_entry);
1616 err_alloc_trid:
1617 	free(nvme_bdev_ctrlr->namespaces);
1618 err_alloc_namespaces:
1619 	free(nvme_bdev_ctrlr);
1620 	return rc;
1621 }
1622 
1623 static void
1624 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
1625 	  struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
1626 {
1627 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
1628 	struct nvme_probe_ctx *ctx = cb_ctx;
1629 	char *name = NULL;
1630 	uint32_t prchk_flags = 0;
1631 	size_t i;
1632 
1633 	if (ctx) {
1634 		for (i = 0; i < ctx->count; i++) {
1635 			if (spdk_nvme_transport_id_compare(trid, &ctx->trids[i]) == 0) {
1636 				prchk_flags = ctx->prchk_flags[i];
1637 				name = strdup(ctx->names[i]);
1638 				break;
1639 			}
1640 		}
1641 	} else {
1642 		name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++);
1643 	}
1644 	if (!name) {
1645 		SPDK_ERRLOG("Failed to assign name to NVMe device\n");
1646 		return;
1647 	}
1648 
1649 	SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name);
1650 
1651 	nvme_bdev_ctrlr_create(ctrlr, name, trid, prchk_flags);
1652 
1653 	nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(trid);
1654 	if (!nvme_bdev_ctrlr) {
1655 		SPDK_ERRLOG("Failed to find new NVMe controller\n");
1656 		free(name);
1657 		return;
1658 	}
1659 
1660 	nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, NULL);
1661 
1662 	free(name);
1663 }
1664 
1665 static void
1666 _nvme_bdev_ctrlr_destruct(void *ctx)
1667 {
1668 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = ctx;
1669 
1670 	nvme_ctrlr_depopulate_namespaces(nvme_bdev_ctrlr);
1671 	nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr);
1672 }
1673 
1674 static void
1675 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr)
1676 {
1677 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = cb_ctx;
1678 
1679 	pthread_mutex_lock(&g_bdev_nvme_mutex);
1680 	assert(nvme_bdev_ctrlr->ctrlr == ctrlr);
1681 	/* The controller's destruction was already started */
1682 	if (nvme_bdev_ctrlr->destruct) {
1683 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
1684 		return;
1685 	}
1686 	nvme_bdev_ctrlr->destruct = true;
1687 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
1688 	_nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr);
1689 }
1690 
1691 static int
1692 bdev_nvme_hotplug_probe(void *arg)
1693 {
1694 	if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) {
1695 		g_hotplug_probe_ctx = NULL;
1696 		spdk_poller_unregister(&g_hotplug_probe_poller);
1697 	}
1698 
1699 	return SPDK_POLLER_BUSY;
1700 }
1701 
1702 static int
1703 bdev_nvme_hotplug(void *arg)
1704 {
1705 	struct spdk_nvme_transport_id trid_pcie;
1706 
1707 	if (g_hotplug_probe_ctx) {
1708 		return SPDK_POLLER_BUSY;
1709 	}
1710 
1711 	memset(&trid_pcie, 0, sizeof(trid_pcie));
1712 	spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE);
1713 
1714 	g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL,
1715 			      hotplug_probe_cb, attach_cb, NULL);
1716 
1717 	if (g_hotplug_probe_ctx) {
1718 		assert(g_hotplug_probe_poller == NULL);
1719 		g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000);
1720 	}
1721 
1722 	return SPDK_POLLER_BUSY;
1723 }
1724 
1725 void
1726 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts)
1727 {
1728 	*opts = g_opts;
1729 }
1730 
1731 int
1732 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts)
1733 {
1734 	if (g_bdev_nvme_init_thread != NULL) {
1735 		if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
1736 			return -EPERM;
1737 		}
1738 	}
1739 
1740 	g_opts = *opts;
1741 
1742 	return 0;
1743 }
1744 
1745 struct set_nvme_hotplug_ctx {
1746 	uint64_t period_us;
1747 	bool enabled;
1748 	spdk_msg_fn fn;
1749 	void *fn_ctx;
1750 };
1751 
1752 static void
1753 set_nvme_hotplug_period_cb(void *_ctx)
1754 {
1755 	struct set_nvme_hotplug_ctx *ctx = _ctx;
1756 
1757 	spdk_poller_unregister(&g_hotplug_poller);
1758 	if (ctx->enabled) {
1759 		g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us);
1760 	}
1761 
1762 	g_nvme_hotplug_poll_period_us = ctx->period_us;
1763 	g_nvme_hotplug_enabled = ctx->enabled;
1764 	if (ctx->fn) {
1765 		ctx->fn(ctx->fn_ctx);
1766 	}
1767 
1768 	free(ctx);
1769 }
1770 
1771 int
1772 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx)
1773 {
1774 	struct set_nvme_hotplug_ctx *ctx;
1775 
1776 	if (enabled == true && !spdk_process_is_primary()) {
1777 		return -EPERM;
1778 	}
1779 
1780 	ctx = calloc(1, sizeof(*ctx));
1781 	if (ctx == NULL) {
1782 		return -ENOMEM;
1783 	}
1784 
1785 	period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us;
1786 	ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX);
1787 	ctx->enabled = enabled;
1788 	ctx->fn = cb;
1789 	ctx->fn_ctx = cb_ctx;
1790 
1791 	spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx);
1792 	return 0;
1793 }
1794 
1795 static void
1796 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, size_t count, int rc)
1797 {
1798 	if (ctx->cb_fn) {
1799 		ctx->cb_fn(ctx->cb_ctx, count, rc);
1800 	}
1801 
1802 	ctx->namespaces_populated = true;
1803 	if (ctx->probe_done) {
1804 		/* The probe was already completed, so we need to free the context
1805 		 * here.  This can happen for cases like OCSSD, where we need to
1806 		 * send additional commands to the SSD after attach.
1807 		 */
1808 		free(ctx);
1809 	}
1810 }
1811 
1812 static void
1813 nvme_ctrlr_populate_namespaces_done(struct nvme_async_probe_ctx *ctx)
1814 {
1815 	struct nvme_bdev_ctrlr	*nvme_bdev_ctrlr;
1816 	struct nvme_bdev_ns	*nvme_ns;
1817 	struct nvme_bdev	*nvme_bdev;
1818 	uint32_t		i, nsid;
1819 	size_t			j;
1820 
1821 	nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(ctx->base_name);
1822 	assert(nvme_bdev_ctrlr != NULL);
1823 
1824 	/*
1825 	 * Report the new bdevs that were created in this call.
1826 	 * There can be more than one bdev per NVMe controller.
1827 	 */
1828 	j = 0;
1829 	for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
1830 		nsid = i + 1;
1831 		nvme_ns = nvme_bdev_ctrlr->namespaces[nsid - 1];
1832 		if (!nvme_ns->populated) {
1833 			continue;
1834 		}
1835 		assert(nvme_ns->id == nsid);
1836 		nvme_bdev = nvme_bdev_ns_to_bdev(nvme_ns);
1837 		if (nvme_bdev == NULL) {
1838 			assert(nvme_ns->type == NVME_BDEV_NS_OCSSD);
1839 			continue;
1840 		}
1841 		if (j < ctx->count) {
1842 			ctx->names[j] = nvme_bdev->disk.name;
1843 			j++;
1844 		} else {
1845 			SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n",
1846 				    ctx->count);
1847 			populate_namespaces_cb(ctx, 0, -ERANGE);
1848 			return;
1849 		}
1850 	}
1851 
1852 	populate_namespaces_cb(ctx, j, 0);
1853 }
1854 
1855 static bool
1856 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2)
1857 {
1858 	const struct spdk_nvme_ns_data *nsdata1, *nsdata2;
1859 
1860 	nsdata1 = spdk_nvme_ns_get_data(ns1);
1861 	nsdata2 = spdk_nvme_ns_get_data(ns2);
1862 
1863 	return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid));
1864 }
1865 
1866 static int
1867 bdev_nvme_add_trid(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, struct spdk_nvme_ctrlr *new_ctrlr,
1868 		   struct spdk_nvme_transport_id *trid)
1869 {
1870 	uint32_t			i, nsid;
1871 	struct nvme_bdev_ns		*nvme_ns;
1872 	struct spdk_nvme_ns		*new_ns;
1873 	struct nvme_bdev_ctrlr_trid	*new_trid, *tmp_trid;
1874 	int				rc = 0;
1875 
1876 	assert(nvme_bdev_ctrlr != NULL);
1877 
1878 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
1879 		SPDK_ERRLOG("PCIe failover is not supported.\n");
1880 		return -ENOTSUP;
1881 	}
1882 
1883 	pthread_mutex_lock(&g_bdev_nvme_mutex);
1884 
1885 	/* Currently we only support failover to the same transport type. */
1886 	if (nvme_bdev_ctrlr->connected_trid->trtype != trid->trtype) {
1887 		rc = -EINVAL;
1888 		goto exit;
1889 	}
1890 
1891 	/* Currently we only support failover to the same NQN. */
1892 	if (strncmp(trid->subnqn, nvme_bdev_ctrlr->connected_trid->subnqn, SPDK_NVMF_NQN_MAX_LEN)) {
1893 		rc = -EINVAL;
1894 		goto exit;
1895 	}
1896 
1897 	/* Skip all the other checks if we've already registered this path. */
1898 	TAILQ_FOREACH(new_trid, &nvme_bdev_ctrlr->trids, link) {
1899 		if (!spdk_nvme_transport_id_compare(&new_trid->trid, trid)) {
1900 			rc = -EEXIST;
1901 			goto exit;
1902 		}
1903 	}
1904 
1905 	if (spdk_nvme_ctrlr_get_num_ns(new_ctrlr) != nvme_bdev_ctrlr->num_ns) {
1906 		rc = -EINVAL;
1907 		goto exit;
1908 	}
1909 
1910 	for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
1911 		nsid = i + 1;
1912 
1913 		nvme_ns = nvme_bdev_ctrlr->namespaces[i];
1914 		if (!nvme_ns->populated) {
1915 			continue;
1916 		}
1917 
1918 		new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nsid);
1919 		assert(new_ns != NULL);
1920 
1921 		if (bdev_nvme_compare_ns(nvme_ns->ns, new_ns) != 0) {
1922 			rc = -EINVAL;
1923 			goto exit;
1924 		}
1925 	}
1926 
1927 	new_trid = calloc(1, sizeof(*new_trid));
1928 	if (new_trid == NULL) {
1929 		rc = -ENOMEM;
1930 		goto exit;
1931 	}
1932 	new_trid->trid = *trid;
1933 	new_trid->is_failed = false;
1934 
1935 	TAILQ_FOREACH(tmp_trid, &nvme_bdev_ctrlr->trids, link) {
1936 		if (tmp_trid->is_failed) {
1937 			TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link);
1938 			goto exit;
1939 		}
1940 	}
1941 
1942 	TAILQ_INSERT_TAIL(&nvme_bdev_ctrlr->trids, new_trid, link);
1943 
1944 exit:
1945 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
1946 	return rc;
1947 }
1948 
1949 static void
1950 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
1951 		  struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
1952 {
1953 	struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx;
1954 	struct nvme_bdev_ctrlr	*nvme_bdev_ctrlr;
1955 	struct nvme_async_probe_ctx *ctx;
1956 	int rc;
1957 
1958 	ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, opts);
1959 	ctx->ctrlr_attached = true;
1960 
1961 	nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(ctx->base_name);
1962 	if (nvme_bdev_ctrlr) {
1963 		/* This is the case that a secondary path is added to an existing
1964 		 * nvme_bdev_ctrlr for failover. After checking if it can access the same
1965 		 * namespaces as the primary path, it is disconnected until failover occurs.
1966 		 */
1967 		rc = bdev_nvme_add_trid(nvme_bdev_ctrlr, ctrlr, &ctx->trid);
1968 
1969 		spdk_nvme_detach(ctrlr);
1970 		goto exit;
1971 	}
1972 
1973 	rc = nvme_bdev_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx->prchk_flags);
1974 	if (rc) {
1975 		SPDK_ERRLOG("Failed to create new device\n");
1976 		goto exit;
1977 	}
1978 
1979 	nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(&ctx->trid);
1980 	assert(nvme_bdev_ctrlr != NULL);
1981 
1982 	nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, ctx);
1983 	return;
1984 
1985 exit:
1986 	populate_namespaces_cb(ctx, 0, rc);
1987 }
1988 
1989 static int
1990 bdev_nvme_async_poll(void *arg)
1991 {
1992 	struct nvme_async_probe_ctx	*ctx = arg;
1993 	int				rc;
1994 
1995 	rc = spdk_nvme_probe_poll_async(ctx->probe_ctx);
1996 	if (spdk_unlikely(rc != -EAGAIN)) {
1997 		ctx->probe_done = true;
1998 		spdk_poller_unregister(&ctx->poller);
1999 		if (!ctx->ctrlr_attached) {
2000 			/* The probe is done, but no controller was attached.
2001 			 * That means we had a failure, so report -EIO back to
2002 			 * the caller (usually the RPC). populate_namespaces_cb()
2003 			 * will take care of freeing the nvme_async_probe_ctx.
2004 			 */
2005 			populate_namespaces_cb(ctx, 0, -EIO);
2006 		} else if (ctx->namespaces_populated) {
2007 			/* The namespaces for the attached controller were all
2008 			 * populated and the response was already sent to the
2009 			 * caller (usually the RPC).  So free the context here.
2010 			 */
2011 			free(ctx);
2012 		}
2013 	}
2014 
2015 	return SPDK_POLLER_BUSY;
2016 }
2017 
2018 int
2019 bdev_nvme_create(struct spdk_nvme_transport_id *trid,
2020 		 struct spdk_nvme_host_id *hostid,
2021 		 const char *base_name,
2022 		 const char **names,
2023 		 uint32_t count,
2024 		 const char *hostnqn,
2025 		 uint32_t prchk_flags,
2026 		 spdk_bdev_create_nvme_fn cb_fn,
2027 		 void *cb_ctx,
2028 		 struct spdk_nvme_ctrlr_opts *opts)
2029 {
2030 	struct nvme_probe_skip_entry	*entry, *tmp;
2031 	struct nvme_async_probe_ctx	*ctx;
2032 
2033 	/* TODO expand this check to include both the host and target TRIDs.
2034 	 * Only if both are the same should we fail.
2035 	 */
2036 	if (nvme_bdev_ctrlr_get(trid) != NULL) {
2037 		SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr);
2038 		return -EEXIST;
2039 	}
2040 
2041 	ctx = calloc(1, sizeof(*ctx));
2042 	if (!ctx) {
2043 		return -ENOMEM;
2044 	}
2045 	ctx->base_name = base_name;
2046 	ctx->names = names;
2047 	ctx->count = count;
2048 	ctx->cb_fn = cb_fn;
2049 	ctx->cb_ctx = cb_ctx;
2050 	ctx->prchk_flags = prchk_flags;
2051 	ctx->trid = *trid;
2052 
2053 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
2054 		TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) {
2055 			if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) {
2056 				TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq);
2057 				free(entry);
2058 				break;
2059 			}
2060 		}
2061 	}
2062 
2063 	if (opts) {
2064 		memcpy(&ctx->opts, opts, sizeof(*opts));
2065 	} else {
2066 		spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->opts, sizeof(ctx->opts));
2067 	}
2068 
2069 	ctx->opts.transport_retry_count = g_opts.retry_count;
2070 	ctx->opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms;
2071 
2072 	if (hostnqn) {
2073 		snprintf(ctx->opts.hostnqn, sizeof(ctx->opts.hostnqn), "%s", hostnqn);
2074 	}
2075 
2076 	if (hostid->hostaddr[0] != '\0') {
2077 		snprintf(ctx->opts.src_addr, sizeof(ctx->opts.src_addr), "%s", hostid->hostaddr);
2078 	}
2079 
2080 	if (hostid->hostsvcid[0] != '\0') {
2081 		snprintf(ctx->opts.src_svcid, sizeof(ctx->opts.src_svcid), "%s", hostid->hostsvcid);
2082 	}
2083 
2084 	ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->opts, connect_attach_cb);
2085 	if (ctx->probe_ctx == NULL) {
2086 		SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr);
2087 		free(ctx);
2088 		return -ENODEV;
2089 	}
2090 	ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000);
2091 
2092 	return 0;
2093 }
2094 
2095 int
2096 bdev_nvme_remove_trid(const char *name, struct spdk_nvme_transport_id *trid)
2097 {
2098 	struct nvme_bdev_ctrlr		*nvme_bdev_ctrlr;
2099 	struct nvme_bdev_ctrlr_trid	*ctrlr_trid, *tmp_trid;
2100 
2101 	if (name == NULL) {
2102 		return -EINVAL;
2103 	}
2104 
2105 	nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name);
2106 	if (nvme_bdev_ctrlr == NULL) {
2107 		SPDK_ERRLOG("Failed to find NVMe controller\n");
2108 		return -ENODEV;
2109 	}
2110 
2111 	/* case 1: we are currently using the path to be removed. */
2112 	if (!spdk_nvme_transport_id_compare(trid, nvme_bdev_ctrlr->connected_trid)) {
2113 		ctrlr_trid = TAILQ_FIRST(&nvme_bdev_ctrlr->trids);
2114 		assert(nvme_bdev_ctrlr->connected_trid == &ctrlr_trid->trid);
2115 		/* case 1A: the current path is the only path. */
2116 		if (!TAILQ_NEXT(ctrlr_trid, link)) {
2117 			return bdev_nvme_delete(name);
2118 		}
2119 
2120 		/* case 1B: there is an alternative path. */
2121 		return bdev_nvme_failover(nvme_bdev_ctrlr, true);
2122 	}
2123 	/* case 2: We are not using the specified path. */
2124 	TAILQ_FOREACH_SAFE(ctrlr_trid, &nvme_bdev_ctrlr->trids, link, tmp_trid) {
2125 		if (!spdk_nvme_transport_id_compare(&ctrlr_trid->trid, trid)) {
2126 			TAILQ_REMOVE(&nvme_bdev_ctrlr->trids, ctrlr_trid, link);
2127 			free(ctrlr_trid);
2128 			return 0;
2129 		}
2130 	}
2131 
2132 	/* case 2A: The address isn't even in the registered list. */
2133 	return -ENXIO;
2134 }
2135 
2136 int
2137 bdev_nvme_delete(const char *name)
2138 {
2139 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
2140 	struct nvme_probe_skip_entry *entry;
2141 
2142 	if (name == NULL) {
2143 		return -EINVAL;
2144 	}
2145 
2146 	pthread_mutex_lock(&g_bdev_nvme_mutex);
2147 
2148 	nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name);
2149 	if (nvme_bdev_ctrlr == NULL) {
2150 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
2151 		SPDK_ERRLOG("Failed to find NVMe controller\n");
2152 		return -ENODEV;
2153 	}
2154 
2155 	/* The controller's destruction was already started */
2156 	if (nvme_bdev_ctrlr->destruct) {
2157 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
2158 		return 0;
2159 	}
2160 
2161 	if (nvme_bdev_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
2162 		entry = calloc(1, sizeof(*entry));
2163 		if (!entry) {
2164 			pthread_mutex_unlock(&g_bdev_nvme_mutex);
2165 			return -ENOMEM;
2166 		}
2167 		entry->trid = *nvme_bdev_ctrlr->connected_trid;
2168 		TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq);
2169 	}
2170 
2171 	nvme_bdev_ctrlr->destruct = true;
2172 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
2173 
2174 	_nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr);
2175 
2176 	return 0;
2177 }
2178 
2179 static int
2180 bdev_nvme_library_init(void)
2181 {
2182 	g_bdev_nvme_init_thread = spdk_get_thread();
2183 
2184 	spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_poll_group_create_cb,
2185 				bdev_nvme_poll_group_destroy_cb,
2186 				sizeof(struct nvme_bdev_poll_group),  "bdev_nvme_poll_groups");
2187 
2188 	return 0;
2189 }
2190 
2191 static void
2192 bdev_nvme_library_fini(void)
2193 {
2194 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, *tmp;
2195 	struct nvme_probe_skip_entry *entry, *entry_tmp;
2196 
2197 	spdk_poller_unregister(&g_hotplug_poller);
2198 	free(g_hotplug_probe_ctx);
2199 
2200 	TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) {
2201 		TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq);
2202 		free(entry);
2203 	}
2204 
2205 	pthread_mutex_lock(&g_bdev_nvme_mutex);
2206 	TAILQ_FOREACH_SAFE(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq, tmp) {
2207 		if (nvme_bdev_ctrlr->destruct) {
2208 			/* This controller's destruction was already started
2209 			 * before the application started shutting down
2210 			 */
2211 			continue;
2212 		}
2213 		nvme_bdev_ctrlr->destruct = true;
2214 
2215 		spdk_thread_send_msg(nvme_bdev_ctrlr->thread, _nvme_bdev_ctrlr_destruct,
2216 				     nvme_bdev_ctrlr);
2217 	}
2218 
2219 	g_bdev_nvme_module_finish = true;
2220 	if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
2221 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
2222 		spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL);
2223 		spdk_bdev_module_finish_done();
2224 		return;
2225 	}
2226 
2227 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
2228 }
2229 
2230 static void
2231 bdev_nvme_verify_pi_error(struct spdk_bdev_io *bdev_io)
2232 {
2233 	struct spdk_bdev *bdev = bdev_io->bdev;
2234 	struct spdk_dif_ctx dif_ctx;
2235 	struct spdk_dif_error err_blk = {};
2236 	int rc;
2237 
2238 	rc = spdk_dif_ctx_init(&dif_ctx,
2239 			       bdev->blocklen, bdev->md_len, bdev->md_interleave,
2240 			       bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags,
2241 			       bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0);
2242 	if (rc != 0) {
2243 		SPDK_ERRLOG("Initialization of DIF context failed\n");
2244 		return;
2245 	}
2246 
2247 	if (bdev->md_interleave) {
2248 		rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
2249 				     bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
2250 	} else {
2251 		struct iovec md_iov = {
2252 			.iov_base	= bdev_io->u.bdev.md_buf,
2253 			.iov_len	= bdev_io->u.bdev.num_blocks * bdev->md_len,
2254 		};
2255 
2256 		rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
2257 				     &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
2258 	}
2259 
2260 	if (rc != 0) {
2261 		SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n",
2262 			    err_blk.err_type, err_blk.err_offset);
2263 	} else {
2264 		SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n");
2265 	}
2266 }
2267 
2268 static void
2269 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl)
2270 {
2271 	struct nvme_bdev_io *bio = ref;
2272 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2273 
2274 	if (spdk_nvme_cpl_is_success(cpl)) {
2275 		/* Run PI verification for read data buffer. */
2276 		bdev_nvme_verify_pi_error(bdev_io);
2277 	}
2278 
2279 	/* Return original completion status */
2280 	spdk_bdev_io_complete_nvme_status(bdev_io, bio->cpl.cdw0, bio->cpl.status.sct,
2281 					  bio->cpl.status.sc);
2282 }
2283 
2284 static void
2285 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl)
2286 {
2287 	struct nvme_bdev_io *bio = ref;
2288 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2289 	struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt;
2290 	struct nvme_io_channel *nvme_ch;
2291 	int ret;
2292 
2293 	if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) {
2294 		SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n",
2295 			    cpl->status.sct, cpl->status.sc);
2296 
2297 		/* Save completion status to use after verifying PI error. */
2298 		bio->cpl = *cpl;
2299 
2300 		nvme_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io));
2301 
2302 		/* Read without PI checking to verify PI error. */
2303 		ret = bdev_nvme_no_pi_readv(nbdev->nvme_ns->ns,
2304 					    nvme_ch->qpair,
2305 					    bio,
2306 					    bdev_io->u.bdev.iovs,
2307 					    bdev_io->u.bdev.iovcnt,
2308 					    bdev_io->u.bdev.md_buf,
2309 					    bdev_io->u.bdev.num_blocks,
2310 					    bdev_io->u.bdev.offset_blocks);
2311 		if (ret == 0) {
2312 			return;
2313 		}
2314 	}
2315 
2316 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2317 }
2318 
2319 static void
2320 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl)
2321 {
2322 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref);
2323 
2324 	if (spdk_nvme_cpl_is_pi_error(cpl)) {
2325 		SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n",
2326 			    cpl->status.sct, cpl->status.sc);
2327 		/* Run PI verification for write data buffer if PI error is detected. */
2328 		bdev_nvme_verify_pi_error(bdev_io);
2329 	}
2330 
2331 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2332 }
2333 
2334 static void
2335 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl)
2336 {
2337 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref);
2338 
2339 	if (spdk_nvme_cpl_is_pi_error(cpl)) {
2340 		SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n",
2341 			    cpl->status.sct, cpl->status.sc);
2342 		/* Run PI verification for compare data buffer if PI error is detected. */
2343 		bdev_nvme_verify_pi_error(bdev_io);
2344 	}
2345 
2346 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2347 }
2348 
2349 static void
2350 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl)
2351 {
2352 	struct nvme_bdev_io *bio = ref;
2353 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2354 
2355 	/* Compare operation completion */
2356 	if ((cpl->cdw0 & 0xFF) == SPDK_NVME_OPC_COMPARE) {
2357 		/* Save compare result for write callback */
2358 		bio->cpl = *cpl;
2359 		return;
2360 	}
2361 
2362 	/* Write operation completion */
2363 	if (spdk_nvme_cpl_is_error(&bio->cpl)) {
2364 		/* If bio->cpl is already an error, it means the compare operation failed.  In that case,
2365 		 * complete the IO with the compare operation's status.
2366 		 */
2367 		if (!spdk_nvme_cpl_is_error(cpl)) {
2368 			SPDK_ERRLOG("Unexpected write success after compare failure.\n");
2369 		}
2370 
2371 		spdk_bdev_io_complete_nvme_status(bdev_io, bio->cpl.cdw0, bio->cpl.status.sct, bio->cpl.status.sc);
2372 	} else {
2373 		spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2374 	}
2375 }
2376 
2377 static void
2378 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl)
2379 {
2380 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref);
2381 
2382 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2383 }
2384 
2385 static void
2386 bdev_nvme_admin_passthru_completion(void *ctx)
2387 {
2388 	struct nvme_bdev_io *bio = ctx;
2389 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2390 
2391 	spdk_bdev_io_complete_nvme_status(bdev_io,
2392 					  bio->cpl.cdw0, bio->cpl.status.sct, bio->cpl.status.sc);
2393 }
2394 
2395 static void
2396 bdev_nvme_abort_completion(void *ctx)
2397 {
2398 	struct nvme_bdev_io *bio = ctx;
2399 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2400 
2401 	if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) {
2402 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
2403 	} else {
2404 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
2405 	}
2406 }
2407 
2408 static void
2409 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl)
2410 {
2411 	struct nvme_bdev_io *bio = ref;
2412 
2413 	bio->cpl = *cpl;
2414 	spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_completion, bio);
2415 }
2416 
2417 static void
2418 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl)
2419 {
2420 	struct nvme_bdev_io *bio = ref;
2421 
2422 	bio->cpl = *cpl;
2423 	spdk_thread_send_msg(bio->orig_thread, bdev_nvme_admin_passthru_completion, bio);
2424 }
2425 
2426 static void
2427 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset)
2428 {
2429 	struct nvme_bdev_io *bio = ref;
2430 	struct iovec *iov;
2431 
2432 	bio->iov_offset = sgl_offset;
2433 	for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) {
2434 		iov = &bio->iovs[bio->iovpos];
2435 		if (bio->iov_offset < iov->iov_len) {
2436 			break;
2437 		}
2438 
2439 		bio->iov_offset -= iov->iov_len;
2440 	}
2441 }
2442 
2443 static int
2444 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length)
2445 {
2446 	struct nvme_bdev_io *bio = ref;
2447 	struct iovec *iov;
2448 
2449 	assert(bio->iovpos < bio->iovcnt);
2450 
2451 	iov = &bio->iovs[bio->iovpos];
2452 
2453 	*address = iov->iov_base;
2454 	*length = iov->iov_len;
2455 
2456 	if (bio->iov_offset) {
2457 		assert(bio->iov_offset <= iov->iov_len);
2458 		*address += bio->iov_offset;
2459 		*length -= bio->iov_offset;
2460 	}
2461 
2462 	bio->iov_offset += *length;
2463 	if (bio->iov_offset == iov->iov_len) {
2464 		bio->iovpos++;
2465 		bio->iov_offset = 0;
2466 	}
2467 
2468 	return 0;
2469 }
2470 
2471 static void
2472 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset)
2473 {
2474 	struct nvme_bdev_io *bio = ref;
2475 	struct iovec *iov;
2476 
2477 	bio->fused_iov_offset = sgl_offset;
2478 	for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) {
2479 		iov = &bio->fused_iovs[bio->fused_iovpos];
2480 		if (bio->fused_iov_offset < iov->iov_len) {
2481 			break;
2482 		}
2483 
2484 		bio->fused_iov_offset -= iov->iov_len;
2485 	}
2486 }
2487 
2488 static int
2489 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length)
2490 {
2491 	struct nvme_bdev_io *bio = ref;
2492 	struct iovec *iov;
2493 
2494 	assert(bio->fused_iovpos < bio->fused_iovcnt);
2495 
2496 	iov = &bio->fused_iovs[bio->fused_iovpos];
2497 
2498 	*address = iov->iov_base;
2499 	*length = iov->iov_len;
2500 
2501 	if (bio->fused_iov_offset) {
2502 		assert(bio->fused_iov_offset <= iov->iov_len);
2503 		*address += bio->fused_iov_offset;
2504 		*length -= bio->fused_iov_offset;
2505 	}
2506 
2507 	bio->fused_iov_offset += *length;
2508 	if (bio->fused_iov_offset == iov->iov_len) {
2509 		bio->fused_iovpos++;
2510 		bio->fused_iov_offset = 0;
2511 	}
2512 
2513 	return 0;
2514 }
2515 
2516 static int
2517 bdev_nvme_no_pi_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
2518 		      struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
2519 		      void *md, uint64_t lba_count, uint64_t lba)
2520 {
2521 	int rc;
2522 
2523 	SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n",
2524 		      lba_count, lba);
2525 
2526 	bio->iovs = iov;
2527 	bio->iovcnt = iovcnt;
2528 	bio->iovpos = 0;
2529 	bio->iov_offset = 0;
2530 
2531 	rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count,
2532 					    bdev_nvme_no_pi_readv_done, bio, 0,
2533 					    bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
2534 					    md, 0, 0);
2535 
2536 	if (rc != 0 && rc != -ENOMEM) {
2537 		SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc);
2538 	}
2539 	return rc;
2540 }
2541 
2542 static int
2543 bdev_nvme_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
2544 		struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
2545 		void *md, uint64_t lba_count, uint64_t lba, uint32_t flags)
2546 {
2547 	int rc;
2548 
2549 	SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n",
2550 		      lba_count, lba);
2551 
2552 	bio->iovs = iov;
2553 	bio->iovcnt = iovcnt;
2554 	bio->iovpos = 0;
2555 	bio->iov_offset = 0;
2556 
2557 	if (iovcnt == 1) {
2558 		rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, md, lba,
2559 						   lba_count,
2560 						   bdev_nvme_readv_done, bio,
2561 						   flags,
2562 						   0, 0);
2563 	} else {
2564 		rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count,
2565 						    bdev_nvme_readv_done, bio, flags,
2566 						    bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
2567 						    md, 0, 0);
2568 	}
2569 
2570 	if (rc != 0 && rc != -ENOMEM) {
2571 		SPDK_ERRLOG("readv failed: rc = %d\n", rc);
2572 	}
2573 	return rc;
2574 }
2575 
2576 static int
2577 bdev_nvme_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
2578 		 struct nvme_bdev_io *bio,
2579 		 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
2580 		 uint32_t flags)
2581 {
2582 	int rc;
2583 
2584 	SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n",
2585 		      lba_count, lba);
2586 
2587 	bio->iovs = iov;
2588 	bio->iovcnt = iovcnt;
2589 	bio->iovpos = 0;
2590 	bio->iov_offset = 0;
2591 
2592 	if (iovcnt == 1) {
2593 		rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, md, lba,
2594 						    lba_count,
2595 						    bdev_nvme_writev_done, bio,
2596 						    flags,
2597 						    0, 0);
2598 	} else {
2599 		rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count,
2600 						     bdev_nvme_writev_done, bio, flags,
2601 						     bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
2602 						     md, 0, 0);
2603 	}
2604 
2605 	if (rc != 0 && rc != -ENOMEM) {
2606 		SPDK_ERRLOG("writev failed: rc = %d\n", rc);
2607 	}
2608 	return rc;
2609 }
2610 
2611 static int
2612 bdev_nvme_comparev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
2613 		   struct nvme_bdev_io *bio,
2614 		   struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
2615 		   uint32_t flags)
2616 {
2617 	int rc;
2618 
2619 	SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n",
2620 		      lba_count, lba);
2621 
2622 	bio->iovs = iov;
2623 	bio->iovcnt = iovcnt;
2624 	bio->iovpos = 0;
2625 	bio->iov_offset = 0;
2626 
2627 	rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count,
2628 					       bdev_nvme_comparev_done, bio, flags,
2629 					       bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
2630 					       md, 0, 0);
2631 
2632 	if (rc != 0 && rc != -ENOMEM) {
2633 		SPDK_ERRLOG("comparev failed: rc = %d\n", rc);
2634 	}
2635 	return rc;
2636 }
2637 
2638 static int
2639 bdev_nvme_comparev_and_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
2640 			      struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt,
2641 			      struct iovec *write_iov, int write_iovcnt,
2642 			      void *md, uint64_t lba_count, uint64_t lba, uint32_t flags)
2643 {
2644 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2645 	int rc;
2646 
2647 	SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n",
2648 		      lba_count, lba);
2649 
2650 	bio->iovs = cmp_iov;
2651 	bio->iovcnt = cmp_iovcnt;
2652 	bio->iovpos = 0;
2653 	bio->iov_offset = 0;
2654 	bio->fused_iovs = write_iov;
2655 	bio->fused_iovcnt = write_iovcnt;
2656 	bio->fused_iovpos = 0;
2657 	bio->fused_iov_offset = 0;
2658 
2659 	if (bdev_io->num_retries == 0) {
2660 		bio->first_fused_submitted = false;
2661 	}
2662 
2663 	if (!bio->first_fused_submitted) {
2664 		flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST;
2665 		memset(&bio->cpl, 0, sizeof(bio->cpl));
2666 
2667 		rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count,
2668 						       bdev_nvme_comparev_and_writev_done, bio, flags,
2669 						       bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0);
2670 		if (rc == 0) {
2671 			bio->first_fused_submitted = true;
2672 			flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST;
2673 		} else {
2674 			if (rc != -ENOMEM) {
2675 				SPDK_ERRLOG("compare failed: rc = %d\n", rc);
2676 			}
2677 			return rc;
2678 		}
2679 	}
2680 
2681 	flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND;
2682 
2683 	rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count,
2684 					     bdev_nvme_comparev_and_writev_done, bio, flags,
2685 					     bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0);
2686 	if (rc != 0 && rc != -ENOMEM) {
2687 		SPDK_ERRLOG("write failed: rc = %d\n", rc);
2688 		rc = 0;
2689 	}
2690 
2691 	return rc;
2692 }
2693 
2694 static int
2695 bdev_nvme_unmap(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
2696 		struct nvme_bdev_io *bio,
2697 		uint64_t offset_blocks,
2698 		uint64_t num_blocks)
2699 {
2700 	struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES];
2701 	struct spdk_nvme_dsm_range *range;
2702 	uint64_t offset, remaining;
2703 	uint64_t num_ranges_u64;
2704 	uint16_t num_ranges;
2705 	int rc;
2706 
2707 	num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) /
2708 			 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
2709 	if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) {
2710 		SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks);
2711 		return -EINVAL;
2712 	}
2713 	num_ranges = (uint16_t)num_ranges_u64;
2714 
2715 	offset = offset_blocks;
2716 	remaining = num_blocks;
2717 	range = &dsm_ranges[0];
2718 
2719 	/* Fill max-size ranges until the remaining blocks fit into one range */
2720 	while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) {
2721 		range->attributes.raw = 0;
2722 		range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
2723 		range->starting_lba = offset;
2724 
2725 		offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
2726 		remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
2727 		range++;
2728 	}
2729 
2730 	/* Final range describes the remaining blocks */
2731 	range->attributes.raw = 0;
2732 	range->length = remaining;
2733 	range->starting_lba = offset;
2734 
2735 	rc = spdk_nvme_ns_cmd_dataset_management(ns, qpair,
2736 			SPDK_NVME_DSM_ATTR_DEALLOCATE,
2737 			dsm_ranges, num_ranges,
2738 			bdev_nvme_queued_done, bio);
2739 
2740 	return rc;
2741 }
2742 
2743 static int
2744 bdev_nvme_admin_passthru(struct nvme_io_channel *nvme_ch, struct nvme_bdev_io *bio,
2745 			 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes)
2746 {
2747 	uint32_t max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ch->ctrlr->ctrlr);
2748 
2749 	if (nbytes > max_xfer_size) {
2750 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
2751 		return -EINVAL;
2752 	}
2753 
2754 	bio->orig_thread = spdk_io_channel_get_thread(spdk_io_channel_from_ctx(nvme_ch));
2755 
2756 	return spdk_nvme_ctrlr_cmd_admin_raw(nvme_ch->ctrlr->ctrlr, cmd, buf,
2757 					     (uint32_t)nbytes, bdev_nvme_admin_passthru_done, bio);
2758 }
2759 
2760 static int
2761 bdev_nvme_io_passthru(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
2762 		      struct nvme_bdev_io *bio,
2763 		      struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes)
2764 {
2765 	uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns);
2766 	struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns);
2767 
2768 	if (nbytes > max_xfer_size) {
2769 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
2770 		return -EINVAL;
2771 	}
2772 
2773 	/*
2774 	 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid,
2775 	 * so fill it out automatically.
2776 	 */
2777 	cmd->nsid = spdk_nvme_ns_get_id(ns);
2778 
2779 	return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf,
2780 					  (uint32_t)nbytes, bdev_nvme_queued_done, bio);
2781 }
2782 
2783 static int
2784 bdev_nvme_io_passthru_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
2785 			 struct nvme_bdev_io *bio,
2786 			 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len)
2787 {
2788 	size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns);
2789 	uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns);
2790 	struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns);
2791 
2792 	if (nbytes > max_xfer_size) {
2793 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
2794 		return -EINVAL;
2795 	}
2796 
2797 	if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) {
2798 		SPDK_ERRLOG("invalid meta data buffer size\n");
2799 		return -EINVAL;
2800 	}
2801 
2802 	/*
2803 	 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid,
2804 	 * so fill it out automatically.
2805 	 */
2806 	cmd->nsid = spdk_nvme_ns_get_id(ns);
2807 
2808 	return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf,
2809 			(uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio);
2810 }
2811 
2812 static void
2813 bdev_nvme_abort_admin_cmd(void *ctx)
2814 {
2815 	struct nvme_bdev_io *bio = ctx;
2816 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2817 	struct nvme_io_channel *nvme_ch;
2818 	struct nvme_bdev_io *bio_to_abort;
2819 	int rc;
2820 
2821 	nvme_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io));
2822 	bio_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx;
2823 
2824 	rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ch->ctrlr->ctrlr,
2825 					   NULL,
2826 					   bio_to_abort,
2827 					   bdev_nvme_abort_done, bio);
2828 	if (rc == -ENOENT) {
2829 		/* If no admin command was found in admin qpair, complete the abort
2830 		 * request with failure.
2831 		 */
2832 		bio->cpl.cdw0 |= 1U;
2833 		bio->cpl.status.sc = SPDK_NVME_SC_SUCCESS;
2834 		bio->cpl.status.sct = SPDK_NVME_SCT_GENERIC;
2835 
2836 		spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_completion, bio);
2837 	}
2838 }
2839 
2840 static int
2841 bdev_nvme_abort(struct nvme_io_channel *nvme_ch, struct nvme_bdev_io *bio,
2842 		struct nvme_bdev_io *bio_to_abort)
2843 {
2844 	int rc;
2845 
2846 	bio->orig_thread = spdk_io_channel_get_thread(spdk_io_channel_from_ctx(nvme_ch));
2847 
2848 	rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ch->ctrlr->ctrlr,
2849 					   nvme_ch->qpair,
2850 					   bio_to_abort,
2851 					   bdev_nvme_abort_done, bio);
2852 	if (rc == -ENOENT) {
2853 		/* If no command was found in I/O qpair, the target command may be
2854 		 * admin command. Only a single thread tries aborting admin command
2855 		 * to clean I/O flow.
2856 		 */
2857 		spdk_thread_send_msg(nvme_ch->ctrlr->thread,
2858 				     bdev_nvme_abort_admin_cmd, bio);
2859 		rc = 0;
2860 	}
2861 
2862 	return rc;
2863 }
2864 
2865 static void
2866 nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w,
2867 		struct nvme_bdev_ns *nvme_ns)
2868 {
2869 	/* nop */
2870 }
2871 
2872 static void
2873 nvme_namespace_config_json(struct spdk_json_write_ctx *w, struct nvme_bdev_ns *nvme_ns)
2874 {
2875 	g_config_json_namespace_fn[nvme_ns->type](w, nvme_ns);
2876 }
2877 
2878 static void
2879 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w)
2880 {
2881 	const char	*action;
2882 
2883 	if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) {
2884 		action = "reset";
2885 	} else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) {
2886 		action = "abort";
2887 	} else {
2888 		action = "none";
2889 	}
2890 
2891 	spdk_json_write_object_begin(w);
2892 
2893 	spdk_json_write_named_string(w, "method", "bdev_nvme_set_options");
2894 
2895 	spdk_json_write_named_object_begin(w, "params");
2896 	spdk_json_write_named_string(w, "action_on_timeout", action);
2897 	spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us);
2898 	spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms);
2899 	spdk_json_write_named_uint32(w, "retry_count", g_opts.retry_count);
2900 	spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst);
2901 	spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight);
2902 	spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight);
2903 	spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight);
2904 	spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us);
2905 	spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us);
2906 	spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests);
2907 	spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit);
2908 	spdk_json_write_object_end(w);
2909 
2910 	spdk_json_write_object_end(w);
2911 }
2912 
2913 static void
2914 nvme_bdev_ctrlr_config_json(struct spdk_json_write_ctx *w,
2915 			    struct nvme_bdev_ctrlr *nvme_bdev_ctrlr)
2916 {
2917 	struct spdk_nvme_transport_id	*trid;
2918 
2919 	trid = nvme_bdev_ctrlr->connected_trid;
2920 
2921 	spdk_json_write_object_begin(w);
2922 
2923 	spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller");
2924 
2925 	spdk_json_write_named_object_begin(w, "params");
2926 	spdk_json_write_named_string(w, "name", nvme_bdev_ctrlr->name);
2927 	nvme_bdev_dump_trid_json(trid, w);
2928 	spdk_json_write_named_bool(w, "prchk_reftag",
2929 				   (nvme_bdev_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0);
2930 	spdk_json_write_named_bool(w, "prchk_guard",
2931 				   (nvme_bdev_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0);
2932 
2933 	spdk_json_write_object_end(w);
2934 
2935 	spdk_json_write_object_end(w);
2936 }
2937 
2938 static void
2939 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w)
2940 {
2941 	spdk_json_write_object_begin(w);
2942 	spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug");
2943 
2944 	spdk_json_write_named_object_begin(w, "params");
2945 	spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us);
2946 	spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled);
2947 	spdk_json_write_object_end(w);
2948 
2949 	spdk_json_write_object_end(w);
2950 }
2951 
2952 static int
2953 bdev_nvme_config_json(struct spdk_json_write_ctx *w)
2954 {
2955 	struct nvme_bdev_ctrlr	*nvme_bdev_ctrlr;
2956 	uint32_t		nsid;
2957 
2958 	bdev_nvme_opts_config_json(w);
2959 
2960 	pthread_mutex_lock(&g_bdev_nvme_mutex);
2961 
2962 	TAILQ_FOREACH(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
2963 		nvme_bdev_ctrlr_config_json(w, nvme_bdev_ctrlr);
2964 
2965 		for (nsid = 0; nsid < nvme_bdev_ctrlr->num_ns; ++nsid) {
2966 			if (!nvme_bdev_ctrlr->namespaces[nsid]->populated) {
2967 				continue;
2968 			}
2969 
2970 			nvme_namespace_config_json(w, nvme_bdev_ctrlr->namespaces[nsid]);
2971 		}
2972 	}
2973 
2974 	/* Dump as last parameter to give all NVMe bdevs chance to be constructed
2975 	 * before enabling hotplug poller.
2976 	 */
2977 	bdev_nvme_hotplug_config_json(w);
2978 
2979 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
2980 	return 0;
2981 }
2982 
2983 struct spdk_nvme_ctrlr *
2984 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev)
2985 {
2986 	if (!bdev || bdev->module != &nvme_if) {
2987 		return NULL;
2988 	}
2989 
2990 	return SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk)->nvme_ns->ctrlr->ctrlr;
2991 }
2992 
2993 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme)
2994