xref: /spdk/module/bdev/nvme/bdev_nvme.c (revision ceea3088870a3919d6bdfe61d7adba11b9733fb7)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation. All rights reserved.
5  *   Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "spdk/stdinc.h"
35 
36 #include "bdev_nvme.h"
37 #include "bdev_ocssd.h"
38 
39 #include "spdk/config.h"
40 #include "spdk/endian.h"
41 #include "spdk/bdev.h"
42 #include "spdk/json.h"
43 #include "spdk/nvme.h"
44 #include "spdk/nvme_ocssd.h"
45 #include "spdk/thread.h"
46 #include "spdk/string.h"
47 #include "spdk/likely.h"
48 #include "spdk/util.h"
49 
50 #include "spdk/bdev_module.h"
51 #include "spdk/log.h"
52 
53 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true
54 
55 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w);
56 
57 struct nvme_bdev_io {
58 	/** array of iovecs to transfer. */
59 	struct iovec *iovs;
60 
61 	/** Number of iovecs in iovs array. */
62 	int iovcnt;
63 
64 	/** Current iovec position. */
65 	int iovpos;
66 
67 	/** Offset in current iovec. */
68 	uint32_t iov_offset;
69 
70 	/** array of iovecs to transfer. */
71 	struct iovec *fused_iovs;
72 
73 	/** Number of iovecs in iovs array. */
74 	int fused_iovcnt;
75 
76 	/** Current iovec position. */
77 	int fused_iovpos;
78 
79 	/** Offset in current iovec. */
80 	uint32_t fused_iov_offset;
81 
82 	/** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */
83 	struct spdk_nvme_cpl cpl;
84 
85 	/** Originating thread */
86 	struct spdk_thread *orig_thread;
87 
88 	/** Keeps track if first of fused commands was submitted */
89 	bool first_fused_submitted;
90 };
91 
92 struct nvme_probe_ctx {
93 	size_t count;
94 	struct spdk_nvme_transport_id trids[NVME_MAX_CONTROLLERS];
95 	struct spdk_nvme_host_id hostids[NVME_MAX_CONTROLLERS];
96 	const char *names[NVME_MAX_CONTROLLERS];
97 	uint32_t prchk_flags[NVME_MAX_CONTROLLERS];
98 	const char *hostnqn;
99 };
100 
101 struct nvme_probe_skip_entry {
102 	struct spdk_nvme_transport_id		trid;
103 	TAILQ_ENTRY(nvme_probe_skip_entry)	tailq;
104 };
105 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */
106 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER(
107 			g_skipped_nvme_ctrlrs);
108 
109 static struct spdk_bdev_nvme_opts g_opts = {
110 	.action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE,
111 	.timeout_us = 0,
112 	.retry_count = 4,
113 	.arbitration_burst = 0,
114 	.low_priority_weight = 0,
115 	.medium_priority_weight = 0,
116 	.high_priority_weight = 0,
117 	.nvme_adminq_poll_period_us = 10000ULL,
118 	.nvme_ioq_poll_period_us = 0,
119 	.io_queue_requests = 0,
120 	.delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT,
121 };
122 
123 #define NVME_HOTPLUG_POLL_PERIOD_MAX			10000000ULL
124 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT		100000ULL
125 
126 static int g_hot_insert_nvme_controller_index = 0;
127 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT;
128 static bool g_nvme_hotplug_enabled = false;
129 static struct spdk_thread *g_bdev_nvme_init_thread;
130 static struct spdk_poller *g_hotplug_poller;
131 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx;
132 
133 static void nvme_ctrlr_populate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
134 		struct nvme_async_probe_ctx *ctx);
135 static void nvme_ctrlr_populate_namespaces_done(struct nvme_async_probe_ctx *ctx);
136 static int bdev_nvme_library_init(void);
137 static void bdev_nvme_library_fini(void);
138 static int bdev_nvme_readv(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch,
139 			   struct nvme_bdev_io *bio,
140 			   struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
141 			   uint32_t flags);
142 static int bdev_nvme_no_pi_readv(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch,
143 				 struct nvme_bdev_io *bio,
144 				 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba);
145 static int bdev_nvme_writev(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch,
146 			    struct nvme_bdev_io *bio,
147 			    struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
148 			    uint32_t flags);
149 static int bdev_nvme_comparev(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch,
150 			      struct nvme_bdev_io *bio,
151 			      struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
152 			      uint32_t flags);
153 static int bdev_nvme_comparev_and_writev(struct nvme_bdev_ns *nvme_ns,
154 		struct nvme_io_channel *nvme_ch,
155 		struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov,
156 		int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba,
157 		uint32_t flags);
158 static int bdev_nvme_admin_passthru(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch,
159 				    struct nvme_bdev_io *bio,
160 				    struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes);
161 static int bdev_nvme_io_passthru(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch,
162 				 struct nvme_bdev_io *bio,
163 				 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes);
164 static int bdev_nvme_io_passthru_md(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch,
165 				    struct nvme_bdev_io *bio,
166 				    struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len);
167 static int bdev_nvme_abort(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch,
168 			   struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort);
169 static int bdev_nvme_reset(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, struct nvme_bdev_io *bio,
170 			   bool failover);
171 
172 typedef void (*populate_namespace_fn)(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
173 				      struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx);
174 static void nvme_ctrlr_populate_standard_namespace(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
175 		struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx);
176 
177 static populate_namespace_fn g_populate_namespace_fn[] = {
178 	NULL,
179 	nvme_ctrlr_populate_standard_namespace,
180 	bdev_ocssd_populate_namespace,
181 };
182 
183 typedef void (*depopulate_namespace_fn)(struct nvme_bdev_ns *ns);
184 static void nvme_ctrlr_depopulate_standard_namespace(struct nvme_bdev_ns *ns);
185 
186 static depopulate_namespace_fn g_depopulate_namespace_fn[] = {
187 	NULL,
188 	nvme_ctrlr_depopulate_standard_namespace,
189 	bdev_ocssd_depopulate_namespace,
190 };
191 
192 typedef void (*config_json_namespace_fn)(struct spdk_json_write_ctx *w, struct nvme_bdev_ns *ns);
193 static void nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w,
194 		struct nvme_bdev_ns *ns);
195 
196 static config_json_namespace_fn g_config_json_namespace_fn[] = {
197 	NULL,
198 	nvme_ctrlr_config_json_standard_namespace,
199 	bdev_ocssd_namespace_config_json,
200 };
201 
202 struct spdk_nvme_qpair *
203 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch)
204 {
205 	struct nvme_io_channel *nvme_ch;
206 
207 	nvme_ch =  spdk_io_channel_get_ctx(ctrlr_io_ch);
208 
209 	return nvme_ch->qpair;
210 }
211 
212 static int
213 bdev_nvme_get_ctx_size(void)
214 {
215 	return sizeof(struct nvme_bdev_io);
216 }
217 
218 static struct spdk_bdev_module nvme_if = {
219 	.name = "nvme",
220 	.async_fini = true,
221 	.module_init = bdev_nvme_library_init,
222 	.module_fini = bdev_nvme_library_fini,
223 	.config_json = bdev_nvme_config_json,
224 	.get_ctx_size = bdev_nvme_get_ctx_size,
225 
226 };
227 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if)
228 
229 static void
230 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx)
231 {
232 	SPDK_DEBUGLOG(bdev_nvme, "qpar %p is disconnected, attempting reconnect.\n", qpair);
233 	/*
234 	 * Currently, just try to reconnect indefinitely. If we are doing a reset, the reset will
235 	 * reconnect a qpair and we will stop getting a callback for this one.
236 	 */
237 	spdk_nvme_ctrlr_reconnect_io_qpair(qpair);
238 }
239 
240 static int
241 bdev_nvme_poll(void *arg)
242 {
243 	struct nvme_bdev_poll_group *group = arg;
244 	int64_t num_completions;
245 
246 	if (group->collect_spin_stat && group->start_ticks == 0) {
247 		group->start_ticks = spdk_get_ticks();
248 	}
249 
250 	num_completions = spdk_nvme_poll_group_process_completions(group->group, 0,
251 			  bdev_nvme_disconnected_qpair_cb);
252 	if (group->collect_spin_stat) {
253 		if (num_completions > 0) {
254 			if (group->end_ticks != 0) {
255 				group->spin_ticks += (group->end_ticks - group->start_ticks);
256 				group->end_ticks = 0;
257 			}
258 			group->start_ticks = 0;
259 		} else {
260 			group->end_ticks = spdk_get_ticks();
261 		}
262 	}
263 
264 	return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
265 }
266 
267 static int
268 bdev_nvme_poll_adminq(void *arg)
269 {
270 	int32_t rc;
271 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = arg;
272 
273 	assert(nvme_bdev_ctrlr != NULL);
274 
275 	rc = spdk_nvme_ctrlr_process_admin_completions(nvme_bdev_ctrlr->ctrlr);
276 	if (rc < 0) {
277 		bdev_nvme_reset(nvme_bdev_ctrlr, NULL, true);
278 	}
279 
280 	return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY;
281 }
282 
283 static int
284 bdev_nvme_destruct(void *ctx)
285 {
286 	struct nvme_bdev *nvme_disk = ctx;
287 
288 	nvme_bdev_detach_bdev_from_ns(nvme_disk);
289 
290 	free(nvme_disk->disk.name);
291 	free(nvme_disk);
292 
293 	return 0;
294 }
295 
296 static int
297 bdev_nvme_flush(struct nvme_bdev_ns *nvme_ns, struct nvme_bdev_io *bio,
298 		uint64_t offset, uint64_t nbytes)
299 {
300 	spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_SUCCESS);
301 
302 	return 0;
303 }
304 
305 static void
306 _bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i)
307 {
308 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
309 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(_ch);
310 	struct spdk_bdev_io *bdev_io;
311 	enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS;
312 
313 	/* A NULL ctx means success. */
314 	if (spdk_io_channel_iter_get_ctx(i) != NULL) {
315 		status = SPDK_BDEV_IO_STATUS_FAILED;
316 	}
317 
318 	while (!TAILQ_EMPTY(&nvme_ch->pending_resets)) {
319 		bdev_io = TAILQ_FIRST(&nvme_ch->pending_resets);
320 		TAILQ_REMOVE(&nvme_ch->pending_resets, bdev_io, module_link);
321 		spdk_bdev_io_complete(bdev_io, status);
322 	}
323 
324 	spdk_for_each_channel_continue(i, 0);
325 }
326 
327 static void
328 _bdev_nvme_reset_complete(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, int rc)
329 {
330 	/* we are using the for_each_channel cb_arg like a return code here. */
331 	/* If it's zero, we succeeded, otherwise, the reset failed. */
332 	void *cb_arg = NULL;
333 
334 	if (rc) {
335 		cb_arg = (void *)0x1;
336 		SPDK_ERRLOG("Resetting controller failed.\n");
337 	} else {
338 		SPDK_NOTICELOG("Resetting controller successful.\n");
339 	}
340 
341 	pthread_mutex_lock(&g_bdev_nvme_mutex);
342 	nvme_bdev_ctrlr->resetting = false;
343 	nvme_bdev_ctrlr->failover_in_progress = false;
344 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
345 	/* Make sure we clear any pending resets before returning. */
346 	spdk_for_each_channel(nvme_bdev_ctrlr,
347 			      _bdev_nvme_complete_pending_resets,
348 			      cb_arg, NULL);
349 }
350 
351 static void
352 _bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status)
353 {
354 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i);
355 	void *ctx = spdk_io_channel_iter_get_ctx(i);
356 	int rc = SPDK_BDEV_IO_STATUS_SUCCESS;
357 
358 	if (status) {
359 		rc = SPDK_BDEV_IO_STATUS_FAILED;
360 	}
361 	if (ctx) {
362 		spdk_bdev_io_complete(spdk_bdev_io_from_ctx(ctx), rc);
363 	}
364 	_bdev_nvme_reset_complete(nvme_bdev_ctrlr, status);
365 }
366 
367 static void
368 _bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i)
369 {
370 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i);
371 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
372 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(_ch);
373 	struct spdk_nvme_io_qpair_opts opts;
374 
375 	spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_bdev_ctrlr->ctrlr, &opts, sizeof(opts));
376 	opts.delay_cmd_submit = g_opts.delay_cmd_submit;
377 	opts.create_only = true;
378 
379 	nvme_ch->qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_bdev_ctrlr->ctrlr, &opts, sizeof(opts));
380 	if (!nvme_ch->qpair) {
381 		spdk_for_each_channel_continue(i, -1);
382 		return;
383 	}
384 
385 	assert(nvme_ch->group != NULL);
386 	if (spdk_nvme_poll_group_add(nvme_ch->group->group, nvme_ch->qpair) != 0) {
387 		SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n");
388 		spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair);
389 		spdk_for_each_channel_continue(i, -1);
390 		return;
391 	}
392 
393 	if (spdk_nvme_ctrlr_connect_io_qpair(nvme_bdev_ctrlr->ctrlr, nvme_ch->qpair)) {
394 		SPDK_ERRLOG("Unable to connect I/O qpair.\n");
395 		spdk_nvme_poll_group_remove(nvme_ch->group->group, nvme_ch->qpair);
396 		spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair);
397 		spdk_for_each_channel_continue(i, -1);
398 		return;
399 	}
400 
401 	spdk_for_each_channel_continue(i, 0);
402 }
403 
404 static void
405 _bdev_nvme_reset(struct spdk_io_channel_iter *i, int status)
406 {
407 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i);
408 	struct nvme_bdev_io *bio = spdk_io_channel_iter_get_ctx(i);
409 	int rc;
410 
411 	if (status) {
412 		if (bio) {
413 			spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_FAILED);
414 		}
415 		_bdev_nvme_reset_complete(nvme_bdev_ctrlr, status);
416 		return;
417 	}
418 
419 	rc = spdk_nvme_ctrlr_reset(nvme_bdev_ctrlr->ctrlr);
420 	if (rc != 0) {
421 		if (bio) {
422 			spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_FAILED);
423 		}
424 		_bdev_nvme_reset_complete(nvme_bdev_ctrlr, rc);
425 		return;
426 	}
427 
428 	/* Recreate all of the I/O queue pairs */
429 	spdk_for_each_channel(nvme_bdev_ctrlr,
430 			      _bdev_nvme_reset_create_qpair,
431 			      bio,
432 			      _bdev_nvme_reset_create_qpairs_done);
433 }
434 
435 static void
436 _bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i)
437 {
438 	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
439 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
440 	int rc;
441 
442 	rc = spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair);
443 	if (!rc) {
444 		nvme_ch->qpair = NULL;
445 	}
446 
447 	spdk_for_each_channel_continue(i, rc);
448 }
449 
450 static int
451 bdev_nvme_reset(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, struct nvme_bdev_io *bio, bool failover)
452 {
453 	struct spdk_io_channel *ch;
454 	struct nvme_io_channel *nvme_ch;
455 	struct nvme_bdev_ctrlr_trid *curr_trid = NULL, *next_trid = NULL;
456 	int rc = 0;
457 
458 	pthread_mutex_lock(&g_bdev_nvme_mutex);
459 	if (nvme_bdev_ctrlr->destruct) {
460 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
461 		/* Don't bother resetting if the controller is in the process of being destructed. */
462 		if (bio) {
463 			spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_FAILED);
464 		}
465 		return 0;
466 	}
467 
468 	if (failover) {
469 		curr_trid = TAILQ_FIRST(&nvme_bdev_ctrlr->trids);
470 		assert(curr_trid);
471 		assert(&curr_trid->trid == nvme_bdev_ctrlr->connected_trid);
472 		next_trid = TAILQ_NEXT(curr_trid, link);
473 		if (!next_trid) {
474 			failover = false;
475 		}
476 	}
477 
478 	if (nvme_bdev_ctrlr->resetting) {
479 		if (failover && !nvme_bdev_ctrlr->failover_in_progress) {
480 			rc = -EAGAIN;
481 		}
482 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
483 		SPDK_NOTICELOG("Unable to perform reset, already in progress.\n");
484 		/*
485 		 * The internal reset calls won't be queued. This is on purpose so that we don't
486 		 * interfere with the app framework reset strategy. i.e. we are deferring to the
487 		 * upper level. If they are in the middle of a reset, we won't try to schedule another one.
488 		 */
489 		if (bio) {
490 			ch = spdk_get_io_channel(nvme_bdev_ctrlr);
491 			assert(ch != NULL);
492 			nvme_ch = spdk_io_channel_get_ctx(ch);
493 			TAILQ_INSERT_TAIL(&nvme_ch->pending_resets, spdk_bdev_io_from_ctx(bio), module_link);
494 			spdk_put_io_channel(ch);
495 		}
496 		return rc;
497 	}
498 
499 	nvme_bdev_ctrlr->resetting = true;
500 	if (failover) {
501 		nvme_bdev_ctrlr->failover_in_progress = true;
502 
503 		spdk_nvme_ctrlr_fail(nvme_bdev_ctrlr->ctrlr);
504 		nvme_bdev_ctrlr->connected_trid = &next_trid->trid;
505 		rc = spdk_nvme_ctrlr_set_trid(nvme_bdev_ctrlr->ctrlr, &next_trid->trid);
506 		assert(rc == 0);
507 		/** Shuffle the old trid to the end of the list and use the new one.
508 		 * Allows for round robin through multiple connections.
509 		 */
510 		TAILQ_REMOVE(&nvme_bdev_ctrlr->trids, curr_trid, link);
511 		TAILQ_INSERT_TAIL(&nvme_bdev_ctrlr->trids, curr_trid, link);
512 	}
513 
514 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
515 	/* First, delete all NVMe I/O queue pairs. */
516 	spdk_for_each_channel(nvme_bdev_ctrlr,
517 			      _bdev_nvme_reset_destroy_qpair,
518 			      bio,
519 			      _bdev_nvme_reset);
520 
521 	return 0;
522 }
523 
524 static int
525 bdev_nvme_unmap(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch,
526 		struct nvme_bdev_io *bio,
527 		uint64_t offset_blocks,
528 		uint64_t num_blocks);
529 
530 static void
531 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
532 		     bool success)
533 {
534 	struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt;
535 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
536 	int ret;
537 
538 	if (!success) {
539 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
540 		return;
541 	}
542 
543 	ret = bdev_nvme_readv(nbdev->nvme_ns,
544 			      nvme_ch,
545 			      (struct nvme_bdev_io *)bdev_io->driver_ctx,
546 			      bdev_io->u.bdev.iovs,
547 			      bdev_io->u.bdev.iovcnt,
548 			      bdev_io->u.bdev.md_buf,
549 			      bdev_io->u.bdev.num_blocks,
550 			      bdev_io->u.bdev.offset_blocks,
551 			      nbdev->disk.dif_check_flags);
552 
553 	if (spdk_likely(ret == 0)) {
554 		return;
555 	} else if (ret == -ENOMEM) {
556 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
557 	} else {
558 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
559 	}
560 }
561 
562 static int
563 _bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
564 {
565 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
566 	struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt;
567 	struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx;
568 	struct nvme_bdev_io *nbdev_io_to_abort;
569 
570 	if (nvme_ch->qpair == NULL) {
571 		/* The device is currently resetting */
572 		return -1;
573 	}
574 
575 	switch (bdev_io->type) {
576 	case SPDK_BDEV_IO_TYPE_READ:
577 		if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) {
578 			bdev_nvme_get_buf_cb(ch, bdev_io, true);
579 		} else {
580 			spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb,
581 					     bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
582 		}
583 		return 0;
584 
585 	case SPDK_BDEV_IO_TYPE_WRITE:
586 		return bdev_nvme_writev(nbdev->nvme_ns,
587 					nvme_ch,
588 					nbdev_io,
589 					bdev_io->u.bdev.iovs,
590 					bdev_io->u.bdev.iovcnt,
591 					bdev_io->u.bdev.md_buf,
592 					bdev_io->u.bdev.num_blocks,
593 					bdev_io->u.bdev.offset_blocks,
594 					nbdev->disk.dif_check_flags);
595 
596 	case SPDK_BDEV_IO_TYPE_COMPARE:
597 		return bdev_nvme_comparev(nbdev->nvme_ns,
598 					  nvme_ch,
599 					  nbdev_io,
600 					  bdev_io->u.bdev.iovs,
601 					  bdev_io->u.bdev.iovcnt,
602 					  bdev_io->u.bdev.md_buf,
603 					  bdev_io->u.bdev.num_blocks,
604 					  bdev_io->u.bdev.offset_blocks,
605 					  nbdev->disk.dif_check_flags);
606 
607 	case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE:
608 		return bdev_nvme_comparev_and_writev(nbdev->nvme_ns,
609 						     nvme_ch,
610 						     nbdev_io,
611 						     bdev_io->u.bdev.iovs,
612 						     bdev_io->u.bdev.iovcnt,
613 						     bdev_io->u.bdev.fused_iovs,
614 						     bdev_io->u.bdev.fused_iovcnt,
615 						     bdev_io->u.bdev.md_buf,
616 						     bdev_io->u.bdev.num_blocks,
617 						     bdev_io->u.bdev.offset_blocks,
618 						     nbdev->disk.dif_check_flags);
619 
620 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
621 		return bdev_nvme_unmap(nbdev->nvme_ns,
622 				       nvme_ch,
623 				       nbdev_io,
624 				       bdev_io->u.bdev.offset_blocks,
625 				       bdev_io->u.bdev.num_blocks);
626 
627 	case SPDK_BDEV_IO_TYPE_UNMAP:
628 		return bdev_nvme_unmap(nbdev->nvme_ns,
629 				       nvme_ch,
630 				       nbdev_io,
631 				       bdev_io->u.bdev.offset_blocks,
632 				       bdev_io->u.bdev.num_blocks);
633 
634 	case SPDK_BDEV_IO_TYPE_RESET:
635 		return bdev_nvme_reset(nbdev->nvme_ns->ctrlr, nbdev_io, false);
636 
637 	case SPDK_BDEV_IO_TYPE_FLUSH:
638 		return bdev_nvme_flush(nbdev->nvme_ns,
639 				       nbdev_io,
640 				       bdev_io->u.bdev.offset_blocks,
641 				       bdev_io->u.bdev.num_blocks);
642 
643 	case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
644 		return bdev_nvme_admin_passthru(nbdev->nvme_ns,
645 						nvme_ch,
646 						nbdev_io,
647 						&bdev_io->u.nvme_passthru.cmd,
648 						bdev_io->u.nvme_passthru.buf,
649 						bdev_io->u.nvme_passthru.nbytes);
650 
651 	case SPDK_BDEV_IO_TYPE_NVME_IO:
652 		return bdev_nvme_io_passthru(nbdev->nvme_ns,
653 					     nvme_ch,
654 					     nbdev_io,
655 					     &bdev_io->u.nvme_passthru.cmd,
656 					     bdev_io->u.nvme_passthru.buf,
657 					     bdev_io->u.nvme_passthru.nbytes);
658 
659 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
660 		return bdev_nvme_io_passthru_md(nbdev->nvme_ns,
661 						nvme_ch,
662 						nbdev_io,
663 						&bdev_io->u.nvme_passthru.cmd,
664 						bdev_io->u.nvme_passthru.buf,
665 						bdev_io->u.nvme_passthru.nbytes,
666 						bdev_io->u.nvme_passthru.md_buf,
667 						bdev_io->u.nvme_passthru.md_len);
668 
669 	case SPDK_BDEV_IO_TYPE_ABORT:
670 		nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx;
671 		return bdev_nvme_abort(nbdev->nvme_ns,
672 				       nvme_ch,
673 				       nbdev_io,
674 				       nbdev_io_to_abort);
675 
676 	default:
677 		return -EINVAL;
678 	}
679 	return 0;
680 }
681 
682 static void
683 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
684 {
685 	int rc = _bdev_nvme_submit_request(ch, bdev_io);
686 
687 	if (spdk_unlikely(rc != 0)) {
688 		if (rc == -ENOMEM) {
689 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
690 		} else {
691 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
692 		}
693 	}
694 }
695 
696 static bool
697 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
698 {
699 	struct nvme_bdev *nbdev = ctx;
700 	struct nvme_bdev_ns *nvme_ns = nbdev->nvme_ns;
701 	const struct spdk_nvme_ctrlr_data *cdata;
702 
703 	switch (io_type) {
704 	case SPDK_BDEV_IO_TYPE_READ:
705 	case SPDK_BDEV_IO_TYPE_WRITE:
706 	case SPDK_BDEV_IO_TYPE_RESET:
707 	case SPDK_BDEV_IO_TYPE_FLUSH:
708 	case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
709 	case SPDK_BDEV_IO_TYPE_NVME_IO:
710 	case SPDK_BDEV_IO_TYPE_ABORT:
711 		return true;
712 
713 	case SPDK_BDEV_IO_TYPE_COMPARE:
714 		return spdk_nvme_ns_supports_compare(nvme_ns->ns);
715 
716 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
717 		return spdk_nvme_ns_get_md_size(nvme_ns->ns) ? true : false;
718 
719 	case SPDK_BDEV_IO_TYPE_UNMAP:
720 		cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr);
721 		return cdata->oncs.dsm;
722 
723 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
724 		cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr);
725 		/*
726 		 * If an NVMe controller guarantees reading unallocated blocks returns zero,
727 		 * we can implement WRITE_ZEROES as an NVMe deallocate command.
728 		 */
729 		if (cdata->oncs.dsm &&
730 		    spdk_nvme_ns_get_dealloc_logical_block_read_value(nvme_ns->ns) ==
731 		    SPDK_NVME_DEALLOC_READ_00) {
732 			return true;
733 		}
734 		/*
735 		 * The NVMe controller write_zeroes function is currently not used by our driver.
736 		 * If a user submits an arbitrarily large write_zeroes request to the controller, the request will fail.
737 		 * Until this is resolved, we only claim support for write_zeroes if deallocated blocks return 0's when read.
738 		 */
739 		return false;
740 
741 	case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE:
742 		if (spdk_nvme_ctrlr_get_flags(nvme_ns->ctrlr->ctrlr) &
743 		    SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) {
744 			return true;
745 		}
746 		return false;
747 
748 	default:
749 		return false;
750 	}
751 }
752 
753 static int
754 bdev_nvme_create_cb(void *io_device, void *ctx_buf)
755 {
756 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = io_device;
757 	struct nvme_io_channel *ch = ctx_buf;
758 	struct spdk_nvme_io_qpair_opts opts;
759 	struct spdk_io_channel *pg_ch = NULL;
760 	int rc;
761 
762 	spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_bdev_ctrlr->ctrlr, &opts, sizeof(opts));
763 	opts.delay_cmd_submit = g_opts.delay_cmd_submit;
764 	opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests);
765 	opts.create_only = true;
766 	g_opts.io_queue_requests = opts.io_queue_requests;
767 
768 	ch->qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_bdev_ctrlr->ctrlr, &opts, sizeof(opts));
769 
770 	if (ch->qpair == NULL) {
771 		return -1;
772 	}
773 
774 	if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) {
775 		if (bdev_ocssd_create_io_channel(ch)) {
776 			goto err;
777 		}
778 	}
779 
780 	pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs);
781 	if (!pg_ch) {
782 		goto err;
783 	}
784 
785 	ch->group = spdk_io_channel_get_ctx(pg_ch);
786 	if (spdk_nvme_poll_group_add(ch->group->group, ch->qpair) != 0) {
787 		goto err;
788 	}
789 
790 	rc = spdk_nvme_ctrlr_connect_io_qpair(nvme_bdev_ctrlr->ctrlr, ch->qpair);
791 	if (rc) {
792 		spdk_nvme_poll_group_remove(ch->group->group, ch->qpair);
793 		goto err;
794 	}
795 
796 #ifdef SPDK_CONFIG_VTUNE
797 	ch->group->collect_spin_stat = true;
798 #else
799 	ch->group->collect_spin_stat = false;
800 #endif
801 
802 	TAILQ_INIT(&ch->pending_resets);
803 	return 0;
804 
805 err:
806 	if (pg_ch) {
807 		spdk_put_io_channel(pg_ch);
808 	}
809 	spdk_nvme_ctrlr_free_io_qpair(ch->qpair);
810 	return -1;
811 }
812 
813 static void
814 bdev_nvme_destroy_cb(void *io_device, void *ctx_buf)
815 {
816 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = io_device;
817 	struct nvme_io_channel *ch = ctx_buf;
818 	struct nvme_bdev_poll_group *group;
819 
820 	group = ch->group;
821 	assert(group != NULL);
822 
823 	if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) {
824 		bdev_ocssd_destroy_io_channel(ch);
825 	}
826 
827 	if (ch->qpair != NULL) {
828 		spdk_nvme_poll_group_remove(group->group, ch->qpair);
829 	}
830 	spdk_put_io_channel(spdk_io_channel_from_ctx(group));
831 
832 	spdk_nvme_ctrlr_free_io_qpair(ch->qpair);
833 }
834 
835 static int
836 bdev_nvme_poll_group_create_cb(void *io_device, void *ctx_buf)
837 {
838 	struct nvme_bdev_poll_group *group = ctx_buf;
839 
840 	group->group = spdk_nvme_poll_group_create(group);
841 	if (group->group == NULL) {
842 		return -1;
843 	}
844 
845 	group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us);
846 
847 	if (group->poller == NULL) {
848 		spdk_nvme_poll_group_destroy(group->group);
849 		return -1;
850 	}
851 
852 	return 0;
853 }
854 
855 static void
856 bdev_nvme_poll_group_destroy_cb(void *io_device, void *ctx_buf)
857 {
858 	struct nvme_bdev_poll_group *group = ctx_buf;
859 
860 	spdk_poller_unregister(&group->poller);
861 	if (spdk_nvme_poll_group_destroy(group->group)) {
862 		SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.");
863 		assert(false);
864 	}
865 }
866 
867 static struct spdk_io_channel *
868 bdev_nvme_get_io_channel(void *ctx)
869 {
870 	struct nvme_bdev *nvme_bdev = ctx;
871 
872 	return spdk_get_io_channel(nvme_bdev->nvme_ns->ctrlr);
873 }
874 
875 static int
876 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
877 {
878 	struct nvme_bdev *nvme_bdev = ctx;
879 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = nvme_bdev->nvme_ns->ctrlr;
880 	const struct spdk_nvme_ctrlr_data *cdata;
881 	struct spdk_nvme_ns *ns;
882 	union spdk_nvme_vs_register vs;
883 	union spdk_nvme_csts_register csts;
884 	char buf[128];
885 
886 	cdata = spdk_nvme_ctrlr_get_data(nvme_bdev_ctrlr->ctrlr);
887 	vs = spdk_nvme_ctrlr_get_regs_vs(nvme_bdev_ctrlr->ctrlr);
888 	csts = spdk_nvme_ctrlr_get_regs_csts(nvme_bdev_ctrlr->ctrlr);
889 	ns = nvme_bdev->nvme_ns->ns;
890 
891 	spdk_json_write_named_object_begin(w, "nvme");
892 
893 	if (nvme_bdev_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
894 		spdk_json_write_named_string(w, "pci_address", nvme_bdev_ctrlr->connected_trid->traddr);
895 	}
896 
897 	spdk_json_write_named_object_begin(w, "trid");
898 
899 	nvme_bdev_dump_trid_json(nvme_bdev_ctrlr->connected_trid, w);
900 
901 	spdk_json_write_object_end(w);
902 
903 #ifdef SPDK_CONFIG_NVME_CUSE
904 	size_t cuse_name_size = 128;
905 	char cuse_name[cuse_name_size];
906 
907 	int rc = spdk_nvme_cuse_get_ns_name(nvme_bdev_ctrlr->ctrlr, spdk_nvme_ns_get_id(ns),
908 					    cuse_name, &cuse_name_size);
909 	if (rc == 0) {
910 		spdk_json_write_named_string(w, "cuse_device", cuse_name);
911 	}
912 #endif
913 
914 	spdk_json_write_named_object_begin(w, "ctrlr_data");
915 
916 	spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid);
917 
918 	snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn);
919 	spdk_str_trim(buf);
920 	spdk_json_write_named_string(w, "model_number", buf);
921 
922 	snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn);
923 	spdk_str_trim(buf);
924 	spdk_json_write_named_string(w, "serial_number", buf);
925 
926 	snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr);
927 	spdk_str_trim(buf);
928 	spdk_json_write_named_string(w, "firmware_revision", buf);
929 
930 	spdk_json_write_named_object_begin(w, "oacs");
931 
932 	spdk_json_write_named_uint32(w, "security", cdata->oacs.security);
933 	spdk_json_write_named_uint32(w, "format", cdata->oacs.format);
934 	spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware);
935 	spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage);
936 
937 	spdk_json_write_object_end(w);
938 
939 	spdk_json_write_object_end(w);
940 
941 	spdk_json_write_named_object_begin(w, "vs");
942 
943 	spdk_json_write_name(w, "nvme_version");
944 	if (vs.bits.ter) {
945 		spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter);
946 	} else {
947 		spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr);
948 	}
949 
950 	spdk_json_write_object_end(w);
951 
952 	spdk_json_write_named_object_begin(w, "csts");
953 
954 	spdk_json_write_named_uint32(w, "rdy", csts.bits.rdy);
955 	spdk_json_write_named_uint32(w, "cfs", csts.bits.cfs);
956 
957 	spdk_json_write_object_end(w);
958 
959 	spdk_json_write_named_object_begin(w, "ns_data");
960 
961 	spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns));
962 
963 	spdk_json_write_object_end(w);
964 
965 	if (cdata->oacs.security) {
966 		spdk_json_write_named_object_begin(w, "security");
967 
968 		spdk_json_write_named_bool(w, "opal", nvme_bdev_ctrlr->opal_dev ? true : false);
969 
970 		spdk_json_write_object_end(w);
971 	}
972 
973 	spdk_json_write_object_end(w);
974 
975 	return 0;
976 }
977 
978 static void
979 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
980 {
981 	/* No config per bdev needed */
982 }
983 
984 static uint64_t
985 bdev_nvme_get_spin_time(struct spdk_io_channel *ch)
986 {
987 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
988 	struct nvme_bdev_poll_group *group = nvme_ch->group;
989 	uint64_t spin_time;
990 
991 	if (!group || !group->collect_spin_stat) {
992 		return 0;
993 	}
994 
995 	if (group->end_ticks != 0) {
996 		group->spin_ticks += (group->end_ticks - group->start_ticks);
997 		group->end_ticks = 0;
998 	}
999 
1000 	spin_time = (group->spin_ticks * 1000000ULL) / spdk_get_ticks_hz();
1001 	group->start_ticks = 0;
1002 	group->spin_ticks = 0;
1003 
1004 	return spin_time;
1005 }
1006 
1007 static const struct spdk_bdev_fn_table nvmelib_fn_table = {
1008 	.destruct		= bdev_nvme_destruct,
1009 	.submit_request		= bdev_nvme_submit_request,
1010 	.io_type_supported	= bdev_nvme_io_type_supported,
1011 	.get_io_channel		= bdev_nvme_get_io_channel,
1012 	.dump_info_json		= bdev_nvme_dump_info_json,
1013 	.write_config_json	= bdev_nvme_write_config_json,
1014 	.get_spin_time		= bdev_nvme_get_spin_time,
1015 };
1016 
1017 static void
1018 nvme_ctrlr_populate_standard_namespace(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
1019 				       struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx)
1020 {
1021 	struct spdk_nvme_ctrlr	*ctrlr = nvme_bdev_ctrlr->ctrlr;
1022 	struct nvme_bdev	*bdev;
1023 	struct spdk_nvme_ns	*ns;
1024 	const struct spdk_uuid	*uuid;
1025 	const struct spdk_nvme_ctrlr_data *cdata;
1026 	const struct spdk_nvme_ns_data *nsdata;
1027 	int			rc;
1028 
1029 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
1030 
1031 	ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id);
1032 	if (!ns) {
1033 		SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id);
1034 		nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, -EINVAL);
1035 		return;
1036 	}
1037 
1038 	bdev = calloc(1, sizeof(*bdev));
1039 	if (!bdev) {
1040 		SPDK_ERRLOG("bdev calloc() failed\n");
1041 		nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, -ENOMEM);
1042 		return;
1043 	}
1044 
1045 	nvme_ns->ns = ns;
1046 	bdev->nvme_ns = nvme_ns;
1047 
1048 	bdev->disk.name = spdk_sprintf_alloc("%sn%d", nvme_bdev_ctrlr->name, spdk_nvme_ns_get_id(ns));
1049 	if (!bdev->disk.name) {
1050 		free(bdev);
1051 		nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, -ENOMEM);
1052 		return;
1053 	}
1054 	bdev->disk.product_name = "NVMe disk";
1055 
1056 	bdev->disk.write_cache = 0;
1057 	if (cdata->vwc.present) {
1058 		/* Enable if the Volatile Write Cache exists */
1059 		bdev->disk.write_cache = 1;
1060 	}
1061 	bdev->disk.blocklen = spdk_nvme_ns_get_extended_sector_size(ns);
1062 	bdev->disk.blockcnt = spdk_nvme_ns_get_num_sectors(ns);
1063 	bdev->disk.optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns);
1064 
1065 	uuid = spdk_nvme_ns_get_uuid(ns);
1066 	if (uuid != NULL) {
1067 		bdev->disk.uuid = *uuid;
1068 	}
1069 
1070 	nsdata = spdk_nvme_ns_get_data(ns);
1071 
1072 	bdev->disk.md_len = spdk_nvme_ns_get_md_size(ns);
1073 	if (bdev->disk.md_len != 0) {
1074 		bdev->disk.md_interleave = nsdata->flbas.extended;
1075 		bdev->disk.dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns);
1076 		if (bdev->disk.dif_type != SPDK_DIF_DISABLE) {
1077 			bdev->disk.dif_is_head_of_md = nsdata->dps.md_start;
1078 			bdev->disk.dif_check_flags = nvme_bdev_ctrlr->prchk_flags;
1079 		}
1080 	}
1081 
1082 	if (!bdev_nvme_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) {
1083 		bdev->disk.acwu = 0;
1084 	} else if (nsdata->nsfeat.ns_atomic_write_unit) {
1085 		bdev->disk.acwu = nsdata->nacwu;
1086 	} else {
1087 		bdev->disk.acwu = cdata->acwu;
1088 	}
1089 
1090 	bdev->disk.ctxt = bdev;
1091 	bdev->disk.fn_table = &nvmelib_fn_table;
1092 	bdev->disk.module = &nvme_if;
1093 	rc = spdk_bdev_register(&bdev->disk);
1094 	if (rc) {
1095 		free(bdev->disk.name);
1096 		free(bdev);
1097 		nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, rc);
1098 		return;
1099 	}
1100 
1101 	nvme_bdev_attach_bdev_to_ns(nvme_ns, bdev);
1102 	nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, 0);
1103 }
1104 
1105 static bool
1106 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
1107 		 struct spdk_nvme_ctrlr_opts *opts)
1108 {
1109 	struct nvme_probe_skip_entry *entry;
1110 
1111 	TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) {
1112 		if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) {
1113 			return false;
1114 		}
1115 	}
1116 
1117 	opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst;
1118 	opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight;
1119 	opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight;
1120 	opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight;
1121 
1122 	SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr);
1123 
1124 	return true;
1125 }
1126 
1127 static void
1128 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl)
1129 {
1130 	struct spdk_nvme_ctrlr *ctrlr = ctx;
1131 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
1132 
1133 	if (spdk_nvme_cpl_is_error(cpl)) {
1134 		SPDK_WARNLOG("Abort failed. Resetting controller.\n");
1135 		nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(spdk_nvme_ctrlr_get_transport_id(ctrlr));
1136 		assert(nvme_bdev_ctrlr != NULL);
1137 		bdev_nvme_reset(nvme_bdev_ctrlr, NULL, false);
1138 	}
1139 }
1140 
1141 static void
1142 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr,
1143 	   struct spdk_nvme_qpair *qpair, uint16_t cid)
1144 {
1145 	int rc;
1146 	union spdk_nvme_csts_register csts;
1147 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
1148 
1149 	SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid);
1150 
1151 	csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr);
1152 	if (csts.bits.cfs) {
1153 		SPDK_ERRLOG("Controller Fatal Status, reset required\n");
1154 		nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(spdk_nvme_ctrlr_get_transport_id(ctrlr));
1155 		assert(nvme_bdev_ctrlr != NULL);
1156 		bdev_nvme_reset(nvme_bdev_ctrlr, NULL, false);
1157 		return;
1158 	}
1159 
1160 	switch (g_opts.action_on_timeout) {
1161 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT:
1162 		if (qpair) {
1163 			rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid,
1164 						       nvme_abort_cpl, ctrlr);
1165 			if (rc == 0) {
1166 				return;
1167 			}
1168 
1169 			SPDK_ERRLOG("Unable to send abort. Resetting.\n");
1170 		}
1171 
1172 	/* FALLTHROUGH */
1173 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET:
1174 		nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(spdk_nvme_ctrlr_get_transport_id(ctrlr));
1175 		assert(nvme_bdev_ctrlr != NULL);
1176 		bdev_nvme_reset(nvme_bdev_ctrlr, NULL, false);
1177 		break;
1178 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE:
1179 		SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n");
1180 		break;
1181 	default:
1182 		SPDK_ERRLOG("An invalid timeout action value is found.\n");
1183 		break;
1184 	}
1185 }
1186 
1187 void
1188 nvme_ctrlr_depopulate_namespace_done(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr)
1189 {
1190 	pthread_mutex_lock(&g_bdev_nvme_mutex);
1191 	nvme_bdev_ctrlr->ref--;
1192 
1193 	if (nvme_bdev_ctrlr->ref == 0 && nvme_bdev_ctrlr->destruct) {
1194 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
1195 		nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr);
1196 		return;
1197 	}
1198 
1199 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
1200 }
1201 
1202 static void
1203 nvme_ctrlr_depopulate_standard_namespace(struct nvme_bdev_ns *ns)
1204 {
1205 	struct nvme_bdev *bdev, *tmp;
1206 
1207 	TAILQ_FOREACH_SAFE(bdev, &ns->bdevs, tailq, tmp) {
1208 		spdk_bdev_unregister(&bdev->disk, NULL, NULL);
1209 	}
1210 
1211 	ns->populated = false;
1212 
1213 	nvme_ctrlr_depopulate_namespace_done(ns->ctrlr);
1214 }
1215 
1216 static void
1217 nvme_ctrlr_populate_namespace(struct nvme_bdev_ctrlr *ctrlr, struct nvme_bdev_ns *ns,
1218 			      struct nvme_async_probe_ctx *ctx)
1219 {
1220 	g_populate_namespace_fn[ns->type](ctrlr, ns, ctx);
1221 }
1222 
1223 static void
1224 nvme_ctrlr_depopulate_namespace(struct nvme_bdev_ctrlr *ctrlr, struct nvme_bdev_ns *ns)
1225 {
1226 	g_depopulate_namespace_fn[ns->type](ns);
1227 }
1228 
1229 void
1230 nvme_ctrlr_populate_namespace_done(struct nvme_async_probe_ctx *ctx,
1231 				   struct nvme_bdev_ns *ns, int rc)
1232 {
1233 	if (rc == 0) {
1234 		ns->populated = true;
1235 		pthread_mutex_lock(&g_bdev_nvme_mutex);
1236 		ns->ctrlr->ref++;
1237 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
1238 	} else {
1239 		memset(ns, 0, sizeof(*ns));
1240 	}
1241 
1242 	if (ctx) {
1243 		ctx->populates_in_progress--;
1244 		if (ctx->populates_in_progress == 0) {
1245 			nvme_ctrlr_populate_namespaces_done(ctx);
1246 		}
1247 	}
1248 }
1249 
1250 static void
1251 nvme_ctrlr_populate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
1252 			       struct nvme_async_probe_ctx *ctx)
1253 {
1254 	struct spdk_nvme_ctrlr	*ctrlr = nvme_bdev_ctrlr->ctrlr;
1255 	struct nvme_bdev_ns	*ns;
1256 	struct spdk_nvme_ns	*nvme_ns;
1257 	struct nvme_bdev	*bdev;
1258 	uint32_t		i;
1259 	int			rc;
1260 	uint64_t		num_sectors;
1261 	bool			ns_is_active;
1262 
1263 	if (ctx) {
1264 		/* Initialize this count to 1 to handle the populate functions
1265 		 * calling nvme_ctrlr_populate_namespace_done() immediately.
1266 		 */
1267 		ctx->populates_in_progress = 1;
1268 	}
1269 
1270 	for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
1271 		uint32_t	nsid = i + 1;
1272 
1273 		ns = nvme_bdev_ctrlr->namespaces[i];
1274 		ns_is_active = spdk_nvme_ctrlr_is_active_ns(ctrlr, nsid);
1275 
1276 		if (ns->populated && ns_is_active && ns->type == NVME_BDEV_NS_STANDARD) {
1277 			/* NS is still there but attributes may have changed */
1278 			nvme_ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid);
1279 			num_sectors = spdk_nvme_ns_get_num_sectors(nvme_ns);
1280 			bdev = TAILQ_FIRST(&ns->bdevs);
1281 			if (bdev->disk.blockcnt != num_sectors) {
1282 				SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %lu, new size %lu\n",
1283 					       nsid,
1284 					       bdev->disk.name,
1285 					       bdev->disk.blockcnt,
1286 					       num_sectors);
1287 				rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors);
1288 				if (rc != 0) {
1289 					SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n",
1290 						    bdev->disk.name, rc);
1291 				}
1292 			}
1293 		}
1294 
1295 		if (!ns->populated && ns_is_active) {
1296 			ns->id = nsid;
1297 			ns->ctrlr = nvme_bdev_ctrlr;
1298 			if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) {
1299 				ns->type = NVME_BDEV_NS_OCSSD;
1300 			} else {
1301 				ns->type = NVME_BDEV_NS_STANDARD;
1302 			}
1303 
1304 			TAILQ_INIT(&ns->bdevs);
1305 
1306 			if (ctx) {
1307 				ctx->populates_in_progress++;
1308 			}
1309 			nvme_ctrlr_populate_namespace(nvme_bdev_ctrlr, ns, ctx);
1310 		}
1311 
1312 		if (ns->populated && !ns_is_active) {
1313 			nvme_ctrlr_depopulate_namespace(nvme_bdev_ctrlr, ns);
1314 		}
1315 	}
1316 
1317 	if (ctx) {
1318 		/* Decrement this count now that the loop is over to account
1319 		 * for the one we started with.  If the count is then 0, we
1320 		 * know any populate_namespace functions completed immediately,
1321 		 * so we'll kick the callback here.
1322 		 */
1323 		ctx->populates_in_progress--;
1324 		if (ctx->populates_in_progress == 0) {
1325 			nvme_ctrlr_populate_namespaces_done(ctx);
1326 		}
1327 	}
1328 
1329 }
1330 
1331 static void
1332 nvme_ctrlr_depopulate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr)
1333 {
1334 	uint32_t i;
1335 	struct nvme_bdev_ns *ns;
1336 
1337 	for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
1338 		uint32_t nsid = i + 1;
1339 
1340 		ns = nvme_bdev_ctrlr->namespaces[nsid - 1];
1341 		if (ns->populated) {
1342 			assert(ns->id == nsid);
1343 			nvme_ctrlr_depopulate_namespace(nvme_bdev_ctrlr, ns);
1344 		}
1345 	}
1346 }
1347 
1348 static void
1349 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl)
1350 {
1351 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr		= arg;
1352 	union spdk_nvme_async_event_completion	event;
1353 
1354 	if (spdk_nvme_cpl_is_error(cpl)) {
1355 		SPDK_WARNLOG("AER request execute failed");
1356 		return;
1357 	}
1358 
1359 	event.raw = cpl->cdw0;
1360 	if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) &&
1361 	    (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) {
1362 		nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, NULL);
1363 	} else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_VENDOR) &&
1364 		   (event.bits.log_page_identifier == SPDK_OCSSD_LOG_CHUNK_NOTIFICATION) &&
1365 		   spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) {
1366 		bdev_ocssd_handle_chunk_notification(nvme_bdev_ctrlr);
1367 	}
1368 }
1369 
1370 static int
1371 nvme_bdev_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr,
1372 		       const char *name,
1373 		       const struct spdk_nvme_transport_id *trid,
1374 		       uint32_t prchk_flags)
1375 {
1376 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
1377 	struct nvme_bdev_ctrlr_trid *trid_entry;
1378 	uint32_t i;
1379 	int rc;
1380 
1381 	nvme_bdev_ctrlr = calloc(1, sizeof(*nvme_bdev_ctrlr));
1382 	if (nvme_bdev_ctrlr == NULL) {
1383 		SPDK_ERRLOG("Failed to allocate device struct\n");
1384 		return -ENOMEM;
1385 	}
1386 
1387 	TAILQ_INIT(&nvme_bdev_ctrlr->trids);
1388 	nvme_bdev_ctrlr->num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr);
1389 	nvme_bdev_ctrlr->namespaces = calloc(nvme_bdev_ctrlr->num_ns, sizeof(struct nvme_bdev_ns *));
1390 	if (!nvme_bdev_ctrlr->namespaces) {
1391 		SPDK_ERRLOG("Failed to allocate block namespaces pointer\n");
1392 		rc = -ENOMEM;
1393 		goto err_alloc_namespaces;
1394 	}
1395 
1396 	trid_entry = calloc(1, sizeof(*trid_entry));
1397 	if (trid_entry == NULL) {
1398 		SPDK_ERRLOG("Failed to allocate trid entry pointer\n");
1399 		rc = -ENOMEM;
1400 		goto err_alloc_trid;
1401 	}
1402 
1403 	trid_entry->trid = *trid;
1404 
1405 	for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
1406 		nvme_bdev_ctrlr->namespaces[i] = calloc(1, sizeof(struct nvme_bdev_ns));
1407 		if (nvme_bdev_ctrlr->namespaces[i] == NULL) {
1408 			SPDK_ERRLOG("Failed to allocate block namespace struct\n");
1409 			rc = -ENOMEM;
1410 			goto err_alloc_namespace;
1411 		}
1412 	}
1413 
1414 	nvme_bdev_ctrlr->thread = spdk_get_thread();
1415 	nvme_bdev_ctrlr->adminq_timer_poller = NULL;
1416 	nvme_bdev_ctrlr->ctrlr = ctrlr;
1417 	nvme_bdev_ctrlr->ref = 0;
1418 	nvme_bdev_ctrlr->connected_trid = &trid_entry->trid;
1419 	nvme_bdev_ctrlr->name = strdup(name);
1420 	if (nvme_bdev_ctrlr->name == NULL) {
1421 		rc = -ENOMEM;
1422 		goto err_alloc_name;
1423 	}
1424 
1425 	if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) {
1426 		rc = bdev_ocssd_init_ctrlr(nvme_bdev_ctrlr);
1427 		if (spdk_unlikely(rc != 0)) {
1428 			SPDK_ERRLOG("Unable to initialize OCSSD controller\n");
1429 			goto err_init_ocssd;
1430 		}
1431 	}
1432 
1433 	nvme_bdev_ctrlr->prchk_flags = prchk_flags;
1434 
1435 	spdk_io_device_register(nvme_bdev_ctrlr, bdev_nvme_create_cb, bdev_nvme_destroy_cb,
1436 				sizeof(struct nvme_io_channel),
1437 				name);
1438 
1439 	nvme_bdev_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_bdev_ctrlr,
1440 					       g_opts.nvme_adminq_poll_period_us);
1441 
1442 	TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nvme_bdev_ctrlr, tailq);
1443 
1444 	if (g_opts.timeout_us > 0) {
1445 		spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us,
1446 				timeout_cb, NULL);
1447 	}
1448 
1449 	spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_bdev_ctrlr);
1450 
1451 	if (spdk_nvme_ctrlr_get_flags(nvme_bdev_ctrlr->ctrlr) &
1452 	    SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) {
1453 		nvme_bdev_ctrlr->opal_dev = spdk_opal_dev_construct(nvme_bdev_ctrlr->ctrlr);
1454 		if (nvme_bdev_ctrlr->opal_dev == NULL) {
1455 			SPDK_ERRLOG("Failed to initialize Opal\n");
1456 		}
1457 	}
1458 
1459 	TAILQ_INSERT_HEAD(&nvme_bdev_ctrlr->trids, trid_entry, link);
1460 	return 0;
1461 
1462 err_init_ocssd:
1463 	free(nvme_bdev_ctrlr->name);
1464 err_alloc_name:
1465 err_alloc_namespace:
1466 	for (; i > 0; i--) {
1467 		free(nvme_bdev_ctrlr->namespaces[i - 1]);
1468 	}
1469 	free(trid_entry);
1470 err_alloc_trid:
1471 	free(nvme_bdev_ctrlr->namespaces);
1472 err_alloc_namespaces:
1473 	free(nvme_bdev_ctrlr);
1474 	return rc;
1475 }
1476 
1477 static void
1478 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
1479 	  struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
1480 {
1481 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
1482 	struct nvme_probe_ctx *ctx = cb_ctx;
1483 	char *name = NULL;
1484 	uint32_t prchk_flags = 0;
1485 	size_t i;
1486 
1487 	if (ctx) {
1488 		for (i = 0; i < ctx->count; i++) {
1489 			if (spdk_nvme_transport_id_compare(trid, &ctx->trids[i]) == 0) {
1490 				prchk_flags = ctx->prchk_flags[i];
1491 				name = strdup(ctx->names[i]);
1492 				break;
1493 			}
1494 		}
1495 	} else {
1496 		name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++);
1497 	}
1498 	if (!name) {
1499 		SPDK_ERRLOG("Failed to assign name to NVMe device\n");
1500 		return;
1501 	}
1502 
1503 	SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name);
1504 
1505 	nvme_bdev_ctrlr_create(ctrlr, name, trid, prchk_flags);
1506 
1507 	nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(trid);
1508 	if (!nvme_bdev_ctrlr) {
1509 		SPDK_ERRLOG("Failed to find new NVMe controller\n");
1510 		free(name);
1511 		return;
1512 	}
1513 
1514 	nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, NULL);
1515 
1516 	free(name);
1517 }
1518 
1519 static void
1520 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr)
1521 {
1522 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
1523 
1524 	pthread_mutex_lock(&g_bdev_nvme_mutex);
1525 	TAILQ_FOREACH(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
1526 		if (nvme_bdev_ctrlr->ctrlr == ctrlr) {
1527 			/* The controller's destruction was already started */
1528 			if (nvme_bdev_ctrlr->destruct) {
1529 				pthread_mutex_unlock(&g_bdev_nvme_mutex);
1530 				return;
1531 			}
1532 			pthread_mutex_unlock(&g_bdev_nvme_mutex);
1533 
1534 			nvme_ctrlr_depopulate_namespaces(nvme_bdev_ctrlr);
1535 
1536 			pthread_mutex_lock(&g_bdev_nvme_mutex);
1537 			nvme_bdev_ctrlr->destruct = true;
1538 			if (nvme_bdev_ctrlr->ref == 0) {
1539 				pthread_mutex_unlock(&g_bdev_nvme_mutex);
1540 				nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr);
1541 			} else {
1542 				pthread_mutex_unlock(&g_bdev_nvme_mutex);
1543 			}
1544 			return;
1545 		}
1546 	}
1547 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
1548 }
1549 
1550 static int
1551 bdev_nvme_hotplug(void *arg)
1552 {
1553 	struct spdk_nvme_transport_id trid_pcie;
1554 	int done;
1555 
1556 	if (!g_hotplug_probe_ctx) {
1557 		memset(&trid_pcie, 0, sizeof(trid_pcie));
1558 		spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE);
1559 
1560 		g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL,
1561 				      hotplug_probe_cb,
1562 				      attach_cb, remove_cb);
1563 		if (!g_hotplug_probe_ctx) {
1564 			return SPDK_POLLER_BUSY;
1565 		}
1566 	}
1567 
1568 	done = spdk_nvme_probe_poll_async(g_hotplug_probe_ctx);
1569 	if (done != -EAGAIN) {
1570 		g_hotplug_probe_ctx = NULL;
1571 	}
1572 
1573 	return SPDK_POLLER_BUSY;
1574 }
1575 
1576 void
1577 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts)
1578 {
1579 	*opts = g_opts;
1580 }
1581 
1582 int
1583 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts)
1584 {
1585 	if (g_bdev_nvme_init_thread != NULL) {
1586 		if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
1587 			return -EPERM;
1588 		}
1589 	}
1590 
1591 	g_opts = *opts;
1592 
1593 	return 0;
1594 }
1595 
1596 struct set_nvme_hotplug_ctx {
1597 	uint64_t period_us;
1598 	bool enabled;
1599 	spdk_msg_fn fn;
1600 	void *fn_ctx;
1601 };
1602 
1603 static void
1604 set_nvme_hotplug_period_cb(void *_ctx)
1605 {
1606 	struct set_nvme_hotplug_ctx *ctx = _ctx;
1607 
1608 	spdk_poller_unregister(&g_hotplug_poller);
1609 	if (ctx->enabled) {
1610 		g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us);
1611 	}
1612 
1613 	g_nvme_hotplug_poll_period_us = ctx->period_us;
1614 	g_nvme_hotplug_enabled = ctx->enabled;
1615 	if (ctx->fn) {
1616 		ctx->fn(ctx->fn_ctx);
1617 	}
1618 
1619 	free(ctx);
1620 }
1621 
1622 int
1623 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx)
1624 {
1625 	struct set_nvme_hotplug_ctx *ctx;
1626 
1627 	if (enabled == true && !spdk_process_is_primary()) {
1628 		return -EPERM;
1629 	}
1630 
1631 	ctx = calloc(1, sizeof(*ctx));
1632 	if (ctx == NULL) {
1633 		return -ENOMEM;
1634 	}
1635 
1636 	period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us;
1637 	ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX);
1638 	ctx->enabled = enabled;
1639 	ctx->fn = cb;
1640 	ctx->fn_ctx = cb_ctx;
1641 
1642 	spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx);
1643 	return 0;
1644 }
1645 
1646 static void
1647 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, size_t count, int rc)
1648 {
1649 	if (ctx->cb_fn) {
1650 		ctx->cb_fn(ctx->cb_ctx, count, rc);
1651 	}
1652 
1653 	free(ctx);
1654 }
1655 
1656 static void
1657 nvme_ctrlr_populate_namespaces_done(struct nvme_async_probe_ctx *ctx)
1658 {
1659 	struct nvme_bdev_ctrlr	*nvme_bdev_ctrlr;
1660 	struct nvme_bdev_ns	*ns;
1661 	struct nvme_bdev	*nvme_bdev, *tmp;
1662 	uint32_t		i, nsid;
1663 	size_t			j;
1664 
1665 	nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(ctx->base_name);
1666 	assert(nvme_bdev_ctrlr != NULL);
1667 
1668 	/*
1669 	 * Report the new bdevs that were created in this call.
1670 	 * There can be more than one bdev per NVMe controller.
1671 	 */
1672 	j = 0;
1673 	for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
1674 		nsid = i + 1;
1675 		ns = nvme_bdev_ctrlr->namespaces[nsid - 1];
1676 		if (!ns->populated) {
1677 			continue;
1678 		}
1679 		assert(ns->id == nsid);
1680 		TAILQ_FOREACH_SAFE(nvme_bdev, &ns->bdevs, tailq, tmp) {
1681 			if (j < ctx->count) {
1682 				ctx->names[j] = nvme_bdev->disk.name;
1683 				j++;
1684 			} else {
1685 				SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n",
1686 					    ctx->count);
1687 				populate_namespaces_cb(ctx, 0, -ERANGE);
1688 				return;
1689 			}
1690 		}
1691 	}
1692 
1693 	populate_namespaces_cb(ctx, j, 0);
1694 }
1695 
1696 static void
1697 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
1698 		  struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
1699 {
1700 	struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx;
1701 	struct nvme_bdev_ctrlr	*nvme_bdev_ctrlr;
1702 	struct nvme_async_probe_ctx *ctx;
1703 	int rc;
1704 
1705 	ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, opts);
1706 
1707 	spdk_poller_unregister(&ctx->poller);
1708 
1709 	rc = nvme_bdev_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx->prchk_flags);
1710 	if (rc) {
1711 		SPDK_ERRLOG("Failed to create new device\n");
1712 		populate_namespaces_cb(ctx, 0, rc);
1713 		return;
1714 	}
1715 
1716 	nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(&ctx->trid);
1717 	assert(nvme_bdev_ctrlr != NULL);
1718 
1719 	nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, ctx);
1720 }
1721 
1722 static int
1723 bdev_nvme_async_poll(void *arg)
1724 {
1725 	struct nvme_async_probe_ctx	*ctx = arg;
1726 	int				rc;
1727 
1728 	rc = spdk_nvme_probe_poll_async(ctx->probe_ctx);
1729 	if (spdk_unlikely(rc != -EAGAIN && rc != 0)) {
1730 		spdk_poller_unregister(&ctx->poller);
1731 		free(ctx);
1732 	}
1733 
1734 	return SPDK_POLLER_BUSY;
1735 }
1736 
1737 static int
1738 bdev_nvme_add_trid(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, struct spdk_nvme_transport_id *trid)
1739 {
1740 	struct spdk_nvme_ctrlr		*new_ctrlr;
1741 	struct spdk_nvme_ctrlr_opts	opts;
1742 	uint32_t			i;
1743 	struct spdk_nvme_ns		*ns, *new_ns;
1744 	const struct spdk_nvme_ns_data	*ns_data, *new_ns_data;
1745 	struct nvme_bdev_ctrlr_trid	*new_trid;
1746 	int				rc = 0;
1747 
1748 	assert(nvme_bdev_ctrlr != NULL);
1749 
1750 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
1751 		SPDK_ERRLOG("PCIe failover is not supported.\n");
1752 		return -ENOTSUP;
1753 	}
1754 
1755 	/* Currently we only support failover to the same transport type. */
1756 	if (nvme_bdev_ctrlr->connected_trid->trtype != trid->trtype) {
1757 		return -EINVAL;
1758 	}
1759 
1760 	/* Currently we only support failover to the same NQN. */
1761 	if (strncmp(trid->subnqn, nvme_bdev_ctrlr->connected_trid->subnqn, SPDK_NVMF_NQN_MAX_LEN)) {
1762 		return -EINVAL;
1763 	}
1764 
1765 	/* Skip all the other checks if we've already registered this path. */
1766 	TAILQ_FOREACH(new_trid, &nvme_bdev_ctrlr->trids, link) {
1767 		if (!spdk_nvme_transport_id_compare(&new_trid->trid, trid)) {
1768 			return -EEXIST;
1769 		}
1770 	}
1771 
1772 	spdk_nvme_ctrlr_get_default_ctrlr_opts(&opts, sizeof(opts));
1773 	opts.transport_retry_count = g_opts.retry_count;
1774 
1775 	new_ctrlr = spdk_nvme_connect(trid, &opts, sizeof(opts));
1776 
1777 	if (new_ctrlr == NULL) {
1778 		return -ENODEV;
1779 	}
1780 
1781 	if (spdk_nvme_ctrlr_get_num_ns(new_ctrlr) != nvme_bdev_ctrlr->num_ns) {
1782 		rc = -EINVAL;
1783 		goto out;
1784 	}
1785 
1786 	for (i = 1; i <= nvme_bdev_ctrlr->num_ns; i++) {
1787 		ns = spdk_nvme_ctrlr_get_ns(nvme_bdev_ctrlr->ctrlr, i);
1788 		new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, i);
1789 		assert(ns != NULL);
1790 		assert(new_ns != NULL);
1791 
1792 		ns_data = spdk_nvme_ns_get_data(ns);
1793 		new_ns_data = spdk_nvme_ns_get_data(new_ns);
1794 		if (memcmp(ns_data->nguid, new_ns_data->nguid, sizeof(ns_data->nguid))) {
1795 			rc = -EINVAL;
1796 			goto out;
1797 		}
1798 	}
1799 
1800 	new_trid = calloc(1, sizeof(*new_trid));
1801 	if (new_trid == NULL) {
1802 		rc = -ENOMEM;
1803 		goto out;
1804 	}
1805 	new_trid->trid = *trid;
1806 	TAILQ_INSERT_TAIL(&nvme_bdev_ctrlr->trids, new_trid, link);
1807 
1808 out:
1809 	spdk_nvme_detach(new_ctrlr);
1810 	return rc;
1811 }
1812 
1813 int
1814 bdev_nvme_remove_trid(const char *name, struct spdk_nvme_transport_id *trid)
1815 {
1816 	struct nvme_bdev_ctrlr		*nvme_bdev_ctrlr;
1817 	struct nvme_bdev_ctrlr_trid	*ctrlr_trid, *tmp_trid;
1818 
1819 	if (name == NULL) {
1820 		return -EINVAL;
1821 	}
1822 
1823 	nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name);
1824 	if (nvme_bdev_ctrlr == NULL) {
1825 		SPDK_ERRLOG("Failed to find NVMe controller\n");
1826 		return -ENODEV;
1827 	}
1828 
1829 	/* case 1: we are currently using the path to be removed. */
1830 	if (!spdk_nvme_transport_id_compare(trid, nvme_bdev_ctrlr->connected_trid)) {
1831 		ctrlr_trid = TAILQ_FIRST(&nvme_bdev_ctrlr->trids);
1832 		assert(nvme_bdev_ctrlr->connected_trid == &ctrlr_trid->trid);
1833 		/* case 1A: the current path is the only path. */
1834 		if (!TAILQ_NEXT(ctrlr_trid, link)) {
1835 			return bdev_nvme_delete(name);
1836 		}
1837 
1838 		/* case 1B: there is an alternative path. */
1839 		if (bdev_nvme_reset(nvme_bdev_ctrlr, NULL, true) == -EAGAIN) {
1840 			return -EAGAIN;
1841 		}
1842 		assert(nvme_bdev_ctrlr->connected_trid != &ctrlr_trid->trid);
1843 		TAILQ_REMOVE(&nvme_bdev_ctrlr->trids, ctrlr_trid, link);
1844 		free(ctrlr_trid);
1845 		return 0;
1846 	}
1847 	/* case 2: We are not using the specified path. */
1848 	TAILQ_FOREACH_SAFE(ctrlr_trid, &nvme_bdev_ctrlr->trids, link, tmp_trid) {
1849 		if (!spdk_nvme_transport_id_compare(&ctrlr_trid->trid, trid)) {
1850 			TAILQ_REMOVE(&nvme_bdev_ctrlr->trids, ctrlr_trid, link);
1851 			free(ctrlr_trid);
1852 			return 0;
1853 		}
1854 	}
1855 
1856 	/* case 2A: The address isn't even in the registered list. */
1857 	return -ENXIO;
1858 }
1859 
1860 int
1861 bdev_nvme_create(struct spdk_nvme_transport_id *trid,
1862 		 struct spdk_nvme_host_id *hostid,
1863 		 const char *base_name,
1864 		 const char **names,
1865 		 uint32_t count,
1866 		 const char *hostnqn,
1867 		 uint32_t prchk_flags,
1868 		 spdk_bdev_create_nvme_fn cb_fn,
1869 		 void *cb_ctx)
1870 {
1871 	struct nvme_probe_skip_entry	*entry, *tmp;
1872 	struct nvme_async_probe_ctx	*ctx;
1873 	struct nvme_bdev_ctrlr		*existing_ctrlr;
1874 	int				rc;
1875 
1876 	/* TODO expand this check to include both the host and target TRIDs.
1877 	 * Only if both are the same should we fail.
1878 	 */
1879 	if (nvme_bdev_ctrlr_get(trid) != NULL) {
1880 		SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr);
1881 		return -EEXIST;
1882 	}
1883 
1884 	ctx = calloc(1, sizeof(*ctx));
1885 	if (!ctx) {
1886 		return -ENOMEM;
1887 	}
1888 	ctx->base_name = base_name;
1889 	ctx->names = names;
1890 	ctx->count = count;
1891 	ctx->cb_fn = cb_fn;
1892 	ctx->cb_ctx = cb_ctx;
1893 	ctx->prchk_flags = prchk_flags;
1894 	ctx->trid = *trid;
1895 
1896 	existing_ctrlr = nvme_bdev_ctrlr_get_by_name(base_name);
1897 	if (existing_ctrlr) {
1898 		rc = bdev_nvme_add_trid(existing_ctrlr, trid);
1899 		if (rc) {
1900 			free(ctx);
1901 			return rc;
1902 		}
1903 
1904 		nvme_ctrlr_populate_namespaces_done(ctx);
1905 		return 0;
1906 	}
1907 
1908 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
1909 		TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) {
1910 			if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) {
1911 				TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq);
1912 				free(entry);
1913 				break;
1914 			}
1915 		}
1916 	}
1917 
1918 	spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->opts, sizeof(ctx->opts));
1919 	ctx->opts.transport_retry_count = g_opts.retry_count;
1920 
1921 	if (hostnqn) {
1922 		snprintf(ctx->opts.hostnqn, sizeof(ctx->opts.hostnqn), "%s", hostnqn);
1923 	}
1924 
1925 	if (hostid->hostaddr[0] != '\0') {
1926 		snprintf(ctx->opts.src_addr, sizeof(ctx->opts.src_addr), "%s", hostid->hostaddr);
1927 	}
1928 
1929 	if (hostid->hostsvcid[0] != '\0') {
1930 		snprintf(ctx->opts.src_svcid, sizeof(ctx->opts.src_svcid), "%s", hostid->hostsvcid);
1931 	}
1932 
1933 	ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->opts, connect_attach_cb);
1934 	if (ctx->probe_ctx == NULL) {
1935 		SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr);
1936 		free(ctx);
1937 		return -ENODEV;
1938 	}
1939 	ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000);
1940 
1941 	return 0;
1942 }
1943 
1944 int
1945 bdev_nvme_delete(const char *name)
1946 {
1947 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = NULL;
1948 	struct nvme_probe_skip_entry *entry;
1949 
1950 	if (name == NULL) {
1951 		return -EINVAL;
1952 	}
1953 
1954 	nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name);
1955 	if (nvme_bdev_ctrlr == NULL) {
1956 		SPDK_ERRLOG("Failed to find NVMe controller\n");
1957 		return -ENODEV;
1958 	}
1959 
1960 	if (nvme_bdev_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
1961 		entry = calloc(1, sizeof(*entry));
1962 		if (!entry) {
1963 			return -ENOMEM;
1964 		}
1965 		entry->trid = *nvme_bdev_ctrlr->connected_trid;
1966 		TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq);
1967 	}
1968 
1969 	remove_cb(NULL, nvme_bdev_ctrlr->ctrlr);
1970 	return 0;
1971 }
1972 
1973 static int
1974 bdev_nvme_library_init(void)
1975 {
1976 	g_bdev_nvme_init_thread = spdk_get_thread();
1977 
1978 	spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_poll_group_create_cb,
1979 				bdev_nvme_poll_group_destroy_cb,
1980 				sizeof(struct nvme_bdev_poll_group),  "bdev_nvme_poll_groups");
1981 
1982 	return 0;
1983 }
1984 
1985 static void
1986 bdev_nvme_library_fini(void)
1987 {
1988 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, *tmp;
1989 	struct nvme_probe_skip_entry *entry, *entry_tmp;
1990 
1991 	spdk_poller_unregister(&g_hotplug_poller);
1992 	free(g_hotplug_probe_ctx);
1993 
1994 	TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) {
1995 		TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq);
1996 		free(entry);
1997 	}
1998 
1999 	pthread_mutex_lock(&g_bdev_nvme_mutex);
2000 	TAILQ_FOREACH_SAFE(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq, tmp) {
2001 		if (nvme_bdev_ctrlr->destruct) {
2002 			/* This controller's destruction was already started
2003 			 * before the application started shutting down
2004 			 */
2005 			continue;
2006 		}
2007 
2008 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
2009 
2010 		nvme_ctrlr_depopulate_namespaces(nvme_bdev_ctrlr);
2011 
2012 		pthread_mutex_lock(&g_bdev_nvme_mutex);
2013 		nvme_bdev_ctrlr->destruct = true;
2014 
2015 		if (nvme_bdev_ctrlr->ref == 0) {
2016 			pthread_mutex_unlock(&g_bdev_nvme_mutex);
2017 			nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr);
2018 			pthread_mutex_lock(&g_bdev_nvme_mutex);
2019 		}
2020 	}
2021 
2022 	g_bdev_nvme_module_finish = true;
2023 	if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
2024 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
2025 		spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL);
2026 		spdk_bdev_module_finish_done();
2027 		return;
2028 	}
2029 
2030 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
2031 }
2032 
2033 static void
2034 bdev_nvme_verify_pi_error(struct spdk_bdev_io *bdev_io)
2035 {
2036 	struct spdk_bdev *bdev = bdev_io->bdev;
2037 	struct spdk_dif_ctx dif_ctx;
2038 	struct spdk_dif_error err_blk = {};
2039 	int rc;
2040 
2041 	rc = spdk_dif_ctx_init(&dif_ctx,
2042 			       bdev->blocklen, bdev->md_len, bdev->md_interleave,
2043 			       bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags,
2044 			       bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0);
2045 	if (rc != 0) {
2046 		SPDK_ERRLOG("Initialization of DIF context failed\n");
2047 		return;
2048 	}
2049 
2050 	if (bdev->md_interleave) {
2051 		rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
2052 				     bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
2053 	} else {
2054 		struct iovec md_iov = {
2055 			.iov_base	= bdev_io->u.bdev.md_buf,
2056 			.iov_len	= bdev_io->u.bdev.num_blocks * bdev->md_len,
2057 		};
2058 
2059 		rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
2060 				     &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
2061 	}
2062 
2063 	if (rc != 0) {
2064 		SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n",
2065 			    err_blk.err_type, err_blk.err_offset);
2066 	} else {
2067 		SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n");
2068 	}
2069 }
2070 
2071 static void
2072 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl)
2073 {
2074 	struct nvme_bdev_io *bio = ref;
2075 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2076 
2077 	if (spdk_nvme_cpl_is_success(cpl)) {
2078 		/* Run PI verification for read data buffer. */
2079 		bdev_nvme_verify_pi_error(bdev_io);
2080 	}
2081 
2082 	/* Return original completion status */
2083 	spdk_bdev_io_complete_nvme_status(bdev_io, bio->cpl.cdw0, bio->cpl.status.sct,
2084 					  bio->cpl.status.sc);
2085 }
2086 
2087 static void
2088 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl)
2089 {
2090 	struct nvme_bdev_io *bio = ref;
2091 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2092 	struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt;
2093 	struct nvme_io_channel *nvme_ch;
2094 	int ret;
2095 
2096 	if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) {
2097 		SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n",
2098 			    cpl->status.sct, cpl->status.sc);
2099 
2100 		/* Save completion status to use after verifying PI error. */
2101 		bio->cpl = *cpl;
2102 
2103 		nvme_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io));
2104 
2105 		/* Read without PI checking to verify PI error. */
2106 		ret = bdev_nvme_no_pi_readv(nbdev->nvme_ns,
2107 					    nvme_ch,
2108 					    bio,
2109 					    bdev_io->u.bdev.iovs,
2110 					    bdev_io->u.bdev.iovcnt,
2111 					    bdev_io->u.bdev.md_buf,
2112 					    bdev_io->u.bdev.num_blocks,
2113 					    bdev_io->u.bdev.offset_blocks);
2114 		if (ret == 0) {
2115 			return;
2116 		}
2117 	}
2118 
2119 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2120 }
2121 
2122 static void
2123 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl)
2124 {
2125 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref);
2126 
2127 	if (spdk_nvme_cpl_is_pi_error(cpl)) {
2128 		SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n",
2129 			    cpl->status.sct, cpl->status.sc);
2130 		/* Run PI verification for write data buffer if PI error is detected. */
2131 		bdev_nvme_verify_pi_error(bdev_io);
2132 	}
2133 
2134 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2135 }
2136 
2137 static void
2138 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl)
2139 {
2140 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref);
2141 
2142 	if (spdk_nvme_cpl_is_pi_error(cpl)) {
2143 		SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n",
2144 			    cpl->status.sct, cpl->status.sc);
2145 		/* Run PI verification for compare data buffer if PI error is detected. */
2146 		bdev_nvme_verify_pi_error(bdev_io);
2147 	}
2148 
2149 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2150 }
2151 
2152 static void
2153 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl)
2154 {
2155 	struct nvme_bdev_io *bio = ref;
2156 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2157 
2158 	/* Compare operation completion */
2159 	if ((cpl->cdw0 & 0xFF) == SPDK_NVME_OPC_COMPARE) {
2160 		/* Save compare result for write callback */
2161 		bio->cpl = *cpl;
2162 		return;
2163 	}
2164 
2165 	/* Write operation completion */
2166 	if (spdk_nvme_cpl_is_error(&bio->cpl)) {
2167 		/* If bio->cpl is already an error, it means the compare operation failed.  In that case,
2168 		 * complete the IO with the compare operation's status.
2169 		 */
2170 		if (!spdk_nvme_cpl_is_error(cpl)) {
2171 			SPDK_ERRLOG("Unexpected write success after compare failure.\n");
2172 		}
2173 
2174 		spdk_bdev_io_complete_nvme_status(bdev_io, bio->cpl.cdw0, bio->cpl.status.sct, bio->cpl.status.sc);
2175 	} else {
2176 		spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2177 	}
2178 }
2179 
2180 static void
2181 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl)
2182 {
2183 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref);
2184 
2185 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2186 }
2187 
2188 static void
2189 bdev_nvme_admin_passthru_completion(void *ctx)
2190 {
2191 	struct nvme_bdev_io *bio = ctx;
2192 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2193 
2194 	spdk_bdev_io_complete_nvme_status(bdev_io,
2195 					  bio->cpl.cdw0, bio->cpl.status.sct, bio->cpl.status.sc);
2196 }
2197 
2198 static void
2199 bdev_nvme_abort_completion(void *ctx)
2200 {
2201 	struct nvme_bdev_io *bio = ctx;
2202 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2203 
2204 	if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) {
2205 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
2206 	} else {
2207 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
2208 	}
2209 }
2210 
2211 static void
2212 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl)
2213 {
2214 	struct nvme_bdev_io *bio = ref;
2215 
2216 	bio->cpl = *cpl;
2217 	spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_completion, bio);
2218 }
2219 
2220 static void
2221 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl)
2222 {
2223 	struct nvme_bdev_io *bio = ref;
2224 
2225 	bio->cpl = *cpl;
2226 	spdk_thread_send_msg(bio->orig_thread, bdev_nvme_admin_passthru_completion, bio);
2227 }
2228 
2229 static void
2230 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset)
2231 {
2232 	struct nvme_bdev_io *bio = ref;
2233 	struct iovec *iov;
2234 
2235 	bio->iov_offset = sgl_offset;
2236 	for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) {
2237 		iov = &bio->iovs[bio->iovpos];
2238 		if (bio->iov_offset < iov->iov_len) {
2239 			break;
2240 		}
2241 
2242 		bio->iov_offset -= iov->iov_len;
2243 	}
2244 }
2245 
2246 static int
2247 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length)
2248 {
2249 	struct nvme_bdev_io *bio = ref;
2250 	struct iovec *iov;
2251 
2252 	assert(bio->iovpos < bio->iovcnt);
2253 
2254 	iov = &bio->iovs[bio->iovpos];
2255 
2256 	*address = iov->iov_base;
2257 	*length = iov->iov_len;
2258 
2259 	if (bio->iov_offset) {
2260 		assert(bio->iov_offset <= iov->iov_len);
2261 		*address += bio->iov_offset;
2262 		*length -= bio->iov_offset;
2263 	}
2264 
2265 	bio->iov_offset += *length;
2266 	if (bio->iov_offset == iov->iov_len) {
2267 		bio->iovpos++;
2268 		bio->iov_offset = 0;
2269 	}
2270 
2271 	return 0;
2272 }
2273 
2274 static void
2275 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset)
2276 {
2277 	struct nvme_bdev_io *bio = ref;
2278 	struct iovec *iov;
2279 
2280 	bio->fused_iov_offset = sgl_offset;
2281 	for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) {
2282 		iov = &bio->fused_iovs[bio->fused_iovpos];
2283 		if (bio->fused_iov_offset < iov->iov_len) {
2284 			break;
2285 		}
2286 
2287 		bio->fused_iov_offset -= iov->iov_len;
2288 	}
2289 }
2290 
2291 static int
2292 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length)
2293 {
2294 	struct nvme_bdev_io *bio = ref;
2295 	struct iovec *iov;
2296 
2297 	assert(bio->fused_iovpos < bio->fused_iovcnt);
2298 
2299 	iov = &bio->fused_iovs[bio->fused_iovpos];
2300 
2301 	*address = iov->iov_base;
2302 	*length = iov->iov_len;
2303 
2304 	if (bio->fused_iov_offset) {
2305 		assert(bio->fused_iov_offset <= iov->iov_len);
2306 		*address += bio->fused_iov_offset;
2307 		*length -= bio->fused_iov_offset;
2308 	}
2309 
2310 	bio->fused_iov_offset += *length;
2311 	if (bio->fused_iov_offset == iov->iov_len) {
2312 		bio->fused_iovpos++;
2313 		bio->fused_iov_offset = 0;
2314 	}
2315 
2316 	return 0;
2317 }
2318 
2319 static int
2320 bdev_nvme_no_pi_readv(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch,
2321 		      struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
2322 		      void *md, uint64_t lba_count, uint64_t lba)
2323 {
2324 	int rc;
2325 
2326 	SPDK_DEBUGLOG(bdev_nvme, "read %lu blocks with offset %#lx without PI check\n",
2327 		      lba_count, lba);
2328 
2329 	bio->iovs = iov;
2330 	bio->iovcnt = iovcnt;
2331 	bio->iovpos = 0;
2332 	bio->iov_offset = 0;
2333 
2334 	rc = spdk_nvme_ns_cmd_readv_with_md(nvme_ns->ns, nvme_ch->qpair, lba, lba_count,
2335 					    bdev_nvme_no_pi_readv_done, bio, 0,
2336 					    bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
2337 					    md, 0, 0);
2338 
2339 	if (rc != 0 && rc != -ENOMEM) {
2340 		SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc);
2341 	}
2342 	return rc;
2343 }
2344 
2345 static int
2346 bdev_nvme_readv(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch,
2347 		struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
2348 		void *md, uint64_t lba_count, uint64_t lba, uint32_t flags)
2349 {
2350 	int rc;
2351 
2352 	SPDK_DEBUGLOG(bdev_nvme, "read %lu blocks with offset %#lx\n",
2353 		      lba_count, lba);
2354 
2355 	bio->iovs = iov;
2356 	bio->iovcnt = iovcnt;
2357 	bio->iovpos = 0;
2358 	bio->iov_offset = 0;
2359 
2360 	if (iovcnt == 1) {
2361 		rc = spdk_nvme_ns_cmd_read_with_md(nvme_ns->ns, nvme_ch->qpair, iov[0].iov_base, md, lba,
2362 						   lba_count,
2363 						   bdev_nvme_readv_done, bio,
2364 						   flags,
2365 						   0, 0);
2366 	} else {
2367 		rc = spdk_nvme_ns_cmd_readv_with_md(nvme_ns->ns, nvme_ch->qpair, lba, lba_count,
2368 						    bdev_nvme_readv_done, bio, flags,
2369 						    bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
2370 						    md, 0, 0);
2371 	}
2372 
2373 	if (rc != 0 && rc != -ENOMEM) {
2374 		SPDK_ERRLOG("readv failed: rc = %d\n", rc);
2375 	}
2376 	return rc;
2377 }
2378 
2379 static int
2380 bdev_nvme_writev(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch,
2381 		 struct nvme_bdev_io *bio,
2382 		 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
2383 		 uint32_t flags)
2384 {
2385 	int rc;
2386 
2387 	SPDK_DEBUGLOG(bdev_nvme, "write %lu blocks with offset %#lx\n",
2388 		      lba_count, lba);
2389 
2390 	bio->iovs = iov;
2391 	bio->iovcnt = iovcnt;
2392 	bio->iovpos = 0;
2393 	bio->iov_offset = 0;
2394 
2395 	if (iovcnt == 1) {
2396 		rc = spdk_nvme_ns_cmd_write_with_md(nvme_ns->ns, nvme_ch->qpair, iov[0].iov_base, md, lba,
2397 						    lba_count,
2398 						    bdev_nvme_readv_done, bio,
2399 						    flags,
2400 						    0, 0);
2401 	} else {
2402 		rc = spdk_nvme_ns_cmd_writev_with_md(nvme_ns->ns, nvme_ch->qpair, lba, lba_count,
2403 						     bdev_nvme_writev_done, bio, flags,
2404 						     bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
2405 						     md, 0, 0);
2406 	}
2407 
2408 	if (rc != 0 && rc != -ENOMEM) {
2409 		SPDK_ERRLOG("writev failed: rc = %d\n", rc);
2410 	}
2411 	return rc;
2412 }
2413 
2414 static int
2415 bdev_nvme_comparev(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch,
2416 		   struct nvme_bdev_io *bio,
2417 		   struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
2418 		   uint32_t flags)
2419 {
2420 	int rc;
2421 
2422 	SPDK_DEBUGLOG(bdev_nvme, "compare %lu blocks with offset %#lx\n",
2423 		      lba_count, lba);
2424 
2425 	bio->iovs = iov;
2426 	bio->iovcnt = iovcnt;
2427 	bio->iovpos = 0;
2428 	bio->iov_offset = 0;
2429 
2430 	rc = spdk_nvme_ns_cmd_comparev_with_md(nvme_ns->ns, nvme_ch->qpair, lba, lba_count,
2431 					       bdev_nvme_comparev_done, bio, flags,
2432 					       bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
2433 					       md, 0, 0);
2434 
2435 	if (rc != 0 && rc != -ENOMEM) {
2436 		SPDK_ERRLOG("comparev failed: rc = %d\n", rc);
2437 	}
2438 	return rc;
2439 }
2440 
2441 static int
2442 bdev_nvme_comparev_and_writev(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch,
2443 			      struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt,
2444 			      struct iovec *write_iov, int write_iovcnt,
2445 			      void *md, uint64_t lba_count, uint64_t lba, uint32_t flags)
2446 {
2447 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2448 	int rc;
2449 
2450 	SPDK_DEBUGLOG(bdev_nvme, "compare and write %lu blocks with offset %#lx\n",
2451 		      lba_count, lba);
2452 
2453 	bio->iovs = cmp_iov;
2454 	bio->iovcnt = cmp_iovcnt;
2455 	bio->iovpos = 0;
2456 	bio->iov_offset = 0;
2457 	bio->fused_iovs = write_iov;
2458 	bio->fused_iovcnt = write_iovcnt;
2459 	bio->fused_iovpos = 0;
2460 	bio->fused_iov_offset = 0;
2461 
2462 	if (bdev_io->num_retries == 0) {
2463 		bio->first_fused_submitted = false;
2464 	}
2465 
2466 	if (!bio->first_fused_submitted) {
2467 		flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST;
2468 		memset(&bio->cpl, 0, sizeof(bio->cpl));
2469 
2470 		rc = spdk_nvme_ns_cmd_comparev_with_md(nvme_ns->ns, nvme_ch->qpair, lba, lba_count,
2471 						       bdev_nvme_comparev_and_writev_done, bio, flags,
2472 						       bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0);
2473 		if (rc == 0) {
2474 			bio->first_fused_submitted = true;
2475 			flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST;
2476 		} else {
2477 			if (rc != -ENOMEM) {
2478 				SPDK_ERRLOG("compare failed: rc = %d\n", rc);
2479 			}
2480 			return rc;
2481 		}
2482 	}
2483 
2484 	flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND;
2485 
2486 	rc = spdk_nvme_ns_cmd_writev_with_md(nvme_ns->ns, nvme_ch->qpair, lba, lba_count,
2487 					     bdev_nvme_comparev_and_writev_done, bio, flags,
2488 					     bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0);
2489 	if (rc != 0 && rc != -ENOMEM) {
2490 		SPDK_ERRLOG("write failed: rc = %d\n", rc);
2491 		rc = 0;
2492 	}
2493 
2494 	return rc;
2495 }
2496 
2497 static int
2498 bdev_nvme_unmap(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch,
2499 		struct nvme_bdev_io *bio,
2500 		uint64_t offset_blocks,
2501 		uint64_t num_blocks)
2502 {
2503 	struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES];
2504 	struct spdk_nvme_dsm_range *range;
2505 	uint64_t offset, remaining;
2506 	uint64_t num_ranges_u64;
2507 	uint16_t num_ranges;
2508 	int rc;
2509 
2510 	num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) /
2511 			 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
2512 	if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) {
2513 		SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks);
2514 		return -EINVAL;
2515 	}
2516 	num_ranges = (uint16_t)num_ranges_u64;
2517 
2518 	offset = offset_blocks;
2519 	remaining = num_blocks;
2520 	range = &dsm_ranges[0];
2521 
2522 	/* Fill max-size ranges until the remaining blocks fit into one range */
2523 	while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) {
2524 		range->attributes.raw = 0;
2525 		range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
2526 		range->starting_lba = offset;
2527 
2528 		offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
2529 		remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
2530 		range++;
2531 	}
2532 
2533 	/* Final range describes the remaining blocks */
2534 	range->attributes.raw = 0;
2535 	range->length = remaining;
2536 	range->starting_lba = offset;
2537 
2538 	rc = spdk_nvme_ns_cmd_dataset_management(nvme_ns->ns, nvme_ch->qpair,
2539 			SPDK_NVME_DSM_ATTR_DEALLOCATE,
2540 			dsm_ranges, num_ranges,
2541 			bdev_nvme_queued_done, bio);
2542 
2543 	return rc;
2544 }
2545 
2546 static int
2547 bdev_nvme_admin_passthru(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch,
2548 			 struct nvme_bdev_io *bio,
2549 			 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes)
2550 {
2551 	uint32_t max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ns->ctrlr->ctrlr);
2552 
2553 	if (nbytes > max_xfer_size) {
2554 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
2555 		return -EINVAL;
2556 	}
2557 
2558 	bio->orig_thread = spdk_io_channel_get_thread(spdk_io_channel_from_ctx(nvme_ch));
2559 
2560 	return spdk_nvme_ctrlr_cmd_admin_raw(nvme_ns->ctrlr->ctrlr, cmd, buf,
2561 					     (uint32_t)nbytes, bdev_nvme_admin_passthru_done, bio);
2562 }
2563 
2564 static int
2565 bdev_nvme_io_passthru(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch,
2566 		      struct nvme_bdev_io *bio,
2567 		      struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes)
2568 {
2569 	uint32_t max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ns->ctrlr->ctrlr);
2570 
2571 	if (nbytes > max_xfer_size) {
2572 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
2573 		return -EINVAL;
2574 	}
2575 
2576 	/*
2577 	 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid,
2578 	 * so fill it out automatically.
2579 	 */
2580 	cmd->nsid = spdk_nvme_ns_get_id(nvme_ns->ns);
2581 
2582 	return spdk_nvme_ctrlr_cmd_io_raw(nvme_ns->ctrlr->ctrlr, nvme_ch->qpair, cmd, buf,
2583 					  (uint32_t)nbytes, bdev_nvme_queued_done, bio);
2584 }
2585 
2586 static int
2587 bdev_nvme_io_passthru_md(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch,
2588 			 struct nvme_bdev_io *bio,
2589 			 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len)
2590 {
2591 	size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(nvme_ns->ns);
2592 	uint32_t max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ns->ctrlr->ctrlr);
2593 
2594 	if (nbytes > max_xfer_size) {
2595 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
2596 		return -EINVAL;
2597 	}
2598 
2599 	if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(nvme_ns->ns)) {
2600 		SPDK_ERRLOG("invalid meta data buffer size\n");
2601 		return -EINVAL;
2602 	}
2603 
2604 	/*
2605 	 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid,
2606 	 * so fill it out automatically.
2607 	 */
2608 	cmd->nsid = spdk_nvme_ns_get_id(nvme_ns->ns);
2609 
2610 	return spdk_nvme_ctrlr_cmd_io_raw_with_md(nvme_ns->ctrlr->ctrlr, nvme_ch->qpair, cmd, buf,
2611 			(uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio);
2612 }
2613 
2614 static void
2615 bdev_nvme_abort_admin_cmd(void *ctx)
2616 {
2617 	struct nvme_bdev_io *bio = ctx;
2618 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2619 	struct nvme_bdev *nbdev;
2620 	struct nvme_bdev_io *bio_to_abort;
2621 	int rc;
2622 
2623 	nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt;
2624 	bio_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx;
2625 
2626 	rc = spdk_nvme_ctrlr_cmd_abort_ext(nbdev->nvme_ns->ctrlr->ctrlr,
2627 					   NULL,
2628 					   bio_to_abort,
2629 					   bdev_nvme_abort_done, bio);
2630 	if (rc == -ENOENT) {
2631 		/* If no admin command was found in admin qpair, complete the abort
2632 		 * request with failure.
2633 		 */
2634 		bio->cpl.cdw0 |= 1U;
2635 		bio->cpl.status.sc = SPDK_NVME_SC_SUCCESS;
2636 		bio->cpl.status.sct = SPDK_NVME_SCT_GENERIC;
2637 
2638 		spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_completion, bio);
2639 	}
2640 }
2641 
2642 static int
2643 bdev_nvme_abort(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch,
2644 		struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort)
2645 {
2646 	int rc;
2647 
2648 	bio->orig_thread = spdk_io_channel_get_thread(spdk_io_channel_from_ctx(nvme_ch));
2649 
2650 	rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ns->ctrlr->ctrlr,
2651 					   nvme_ch->qpair,
2652 					   bio_to_abort,
2653 					   bdev_nvme_abort_done, bio);
2654 	if (rc == -ENOENT) {
2655 		/* If no command was found in I/O qpair, the target command may be
2656 		 * admin command. Only a single thread tries aborting admin command
2657 		 * to clean I/O flow.
2658 		 */
2659 		spdk_thread_send_msg(nvme_ns->ctrlr->thread,
2660 				     bdev_nvme_abort_admin_cmd, bio);
2661 		rc = 0;
2662 	}
2663 
2664 	return rc;
2665 }
2666 
2667 static void
2668 nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w, struct nvme_bdev_ns *ns)
2669 {
2670 	/* nop */
2671 }
2672 
2673 static void
2674 nvme_namespace_config_json(struct spdk_json_write_ctx *w, struct nvme_bdev_ns *ns)
2675 {
2676 	g_config_json_namespace_fn[ns->type](w, ns);
2677 }
2678 
2679 static int
2680 bdev_nvme_config_json(struct spdk_json_write_ctx *w)
2681 {
2682 	struct nvme_bdev_ctrlr		*nvme_bdev_ctrlr;
2683 	struct spdk_nvme_transport_id	*trid;
2684 	const char			*action;
2685 	uint32_t			nsid;
2686 
2687 	if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) {
2688 		action = "reset";
2689 	} else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) {
2690 		action = "abort";
2691 	} else {
2692 		action = "none";
2693 	}
2694 
2695 	spdk_json_write_object_begin(w);
2696 
2697 	spdk_json_write_named_string(w, "method", "bdev_nvme_set_options");
2698 
2699 	spdk_json_write_named_object_begin(w, "params");
2700 	spdk_json_write_named_string(w, "action_on_timeout", action);
2701 	spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us);
2702 	spdk_json_write_named_uint32(w, "retry_count", g_opts.retry_count);
2703 	spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst);
2704 	spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight);
2705 	spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight);
2706 	spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight);
2707 	spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us);
2708 	spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us);
2709 	spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests);
2710 	spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit);
2711 	spdk_json_write_object_end(w);
2712 
2713 	spdk_json_write_object_end(w);
2714 
2715 	pthread_mutex_lock(&g_bdev_nvme_mutex);
2716 	TAILQ_FOREACH(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
2717 		trid = nvme_bdev_ctrlr->connected_trid;
2718 
2719 		spdk_json_write_object_begin(w);
2720 
2721 		spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller");
2722 
2723 		spdk_json_write_named_object_begin(w, "params");
2724 		spdk_json_write_named_string(w, "name", nvme_bdev_ctrlr->name);
2725 		nvme_bdev_dump_trid_json(trid, w);
2726 		spdk_json_write_named_bool(w, "prchk_reftag",
2727 					   (nvme_bdev_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0);
2728 		spdk_json_write_named_bool(w, "prchk_guard",
2729 					   (nvme_bdev_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0);
2730 
2731 		spdk_json_write_object_end(w);
2732 
2733 		spdk_json_write_object_end(w);
2734 
2735 		for (nsid = 0; nsid < nvme_bdev_ctrlr->num_ns; ++nsid) {
2736 			if (!nvme_bdev_ctrlr->namespaces[nsid]->populated) {
2737 				continue;
2738 			}
2739 
2740 			nvme_namespace_config_json(w, nvme_bdev_ctrlr->namespaces[nsid]);
2741 		}
2742 	}
2743 
2744 	/* Dump as last parameter to give all NVMe bdevs chance to be constructed
2745 	 * before enabling hotplug poller.
2746 	 */
2747 	spdk_json_write_object_begin(w);
2748 	spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug");
2749 
2750 	spdk_json_write_named_object_begin(w, "params");
2751 	spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us);
2752 	spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled);
2753 	spdk_json_write_object_end(w);
2754 
2755 	spdk_json_write_object_end(w);
2756 
2757 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
2758 	return 0;
2759 }
2760 
2761 struct spdk_nvme_ctrlr *
2762 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev)
2763 {
2764 	if (!bdev || bdev->module != &nvme_if) {
2765 		return NULL;
2766 	}
2767 
2768 	return SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk)->nvme_ns->ctrlr->ctrlr;
2769 }
2770 
2771 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme)
2772