xref: /spdk/module/bdev/nvme/bdev_nvme.c (revision 9854c138f7b7c1833766c3ab89bbec9516f99364)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation. All rights reserved.
5  *   Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "spdk/stdinc.h"
35 
36 #include "bdev_nvme.h"
37 #include "bdev_ocssd.h"
38 
39 #include "spdk/config.h"
40 #include "spdk/endian.h"
41 #include "spdk/bdev.h"
42 #include "spdk/json.h"
43 #include "spdk/nvme.h"
44 #include "spdk/nvme_ocssd.h"
45 #include "spdk/thread.h"
46 #include "spdk/string.h"
47 #include "spdk/util.h"
48 
49 #include "spdk/bdev_module.h"
50 #include "spdk/log.h"
51 
52 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true
53 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS	(10000)
54 
55 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w);
56 
57 struct nvme_bdev_io {
58 	/** array of iovecs to transfer. */
59 	struct iovec *iovs;
60 
61 	/** Number of iovecs in iovs array. */
62 	int iovcnt;
63 
64 	/** Current iovec position. */
65 	int iovpos;
66 
67 	/** Offset in current iovec. */
68 	uint32_t iov_offset;
69 
70 	/** array of iovecs to transfer. */
71 	struct iovec *fused_iovs;
72 
73 	/** Number of iovecs in iovs array. */
74 	int fused_iovcnt;
75 
76 	/** Current iovec position. */
77 	int fused_iovpos;
78 
79 	/** Offset in current iovec. */
80 	uint32_t fused_iov_offset;
81 
82 	/** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */
83 	struct spdk_nvme_cpl cpl;
84 
85 	/** Originating thread */
86 	struct spdk_thread *orig_thread;
87 
88 	/** Keeps track if first of fused commands was submitted */
89 	bool first_fused_submitted;
90 };
91 
92 struct nvme_probe_ctx {
93 	size_t count;
94 	struct spdk_nvme_transport_id trids[NVME_MAX_CONTROLLERS];
95 	struct spdk_nvme_host_id hostids[NVME_MAX_CONTROLLERS];
96 	const char *names[NVME_MAX_CONTROLLERS];
97 	uint32_t prchk_flags[NVME_MAX_CONTROLLERS];
98 	const char *hostnqn;
99 };
100 
101 struct nvme_probe_skip_entry {
102 	struct spdk_nvme_transport_id		trid;
103 	TAILQ_ENTRY(nvme_probe_skip_entry)	tailq;
104 };
105 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */
106 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER(
107 			g_skipped_nvme_ctrlrs);
108 
109 static struct spdk_bdev_nvme_opts g_opts = {
110 	.action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE,
111 	.timeout_us = 0,
112 	.keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS,
113 	.retry_count = 4,
114 	.arbitration_burst = 0,
115 	.low_priority_weight = 0,
116 	.medium_priority_weight = 0,
117 	.high_priority_weight = 0,
118 	.nvme_adminq_poll_period_us = 10000ULL,
119 	.nvme_ioq_poll_period_us = 0,
120 	.io_queue_requests = 0,
121 	.delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT,
122 };
123 
124 #define NVME_HOTPLUG_POLL_PERIOD_MAX			10000000ULL
125 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT		100000ULL
126 
127 static int g_hot_insert_nvme_controller_index = 0;
128 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT;
129 static bool g_nvme_hotplug_enabled = false;
130 static struct spdk_thread *g_bdev_nvme_init_thread;
131 static struct spdk_poller *g_hotplug_poller;
132 static struct spdk_poller *g_hotplug_probe_poller;
133 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx;
134 
135 static void nvme_ctrlr_populate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
136 		struct nvme_async_probe_ctx *ctx);
137 static void nvme_ctrlr_populate_namespaces_done(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
138 		struct nvme_async_probe_ctx *ctx);
139 static int bdev_nvme_library_init(void);
140 static void bdev_nvme_library_fini(void);
141 static int bdev_nvme_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
142 			   struct nvme_bdev_io *bio,
143 			   struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
144 			   uint32_t flags);
145 static int bdev_nvme_no_pi_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
146 				 struct nvme_bdev_io *bio,
147 				 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba);
148 static int bdev_nvme_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
149 			    struct nvme_bdev_io *bio,
150 			    struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
151 			    uint32_t flags);
152 static int bdev_nvme_comparev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
153 			      struct nvme_bdev_io *bio,
154 			      struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
155 			      uint32_t flags);
156 static int bdev_nvme_comparev_and_writev(struct spdk_nvme_ns *ns,
157 		struct spdk_nvme_qpair *qpair,
158 		struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov,
159 		int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba,
160 		uint32_t flags);
161 static int bdev_nvme_admin_passthru(struct nvme_io_channel *nvme_ch,
162 				    struct nvme_bdev_io *bio,
163 				    struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes);
164 static int bdev_nvme_io_passthru(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
165 				 struct nvme_bdev_io *bio,
166 				 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes);
167 static int bdev_nvme_io_passthru_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
168 				    struct nvme_bdev_io *bio,
169 				    struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len);
170 static int bdev_nvme_abort(struct nvme_io_channel *nvme_ch,
171 			   struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort);
172 static int bdev_nvme_reset(struct nvme_io_channel *nvme_ch, struct nvme_bdev_io *bio);
173 static int bdev_nvme_failover(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, bool remove);
174 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr);
175 
176 typedef void (*populate_namespace_fn)(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
177 				      struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx);
178 static void nvme_ctrlr_populate_standard_namespace(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
179 		struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx);
180 
181 static populate_namespace_fn g_populate_namespace_fn[] = {
182 	NULL,
183 	nvme_ctrlr_populate_standard_namespace,
184 	bdev_ocssd_populate_namespace,
185 };
186 
187 typedef void (*depopulate_namespace_fn)(struct nvme_bdev_ns *nvme_ns);
188 static void nvme_ctrlr_depopulate_standard_namespace(struct nvme_bdev_ns *nvme_ns);
189 
190 static depopulate_namespace_fn g_depopulate_namespace_fn[] = {
191 	NULL,
192 	nvme_ctrlr_depopulate_standard_namespace,
193 	bdev_ocssd_depopulate_namespace,
194 };
195 
196 typedef void (*config_json_namespace_fn)(struct spdk_json_write_ctx *w,
197 		struct nvme_bdev_ns *nvme_ns);
198 static void nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w,
199 		struct nvme_bdev_ns *nvme_ns);
200 
201 static config_json_namespace_fn g_config_json_namespace_fn[] = {
202 	NULL,
203 	nvme_ctrlr_config_json_standard_namespace,
204 	bdev_ocssd_namespace_config_json,
205 };
206 
207 struct spdk_nvme_qpair *
208 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch)
209 {
210 	struct nvme_io_channel *nvme_ch;
211 
212 	nvme_ch =  spdk_io_channel_get_ctx(ctrlr_io_ch);
213 
214 	return nvme_ch->qpair;
215 }
216 
217 static int
218 bdev_nvme_get_ctx_size(void)
219 {
220 	return sizeof(struct nvme_bdev_io);
221 }
222 
223 static struct spdk_bdev_module nvme_if = {
224 	.name = "nvme",
225 	.async_fini = true,
226 	.module_init = bdev_nvme_library_init,
227 	.module_fini = bdev_nvme_library_fini,
228 	.config_json = bdev_nvme_config_json,
229 	.get_ctx_size = bdev_nvme_get_ctx_size,
230 
231 };
232 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if)
233 
234 static void
235 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx)
236 {
237 	int rc;
238 
239 	SPDK_DEBUGLOG(bdev_nvme, "qpair %p is disconnected, attempting reconnect.\n", qpair);
240 	/*
241 	 * Currently, just try to reconnect indefinitely. If we are doing a reset, the reset will
242 	 * reconnect a qpair and we will stop getting a callback for this one.
243 	 */
244 	rc = spdk_nvme_ctrlr_reconnect_io_qpair(qpair);
245 	if (rc != 0) {
246 		SPDK_WARNLOG("Failed to reconnect to qpair %p, errno %d\n", qpair, -rc);
247 	}
248 }
249 
250 static int
251 bdev_nvme_poll(void *arg)
252 {
253 	struct nvme_bdev_poll_group *group = arg;
254 	int64_t num_completions;
255 
256 	if (group->collect_spin_stat && group->start_ticks == 0) {
257 		group->start_ticks = spdk_get_ticks();
258 	}
259 
260 	num_completions = spdk_nvme_poll_group_process_completions(group->group, 0,
261 			  bdev_nvme_disconnected_qpair_cb);
262 	if (group->collect_spin_stat) {
263 		if (num_completions > 0) {
264 			if (group->end_ticks != 0) {
265 				group->spin_ticks += (group->end_ticks - group->start_ticks);
266 				group->end_ticks = 0;
267 			}
268 			group->start_ticks = 0;
269 		} else {
270 			group->end_ticks = spdk_get_ticks();
271 		}
272 	}
273 
274 	return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
275 }
276 
277 static int
278 bdev_nvme_poll_adminq(void *arg)
279 {
280 	int32_t rc;
281 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = arg;
282 
283 	assert(nvme_bdev_ctrlr != NULL);
284 
285 	rc = spdk_nvme_ctrlr_process_admin_completions(nvme_bdev_ctrlr->ctrlr);
286 	if (rc < 0) {
287 		bdev_nvme_failover(nvme_bdev_ctrlr, false);
288 	}
289 
290 	return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY;
291 }
292 
293 static int
294 bdev_nvme_destruct(void *ctx)
295 {
296 	struct nvme_bdev *nvme_disk = ctx;
297 	struct nvme_bdev_ns *nvme_ns = nvme_disk->nvme_ns;
298 
299 	nvme_ns->bdev = NULL;
300 
301 	nvme_bdev_ns_detach(nvme_ns);
302 
303 	free(nvme_disk->disk.name);
304 	free(nvme_disk);
305 
306 	return 0;
307 }
308 
309 static int
310 bdev_nvme_flush(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
311 		struct nvme_bdev_io *bio, uint64_t offset, uint64_t nbytes)
312 {
313 	spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_SUCCESS);
314 
315 	return 0;
316 }
317 
318 static int
319 bdev_nvme_create_qpair(struct nvme_io_channel *nvme_ch)
320 {
321 	struct spdk_nvme_ctrlr *ctrlr = nvme_ch->ctrlr->ctrlr;
322 	struct spdk_nvme_io_qpair_opts opts;
323 	int rc;
324 
325 	spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &opts, sizeof(opts));
326 	opts.delay_cmd_submit = g_opts.delay_cmd_submit;
327 	opts.create_only = true;
328 	opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests);
329 	g_opts.io_queue_requests = opts.io_queue_requests;
330 
331 	nvme_ch->qpair = spdk_nvme_ctrlr_alloc_io_qpair(ctrlr, &opts, sizeof(opts));
332 	if (nvme_ch->qpair == NULL) {
333 		return -1;
334 	}
335 
336 	assert(nvme_ch->group != NULL);
337 
338 	rc = spdk_nvme_poll_group_add(nvme_ch->group->group, nvme_ch->qpair);
339 	if (rc != 0) {
340 		SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n");
341 		goto err;
342 	}
343 
344 	rc = spdk_nvme_ctrlr_connect_io_qpair(ctrlr, nvme_ch->qpair);
345 	if (rc != 0) {
346 		SPDK_ERRLOG("Unable to connect I/O qpair.\n");
347 		goto err;
348 	}
349 
350 	return 0;
351 
352 err:
353 	spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair);
354 
355 	return rc;
356 }
357 
358 static void
359 _bdev_nvme_reset_destruct_ctrlr(struct spdk_io_channel_iter *i, int status)
360 {
361 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i);
362 
363 	spdk_thread_send_msg(nvme_bdev_ctrlr->thread, nvme_bdev_ctrlr_do_destruct,
364 			     nvme_bdev_ctrlr);
365 }
366 
367 static void
368 _bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i)
369 {
370 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
371 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(_ch);
372 	struct spdk_bdev_io *bdev_io;
373 	enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS;
374 
375 	/* A NULL ctx means success. */
376 	if (spdk_io_channel_iter_get_ctx(i) != NULL) {
377 		status = SPDK_BDEV_IO_STATUS_FAILED;
378 	}
379 
380 	while (!TAILQ_EMPTY(&nvme_ch->pending_resets)) {
381 		bdev_io = TAILQ_FIRST(&nvme_ch->pending_resets);
382 		TAILQ_REMOVE(&nvme_ch->pending_resets, bdev_io, module_link);
383 		spdk_bdev_io_complete(bdev_io, status);
384 	}
385 
386 	spdk_for_each_channel_continue(i, 0);
387 }
388 
389 static void
390 _bdev_nvme_reset_complete(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, int rc)
391 {
392 	/* we are using the for_each_channel cb_arg like a return code here. */
393 	/* If it's zero, we succeeded, otherwise, the reset failed. */
394 	void *cb_arg = NULL;
395 	struct nvme_bdev_ctrlr_trid *curr_trid;
396 	bool do_destruct = false;
397 
398 	if (rc) {
399 		cb_arg = (void *)0x1;
400 		SPDK_ERRLOG("Resetting controller failed.\n");
401 	} else {
402 		SPDK_NOTICELOG("Resetting controller successful.\n");
403 	}
404 
405 	pthread_mutex_lock(&nvme_bdev_ctrlr->mutex);
406 	nvme_bdev_ctrlr->resetting = false;
407 	nvme_bdev_ctrlr->failover_in_progress = false;
408 
409 	curr_trid = TAILQ_FIRST(&nvme_bdev_ctrlr->trids);
410 	assert(curr_trid != NULL);
411 	assert(&curr_trid->trid == nvme_bdev_ctrlr->connected_trid);
412 
413 	curr_trid->is_failed = cb_arg != NULL ? true : false;
414 
415 	if (nvme_bdev_ctrlr->ref == 0 && nvme_bdev_ctrlr->destruct) {
416 		/* Destruct ctrlr after clearing pending resets. */
417 		do_destruct = true;
418 	}
419 
420 	pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
421 
422 	/* Make sure we clear any pending resets before returning. */
423 	spdk_for_each_channel(nvme_bdev_ctrlr,
424 			      _bdev_nvme_complete_pending_resets,
425 			      cb_arg,
426 			      do_destruct ? _bdev_nvme_reset_destruct_ctrlr : NULL);
427 }
428 
429 static void
430 _bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status)
431 {
432 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i);
433 	struct nvme_bdev_io *bio = spdk_io_channel_iter_get_ctx(i);
434 	int rc = SPDK_BDEV_IO_STATUS_SUCCESS;
435 
436 	if (status) {
437 		rc = SPDK_BDEV_IO_STATUS_FAILED;
438 	}
439 	if (bio) {
440 		spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), rc);
441 	}
442 	_bdev_nvme_reset_complete(nvme_bdev_ctrlr, status);
443 }
444 
445 static void
446 _bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i)
447 {
448 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
449 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(_ch);
450 	int rc;
451 
452 	rc = bdev_nvme_create_qpair(nvme_ch);
453 
454 	spdk_for_each_channel_continue(i, rc);
455 }
456 
457 static void
458 _bdev_nvme_reset_ctrlr(struct spdk_io_channel_iter *i, int status)
459 {
460 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i);
461 	struct nvme_bdev_io *bio = spdk_io_channel_iter_get_ctx(i);
462 	int rc;
463 
464 	if (status) {
465 		rc = status;
466 		goto err;
467 	}
468 
469 	rc = spdk_nvme_ctrlr_reset(nvme_bdev_ctrlr->ctrlr);
470 	if (rc != 0) {
471 		goto err;
472 	}
473 
474 	/* Recreate all of the I/O queue pairs */
475 	spdk_for_each_channel(nvme_bdev_ctrlr,
476 			      _bdev_nvme_reset_create_qpair,
477 			      bio,
478 			      _bdev_nvme_reset_create_qpairs_done);
479 	return;
480 
481 err:
482 	if (bio) {
483 		spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_FAILED);
484 	}
485 	_bdev_nvme_reset_complete(nvme_bdev_ctrlr, rc);
486 }
487 
488 static void
489 _bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i)
490 {
491 	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
492 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
493 	int rc;
494 
495 	rc = spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair);
496 	if (!rc) {
497 		nvme_ch->qpair = NULL;
498 	}
499 
500 	spdk_for_each_channel_continue(i, rc);
501 }
502 
503 static int
504 _bdev_nvme_reset(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, void *ctx)
505 {
506 	pthread_mutex_lock(&nvme_bdev_ctrlr->mutex);
507 	if (nvme_bdev_ctrlr->destruct) {
508 		pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
509 		return -EBUSY;
510 	}
511 
512 	if (nvme_bdev_ctrlr->resetting) {
513 		pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
514 		SPDK_NOTICELOG("Unable to perform reset, already in progress.\n");
515 		return -EAGAIN;
516 	}
517 
518 	nvme_bdev_ctrlr->resetting = true;
519 
520 	pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
521 
522 	/* First, delete all NVMe I/O queue pairs. */
523 	spdk_for_each_channel(nvme_bdev_ctrlr,
524 			      _bdev_nvme_reset_destroy_qpair,
525 			      ctx,
526 			      _bdev_nvme_reset_ctrlr);
527 
528 	return 0;
529 }
530 
531 static int
532 bdev_nvme_reset(struct nvme_io_channel *nvme_ch, struct nvme_bdev_io *bio)
533 {
534 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
535 	int rc;
536 
537 	rc = _bdev_nvme_reset(nvme_ch->ctrlr, bio);
538 	if (rc == -EBUSY) {
539 		/* Don't bother resetting if the controller is in the process of being destructed. */
540 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
541 		return 0;
542 	} else if (rc == -EAGAIN) {
543 		/*
544 		 * Reset call is queued only if it is from the app framework. This is on purpose so that
545 		 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the
546 		 * upper level. If they are in the middle of a reset, we won't try to schedule another one.
547 		 */
548 		TAILQ_INSERT_TAIL(&nvme_ch->pending_resets, bdev_io, module_link);
549 		return 0;
550 	} else {
551 		return rc;
552 	}
553 }
554 
555 static int
556 bdev_nvme_failover(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, bool remove)
557 {
558 	struct nvme_bdev_ctrlr_trid *curr_trid = NULL, *next_trid = NULL;
559 	int rc = 0;
560 
561 	pthread_mutex_lock(&nvme_bdev_ctrlr->mutex);
562 	if (nvme_bdev_ctrlr->destruct) {
563 		pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
564 		/* Don't bother resetting if the controller is in the process of being destructed. */
565 		return 0;
566 	}
567 
568 	curr_trid = TAILQ_FIRST(&nvme_bdev_ctrlr->trids);
569 	assert(curr_trid);
570 	assert(&curr_trid->trid == nvme_bdev_ctrlr->connected_trid);
571 	next_trid = TAILQ_NEXT(curr_trid, link);
572 
573 	if (nvme_bdev_ctrlr->resetting) {
574 		if (next_trid && !nvme_bdev_ctrlr->failover_in_progress) {
575 			rc = -EAGAIN;
576 		}
577 		pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
578 		SPDK_NOTICELOG("Unable to perform reset, already in progress.\n");
579 		return rc;
580 	}
581 
582 	nvme_bdev_ctrlr->resetting = true;
583 	curr_trid->is_failed = true;
584 
585 	if (next_trid) {
586 		assert(curr_trid->trid.trtype != SPDK_NVME_TRANSPORT_PCIE);
587 
588 		SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", curr_trid->trid.traddr,
589 			       curr_trid->trid.trsvcid,	next_trid->trid.traddr, next_trid->trid.trsvcid);
590 
591 		nvme_bdev_ctrlr->failover_in_progress = true;
592 		spdk_nvme_ctrlr_fail(nvme_bdev_ctrlr->ctrlr);
593 		nvme_bdev_ctrlr->connected_trid = &next_trid->trid;
594 		rc = spdk_nvme_ctrlr_set_trid(nvme_bdev_ctrlr->ctrlr, &next_trid->trid);
595 		assert(rc == 0);
596 		TAILQ_REMOVE(&nvme_bdev_ctrlr->trids, curr_trid, link);
597 		if (!remove) {
598 			/** Shuffle the old trid to the end of the list and use the new one.
599 			 * Allows for round robin through multiple connections.
600 			 */
601 			TAILQ_INSERT_TAIL(&nvme_bdev_ctrlr->trids, curr_trid, link);
602 		} else {
603 			free(curr_trid);
604 		}
605 	}
606 
607 	pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
608 
609 	/* First, delete all NVMe I/O queue pairs. */
610 	spdk_for_each_channel(nvme_bdev_ctrlr,
611 			      _bdev_nvme_reset_destroy_qpair,
612 			      NULL,
613 			      _bdev_nvme_reset_ctrlr);
614 
615 	return 0;
616 }
617 
618 static int
619 bdev_nvme_unmap(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
620 		struct nvme_bdev_io *bio,
621 		uint64_t offset_blocks,
622 		uint64_t num_blocks);
623 
624 static void
625 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
626 		     bool success)
627 {
628 	struct spdk_bdev *bdev = bdev_io->bdev;
629 	struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev->ctxt;
630 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
631 	struct nvme_bdev_ns *nvme_ns;
632 	struct spdk_nvme_qpair *qpair;
633 	int ret;
634 
635 	if (!success) {
636 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
637 		return;
638 	}
639 
640 	if (spdk_unlikely(!bdev_nvme_find_io_path(nbdev, nvme_ch, &nvme_ns, &qpair))) {
641 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
642 		return;
643 	}
644 
645 	ret = bdev_nvme_readv(nvme_ns->ns,
646 			      qpair,
647 			      (struct nvme_bdev_io *)bdev_io->driver_ctx,
648 			      bdev_io->u.bdev.iovs,
649 			      bdev_io->u.bdev.iovcnt,
650 			      bdev_io->u.bdev.md_buf,
651 			      bdev_io->u.bdev.num_blocks,
652 			      bdev_io->u.bdev.offset_blocks,
653 			      bdev->dif_check_flags);
654 
655 	if (spdk_likely(ret == 0)) {
656 		return;
657 	} else if (ret == -ENOMEM) {
658 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
659 	} else {
660 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
661 	}
662 }
663 
664 static int
665 _bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
666 {
667 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
668 	struct spdk_bdev *bdev = bdev_io->bdev;
669 	struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev->ctxt;
670 	struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx;
671 	struct nvme_bdev_io *nbdev_io_to_abort;
672 	struct nvme_bdev_ns *nvme_ns;
673 	struct spdk_nvme_qpair *qpair;
674 
675 	if (spdk_unlikely(!bdev_nvme_find_io_path(nbdev, nvme_ch, &nvme_ns, &qpair))) {
676 		return -1;
677 	}
678 
679 	switch (bdev_io->type) {
680 	case SPDK_BDEV_IO_TYPE_READ:
681 		if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) {
682 			return bdev_nvme_readv(nvme_ns->ns,
683 					       qpair,
684 					       nbdev_io,
685 					       bdev_io->u.bdev.iovs,
686 					       bdev_io->u.bdev.iovcnt,
687 					       bdev_io->u.bdev.md_buf,
688 					       bdev_io->u.bdev.num_blocks,
689 					       bdev_io->u.bdev.offset_blocks,
690 					       bdev->dif_check_flags);
691 		} else {
692 			spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb,
693 					     bdev_io->u.bdev.num_blocks * bdev->blocklen);
694 			return 0;
695 		}
696 
697 	case SPDK_BDEV_IO_TYPE_WRITE:
698 		return bdev_nvme_writev(nvme_ns->ns,
699 					qpair,
700 					nbdev_io,
701 					bdev_io->u.bdev.iovs,
702 					bdev_io->u.bdev.iovcnt,
703 					bdev_io->u.bdev.md_buf,
704 					bdev_io->u.bdev.num_blocks,
705 					bdev_io->u.bdev.offset_blocks,
706 					bdev->dif_check_flags);
707 
708 	case SPDK_BDEV_IO_TYPE_COMPARE:
709 		return bdev_nvme_comparev(nvme_ns->ns,
710 					  qpair,
711 					  nbdev_io,
712 					  bdev_io->u.bdev.iovs,
713 					  bdev_io->u.bdev.iovcnt,
714 					  bdev_io->u.bdev.md_buf,
715 					  bdev_io->u.bdev.num_blocks,
716 					  bdev_io->u.bdev.offset_blocks,
717 					  bdev->dif_check_flags);
718 
719 	case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE:
720 		return bdev_nvme_comparev_and_writev(nvme_ns->ns,
721 						     qpair,
722 						     nbdev_io,
723 						     bdev_io->u.bdev.iovs,
724 						     bdev_io->u.bdev.iovcnt,
725 						     bdev_io->u.bdev.fused_iovs,
726 						     bdev_io->u.bdev.fused_iovcnt,
727 						     bdev_io->u.bdev.md_buf,
728 						     bdev_io->u.bdev.num_blocks,
729 						     bdev_io->u.bdev.offset_blocks,
730 						     bdev->dif_check_flags);
731 
732 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
733 		return bdev_nvme_unmap(nvme_ns->ns,
734 				       qpair,
735 				       nbdev_io,
736 				       bdev_io->u.bdev.offset_blocks,
737 				       bdev_io->u.bdev.num_blocks);
738 
739 	case SPDK_BDEV_IO_TYPE_UNMAP:
740 		return bdev_nvme_unmap(nvme_ns->ns,
741 				       qpair,
742 				       nbdev_io,
743 				       bdev_io->u.bdev.offset_blocks,
744 				       bdev_io->u.bdev.num_blocks);
745 
746 	case SPDK_BDEV_IO_TYPE_RESET:
747 		return bdev_nvme_reset(nvme_ch, nbdev_io);
748 
749 	case SPDK_BDEV_IO_TYPE_FLUSH:
750 		return bdev_nvme_flush(nvme_ns->ns,
751 				       qpair,
752 				       nbdev_io,
753 				       bdev_io->u.bdev.offset_blocks,
754 				       bdev_io->u.bdev.num_blocks);
755 
756 	case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
757 		return bdev_nvme_admin_passthru(nvme_ch,
758 						nbdev_io,
759 						&bdev_io->u.nvme_passthru.cmd,
760 						bdev_io->u.nvme_passthru.buf,
761 						bdev_io->u.nvme_passthru.nbytes);
762 
763 	case SPDK_BDEV_IO_TYPE_NVME_IO:
764 		return bdev_nvme_io_passthru(nvme_ns->ns,
765 					     qpair,
766 					     nbdev_io,
767 					     &bdev_io->u.nvme_passthru.cmd,
768 					     bdev_io->u.nvme_passthru.buf,
769 					     bdev_io->u.nvme_passthru.nbytes);
770 
771 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
772 		return bdev_nvme_io_passthru_md(nvme_ns->ns,
773 						qpair,
774 						nbdev_io,
775 						&bdev_io->u.nvme_passthru.cmd,
776 						bdev_io->u.nvme_passthru.buf,
777 						bdev_io->u.nvme_passthru.nbytes,
778 						bdev_io->u.nvme_passthru.md_buf,
779 						bdev_io->u.nvme_passthru.md_len);
780 
781 	case SPDK_BDEV_IO_TYPE_ABORT:
782 		nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx;
783 		return bdev_nvme_abort(nvme_ch,
784 				       nbdev_io,
785 				       nbdev_io_to_abort);
786 
787 	default:
788 		return -EINVAL;
789 	}
790 	return 0;
791 }
792 
793 static void
794 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
795 {
796 	int rc = _bdev_nvme_submit_request(ch, bdev_io);
797 
798 	if (spdk_unlikely(rc != 0)) {
799 		if (rc == -ENOMEM) {
800 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
801 		} else {
802 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
803 		}
804 	}
805 }
806 
807 static bool
808 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
809 {
810 	struct nvme_bdev *nbdev = ctx;
811 	struct nvme_bdev_ns *nvme_ns;
812 	struct spdk_nvme_ns *ns;
813 	struct spdk_nvme_ctrlr *ctrlr;
814 	const struct spdk_nvme_ctrlr_data *cdata;
815 
816 	nvme_ns = nvme_bdev_to_bdev_ns(nbdev);
817 	assert(nvme_ns != NULL);
818 	ns = nvme_ns->ns;
819 	ctrlr = spdk_nvme_ns_get_ctrlr(ns);
820 
821 	switch (io_type) {
822 	case SPDK_BDEV_IO_TYPE_READ:
823 	case SPDK_BDEV_IO_TYPE_WRITE:
824 	case SPDK_BDEV_IO_TYPE_RESET:
825 	case SPDK_BDEV_IO_TYPE_FLUSH:
826 	case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
827 	case SPDK_BDEV_IO_TYPE_NVME_IO:
828 	case SPDK_BDEV_IO_TYPE_ABORT:
829 		return true;
830 
831 	case SPDK_BDEV_IO_TYPE_COMPARE:
832 		return spdk_nvme_ns_supports_compare(ns);
833 
834 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
835 		return spdk_nvme_ns_get_md_size(ns) ? true : false;
836 
837 	case SPDK_BDEV_IO_TYPE_UNMAP:
838 		cdata = spdk_nvme_ctrlr_get_data(ctrlr);
839 		return cdata->oncs.dsm;
840 
841 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
842 		cdata = spdk_nvme_ctrlr_get_data(ctrlr);
843 		/*
844 		 * If an NVMe controller guarantees reading unallocated blocks returns zero,
845 		 * we can implement WRITE_ZEROES as an NVMe deallocate command.
846 		 */
847 		if (cdata->oncs.dsm &&
848 		    spdk_nvme_ns_get_dealloc_logical_block_read_value(ns) ==
849 		    SPDK_NVME_DEALLOC_READ_00) {
850 			return true;
851 		}
852 		/*
853 		 * The NVMe controller write_zeroes function is currently not used by our driver.
854 		 * If a user submits an arbitrarily large write_zeroes request to the controller, the request will fail.
855 		 * Until this is resolved, we only claim support for write_zeroes if deallocated blocks return 0's when read.
856 		 */
857 		return false;
858 
859 	case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE:
860 		if (spdk_nvme_ctrlr_get_flags(ctrlr) &
861 		    SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) {
862 			return true;
863 		}
864 		return false;
865 
866 	default:
867 		return false;
868 	}
869 }
870 
871 static int
872 bdev_nvme_create_cb(void *io_device, void *ctx_buf)
873 {
874 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = io_device;
875 	struct nvme_io_channel *nvme_ch = ctx_buf;
876 	struct spdk_io_channel *pg_ch = NULL;
877 	int rc;
878 
879 	if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) {
880 		rc = bdev_ocssd_create_io_channel(nvme_ch);
881 		if (rc != 0) {
882 			return rc;
883 		}
884 	}
885 
886 	pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs);
887 	if (!pg_ch) {
888 		rc = -1;
889 		goto err_pg_ch;
890 	}
891 
892 	nvme_ch->group = spdk_io_channel_get_ctx(pg_ch);
893 
894 #ifdef SPDK_CONFIG_VTUNE
895 	nvme_ch->group->collect_spin_stat = true;
896 #else
897 	nvme_ch->group->collect_spin_stat = false;
898 #endif
899 
900 	TAILQ_INIT(&nvme_ch->pending_resets);
901 
902 	nvme_ch->ctrlr = nvme_bdev_ctrlr;
903 
904 	rc = bdev_nvme_create_qpair(nvme_ch);
905 	if (rc != 0) {
906 		goto err_qpair;
907 	}
908 
909 	return 0;
910 
911 err_qpair:
912 	spdk_put_io_channel(pg_ch);
913 err_pg_ch:
914 	if (nvme_ch->ocssd_ch) {
915 		bdev_ocssd_destroy_io_channel(nvme_ch);
916 	}
917 
918 	return rc;
919 }
920 
921 static void
922 bdev_nvme_destroy_cb(void *io_device, void *ctx_buf)
923 {
924 	struct nvme_io_channel *nvme_ch = ctx_buf;
925 
926 	assert(nvme_ch->group != NULL);
927 
928 	if (nvme_ch->ocssd_ch != NULL) {
929 		bdev_ocssd_destroy_io_channel(nvme_ch);
930 	}
931 
932 	spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair);
933 
934 	spdk_put_io_channel(spdk_io_channel_from_ctx(nvme_ch->group));
935 }
936 
937 static int
938 bdev_nvme_poll_group_create_cb(void *io_device, void *ctx_buf)
939 {
940 	struct nvme_bdev_poll_group *group = ctx_buf;
941 
942 	group->group = spdk_nvme_poll_group_create(group);
943 	if (group->group == NULL) {
944 		return -1;
945 	}
946 
947 	group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us);
948 
949 	if (group->poller == NULL) {
950 		spdk_nvme_poll_group_destroy(group->group);
951 		return -1;
952 	}
953 
954 	return 0;
955 }
956 
957 static void
958 bdev_nvme_poll_group_destroy_cb(void *io_device, void *ctx_buf)
959 {
960 	struct nvme_bdev_poll_group *group = ctx_buf;
961 
962 	spdk_poller_unregister(&group->poller);
963 	if (spdk_nvme_poll_group_destroy(group->group)) {
964 		SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.");
965 		assert(false);
966 	}
967 }
968 
969 static struct spdk_io_channel *
970 bdev_nvme_get_io_channel(void *ctx)
971 {
972 	struct nvme_bdev *nvme_bdev = ctx;
973 
974 	return spdk_get_io_channel(nvme_bdev->nvme_ns->ctrlr);
975 }
976 
977 static void *
978 bdev_nvme_get_module_ctx(void *ctx)
979 {
980 	struct nvme_bdev *nvme_bdev = ctx;
981 
982 	return bdev_nvme_get_ctrlr(&nvme_bdev->disk);
983 }
984 
985 static int
986 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
987 {
988 	struct nvme_bdev *nvme_bdev = ctx;
989 	struct nvme_bdev_ns *nvme_ns;
990 	struct spdk_nvme_ns *ns;
991 	struct spdk_nvme_ctrlr *ctrlr;
992 	const struct spdk_nvme_ctrlr_data *cdata;
993 	const struct spdk_nvme_transport_id *trid;
994 	union spdk_nvme_vs_register vs;
995 	union spdk_nvme_csts_register csts;
996 	char buf[128];
997 
998 	nvme_ns = nvme_bdev_to_bdev_ns(nvme_bdev);
999 	assert(nvme_ns != NULL);
1000 	ns = nvme_ns->ns;
1001 	ctrlr = spdk_nvme_ns_get_ctrlr(ns);
1002 
1003 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
1004 	trid = spdk_nvme_ctrlr_get_transport_id(ctrlr);
1005 	vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr);
1006 	csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr);
1007 
1008 	spdk_json_write_named_object_begin(w, "nvme");
1009 
1010 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
1011 		spdk_json_write_named_string(w, "pci_address", trid->traddr);
1012 	}
1013 
1014 	spdk_json_write_named_object_begin(w, "trid");
1015 
1016 	nvme_bdev_dump_trid_json(trid, w);
1017 
1018 	spdk_json_write_object_end(w);
1019 
1020 #ifdef SPDK_CONFIG_NVME_CUSE
1021 	size_t cuse_name_size = 128;
1022 	char cuse_name[cuse_name_size];
1023 
1024 	int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns),
1025 					    cuse_name, &cuse_name_size);
1026 	if (rc == 0) {
1027 		spdk_json_write_named_string(w, "cuse_device", cuse_name);
1028 	}
1029 #endif
1030 
1031 	spdk_json_write_named_object_begin(w, "ctrlr_data");
1032 
1033 	spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid);
1034 
1035 	snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn);
1036 	spdk_str_trim(buf);
1037 	spdk_json_write_named_string(w, "model_number", buf);
1038 
1039 	snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn);
1040 	spdk_str_trim(buf);
1041 	spdk_json_write_named_string(w, "serial_number", buf);
1042 
1043 	snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr);
1044 	spdk_str_trim(buf);
1045 	spdk_json_write_named_string(w, "firmware_revision", buf);
1046 
1047 	if (cdata->subnqn[0] != '\0') {
1048 		spdk_json_write_named_string(w, "subnqn", cdata->subnqn);
1049 	}
1050 
1051 	spdk_json_write_named_object_begin(w, "oacs");
1052 
1053 	spdk_json_write_named_uint32(w, "security", cdata->oacs.security);
1054 	spdk_json_write_named_uint32(w, "format", cdata->oacs.format);
1055 	spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware);
1056 	spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage);
1057 
1058 	spdk_json_write_object_end(w);
1059 
1060 	spdk_json_write_object_end(w);
1061 
1062 	spdk_json_write_named_object_begin(w, "vs");
1063 
1064 	spdk_json_write_name(w, "nvme_version");
1065 	if (vs.bits.ter) {
1066 		spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter);
1067 	} else {
1068 		spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr);
1069 	}
1070 
1071 	spdk_json_write_object_end(w);
1072 
1073 	spdk_json_write_named_object_begin(w, "csts");
1074 
1075 	spdk_json_write_named_uint32(w, "rdy", csts.bits.rdy);
1076 	spdk_json_write_named_uint32(w, "cfs", csts.bits.cfs);
1077 
1078 	spdk_json_write_object_end(w);
1079 
1080 	spdk_json_write_named_object_begin(w, "ns_data");
1081 
1082 	spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns));
1083 
1084 	spdk_json_write_object_end(w);
1085 
1086 	if (cdata->oacs.security) {
1087 		spdk_json_write_named_object_begin(w, "security");
1088 
1089 		spdk_json_write_named_bool(w, "opal", nvme_bdev->opal);
1090 
1091 		spdk_json_write_object_end(w);
1092 	}
1093 
1094 	spdk_json_write_object_end(w);
1095 
1096 	return 0;
1097 }
1098 
1099 static void
1100 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
1101 {
1102 	/* No config per bdev needed */
1103 }
1104 
1105 static uint64_t
1106 bdev_nvme_get_spin_time(struct spdk_io_channel *ch)
1107 {
1108 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
1109 	struct nvme_bdev_poll_group *group = nvme_ch->group;
1110 	uint64_t spin_time;
1111 
1112 	if (!group || !group->collect_spin_stat) {
1113 		return 0;
1114 	}
1115 
1116 	if (group->end_ticks != 0) {
1117 		group->spin_ticks += (group->end_ticks - group->start_ticks);
1118 		group->end_ticks = 0;
1119 	}
1120 
1121 	spin_time = (group->spin_ticks * 1000000ULL) / spdk_get_ticks_hz();
1122 	group->start_ticks = 0;
1123 	group->spin_ticks = 0;
1124 
1125 	return spin_time;
1126 }
1127 
1128 static const struct spdk_bdev_fn_table nvmelib_fn_table = {
1129 	.destruct		= bdev_nvme_destruct,
1130 	.submit_request		= bdev_nvme_submit_request,
1131 	.io_type_supported	= bdev_nvme_io_type_supported,
1132 	.get_io_channel		= bdev_nvme_get_io_channel,
1133 	.dump_info_json		= bdev_nvme_dump_info_json,
1134 	.write_config_json	= bdev_nvme_write_config_json,
1135 	.get_spin_time		= bdev_nvme_get_spin_time,
1136 	.get_module_ctx		= bdev_nvme_get_module_ctx,
1137 };
1138 
1139 static int
1140 nvme_disk_create(struct spdk_bdev *disk, const char *base_name,
1141 		 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns,
1142 		 uint32_t prchk_flags, void *ctx)
1143 {
1144 	const struct spdk_uuid		*uuid;
1145 	const struct spdk_nvme_ctrlr_data *cdata;
1146 	const struct spdk_nvme_ns_data	*nsdata;
1147 	int				rc;
1148 
1149 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
1150 
1151 	disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns));
1152 	if (!disk->name) {
1153 		return -ENOMEM;
1154 	}
1155 	disk->product_name = "NVMe disk";
1156 
1157 	disk->write_cache = 0;
1158 	if (cdata->vwc.present) {
1159 		/* Enable if the Volatile Write Cache exists */
1160 		disk->write_cache = 1;
1161 	}
1162 	disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns);
1163 	disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns);
1164 	disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns);
1165 
1166 	uuid = spdk_nvme_ns_get_uuid(ns);
1167 	if (uuid != NULL) {
1168 		disk->uuid = *uuid;
1169 	}
1170 
1171 	nsdata = spdk_nvme_ns_get_data(ns);
1172 
1173 	disk->md_len = spdk_nvme_ns_get_md_size(ns);
1174 	if (disk->md_len != 0) {
1175 		disk->md_interleave = nsdata->flbas.extended;
1176 		disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns);
1177 		if (disk->dif_type != SPDK_DIF_DISABLE) {
1178 			disk->dif_is_head_of_md = nsdata->dps.md_start;
1179 			disk->dif_check_flags = prchk_flags;
1180 		}
1181 	}
1182 
1183 	if (!(spdk_nvme_ctrlr_get_flags(ctrlr) &
1184 	      SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) {
1185 		disk->acwu = 0;
1186 	} else if (nsdata->nsfeat.ns_atomic_write_unit) {
1187 		disk->acwu = nsdata->nacwu;
1188 	} else {
1189 		disk->acwu = cdata->acwu;
1190 	}
1191 
1192 	disk->ctxt = ctx;
1193 	disk->fn_table = &nvmelib_fn_table;
1194 	disk->module = &nvme_if;
1195 	rc = spdk_bdev_register(disk);
1196 	if (rc) {
1197 		SPDK_ERRLOG("spdk_bdev_register() failed\n");
1198 		free(disk->name);
1199 		return rc;
1200 	}
1201 
1202 	return 0;
1203 }
1204 
1205 static int
1206 nvme_bdev_create(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, struct nvme_bdev_ns *nvme_ns)
1207 {
1208 	struct nvme_bdev *bdev;
1209 	int rc;
1210 
1211 	bdev = calloc(1, sizeof(*bdev));
1212 	if (!bdev) {
1213 		SPDK_ERRLOG("bdev calloc() failed\n");
1214 		return -ENOMEM;
1215 	}
1216 
1217 	bdev->nvme_ns = nvme_ns;
1218 	bdev->opal = nvme_bdev_ctrlr->opal_dev != NULL;
1219 
1220 	rc = nvme_disk_create(&bdev->disk, nvme_bdev_ctrlr->name, nvme_bdev_ctrlr->ctrlr,
1221 			      nvme_ns->ns, nvme_bdev_ctrlr->prchk_flags, bdev);
1222 	if (rc != 0) {
1223 		SPDK_ERRLOG("Failed to create NVMe disk\n");
1224 		free(bdev);
1225 		return rc;
1226 	}
1227 
1228 	nvme_ns->ref++;
1229 	nvme_ns->bdev = bdev;
1230 
1231 	return 0;
1232 }
1233 
1234 static void
1235 nvme_ctrlr_populate_standard_namespace(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
1236 				       struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx)
1237 {
1238 	struct spdk_nvme_ctrlr	*ctrlr = nvme_bdev_ctrlr->ctrlr;
1239 	struct spdk_nvme_ns	*ns;
1240 	int			rc = 0;
1241 
1242 	ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id);
1243 	if (!ns) {
1244 		SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id);
1245 		rc = -EINVAL;
1246 		goto done;
1247 	}
1248 
1249 	nvme_ns->ns = ns;
1250 	nvme_ns->ref = 1;
1251 
1252 	rc = nvme_bdev_create(nvme_bdev_ctrlr, nvme_ns);
1253 done:
1254 	nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, rc);
1255 }
1256 
1257 static bool
1258 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
1259 		 struct spdk_nvme_ctrlr_opts *opts)
1260 {
1261 	struct nvme_probe_skip_entry *entry;
1262 
1263 	TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) {
1264 		if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) {
1265 			return false;
1266 		}
1267 	}
1268 
1269 	opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst;
1270 	opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight;
1271 	opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight;
1272 	opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight;
1273 
1274 	SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr);
1275 
1276 	return true;
1277 }
1278 
1279 static void
1280 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl)
1281 {
1282 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = ctx;
1283 
1284 	if (spdk_nvme_cpl_is_error(cpl)) {
1285 		SPDK_WARNLOG("Abort failed. Resetting controller.\n");
1286 		_bdev_nvme_reset(nvme_bdev_ctrlr, NULL);
1287 	}
1288 }
1289 
1290 static void
1291 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr,
1292 	   struct spdk_nvme_qpair *qpair, uint16_t cid)
1293 {
1294 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = cb_arg;
1295 	union spdk_nvme_csts_register csts;
1296 	int rc;
1297 
1298 	assert(nvme_bdev_ctrlr->ctrlr == ctrlr);
1299 
1300 	SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid);
1301 
1302 	/* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O
1303 	 * queue.  (Note: qpair == NULL when there's an admin cmd timeout.)  Otherwise we
1304 	 * would submit another fabrics cmd on the admin queue to read CSTS and check for its
1305 	 * completion recursively.
1306 	 */
1307 	if (nvme_bdev_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) {
1308 		csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr);
1309 		if (csts.bits.cfs) {
1310 			SPDK_ERRLOG("Controller Fatal Status, reset required\n");
1311 			_bdev_nvme_reset(nvme_bdev_ctrlr, NULL);
1312 			return;
1313 		}
1314 	}
1315 
1316 	switch (g_opts.action_on_timeout) {
1317 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT:
1318 		if (qpair) {
1319 			rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid,
1320 						       nvme_abort_cpl, nvme_bdev_ctrlr);
1321 			if (rc == 0) {
1322 				return;
1323 			}
1324 
1325 			SPDK_ERRLOG("Unable to send abort. Resetting.\n");
1326 		}
1327 
1328 	/* FALLTHROUGH */
1329 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET:
1330 		_bdev_nvme_reset(nvme_bdev_ctrlr, NULL);
1331 		break;
1332 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE:
1333 		SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n");
1334 		break;
1335 	default:
1336 		SPDK_ERRLOG("An invalid timeout action value is found.\n");
1337 		break;
1338 	}
1339 }
1340 
1341 void
1342 nvme_ctrlr_depopulate_namespace_done(struct nvme_bdev_ns *nvme_ns)
1343 {
1344 	nvme_bdev_ns_detach(nvme_ns);
1345 }
1346 
1347 static void
1348 nvme_ctrlr_depopulate_standard_namespace(struct nvme_bdev_ns *nvme_ns)
1349 {
1350 	struct nvme_bdev *bdev;
1351 
1352 	bdev = nvme_bdev_ns_to_bdev(nvme_ns);
1353 	if (bdev != NULL) {
1354 		spdk_bdev_unregister(&bdev->disk, NULL, NULL);
1355 	}
1356 
1357 	nvme_ns->populated = false;
1358 
1359 	nvme_ctrlr_depopulate_namespace_done(nvme_ns);
1360 }
1361 
1362 static void
1363 nvme_ctrlr_populate_namespace(struct nvme_bdev_ctrlr *ctrlr, struct nvme_bdev_ns *nvme_ns,
1364 			      struct nvme_async_probe_ctx *ctx)
1365 {
1366 	g_populate_namespace_fn[nvme_ns->type](ctrlr, nvme_ns, ctx);
1367 }
1368 
1369 static void
1370 nvme_ctrlr_depopulate_namespace(struct nvme_bdev_ctrlr *ctrlr, struct nvme_bdev_ns *nvme_ns)
1371 {
1372 	g_depopulate_namespace_fn[nvme_ns->type](nvme_ns);
1373 }
1374 
1375 void
1376 nvme_ctrlr_populate_namespace_done(struct nvme_async_probe_ctx *ctx,
1377 				   struct nvme_bdev_ns *nvme_ns, int rc)
1378 {
1379 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = nvme_ns->ctrlr;
1380 
1381 	assert(nvme_bdev_ctrlr != NULL);
1382 
1383 	if (rc == 0) {
1384 		nvme_ns->populated = true;
1385 		pthread_mutex_lock(&nvme_bdev_ctrlr->mutex);
1386 		nvme_bdev_ctrlr->ref++;
1387 		pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
1388 	} else {
1389 		memset(nvme_ns, 0, sizeof(*nvme_ns));
1390 	}
1391 
1392 	if (ctx) {
1393 		ctx->populates_in_progress--;
1394 		if (ctx->populates_in_progress == 0) {
1395 			nvme_ctrlr_populate_namespaces_done(nvme_bdev_ctrlr, ctx);
1396 		}
1397 	}
1398 }
1399 
1400 static void
1401 nvme_ctrlr_populate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
1402 			       struct nvme_async_probe_ctx *ctx)
1403 {
1404 	struct spdk_nvme_ctrlr	*ctrlr = nvme_bdev_ctrlr->ctrlr;
1405 	struct nvme_bdev_ns	*nvme_ns;
1406 	struct spdk_nvme_ns	*ns;
1407 	struct nvme_bdev	*bdev;
1408 	uint32_t		i;
1409 	int			rc;
1410 	uint64_t		num_sectors;
1411 	bool			ns_is_active;
1412 
1413 	if (ctx) {
1414 		/* Initialize this count to 1 to handle the populate functions
1415 		 * calling nvme_ctrlr_populate_namespace_done() immediately.
1416 		 */
1417 		ctx->populates_in_progress = 1;
1418 	}
1419 
1420 	for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
1421 		uint32_t	nsid = i + 1;
1422 
1423 		nvme_ns = nvme_bdev_ctrlr->namespaces[i];
1424 		ns_is_active = spdk_nvme_ctrlr_is_active_ns(ctrlr, nsid);
1425 
1426 		if (nvme_ns->populated && ns_is_active && nvme_ns->type == NVME_BDEV_NS_STANDARD) {
1427 			/* NS is still there but attributes may have changed */
1428 			ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid);
1429 			num_sectors = spdk_nvme_ns_get_num_sectors(ns);
1430 			bdev = nvme_bdev_ns_to_bdev(nvme_ns);
1431 			assert(bdev != NULL);
1432 			if (bdev->disk.blockcnt != num_sectors) {
1433 				SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n",
1434 					       nsid,
1435 					       bdev->disk.name,
1436 					       bdev->disk.blockcnt,
1437 					       num_sectors);
1438 				rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors);
1439 				if (rc != 0) {
1440 					SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n",
1441 						    bdev->disk.name, rc);
1442 				}
1443 			}
1444 		}
1445 
1446 		if (!nvme_ns->populated && ns_is_active) {
1447 			nvme_ns->id = nsid;
1448 			nvme_ns->ctrlr = nvme_bdev_ctrlr;
1449 			if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) {
1450 				nvme_ns->type = NVME_BDEV_NS_OCSSD;
1451 			} else {
1452 				nvme_ns->type = NVME_BDEV_NS_STANDARD;
1453 			}
1454 
1455 			nvme_ns->bdev = NULL;
1456 
1457 			if (ctx) {
1458 				ctx->populates_in_progress++;
1459 			}
1460 			nvme_ctrlr_populate_namespace(nvme_bdev_ctrlr, nvme_ns, ctx);
1461 		}
1462 
1463 		if (nvme_ns->populated && !ns_is_active) {
1464 			nvme_ctrlr_depopulate_namespace(nvme_bdev_ctrlr, nvme_ns);
1465 		}
1466 	}
1467 
1468 	if (ctx) {
1469 		/* Decrement this count now that the loop is over to account
1470 		 * for the one we started with.  If the count is then 0, we
1471 		 * know any populate_namespace functions completed immediately,
1472 		 * so we'll kick the callback here.
1473 		 */
1474 		ctx->populates_in_progress--;
1475 		if (ctx->populates_in_progress == 0) {
1476 			nvme_ctrlr_populate_namespaces_done(nvme_bdev_ctrlr, ctx);
1477 		}
1478 	}
1479 
1480 }
1481 
1482 static void
1483 nvme_ctrlr_depopulate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr)
1484 {
1485 	uint32_t i;
1486 	struct nvme_bdev_ns *nvme_ns;
1487 
1488 	for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
1489 		uint32_t nsid = i + 1;
1490 
1491 		nvme_ns = nvme_bdev_ctrlr->namespaces[nsid - 1];
1492 		if (nvme_ns->populated) {
1493 			assert(nvme_ns->id == nsid);
1494 			nvme_ctrlr_depopulate_namespace(nvme_bdev_ctrlr, nvme_ns);
1495 		}
1496 	}
1497 }
1498 
1499 static void
1500 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl)
1501 {
1502 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr		= arg;
1503 	union spdk_nvme_async_event_completion	event;
1504 
1505 	if (spdk_nvme_cpl_is_error(cpl)) {
1506 		SPDK_WARNLOG("AER request execute failed");
1507 		return;
1508 	}
1509 
1510 	event.raw = cpl->cdw0;
1511 	if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) &&
1512 	    (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) {
1513 		nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, NULL);
1514 	} else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_VENDOR) &&
1515 		   (event.bits.log_page_identifier == SPDK_OCSSD_LOG_CHUNK_NOTIFICATION) &&
1516 		   spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) {
1517 		bdev_ocssd_handle_chunk_notification(nvme_bdev_ctrlr);
1518 	}
1519 }
1520 
1521 static int
1522 nvme_bdev_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr,
1523 		       const char *name,
1524 		       const struct spdk_nvme_transport_id *trid,
1525 		       uint32_t prchk_flags,
1526 		       struct nvme_bdev_ctrlr **_nvme_bdev_ctrlr)
1527 {
1528 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
1529 	struct nvme_bdev_ctrlr_trid *trid_entry;
1530 	uint32_t i;
1531 	int rc;
1532 
1533 	nvme_bdev_ctrlr = calloc(1, sizeof(*nvme_bdev_ctrlr));
1534 	if (nvme_bdev_ctrlr == NULL) {
1535 		SPDK_ERRLOG("Failed to allocate device struct\n");
1536 		return -ENOMEM;
1537 	}
1538 
1539 	rc = pthread_mutex_init(&nvme_bdev_ctrlr->mutex, NULL);
1540 	if (rc != 0) {
1541 		goto err_init_mutex;
1542 	}
1543 
1544 	TAILQ_INIT(&nvme_bdev_ctrlr->trids);
1545 	nvme_bdev_ctrlr->num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr);
1546 	if (nvme_bdev_ctrlr->num_ns != 0) {
1547 		nvme_bdev_ctrlr->namespaces = calloc(nvme_bdev_ctrlr->num_ns, sizeof(struct nvme_bdev_ns *));
1548 		if (!nvme_bdev_ctrlr->namespaces) {
1549 			SPDK_ERRLOG("Failed to allocate block namespaces pointer\n");
1550 			rc = -ENOMEM;
1551 			goto err_alloc_namespaces;
1552 		}
1553 	}
1554 
1555 	trid_entry = calloc(1, sizeof(*trid_entry));
1556 	if (trid_entry == NULL) {
1557 		SPDK_ERRLOG("Failed to allocate trid entry pointer\n");
1558 		rc = -ENOMEM;
1559 		goto err_alloc_trid;
1560 	}
1561 
1562 	trid_entry->trid = *trid;
1563 
1564 	for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
1565 		nvme_bdev_ctrlr->namespaces[i] = calloc(1, sizeof(struct nvme_bdev_ns));
1566 		if (nvme_bdev_ctrlr->namespaces[i] == NULL) {
1567 			SPDK_ERRLOG("Failed to allocate block namespace struct\n");
1568 			rc = -ENOMEM;
1569 			goto err_alloc_namespace;
1570 		}
1571 	}
1572 
1573 	nvme_bdev_ctrlr->thread = spdk_get_thread();
1574 	nvme_bdev_ctrlr->adminq_timer_poller = NULL;
1575 	nvme_bdev_ctrlr->ctrlr = ctrlr;
1576 	nvme_bdev_ctrlr->ref = 1;
1577 	nvme_bdev_ctrlr->connected_trid = &trid_entry->trid;
1578 	nvme_bdev_ctrlr->name = strdup(name);
1579 	if (nvme_bdev_ctrlr->name == NULL) {
1580 		rc = -ENOMEM;
1581 		goto err_alloc_name;
1582 	}
1583 
1584 	if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) {
1585 		rc = bdev_ocssd_init_ctrlr(nvme_bdev_ctrlr);
1586 		if (spdk_unlikely(rc != 0)) {
1587 			SPDK_ERRLOG("Unable to initialize OCSSD controller\n");
1588 			goto err_init_ocssd;
1589 		}
1590 	}
1591 
1592 	nvme_bdev_ctrlr->prchk_flags = prchk_flags;
1593 
1594 	spdk_io_device_register(nvme_bdev_ctrlr, bdev_nvme_create_cb, bdev_nvme_destroy_cb,
1595 				sizeof(struct nvme_io_channel),
1596 				name);
1597 
1598 	nvme_bdev_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_bdev_ctrlr,
1599 					       g_opts.nvme_adminq_poll_period_us);
1600 
1601 	TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nvme_bdev_ctrlr, tailq);
1602 
1603 	if (g_opts.timeout_us > 0) {
1604 		spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us,
1605 				timeout_cb, nvme_bdev_ctrlr);
1606 	}
1607 
1608 	spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_bdev_ctrlr);
1609 	spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_bdev_ctrlr);
1610 
1611 	if (spdk_nvme_ctrlr_get_flags(nvme_bdev_ctrlr->ctrlr) &
1612 	    SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) {
1613 		nvme_bdev_ctrlr->opal_dev = spdk_opal_dev_construct(nvme_bdev_ctrlr->ctrlr);
1614 		if (nvme_bdev_ctrlr->opal_dev == NULL) {
1615 			SPDK_ERRLOG("Failed to initialize Opal\n");
1616 		}
1617 	}
1618 
1619 	TAILQ_INSERT_HEAD(&nvme_bdev_ctrlr->trids, trid_entry, link);
1620 
1621 	if (_nvme_bdev_ctrlr != NULL) {
1622 		*_nvme_bdev_ctrlr = nvme_bdev_ctrlr;
1623 	}
1624 	return 0;
1625 
1626 err_init_ocssd:
1627 	free(nvme_bdev_ctrlr->name);
1628 err_alloc_name:
1629 err_alloc_namespace:
1630 	for (; i > 0; i--) {
1631 		free(nvme_bdev_ctrlr->namespaces[i - 1]);
1632 	}
1633 	free(trid_entry);
1634 err_alloc_trid:
1635 	free(nvme_bdev_ctrlr->namespaces);
1636 err_alloc_namespaces:
1637 	pthread_mutex_destroy(&nvme_bdev_ctrlr->mutex);
1638 err_init_mutex:
1639 	free(nvme_bdev_ctrlr);
1640 	return rc;
1641 }
1642 
1643 static void
1644 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
1645 	  struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
1646 {
1647 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
1648 	struct nvme_probe_ctx *ctx = cb_ctx;
1649 	char *name = NULL;
1650 	uint32_t prchk_flags = 0;
1651 	size_t i;
1652 	int rc;
1653 
1654 	if (ctx) {
1655 		for (i = 0; i < ctx->count; i++) {
1656 			if (spdk_nvme_transport_id_compare(trid, &ctx->trids[i]) == 0) {
1657 				prchk_flags = ctx->prchk_flags[i];
1658 				name = strdup(ctx->names[i]);
1659 				break;
1660 			}
1661 		}
1662 	} else {
1663 		name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++);
1664 	}
1665 	if (!name) {
1666 		SPDK_ERRLOG("Failed to assign name to NVMe device\n");
1667 		return;
1668 	}
1669 
1670 	SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name);
1671 
1672 	rc = nvme_bdev_ctrlr_create(ctrlr, name, trid, prchk_flags, &nvme_bdev_ctrlr);
1673 	if (rc != 0) {
1674 		SPDK_ERRLOG("Failed to create new NVMe controller\n");
1675 		free(name);
1676 		return;
1677 	}
1678 
1679 	nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, NULL);
1680 
1681 	free(name);
1682 }
1683 
1684 static void
1685 _nvme_bdev_ctrlr_destruct(void *ctx)
1686 {
1687 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = ctx;
1688 
1689 	nvme_ctrlr_depopulate_namespaces(nvme_bdev_ctrlr);
1690 	nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr);
1691 }
1692 
1693 static int
1694 _bdev_nvme_delete(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, bool hotplug)
1695 {
1696 	struct nvme_probe_skip_entry *entry;
1697 
1698 	pthread_mutex_lock(&nvme_bdev_ctrlr->mutex);
1699 
1700 	/* The controller's destruction was already started */
1701 	if (nvme_bdev_ctrlr->destruct) {
1702 		pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
1703 		return 0;
1704 	}
1705 
1706 	if (!hotplug &&
1707 	    nvme_bdev_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
1708 		entry = calloc(1, sizeof(*entry));
1709 		if (!entry) {
1710 			pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
1711 			return -ENOMEM;
1712 		}
1713 		entry->trid = *nvme_bdev_ctrlr->connected_trid;
1714 		TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq);
1715 	}
1716 
1717 	nvme_bdev_ctrlr->destruct = true;
1718 	pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
1719 
1720 	_nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr);
1721 
1722 	return 0;
1723 }
1724 
1725 static void
1726 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr)
1727 {
1728 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = cb_ctx;
1729 
1730 	_bdev_nvme_delete(nvme_bdev_ctrlr, true);
1731 }
1732 
1733 static int
1734 bdev_nvme_hotplug_probe(void *arg)
1735 {
1736 	if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) {
1737 		g_hotplug_probe_ctx = NULL;
1738 		spdk_poller_unregister(&g_hotplug_probe_poller);
1739 	}
1740 
1741 	return SPDK_POLLER_BUSY;
1742 }
1743 
1744 static int
1745 bdev_nvme_hotplug(void *arg)
1746 {
1747 	struct spdk_nvme_transport_id trid_pcie;
1748 
1749 	if (g_hotplug_probe_ctx) {
1750 		return SPDK_POLLER_BUSY;
1751 	}
1752 
1753 	memset(&trid_pcie, 0, sizeof(trid_pcie));
1754 	spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE);
1755 
1756 	g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL,
1757 			      hotplug_probe_cb, attach_cb, NULL);
1758 
1759 	if (g_hotplug_probe_ctx) {
1760 		assert(g_hotplug_probe_poller == NULL);
1761 		g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000);
1762 	}
1763 
1764 	return SPDK_POLLER_BUSY;
1765 }
1766 
1767 void
1768 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts)
1769 {
1770 	*opts = g_opts;
1771 }
1772 
1773 int
1774 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts)
1775 {
1776 	if (g_bdev_nvme_init_thread != NULL) {
1777 		if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
1778 			return -EPERM;
1779 		}
1780 	}
1781 
1782 	g_opts = *opts;
1783 
1784 	return 0;
1785 }
1786 
1787 struct set_nvme_hotplug_ctx {
1788 	uint64_t period_us;
1789 	bool enabled;
1790 	spdk_msg_fn fn;
1791 	void *fn_ctx;
1792 };
1793 
1794 static void
1795 set_nvme_hotplug_period_cb(void *_ctx)
1796 {
1797 	struct set_nvme_hotplug_ctx *ctx = _ctx;
1798 
1799 	spdk_poller_unregister(&g_hotplug_poller);
1800 	if (ctx->enabled) {
1801 		g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us);
1802 	}
1803 
1804 	g_nvme_hotplug_poll_period_us = ctx->period_us;
1805 	g_nvme_hotplug_enabled = ctx->enabled;
1806 	if (ctx->fn) {
1807 		ctx->fn(ctx->fn_ctx);
1808 	}
1809 
1810 	free(ctx);
1811 }
1812 
1813 int
1814 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx)
1815 {
1816 	struct set_nvme_hotplug_ctx *ctx;
1817 
1818 	if (enabled == true && !spdk_process_is_primary()) {
1819 		return -EPERM;
1820 	}
1821 
1822 	ctx = calloc(1, sizeof(*ctx));
1823 	if (ctx == NULL) {
1824 		return -ENOMEM;
1825 	}
1826 
1827 	period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us;
1828 	ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX);
1829 	ctx->enabled = enabled;
1830 	ctx->fn = cb;
1831 	ctx->fn_ctx = cb_ctx;
1832 
1833 	spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx);
1834 	return 0;
1835 }
1836 
1837 static void
1838 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, size_t count, int rc)
1839 {
1840 	if (ctx->cb_fn) {
1841 		ctx->cb_fn(ctx->cb_ctx, count, rc);
1842 	}
1843 
1844 	ctx->namespaces_populated = true;
1845 	if (ctx->probe_done) {
1846 		/* The probe was already completed, so we need to free the context
1847 		 * here.  This can happen for cases like OCSSD, where we need to
1848 		 * send additional commands to the SSD after attach.
1849 		 */
1850 		free(ctx);
1851 	}
1852 }
1853 
1854 static void
1855 nvme_ctrlr_populate_namespaces_done(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
1856 				    struct nvme_async_probe_ctx *ctx)
1857 {
1858 	struct nvme_bdev_ns	*nvme_ns;
1859 	struct nvme_bdev	*nvme_bdev;
1860 	uint32_t		i, nsid;
1861 	size_t			j;
1862 
1863 	assert(nvme_bdev_ctrlr != NULL);
1864 
1865 	/*
1866 	 * Report the new bdevs that were created in this call.
1867 	 * There can be more than one bdev per NVMe controller.
1868 	 */
1869 	j = 0;
1870 	for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
1871 		nsid = i + 1;
1872 		nvme_ns = nvme_bdev_ctrlr->namespaces[nsid - 1];
1873 		if (!nvme_ns->populated) {
1874 			continue;
1875 		}
1876 		assert(nvme_ns->id == nsid);
1877 		nvme_bdev = nvme_bdev_ns_to_bdev(nvme_ns);
1878 		if (nvme_bdev == NULL) {
1879 			assert(nvme_ns->type == NVME_BDEV_NS_OCSSD);
1880 			continue;
1881 		}
1882 		if (j < ctx->count) {
1883 			ctx->names[j] = nvme_bdev->disk.name;
1884 			j++;
1885 		} else {
1886 			SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n",
1887 				    ctx->count);
1888 			populate_namespaces_cb(ctx, 0, -ERANGE);
1889 			return;
1890 		}
1891 	}
1892 
1893 	populate_namespaces_cb(ctx, j, 0);
1894 }
1895 
1896 static bool
1897 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2)
1898 {
1899 	const struct spdk_nvme_ns_data *nsdata1, *nsdata2;
1900 
1901 	nsdata1 = spdk_nvme_ns_get_data(ns1);
1902 	nsdata2 = spdk_nvme_ns_get_data(ns2);
1903 
1904 	return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid));
1905 }
1906 
1907 static int
1908 bdev_nvme_add_trid(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, struct spdk_nvme_ctrlr *new_ctrlr,
1909 		   struct spdk_nvme_transport_id *trid)
1910 {
1911 	uint32_t			i, nsid;
1912 	struct nvme_bdev_ns		*nvme_ns;
1913 	struct spdk_nvme_ns		*new_ns;
1914 	struct nvme_bdev_ctrlr_trid	*new_trid, *tmp_trid;
1915 	int				rc = 0;
1916 
1917 	assert(nvme_bdev_ctrlr != NULL);
1918 
1919 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
1920 		SPDK_ERRLOG("PCIe failover is not supported.\n");
1921 		return -ENOTSUP;
1922 	}
1923 
1924 	pthread_mutex_lock(&nvme_bdev_ctrlr->mutex);
1925 
1926 	/* Currently we only support failover to the same transport type. */
1927 	if (nvme_bdev_ctrlr->connected_trid->trtype != trid->trtype) {
1928 		rc = -EINVAL;
1929 		goto exit;
1930 	}
1931 
1932 	/* Currently we only support failover to the same NQN. */
1933 	if (strncmp(trid->subnqn, nvme_bdev_ctrlr->connected_trid->subnqn, SPDK_NVMF_NQN_MAX_LEN)) {
1934 		rc = -EINVAL;
1935 		goto exit;
1936 	}
1937 
1938 	/* Skip all the other checks if we've already registered this path. */
1939 	TAILQ_FOREACH(new_trid, &nvme_bdev_ctrlr->trids, link) {
1940 		if (!spdk_nvme_transport_id_compare(&new_trid->trid, trid)) {
1941 			rc = -EEXIST;
1942 			goto exit;
1943 		}
1944 	}
1945 
1946 	if (spdk_nvme_ctrlr_get_num_ns(new_ctrlr) != nvme_bdev_ctrlr->num_ns) {
1947 		rc = -EINVAL;
1948 		goto exit;
1949 	}
1950 
1951 	for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
1952 		nsid = i + 1;
1953 
1954 		nvme_ns = nvme_bdev_ctrlr->namespaces[i];
1955 		if (!nvme_ns->populated) {
1956 			continue;
1957 		}
1958 
1959 		new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nsid);
1960 		assert(new_ns != NULL);
1961 
1962 		if (bdev_nvme_compare_ns(nvme_ns->ns, new_ns) != 0) {
1963 			rc = -EINVAL;
1964 			goto exit;
1965 		}
1966 	}
1967 
1968 	new_trid = calloc(1, sizeof(*new_trid));
1969 	if (new_trid == NULL) {
1970 		rc = -ENOMEM;
1971 		goto exit;
1972 	}
1973 	new_trid->trid = *trid;
1974 	new_trid->is_failed = false;
1975 
1976 	TAILQ_FOREACH(tmp_trid, &nvme_bdev_ctrlr->trids, link) {
1977 		if (tmp_trid->is_failed) {
1978 			TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link);
1979 			goto exit;
1980 		}
1981 	}
1982 
1983 	TAILQ_INSERT_TAIL(&nvme_bdev_ctrlr->trids, new_trid, link);
1984 
1985 exit:
1986 	pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
1987 	return rc;
1988 }
1989 
1990 static void
1991 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
1992 		  struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
1993 {
1994 	struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx;
1995 	struct nvme_bdev_ctrlr	*nvme_bdev_ctrlr;
1996 	struct nvme_async_probe_ctx *ctx;
1997 	int rc;
1998 
1999 	ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, opts);
2000 	ctx->ctrlr_attached = true;
2001 
2002 	nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(ctx->base_name);
2003 	if (nvme_bdev_ctrlr) {
2004 		/* This is the case that a secondary path is added to an existing
2005 		 * nvme_bdev_ctrlr for failover. After checking if it can access the same
2006 		 * namespaces as the primary path, it is disconnected until failover occurs.
2007 		 */
2008 		rc = bdev_nvme_add_trid(nvme_bdev_ctrlr, ctrlr, &ctx->trid);
2009 
2010 		spdk_nvme_detach(ctrlr);
2011 		goto exit;
2012 	}
2013 
2014 	rc = nvme_bdev_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx->prchk_flags,
2015 				    &nvme_bdev_ctrlr);
2016 	if (rc) {
2017 		SPDK_ERRLOG("Failed to create new device\n");
2018 		goto exit;
2019 	}
2020 
2021 	nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, ctx);
2022 	return;
2023 
2024 exit:
2025 	populate_namespaces_cb(ctx, 0, rc);
2026 }
2027 
2028 static int
2029 bdev_nvme_async_poll(void *arg)
2030 {
2031 	struct nvme_async_probe_ctx	*ctx = arg;
2032 	int				rc;
2033 
2034 	rc = spdk_nvme_probe_poll_async(ctx->probe_ctx);
2035 	if (spdk_unlikely(rc != -EAGAIN)) {
2036 		ctx->probe_done = true;
2037 		spdk_poller_unregister(&ctx->poller);
2038 		if (!ctx->ctrlr_attached) {
2039 			/* The probe is done, but no controller was attached.
2040 			 * That means we had a failure, so report -EIO back to
2041 			 * the caller (usually the RPC). populate_namespaces_cb()
2042 			 * will take care of freeing the nvme_async_probe_ctx.
2043 			 */
2044 			populate_namespaces_cb(ctx, 0, -EIO);
2045 		} else if (ctx->namespaces_populated) {
2046 			/* The namespaces for the attached controller were all
2047 			 * populated and the response was already sent to the
2048 			 * caller (usually the RPC).  So free the context here.
2049 			 */
2050 			free(ctx);
2051 		}
2052 	}
2053 
2054 	return SPDK_POLLER_BUSY;
2055 }
2056 
2057 int
2058 bdev_nvme_create(struct spdk_nvme_transport_id *trid,
2059 		 struct spdk_nvme_host_id *hostid,
2060 		 const char *base_name,
2061 		 const char **names,
2062 		 uint32_t count,
2063 		 const char *hostnqn,
2064 		 uint32_t prchk_flags,
2065 		 spdk_bdev_create_nvme_fn cb_fn,
2066 		 void *cb_ctx,
2067 		 struct spdk_nvme_ctrlr_opts *opts)
2068 {
2069 	struct nvme_probe_skip_entry	*entry, *tmp;
2070 	struct nvme_async_probe_ctx	*ctx;
2071 
2072 	/* TODO expand this check to include both the host and target TRIDs.
2073 	 * Only if both are the same should we fail.
2074 	 */
2075 	if (nvme_bdev_ctrlr_get(trid) != NULL) {
2076 		SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr);
2077 		return -EEXIST;
2078 	}
2079 
2080 	ctx = calloc(1, sizeof(*ctx));
2081 	if (!ctx) {
2082 		return -ENOMEM;
2083 	}
2084 	ctx->base_name = base_name;
2085 	ctx->names = names;
2086 	ctx->count = count;
2087 	ctx->cb_fn = cb_fn;
2088 	ctx->cb_ctx = cb_ctx;
2089 	ctx->prchk_flags = prchk_flags;
2090 	ctx->trid = *trid;
2091 
2092 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
2093 		TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) {
2094 			if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) {
2095 				TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq);
2096 				free(entry);
2097 				break;
2098 			}
2099 		}
2100 	}
2101 
2102 	if (opts) {
2103 		memcpy(&ctx->opts, opts, sizeof(*opts));
2104 	} else {
2105 		spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->opts, sizeof(ctx->opts));
2106 	}
2107 
2108 	ctx->opts.transport_retry_count = g_opts.retry_count;
2109 	ctx->opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms;
2110 
2111 	if (hostnqn) {
2112 		snprintf(ctx->opts.hostnqn, sizeof(ctx->opts.hostnqn), "%s", hostnqn);
2113 	}
2114 
2115 	if (hostid->hostaddr[0] != '\0') {
2116 		snprintf(ctx->opts.src_addr, sizeof(ctx->opts.src_addr), "%s", hostid->hostaddr);
2117 	}
2118 
2119 	if (hostid->hostsvcid[0] != '\0') {
2120 		snprintf(ctx->opts.src_svcid, sizeof(ctx->opts.src_svcid), "%s", hostid->hostsvcid);
2121 	}
2122 
2123 	ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->opts, connect_attach_cb);
2124 	if (ctx->probe_ctx == NULL) {
2125 		SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr);
2126 		free(ctx);
2127 		return -ENODEV;
2128 	}
2129 	ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000);
2130 
2131 	return 0;
2132 }
2133 
2134 int
2135 bdev_nvme_delete(const char *name, const struct spdk_nvme_transport_id *trid)
2136 {
2137 	struct nvme_bdev_ctrlr		*nvme_bdev_ctrlr;
2138 	struct nvme_bdev_ctrlr_trid	*ctrlr_trid, *tmp_trid;
2139 
2140 	if (name == NULL) {
2141 		return -EINVAL;
2142 	}
2143 
2144 	nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name);
2145 	if (nvme_bdev_ctrlr == NULL) {
2146 		SPDK_ERRLOG("Failed to find NVMe controller\n");
2147 		return -ENODEV;
2148 	}
2149 
2150 	/* case 1: remove the controller itself. */
2151 	if (trid == NULL) {
2152 		return _bdev_nvme_delete(nvme_bdev_ctrlr, false);
2153 	}
2154 
2155 	/* case 2: we are currently using the path to be removed. */
2156 	if (!spdk_nvme_transport_id_compare(trid, nvme_bdev_ctrlr->connected_trid)) {
2157 		ctrlr_trid = TAILQ_FIRST(&nvme_bdev_ctrlr->trids);
2158 		assert(nvme_bdev_ctrlr->connected_trid == &ctrlr_trid->trid);
2159 		/* case 2A: the current path is the only path. */
2160 		if (!TAILQ_NEXT(ctrlr_trid, link)) {
2161 			return _bdev_nvme_delete(nvme_bdev_ctrlr, false);
2162 		}
2163 
2164 		/* case 1B: there is an alternative path. */
2165 		return bdev_nvme_failover(nvme_bdev_ctrlr, true);
2166 	}
2167 	/* case 3: We are not using the specified path. */
2168 	TAILQ_FOREACH_SAFE(ctrlr_trid, &nvme_bdev_ctrlr->trids, link, tmp_trid) {
2169 		if (!spdk_nvme_transport_id_compare(&ctrlr_trid->trid, trid)) {
2170 			TAILQ_REMOVE(&nvme_bdev_ctrlr->trids, ctrlr_trid, link);
2171 			free(ctrlr_trid);
2172 			return 0;
2173 		}
2174 	}
2175 
2176 	/* case 3A: The address isn't even in the registered list. */
2177 	return -ENXIO;
2178 }
2179 
2180 static int
2181 bdev_nvme_library_init(void)
2182 {
2183 	g_bdev_nvme_init_thread = spdk_get_thread();
2184 
2185 	spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_poll_group_create_cb,
2186 				bdev_nvme_poll_group_destroy_cb,
2187 				sizeof(struct nvme_bdev_poll_group),  "bdev_nvme_poll_groups");
2188 
2189 	return 0;
2190 }
2191 
2192 static void
2193 bdev_nvme_library_fini(void)
2194 {
2195 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, *tmp;
2196 	struct nvme_probe_skip_entry *entry, *entry_tmp;
2197 
2198 	spdk_poller_unregister(&g_hotplug_poller);
2199 	free(g_hotplug_probe_ctx);
2200 
2201 	TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) {
2202 		TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq);
2203 		free(entry);
2204 	}
2205 
2206 	pthread_mutex_lock(&g_bdev_nvme_mutex);
2207 	TAILQ_FOREACH_SAFE(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq, tmp) {
2208 		pthread_mutex_lock(&nvme_bdev_ctrlr->mutex);
2209 		if (nvme_bdev_ctrlr->destruct) {
2210 			/* This controller's destruction was already started
2211 			 * before the application started shutting down
2212 			 */
2213 			pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
2214 			continue;
2215 		}
2216 		nvme_bdev_ctrlr->destruct = true;
2217 		pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
2218 
2219 		spdk_thread_send_msg(nvme_bdev_ctrlr->thread, _nvme_bdev_ctrlr_destruct,
2220 				     nvme_bdev_ctrlr);
2221 	}
2222 
2223 	g_bdev_nvme_module_finish = true;
2224 	if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
2225 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
2226 		spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL);
2227 		spdk_bdev_module_finish_done();
2228 		return;
2229 	}
2230 
2231 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
2232 }
2233 
2234 static void
2235 bdev_nvme_verify_pi_error(struct spdk_bdev_io *bdev_io)
2236 {
2237 	struct spdk_bdev *bdev = bdev_io->bdev;
2238 	struct spdk_dif_ctx dif_ctx;
2239 	struct spdk_dif_error err_blk = {};
2240 	int rc;
2241 
2242 	rc = spdk_dif_ctx_init(&dif_ctx,
2243 			       bdev->blocklen, bdev->md_len, bdev->md_interleave,
2244 			       bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags,
2245 			       bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0);
2246 	if (rc != 0) {
2247 		SPDK_ERRLOG("Initialization of DIF context failed\n");
2248 		return;
2249 	}
2250 
2251 	if (bdev->md_interleave) {
2252 		rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
2253 				     bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
2254 	} else {
2255 		struct iovec md_iov = {
2256 			.iov_base	= bdev_io->u.bdev.md_buf,
2257 			.iov_len	= bdev_io->u.bdev.num_blocks * bdev->md_len,
2258 		};
2259 
2260 		rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
2261 				     &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
2262 	}
2263 
2264 	if (rc != 0) {
2265 		SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n",
2266 			    err_blk.err_type, err_blk.err_offset);
2267 	} else {
2268 		SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n");
2269 	}
2270 }
2271 
2272 static void
2273 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl)
2274 {
2275 	struct nvme_bdev_io *bio = ref;
2276 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2277 
2278 	if (spdk_nvme_cpl_is_success(cpl)) {
2279 		/* Run PI verification for read data buffer. */
2280 		bdev_nvme_verify_pi_error(bdev_io);
2281 	}
2282 
2283 	/* Return original completion status */
2284 	spdk_bdev_io_complete_nvme_status(bdev_io, bio->cpl.cdw0, bio->cpl.status.sct,
2285 					  bio->cpl.status.sc);
2286 }
2287 
2288 static void
2289 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl)
2290 {
2291 	struct nvme_bdev_io *bio = ref;
2292 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2293 	struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt;
2294 	struct nvme_io_channel *nvme_ch;
2295 	struct nvme_bdev_ns *nvme_ns;
2296 	struct spdk_nvme_qpair *qpair;
2297 	int ret;
2298 
2299 	if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) {
2300 		SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n",
2301 			    cpl->status.sct, cpl->status.sc);
2302 
2303 		/* Save completion status to use after verifying PI error. */
2304 		bio->cpl = *cpl;
2305 
2306 		nvme_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io));
2307 
2308 		if (spdk_likely(bdev_nvme_find_io_path(nbdev, nvme_ch, &nvme_ns, &qpair))) {
2309 			/* Read without PI checking to verify PI error. */
2310 			ret = bdev_nvme_no_pi_readv(nvme_ns->ns,
2311 						    qpair,
2312 						    bio,
2313 						    bdev_io->u.bdev.iovs,
2314 						    bdev_io->u.bdev.iovcnt,
2315 						    bdev_io->u.bdev.md_buf,
2316 						    bdev_io->u.bdev.num_blocks,
2317 						    bdev_io->u.bdev.offset_blocks);
2318 			if (ret == 0) {
2319 				return;
2320 			}
2321 		}
2322 	}
2323 
2324 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2325 }
2326 
2327 static void
2328 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl)
2329 {
2330 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref);
2331 
2332 	if (spdk_nvme_cpl_is_pi_error(cpl)) {
2333 		SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n",
2334 			    cpl->status.sct, cpl->status.sc);
2335 		/* Run PI verification for write data buffer if PI error is detected. */
2336 		bdev_nvme_verify_pi_error(bdev_io);
2337 	}
2338 
2339 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2340 }
2341 
2342 static void
2343 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl)
2344 {
2345 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref);
2346 
2347 	if (spdk_nvme_cpl_is_pi_error(cpl)) {
2348 		SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n",
2349 			    cpl->status.sct, cpl->status.sc);
2350 		/* Run PI verification for compare data buffer if PI error is detected. */
2351 		bdev_nvme_verify_pi_error(bdev_io);
2352 	}
2353 
2354 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2355 }
2356 
2357 static void
2358 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl)
2359 {
2360 	struct nvme_bdev_io *bio = ref;
2361 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2362 
2363 	/* Compare operation completion */
2364 	if ((cpl->cdw0 & 0xFF) == SPDK_NVME_OPC_COMPARE) {
2365 		/* Save compare result for write callback */
2366 		bio->cpl = *cpl;
2367 		return;
2368 	}
2369 
2370 	/* Write operation completion */
2371 	if (spdk_nvme_cpl_is_error(&bio->cpl)) {
2372 		/* If bio->cpl is already an error, it means the compare operation failed.  In that case,
2373 		 * complete the IO with the compare operation's status.
2374 		 */
2375 		if (!spdk_nvme_cpl_is_error(cpl)) {
2376 			SPDK_ERRLOG("Unexpected write success after compare failure.\n");
2377 		}
2378 
2379 		spdk_bdev_io_complete_nvme_status(bdev_io, bio->cpl.cdw0, bio->cpl.status.sct, bio->cpl.status.sc);
2380 	} else {
2381 		spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2382 	}
2383 }
2384 
2385 static void
2386 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl)
2387 {
2388 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref);
2389 
2390 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2391 }
2392 
2393 static void
2394 bdev_nvme_admin_passthru_completion(void *ctx)
2395 {
2396 	struct nvme_bdev_io *bio = ctx;
2397 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2398 
2399 	spdk_bdev_io_complete_nvme_status(bdev_io,
2400 					  bio->cpl.cdw0, bio->cpl.status.sct, bio->cpl.status.sc);
2401 }
2402 
2403 static void
2404 bdev_nvme_abort_completion(void *ctx)
2405 {
2406 	struct nvme_bdev_io *bio = ctx;
2407 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2408 
2409 	if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) {
2410 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
2411 	} else {
2412 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
2413 	}
2414 }
2415 
2416 static void
2417 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl)
2418 {
2419 	struct nvme_bdev_io *bio = ref;
2420 
2421 	bio->cpl = *cpl;
2422 	spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_completion, bio);
2423 }
2424 
2425 static void
2426 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl)
2427 {
2428 	struct nvme_bdev_io *bio = ref;
2429 
2430 	bio->cpl = *cpl;
2431 	spdk_thread_send_msg(bio->orig_thread, bdev_nvme_admin_passthru_completion, bio);
2432 }
2433 
2434 static void
2435 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset)
2436 {
2437 	struct nvme_bdev_io *bio = ref;
2438 	struct iovec *iov;
2439 
2440 	bio->iov_offset = sgl_offset;
2441 	for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) {
2442 		iov = &bio->iovs[bio->iovpos];
2443 		if (bio->iov_offset < iov->iov_len) {
2444 			break;
2445 		}
2446 
2447 		bio->iov_offset -= iov->iov_len;
2448 	}
2449 }
2450 
2451 static int
2452 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length)
2453 {
2454 	struct nvme_bdev_io *bio = ref;
2455 	struct iovec *iov;
2456 
2457 	assert(bio->iovpos < bio->iovcnt);
2458 
2459 	iov = &bio->iovs[bio->iovpos];
2460 
2461 	*address = iov->iov_base;
2462 	*length = iov->iov_len;
2463 
2464 	if (bio->iov_offset) {
2465 		assert(bio->iov_offset <= iov->iov_len);
2466 		*address += bio->iov_offset;
2467 		*length -= bio->iov_offset;
2468 	}
2469 
2470 	bio->iov_offset += *length;
2471 	if (bio->iov_offset == iov->iov_len) {
2472 		bio->iovpos++;
2473 		bio->iov_offset = 0;
2474 	}
2475 
2476 	return 0;
2477 }
2478 
2479 static void
2480 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset)
2481 {
2482 	struct nvme_bdev_io *bio = ref;
2483 	struct iovec *iov;
2484 
2485 	bio->fused_iov_offset = sgl_offset;
2486 	for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) {
2487 		iov = &bio->fused_iovs[bio->fused_iovpos];
2488 		if (bio->fused_iov_offset < iov->iov_len) {
2489 			break;
2490 		}
2491 
2492 		bio->fused_iov_offset -= iov->iov_len;
2493 	}
2494 }
2495 
2496 static int
2497 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length)
2498 {
2499 	struct nvme_bdev_io *bio = ref;
2500 	struct iovec *iov;
2501 
2502 	assert(bio->fused_iovpos < bio->fused_iovcnt);
2503 
2504 	iov = &bio->fused_iovs[bio->fused_iovpos];
2505 
2506 	*address = iov->iov_base;
2507 	*length = iov->iov_len;
2508 
2509 	if (bio->fused_iov_offset) {
2510 		assert(bio->fused_iov_offset <= iov->iov_len);
2511 		*address += bio->fused_iov_offset;
2512 		*length -= bio->fused_iov_offset;
2513 	}
2514 
2515 	bio->fused_iov_offset += *length;
2516 	if (bio->fused_iov_offset == iov->iov_len) {
2517 		bio->fused_iovpos++;
2518 		bio->fused_iov_offset = 0;
2519 	}
2520 
2521 	return 0;
2522 }
2523 
2524 static int
2525 bdev_nvme_no_pi_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
2526 		      struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
2527 		      void *md, uint64_t lba_count, uint64_t lba)
2528 {
2529 	int rc;
2530 
2531 	SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n",
2532 		      lba_count, lba);
2533 
2534 	bio->iovs = iov;
2535 	bio->iovcnt = iovcnt;
2536 	bio->iovpos = 0;
2537 	bio->iov_offset = 0;
2538 
2539 	rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count,
2540 					    bdev_nvme_no_pi_readv_done, bio, 0,
2541 					    bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
2542 					    md, 0, 0);
2543 
2544 	if (rc != 0 && rc != -ENOMEM) {
2545 		SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc);
2546 	}
2547 	return rc;
2548 }
2549 
2550 static int
2551 bdev_nvme_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
2552 		struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
2553 		void *md, uint64_t lba_count, uint64_t lba, uint32_t flags)
2554 {
2555 	int rc;
2556 
2557 	SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n",
2558 		      lba_count, lba);
2559 
2560 	bio->iovs = iov;
2561 	bio->iovcnt = iovcnt;
2562 	bio->iovpos = 0;
2563 	bio->iov_offset = 0;
2564 
2565 	if (iovcnt == 1) {
2566 		rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, md, lba,
2567 						   lba_count,
2568 						   bdev_nvme_readv_done, bio,
2569 						   flags,
2570 						   0, 0);
2571 	} else {
2572 		rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count,
2573 						    bdev_nvme_readv_done, bio, flags,
2574 						    bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
2575 						    md, 0, 0);
2576 	}
2577 
2578 	if (rc != 0 && rc != -ENOMEM) {
2579 		SPDK_ERRLOG("readv failed: rc = %d\n", rc);
2580 	}
2581 	return rc;
2582 }
2583 
2584 static int
2585 bdev_nvme_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
2586 		 struct nvme_bdev_io *bio,
2587 		 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
2588 		 uint32_t flags)
2589 {
2590 	int rc;
2591 
2592 	SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n",
2593 		      lba_count, lba);
2594 
2595 	bio->iovs = iov;
2596 	bio->iovcnt = iovcnt;
2597 	bio->iovpos = 0;
2598 	bio->iov_offset = 0;
2599 
2600 	if (iovcnt == 1) {
2601 		rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, md, lba,
2602 						    lba_count,
2603 						    bdev_nvme_writev_done, bio,
2604 						    flags,
2605 						    0, 0);
2606 	} else {
2607 		rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count,
2608 						     bdev_nvme_writev_done, bio, flags,
2609 						     bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
2610 						     md, 0, 0);
2611 	}
2612 
2613 	if (rc != 0 && rc != -ENOMEM) {
2614 		SPDK_ERRLOG("writev failed: rc = %d\n", rc);
2615 	}
2616 	return rc;
2617 }
2618 
2619 static int
2620 bdev_nvme_comparev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
2621 		   struct nvme_bdev_io *bio,
2622 		   struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
2623 		   uint32_t flags)
2624 {
2625 	int rc;
2626 
2627 	SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n",
2628 		      lba_count, lba);
2629 
2630 	bio->iovs = iov;
2631 	bio->iovcnt = iovcnt;
2632 	bio->iovpos = 0;
2633 	bio->iov_offset = 0;
2634 
2635 	rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count,
2636 					       bdev_nvme_comparev_done, bio, flags,
2637 					       bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
2638 					       md, 0, 0);
2639 
2640 	if (rc != 0 && rc != -ENOMEM) {
2641 		SPDK_ERRLOG("comparev failed: rc = %d\n", rc);
2642 	}
2643 	return rc;
2644 }
2645 
2646 static int
2647 bdev_nvme_comparev_and_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
2648 			      struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt,
2649 			      struct iovec *write_iov, int write_iovcnt,
2650 			      void *md, uint64_t lba_count, uint64_t lba, uint32_t flags)
2651 {
2652 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2653 	int rc;
2654 
2655 	SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n",
2656 		      lba_count, lba);
2657 
2658 	bio->iovs = cmp_iov;
2659 	bio->iovcnt = cmp_iovcnt;
2660 	bio->iovpos = 0;
2661 	bio->iov_offset = 0;
2662 	bio->fused_iovs = write_iov;
2663 	bio->fused_iovcnt = write_iovcnt;
2664 	bio->fused_iovpos = 0;
2665 	bio->fused_iov_offset = 0;
2666 
2667 	if (bdev_io->num_retries == 0) {
2668 		bio->first_fused_submitted = false;
2669 	}
2670 
2671 	if (!bio->first_fused_submitted) {
2672 		flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST;
2673 		memset(&bio->cpl, 0, sizeof(bio->cpl));
2674 
2675 		rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count,
2676 						       bdev_nvme_comparev_and_writev_done, bio, flags,
2677 						       bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0);
2678 		if (rc == 0) {
2679 			bio->first_fused_submitted = true;
2680 			flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST;
2681 		} else {
2682 			if (rc != -ENOMEM) {
2683 				SPDK_ERRLOG("compare failed: rc = %d\n", rc);
2684 			}
2685 			return rc;
2686 		}
2687 	}
2688 
2689 	flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND;
2690 
2691 	rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count,
2692 					     bdev_nvme_comparev_and_writev_done, bio, flags,
2693 					     bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0);
2694 	if (rc != 0 && rc != -ENOMEM) {
2695 		SPDK_ERRLOG("write failed: rc = %d\n", rc);
2696 		rc = 0;
2697 	}
2698 
2699 	return rc;
2700 }
2701 
2702 static int
2703 bdev_nvme_unmap(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
2704 		struct nvme_bdev_io *bio,
2705 		uint64_t offset_blocks,
2706 		uint64_t num_blocks)
2707 {
2708 	struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES];
2709 	struct spdk_nvme_dsm_range *range;
2710 	uint64_t offset, remaining;
2711 	uint64_t num_ranges_u64;
2712 	uint16_t num_ranges;
2713 	int rc;
2714 
2715 	num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) /
2716 			 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
2717 	if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) {
2718 		SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks);
2719 		return -EINVAL;
2720 	}
2721 	num_ranges = (uint16_t)num_ranges_u64;
2722 
2723 	offset = offset_blocks;
2724 	remaining = num_blocks;
2725 	range = &dsm_ranges[0];
2726 
2727 	/* Fill max-size ranges until the remaining blocks fit into one range */
2728 	while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) {
2729 		range->attributes.raw = 0;
2730 		range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
2731 		range->starting_lba = offset;
2732 
2733 		offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
2734 		remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
2735 		range++;
2736 	}
2737 
2738 	/* Final range describes the remaining blocks */
2739 	range->attributes.raw = 0;
2740 	range->length = remaining;
2741 	range->starting_lba = offset;
2742 
2743 	rc = spdk_nvme_ns_cmd_dataset_management(ns, qpair,
2744 			SPDK_NVME_DSM_ATTR_DEALLOCATE,
2745 			dsm_ranges, num_ranges,
2746 			bdev_nvme_queued_done, bio);
2747 
2748 	return rc;
2749 }
2750 
2751 static int
2752 bdev_nvme_admin_passthru(struct nvme_io_channel *nvme_ch, struct nvme_bdev_io *bio,
2753 			 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes)
2754 {
2755 	uint32_t max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ch->ctrlr->ctrlr);
2756 
2757 	if (nbytes > max_xfer_size) {
2758 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
2759 		return -EINVAL;
2760 	}
2761 
2762 	bio->orig_thread = spdk_io_channel_get_thread(spdk_io_channel_from_ctx(nvme_ch));
2763 
2764 	return spdk_nvme_ctrlr_cmd_admin_raw(nvme_ch->ctrlr->ctrlr, cmd, buf,
2765 					     (uint32_t)nbytes, bdev_nvme_admin_passthru_done, bio);
2766 }
2767 
2768 static int
2769 bdev_nvme_io_passthru(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
2770 		      struct nvme_bdev_io *bio,
2771 		      struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes)
2772 {
2773 	uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns);
2774 	struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns);
2775 
2776 	if (nbytes > max_xfer_size) {
2777 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
2778 		return -EINVAL;
2779 	}
2780 
2781 	/*
2782 	 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid,
2783 	 * so fill it out automatically.
2784 	 */
2785 	cmd->nsid = spdk_nvme_ns_get_id(ns);
2786 
2787 	return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf,
2788 					  (uint32_t)nbytes, bdev_nvme_queued_done, bio);
2789 }
2790 
2791 static int
2792 bdev_nvme_io_passthru_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
2793 			 struct nvme_bdev_io *bio,
2794 			 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len)
2795 {
2796 	size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns);
2797 	uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns);
2798 	struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns);
2799 
2800 	if (nbytes > max_xfer_size) {
2801 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
2802 		return -EINVAL;
2803 	}
2804 
2805 	if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) {
2806 		SPDK_ERRLOG("invalid meta data buffer size\n");
2807 		return -EINVAL;
2808 	}
2809 
2810 	/*
2811 	 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid,
2812 	 * so fill it out automatically.
2813 	 */
2814 	cmd->nsid = spdk_nvme_ns_get_id(ns);
2815 
2816 	return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf,
2817 			(uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio);
2818 }
2819 
2820 static void
2821 bdev_nvme_abort_admin_cmd(void *ctx)
2822 {
2823 	struct nvme_bdev_io *bio = ctx;
2824 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2825 	struct nvme_io_channel *nvme_ch;
2826 	struct nvme_bdev_io *bio_to_abort;
2827 	int rc;
2828 
2829 	nvme_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io));
2830 	bio_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx;
2831 
2832 	rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ch->ctrlr->ctrlr,
2833 					   NULL,
2834 					   bio_to_abort,
2835 					   bdev_nvme_abort_done, bio);
2836 	if (rc == -ENOENT) {
2837 		/* If no admin command was found in admin qpair, complete the abort
2838 		 * request with failure.
2839 		 */
2840 		bio->cpl.cdw0 |= 1U;
2841 		bio->cpl.status.sc = SPDK_NVME_SC_SUCCESS;
2842 		bio->cpl.status.sct = SPDK_NVME_SCT_GENERIC;
2843 
2844 		spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_completion, bio);
2845 	}
2846 }
2847 
2848 static int
2849 bdev_nvme_abort(struct nvme_io_channel *nvme_ch, struct nvme_bdev_io *bio,
2850 		struct nvme_bdev_io *bio_to_abort)
2851 {
2852 	int rc;
2853 
2854 	bio->orig_thread = spdk_io_channel_get_thread(spdk_io_channel_from_ctx(nvme_ch));
2855 
2856 	rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ch->ctrlr->ctrlr,
2857 					   nvme_ch->qpair,
2858 					   bio_to_abort,
2859 					   bdev_nvme_abort_done, bio);
2860 	if (rc == -ENOENT) {
2861 		/* If no command was found in I/O qpair, the target command may be
2862 		 * admin command. Only a single thread tries aborting admin command
2863 		 * to clean I/O flow.
2864 		 */
2865 		spdk_thread_send_msg(nvme_ch->ctrlr->thread,
2866 				     bdev_nvme_abort_admin_cmd, bio);
2867 		rc = 0;
2868 	}
2869 
2870 	return rc;
2871 }
2872 
2873 static void
2874 nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w,
2875 		struct nvme_bdev_ns *nvme_ns)
2876 {
2877 	/* nop */
2878 }
2879 
2880 static void
2881 nvme_namespace_config_json(struct spdk_json_write_ctx *w, struct nvme_bdev_ns *nvme_ns)
2882 {
2883 	g_config_json_namespace_fn[nvme_ns->type](w, nvme_ns);
2884 }
2885 
2886 static void
2887 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w)
2888 {
2889 	const char	*action;
2890 
2891 	if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) {
2892 		action = "reset";
2893 	} else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) {
2894 		action = "abort";
2895 	} else {
2896 		action = "none";
2897 	}
2898 
2899 	spdk_json_write_object_begin(w);
2900 
2901 	spdk_json_write_named_string(w, "method", "bdev_nvme_set_options");
2902 
2903 	spdk_json_write_named_object_begin(w, "params");
2904 	spdk_json_write_named_string(w, "action_on_timeout", action);
2905 	spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us);
2906 	spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms);
2907 	spdk_json_write_named_uint32(w, "retry_count", g_opts.retry_count);
2908 	spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst);
2909 	spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight);
2910 	spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight);
2911 	spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight);
2912 	spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us);
2913 	spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us);
2914 	spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests);
2915 	spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit);
2916 	spdk_json_write_object_end(w);
2917 
2918 	spdk_json_write_object_end(w);
2919 }
2920 
2921 static void
2922 nvme_bdev_ctrlr_config_json(struct spdk_json_write_ctx *w,
2923 			    struct nvme_bdev_ctrlr *nvme_bdev_ctrlr)
2924 {
2925 	struct spdk_nvme_transport_id	*trid;
2926 
2927 	trid = nvme_bdev_ctrlr->connected_trid;
2928 
2929 	spdk_json_write_object_begin(w);
2930 
2931 	spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller");
2932 
2933 	spdk_json_write_named_object_begin(w, "params");
2934 	spdk_json_write_named_string(w, "name", nvme_bdev_ctrlr->name);
2935 	nvme_bdev_dump_trid_json(trid, w);
2936 	spdk_json_write_named_bool(w, "prchk_reftag",
2937 				   (nvme_bdev_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0);
2938 	spdk_json_write_named_bool(w, "prchk_guard",
2939 				   (nvme_bdev_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0);
2940 
2941 	spdk_json_write_object_end(w);
2942 
2943 	spdk_json_write_object_end(w);
2944 }
2945 
2946 static void
2947 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w)
2948 {
2949 	spdk_json_write_object_begin(w);
2950 	spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug");
2951 
2952 	spdk_json_write_named_object_begin(w, "params");
2953 	spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us);
2954 	spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled);
2955 	spdk_json_write_object_end(w);
2956 
2957 	spdk_json_write_object_end(w);
2958 }
2959 
2960 static int
2961 bdev_nvme_config_json(struct spdk_json_write_ctx *w)
2962 {
2963 	struct nvme_bdev_ctrlr	*nvme_bdev_ctrlr;
2964 	uint32_t		nsid;
2965 
2966 	bdev_nvme_opts_config_json(w);
2967 
2968 	pthread_mutex_lock(&g_bdev_nvme_mutex);
2969 
2970 	TAILQ_FOREACH(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
2971 		nvme_bdev_ctrlr_config_json(w, nvme_bdev_ctrlr);
2972 
2973 		for (nsid = 0; nsid < nvme_bdev_ctrlr->num_ns; ++nsid) {
2974 			if (!nvme_bdev_ctrlr->namespaces[nsid]->populated) {
2975 				continue;
2976 			}
2977 
2978 			nvme_namespace_config_json(w, nvme_bdev_ctrlr->namespaces[nsid]);
2979 		}
2980 	}
2981 
2982 	/* Dump as last parameter to give all NVMe bdevs chance to be constructed
2983 	 * before enabling hotplug poller.
2984 	 */
2985 	bdev_nvme_hotplug_config_json(w);
2986 
2987 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
2988 	return 0;
2989 }
2990 
2991 struct spdk_nvme_ctrlr *
2992 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev)
2993 {
2994 	if (!bdev || bdev->module != &nvme_if) {
2995 		return NULL;
2996 	}
2997 
2998 	return SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk)->nvme_ns->ctrlr->ctrlr;
2999 }
3000 
3001 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme)
3002