xref: /spdk/module/bdev/nvme/bdev_nvme.c (revision 4e8e97c886e47e337dc470ac8c1ffa044d729af0)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation. All rights reserved.
5  *   Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "spdk/stdinc.h"
35 
36 #include "bdev_nvme.h"
37 #include "bdev_ocssd.h"
38 
39 #include "spdk/config.h"
40 #include "spdk/conf.h"
41 #include "spdk/endian.h"
42 #include "spdk/bdev.h"
43 #include "spdk/json.h"
44 #include "spdk/nvme.h"
45 #include "spdk/nvme_ocssd.h"
46 #include "spdk/thread.h"
47 #include "spdk/string.h"
48 #include "spdk/likely.h"
49 #include "spdk/util.h"
50 
51 #include "spdk/bdev_module.h"
52 #include "spdk/log.h"
53 
54 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true
55 
56 static void bdev_nvme_get_spdk_running_config(FILE *fp);
57 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w);
58 
59 struct nvme_bdev_io {
60 	/** array of iovecs to transfer. */
61 	struct iovec *iovs;
62 
63 	/** Number of iovecs in iovs array. */
64 	int iovcnt;
65 
66 	/** Current iovec position. */
67 	int iovpos;
68 
69 	/** Offset in current iovec. */
70 	uint32_t iov_offset;
71 
72 	/** array of iovecs to transfer. */
73 	struct iovec *fused_iovs;
74 
75 	/** Number of iovecs in iovs array. */
76 	int fused_iovcnt;
77 
78 	/** Current iovec position. */
79 	int fused_iovpos;
80 
81 	/** Offset in current iovec. */
82 	uint32_t fused_iov_offset;
83 
84 	/** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */
85 	struct spdk_nvme_cpl cpl;
86 
87 	/** Originating thread */
88 	struct spdk_thread *orig_thread;
89 
90 	/** Keeps track if first of fused commands was submitted */
91 	bool first_fused_submitted;
92 };
93 
94 struct nvme_probe_ctx {
95 	size_t count;
96 	struct spdk_nvme_transport_id trids[NVME_MAX_CONTROLLERS];
97 	struct spdk_nvme_host_id hostids[NVME_MAX_CONTROLLERS];
98 	const char *names[NVME_MAX_CONTROLLERS];
99 	uint32_t prchk_flags[NVME_MAX_CONTROLLERS];
100 	const char *hostnqn;
101 };
102 
103 struct nvme_probe_skip_entry {
104 	struct spdk_nvme_transport_id		trid;
105 	TAILQ_ENTRY(nvme_probe_skip_entry)	tailq;
106 };
107 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */
108 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER(
109 			g_skipped_nvme_ctrlrs);
110 
111 static struct spdk_bdev_nvme_opts g_opts = {
112 	.action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE,
113 	.timeout_us = 0,
114 	.retry_count = 4,
115 	.arbitration_burst = 0,
116 	.low_priority_weight = 0,
117 	.medium_priority_weight = 0,
118 	.high_priority_weight = 0,
119 	.nvme_adminq_poll_period_us = 10000ULL,
120 	.nvme_ioq_poll_period_us = 0,
121 	.io_queue_requests = 0,
122 	.delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT,
123 };
124 
125 #define NVME_HOTPLUG_POLL_PERIOD_MAX			10000000ULL
126 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT		100000ULL
127 
128 static int g_hot_insert_nvme_controller_index = 0;
129 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT;
130 static bool g_nvme_hotplug_enabled = false;
131 static struct spdk_thread *g_bdev_nvme_init_thread;
132 static struct spdk_poller *g_hotplug_poller;
133 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx;
134 static char *g_nvme_hostnqn = NULL;
135 
136 static void nvme_ctrlr_populate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
137 		struct nvme_async_probe_ctx *ctx);
138 static void nvme_ctrlr_populate_namespaces_done(struct nvme_async_probe_ctx *ctx);
139 static int bdev_nvme_library_init(void);
140 static void bdev_nvme_library_fini(void);
141 static int bdev_nvme_readv(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch,
142 			   struct nvme_bdev_io *bio,
143 			   struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
144 			   uint32_t flags);
145 static int bdev_nvme_no_pi_readv(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch,
146 				 struct nvme_bdev_io *bio,
147 				 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba);
148 static int bdev_nvme_writev(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch,
149 			    struct nvme_bdev_io *bio,
150 			    struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
151 			    uint32_t flags);
152 static int bdev_nvme_comparev(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch,
153 			      struct nvme_bdev_io *bio,
154 			      struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
155 			      uint32_t flags);
156 static int bdev_nvme_comparev_and_writev(struct nvme_bdev_ns *nvme_ns,
157 		struct nvme_io_channel *nvme_ch,
158 		struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov,
159 		int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba,
160 		uint32_t flags);
161 static int bdev_nvme_admin_passthru(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch,
162 				    struct nvme_bdev_io *bio,
163 				    struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes);
164 static int bdev_nvme_io_passthru(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch,
165 				 struct nvme_bdev_io *bio,
166 				 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes);
167 static int bdev_nvme_io_passthru_md(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch,
168 				    struct nvme_bdev_io *bio,
169 				    struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len);
170 static int bdev_nvme_abort(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch,
171 			   struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort);
172 static int bdev_nvme_reset(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, struct nvme_bdev_io *bio,
173 			   bool failover);
174 
175 typedef void (*populate_namespace_fn)(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
176 				      struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx);
177 static void nvme_ctrlr_populate_standard_namespace(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
178 		struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx);
179 
180 static populate_namespace_fn g_populate_namespace_fn[] = {
181 	NULL,
182 	nvme_ctrlr_populate_standard_namespace,
183 	bdev_ocssd_populate_namespace,
184 };
185 
186 typedef void (*depopulate_namespace_fn)(struct nvme_bdev_ns *ns);
187 static void nvme_ctrlr_depopulate_standard_namespace(struct nvme_bdev_ns *ns);
188 
189 static depopulate_namespace_fn g_depopulate_namespace_fn[] = {
190 	NULL,
191 	nvme_ctrlr_depopulate_standard_namespace,
192 	bdev_ocssd_depopulate_namespace,
193 };
194 
195 typedef void (*config_json_namespace_fn)(struct spdk_json_write_ctx *w, struct nvme_bdev_ns *ns);
196 static void nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w,
197 		struct nvme_bdev_ns *ns);
198 
199 static config_json_namespace_fn g_config_json_namespace_fn[] = {
200 	NULL,
201 	nvme_ctrlr_config_json_standard_namespace,
202 	bdev_ocssd_namespace_config_json,
203 };
204 
205 struct spdk_nvme_qpair *
206 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch)
207 {
208 	struct nvme_io_channel *nvme_ch;
209 
210 	nvme_ch =  spdk_io_channel_get_ctx(ctrlr_io_ch);
211 
212 	return nvme_ch->qpair;
213 }
214 
215 static int
216 bdev_nvme_get_ctx_size(void)
217 {
218 	return sizeof(struct nvme_bdev_io);
219 }
220 
221 static struct spdk_bdev_module nvme_if = {
222 	.name = "nvme",
223 	.async_fini = true,
224 	.module_init = bdev_nvme_library_init,
225 	.module_fini = bdev_nvme_library_fini,
226 	.config_text = bdev_nvme_get_spdk_running_config,
227 	.config_json = bdev_nvme_config_json,
228 	.get_ctx_size = bdev_nvme_get_ctx_size,
229 
230 };
231 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if)
232 
233 static void
234 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx)
235 {
236 	SPDK_DEBUGLOG(bdev_nvme, "qpar %p is disconnected, attempting reconnect.\n", qpair);
237 	/*
238 	 * Currently, just try to reconnect indefinitely. If we are doing a reset, the reset will
239 	 * reconnect a qpair and we will stop getting a callback for this one.
240 	 */
241 	spdk_nvme_ctrlr_reconnect_io_qpair(qpair);
242 }
243 
244 static int
245 bdev_nvme_poll(void *arg)
246 {
247 	struct nvme_bdev_poll_group *group = arg;
248 	int64_t num_completions;
249 
250 	if (group->collect_spin_stat && group->start_ticks == 0) {
251 		group->start_ticks = spdk_get_ticks();
252 	}
253 
254 	num_completions = spdk_nvme_poll_group_process_completions(group->group, 0,
255 			  bdev_nvme_disconnected_qpair_cb);
256 	if (group->collect_spin_stat) {
257 		if (num_completions > 0) {
258 			if (group->end_ticks != 0) {
259 				group->spin_ticks += (group->end_ticks - group->start_ticks);
260 				group->end_ticks = 0;
261 			}
262 			group->start_ticks = 0;
263 		} else {
264 			group->end_ticks = spdk_get_ticks();
265 		}
266 	}
267 
268 	return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
269 }
270 
271 static int
272 bdev_nvme_poll_adminq(void *arg)
273 {
274 	int32_t rc;
275 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = arg;
276 
277 	assert(nvme_bdev_ctrlr != NULL);
278 
279 	rc = spdk_nvme_ctrlr_process_admin_completions(nvme_bdev_ctrlr->ctrlr);
280 	if (rc < 0) {
281 		bdev_nvme_reset(nvme_bdev_ctrlr, NULL, true);
282 	}
283 
284 	return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY;
285 }
286 
287 static int
288 bdev_nvme_destruct(void *ctx)
289 {
290 	struct nvme_bdev *nvme_disk = ctx;
291 
292 	nvme_bdev_detach_bdev_from_ns(nvme_disk);
293 
294 	free(nvme_disk->disk.name);
295 	free(nvme_disk);
296 
297 	return 0;
298 }
299 
300 static int
301 bdev_nvme_flush(struct nvme_bdev_ns *nvme_ns, struct nvme_bdev_io *bio,
302 		uint64_t offset, uint64_t nbytes)
303 {
304 	spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_SUCCESS);
305 
306 	return 0;
307 }
308 
309 static void
310 _bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i)
311 {
312 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
313 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(_ch);
314 	struct spdk_bdev_io *bdev_io;
315 	enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS;
316 
317 	/* A NULL ctx means success. */
318 	if (spdk_io_channel_iter_get_ctx(i) != NULL) {
319 		status = SPDK_BDEV_IO_STATUS_FAILED;
320 	}
321 
322 	while (!TAILQ_EMPTY(&nvme_ch->pending_resets)) {
323 		bdev_io = TAILQ_FIRST(&nvme_ch->pending_resets);
324 		TAILQ_REMOVE(&nvme_ch->pending_resets, bdev_io, module_link);
325 		spdk_bdev_io_complete(bdev_io, status);
326 	}
327 
328 	spdk_for_each_channel_continue(i, 0);
329 }
330 
331 static void
332 _bdev_nvme_reset_complete(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, int rc)
333 {
334 	/* we are using the for_each_channel cb_arg like a return code here. */
335 	/* If it's zero, we succeeded, otherwise, the reset failed. */
336 	void *cb_arg = NULL;
337 
338 	if (rc) {
339 		cb_arg = (void *)0x1;
340 		SPDK_ERRLOG("Resetting controller failed.\n");
341 	} else {
342 		SPDK_NOTICELOG("Resetting controller successful.\n");
343 	}
344 
345 	pthread_mutex_lock(&g_bdev_nvme_mutex);
346 	nvme_bdev_ctrlr->resetting = false;
347 	nvme_bdev_ctrlr->failover_in_progress = false;
348 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
349 	/* Make sure we clear any pending resets before returning. */
350 	spdk_for_each_channel(nvme_bdev_ctrlr,
351 			      _bdev_nvme_complete_pending_resets,
352 			      cb_arg, NULL);
353 }
354 
355 static void
356 _bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status)
357 {
358 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i);
359 	void *ctx = spdk_io_channel_iter_get_ctx(i);
360 	int rc = SPDK_BDEV_IO_STATUS_SUCCESS;
361 
362 	if (status) {
363 		rc = SPDK_BDEV_IO_STATUS_FAILED;
364 	}
365 	if (ctx) {
366 		spdk_bdev_io_complete(spdk_bdev_io_from_ctx(ctx), rc);
367 	}
368 	_bdev_nvme_reset_complete(nvme_bdev_ctrlr, status);
369 }
370 
371 static void
372 _bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i)
373 {
374 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i);
375 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
376 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(_ch);
377 	struct spdk_nvme_io_qpair_opts opts;
378 
379 	spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_bdev_ctrlr->ctrlr, &opts, sizeof(opts));
380 	opts.delay_cmd_submit = g_opts.delay_cmd_submit;
381 	opts.create_only = true;
382 
383 	nvme_ch->qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_bdev_ctrlr->ctrlr, &opts, sizeof(opts));
384 	if (!nvme_ch->qpair) {
385 		spdk_for_each_channel_continue(i, -1);
386 		return;
387 	}
388 
389 	assert(nvme_ch->group != NULL);
390 	if (spdk_nvme_poll_group_add(nvme_ch->group->group, nvme_ch->qpair) != 0) {
391 		SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n");
392 		spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair);
393 		spdk_for_each_channel_continue(i, -1);
394 		return;
395 	}
396 
397 	if (spdk_nvme_ctrlr_connect_io_qpair(nvme_bdev_ctrlr->ctrlr, nvme_ch->qpair)) {
398 		SPDK_ERRLOG("Unable to connect I/O qpair.\n");
399 		spdk_nvme_poll_group_remove(nvme_ch->group->group, nvme_ch->qpair);
400 		spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair);
401 		spdk_for_each_channel_continue(i, -1);
402 		return;
403 	}
404 
405 	spdk_for_each_channel_continue(i, 0);
406 }
407 
408 static void
409 _bdev_nvme_reset(struct spdk_io_channel_iter *i, int status)
410 {
411 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i);
412 	struct nvme_bdev_io *bio = spdk_io_channel_iter_get_ctx(i);
413 	int rc;
414 
415 	if (status) {
416 		if (bio) {
417 			spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_FAILED);
418 		}
419 		_bdev_nvme_reset_complete(nvme_bdev_ctrlr, status);
420 		return;
421 	}
422 
423 	rc = spdk_nvme_ctrlr_reset(nvme_bdev_ctrlr->ctrlr);
424 	if (rc != 0) {
425 		if (bio) {
426 			spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_FAILED);
427 		}
428 		_bdev_nvme_reset_complete(nvme_bdev_ctrlr, rc);
429 		return;
430 	}
431 
432 	/* Recreate all of the I/O queue pairs */
433 	spdk_for_each_channel(nvme_bdev_ctrlr,
434 			      _bdev_nvme_reset_create_qpair,
435 			      bio,
436 			      _bdev_nvme_reset_create_qpairs_done);
437 
438 
439 }
440 
441 static void
442 _bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i)
443 {
444 	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
445 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
446 	int rc;
447 
448 	rc = spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair);
449 	if (!rc) {
450 		nvme_ch->qpair = NULL;
451 	}
452 
453 	spdk_for_each_channel_continue(i, rc);
454 }
455 
456 static int
457 bdev_nvme_reset(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, struct nvme_bdev_io *bio, bool failover)
458 {
459 	struct spdk_io_channel *ch;
460 	struct nvme_io_channel *nvme_ch;
461 	struct nvme_bdev_ctrlr_trid *next_trid = NULL, *tmp_trid = NULL;
462 	int rc = 0;
463 
464 	pthread_mutex_lock(&g_bdev_nvme_mutex);
465 	if (nvme_bdev_ctrlr->destruct) {
466 		/* Don't bother resetting if the controller is in the process of being destructed. */
467 		if (bio) {
468 			spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_FAILED);
469 		}
470 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
471 		return 0;
472 	}
473 
474 	if (failover) {
475 		tmp_trid = TAILQ_FIRST(&nvme_bdev_ctrlr->trids);
476 		assert(tmp_trid);
477 		assert(&tmp_trid->trid == nvme_bdev_ctrlr->connected_trid);
478 		next_trid = TAILQ_NEXT(tmp_trid, link);
479 		if (!next_trid) {
480 			failover = false;
481 		}
482 	}
483 
484 	if (!nvme_bdev_ctrlr->resetting) {
485 		nvme_bdev_ctrlr->resetting = true;
486 		if (failover) {
487 			nvme_bdev_ctrlr->failover_in_progress = true;
488 		}
489 	} else {
490 		if (next_trid && !nvme_bdev_ctrlr->failover_in_progress) {
491 			rc = -EAGAIN;
492 		}
493 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
494 		SPDK_NOTICELOG("Unable to perform reset, already in progress.\n");
495 		/*
496 		 * The internal reset calls won't be queued. This is on purpose so that we don't
497 		 * interfere with the app framework reset strategy. i.e. we are deferring to the
498 		 * upper level. If they are in the middle of a reset, we won't try to schedule another one.
499 		 */
500 		if (bio) {
501 			ch = spdk_get_io_channel(nvme_bdev_ctrlr);
502 			assert(ch != NULL);
503 			nvme_ch = spdk_io_channel_get_ctx(ch);
504 			TAILQ_INSERT_TAIL(&nvme_ch->pending_resets, spdk_bdev_io_from_ctx(bio), module_link);
505 			spdk_put_io_channel(ch);
506 		}
507 		return rc;
508 	}
509 
510 	if (failover) {
511 		spdk_nvme_ctrlr_fail(nvme_bdev_ctrlr->ctrlr);
512 		nvme_bdev_ctrlr->connected_trid = &next_trid->trid;
513 		rc = spdk_nvme_ctrlr_set_trid(nvme_bdev_ctrlr->ctrlr, &next_trid->trid);
514 		assert(rc == 0);
515 		/** Shuffle the old trid to the end of the list and use the new one.
516 		 * Allows for round robin through multiple connections.
517 		 */
518 		TAILQ_REMOVE(&nvme_bdev_ctrlr->trids, tmp_trid, link);
519 		TAILQ_INSERT_TAIL(&nvme_bdev_ctrlr->trids, tmp_trid, link);
520 	}
521 
522 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
523 	/* First, delete all NVMe I/O queue pairs. */
524 	spdk_for_each_channel(nvme_bdev_ctrlr,
525 			      _bdev_nvme_reset_destroy_qpair,
526 			      bio,
527 			      _bdev_nvme_reset);
528 
529 	return 0;
530 }
531 
532 static int
533 bdev_nvme_unmap(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch,
534 		struct nvme_bdev_io *bio,
535 		uint64_t offset_blocks,
536 		uint64_t num_blocks);
537 
538 static void
539 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
540 		     bool success)
541 {
542 	struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt;
543 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
544 	int ret;
545 
546 	if (!success) {
547 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
548 		return;
549 	}
550 
551 	ret = bdev_nvme_readv(nbdev->nvme_ns,
552 			      nvme_ch,
553 			      (struct nvme_bdev_io *)bdev_io->driver_ctx,
554 			      bdev_io->u.bdev.iovs,
555 			      bdev_io->u.bdev.iovcnt,
556 			      bdev_io->u.bdev.md_buf,
557 			      bdev_io->u.bdev.num_blocks,
558 			      bdev_io->u.bdev.offset_blocks,
559 			      nbdev->disk.dif_check_flags);
560 
561 	if (spdk_likely(ret == 0)) {
562 		return;
563 	} else if (ret == -ENOMEM) {
564 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
565 	} else {
566 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
567 	}
568 }
569 
570 static int
571 _bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
572 {
573 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
574 	struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt;
575 	struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx;
576 	struct nvme_bdev_io *nbdev_io_to_abort;
577 
578 	if (nvme_ch->qpair == NULL) {
579 		/* The device is currently resetting */
580 		return -1;
581 	}
582 
583 	switch (bdev_io->type) {
584 	case SPDK_BDEV_IO_TYPE_READ:
585 		if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) {
586 			bdev_nvme_get_buf_cb(ch, bdev_io, true);
587 		} else {
588 			spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb,
589 					     bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
590 		}
591 		return 0;
592 
593 	case SPDK_BDEV_IO_TYPE_WRITE:
594 		return bdev_nvme_writev(nbdev->nvme_ns,
595 					nvme_ch,
596 					nbdev_io,
597 					bdev_io->u.bdev.iovs,
598 					bdev_io->u.bdev.iovcnt,
599 					bdev_io->u.bdev.md_buf,
600 					bdev_io->u.bdev.num_blocks,
601 					bdev_io->u.bdev.offset_blocks,
602 					nbdev->disk.dif_check_flags);
603 
604 	case SPDK_BDEV_IO_TYPE_COMPARE:
605 		return bdev_nvme_comparev(nbdev->nvme_ns,
606 					  nvme_ch,
607 					  nbdev_io,
608 					  bdev_io->u.bdev.iovs,
609 					  bdev_io->u.bdev.iovcnt,
610 					  bdev_io->u.bdev.md_buf,
611 					  bdev_io->u.bdev.num_blocks,
612 					  bdev_io->u.bdev.offset_blocks,
613 					  nbdev->disk.dif_check_flags);
614 
615 	case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE:
616 		return bdev_nvme_comparev_and_writev(nbdev->nvme_ns,
617 						     nvme_ch,
618 						     nbdev_io,
619 						     bdev_io->u.bdev.iovs,
620 						     bdev_io->u.bdev.iovcnt,
621 						     bdev_io->u.bdev.fused_iovs,
622 						     bdev_io->u.bdev.fused_iovcnt,
623 						     bdev_io->u.bdev.md_buf,
624 						     bdev_io->u.bdev.num_blocks,
625 						     bdev_io->u.bdev.offset_blocks,
626 						     nbdev->disk.dif_check_flags);
627 
628 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
629 		return bdev_nvme_unmap(nbdev->nvme_ns,
630 				       nvme_ch,
631 				       nbdev_io,
632 				       bdev_io->u.bdev.offset_blocks,
633 				       bdev_io->u.bdev.num_blocks);
634 
635 	case SPDK_BDEV_IO_TYPE_UNMAP:
636 		return bdev_nvme_unmap(nbdev->nvme_ns,
637 				       nvme_ch,
638 				       nbdev_io,
639 				       bdev_io->u.bdev.offset_blocks,
640 				       bdev_io->u.bdev.num_blocks);
641 
642 	case SPDK_BDEV_IO_TYPE_RESET:
643 		return bdev_nvme_reset(nbdev->nvme_ns->ctrlr, nbdev_io, false);
644 
645 	case SPDK_BDEV_IO_TYPE_FLUSH:
646 		return bdev_nvme_flush(nbdev->nvme_ns,
647 				       nbdev_io,
648 				       bdev_io->u.bdev.offset_blocks,
649 				       bdev_io->u.bdev.num_blocks);
650 
651 	case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
652 		return bdev_nvme_admin_passthru(nbdev->nvme_ns,
653 						nvme_ch,
654 						nbdev_io,
655 						&bdev_io->u.nvme_passthru.cmd,
656 						bdev_io->u.nvme_passthru.buf,
657 						bdev_io->u.nvme_passthru.nbytes);
658 
659 	case SPDK_BDEV_IO_TYPE_NVME_IO:
660 		return bdev_nvme_io_passthru(nbdev->nvme_ns,
661 					     nvme_ch,
662 					     nbdev_io,
663 					     &bdev_io->u.nvme_passthru.cmd,
664 					     bdev_io->u.nvme_passthru.buf,
665 					     bdev_io->u.nvme_passthru.nbytes);
666 
667 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
668 		return bdev_nvme_io_passthru_md(nbdev->nvme_ns,
669 						nvme_ch,
670 						nbdev_io,
671 						&bdev_io->u.nvme_passthru.cmd,
672 						bdev_io->u.nvme_passthru.buf,
673 						bdev_io->u.nvme_passthru.nbytes,
674 						bdev_io->u.nvme_passthru.md_buf,
675 						bdev_io->u.nvme_passthru.md_len);
676 
677 	case SPDK_BDEV_IO_TYPE_ABORT:
678 		nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx;
679 		return bdev_nvme_abort(nbdev->nvme_ns,
680 				       nvme_ch,
681 				       nbdev_io,
682 				       nbdev_io_to_abort);
683 
684 	default:
685 		return -EINVAL;
686 	}
687 	return 0;
688 }
689 
690 static void
691 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
692 {
693 	int rc = _bdev_nvme_submit_request(ch, bdev_io);
694 
695 	if (spdk_unlikely(rc != 0)) {
696 		if (rc == -ENOMEM) {
697 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
698 		} else {
699 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
700 		}
701 	}
702 }
703 
704 static bool
705 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
706 {
707 	struct nvme_bdev *nbdev = ctx;
708 	struct nvme_bdev_ns *nvme_ns = nbdev->nvme_ns;
709 	const struct spdk_nvme_ctrlr_data *cdata;
710 
711 	switch (io_type) {
712 	case SPDK_BDEV_IO_TYPE_READ:
713 	case SPDK_BDEV_IO_TYPE_WRITE:
714 	case SPDK_BDEV_IO_TYPE_RESET:
715 	case SPDK_BDEV_IO_TYPE_FLUSH:
716 	case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
717 	case SPDK_BDEV_IO_TYPE_NVME_IO:
718 	case SPDK_BDEV_IO_TYPE_ABORT:
719 		return true;
720 
721 	case SPDK_BDEV_IO_TYPE_COMPARE:
722 		return spdk_nvme_ns_supports_compare(nvme_ns->ns);
723 
724 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
725 		return spdk_nvme_ns_get_md_size(nvme_ns->ns) ? true : false;
726 
727 	case SPDK_BDEV_IO_TYPE_UNMAP:
728 		cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr);
729 		return cdata->oncs.dsm;
730 
731 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
732 		cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr);
733 		/*
734 		 * If an NVMe controller guarantees reading unallocated blocks returns zero,
735 		 * we can implement WRITE_ZEROES as an NVMe deallocate command.
736 		 */
737 		if (cdata->oncs.dsm &&
738 		    spdk_nvme_ns_get_dealloc_logical_block_read_value(nvme_ns->ns) ==
739 		    SPDK_NVME_DEALLOC_READ_00) {
740 			return true;
741 		}
742 		/*
743 		 * The NVMe controller write_zeroes function is currently not used by our driver.
744 		 * If a user submits an arbitrarily large write_zeroes request to the controller, the request will fail.
745 		 * Until this is resolved, we only claim support for write_zeroes if deallocated blocks return 0's when read.
746 		 */
747 		return false;
748 
749 	case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE:
750 		if (spdk_nvme_ctrlr_get_flags(nvme_ns->ctrlr->ctrlr) &
751 		    SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) {
752 			return true;
753 		}
754 		return false;
755 
756 	default:
757 		return false;
758 	}
759 }
760 
761 static int
762 bdev_nvme_create_cb(void *io_device, void *ctx_buf)
763 {
764 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = io_device;
765 	struct nvme_io_channel *ch = ctx_buf;
766 	struct spdk_nvme_io_qpair_opts opts;
767 	struct spdk_io_channel *pg_ch = NULL;
768 	int rc;
769 
770 	spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_bdev_ctrlr->ctrlr, &opts, sizeof(opts));
771 	opts.delay_cmd_submit = g_opts.delay_cmd_submit;
772 	opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests);
773 	opts.create_only = true;
774 	g_opts.io_queue_requests = opts.io_queue_requests;
775 
776 	ch->qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_bdev_ctrlr->ctrlr, &opts, sizeof(opts));
777 
778 	if (ch->qpair == NULL) {
779 		return -1;
780 	}
781 
782 	if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) {
783 		if (bdev_ocssd_create_io_channel(ch)) {
784 			goto err;
785 		}
786 	}
787 
788 	pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs);
789 	if (!pg_ch) {
790 		goto err;
791 	}
792 
793 	ch->group = spdk_io_channel_get_ctx(pg_ch);
794 	if (spdk_nvme_poll_group_add(ch->group->group, ch->qpair) != 0) {
795 		goto err;
796 	}
797 
798 	rc = spdk_nvme_ctrlr_connect_io_qpair(nvme_bdev_ctrlr->ctrlr, ch->qpair);
799 	if (rc) {
800 		spdk_nvme_poll_group_remove(ch->group->group, ch->qpair);
801 		goto err;
802 	}
803 
804 #ifdef SPDK_CONFIG_VTUNE
805 	ch->group->collect_spin_stat = true;
806 #else
807 	ch->group->collect_spin_stat = false;
808 #endif
809 
810 	TAILQ_INIT(&ch->pending_resets);
811 	return 0;
812 
813 err:
814 	if (pg_ch) {
815 		spdk_put_io_channel(pg_ch);
816 	}
817 	spdk_nvme_ctrlr_free_io_qpair(ch->qpair);
818 	return -1;
819 }
820 
821 static void
822 bdev_nvme_destroy_cb(void *io_device, void *ctx_buf)
823 {
824 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = io_device;
825 	struct nvme_io_channel *ch = ctx_buf;
826 	struct nvme_bdev_poll_group *group;
827 
828 	group = ch->group;
829 	assert(group != NULL);
830 
831 	if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) {
832 		bdev_ocssd_destroy_io_channel(ch);
833 	}
834 
835 	if (ch->qpair != NULL) {
836 		spdk_nvme_poll_group_remove(group->group, ch->qpair);
837 	}
838 	spdk_put_io_channel(spdk_io_channel_from_ctx(group));
839 
840 	spdk_nvme_ctrlr_free_io_qpair(ch->qpair);
841 }
842 
843 static int
844 bdev_nvme_poll_group_create_cb(void *io_device, void *ctx_buf)
845 {
846 	struct nvme_bdev_poll_group *group = ctx_buf;
847 
848 	group->group = spdk_nvme_poll_group_create(group);
849 	if (group->group == NULL) {
850 		return -1;
851 	}
852 
853 	group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us);
854 
855 	if (group->poller == NULL) {
856 		spdk_nvme_poll_group_destroy(group->group);
857 		return -1;
858 	}
859 
860 	return 0;
861 }
862 
863 static void
864 bdev_nvme_poll_group_destroy_cb(void *io_device, void *ctx_buf)
865 {
866 	struct nvme_bdev_poll_group *group = ctx_buf;
867 
868 	spdk_poller_unregister(&group->poller);
869 	if (spdk_nvme_poll_group_destroy(group->group)) {
870 		SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.");
871 		assert(false);
872 	}
873 }
874 
875 static struct spdk_io_channel *
876 bdev_nvme_get_io_channel(void *ctx)
877 {
878 	struct nvme_bdev *nvme_bdev = ctx;
879 
880 	return spdk_get_io_channel(nvme_bdev->nvme_ns->ctrlr);
881 }
882 
883 static int
884 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
885 {
886 	struct nvme_bdev *nvme_bdev = ctx;
887 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = nvme_bdev->nvme_ns->ctrlr;
888 	const struct spdk_nvme_ctrlr_data *cdata;
889 	struct spdk_nvme_ns *ns;
890 	union spdk_nvme_vs_register vs;
891 	union spdk_nvme_csts_register csts;
892 	char buf[128];
893 
894 	cdata = spdk_nvme_ctrlr_get_data(nvme_bdev_ctrlr->ctrlr);
895 	vs = spdk_nvme_ctrlr_get_regs_vs(nvme_bdev_ctrlr->ctrlr);
896 	csts = spdk_nvme_ctrlr_get_regs_csts(nvme_bdev_ctrlr->ctrlr);
897 	ns = nvme_bdev->nvme_ns->ns;
898 
899 	spdk_json_write_named_object_begin(w, "nvme");
900 
901 	if (nvme_bdev_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
902 		spdk_json_write_named_string(w, "pci_address", nvme_bdev_ctrlr->connected_trid->traddr);
903 	}
904 
905 	spdk_json_write_named_object_begin(w, "trid");
906 
907 	nvme_bdev_dump_trid_json(nvme_bdev_ctrlr->connected_trid, w);
908 
909 	spdk_json_write_object_end(w);
910 
911 #ifdef SPDK_CONFIG_NVME_CUSE
912 	size_t cuse_name_size = 128;
913 	char cuse_name[cuse_name_size];
914 
915 	int rc = spdk_nvme_cuse_get_ns_name(nvme_bdev_ctrlr->ctrlr, spdk_nvme_ns_get_id(ns),
916 					    cuse_name, &cuse_name_size);
917 	if (rc == 0) {
918 		spdk_json_write_named_string(w, "cuse_device", cuse_name);
919 	}
920 #endif
921 
922 	spdk_json_write_named_object_begin(w, "ctrlr_data");
923 
924 	spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid);
925 
926 	snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn);
927 	spdk_str_trim(buf);
928 	spdk_json_write_named_string(w, "model_number", buf);
929 
930 	snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn);
931 	spdk_str_trim(buf);
932 	spdk_json_write_named_string(w, "serial_number", buf);
933 
934 	snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr);
935 	spdk_str_trim(buf);
936 	spdk_json_write_named_string(w, "firmware_revision", buf);
937 
938 	spdk_json_write_named_object_begin(w, "oacs");
939 
940 	spdk_json_write_named_uint32(w, "security", cdata->oacs.security);
941 	spdk_json_write_named_uint32(w, "format", cdata->oacs.format);
942 	spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware);
943 	spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage);
944 
945 	spdk_json_write_object_end(w);
946 
947 	spdk_json_write_object_end(w);
948 
949 	spdk_json_write_named_object_begin(w, "vs");
950 
951 	spdk_json_write_name(w, "nvme_version");
952 	if (vs.bits.ter) {
953 		spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter);
954 	} else {
955 		spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr);
956 	}
957 
958 	spdk_json_write_object_end(w);
959 
960 	spdk_json_write_named_object_begin(w, "csts");
961 
962 	spdk_json_write_named_uint32(w, "rdy", csts.bits.rdy);
963 	spdk_json_write_named_uint32(w, "cfs", csts.bits.cfs);
964 
965 	spdk_json_write_object_end(w);
966 
967 	spdk_json_write_named_object_begin(w, "ns_data");
968 
969 	spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns));
970 
971 	spdk_json_write_object_end(w);
972 
973 	if (cdata->oacs.security) {
974 		spdk_json_write_named_object_begin(w, "security");
975 
976 		spdk_json_write_named_bool(w, "opal", nvme_bdev_ctrlr->opal_dev ? true : false);
977 
978 		spdk_json_write_object_end(w);
979 	}
980 
981 	spdk_json_write_object_end(w);
982 
983 	return 0;
984 }
985 
986 static void
987 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
988 {
989 	/* No config per bdev needed */
990 }
991 
992 static uint64_t
993 bdev_nvme_get_spin_time(struct spdk_io_channel *ch)
994 {
995 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
996 	struct nvme_bdev_poll_group *group = nvme_ch->group;
997 	uint64_t spin_time;
998 
999 	if (!group || !group->collect_spin_stat) {
1000 		return 0;
1001 	}
1002 
1003 	if (group->end_ticks != 0) {
1004 		group->spin_ticks += (group->end_ticks - group->start_ticks);
1005 		group->end_ticks = 0;
1006 	}
1007 
1008 	spin_time = (group->spin_ticks * 1000000ULL) / spdk_get_ticks_hz();
1009 	group->start_ticks = 0;
1010 	group->spin_ticks = 0;
1011 
1012 	return spin_time;
1013 }
1014 
1015 static const struct spdk_bdev_fn_table nvmelib_fn_table = {
1016 	.destruct		= bdev_nvme_destruct,
1017 	.submit_request		= bdev_nvme_submit_request,
1018 	.io_type_supported	= bdev_nvme_io_type_supported,
1019 	.get_io_channel		= bdev_nvme_get_io_channel,
1020 	.dump_info_json		= bdev_nvme_dump_info_json,
1021 	.write_config_json	= bdev_nvme_write_config_json,
1022 	.get_spin_time		= bdev_nvme_get_spin_time,
1023 };
1024 
1025 static void
1026 nvme_ctrlr_populate_standard_namespace(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
1027 				       struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx)
1028 {
1029 	struct spdk_nvme_ctrlr	*ctrlr = nvme_bdev_ctrlr->ctrlr;
1030 	struct nvme_bdev	*bdev;
1031 	struct spdk_nvme_ns	*ns;
1032 	const struct spdk_uuid	*uuid;
1033 	const struct spdk_nvme_ctrlr_data *cdata;
1034 	const struct spdk_nvme_ns_data *nsdata;
1035 	int			rc;
1036 
1037 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
1038 
1039 	ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id);
1040 	if (!ns) {
1041 		SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id);
1042 		nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, -EINVAL);
1043 		return;
1044 	}
1045 
1046 	bdev = calloc(1, sizeof(*bdev));
1047 	if (!bdev) {
1048 		SPDK_ERRLOG("bdev calloc() failed\n");
1049 		nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, -ENOMEM);
1050 		return;
1051 	}
1052 
1053 	nvme_ns->ns = ns;
1054 	bdev->nvme_ns = nvme_ns;
1055 
1056 	bdev->disk.name = spdk_sprintf_alloc("%sn%d", nvme_bdev_ctrlr->name, spdk_nvme_ns_get_id(ns));
1057 	if (!bdev->disk.name) {
1058 		free(bdev);
1059 		nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, -ENOMEM);
1060 		return;
1061 	}
1062 	bdev->disk.product_name = "NVMe disk";
1063 
1064 	bdev->disk.write_cache = 0;
1065 	if (cdata->vwc.present) {
1066 		/* Enable if the Volatile Write Cache exists */
1067 		bdev->disk.write_cache = 1;
1068 	}
1069 	bdev->disk.blocklen = spdk_nvme_ns_get_extended_sector_size(ns);
1070 	bdev->disk.blockcnt = spdk_nvme_ns_get_num_sectors(ns);
1071 	bdev->disk.optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns);
1072 
1073 	uuid = spdk_nvme_ns_get_uuid(ns);
1074 	if (uuid != NULL) {
1075 		bdev->disk.uuid = *uuid;
1076 	}
1077 
1078 	nsdata = spdk_nvme_ns_get_data(ns);
1079 
1080 	bdev->disk.md_len = spdk_nvme_ns_get_md_size(ns);
1081 	if (bdev->disk.md_len != 0) {
1082 		bdev->disk.md_interleave = nsdata->flbas.extended;
1083 		bdev->disk.dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns);
1084 		if (bdev->disk.dif_type != SPDK_DIF_DISABLE) {
1085 			bdev->disk.dif_is_head_of_md = nsdata->dps.md_start;
1086 			bdev->disk.dif_check_flags = nvme_bdev_ctrlr->prchk_flags;
1087 		}
1088 	}
1089 
1090 	if (!bdev_nvme_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) {
1091 		bdev->disk.acwu = 0;
1092 	} else if (nsdata->nsfeat.ns_atomic_write_unit) {
1093 		bdev->disk.acwu = nsdata->nacwu;
1094 	} else {
1095 		bdev->disk.acwu = cdata->acwu;
1096 	}
1097 
1098 	bdev->disk.ctxt = bdev;
1099 	bdev->disk.fn_table = &nvmelib_fn_table;
1100 	bdev->disk.module = &nvme_if;
1101 	rc = spdk_bdev_register(&bdev->disk);
1102 	if (rc) {
1103 		free(bdev->disk.name);
1104 		free(bdev);
1105 		nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, rc);
1106 		return;
1107 	}
1108 
1109 	nvme_bdev_attach_bdev_to_ns(nvme_ns, bdev);
1110 	nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, 0);
1111 }
1112 
1113 static bool
1114 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
1115 		 struct spdk_nvme_ctrlr_opts *opts)
1116 {
1117 	struct nvme_probe_skip_entry *entry;
1118 
1119 	TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) {
1120 		if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) {
1121 			return false;
1122 		}
1123 	}
1124 
1125 	opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst;
1126 	opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight;
1127 	opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight;
1128 	opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight;
1129 
1130 	SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr);
1131 
1132 	return true;
1133 }
1134 
1135 static bool
1136 probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
1137 	 struct spdk_nvme_ctrlr_opts *opts)
1138 {
1139 	struct nvme_probe_ctx *ctx = cb_ctx;
1140 
1141 	SPDK_DEBUGLOG(bdev_nvme, "Probing device %s\n", trid->traddr);
1142 
1143 	if (nvme_bdev_ctrlr_get(trid)) {
1144 		SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n",
1145 			    trid->traddr);
1146 		return false;
1147 	}
1148 
1149 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
1150 		bool claim_device = false;
1151 		size_t i;
1152 
1153 		for (i = 0; i < ctx->count; i++) {
1154 			if (spdk_nvme_transport_id_compare(trid, &ctx->trids[i]) == 0) {
1155 				claim_device = true;
1156 				break;
1157 			}
1158 		}
1159 
1160 		if (!claim_device) {
1161 			SPDK_DEBUGLOG(bdev_nvme, "Not claiming device at %s\n", trid->traddr);
1162 			return false;
1163 		}
1164 	}
1165 
1166 	if (ctx->hostnqn) {
1167 		snprintf(opts->hostnqn, sizeof(opts->hostnqn), "%s", ctx->hostnqn);
1168 	}
1169 
1170 	opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst;
1171 	opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight;
1172 	opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight;
1173 	opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight;
1174 
1175 	return true;
1176 }
1177 
1178 static void
1179 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl)
1180 {
1181 	struct spdk_nvme_ctrlr *ctrlr = ctx;
1182 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
1183 
1184 	if (spdk_nvme_cpl_is_error(cpl)) {
1185 		SPDK_WARNLOG("Abort failed. Resetting controller.\n");
1186 		nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(spdk_nvme_ctrlr_get_transport_id(ctrlr));
1187 		assert(nvme_bdev_ctrlr != NULL);
1188 		bdev_nvme_reset(nvme_bdev_ctrlr, NULL, false);
1189 	}
1190 }
1191 
1192 static void
1193 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr,
1194 	   struct spdk_nvme_qpair *qpair, uint16_t cid)
1195 {
1196 	int rc;
1197 	union spdk_nvme_csts_register csts;
1198 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
1199 
1200 	SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid);
1201 
1202 	csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr);
1203 	if (csts.bits.cfs) {
1204 		SPDK_ERRLOG("Controller Fatal Status, reset required\n");
1205 		nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(spdk_nvme_ctrlr_get_transport_id(ctrlr));
1206 		assert(nvme_bdev_ctrlr != NULL);
1207 		bdev_nvme_reset(nvme_bdev_ctrlr, NULL, false);
1208 		return;
1209 	}
1210 
1211 	switch (g_opts.action_on_timeout) {
1212 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT:
1213 		if (qpair) {
1214 			rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid,
1215 						       nvme_abort_cpl, ctrlr);
1216 			if (rc == 0) {
1217 				return;
1218 			}
1219 
1220 			SPDK_ERRLOG("Unable to send abort. Resetting.\n");
1221 		}
1222 
1223 	/* FALLTHROUGH */
1224 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET:
1225 		nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(spdk_nvme_ctrlr_get_transport_id(ctrlr));
1226 		assert(nvme_bdev_ctrlr != NULL);
1227 		bdev_nvme_reset(nvme_bdev_ctrlr, NULL, false);
1228 		break;
1229 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE:
1230 		SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n");
1231 		break;
1232 	default:
1233 		SPDK_ERRLOG("An invalid timeout action value is found.\n");
1234 		break;
1235 	}
1236 }
1237 
1238 void
1239 nvme_ctrlr_depopulate_namespace_done(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr)
1240 {
1241 	pthread_mutex_lock(&g_bdev_nvme_mutex);
1242 	nvme_bdev_ctrlr->ref--;
1243 
1244 	if (nvme_bdev_ctrlr->ref == 0 && nvme_bdev_ctrlr->destruct) {
1245 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
1246 		nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr);
1247 		return;
1248 	}
1249 
1250 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
1251 }
1252 
1253 static void
1254 nvme_ctrlr_depopulate_standard_namespace(struct nvme_bdev_ns *ns)
1255 {
1256 	struct nvme_bdev *bdev, *tmp;
1257 
1258 	TAILQ_FOREACH_SAFE(bdev, &ns->bdevs, tailq, tmp) {
1259 		spdk_bdev_unregister(&bdev->disk, NULL, NULL);
1260 	}
1261 
1262 	ns->populated = false;
1263 
1264 	nvme_ctrlr_depopulate_namespace_done(ns->ctrlr);
1265 }
1266 
1267 static void nvme_ctrlr_populate_namespace(struct nvme_bdev_ctrlr *ctrlr, struct nvme_bdev_ns *ns,
1268 		struct nvme_async_probe_ctx *ctx)
1269 {
1270 	g_populate_namespace_fn[ns->type](ctrlr, ns, ctx);
1271 }
1272 
1273 static void nvme_ctrlr_depopulate_namespace(struct nvme_bdev_ctrlr *ctrlr, struct nvme_bdev_ns *ns)
1274 {
1275 	g_depopulate_namespace_fn[ns->type](ns);
1276 }
1277 
1278 void
1279 nvme_ctrlr_populate_namespace_done(struct nvme_async_probe_ctx *ctx,
1280 				   struct nvme_bdev_ns *ns, int rc)
1281 {
1282 	if (rc == 0) {
1283 		ns->populated = true;
1284 		pthread_mutex_lock(&g_bdev_nvme_mutex);
1285 		ns->ctrlr->ref++;
1286 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
1287 	} else {
1288 		memset(ns, 0, sizeof(*ns));
1289 	}
1290 
1291 	if (ctx) {
1292 		ctx->populates_in_progress--;
1293 		if (ctx->populates_in_progress == 0) {
1294 			nvme_ctrlr_populate_namespaces_done(ctx);
1295 		}
1296 	}
1297 }
1298 
1299 static void
1300 nvme_ctrlr_populate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
1301 			       struct nvme_async_probe_ctx *ctx)
1302 {
1303 	struct spdk_nvme_ctrlr	*ctrlr = nvme_bdev_ctrlr->ctrlr;
1304 	struct nvme_bdev_ns	*ns;
1305 	struct spdk_nvme_ns	*nvme_ns;
1306 	struct nvme_bdev	*bdev;
1307 	uint32_t		i;
1308 	int			rc;
1309 	uint64_t		num_sectors;
1310 	bool			ns_is_active;
1311 
1312 	if (ctx) {
1313 		/* Initialize this count to 1 to handle the populate functions
1314 		 * calling nvme_ctrlr_populate_namespace_done() immediately.
1315 		 */
1316 		ctx->populates_in_progress = 1;
1317 	}
1318 
1319 	for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
1320 		uint32_t	nsid = i + 1;
1321 
1322 		ns = nvme_bdev_ctrlr->namespaces[i];
1323 		ns_is_active = spdk_nvme_ctrlr_is_active_ns(ctrlr, nsid);
1324 
1325 		if (ns->populated && ns_is_active && ns->type == NVME_BDEV_NS_STANDARD) {
1326 			/* NS is still there but attributes may have changed */
1327 			nvme_ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid);
1328 			num_sectors = spdk_nvme_ns_get_num_sectors(nvme_ns);
1329 			bdev = TAILQ_FIRST(&ns->bdevs);
1330 			if (bdev->disk.blockcnt != num_sectors) {
1331 				SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %lu, new size %lu\n",
1332 					       nsid,
1333 					       bdev->disk.name,
1334 					       bdev->disk.blockcnt,
1335 					       num_sectors);
1336 				rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors);
1337 				if (rc != 0) {
1338 					SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n",
1339 						    bdev->disk.name, rc);
1340 				}
1341 			}
1342 		}
1343 
1344 		if (!ns->populated && ns_is_active) {
1345 			ns->id = nsid;
1346 			ns->ctrlr = nvme_bdev_ctrlr;
1347 			if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) {
1348 				ns->type = NVME_BDEV_NS_OCSSD;
1349 			} else {
1350 				ns->type = NVME_BDEV_NS_STANDARD;
1351 			}
1352 
1353 			TAILQ_INIT(&ns->bdevs);
1354 
1355 			if (ctx) {
1356 				ctx->populates_in_progress++;
1357 			}
1358 			nvme_ctrlr_populate_namespace(nvme_bdev_ctrlr, ns, ctx);
1359 		}
1360 
1361 		if (ns->populated && !ns_is_active) {
1362 			nvme_ctrlr_depopulate_namespace(nvme_bdev_ctrlr, ns);
1363 		}
1364 	}
1365 
1366 	if (ctx) {
1367 		/* Decrement this count now that the loop is over to account
1368 		 * for the one we started with.  If the count is then 0, we
1369 		 * know any populate_namespace functions completed immediately,
1370 		 * so we'll kick the callback here.
1371 		 */
1372 		ctx->populates_in_progress--;
1373 		if (ctx->populates_in_progress == 0) {
1374 			nvme_ctrlr_populate_namespaces_done(ctx);
1375 		}
1376 	}
1377 
1378 }
1379 
1380 static void
1381 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl)
1382 {
1383 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr		= arg;
1384 	union spdk_nvme_async_event_completion	event;
1385 
1386 	if (spdk_nvme_cpl_is_error(cpl)) {
1387 		SPDK_WARNLOG("AER request execute failed");
1388 		return;
1389 	}
1390 
1391 	event.raw = cpl->cdw0;
1392 	if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) &&
1393 	    (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) {
1394 		nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, NULL);
1395 	} else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_VENDOR) &&
1396 		   (event.bits.log_page_identifier == SPDK_OCSSD_LOG_CHUNK_NOTIFICATION) &&
1397 		   spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) {
1398 		bdev_ocssd_handle_chunk_notification(nvme_bdev_ctrlr);
1399 	}
1400 }
1401 
1402 static int
1403 create_ctrlr(struct spdk_nvme_ctrlr *ctrlr,
1404 	     const char *name,
1405 	     const struct spdk_nvme_transport_id *trid,
1406 	     uint32_t prchk_flags)
1407 {
1408 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
1409 	struct nvme_bdev_ctrlr_trid *trid_entry;
1410 	uint32_t i;
1411 	int rc;
1412 
1413 	nvme_bdev_ctrlr = calloc(1, sizeof(*nvme_bdev_ctrlr));
1414 	if (nvme_bdev_ctrlr == NULL) {
1415 		SPDK_ERRLOG("Failed to allocate device struct\n");
1416 		return -ENOMEM;
1417 	}
1418 
1419 	TAILQ_INIT(&nvme_bdev_ctrlr->trids);
1420 	nvme_bdev_ctrlr->num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr);
1421 	nvme_bdev_ctrlr->namespaces = calloc(nvme_bdev_ctrlr->num_ns, sizeof(struct nvme_bdev_ns *));
1422 	if (!nvme_bdev_ctrlr->namespaces) {
1423 		SPDK_ERRLOG("Failed to allocate block namespaces pointer\n");
1424 		free(nvme_bdev_ctrlr);
1425 		return -ENOMEM;
1426 	}
1427 
1428 	trid_entry = calloc(1, sizeof(*trid_entry));
1429 	if (trid_entry == NULL) {
1430 		SPDK_ERRLOG("Failed to allocate trid entry pointer\n");
1431 		free(nvme_bdev_ctrlr->namespaces);
1432 		free(nvme_bdev_ctrlr);
1433 		return -ENOMEM;
1434 	}
1435 
1436 	trid_entry->trid = *trid;
1437 
1438 	for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
1439 		nvme_bdev_ctrlr->namespaces[i] = calloc(1, sizeof(struct nvme_bdev_ns));
1440 		if (nvme_bdev_ctrlr->namespaces[i] == NULL) {
1441 			SPDK_ERRLOG("Failed to allocate block namespace struct\n");
1442 			for (; i > 0; i--) {
1443 				free(nvme_bdev_ctrlr->namespaces[i - 1]);
1444 			}
1445 			free(trid_entry);
1446 			free(nvme_bdev_ctrlr->namespaces);
1447 			free(nvme_bdev_ctrlr);
1448 			return -ENOMEM;
1449 		}
1450 	}
1451 
1452 	nvme_bdev_ctrlr->thread = spdk_get_thread();
1453 	nvme_bdev_ctrlr->adminq_timer_poller = NULL;
1454 	nvme_bdev_ctrlr->ctrlr = ctrlr;
1455 	nvme_bdev_ctrlr->ref = 0;
1456 	nvme_bdev_ctrlr->connected_trid = &trid_entry->trid;
1457 	nvme_bdev_ctrlr->name = strdup(name);
1458 	if (nvme_bdev_ctrlr->name == NULL) {
1459 		free(trid_entry);
1460 		free(nvme_bdev_ctrlr->namespaces);
1461 		free(nvme_bdev_ctrlr);
1462 		return -ENOMEM;
1463 	}
1464 
1465 	if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) {
1466 		rc = bdev_ocssd_init_ctrlr(nvme_bdev_ctrlr);
1467 		if (spdk_unlikely(rc != 0)) {
1468 			SPDK_ERRLOG("Unable to initialize OCSSD controller\n");
1469 			free(trid_entry);
1470 			free(nvme_bdev_ctrlr->name);
1471 			free(nvme_bdev_ctrlr->namespaces);
1472 			free(nvme_bdev_ctrlr);
1473 			return rc;
1474 		}
1475 	}
1476 
1477 	nvme_bdev_ctrlr->prchk_flags = prchk_flags;
1478 
1479 	spdk_io_device_register(nvme_bdev_ctrlr, bdev_nvme_create_cb, bdev_nvme_destroy_cb,
1480 				sizeof(struct nvme_io_channel),
1481 				name);
1482 
1483 	nvme_bdev_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_bdev_ctrlr,
1484 					       g_opts.nvme_adminq_poll_period_us);
1485 
1486 	TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nvme_bdev_ctrlr, tailq);
1487 
1488 	if (g_opts.timeout_us > 0) {
1489 		spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us,
1490 				timeout_cb, NULL);
1491 	}
1492 
1493 	spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_bdev_ctrlr);
1494 
1495 	if (spdk_nvme_ctrlr_get_flags(nvme_bdev_ctrlr->ctrlr) &
1496 	    SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) {
1497 		nvme_bdev_ctrlr->opal_dev = spdk_opal_dev_construct(nvme_bdev_ctrlr->ctrlr);
1498 		if (nvme_bdev_ctrlr->opal_dev == NULL) {
1499 			SPDK_ERRLOG("Failed to initialize Opal\n");
1500 		}
1501 	}
1502 
1503 	TAILQ_INSERT_HEAD(&nvme_bdev_ctrlr->trids, trid_entry, link);
1504 	return 0;
1505 }
1506 
1507 static void
1508 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
1509 	  struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
1510 {
1511 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
1512 	struct nvme_probe_ctx *ctx = cb_ctx;
1513 	char *name = NULL;
1514 	uint32_t prchk_flags = 0;
1515 	size_t i;
1516 
1517 	if (ctx) {
1518 		for (i = 0; i < ctx->count; i++) {
1519 			if (spdk_nvme_transport_id_compare(trid, &ctx->trids[i]) == 0) {
1520 				prchk_flags = ctx->prchk_flags[i];
1521 				name = strdup(ctx->names[i]);
1522 				break;
1523 			}
1524 		}
1525 	} else {
1526 		name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++);
1527 	}
1528 	if (!name) {
1529 		SPDK_ERRLOG("Failed to assign name to NVMe device\n");
1530 		return;
1531 	}
1532 
1533 	SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name);
1534 
1535 	create_ctrlr(ctrlr, name, trid, prchk_flags);
1536 
1537 	nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(trid);
1538 	if (!nvme_bdev_ctrlr) {
1539 		SPDK_ERRLOG("Failed to find new NVMe controller\n");
1540 		free(name);
1541 		return;
1542 	}
1543 
1544 	nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, NULL);
1545 
1546 	free(name);
1547 }
1548 
1549 static void
1550 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr)
1551 {
1552 	uint32_t i;
1553 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
1554 	struct nvme_bdev_ns *ns;
1555 
1556 	pthread_mutex_lock(&g_bdev_nvme_mutex);
1557 	TAILQ_FOREACH(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
1558 		if (nvme_bdev_ctrlr->ctrlr == ctrlr) {
1559 			/* The controller's destruction was already started */
1560 			if (nvme_bdev_ctrlr->destruct) {
1561 				pthread_mutex_unlock(&g_bdev_nvme_mutex);
1562 				return;
1563 			}
1564 			pthread_mutex_unlock(&g_bdev_nvme_mutex);
1565 			for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
1566 				uint32_t	nsid = i + 1;
1567 
1568 				ns = nvme_bdev_ctrlr->namespaces[nsid - 1];
1569 				if (ns->populated) {
1570 					assert(ns->id == nsid);
1571 					nvme_ctrlr_depopulate_namespace(nvme_bdev_ctrlr, ns);
1572 				}
1573 			}
1574 
1575 			pthread_mutex_lock(&g_bdev_nvme_mutex);
1576 			nvme_bdev_ctrlr->destruct = true;
1577 			if (nvme_bdev_ctrlr->ref == 0) {
1578 				pthread_mutex_unlock(&g_bdev_nvme_mutex);
1579 				nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr);
1580 			} else {
1581 				pthread_mutex_unlock(&g_bdev_nvme_mutex);
1582 			}
1583 			return;
1584 		}
1585 	}
1586 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
1587 }
1588 
1589 static int
1590 bdev_nvme_hotplug(void *arg)
1591 {
1592 	struct spdk_nvme_transport_id trid_pcie;
1593 	int done;
1594 
1595 	if (!g_hotplug_probe_ctx) {
1596 		memset(&trid_pcie, 0, sizeof(trid_pcie));
1597 		spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE);
1598 
1599 		g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL,
1600 				      hotplug_probe_cb,
1601 				      attach_cb, remove_cb);
1602 		if (!g_hotplug_probe_ctx) {
1603 			return SPDK_POLLER_BUSY;
1604 		}
1605 	}
1606 
1607 	done = spdk_nvme_probe_poll_async(g_hotplug_probe_ctx);
1608 	if (done != -EAGAIN) {
1609 		g_hotplug_probe_ctx = NULL;
1610 	}
1611 
1612 	return SPDK_POLLER_BUSY;
1613 }
1614 
1615 void
1616 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts)
1617 {
1618 	*opts = g_opts;
1619 }
1620 
1621 int
1622 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts)
1623 {
1624 	if (g_bdev_nvme_init_thread != NULL) {
1625 		if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
1626 			return -EPERM;
1627 		}
1628 	}
1629 
1630 	g_opts = *opts;
1631 
1632 	return 0;
1633 }
1634 
1635 struct set_nvme_hotplug_ctx {
1636 	uint64_t period_us;
1637 	bool enabled;
1638 	spdk_msg_fn fn;
1639 	void *fn_ctx;
1640 };
1641 
1642 static void
1643 set_nvme_hotplug_period_cb(void *_ctx)
1644 {
1645 	struct set_nvme_hotplug_ctx *ctx = _ctx;
1646 
1647 	spdk_poller_unregister(&g_hotplug_poller);
1648 	if (ctx->enabled) {
1649 		g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us);
1650 	}
1651 
1652 	g_nvme_hotplug_poll_period_us = ctx->period_us;
1653 	g_nvme_hotplug_enabled = ctx->enabled;
1654 	if (ctx->fn) {
1655 		ctx->fn(ctx->fn_ctx);
1656 	}
1657 
1658 	free(ctx);
1659 }
1660 
1661 int
1662 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx)
1663 {
1664 	struct set_nvme_hotplug_ctx *ctx;
1665 
1666 	if (enabled == true && !spdk_process_is_primary()) {
1667 		return -EPERM;
1668 	}
1669 
1670 	ctx = calloc(1, sizeof(*ctx));
1671 	if (ctx == NULL) {
1672 		return -ENOMEM;
1673 	}
1674 
1675 	period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us;
1676 	ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX);
1677 	ctx->enabled = enabled;
1678 	ctx->fn = cb;
1679 	ctx->fn_ctx = cb_ctx;
1680 
1681 	spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx);
1682 	return 0;
1683 }
1684 
1685 static void
1686 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, size_t count, int rc)
1687 {
1688 	if (ctx->cb_fn) {
1689 		ctx->cb_fn(ctx->cb_ctx, count, rc);
1690 	}
1691 
1692 	free(ctx);
1693 }
1694 
1695 static void
1696 nvme_ctrlr_populate_namespaces_done(struct nvme_async_probe_ctx *ctx)
1697 {
1698 	struct nvme_bdev_ctrlr	*nvme_bdev_ctrlr;
1699 	struct nvme_bdev_ns	*ns;
1700 	struct nvme_bdev	*nvme_bdev, *tmp;
1701 	uint32_t		i, nsid;
1702 	size_t			j;
1703 
1704 	nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(ctx->base_name);
1705 	assert(nvme_bdev_ctrlr != NULL);
1706 
1707 	/*
1708 	 * Report the new bdevs that were created in this call.
1709 	 * There can be more than one bdev per NVMe controller.
1710 	 */
1711 	j = 0;
1712 	for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
1713 		nsid = i + 1;
1714 		ns = nvme_bdev_ctrlr->namespaces[nsid - 1];
1715 		if (!ns->populated) {
1716 			continue;
1717 		}
1718 		assert(ns->id == nsid);
1719 		TAILQ_FOREACH_SAFE(nvme_bdev, &ns->bdevs, tailq, tmp) {
1720 			if (j < ctx->count) {
1721 				ctx->names[j] = nvme_bdev->disk.name;
1722 				j++;
1723 			} else {
1724 				SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n",
1725 					    ctx->count);
1726 				populate_namespaces_cb(ctx, 0, -ERANGE);
1727 				return;
1728 			}
1729 		}
1730 	}
1731 
1732 	populate_namespaces_cb(ctx, j, 0);
1733 }
1734 
1735 static void
1736 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
1737 		  struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
1738 {
1739 	struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx;
1740 	struct nvme_bdev_ctrlr	*nvme_bdev_ctrlr;
1741 	struct nvme_async_probe_ctx *ctx;
1742 	int rc;
1743 
1744 	ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, opts);
1745 
1746 	spdk_poller_unregister(&ctx->poller);
1747 
1748 	rc = create_ctrlr(ctrlr, ctx->base_name, &ctx->trid, ctx->prchk_flags);
1749 	if (rc) {
1750 		SPDK_ERRLOG("Failed to create new device\n");
1751 		populate_namespaces_cb(ctx, 0, rc);
1752 		return;
1753 	}
1754 
1755 	nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(&ctx->trid);
1756 	assert(nvme_bdev_ctrlr != NULL);
1757 
1758 	nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, ctx);
1759 }
1760 
1761 static int
1762 bdev_nvme_async_poll(void *arg)
1763 {
1764 	struct nvme_async_probe_ctx	*ctx = arg;
1765 	int				rc;
1766 
1767 	rc = spdk_nvme_probe_poll_async(ctx->probe_ctx);
1768 	if (spdk_unlikely(rc != -EAGAIN && rc != 0)) {
1769 		spdk_poller_unregister(&ctx->poller);
1770 		free(ctx);
1771 	}
1772 
1773 	return SPDK_POLLER_BUSY;
1774 }
1775 
1776 static int
1777 bdev_nvme_add_trid(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, struct spdk_nvme_transport_id *trid)
1778 {
1779 	struct spdk_nvme_ctrlr		*new_ctrlr;
1780 	struct spdk_nvme_ctrlr_opts	opts;
1781 	uint32_t			i;
1782 	struct spdk_nvme_ns		*ns, *new_ns;
1783 	const struct spdk_nvme_ns_data	*ns_data, *new_ns_data;
1784 	struct nvme_bdev_ctrlr_trid	*new_trid;
1785 	int				rc = 0;
1786 
1787 	assert(nvme_bdev_ctrlr != NULL);
1788 
1789 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
1790 		SPDK_ERRLOG("PCIe failover is not supported.\n");
1791 		return -ENOTSUP;
1792 	}
1793 
1794 	/* Currently we only support failover to the same transport type. */
1795 	if (nvme_bdev_ctrlr->connected_trid->trtype != trid->trtype) {
1796 		return -EINVAL;
1797 	}
1798 
1799 	/* Currently we only support failover to the same NQN. */
1800 	if (strncmp(trid->subnqn, nvme_bdev_ctrlr->connected_trid->subnqn, SPDK_NVMF_NQN_MAX_LEN)) {
1801 		return -EINVAL;
1802 	}
1803 
1804 	/* Skip all the other checks if we've already registered this path. */
1805 	TAILQ_FOREACH(new_trid, &nvme_bdev_ctrlr->trids, link) {
1806 		if (!spdk_nvme_transport_id_compare(&new_trid->trid, trid)) {
1807 			return -EEXIST;
1808 		}
1809 	}
1810 
1811 	spdk_nvme_ctrlr_get_default_ctrlr_opts(&opts, sizeof(opts));
1812 	opts.transport_retry_count = g_opts.retry_count;
1813 
1814 	new_ctrlr = spdk_nvme_connect(trid, &opts, sizeof(opts));
1815 
1816 	if (new_ctrlr == NULL) {
1817 		return -ENODEV;
1818 	}
1819 
1820 	if (spdk_nvme_ctrlr_get_num_ns(new_ctrlr) != nvme_bdev_ctrlr->num_ns) {
1821 		rc = -EINVAL;
1822 		goto out;
1823 	}
1824 
1825 	for (i = 1; i <= nvme_bdev_ctrlr->num_ns; i++) {
1826 		ns = spdk_nvme_ctrlr_get_ns(nvme_bdev_ctrlr->ctrlr, i);
1827 		new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, i);
1828 		assert(ns != NULL);
1829 		assert(new_ns != NULL);
1830 
1831 		ns_data = spdk_nvme_ns_get_data(ns);
1832 		new_ns_data = spdk_nvme_ns_get_data(new_ns);
1833 		if (memcmp(ns_data->nguid, new_ns_data->nguid, sizeof(ns_data->nguid))) {
1834 			rc = -EINVAL;
1835 			goto out;
1836 		}
1837 	}
1838 
1839 	new_trid = calloc(1, sizeof(*new_trid));
1840 	if (new_trid == NULL) {
1841 		rc = -ENOMEM;
1842 		goto out;
1843 	}
1844 	new_trid->trid = *trid;
1845 	TAILQ_INSERT_TAIL(&nvme_bdev_ctrlr->trids, new_trid, link);
1846 
1847 out:
1848 	spdk_nvme_detach(new_ctrlr);
1849 	return rc;
1850 }
1851 
1852 int
1853 bdev_nvme_remove_trid(const char *name, struct spdk_nvme_transport_id *trid)
1854 {
1855 	struct nvme_bdev_ctrlr		*nvme_bdev_ctrlr;
1856 	struct nvme_bdev_ctrlr_trid	*ctrlr_trid, *tmp_trid;
1857 
1858 	if (name == NULL) {
1859 		return -EINVAL;
1860 	}
1861 
1862 	nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name);
1863 	if (nvme_bdev_ctrlr == NULL) {
1864 		SPDK_ERRLOG("Failed to find NVMe controller\n");
1865 		return -ENODEV;
1866 	}
1867 
1868 	/* case 1: we are currently using the path to be removed. */
1869 	if (!spdk_nvme_transport_id_compare(trid, nvme_bdev_ctrlr->connected_trid)) {
1870 		ctrlr_trid = TAILQ_FIRST(&nvme_bdev_ctrlr->trids);
1871 		assert(nvme_bdev_ctrlr->connected_trid == &ctrlr_trid->trid);
1872 		/* case 1A: the current path is the only path. */
1873 		if (!TAILQ_NEXT(ctrlr_trid, link)) {
1874 			return bdev_nvme_delete(name);
1875 		}
1876 
1877 		/* case 1B: there is an alternative path. */
1878 		if (bdev_nvme_reset(nvme_bdev_ctrlr, NULL, true) == -EAGAIN) {
1879 			return -EAGAIN;
1880 		}
1881 		assert(nvme_bdev_ctrlr->connected_trid != &ctrlr_trid->trid);
1882 		TAILQ_REMOVE(&nvme_bdev_ctrlr->trids, ctrlr_trid, link);
1883 		free(ctrlr_trid);
1884 		return 0;
1885 	}
1886 	/* case 2: We are not using the specified path. */
1887 	TAILQ_FOREACH_SAFE(ctrlr_trid, &nvme_bdev_ctrlr->trids, link, tmp_trid) {
1888 		if (!spdk_nvme_transport_id_compare(&ctrlr_trid->trid, trid)) {
1889 			TAILQ_REMOVE(&nvme_bdev_ctrlr->trids, ctrlr_trid, link);
1890 			free(ctrlr_trid);
1891 			return 0;
1892 		}
1893 	}
1894 
1895 	/* case 2A: The address isn't even in the registered list. */
1896 	return -ENXIO;
1897 }
1898 
1899 int
1900 bdev_nvme_create(struct spdk_nvme_transport_id *trid,
1901 		 struct spdk_nvme_host_id *hostid,
1902 		 const char *base_name,
1903 		 const char **names,
1904 		 uint32_t count,
1905 		 const char *hostnqn,
1906 		 uint32_t prchk_flags,
1907 		 spdk_bdev_create_nvme_fn cb_fn,
1908 		 void *cb_ctx)
1909 {
1910 	struct nvme_probe_skip_entry	*entry, *tmp;
1911 	struct nvme_async_probe_ctx	*ctx;
1912 	struct nvme_bdev_ctrlr		*existing_ctrlr;
1913 	int				rc;
1914 
1915 	/* TODO expand this check to include both the host and target TRIDs.
1916 	 * Only if both are the same should we fail.
1917 	 */
1918 	if (nvme_bdev_ctrlr_get(trid) != NULL) {
1919 		SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr);
1920 		return -EEXIST;
1921 	}
1922 
1923 	ctx = calloc(1, sizeof(*ctx));
1924 	if (!ctx) {
1925 		return -ENOMEM;
1926 	}
1927 	ctx->base_name = base_name;
1928 	ctx->names = names;
1929 	ctx->count = count;
1930 	ctx->cb_fn = cb_fn;
1931 	ctx->cb_ctx = cb_ctx;
1932 	ctx->prchk_flags = prchk_flags;
1933 	ctx->trid = *trid;
1934 
1935 	existing_ctrlr = nvme_bdev_ctrlr_get_by_name(base_name);
1936 	if (existing_ctrlr) {
1937 		rc = bdev_nvme_add_trid(existing_ctrlr, trid);
1938 		if (rc) {
1939 			free(ctx);
1940 			return rc;
1941 		}
1942 
1943 		nvme_ctrlr_populate_namespaces_done(ctx);
1944 		return 0;
1945 	}
1946 
1947 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
1948 		TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) {
1949 			if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) {
1950 				TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq);
1951 				free(entry);
1952 				break;
1953 			}
1954 		}
1955 	}
1956 
1957 	spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->opts, sizeof(ctx->opts));
1958 	ctx->opts.transport_retry_count = g_opts.retry_count;
1959 
1960 	if (hostnqn) {
1961 		snprintf(ctx->opts.hostnqn, sizeof(ctx->opts.hostnqn), "%s", hostnqn);
1962 	}
1963 
1964 	if (hostid->hostaddr[0] != '\0') {
1965 		snprintf(ctx->opts.src_addr, sizeof(ctx->opts.src_addr), "%s", hostid->hostaddr);
1966 	}
1967 
1968 	if (hostid->hostsvcid[0] != '\0') {
1969 		snprintf(ctx->opts.src_svcid, sizeof(ctx->opts.src_svcid), "%s", hostid->hostsvcid);
1970 	}
1971 
1972 	ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->opts, connect_attach_cb);
1973 	if (ctx->probe_ctx == NULL) {
1974 		SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr);
1975 		free(ctx);
1976 		return -ENODEV;
1977 	}
1978 	ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000);
1979 
1980 	return 0;
1981 }
1982 
1983 int
1984 bdev_nvme_delete(const char *name)
1985 {
1986 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = NULL;
1987 	struct nvme_probe_skip_entry *entry;
1988 
1989 	if (name == NULL) {
1990 		return -EINVAL;
1991 	}
1992 
1993 	nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name);
1994 	if (nvme_bdev_ctrlr == NULL) {
1995 		SPDK_ERRLOG("Failed to find NVMe controller\n");
1996 		return -ENODEV;
1997 	}
1998 
1999 	if (nvme_bdev_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
2000 		entry = calloc(1, sizeof(*entry));
2001 		if (!entry) {
2002 			return -ENOMEM;
2003 		}
2004 		entry->trid = *nvme_bdev_ctrlr->connected_trid;
2005 		TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq);
2006 	}
2007 
2008 	remove_cb(NULL, nvme_bdev_ctrlr->ctrlr);
2009 	return 0;
2010 }
2011 
2012 static int
2013 bdev_nvme_library_init(void)
2014 {
2015 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
2016 	struct spdk_conf_section *sp;
2017 	const char *val;
2018 	int rc = 0;
2019 	int64_t intval = 0;
2020 	size_t i;
2021 	struct nvme_probe_ctx *probe_ctx = NULL;
2022 	int retry_count;
2023 	uint32_t local_nvme_num = 0;
2024 	int64_t hotplug_period;
2025 	bool hotplug_enabled = g_nvme_hotplug_enabled;
2026 
2027 	g_bdev_nvme_init_thread = spdk_get_thread();
2028 
2029 	spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_poll_group_create_cb,
2030 				bdev_nvme_poll_group_destroy_cb,
2031 				sizeof(struct nvme_bdev_poll_group),  "bdev_nvme_poll_groups");
2032 
2033 	sp = spdk_conf_find_section(NULL, "Nvme");
2034 	if (sp == NULL) {
2035 		goto end;
2036 	}
2037 
2038 	probe_ctx = calloc(1, sizeof(*probe_ctx));
2039 	if (probe_ctx == NULL) {
2040 		SPDK_ERRLOG("Failed to allocate probe_ctx\n");
2041 		rc = -1;
2042 		goto end;
2043 	}
2044 
2045 	retry_count = spdk_conf_section_get_intval(sp, "RetryCount");
2046 	if (retry_count >= 0) {
2047 		g_opts.retry_count = retry_count;
2048 	}
2049 
2050 	val = spdk_conf_section_get_val(sp, "TimeoutUsec");
2051 	if (val != NULL) {
2052 		intval = spdk_strtoll(val, 10);
2053 		if (intval < 0) {
2054 			SPDK_ERRLOG("Invalid TimeoutUsec value\n");
2055 			rc = -1;
2056 			goto end;
2057 		}
2058 	}
2059 
2060 	g_opts.timeout_us = intval;
2061 
2062 	if (g_opts.timeout_us > 0) {
2063 		val = spdk_conf_section_get_val(sp, "ActionOnTimeout");
2064 		if (val != NULL) {
2065 			if (!strcasecmp(val, "Reset")) {
2066 				g_opts.action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET;
2067 			} else if (!strcasecmp(val, "Abort")) {
2068 				g_opts.action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT;
2069 			}
2070 		}
2071 	}
2072 
2073 	intval = spdk_conf_section_get_intval(sp, "AdminPollRate");
2074 	if (intval > 0) {
2075 		g_opts.nvme_adminq_poll_period_us = intval;
2076 	}
2077 
2078 	intval = spdk_conf_section_get_intval(sp, "IOPollRate");
2079 	if (intval > 0) {
2080 		g_opts.nvme_ioq_poll_period_us = intval;
2081 	}
2082 
2083 	if (spdk_process_is_primary()) {
2084 		hotplug_enabled = spdk_conf_section_get_boolval(sp, "HotplugEnable", false);
2085 	}
2086 
2087 	hotplug_period = spdk_conf_section_get_intval(sp, "HotplugPollRate");
2088 	if (hotplug_period < 0) {
2089 		hotplug_period = 0;
2090 	}
2091 
2092 	g_nvme_hostnqn = spdk_conf_section_get_val(sp, "HostNQN");
2093 	probe_ctx->hostnqn = g_nvme_hostnqn;
2094 
2095 	g_opts.delay_cmd_submit = spdk_conf_section_get_boolval(sp, "DelayCmdSubmit",
2096 				  SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT);
2097 
2098 	for (i = 0; i < NVME_MAX_CONTROLLERS; i++) {
2099 		val = spdk_conf_section_get_nmval(sp, "TransportID", i, 0);
2100 		if (val == NULL) {
2101 			break;
2102 		}
2103 
2104 		rc = spdk_nvme_transport_id_parse(&probe_ctx->trids[i], val);
2105 		if (rc < 0) {
2106 			SPDK_ERRLOG("Unable to parse TransportID: %s\n", val);
2107 			rc = -1;
2108 			goto end;
2109 		}
2110 
2111 		rc = spdk_nvme_host_id_parse(&probe_ctx->hostids[i], val);
2112 		if (rc < 0) {
2113 			SPDK_ERRLOG("Unable to parse HostID: %s\n", val);
2114 			rc = -1;
2115 			goto end;
2116 		}
2117 
2118 		val = spdk_conf_section_get_nmval(sp, "TransportID", i, 1);
2119 		if (val == NULL) {
2120 			SPDK_ERRLOG("No name provided for TransportID\n");
2121 			rc = -1;
2122 			goto end;
2123 		}
2124 
2125 		probe_ctx->names[i] = val;
2126 
2127 		val = spdk_conf_section_get_nmval(sp, "TransportID", i, 2);
2128 		if (val != NULL) {
2129 			rc = spdk_nvme_prchk_flags_parse(&probe_ctx->prchk_flags[i], val);
2130 			if (rc < 0) {
2131 				SPDK_ERRLOG("Unable to parse prchk: %s\n", val);
2132 				rc = -1;
2133 				goto end;
2134 			}
2135 		}
2136 
2137 		probe_ctx->count++;
2138 
2139 		if (probe_ctx->trids[i].trtype != SPDK_NVME_TRANSPORT_PCIE) {
2140 			struct spdk_nvme_ctrlr *ctrlr;
2141 			struct spdk_nvme_ctrlr_opts opts;
2142 
2143 			if (nvme_bdev_ctrlr_get(&probe_ctx->trids[i])) {
2144 				SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n",
2145 					    probe_ctx->trids[i].traddr);
2146 				rc = -1;
2147 				goto end;
2148 			}
2149 
2150 			if (probe_ctx->trids[i].subnqn[0] == '\0') {
2151 				SPDK_ERRLOG("Need to provide subsystem nqn\n");
2152 				rc = -1;
2153 				goto end;
2154 			}
2155 
2156 			spdk_nvme_ctrlr_get_default_ctrlr_opts(&opts, sizeof(opts));
2157 			opts.transport_retry_count = g_opts.retry_count;
2158 
2159 			if (probe_ctx->hostnqn != NULL) {
2160 				snprintf(opts.hostnqn, sizeof(opts.hostnqn), "%s", probe_ctx->hostnqn);
2161 			}
2162 
2163 			if (probe_ctx->hostids[i].hostaddr[0] != '\0') {
2164 				snprintf(opts.src_addr, sizeof(opts.src_addr), "%s", probe_ctx->hostids[i].hostaddr);
2165 			}
2166 
2167 			if (probe_ctx->hostids[i].hostsvcid[0] != '\0') {
2168 				snprintf(opts.src_svcid, sizeof(opts.src_svcid), "%s", probe_ctx->hostids[i].hostsvcid);
2169 			}
2170 
2171 			ctrlr = spdk_nvme_connect(&probe_ctx->trids[i], &opts, sizeof(opts));
2172 			if (ctrlr == NULL) {
2173 				SPDK_ERRLOG("Unable to connect to provided trid (traddr: %s)\n",
2174 					    probe_ctx->trids[i].traddr);
2175 				rc = -1;
2176 				goto end;
2177 			}
2178 
2179 			rc = create_ctrlr(ctrlr, probe_ctx->names[i], &probe_ctx->trids[i], 0);
2180 			if (rc) {
2181 				goto end;
2182 			}
2183 
2184 			nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(&probe_ctx->trids[i]);
2185 			if (!nvme_bdev_ctrlr) {
2186 				SPDK_ERRLOG("Failed to find new NVMe controller\n");
2187 				rc = -ENODEV;
2188 				goto end;
2189 			}
2190 
2191 			nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, NULL);
2192 		} else {
2193 			local_nvme_num++;
2194 		}
2195 	}
2196 
2197 	if (local_nvme_num > 0) {
2198 		/* used to probe local NVMe device */
2199 		if (spdk_nvme_probe(NULL, probe_ctx, probe_cb, attach_cb, remove_cb)) {
2200 			rc = -1;
2201 			goto end;
2202 		}
2203 
2204 		for (i = 0; i < probe_ctx->count; i++) {
2205 			if (probe_ctx->trids[i].trtype != SPDK_NVME_TRANSPORT_PCIE) {
2206 				continue;
2207 			}
2208 
2209 			if (!nvme_bdev_ctrlr_get(&probe_ctx->trids[i])) {
2210 				SPDK_ERRLOG("NVMe SSD \"%s\" could not be found.\n", probe_ctx->trids[i].traddr);
2211 				SPDK_ERRLOG("Check PCIe BDF and that it is attached to UIO/VFIO driver.\n");
2212 			}
2213 		}
2214 	}
2215 
2216 	rc = bdev_nvme_set_hotplug(hotplug_enabled, hotplug_period, NULL, NULL);
2217 	if (rc) {
2218 		SPDK_ERRLOG("Failed to setup hotplug (%d): %s", rc, spdk_strerror(rc));
2219 		rc = -1;
2220 	}
2221 end:
2222 	free(probe_ctx);
2223 	return rc;
2224 }
2225 
2226 static void
2227 bdev_nvme_library_fini(void)
2228 {
2229 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, *tmp;
2230 	struct nvme_probe_skip_entry *entry, *entry_tmp;
2231 	struct nvme_bdev_ns *ns;
2232 	uint32_t i;
2233 
2234 	spdk_poller_unregister(&g_hotplug_poller);
2235 	free(g_hotplug_probe_ctx);
2236 
2237 	TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) {
2238 		TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq);
2239 		free(entry);
2240 	}
2241 
2242 	pthread_mutex_lock(&g_bdev_nvme_mutex);
2243 	TAILQ_FOREACH_SAFE(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq, tmp) {
2244 		if (nvme_bdev_ctrlr->destruct) {
2245 			/* This controller's destruction was already started
2246 			 * before the application started shutting down
2247 			 */
2248 			continue;
2249 		}
2250 
2251 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
2252 
2253 		for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
2254 			uint32_t nsid = i + 1;
2255 
2256 			ns = nvme_bdev_ctrlr->namespaces[nsid - 1];
2257 			if (ns->populated) {
2258 				assert(ns->id == nsid);
2259 				nvme_ctrlr_depopulate_namespace(nvme_bdev_ctrlr, ns);
2260 			}
2261 		}
2262 
2263 		pthread_mutex_lock(&g_bdev_nvme_mutex);
2264 		nvme_bdev_ctrlr->destruct = true;
2265 
2266 		if (nvme_bdev_ctrlr->ref == 0) {
2267 			pthread_mutex_unlock(&g_bdev_nvme_mutex);
2268 			nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr);
2269 			pthread_mutex_lock(&g_bdev_nvme_mutex);
2270 		}
2271 	}
2272 
2273 	g_bdev_nvme_module_finish = true;
2274 	if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
2275 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
2276 		spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL);
2277 		spdk_bdev_module_finish_done();
2278 		return;
2279 	}
2280 
2281 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
2282 }
2283 
2284 static void
2285 bdev_nvme_verify_pi_error(struct spdk_bdev_io *bdev_io)
2286 {
2287 	struct spdk_bdev *bdev = bdev_io->bdev;
2288 	struct spdk_dif_ctx dif_ctx;
2289 	struct spdk_dif_error err_blk = {};
2290 	int rc;
2291 
2292 	rc = spdk_dif_ctx_init(&dif_ctx,
2293 			       bdev->blocklen, bdev->md_len, bdev->md_interleave,
2294 			       bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags,
2295 			       bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0);
2296 	if (rc != 0) {
2297 		SPDK_ERRLOG("Initialization of DIF context failed\n");
2298 		return;
2299 	}
2300 
2301 	if (bdev->md_interleave) {
2302 		rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
2303 				     bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
2304 	} else {
2305 		struct iovec md_iov = {
2306 			.iov_base	= bdev_io->u.bdev.md_buf,
2307 			.iov_len	= bdev_io->u.bdev.num_blocks * bdev->md_len,
2308 		};
2309 
2310 		rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
2311 				     &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
2312 	}
2313 
2314 	if (rc != 0) {
2315 		SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n",
2316 			    err_blk.err_type, err_blk.err_offset);
2317 	} else {
2318 		SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n");
2319 	}
2320 }
2321 
2322 static void
2323 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl)
2324 {
2325 	struct nvme_bdev_io *bio = ref;
2326 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2327 
2328 	if (spdk_nvme_cpl_is_success(cpl)) {
2329 		/* Run PI verification for read data buffer. */
2330 		bdev_nvme_verify_pi_error(bdev_io);
2331 	}
2332 
2333 	/* Return original completion status */
2334 	spdk_bdev_io_complete_nvme_status(bdev_io, bio->cpl.cdw0, bio->cpl.status.sct,
2335 					  bio->cpl.status.sc);
2336 }
2337 
2338 static void
2339 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl)
2340 {
2341 	struct nvme_bdev_io *bio = ref;
2342 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2343 	struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt;
2344 	struct nvme_io_channel *nvme_ch;
2345 	int ret;
2346 
2347 	if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) {
2348 		SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n",
2349 			    cpl->status.sct, cpl->status.sc);
2350 
2351 		/* Save completion status to use after verifying PI error. */
2352 		bio->cpl = *cpl;
2353 
2354 		nvme_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io));
2355 
2356 		/* Read without PI checking to verify PI error. */
2357 		ret = bdev_nvme_no_pi_readv(nbdev->nvme_ns,
2358 					    nvme_ch,
2359 					    bio,
2360 					    bdev_io->u.bdev.iovs,
2361 					    bdev_io->u.bdev.iovcnt,
2362 					    bdev_io->u.bdev.md_buf,
2363 					    bdev_io->u.bdev.num_blocks,
2364 					    bdev_io->u.bdev.offset_blocks);
2365 		if (ret == 0) {
2366 			return;
2367 		}
2368 	}
2369 
2370 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2371 }
2372 
2373 static void
2374 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl)
2375 {
2376 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref);
2377 
2378 	if (spdk_nvme_cpl_is_pi_error(cpl)) {
2379 		SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n",
2380 			    cpl->status.sct, cpl->status.sc);
2381 		/* Run PI verification for write data buffer if PI error is detected. */
2382 		bdev_nvme_verify_pi_error(bdev_io);
2383 	}
2384 
2385 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2386 }
2387 
2388 static void
2389 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl)
2390 {
2391 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref);
2392 
2393 	if (spdk_nvme_cpl_is_pi_error(cpl)) {
2394 		SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n",
2395 			    cpl->status.sct, cpl->status.sc);
2396 		/* Run PI verification for compare data buffer if PI error is detected. */
2397 		bdev_nvme_verify_pi_error(bdev_io);
2398 	}
2399 
2400 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2401 }
2402 
2403 static void
2404 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl)
2405 {
2406 	struct nvme_bdev_io *bio = ref;
2407 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2408 
2409 	/* Compare operation completion */
2410 	if ((cpl->cdw0 & 0xFF) == SPDK_NVME_OPC_COMPARE) {
2411 		/* Save compare result for write callback */
2412 		bio->cpl = *cpl;
2413 		return;
2414 	}
2415 
2416 	/* Write operation completion */
2417 	if (spdk_nvme_cpl_is_error(&bio->cpl)) {
2418 		/* If bio->cpl is already an error, it means the compare operation failed.  In that case,
2419 		 * complete the IO with the compare operation's status.
2420 		 */
2421 		if (!spdk_nvme_cpl_is_error(cpl)) {
2422 			SPDK_ERRLOG("Unexpected write success after compare failure.\n");
2423 		}
2424 
2425 		spdk_bdev_io_complete_nvme_status(bdev_io, bio->cpl.cdw0, bio->cpl.status.sct, bio->cpl.status.sc);
2426 	} else {
2427 		spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2428 	}
2429 }
2430 
2431 static void
2432 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl)
2433 {
2434 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref);
2435 
2436 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2437 }
2438 
2439 static void
2440 bdev_nvme_admin_passthru_completion(void *ctx)
2441 {
2442 	struct nvme_bdev_io *bio = ctx;
2443 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2444 
2445 	spdk_bdev_io_complete_nvme_status(bdev_io,
2446 					  bio->cpl.cdw0, bio->cpl.status.sct, bio->cpl.status.sc);
2447 }
2448 
2449 static void
2450 bdev_nvme_abort_completion(void *ctx)
2451 {
2452 	struct nvme_bdev_io *bio = ctx;
2453 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2454 
2455 	if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) {
2456 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
2457 	} else {
2458 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
2459 	}
2460 }
2461 
2462 static void
2463 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl)
2464 {
2465 	struct nvme_bdev_io *bio = ref;
2466 
2467 	bio->cpl = *cpl;
2468 	spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_completion, bio);
2469 }
2470 
2471 static void
2472 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl)
2473 {
2474 	struct nvme_bdev_io *bio = ref;
2475 
2476 	bio->cpl = *cpl;
2477 	spdk_thread_send_msg(bio->orig_thread, bdev_nvme_admin_passthru_completion, bio);
2478 }
2479 
2480 static void
2481 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset)
2482 {
2483 	struct nvme_bdev_io *bio = ref;
2484 	struct iovec *iov;
2485 
2486 	bio->iov_offset = sgl_offset;
2487 	for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) {
2488 		iov = &bio->iovs[bio->iovpos];
2489 		if (bio->iov_offset < iov->iov_len) {
2490 			break;
2491 		}
2492 
2493 		bio->iov_offset -= iov->iov_len;
2494 	}
2495 }
2496 
2497 static int
2498 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length)
2499 {
2500 	struct nvme_bdev_io *bio = ref;
2501 	struct iovec *iov;
2502 
2503 	assert(bio->iovpos < bio->iovcnt);
2504 
2505 	iov = &bio->iovs[bio->iovpos];
2506 
2507 	*address = iov->iov_base;
2508 	*length = iov->iov_len;
2509 
2510 	if (bio->iov_offset) {
2511 		assert(bio->iov_offset <= iov->iov_len);
2512 		*address += bio->iov_offset;
2513 		*length -= bio->iov_offset;
2514 	}
2515 
2516 	bio->iov_offset += *length;
2517 	if (bio->iov_offset == iov->iov_len) {
2518 		bio->iovpos++;
2519 		bio->iov_offset = 0;
2520 	}
2521 
2522 	return 0;
2523 }
2524 
2525 static void
2526 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset)
2527 {
2528 	struct nvme_bdev_io *bio = ref;
2529 	struct iovec *iov;
2530 
2531 	bio->fused_iov_offset = sgl_offset;
2532 	for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) {
2533 		iov = &bio->fused_iovs[bio->fused_iovpos];
2534 		if (bio->fused_iov_offset < iov->iov_len) {
2535 			break;
2536 		}
2537 
2538 		bio->fused_iov_offset -= iov->iov_len;
2539 	}
2540 }
2541 
2542 static int
2543 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length)
2544 {
2545 	struct nvme_bdev_io *bio = ref;
2546 	struct iovec *iov;
2547 
2548 	assert(bio->fused_iovpos < bio->fused_iovcnt);
2549 
2550 	iov = &bio->fused_iovs[bio->fused_iovpos];
2551 
2552 	*address = iov->iov_base;
2553 	*length = iov->iov_len;
2554 
2555 	if (bio->fused_iov_offset) {
2556 		assert(bio->fused_iov_offset <= iov->iov_len);
2557 		*address += bio->fused_iov_offset;
2558 		*length -= bio->fused_iov_offset;
2559 	}
2560 
2561 	bio->fused_iov_offset += *length;
2562 	if (bio->fused_iov_offset == iov->iov_len) {
2563 		bio->fused_iovpos++;
2564 		bio->fused_iov_offset = 0;
2565 	}
2566 
2567 	return 0;
2568 }
2569 
2570 static int
2571 bdev_nvme_no_pi_readv(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch,
2572 		      struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
2573 		      void *md, uint64_t lba_count, uint64_t lba)
2574 {
2575 	int rc;
2576 
2577 	SPDK_DEBUGLOG(bdev_nvme, "read %lu blocks with offset %#lx without PI check\n",
2578 		      lba_count, lba);
2579 
2580 	bio->iovs = iov;
2581 	bio->iovcnt = iovcnt;
2582 	bio->iovpos = 0;
2583 	bio->iov_offset = 0;
2584 
2585 	rc = spdk_nvme_ns_cmd_readv_with_md(nvme_ns->ns, nvme_ch->qpair, lba, lba_count,
2586 					    bdev_nvme_no_pi_readv_done, bio, 0,
2587 					    bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
2588 					    md, 0, 0);
2589 
2590 	if (rc != 0 && rc != -ENOMEM) {
2591 		SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc);
2592 	}
2593 	return rc;
2594 }
2595 
2596 static int
2597 bdev_nvme_readv(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch,
2598 		struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
2599 		void *md, uint64_t lba_count, uint64_t lba, uint32_t flags)
2600 {
2601 	int rc;
2602 
2603 	SPDK_DEBUGLOG(bdev_nvme, "read %lu blocks with offset %#lx\n",
2604 		      lba_count, lba);
2605 
2606 	bio->iovs = iov;
2607 	bio->iovcnt = iovcnt;
2608 	bio->iovpos = 0;
2609 	bio->iov_offset = 0;
2610 
2611 	if (iovcnt == 1) {
2612 		rc = spdk_nvme_ns_cmd_read_with_md(nvme_ns->ns, nvme_ch->qpair, iov[0].iov_base, md, lba,
2613 						   lba_count,
2614 						   bdev_nvme_readv_done, bio,
2615 						   flags,
2616 						   0, 0);
2617 	} else {
2618 		rc = spdk_nvme_ns_cmd_readv_with_md(nvme_ns->ns, nvme_ch->qpair, lba, lba_count,
2619 						    bdev_nvme_readv_done, bio, flags,
2620 						    bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
2621 						    md, 0, 0);
2622 	}
2623 
2624 	if (rc != 0 && rc != -ENOMEM) {
2625 		SPDK_ERRLOG("readv failed: rc = %d\n", rc);
2626 	}
2627 	return rc;
2628 }
2629 
2630 static int
2631 bdev_nvme_writev(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch,
2632 		 struct nvme_bdev_io *bio,
2633 		 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
2634 		 uint32_t flags)
2635 {
2636 	int rc;
2637 
2638 	SPDK_DEBUGLOG(bdev_nvme, "write %lu blocks with offset %#lx\n",
2639 		      lba_count, lba);
2640 
2641 	bio->iovs = iov;
2642 	bio->iovcnt = iovcnt;
2643 	bio->iovpos = 0;
2644 	bio->iov_offset = 0;
2645 
2646 	if (iovcnt == 1) {
2647 		rc = spdk_nvme_ns_cmd_write_with_md(nvme_ns->ns, nvme_ch->qpair, iov[0].iov_base, md, lba,
2648 						    lba_count,
2649 						    bdev_nvme_readv_done, bio,
2650 						    flags,
2651 						    0, 0);
2652 	} else {
2653 		rc = spdk_nvme_ns_cmd_writev_with_md(nvme_ns->ns, nvme_ch->qpair, lba, lba_count,
2654 						     bdev_nvme_writev_done, bio, flags,
2655 						     bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
2656 						     md, 0, 0);
2657 	}
2658 
2659 	if (rc != 0 && rc != -ENOMEM) {
2660 		SPDK_ERRLOG("writev failed: rc = %d\n", rc);
2661 	}
2662 	return rc;
2663 }
2664 
2665 static int
2666 bdev_nvme_comparev(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch,
2667 		   struct nvme_bdev_io *bio,
2668 		   struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
2669 		   uint32_t flags)
2670 {
2671 	int rc;
2672 
2673 	SPDK_DEBUGLOG(bdev_nvme, "compare %lu blocks with offset %#lx\n",
2674 		      lba_count, lba);
2675 
2676 	bio->iovs = iov;
2677 	bio->iovcnt = iovcnt;
2678 	bio->iovpos = 0;
2679 	bio->iov_offset = 0;
2680 
2681 	rc = spdk_nvme_ns_cmd_comparev_with_md(nvme_ns->ns, nvme_ch->qpair, lba, lba_count,
2682 					       bdev_nvme_comparev_done, bio, flags,
2683 					       bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
2684 					       md, 0, 0);
2685 
2686 	if (rc != 0 && rc != -ENOMEM) {
2687 		SPDK_ERRLOG("comparev failed: rc = %d\n", rc);
2688 	}
2689 	return rc;
2690 }
2691 
2692 static int
2693 bdev_nvme_comparev_and_writev(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch,
2694 			      struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt,
2695 			      struct iovec *write_iov, int write_iovcnt,
2696 			      void *md, uint64_t lba_count, uint64_t lba, uint32_t flags)
2697 {
2698 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2699 	int rc;
2700 
2701 	SPDK_DEBUGLOG(bdev_nvme, "compare and write %lu blocks with offset %#lx\n",
2702 		      lba_count, lba);
2703 
2704 	bio->iovs = cmp_iov;
2705 	bio->iovcnt = cmp_iovcnt;
2706 	bio->iovpos = 0;
2707 	bio->iov_offset = 0;
2708 	bio->fused_iovs = write_iov;
2709 	bio->fused_iovcnt = write_iovcnt;
2710 	bio->fused_iovpos = 0;
2711 	bio->fused_iov_offset = 0;
2712 
2713 	if (bdev_io->num_retries == 0) {
2714 		bio->first_fused_submitted = false;
2715 	}
2716 
2717 	if (!bio->first_fused_submitted) {
2718 		flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST;
2719 		memset(&bio->cpl, 0, sizeof(bio->cpl));
2720 
2721 		rc = spdk_nvme_ns_cmd_comparev_with_md(nvme_ns->ns, nvme_ch->qpair, lba, lba_count,
2722 						       bdev_nvme_comparev_and_writev_done, bio, flags,
2723 						       bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0);
2724 		if (rc == 0) {
2725 			bio->first_fused_submitted = true;
2726 			flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST;
2727 		} else {
2728 			if (rc != -ENOMEM) {
2729 				SPDK_ERRLOG("compare failed: rc = %d\n", rc);
2730 			}
2731 			return rc;
2732 		}
2733 	}
2734 
2735 	flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND;
2736 
2737 	rc = spdk_nvme_ns_cmd_writev_with_md(nvme_ns->ns, nvme_ch->qpair, lba, lba_count,
2738 					     bdev_nvme_comparev_and_writev_done, bio, flags,
2739 					     bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0);
2740 	if (rc != 0 && rc != -ENOMEM) {
2741 		SPDK_ERRLOG("write failed: rc = %d\n", rc);
2742 		rc = 0;
2743 	}
2744 
2745 	return rc;
2746 }
2747 
2748 static int
2749 bdev_nvme_unmap(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch,
2750 		struct nvme_bdev_io *bio,
2751 		uint64_t offset_blocks,
2752 		uint64_t num_blocks)
2753 {
2754 	struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES];
2755 	struct spdk_nvme_dsm_range *range;
2756 	uint64_t offset, remaining;
2757 	uint64_t num_ranges_u64;
2758 	uint16_t num_ranges;
2759 	int rc;
2760 
2761 	num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) /
2762 			 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
2763 	if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) {
2764 		SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks);
2765 		return -EINVAL;
2766 	}
2767 	num_ranges = (uint16_t)num_ranges_u64;
2768 
2769 	offset = offset_blocks;
2770 	remaining = num_blocks;
2771 	range = &dsm_ranges[0];
2772 
2773 	/* Fill max-size ranges until the remaining blocks fit into one range */
2774 	while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) {
2775 		range->attributes.raw = 0;
2776 		range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
2777 		range->starting_lba = offset;
2778 
2779 		offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
2780 		remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
2781 		range++;
2782 	}
2783 
2784 	/* Final range describes the remaining blocks */
2785 	range->attributes.raw = 0;
2786 	range->length = remaining;
2787 	range->starting_lba = offset;
2788 
2789 	rc = spdk_nvme_ns_cmd_dataset_management(nvme_ns->ns, nvme_ch->qpair,
2790 			SPDK_NVME_DSM_ATTR_DEALLOCATE,
2791 			dsm_ranges, num_ranges,
2792 			bdev_nvme_queued_done, bio);
2793 
2794 	return rc;
2795 }
2796 
2797 static int
2798 bdev_nvme_admin_passthru(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch,
2799 			 struct nvme_bdev_io *bio,
2800 			 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes)
2801 {
2802 	uint32_t max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ns->ctrlr->ctrlr);
2803 
2804 	if (nbytes > max_xfer_size) {
2805 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
2806 		return -EINVAL;
2807 	}
2808 
2809 	bio->orig_thread = spdk_io_channel_get_thread(spdk_io_channel_from_ctx(nvme_ch));
2810 
2811 	return spdk_nvme_ctrlr_cmd_admin_raw(nvme_ns->ctrlr->ctrlr, cmd, buf,
2812 					     (uint32_t)nbytes, bdev_nvme_admin_passthru_done, bio);
2813 }
2814 
2815 static int
2816 bdev_nvme_io_passthru(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch,
2817 		      struct nvme_bdev_io *bio,
2818 		      struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes)
2819 {
2820 	uint32_t max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ns->ctrlr->ctrlr);
2821 
2822 	if (nbytes > max_xfer_size) {
2823 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
2824 		return -EINVAL;
2825 	}
2826 
2827 	/*
2828 	 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid,
2829 	 * so fill it out automatically.
2830 	 */
2831 	cmd->nsid = spdk_nvme_ns_get_id(nvme_ns->ns);
2832 
2833 	return spdk_nvme_ctrlr_cmd_io_raw(nvme_ns->ctrlr->ctrlr, nvme_ch->qpair, cmd, buf,
2834 					  (uint32_t)nbytes, bdev_nvme_queued_done, bio);
2835 }
2836 
2837 static int
2838 bdev_nvme_io_passthru_md(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch,
2839 			 struct nvme_bdev_io *bio,
2840 			 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len)
2841 {
2842 	size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(nvme_ns->ns);
2843 	uint32_t max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ns->ctrlr->ctrlr);
2844 
2845 	if (nbytes > max_xfer_size) {
2846 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
2847 		return -EINVAL;
2848 	}
2849 
2850 	if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(nvme_ns->ns)) {
2851 		SPDK_ERRLOG("invalid meta data buffer size\n");
2852 		return -EINVAL;
2853 	}
2854 
2855 	/*
2856 	 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid,
2857 	 * so fill it out automatically.
2858 	 */
2859 	cmd->nsid = spdk_nvme_ns_get_id(nvme_ns->ns);
2860 
2861 	return spdk_nvme_ctrlr_cmd_io_raw_with_md(nvme_ns->ctrlr->ctrlr, nvme_ch->qpair, cmd, buf,
2862 			(uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio);
2863 }
2864 
2865 static void
2866 bdev_nvme_abort_admin_cmd(void *ctx)
2867 {
2868 	struct nvme_bdev_io *bio = ctx;
2869 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2870 	struct nvme_bdev *nbdev;
2871 	struct nvme_bdev_io *bio_to_abort;
2872 	int rc;
2873 
2874 	nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt;
2875 	bio_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx;
2876 
2877 	rc = spdk_nvme_ctrlr_cmd_abort_ext(nbdev->nvme_ns->ctrlr->ctrlr,
2878 					   NULL,
2879 					   bio_to_abort,
2880 					   bdev_nvme_abort_done, bio);
2881 	if (rc == -ENOENT) {
2882 		/* If no admin command was found in admin qpair, complete the abort
2883 		 * request with failure.
2884 		 */
2885 		bio->cpl.cdw0 |= 1U;
2886 		bio->cpl.status.sc = SPDK_NVME_SC_SUCCESS;
2887 		bio->cpl.status.sct = SPDK_NVME_SCT_GENERIC;
2888 
2889 		spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_completion, bio);
2890 	}
2891 }
2892 
2893 static int
2894 bdev_nvme_abort(struct nvme_bdev_ns *nvme_ns, struct nvme_io_channel *nvme_ch,
2895 		struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort)
2896 {
2897 	int rc;
2898 
2899 	bio->orig_thread = spdk_io_channel_get_thread(spdk_io_channel_from_ctx(nvme_ch));
2900 
2901 	rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ns->ctrlr->ctrlr,
2902 					   nvme_ch->qpair,
2903 					   bio_to_abort,
2904 					   bdev_nvme_abort_done, bio);
2905 	if (rc == -ENOENT) {
2906 		/* If no command was found in I/O qpair, the target command may be
2907 		 * admin command. Only a single thread tries aborting admin command
2908 		 * to clean I/O flow.
2909 		 */
2910 		spdk_thread_send_msg(nvme_ns->ctrlr->thread,
2911 				     bdev_nvme_abort_admin_cmd, bio);
2912 		rc = 0;
2913 	}
2914 
2915 	return rc;
2916 }
2917 
2918 static void
2919 bdev_nvme_get_spdk_running_config(FILE *fp)
2920 {
2921 	struct nvme_bdev_ctrlr	*nvme_bdev_ctrlr;
2922 
2923 	fprintf(fp, "\n[Nvme]");
2924 	fprintf(fp, "\n"
2925 		"# NVMe Device Whitelist\n"
2926 		"# Users may specify which NVMe devices to claim by their transport id.\n"
2927 		"# See spdk_nvme_transport_id_parse() in spdk/nvme.h for the correct format.\n"
2928 		"# The second argument is the assigned name, which can be referenced from\n"
2929 		"# other sections in the configuration file. For NVMe devices, a namespace\n"
2930 		"# is automatically appended to each name in the format <YourName>nY, where\n"
2931 		"# Y is the NSID (starts at 1).\n");
2932 
2933 	TAILQ_FOREACH(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
2934 		const char *trtype;
2935 		const char *prchk_flags;
2936 
2937 		trtype = spdk_nvme_transport_id_trtype_str(nvme_bdev_ctrlr->connected_trid->trtype);
2938 		if (!trtype) {
2939 			continue;
2940 		}
2941 
2942 		if (nvme_bdev_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
2943 			fprintf(fp, "TransportID \"trtype:%s traddr:%s\" %s\n",
2944 				trtype,
2945 				nvme_bdev_ctrlr->connected_trid->traddr, nvme_bdev_ctrlr->name);
2946 		} else {
2947 			const char *adrfam;
2948 
2949 			adrfam = spdk_nvme_transport_id_adrfam_str(nvme_bdev_ctrlr->connected_trid->adrfam);
2950 			prchk_flags = spdk_nvme_prchk_flags_str(nvme_bdev_ctrlr->prchk_flags);
2951 
2952 			if (adrfam) {
2953 				fprintf(fp, "TransportID \"trtype:%s adrfam:%s traddr:%s trsvcid:%s subnqn:%s\" %s",
2954 					trtype,	adrfam,
2955 					nvme_bdev_ctrlr->connected_trid->traddr, nvme_bdev_ctrlr->connected_trid->trsvcid,
2956 					nvme_bdev_ctrlr->connected_trid->subnqn, nvme_bdev_ctrlr->name);
2957 			} else {
2958 				fprintf(fp, "TransportID \"trtype:%s traddr:%s trsvcid:%s subnqn:%s\" %s",
2959 					trtype,
2960 					nvme_bdev_ctrlr->connected_trid->traddr, nvme_bdev_ctrlr->connected_trid->trsvcid,
2961 					nvme_bdev_ctrlr->connected_trid->subnqn, nvme_bdev_ctrlr->name);
2962 			}
2963 
2964 			if (prchk_flags) {
2965 				fprintf(fp, " \"%s\"\n", prchk_flags);
2966 			} else {
2967 				fprintf(fp, "\n");
2968 			}
2969 		}
2970 	}
2971 
2972 	fprintf(fp, "\n"
2973 		"# The number of attempts per I/O when an I/O fails. Do not include\n"
2974 		"# this key to get the default behavior.\n");
2975 	fprintf(fp, "RetryCount %d\n", g_opts.retry_count);
2976 	fprintf(fp, "\n"
2977 		"# Timeout for each command, in microseconds. If 0, don't track timeouts.\n");
2978 	fprintf(fp, "TimeoutUsec %"PRIu64"\n", g_opts.timeout_us);
2979 
2980 	fprintf(fp, "\n"
2981 		"# Action to take on command time out. Only valid when Timeout is greater\n"
2982 		"# than 0. This may be 'Reset' to reset the controller, 'Abort' to abort\n"
2983 		"# the command, or 'None' to just print a message but do nothing.\n"
2984 		"# Admin command timeouts will always result in a reset.\n");
2985 	switch (g_opts.action_on_timeout) {
2986 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE:
2987 		fprintf(fp, "ActionOnTimeout None\n");
2988 		break;
2989 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET:
2990 		fprintf(fp, "ActionOnTimeout Reset\n");
2991 		break;
2992 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT:
2993 		fprintf(fp, "ActionOnTimeout Abort\n");
2994 		break;
2995 	}
2996 
2997 	fprintf(fp, "\n"
2998 		"# Set how often the admin queue is polled for asynchronous events.\n"
2999 		"# Units in microseconds.\n");
3000 	fprintf(fp, "AdminPollRate %"PRIu64"\n", g_opts.nvme_adminq_poll_period_us);
3001 	fprintf(fp, "IOPollRate %" PRIu64"\n", g_opts.nvme_ioq_poll_period_us);
3002 	fprintf(fp, "\n"
3003 		"# Disable handling of hotplug (runtime insert and remove) events,\n"
3004 		"# users can set to Yes if want to enable it.\n"
3005 		"# Default: No\n");
3006 	fprintf(fp, "HotplugEnable %s\n", g_nvme_hotplug_enabled ? "Yes" : "No");
3007 	fprintf(fp, "\n"
3008 		"# Set how often the hotplug is processed for insert and remove events."
3009 		"# Units in microseconds.\n");
3010 	fprintf(fp, "HotplugPollRate %"PRIu64"\n", g_nvme_hotplug_poll_period_us);
3011 	if (g_nvme_hostnqn) {
3012 		fprintf(fp, "HostNQN %s\n",  g_nvme_hostnqn);
3013 	}
3014 	fprintf(fp, "DelayCmdSubmit %s\n", g_opts.delay_cmd_submit ? "True" : "False");
3015 
3016 	fprintf(fp, "\n");
3017 }
3018 
3019 static void
3020 nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w, struct nvme_bdev_ns *ns)
3021 {
3022 	/* nop */
3023 }
3024 
3025 static void
3026 nvme_namespace_config_json(struct spdk_json_write_ctx *w, struct nvme_bdev_ns *ns)
3027 {
3028 	g_config_json_namespace_fn[ns->type](w, ns);
3029 }
3030 
3031 static int
3032 bdev_nvme_config_json(struct spdk_json_write_ctx *w)
3033 {
3034 	struct nvme_bdev_ctrlr		*nvme_bdev_ctrlr;
3035 	struct spdk_nvme_transport_id	*trid;
3036 	const char			*action;
3037 	uint32_t			nsid;
3038 
3039 	if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) {
3040 		action = "reset";
3041 	} else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) {
3042 		action = "abort";
3043 	} else {
3044 		action = "none";
3045 	}
3046 
3047 	spdk_json_write_object_begin(w);
3048 
3049 	spdk_json_write_named_string(w, "method", "bdev_nvme_set_options");
3050 
3051 	spdk_json_write_named_object_begin(w, "params");
3052 	spdk_json_write_named_string(w, "action_on_timeout", action);
3053 	spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us);
3054 	spdk_json_write_named_uint32(w, "retry_count", g_opts.retry_count);
3055 	spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst);
3056 	spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight);
3057 	spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight);
3058 	spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight);
3059 	spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us);
3060 	spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us);
3061 	spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests);
3062 	spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit);
3063 	spdk_json_write_object_end(w);
3064 
3065 	spdk_json_write_object_end(w);
3066 
3067 	pthread_mutex_lock(&g_bdev_nvme_mutex);
3068 	TAILQ_FOREACH(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
3069 		trid = nvme_bdev_ctrlr->connected_trid;
3070 
3071 		spdk_json_write_object_begin(w);
3072 
3073 		spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller");
3074 
3075 		spdk_json_write_named_object_begin(w, "params");
3076 		spdk_json_write_named_string(w, "name", nvme_bdev_ctrlr->name);
3077 		nvme_bdev_dump_trid_json(trid, w);
3078 		spdk_json_write_named_bool(w, "prchk_reftag",
3079 					   (nvme_bdev_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0);
3080 		spdk_json_write_named_bool(w, "prchk_guard",
3081 					   (nvme_bdev_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0);
3082 
3083 		spdk_json_write_object_end(w);
3084 
3085 		spdk_json_write_object_end(w);
3086 
3087 		for (nsid = 0; nsid < nvme_bdev_ctrlr->num_ns; ++nsid) {
3088 			if (!nvme_bdev_ctrlr->namespaces[nsid]->populated) {
3089 				continue;
3090 			}
3091 
3092 			nvme_namespace_config_json(w, nvme_bdev_ctrlr->namespaces[nsid]);
3093 		}
3094 	}
3095 
3096 	/* Dump as last parameter to give all NVMe bdevs chance to be constructed
3097 	 * before enabling hotplug poller.
3098 	 */
3099 	spdk_json_write_object_begin(w);
3100 	spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug");
3101 
3102 	spdk_json_write_named_object_begin(w, "params");
3103 	spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us);
3104 	spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled);
3105 	spdk_json_write_object_end(w);
3106 
3107 	spdk_json_write_object_end(w);
3108 
3109 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
3110 	return 0;
3111 }
3112 
3113 struct spdk_nvme_ctrlr *
3114 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev)
3115 {
3116 	if (!bdev || bdev->module != &nvme_if) {
3117 		return NULL;
3118 	}
3119 
3120 	return SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk)->nvme_ns->ctrlr->ctrlr;
3121 }
3122 
3123 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme)
3124