xref: /spdk/module/bdev/nvme/bdev_nvme.c (revision 1f4f4cc75a522f897856e980a0b35d3c8fac24ed)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation. All rights reserved.
5  *   Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "spdk/stdinc.h"
35 
36 #include "bdev_nvme.h"
37 #include "bdev_ocssd.h"
38 
39 #include "spdk/config.h"
40 #include "spdk/conf.h"
41 #include "spdk/endian.h"
42 #include "spdk/bdev.h"
43 #include "spdk/json.h"
44 #include "spdk/nvme.h"
45 #include "spdk/nvme_ocssd.h"
46 #include "spdk/thread.h"
47 #include "spdk/string.h"
48 #include "spdk/likely.h"
49 #include "spdk/util.h"
50 
51 #include "spdk/bdev_module.h"
52 #include "spdk_internal/log.h"
53 
54 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true
55 
56 static void bdev_nvme_get_spdk_running_config(FILE *fp);
57 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w);
58 
59 struct nvme_bdev_io {
60 	/** array of iovecs to transfer. */
61 	struct iovec *iovs;
62 
63 	/** Number of iovecs in iovs array. */
64 	int iovcnt;
65 
66 	/** Current iovec position. */
67 	int iovpos;
68 
69 	/** Offset in current iovec. */
70 	uint32_t iov_offset;
71 
72 	/** array of iovecs to transfer. */
73 	struct iovec *fused_iovs;
74 
75 	/** Number of iovecs in iovs array. */
76 	int fused_iovcnt;
77 
78 	/** Current iovec position. */
79 	int fused_iovpos;
80 
81 	/** Offset in current iovec. */
82 	uint32_t fused_iov_offset;
83 
84 	/** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */
85 	struct spdk_nvme_cpl cpl;
86 
87 	/** Originating thread */
88 	struct spdk_thread *orig_thread;
89 
90 	/** Keeps track if first of fused commands was submitted */
91 	bool first_fused_submitted;
92 };
93 
94 struct nvme_probe_ctx {
95 	size_t count;
96 	struct spdk_nvme_transport_id trids[NVME_MAX_CONTROLLERS];
97 	struct spdk_nvme_host_id hostids[NVME_MAX_CONTROLLERS];
98 	const char *names[NVME_MAX_CONTROLLERS];
99 	uint32_t prchk_flags[NVME_MAX_CONTROLLERS];
100 	const char *hostnqn;
101 };
102 
103 struct nvme_probe_skip_entry {
104 	struct spdk_nvme_transport_id		trid;
105 	TAILQ_ENTRY(nvme_probe_skip_entry)	tailq;
106 };
107 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */
108 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER(
109 			g_skipped_nvme_ctrlrs);
110 
111 static struct spdk_bdev_nvme_opts g_opts = {
112 	.action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE,
113 	.timeout_us = 0,
114 	.retry_count = 4,
115 	.arbitration_burst = 0,
116 	.low_priority_weight = 0,
117 	.medium_priority_weight = 0,
118 	.high_priority_weight = 0,
119 	.nvme_adminq_poll_period_us = 10000ULL,
120 	.nvme_ioq_poll_period_us = 0,
121 	.io_queue_requests = 0,
122 	.delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT,
123 };
124 
125 #define NVME_HOTPLUG_POLL_PERIOD_MAX			10000000ULL
126 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT		100000ULL
127 
128 static int g_hot_insert_nvme_controller_index = 0;
129 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT;
130 static bool g_nvme_hotplug_enabled = false;
131 static struct spdk_thread *g_bdev_nvme_init_thread;
132 static struct spdk_poller *g_hotplug_poller;
133 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx;
134 static char *g_nvme_hostnqn = NULL;
135 
136 static void nvme_ctrlr_populate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
137 		struct nvme_async_probe_ctx *ctx);
138 static void nvme_ctrlr_populate_namespaces_done(struct nvme_async_probe_ctx *ctx);
139 static int bdev_nvme_library_init(void);
140 static void bdev_nvme_library_fini(void);
141 static int bdev_nvme_readv(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
142 			   struct nvme_bdev_io *bio,
143 			   struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba);
144 static int bdev_nvme_no_pi_readv(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
145 				 struct nvme_bdev_io *bio,
146 				 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba);
147 static int bdev_nvme_writev(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
148 			    struct nvme_bdev_io *bio,
149 			    struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba);
150 static int bdev_nvme_comparev(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
151 			      struct nvme_bdev_io *bio,
152 			      struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba);
153 static int bdev_nvme_comparev_and_writev(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
154 		struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov,
155 		int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba);
156 static int bdev_nvme_admin_passthru(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
157 				    struct nvme_bdev_io *bio,
158 				    struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes);
159 static int bdev_nvme_io_passthru(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
160 				 struct nvme_bdev_io *bio,
161 				 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes);
162 static int bdev_nvme_io_passthru_md(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
163 				    struct nvme_bdev_io *bio,
164 				    struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len);
165 static int bdev_nvme_reset(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, struct nvme_bdev_io *bio);
166 
167 typedef void (*populate_namespace_fn)(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
168 				      struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx);
169 static void nvme_ctrlr_populate_standard_namespace(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
170 		struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx);
171 
172 static populate_namespace_fn g_populate_namespace_fn[] = {
173 	NULL,
174 	nvme_ctrlr_populate_standard_namespace,
175 	bdev_ocssd_populate_namespace,
176 };
177 
178 typedef void (*depopulate_namespace_fn)(struct nvme_bdev_ns *ns);
179 static void nvme_ctrlr_depopulate_standard_namespace(struct nvme_bdev_ns *ns);
180 
181 static depopulate_namespace_fn g_depopulate_namespace_fn[] = {
182 	NULL,
183 	nvme_ctrlr_depopulate_standard_namespace,
184 	bdev_ocssd_depopulate_namespace,
185 };
186 
187 typedef void (*config_json_namespace_fn)(struct spdk_json_write_ctx *w, struct nvme_bdev_ns *ns);
188 static void nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w,
189 		struct nvme_bdev_ns *ns);
190 
191 static config_json_namespace_fn g_config_json_namespace_fn[] = {
192 	NULL,
193 	nvme_ctrlr_config_json_standard_namespace,
194 	bdev_ocssd_namespace_config_json,
195 };
196 
197 struct spdk_nvme_qpair *
198 spdk_bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch)
199 {
200 	struct nvme_io_channel *nvme_ch;
201 
202 	nvme_ch =  spdk_io_channel_get_ctx(ctrlr_io_ch);
203 
204 	return nvme_ch->qpair;
205 }
206 
207 static int
208 bdev_nvme_get_ctx_size(void)
209 {
210 	return sizeof(struct nvme_bdev_io);
211 }
212 
213 static struct spdk_bdev_module nvme_if = {
214 	.name = "nvme",
215 	.async_fini = true,
216 	.module_init = bdev_nvme_library_init,
217 	.module_fini = bdev_nvme_library_fini,
218 	.config_text = bdev_nvme_get_spdk_running_config,
219 	.config_json = bdev_nvme_config_json,
220 	.get_ctx_size = bdev_nvme_get_ctx_size,
221 
222 };
223 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if)
224 
225 static void
226 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx)
227 {
228 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "qpar %p is disconnected, attempting reconnect.\n", qpair);
229 	/*
230 	 * Currently, just try to reconnect indefinitely. If we are doing a reset, the reset will
231 	 * reconnect a qpair and we will stop getting a callback for this one.
232 	 */
233 	spdk_nvme_ctrlr_reconnect_io_qpair(qpair);
234 }
235 
236 static int
237 bdev_nvme_poll(void *arg)
238 {
239 	struct nvme_bdev_poll_group *group = arg;
240 	int64_t num_completions;
241 
242 	if (group->collect_spin_stat && group->start_ticks == 0) {
243 		group->start_ticks = spdk_get_ticks();
244 	}
245 
246 	num_completions = spdk_nvme_poll_group_process_completions(group->group, 0,
247 			  bdev_nvme_disconnected_qpair_cb);
248 	if (group->collect_spin_stat) {
249 		if (num_completions > 0) {
250 			if (group->end_ticks != 0) {
251 				group->spin_ticks += (group->end_ticks - group->start_ticks);
252 				group->end_ticks = 0;
253 			}
254 			group->start_ticks = 0;
255 		} else {
256 			group->end_ticks = spdk_get_ticks();
257 		}
258 	}
259 
260 	return num_completions;
261 }
262 
263 static int
264 bdev_nvme_poll_adminq(void *arg)
265 {
266 	int32_t rc;
267 	struct spdk_nvme_ctrlr *ctrlr = arg;
268 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
269 
270 	rc = spdk_nvme_ctrlr_process_admin_completions(ctrlr);
271 
272 	if (rc < 0) {
273 		nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(spdk_nvme_ctrlr_get_transport_id(ctrlr));
274 		assert(nvme_bdev_ctrlr != NULL);
275 		bdev_nvme_reset(nvme_bdev_ctrlr, NULL);
276 	}
277 
278 	return rc;
279 }
280 
281 static int
282 bdev_nvme_destruct(void *ctx)
283 {
284 	struct nvme_bdev *nvme_disk = ctx;
285 
286 	nvme_bdev_detach_bdev_from_ns(nvme_disk);
287 
288 	free(nvme_disk->disk.name);
289 	free(nvme_disk);
290 
291 	return 0;
292 }
293 
294 static int
295 bdev_nvme_flush(struct nvme_bdev *nbdev, struct nvme_bdev_io *bio,
296 		uint64_t offset, uint64_t nbytes)
297 {
298 	spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_SUCCESS);
299 
300 	return 0;
301 }
302 
303 static void
304 _bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i)
305 {
306 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
307 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(_ch);
308 	struct spdk_bdev_io *bdev_io;
309 	enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS;
310 
311 	/* A NULL ctx means success. */
312 	if (spdk_io_channel_iter_get_ctx(i) != NULL) {
313 		status = SPDK_BDEV_IO_STATUS_FAILED;
314 	}
315 
316 	while (!TAILQ_EMPTY(&nvme_ch->pending_resets)) {
317 		bdev_io = TAILQ_FIRST(&nvme_ch->pending_resets);
318 		TAILQ_REMOVE(&nvme_ch->pending_resets, bdev_io, module_link);
319 		spdk_bdev_io_complete(bdev_io, status);
320 	}
321 
322 	spdk_for_each_channel_continue(i, 0);
323 }
324 
325 static void
326 _bdev_nvme_reset_complete(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, int rc)
327 {
328 	/* we are using the for_each_channel cb_arg like a return code here. */
329 	/* If it's zero, we succeeded, otherwise, the reset failed. */
330 	void *cb_arg = NULL;
331 
332 	if (rc) {
333 		cb_arg = (void *)0x1;
334 		SPDK_ERRLOG("Resetting controller failed.\n");
335 	} else {
336 		SPDK_NOTICELOG("Resetting controller successful.\n");
337 	}
338 
339 	pthread_mutex_lock(&g_bdev_nvme_mutex);
340 	nvme_bdev_ctrlr->resetting = false;
341 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
342 	/* Make sure we clear any pending resets before returning. */
343 	spdk_for_each_channel(nvme_bdev_ctrlr,
344 			      _bdev_nvme_complete_pending_resets,
345 			      cb_arg, NULL);
346 }
347 
348 static void
349 _bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status)
350 {
351 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i);
352 	void *ctx = spdk_io_channel_iter_get_ctx(i);
353 	int rc = SPDK_BDEV_IO_STATUS_SUCCESS;
354 
355 	if (status) {
356 		rc = SPDK_BDEV_IO_STATUS_FAILED;
357 	}
358 	if (ctx) {
359 		spdk_bdev_io_complete(spdk_bdev_io_from_ctx(ctx), rc);
360 	}
361 	_bdev_nvme_reset_complete(nvme_bdev_ctrlr, status);
362 }
363 
364 static void
365 _bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i)
366 {
367 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i);
368 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
369 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(_ch);
370 	struct spdk_nvme_io_qpair_opts opts;
371 
372 	spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_bdev_ctrlr->ctrlr, &opts, sizeof(opts));
373 	opts.delay_cmd_submit = g_opts.delay_cmd_submit;
374 	opts.create_only = true;
375 
376 	nvme_ch->qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_bdev_ctrlr->ctrlr, &opts, sizeof(opts));
377 	if (!nvme_ch->qpair) {
378 		spdk_for_each_channel_continue(i, -1);
379 		return;
380 	}
381 
382 	assert(nvme_ch->group != NULL);
383 	if (spdk_nvme_poll_group_add(nvme_ch->group->group, nvme_ch->qpair) != 0) {
384 		SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n");
385 		spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair);
386 		spdk_for_each_channel_continue(i, -1);
387 		return;
388 	}
389 
390 	if (spdk_nvme_ctrlr_connect_io_qpair(nvme_bdev_ctrlr->ctrlr, nvme_ch->qpair)) {
391 		SPDK_ERRLOG("Unable to connect I/O qpair.\n");
392 		spdk_nvme_poll_group_remove(nvme_ch->group->group, nvme_ch->qpair);
393 		spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair);
394 		spdk_for_each_channel_continue(i, -1);
395 		return;
396 	}
397 
398 	spdk_for_each_channel_continue(i, 0);
399 }
400 
401 static void
402 _bdev_nvme_reset(struct spdk_io_channel_iter *i, int status)
403 {
404 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i);
405 	struct nvme_bdev_io *bio = spdk_io_channel_iter_get_ctx(i);
406 	int rc;
407 
408 	if (status) {
409 		if (bio) {
410 			spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_FAILED);
411 		}
412 		_bdev_nvme_reset_complete(nvme_bdev_ctrlr, status);
413 		return;
414 	}
415 
416 	rc = spdk_nvme_ctrlr_reset(nvme_bdev_ctrlr->ctrlr);
417 	if (rc != 0) {
418 		if (bio) {
419 			spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_FAILED);
420 		}
421 		_bdev_nvme_reset_complete(nvme_bdev_ctrlr, rc);
422 		return;
423 	}
424 
425 	/* Recreate all of the I/O queue pairs */
426 	spdk_for_each_channel(nvme_bdev_ctrlr,
427 			      _bdev_nvme_reset_create_qpair,
428 			      bio,
429 			      _bdev_nvme_reset_create_qpairs_done);
430 
431 
432 }
433 
434 static void
435 _bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i)
436 {
437 	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
438 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
439 	int rc;
440 
441 	rc = spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair);
442 	if (!rc) {
443 		nvme_ch->qpair = NULL;
444 	}
445 
446 	spdk_for_each_channel_continue(i, rc);
447 }
448 
449 static int
450 bdev_nvme_reset(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, struct nvme_bdev_io *bio)
451 {
452 	struct spdk_io_channel *ch;
453 	struct nvme_io_channel *nvme_ch;
454 
455 	pthread_mutex_lock(&g_bdev_nvme_mutex);
456 	if (nvme_bdev_ctrlr->destruct) {
457 		/* Don't bother resetting if the controller is in the process of being destructed. */
458 		if (bio) {
459 			spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_FAILED);
460 		}
461 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
462 		return 0;
463 	}
464 
465 	if (!nvme_bdev_ctrlr->resetting) {
466 		nvme_bdev_ctrlr->resetting = true;
467 	} else {
468 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
469 		SPDK_NOTICELOG("Unable to perform reset, already in progress.\n");
470 		/*
471 		 * The internal reset calls won't be queued. This is on purpose so that we don't
472 		 * interfere with the app framework reset strategy. i.e. we are deferring to the
473 		 * upper level. If they are in the middle of a reset, we won't try to schedule another one.
474 		 */
475 		if (bio) {
476 			ch = spdk_get_io_channel(nvme_bdev_ctrlr);
477 			assert(ch != NULL);
478 			nvme_ch = spdk_io_channel_get_ctx(ch);
479 			TAILQ_INSERT_TAIL(&nvme_ch->pending_resets, spdk_bdev_io_from_ctx(bio), module_link);
480 			spdk_put_io_channel(ch);
481 		}
482 		return 0;
483 	}
484 
485 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
486 	/* First, delete all NVMe I/O queue pairs. */
487 	spdk_for_each_channel(nvme_bdev_ctrlr,
488 			      _bdev_nvme_reset_destroy_qpair,
489 			      bio,
490 			      _bdev_nvme_reset);
491 
492 	return 0;
493 }
494 
495 static int
496 bdev_nvme_unmap(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
497 		struct nvme_bdev_io *bio,
498 		uint64_t offset_blocks,
499 		uint64_t num_blocks);
500 
501 static void
502 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
503 		     bool success)
504 {
505 	int ret;
506 
507 	if (!success) {
508 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
509 		return;
510 	}
511 
512 	ret = bdev_nvme_readv((struct nvme_bdev *)bdev_io->bdev->ctxt,
513 			      ch,
514 			      (struct nvme_bdev_io *)bdev_io->driver_ctx,
515 			      bdev_io->u.bdev.iovs,
516 			      bdev_io->u.bdev.iovcnt,
517 			      bdev_io->u.bdev.md_buf,
518 			      bdev_io->u.bdev.num_blocks,
519 			      bdev_io->u.bdev.offset_blocks);
520 
521 	if (spdk_likely(ret == 0)) {
522 		return;
523 	} else if (ret == -ENOMEM) {
524 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
525 	} else {
526 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
527 	}
528 }
529 
530 static int
531 _bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
532 {
533 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
534 	struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt;
535 	struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx;
536 
537 	if (nvme_ch->qpair == NULL) {
538 		/* The device is currently resetting */
539 		return -1;
540 	}
541 
542 	switch (bdev_io->type) {
543 	case SPDK_BDEV_IO_TYPE_READ:
544 		spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb,
545 				     bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
546 		return 0;
547 
548 	case SPDK_BDEV_IO_TYPE_WRITE:
549 		return bdev_nvme_writev(nbdev,
550 					ch,
551 					nbdev_io,
552 					bdev_io->u.bdev.iovs,
553 					bdev_io->u.bdev.iovcnt,
554 					bdev_io->u.bdev.md_buf,
555 					bdev_io->u.bdev.num_blocks,
556 					bdev_io->u.bdev.offset_blocks);
557 
558 	case SPDK_BDEV_IO_TYPE_COMPARE:
559 		return bdev_nvme_comparev(nbdev,
560 					  ch,
561 					  nbdev_io,
562 					  bdev_io->u.bdev.iovs,
563 					  bdev_io->u.bdev.iovcnt,
564 					  bdev_io->u.bdev.md_buf,
565 					  bdev_io->u.bdev.num_blocks,
566 					  bdev_io->u.bdev.offset_blocks);
567 
568 	case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE:
569 		return bdev_nvme_comparev_and_writev(nbdev,
570 						     ch,
571 						     nbdev_io,
572 						     bdev_io->u.bdev.iovs,
573 						     bdev_io->u.bdev.iovcnt,
574 						     bdev_io->u.bdev.fused_iovs,
575 						     bdev_io->u.bdev.fused_iovcnt,
576 						     bdev_io->u.bdev.md_buf,
577 						     bdev_io->u.bdev.num_blocks,
578 						     bdev_io->u.bdev.offset_blocks);
579 
580 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
581 		return bdev_nvme_unmap(nbdev,
582 				       ch,
583 				       nbdev_io,
584 				       bdev_io->u.bdev.offset_blocks,
585 				       bdev_io->u.bdev.num_blocks);
586 
587 	case SPDK_BDEV_IO_TYPE_UNMAP:
588 		return bdev_nvme_unmap(nbdev,
589 				       ch,
590 				       nbdev_io,
591 				       bdev_io->u.bdev.offset_blocks,
592 				       bdev_io->u.bdev.num_blocks);
593 
594 	case SPDK_BDEV_IO_TYPE_RESET:
595 		return bdev_nvme_reset(nbdev->nvme_bdev_ctrlr, nbdev_io);
596 
597 	case SPDK_BDEV_IO_TYPE_FLUSH:
598 		return bdev_nvme_flush(nbdev,
599 				       nbdev_io,
600 				       bdev_io->u.bdev.offset_blocks,
601 				       bdev_io->u.bdev.num_blocks);
602 
603 	case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
604 		return bdev_nvme_admin_passthru(nbdev,
605 						ch,
606 						nbdev_io,
607 						&bdev_io->u.nvme_passthru.cmd,
608 						bdev_io->u.nvme_passthru.buf,
609 						bdev_io->u.nvme_passthru.nbytes);
610 
611 	case SPDK_BDEV_IO_TYPE_NVME_IO:
612 		return bdev_nvme_io_passthru(nbdev,
613 					     ch,
614 					     nbdev_io,
615 					     &bdev_io->u.nvme_passthru.cmd,
616 					     bdev_io->u.nvme_passthru.buf,
617 					     bdev_io->u.nvme_passthru.nbytes);
618 
619 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
620 		return bdev_nvme_io_passthru_md(nbdev,
621 						ch,
622 						nbdev_io,
623 						&bdev_io->u.nvme_passthru.cmd,
624 						bdev_io->u.nvme_passthru.buf,
625 						bdev_io->u.nvme_passthru.nbytes,
626 						bdev_io->u.nvme_passthru.md_buf,
627 						bdev_io->u.nvme_passthru.md_len);
628 
629 	default:
630 		return -EINVAL;
631 	}
632 	return 0;
633 }
634 
635 static void
636 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
637 {
638 	int rc = _bdev_nvme_submit_request(ch, bdev_io);
639 
640 	if (spdk_unlikely(rc != 0)) {
641 		if (rc == -ENOMEM) {
642 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
643 		} else {
644 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
645 		}
646 	}
647 }
648 
649 static bool
650 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
651 {
652 	struct nvme_bdev *nbdev = ctx;
653 	const struct spdk_nvme_ctrlr_data *cdata;
654 
655 	switch (io_type) {
656 	case SPDK_BDEV_IO_TYPE_READ:
657 	case SPDK_BDEV_IO_TYPE_WRITE:
658 	case SPDK_BDEV_IO_TYPE_RESET:
659 	case SPDK_BDEV_IO_TYPE_FLUSH:
660 	case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
661 	case SPDK_BDEV_IO_TYPE_NVME_IO:
662 		return true;
663 
664 	case SPDK_BDEV_IO_TYPE_COMPARE:
665 		return spdk_nvme_ns_supports_compare(nbdev->nvme_ns->ns);
666 
667 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
668 		return spdk_nvme_ns_get_md_size(nbdev->nvme_ns->ns) ? true : false;
669 
670 	case SPDK_BDEV_IO_TYPE_UNMAP:
671 		cdata = spdk_nvme_ctrlr_get_data(nbdev->nvme_bdev_ctrlr->ctrlr);
672 		return cdata->oncs.dsm;
673 
674 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
675 		cdata = spdk_nvme_ctrlr_get_data(nbdev->nvme_bdev_ctrlr->ctrlr);
676 		/*
677 		 * If an NVMe controller guarantees reading unallocated blocks returns zero,
678 		 * we can implement WRITE_ZEROES as an NVMe deallocate command.
679 		 */
680 		if (cdata->oncs.dsm &&
681 		    spdk_nvme_ns_get_dealloc_logical_block_read_value(nbdev->nvme_ns->ns) ==
682 		    SPDK_NVME_DEALLOC_READ_00) {
683 			return true;
684 		}
685 		/*
686 		 * The NVMe controller write_zeroes function is currently not used by our driver.
687 		 * If a user submits an arbitrarily large write_zeroes request to the controller, the request will fail.
688 		 * Until this is resolved, we only claim support for write_zeroes if deallocated blocks return 0's when read.
689 		 */
690 		return false;
691 
692 	case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE:
693 		if (spdk_nvme_ctrlr_get_flags(nbdev->nvme_bdev_ctrlr->ctrlr) &
694 		    SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) {
695 			return true;
696 		}
697 		return false;
698 
699 	default:
700 		return false;
701 	}
702 }
703 
704 static int
705 bdev_nvme_create_cb(void *io_device, void *ctx_buf)
706 {
707 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = io_device;
708 	struct nvme_io_channel *ch = ctx_buf;
709 	struct spdk_nvme_io_qpair_opts opts;
710 	struct spdk_io_channel *pg_ch = NULL;
711 	int rc;
712 
713 	spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_bdev_ctrlr->ctrlr, &opts, sizeof(opts));
714 	opts.delay_cmd_submit = g_opts.delay_cmd_submit;
715 	opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests);
716 	opts.create_only = true;
717 	g_opts.io_queue_requests = opts.io_queue_requests;
718 
719 	ch->qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_bdev_ctrlr->ctrlr, &opts, sizeof(opts));
720 
721 	if (ch->qpair == NULL) {
722 		return -1;
723 	}
724 
725 	if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) {
726 		if (bdev_ocssd_create_io_channel(ch)) {
727 			goto err;
728 		}
729 	}
730 
731 	pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs);
732 	if (!pg_ch) {
733 		goto err;
734 	}
735 
736 	ch->group = spdk_io_channel_get_ctx(pg_ch);
737 	if (spdk_nvme_poll_group_add(ch->group->group, ch->qpair) != 0) {
738 		goto err;
739 	}
740 
741 	rc = spdk_nvme_ctrlr_connect_io_qpair(nvme_bdev_ctrlr->ctrlr, ch->qpair);
742 	if (rc) {
743 		spdk_nvme_poll_group_remove(ch->group->group, ch->qpair);
744 		goto err;
745 	}
746 
747 #ifdef SPDK_CONFIG_VTUNE
748 	ch->group->collect_spin_stat = true;
749 #else
750 	ch->group->collect_spin_stat = false;
751 #endif
752 
753 	TAILQ_INIT(&ch->pending_resets);
754 	return 0;
755 
756 err:
757 	if (pg_ch) {
758 		spdk_put_io_channel(pg_ch);
759 	}
760 	spdk_nvme_ctrlr_free_io_qpair(ch->qpair);
761 	return -1;
762 }
763 
764 static void
765 bdev_nvme_destroy_cb(void *io_device, void *ctx_buf)
766 {
767 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = io_device;
768 	struct nvme_io_channel *ch = ctx_buf;
769 	struct nvme_bdev_poll_group *group;
770 
771 	group = ch->group;
772 	assert(group != NULL);
773 
774 	if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) {
775 		bdev_ocssd_destroy_io_channel(ch);
776 	}
777 
778 	if (ch->qpair != NULL) {
779 		spdk_nvme_poll_group_remove(group->group, ch->qpair);
780 	}
781 	spdk_put_io_channel(spdk_io_channel_from_ctx(group));
782 
783 	spdk_nvme_ctrlr_free_io_qpair(ch->qpair);
784 }
785 
786 static int
787 bdev_nvme_poll_group_create_cb(void *io_device, void *ctx_buf)
788 {
789 	struct nvme_bdev_poll_group *group = ctx_buf;
790 
791 	group->group = spdk_nvme_poll_group_create(group);
792 	if (group->group == NULL) {
793 		return -1;
794 	}
795 
796 	group->poller = spdk_poller_register(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us);
797 
798 	if (group->poller == NULL) {
799 		spdk_nvme_poll_group_destroy(group->group);
800 		return -1;
801 	}
802 
803 	return 0;
804 }
805 
806 static void
807 bdev_nvme_poll_group_destroy_cb(void *io_device, void *ctx_buf)
808 {
809 	struct nvme_bdev_poll_group *group = ctx_buf;
810 
811 	spdk_poller_unregister(&group->poller);
812 	if (spdk_nvme_poll_group_destroy(group->group)) {
813 		SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.");
814 		assert(false);
815 	}
816 }
817 
818 static struct spdk_io_channel *
819 bdev_nvme_get_io_channel(void *ctx)
820 {
821 	struct nvme_bdev *nvme_bdev = ctx;
822 
823 	return spdk_get_io_channel(nvme_bdev->nvme_bdev_ctrlr);
824 }
825 
826 static int
827 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
828 {
829 	struct nvme_bdev *nvme_bdev = ctx;
830 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = nvme_bdev->nvme_bdev_ctrlr;
831 	const struct spdk_nvme_ctrlr_data *cdata;
832 	struct spdk_nvme_ns *ns;
833 	union spdk_nvme_vs_register vs;
834 	union spdk_nvme_csts_register csts;
835 	char buf[128];
836 
837 	cdata = spdk_nvme_ctrlr_get_data(nvme_bdev->nvme_bdev_ctrlr->ctrlr);
838 	vs = spdk_nvme_ctrlr_get_regs_vs(nvme_bdev->nvme_bdev_ctrlr->ctrlr);
839 	csts = spdk_nvme_ctrlr_get_regs_csts(nvme_bdev->nvme_bdev_ctrlr->ctrlr);
840 	ns = nvme_bdev->nvme_ns->ns;
841 
842 	spdk_json_write_named_object_begin(w, "nvme");
843 
844 	if (nvme_bdev_ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) {
845 		spdk_json_write_named_string(w, "pci_address", nvme_bdev_ctrlr->trid.traddr);
846 	}
847 
848 	spdk_json_write_named_object_begin(w, "trid");
849 
850 	nvme_bdev_dump_trid_json(&nvme_bdev_ctrlr->trid, w);
851 
852 	spdk_json_write_object_end(w);
853 
854 #ifdef SPDK_CONFIG_NVME_CUSE
855 	size_t cuse_name_size = 128;
856 	char cuse_name[cuse_name_size];
857 
858 	int rc = spdk_nvme_cuse_get_ns_name(nvme_bdev->nvme_bdev_ctrlr->ctrlr, spdk_nvme_ns_get_id(ns),
859 					    cuse_name, &cuse_name_size);
860 	if (rc == 0) {
861 		spdk_json_write_named_string(w, "cuse_device", cuse_name);
862 	}
863 #endif
864 
865 	spdk_json_write_named_object_begin(w, "ctrlr_data");
866 
867 	spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid);
868 
869 	snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn);
870 	spdk_str_trim(buf);
871 	spdk_json_write_named_string(w, "model_number", buf);
872 
873 	snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn);
874 	spdk_str_trim(buf);
875 	spdk_json_write_named_string(w, "serial_number", buf);
876 
877 	snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr);
878 	spdk_str_trim(buf);
879 	spdk_json_write_named_string(w, "firmware_revision", buf);
880 
881 	spdk_json_write_named_object_begin(w, "oacs");
882 
883 	spdk_json_write_named_uint32(w, "security", cdata->oacs.security);
884 	spdk_json_write_named_uint32(w, "format", cdata->oacs.format);
885 	spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware);
886 	spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage);
887 
888 	spdk_json_write_object_end(w);
889 
890 	spdk_json_write_object_end(w);
891 
892 	spdk_json_write_named_object_begin(w, "vs");
893 
894 	spdk_json_write_name(w, "nvme_version");
895 	if (vs.bits.ter) {
896 		spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter);
897 	} else {
898 		spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr);
899 	}
900 
901 	spdk_json_write_object_end(w);
902 
903 	spdk_json_write_named_object_begin(w, "csts");
904 
905 	spdk_json_write_named_uint32(w, "rdy", csts.bits.rdy);
906 	spdk_json_write_named_uint32(w, "cfs", csts.bits.cfs);
907 
908 	spdk_json_write_object_end(w);
909 
910 	spdk_json_write_named_object_begin(w, "ns_data");
911 
912 	spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns));
913 
914 	spdk_json_write_object_end(w);
915 
916 	if (cdata->oacs.security) {
917 		spdk_json_write_named_object_begin(w, "security");
918 
919 		spdk_json_write_named_bool(w, "opal", spdk_opal_supported(nvme_bdev_ctrlr->opal_dev));
920 
921 		spdk_json_write_object_end(w);
922 	}
923 
924 	spdk_json_write_object_end(w);
925 
926 	return 0;
927 }
928 
929 static void
930 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
931 {
932 	/* No config per bdev needed */
933 }
934 
935 static uint64_t
936 bdev_nvme_get_spin_time(struct spdk_io_channel *ch)
937 {
938 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
939 	struct nvme_bdev_poll_group *group = nvme_ch->group;
940 	uint64_t spin_time;
941 
942 	if (!group || !group->collect_spin_stat) {
943 		return 0;
944 	}
945 
946 	if (group->end_ticks != 0) {
947 		group->spin_ticks += (group->end_ticks - group->start_ticks);
948 		group->end_ticks = 0;
949 	}
950 
951 	spin_time = (group->spin_ticks * 1000000ULL) / spdk_get_ticks_hz();
952 	group->start_ticks = 0;
953 	group->spin_ticks = 0;
954 
955 	return spin_time;
956 }
957 
958 static const struct spdk_bdev_fn_table nvmelib_fn_table = {
959 	.destruct		= bdev_nvme_destruct,
960 	.submit_request		= bdev_nvme_submit_request,
961 	.io_type_supported	= bdev_nvme_io_type_supported,
962 	.get_io_channel		= bdev_nvme_get_io_channel,
963 	.dump_info_json		= bdev_nvme_dump_info_json,
964 	.write_config_json	= bdev_nvme_write_config_json,
965 	.get_spin_time		= bdev_nvme_get_spin_time,
966 };
967 
968 static void
969 nvme_ctrlr_populate_standard_namespace(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
970 				       struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx)
971 {
972 	struct spdk_nvme_ctrlr	*ctrlr = nvme_bdev_ctrlr->ctrlr;
973 	struct nvme_bdev	*bdev;
974 	struct spdk_nvme_ns	*ns;
975 	const struct spdk_uuid	*uuid;
976 	const struct spdk_nvme_ctrlr_data *cdata;
977 	const struct spdk_nvme_ns_data *nsdata;
978 	int			rc;
979 
980 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
981 
982 	ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id);
983 	if (!ns) {
984 		SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "Invalid NS %d\n", nvme_ns->id);
985 		nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, -EINVAL);
986 		return;
987 	}
988 
989 	bdev = calloc(1, sizeof(*bdev));
990 	if (!bdev) {
991 		SPDK_ERRLOG("bdev calloc() failed\n");
992 		nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, -ENOMEM);
993 		return;
994 	}
995 
996 	bdev->nvme_bdev_ctrlr = nvme_bdev_ctrlr;
997 	nvme_ns->ns = ns;
998 	bdev->nvme_ns = nvme_ns;
999 
1000 	bdev->disk.name = spdk_sprintf_alloc("%sn%d", nvme_bdev_ctrlr->name, spdk_nvme_ns_get_id(ns));
1001 	if (!bdev->disk.name) {
1002 		free(bdev);
1003 		nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, -ENOMEM);
1004 		return;
1005 	}
1006 	bdev->disk.product_name = "NVMe disk";
1007 
1008 	bdev->disk.write_cache = 0;
1009 	if (cdata->vwc.present) {
1010 		/* Enable if the Volatile Write Cache exists */
1011 		bdev->disk.write_cache = 1;
1012 	}
1013 	bdev->disk.blocklen = spdk_nvme_ns_get_extended_sector_size(ns);
1014 	bdev->disk.blockcnt = spdk_nvme_ns_get_num_sectors(ns);
1015 	bdev->disk.optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns);
1016 
1017 	uuid = spdk_nvme_ns_get_uuid(ns);
1018 	if (uuid != NULL) {
1019 		bdev->disk.uuid = *uuid;
1020 	}
1021 
1022 	nsdata = spdk_nvme_ns_get_data(ns);
1023 
1024 	bdev->disk.md_len = spdk_nvme_ns_get_md_size(ns);
1025 	if (bdev->disk.md_len != 0) {
1026 		bdev->disk.md_interleave = nsdata->flbas.extended;
1027 		bdev->disk.dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns);
1028 		if (bdev->disk.dif_type != SPDK_DIF_DISABLE) {
1029 			bdev->disk.dif_is_head_of_md = nsdata->dps.md_start;
1030 			bdev->disk.dif_check_flags = nvme_bdev_ctrlr->prchk_flags;
1031 		}
1032 	}
1033 
1034 	if (!bdev_nvme_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) {
1035 		bdev->disk.acwu = 0;
1036 	} else if (nsdata->nsfeat.ns_atomic_write_unit) {
1037 		bdev->disk.acwu = nsdata->nacwu;
1038 	} else {
1039 		bdev->disk.acwu = cdata->acwu;
1040 	}
1041 
1042 	bdev->disk.ctxt = bdev;
1043 	bdev->disk.fn_table = &nvmelib_fn_table;
1044 	bdev->disk.module = &nvme_if;
1045 	rc = spdk_bdev_register(&bdev->disk);
1046 	if (rc) {
1047 		free(bdev->disk.name);
1048 		free(bdev);
1049 		nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, rc);
1050 		return;
1051 	}
1052 
1053 	nvme_bdev_attach_bdev_to_ns(nvme_ns, bdev);
1054 	nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, 0);
1055 }
1056 
1057 static bool
1058 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
1059 		 struct spdk_nvme_ctrlr_opts *opts)
1060 {
1061 	struct nvme_probe_skip_entry *entry;
1062 
1063 	TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) {
1064 		if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) {
1065 			return false;
1066 		}
1067 	}
1068 
1069 	opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst;
1070 	opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight;
1071 	opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight;
1072 	opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight;
1073 
1074 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "Attaching to %s\n", trid->traddr);
1075 
1076 	return true;
1077 }
1078 
1079 static bool
1080 probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
1081 	 struct spdk_nvme_ctrlr_opts *opts)
1082 {
1083 	struct nvme_probe_ctx *ctx = cb_ctx;
1084 
1085 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "Probing device %s\n", trid->traddr);
1086 
1087 	if (nvme_bdev_ctrlr_get(trid)) {
1088 		SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n",
1089 			    trid->traddr);
1090 		return false;
1091 	}
1092 
1093 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
1094 		bool claim_device = false;
1095 		size_t i;
1096 
1097 		for (i = 0; i < ctx->count; i++) {
1098 			if (spdk_nvme_transport_id_compare(trid, &ctx->trids[i]) == 0) {
1099 				claim_device = true;
1100 				break;
1101 			}
1102 		}
1103 
1104 		if (!claim_device) {
1105 			SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "Not claiming device at %s\n", trid->traddr);
1106 			return false;
1107 		}
1108 	}
1109 
1110 	if (ctx->hostnqn) {
1111 		snprintf(opts->hostnqn, sizeof(opts->hostnqn), "%s", ctx->hostnqn);
1112 	}
1113 
1114 	opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst;
1115 	opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight;
1116 	opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight;
1117 	opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight;
1118 
1119 	return true;
1120 }
1121 
1122 static void
1123 spdk_nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl)
1124 {
1125 	struct spdk_nvme_ctrlr *ctrlr = ctx;
1126 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
1127 
1128 	if (spdk_nvme_cpl_is_error(cpl)) {
1129 		SPDK_WARNLOG("Abort failed. Resetting controller.\n");
1130 		nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(spdk_nvme_ctrlr_get_transport_id(ctrlr));
1131 		assert(nvme_bdev_ctrlr != NULL);
1132 		bdev_nvme_reset(nvme_bdev_ctrlr, NULL);
1133 	}
1134 }
1135 
1136 static void
1137 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr,
1138 	   struct spdk_nvme_qpair *qpair, uint16_t cid)
1139 {
1140 	int rc;
1141 	union spdk_nvme_csts_register csts;
1142 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
1143 
1144 	SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid);
1145 
1146 	csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr);
1147 	if (csts.bits.cfs) {
1148 		SPDK_ERRLOG("Controller Fatal Status, reset required\n");
1149 		nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(spdk_nvme_ctrlr_get_transport_id(ctrlr));
1150 		assert(nvme_bdev_ctrlr != NULL);
1151 		bdev_nvme_reset(nvme_bdev_ctrlr, NULL);
1152 		return;
1153 	}
1154 
1155 	switch (g_opts.action_on_timeout) {
1156 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT:
1157 		if (qpair) {
1158 			rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid,
1159 						       spdk_nvme_abort_cpl, ctrlr);
1160 			if (rc == 0) {
1161 				return;
1162 			}
1163 
1164 			SPDK_ERRLOG("Unable to send abort. Resetting.\n");
1165 		}
1166 
1167 	/* FALLTHROUGH */
1168 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET:
1169 		nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(spdk_nvme_ctrlr_get_transport_id(ctrlr));
1170 		assert(nvme_bdev_ctrlr != NULL);
1171 		bdev_nvme_reset(nvme_bdev_ctrlr, NULL);
1172 		break;
1173 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE:
1174 		SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "No action for nvme controller timeout.\n");
1175 		break;
1176 	default:
1177 		SPDK_ERRLOG("An invalid timeout action value is found.\n");
1178 		break;
1179 	}
1180 }
1181 
1182 void
1183 nvme_ctrlr_depopulate_namespace_done(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr)
1184 {
1185 	pthread_mutex_lock(&g_bdev_nvme_mutex);
1186 	nvme_bdev_ctrlr->ref--;
1187 
1188 	if (nvme_bdev_ctrlr->ref == 0 && nvme_bdev_ctrlr->destruct) {
1189 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
1190 		nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr);
1191 		return;
1192 	}
1193 
1194 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
1195 }
1196 
1197 static void
1198 nvme_ctrlr_depopulate_standard_namespace(struct nvme_bdev_ns *ns)
1199 {
1200 	struct nvme_bdev *bdev, *tmp;
1201 
1202 	TAILQ_FOREACH_SAFE(bdev, &ns->bdevs, tailq, tmp) {
1203 		spdk_bdev_unregister(&bdev->disk, NULL, NULL);
1204 	}
1205 
1206 	ns->populated = false;
1207 
1208 	nvme_ctrlr_depopulate_namespace_done(ns->ctrlr);
1209 }
1210 
1211 static void nvme_ctrlr_populate_namespace(struct nvme_bdev_ctrlr *ctrlr, struct nvme_bdev_ns *ns,
1212 		struct nvme_async_probe_ctx *ctx)
1213 {
1214 	g_populate_namespace_fn[ns->type](ctrlr, ns, ctx);
1215 }
1216 
1217 static void nvme_ctrlr_depopulate_namespace(struct nvme_bdev_ctrlr *ctrlr, struct nvme_bdev_ns *ns)
1218 {
1219 	g_depopulate_namespace_fn[ns->type](ns);
1220 }
1221 
1222 void
1223 nvme_ctrlr_populate_namespace_done(struct nvme_async_probe_ctx *ctx,
1224 				   struct nvme_bdev_ns *ns, int rc)
1225 {
1226 	if (rc == 0) {
1227 		ns->populated = true;
1228 		pthread_mutex_lock(&g_bdev_nvme_mutex);
1229 		ns->ctrlr->ref++;
1230 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
1231 	} else {
1232 		memset(ns, 0, sizeof(*ns));
1233 	}
1234 
1235 	if (ctx) {
1236 		ctx->populates_in_progress--;
1237 		if (ctx->populates_in_progress == 0) {
1238 			nvme_ctrlr_populate_namespaces_done(ctx);
1239 		}
1240 	}
1241 }
1242 
1243 static void
1244 nvme_ctrlr_populate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
1245 			       struct nvme_async_probe_ctx *ctx)
1246 {
1247 	struct spdk_nvme_ctrlr	*ctrlr = nvme_bdev_ctrlr->ctrlr;
1248 	struct nvme_bdev_ns	*ns;
1249 	struct spdk_nvme_ns	*nvme_ns;
1250 	struct nvme_bdev	*bdev;
1251 	uint32_t		i;
1252 	int			rc;
1253 	uint64_t		num_sectors;
1254 	bool			ns_is_active;
1255 
1256 	if (ctx) {
1257 		/* Initialize this count to 1 to handle the populate functions
1258 		 * calling nvme_ctrlr_populate_namespace_done() immediately.
1259 		 */
1260 		ctx->populates_in_progress = 1;
1261 	}
1262 
1263 	for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
1264 		uint32_t	nsid = i + 1;
1265 
1266 		ns = nvme_bdev_ctrlr->namespaces[i];
1267 		ns_is_active = spdk_nvme_ctrlr_is_active_ns(ctrlr, nsid);
1268 
1269 		if (ns->populated && ns_is_active && ns->type == NVME_BDEV_NS_STANDARD) {
1270 			/* NS is still there but attributes may have changed */
1271 			nvme_ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid);
1272 			num_sectors = spdk_nvme_ns_get_num_sectors(nvme_ns);
1273 			bdev = TAILQ_FIRST(&ns->bdevs);
1274 			if (bdev->disk.blockcnt != num_sectors) {
1275 				SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %lu, new size %lu\n",
1276 					       nsid,
1277 					       bdev->disk.name,
1278 					       bdev->disk.blockcnt,
1279 					       num_sectors);
1280 				rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors);
1281 				if (rc != 0) {
1282 					SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n",
1283 						    bdev->disk.name, rc);
1284 				}
1285 			}
1286 		}
1287 
1288 		if (!ns->populated && ns_is_active) {
1289 			ns->id = nsid;
1290 			ns->ctrlr = nvme_bdev_ctrlr;
1291 			if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) {
1292 				ns->type = NVME_BDEV_NS_OCSSD;
1293 			} else {
1294 				ns->type = NVME_BDEV_NS_STANDARD;
1295 			}
1296 
1297 			TAILQ_INIT(&ns->bdevs);
1298 
1299 			if (ctx) {
1300 				ctx->populates_in_progress++;
1301 			}
1302 			nvme_ctrlr_populate_namespace(nvme_bdev_ctrlr, ns, ctx);
1303 		}
1304 
1305 		if (ns->populated && !ns_is_active) {
1306 			nvme_ctrlr_depopulate_namespace(nvme_bdev_ctrlr, ns);
1307 		}
1308 	}
1309 
1310 	if (ctx) {
1311 		/* Decrement this count now that the loop is over to account
1312 		 * for the one we started with.  If the count is then 0, we
1313 		 * know any populate_namespace functions completed immediately,
1314 		 * so we'll kick the callback here.
1315 		 */
1316 		ctx->populates_in_progress--;
1317 		if (ctx->populates_in_progress == 0) {
1318 			nvme_ctrlr_populate_namespaces_done(ctx);
1319 		}
1320 	}
1321 
1322 }
1323 
1324 static void
1325 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl)
1326 {
1327 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr		= arg;
1328 	union spdk_nvme_async_event_completion	event;
1329 
1330 	if (spdk_nvme_cpl_is_error(cpl)) {
1331 		SPDK_WARNLOG("AER request execute failed");
1332 		return;
1333 	}
1334 
1335 	event.raw = cpl->cdw0;
1336 	if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) &&
1337 	    (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) {
1338 		nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, NULL);
1339 	} else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_VENDOR) &&
1340 		   (event.bits.log_page_identifier == SPDK_OCSSD_LOG_CHUNK_NOTIFICATION) &&
1341 		   spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) {
1342 		bdev_ocssd_handle_chunk_notification(nvme_bdev_ctrlr);
1343 	}
1344 }
1345 
1346 static int
1347 create_ctrlr(struct spdk_nvme_ctrlr *ctrlr,
1348 	     const char *name,
1349 	     const struct spdk_nvme_transport_id *trid,
1350 	     uint32_t prchk_flags)
1351 {
1352 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
1353 	uint32_t i;
1354 	int rc;
1355 
1356 	nvme_bdev_ctrlr = calloc(1, sizeof(*nvme_bdev_ctrlr));
1357 	if (nvme_bdev_ctrlr == NULL) {
1358 		SPDK_ERRLOG("Failed to allocate device struct\n");
1359 		return -ENOMEM;
1360 	}
1361 	nvme_bdev_ctrlr->num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr);
1362 	nvme_bdev_ctrlr->namespaces = calloc(nvme_bdev_ctrlr->num_ns, sizeof(struct nvme_bdev_ns *));
1363 	if (!nvme_bdev_ctrlr->namespaces) {
1364 		SPDK_ERRLOG("Failed to allocate block namespaces pointer\n");
1365 		free(nvme_bdev_ctrlr);
1366 		return -ENOMEM;
1367 	}
1368 
1369 	for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
1370 		nvme_bdev_ctrlr->namespaces[i] = calloc(1, sizeof(struct nvme_bdev_ns));
1371 		if (nvme_bdev_ctrlr->namespaces[i] == NULL) {
1372 			SPDK_ERRLOG("Failed to allocate block namespace struct\n");
1373 			for (; i > 0; i--) {
1374 				free(nvme_bdev_ctrlr->namespaces[i - 1]);
1375 			}
1376 			free(nvme_bdev_ctrlr->namespaces);
1377 			free(nvme_bdev_ctrlr);
1378 			return -ENOMEM;
1379 		}
1380 	}
1381 
1382 	nvme_bdev_ctrlr->adminq_timer_poller = NULL;
1383 	nvme_bdev_ctrlr->ctrlr = ctrlr;
1384 	nvme_bdev_ctrlr->ref = 0;
1385 	nvme_bdev_ctrlr->trid = *trid;
1386 	nvme_bdev_ctrlr->name = strdup(name);
1387 	if (nvme_bdev_ctrlr->name == NULL) {
1388 		free(nvme_bdev_ctrlr->namespaces);
1389 		free(nvme_bdev_ctrlr);
1390 		return -ENOMEM;
1391 	}
1392 
1393 	if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) {
1394 		rc = bdev_ocssd_init_ctrlr(nvme_bdev_ctrlr);
1395 		if (spdk_unlikely(rc != 0)) {
1396 			SPDK_ERRLOG("Unable to initialize OCSSD controller\n");
1397 			free(nvme_bdev_ctrlr->name);
1398 			free(nvme_bdev_ctrlr->namespaces);
1399 			free(nvme_bdev_ctrlr);
1400 			return rc;
1401 		}
1402 	}
1403 
1404 	nvme_bdev_ctrlr->prchk_flags = prchk_flags;
1405 
1406 	spdk_io_device_register(nvme_bdev_ctrlr, bdev_nvme_create_cb, bdev_nvme_destroy_cb,
1407 				sizeof(struct nvme_io_channel),
1408 				name);
1409 
1410 	nvme_bdev_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, ctrlr,
1411 					       g_opts.nvme_adminq_poll_period_us);
1412 
1413 	TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nvme_bdev_ctrlr, tailq);
1414 
1415 	if (g_opts.timeout_us > 0) {
1416 		spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us,
1417 				timeout_cb, NULL);
1418 	}
1419 
1420 	spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_bdev_ctrlr);
1421 
1422 	if (spdk_nvme_ctrlr_get_flags(nvme_bdev_ctrlr->ctrlr) &
1423 	    SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) {
1424 		nvme_bdev_ctrlr->opal_dev = spdk_opal_dev_construct(nvme_bdev_ctrlr->ctrlr);
1425 		if (nvme_bdev_ctrlr->opal_dev == NULL) {
1426 			SPDK_ERRLOG("Failed to initialize Opal\n");
1427 		}
1428 	}
1429 	return 0;
1430 }
1431 
1432 static void
1433 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
1434 	  struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
1435 {
1436 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
1437 	struct nvme_probe_ctx *ctx = cb_ctx;
1438 	char *name = NULL;
1439 	uint32_t prchk_flags = 0;
1440 	size_t i;
1441 
1442 	if (ctx) {
1443 		for (i = 0; i < ctx->count; i++) {
1444 			if (spdk_nvme_transport_id_compare(trid, &ctx->trids[i]) == 0) {
1445 				prchk_flags = ctx->prchk_flags[i];
1446 				name = strdup(ctx->names[i]);
1447 				break;
1448 			}
1449 		}
1450 	} else {
1451 		name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++);
1452 	}
1453 	if (!name) {
1454 		SPDK_ERRLOG("Failed to assign name to NVMe device\n");
1455 		return;
1456 	}
1457 
1458 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "Attached to %s (%s)\n", trid->traddr, name);
1459 
1460 	create_ctrlr(ctrlr, name, trid, prchk_flags);
1461 
1462 	nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(trid);
1463 	if (!nvme_bdev_ctrlr) {
1464 		SPDK_ERRLOG("Failed to find new NVMe controller\n");
1465 		free(name);
1466 		return;
1467 	}
1468 
1469 	nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, NULL);
1470 
1471 	free(name);
1472 }
1473 
1474 static void
1475 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr)
1476 {
1477 	uint32_t i;
1478 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
1479 	struct nvme_bdev_ns *ns;
1480 
1481 	pthread_mutex_lock(&g_bdev_nvme_mutex);
1482 	TAILQ_FOREACH(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
1483 		if (nvme_bdev_ctrlr->ctrlr == ctrlr) {
1484 			/* The controller's destruction was already started */
1485 			if (nvme_bdev_ctrlr->destruct) {
1486 				pthread_mutex_unlock(&g_bdev_nvme_mutex);
1487 				return;
1488 			}
1489 			pthread_mutex_unlock(&g_bdev_nvme_mutex);
1490 			for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
1491 				uint32_t	nsid = i + 1;
1492 
1493 				ns = nvme_bdev_ctrlr->namespaces[nsid - 1];
1494 				if (ns->populated) {
1495 					assert(ns->id == nsid);
1496 					nvme_ctrlr_depopulate_namespace(nvme_bdev_ctrlr, ns);
1497 				}
1498 			}
1499 
1500 			pthread_mutex_lock(&g_bdev_nvme_mutex);
1501 			nvme_bdev_ctrlr->destruct = true;
1502 			if (nvme_bdev_ctrlr->ref == 0) {
1503 				pthread_mutex_unlock(&g_bdev_nvme_mutex);
1504 				nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr);
1505 			} else {
1506 				pthread_mutex_unlock(&g_bdev_nvme_mutex);
1507 			}
1508 			return;
1509 		}
1510 	}
1511 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
1512 }
1513 
1514 static int
1515 bdev_nvme_hotplug(void *arg)
1516 {
1517 	struct spdk_nvme_transport_id trid_pcie;
1518 	int done;
1519 
1520 	if (!g_hotplug_probe_ctx) {
1521 		memset(&trid_pcie, 0, sizeof(trid_pcie));
1522 		spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE);
1523 
1524 		g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL,
1525 				      hotplug_probe_cb,
1526 				      attach_cb, remove_cb);
1527 		if (!g_hotplug_probe_ctx) {
1528 			return -1;
1529 		}
1530 	}
1531 
1532 	done = spdk_nvme_probe_poll_async(g_hotplug_probe_ctx);
1533 	if (done != -EAGAIN) {
1534 		g_hotplug_probe_ctx = NULL;
1535 		return 1;
1536 	}
1537 
1538 	return -1;
1539 }
1540 
1541 void
1542 spdk_bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts)
1543 {
1544 	*opts = g_opts;
1545 }
1546 
1547 int
1548 spdk_bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts)
1549 {
1550 	if (g_bdev_nvme_init_thread != NULL) {
1551 		if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
1552 			return -EPERM;
1553 		}
1554 	}
1555 
1556 	g_opts = *opts;
1557 
1558 	return 0;
1559 }
1560 
1561 struct set_nvme_hotplug_ctx {
1562 	uint64_t period_us;
1563 	bool enabled;
1564 	spdk_msg_fn fn;
1565 	void *fn_ctx;
1566 };
1567 
1568 static void
1569 set_nvme_hotplug_period_cb(void *_ctx)
1570 {
1571 	struct set_nvme_hotplug_ctx *ctx = _ctx;
1572 
1573 	spdk_poller_unregister(&g_hotplug_poller);
1574 	if (ctx->enabled) {
1575 		g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us);
1576 	}
1577 
1578 	g_nvme_hotplug_poll_period_us = ctx->period_us;
1579 	g_nvme_hotplug_enabled = ctx->enabled;
1580 	if (ctx->fn) {
1581 		ctx->fn(ctx->fn_ctx);
1582 	}
1583 
1584 	free(ctx);
1585 }
1586 
1587 int
1588 spdk_bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx)
1589 {
1590 	struct set_nvme_hotplug_ctx *ctx;
1591 
1592 	if (enabled == true && !spdk_process_is_primary()) {
1593 		return -EPERM;
1594 	}
1595 
1596 	ctx = calloc(1, sizeof(*ctx));
1597 	if (ctx == NULL) {
1598 		return -ENOMEM;
1599 	}
1600 
1601 	period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us;
1602 	ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX);
1603 	ctx->enabled = enabled;
1604 	ctx->fn = cb;
1605 	ctx->fn_ctx = cb_ctx;
1606 
1607 	spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx);
1608 	return 0;
1609 }
1610 
1611 static void
1612 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, size_t count, int rc)
1613 {
1614 	if (ctx->cb_fn) {
1615 		ctx->cb_fn(ctx->cb_ctx, count, rc);
1616 	}
1617 
1618 	free(ctx);
1619 }
1620 
1621 static void
1622 nvme_ctrlr_populate_namespaces_done(struct nvme_async_probe_ctx *ctx)
1623 {
1624 	struct nvme_bdev_ctrlr	*nvme_bdev_ctrlr;
1625 	struct nvme_bdev_ns	*ns;
1626 	struct nvme_bdev	*nvme_bdev, *tmp;
1627 	uint32_t		i, nsid;
1628 	size_t			j;
1629 
1630 	nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(&ctx->trid);
1631 	assert(nvme_bdev_ctrlr != NULL);
1632 
1633 	/*
1634 	 * Report the new bdevs that were created in this call.
1635 	 * There can be more than one bdev per NVMe controller.
1636 	 */
1637 	j = 0;
1638 	for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
1639 		nsid = i + 1;
1640 		ns = nvme_bdev_ctrlr->namespaces[nsid - 1];
1641 		if (!ns->populated) {
1642 			continue;
1643 		}
1644 		assert(ns->id == nsid);
1645 		TAILQ_FOREACH_SAFE(nvme_bdev, &ns->bdevs, tailq, tmp) {
1646 			if (j < ctx->count) {
1647 				ctx->names[j] = nvme_bdev->disk.name;
1648 				j++;
1649 			} else {
1650 				SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n",
1651 					    ctx->count);
1652 				populate_namespaces_cb(ctx, 0, -ERANGE);
1653 				return;
1654 			}
1655 		}
1656 	}
1657 
1658 	populate_namespaces_cb(ctx, j, 0);
1659 }
1660 
1661 static void
1662 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
1663 		  struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
1664 {
1665 	struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx;
1666 	struct nvme_bdev_ctrlr	*nvme_bdev_ctrlr;
1667 	struct nvme_async_probe_ctx *ctx;
1668 	int rc;
1669 
1670 	ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, opts);
1671 
1672 	spdk_poller_unregister(&ctx->poller);
1673 
1674 	rc = create_ctrlr(ctrlr, ctx->base_name, &ctx->trid, ctx->prchk_flags);
1675 	if (rc) {
1676 		SPDK_ERRLOG("Failed to create new device\n");
1677 		populate_namespaces_cb(ctx, 0, rc);
1678 		return;
1679 	}
1680 
1681 	nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(&ctx->trid);
1682 	assert(nvme_bdev_ctrlr != NULL);
1683 
1684 	nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, ctx);
1685 }
1686 
1687 static int
1688 bdev_nvme_async_poll(void *arg)
1689 {
1690 	struct nvme_async_probe_ctx	*ctx = arg;
1691 	int				rc;
1692 
1693 	rc = spdk_nvme_probe_poll_async(ctx->probe_ctx);
1694 	if (spdk_unlikely(rc != -EAGAIN && rc != 0)) {
1695 		spdk_poller_unregister(&ctx->poller);
1696 		free(ctx);
1697 	}
1698 
1699 	return 1;
1700 }
1701 
1702 int
1703 spdk_bdev_nvme_create(struct spdk_nvme_transport_id *trid,
1704 		      struct spdk_nvme_host_id *hostid,
1705 		      const char *base_name,
1706 		      const char **names,
1707 		      uint32_t count,
1708 		      const char *hostnqn,
1709 		      uint32_t prchk_flags,
1710 		      spdk_bdev_create_nvme_fn cb_fn,
1711 		      void *cb_ctx)
1712 {
1713 	struct nvme_probe_skip_entry	*entry, *tmp;
1714 	struct nvme_async_probe_ctx	*ctx;
1715 
1716 	if (nvme_bdev_ctrlr_get(trid) != NULL) {
1717 		SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr);
1718 		return -EEXIST;
1719 	}
1720 
1721 	if (nvme_bdev_ctrlr_get_by_name(base_name)) {
1722 		SPDK_ERRLOG("A controller with the provided name (%s) already exists.\n", base_name);
1723 		return -EEXIST;
1724 	}
1725 
1726 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
1727 		TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) {
1728 			if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) {
1729 				TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq);
1730 				free(entry);
1731 				break;
1732 			}
1733 		}
1734 	}
1735 
1736 	ctx = calloc(1, sizeof(*ctx));
1737 	if (!ctx) {
1738 		return -ENOMEM;
1739 	}
1740 	ctx->base_name = base_name;
1741 	ctx->names = names;
1742 	ctx->count = count;
1743 	ctx->cb_fn = cb_fn;
1744 	ctx->cb_ctx = cb_ctx;
1745 	ctx->prchk_flags = prchk_flags;
1746 	ctx->trid = *trid;
1747 
1748 	spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->opts, sizeof(ctx->opts));
1749 	ctx->opts.transport_retry_count = g_opts.retry_count;
1750 
1751 	if (hostnqn) {
1752 		snprintf(ctx->opts.hostnqn, sizeof(ctx->opts.hostnqn), "%s", hostnqn);
1753 	}
1754 
1755 	if (hostid->hostaddr[0] != '\0') {
1756 		snprintf(ctx->opts.src_addr, sizeof(ctx->opts.src_addr), "%s", hostid->hostaddr);
1757 	}
1758 
1759 	if (hostid->hostsvcid[0] != '\0') {
1760 		snprintf(ctx->opts.src_svcid, sizeof(ctx->opts.src_svcid), "%s", hostid->hostsvcid);
1761 	}
1762 
1763 	ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->opts, connect_attach_cb);
1764 	if (ctx->probe_ctx == NULL) {
1765 		SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr);
1766 		free(ctx);
1767 		return -ENODEV;
1768 	}
1769 	ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000);
1770 
1771 	return 0;
1772 }
1773 
1774 int
1775 spdk_bdev_nvme_delete(const char *name)
1776 {
1777 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = NULL;
1778 	struct nvme_probe_skip_entry *entry;
1779 
1780 	if (name == NULL) {
1781 		return -EINVAL;
1782 	}
1783 
1784 	nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name);
1785 	if (nvme_bdev_ctrlr == NULL) {
1786 		SPDK_ERRLOG("Failed to find NVMe controller\n");
1787 		return -ENODEV;
1788 	}
1789 
1790 	if (nvme_bdev_ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) {
1791 		entry = calloc(1, sizeof(*entry));
1792 		if (!entry) {
1793 			return -ENOMEM;
1794 		}
1795 		entry->trid = nvme_bdev_ctrlr->trid;
1796 		TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq);
1797 	}
1798 
1799 	remove_cb(NULL, nvme_bdev_ctrlr->ctrlr);
1800 	return 0;
1801 }
1802 
1803 static int
1804 bdev_nvme_library_init(void)
1805 {
1806 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
1807 	struct spdk_conf_section *sp;
1808 	const char *val;
1809 	int rc = 0;
1810 	int64_t intval = 0;
1811 	size_t i;
1812 	struct nvme_probe_ctx *probe_ctx = NULL;
1813 	int retry_count;
1814 	uint32_t local_nvme_num = 0;
1815 	int64_t hotplug_period;
1816 	bool hotplug_enabled = g_nvme_hotplug_enabled;
1817 
1818 	g_bdev_nvme_init_thread = spdk_get_thread();
1819 
1820 	spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_poll_group_create_cb,
1821 				bdev_nvme_poll_group_destroy_cb,
1822 				sizeof(struct nvme_bdev_poll_group),  "bdev_nvme_poll_groups");
1823 
1824 	sp = spdk_conf_find_section(NULL, "Nvme");
1825 	if (sp == NULL) {
1826 		goto end;
1827 	}
1828 
1829 	probe_ctx = calloc(1, sizeof(*probe_ctx));
1830 	if (probe_ctx == NULL) {
1831 		SPDK_ERRLOG("Failed to allocate probe_ctx\n");
1832 		rc = -1;
1833 		goto end;
1834 	}
1835 
1836 	retry_count = spdk_conf_section_get_intval(sp, "RetryCount");
1837 	if (retry_count >= 0) {
1838 		g_opts.retry_count = retry_count;
1839 	}
1840 
1841 	val = spdk_conf_section_get_val(sp, "TimeoutUsec");
1842 	if (val != NULL) {
1843 		intval = spdk_strtoll(val, 10);
1844 		if (intval < 0) {
1845 			SPDK_ERRLOG("Invalid TimeoutUsec value\n");
1846 			rc = -1;
1847 			goto end;
1848 		}
1849 	}
1850 
1851 	g_opts.timeout_us = intval;
1852 
1853 	if (g_opts.timeout_us > 0) {
1854 		val = spdk_conf_section_get_val(sp, "ActionOnTimeout");
1855 		if (val != NULL) {
1856 			if (!strcasecmp(val, "Reset")) {
1857 				g_opts.action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET;
1858 			} else if (!strcasecmp(val, "Abort")) {
1859 				g_opts.action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT;
1860 			}
1861 		}
1862 	}
1863 
1864 	intval = spdk_conf_section_get_intval(sp, "AdminPollRate");
1865 	if (intval > 0) {
1866 		g_opts.nvme_adminq_poll_period_us = intval;
1867 	}
1868 
1869 	intval = spdk_conf_section_get_intval(sp, "IOPollRate");
1870 	if (intval > 0) {
1871 		g_opts.nvme_ioq_poll_period_us = intval;
1872 	}
1873 
1874 	if (spdk_process_is_primary()) {
1875 		hotplug_enabled = spdk_conf_section_get_boolval(sp, "HotplugEnable", false);
1876 	}
1877 
1878 	hotplug_period = spdk_conf_section_get_intval(sp, "HotplugPollRate");
1879 	if (hotplug_period < 0) {
1880 		hotplug_period = 0;
1881 	}
1882 
1883 	g_nvme_hostnqn = spdk_conf_section_get_val(sp, "HostNQN");
1884 	probe_ctx->hostnqn = g_nvme_hostnqn;
1885 
1886 	g_opts.delay_cmd_submit = spdk_conf_section_get_boolval(sp, "DelayCmdSubmit",
1887 				  SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT);
1888 
1889 	for (i = 0; i < NVME_MAX_CONTROLLERS; i++) {
1890 		val = spdk_conf_section_get_nmval(sp, "TransportID", i, 0);
1891 		if (val == NULL) {
1892 			break;
1893 		}
1894 
1895 		rc = spdk_nvme_transport_id_parse(&probe_ctx->trids[i], val);
1896 		if (rc < 0) {
1897 			SPDK_ERRLOG("Unable to parse TransportID: %s\n", val);
1898 			rc = -1;
1899 			goto end;
1900 		}
1901 
1902 		rc = spdk_nvme_host_id_parse(&probe_ctx->hostids[i], val);
1903 		if (rc < 0) {
1904 			SPDK_ERRLOG("Unable to parse HostID: %s\n", val);
1905 			rc = -1;
1906 			goto end;
1907 		}
1908 
1909 		val = spdk_conf_section_get_nmval(sp, "TransportID", i, 1);
1910 		if (val == NULL) {
1911 			SPDK_ERRLOG("No name provided for TransportID\n");
1912 			rc = -1;
1913 			goto end;
1914 		}
1915 
1916 		probe_ctx->names[i] = val;
1917 
1918 		val = spdk_conf_section_get_nmval(sp, "TransportID", i, 2);
1919 		if (val != NULL) {
1920 			rc = spdk_nvme_prchk_flags_parse(&probe_ctx->prchk_flags[i], val);
1921 			if (rc < 0) {
1922 				SPDK_ERRLOG("Unable to parse prchk: %s\n", val);
1923 				rc = -1;
1924 				goto end;
1925 			}
1926 		}
1927 
1928 		probe_ctx->count++;
1929 
1930 		if (probe_ctx->trids[i].trtype != SPDK_NVME_TRANSPORT_PCIE) {
1931 			struct spdk_nvme_ctrlr *ctrlr;
1932 			struct spdk_nvme_ctrlr_opts opts;
1933 
1934 			if (nvme_bdev_ctrlr_get(&probe_ctx->trids[i])) {
1935 				SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n",
1936 					    probe_ctx->trids[i].traddr);
1937 				rc = -1;
1938 				goto end;
1939 			}
1940 
1941 			if (probe_ctx->trids[i].subnqn[0] == '\0') {
1942 				SPDK_ERRLOG("Need to provide subsystem nqn\n");
1943 				rc = -1;
1944 				goto end;
1945 			}
1946 
1947 			spdk_nvme_ctrlr_get_default_ctrlr_opts(&opts, sizeof(opts));
1948 			opts.transport_retry_count = g_opts.retry_count;
1949 
1950 			if (probe_ctx->hostnqn != NULL) {
1951 				snprintf(opts.hostnqn, sizeof(opts.hostnqn), "%s", probe_ctx->hostnqn);
1952 			}
1953 
1954 			if (probe_ctx->hostids[i].hostaddr[0] != '\0') {
1955 				snprintf(opts.src_addr, sizeof(opts.src_addr), "%s", probe_ctx->hostids[i].hostaddr);
1956 			}
1957 
1958 			if (probe_ctx->hostids[i].hostsvcid[0] != '\0') {
1959 				snprintf(opts.src_svcid, sizeof(opts.src_svcid), "%s", probe_ctx->hostids[i].hostsvcid);
1960 			}
1961 
1962 			ctrlr = spdk_nvme_connect(&probe_ctx->trids[i], &opts, sizeof(opts));
1963 			if (ctrlr == NULL) {
1964 				SPDK_ERRLOG("Unable to connect to provided trid (traddr: %s)\n",
1965 					    probe_ctx->trids[i].traddr);
1966 				rc = -1;
1967 				goto end;
1968 			}
1969 
1970 			rc = create_ctrlr(ctrlr, probe_ctx->names[i], &probe_ctx->trids[i], 0);
1971 			if (rc) {
1972 				goto end;
1973 			}
1974 
1975 			nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(&probe_ctx->trids[i]);
1976 			if (!nvme_bdev_ctrlr) {
1977 				SPDK_ERRLOG("Failed to find new NVMe controller\n");
1978 				rc = -ENODEV;
1979 				goto end;
1980 			}
1981 
1982 			nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, NULL);
1983 		} else {
1984 			local_nvme_num++;
1985 		}
1986 	}
1987 
1988 	if (local_nvme_num > 0) {
1989 		/* used to probe local NVMe device */
1990 		if (spdk_nvme_probe(NULL, probe_ctx, probe_cb, attach_cb, remove_cb)) {
1991 			rc = -1;
1992 			goto end;
1993 		}
1994 
1995 		for (i = 0; i < probe_ctx->count; i++) {
1996 			if (probe_ctx->trids[i].trtype != SPDK_NVME_TRANSPORT_PCIE) {
1997 				continue;
1998 			}
1999 
2000 			if (!nvme_bdev_ctrlr_get(&probe_ctx->trids[i])) {
2001 				SPDK_ERRLOG("NVMe SSD \"%s\" could not be found.\n", probe_ctx->trids[i].traddr);
2002 				SPDK_ERRLOG("Check PCIe BDF and that it is attached to UIO/VFIO driver.\n");
2003 			}
2004 		}
2005 	}
2006 
2007 	rc = spdk_bdev_nvme_set_hotplug(hotplug_enabled, hotplug_period, NULL, NULL);
2008 	if (rc) {
2009 		SPDK_ERRLOG("Failed to setup hotplug (%d): %s", rc, spdk_strerror(rc));
2010 		rc = -1;
2011 	}
2012 end:
2013 	free(probe_ctx);
2014 	return rc;
2015 }
2016 
2017 static void
2018 bdev_nvme_library_fini(void)
2019 {
2020 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, *tmp;
2021 	struct nvme_probe_skip_entry *entry, *entry_tmp;
2022 	struct nvme_bdev_ns *ns;
2023 	uint32_t i;
2024 
2025 	spdk_poller_unregister(&g_hotplug_poller);
2026 	free(g_hotplug_probe_ctx);
2027 
2028 	TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) {
2029 		TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq);
2030 		free(entry);
2031 	}
2032 
2033 	pthread_mutex_lock(&g_bdev_nvme_mutex);
2034 	TAILQ_FOREACH_SAFE(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq, tmp) {
2035 		if (nvme_bdev_ctrlr->destruct) {
2036 			/* This controller's destruction was already started
2037 			 * before the application started shutting down
2038 			 */
2039 			continue;
2040 		}
2041 
2042 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
2043 
2044 		for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
2045 			uint32_t nsid = i + 1;
2046 
2047 			ns = nvme_bdev_ctrlr->namespaces[nsid - 1];
2048 			if (ns->populated) {
2049 				assert(ns->id == nsid);
2050 				nvme_ctrlr_depopulate_namespace(nvme_bdev_ctrlr, ns);
2051 			}
2052 		}
2053 
2054 		pthread_mutex_lock(&g_bdev_nvme_mutex);
2055 		nvme_bdev_ctrlr->destruct = true;
2056 
2057 		if (nvme_bdev_ctrlr->ref == 0) {
2058 			pthread_mutex_unlock(&g_bdev_nvme_mutex);
2059 			nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr);
2060 			pthread_mutex_lock(&g_bdev_nvme_mutex);
2061 		}
2062 	}
2063 
2064 	g_bdev_nvme_module_finish = true;
2065 	if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
2066 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
2067 		spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL);
2068 		spdk_bdev_module_finish_done();
2069 		return;
2070 	}
2071 
2072 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
2073 }
2074 
2075 static void
2076 bdev_nvme_verify_pi_error(struct spdk_bdev_io *bdev_io)
2077 {
2078 	struct spdk_bdev *bdev = bdev_io->bdev;
2079 	struct spdk_dif_ctx dif_ctx;
2080 	struct spdk_dif_error err_blk = {};
2081 	int rc;
2082 
2083 	rc = spdk_dif_ctx_init(&dif_ctx,
2084 			       bdev->blocklen, bdev->md_len, bdev->md_interleave,
2085 			       bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags,
2086 			       bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0);
2087 	if (rc != 0) {
2088 		SPDK_ERRLOG("Initialization of DIF context failed\n");
2089 		return;
2090 	}
2091 
2092 	if (bdev->md_interleave) {
2093 		rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
2094 				     bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
2095 	} else {
2096 		struct iovec md_iov = {
2097 			.iov_base	= bdev_io->u.bdev.md_buf,
2098 			.iov_len	= bdev_io->u.bdev.num_blocks * bdev->md_len,
2099 		};
2100 
2101 		rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
2102 				     &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
2103 	}
2104 
2105 	if (rc != 0) {
2106 		SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n",
2107 			    err_blk.err_type, err_blk.err_offset);
2108 	} else {
2109 		SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n");
2110 	}
2111 }
2112 
2113 static void
2114 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl)
2115 {
2116 	struct nvme_bdev_io *bio = ref;
2117 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2118 
2119 	if (spdk_nvme_cpl_is_success(cpl)) {
2120 		/* Run PI verification for read data buffer. */
2121 		bdev_nvme_verify_pi_error(bdev_io);
2122 	}
2123 
2124 	/* Return original completion status */
2125 	spdk_bdev_io_complete_nvme_status(bdev_io, bio->cpl.cdw0, bio->cpl.status.sct,
2126 					  bio->cpl.status.sc);
2127 }
2128 
2129 static void
2130 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl)
2131 {
2132 	struct nvme_bdev_io *bio = ref;
2133 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2134 	int ret;
2135 
2136 	if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) {
2137 		SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n",
2138 			    cpl->status.sct, cpl->status.sc);
2139 
2140 		/* Save completion status to use after verifying PI error. */
2141 		bio->cpl = *cpl;
2142 
2143 		/* Read without PI checking to verify PI error. */
2144 		ret = bdev_nvme_no_pi_readv((struct nvme_bdev *)bdev_io->bdev->ctxt,
2145 					    spdk_bdev_io_get_io_channel(bdev_io),
2146 					    bio,
2147 					    bdev_io->u.bdev.iovs,
2148 					    bdev_io->u.bdev.iovcnt,
2149 					    bdev_io->u.bdev.md_buf,
2150 					    bdev_io->u.bdev.num_blocks,
2151 					    bdev_io->u.bdev.offset_blocks);
2152 		if (ret == 0) {
2153 			return;
2154 		}
2155 	}
2156 
2157 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2158 }
2159 
2160 static void
2161 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl)
2162 {
2163 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref);
2164 
2165 	if (spdk_nvme_cpl_is_pi_error(cpl)) {
2166 		SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n",
2167 			    cpl->status.sct, cpl->status.sc);
2168 		/* Run PI verification for write data buffer if PI error is detected. */
2169 		bdev_nvme_verify_pi_error(bdev_io);
2170 	}
2171 
2172 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2173 }
2174 
2175 static void
2176 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl)
2177 {
2178 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref);
2179 
2180 	if (spdk_nvme_cpl_is_pi_error(cpl)) {
2181 		SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n",
2182 			    cpl->status.sct, cpl->status.sc);
2183 		/* Run PI verification for compare data buffer if PI error is detected. */
2184 		bdev_nvme_verify_pi_error(bdev_io);
2185 	}
2186 
2187 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2188 }
2189 
2190 static void
2191 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl)
2192 {
2193 	struct nvme_bdev_io *bio = ref;
2194 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2195 
2196 	/* Compare operation completion */
2197 	if ((cpl->cdw0 & 0xFF) == SPDK_NVME_OPC_COMPARE) {
2198 		/* Save compare result for write callback */
2199 		bio->cpl = *cpl;
2200 		return;
2201 	}
2202 
2203 	/* Write operation completion */
2204 	if (spdk_nvme_cpl_is_error(&bio->cpl)) {
2205 		/* If bio->cpl is already an error, it means the compare operation failed.  In that case,
2206 		 * complete the IO with the compare operation's status.
2207 		 */
2208 		if (!spdk_nvme_cpl_is_error(cpl)) {
2209 			SPDK_ERRLOG("Unexpected write success after compare failure.\n");
2210 		}
2211 
2212 		spdk_bdev_io_complete_nvme_status(bdev_io, bio->cpl.cdw0, bio->cpl.status.sct, bio->cpl.status.sc);
2213 	} else {
2214 		spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2215 	}
2216 }
2217 
2218 static void
2219 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl)
2220 {
2221 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref);
2222 
2223 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2224 }
2225 
2226 static void
2227 bdev_nvme_admin_passthru_completion(void *ctx)
2228 {
2229 	struct nvme_bdev_io *bio = ctx;
2230 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2231 
2232 	spdk_bdev_io_complete_nvme_status(bdev_io,
2233 					  bio->cpl.cdw0, bio->cpl.status.sct, bio->cpl.status.sc);
2234 }
2235 
2236 static void
2237 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl)
2238 {
2239 	struct nvme_bdev_io *bio = ref;
2240 
2241 	bio->cpl = *cpl;
2242 	spdk_thread_send_msg(bio->orig_thread, bdev_nvme_admin_passthru_completion, bio);
2243 }
2244 
2245 static void
2246 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset)
2247 {
2248 	struct nvme_bdev_io *bio = ref;
2249 	struct iovec *iov;
2250 
2251 	bio->iov_offset = sgl_offset;
2252 	for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) {
2253 		iov = &bio->iovs[bio->iovpos];
2254 		if (bio->iov_offset < iov->iov_len) {
2255 			break;
2256 		}
2257 
2258 		bio->iov_offset -= iov->iov_len;
2259 	}
2260 }
2261 
2262 static int
2263 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length)
2264 {
2265 	struct nvme_bdev_io *bio = ref;
2266 	struct iovec *iov;
2267 
2268 	assert(bio->iovpos < bio->iovcnt);
2269 
2270 	iov = &bio->iovs[bio->iovpos];
2271 
2272 	*address = iov->iov_base;
2273 	*length = iov->iov_len;
2274 
2275 	if (bio->iov_offset) {
2276 		assert(bio->iov_offset <= iov->iov_len);
2277 		*address += bio->iov_offset;
2278 		*length -= bio->iov_offset;
2279 	}
2280 
2281 	bio->iov_offset += *length;
2282 	if (bio->iov_offset == iov->iov_len) {
2283 		bio->iovpos++;
2284 		bio->iov_offset = 0;
2285 	}
2286 
2287 	return 0;
2288 }
2289 
2290 static void
2291 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset)
2292 {
2293 	struct nvme_bdev_io *bio = ref;
2294 	struct iovec *iov;
2295 
2296 	bio->fused_iov_offset = sgl_offset;
2297 	for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) {
2298 		iov = &bio->fused_iovs[bio->fused_iovpos];
2299 		if (bio->fused_iov_offset < iov->iov_len) {
2300 			break;
2301 		}
2302 
2303 		bio->fused_iov_offset -= iov->iov_len;
2304 	}
2305 }
2306 
2307 static int
2308 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length)
2309 {
2310 	struct nvme_bdev_io *bio = ref;
2311 	struct iovec *iov;
2312 
2313 	assert(bio->fused_iovpos < bio->fused_iovcnt);
2314 
2315 	iov = &bio->fused_iovs[bio->fused_iovpos];
2316 
2317 	*address = iov->iov_base;
2318 	*length = iov->iov_len;
2319 
2320 	if (bio->fused_iov_offset) {
2321 		assert(bio->fused_iov_offset <= iov->iov_len);
2322 		*address += bio->fused_iov_offset;
2323 		*length -= bio->fused_iov_offset;
2324 	}
2325 
2326 	bio->fused_iov_offset += *length;
2327 	if (bio->fused_iov_offset == iov->iov_len) {
2328 		bio->fused_iovpos++;
2329 		bio->fused_iov_offset = 0;
2330 	}
2331 
2332 	return 0;
2333 }
2334 
2335 static int
2336 bdev_nvme_no_pi_readv(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
2337 		      struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
2338 		      void *md, uint64_t lba_count, uint64_t lba)
2339 {
2340 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
2341 	int rc;
2342 
2343 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "read %lu blocks with offset %#lx without PI check\n",
2344 		      lba_count, lba);
2345 
2346 	bio->iovs = iov;
2347 	bio->iovcnt = iovcnt;
2348 	bio->iovpos = 0;
2349 	bio->iov_offset = 0;
2350 
2351 	rc = spdk_nvme_ns_cmd_readv_with_md(nbdev->nvme_ns->ns, nvme_ch->qpair, lba, lba_count,
2352 					    bdev_nvme_no_pi_readv_done, bio, 0,
2353 					    bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
2354 					    md, 0, 0);
2355 
2356 	if (rc != 0 && rc != -ENOMEM) {
2357 		SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc);
2358 	}
2359 	return rc;
2360 }
2361 
2362 static int
2363 bdev_nvme_readv(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
2364 		struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
2365 		void *md, uint64_t lba_count, uint64_t lba)
2366 {
2367 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
2368 	int rc;
2369 
2370 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "read %lu blocks with offset %#lx\n",
2371 		      lba_count, lba);
2372 
2373 	bio->iovs = iov;
2374 	bio->iovcnt = iovcnt;
2375 	bio->iovpos = 0;
2376 	bio->iov_offset = 0;
2377 
2378 	rc = spdk_nvme_ns_cmd_readv_with_md(nbdev->nvme_ns->ns, nvme_ch->qpair, lba, lba_count,
2379 					    bdev_nvme_readv_done, bio, nbdev->disk.dif_check_flags,
2380 					    bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
2381 					    md, 0, 0);
2382 
2383 	if (rc != 0 && rc != -ENOMEM) {
2384 		SPDK_ERRLOG("readv failed: rc = %d\n", rc);
2385 	}
2386 	return rc;
2387 }
2388 
2389 static int
2390 bdev_nvme_writev(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
2391 		 struct nvme_bdev_io *bio,
2392 		 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba)
2393 {
2394 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
2395 	int rc;
2396 
2397 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "write %lu blocks with offset %#lx\n",
2398 		      lba_count, lba);
2399 
2400 	bio->iovs = iov;
2401 	bio->iovcnt = iovcnt;
2402 	bio->iovpos = 0;
2403 	bio->iov_offset = 0;
2404 
2405 	rc = spdk_nvme_ns_cmd_writev_with_md(nbdev->nvme_ns->ns, nvme_ch->qpair, lba, lba_count,
2406 					     bdev_nvme_writev_done, bio, nbdev->disk.dif_check_flags,
2407 					     bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
2408 					     md, 0, 0);
2409 
2410 	if (rc != 0 && rc != -ENOMEM) {
2411 		SPDK_ERRLOG("writev failed: rc = %d\n", rc);
2412 	}
2413 	return rc;
2414 }
2415 
2416 static int
2417 bdev_nvme_comparev(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
2418 		   struct nvme_bdev_io *bio,
2419 		   struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba)
2420 {
2421 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
2422 	int rc;
2423 
2424 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "compare %lu blocks with offset %#lx\n",
2425 		      lba_count, lba);
2426 
2427 	bio->iovs = iov;
2428 	bio->iovcnt = iovcnt;
2429 	bio->iovpos = 0;
2430 	bio->iov_offset = 0;
2431 
2432 	rc = spdk_nvme_ns_cmd_comparev_with_md(nbdev->nvme_ns->ns, nvme_ch->qpair, lba, lba_count,
2433 					       bdev_nvme_comparev_done, bio, nbdev->disk.dif_check_flags,
2434 					       bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
2435 					       md, 0, 0);
2436 
2437 	if (rc != 0 && rc != -ENOMEM) {
2438 		SPDK_ERRLOG("comparev failed: rc = %d\n", rc);
2439 	}
2440 	return rc;
2441 }
2442 
2443 static int
2444 bdev_nvme_comparev_and_writev(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
2445 			      struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov,
2446 			      int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba)
2447 {
2448 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
2449 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2450 	uint32_t flags = nbdev->disk.dif_check_flags;
2451 	int rc;
2452 
2453 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "compare and write %lu blocks with offset %#lx\n",
2454 		      lba_count, lba);
2455 
2456 	bio->iovs = cmp_iov;
2457 	bio->iovcnt = cmp_iovcnt;
2458 	bio->iovpos = 0;
2459 	bio->iov_offset = 0;
2460 	bio->fused_iovs = write_iov;
2461 	bio->fused_iovcnt = write_iovcnt;
2462 	bio->fused_iovpos = 0;
2463 	bio->fused_iov_offset = 0;
2464 
2465 	if (bdev_io->num_retries == 0) {
2466 		bio->first_fused_submitted = false;
2467 	}
2468 
2469 	if (!bio->first_fused_submitted) {
2470 		flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST;
2471 		memset(&bio->cpl, 0, sizeof(bio->cpl));
2472 
2473 		rc = spdk_nvme_ns_cmd_comparev_with_md(nbdev->nvme_ns->ns, nvme_ch->qpair, lba, lba_count,
2474 						       bdev_nvme_comparev_and_writev_done, bio, flags,
2475 						       bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0);
2476 		if (rc == 0) {
2477 			bio->first_fused_submitted = true;
2478 			flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST;
2479 		} else {
2480 			if (rc != -ENOMEM) {
2481 				SPDK_ERRLOG("compare failed: rc = %d\n", rc);
2482 			}
2483 			return rc;
2484 		}
2485 	}
2486 
2487 	flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND;
2488 
2489 	rc = spdk_nvme_ns_cmd_writev_with_md(nbdev->nvme_ns->ns, nvme_ch->qpair, lba, lba_count,
2490 					     bdev_nvme_comparev_and_writev_done, bio, flags,
2491 					     bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0);
2492 	if (rc != 0 && rc != -ENOMEM) {
2493 		SPDK_ERRLOG("write failed: rc = %d\n", rc);
2494 		rc = 0;
2495 	}
2496 
2497 	return rc;
2498 }
2499 
2500 static int
2501 bdev_nvme_unmap(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
2502 		struct nvme_bdev_io *bio,
2503 		uint64_t offset_blocks,
2504 		uint64_t num_blocks)
2505 {
2506 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
2507 	struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES];
2508 	struct spdk_nvme_dsm_range *range;
2509 	uint64_t offset, remaining;
2510 	uint64_t num_ranges_u64;
2511 	uint16_t num_ranges;
2512 	int rc;
2513 
2514 	num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) /
2515 			 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
2516 	if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) {
2517 		SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks);
2518 		return -EINVAL;
2519 	}
2520 	num_ranges = (uint16_t)num_ranges_u64;
2521 
2522 	offset = offset_blocks;
2523 	remaining = num_blocks;
2524 	range = &dsm_ranges[0];
2525 
2526 	/* Fill max-size ranges until the remaining blocks fit into one range */
2527 	while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) {
2528 		range->attributes.raw = 0;
2529 		range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
2530 		range->starting_lba = offset;
2531 
2532 		offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
2533 		remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
2534 		range++;
2535 	}
2536 
2537 	/* Final range describes the remaining blocks */
2538 	range->attributes.raw = 0;
2539 	range->length = remaining;
2540 	range->starting_lba = offset;
2541 
2542 	rc = spdk_nvme_ns_cmd_dataset_management(nbdev->nvme_ns->ns, nvme_ch->qpair,
2543 			SPDK_NVME_DSM_ATTR_DEALLOCATE,
2544 			dsm_ranges, num_ranges,
2545 			bdev_nvme_queued_done, bio);
2546 
2547 	return rc;
2548 }
2549 
2550 static int
2551 bdev_nvme_admin_passthru(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
2552 			 struct nvme_bdev_io *bio,
2553 			 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes)
2554 {
2555 	uint32_t max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nbdev->nvme_bdev_ctrlr->ctrlr);
2556 
2557 	if (nbytes > max_xfer_size) {
2558 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
2559 		return -EINVAL;
2560 	}
2561 
2562 	bio->orig_thread = spdk_io_channel_get_thread(ch);
2563 
2564 	return spdk_nvme_ctrlr_cmd_admin_raw(nbdev->nvme_bdev_ctrlr->ctrlr, cmd, buf,
2565 					     (uint32_t)nbytes, bdev_nvme_admin_passthru_done, bio);
2566 }
2567 
2568 static int
2569 bdev_nvme_io_passthru(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
2570 		      struct nvme_bdev_io *bio,
2571 		      struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes)
2572 {
2573 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
2574 	uint32_t max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nbdev->nvme_bdev_ctrlr->ctrlr);
2575 
2576 	if (nbytes > max_xfer_size) {
2577 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
2578 		return -EINVAL;
2579 	}
2580 
2581 	/*
2582 	 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid,
2583 	 * so fill it out automatically.
2584 	 */
2585 	cmd->nsid = spdk_nvme_ns_get_id(nbdev->nvme_ns->ns);
2586 
2587 	return spdk_nvme_ctrlr_cmd_io_raw(nbdev->nvme_bdev_ctrlr->ctrlr, nvme_ch->qpair, cmd, buf,
2588 					  (uint32_t)nbytes, bdev_nvme_queued_done, bio);
2589 }
2590 
2591 static int
2592 bdev_nvme_io_passthru_md(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
2593 			 struct nvme_bdev_io *bio,
2594 			 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len)
2595 {
2596 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
2597 	size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(nbdev->nvme_ns->ns);
2598 	uint32_t max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nbdev->nvme_bdev_ctrlr->ctrlr);
2599 
2600 	if (nbytes > max_xfer_size) {
2601 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
2602 		return -EINVAL;
2603 	}
2604 
2605 	if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(nbdev->nvme_ns->ns)) {
2606 		SPDK_ERRLOG("invalid meta data buffer size\n");
2607 		return -EINVAL;
2608 	}
2609 
2610 	/*
2611 	 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid,
2612 	 * so fill it out automatically.
2613 	 */
2614 	cmd->nsid = spdk_nvme_ns_get_id(nbdev->nvme_ns->ns);
2615 
2616 	return spdk_nvme_ctrlr_cmd_io_raw_with_md(nbdev->nvme_bdev_ctrlr->ctrlr, nvme_ch->qpair, cmd, buf,
2617 			(uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio);
2618 }
2619 
2620 static void
2621 bdev_nvme_get_spdk_running_config(FILE *fp)
2622 {
2623 	struct nvme_bdev_ctrlr	*nvme_bdev_ctrlr;
2624 
2625 	fprintf(fp, "\n[Nvme]");
2626 	fprintf(fp, "\n"
2627 		"# NVMe Device Whitelist\n"
2628 		"# Users may specify which NVMe devices to claim by their transport id.\n"
2629 		"# See spdk_nvme_transport_id_parse() in spdk/nvme.h for the correct format.\n"
2630 		"# The second argument is the assigned name, which can be referenced from\n"
2631 		"# other sections in the configuration file. For NVMe devices, a namespace\n"
2632 		"# is automatically appended to each name in the format <YourName>nY, where\n"
2633 		"# Y is the NSID (starts at 1).\n");
2634 
2635 	TAILQ_FOREACH(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
2636 		const char *trtype;
2637 		const char *prchk_flags;
2638 
2639 		trtype = spdk_nvme_transport_id_trtype_str(nvme_bdev_ctrlr->trid.trtype);
2640 		if (!trtype) {
2641 			continue;
2642 		}
2643 
2644 		if (nvme_bdev_ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) {
2645 			fprintf(fp, "TransportID \"trtype:%s traddr:%s\" %s\n",
2646 				trtype,
2647 				nvme_bdev_ctrlr->trid.traddr, nvme_bdev_ctrlr->name);
2648 		} else {
2649 			const char *adrfam;
2650 
2651 			adrfam = spdk_nvme_transport_id_adrfam_str(nvme_bdev_ctrlr->trid.adrfam);
2652 			prchk_flags = spdk_nvme_prchk_flags_str(nvme_bdev_ctrlr->prchk_flags);
2653 
2654 			if (adrfam) {
2655 				fprintf(fp, "TransportID \"trtype:%s adrfam:%s traddr:%s trsvcid:%s subnqn:%s\" %s",
2656 					trtype,	adrfam,
2657 					nvme_bdev_ctrlr->trid.traddr, nvme_bdev_ctrlr->trid.trsvcid,
2658 					nvme_bdev_ctrlr->trid.subnqn, nvme_bdev_ctrlr->name);
2659 			} else {
2660 				fprintf(fp, "TransportID \"trtype:%s traddr:%s trsvcid:%s subnqn:%s\" %s",
2661 					trtype,
2662 					nvme_bdev_ctrlr->trid.traddr, nvme_bdev_ctrlr->trid.trsvcid,
2663 					nvme_bdev_ctrlr->trid.subnqn, nvme_bdev_ctrlr->name);
2664 			}
2665 
2666 			if (prchk_flags) {
2667 				fprintf(fp, " \"%s\"\n", prchk_flags);
2668 			} else {
2669 				fprintf(fp, "\n");
2670 			}
2671 		}
2672 	}
2673 
2674 	fprintf(fp, "\n"
2675 		"# The number of attempts per I/O when an I/O fails. Do not include\n"
2676 		"# this key to get the default behavior.\n");
2677 	fprintf(fp, "RetryCount %d\n", g_opts.retry_count);
2678 	fprintf(fp, "\n"
2679 		"# Timeout for each command, in microseconds. If 0, don't track timeouts.\n");
2680 	fprintf(fp, "TimeoutUsec %"PRIu64"\n", g_opts.timeout_us);
2681 
2682 	fprintf(fp, "\n"
2683 		"# Action to take on command time out. Only valid when Timeout is greater\n"
2684 		"# than 0. This may be 'Reset' to reset the controller, 'Abort' to abort\n"
2685 		"# the command, or 'None' to just print a message but do nothing.\n"
2686 		"# Admin command timeouts will always result in a reset.\n");
2687 	switch (g_opts.action_on_timeout) {
2688 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE:
2689 		fprintf(fp, "ActionOnTimeout None\n");
2690 		break;
2691 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET:
2692 		fprintf(fp, "ActionOnTimeout Reset\n");
2693 		break;
2694 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT:
2695 		fprintf(fp, "ActionOnTimeout Abort\n");
2696 		break;
2697 	}
2698 
2699 	fprintf(fp, "\n"
2700 		"# Set how often the admin queue is polled for asynchronous events.\n"
2701 		"# Units in microseconds.\n");
2702 	fprintf(fp, "AdminPollRate %"PRIu64"\n", g_opts.nvme_adminq_poll_period_us);
2703 	fprintf(fp, "IOPollRate %" PRIu64"\n", g_opts.nvme_ioq_poll_period_us);
2704 	fprintf(fp, "\n"
2705 		"# Disable handling of hotplug (runtime insert and remove) events,\n"
2706 		"# users can set to Yes if want to enable it.\n"
2707 		"# Default: No\n");
2708 	fprintf(fp, "HotplugEnable %s\n", g_nvme_hotplug_enabled ? "Yes" : "No");
2709 	fprintf(fp, "\n"
2710 		"# Set how often the hotplug is processed for insert and remove events."
2711 		"# Units in microseconds.\n");
2712 	fprintf(fp, "HotplugPollRate %"PRIu64"\n", g_nvme_hotplug_poll_period_us);
2713 	if (g_nvme_hostnqn) {
2714 		fprintf(fp, "HostNQN %s\n",  g_nvme_hostnqn);
2715 	}
2716 	fprintf(fp, "DelayCmdSubmit %s\n", g_opts.delay_cmd_submit ? "True" : "False");
2717 
2718 	fprintf(fp, "\n");
2719 }
2720 
2721 static void
2722 nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w, struct nvme_bdev_ns *ns)
2723 {
2724 	/* nop */
2725 }
2726 
2727 static void
2728 nvme_namespace_config_json(struct spdk_json_write_ctx *w, struct nvme_bdev_ns *ns)
2729 {
2730 	g_config_json_namespace_fn[ns->type](w, ns);
2731 }
2732 
2733 static int
2734 bdev_nvme_config_json(struct spdk_json_write_ctx *w)
2735 {
2736 	struct nvme_bdev_ctrlr		*nvme_bdev_ctrlr;
2737 	struct spdk_nvme_transport_id	*trid;
2738 	const char			*action;
2739 	uint32_t			nsid;
2740 
2741 	if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) {
2742 		action = "reset";
2743 	} else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) {
2744 		action = "abort";
2745 	} else {
2746 		action = "none";
2747 	}
2748 
2749 	spdk_json_write_object_begin(w);
2750 
2751 	spdk_json_write_named_string(w, "method", "bdev_nvme_set_options");
2752 
2753 	spdk_json_write_named_object_begin(w, "params");
2754 	spdk_json_write_named_string(w, "action_on_timeout", action);
2755 	spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us);
2756 	spdk_json_write_named_uint32(w, "retry_count", g_opts.retry_count);
2757 	spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst);
2758 	spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight);
2759 	spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight);
2760 	spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight);
2761 	spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us);
2762 	spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us);
2763 	spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests);
2764 	spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit);
2765 	spdk_json_write_object_end(w);
2766 
2767 	spdk_json_write_object_end(w);
2768 
2769 	pthread_mutex_lock(&g_bdev_nvme_mutex);
2770 	TAILQ_FOREACH(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
2771 		trid = &nvme_bdev_ctrlr->trid;
2772 
2773 		spdk_json_write_object_begin(w);
2774 
2775 		spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller");
2776 
2777 		spdk_json_write_named_object_begin(w, "params");
2778 		spdk_json_write_named_string(w, "name", nvme_bdev_ctrlr->name);
2779 		nvme_bdev_dump_trid_json(trid, w);
2780 		spdk_json_write_named_bool(w, "prchk_reftag",
2781 					   (nvme_bdev_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0);
2782 		spdk_json_write_named_bool(w, "prchk_guard",
2783 					   (nvme_bdev_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0);
2784 
2785 		spdk_json_write_object_end(w);
2786 
2787 		spdk_json_write_object_end(w);
2788 
2789 		for (nsid = 0; nsid < nvme_bdev_ctrlr->num_ns; ++nsid) {
2790 			if (!nvme_bdev_ctrlr->namespaces[nsid]->populated) {
2791 				continue;
2792 			}
2793 
2794 			nvme_namespace_config_json(w, nvme_bdev_ctrlr->namespaces[nsid]);
2795 		}
2796 	}
2797 
2798 	/* Dump as last parameter to give all NVMe bdevs chance to be constructed
2799 	 * before enabling hotplug poller.
2800 	 */
2801 	spdk_json_write_object_begin(w);
2802 	spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug");
2803 
2804 	spdk_json_write_named_object_begin(w, "params");
2805 	spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us);
2806 	spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled);
2807 	spdk_json_write_object_end(w);
2808 
2809 	spdk_json_write_object_end(w);
2810 
2811 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
2812 	return 0;
2813 }
2814 
2815 struct spdk_nvme_ctrlr *
2816 spdk_bdev_nvme_get_ctrlr(struct spdk_bdev *bdev)
2817 {
2818 	if (!bdev || bdev->module != &nvme_if) {
2819 		return NULL;
2820 	}
2821 
2822 	return SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk)->nvme_bdev_ctrlr->ctrlr;
2823 }
2824 
2825 SPDK_LOG_REGISTER_COMPONENT("bdev_nvme", SPDK_LOG_BDEV_NVME)
2826