xref: /spdk/module/bdev/nvme/bdev_nvme.c (revision 800b18d028b95f92738315af634b8101a4c84031)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation. All rights reserved.
5  *   Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "spdk/stdinc.h"
35 
36 #include "bdev_nvme.h"
37 #include "bdev_ocssd.h"
38 
39 #include "spdk/config.h"
40 #include "spdk/conf.h"
41 #include "spdk/endian.h"
42 #include "spdk/bdev.h"
43 #include "spdk/json.h"
44 #include "spdk/nvme.h"
45 #include "spdk/nvme_ocssd.h"
46 #include "spdk/thread.h"
47 #include "spdk/string.h"
48 #include "spdk/likely.h"
49 #include "spdk/util.h"
50 
51 #include "spdk/bdev_module.h"
52 #include "spdk_internal/log.h"
53 
54 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true
55 
56 static void bdev_nvme_get_spdk_running_config(FILE *fp);
57 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w);
58 
59 struct nvme_bdev_io {
60 	/** array of iovecs to transfer. */
61 	struct iovec *iovs;
62 
63 	/** Number of iovecs in iovs array. */
64 	int iovcnt;
65 
66 	/** Current iovec position. */
67 	int iovpos;
68 
69 	/** Offset in current iovec. */
70 	uint32_t iov_offset;
71 
72 	/** array of iovecs to transfer. */
73 	struct iovec *fused_iovs;
74 
75 	/** Number of iovecs in iovs array. */
76 	int fused_iovcnt;
77 
78 	/** Current iovec position. */
79 	int fused_iovpos;
80 
81 	/** Offset in current iovec. */
82 	uint32_t fused_iov_offset;
83 
84 	/** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */
85 	struct spdk_nvme_cpl cpl;
86 
87 	/** Originating thread */
88 	struct spdk_thread *orig_thread;
89 
90 	/** Keeps track if first of fused commands was submitted */
91 	bool first_fused_submitted;
92 };
93 
94 struct nvme_probe_ctx {
95 	size_t count;
96 	struct spdk_nvme_transport_id trids[NVME_MAX_CONTROLLERS];
97 	struct spdk_nvme_host_id hostids[NVME_MAX_CONTROLLERS];
98 	const char *names[NVME_MAX_CONTROLLERS];
99 	uint32_t prchk_flags[NVME_MAX_CONTROLLERS];
100 	const char *hostnqn;
101 };
102 
103 struct nvme_probe_skip_entry {
104 	struct spdk_nvme_transport_id		trid;
105 	TAILQ_ENTRY(nvme_probe_skip_entry)	tailq;
106 };
107 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */
108 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER(
109 			g_skipped_nvme_ctrlrs);
110 
111 static struct spdk_bdev_nvme_opts g_opts = {
112 	.action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE,
113 	.timeout_us = 0,
114 	.retry_count = 4,
115 	.arbitration_burst = 0,
116 	.low_priority_weight = 0,
117 	.medium_priority_weight = 0,
118 	.high_priority_weight = 0,
119 	.nvme_adminq_poll_period_us = 10000ULL,
120 	.nvme_ioq_poll_period_us = 0,
121 	.io_queue_requests = 0,
122 	.delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT,
123 };
124 
125 #define NVME_HOTPLUG_POLL_PERIOD_MAX			10000000ULL
126 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT		100000ULL
127 
128 static int g_hot_insert_nvme_controller_index = 0;
129 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT;
130 static bool g_nvme_hotplug_enabled = false;
131 static struct spdk_thread *g_bdev_nvme_init_thread;
132 static struct spdk_poller *g_hotplug_poller;
133 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx;
134 static char *g_nvme_hostnqn = NULL;
135 
136 static void nvme_ctrlr_populate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
137 		struct nvme_async_probe_ctx *ctx);
138 static void nvme_ctrlr_populate_namespaces_done(struct nvme_async_probe_ctx *ctx);
139 static int bdev_nvme_library_init(void);
140 static void bdev_nvme_library_fini(void);
141 static int bdev_nvme_readv(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
142 			   struct nvme_bdev_io *bio,
143 			   struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba);
144 static int bdev_nvme_no_pi_readv(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
145 				 struct nvme_bdev_io *bio,
146 				 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba);
147 static int bdev_nvme_writev(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
148 			    struct nvme_bdev_io *bio,
149 			    struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba);
150 static int bdev_nvme_comparev(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
151 			      struct nvme_bdev_io *bio,
152 			      struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba);
153 static int bdev_nvme_comparev_and_writev(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
154 		struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov,
155 		int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba);
156 static int bdev_nvme_admin_passthru(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
157 				    struct nvme_bdev_io *bio,
158 				    struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes);
159 static int bdev_nvme_io_passthru(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
160 				 struct nvme_bdev_io *bio,
161 				 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes);
162 static int bdev_nvme_io_passthru_md(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
163 				    struct nvme_bdev_io *bio,
164 				    struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len);
165 static int bdev_nvme_reset(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, struct nvme_bdev_io *bio);
166 static int bdev_nvme_abort(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
167 			   struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort);
168 
169 typedef void (*populate_namespace_fn)(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
170 				      struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx);
171 static void nvme_ctrlr_populate_standard_namespace(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
172 		struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx);
173 
174 static populate_namespace_fn g_populate_namespace_fn[] = {
175 	NULL,
176 	nvme_ctrlr_populate_standard_namespace,
177 	bdev_ocssd_populate_namespace,
178 };
179 
180 typedef void (*depopulate_namespace_fn)(struct nvme_bdev_ns *ns);
181 static void nvme_ctrlr_depopulate_standard_namespace(struct nvme_bdev_ns *ns);
182 
183 static depopulate_namespace_fn g_depopulate_namespace_fn[] = {
184 	NULL,
185 	nvme_ctrlr_depopulate_standard_namespace,
186 	bdev_ocssd_depopulate_namespace,
187 };
188 
189 typedef void (*config_json_namespace_fn)(struct spdk_json_write_ctx *w, struct nvme_bdev_ns *ns);
190 static void nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w,
191 		struct nvme_bdev_ns *ns);
192 
193 static config_json_namespace_fn g_config_json_namespace_fn[] = {
194 	NULL,
195 	nvme_ctrlr_config_json_standard_namespace,
196 	bdev_ocssd_namespace_config_json,
197 };
198 
199 struct spdk_nvme_qpair *
200 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch)
201 {
202 	struct nvme_io_channel *nvme_ch;
203 
204 	nvme_ch =  spdk_io_channel_get_ctx(ctrlr_io_ch);
205 
206 	return nvme_ch->qpair;
207 }
208 
209 static int
210 bdev_nvme_get_ctx_size(void)
211 {
212 	return sizeof(struct nvme_bdev_io);
213 }
214 
215 static struct spdk_bdev_module nvme_if = {
216 	.name = "nvme",
217 	.async_fini = true,
218 	.module_init = bdev_nvme_library_init,
219 	.module_fini = bdev_nvme_library_fini,
220 	.config_text = bdev_nvme_get_spdk_running_config,
221 	.config_json = bdev_nvme_config_json,
222 	.get_ctx_size = bdev_nvme_get_ctx_size,
223 
224 };
225 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if)
226 
227 static void
228 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx)
229 {
230 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "qpar %p is disconnected, attempting reconnect.\n", qpair);
231 	/*
232 	 * Currently, just try to reconnect indefinitely. If we are doing a reset, the reset will
233 	 * reconnect a qpair and we will stop getting a callback for this one.
234 	 */
235 	spdk_nvme_ctrlr_reconnect_io_qpair(qpair);
236 }
237 
238 static int
239 bdev_nvme_poll(void *arg)
240 {
241 	struct nvme_bdev_poll_group *group = arg;
242 	int64_t num_completions;
243 
244 	if (group->collect_spin_stat && group->start_ticks == 0) {
245 		group->start_ticks = spdk_get_ticks();
246 	}
247 
248 	num_completions = spdk_nvme_poll_group_process_completions(group->group, 0,
249 			  bdev_nvme_disconnected_qpair_cb);
250 	if (group->collect_spin_stat) {
251 		if (num_completions > 0) {
252 			if (group->end_ticks != 0) {
253 				group->spin_ticks += (group->end_ticks - group->start_ticks);
254 				group->end_ticks = 0;
255 			}
256 			group->start_ticks = 0;
257 		} else {
258 			group->end_ticks = spdk_get_ticks();
259 		}
260 	}
261 
262 	return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
263 }
264 
265 static int
266 bdev_nvme_poll_adminq(void *arg)
267 {
268 	int32_t rc;
269 	struct spdk_nvme_ctrlr *ctrlr = arg;
270 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
271 
272 	rc = spdk_nvme_ctrlr_process_admin_completions(ctrlr);
273 
274 	if (rc < 0) {
275 		nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(spdk_nvme_ctrlr_get_transport_id(ctrlr));
276 		assert(nvme_bdev_ctrlr != NULL);
277 		bdev_nvme_reset(nvme_bdev_ctrlr, NULL);
278 	}
279 
280 	return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY;
281 }
282 
283 static int
284 bdev_nvme_destruct(void *ctx)
285 {
286 	struct nvme_bdev *nvme_disk = ctx;
287 
288 	nvme_bdev_detach_bdev_from_ns(nvme_disk);
289 
290 	free(nvme_disk->disk.name);
291 	free(nvme_disk);
292 
293 	return 0;
294 }
295 
296 static int
297 bdev_nvme_flush(struct nvme_bdev *nbdev, struct nvme_bdev_io *bio,
298 		uint64_t offset, uint64_t nbytes)
299 {
300 	spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_SUCCESS);
301 
302 	return 0;
303 }
304 
305 static void
306 _bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i)
307 {
308 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
309 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(_ch);
310 	struct spdk_bdev_io *bdev_io;
311 	enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS;
312 
313 	/* A NULL ctx means success. */
314 	if (spdk_io_channel_iter_get_ctx(i) != NULL) {
315 		status = SPDK_BDEV_IO_STATUS_FAILED;
316 	}
317 
318 	while (!TAILQ_EMPTY(&nvme_ch->pending_resets)) {
319 		bdev_io = TAILQ_FIRST(&nvme_ch->pending_resets);
320 		TAILQ_REMOVE(&nvme_ch->pending_resets, bdev_io, module_link);
321 		spdk_bdev_io_complete(bdev_io, status);
322 	}
323 
324 	spdk_for_each_channel_continue(i, 0);
325 }
326 
327 static void
328 _bdev_nvme_reset_complete(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, int rc)
329 {
330 	/* we are using the for_each_channel cb_arg like a return code here. */
331 	/* If it's zero, we succeeded, otherwise, the reset failed. */
332 	void *cb_arg = NULL;
333 
334 	if (rc) {
335 		cb_arg = (void *)0x1;
336 		SPDK_ERRLOG("Resetting controller failed.\n");
337 	} else {
338 		SPDK_NOTICELOG("Resetting controller successful.\n");
339 	}
340 
341 	pthread_mutex_lock(&g_bdev_nvme_mutex);
342 	nvme_bdev_ctrlr->resetting = false;
343 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
344 	/* Make sure we clear any pending resets before returning. */
345 	spdk_for_each_channel(nvme_bdev_ctrlr,
346 			      _bdev_nvme_complete_pending_resets,
347 			      cb_arg, NULL);
348 }
349 
350 static void
351 _bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status)
352 {
353 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i);
354 	void *ctx = spdk_io_channel_iter_get_ctx(i);
355 	int rc = SPDK_BDEV_IO_STATUS_SUCCESS;
356 
357 	if (status) {
358 		rc = SPDK_BDEV_IO_STATUS_FAILED;
359 	}
360 	if (ctx) {
361 		spdk_bdev_io_complete(spdk_bdev_io_from_ctx(ctx), rc);
362 	}
363 	_bdev_nvme_reset_complete(nvme_bdev_ctrlr, status);
364 }
365 
366 static void
367 _bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i)
368 {
369 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i);
370 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
371 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(_ch);
372 	struct spdk_nvme_io_qpair_opts opts;
373 
374 	spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_bdev_ctrlr->ctrlr, &opts, sizeof(opts));
375 	opts.delay_cmd_submit = g_opts.delay_cmd_submit;
376 	opts.create_only = true;
377 
378 	nvme_ch->qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_bdev_ctrlr->ctrlr, &opts, sizeof(opts));
379 	if (!nvme_ch->qpair) {
380 		spdk_for_each_channel_continue(i, -1);
381 		return;
382 	}
383 
384 	assert(nvme_ch->group != NULL);
385 	if (spdk_nvme_poll_group_add(nvme_ch->group->group, nvme_ch->qpair) != 0) {
386 		SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n");
387 		spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair);
388 		spdk_for_each_channel_continue(i, -1);
389 		return;
390 	}
391 
392 	if (spdk_nvme_ctrlr_connect_io_qpair(nvme_bdev_ctrlr->ctrlr, nvme_ch->qpair)) {
393 		SPDK_ERRLOG("Unable to connect I/O qpair.\n");
394 		spdk_nvme_poll_group_remove(nvme_ch->group->group, nvme_ch->qpair);
395 		spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair);
396 		spdk_for_each_channel_continue(i, -1);
397 		return;
398 	}
399 
400 	spdk_for_each_channel_continue(i, 0);
401 }
402 
403 static void
404 _bdev_nvme_reset(struct spdk_io_channel_iter *i, int status)
405 {
406 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i);
407 	struct nvme_bdev_io *bio = spdk_io_channel_iter_get_ctx(i);
408 	int rc;
409 
410 	if (status) {
411 		if (bio) {
412 			spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_FAILED);
413 		}
414 		_bdev_nvme_reset_complete(nvme_bdev_ctrlr, status);
415 		return;
416 	}
417 
418 	rc = spdk_nvme_ctrlr_reset(nvme_bdev_ctrlr->ctrlr);
419 	if (rc != 0) {
420 		if (bio) {
421 			spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_FAILED);
422 		}
423 		_bdev_nvme_reset_complete(nvme_bdev_ctrlr, rc);
424 		return;
425 	}
426 
427 	/* Recreate all of the I/O queue pairs */
428 	spdk_for_each_channel(nvme_bdev_ctrlr,
429 			      _bdev_nvme_reset_create_qpair,
430 			      bio,
431 			      _bdev_nvme_reset_create_qpairs_done);
432 
433 
434 }
435 
436 static void
437 _bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i)
438 {
439 	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
440 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
441 	int rc;
442 
443 	rc = spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair);
444 	if (!rc) {
445 		nvme_ch->qpair = NULL;
446 	}
447 
448 	spdk_for_each_channel_continue(i, rc);
449 }
450 
451 static int
452 bdev_nvme_reset(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, struct nvme_bdev_io *bio)
453 {
454 	struct spdk_io_channel *ch;
455 	struct nvme_io_channel *nvme_ch;
456 
457 	pthread_mutex_lock(&g_bdev_nvme_mutex);
458 	if (nvme_bdev_ctrlr->destruct) {
459 		/* Don't bother resetting if the controller is in the process of being destructed. */
460 		if (bio) {
461 			spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_FAILED);
462 		}
463 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
464 		return 0;
465 	}
466 
467 	if (!nvme_bdev_ctrlr->resetting) {
468 		nvme_bdev_ctrlr->resetting = true;
469 	} else {
470 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
471 		SPDK_NOTICELOG("Unable to perform reset, already in progress.\n");
472 		/*
473 		 * The internal reset calls won't be queued. This is on purpose so that we don't
474 		 * interfere with the app framework reset strategy. i.e. we are deferring to the
475 		 * upper level. If they are in the middle of a reset, we won't try to schedule another one.
476 		 */
477 		if (bio) {
478 			ch = spdk_get_io_channel(nvme_bdev_ctrlr);
479 			assert(ch != NULL);
480 			nvme_ch = spdk_io_channel_get_ctx(ch);
481 			TAILQ_INSERT_TAIL(&nvme_ch->pending_resets, spdk_bdev_io_from_ctx(bio), module_link);
482 			spdk_put_io_channel(ch);
483 		}
484 		return 0;
485 	}
486 
487 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
488 	/* First, delete all NVMe I/O queue pairs. */
489 	spdk_for_each_channel(nvme_bdev_ctrlr,
490 			      _bdev_nvme_reset_destroy_qpair,
491 			      bio,
492 			      _bdev_nvme_reset);
493 
494 	return 0;
495 }
496 
497 static int
498 bdev_nvme_unmap(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
499 		struct nvme_bdev_io *bio,
500 		uint64_t offset_blocks,
501 		uint64_t num_blocks);
502 
503 static void
504 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
505 		     bool success)
506 {
507 	int ret;
508 
509 	if (!success) {
510 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
511 		return;
512 	}
513 
514 	ret = bdev_nvme_readv((struct nvme_bdev *)bdev_io->bdev->ctxt,
515 			      ch,
516 			      (struct nvme_bdev_io *)bdev_io->driver_ctx,
517 			      bdev_io->u.bdev.iovs,
518 			      bdev_io->u.bdev.iovcnt,
519 			      bdev_io->u.bdev.md_buf,
520 			      bdev_io->u.bdev.num_blocks,
521 			      bdev_io->u.bdev.offset_blocks);
522 
523 	if (spdk_likely(ret == 0)) {
524 		return;
525 	} else if (ret == -ENOMEM) {
526 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
527 	} else {
528 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
529 	}
530 }
531 
532 static int
533 _bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
534 {
535 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
536 	struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt;
537 	struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx;
538 	struct nvme_bdev_io *nbdev_io_to_abort;
539 
540 	if (nvme_ch->qpair == NULL) {
541 		/* The device is currently resetting */
542 		return -1;
543 	}
544 
545 	switch (bdev_io->type) {
546 	case SPDK_BDEV_IO_TYPE_READ:
547 		spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb,
548 				     bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
549 		return 0;
550 
551 	case SPDK_BDEV_IO_TYPE_WRITE:
552 		return bdev_nvme_writev(nbdev,
553 					ch,
554 					nbdev_io,
555 					bdev_io->u.bdev.iovs,
556 					bdev_io->u.bdev.iovcnt,
557 					bdev_io->u.bdev.md_buf,
558 					bdev_io->u.bdev.num_blocks,
559 					bdev_io->u.bdev.offset_blocks);
560 
561 	case SPDK_BDEV_IO_TYPE_COMPARE:
562 		return bdev_nvme_comparev(nbdev,
563 					  ch,
564 					  nbdev_io,
565 					  bdev_io->u.bdev.iovs,
566 					  bdev_io->u.bdev.iovcnt,
567 					  bdev_io->u.bdev.md_buf,
568 					  bdev_io->u.bdev.num_blocks,
569 					  bdev_io->u.bdev.offset_blocks);
570 
571 	case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE:
572 		return bdev_nvme_comparev_and_writev(nbdev,
573 						     ch,
574 						     nbdev_io,
575 						     bdev_io->u.bdev.iovs,
576 						     bdev_io->u.bdev.iovcnt,
577 						     bdev_io->u.bdev.fused_iovs,
578 						     bdev_io->u.bdev.fused_iovcnt,
579 						     bdev_io->u.bdev.md_buf,
580 						     bdev_io->u.bdev.num_blocks,
581 						     bdev_io->u.bdev.offset_blocks);
582 
583 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
584 		return bdev_nvme_unmap(nbdev,
585 				       ch,
586 				       nbdev_io,
587 				       bdev_io->u.bdev.offset_blocks,
588 				       bdev_io->u.bdev.num_blocks);
589 
590 	case SPDK_BDEV_IO_TYPE_UNMAP:
591 		return bdev_nvme_unmap(nbdev,
592 				       ch,
593 				       nbdev_io,
594 				       bdev_io->u.bdev.offset_blocks,
595 				       bdev_io->u.bdev.num_blocks);
596 
597 	case SPDK_BDEV_IO_TYPE_RESET:
598 		return bdev_nvme_reset(nbdev->nvme_bdev_ctrlr, nbdev_io);
599 
600 	case SPDK_BDEV_IO_TYPE_FLUSH:
601 		return bdev_nvme_flush(nbdev,
602 				       nbdev_io,
603 				       bdev_io->u.bdev.offset_blocks,
604 				       bdev_io->u.bdev.num_blocks);
605 
606 	case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
607 		return bdev_nvme_admin_passthru(nbdev,
608 						ch,
609 						nbdev_io,
610 						&bdev_io->u.nvme_passthru.cmd,
611 						bdev_io->u.nvme_passthru.buf,
612 						bdev_io->u.nvme_passthru.nbytes);
613 
614 	case SPDK_BDEV_IO_TYPE_NVME_IO:
615 		return bdev_nvme_io_passthru(nbdev,
616 					     ch,
617 					     nbdev_io,
618 					     &bdev_io->u.nvme_passthru.cmd,
619 					     bdev_io->u.nvme_passthru.buf,
620 					     bdev_io->u.nvme_passthru.nbytes);
621 
622 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
623 		return bdev_nvme_io_passthru_md(nbdev,
624 						ch,
625 						nbdev_io,
626 						&bdev_io->u.nvme_passthru.cmd,
627 						bdev_io->u.nvme_passthru.buf,
628 						bdev_io->u.nvme_passthru.nbytes,
629 						bdev_io->u.nvme_passthru.md_buf,
630 						bdev_io->u.nvme_passthru.md_len);
631 
632 	case SPDK_BDEV_IO_TYPE_ABORT:
633 		nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx;
634 		return bdev_nvme_abort(nbdev,
635 				       ch,
636 				       nbdev_io,
637 				       nbdev_io_to_abort);
638 
639 	default:
640 		return -EINVAL;
641 	}
642 	return 0;
643 }
644 
645 static void
646 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
647 {
648 	int rc = _bdev_nvme_submit_request(ch, bdev_io);
649 
650 	if (spdk_unlikely(rc != 0)) {
651 		if (rc == -ENOMEM) {
652 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
653 		} else {
654 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
655 		}
656 	}
657 }
658 
659 static bool
660 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
661 {
662 	struct nvme_bdev *nbdev = ctx;
663 	const struct spdk_nvme_ctrlr_data *cdata;
664 
665 	switch (io_type) {
666 	case SPDK_BDEV_IO_TYPE_READ:
667 	case SPDK_BDEV_IO_TYPE_WRITE:
668 	case SPDK_BDEV_IO_TYPE_RESET:
669 	case SPDK_BDEV_IO_TYPE_FLUSH:
670 	case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
671 	case SPDK_BDEV_IO_TYPE_NVME_IO:
672 	case SPDK_BDEV_IO_TYPE_ABORT:
673 		return true;
674 
675 	case SPDK_BDEV_IO_TYPE_COMPARE:
676 		return spdk_nvme_ns_supports_compare(nbdev->nvme_ns->ns);
677 
678 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
679 		return spdk_nvme_ns_get_md_size(nbdev->nvme_ns->ns) ? true : false;
680 
681 	case SPDK_BDEV_IO_TYPE_UNMAP:
682 		cdata = spdk_nvme_ctrlr_get_data(nbdev->nvme_bdev_ctrlr->ctrlr);
683 		return cdata->oncs.dsm;
684 
685 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
686 		cdata = spdk_nvme_ctrlr_get_data(nbdev->nvme_bdev_ctrlr->ctrlr);
687 		/*
688 		 * If an NVMe controller guarantees reading unallocated blocks returns zero,
689 		 * we can implement WRITE_ZEROES as an NVMe deallocate command.
690 		 */
691 		if (cdata->oncs.dsm &&
692 		    spdk_nvme_ns_get_dealloc_logical_block_read_value(nbdev->nvme_ns->ns) ==
693 		    SPDK_NVME_DEALLOC_READ_00) {
694 			return true;
695 		}
696 		/*
697 		 * The NVMe controller write_zeroes function is currently not used by our driver.
698 		 * If a user submits an arbitrarily large write_zeroes request to the controller, the request will fail.
699 		 * Until this is resolved, we only claim support for write_zeroes if deallocated blocks return 0's when read.
700 		 */
701 		return false;
702 
703 	case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE:
704 		if (spdk_nvme_ctrlr_get_flags(nbdev->nvme_bdev_ctrlr->ctrlr) &
705 		    SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) {
706 			return true;
707 		}
708 		return false;
709 
710 	default:
711 		return false;
712 	}
713 }
714 
715 static int
716 bdev_nvme_create_cb(void *io_device, void *ctx_buf)
717 {
718 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = io_device;
719 	struct nvme_io_channel *ch = ctx_buf;
720 	struct spdk_nvme_io_qpair_opts opts;
721 	struct spdk_io_channel *pg_ch = NULL;
722 	int rc;
723 
724 	spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_bdev_ctrlr->ctrlr, &opts, sizeof(opts));
725 	opts.delay_cmd_submit = g_opts.delay_cmd_submit;
726 	opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests);
727 	opts.create_only = true;
728 	g_opts.io_queue_requests = opts.io_queue_requests;
729 
730 	ch->qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_bdev_ctrlr->ctrlr, &opts, sizeof(opts));
731 
732 	if (ch->qpair == NULL) {
733 		return -1;
734 	}
735 
736 	if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) {
737 		if (bdev_ocssd_create_io_channel(ch)) {
738 			goto err;
739 		}
740 	}
741 
742 	pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs);
743 	if (!pg_ch) {
744 		goto err;
745 	}
746 
747 	ch->group = spdk_io_channel_get_ctx(pg_ch);
748 	if (spdk_nvme_poll_group_add(ch->group->group, ch->qpair) != 0) {
749 		goto err;
750 	}
751 
752 	rc = spdk_nvme_ctrlr_connect_io_qpair(nvme_bdev_ctrlr->ctrlr, ch->qpair);
753 	if (rc) {
754 		spdk_nvme_poll_group_remove(ch->group->group, ch->qpair);
755 		goto err;
756 	}
757 
758 #ifdef SPDK_CONFIG_VTUNE
759 	ch->group->collect_spin_stat = true;
760 #else
761 	ch->group->collect_spin_stat = false;
762 #endif
763 
764 	TAILQ_INIT(&ch->pending_resets);
765 	return 0;
766 
767 err:
768 	if (pg_ch) {
769 		spdk_put_io_channel(pg_ch);
770 	}
771 	spdk_nvme_ctrlr_free_io_qpair(ch->qpair);
772 	return -1;
773 }
774 
775 static void
776 bdev_nvme_destroy_cb(void *io_device, void *ctx_buf)
777 {
778 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = io_device;
779 	struct nvme_io_channel *ch = ctx_buf;
780 	struct nvme_bdev_poll_group *group;
781 
782 	group = ch->group;
783 	assert(group != NULL);
784 
785 	if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) {
786 		bdev_ocssd_destroy_io_channel(ch);
787 	}
788 
789 	if (ch->qpair != NULL) {
790 		spdk_nvme_poll_group_remove(group->group, ch->qpair);
791 	}
792 	spdk_put_io_channel(spdk_io_channel_from_ctx(group));
793 
794 	spdk_nvme_ctrlr_free_io_qpair(ch->qpair);
795 }
796 
797 static int
798 bdev_nvme_poll_group_create_cb(void *io_device, void *ctx_buf)
799 {
800 	struct nvme_bdev_poll_group *group = ctx_buf;
801 
802 	group->group = spdk_nvme_poll_group_create(group);
803 	if (group->group == NULL) {
804 		return -1;
805 	}
806 
807 	group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us);
808 
809 	if (group->poller == NULL) {
810 		spdk_nvme_poll_group_destroy(group->group);
811 		return -1;
812 	}
813 
814 	return 0;
815 }
816 
817 static void
818 bdev_nvme_poll_group_destroy_cb(void *io_device, void *ctx_buf)
819 {
820 	struct nvme_bdev_poll_group *group = ctx_buf;
821 
822 	spdk_poller_unregister(&group->poller);
823 	if (spdk_nvme_poll_group_destroy(group->group)) {
824 		SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.");
825 		assert(false);
826 	}
827 }
828 
829 static struct spdk_io_channel *
830 bdev_nvme_get_io_channel(void *ctx)
831 {
832 	struct nvme_bdev *nvme_bdev = ctx;
833 
834 	return spdk_get_io_channel(nvme_bdev->nvme_bdev_ctrlr);
835 }
836 
837 static int
838 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
839 {
840 	struct nvme_bdev *nvme_bdev = ctx;
841 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = nvme_bdev->nvme_bdev_ctrlr;
842 	const struct spdk_nvme_ctrlr_data *cdata;
843 	struct spdk_nvme_ns *ns;
844 	union spdk_nvme_vs_register vs;
845 	union spdk_nvme_csts_register csts;
846 	char buf[128];
847 
848 	cdata = spdk_nvme_ctrlr_get_data(nvme_bdev->nvme_bdev_ctrlr->ctrlr);
849 	vs = spdk_nvme_ctrlr_get_regs_vs(nvme_bdev->nvme_bdev_ctrlr->ctrlr);
850 	csts = spdk_nvme_ctrlr_get_regs_csts(nvme_bdev->nvme_bdev_ctrlr->ctrlr);
851 	ns = nvme_bdev->nvme_ns->ns;
852 
853 	spdk_json_write_named_object_begin(w, "nvme");
854 
855 	if (nvme_bdev_ctrlr->trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
856 		spdk_json_write_named_string(w, "pci_address", nvme_bdev_ctrlr->trid->traddr);
857 	}
858 
859 	spdk_json_write_named_object_begin(w, "trid");
860 
861 	nvme_bdev_dump_trid_json(nvme_bdev_ctrlr->trid, w);
862 
863 	spdk_json_write_object_end(w);
864 
865 #ifdef SPDK_CONFIG_NVME_CUSE
866 	size_t cuse_name_size = 128;
867 	char cuse_name[cuse_name_size];
868 
869 	int rc = spdk_nvme_cuse_get_ns_name(nvme_bdev->nvme_bdev_ctrlr->ctrlr, spdk_nvme_ns_get_id(ns),
870 					    cuse_name, &cuse_name_size);
871 	if (rc == 0) {
872 		spdk_json_write_named_string(w, "cuse_device", cuse_name);
873 	}
874 #endif
875 
876 	spdk_json_write_named_object_begin(w, "ctrlr_data");
877 
878 	spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid);
879 
880 	snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn);
881 	spdk_str_trim(buf);
882 	spdk_json_write_named_string(w, "model_number", buf);
883 
884 	snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn);
885 	spdk_str_trim(buf);
886 	spdk_json_write_named_string(w, "serial_number", buf);
887 
888 	snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr);
889 	spdk_str_trim(buf);
890 	spdk_json_write_named_string(w, "firmware_revision", buf);
891 
892 	spdk_json_write_named_object_begin(w, "oacs");
893 
894 	spdk_json_write_named_uint32(w, "security", cdata->oacs.security);
895 	spdk_json_write_named_uint32(w, "format", cdata->oacs.format);
896 	spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware);
897 	spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage);
898 
899 	spdk_json_write_object_end(w);
900 
901 	spdk_json_write_object_end(w);
902 
903 	spdk_json_write_named_object_begin(w, "vs");
904 
905 	spdk_json_write_name(w, "nvme_version");
906 	if (vs.bits.ter) {
907 		spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter);
908 	} else {
909 		spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr);
910 	}
911 
912 	spdk_json_write_object_end(w);
913 
914 	spdk_json_write_named_object_begin(w, "csts");
915 
916 	spdk_json_write_named_uint32(w, "rdy", csts.bits.rdy);
917 	spdk_json_write_named_uint32(w, "cfs", csts.bits.cfs);
918 
919 	spdk_json_write_object_end(w);
920 
921 	spdk_json_write_named_object_begin(w, "ns_data");
922 
923 	spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns));
924 
925 	spdk_json_write_object_end(w);
926 
927 	if (cdata->oacs.security) {
928 		spdk_json_write_named_object_begin(w, "security");
929 
930 		spdk_json_write_named_bool(w, "opal", nvme_bdev_ctrlr->opal_dev ? true : false);
931 
932 		spdk_json_write_object_end(w);
933 	}
934 
935 	spdk_json_write_object_end(w);
936 
937 	return 0;
938 }
939 
940 static void
941 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
942 {
943 	/* No config per bdev needed */
944 }
945 
946 static uint64_t
947 bdev_nvme_get_spin_time(struct spdk_io_channel *ch)
948 {
949 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
950 	struct nvme_bdev_poll_group *group = nvme_ch->group;
951 	uint64_t spin_time;
952 
953 	if (!group || !group->collect_spin_stat) {
954 		return 0;
955 	}
956 
957 	if (group->end_ticks != 0) {
958 		group->spin_ticks += (group->end_ticks - group->start_ticks);
959 		group->end_ticks = 0;
960 	}
961 
962 	spin_time = (group->spin_ticks * 1000000ULL) / spdk_get_ticks_hz();
963 	group->start_ticks = 0;
964 	group->spin_ticks = 0;
965 
966 	return spin_time;
967 }
968 
969 static const struct spdk_bdev_fn_table nvmelib_fn_table = {
970 	.destruct		= bdev_nvme_destruct,
971 	.submit_request		= bdev_nvme_submit_request,
972 	.io_type_supported	= bdev_nvme_io_type_supported,
973 	.get_io_channel		= bdev_nvme_get_io_channel,
974 	.dump_info_json		= bdev_nvme_dump_info_json,
975 	.write_config_json	= bdev_nvme_write_config_json,
976 	.get_spin_time		= bdev_nvme_get_spin_time,
977 };
978 
979 static void
980 nvme_ctrlr_populate_standard_namespace(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
981 				       struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx)
982 {
983 	struct spdk_nvme_ctrlr	*ctrlr = nvme_bdev_ctrlr->ctrlr;
984 	struct nvme_bdev	*bdev;
985 	struct spdk_nvme_ns	*ns;
986 	const struct spdk_uuid	*uuid;
987 	const struct spdk_nvme_ctrlr_data *cdata;
988 	const struct spdk_nvme_ns_data *nsdata;
989 	int			rc;
990 
991 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
992 
993 	ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id);
994 	if (!ns) {
995 		SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "Invalid NS %d\n", nvme_ns->id);
996 		nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, -EINVAL);
997 		return;
998 	}
999 
1000 	bdev = calloc(1, sizeof(*bdev));
1001 	if (!bdev) {
1002 		SPDK_ERRLOG("bdev calloc() failed\n");
1003 		nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, -ENOMEM);
1004 		return;
1005 	}
1006 
1007 	bdev->nvme_bdev_ctrlr = nvme_bdev_ctrlr;
1008 	nvme_ns->ns = ns;
1009 	bdev->nvme_ns = nvme_ns;
1010 
1011 	bdev->disk.name = spdk_sprintf_alloc("%sn%d", nvme_bdev_ctrlr->name, spdk_nvme_ns_get_id(ns));
1012 	if (!bdev->disk.name) {
1013 		free(bdev);
1014 		nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, -ENOMEM);
1015 		return;
1016 	}
1017 	bdev->disk.product_name = "NVMe disk";
1018 
1019 	bdev->disk.write_cache = 0;
1020 	if (cdata->vwc.present) {
1021 		/* Enable if the Volatile Write Cache exists */
1022 		bdev->disk.write_cache = 1;
1023 	}
1024 	bdev->disk.blocklen = spdk_nvme_ns_get_extended_sector_size(ns);
1025 	bdev->disk.blockcnt = spdk_nvme_ns_get_num_sectors(ns);
1026 	bdev->disk.optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns);
1027 
1028 	uuid = spdk_nvme_ns_get_uuid(ns);
1029 	if (uuid != NULL) {
1030 		bdev->disk.uuid = *uuid;
1031 	}
1032 
1033 	nsdata = spdk_nvme_ns_get_data(ns);
1034 
1035 	bdev->disk.md_len = spdk_nvme_ns_get_md_size(ns);
1036 	if (bdev->disk.md_len != 0) {
1037 		bdev->disk.md_interleave = nsdata->flbas.extended;
1038 		bdev->disk.dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns);
1039 		if (bdev->disk.dif_type != SPDK_DIF_DISABLE) {
1040 			bdev->disk.dif_is_head_of_md = nsdata->dps.md_start;
1041 			bdev->disk.dif_check_flags = nvme_bdev_ctrlr->prchk_flags;
1042 		}
1043 	}
1044 
1045 	if (!bdev_nvme_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) {
1046 		bdev->disk.acwu = 0;
1047 	} else if (nsdata->nsfeat.ns_atomic_write_unit) {
1048 		bdev->disk.acwu = nsdata->nacwu;
1049 	} else {
1050 		bdev->disk.acwu = cdata->acwu;
1051 	}
1052 
1053 	bdev->disk.ctxt = bdev;
1054 	bdev->disk.fn_table = &nvmelib_fn_table;
1055 	bdev->disk.module = &nvme_if;
1056 	rc = spdk_bdev_register(&bdev->disk);
1057 	if (rc) {
1058 		free(bdev->disk.name);
1059 		free(bdev);
1060 		nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, rc);
1061 		return;
1062 	}
1063 
1064 	nvme_bdev_attach_bdev_to_ns(nvme_ns, bdev);
1065 	nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, 0);
1066 }
1067 
1068 static bool
1069 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
1070 		 struct spdk_nvme_ctrlr_opts *opts)
1071 {
1072 	struct nvme_probe_skip_entry *entry;
1073 
1074 	TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) {
1075 		if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) {
1076 			return false;
1077 		}
1078 	}
1079 
1080 	opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst;
1081 	opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight;
1082 	opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight;
1083 	opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight;
1084 
1085 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "Attaching to %s\n", trid->traddr);
1086 
1087 	return true;
1088 }
1089 
1090 static bool
1091 probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
1092 	 struct spdk_nvme_ctrlr_opts *opts)
1093 {
1094 	struct nvme_probe_ctx *ctx = cb_ctx;
1095 
1096 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "Probing device %s\n", trid->traddr);
1097 
1098 	if (nvme_bdev_ctrlr_get(trid)) {
1099 		SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n",
1100 			    trid->traddr);
1101 		return false;
1102 	}
1103 
1104 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
1105 		bool claim_device = false;
1106 		size_t i;
1107 
1108 		for (i = 0; i < ctx->count; i++) {
1109 			if (spdk_nvme_transport_id_compare(trid, &ctx->trids[i]) == 0) {
1110 				claim_device = true;
1111 				break;
1112 			}
1113 		}
1114 
1115 		if (!claim_device) {
1116 			SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "Not claiming device at %s\n", trid->traddr);
1117 			return false;
1118 		}
1119 	}
1120 
1121 	if (ctx->hostnqn) {
1122 		snprintf(opts->hostnqn, sizeof(opts->hostnqn), "%s", ctx->hostnqn);
1123 	}
1124 
1125 	opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst;
1126 	opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight;
1127 	opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight;
1128 	opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight;
1129 
1130 	return true;
1131 }
1132 
1133 static void
1134 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl)
1135 {
1136 	struct spdk_nvme_ctrlr *ctrlr = ctx;
1137 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
1138 
1139 	if (spdk_nvme_cpl_is_error(cpl)) {
1140 		SPDK_WARNLOG("Abort failed. Resetting controller.\n");
1141 		nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(spdk_nvme_ctrlr_get_transport_id(ctrlr));
1142 		assert(nvme_bdev_ctrlr != NULL);
1143 		bdev_nvme_reset(nvme_bdev_ctrlr, NULL);
1144 	}
1145 }
1146 
1147 static void
1148 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr,
1149 	   struct spdk_nvme_qpair *qpair, uint16_t cid)
1150 {
1151 	int rc;
1152 	union spdk_nvme_csts_register csts;
1153 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
1154 
1155 	SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid);
1156 
1157 	csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr);
1158 	if (csts.bits.cfs) {
1159 		SPDK_ERRLOG("Controller Fatal Status, reset required\n");
1160 		nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(spdk_nvme_ctrlr_get_transport_id(ctrlr));
1161 		assert(nvme_bdev_ctrlr != NULL);
1162 		bdev_nvme_reset(nvme_bdev_ctrlr, NULL);
1163 		return;
1164 	}
1165 
1166 	switch (g_opts.action_on_timeout) {
1167 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT:
1168 		if (qpair) {
1169 			rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid,
1170 						       nvme_abort_cpl, ctrlr);
1171 			if (rc == 0) {
1172 				return;
1173 			}
1174 
1175 			SPDK_ERRLOG("Unable to send abort. Resetting.\n");
1176 		}
1177 
1178 	/* FALLTHROUGH */
1179 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET:
1180 		nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(spdk_nvme_ctrlr_get_transport_id(ctrlr));
1181 		assert(nvme_bdev_ctrlr != NULL);
1182 		bdev_nvme_reset(nvme_bdev_ctrlr, NULL);
1183 		break;
1184 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE:
1185 		SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "No action for nvme controller timeout.\n");
1186 		break;
1187 	default:
1188 		SPDK_ERRLOG("An invalid timeout action value is found.\n");
1189 		break;
1190 	}
1191 }
1192 
1193 void
1194 nvme_ctrlr_depopulate_namespace_done(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr)
1195 {
1196 	pthread_mutex_lock(&g_bdev_nvme_mutex);
1197 	nvme_bdev_ctrlr->ref--;
1198 
1199 	if (nvme_bdev_ctrlr->ref == 0 && nvme_bdev_ctrlr->destruct) {
1200 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
1201 		nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr);
1202 		return;
1203 	}
1204 
1205 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
1206 }
1207 
1208 static void
1209 nvme_ctrlr_depopulate_standard_namespace(struct nvme_bdev_ns *ns)
1210 {
1211 	struct nvme_bdev *bdev, *tmp;
1212 
1213 	TAILQ_FOREACH_SAFE(bdev, &ns->bdevs, tailq, tmp) {
1214 		spdk_bdev_unregister(&bdev->disk, NULL, NULL);
1215 	}
1216 
1217 	ns->populated = false;
1218 
1219 	nvme_ctrlr_depopulate_namespace_done(ns->ctrlr);
1220 }
1221 
1222 static void nvme_ctrlr_populate_namespace(struct nvme_bdev_ctrlr *ctrlr, struct nvme_bdev_ns *ns,
1223 		struct nvme_async_probe_ctx *ctx)
1224 {
1225 	g_populate_namespace_fn[ns->type](ctrlr, ns, ctx);
1226 }
1227 
1228 static void nvme_ctrlr_depopulate_namespace(struct nvme_bdev_ctrlr *ctrlr, struct nvme_bdev_ns *ns)
1229 {
1230 	g_depopulate_namespace_fn[ns->type](ns);
1231 }
1232 
1233 void
1234 nvme_ctrlr_populate_namespace_done(struct nvme_async_probe_ctx *ctx,
1235 				   struct nvme_bdev_ns *ns, int rc)
1236 {
1237 	if (rc == 0) {
1238 		ns->populated = true;
1239 		pthread_mutex_lock(&g_bdev_nvme_mutex);
1240 		ns->ctrlr->ref++;
1241 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
1242 	} else {
1243 		memset(ns, 0, sizeof(*ns));
1244 	}
1245 
1246 	if (ctx) {
1247 		ctx->populates_in_progress--;
1248 		if (ctx->populates_in_progress == 0) {
1249 			nvme_ctrlr_populate_namespaces_done(ctx);
1250 		}
1251 	}
1252 }
1253 
1254 static void
1255 nvme_ctrlr_populate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
1256 			       struct nvme_async_probe_ctx *ctx)
1257 {
1258 	struct spdk_nvme_ctrlr	*ctrlr = nvme_bdev_ctrlr->ctrlr;
1259 	struct nvme_bdev_ns	*ns;
1260 	struct spdk_nvme_ns	*nvme_ns;
1261 	struct nvme_bdev	*bdev;
1262 	uint32_t		i;
1263 	int			rc;
1264 	uint64_t		num_sectors;
1265 	bool			ns_is_active;
1266 
1267 	if (ctx) {
1268 		/* Initialize this count to 1 to handle the populate functions
1269 		 * calling nvme_ctrlr_populate_namespace_done() immediately.
1270 		 */
1271 		ctx->populates_in_progress = 1;
1272 	}
1273 
1274 	for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
1275 		uint32_t	nsid = i + 1;
1276 
1277 		ns = nvme_bdev_ctrlr->namespaces[i];
1278 		ns_is_active = spdk_nvme_ctrlr_is_active_ns(ctrlr, nsid);
1279 
1280 		if (ns->populated && ns_is_active && ns->type == NVME_BDEV_NS_STANDARD) {
1281 			/* NS is still there but attributes may have changed */
1282 			nvme_ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid);
1283 			num_sectors = spdk_nvme_ns_get_num_sectors(nvme_ns);
1284 			bdev = TAILQ_FIRST(&ns->bdevs);
1285 			if (bdev->disk.blockcnt != num_sectors) {
1286 				SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %lu, new size %lu\n",
1287 					       nsid,
1288 					       bdev->disk.name,
1289 					       bdev->disk.blockcnt,
1290 					       num_sectors);
1291 				rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors);
1292 				if (rc != 0) {
1293 					SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n",
1294 						    bdev->disk.name, rc);
1295 				}
1296 			}
1297 		}
1298 
1299 		if (!ns->populated && ns_is_active) {
1300 			ns->id = nsid;
1301 			ns->ctrlr = nvme_bdev_ctrlr;
1302 			if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) {
1303 				ns->type = NVME_BDEV_NS_OCSSD;
1304 			} else {
1305 				ns->type = NVME_BDEV_NS_STANDARD;
1306 			}
1307 
1308 			TAILQ_INIT(&ns->bdevs);
1309 
1310 			if (ctx) {
1311 				ctx->populates_in_progress++;
1312 			}
1313 			nvme_ctrlr_populate_namespace(nvme_bdev_ctrlr, ns, ctx);
1314 		}
1315 
1316 		if (ns->populated && !ns_is_active) {
1317 			nvme_ctrlr_depopulate_namespace(nvme_bdev_ctrlr, ns);
1318 		}
1319 	}
1320 
1321 	if (ctx) {
1322 		/* Decrement this count now that the loop is over to account
1323 		 * for the one we started with.  If the count is then 0, we
1324 		 * know any populate_namespace functions completed immediately,
1325 		 * so we'll kick the callback here.
1326 		 */
1327 		ctx->populates_in_progress--;
1328 		if (ctx->populates_in_progress == 0) {
1329 			nvme_ctrlr_populate_namespaces_done(ctx);
1330 		}
1331 	}
1332 
1333 }
1334 
1335 static void
1336 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl)
1337 {
1338 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr		= arg;
1339 	union spdk_nvme_async_event_completion	event;
1340 
1341 	if (spdk_nvme_cpl_is_error(cpl)) {
1342 		SPDK_WARNLOG("AER request execute failed");
1343 		return;
1344 	}
1345 
1346 	event.raw = cpl->cdw0;
1347 	if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) &&
1348 	    (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) {
1349 		nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, NULL);
1350 	} else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_VENDOR) &&
1351 		   (event.bits.log_page_identifier == SPDK_OCSSD_LOG_CHUNK_NOTIFICATION) &&
1352 		   spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) {
1353 		bdev_ocssd_handle_chunk_notification(nvme_bdev_ctrlr);
1354 	}
1355 }
1356 
1357 static int
1358 create_ctrlr(struct spdk_nvme_ctrlr *ctrlr,
1359 	     const char *name,
1360 	     const struct spdk_nvme_transport_id *trid,
1361 	     uint32_t prchk_flags)
1362 {
1363 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
1364 	uint32_t i;
1365 	int rc;
1366 
1367 	nvme_bdev_ctrlr = calloc(1, sizeof(*nvme_bdev_ctrlr));
1368 	if (nvme_bdev_ctrlr == NULL) {
1369 		SPDK_ERRLOG("Failed to allocate device struct\n");
1370 		return -ENOMEM;
1371 	}
1372 
1373 	nvme_bdev_ctrlr->trid = calloc(1, sizeof(*nvme_bdev_ctrlr->trid));
1374 	if (nvme_bdev_ctrlr->trid == NULL) {
1375 		SPDK_ERRLOG("Failed to allocate device trid struct\n");
1376 		free(nvme_bdev_ctrlr);
1377 		return -ENOMEM;
1378 	}
1379 
1380 	nvme_bdev_ctrlr->num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr);
1381 	nvme_bdev_ctrlr->namespaces = calloc(nvme_bdev_ctrlr->num_ns, sizeof(struct nvme_bdev_ns *));
1382 	if (!nvme_bdev_ctrlr->namespaces) {
1383 		SPDK_ERRLOG("Failed to allocate block namespaces pointer\n");
1384 		free(nvme_bdev_ctrlr->trid);
1385 		free(nvme_bdev_ctrlr);
1386 		return -ENOMEM;
1387 	}
1388 
1389 	for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
1390 		nvme_bdev_ctrlr->namespaces[i] = calloc(1, sizeof(struct nvme_bdev_ns));
1391 		if (nvme_bdev_ctrlr->namespaces[i] == NULL) {
1392 			SPDK_ERRLOG("Failed to allocate block namespace struct\n");
1393 			for (; i > 0; i--) {
1394 				free(nvme_bdev_ctrlr->namespaces[i - 1]);
1395 			}
1396 			free(nvme_bdev_ctrlr->namespaces);
1397 			free(nvme_bdev_ctrlr->trid);
1398 			free(nvme_bdev_ctrlr);
1399 			return -ENOMEM;
1400 		}
1401 	}
1402 
1403 	nvme_bdev_ctrlr->thread = spdk_get_thread();
1404 	nvme_bdev_ctrlr->adminq_timer_poller = NULL;
1405 	nvme_bdev_ctrlr->ctrlr = ctrlr;
1406 	nvme_bdev_ctrlr->ref = 0;
1407 	*nvme_bdev_ctrlr->trid = *trid;
1408 	nvme_bdev_ctrlr->name = strdup(name);
1409 	if (nvme_bdev_ctrlr->name == NULL) {
1410 		free(nvme_bdev_ctrlr->namespaces);
1411 		free(nvme_bdev_ctrlr->trid);
1412 		free(nvme_bdev_ctrlr);
1413 		return -ENOMEM;
1414 	}
1415 
1416 	if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) {
1417 		rc = bdev_ocssd_init_ctrlr(nvme_bdev_ctrlr);
1418 		if (spdk_unlikely(rc != 0)) {
1419 			SPDK_ERRLOG("Unable to initialize OCSSD controller\n");
1420 			free(nvme_bdev_ctrlr->name);
1421 			free(nvme_bdev_ctrlr->namespaces);
1422 			free(nvme_bdev_ctrlr->trid);
1423 			free(nvme_bdev_ctrlr);
1424 			return rc;
1425 		}
1426 	}
1427 
1428 	nvme_bdev_ctrlr->prchk_flags = prchk_flags;
1429 
1430 	spdk_io_device_register(nvme_bdev_ctrlr, bdev_nvme_create_cb, bdev_nvme_destroy_cb,
1431 				sizeof(struct nvme_io_channel),
1432 				name);
1433 
1434 	nvme_bdev_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, ctrlr,
1435 					       g_opts.nvme_adminq_poll_period_us);
1436 
1437 	TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nvme_bdev_ctrlr, tailq);
1438 
1439 	if (g_opts.timeout_us > 0) {
1440 		spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us,
1441 				timeout_cb, NULL);
1442 	}
1443 
1444 	spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_bdev_ctrlr);
1445 
1446 	if (spdk_nvme_ctrlr_get_flags(nvme_bdev_ctrlr->ctrlr) &
1447 	    SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) {
1448 		nvme_bdev_ctrlr->opal_dev = spdk_opal_dev_construct(nvme_bdev_ctrlr->ctrlr);
1449 		if (nvme_bdev_ctrlr->opal_dev == NULL) {
1450 			SPDK_ERRLOG("Failed to initialize Opal\n");
1451 		}
1452 	}
1453 	return 0;
1454 }
1455 
1456 static void
1457 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
1458 	  struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
1459 {
1460 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
1461 	struct nvme_probe_ctx *ctx = cb_ctx;
1462 	char *name = NULL;
1463 	uint32_t prchk_flags = 0;
1464 	size_t i;
1465 
1466 	if (ctx) {
1467 		for (i = 0; i < ctx->count; i++) {
1468 			if (spdk_nvme_transport_id_compare(trid, &ctx->trids[i]) == 0) {
1469 				prchk_flags = ctx->prchk_flags[i];
1470 				name = strdup(ctx->names[i]);
1471 				break;
1472 			}
1473 		}
1474 	} else {
1475 		name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++);
1476 	}
1477 	if (!name) {
1478 		SPDK_ERRLOG("Failed to assign name to NVMe device\n");
1479 		return;
1480 	}
1481 
1482 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "Attached to %s (%s)\n", trid->traddr, name);
1483 
1484 	create_ctrlr(ctrlr, name, trid, prchk_flags);
1485 
1486 	nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(trid);
1487 	if (!nvme_bdev_ctrlr) {
1488 		SPDK_ERRLOG("Failed to find new NVMe controller\n");
1489 		free(name);
1490 		return;
1491 	}
1492 
1493 	nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, NULL);
1494 
1495 	free(name);
1496 }
1497 
1498 static void
1499 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr)
1500 {
1501 	uint32_t i;
1502 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
1503 	struct nvme_bdev_ns *ns;
1504 
1505 	pthread_mutex_lock(&g_bdev_nvme_mutex);
1506 	TAILQ_FOREACH(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
1507 		if (nvme_bdev_ctrlr->ctrlr == ctrlr) {
1508 			/* The controller's destruction was already started */
1509 			if (nvme_bdev_ctrlr->destruct) {
1510 				pthread_mutex_unlock(&g_bdev_nvme_mutex);
1511 				return;
1512 			}
1513 			pthread_mutex_unlock(&g_bdev_nvme_mutex);
1514 			for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
1515 				uint32_t	nsid = i + 1;
1516 
1517 				ns = nvme_bdev_ctrlr->namespaces[nsid - 1];
1518 				if (ns->populated) {
1519 					assert(ns->id == nsid);
1520 					nvme_ctrlr_depopulate_namespace(nvme_bdev_ctrlr, ns);
1521 				}
1522 			}
1523 
1524 			pthread_mutex_lock(&g_bdev_nvme_mutex);
1525 			nvme_bdev_ctrlr->destruct = true;
1526 			if (nvme_bdev_ctrlr->ref == 0) {
1527 				pthread_mutex_unlock(&g_bdev_nvme_mutex);
1528 				nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr);
1529 			} else {
1530 				pthread_mutex_unlock(&g_bdev_nvme_mutex);
1531 			}
1532 			return;
1533 		}
1534 	}
1535 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
1536 }
1537 
1538 static int
1539 bdev_nvme_hotplug(void *arg)
1540 {
1541 	struct spdk_nvme_transport_id trid_pcie;
1542 	int done;
1543 
1544 	if (!g_hotplug_probe_ctx) {
1545 		memset(&trid_pcie, 0, sizeof(trid_pcie));
1546 		spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE);
1547 
1548 		g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL,
1549 				      hotplug_probe_cb,
1550 				      attach_cb, remove_cb);
1551 		if (!g_hotplug_probe_ctx) {
1552 			return SPDK_POLLER_BUSY;
1553 		}
1554 	}
1555 
1556 	done = spdk_nvme_probe_poll_async(g_hotplug_probe_ctx);
1557 	if (done != -EAGAIN) {
1558 		g_hotplug_probe_ctx = NULL;
1559 	}
1560 
1561 	return SPDK_POLLER_BUSY;
1562 }
1563 
1564 void
1565 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts)
1566 {
1567 	*opts = g_opts;
1568 }
1569 
1570 int
1571 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts)
1572 {
1573 	if (g_bdev_nvme_init_thread != NULL) {
1574 		if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
1575 			return -EPERM;
1576 		}
1577 	}
1578 
1579 	g_opts = *opts;
1580 
1581 	return 0;
1582 }
1583 
1584 struct set_nvme_hotplug_ctx {
1585 	uint64_t period_us;
1586 	bool enabled;
1587 	spdk_msg_fn fn;
1588 	void *fn_ctx;
1589 };
1590 
1591 static void
1592 set_nvme_hotplug_period_cb(void *_ctx)
1593 {
1594 	struct set_nvme_hotplug_ctx *ctx = _ctx;
1595 
1596 	spdk_poller_unregister(&g_hotplug_poller);
1597 	if (ctx->enabled) {
1598 		g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us);
1599 	}
1600 
1601 	g_nvme_hotplug_poll_period_us = ctx->period_us;
1602 	g_nvme_hotplug_enabled = ctx->enabled;
1603 	if (ctx->fn) {
1604 		ctx->fn(ctx->fn_ctx);
1605 	}
1606 
1607 	free(ctx);
1608 }
1609 
1610 int
1611 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx)
1612 {
1613 	struct set_nvme_hotplug_ctx *ctx;
1614 
1615 	if (enabled == true && !spdk_process_is_primary()) {
1616 		return -EPERM;
1617 	}
1618 
1619 	ctx = calloc(1, sizeof(*ctx));
1620 	if (ctx == NULL) {
1621 		return -ENOMEM;
1622 	}
1623 
1624 	period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us;
1625 	ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX);
1626 	ctx->enabled = enabled;
1627 	ctx->fn = cb;
1628 	ctx->fn_ctx = cb_ctx;
1629 
1630 	spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx);
1631 	return 0;
1632 }
1633 
1634 static void
1635 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, size_t count, int rc)
1636 {
1637 	if (ctx->cb_fn) {
1638 		ctx->cb_fn(ctx->cb_ctx, count, rc);
1639 	}
1640 
1641 	free(ctx);
1642 }
1643 
1644 static void
1645 nvme_ctrlr_populate_namespaces_done(struct nvme_async_probe_ctx *ctx)
1646 {
1647 	struct nvme_bdev_ctrlr	*nvme_bdev_ctrlr;
1648 	struct nvme_bdev_ns	*ns;
1649 	struct nvme_bdev	*nvme_bdev, *tmp;
1650 	uint32_t		i, nsid;
1651 	size_t			j;
1652 
1653 	nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(&ctx->trid);
1654 	assert(nvme_bdev_ctrlr != NULL);
1655 
1656 	/*
1657 	 * Report the new bdevs that were created in this call.
1658 	 * There can be more than one bdev per NVMe controller.
1659 	 */
1660 	j = 0;
1661 	for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
1662 		nsid = i + 1;
1663 		ns = nvme_bdev_ctrlr->namespaces[nsid - 1];
1664 		if (!ns->populated) {
1665 			continue;
1666 		}
1667 		assert(ns->id == nsid);
1668 		TAILQ_FOREACH_SAFE(nvme_bdev, &ns->bdevs, tailq, tmp) {
1669 			if (j < ctx->count) {
1670 				ctx->names[j] = nvme_bdev->disk.name;
1671 				j++;
1672 			} else {
1673 				SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n",
1674 					    ctx->count);
1675 				populate_namespaces_cb(ctx, 0, -ERANGE);
1676 				return;
1677 			}
1678 		}
1679 	}
1680 
1681 	populate_namespaces_cb(ctx, j, 0);
1682 }
1683 
1684 static void
1685 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
1686 		  struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
1687 {
1688 	struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx;
1689 	struct nvme_bdev_ctrlr	*nvme_bdev_ctrlr;
1690 	struct nvme_async_probe_ctx *ctx;
1691 	int rc;
1692 
1693 	ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, opts);
1694 
1695 	spdk_poller_unregister(&ctx->poller);
1696 
1697 	rc = create_ctrlr(ctrlr, ctx->base_name, &ctx->trid, ctx->prchk_flags);
1698 	if (rc) {
1699 		SPDK_ERRLOG("Failed to create new device\n");
1700 		populate_namespaces_cb(ctx, 0, rc);
1701 		return;
1702 	}
1703 
1704 	nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(&ctx->trid);
1705 	assert(nvme_bdev_ctrlr != NULL);
1706 
1707 	nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, ctx);
1708 }
1709 
1710 static int
1711 bdev_nvme_async_poll(void *arg)
1712 {
1713 	struct nvme_async_probe_ctx	*ctx = arg;
1714 	int				rc;
1715 
1716 	rc = spdk_nvme_probe_poll_async(ctx->probe_ctx);
1717 	if (spdk_unlikely(rc != -EAGAIN && rc != 0)) {
1718 		spdk_poller_unregister(&ctx->poller);
1719 		free(ctx);
1720 	}
1721 
1722 	return SPDK_POLLER_BUSY;
1723 }
1724 
1725 int
1726 bdev_nvme_create(struct spdk_nvme_transport_id *trid,
1727 		 struct spdk_nvme_host_id *hostid,
1728 		 const char *base_name,
1729 		 const char **names,
1730 		 uint32_t count,
1731 		 const char *hostnqn,
1732 		 uint32_t prchk_flags,
1733 		 spdk_bdev_create_nvme_fn cb_fn,
1734 		 void *cb_ctx)
1735 {
1736 	struct nvme_probe_skip_entry	*entry, *tmp;
1737 	struct nvme_async_probe_ctx	*ctx;
1738 
1739 	if (nvme_bdev_ctrlr_get(trid) != NULL) {
1740 		SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr);
1741 		return -EEXIST;
1742 	}
1743 
1744 	if (nvme_bdev_ctrlr_get_by_name(base_name)) {
1745 		SPDK_ERRLOG("A controller with the provided name (%s) already exists.\n", base_name);
1746 		return -EEXIST;
1747 	}
1748 
1749 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
1750 		TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) {
1751 			if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) {
1752 				TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq);
1753 				free(entry);
1754 				break;
1755 			}
1756 		}
1757 	}
1758 
1759 	ctx = calloc(1, sizeof(*ctx));
1760 	if (!ctx) {
1761 		return -ENOMEM;
1762 	}
1763 	ctx->base_name = base_name;
1764 	ctx->names = names;
1765 	ctx->count = count;
1766 	ctx->cb_fn = cb_fn;
1767 	ctx->cb_ctx = cb_ctx;
1768 	ctx->prchk_flags = prchk_flags;
1769 	ctx->trid = *trid;
1770 
1771 	spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->opts, sizeof(ctx->opts));
1772 	ctx->opts.transport_retry_count = g_opts.retry_count;
1773 
1774 	if (hostnqn) {
1775 		snprintf(ctx->opts.hostnqn, sizeof(ctx->opts.hostnqn), "%s", hostnqn);
1776 	}
1777 
1778 	if (hostid->hostaddr[0] != '\0') {
1779 		snprintf(ctx->opts.src_addr, sizeof(ctx->opts.src_addr), "%s", hostid->hostaddr);
1780 	}
1781 
1782 	if (hostid->hostsvcid[0] != '\0') {
1783 		snprintf(ctx->opts.src_svcid, sizeof(ctx->opts.src_svcid), "%s", hostid->hostsvcid);
1784 	}
1785 
1786 	ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->opts, connect_attach_cb);
1787 	if (ctx->probe_ctx == NULL) {
1788 		SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr);
1789 		free(ctx);
1790 		return -ENODEV;
1791 	}
1792 	ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000);
1793 
1794 	return 0;
1795 }
1796 
1797 int
1798 bdev_nvme_delete(const char *name)
1799 {
1800 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = NULL;
1801 	struct nvme_probe_skip_entry *entry;
1802 
1803 	if (name == NULL) {
1804 		return -EINVAL;
1805 	}
1806 
1807 	nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name);
1808 	if (nvme_bdev_ctrlr == NULL) {
1809 		SPDK_ERRLOG("Failed to find NVMe controller\n");
1810 		return -ENODEV;
1811 	}
1812 
1813 	if (nvme_bdev_ctrlr->trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
1814 		entry = calloc(1, sizeof(*entry));
1815 		if (!entry) {
1816 			return -ENOMEM;
1817 		}
1818 		entry->trid = *nvme_bdev_ctrlr->trid;
1819 		TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq);
1820 	}
1821 
1822 	remove_cb(NULL, nvme_bdev_ctrlr->ctrlr);
1823 	return 0;
1824 }
1825 
1826 static int
1827 bdev_nvme_library_init(void)
1828 {
1829 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
1830 	struct spdk_conf_section *sp;
1831 	const char *val;
1832 	int rc = 0;
1833 	int64_t intval = 0;
1834 	size_t i;
1835 	struct nvme_probe_ctx *probe_ctx = NULL;
1836 	int retry_count;
1837 	uint32_t local_nvme_num = 0;
1838 	int64_t hotplug_period;
1839 	bool hotplug_enabled = g_nvme_hotplug_enabled;
1840 
1841 	g_bdev_nvme_init_thread = spdk_get_thread();
1842 
1843 	spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_poll_group_create_cb,
1844 				bdev_nvme_poll_group_destroy_cb,
1845 				sizeof(struct nvme_bdev_poll_group),  "bdev_nvme_poll_groups");
1846 
1847 	sp = spdk_conf_find_section(NULL, "Nvme");
1848 	if (sp == NULL) {
1849 		goto end;
1850 	}
1851 
1852 	probe_ctx = calloc(1, sizeof(*probe_ctx));
1853 	if (probe_ctx == NULL) {
1854 		SPDK_ERRLOG("Failed to allocate probe_ctx\n");
1855 		rc = -1;
1856 		goto end;
1857 	}
1858 
1859 	retry_count = spdk_conf_section_get_intval(sp, "RetryCount");
1860 	if (retry_count >= 0) {
1861 		g_opts.retry_count = retry_count;
1862 	}
1863 
1864 	val = spdk_conf_section_get_val(sp, "TimeoutUsec");
1865 	if (val != NULL) {
1866 		intval = spdk_strtoll(val, 10);
1867 		if (intval < 0) {
1868 			SPDK_ERRLOG("Invalid TimeoutUsec value\n");
1869 			rc = -1;
1870 			goto end;
1871 		}
1872 	}
1873 
1874 	g_opts.timeout_us = intval;
1875 
1876 	if (g_opts.timeout_us > 0) {
1877 		val = spdk_conf_section_get_val(sp, "ActionOnTimeout");
1878 		if (val != NULL) {
1879 			if (!strcasecmp(val, "Reset")) {
1880 				g_opts.action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET;
1881 			} else if (!strcasecmp(val, "Abort")) {
1882 				g_opts.action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT;
1883 			}
1884 		}
1885 	}
1886 
1887 	intval = spdk_conf_section_get_intval(sp, "AdminPollRate");
1888 	if (intval > 0) {
1889 		g_opts.nvme_adminq_poll_period_us = intval;
1890 	}
1891 
1892 	intval = spdk_conf_section_get_intval(sp, "IOPollRate");
1893 	if (intval > 0) {
1894 		g_opts.nvme_ioq_poll_period_us = intval;
1895 	}
1896 
1897 	if (spdk_process_is_primary()) {
1898 		hotplug_enabled = spdk_conf_section_get_boolval(sp, "HotplugEnable", false);
1899 	}
1900 
1901 	hotplug_period = spdk_conf_section_get_intval(sp, "HotplugPollRate");
1902 	if (hotplug_period < 0) {
1903 		hotplug_period = 0;
1904 	}
1905 
1906 	g_nvme_hostnqn = spdk_conf_section_get_val(sp, "HostNQN");
1907 	probe_ctx->hostnqn = g_nvme_hostnqn;
1908 
1909 	g_opts.delay_cmd_submit = spdk_conf_section_get_boolval(sp, "DelayCmdSubmit",
1910 				  SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT);
1911 
1912 	for (i = 0; i < NVME_MAX_CONTROLLERS; i++) {
1913 		val = spdk_conf_section_get_nmval(sp, "TransportID", i, 0);
1914 		if (val == NULL) {
1915 			break;
1916 		}
1917 
1918 		rc = spdk_nvme_transport_id_parse(&probe_ctx->trids[i], val);
1919 		if (rc < 0) {
1920 			SPDK_ERRLOG("Unable to parse TransportID: %s\n", val);
1921 			rc = -1;
1922 			goto end;
1923 		}
1924 
1925 		rc = spdk_nvme_host_id_parse(&probe_ctx->hostids[i], val);
1926 		if (rc < 0) {
1927 			SPDK_ERRLOG("Unable to parse HostID: %s\n", val);
1928 			rc = -1;
1929 			goto end;
1930 		}
1931 
1932 		val = spdk_conf_section_get_nmval(sp, "TransportID", i, 1);
1933 		if (val == NULL) {
1934 			SPDK_ERRLOG("No name provided for TransportID\n");
1935 			rc = -1;
1936 			goto end;
1937 		}
1938 
1939 		probe_ctx->names[i] = val;
1940 
1941 		val = spdk_conf_section_get_nmval(sp, "TransportID", i, 2);
1942 		if (val != NULL) {
1943 			rc = spdk_nvme_prchk_flags_parse(&probe_ctx->prchk_flags[i], val);
1944 			if (rc < 0) {
1945 				SPDK_ERRLOG("Unable to parse prchk: %s\n", val);
1946 				rc = -1;
1947 				goto end;
1948 			}
1949 		}
1950 
1951 		probe_ctx->count++;
1952 
1953 		if (probe_ctx->trids[i].trtype != SPDK_NVME_TRANSPORT_PCIE) {
1954 			struct spdk_nvme_ctrlr *ctrlr;
1955 			struct spdk_nvme_ctrlr_opts opts;
1956 
1957 			if (nvme_bdev_ctrlr_get(&probe_ctx->trids[i])) {
1958 				SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n",
1959 					    probe_ctx->trids[i].traddr);
1960 				rc = -1;
1961 				goto end;
1962 			}
1963 
1964 			if (probe_ctx->trids[i].subnqn[0] == '\0') {
1965 				SPDK_ERRLOG("Need to provide subsystem nqn\n");
1966 				rc = -1;
1967 				goto end;
1968 			}
1969 
1970 			spdk_nvme_ctrlr_get_default_ctrlr_opts(&opts, sizeof(opts));
1971 			opts.transport_retry_count = g_opts.retry_count;
1972 
1973 			if (probe_ctx->hostnqn != NULL) {
1974 				snprintf(opts.hostnqn, sizeof(opts.hostnqn), "%s", probe_ctx->hostnqn);
1975 			}
1976 
1977 			if (probe_ctx->hostids[i].hostaddr[0] != '\0') {
1978 				snprintf(opts.src_addr, sizeof(opts.src_addr), "%s", probe_ctx->hostids[i].hostaddr);
1979 			}
1980 
1981 			if (probe_ctx->hostids[i].hostsvcid[0] != '\0') {
1982 				snprintf(opts.src_svcid, sizeof(opts.src_svcid), "%s", probe_ctx->hostids[i].hostsvcid);
1983 			}
1984 
1985 			ctrlr = spdk_nvme_connect(&probe_ctx->trids[i], &opts, sizeof(opts));
1986 			if (ctrlr == NULL) {
1987 				SPDK_ERRLOG("Unable to connect to provided trid (traddr: %s)\n",
1988 					    probe_ctx->trids[i].traddr);
1989 				rc = -1;
1990 				goto end;
1991 			}
1992 
1993 			rc = create_ctrlr(ctrlr, probe_ctx->names[i], &probe_ctx->trids[i], 0);
1994 			if (rc) {
1995 				goto end;
1996 			}
1997 
1998 			nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(&probe_ctx->trids[i]);
1999 			if (!nvme_bdev_ctrlr) {
2000 				SPDK_ERRLOG("Failed to find new NVMe controller\n");
2001 				rc = -ENODEV;
2002 				goto end;
2003 			}
2004 
2005 			nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, NULL);
2006 		} else {
2007 			local_nvme_num++;
2008 		}
2009 	}
2010 
2011 	if (local_nvme_num > 0) {
2012 		/* used to probe local NVMe device */
2013 		if (spdk_nvme_probe(NULL, probe_ctx, probe_cb, attach_cb, remove_cb)) {
2014 			rc = -1;
2015 			goto end;
2016 		}
2017 
2018 		for (i = 0; i < probe_ctx->count; i++) {
2019 			if (probe_ctx->trids[i].trtype != SPDK_NVME_TRANSPORT_PCIE) {
2020 				continue;
2021 			}
2022 
2023 			if (!nvme_bdev_ctrlr_get(&probe_ctx->trids[i])) {
2024 				SPDK_ERRLOG("NVMe SSD \"%s\" could not be found.\n", probe_ctx->trids[i].traddr);
2025 				SPDK_ERRLOG("Check PCIe BDF and that it is attached to UIO/VFIO driver.\n");
2026 			}
2027 		}
2028 	}
2029 
2030 	rc = bdev_nvme_set_hotplug(hotplug_enabled, hotplug_period, NULL, NULL);
2031 	if (rc) {
2032 		SPDK_ERRLOG("Failed to setup hotplug (%d): %s", rc, spdk_strerror(rc));
2033 		rc = -1;
2034 	}
2035 end:
2036 	free(probe_ctx);
2037 	return rc;
2038 }
2039 
2040 static void
2041 bdev_nvme_library_fini(void)
2042 {
2043 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, *tmp;
2044 	struct nvme_probe_skip_entry *entry, *entry_tmp;
2045 	struct nvme_bdev_ns *ns;
2046 	uint32_t i;
2047 
2048 	spdk_poller_unregister(&g_hotplug_poller);
2049 	free(g_hotplug_probe_ctx);
2050 
2051 	TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) {
2052 		TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq);
2053 		free(entry);
2054 	}
2055 
2056 	pthread_mutex_lock(&g_bdev_nvme_mutex);
2057 	TAILQ_FOREACH_SAFE(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq, tmp) {
2058 		if (nvme_bdev_ctrlr->destruct) {
2059 			/* This controller's destruction was already started
2060 			 * before the application started shutting down
2061 			 */
2062 			continue;
2063 		}
2064 
2065 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
2066 
2067 		for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
2068 			uint32_t nsid = i + 1;
2069 
2070 			ns = nvme_bdev_ctrlr->namespaces[nsid - 1];
2071 			if (ns->populated) {
2072 				assert(ns->id == nsid);
2073 				nvme_ctrlr_depopulate_namespace(nvme_bdev_ctrlr, ns);
2074 			}
2075 		}
2076 
2077 		pthread_mutex_lock(&g_bdev_nvme_mutex);
2078 		nvme_bdev_ctrlr->destruct = true;
2079 
2080 		if (nvme_bdev_ctrlr->ref == 0) {
2081 			pthread_mutex_unlock(&g_bdev_nvme_mutex);
2082 			nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr);
2083 			pthread_mutex_lock(&g_bdev_nvme_mutex);
2084 		}
2085 	}
2086 
2087 	g_bdev_nvme_module_finish = true;
2088 	if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
2089 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
2090 		spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL);
2091 		spdk_bdev_module_finish_done();
2092 		return;
2093 	}
2094 
2095 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
2096 }
2097 
2098 static void
2099 bdev_nvme_verify_pi_error(struct spdk_bdev_io *bdev_io)
2100 {
2101 	struct spdk_bdev *bdev = bdev_io->bdev;
2102 	struct spdk_dif_ctx dif_ctx;
2103 	struct spdk_dif_error err_blk = {};
2104 	int rc;
2105 
2106 	rc = spdk_dif_ctx_init(&dif_ctx,
2107 			       bdev->blocklen, bdev->md_len, bdev->md_interleave,
2108 			       bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags,
2109 			       bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0);
2110 	if (rc != 0) {
2111 		SPDK_ERRLOG("Initialization of DIF context failed\n");
2112 		return;
2113 	}
2114 
2115 	if (bdev->md_interleave) {
2116 		rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
2117 				     bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
2118 	} else {
2119 		struct iovec md_iov = {
2120 			.iov_base	= bdev_io->u.bdev.md_buf,
2121 			.iov_len	= bdev_io->u.bdev.num_blocks * bdev->md_len,
2122 		};
2123 
2124 		rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
2125 				     &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
2126 	}
2127 
2128 	if (rc != 0) {
2129 		SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n",
2130 			    err_blk.err_type, err_blk.err_offset);
2131 	} else {
2132 		SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n");
2133 	}
2134 }
2135 
2136 static void
2137 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl)
2138 {
2139 	struct nvme_bdev_io *bio = ref;
2140 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2141 
2142 	if (spdk_nvme_cpl_is_success(cpl)) {
2143 		/* Run PI verification for read data buffer. */
2144 		bdev_nvme_verify_pi_error(bdev_io);
2145 	}
2146 
2147 	/* Return original completion status */
2148 	spdk_bdev_io_complete_nvme_status(bdev_io, bio->cpl.cdw0, bio->cpl.status.sct,
2149 					  bio->cpl.status.sc);
2150 }
2151 
2152 static void
2153 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl)
2154 {
2155 	struct nvme_bdev_io *bio = ref;
2156 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2157 	int ret;
2158 
2159 	if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) {
2160 		SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n",
2161 			    cpl->status.sct, cpl->status.sc);
2162 
2163 		/* Save completion status to use after verifying PI error. */
2164 		bio->cpl = *cpl;
2165 
2166 		/* Read without PI checking to verify PI error. */
2167 		ret = bdev_nvme_no_pi_readv((struct nvme_bdev *)bdev_io->bdev->ctxt,
2168 					    spdk_bdev_io_get_io_channel(bdev_io),
2169 					    bio,
2170 					    bdev_io->u.bdev.iovs,
2171 					    bdev_io->u.bdev.iovcnt,
2172 					    bdev_io->u.bdev.md_buf,
2173 					    bdev_io->u.bdev.num_blocks,
2174 					    bdev_io->u.bdev.offset_blocks);
2175 		if (ret == 0) {
2176 			return;
2177 		}
2178 	}
2179 
2180 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2181 }
2182 
2183 static void
2184 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl)
2185 {
2186 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref);
2187 
2188 	if (spdk_nvme_cpl_is_pi_error(cpl)) {
2189 		SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n",
2190 			    cpl->status.sct, cpl->status.sc);
2191 		/* Run PI verification for write data buffer if PI error is detected. */
2192 		bdev_nvme_verify_pi_error(bdev_io);
2193 	}
2194 
2195 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2196 }
2197 
2198 static void
2199 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl)
2200 {
2201 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref);
2202 
2203 	if (spdk_nvme_cpl_is_pi_error(cpl)) {
2204 		SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n",
2205 			    cpl->status.sct, cpl->status.sc);
2206 		/* Run PI verification for compare data buffer if PI error is detected. */
2207 		bdev_nvme_verify_pi_error(bdev_io);
2208 	}
2209 
2210 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2211 }
2212 
2213 static void
2214 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl)
2215 {
2216 	struct nvme_bdev_io *bio = ref;
2217 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2218 
2219 	/* Compare operation completion */
2220 	if ((cpl->cdw0 & 0xFF) == SPDK_NVME_OPC_COMPARE) {
2221 		/* Save compare result for write callback */
2222 		bio->cpl = *cpl;
2223 		return;
2224 	}
2225 
2226 	/* Write operation completion */
2227 	if (spdk_nvme_cpl_is_error(&bio->cpl)) {
2228 		/* If bio->cpl is already an error, it means the compare operation failed.  In that case,
2229 		 * complete the IO with the compare operation's status.
2230 		 */
2231 		if (!spdk_nvme_cpl_is_error(cpl)) {
2232 			SPDK_ERRLOG("Unexpected write success after compare failure.\n");
2233 		}
2234 
2235 		spdk_bdev_io_complete_nvme_status(bdev_io, bio->cpl.cdw0, bio->cpl.status.sct, bio->cpl.status.sc);
2236 	} else {
2237 		spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2238 	}
2239 }
2240 
2241 static void
2242 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl)
2243 {
2244 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref);
2245 
2246 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2247 }
2248 
2249 static void
2250 bdev_nvme_admin_passthru_completion(void *ctx)
2251 {
2252 	struct nvme_bdev_io *bio = ctx;
2253 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2254 
2255 	spdk_bdev_io_complete_nvme_status(bdev_io,
2256 					  bio->cpl.cdw0, bio->cpl.status.sct, bio->cpl.status.sc);
2257 }
2258 
2259 static void
2260 bdev_nvme_abort_completion(void *ctx)
2261 {
2262 	struct nvme_bdev_io *bio = ctx;
2263 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2264 
2265 	if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) {
2266 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
2267 	} else {
2268 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
2269 	}
2270 }
2271 
2272 static void
2273 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl)
2274 {
2275 	struct nvme_bdev_io *bio = ref;
2276 
2277 	bio->cpl = *cpl;
2278 	spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_completion, bio);
2279 }
2280 
2281 static void
2282 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl)
2283 {
2284 	struct nvme_bdev_io *bio = ref;
2285 
2286 	bio->cpl = *cpl;
2287 	spdk_thread_send_msg(bio->orig_thread, bdev_nvme_admin_passthru_completion, bio);
2288 }
2289 
2290 static void
2291 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset)
2292 {
2293 	struct nvme_bdev_io *bio = ref;
2294 	struct iovec *iov;
2295 
2296 	bio->iov_offset = sgl_offset;
2297 	for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) {
2298 		iov = &bio->iovs[bio->iovpos];
2299 		if (bio->iov_offset < iov->iov_len) {
2300 			break;
2301 		}
2302 
2303 		bio->iov_offset -= iov->iov_len;
2304 	}
2305 }
2306 
2307 static int
2308 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length)
2309 {
2310 	struct nvme_bdev_io *bio = ref;
2311 	struct iovec *iov;
2312 
2313 	assert(bio->iovpos < bio->iovcnt);
2314 
2315 	iov = &bio->iovs[bio->iovpos];
2316 
2317 	*address = iov->iov_base;
2318 	*length = iov->iov_len;
2319 
2320 	if (bio->iov_offset) {
2321 		assert(bio->iov_offset <= iov->iov_len);
2322 		*address += bio->iov_offset;
2323 		*length -= bio->iov_offset;
2324 	}
2325 
2326 	bio->iov_offset += *length;
2327 	if (bio->iov_offset == iov->iov_len) {
2328 		bio->iovpos++;
2329 		bio->iov_offset = 0;
2330 	}
2331 
2332 	return 0;
2333 }
2334 
2335 static void
2336 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset)
2337 {
2338 	struct nvme_bdev_io *bio = ref;
2339 	struct iovec *iov;
2340 
2341 	bio->fused_iov_offset = sgl_offset;
2342 	for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) {
2343 		iov = &bio->fused_iovs[bio->fused_iovpos];
2344 		if (bio->fused_iov_offset < iov->iov_len) {
2345 			break;
2346 		}
2347 
2348 		bio->fused_iov_offset -= iov->iov_len;
2349 	}
2350 }
2351 
2352 static int
2353 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length)
2354 {
2355 	struct nvme_bdev_io *bio = ref;
2356 	struct iovec *iov;
2357 
2358 	assert(bio->fused_iovpos < bio->fused_iovcnt);
2359 
2360 	iov = &bio->fused_iovs[bio->fused_iovpos];
2361 
2362 	*address = iov->iov_base;
2363 	*length = iov->iov_len;
2364 
2365 	if (bio->fused_iov_offset) {
2366 		assert(bio->fused_iov_offset <= iov->iov_len);
2367 		*address += bio->fused_iov_offset;
2368 		*length -= bio->fused_iov_offset;
2369 	}
2370 
2371 	bio->fused_iov_offset += *length;
2372 	if (bio->fused_iov_offset == iov->iov_len) {
2373 		bio->fused_iovpos++;
2374 		bio->fused_iov_offset = 0;
2375 	}
2376 
2377 	return 0;
2378 }
2379 
2380 static int
2381 bdev_nvme_no_pi_readv(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
2382 		      struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
2383 		      void *md, uint64_t lba_count, uint64_t lba)
2384 {
2385 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
2386 	int rc;
2387 
2388 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "read %lu blocks with offset %#lx without PI check\n",
2389 		      lba_count, lba);
2390 
2391 	bio->iovs = iov;
2392 	bio->iovcnt = iovcnt;
2393 	bio->iovpos = 0;
2394 	bio->iov_offset = 0;
2395 
2396 	rc = spdk_nvme_ns_cmd_readv_with_md(nbdev->nvme_ns->ns, nvme_ch->qpair, lba, lba_count,
2397 					    bdev_nvme_no_pi_readv_done, bio, 0,
2398 					    bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
2399 					    md, 0, 0);
2400 
2401 	if (rc != 0 && rc != -ENOMEM) {
2402 		SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc);
2403 	}
2404 	return rc;
2405 }
2406 
2407 static int
2408 bdev_nvme_readv(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
2409 		struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
2410 		void *md, uint64_t lba_count, uint64_t lba)
2411 {
2412 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
2413 	int rc;
2414 
2415 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "read %lu blocks with offset %#lx\n",
2416 		      lba_count, lba);
2417 
2418 	bio->iovs = iov;
2419 	bio->iovcnt = iovcnt;
2420 	bio->iovpos = 0;
2421 	bio->iov_offset = 0;
2422 
2423 	rc = spdk_nvme_ns_cmd_readv_with_md(nbdev->nvme_ns->ns, nvme_ch->qpair, lba, lba_count,
2424 					    bdev_nvme_readv_done, bio, nbdev->disk.dif_check_flags,
2425 					    bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
2426 					    md, 0, 0);
2427 
2428 	if (rc != 0 && rc != -ENOMEM) {
2429 		SPDK_ERRLOG("readv failed: rc = %d\n", rc);
2430 	}
2431 	return rc;
2432 }
2433 
2434 static int
2435 bdev_nvme_writev(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
2436 		 struct nvme_bdev_io *bio,
2437 		 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba)
2438 {
2439 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
2440 	int rc;
2441 
2442 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "write %lu blocks with offset %#lx\n",
2443 		      lba_count, lba);
2444 
2445 	bio->iovs = iov;
2446 	bio->iovcnt = iovcnt;
2447 	bio->iovpos = 0;
2448 	bio->iov_offset = 0;
2449 
2450 	rc = spdk_nvme_ns_cmd_writev_with_md(nbdev->nvme_ns->ns, nvme_ch->qpair, lba, lba_count,
2451 					     bdev_nvme_writev_done, bio, nbdev->disk.dif_check_flags,
2452 					     bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
2453 					     md, 0, 0);
2454 
2455 	if (rc != 0 && rc != -ENOMEM) {
2456 		SPDK_ERRLOG("writev failed: rc = %d\n", rc);
2457 	}
2458 	return rc;
2459 }
2460 
2461 static int
2462 bdev_nvme_comparev(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
2463 		   struct nvme_bdev_io *bio,
2464 		   struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba)
2465 {
2466 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
2467 	int rc;
2468 
2469 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "compare %lu blocks with offset %#lx\n",
2470 		      lba_count, lba);
2471 
2472 	bio->iovs = iov;
2473 	bio->iovcnt = iovcnt;
2474 	bio->iovpos = 0;
2475 	bio->iov_offset = 0;
2476 
2477 	rc = spdk_nvme_ns_cmd_comparev_with_md(nbdev->nvme_ns->ns, nvme_ch->qpair, lba, lba_count,
2478 					       bdev_nvme_comparev_done, bio, nbdev->disk.dif_check_flags,
2479 					       bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
2480 					       md, 0, 0);
2481 
2482 	if (rc != 0 && rc != -ENOMEM) {
2483 		SPDK_ERRLOG("comparev failed: rc = %d\n", rc);
2484 	}
2485 	return rc;
2486 }
2487 
2488 static int
2489 bdev_nvme_comparev_and_writev(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
2490 			      struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov,
2491 			      int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba)
2492 {
2493 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
2494 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2495 	uint32_t flags = nbdev->disk.dif_check_flags;
2496 	int rc;
2497 
2498 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "compare and write %lu blocks with offset %#lx\n",
2499 		      lba_count, lba);
2500 
2501 	bio->iovs = cmp_iov;
2502 	bio->iovcnt = cmp_iovcnt;
2503 	bio->iovpos = 0;
2504 	bio->iov_offset = 0;
2505 	bio->fused_iovs = write_iov;
2506 	bio->fused_iovcnt = write_iovcnt;
2507 	bio->fused_iovpos = 0;
2508 	bio->fused_iov_offset = 0;
2509 
2510 	if (bdev_io->num_retries == 0) {
2511 		bio->first_fused_submitted = false;
2512 	}
2513 
2514 	if (!bio->first_fused_submitted) {
2515 		flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST;
2516 		memset(&bio->cpl, 0, sizeof(bio->cpl));
2517 
2518 		rc = spdk_nvme_ns_cmd_comparev_with_md(nbdev->nvme_ns->ns, nvme_ch->qpair, lba, lba_count,
2519 						       bdev_nvme_comparev_and_writev_done, bio, flags,
2520 						       bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0);
2521 		if (rc == 0) {
2522 			bio->first_fused_submitted = true;
2523 			flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST;
2524 		} else {
2525 			if (rc != -ENOMEM) {
2526 				SPDK_ERRLOG("compare failed: rc = %d\n", rc);
2527 			}
2528 			return rc;
2529 		}
2530 	}
2531 
2532 	flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND;
2533 
2534 	rc = spdk_nvme_ns_cmd_writev_with_md(nbdev->nvme_ns->ns, nvme_ch->qpair, lba, lba_count,
2535 					     bdev_nvme_comparev_and_writev_done, bio, flags,
2536 					     bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0);
2537 	if (rc != 0 && rc != -ENOMEM) {
2538 		SPDK_ERRLOG("write failed: rc = %d\n", rc);
2539 		rc = 0;
2540 	}
2541 
2542 	return rc;
2543 }
2544 
2545 static int
2546 bdev_nvme_unmap(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
2547 		struct nvme_bdev_io *bio,
2548 		uint64_t offset_blocks,
2549 		uint64_t num_blocks)
2550 {
2551 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
2552 	struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES];
2553 	struct spdk_nvme_dsm_range *range;
2554 	uint64_t offset, remaining;
2555 	uint64_t num_ranges_u64;
2556 	uint16_t num_ranges;
2557 	int rc;
2558 
2559 	num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) /
2560 			 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
2561 	if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) {
2562 		SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks);
2563 		return -EINVAL;
2564 	}
2565 	num_ranges = (uint16_t)num_ranges_u64;
2566 
2567 	offset = offset_blocks;
2568 	remaining = num_blocks;
2569 	range = &dsm_ranges[0];
2570 
2571 	/* Fill max-size ranges until the remaining blocks fit into one range */
2572 	while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) {
2573 		range->attributes.raw = 0;
2574 		range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
2575 		range->starting_lba = offset;
2576 
2577 		offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
2578 		remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
2579 		range++;
2580 	}
2581 
2582 	/* Final range describes the remaining blocks */
2583 	range->attributes.raw = 0;
2584 	range->length = remaining;
2585 	range->starting_lba = offset;
2586 
2587 	rc = spdk_nvme_ns_cmd_dataset_management(nbdev->nvme_ns->ns, nvme_ch->qpair,
2588 			SPDK_NVME_DSM_ATTR_DEALLOCATE,
2589 			dsm_ranges, num_ranges,
2590 			bdev_nvme_queued_done, bio);
2591 
2592 	return rc;
2593 }
2594 
2595 static int
2596 bdev_nvme_admin_passthru(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
2597 			 struct nvme_bdev_io *bio,
2598 			 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes)
2599 {
2600 	uint32_t max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nbdev->nvme_bdev_ctrlr->ctrlr);
2601 
2602 	if (nbytes > max_xfer_size) {
2603 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
2604 		return -EINVAL;
2605 	}
2606 
2607 	bio->orig_thread = spdk_io_channel_get_thread(ch);
2608 
2609 	return spdk_nvme_ctrlr_cmd_admin_raw(nbdev->nvme_bdev_ctrlr->ctrlr, cmd, buf,
2610 					     (uint32_t)nbytes, bdev_nvme_admin_passthru_done, bio);
2611 }
2612 
2613 static int
2614 bdev_nvme_io_passthru(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
2615 		      struct nvme_bdev_io *bio,
2616 		      struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes)
2617 {
2618 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
2619 	uint32_t max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nbdev->nvme_bdev_ctrlr->ctrlr);
2620 
2621 	if (nbytes > max_xfer_size) {
2622 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
2623 		return -EINVAL;
2624 	}
2625 
2626 	/*
2627 	 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid,
2628 	 * so fill it out automatically.
2629 	 */
2630 	cmd->nsid = spdk_nvme_ns_get_id(nbdev->nvme_ns->ns);
2631 
2632 	return spdk_nvme_ctrlr_cmd_io_raw(nbdev->nvme_bdev_ctrlr->ctrlr, nvme_ch->qpair, cmd, buf,
2633 					  (uint32_t)nbytes, bdev_nvme_queued_done, bio);
2634 }
2635 
2636 static int
2637 bdev_nvme_io_passthru_md(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
2638 			 struct nvme_bdev_io *bio,
2639 			 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len)
2640 {
2641 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
2642 	size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(nbdev->nvme_ns->ns);
2643 	uint32_t max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nbdev->nvme_bdev_ctrlr->ctrlr);
2644 
2645 	if (nbytes > max_xfer_size) {
2646 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
2647 		return -EINVAL;
2648 	}
2649 
2650 	if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(nbdev->nvme_ns->ns)) {
2651 		SPDK_ERRLOG("invalid meta data buffer size\n");
2652 		return -EINVAL;
2653 	}
2654 
2655 	/*
2656 	 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid,
2657 	 * so fill it out automatically.
2658 	 */
2659 	cmd->nsid = spdk_nvme_ns_get_id(nbdev->nvme_ns->ns);
2660 
2661 	return spdk_nvme_ctrlr_cmd_io_raw_with_md(nbdev->nvme_bdev_ctrlr->ctrlr, nvme_ch->qpair, cmd, buf,
2662 			(uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio);
2663 }
2664 
2665 static void
2666 bdev_nvme_abort_admin_cmd(void *ctx)
2667 {
2668 	struct nvme_bdev_io *bio = ctx;
2669 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2670 	struct nvme_bdev *nbdev;
2671 	struct nvme_bdev_io *bio_to_abort;
2672 	int rc;
2673 
2674 	nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt;
2675 	bio_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx;
2676 
2677 	rc = spdk_nvme_ctrlr_cmd_abort_ext(nbdev->nvme_bdev_ctrlr->ctrlr,
2678 					   NULL,
2679 					   bio_to_abort,
2680 					   bdev_nvme_abort_done, bio);
2681 	if (rc == -ENOENT) {
2682 		/* If no admin command was found in admin qpair, complete the abort
2683 		 * request with failure.
2684 		 */
2685 		bio->cpl.cdw0 |= 1U;
2686 		bio->cpl.status.sc = SPDK_NVME_SC_SUCCESS;
2687 		bio->cpl.status.sct = SPDK_NVME_SCT_GENERIC;
2688 
2689 		spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_completion, bio);
2690 	}
2691 }
2692 
2693 static int
2694 bdev_nvme_abort(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
2695 		struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort)
2696 {
2697 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
2698 	int rc;
2699 
2700 	bio->orig_thread = spdk_io_channel_get_thread(ch);
2701 
2702 	rc = spdk_nvme_ctrlr_cmd_abort_ext(nbdev->nvme_bdev_ctrlr->ctrlr,
2703 					   nvme_ch->qpair,
2704 					   bio_to_abort,
2705 					   bdev_nvme_abort_done, bio);
2706 	if (rc == -ENOENT) {
2707 		/* If no command was found in I/O qpair, the target command may be
2708 		 * admin command. Only a single thread tries aborting admin command
2709 		 * to clean I/O flow.
2710 		 */
2711 		spdk_thread_send_msg(nbdev->nvme_bdev_ctrlr->thread,
2712 				     bdev_nvme_abort_admin_cmd, bio);
2713 		rc = 0;
2714 	}
2715 
2716 	return rc;
2717 }
2718 
2719 static void
2720 bdev_nvme_get_spdk_running_config(FILE *fp)
2721 {
2722 	struct nvme_bdev_ctrlr	*nvme_bdev_ctrlr;
2723 
2724 	fprintf(fp, "\n[Nvme]");
2725 	fprintf(fp, "\n"
2726 		"# NVMe Device Whitelist\n"
2727 		"# Users may specify which NVMe devices to claim by their transport id.\n"
2728 		"# See spdk_nvme_transport_id_parse() in spdk/nvme.h for the correct format.\n"
2729 		"# The second argument is the assigned name, which can be referenced from\n"
2730 		"# other sections in the configuration file. For NVMe devices, a namespace\n"
2731 		"# is automatically appended to each name in the format <YourName>nY, where\n"
2732 		"# Y is the NSID (starts at 1).\n");
2733 
2734 	TAILQ_FOREACH(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
2735 		const char *trtype;
2736 		const char *prchk_flags;
2737 
2738 		trtype = spdk_nvme_transport_id_trtype_str(nvme_bdev_ctrlr->trid->trtype);
2739 		if (!trtype) {
2740 			continue;
2741 		}
2742 
2743 		if (nvme_bdev_ctrlr->trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
2744 			fprintf(fp, "TransportID \"trtype:%s traddr:%s\" %s\n",
2745 				trtype,
2746 				nvme_bdev_ctrlr->trid->traddr, nvme_bdev_ctrlr->name);
2747 		} else {
2748 			const char *adrfam;
2749 
2750 			adrfam = spdk_nvme_transport_id_adrfam_str(nvme_bdev_ctrlr->trid->adrfam);
2751 			prchk_flags = spdk_nvme_prchk_flags_str(nvme_bdev_ctrlr->prchk_flags);
2752 
2753 			if (adrfam) {
2754 				fprintf(fp, "TransportID \"trtype:%s adrfam:%s traddr:%s trsvcid:%s subnqn:%s\" %s",
2755 					trtype,	adrfam,
2756 					nvme_bdev_ctrlr->trid->traddr, nvme_bdev_ctrlr->trid->trsvcid,
2757 					nvme_bdev_ctrlr->trid->subnqn, nvme_bdev_ctrlr->name);
2758 			} else {
2759 				fprintf(fp, "TransportID \"trtype:%s traddr:%s trsvcid:%s subnqn:%s\" %s",
2760 					trtype,
2761 					nvme_bdev_ctrlr->trid->traddr, nvme_bdev_ctrlr->trid->trsvcid,
2762 					nvme_bdev_ctrlr->trid->subnqn, nvme_bdev_ctrlr->name);
2763 			}
2764 
2765 			if (prchk_flags) {
2766 				fprintf(fp, " \"%s\"\n", prchk_flags);
2767 			} else {
2768 				fprintf(fp, "\n");
2769 			}
2770 		}
2771 	}
2772 
2773 	fprintf(fp, "\n"
2774 		"# The number of attempts per I/O when an I/O fails. Do not include\n"
2775 		"# this key to get the default behavior.\n");
2776 	fprintf(fp, "RetryCount %d\n", g_opts.retry_count);
2777 	fprintf(fp, "\n"
2778 		"# Timeout for each command, in microseconds. If 0, don't track timeouts.\n");
2779 	fprintf(fp, "TimeoutUsec %"PRIu64"\n", g_opts.timeout_us);
2780 
2781 	fprintf(fp, "\n"
2782 		"# Action to take on command time out. Only valid when Timeout is greater\n"
2783 		"# than 0. This may be 'Reset' to reset the controller, 'Abort' to abort\n"
2784 		"# the command, or 'None' to just print a message but do nothing.\n"
2785 		"# Admin command timeouts will always result in a reset.\n");
2786 	switch (g_opts.action_on_timeout) {
2787 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE:
2788 		fprintf(fp, "ActionOnTimeout None\n");
2789 		break;
2790 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET:
2791 		fprintf(fp, "ActionOnTimeout Reset\n");
2792 		break;
2793 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT:
2794 		fprintf(fp, "ActionOnTimeout Abort\n");
2795 		break;
2796 	}
2797 
2798 	fprintf(fp, "\n"
2799 		"# Set how often the admin queue is polled for asynchronous events.\n"
2800 		"# Units in microseconds.\n");
2801 	fprintf(fp, "AdminPollRate %"PRIu64"\n", g_opts.nvme_adminq_poll_period_us);
2802 	fprintf(fp, "IOPollRate %" PRIu64"\n", g_opts.nvme_ioq_poll_period_us);
2803 	fprintf(fp, "\n"
2804 		"# Disable handling of hotplug (runtime insert and remove) events,\n"
2805 		"# users can set to Yes if want to enable it.\n"
2806 		"# Default: No\n");
2807 	fprintf(fp, "HotplugEnable %s\n", g_nvme_hotplug_enabled ? "Yes" : "No");
2808 	fprintf(fp, "\n"
2809 		"# Set how often the hotplug is processed for insert and remove events."
2810 		"# Units in microseconds.\n");
2811 	fprintf(fp, "HotplugPollRate %"PRIu64"\n", g_nvme_hotplug_poll_period_us);
2812 	if (g_nvme_hostnqn) {
2813 		fprintf(fp, "HostNQN %s\n",  g_nvme_hostnqn);
2814 	}
2815 	fprintf(fp, "DelayCmdSubmit %s\n", g_opts.delay_cmd_submit ? "True" : "False");
2816 
2817 	fprintf(fp, "\n");
2818 }
2819 
2820 static void
2821 nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w, struct nvme_bdev_ns *ns)
2822 {
2823 	/* nop */
2824 }
2825 
2826 static void
2827 nvme_namespace_config_json(struct spdk_json_write_ctx *w, struct nvme_bdev_ns *ns)
2828 {
2829 	g_config_json_namespace_fn[ns->type](w, ns);
2830 }
2831 
2832 static int
2833 bdev_nvme_config_json(struct spdk_json_write_ctx *w)
2834 {
2835 	struct nvme_bdev_ctrlr		*nvme_bdev_ctrlr;
2836 	struct spdk_nvme_transport_id	*trid;
2837 	const char			*action;
2838 	uint32_t			nsid;
2839 
2840 	if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) {
2841 		action = "reset";
2842 	} else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) {
2843 		action = "abort";
2844 	} else {
2845 		action = "none";
2846 	}
2847 
2848 	spdk_json_write_object_begin(w);
2849 
2850 	spdk_json_write_named_string(w, "method", "bdev_nvme_set_options");
2851 
2852 	spdk_json_write_named_object_begin(w, "params");
2853 	spdk_json_write_named_string(w, "action_on_timeout", action);
2854 	spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us);
2855 	spdk_json_write_named_uint32(w, "retry_count", g_opts.retry_count);
2856 	spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst);
2857 	spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight);
2858 	spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight);
2859 	spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight);
2860 	spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us);
2861 	spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us);
2862 	spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests);
2863 	spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit);
2864 	spdk_json_write_object_end(w);
2865 
2866 	spdk_json_write_object_end(w);
2867 
2868 	pthread_mutex_lock(&g_bdev_nvme_mutex);
2869 	TAILQ_FOREACH(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
2870 		trid = nvme_bdev_ctrlr->trid;
2871 
2872 		spdk_json_write_object_begin(w);
2873 
2874 		spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller");
2875 
2876 		spdk_json_write_named_object_begin(w, "params");
2877 		spdk_json_write_named_string(w, "name", nvme_bdev_ctrlr->name);
2878 		nvme_bdev_dump_trid_json(trid, w);
2879 		spdk_json_write_named_bool(w, "prchk_reftag",
2880 					   (nvme_bdev_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0);
2881 		spdk_json_write_named_bool(w, "prchk_guard",
2882 					   (nvme_bdev_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0);
2883 
2884 		spdk_json_write_object_end(w);
2885 
2886 		spdk_json_write_object_end(w);
2887 
2888 		for (nsid = 0; nsid < nvme_bdev_ctrlr->num_ns; ++nsid) {
2889 			if (!nvme_bdev_ctrlr->namespaces[nsid]->populated) {
2890 				continue;
2891 			}
2892 
2893 			nvme_namespace_config_json(w, nvme_bdev_ctrlr->namespaces[nsid]);
2894 		}
2895 	}
2896 
2897 	/* Dump as last parameter to give all NVMe bdevs chance to be constructed
2898 	 * before enabling hotplug poller.
2899 	 */
2900 	spdk_json_write_object_begin(w);
2901 	spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug");
2902 
2903 	spdk_json_write_named_object_begin(w, "params");
2904 	spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us);
2905 	spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled);
2906 	spdk_json_write_object_end(w);
2907 
2908 	spdk_json_write_object_end(w);
2909 
2910 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
2911 	return 0;
2912 }
2913 
2914 struct spdk_nvme_ctrlr *
2915 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev)
2916 {
2917 	if (!bdev || bdev->module != &nvme_if) {
2918 		return NULL;
2919 	}
2920 
2921 	return SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk)->nvme_bdev_ctrlr->ctrlr;
2922 }
2923 
2924 SPDK_LOG_REGISTER_COMPONENT("bdev_nvme", SPDK_LOG_BDEV_NVME)
2925