xref: /spdk/module/bdev/nvme/bdev_nvme.c (revision 1fa071d332db21bf893d581a8e93b425ba788a24)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation. All rights reserved.
5  *   Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "spdk/stdinc.h"
35 
36 #include "bdev_nvme.h"
37 #include "bdev_ocssd.h"
38 
39 #include "spdk/config.h"
40 #include "spdk/conf.h"
41 #include "spdk/endian.h"
42 #include "spdk/bdev.h"
43 #include "spdk/json.h"
44 #include "spdk/nvme.h"
45 #include "spdk/nvme_ocssd.h"
46 #include "spdk/thread.h"
47 #include "spdk/string.h"
48 #include "spdk/likely.h"
49 #include "spdk/util.h"
50 
51 #include "spdk/bdev_module.h"
52 #include "spdk_internal/log.h"
53 
54 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true
55 
56 static void bdev_nvme_get_spdk_running_config(FILE *fp);
57 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w);
58 
59 struct nvme_bdev_io {
60 	/** array of iovecs to transfer. */
61 	struct iovec *iovs;
62 
63 	/** Number of iovecs in iovs array. */
64 	int iovcnt;
65 
66 	/** Current iovec position. */
67 	int iovpos;
68 
69 	/** Offset in current iovec. */
70 	uint32_t iov_offset;
71 
72 	/** array of iovecs to transfer. */
73 	struct iovec *fused_iovs;
74 
75 	/** Number of iovecs in iovs array. */
76 	int fused_iovcnt;
77 
78 	/** Current iovec position. */
79 	int fused_iovpos;
80 
81 	/** Offset in current iovec. */
82 	uint32_t fused_iov_offset;
83 
84 	/** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */
85 	struct spdk_nvme_cpl cpl;
86 
87 	/** Originating thread */
88 	struct spdk_thread *orig_thread;
89 
90 	/** Keeps track if first of fused commands was submitted */
91 	bool first_fused_submitted;
92 };
93 
94 struct nvme_probe_ctx {
95 	size_t count;
96 	struct spdk_nvme_transport_id trids[NVME_MAX_CONTROLLERS];
97 	struct spdk_nvme_host_id hostids[NVME_MAX_CONTROLLERS];
98 	const char *names[NVME_MAX_CONTROLLERS];
99 	uint32_t prchk_flags[NVME_MAX_CONTROLLERS];
100 	const char *hostnqn;
101 };
102 
103 struct nvme_probe_skip_entry {
104 	struct spdk_nvme_transport_id		trid;
105 	TAILQ_ENTRY(nvme_probe_skip_entry)	tailq;
106 };
107 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */
108 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER(
109 			g_skipped_nvme_ctrlrs);
110 
111 static struct spdk_bdev_nvme_opts g_opts = {
112 	.action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE,
113 	.timeout_us = 0,
114 	.retry_count = 4,
115 	.arbitration_burst = 0,
116 	.low_priority_weight = 0,
117 	.medium_priority_weight = 0,
118 	.high_priority_weight = 0,
119 	.nvme_adminq_poll_period_us = 10000ULL,
120 	.nvme_ioq_poll_period_us = 0,
121 	.io_queue_requests = 0,
122 	.delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT,
123 };
124 
125 #define NVME_HOTPLUG_POLL_PERIOD_MAX			10000000ULL
126 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT		100000ULL
127 
128 static int g_hot_insert_nvme_controller_index = 0;
129 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT;
130 static bool g_nvme_hotplug_enabled = false;
131 static struct spdk_thread *g_bdev_nvme_init_thread;
132 static struct spdk_poller *g_hotplug_poller;
133 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx;
134 static char *g_nvme_hostnqn = NULL;
135 
136 static void nvme_ctrlr_populate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
137 		struct nvme_async_probe_ctx *ctx);
138 static void nvme_ctrlr_populate_namespaces_done(struct nvme_async_probe_ctx *ctx);
139 static int bdev_nvme_library_init(void);
140 static void bdev_nvme_library_fini(void);
141 static int bdev_nvme_readv(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
142 			   struct nvme_bdev_io *bio,
143 			   struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba);
144 static int bdev_nvme_no_pi_readv(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
145 				 struct nvme_bdev_io *bio,
146 				 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba);
147 static int bdev_nvme_writev(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
148 			    struct nvme_bdev_io *bio,
149 			    struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba);
150 static int bdev_nvme_comparev(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
151 			      struct nvme_bdev_io *bio,
152 			      struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba);
153 static int bdev_nvme_comparev_and_writev(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
154 		struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov,
155 		int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba);
156 static int bdev_nvme_admin_passthru(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
157 				    struct nvme_bdev_io *bio,
158 				    struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes);
159 static int bdev_nvme_io_passthru(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
160 				 struct nvme_bdev_io *bio,
161 				 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes);
162 static int bdev_nvme_io_passthru_md(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
163 				    struct nvme_bdev_io *bio,
164 				    struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len);
165 static int bdev_nvme_reset(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, struct nvme_bdev_io *bio);
166 
167 typedef void (*populate_namespace_fn)(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
168 				      struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx);
169 static void nvme_ctrlr_populate_standard_namespace(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
170 		struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx);
171 
172 static populate_namespace_fn g_populate_namespace_fn[] = {
173 	NULL,
174 	nvme_ctrlr_populate_standard_namespace,
175 	bdev_ocssd_populate_namespace,
176 };
177 
178 typedef void (*depopulate_namespace_fn)(struct nvme_bdev_ns *ns);
179 static void nvme_ctrlr_depopulate_standard_namespace(struct nvme_bdev_ns *ns);
180 
181 static depopulate_namespace_fn g_depopulate_namespace_fn[] = {
182 	NULL,
183 	nvme_ctrlr_depopulate_standard_namespace,
184 	bdev_ocssd_depopulate_namespace,
185 };
186 
187 typedef void (*config_json_namespace_fn)(struct spdk_json_write_ctx *w, struct nvme_bdev_ns *ns);
188 static void nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w,
189 		struct nvme_bdev_ns *ns);
190 
191 static config_json_namespace_fn g_config_json_namespace_fn[] = {
192 	NULL,
193 	nvme_ctrlr_config_json_standard_namespace,
194 	bdev_ocssd_namespace_config_json,
195 };
196 
197 struct spdk_nvme_qpair *
198 spdk_bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch)
199 {
200 	struct nvme_io_channel *nvme_ch;
201 
202 	nvme_ch =  spdk_io_channel_get_ctx(ctrlr_io_ch);
203 
204 	return nvme_ch->qpair;
205 }
206 
207 static int
208 bdev_nvme_get_ctx_size(void)
209 {
210 	return sizeof(struct nvme_bdev_io);
211 }
212 
213 static struct spdk_bdev_module nvme_if = {
214 	.name = "nvme",
215 	.async_fini = true,
216 	.module_init = bdev_nvme_library_init,
217 	.module_fini = bdev_nvme_library_fini,
218 	.config_text = bdev_nvme_get_spdk_running_config,
219 	.config_json = bdev_nvme_config_json,
220 	.get_ctx_size = bdev_nvme_get_ctx_size,
221 
222 };
223 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if)
224 
225 static int
226 bdev_nvme_poll(void *arg)
227 {
228 	struct nvme_io_channel *ch = arg;
229 	int32_t num_completions;
230 
231 	if (ch->qpair == NULL) {
232 		return -1;
233 	}
234 
235 	if (ch->collect_spin_stat && ch->start_ticks == 0) {
236 		ch->start_ticks = spdk_get_ticks();
237 	}
238 
239 	num_completions = spdk_nvme_qpair_process_completions(ch->qpair, 0);
240 
241 	if (ch->collect_spin_stat) {
242 		if (num_completions > 0) {
243 			if (ch->end_ticks != 0) {
244 				ch->spin_ticks += (ch->end_ticks - ch->start_ticks);
245 				ch->end_ticks = 0;
246 			}
247 			ch->start_ticks = 0;
248 		} else {
249 			ch->end_ticks = spdk_get_ticks();
250 		}
251 	}
252 
253 	return num_completions;
254 }
255 
256 static int
257 bdev_nvme_poll_adminq(void *arg)
258 {
259 	int32_t rc;
260 	struct spdk_nvme_ctrlr *ctrlr = arg;
261 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
262 
263 	rc = spdk_nvme_ctrlr_process_admin_completions(ctrlr);
264 
265 	if (rc < 0) {
266 		nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(spdk_nvme_ctrlr_get_transport_id(ctrlr));
267 		assert(nvme_bdev_ctrlr != NULL);
268 		bdev_nvme_reset(nvme_bdev_ctrlr, NULL);
269 	}
270 
271 	return rc;
272 }
273 
274 static int
275 bdev_nvme_destruct(void *ctx)
276 {
277 	struct nvme_bdev *nvme_disk = ctx;
278 
279 	nvme_bdev_detach_bdev_from_ns(nvme_disk);
280 
281 	free(nvme_disk->disk.name);
282 	free(nvme_disk);
283 
284 	return 0;
285 }
286 
287 static int
288 bdev_nvme_flush(struct nvme_bdev *nbdev, struct nvme_bdev_io *bio,
289 		uint64_t offset, uint64_t nbytes)
290 {
291 	spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_SUCCESS);
292 
293 	return 0;
294 }
295 
296 static void
297 _bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i)
298 {
299 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
300 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(_ch);
301 	struct spdk_bdev_io *bdev_io;
302 	enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS;
303 
304 	/* A NULL ctx means success. */
305 	if (spdk_io_channel_iter_get_ctx(i) != NULL) {
306 		status = SPDK_BDEV_IO_STATUS_FAILED;
307 	}
308 
309 	while (!TAILQ_EMPTY(&nvme_ch->pending_resets)) {
310 		bdev_io = TAILQ_FIRST(&nvme_ch->pending_resets);
311 		TAILQ_REMOVE(&nvme_ch->pending_resets, bdev_io, module_link);
312 		spdk_bdev_io_complete(bdev_io, status);
313 	}
314 
315 	spdk_for_each_channel_continue(i, 0);
316 }
317 
318 static void
319 _bdev_nvme_reset_complete(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, int rc)
320 {
321 	/* we are using the for_each_channel cb_arg like a return code here. */
322 	/* If it's zero, we succeeded, otherwise, the reset failed. */
323 	void *cb_arg = NULL;
324 
325 	if (rc) {
326 		cb_arg = (void *)0x1;
327 		SPDK_ERRLOG("Resetting controller failed.\n");
328 	} else {
329 		SPDK_NOTICELOG("Resetting controller successful.\n");
330 	}
331 
332 	pthread_mutex_lock(&g_bdev_nvme_mutex);
333 	nvme_bdev_ctrlr->resetting = false;
334 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
335 	/* Make sure we clear any pending resets before returning. */
336 	spdk_for_each_channel(nvme_bdev_ctrlr,
337 			      _bdev_nvme_complete_pending_resets,
338 			      cb_arg, NULL);
339 }
340 
341 static void
342 _bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status)
343 {
344 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i);
345 	void *ctx = spdk_io_channel_iter_get_ctx(i);
346 	int rc = SPDK_BDEV_IO_STATUS_SUCCESS;
347 
348 	if (status) {
349 		rc = SPDK_BDEV_IO_STATUS_FAILED;
350 	}
351 	if (ctx) {
352 		spdk_bdev_io_complete(spdk_bdev_io_from_ctx(ctx), rc);
353 	}
354 	_bdev_nvme_reset_complete(nvme_bdev_ctrlr, status);
355 }
356 
357 static void
358 _bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i)
359 {
360 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i);
361 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
362 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(_ch);
363 	struct spdk_nvme_io_qpair_opts opts;
364 
365 	spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_bdev_ctrlr->ctrlr, &opts, sizeof(opts));
366 	opts.delay_cmd_submit = g_opts.delay_cmd_submit;
367 
368 	nvme_ch->qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_bdev_ctrlr->ctrlr, &opts, sizeof(opts));
369 	if (!nvme_ch->qpair) {
370 		spdk_for_each_channel_continue(i, -1);
371 		return;
372 	}
373 
374 	spdk_for_each_channel_continue(i, 0);
375 }
376 
377 static void
378 _bdev_nvme_reset(struct spdk_io_channel_iter *i, int status)
379 {
380 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i);
381 	struct nvme_bdev_io *bio = spdk_io_channel_iter_get_ctx(i);
382 	int rc;
383 
384 	if (status) {
385 		if (bio) {
386 			spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_FAILED);
387 		}
388 		_bdev_nvme_reset_complete(nvme_bdev_ctrlr, status);
389 		return;
390 	}
391 
392 	rc = spdk_nvme_ctrlr_reset(nvme_bdev_ctrlr->ctrlr);
393 	if (rc != 0) {
394 		if (bio) {
395 			spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_FAILED);
396 		}
397 		_bdev_nvme_reset_complete(nvme_bdev_ctrlr, rc);
398 		return;
399 	}
400 
401 	/* Recreate all of the I/O queue pairs */
402 	spdk_for_each_channel(nvme_bdev_ctrlr,
403 			      _bdev_nvme_reset_create_qpair,
404 			      bio,
405 			      _bdev_nvme_reset_create_qpairs_done);
406 
407 
408 }
409 
410 static void
411 _bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i)
412 {
413 	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
414 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
415 	int rc;
416 
417 	rc = spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair);
418 	if (!rc) {
419 		nvme_ch->qpair = NULL;
420 	}
421 
422 	spdk_for_each_channel_continue(i, rc);
423 }
424 
425 static int
426 bdev_nvme_reset(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, struct nvme_bdev_io *bio)
427 {
428 	struct spdk_io_channel *ch;
429 	struct nvme_io_channel *nvme_ch;
430 
431 	pthread_mutex_lock(&g_bdev_nvme_mutex);
432 	if (nvme_bdev_ctrlr->destruct) {
433 		/* Don't bother resetting if the controller is in the process of being destructed. */
434 		if (bio) {
435 			spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_FAILED);
436 		}
437 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
438 		return 0;
439 	}
440 
441 	if (!nvme_bdev_ctrlr->resetting) {
442 		nvme_bdev_ctrlr->resetting = true;
443 	} else {
444 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
445 		SPDK_NOTICELOG("Unable to perform reset, already in progress.\n");
446 		/*
447 		 * The internal reset calls won't be queued. This is on purpose so that we don't
448 		 * interfere with the app framework reset strategy. i.e. we are deferring to the
449 		 * upper level. If they are in the middle of a reset, we won't try to schedule another one.
450 		 */
451 		if (bio) {
452 			ch = spdk_get_io_channel(nvme_bdev_ctrlr);
453 			assert(ch != NULL);
454 			nvme_ch = spdk_io_channel_get_ctx(ch);
455 			TAILQ_INSERT_TAIL(&nvme_ch->pending_resets, spdk_bdev_io_from_ctx(bio), module_link);
456 			spdk_put_io_channel(ch);
457 		}
458 		return 0;
459 	}
460 
461 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
462 	/* First, delete all NVMe I/O queue pairs. */
463 	spdk_for_each_channel(nvme_bdev_ctrlr,
464 			      _bdev_nvme_reset_destroy_qpair,
465 			      bio,
466 			      _bdev_nvme_reset);
467 
468 	return 0;
469 }
470 
471 static int
472 bdev_nvme_unmap(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
473 		struct nvme_bdev_io *bio,
474 		uint64_t offset_blocks,
475 		uint64_t num_blocks);
476 
477 static void
478 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
479 		     bool success)
480 {
481 	int ret;
482 
483 	if (!success) {
484 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
485 		return;
486 	}
487 
488 	ret = bdev_nvme_readv((struct nvme_bdev *)bdev_io->bdev->ctxt,
489 			      ch,
490 			      (struct nvme_bdev_io *)bdev_io->driver_ctx,
491 			      bdev_io->u.bdev.iovs,
492 			      bdev_io->u.bdev.iovcnt,
493 			      bdev_io->u.bdev.md_buf,
494 			      bdev_io->u.bdev.num_blocks,
495 			      bdev_io->u.bdev.offset_blocks);
496 
497 	if (spdk_likely(ret == 0)) {
498 		return;
499 	} else if (ret == -ENOMEM) {
500 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
501 	} else {
502 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
503 	}
504 }
505 
506 static int
507 _bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
508 {
509 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
510 	struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt;
511 	struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx;
512 
513 	if (nvme_ch->qpair == NULL) {
514 		/* The device is currently resetting */
515 		return -1;
516 	}
517 
518 	switch (bdev_io->type) {
519 	case SPDK_BDEV_IO_TYPE_READ:
520 		spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb,
521 				     bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
522 		return 0;
523 
524 	case SPDK_BDEV_IO_TYPE_WRITE:
525 		return bdev_nvme_writev(nbdev,
526 					ch,
527 					nbdev_io,
528 					bdev_io->u.bdev.iovs,
529 					bdev_io->u.bdev.iovcnt,
530 					bdev_io->u.bdev.md_buf,
531 					bdev_io->u.bdev.num_blocks,
532 					bdev_io->u.bdev.offset_blocks);
533 
534 	case SPDK_BDEV_IO_TYPE_COMPARE:
535 		return bdev_nvme_comparev(nbdev,
536 					  ch,
537 					  nbdev_io,
538 					  bdev_io->u.bdev.iovs,
539 					  bdev_io->u.bdev.iovcnt,
540 					  bdev_io->u.bdev.md_buf,
541 					  bdev_io->u.bdev.num_blocks,
542 					  bdev_io->u.bdev.offset_blocks);
543 
544 	case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE:
545 		return bdev_nvme_comparev_and_writev(nbdev,
546 						     ch,
547 						     nbdev_io,
548 						     bdev_io->u.bdev.iovs,
549 						     bdev_io->u.bdev.iovcnt,
550 						     bdev_io->u.bdev.fused_iovs,
551 						     bdev_io->u.bdev.fused_iovcnt,
552 						     bdev_io->u.bdev.md_buf,
553 						     bdev_io->u.bdev.num_blocks,
554 						     bdev_io->u.bdev.offset_blocks);
555 
556 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
557 		return bdev_nvme_unmap(nbdev,
558 				       ch,
559 				       nbdev_io,
560 				       bdev_io->u.bdev.offset_blocks,
561 				       bdev_io->u.bdev.num_blocks);
562 
563 	case SPDK_BDEV_IO_TYPE_UNMAP:
564 		return bdev_nvme_unmap(nbdev,
565 				       ch,
566 				       nbdev_io,
567 				       bdev_io->u.bdev.offset_blocks,
568 				       bdev_io->u.bdev.num_blocks);
569 
570 	case SPDK_BDEV_IO_TYPE_RESET:
571 		return bdev_nvme_reset(nbdev->nvme_bdev_ctrlr, nbdev_io);
572 
573 	case SPDK_BDEV_IO_TYPE_FLUSH:
574 		return bdev_nvme_flush(nbdev,
575 				       nbdev_io,
576 				       bdev_io->u.bdev.offset_blocks,
577 				       bdev_io->u.bdev.num_blocks);
578 
579 	case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
580 		return bdev_nvme_admin_passthru(nbdev,
581 						ch,
582 						nbdev_io,
583 						&bdev_io->u.nvme_passthru.cmd,
584 						bdev_io->u.nvme_passthru.buf,
585 						bdev_io->u.nvme_passthru.nbytes);
586 
587 	case SPDK_BDEV_IO_TYPE_NVME_IO:
588 		return bdev_nvme_io_passthru(nbdev,
589 					     ch,
590 					     nbdev_io,
591 					     &bdev_io->u.nvme_passthru.cmd,
592 					     bdev_io->u.nvme_passthru.buf,
593 					     bdev_io->u.nvme_passthru.nbytes);
594 
595 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
596 		return bdev_nvme_io_passthru_md(nbdev,
597 						ch,
598 						nbdev_io,
599 						&bdev_io->u.nvme_passthru.cmd,
600 						bdev_io->u.nvme_passthru.buf,
601 						bdev_io->u.nvme_passthru.nbytes,
602 						bdev_io->u.nvme_passthru.md_buf,
603 						bdev_io->u.nvme_passthru.md_len);
604 
605 	default:
606 		return -EINVAL;
607 	}
608 	return 0;
609 }
610 
611 static void
612 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
613 {
614 	int rc = _bdev_nvme_submit_request(ch, bdev_io);
615 
616 	if (spdk_unlikely(rc != 0)) {
617 		if (rc == -ENOMEM) {
618 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
619 		} else {
620 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
621 		}
622 	}
623 }
624 
625 static bool
626 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
627 {
628 	struct nvme_bdev *nbdev = ctx;
629 	const struct spdk_nvme_ctrlr_data *cdata;
630 
631 	switch (io_type) {
632 	case SPDK_BDEV_IO_TYPE_READ:
633 	case SPDK_BDEV_IO_TYPE_WRITE:
634 	case SPDK_BDEV_IO_TYPE_RESET:
635 	case SPDK_BDEV_IO_TYPE_FLUSH:
636 	case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
637 	case SPDK_BDEV_IO_TYPE_NVME_IO:
638 		return true;
639 
640 	case SPDK_BDEV_IO_TYPE_COMPARE:
641 		return spdk_nvme_ns_supports_compare(nbdev->nvme_ns->ns);
642 
643 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
644 		return spdk_nvme_ns_get_md_size(nbdev->nvme_ns->ns) ? true : false;
645 
646 	case SPDK_BDEV_IO_TYPE_UNMAP:
647 		cdata = spdk_nvme_ctrlr_get_data(nbdev->nvme_bdev_ctrlr->ctrlr);
648 		return cdata->oncs.dsm;
649 
650 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
651 		cdata = spdk_nvme_ctrlr_get_data(nbdev->nvme_bdev_ctrlr->ctrlr);
652 		/*
653 		 * If an NVMe controller guarantees reading unallocated blocks returns zero,
654 		 * we can implement WRITE_ZEROES as an NVMe deallocate command.
655 		 */
656 		if (cdata->oncs.dsm &&
657 		    spdk_nvme_ns_get_dealloc_logical_block_read_value(nbdev->nvme_ns->ns) ==
658 		    SPDK_NVME_DEALLOC_READ_00) {
659 			return true;
660 		}
661 		/*
662 		 * The NVMe controller write_zeroes function is currently not used by our driver.
663 		 * If a user submits an arbitrarily large write_zeroes request to the controller, the request will fail.
664 		 * Until this is resolved, we only claim support for write_zeroes if deallocated blocks return 0's when read.
665 		 */
666 		return false;
667 
668 	case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE:
669 		if (spdk_nvme_ctrlr_get_flags(nbdev->nvme_bdev_ctrlr->ctrlr) &
670 		    SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) {
671 			return true;
672 		}
673 		return false;
674 
675 	default:
676 		return false;
677 	}
678 }
679 
680 static int
681 bdev_nvme_create_cb(void *io_device, void *ctx_buf)
682 {
683 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = io_device;
684 	struct nvme_io_channel *ch = ctx_buf;
685 	struct spdk_nvme_io_qpair_opts opts;
686 
687 #ifdef SPDK_CONFIG_VTUNE
688 	ch->collect_spin_stat = true;
689 #else
690 	ch->collect_spin_stat = false;
691 #endif
692 
693 	spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_bdev_ctrlr->ctrlr, &opts, sizeof(opts));
694 	opts.delay_cmd_submit = g_opts.delay_cmd_submit;
695 	opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests);
696 	g_opts.io_queue_requests = opts.io_queue_requests;
697 
698 	ch->qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_bdev_ctrlr->ctrlr, &opts, sizeof(opts));
699 
700 	if (ch->qpair == NULL) {
701 		return -1;
702 	}
703 
704 	if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) {
705 		if (bdev_ocssd_create_io_channel(ch)) {
706 			spdk_nvme_ctrlr_free_io_qpair(ch->qpair);
707 			return -1;
708 		}
709 	}
710 
711 	ch->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, ch, g_opts.nvme_ioq_poll_period_us);
712 
713 	TAILQ_INIT(&ch->pending_resets);
714 	return 0;
715 }
716 
717 static void
718 bdev_nvme_destroy_cb(void *io_device, void *ctx_buf)
719 {
720 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = io_device;
721 	struct nvme_io_channel *ch = ctx_buf;
722 
723 	if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) {
724 		bdev_ocssd_destroy_io_channel(ch);
725 	}
726 
727 	spdk_nvme_ctrlr_free_io_qpair(ch->qpair);
728 	spdk_poller_unregister(&ch->poller);
729 }
730 
731 static struct spdk_io_channel *
732 bdev_nvme_get_io_channel(void *ctx)
733 {
734 	struct nvme_bdev *nvme_bdev = ctx;
735 
736 	return spdk_get_io_channel(nvme_bdev->nvme_bdev_ctrlr);
737 }
738 
739 static int
740 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
741 {
742 	struct nvme_bdev *nvme_bdev = ctx;
743 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = nvme_bdev->nvme_bdev_ctrlr;
744 	const struct spdk_nvme_ctrlr_data *cdata;
745 	struct spdk_nvme_ns *ns;
746 	union spdk_nvme_vs_register vs;
747 	union spdk_nvme_csts_register csts;
748 	char buf[128];
749 
750 	cdata = spdk_nvme_ctrlr_get_data(nvme_bdev->nvme_bdev_ctrlr->ctrlr);
751 	vs = spdk_nvme_ctrlr_get_regs_vs(nvme_bdev->nvme_bdev_ctrlr->ctrlr);
752 	csts = spdk_nvme_ctrlr_get_regs_csts(nvme_bdev->nvme_bdev_ctrlr->ctrlr);
753 	ns = nvme_bdev->nvme_ns->ns;
754 
755 	spdk_json_write_named_object_begin(w, "nvme");
756 
757 	if (nvme_bdev_ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) {
758 		spdk_json_write_named_string(w, "pci_address", nvme_bdev_ctrlr->trid.traddr);
759 	}
760 
761 	spdk_json_write_named_object_begin(w, "trid");
762 
763 	nvme_bdev_dump_trid_json(&nvme_bdev_ctrlr->trid, w);
764 
765 	spdk_json_write_object_end(w);
766 
767 #ifdef SPDK_CONFIG_NVME_CUSE
768 	char *cuse_device;
769 
770 	cuse_device = spdk_nvme_cuse_get_ns_name(nvme_bdev->nvme_bdev_ctrlr->ctrlr,
771 			spdk_nvme_ns_get_id(ns));
772 	if (cuse_device) {
773 		spdk_json_write_named_string(w, "cuse_device", cuse_device);
774 	}
775 #endif
776 
777 	spdk_json_write_named_object_begin(w, "ctrlr_data");
778 
779 	spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid);
780 
781 	snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn);
782 	spdk_str_trim(buf);
783 	spdk_json_write_named_string(w, "model_number", buf);
784 
785 	snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn);
786 	spdk_str_trim(buf);
787 	spdk_json_write_named_string(w, "serial_number", buf);
788 
789 	snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr);
790 	spdk_str_trim(buf);
791 	spdk_json_write_named_string(w, "firmware_revision", buf);
792 
793 	spdk_json_write_named_object_begin(w, "oacs");
794 
795 	spdk_json_write_named_uint32(w, "security", cdata->oacs.security);
796 	spdk_json_write_named_uint32(w, "format", cdata->oacs.format);
797 	spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware);
798 	spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage);
799 
800 	spdk_json_write_object_end(w);
801 
802 	spdk_json_write_object_end(w);
803 
804 	spdk_json_write_named_object_begin(w, "vs");
805 
806 	spdk_json_write_name(w, "nvme_version");
807 	if (vs.bits.ter) {
808 		spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter);
809 	} else {
810 		spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr);
811 	}
812 
813 	spdk_json_write_object_end(w);
814 
815 	spdk_json_write_named_object_begin(w, "csts");
816 
817 	spdk_json_write_named_uint32(w, "rdy", csts.bits.rdy);
818 	spdk_json_write_named_uint32(w, "cfs", csts.bits.cfs);
819 
820 	spdk_json_write_object_end(w);
821 
822 	spdk_json_write_named_object_begin(w, "ns_data");
823 
824 	spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns));
825 
826 	spdk_json_write_object_end(w);
827 
828 	if (cdata->oacs.security) {
829 		spdk_json_write_named_object_begin(w, "security");
830 
831 		spdk_json_write_named_bool(w, "opal", spdk_opal_supported(nvme_bdev_ctrlr->opal_dev));
832 
833 		spdk_json_write_object_end(w);
834 	}
835 
836 	spdk_json_write_object_end(w);
837 
838 	return 0;
839 }
840 
841 static void
842 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
843 {
844 	/* No config per bdev needed */
845 }
846 
847 static uint64_t
848 bdev_nvme_get_spin_time(struct spdk_io_channel *ch)
849 {
850 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
851 	uint64_t spin_time;
852 
853 	if (!nvme_ch->collect_spin_stat) {
854 		return 0;
855 	}
856 
857 	if (nvme_ch->end_ticks != 0) {
858 		nvme_ch->spin_ticks += (nvme_ch->end_ticks - nvme_ch->start_ticks);
859 		nvme_ch->end_ticks = 0;
860 	}
861 
862 	spin_time = (nvme_ch->spin_ticks * 1000000ULL) / spdk_get_ticks_hz();
863 	nvme_ch->start_ticks = 0;
864 	nvme_ch->spin_ticks = 0;
865 
866 	return spin_time;
867 }
868 
869 static const struct spdk_bdev_fn_table nvmelib_fn_table = {
870 	.destruct		= bdev_nvme_destruct,
871 	.submit_request		= bdev_nvme_submit_request,
872 	.io_type_supported	= bdev_nvme_io_type_supported,
873 	.get_io_channel		= bdev_nvme_get_io_channel,
874 	.dump_info_json		= bdev_nvme_dump_info_json,
875 	.write_config_json	= bdev_nvme_write_config_json,
876 	.get_spin_time		= bdev_nvme_get_spin_time,
877 };
878 
879 static void
880 nvme_ctrlr_populate_standard_namespace(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
881 				       struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx)
882 {
883 	struct spdk_nvme_ctrlr	*ctrlr = nvme_bdev_ctrlr->ctrlr;
884 	struct nvme_bdev	*bdev;
885 	struct spdk_nvme_ns	*ns;
886 	const struct spdk_uuid	*uuid;
887 	const struct spdk_nvme_ctrlr_data *cdata;
888 	const struct spdk_nvme_ns_data *nsdata;
889 	int			rc;
890 
891 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
892 
893 	ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id);
894 	if (!ns) {
895 		SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "Invalid NS %d\n", nvme_ns->id);
896 		nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, -EINVAL);
897 		return;
898 	}
899 
900 	bdev = calloc(1, sizeof(*bdev));
901 	if (!bdev) {
902 		SPDK_ERRLOG("bdev calloc() failed\n");
903 		nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, -ENOMEM);
904 		return;
905 	}
906 
907 	bdev->nvme_bdev_ctrlr = nvme_bdev_ctrlr;
908 	nvme_ns->ns = ns;
909 	bdev->nvme_ns = nvme_ns;
910 
911 	bdev->disk.name = spdk_sprintf_alloc("%sn%d", nvme_bdev_ctrlr->name, spdk_nvme_ns_get_id(ns));
912 	if (!bdev->disk.name) {
913 		free(bdev);
914 		nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, -ENOMEM);
915 		return;
916 	}
917 	bdev->disk.product_name = "NVMe disk";
918 
919 	bdev->disk.write_cache = 0;
920 	if (cdata->vwc.present) {
921 		/* Enable if the Volatile Write Cache exists */
922 		bdev->disk.write_cache = 1;
923 	}
924 	bdev->disk.blocklen = spdk_nvme_ns_get_extended_sector_size(ns);
925 	bdev->disk.blockcnt = spdk_nvme_ns_get_num_sectors(ns);
926 	bdev->disk.optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns);
927 
928 	uuid = spdk_nvme_ns_get_uuid(ns);
929 	if (uuid != NULL) {
930 		bdev->disk.uuid = *uuid;
931 	}
932 
933 	nsdata = spdk_nvme_ns_get_data(ns);
934 
935 	bdev->disk.md_len = spdk_nvme_ns_get_md_size(ns);
936 	if (bdev->disk.md_len != 0) {
937 		bdev->disk.md_interleave = nsdata->flbas.extended;
938 		bdev->disk.dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns);
939 		if (bdev->disk.dif_type != SPDK_DIF_DISABLE) {
940 			bdev->disk.dif_is_head_of_md = nsdata->dps.md_start;
941 			bdev->disk.dif_check_flags = nvme_bdev_ctrlr->prchk_flags;
942 		}
943 	}
944 
945 	if (!bdev_nvme_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) {
946 		bdev->disk.acwu = 0;
947 	} else if (nsdata->nsfeat.ns_atomic_write_unit) {
948 		bdev->disk.acwu = nsdata->nacwu;
949 	} else {
950 		bdev->disk.acwu = cdata->acwu;
951 	}
952 
953 	bdev->disk.ctxt = bdev;
954 	bdev->disk.fn_table = &nvmelib_fn_table;
955 	bdev->disk.module = &nvme_if;
956 	rc = spdk_bdev_register(&bdev->disk);
957 	if (rc) {
958 		free(bdev->disk.name);
959 		free(bdev);
960 		nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, rc);
961 		return;
962 	}
963 
964 	nvme_bdev_attach_bdev_to_ns(nvme_ns, bdev);
965 	nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, 0);
966 }
967 
968 static bool
969 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
970 		 struct spdk_nvme_ctrlr_opts *opts)
971 {
972 	struct nvme_probe_skip_entry *entry;
973 
974 	TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) {
975 		if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) {
976 			return false;
977 		}
978 	}
979 
980 	opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst;
981 	opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight;
982 	opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight;
983 	opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight;
984 
985 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "Attaching to %s\n", trid->traddr);
986 
987 	return true;
988 }
989 
990 static bool
991 probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
992 	 struct spdk_nvme_ctrlr_opts *opts)
993 {
994 	struct nvme_probe_ctx *ctx = cb_ctx;
995 
996 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "Probing device %s\n", trid->traddr);
997 
998 	if (nvme_bdev_ctrlr_get(trid)) {
999 		SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n",
1000 			    trid->traddr);
1001 		return false;
1002 	}
1003 
1004 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
1005 		bool claim_device = false;
1006 		size_t i;
1007 
1008 		for (i = 0; i < ctx->count; i++) {
1009 			if (spdk_nvme_transport_id_compare(trid, &ctx->trids[i]) == 0) {
1010 				claim_device = true;
1011 				break;
1012 			}
1013 		}
1014 
1015 		if (!claim_device) {
1016 			SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "Not claiming device at %s\n", trid->traddr);
1017 			return false;
1018 		}
1019 	}
1020 
1021 	if (ctx->hostnqn) {
1022 		snprintf(opts->hostnqn, sizeof(opts->hostnqn), "%s", ctx->hostnqn);
1023 	}
1024 
1025 	opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst;
1026 	opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight;
1027 	opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight;
1028 	opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight;
1029 
1030 	return true;
1031 }
1032 
1033 static void
1034 spdk_nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl)
1035 {
1036 	struct spdk_nvme_ctrlr *ctrlr = ctx;
1037 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
1038 
1039 	if (spdk_nvme_cpl_is_error(cpl)) {
1040 		SPDK_WARNLOG("Abort failed. Resetting controller.\n");
1041 		nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(spdk_nvme_ctrlr_get_transport_id(ctrlr));
1042 		assert(nvme_bdev_ctrlr != NULL);
1043 		bdev_nvme_reset(nvme_bdev_ctrlr, NULL);
1044 	}
1045 }
1046 
1047 static void
1048 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr,
1049 	   struct spdk_nvme_qpair *qpair, uint16_t cid)
1050 {
1051 	int rc;
1052 	union spdk_nvme_csts_register csts;
1053 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
1054 
1055 	SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid);
1056 
1057 	csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr);
1058 	if (csts.bits.cfs) {
1059 		SPDK_ERRLOG("Controller Fatal Status, reset required\n");
1060 		nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(spdk_nvme_ctrlr_get_transport_id(ctrlr));
1061 		assert(nvme_bdev_ctrlr != NULL);
1062 		bdev_nvme_reset(nvme_bdev_ctrlr, NULL);
1063 		return;
1064 	}
1065 
1066 	switch (g_opts.action_on_timeout) {
1067 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT:
1068 		if (qpair) {
1069 			rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid,
1070 						       spdk_nvme_abort_cpl, ctrlr);
1071 			if (rc == 0) {
1072 				return;
1073 			}
1074 
1075 			SPDK_ERRLOG("Unable to send abort. Resetting.\n");
1076 		}
1077 
1078 	/* FALLTHROUGH */
1079 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET:
1080 		nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(spdk_nvme_ctrlr_get_transport_id(ctrlr));
1081 		assert(nvme_bdev_ctrlr != NULL);
1082 		bdev_nvme_reset(nvme_bdev_ctrlr, NULL);
1083 		break;
1084 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE:
1085 		SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "No action for nvme controller timeout.\n");
1086 		break;
1087 	default:
1088 		SPDK_ERRLOG("An invalid timeout action value is found.\n");
1089 		break;
1090 	}
1091 }
1092 
1093 void
1094 nvme_ctrlr_depopulate_namespace_done(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr)
1095 {
1096 	pthread_mutex_lock(&g_bdev_nvme_mutex);
1097 	nvme_bdev_ctrlr->ref--;
1098 
1099 	if (nvme_bdev_ctrlr->ref == 0 && nvme_bdev_ctrlr->destruct) {
1100 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
1101 		nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr);
1102 		return;
1103 	}
1104 
1105 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
1106 }
1107 
1108 static void
1109 nvme_ctrlr_depopulate_standard_namespace(struct nvme_bdev_ns *ns)
1110 {
1111 	struct nvme_bdev *bdev, *tmp;
1112 
1113 	TAILQ_FOREACH_SAFE(bdev, &ns->bdevs, tailq, tmp) {
1114 		spdk_bdev_unregister(&bdev->disk, NULL, NULL);
1115 	}
1116 
1117 	ns->populated = false;
1118 
1119 	nvme_ctrlr_depopulate_namespace_done(ns->ctrlr);
1120 }
1121 
1122 static void nvme_ctrlr_populate_namespace(struct nvme_bdev_ctrlr *ctrlr, struct nvme_bdev_ns *ns,
1123 		struct nvme_async_probe_ctx *ctx)
1124 {
1125 	g_populate_namespace_fn[ns->type](ctrlr, ns, ctx);
1126 }
1127 
1128 static void nvme_ctrlr_depopulate_namespace(struct nvme_bdev_ctrlr *ctrlr, struct nvme_bdev_ns *ns)
1129 {
1130 	g_depopulate_namespace_fn[ns->type](ns);
1131 }
1132 
1133 void
1134 nvme_ctrlr_populate_namespace_done(struct nvme_async_probe_ctx *ctx,
1135 				   struct nvme_bdev_ns *ns, int rc)
1136 {
1137 	if (rc == 0) {
1138 		ns->populated = true;
1139 		pthread_mutex_lock(&g_bdev_nvme_mutex);
1140 		ns->ctrlr->ref++;
1141 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
1142 	} else {
1143 		memset(ns, 0, sizeof(*ns));
1144 	}
1145 
1146 	if (ctx) {
1147 		ctx->populates_in_progress--;
1148 		if (ctx->populates_in_progress == 0) {
1149 			nvme_ctrlr_populate_namespaces_done(ctx);
1150 		}
1151 	}
1152 }
1153 
1154 static void
1155 nvme_ctrlr_populate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
1156 			       struct nvme_async_probe_ctx *ctx)
1157 {
1158 	struct spdk_nvme_ctrlr	*ctrlr = nvme_bdev_ctrlr->ctrlr;
1159 	struct nvme_bdev_ns	*ns;
1160 	struct spdk_nvme_ns	*nvme_ns;
1161 	struct nvme_bdev	*bdev;
1162 	uint32_t		i;
1163 	int			rc;
1164 	uint64_t		num_sectors;
1165 	bool			ns_is_active;
1166 
1167 	if (ctx) {
1168 		/* Initialize this count to 1 to handle the populate functions
1169 		 * calling nvme_ctrlr_populate_namespace_done() immediately.
1170 		 */
1171 		ctx->populates_in_progress = 1;
1172 	}
1173 
1174 	for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
1175 		uint32_t	nsid = i + 1;
1176 
1177 		ns = nvme_bdev_ctrlr->namespaces[i];
1178 		ns_is_active = spdk_nvme_ctrlr_is_active_ns(ctrlr, nsid);
1179 
1180 		if (ns->populated && ns_is_active && ns->type == NVME_BDEV_NS_STANDARD) {
1181 			/* NS is still there but attributes may have changed */
1182 			nvme_ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid);
1183 			num_sectors = spdk_nvme_ns_get_num_sectors(nvme_ns);
1184 			bdev = TAILQ_FIRST(&ns->bdevs);
1185 			if (bdev->disk.blockcnt != num_sectors) {
1186 				SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %lu, new size %lu\n",
1187 					       nsid,
1188 					       bdev->disk.name,
1189 					       bdev->disk.blockcnt,
1190 					       num_sectors);
1191 				rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors);
1192 				if (rc != 0) {
1193 					SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n",
1194 						    bdev->disk.name, rc);
1195 				}
1196 			}
1197 		}
1198 
1199 		if (!ns->populated && ns_is_active) {
1200 			ns->id = nsid;
1201 			ns->ctrlr = nvme_bdev_ctrlr;
1202 			if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) {
1203 				ns->type = NVME_BDEV_NS_OCSSD;
1204 			} else {
1205 				ns->type = NVME_BDEV_NS_STANDARD;
1206 			}
1207 
1208 			TAILQ_INIT(&ns->bdevs);
1209 
1210 			if (ctx) {
1211 				ctx->populates_in_progress++;
1212 			}
1213 			nvme_ctrlr_populate_namespace(nvme_bdev_ctrlr, ns, ctx);
1214 		}
1215 
1216 		if (ns->populated && !ns_is_active) {
1217 			nvme_ctrlr_depopulate_namespace(nvme_bdev_ctrlr, ns);
1218 		}
1219 	}
1220 
1221 	if (ctx) {
1222 		/* Decrement this count now that the loop is over to account
1223 		 * for the one we started with.  If the count is then 0, we
1224 		 * know any populate_namespace functions completed immediately,
1225 		 * so we'll kick the callback here.
1226 		 */
1227 		ctx->populates_in_progress--;
1228 		if (ctx->populates_in_progress == 0) {
1229 			nvme_ctrlr_populate_namespaces_done(ctx);
1230 		}
1231 	}
1232 
1233 }
1234 
1235 static void
1236 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl)
1237 {
1238 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr		= arg;
1239 	union spdk_nvme_async_event_completion	event;
1240 
1241 	if (spdk_nvme_cpl_is_error(cpl)) {
1242 		SPDK_WARNLOG("AER request execute failed");
1243 		return;
1244 	}
1245 
1246 	event.raw = cpl->cdw0;
1247 	if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) &&
1248 	    (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) {
1249 		nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, NULL);
1250 	} else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_VENDOR) &&
1251 		   (event.bits.log_page_identifier == SPDK_OCSSD_LOG_CHUNK_NOTIFICATION) &&
1252 		   spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) {
1253 		bdev_ocssd_handle_chunk_notification(nvme_bdev_ctrlr);
1254 	}
1255 }
1256 
1257 static int
1258 create_ctrlr(struct spdk_nvme_ctrlr *ctrlr,
1259 	     const char *name,
1260 	     const struct spdk_nvme_transport_id *trid,
1261 	     uint32_t prchk_flags)
1262 {
1263 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
1264 	uint32_t i;
1265 	int rc;
1266 
1267 	nvme_bdev_ctrlr = calloc(1, sizeof(*nvme_bdev_ctrlr));
1268 	if (nvme_bdev_ctrlr == NULL) {
1269 		SPDK_ERRLOG("Failed to allocate device struct\n");
1270 		return -ENOMEM;
1271 	}
1272 	nvme_bdev_ctrlr->num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr);
1273 	nvme_bdev_ctrlr->namespaces = calloc(nvme_bdev_ctrlr->num_ns, sizeof(struct nvme_bdev_ns *));
1274 	if (!nvme_bdev_ctrlr->namespaces) {
1275 		SPDK_ERRLOG("Failed to allocate block namespaces pointer\n");
1276 		free(nvme_bdev_ctrlr);
1277 		return -ENOMEM;
1278 	}
1279 
1280 	for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
1281 		nvme_bdev_ctrlr->namespaces[i] = calloc(1, sizeof(struct nvme_bdev_ns));
1282 		if (nvme_bdev_ctrlr->namespaces[i] == NULL) {
1283 			SPDK_ERRLOG("Failed to allocate block namespace struct\n");
1284 			for (; i > 0; i--) {
1285 				free(nvme_bdev_ctrlr->namespaces[i - 1]);
1286 			}
1287 			free(nvme_bdev_ctrlr->namespaces);
1288 			free(nvme_bdev_ctrlr);
1289 			return -ENOMEM;
1290 		}
1291 	}
1292 
1293 	nvme_bdev_ctrlr->adminq_timer_poller = NULL;
1294 	nvme_bdev_ctrlr->ctrlr = ctrlr;
1295 	nvme_bdev_ctrlr->ref = 0;
1296 	nvme_bdev_ctrlr->trid = *trid;
1297 	nvme_bdev_ctrlr->name = strdup(name);
1298 	if (nvme_bdev_ctrlr->name == NULL) {
1299 		free(nvme_bdev_ctrlr->namespaces);
1300 		free(nvme_bdev_ctrlr);
1301 		return -ENOMEM;
1302 	}
1303 
1304 	if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) {
1305 		rc = bdev_ocssd_init_ctrlr(nvme_bdev_ctrlr);
1306 		if (spdk_unlikely(rc != 0)) {
1307 			SPDK_ERRLOG("Unable to initialize OCSSD controller\n");
1308 			free(nvme_bdev_ctrlr->name);
1309 			free(nvme_bdev_ctrlr->namespaces);
1310 			free(nvme_bdev_ctrlr);
1311 			return rc;
1312 		}
1313 	}
1314 
1315 	nvme_bdev_ctrlr->prchk_flags = prchk_flags;
1316 
1317 	spdk_io_device_register(nvme_bdev_ctrlr, bdev_nvme_create_cb, bdev_nvme_destroy_cb,
1318 				sizeof(struct nvme_io_channel),
1319 				name);
1320 
1321 	nvme_bdev_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, ctrlr,
1322 					       g_opts.nvme_adminq_poll_period_us);
1323 
1324 	TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nvme_bdev_ctrlr, tailq);
1325 
1326 	if (g_opts.timeout_us > 0) {
1327 		spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us,
1328 				timeout_cb, NULL);
1329 	}
1330 
1331 	spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_bdev_ctrlr);
1332 
1333 	if (spdk_nvme_ctrlr_get_flags(nvme_bdev_ctrlr->ctrlr) &
1334 	    SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) {
1335 		nvme_bdev_ctrlr->opal_dev = spdk_opal_dev_construct(nvme_bdev_ctrlr->ctrlr);
1336 		if (nvme_bdev_ctrlr->opal_dev == NULL) {
1337 			SPDK_ERRLOG("Failed to initialize Opal\n");
1338 		}
1339 	}
1340 	return 0;
1341 }
1342 
1343 static void
1344 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
1345 	  struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
1346 {
1347 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
1348 	struct nvme_probe_ctx *ctx = cb_ctx;
1349 	char *name = NULL;
1350 	uint32_t prchk_flags = 0;
1351 	size_t i;
1352 
1353 	if (ctx) {
1354 		for (i = 0; i < ctx->count; i++) {
1355 			if (spdk_nvme_transport_id_compare(trid, &ctx->trids[i]) == 0) {
1356 				prchk_flags = ctx->prchk_flags[i];
1357 				name = strdup(ctx->names[i]);
1358 				break;
1359 			}
1360 		}
1361 	} else {
1362 		name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++);
1363 	}
1364 	if (!name) {
1365 		SPDK_ERRLOG("Failed to assign name to NVMe device\n");
1366 		return;
1367 	}
1368 
1369 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "Attached to %s (%s)\n", trid->traddr, name);
1370 
1371 	create_ctrlr(ctrlr, name, trid, prchk_flags);
1372 
1373 	nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(trid);
1374 	if (!nvme_bdev_ctrlr) {
1375 		SPDK_ERRLOG("Failed to find new NVMe controller\n");
1376 		free(name);
1377 		return;
1378 	}
1379 
1380 	nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, NULL);
1381 
1382 	free(name);
1383 }
1384 
1385 static void
1386 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr)
1387 {
1388 	uint32_t i;
1389 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
1390 	struct nvme_bdev_ns *ns;
1391 
1392 	pthread_mutex_lock(&g_bdev_nvme_mutex);
1393 	TAILQ_FOREACH(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
1394 		if (nvme_bdev_ctrlr->ctrlr == ctrlr) {
1395 			/* The controller's destruction was already started */
1396 			if (nvme_bdev_ctrlr->destruct) {
1397 				pthread_mutex_unlock(&g_bdev_nvme_mutex);
1398 				return;
1399 			}
1400 			pthread_mutex_unlock(&g_bdev_nvme_mutex);
1401 			for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
1402 				uint32_t	nsid = i + 1;
1403 
1404 				ns = nvme_bdev_ctrlr->namespaces[nsid - 1];
1405 				if (ns->populated) {
1406 					assert(ns->id == nsid);
1407 					nvme_ctrlr_depopulate_namespace(nvme_bdev_ctrlr, ns);
1408 				}
1409 			}
1410 
1411 			pthread_mutex_lock(&g_bdev_nvme_mutex);
1412 			nvme_bdev_ctrlr->destruct = true;
1413 			if (nvme_bdev_ctrlr->ref == 0) {
1414 				pthread_mutex_unlock(&g_bdev_nvme_mutex);
1415 				nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr);
1416 			} else {
1417 				pthread_mutex_unlock(&g_bdev_nvme_mutex);
1418 			}
1419 			return;
1420 		}
1421 	}
1422 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
1423 }
1424 
1425 static int
1426 bdev_nvme_hotplug(void *arg)
1427 {
1428 	struct spdk_nvme_transport_id trid_pcie;
1429 	int done;
1430 
1431 	if (!g_hotplug_probe_ctx) {
1432 		memset(&trid_pcie, 0, sizeof(trid_pcie));
1433 		spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE);
1434 
1435 		g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL,
1436 				      hotplug_probe_cb,
1437 				      attach_cb, remove_cb);
1438 		if (!g_hotplug_probe_ctx) {
1439 			return -1;
1440 		}
1441 	}
1442 
1443 	done = spdk_nvme_probe_poll_async(g_hotplug_probe_ctx);
1444 	if (done != -EAGAIN) {
1445 		g_hotplug_probe_ctx = NULL;
1446 		return 1;
1447 	}
1448 
1449 	return -1;
1450 }
1451 
1452 void
1453 spdk_bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts)
1454 {
1455 	*opts = g_opts;
1456 }
1457 
1458 int
1459 spdk_bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts)
1460 {
1461 	if (g_bdev_nvme_init_thread != NULL) {
1462 		if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
1463 			return -EPERM;
1464 		}
1465 	}
1466 
1467 	g_opts = *opts;
1468 
1469 	return 0;
1470 }
1471 
1472 struct set_nvme_hotplug_ctx {
1473 	uint64_t period_us;
1474 	bool enabled;
1475 	spdk_msg_fn fn;
1476 	void *fn_ctx;
1477 };
1478 
1479 static void
1480 set_nvme_hotplug_period_cb(void *_ctx)
1481 {
1482 	struct set_nvme_hotplug_ctx *ctx = _ctx;
1483 
1484 	spdk_poller_unregister(&g_hotplug_poller);
1485 	if (ctx->enabled) {
1486 		g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us);
1487 	}
1488 
1489 	g_nvme_hotplug_poll_period_us = ctx->period_us;
1490 	g_nvme_hotplug_enabled = ctx->enabled;
1491 	if (ctx->fn) {
1492 		ctx->fn(ctx->fn_ctx);
1493 	}
1494 
1495 	free(ctx);
1496 }
1497 
1498 int
1499 spdk_bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx)
1500 {
1501 	struct set_nvme_hotplug_ctx *ctx;
1502 
1503 	if (enabled == true && !spdk_process_is_primary()) {
1504 		return -EPERM;
1505 	}
1506 
1507 	ctx = calloc(1, sizeof(*ctx));
1508 	if (ctx == NULL) {
1509 		return -ENOMEM;
1510 	}
1511 
1512 	period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us;
1513 	ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX);
1514 	ctx->enabled = enabled;
1515 	ctx->fn = cb;
1516 	ctx->fn_ctx = cb_ctx;
1517 
1518 	spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx);
1519 	return 0;
1520 }
1521 
1522 static void
1523 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, size_t count, int rc)
1524 {
1525 	if (ctx->cb_fn) {
1526 		ctx->cb_fn(ctx->cb_ctx, count, rc);
1527 	}
1528 
1529 	free(ctx);
1530 }
1531 
1532 static void
1533 nvme_ctrlr_populate_namespaces_done(struct nvme_async_probe_ctx *ctx)
1534 {
1535 	struct nvme_bdev_ctrlr	*nvme_bdev_ctrlr;
1536 	struct nvme_bdev_ns	*ns;
1537 	struct nvme_bdev	*nvme_bdev, *tmp;
1538 	uint32_t		i, nsid;
1539 	size_t			j;
1540 
1541 	nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(&ctx->trid);
1542 
1543 	/*
1544 	 * Report the new bdevs that were created in this call.
1545 	 * There can be more than one bdev per NVMe controller.
1546 	 */
1547 	j = 0;
1548 	for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
1549 		nsid = i + 1;
1550 		ns = nvme_bdev_ctrlr->namespaces[nsid - 1];
1551 		if (!ns->populated) {
1552 			continue;
1553 		}
1554 		assert(ns->id == nsid);
1555 		TAILQ_FOREACH_SAFE(nvme_bdev, &ns->bdevs, tailq, tmp) {
1556 			if (j < ctx->count) {
1557 				ctx->names[j] = nvme_bdev->disk.name;
1558 				j++;
1559 			} else {
1560 				SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n",
1561 					    ctx->count);
1562 				populate_namespaces_cb(ctx, 0, -ERANGE);
1563 				return;
1564 			}
1565 		}
1566 	}
1567 
1568 	populate_namespaces_cb(ctx, j, 0);
1569 }
1570 
1571 static void
1572 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
1573 		  struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
1574 {
1575 	struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx;
1576 	struct nvme_bdev_ctrlr	*nvme_bdev_ctrlr;
1577 	struct nvme_async_probe_ctx *ctx;
1578 	int rc;
1579 
1580 	ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, opts);
1581 
1582 	spdk_poller_unregister(&ctx->poller);
1583 
1584 	rc = create_ctrlr(ctrlr, ctx->base_name, &ctx->trid, ctx->prchk_flags);
1585 	if (rc) {
1586 		SPDK_ERRLOG("Failed to create new device\n");
1587 		populate_namespaces_cb(ctx, 0, rc);
1588 		return;
1589 	}
1590 
1591 	nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(&ctx->trid);
1592 	assert(nvme_bdev_ctrlr != NULL);
1593 
1594 	nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, ctx);
1595 }
1596 
1597 static int
1598 bdev_nvme_async_poll(void *arg)
1599 {
1600 	struct nvme_async_probe_ctx	*ctx = arg;
1601 	int				rc;
1602 
1603 	rc = spdk_nvme_probe_poll_async(ctx->probe_ctx);
1604 	if (spdk_unlikely(rc != -EAGAIN && rc != 0)) {
1605 		spdk_poller_unregister(&ctx->poller);
1606 		free(ctx);
1607 	}
1608 
1609 	return 1;
1610 }
1611 
1612 int
1613 spdk_bdev_nvme_create(struct spdk_nvme_transport_id *trid,
1614 		      struct spdk_nvme_host_id *hostid,
1615 		      const char *base_name,
1616 		      const char **names,
1617 		      uint32_t count,
1618 		      const char *hostnqn,
1619 		      uint32_t prchk_flags,
1620 		      spdk_bdev_create_nvme_fn cb_fn,
1621 		      void *cb_ctx)
1622 {
1623 	struct nvme_probe_skip_entry	*entry, *tmp;
1624 	struct nvme_async_probe_ctx	*ctx;
1625 
1626 	if (nvme_bdev_ctrlr_get(trid) != NULL) {
1627 		SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr);
1628 		return -EEXIST;
1629 	}
1630 
1631 	if (nvme_bdev_ctrlr_get_by_name(base_name)) {
1632 		SPDK_ERRLOG("A controller with the provided name (%s) already exists.\n", base_name);
1633 		return -EEXIST;
1634 	}
1635 
1636 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
1637 		TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) {
1638 			if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) {
1639 				TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq);
1640 				free(entry);
1641 				break;
1642 			}
1643 		}
1644 	}
1645 
1646 	ctx = calloc(1, sizeof(*ctx));
1647 	if (!ctx) {
1648 		return -ENOMEM;
1649 	}
1650 	ctx->base_name = base_name;
1651 	ctx->names = names;
1652 	ctx->count = count;
1653 	ctx->cb_fn = cb_fn;
1654 	ctx->cb_ctx = cb_ctx;
1655 	ctx->prchk_flags = prchk_flags;
1656 	ctx->trid = *trid;
1657 
1658 	spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->opts, sizeof(ctx->opts));
1659 	ctx->opts.transport_retry_count = g_opts.retry_count;
1660 
1661 	if (hostnqn) {
1662 		snprintf(ctx->opts.hostnqn, sizeof(ctx->opts.hostnqn), "%s", hostnqn);
1663 	}
1664 
1665 	if (hostid->hostaddr[0] != '\0') {
1666 		snprintf(ctx->opts.src_addr, sizeof(ctx->opts.src_addr), "%s", hostid->hostaddr);
1667 	}
1668 
1669 	if (hostid->hostsvcid[0] != '\0') {
1670 		snprintf(ctx->opts.src_svcid, sizeof(ctx->opts.src_svcid), "%s", hostid->hostsvcid);
1671 	}
1672 
1673 	ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->opts, connect_attach_cb);
1674 	if (ctx->probe_ctx == NULL) {
1675 		SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr);
1676 		free(ctx);
1677 		return -ENODEV;
1678 	}
1679 	ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000);
1680 
1681 	return 0;
1682 }
1683 
1684 int
1685 spdk_bdev_nvme_delete(const char *name)
1686 {
1687 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = NULL;
1688 	struct nvme_probe_skip_entry *entry;
1689 
1690 	if (name == NULL) {
1691 		return -EINVAL;
1692 	}
1693 
1694 	nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name);
1695 	if (nvme_bdev_ctrlr == NULL) {
1696 		SPDK_ERRLOG("Failed to find NVMe controller\n");
1697 		return -ENODEV;
1698 	}
1699 
1700 	if (nvme_bdev_ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) {
1701 		entry = calloc(1, sizeof(*entry));
1702 		if (!entry) {
1703 			return -ENOMEM;
1704 		}
1705 		entry->trid = nvme_bdev_ctrlr->trid;
1706 		TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq);
1707 	}
1708 
1709 	remove_cb(NULL, nvme_bdev_ctrlr->ctrlr);
1710 	return 0;
1711 }
1712 
1713 static int
1714 bdev_nvme_library_init(void)
1715 {
1716 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
1717 	struct spdk_conf_section *sp;
1718 	const char *val;
1719 	int rc = 0;
1720 	int64_t intval = 0;
1721 	size_t i;
1722 	struct nvme_probe_ctx *probe_ctx = NULL;
1723 	int retry_count;
1724 	uint32_t local_nvme_num = 0;
1725 	int64_t hotplug_period;
1726 	bool hotplug_enabled = g_nvme_hotplug_enabled;
1727 
1728 	g_bdev_nvme_init_thread = spdk_get_thread();
1729 
1730 	sp = spdk_conf_find_section(NULL, "Nvme");
1731 	if (sp == NULL) {
1732 		goto end;
1733 	}
1734 
1735 	probe_ctx = calloc(1, sizeof(*probe_ctx));
1736 	if (probe_ctx == NULL) {
1737 		SPDK_ERRLOG("Failed to allocate probe_ctx\n");
1738 		rc = -1;
1739 		goto end;
1740 	}
1741 
1742 	retry_count = spdk_conf_section_get_intval(sp, "RetryCount");
1743 	if (retry_count >= 0) {
1744 		g_opts.retry_count = retry_count;
1745 	}
1746 
1747 	val = spdk_conf_section_get_val(sp, "TimeoutUsec");
1748 	if (val != NULL) {
1749 		intval = spdk_strtoll(val, 10);
1750 		if (intval < 0) {
1751 			SPDK_ERRLOG("Invalid TimeoutUsec value\n");
1752 			rc = -1;
1753 			goto end;
1754 		}
1755 	}
1756 
1757 	g_opts.timeout_us = intval;
1758 
1759 	if (g_opts.timeout_us > 0) {
1760 		val = spdk_conf_section_get_val(sp, "ActionOnTimeout");
1761 		if (val != NULL) {
1762 			if (!strcasecmp(val, "Reset")) {
1763 				g_opts.action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET;
1764 			} else if (!strcasecmp(val, "Abort")) {
1765 				g_opts.action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT;
1766 			}
1767 		}
1768 	}
1769 
1770 	intval = spdk_conf_section_get_intval(sp, "AdminPollRate");
1771 	if (intval > 0) {
1772 		g_opts.nvme_adminq_poll_period_us = intval;
1773 	}
1774 
1775 	intval = spdk_conf_section_get_intval(sp, "IOPollRate");
1776 	if (intval > 0) {
1777 		g_opts.nvme_ioq_poll_period_us = intval;
1778 	}
1779 
1780 	if (spdk_process_is_primary()) {
1781 		hotplug_enabled = spdk_conf_section_get_boolval(sp, "HotplugEnable", false);
1782 	}
1783 
1784 	hotplug_period = spdk_conf_section_get_intval(sp, "HotplugPollRate");
1785 	if (hotplug_period < 0) {
1786 		hotplug_period = 0;
1787 	}
1788 
1789 	g_nvme_hostnqn = spdk_conf_section_get_val(sp, "HostNQN");
1790 	probe_ctx->hostnqn = g_nvme_hostnqn;
1791 
1792 	g_opts.delay_cmd_submit = spdk_conf_section_get_boolval(sp, "DelayCmdSubmit",
1793 				  SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT);
1794 
1795 	for (i = 0; i < NVME_MAX_CONTROLLERS; i++) {
1796 		val = spdk_conf_section_get_nmval(sp, "TransportID", i, 0);
1797 		if (val == NULL) {
1798 			break;
1799 		}
1800 
1801 		rc = spdk_nvme_transport_id_parse(&probe_ctx->trids[i], val);
1802 		if (rc < 0) {
1803 			SPDK_ERRLOG("Unable to parse TransportID: %s\n", val);
1804 			rc = -1;
1805 			goto end;
1806 		}
1807 
1808 		rc = spdk_nvme_host_id_parse(&probe_ctx->hostids[i], val);
1809 		if (rc < 0) {
1810 			SPDK_ERRLOG("Unable to parse HostID: %s\n", val);
1811 			rc = -1;
1812 			goto end;
1813 		}
1814 
1815 		val = spdk_conf_section_get_nmval(sp, "TransportID", i, 1);
1816 		if (val == NULL) {
1817 			SPDK_ERRLOG("No name provided for TransportID\n");
1818 			rc = -1;
1819 			goto end;
1820 		}
1821 
1822 		probe_ctx->names[i] = val;
1823 
1824 		val = spdk_conf_section_get_nmval(sp, "TransportID", i, 2);
1825 		if (val != NULL) {
1826 			rc = spdk_nvme_prchk_flags_parse(&probe_ctx->prchk_flags[i], val);
1827 			if (rc < 0) {
1828 				SPDK_ERRLOG("Unable to parse prchk: %s\n", val);
1829 				rc = -1;
1830 				goto end;
1831 			}
1832 		}
1833 
1834 		probe_ctx->count++;
1835 
1836 		if (probe_ctx->trids[i].trtype != SPDK_NVME_TRANSPORT_PCIE) {
1837 			struct spdk_nvme_ctrlr *ctrlr;
1838 			struct spdk_nvme_ctrlr_opts opts;
1839 
1840 			if (nvme_bdev_ctrlr_get(&probe_ctx->trids[i])) {
1841 				SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n",
1842 					    probe_ctx->trids[i].traddr);
1843 				rc = -1;
1844 				goto end;
1845 			}
1846 
1847 			if (probe_ctx->trids[i].subnqn[0] == '\0') {
1848 				SPDK_ERRLOG("Need to provide subsystem nqn\n");
1849 				rc = -1;
1850 				goto end;
1851 			}
1852 
1853 			spdk_nvme_ctrlr_get_default_ctrlr_opts(&opts, sizeof(opts));
1854 			opts.transport_retry_count = g_opts.retry_count;
1855 
1856 			if (probe_ctx->hostnqn != NULL) {
1857 				snprintf(opts.hostnqn, sizeof(opts.hostnqn), "%s", probe_ctx->hostnqn);
1858 			}
1859 
1860 			if (probe_ctx->hostids[i].hostaddr[0] != '\0') {
1861 				snprintf(opts.src_addr, sizeof(opts.src_addr), "%s", probe_ctx->hostids[i].hostaddr);
1862 			}
1863 
1864 			if (probe_ctx->hostids[i].hostsvcid[0] != '\0') {
1865 				snprintf(opts.src_svcid, sizeof(opts.src_svcid), "%s", probe_ctx->hostids[i].hostsvcid);
1866 			}
1867 
1868 			ctrlr = spdk_nvme_connect(&probe_ctx->trids[i], &opts, sizeof(opts));
1869 			if (ctrlr == NULL) {
1870 				SPDK_ERRLOG("Unable to connect to provided trid (traddr: %s)\n",
1871 					    probe_ctx->trids[i].traddr);
1872 				rc = -1;
1873 				goto end;
1874 			}
1875 
1876 			rc = create_ctrlr(ctrlr, probe_ctx->names[i], &probe_ctx->trids[i], 0);
1877 			if (rc) {
1878 				goto end;
1879 			}
1880 
1881 			nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(&probe_ctx->trids[i]);
1882 			if (!nvme_bdev_ctrlr) {
1883 				SPDK_ERRLOG("Failed to find new NVMe controller\n");
1884 				rc = -ENODEV;
1885 				goto end;
1886 			}
1887 
1888 			nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, NULL);
1889 		} else {
1890 			local_nvme_num++;
1891 		}
1892 	}
1893 
1894 	if (local_nvme_num > 0) {
1895 		/* used to probe local NVMe device */
1896 		if (spdk_nvme_probe(NULL, probe_ctx, probe_cb, attach_cb, remove_cb)) {
1897 			rc = -1;
1898 			goto end;
1899 		}
1900 
1901 		for (i = 0; i < probe_ctx->count; i++) {
1902 			if (probe_ctx->trids[i].trtype != SPDK_NVME_TRANSPORT_PCIE) {
1903 				continue;
1904 			}
1905 
1906 			if (!nvme_bdev_ctrlr_get(&probe_ctx->trids[i])) {
1907 				SPDK_ERRLOG("NVMe SSD \"%s\" could not be found.\n", probe_ctx->trids[i].traddr);
1908 				SPDK_ERRLOG("Check PCIe BDF and that it is attached to UIO/VFIO driver.\n");
1909 			}
1910 		}
1911 	}
1912 
1913 	rc = spdk_bdev_nvme_set_hotplug(hotplug_enabled, hotplug_period, NULL, NULL);
1914 	if (rc) {
1915 		SPDK_ERRLOG("Failed to setup hotplug (%d): %s", rc, spdk_strerror(rc));
1916 		rc = -1;
1917 	}
1918 end:
1919 	free(probe_ctx);
1920 	return rc;
1921 }
1922 
1923 static void
1924 bdev_nvme_library_fini(void)
1925 {
1926 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, *tmp;
1927 	struct nvme_probe_skip_entry *entry, *entry_tmp;
1928 	struct nvme_bdev_ns *ns;
1929 	uint32_t i;
1930 
1931 	spdk_poller_unregister(&g_hotplug_poller);
1932 	free(g_hotplug_probe_ctx);
1933 
1934 	TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) {
1935 		TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq);
1936 		free(entry);
1937 	}
1938 
1939 	pthread_mutex_lock(&g_bdev_nvme_mutex);
1940 	TAILQ_FOREACH_SAFE(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq, tmp) {
1941 		if (nvme_bdev_ctrlr->destruct) {
1942 			/* This controller's destruction was already started
1943 			 * before the application started shutting down
1944 			 */
1945 			continue;
1946 		}
1947 
1948 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
1949 
1950 		for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
1951 			uint32_t nsid = i + 1;
1952 
1953 			ns = nvme_bdev_ctrlr->namespaces[nsid - 1];
1954 			if (ns->populated) {
1955 				assert(ns->id == nsid);
1956 				nvme_ctrlr_depopulate_namespace(nvme_bdev_ctrlr, ns);
1957 			}
1958 		}
1959 
1960 		pthread_mutex_lock(&g_bdev_nvme_mutex);
1961 		nvme_bdev_ctrlr->destruct = true;
1962 
1963 		if (nvme_bdev_ctrlr->ref == 0) {
1964 			pthread_mutex_unlock(&g_bdev_nvme_mutex);
1965 			nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr);
1966 			pthread_mutex_lock(&g_bdev_nvme_mutex);
1967 		}
1968 	}
1969 
1970 	g_bdev_nvme_module_finish = true;
1971 	if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
1972 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
1973 		spdk_bdev_module_finish_done();
1974 		return;
1975 	}
1976 
1977 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
1978 }
1979 
1980 static void
1981 bdev_nvme_verify_pi_error(struct spdk_bdev_io *bdev_io)
1982 {
1983 	struct spdk_bdev *bdev = bdev_io->bdev;
1984 	struct spdk_dif_ctx dif_ctx;
1985 	struct spdk_dif_error err_blk = {};
1986 	int rc;
1987 
1988 	rc = spdk_dif_ctx_init(&dif_ctx,
1989 			       bdev->blocklen, bdev->md_len, bdev->md_interleave,
1990 			       bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags,
1991 			       bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0);
1992 	if (rc != 0) {
1993 		SPDK_ERRLOG("Initialization of DIF context failed\n");
1994 		return;
1995 	}
1996 
1997 	if (bdev->md_interleave) {
1998 		rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
1999 				     bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
2000 	} else {
2001 		struct iovec md_iov = {
2002 			.iov_base	= bdev_io->u.bdev.md_buf,
2003 			.iov_len	= bdev_io->u.bdev.num_blocks * bdev->md_len,
2004 		};
2005 
2006 		rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
2007 				     &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
2008 	}
2009 
2010 	if (rc != 0) {
2011 		SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n",
2012 			    err_blk.err_type, err_blk.err_offset);
2013 	} else {
2014 		SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n");
2015 	}
2016 }
2017 
2018 static void
2019 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl)
2020 {
2021 	struct nvme_bdev_io *bio = ref;
2022 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2023 
2024 	if (spdk_nvme_cpl_is_success(cpl)) {
2025 		/* Run PI verification for read data buffer. */
2026 		bdev_nvme_verify_pi_error(bdev_io);
2027 	}
2028 
2029 	/* Return original completion status */
2030 	spdk_bdev_io_complete_nvme_status(bdev_io, bio->cpl.cdw0, bio->cpl.status.sct,
2031 					  bio->cpl.status.sc);
2032 }
2033 
2034 static void
2035 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl)
2036 {
2037 	struct nvme_bdev_io *bio = ref;
2038 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2039 	int ret;
2040 
2041 	if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) {
2042 		SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n",
2043 			    cpl->status.sct, cpl->status.sc);
2044 
2045 		/* Save completion status to use after verifying PI error. */
2046 		bio->cpl = *cpl;
2047 
2048 		/* Read without PI checking to verify PI error. */
2049 		ret = bdev_nvme_no_pi_readv((struct nvme_bdev *)bdev_io->bdev->ctxt,
2050 					    spdk_bdev_io_get_io_channel(bdev_io),
2051 					    bio,
2052 					    bdev_io->u.bdev.iovs,
2053 					    bdev_io->u.bdev.iovcnt,
2054 					    bdev_io->u.bdev.md_buf,
2055 					    bdev_io->u.bdev.num_blocks,
2056 					    bdev_io->u.bdev.offset_blocks);
2057 		if (ret == 0) {
2058 			return;
2059 		}
2060 	}
2061 
2062 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2063 }
2064 
2065 static void
2066 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl)
2067 {
2068 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref);
2069 
2070 	if (spdk_nvme_cpl_is_pi_error(cpl)) {
2071 		SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n",
2072 			    cpl->status.sct, cpl->status.sc);
2073 		/* Run PI verification for write data buffer if PI error is detected. */
2074 		bdev_nvme_verify_pi_error(bdev_io);
2075 	}
2076 
2077 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2078 }
2079 
2080 static void
2081 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl)
2082 {
2083 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref);
2084 
2085 	if (spdk_nvme_cpl_is_pi_error(cpl)) {
2086 		SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n",
2087 			    cpl->status.sct, cpl->status.sc);
2088 		/* Run PI verification for compare data buffer if PI error is detected. */
2089 		bdev_nvme_verify_pi_error(bdev_io);
2090 	}
2091 
2092 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2093 }
2094 
2095 static void
2096 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl)
2097 {
2098 	struct nvme_bdev_io *bio = ref;
2099 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2100 
2101 	/* Compare operation completion */
2102 	if ((cpl->cdw0 & 0xFF) == SPDK_NVME_OPC_COMPARE) {
2103 		/* Save compare result for write callback */
2104 		bio->cpl = *cpl;
2105 		return;
2106 	}
2107 
2108 	/* Write operation completion */
2109 	if (spdk_nvme_cpl_is_error(&bio->cpl)) {
2110 		/* If bio->cpl is already an error, it means the compare operation failed.  In that case,
2111 		 * complete the IO with the compare operation's status.
2112 		 */
2113 		if (!spdk_nvme_cpl_is_error(cpl)) {
2114 			SPDK_ERRLOG("Unexpected write success after compare failure.\n");
2115 		}
2116 
2117 		spdk_bdev_io_complete_nvme_status(bdev_io, bio->cpl.cdw0, bio->cpl.status.sct, bio->cpl.status.sc);
2118 	} else {
2119 		spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2120 	}
2121 }
2122 
2123 static void
2124 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl)
2125 {
2126 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref);
2127 
2128 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2129 }
2130 
2131 static void
2132 bdev_nvme_admin_passthru_completion(void *ctx)
2133 {
2134 	struct nvme_bdev_io *bio = ctx;
2135 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2136 
2137 	spdk_bdev_io_complete_nvme_status(bdev_io,
2138 					  bio->cpl.cdw0, bio->cpl.status.sct, bio->cpl.status.sc);
2139 }
2140 
2141 static void
2142 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl)
2143 {
2144 	struct nvme_bdev_io *bio = ref;
2145 
2146 	bio->cpl = *cpl;
2147 	spdk_thread_send_msg(bio->orig_thread, bdev_nvme_admin_passthru_completion, bio);
2148 }
2149 
2150 static void
2151 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset)
2152 {
2153 	struct nvme_bdev_io *bio = ref;
2154 	struct iovec *iov;
2155 
2156 	bio->iov_offset = sgl_offset;
2157 	for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) {
2158 		iov = &bio->iovs[bio->iovpos];
2159 		if (bio->iov_offset < iov->iov_len) {
2160 			break;
2161 		}
2162 
2163 		bio->iov_offset -= iov->iov_len;
2164 	}
2165 }
2166 
2167 static int
2168 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length)
2169 {
2170 	struct nvme_bdev_io *bio = ref;
2171 	struct iovec *iov;
2172 
2173 	assert(bio->iovpos < bio->iovcnt);
2174 
2175 	iov = &bio->iovs[bio->iovpos];
2176 
2177 	*address = iov->iov_base;
2178 	*length = iov->iov_len;
2179 
2180 	if (bio->iov_offset) {
2181 		assert(bio->iov_offset <= iov->iov_len);
2182 		*address += bio->iov_offset;
2183 		*length -= bio->iov_offset;
2184 	}
2185 
2186 	bio->iov_offset += *length;
2187 	if (bio->iov_offset == iov->iov_len) {
2188 		bio->iovpos++;
2189 		bio->iov_offset = 0;
2190 	}
2191 
2192 	return 0;
2193 }
2194 
2195 static void
2196 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset)
2197 {
2198 	struct nvme_bdev_io *bio = ref;
2199 	struct iovec *iov;
2200 
2201 	bio->fused_iov_offset = sgl_offset;
2202 	for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) {
2203 		iov = &bio->fused_iovs[bio->fused_iovpos];
2204 		if (bio->fused_iov_offset < iov->iov_len) {
2205 			break;
2206 		}
2207 
2208 		bio->fused_iov_offset -= iov->iov_len;
2209 	}
2210 }
2211 
2212 static int
2213 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length)
2214 {
2215 	struct nvme_bdev_io *bio = ref;
2216 	struct iovec *iov;
2217 
2218 	assert(bio->fused_iovpos < bio->fused_iovcnt);
2219 
2220 	iov = &bio->fused_iovs[bio->fused_iovpos];
2221 
2222 	*address = iov->iov_base;
2223 	*length = iov->iov_len;
2224 
2225 	if (bio->fused_iov_offset) {
2226 		assert(bio->fused_iov_offset <= iov->iov_len);
2227 		*address += bio->fused_iov_offset;
2228 		*length -= bio->fused_iov_offset;
2229 	}
2230 
2231 	bio->fused_iov_offset += *length;
2232 	if (bio->fused_iov_offset == iov->iov_len) {
2233 		bio->fused_iovpos++;
2234 		bio->fused_iov_offset = 0;
2235 	}
2236 
2237 	return 0;
2238 }
2239 
2240 static int
2241 bdev_nvme_no_pi_readv(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
2242 		      struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
2243 		      void *md, uint64_t lba_count, uint64_t lba)
2244 {
2245 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
2246 	int rc;
2247 
2248 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "read %lu blocks with offset %#lx without PI check\n",
2249 		      lba_count, lba);
2250 
2251 	bio->iovs = iov;
2252 	bio->iovcnt = iovcnt;
2253 	bio->iovpos = 0;
2254 	bio->iov_offset = 0;
2255 
2256 	rc = spdk_nvme_ns_cmd_readv_with_md(nbdev->nvme_ns->ns, nvme_ch->qpair, lba, lba_count,
2257 					    bdev_nvme_no_pi_readv_done, bio, 0,
2258 					    bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
2259 					    md, 0, 0);
2260 
2261 	if (rc != 0 && rc != -ENOMEM) {
2262 		SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc);
2263 	}
2264 	return rc;
2265 }
2266 
2267 static int
2268 bdev_nvme_readv(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
2269 		struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
2270 		void *md, uint64_t lba_count, uint64_t lba)
2271 {
2272 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
2273 	int rc;
2274 
2275 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "read %lu blocks with offset %#lx\n",
2276 		      lba_count, lba);
2277 
2278 	bio->iovs = iov;
2279 	bio->iovcnt = iovcnt;
2280 	bio->iovpos = 0;
2281 	bio->iov_offset = 0;
2282 
2283 	rc = spdk_nvme_ns_cmd_readv_with_md(nbdev->nvme_ns->ns, nvme_ch->qpair, lba, lba_count,
2284 					    bdev_nvme_readv_done, bio, nbdev->disk.dif_check_flags,
2285 					    bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
2286 					    md, 0, 0);
2287 
2288 	if (rc != 0 && rc != -ENOMEM) {
2289 		SPDK_ERRLOG("readv failed: rc = %d\n", rc);
2290 	}
2291 	return rc;
2292 }
2293 
2294 static int
2295 bdev_nvme_writev(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
2296 		 struct nvme_bdev_io *bio,
2297 		 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba)
2298 {
2299 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
2300 	int rc;
2301 
2302 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "write %lu blocks with offset %#lx\n",
2303 		      lba_count, lba);
2304 
2305 	bio->iovs = iov;
2306 	bio->iovcnt = iovcnt;
2307 	bio->iovpos = 0;
2308 	bio->iov_offset = 0;
2309 
2310 	rc = spdk_nvme_ns_cmd_writev_with_md(nbdev->nvme_ns->ns, nvme_ch->qpair, lba, lba_count,
2311 					     bdev_nvme_writev_done, bio, nbdev->disk.dif_check_flags,
2312 					     bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
2313 					     md, 0, 0);
2314 
2315 	if (rc != 0 && rc != -ENOMEM) {
2316 		SPDK_ERRLOG("writev failed: rc = %d\n", rc);
2317 	}
2318 	return rc;
2319 }
2320 
2321 static int
2322 bdev_nvme_comparev(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
2323 		   struct nvme_bdev_io *bio,
2324 		   struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba)
2325 {
2326 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
2327 	int rc;
2328 
2329 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "compare %lu blocks with offset %#lx\n",
2330 		      lba_count, lba);
2331 
2332 	bio->iovs = iov;
2333 	bio->iovcnt = iovcnt;
2334 	bio->iovpos = 0;
2335 	bio->iov_offset = 0;
2336 
2337 	rc = spdk_nvme_ns_cmd_comparev_with_md(nbdev->nvme_ns->ns, nvme_ch->qpair, lba, lba_count,
2338 					       bdev_nvme_comparev_done, bio, nbdev->disk.dif_check_flags,
2339 					       bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
2340 					       md, 0, 0);
2341 
2342 	if (rc != 0 && rc != -ENOMEM) {
2343 		SPDK_ERRLOG("comparev failed: rc = %d\n", rc);
2344 	}
2345 	return rc;
2346 }
2347 
2348 static int
2349 bdev_nvme_comparev_and_writev(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
2350 			      struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov,
2351 			      int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba)
2352 {
2353 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
2354 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2355 	uint32_t flags = nbdev->disk.dif_check_flags;
2356 	int rc;
2357 
2358 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "compare and write %lu blocks with offset %#lx\n",
2359 		      lba_count, lba);
2360 
2361 	bio->iovs = cmp_iov;
2362 	bio->iovcnt = cmp_iovcnt;
2363 	bio->iovpos = 0;
2364 	bio->iov_offset = 0;
2365 	bio->fused_iovs = write_iov;
2366 	bio->fused_iovcnt = write_iovcnt;
2367 	bio->fused_iovpos = 0;
2368 	bio->fused_iov_offset = 0;
2369 
2370 	if (bdev_io->num_retries == 0) {
2371 		bio->first_fused_submitted = false;
2372 	}
2373 
2374 	if (!bio->first_fused_submitted) {
2375 		flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST;
2376 		memset(&bio->cpl, 0, sizeof(bio->cpl));
2377 
2378 		rc = spdk_nvme_ns_cmd_comparev_with_md(nbdev->nvme_ns->ns, nvme_ch->qpair, lba, lba_count,
2379 						       bdev_nvme_comparev_and_writev_done, bio, flags,
2380 						       bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0);
2381 		if (rc == 0) {
2382 			bio->first_fused_submitted = true;
2383 			flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST;
2384 		} else {
2385 			if (rc != -ENOMEM) {
2386 				SPDK_ERRLOG("compare failed: rc = %d\n", rc);
2387 			}
2388 			return rc;
2389 		}
2390 	}
2391 
2392 	flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND;
2393 
2394 	rc = spdk_nvme_ns_cmd_writev_with_md(nbdev->nvme_ns->ns, nvme_ch->qpair, lba, lba_count,
2395 					     bdev_nvme_comparev_and_writev_done, bio, flags,
2396 					     bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0);
2397 	if (rc != 0 && rc != -ENOMEM) {
2398 		SPDK_ERRLOG("write failed: rc = %d\n", rc);
2399 		rc = 0;
2400 	}
2401 
2402 	return rc;
2403 }
2404 
2405 static int
2406 bdev_nvme_unmap(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
2407 		struct nvme_bdev_io *bio,
2408 		uint64_t offset_blocks,
2409 		uint64_t num_blocks)
2410 {
2411 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
2412 	struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES];
2413 	struct spdk_nvme_dsm_range *range;
2414 	uint64_t offset, remaining;
2415 	uint64_t num_ranges_u64;
2416 	uint16_t num_ranges;
2417 	int rc;
2418 
2419 	num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) /
2420 			 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
2421 	if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) {
2422 		SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks);
2423 		return -EINVAL;
2424 	}
2425 	num_ranges = (uint16_t)num_ranges_u64;
2426 
2427 	offset = offset_blocks;
2428 	remaining = num_blocks;
2429 	range = &dsm_ranges[0];
2430 
2431 	/* Fill max-size ranges until the remaining blocks fit into one range */
2432 	while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) {
2433 		range->attributes.raw = 0;
2434 		range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
2435 		range->starting_lba = offset;
2436 
2437 		offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
2438 		remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
2439 		range++;
2440 	}
2441 
2442 	/* Final range describes the remaining blocks */
2443 	range->attributes.raw = 0;
2444 	range->length = remaining;
2445 	range->starting_lba = offset;
2446 
2447 	rc = spdk_nvme_ns_cmd_dataset_management(nbdev->nvme_ns->ns, nvme_ch->qpair,
2448 			SPDK_NVME_DSM_ATTR_DEALLOCATE,
2449 			dsm_ranges, num_ranges,
2450 			bdev_nvme_queued_done, bio);
2451 
2452 	return rc;
2453 }
2454 
2455 static int
2456 bdev_nvme_admin_passthru(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
2457 			 struct nvme_bdev_io *bio,
2458 			 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes)
2459 {
2460 	uint32_t max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nbdev->nvme_bdev_ctrlr->ctrlr);
2461 
2462 	if (nbytes > max_xfer_size) {
2463 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
2464 		return -EINVAL;
2465 	}
2466 
2467 	bio->orig_thread = spdk_io_channel_get_thread(ch);
2468 
2469 	return spdk_nvme_ctrlr_cmd_admin_raw(nbdev->nvme_bdev_ctrlr->ctrlr, cmd, buf,
2470 					     (uint32_t)nbytes, bdev_nvme_admin_passthru_done, bio);
2471 }
2472 
2473 static int
2474 bdev_nvme_io_passthru(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
2475 		      struct nvme_bdev_io *bio,
2476 		      struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes)
2477 {
2478 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
2479 	uint32_t max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nbdev->nvme_bdev_ctrlr->ctrlr);
2480 
2481 	if (nbytes > max_xfer_size) {
2482 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
2483 		return -EINVAL;
2484 	}
2485 
2486 	/*
2487 	 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid,
2488 	 * so fill it out automatically.
2489 	 */
2490 	cmd->nsid = spdk_nvme_ns_get_id(nbdev->nvme_ns->ns);
2491 
2492 	return spdk_nvme_ctrlr_cmd_io_raw(nbdev->nvme_bdev_ctrlr->ctrlr, nvme_ch->qpair, cmd, buf,
2493 					  (uint32_t)nbytes, bdev_nvme_queued_done, bio);
2494 }
2495 
2496 static int
2497 bdev_nvme_io_passthru_md(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
2498 			 struct nvme_bdev_io *bio,
2499 			 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len)
2500 {
2501 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
2502 	size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(nbdev->nvme_ns->ns);
2503 	uint32_t max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nbdev->nvme_bdev_ctrlr->ctrlr);
2504 
2505 	if (nbytes > max_xfer_size) {
2506 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
2507 		return -EINVAL;
2508 	}
2509 
2510 	if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(nbdev->nvme_ns->ns)) {
2511 		SPDK_ERRLOG("invalid meta data buffer size\n");
2512 		return -EINVAL;
2513 	}
2514 
2515 	/*
2516 	 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid,
2517 	 * so fill it out automatically.
2518 	 */
2519 	cmd->nsid = spdk_nvme_ns_get_id(nbdev->nvme_ns->ns);
2520 
2521 	return spdk_nvme_ctrlr_cmd_io_raw_with_md(nbdev->nvme_bdev_ctrlr->ctrlr, nvme_ch->qpair, cmd, buf,
2522 			(uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio);
2523 }
2524 
2525 static void
2526 bdev_nvme_get_spdk_running_config(FILE *fp)
2527 {
2528 	struct nvme_bdev_ctrlr	*nvme_bdev_ctrlr;
2529 
2530 	fprintf(fp, "\n[Nvme]");
2531 	fprintf(fp, "\n"
2532 		"# NVMe Device Whitelist\n"
2533 		"# Users may specify which NVMe devices to claim by their transport id.\n"
2534 		"# See spdk_nvme_transport_id_parse() in spdk/nvme.h for the correct format.\n"
2535 		"# The second argument is the assigned name, which can be referenced from\n"
2536 		"# other sections in the configuration file. For NVMe devices, a namespace\n"
2537 		"# is automatically appended to each name in the format <YourName>nY, where\n"
2538 		"# Y is the NSID (starts at 1).\n");
2539 
2540 	TAILQ_FOREACH(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
2541 		const char *trtype;
2542 		const char *prchk_flags;
2543 
2544 		trtype = spdk_nvme_transport_id_trtype_str(nvme_bdev_ctrlr->trid.trtype);
2545 		if (!trtype) {
2546 			continue;
2547 		}
2548 
2549 		if (nvme_bdev_ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) {
2550 			fprintf(fp, "TransportID \"trtype:%s traddr:%s\" %s\n",
2551 				trtype,
2552 				nvme_bdev_ctrlr->trid.traddr, nvme_bdev_ctrlr->name);
2553 		} else {
2554 			const char *adrfam;
2555 
2556 			adrfam = spdk_nvme_transport_id_adrfam_str(nvme_bdev_ctrlr->trid.adrfam);
2557 			prchk_flags = spdk_nvme_prchk_flags_str(nvme_bdev_ctrlr->prchk_flags);
2558 
2559 			if (adrfam) {
2560 				fprintf(fp, "TransportID \"trtype:%s adrfam:%s traddr:%s trsvcid:%s subnqn:%s\" %s",
2561 					trtype,	adrfam,
2562 					nvme_bdev_ctrlr->trid.traddr, nvme_bdev_ctrlr->trid.trsvcid,
2563 					nvme_bdev_ctrlr->trid.subnqn, nvme_bdev_ctrlr->name);
2564 			} else {
2565 				fprintf(fp, "TransportID \"trtype:%s traddr:%s trsvcid:%s subnqn:%s\" %s",
2566 					trtype,
2567 					nvme_bdev_ctrlr->trid.traddr, nvme_bdev_ctrlr->trid.trsvcid,
2568 					nvme_bdev_ctrlr->trid.subnqn, nvme_bdev_ctrlr->name);
2569 			}
2570 
2571 			if (prchk_flags) {
2572 				fprintf(fp, " \"%s\"\n", prchk_flags);
2573 			} else {
2574 				fprintf(fp, "\n");
2575 			}
2576 		}
2577 	}
2578 
2579 	fprintf(fp, "\n"
2580 		"# The number of attempts per I/O when an I/O fails. Do not include\n"
2581 		"# this key to get the default behavior.\n");
2582 	fprintf(fp, "RetryCount %d\n", g_opts.retry_count);
2583 	fprintf(fp, "\n"
2584 		"# Timeout for each command, in microseconds. If 0, don't track timeouts.\n");
2585 	fprintf(fp, "TimeoutUsec %"PRIu64"\n", g_opts.timeout_us);
2586 
2587 	fprintf(fp, "\n"
2588 		"# Action to take on command time out. Only valid when Timeout is greater\n"
2589 		"# than 0. This may be 'Reset' to reset the controller, 'Abort' to abort\n"
2590 		"# the command, or 'None' to just print a message but do nothing.\n"
2591 		"# Admin command timeouts will always result in a reset.\n");
2592 	switch (g_opts.action_on_timeout) {
2593 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE:
2594 		fprintf(fp, "ActionOnTimeout None\n");
2595 		break;
2596 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET:
2597 		fprintf(fp, "ActionOnTimeout Reset\n");
2598 		break;
2599 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT:
2600 		fprintf(fp, "ActionOnTimeout Abort\n");
2601 		break;
2602 	}
2603 
2604 	fprintf(fp, "\n"
2605 		"# Set how often the admin queue is polled for asynchronous events.\n"
2606 		"# Units in microseconds.\n");
2607 	fprintf(fp, "AdminPollRate %"PRIu64"\n", g_opts.nvme_adminq_poll_period_us);
2608 	fprintf(fp, "IOPollRate %" PRIu64"\n", g_opts.nvme_ioq_poll_period_us);
2609 	fprintf(fp, "\n"
2610 		"# Disable handling of hotplug (runtime insert and remove) events,\n"
2611 		"# users can set to Yes if want to enable it.\n"
2612 		"# Default: No\n");
2613 	fprintf(fp, "HotplugEnable %s\n", g_nvme_hotplug_enabled ? "Yes" : "No");
2614 	fprintf(fp, "\n"
2615 		"# Set how often the hotplug is processed for insert and remove events."
2616 		"# Units in microseconds.\n");
2617 	fprintf(fp, "HotplugPollRate %"PRIu64"\n", g_nvme_hotplug_poll_period_us);
2618 	if (g_nvme_hostnqn) {
2619 		fprintf(fp, "HostNQN %s\n",  g_nvme_hostnqn);
2620 	}
2621 	fprintf(fp, "DelayCmdSubmit %s\n", g_opts.delay_cmd_submit ? "True" : "False");
2622 
2623 	fprintf(fp, "\n");
2624 }
2625 
2626 static void
2627 nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w, struct nvme_bdev_ns *ns)
2628 {
2629 	/* nop */
2630 }
2631 
2632 static void
2633 nvme_namespace_config_json(struct spdk_json_write_ctx *w, struct nvme_bdev_ns *ns)
2634 {
2635 	g_config_json_namespace_fn[ns->type](w, ns);
2636 }
2637 
2638 static int
2639 bdev_nvme_config_json(struct spdk_json_write_ctx *w)
2640 {
2641 	struct nvme_bdev_ctrlr		*nvme_bdev_ctrlr;
2642 	struct spdk_nvme_transport_id	*trid;
2643 	const char			*action;
2644 	uint32_t			nsid;
2645 
2646 	if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) {
2647 		action = "reset";
2648 	} else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) {
2649 		action = "abort";
2650 	} else {
2651 		action = "none";
2652 	}
2653 
2654 	spdk_json_write_object_begin(w);
2655 
2656 	spdk_json_write_named_string(w, "method", "bdev_nvme_set_options");
2657 
2658 	spdk_json_write_named_object_begin(w, "params");
2659 	spdk_json_write_named_string(w, "action_on_timeout", action);
2660 	spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us);
2661 	spdk_json_write_named_uint32(w, "retry_count", g_opts.retry_count);
2662 	spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst);
2663 	spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight);
2664 	spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight);
2665 	spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight);
2666 	spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us);
2667 	spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us);
2668 	spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests);
2669 	spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit);
2670 	spdk_json_write_object_end(w);
2671 
2672 	spdk_json_write_object_end(w);
2673 
2674 	pthread_mutex_lock(&g_bdev_nvme_mutex);
2675 	TAILQ_FOREACH(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
2676 		trid = &nvme_bdev_ctrlr->trid;
2677 
2678 		spdk_json_write_object_begin(w);
2679 
2680 		spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller");
2681 
2682 		spdk_json_write_named_object_begin(w, "params");
2683 		spdk_json_write_named_string(w, "name", nvme_bdev_ctrlr->name);
2684 		nvme_bdev_dump_trid_json(trid, w);
2685 		spdk_json_write_named_bool(w, "prchk_reftag",
2686 					   (nvme_bdev_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0);
2687 		spdk_json_write_named_bool(w, "prchk_guard",
2688 					   (nvme_bdev_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0);
2689 
2690 		spdk_json_write_object_end(w);
2691 
2692 		spdk_json_write_object_end(w);
2693 
2694 		for (nsid = 0; nsid < nvme_bdev_ctrlr->num_ns; ++nsid) {
2695 			if (!nvme_bdev_ctrlr->namespaces[nsid]->populated) {
2696 				continue;
2697 			}
2698 
2699 			nvme_namespace_config_json(w, nvme_bdev_ctrlr->namespaces[nsid]);
2700 		}
2701 	}
2702 
2703 	/* Dump as last parameter to give all NVMe bdevs chance to be constructed
2704 	 * before enabling hotplug poller.
2705 	 */
2706 	spdk_json_write_object_begin(w);
2707 	spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug");
2708 
2709 	spdk_json_write_named_object_begin(w, "params");
2710 	spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us);
2711 	spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled);
2712 	spdk_json_write_object_end(w);
2713 
2714 	spdk_json_write_object_end(w);
2715 
2716 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
2717 	return 0;
2718 }
2719 
2720 struct spdk_nvme_ctrlr *
2721 spdk_bdev_nvme_get_ctrlr(struct spdk_bdev *bdev)
2722 {
2723 	if (!bdev || bdev->module != &nvme_if) {
2724 		return NULL;
2725 	}
2726 
2727 	return SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk)->nvme_bdev_ctrlr->ctrlr;
2728 }
2729 
2730 SPDK_LOG_REGISTER_COMPONENT("bdev_nvme", SPDK_LOG_BDEV_NVME)
2731