xref: /spdk/module/bdev/nvme/bdev_nvme.c (revision b30d57cdad6d2bc75cc1e4e2ebbcebcb0d98dcfa)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation. All rights reserved.
5  *   Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "spdk/stdinc.h"
35 
36 #include "bdev_nvme.h"
37 #include "bdev_ocssd.h"
38 
39 #include "spdk/config.h"
40 #include "spdk/endian.h"
41 #include "spdk/bdev.h"
42 #include "spdk/json.h"
43 #include "spdk/nvme.h"
44 #include "spdk/nvme_ocssd.h"
45 #include "spdk/thread.h"
46 #include "spdk/string.h"
47 #include "spdk/util.h"
48 
49 #include "spdk/bdev_module.h"
50 #include "spdk/log.h"
51 
52 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true
53 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS	(10000)
54 
55 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w);
56 
57 struct nvme_bdev_io {
58 	/** array of iovecs to transfer. */
59 	struct iovec *iovs;
60 
61 	/** Number of iovecs in iovs array. */
62 	int iovcnt;
63 
64 	/** Current iovec position. */
65 	int iovpos;
66 
67 	/** Offset in current iovec. */
68 	uint32_t iov_offset;
69 
70 	/** array of iovecs to transfer. */
71 	struct iovec *fused_iovs;
72 
73 	/** Number of iovecs in iovs array. */
74 	int fused_iovcnt;
75 
76 	/** Current iovec position. */
77 	int fused_iovpos;
78 
79 	/** Offset in current iovec. */
80 	uint32_t fused_iov_offset;
81 
82 	/** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */
83 	struct spdk_nvme_cpl cpl;
84 
85 	/** Originating thread */
86 	struct spdk_thread *orig_thread;
87 
88 	/** Keeps track if first of fused commands was submitted */
89 	bool first_fused_submitted;
90 };
91 
92 struct nvme_probe_ctx {
93 	size_t count;
94 	struct spdk_nvme_transport_id trids[NVME_MAX_CONTROLLERS];
95 	struct spdk_nvme_host_id hostids[NVME_MAX_CONTROLLERS];
96 	const char *names[NVME_MAX_CONTROLLERS];
97 	uint32_t prchk_flags[NVME_MAX_CONTROLLERS];
98 	const char *hostnqn;
99 };
100 
101 struct nvme_probe_skip_entry {
102 	struct spdk_nvme_transport_id		trid;
103 	TAILQ_ENTRY(nvme_probe_skip_entry)	tailq;
104 };
105 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */
106 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER(
107 			g_skipped_nvme_ctrlrs);
108 
109 static struct spdk_bdev_nvme_opts g_opts = {
110 	.action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE,
111 	.timeout_us = 0,
112 	.keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS,
113 	.retry_count = 4,
114 	.arbitration_burst = 0,
115 	.low_priority_weight = 0,
116 	.medium_priority_weight = 0,
117 	.high_priority_weight = 0,
118 	.nvme_adminq_poll_period_us = 10000ULL,
119 	.nvme_ioq_poll_period_us = 0,
120 	.io_queue_requests = 0,
121 	.delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT,
122 };
123 
124 #define NVME_HOTPLUG_POLL_PERIOD_MAX			10000000ULL
125 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT		100000ULL
126 
127 static int g_hot_insert_nvme_controller_index = 0;
128 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT;
129 static bool g_nvme_hotplug_enabled = false;
130 static struct spdk_thread *g_bdev_nvme_init_thread;
131 static struct spdk_poller *g_hotplug_poller;
132 static struct spdk_poller *g_hotplug_probe_poller;
133 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx;
134 
135 static void nvme_ctrlr_populate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
136 		struct nvme_async_probe_ctx *ctx);
137 static void nvme_ctrlr_populate_namespaces_done(struct nvme_async_probe_ctx *ctx);
138 static int bdev_nvme_library_init(void);
139 static void bdev_nvme_library_fini(void);
140 static int bdev_nvme_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
141 			   struct nvme_bdev_io *bio,
142 			   struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
143 			   uint32_t flags);
144 static int bdev_nvme_no_pi_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
145 				 struct nvme_bdev_io *bio,
146 				 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba);
147 static int bdev_nvme_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
148 			    struct nvme_bdev_io *bio,
149 			    struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
150 			    uint32_t flags);
151 static int bdev_nvme_comparev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
152 			      struct nvme_bdev_io *bio,
153 			      struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
154 			      uint32_t flags);
155 static int bdev_nvme_comparev_and_writev(struct spdk_nvme_ns *ns,
156 		struct spdk_nvme_qpair *qpair,
157 		struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov,
158 		int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba,
159 		uint32_t flags);
160 static int bdev_nvme_admin_passthru(struct nvme_io_channel *nvme_ch,
161 				    struct nvme_bdev_io *bio,
162 				    struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes);
163 static int bdev_nvme_io_passthru(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
164 				 struct nvme_bdev_io *bio,
165 				 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes);
166 static int bdev_nvme_io_passthru_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
167 				    struct nvme_bdev_io *bio,
168 				    struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len);
169 static int bdev_nvme_abort(struct nvme_io_channel *nvme_ch,
170 			   struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort);
171 static int bdev_nvme_reset(struct nvme_io_channel *nvme_ch, struct nvme_bdev_io *bio);
172 static int bdev_nvme_failover(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, bool remove);
173 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr);
174 
175 typedef void (*populate_namespace_fn)(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
176 				      struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx);
177 static void nvme_ctrlr_populate_standard_namespace(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
178 		struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx);
179 
180 static populate_namespace_fn g_populate_namespace_fn[] = {
181 	NULL,
182 	nvme_ctrlr_populate_standard_namespace,
183 	bdev_ocssd_populate_namespace,
184 };
185 
186 typedef void (*depopulate_namespace_fn)(struct nvme_bdev_ns *nvme_ns);
187 static void nvme_ctrlr_depopulate_standard_namespace(struct nvme_bdev_ns *nvme_ns);
188 
189 static depopulate_namespace_fn g_depopulate_namespace_fn[] = {
190 	NULL,
191 	nvme_ctrlr_depopulate_standard_namespace,
192 	bdev_ocssd_depopulate_namespace,
193 };
194 
195 typedef void (*config_json_namespace_fn)(struct spdk_json_write_ctx *w,
196 		struct nvme_bdev_ns *nvme_ns);
197 static void nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w,
198 		struct nvme_bdev_ns *nvme_ns);
199 
200 static config_json_namespace_fn g_config_json_namespace_fn[] = {
201 	NULL,
202 	nvme_ctrlr_config_json_standard_namespace,
203 	bdev_ocssd_namespace_config_json,
204 };
205 
206 struct spdk_nvme_qpair *
207 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch)
208 {
209 	struct nvme_io_channel *nvme_ch;
210 
211 	nvme_ch =  spdk_io_channel_get_ctx(ctrlr_io_ch);
212 
213 	return nvme_ch->qpair;
214 }
215 
216 static int
217 bdev_nvme_get_ctx_size(void)
218 {
219 	return sizeof(struct nvme_bdev_io);
220 }
221 
222 static struct spdk_bdev_module nvme_if = {
223 	.name = "nvme",
224 	.async_fini = true,
225 	.module_init = bdev_nvme_library_init,
226 	.module_fini = bdev_nvme_library_fini,
227 	.config_json = bdev_nvme_config_json,
228 	.get_ctx_size = bdev_nvme_get_ctx_size,
229 
230 };
231 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if)
232 
233 static void
234 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx)
235 {
236 	int rc;
237 
238 	SPDK_DEBUGLOG(bdev_nvme, "qpair %p is disconnected, attempting reconnect.\n", qpair);
239 	/*
240 	 * Currently, just try to reconnect indefinitely. If we are doing a reset, the reset will
241 	 * reconnect a qpair and we will stop getting a callback for this one.
242 	 */
243 	rc = spdk_nvme_ctrlr_reconnect_io_qpair(qpair);
244 	if (rc != 0) {
245 		SPDK_WARNLOG("Failed to reconnect to qpair %p, errno %d\n", qpair, -rc);
246 	}
247 }
248 
249 static int
250 bdev_nvme_poll(void *arg)
251 {
252 	struct nvme_bdev_poll_group *group = arg;
253 	int64_t num_completions;
254 
255 	if (group->collect_spin_stat && group->start_ticks == 0) {
256 		group->start_ticks = spdk_get_ticks();
257 	}
258 
259 	num_completions = spdk_nvme_poll_group_process_completions(group->group, 0,
260 			  bdev_nvme_disconnected_qpair_cb);
261 	if (group->collect_spin_stat) {
262 		if (num_completions > 0) {
263 			if (group->end_ticks != 0) {
264 				group->spin_ticks += (group->end_ticks - group->start_ticks);
265 				group->end_ticks = 0;
266 			}
267 			group->start_ticks = 0;
268 		} else {
269 			group->end_ticks = spdk_get_ticks();
270 		}
271 	}
272 
273 	return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
274 }
275 
276 static int
277 bdev_nvme_poll_adminq(void *arg)
278 {
279 	int32_t rc;
280 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = arg;
281 
282 	assert(nvme_bdev_ctrlr != NULL);
283 
284 	rc = spdk_nvme_ctrlr_process_admin_completions(nvme_bdev_ctrlr->ctrlr);
285 	if (rc < 0) {
286 		bdev_nvme_failover(nvme_bdev_ctrlr, false);
287 	}
288 
289 	return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY;
290 }
291 
292 static int
293 bdev_nvme_destruct(void *ctx)
294 {
295 	struct nvme_bdev *nvme_disk = ctx;
296 	struct nvme_bdev_ns *nvme_ns = nvme_disk->nvme_ns;
297 
298 	pthread_mutex_lock(&g_bdev_nvme_mutex);
299 	TAILQ_REMOVE(&nvme_ns->bdevs, nvme_disk, tailq);
300 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
301 
302 	nvme_bdev_ns_detach(nvme_ns);
303 
304 	free(nvme_disk->disk.name);
305 	free(nvme_disk);
306 
307 	return 0;
308 }
309 
310 static int
311 bdev_nvme_flush(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
312 		struct nvme_bdev_io *bio, uint64_t offset, uint64_t nbytes)
313 {
314 	spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_SUCCESS);
315 
316 	return 0;
317 }
318 
319 static int
320 bdev_nvme_create_qpair(struct nvme_io_channel *nvme_ch)
321 {
322 	struct spdk_nvme_ctrlr *ctrlr = nvme_ch->ctrlr->ctrlr;
323 	struct spdk_nvme_io_qpair_opts opts;
324 	int rc;
325 
326 	spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &opts, sizeof(opts));
327 	opts.delay_cmd_submit = g_opts.delay_cmd_submit;
328 	opts.create_only = true;
329 	opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests);
330 	g_opts.io_queue_requests = opts.io_queue_requests;
331 
332 	nvme_ch->qpair = spdk_nvme_ctrlr_alloc_io_qpair(ctrlr, &opts, sizeof(opts));
333 	if (nvme_ch->qpair == NULL) {
334 		return -1;
335 	}
336 
337 	assert(nvme_ch->group != NULL);
338 
339 	rc = spdk_nvme_poll_group_add(nvme_ch->group->group, nvme_ch->qpair);
340 	if (rc != 0) {
341 		SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n");
342 		goto err;
343 	}
344 
345 	rc = spdk_nvme_ctrlr_connect_io_qpair(ctrlr, nvme_ch->qpair);
346 	if (rc != 0) {
347 		SPDK_ERRLOG("Unable to connect I/O qpair.\n");
348 		goto err;
349 	}
350 
351 	return 0;
352 
353 err:
354 	spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair);
355 
356 	return rc;
357 }
358 
359 static void
360 _bdev_nvme_reset_destruct_ctrlr(struct spdk_io_channel_iter *i, int status)
361 {
362 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i);
363 
364 	spdk_thread_send_msg(nvme_bdev_ctrlr->thread, nvme_bdev_ctrlr_do_destruct,
365 			     nvme_bdev_ctrlr);
366 }
367 
368 static void
369 _bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i)
370 {
371 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
372 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(_ch);
373 	struct spdk_bdev_io *bdev_io;
374 	enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS;
375 
376 	/* A NULL ctx means success. */
377 	if (spdk_io_channel_iter_get_ctx(i) != NULL) {
378 		status = SPDK_BDEV_IO_STATUS_FAILED;
379 	}
380 
381 	while (!TAILQ_EMPTY(&nvme_ch->pending_resets)) {
382 		bdev_io = TAILQ_FIRST(&nvme_ch->pending_resets);
383 		TAILQ_REMOVE(&nvme_ch->pending_resets, bdev_io, module_link);
384 		spdk_bdev_io_complete(bdev_io, status);
385 	}
386 
387 	spdk_for_each_channel_continue(i, 0);
388 }
389 
390 static void
391 _bdev_nvme_reset_complete(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, int rc)
392 {
393 	/* we are using the for_each_channel cb_arg like a return code here. */
394 	/* If it's zero, we succeeded, otherwise, the reset failed. */
395 	void *cb_arg = NULL;
396 	struct nvme_bdev_ctrlr_trid *curr_trid;
397 	bool do_destruct = false;
398 
399 	if (rc) {
400 		cb_arg = (void *)0x1;
401 		SPDK_ERRLOG("Resetting controller failed.\n");
402 	} else {
403 		SPDK_NOTICELOG("Resetting controller successful.\n");
404 	}
405 
406 	pthread_mutex_lock(&g_bdev_nvme_mutex);
407 	nvme_bdev_ctrlr->resetting = false;
408 	nvme_bdev_ctrlr->failover_in_progress = false;
409 
410 	curr_trid = TAILQ_FIRST(&nvme_bdev_ctrlr->trids);
411 	assert(curr_trid != NULL);
412 	assert(&curr_trid->trid == nvme_bdev_ctrlr->connected_trid);
413 
414 	curr_trid->is_failed = cb_arg != NULL ? true : false;
415 
416 	if (nvme_bdev_ctrlr->ref == 0 && nvme_bdev_ctrlr->destruct) {
417 		/* Destruct ctrlr after clearing pending resets. */
418 		do_destruct = true;
419 	}
420 
421 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
422 	/* Make sure we clear any pending resets before returning. */
423 	spdk_for_each_channel(nvme_bdev_ctrlr,
424 			      _bdev_nvme_complete_pending_resets,
425 			      cb_arg,
426 			      do_destruct ? _bdev_nvme_reset_destruct_ctrlr : NULL);
427 }
428 
429 static void
430 _bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status)
431 {
432 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i);
433 	struct nvme_bdev_io *bio = spdk_io_channel_iter_get_ctx(i);
434 	int rc = SPDK_BDEV_IO_STATUS_SUCCESS;
435 
436 	if (status) {
437 		rc = SPDK_BDEV_IO_STATUS_FAILED;
438 	}
439 	if (bio) {
440 		spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), rc);
441 	}
442 	_bdev_nvme_reset_complete(nvme_bdev_ctrlr, status);
443 }
444 
445 static void
446 _bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i)
447 {
448 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
449 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(_ch);
450 	int rc;
451 
452 	rc = bdev_nvme_create_qpair(nvme_ch);
453 
454 	spdk_for_each_channel_continue(i, rc);
455 }
456 
457 static void
458 _bdev_nvme_reset_ctrlr(struct spdk_io_channel_iter *i, int status)
459 {
460 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i);
461 	struct nvme_bdev_io *bio = spdk_io_channel_iter_get_ctx(i);
462 	int rc;
463 
464 	if (status) {
465 		rc = status;
466 		goto err;
467 	}
468 
469 	rc = spdk_nvme_ctrlr_reset(nvme_bdev_ctrlr->ctrlr);
470 	if (rc != 0) {
471 		goto err;
472 	}
473 
474 	/* Recreate all of the I/O queue pairs */
475 	spdk_for_each_channel(nvme_bdev_ctrlr,
476 			      _bdev_nvme_reset_create_qpair,
477 			      bio,
478 			      _bdev_nvme_reset_create_qpairs_done);
479 	return;
480 
481 err:
482 	if (bio) {
483 		spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_FAILED);
484 	}
485 	_bdev_nvme_reset_complete(nvme_bdev_ctrlr, rc);
486 }
487 
488 static void
489 _bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i)
490 {
491 	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
492 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
493 	int rc;
494 
495 	rc = spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair);
496 	if (!rc) {
497 		nvme_ch->qpair = NULL;
498 	}
499 
500 	spdk_for_each_channel_continue(i, rc);
501 }
502 
503 static int
504 _bdev_nvme_reset(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, void *ctx)
505 {
506 	pthread_mutex_lock(&g_bdev_nvme_mutex);
507 	if (nvme_bdev_ctrlr->destruct) {
508 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
509 		return -EBUSY;
510 	}
511 
512 	if (nvme_bdev_ctrlr->resetting) {
513 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
514 		SPDK_NOTICELOG("Unable to perform reset, already in progress.\n");
515 		return -EAGAIN;
516 	}
517 
518 	nvme_bdev_ctrlr->resetting = true;
519 
520 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
521 	/* First, delete all NVMe I/O queue pairs. */
522 	spdk_for_each_channel(nvme_bdev_ctrlr,
523 			      _bdev_nvme_reset_destroy_qpair,
524 			      ctx,
525 			      _bdev_nvme_reset_ctrlr);
526 
527 	return 0;
528 }
529 
530 static int
531 bdev_nvme_reset(struct nvme_io_channel *nvme_ch, struct nvme_bdev_io *bio)
532 {
533 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
534 	int rc;
535 
536 	rc = _bdev_nvme_reset(nvme_ch->ctrlr, bio);
537 	if (rc == -EBUSY) {
538 		/* Don't bother resetting if the controller is in the process of being destructed. */
539 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
540 		return 0;
541 	} else if (rc == -EAGAIN) {
542 		/*
543 		 * Reset call is queued only if it is from the app framework. This is on purpose so that
544 		 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the
545 		 * upper level. If they are in the middle of a reset, we won't try to schedule another one.
546 		 */
547 		TAILQ_INSERT_TAIL(&nvme_ch->pending_resets, bdev_io, module_link);
548 		return 0;
549 	} else {
550 		return rc;
551 	}
552 }
553 
554 static int
555 bdev_nvme_failover(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, bool remove)
556 {
557 	struct nvme_bdev_ctrlr_trid *curr_trid = NULL, *next_trid = NULL;
558 	int rc = 0;
559 
560 	pthread_mutex_lock(&g_bdev_nvme_mutex);
561 	if (nvme_bdev_ctrlr->destruct) {
562 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
563 		/* Don't bother resetting if the controller is in the process of being destructed. */
564 		return 0;
565 	}
566 
567 	curr_trid = TAILQ_FIRST(&nvme_bdev_ctrlr->trids);
568 	assert(curr_trid);
569 	assert(&curr_trid->trid == nvme_bdev_ctrlr->connected_trid);
570 	next_trid = TAILQ_NEXT(curr_trid, link);
571 
572 	if (nvme_bdev_ctrlr->resetting) {
573 		if (next_trid && !nvme_bdev_ctrlr->failover_in_progress) {
574 			rc = -EAGAIN;
575 		}
576 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
577 		SPDK_NOTICELOG("Unable to perform reset, already in progress.\n");
578 		return rc;
579 	}
580 
581 	nvme_bdev_ctrlr->resetting = true;
582 	curr_trid->is_failed = true;
583 
584 	if (next_trid) {
585 		assert(curr_trid->trid.trtype != SPDK_NVME_TRANSPORT_PCIE);
586 
587 		SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", curr_trid->trid.traddr,
588 			       curr_trid->trid.trsvcid,	next_trid->trid.traddr, next_trid->trid.trsvcid);
589 
590 		nvme_bdev_ctrlr->failover_in_progress = true;
591 		spdk_nvme_ctrlr_fail(nvme_bdev_ctrlr->ctrlr);
592 		nvme_bdev_ctrlr->connected_trid = &next_trid->trid;
593 		rc = spdk_nvme_ctrlr_set_trid(nvme_bdev_ctrlr->ctrlr, &next_trid->trid);
594 		assert(rc == 0);
595 		TAILQ_REMOVE(&nvme_bdev_ctrlr->trids, curr_trid, link);
596 		if (!remove) {
597 			/** Shuffle the old trid to the end of the list and use the new one.
598 			 * Allows for round robin through multiple connections.
599 			 */
600 			TAILQ_INSERT_TAIL(&nvme_bdev_ctrlr->trids, curr_trid, link);
601 		} else {
602 			free(curr_trid);
603 		}
604 	}
605 
606 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
607 	/* First, delete all NVMe I/O queue pairs. */
608 	spdk_for_each_channel(nvme_bdev_ctrlr,
609 			      _bdev_nvme_reset_destroy_qpair,
610 			      NULL,
611 			      _bdev_nvme_reset_ctrlr);
612 
613 	return 0;
614 }
615 
616 static int
617 bdev_nvme_unmap(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
618 		struct nvme_bdev_io *bio,
619 		uint64_t offset_blocks,
620 		uint64_t num_blocks);
621 
622 static void
623 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
624 		     bool success)
625 {
626 	struct spdk_bdev *bdev = bdev_io->bdev;
627 	struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev->ctxt;
628 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
629 	struct nvme_bdev_ns *nvme_ns;
630 	struct spdk_nvme_qpair *qpair;
631 	int ret;
632 
633 	if (!success) {
634 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
635 		return;
636 	}
637 
638 	if (spdk_unlikely(!bdev_nvme_find_io_path(nbdev, nvme_ch, &nvme_ns, &qpair))) {
639 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
640 		return;
641 	}
642 
643 	ret = bdev_nvme_readv(nvme_ns->ns,
644 			      qpair,
645 			      (struct nvme_bdev_io *)bdev_io->driver_ctx,
646 			      bdev_io->u.bdev.iovs,
647 			      bdev_io->u.bdev.iovcnt,
648 			      bdev_io->u.bdev.md_buf,
649 			      bdev_io->u.bdev.num_blocks,
650 			      bdev_io->u.bdev.offset_blocks,
651 			      bdev->dif_check_flags);
652 
653 	if (spdk_likely(ret == 0)) {
654 		return;
655 	} else if (ret == -ENOMEM) {
656 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
657 	} else {
658 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
659 	}
660 }
661 
662 static int
663 _bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
664 {
665 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
666 	struct spdk_bdev *bdev = bdev_io->bdev;
667 	struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev->ctxt;
668 	struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx;
669 	struct nvme_bdev_io *nbdev_io_to_abort;
670 	struct nvme_bdev_ns *nvme_ns;
671 	struct spdk_nvme_qpair *qpair;
672 
673 	if (spdk_unlikely(!bdev_nvme_find_io_path(nbdev, nvme_ch, &nvme_ns, &qpair))) {
674 		return -1;
675 	}
676 
677 	switch (bdev_io->type) {
678 	case SPDK_BDEV_IO_TYPE_READ:
679 		if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) {
680 			return bdev_nvme_readv(nvme_ns->ns,
681 					       qpair,
682 					       nbdev_io,
683 					       bdev_io->u.bdev.iovs,
684 					       bdev_io->u.bdev.iovcnt,
685 					       bdev_io->u.bdev.md_buf,
686 					       bdev_io->u.bdev.num_blocks,
687 					       bdev_io->u.bdev.offset_blocks,
688 					       bdev->dif_check_flags);
689 		} else {
690 			spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb,
691 					     bdev_io->u.bdev.num_blocks * bdev->blocklen);
692 			return 0;
693 		}
694 
695 	case SPDK_BDEV_IO_TYPE_WRITE:
696 		return bdev_nvme_writev(nvme_ns->ns,
697 					qpair,
698 					nbdev_io,
699 					bdev_io->u.bdev.iovs,
700 					bdev_io->u.bdev.iovcnt,
701 					bdev_io->u.bdev.md_buf,
702 					bdev_io->u.bdev.num_blocks,
703 					bdev_io->u.bdev.offset_blocks,
704 					bdev->dif_check_flags);
705 
706 	case SPDK_BDEV_IO_TYPE_COMPARE:
707 		return bdev_nvme_comparev(nvme_ns->ns,
708 					  qpair,
709 					  nbdev_io,
710 					  bdev_io->u.bdev.iovs,
711 					  bdev_io->u.bdev.iovcnt,
712 					  bdev_io->u.bdev.md_buf,
713 					  bdev_io->u.bdev.num_blocks,
714 					  bdev_io->u.bdev.offset_blocks,
715 					  bdev->dif_check_flags);
716 
717 	case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE:
718 		return bdev_nvme_comparev_and_writev(nvme_ns->ns,
719 						     qpair,
720 						     nbdev_io,
721 						     bdev_io->u.bdev.iovs,
722 						     bdev_io->u.bdev.iovcnt,
723 						     bdev_io->u.bdev.fused_iovs,
724 						     bdev_io->u.bdev.fused_iovcnt,
725 						     bdev_io->u.bdev.md_buf,
726 						     bdev_io->u.bdev.num_blocks,
727 						     bdev_io->u.bdev.offset_blocks,
728 						     bdev->dif_check_flags);
729 
730 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
731 		return bdev_nvme_unmap(nvme_ns->ns,
732 				       qpair,
733 				       nbdev_io,
734 				       bdev_io->u.bdev.offset_blocks,
735 				       bdev_io->u.bdev.num_blocks);
736 
737 	case SPDK_BDEV_IO_TYPE_UNMAP:
738 		return bdev_nvme_unmap(nvme_ns->ns,
739 				       qpair,
740 				       nbdev_io,
741 				       bdev_io->u.bdev.offset_blocks,
742 				       bdev_io->u.bdev.num_blocks);
743 
744 	case SPDK_BDEV_IO_TYPE_RESET:
745 		return bdev_nvme_reset(nvme_ch, nbdev_io);
746 
747 	case SPDK_BDEV_IO_TYPE_FLUSH:
748 		return bdev_nvme_flush(nvme_ns->ns,
749 				       qpair,
750 				       nbdev_io,
751 				       bdev_io->u.bdev.offset_blocks,
752 				       bdev_io->u.bdev.num_blocks);
753 
754 	case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
755 		return bdev_nvme_admin_passthru(nvme_ch,
756 						nbdev_io,
757 						&bdev_io->u.nvme_passthru.cmd,
758 						bdev_io->u.nvme_passthru.buf,
759 						bdev_io->u.nvme_passthru.nbytes);
760 
761 	case SPDK_BDEV_IO_TYPE_NVME_IO:
762 		return bdev_nvme_io_passthru(nvme_ns->ns,
763 					     qpair,
764 					     nbdev_io,
765 					     &bdev_io->u.nvme_passthru.cmd,
766 					     bdev_io->u.nvme_passthru.buf,
767 					     bdev_io->u.nvme_passthru.nbytes);
768 
769 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
770 		return bdev_nvme_io_passthru_md(nvme_ns->ns,
771 						qpair,
772 						nbdev_io,
773 						&bdev_io->u.nvme_passthru.cmd,
774 						bdev_io->u.nvme_passthru.buf,
775 						bdev_io->u.nvme_passthru.nbytes,
776 						bdev_io->u.nvme_passthru.md_buf,
777 						bdev_io->u.nvme_passthru.md_len);
778 
779 	case SPDK_BDEV_IO_TYPE_ABORT:
780 		nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx;
781 		return bdev_nvme_abort(nvme_ch,
782 				       nbdev_io,
783 				       nbdev_io_to_abort);
784 
785 	default:
786 		return -EINVAL;
787 	}
788 	return 0;
789 }
790 
791 static void
792 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
793 {
794 	int rc = _bdev_nvme_submit_request(ch, bdev_io);
795 
796 	if (spdk_unlikely(rc != 0)) {
797 		if (rc == -ENOMEM) {
798 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
799 		} else {
800 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
801 		}
802 	}
803 }
804 
805 static bool
806 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
807 {
808 	struct nvme_bdev *nbdev = ctx;
809 	struct nvme_bdev_ns *nvme_ns;
810 	struct spdk_nvme_ns *ns;
811 	struct spdk_nvme_ctrlr *ctrlr;
812 	const struct spdk_nvme_ctrlr_data *cdata;
813 
814 	nvme_ns = nvme_bdev_to_bdev_ns(nbdev);
815 	assert(nvme_ns != NULL);
816 	ns = nvme_ns->ns;
817 	ctrlr = spdk_nvme_ns_get_ctrlr(ns);
818 
819 	switch (io_type) {
820 	case SPDK_BDEV_IO_TYPE_READ:
821 	case SPDK_BDEV_IO_TYPE_WRITE:
822 	case SPDK_BDEV_IO_TYPE_RESET:
823 	case SPDK_BDEV_IO_TYPE_FLUSH:
824 	case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
825 	case SPDK_BDEV_IO_TYPE_NVME_IO:
826 	case SPDK_BDEV_IO_TYPE_ABORT:
827 		return true;
828 
829 	case SPDK_BDEV_IO_TYPE_COMPARE:
830 		return spdk_nvme_ns_supports_compare(ns);
831 
832 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
833 		return spdk_nvme_ns_get_md_size(ns) ? true : false;
834 
835 	case SPDK_BDEV_IO_TYPE_UNMAP:
836 		cdata = spdk_nvme_ctrlr_get_data(ctrlr);
837 		return cdata->oncs.dsm;
838 
839 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
840 		cdata = spdk_nvme_ctrlr_get_data(ctrlr);
841 		/*
842 		 * If an NVMe controller guarantees reading unallocated blocks returns zero,
843 		 * we can implement WRITE_ZEROES as an NVMe deallocate command.
844 		 */
845 		if (cdata->oncs.dsm &&
846 		    spdk_nvme_ns_get_dealloc_logical_block_read_value(ns) ==
847 		    SPDK_NVME_DEALLOC_READ_00) {
848 			return true;
849 		}
850 		/*
851 		 * The NVMe controller write_zeroes function is currently not used by our driver.
852 		 * If a user submits an arbitrarily large write_zeroes request to the controller, the request will fail.
853 		 * Until this is resolved, we only claim support for write_zeroes if deallocated blocks return 0's when read.
854 		 */
855 		return false;
856 
857 	case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE:
858 		if (spdk_nvme_ctrlr_get_flags(ctrlr) &
859 		    SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) {
860 			return true;
861 		}
862 		return false;
863 
864 	default:
865 		return false;
866 	}
867 }
868 
869 static int
870 bdev_nvme_create_cb(void *io_device, void *ctx_buf)
871 {
872 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = io_device;
873 	struct nvme_io_channel *nvme_ch = ctx_buf;
874 	struct spdk_io_channel *pg_ch = NULL;
875 	int rc;
876 
877 	if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) {
878 		rc = bdev_ocssd_create_io_channel(nvme_ch);
879 		if (rc != 0) {
880 			return rc;
881 		}
882 	}
883 
884 	pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs);
885 	if (!pg_ch) {
886 		rc = -1;
887 		goto err_pg_ch;
888 	}
889 
890 	nvme_ch->group = spdk_io_channel_get_ctx(pg_ch);
891 
892 #ifdef SPDK_CONFIG_VTUNE
893 	nvme_ch->group->collect_spin_stat = true;
894 #else
895 	nvme_ch->group->collect_spin_stat = false;
896 #endif
897 
898 	TAILQ_INIT(&nvme_ch->pending_resets);
899 
900 	nvme_ch->ctrlr = nvme_bdev_ctrlr;
901 
902 	rc = bdev_nvme_create_qpair(nvme_ch);
903 	if (rc != 0) {
904 		goto err_qpair;
905 	}
906 
907 	return 0;
908 
909 err_qpair:
910 	spdk_put_io_channel(pg_ch);
911 err_pg_ch:
912 	if (nvme_ch->ocssd_ch) {
913 		bdev_ocssd_destroy_io_channel(nvme_ch);
914 	}
915 
916 	return rc;
917 }
918 
919 static void
920 bdev_nvme_destroy_cb(void *io_device, void *ctx_buf)
921 {
922 	struct nvme_io_channel *nvme_ch = ctx_buf;
923 
924 	assert(nvme_ch->group != NULL);
925 
926 	if (nvme_ch->ocssd_ch != NULL) {
927 		bdev_ocssd_destroy_io_channel(nvme_ch);
928 	}
929 
930 	spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair);
931 
932 	spdk_put_io_channel(spdk_io_channel_from_ctx(nvme_ch->group));
933 }
934 
935 static int
936 bdev_nvme_poll_group_create_cb(void *io_device, void *ctx_buf)
937 {
938 	struct nvme_bdev_poll_group *group = ctx_buf;
939 
940 	group->group = spdk_nvme_poll_group_create(group);
941 	if (group->group == NULL) {
942 		return -1;
943 	}
944 
945 	group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us);
946 
947 	if (group->poller == NULL) {
948 		spdk_nvme_poll_group_destroy(group->group);
949 		return -1;
950 	}
951 
952 	return 0;
953 }
954 
955 static void
956 bdev_nvme_poll_group_destroy_cb(void *io_device, void *ctx_buf)
957 {
958 	struct nvme_bdev_poll_group *group = ctx_buf;
959 
960 	spdk_poller_unregister(&group->poller);
961 	if (spdk_nvme_poll_group_destroy(group->group)) {
962 		SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.");
963 		assert(false);
964 	}
965 }
966 
967 static struct spdk_io_channel *
968 bdev_nvme_get_io_channel(void *ctx)
969 {
970 	struct nvme_bdev *nvme_bdev = ctx;
971 
972 	return spdk_get_io_channel(nvme_bdev->nvme_ns->ctrlr);
973 }
974 
975 static void *
976 bdev_nvme_get_module_ctx(void *ctx)
977 {
978 	struct nvme_bdev *nvme_bdev = ctx;
979 
980 	return bdev_nvme_get_ctrlr(&nvme_bdev->disk);
981 }
982 
983 static int
984 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
985 {
986 	struct nvme_bdev *nvme_bdev = ctx;
987 	struct nvme_bdev_ns *nvme_ns;
988 	struct spdk_nvme_ns *ns;
989 	struct spdk_nvme_ctrlr *ctrlr;
990 	const struct spdk_nvme_ctrlr_data *cdata;
991 	const struct spdk_nvme_transport_id *trid;
992 	union spdk_nvme_vs_register vs;
993 	union spdk_nvme_csts_register csts;
994 	char buf[128];
995 
996 	nvme_ns = nvme_bdev_to_bdev_ns(nvme_bdev);
997 	assert(nvme_ns != NULL);
998 	ns = nvme_ns->ns;
999 	ctrlr = spdk_nvme_ns_get_ctrlr(ns);
1000 
1001 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
1002 	trid = spdk_nvme_ctrlr_get_transport_id(ctrlr);
1003 	vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr);
1004 	csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr);
1005 
1006 	spdk_json_write_named_object_begin(w, "nvme");
1007 
1008 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
1009 		spdk_json_write_named_string(w, "pci_address", trid->traddr);
1010 	}
1011 
1012 	spdk_json_write_named_object_begin(w, "trid");
1013 
1014 	nvme_bdev_dump_trid_json(trid, w);
1015 
1016 	spdk_json_write_object_end(w);
1017 
1018 #ifdef SPDK_CONFIG_NVME_CUSE
1019 	size_t cuse_name_size = 128;
1020 	char cuse_name[cuse_name_size];
1021 
1022 	int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns),
1023 					    cuse_name, &cuse_name_size);
1024 	if (rc == 0) {
1025 		spdk_json_write_named_string(w, "cuse_device", cuse_name);
1026 	}
1027 #endif
1028 
1029 	spdk_json_write_named_object_begin(w, "ctrlr_data");
1030 
1031 	spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid);
1032 
1033 	snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn);
1034 	spdk_str_trim(buf);
1035 	spdk_json_write_named_string(w, "model_number", buf);
1036 
1037 	snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn);
1038 	spdk_str_trim(buf);
1039 	spdk_json_write_named_string(w, "serial_number", buf);
1040 
1041 	snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr);
1042 	spdk_str_trim(buf);
1043 	spdk_json_write_named_string(w, "firmware_revision", buf);
1044 
1045 	if (cdata->subnqn[0] != '\0') {
1046 		spdk_json_write_named_string(w, "subnqn", cdata->subnqn);
1047 	}
1048 
1049 	spdk_json_write_named_object_begin(w, "oacs");
1050 
1051 	spdk_json_write_named_uint32(w, "security", cdata->oacs.security);
1052 	spdk_json_write_named_uint32(w, "format", cdata->oacs.format);
1053 	spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware);
1054 	spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage);
1055 
1056 	spdk_json_write_object_end(w);
1057 
1058 	spdk_json_write_object_end(w);
1059 
1060 	spdk_json_write_named_object_begin(w, "vs");
1061 
1062 	spdk_json_write_name(w, "nvme_version");
1063 	if (vs.bits.ter) {
1064 		spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter);
1065 	} else {
1066 		spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr);
1067 	}
1068 
1069 	spdk_json_write_object_end(w);
1070 
1071 	spdk_json_write_named_object_begin(w, "csts");
1072 
1073 	spdk_json_write_named_uint32(w, "rdy", csts.bits.rdy);
1074 	spdk_json_write_named_uint32(w, "cfs", csts.bits.cfs);
1075 
1076 	spdk_json_write_object_end(w);
1077 
1078 	spdk_json_write_named_object_begin(w, "ns_data");
1079 
1080 	spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns));
1081 
1082 	spdk_json_write_object_end(w);
1083 
1084 	if (cdata->oacs.security) {
1085 		spdk_json_write_named_object_begin(w, "security");
1086 
1087 		spdk_json_write_named_bool(w, "opal", nvme_bdev->opal);
1088 
1089 		spdk_json_write_object_end(w);
1090 	}
1091 
1092 	spdk_json_write_object_end(w);
1093 
1094 	return 0;
1095 }
1096 
1097 static void
1098 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
1099 {
1100 	/* No config per bdev needed */
1101 }
1102 
1103 static uint64_t
1104 bdev_nvme_get_spin_time(struct spdk_io_channel *ch)
1105 {
1106 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
1107 	struct nvme_bdev_poll_group *group = nvme_ch->group;
1108 	uint64_t spin_time;
1109 
1110 	if (!group || !group->collect_spin_stat) {
1111 		return 0;
1112 	}
1113 
1114 	if (group->end_ticks != 0) {
1115 		group->spin_ticks += (group->end_ticks - group->start_ticks);
1116 		group->end_ticks = 0;
1117 	}
1118 
1119 	spin_time = (group->spin_ticks * 1000000ULL) / spdk_get_ticks_hz();
1120 	group->start_ticks = 0;
1121 	group->spin_ticks = 0;
1122 
1123 	return spin_time;
1124 }
1125 
1126 static const struct spdk_bdev_fn_table nvmelib_fn_table = {
1127 	.destruct		= bdev_nvme_destruct,
1128 	.submit_request		= bdev_nvme_submit_request,
1129 	.io_type_supported	= bdev_nvme_io_type_supported,
1130 	.get_io_channel		= bdev_nvme_get_io_channel,
1131 	.dump_info_json		= bdev_nvme_dump_info_json,
1132 	.write_config_json	= bdev_nvme_write_config_json,
1133 	.get_spin_time		= bdev_nvme_get_spin_time,
1134 	.get_module_ctx		= bdev_nvme_get_module_ctx,
1135 };
1136 
1137 static int
1138 nvme_disk_create(struct spdk_bdev *disk, const char *base_name,
1139 		 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns,
1140 		 uint32_t prchk_flags, void *ctx)
1141 {
1142 	const struct spdk_uuid		*uuid;
1143 	const struct spdk_nvme_ctrlr_data *cdata;
1144 	const struct spdk_nvme_ns_data	*nsdata;
1145 	int				rc;
1146 
1147 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
1148 
1149 	disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns));
1150 	if (!disk->name) {
1151 		return -ENOMEM;
1152 	}
1153 	disk->product_name = "NVMe disk";
1154 
1155 	disk->write_cache = 0;
1156 	if (cdata->vwc.present) {
1157 		/* Enable if the Volatile Write Cache exists */
1158 		disk->write_cache = 1;
1159 	}
1160 	disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns);
1161 	disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns);
1162 	disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns);
1163 
1164 	uuid = spdk_nvme_ns_get_uuid(ns);
1165 	if (uuid != NULL) {
1166 		disk->uuid = *uuid;
1167 	}
1168 
1169 	nsdata = spdk_nvme_ns_get_data(ns);
1170 
1171 	disk->md_len = spdk_nvme_ns_get_md_size(ns);
1172 	if (disk->md_len != 0) {
1173 		disk->md_interleave = nsdata->flbas.extended;
1174 		disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns);
1175 		if (disk->dif_type != SPDK_DIF_DISABLE) {
1176 			disk->dif_is_head_of_md = nsdata->dps.md_start;
1177 			disk->dif_check_flags = prchk_flags;
1178 		}
1179 	}
1180 
1181 	if (!(spdk_nvme_ctrlr_get_flags(ctrlr) &
1182 	      SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) {
1183 		disk->acwu = 0;
1184 	} else if (nsdata->nsfeat.ns_atomic_write_unit) {
1185 		disk->acwu = nsdata->nacwu;
1186 	} else {
1187 		disk->acwu = cdata->acwu;
1188 	}
1189 
1190 	disk->ctxt = ctx;
1191 	disk->fn_table = &nvmelib_fn_table;
1192 	disk->module = &nvme_if;
1193 	rc = spdk_bdev_register(disk);
1194 	if (rc) {
1195 		SPDK_ERRLOG("spdk_bdev_register() failed\n");
1196 		free(disk->name);
1197 		return rc;
1198 	}
1199 
1200 	return 0;
1201 }
1202 
1203 static int
1204 nvme_bdev_create(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, struct nvme_bdev_ns *nvme_ns)
1205 {
1206 	struct nvme_bdev *bdev;
1207 	int rc;
1208 
1209 	bdev = calloc(1, sizeof(*bdev));
1210 	if (!bdev) {
1211 		SPDK_ERRLOG("bdev calloc() failed\n");
1212 		return -ENOMEM;
1213 	}
1214 
1215 	bdev->nvme_ns = nvme_ns;
1216 	bdev->opal = nvme_bdev_ctrlr->opal_dev != NULL;
1217 
1218 	rc = nvme_disk_create(&bdev->disk, nvme_bdev_ctrlr->name, nvme_bdev_ctrlr->ctrlr,
1219 			      nvme_ns->ns, nvme_bdev_ctrlr->prchk_flags, bdev);
1220 	if (rc != 0) {
1221 		SPDK_ERRLOG("Failed to create NVMe disk\n");
1222 		free(bdev);
1223 		return rc;
1224 	}
1225 
1226 	nvme_ns->ref++;
1227 	TAILQ_INSERT_TAIL(&nvme_ns->bdevs, bdev, tailq);
1228 
1229 	return 0;
1230 }
1231 
1232 static void
1233 nvme_ctrlr_populate_standard_namespace(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
1234 				       struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx)
1235 {
1236 	struct spdk_nvme_ctrlr	*ctrlr = nvme_bdev_ctrlr->ctrlr;
1237 	struct spdk_nvme_ns	*ns;
1238 	int			rc = 0;
1239 
1240 	ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id);
1241 	if (!ns) {
1242 		SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id);
1243 		rc = -EINVAL;
1244 		goto done;
1245 	}
1246 
1247 	nvme_ns->ns = ns;
1248 	nvme_ns->ref = 1;
1249 
1250 	rc = nvme_bdev_create(nvme_bdev_ctrlr, nvme_ns);
1251 done:
1252 	nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, rc);
1253 }
1254 
1255 static bool
1256 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
1257 		 struct spdk_nvme_ctrlr_opts *opts)
1258 {
1259 	struct nvme_probe_skip_entry *entry;
1260 
1261 	TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) {
1262 		if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) {
1263 			return false;
1264 		}
1265 	}
1266 
1267 	opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst;
1268 	opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight;
1269 	opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight;
1270 	opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight;
1271 
1272 	SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr);
1273 
1274 	return true;
1275 }
1276 
1277 static void
1278 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl)
1279 {
1280 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = ctx;
1281 
1282 	if (spdk_nvme_cpl_is_error(cpl)) {
1283 		SPDK_WARNLOG("Abort failed. Resetting controller.\n");
1284 		_bdev_nvme_reset(nvme_bdev_ctrlr, NULL);
1285 	}
1286 }
1287 
1288 static void
1289 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr,
1290 	   struct spdk_nvme_qpair *qpair, uint16_t cid)
1291 {
1292 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = cb_arg;
1293 	union spdk_nvme_csts_register csts;
1294 	int rc;
1295 
1296 	assert(nvme_bdev_ctrlr->ctrlr == ctrlr);
1297 
1298 	SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid);
1299 
1300 	/* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O
1301 	 * queue.  (Note: qpair == NULL when there's an admin cmd timeout.)  Otherwise we
1302 	 * would submit another fabrics cmd on the admin queue to read CSTS and check for its
1303 	 * completion recursively.
1304 	 */
1305 	if (nvme_bdev_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) {
1306 		csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr);
1307 		if (csts.bits.cfs) {
1308 			SPDK_ERRLOG("Controller Fatal Status, reset required\n");
1309 			_bdev_nvme_reset(nvme_bdev_ctrlr, NULL);
1310 			return;
1311 		}
1312 	}
1313 
1314 	switch (g_opts.action_on_timeout) {
1315 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT:
1316 		if (qpair) {
1317 			rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid,
1318 						       nvme_abort_cpl, nvme_bdev_ctrlr);
1319 			if (rc == 0) {
1320 				return;
1321 			}
1322 
1323 			SPDK_ERRLOG("Unable to send abort. Resetting.\n");
1324 		}
1325 
1326 	/* FALLTHROUGH */
1327 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET:
1328 		_bdev_nvme_reset(nvme_bdev_ctrlr, NULL);
1329 		break;
1330 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE:
1331 		SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n");
1332 		break;
1333 	default:
1334 		SPDK_ERRLOG("An invalid timeout action value is found.\n");
1335 		break;
1336 	}
1337 }
1338 
1339 void
1340 nvme_ctrlr_depopulate_namespace_done(struct nvme_bdev_ns *nvme_ns)
1341 {
1342 	nvme_bdev_ns_detach(nvme_ns);
1343 }
1344 
1345 static void
1346 nvme_ctrlr_depopulate_standard_namespace(struct nvme_bdev_ns *nvme_ns)
1347 {
1348 	struct nvme_bdev *bdev, *tmp;
1349 
1350 	TAILQ_FOREACH_SAFE(bdev, &nvme_ns->bdevs, tailq, tmp) {
1351 		spdk_bdev_unregister(&bdev->disk, NULL, NULL);
1352 	}
1353 
1354 	nvme_ns->populated = false;
1355 
1356 	nvme_ctrlr_depopulate_namespace_done(nvme_ns);
1357 }
1358 
1359 static void
1360 nvme_ctrlr_populate_namespace(struct nvme_bdev_ctrlr *ctrlr, struct nvme_bdev_ns *nvme_ns,
1361 			      struct nvme_async_probe_ctx *ctx)
1362 {
1363 	g_populate_namespace_fn[nvme_ns->type](ctrlr, nvme_ns, ctx);
1364 }
1365 
1366 static void
1367 nvme_ctrlr_depopulate_namespace(struct nvme_bdev_ctrlr *ctrlr, struct nvme_bdev_ns *nvme_ns)
1368 {
1369 	g_depopulate_namespace_fn[nvme_ns->type](nvme_ns);
1370 }
1371 
1372 void
1373 nvme_ctrlr_populate_namespace_done(struct nvme_async_probe_ctx *ctx,
1374 				   struct nvme_bdev_ns *nvme_ns, int rc)
1375 {
1376 	if (rc == 0) {
1377 		nvme_ns->populated = true;
1378 		pthread_mutex_lock(&g_bdev_nvme_mutex);
1379 		nvme_ns->ctrlr->ref++;
1380 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
1381 	} else {
1382 		memset(nvme_ns, 0, sizeof(*nvme_ns));
1383 	}
1384 
1385 	if (ctx) {
1386 		ctx->populates_in_progress--;
1387 		if (ctx->populates_in_progress == 0) {
1388 			nvme_ctrlr_populate_namespaces_done(ctx);
1389 		}
1390 	}
1391 }
1392 
1393 static void
1394 nvme_ctrlr_populate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
1395 			       struct nvme_async_probe_ctx *ctx)
1396 {
1397 	struct spdk_nvme_ctrlr	*ctrlr = nvme_bdev_ctrlr->ctrlr;
1398 	struct nvme_bdev_ns	*nvme_ns;
1399 	struct spdk_nvme_ns	*ns;
1400 	struct nvme_bdev	*bdev;
1401 	uint32_t		i;
1402 	int			rc;
1403 	uint64_t		num_sectors;
1404 	bool			ns_is_active;
1405 
1406 	if (ctx) {
1407 		/* Initialize this count to 1 to handle the populate functions
1408 		 * calling nvme_ctrlr_populate_namespace_done() immediately.
1409 		 */
1410 		ctx->populates_in_progress = 1;
1411 	}
1412 
1413 	for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
1414 		uint32_t	nsid = i + 1;
1415 
1416 		nvme_ns = nvme_bdev_ctrlr->namespaces[i];
1417 		ns_is_active = spdk_nvme_ctrlr_is_active_ns(ctrlr, nsid);
1418 
1419 		if (nvme_ns->populated && ns_is_active && nvme_ns->type == NVME_BDEV_NS_STANDARD) {
1420 			/* NS is still there but attributes may have changed */
1421 			ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid);
1422 			num_sectors = spdk_nvme_ns_get_num_sectors(ns);
1423 			bdev = TAILQ_FIRST(&nvme_ns->bdevs);
1424 			if (bdev->disk.blockcnt != num_sectors) {
1425 				SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n",
1426 					       nsid,
1427 					       bdev->disk.name,
1428 					       bdev->disk.blockcnt,
1429 					       num_sectors);
1430 				rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors);
1431 				if (rc != 0) {
1432 					SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n",
1433 						    bdev->disk.name, rc);
1434 				}
1435 			}
1436 		}
1437 
1438 		if (!nvme_ns->populated && ns_is_active) {
1439 			nvme_ns->id = nsid;
1440 			nvme_ns->ctrlr = nvme_bdev_ctrlr;
1441 			if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) {
1442 				nvme_ns->type = NVME_BDEV_NS_OCSSD;
1443 			} else {
1444 				nvme_ns->type = NVME_BDEV_NS_STANDARD;
1445 			}
1446 
1447 			TAILQ_INIT(&nvme_ns->bdevs);
1448 
1449 			if (ctx) {
1450 				ctx->populates_in_progress++;
1451 			}
1452 			nvme_ctrlr_populate_namespace(nvme_bdev_ctrlr, nvme_ns, ctx);
1453 		}
1454 
1455 		if (nvme_ns->populated && !ns_is_active) {
1456 			nvme_ctrlr_depopulate_namespace(nvme_bdev_ctrlr, nvme_ns);
1457 		}
1458 	}
1459 
1460 	if (ctx) {
1461 		/* Decrement this count now that the loop is over to account
1462 		 * for the one we started with.  If the count is then 0, we
1463 		 * know any populate_namespace functions completed immediately,
1464 		 * so we'll kick the callback here.
1465 		 */
1466 		ctx->populates_in_progress--;
1467 		if (ctx->populates_in_progress == 0) {
1468 			nvme_ctrlr_populate_namespaces_done(ctx);
1469 		}
1470 	}
1471 
1472 }
1473 
1474 static void
1475 nvme_ctrlr_depopulate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr)
1476 {
1477 	uint32_t i;
1478 	struct nvme_bdev_ns *nvme_ns;
1479 
1480 	for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
1481 		uint32_t nsid = i + 1;
1482 
1483 		nvme_ns = nvme_bdev_ctrlr->namespaces[nsid - 1];
1484 		if (nvme_ns->populated) {
1485 			assert(nvme_ns->id == nsid);
1486 			nvme_ctrlr_depopulate_namespace(nvme_bdev_ctrlr, nvme_ns);
1487 		}
1488 	}
1489 }
1490 
1491 static void
1492 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl)
1493 {
1494 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr		= arg;
1495 	union spdk_nvme_async_event_completion	event;
1496 
1497 	if (spdk_nvme_cpl_is_error(cpl)) {
1498 		SPDK_WARNLOG("AER request execute failed");
1499 		return;
1500 	}
1501 
1502 	event.raw = cpl->cdw0;
1503 	if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) &&
1504 	    (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) {
1505 		nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, NULL);
1506 	} else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_VENDOR) &&
1507 		   (event.bits.log_page_identifier == SPDK_OCSSD_LOG_CHUNK_NOTIFICATION) &&
1508 		   spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) {
1509 		bdev_ocssd_handle_chunk_notification(nvme_bdev_ctrlr);
1510 	}
1511 }
1512 
1513 static int
1514 nvme_bdev_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr,
1515 		       const char *name,
1516 		       const struct spdk_nvme_transport_id *trid,
1517 		       uint32_t prchk_flags)
1518 {
1519 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
1520 	struct nvme_bdev_ctrlr_trid *trid_entry;
1521 	uint32_t i;
1522 	int rc;
1523 
1524 	nvme_bdev_ctrlr = calloc(1, sizeof(*nvme_bdev_ctrlr));
1525 	if (nvme_bdev_ctrlr == NULL) {
1526 		SPDK_ERRLOG("Failed to allocate device struct\n");
1527 		return -ENOMEM;
1528 	}
1529 
1530 	TAILQ_INIT(&nvme_bdev_ctrlr->trids);
1531 	nvme_bdev_ctrlr->num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr);
1532 	nvme_bdev_ctrlr->namespaces = calloc(nvme_bdev_ctrlr->num_ns, sizeof(struct nvme_bdev_ns *));
1533 	if (!nvme_bdev_ctrlr->namespaces) {
1534 		SPDK_ERRLOG("Failed to allocate block namespaces pointer\n");
1535 		rc = -ENOMEM;
1536 		goto err_alloc_namespaces;
1537 	}
1538 
1539 	trid_entry = calloc(1, sizeof(*trid_entry));
1540 	if (trid_entry == NULL) {
1541 		SPDK_ERRLOG("Failed to allocate trid entry pointer\n");
1542 		rc = -ENOMEM;
1543 		goto err_alloc_trid;
1544 	}
1545 
1546 	trid_entry->trid = *trid;
1547 
1548 	for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
1549 		nvme_bdev_ctrlr->namespaces[i] = calloc(1, sizeof(struct nvme_bdev_ns));
1550 		if (nvme_bdev_ctrlr->namespaces[i] == NULL) {
1551 			SPDK_ERRLOG("Failed to allocate block namespace struct\n");
1552 			rc = -ENOMEM;
1553 			goto err_alloc_namespace;
1554 		}
1555 	}
1556 
1557 	nvme_bdev_ctrlr->thread = spdk_get_thread();
1558 	nvme_bdev_ctrlr->adminq_timer_poller = NULL;
1559 	nvme_bdev_ctrlr->ctrlr = ctrlr;
1560 	nvme_bdev_ctrlr->ref = 1;
1561 	nvme_bdev_ctrlr->connected_trid = &trid_entry->trid;
1562 	nvme_bdev_ctrlr->name = strdup(name);
1563 	if (nvme_bdev_ctrlr->name == NULL) {
1564 		rc = -ENOMEM;
1565 		goto err_alloc_name;
1566 	}
1567 
1568 	if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) {
1569 		rc = bdev_ocssd_init_ctrlr(nvme_bdev_ctrlr);
1570 		if (spdk_unlikely(rc != 0)) {
1571 			SPDK_ERRLOG("Unable to initialize OCSSD controller\n");
1572 			goto err_init_ocssd;
1573 		}
1574 	}
1575 
1576 	nvme_bdev_ctrlr->prchk_flags = prchk_flags;
1577 
1578 	spdk_io_device_register(nvme_bdev_ctrlr, bdev_nvme_create_cb, bdev_nvme_destroy_cb,
1579 				sizeof(struct nvme_io_channel),
1580 				name);
1581 
1582 	nvme_bdev_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_bdev_ctrlr,
1583 					       g_opts.nvme_adminq_poll_period_us);
1584 
1585 	TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nvme_bdev_ctrlr, tailq);
1586 
1587 	if (g_opts.timeout_us > 0) {
1588 		spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us,
1589 				timeout_cb, nvme_bdev_ctrlr);
1590 	}
1591 
1592 	spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_bdev_ctrlr);
1593 	spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_bdev_ctrlr);
1594 
1595 	if (spdk_nvme_ctrlr_get_flags(nvme_bdev_ctrlr->ctrlr) &
1596 	    SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) {
1597 		nvme_bdev_ctrlr->opal_dev = spdk_opal_dev_construct(nvme_bdev_ctrlr->ctrlr);
1598 		if (nvme_bdev_ctrlr->opal_dev == NULL) {
1599 			SPDK_ERRLOG("Failed to initialize Opal\n");
1600 		}
1601 	}
1602 
1603 	TAILQ_INSERT_HEAD(&nvme_bdev_ctrlr->trids, trid_entry, link);
1604 	return 0;
1605 
1606 err_init_ocssd:
1607 	free(nvme_bdev_ctrlr->name);
1608 err_alloc_name:
1609 err_alloc_namespace:
1610 	for (; i > 0; i--) {
1611 		free(nvme_bdev_ctrlr->namespaces[i - 1]);
1612 	}
1613 	free(trid_entry);
1614 err_alloc_trid:
1615 	free(nvme_bdev_ctrlr->namespaces);
1616 err_alloc_namespaces:
1617 	free(nvme_bdev_ctrlr);
1618 	return rc;
1619 }
1620 
1621 static void
1622 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
1623 	  struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
1624 {
1625 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
1626 	struct nvme_probe_ctx *ctx = cb_ctx;
1627 	char *name = NULL;
1628 	uint32_t prchk_flags = 0;
1629 	size_t i;
1630 
1631 	if (ctx) {
1632 		for (i = 0; i < ctx->count; i++) {
1633 			if (spdk_nvme_transport_id_compare(trid, &ctx->trids[i]) == 0) {
1634 				prchk_flags = ctx->prchk_flags[i];
1635 				name = strdup(ctx->names[i]);
1636 				break;
1637 			}
1638 		}
1639 	} else {
1640 		name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++);
1641 	}
1642 	if (!name) {
1643 		SPDK_ERRLOG("Failed to assign name to NVMe device\n");
1644 		return;
1645 	}
1646 
1647 	SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name);
1648 
1649 	nvme_bdev_ctrlr_create(ctrlr, name, trid, prchk_flags);
1650 
1651 	nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(trid);
1652 	if (!nvme_bdev_ctrlr) {
1653 		SPDK_ERRLOG("Failed to find new NVMe controller\n");
1654 		free(name);
1655 		return;
1656 	}
1657 
1658 	nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, NULL);
1659 
1660 	free(name);
1661 }
1662 
1663 static void
1664 _nvme_bdev_ctrlr_destruct(void *ctx)
1665 {
1666 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = ctx;
1667 
1668 	nvme_ctrlr_depopulate_namespaces(nvme_bdev_ctrlr);
1669 	nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr);
1670 }
1671 
1672 static void
1673 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr)
1674 {
1675 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = cb_ctx;
1676 
1677 	pthread_mutex_lock(&g_bdev_nvme_mutex);
1678 	assert(nvme_bdev_ctrlr->ctrlr == ctrlr);
1679 	/* The controller's destruction was already started */
1680 	if (nvme_bdev_ctrlr->destruct) {
1681 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
1682 		return;
1683 	}
1684 	nvme_bdev_ctrlr->destruct = true;
1685 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
1686 	_nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr);
1687 }
1688 
1689 static int
1690 bdev_nvme_hotplug_probe(void *arg)
1691 {
1692 	if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) {
1693 		g_hotplug_probe_ctx = NULL;
1694 		spdk_poller_unregister(&g_hotplug_probe_poller);
1695 	}
1696 
1697 	return SPDK_POLLER_BUSY;
1698 }
1699 
1700 static int
1701 bdev_nvme_hotplug(void *arg)
1702 {
1703 	struct spdk_nvme_transport_id trid_pcie;
1704 
1705 	if (g_hotplug_probe_ctx) {
1706 		return SPDK_POLLER_BUSY;
1707 	}
1708 
1709 	memset(&trid_pcie, 0, sizeof(trid_pcie));
1710 	spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE);
1711 
1712 	g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL,
1713 			      hotplug_probe_cb, attach_cb, NULL);
1714 
1715 	if (g_hotplug_probe_ctx) {
1716 		assert(g_hotplug_probe_poller == NULL);
1717 		g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000);
1718 	}
1719 
1720 	return SPDK_POLLER_BUSY;
1721 }
1722 
1723 void
1724 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts)
1725 {
1726 	*opts = g_opts;
1727 }
1728 
1729 int
1730 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts)
1731 {
1732 	if (g_bdev_nvme_init_thread != NULL) {
1733 		if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
1734 			return -EPERM;
1735 		}
1736 	}
1737 
1738 	g_opts = *opts;
1739 
1740 	return 0;
1741 }
1742 
1743 struct set_nvme_hotplug_ctx {
1744 	uint64_t period_us;
1745 	bool enabled;
1746 	spdk_msg_fn fn;
1747 	void *fn_ctx;
1748 };
1749 
1750 static void
1751 set_nvme_hotplug_period_cb(void *_ctx)
1752 {
1753 	struct set_nvme_hotplug_ctx *ctx = _ctx;
1754 
1755 	spdk_poller_unregister(&g_hotplug_poller);
1756 	if (ctx->enabled) {
1757 		g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us);
1758 	}
1759 
1760 	g_nvme_hotplug_poll_period_us = ctx->period_us;
1761 	g_nvme_hotplug_enabled = ctx->enabled;
1762 	if (ctx->fn) {
1763 		ctx->fn(ctx->fn_ctx);
1764 	}
1765 
1766 	free(ctx);
1767 }
1768 
1769 int
1770 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx)
1771 {
1772 	struct set_nvme_hotplug_ctx *ctx;
1773 
1774 	if (enabled == true && !spdk_process_is_primary()) {
1775 		return -EPERM;
1776 	}
1777 
1778 	ctx = calloc(1, sizeof(*ctx));
1779 	if (ctx == NULL) {
1780 		return -ENOMEM;
1781 	}
1782 
1783 	period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us;
1784 	ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX);
1785 	ctx->enabled = enabled;
1786 	ctx->fn = cb;
1787 	ctx->fn_ctx = cb_ctx;
1788 
1789 	spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx);
1790 	return 0;
1791 }
1792 
1793 static void
1794 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, size_t count, int rc)
1795 {
1796 	if (ctx->cb_fn) {
1797 		ctx->cb_fn(ctx->cb_ctx, count, rc);
1798 	}
1799 
1800 	ctx->namespaces_populated = true;
1801 	if (ctx->probe_done) {
1802 		/* The probe was already completed, so we need to free the context
1803 		 * here.  This can happen for cases like OCSSD, where we need to
1804 		 * send additional commands to the SSD after attach.
1805 		 */
1806 		free(ctx);
1807 	}
1808 }
1809 
1810 static void
1811 nvme_ctrlr_populate_namespaces_done(struct nvme_async_probe_ctx *ctx)
1812 {
1813 	struct nvme_bdev_ctrlr	*nvme_bdev_ctrlr;
1814 	struct nvme_bdev_ns	*nvme_ns;
1815 	struct nvme_bdev	*nvme_bdev, *tmp;
1816 	uint32_t		i, nsid;
1817 	size_t			j;
1818 
1819 	nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(ctx->base_name);
1820 	assert(nvme_bdev_ctrlr != NULL);
1821 
1822 	/*
1823 	 * Report the new bdevs that were created in this call.
1824 	 * There can be more than one bdev per NVMe controller.
1825 	 */
1826 	j = 0;
1827 	for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
1828 		nsid = i + 1;
1829 		nvme_ns = nvme_bdev_ctrlr->namespaces[nsid - 1];
1830 		if (!nvme_ns->populated) {
1831 			continue;
1832 		}
1833 		assert(nvme_ns->id == nsid);
1834 		TAILQ_FOREACH_SAFE(nvme_bdev, &nvme_ns->bdevs, tailq, tmp) {
1835 			if (j < ctx->count) {
1836 				ctx->names[j] = nvme_bdev->disk.name;
1837 				j++;
1838 			} else {
1839 				SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n",
1840 					    ctx->count);
1841 				populate_namespaces_cb(ctx, 0, -ERANGE);
1842 				return;
1843 			}
1844 		}
1845 	}
1846 
1847 	populate_namespaces_cb(ctx, j, 0);
1848 }
1849 
1850 static bool
1851 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2)
1852 {
1853 	const struct spdk_nvme_ns_data *nsdata1, *nsdata2;
1854 
1855 	nsdata1 = spdk_nvme_ns_get_data(ns1);
1856 	nsdata2 = spdk_nvme_ns_get_data(ns2);
1857 
1858 	return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid));
1859 }
1860 
1861 static int
1862 bdev_nvme_add_trid(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, struct spdk_nvme_ctrlr *new_ctrlr,
1863 		   struct spdk_nvme_transport_id *trid)
1864 {
1865 	uint32_t			i, nsid;
1866 	struct nvme_bdev_ns		*nvme_ns;
1867 	struct spdk_nvme_ns		*new_ns;
1868 	struct nvme_bdev_ctrlr_trid	*new_trid, *tmp_trid;
1869 	int				rc = 0;
1870 
1871 	assert(nvme_bdev_ctrlr != NULL);
1872 
1873 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
1874 		SPDK_ERRLOG("PCIe failover is not supported.\n");
1875 		return -ENOTSUP;
1876 	}
1877 
1878 	pthread_mutex_lock(&g_bdev_nvme_mutex);
1879 
1880 	/* Currently we only support failover to the same transport type. */
1881 	if (nvme_bdev_ctrlr->connected_trid->trtype != trid->trtype) {
1882 		rc = -EINVAL;
1883 		goto exit;
1884 	}
1885 
1886 	/* Currently we only support failover to the same NQN. */
1887 	if (strncmp(trid->subnqn, nvme_bdev_ctrlr->connected_trid->subnqn, SPDK_NVMF_NQN_MAX_LEN)) {
1888 		rc = -EINVAL;
1889 		goto exit;
1890 	}
1891 
1892 	/* Skip all the other checks if we've already registered this path. */
1893 	TAILQ_FOREACH(new_trid, &nvme_bdev_ctrlr->trids, link) {
1894 		if (!spdk_nvme_transport_id_compare(&new_trid->trid, trid)) {
1895 			rc = -EEXIST;
1896 			goto exit;
1897 		}
1898 	}
1899 
1900 	if (spdk_nvme_ctrlr_get_num_ns(new_ctrlr) != nvme_bdev_ctrlr->num_ns) {
1901 		rc = -EINVAL;
1902 		goto exit;
1903 	}
1904 
1905 	for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
1906 		nsid = i + 1;
1907 
1908 		nvme_ns = nvme_bdev_ctrlr->namespaces[i];
1909 		if (!nvme_ns->populated) {
1910 			continue;
1911 		}
1912 
1913 		new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nsid);
1914 		assert(new_ns != NULL);
1915 
1916 		if (bdev_nvme_compare_ns(nvme_ns->ns, new_ns) != 0) {
1917 			rc = -EINVAL;
1918 			goto exit;
1919 		}
1920 	}
1921 
1922 	new_trid = calloc(1, sizeof(*new_trid));
1923 	if (new_trid == NULL) {
1924 		rc = -ENOMEM;
1925 		goto exit;
1926 	}
1927 	new_trid->trid = *trid;
1928 	new_trid->is_failed = false;
1929 
1930 	TAILQ_FOREACH(tmp_trid, &nvme_bdev_ctrlr->trids, link) {
1931 		if (tmp_trid->is_failed) {
1932 			TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link);
1933 			goto exit;
1934 		}
1935 	}
1936 
1937 	TAILQ_INSERT_TAIL(&nvme_bdev_ctrlr->trids, new_trid, link);
1938 
1939 exit:
1940 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
1941 	return rc;
1942 }
1943 
1944 static void
1945 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
1946 		  struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
1947 {
1948 	struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx;
1949 	struct nvme_bdev_ctrlr	*nvme_bdev_ctrlr;
1950 	struct nvme_async_probe_ctx *ctx;
1951 	int rc;
1952 
1953 	ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, opts);
1954 	ctx->ctrlr_attached = true;
1955 
1956 	nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(ctx->base_name);
1957 	if (nvme_bdev_ctrlr) {
1958 		/* This is the case that a secondary path is added to an existing
1959 		 * nvme_bdev_ctrlr for failover. After checking if it can access the same
1960 		 * namespaces as the primary path, it is disconnected until failover occurs.
1961 		 */
1962 		rc = bdev_nvme_add_trid(nvme_bdev_ctrlr, ctrlr, &ctx->trid);
1963 
1964 		spdk_nvme_detach(ctrlr);
1965 		goto exit;
1966 	}
1967 
1968 	rc = nvme_bdev_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx->prchk_flags);
1969 	if (rc) {
1970 		SPDK_ERRLOG("Failed to create new device\n");
1971 		goto exit;
1972 	}
1973 
1974 	nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(&ctx->trid);
1975 	assert(nvme_bdev_ctrlr != NULL);
1976 
1977 	nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, ctx);
1978 	return;
1979 
1980 exit:
1981 	populate_namespaces_cb(ctx, 0, rc);
1982 }
1983 
1984 static int
1985 bdev_nvme_async_poll(void *arg)
1986 {
1987 	struct nvme_async_probe_ctx	*ctx = arg;
1988 	int				rc;
1989 
1990 	rc = spdk_nvme_probe_poll_async(ctx->probe_ctx);
1991 	if (spdk_unlikely(rc != -EAGAIN)) {
1992 		ctx->probe_done = true;
1993 		spdk_poller_unregister(&ctx->poller);
1994 		if (!ctx->ctrlr_attached) {
1995 			/* The probe is done, but no controller was attached.
1996 			 * That means we had a failure, so report -EIO back to
1997 			 * the caller (usually the RPC). populate_namespaces_cb()
1998 			 * will take care of freeing the nvme_async_probe_ctx.
1999 			 */
2000 			populate_namespaces_cb(ctx, 0, -EIO);
2001 		} else if (ctx->namespaces_populated) {
2002 			/* The namespaces for the attached controller were all
2003 			 * populated and the response was already sent to the
2004 			 * caller (usually the RPC).  So free the context here.
2005 			 */
2006 			free(ctx);
2007 		}
2008 	}
2009 
2010 	return SPDK_POLLER_BUSY;
2011 }
2012 
2013 int
2014 bdev_nvme_create(struct spdk_nvme_transport_id *trid,
2015 		 struct spdk_nvme_host_id *hostid,
2016 		 const char *base_name,
2017 		 const char **names,
2018 		 uint32_t count,
2019 		 const char *hostnqn,
2020 		 uint32_t prchk_flags,
2021 		 spdk_bdev_create_nvme_fn cb_fn,
2022 		 void *cb_ctx)
2023 {
2024 	struct nvme_probe_skip_entry	*entry, *tmp;
2025 	struct nvme_async_probe_ctx	*ctx;
2026 
2027 	/* TODO expand this check to include both the host and target TRIDs.
2028 	 * Only if both are the same should we fail.
2029 	 */
2030 	if (nvme_bdev_ctrlr_get(trid) != NULL) {
2031 		SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr);
2032 		return -EEXIST;
2033 	}
2034 
2035 	ctx = calloc(1, sizeof(*ctx));
2036 	if (!ctx) {
2037 		return -ENOMEM;
2038 	}
2039 	ctx->base_name = base_name;
2040 	ctx->names = names;
2041 	ctx->count = count;
2042 	ctx->cb_fn = cb_fn;
2043 	ctx->cb_ctx = cb_ctx;
2044 	ctx->prchk_flags = prchk_flags;
2045 	ctx->trid = *trid;
2046 
2047 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
2048 		TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) {
2049 			if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) {
2050 				TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq);
2051 				free(entry);
2052 				break;
2053 			}
2054 		}
2055 	}
2056 
2057 	spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->opts, sizeof(ctx->opts));
2058 	ctx->opts.transport_retry_count = g_opts.retry_count;
2059 	ctx->opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms;
2060 
2061 	if (hostnqn) {
2062 		snprintf(ctx->opts.hostnqn, sizeof(ctx->opts.hostnqn), "%s", hostnqn);
2063 	}
2064 
2065 	if (hostid->hostaddr[0] != '\0') {
2066 		snprintf(ctx->opts.src_addr, sizeof(ctx->opts.src_addr), "%s", hostid->hostaddr);
2067 	}
2068 
2069 	if (hostid->hostsvcid[0] != '\0') {
2070 		snprintf(ctx->opts.src_svcid, sizeof(ctx->opts.src_svcid), "%s", hostid->hostsvcid);
2071 	}
2072 
2073 	ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->opts, connect_attach_cb);
2074 	if (ctx->probe_ctx == NULL) {
2075 		SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr);
2076 		free(ctx);
2077 		return -ENODEV;
2078 	}
2079 	ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000);
2080 
2081 	return 0;
2082 }
2083 
2084 int
2085 bdev_nvme_remove_trid(const char *name, struct spdk_nvme_transport_id *trid)
2086 {
2087 	struct nvme_bdev_ctrlr		*nvme_bdev_ctrlr;
2088 	struct nvme_bdev_ctrlr_trid	*ctrlr_trid, *tmp_trid;
2089 
2090 	if (name == NULL) {
2091 		return -EINVAL;
2092 	}
2093 
2094 	nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name);
2095 	if (nvme_bdev_ctrlr == NULL) {
2096 		SPDK_ERRLOG("Failed to find NVMe controller\n");
2097 		return -ENODEV;
2098 	}
2099 
2100 	/* case 1: we are currently using the path to be removed. */
2101 	if (!spdk_nvme_transport_id_compare(trid, nvme_bdev_ctrlr->connected_trid)) {
2102 		ctrlr_trid = TAILQ_FIRST(&nvme_bdev_ctrlr->trids);
2103 		assert(nvme_bdev_ctrlr->connected_trid == &ctrlr_trid->trid);
2104 		/* case 1A: the current path is the only path. */
2105 		if (!TAILQ_NEXT(ctrlr_trid, link)) {
2106 			return bdev_nvme_delete(name);
2107 		}
2108 
2109 		/* case 1B: there is an alternative path. */
2110 		return bdev_nvme_failover(nvme_bdev_ctrlr, true);
2111 	}
2112 	/* case 2: We are not using the specified path. */
2113 	TAILQ_FOREACH_SAFE(ctrlr_trid, &nvme_bdev_ctrlr->trids, link, tmp_trid) {
2114 		if (!spdk_nvme_transport_id_compare(&ctrlr_trid->trid, trid)) {
2115 			TAILQ_REMOVE(&nvme_bdev_ctrlr->trids, ctrlr_trid, link);
2116 			free(ctrlr_trid);
2117 			return 0;
2118 		}
2119 	}
2120 
2121 	/* case 2A: The address isn't even in the registered list. */
2122 	return -ENXIO;
2123 }
2124 
2125 int
2126 bdev_nvme_delete(const char *name)
2127 {
2128 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
2129 	struct nvme_probe_skip_entry *entry;
2130 
2131 	if (name == NULL) {
2132 		return -EINVAL;
2133 	}
2134 
2135 	pthread_mutex_lock(&g_bdev_nvme_mutex);
2136 
2137 	nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name);
2138 	if (nvme_bdev_ctrlr == NULL) {
2139 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
2140 		SPDK_ERRLOG("Failed to find NVMe controller\n");
2141 		return -ENODEV;
2142 	}
2143 
2144 	/* The controller's destruction was already started */
2145 	if (nvme_bdev_ctrlr->destruct) {
2146 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
2147 		return 0;
2148 	}
2149 
2150 	if (nvme_bdev_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
2151 		entry = calloc(1, sizeof(*entry));
2152 		if (!entry) {
2153 			pthread_mutex_unlock(&g_bdev_nvme_mutex);
2154 			return -ENOMEM;
2155 		}
2156 		entry->trid = *nvme_bdev_ctrlr->connected_trid;
2157 		TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq);
2158 	}
2159 
2160 	nvme_bdev_ctrlr->destruct = true;
2161 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
2162 
2163 	_nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr);
2164 
2165 	return 0;
2166 }
2167 
2168 static int
2169 bdev_nvme_library_init(void)
2170 {
2171 	g_bdev_nvme_init_thread = spdk_get_thread();
2172 
2173 	spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_poll_group_create_cb,
2174 				bdev_nvme_poll_group_destroy_cb,
2175 				sizeof(struct nvme_bdev_poll_group),  "bdev_nvme_poll_groups");
2176 
2177 	return 0;
2178 }
2179 
2180 static void
2181 bdev_nvme_library_fini(void)
2182 {
2183 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, *tmp;
2184 	struct nvme_probe_skip_entry *entry, *entry_tmp;
2185 
2186 	spdk_poller_unregister(&g_hotplug_poller);
2187 	free(g_hotplug_probe_ctx);
2188 
2189 	TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) {
2190 		TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq);
2191 		free(entry);
2192 	}
2193 
2194 	pthread_mutex_lock(&g_bdev_nvme_mutex);
2195 	TAILQ_FOREACH_SAFE(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq, tmp) {
2196 		if (nvme_bdev_ctrlr->destruct) {
2197 			/* This controller's destruction was already started
2198 			 * before the application started shutting down
2199 			 */
2200 			continue;
2201 		}
2202 		nvme_bdev_ctrlr->destruct = true;
2203 
2204 		spdk_thread_send_msg(nvme_bdev_ctrlr->thread, _nvme_bdev_ctrlr_destruct,
2205 				     nvme_bdev_ctrlr);
2206 	}
2207 
2208 	g_bdev_nvme_module_finish = true;
2209 	if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
2210 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
2211 		spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL);
2212 		spdk_bdev_module_finish_done();
2213 		return;
2214 	}
2215 
2216 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
2217 }
2218 
2219 static void
2220 bdev_nvme_verify_pi_error(struct spdk_bdev_io *bdev_io)
2221 {
2222 	struct spdk_bdev *bdev = bdev_io->bdev;
2223 	struct spdk_dif_ctx dif_ctx;
2224 	struct spdk_dif_error err_blk = {};
2225 	int rc;
2226 
2227 	rc = spdk_dif_ctx_init(&dif_ctx,
2228 			       bdev->blocklen, bdev->md_len, bdev->md_interleave,
2229 			       bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags,
2230 			       bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0);
2231 	if (rc != 0) {
2232 		SPDK_ERRLOG("Initialization of DIF context failed\n");
2233 		return;
2234 	}
2235 
2236 	if (bdev->md_interleave) {
2237 		rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
2238 				     bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
2239 	} else {
2240 		struct iovec md_iov = {
2241 			.iov_base	= bdev_io->u.bdev.md_buf,
2242 			.iov_len	= bdev_io->u.bdev.num_blocks * bdev->md_len,
2243 		};
2244 
2245 		rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
2246 				     &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
2247 	}
2248 
2249 	if (rc != 0) {
2250 		SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n",
2251 			    err_blk.err_type, err_blk.err_offset);
2252 	} else {
2253 		SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n");
2254 	}
2255 }
2256 
2257 static void
2258 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl)
2259 {
2260 	struct nvme_bdev_io *bio = ref;
2261 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2262 
2263 	if (spdk_nvme_cpl_is_success(cpl)) {
2264 		/* Run PI verification for read data buffer. */
2265 		bdev_nvme_verify_pi_error(bdev_io);
2266 	}
2267 
2268 	/* Return original completion status */
2269 	spdk_bdev_io_complete_nvme_status(bdev_io, bio->cpl.cdw0, bio->cpl.status.sct,
2270 					  bio->cpl.status.sc);
2271 }
2272 
2273 static void
2274 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl)
2275 {
2276 	struct nvme_bdev_io *bio = ref;
2277 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2278 	struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt;
2279 	struct nvme_io_channel *nvme_ch;
2280 	int ret;
2281 
2282 	if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) {
2283 		SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n",
2284 			    cpl->status.sct, cpl->status.sc);
2285 
2286 		/* Save completion status to use after verifying PI error. */
2287 		bio->cpl = *cpl;
2288 
2289 		nvme_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io));
2290 
2291 		/* Read without PI checking to verify PI error. */
2292 		ret = bdev_nvme_no_pi_readv(nbdev->nvme_ns->ns,
2293 					    nvme_ch->qpair,
2294 					    bio,
2295 					    bdev_io->u.bdev.iovs,
2296 					    bdev_io->u.bdev.iovcnt,
2297 					    bdev_io->u.bdev.md_buf,
2298 					    bdev_io->u.bdev.num_blocks,
2299 					    bdev_io->u.bdev.offset_blocks);
2300 		if (ret == 0) {
2301 			return;
2302 		}
2303 	}
2304 
2305 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2306 }
2307 
2308 static void
2309 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl)
2310 {
2311 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref);
2312 
2313 	if (spdk_nvme_cpl_is_pi_error(cpl)) {
2314 		SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n",
2315 			    cpl->status.sct, cpl->status.sc);
2316 		/* Run PI verification for write data buffer if PI error is detected. */
2317 		bdev_nvme_verify_pi_error(bdev_io);
2318 	}
2319 
2320 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2321 }
2322 
2323 static void
2324 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl)
2325 {
2326 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref);
2327 
2328 	if (spdk_nvme_cpl_is_pi_error(cpl)) {
2329 		SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n",
2330 			    cpl->status.sct, cpl->status.sc);
2331 		/* Run PI verification for compare data buffer if PI error is detected. */
2332 		bdev_nvme_verify_pi_error(bdev_io);
2333 	}
2334 
2335 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2336 }
2337 
2338 static void
2339 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl)
2340 {
2341 	struct nvme_bdev_io *bio = ref;
2342 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2343 
2344 	/* Compare operation completion */
2345 	if ((cpl->cdw0 & 0xFF) == SPDK_NVME_OPC_COMPARE) {
2346 		/* Save compare result for write callback */
2347 		bio->cpl = *cpl;
2348 		return;
2349 	}
2350 
2351 	/* Write operation completion */
2352 	if (spdk_nvme_cpl_is_error(&bio->cpl)) {
2353 		/* If bio->cpl is already an error, it means the compare operation failed.  In that case,
2354 		 * complete the IO with the compare operation's status.
2355 		 */
2356 		if (!spdk_nvme_cpl_is_error(cpl)) {
2357 			SPDK_ERRLOG("Unexpected write success after compare failure.\n");
2358 		}
2359 
2360 		spdk_bdev_io_complete_nvme_status(bdev_io, bio->cpl.cdw0, bio->cpl.status.sct, bio->cpl.status.sc);
2361 	} else {
2362 		spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2363 	}
2364 }
2365 
2366 static void
2367 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl)
2368 {
2369 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref);
2370 
2371 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2372 }
2373 
2374 static void
2375 bdev_nvme_admin_passthru_completion(void *ctx)
2376 {
2377 	struct nvme_bdev_io *bio = ctx;
2378 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2379 
2380 	spdk_bdev_io_complete_nvme_status(bdev_io,
2381 					  bio->cpl.cdw0, bio->cpl.status.sct, bio->cpl.status.sc);
2382 }
2383 
2384 static void
2385 bdev_nvme_abort_completion(void *ctx)
2386 {
2387 	struct nvme_bdev_io *bio = ctx;
2388 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2389 
2390 	if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) {
2391 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
2392 	} else {
2393 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
2394 	}
2395 }
2396 
2397 static void
2398 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl)
2399 {
2400 	struct nvme_bdev_io *bio = ref;
2401 
2402 	bio->cpl = *cpl;
2403 	spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_completion, bio);
2404 }
2405 
2406 static void
2407 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl)
2408 {
2409 	struct nvme_bdev_io *bio = ref;
2410 
2411 	bio->cpl = *cpl;
2412 	spdk_thread_send_msg(bio->orig_thread, bdev_nvme_admin_passthru_completion, bio);
2413 }
2414 
2415 static void
2416 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset)
2417 {
2418 	struct nvme_bdev_io *bio = ref;
2419 	struct iovec *iov;
2420 
2421 	bio->iov_offset = sgl_offset;
2422 	for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) {
2423 		iov = &bio->iovs[bio->iovpos];
2424 		if (bio->iov_offset < iov->iov_len) {
2425 			break;
2426 		}
2427 
2428 		bio->iov_offset -= iov->iov_len;
2429 	}
2430 }
2431 
2432 static int
2433 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length)
2434 {
2435 	struct nvme_bdev_io *bio = ref;
2436 	struct iovec *iov;
2437 
2438 	assert(bio->iovpos < bio->iovcnt);
2439 
2440 	iov = &bio->iovs[bio->iovpos];
2441 
2442 	*address = iov->iov_base;
2443 	*length = iov->iov_len;
2444 
2445 	if (bio->iov_offset) {
2446 		assert(bio->iov_offset <= iov->iov_len);
2447 		*address += bio->iov_offset;
2448 		*length -= bio->iov_offset;
2449 	}
2450 
2451 	bio->iov_offset += *length;
2452 	if (bio->iov_offset == iov->iov_len) {
2453 		bio->iovpos++;
2454 		bio->iov_offset = 0;
2455 	}
2456 
2457 	return 0;
2458 }
2459 
2460 static void
2461 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset)
2462 {
2463 	struct nvme_bdev_io *bio = ref;
2464 	struct iovec *iov;
2465 
2466 	bio->fused_iov_offset = sgl_offset;
2467 	for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) {
2468 		iov = &bio->fused_iovs[bio->fused_iovpos];
2469 		if (bio->fused_iov_offset < iov->iov_len) {
2470 			break;
2471 		}
2472 
2473 		bio->fused_iov_offset -= iov->iov_len;
2474 	}
2475 }
2476 
2477 static int
2478 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length)
2479 {
2480 	struct nvme_bdev_io *bio = ref;
2481 	struct iovec *iov;
2482 
2483 	assert(bio->fused_iovpos < bio->fused_iovcnt);
2484 
2485 	iov = &bio->fused_iovs[bio->fused_iovpos];
2486 
2487 	*address = iov->iov_base;
2488 	*length = iov->iov_len;
2489 
2490 	if (bio->fused_iov_offset) {
2491 		assert(bio->fused_iov_offset <= iov->iov_len);
2492 		*address += bio->fused_iov_offset;
2493 		*length -= bio->fused_iov_offset;
2494 	}
2495 
2496 	bio->fused_iov_offset += *length;
2497 	if (bio->fused_iov_offset == iov->iov_len) {
2498 		bio->fused_iovpos++;
2499 		bio->fused_iov_offset = 0;
2500 	}
2501 
2502 	return 0;
2503 }
2504 
2505 static int
2506 bdev_nvme_no_pi_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
2507 		      struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
2508 		      void *md, uint64_t lba_count, uint64_t lba)
2509 {
2510 	int rc;
2511 
2512 	SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n",
2513 		      lba_count, lba);
2514 
2515 	bio->iovs = iov;
2516 	bio->iovcnt = iovcnt;
2517 	bio->iovpos = 0;
2518 	bio->iov_offset = 0;
2519 
2520 	rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count,
2521 					    bdev_nvme_no_pi_readv_done, bio, 0,
2522 					    bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
2523 					    md, 0, 0);
2524 
2525 	if (rc != 0 && rc != -ENOMEM) {
2526 		SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc);
2527 	}
2528 	return rc;
2529 }
2530 
2531 static int
2532 bdev_nvme_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
2533 		struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
2534 		void *md, uint64_t lba_count, uint64_t lba, uint32_t flags)
2535 {
2536 	int rc;
2537 
2538 	SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n",
2539 		      lba_count, lba);
2540 
2541 	bio->iovs = iov;
2542 	bio->iovcnt = iovcnt;
2543 	bio->iovpos = 0;
2544 	bio->iov_offset = 0;
2545 
2546 	if (iovcnt == 1) {
2547 		rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, md, lba,
2548 						   lba_count,
2549 						   bdev_nvme_readv_done, bio,
2550 						   flags,
2551 						   0, 0);
2552 	} else {
2553 		rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count,
2554 						    bdev_nvme_readv_done, bio, flags,
2555 						    bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
2556 						    md, 0, 0);
2557 	}
2558 
2559 	if (rc != 0 && rc != -ENOMEM) {
2560 		SPDK_ERRLOG("readv failed: rc = %d\n", rc);
2561 	}
2562 	return rc;
2563 }
2564 
2565 static int
2566 bdev_nvme_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
2567 		 struct nvme_bdev_io *bio,
2568 		 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
2569 		 uint32_t flags)
2570 {
2571 	int rc;
2572 
2573 	SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n",
2574 		      lba_count, lba);
2575 
2576 	bio->iovs = iov;
2577 	bio->iovcnt = iovcnt;
2578 	bio->iovpos = 0;
2579 	bio->iov_offset = 0;
2580 
2581 	if (iovcnt == 1) {
2582 		rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, md, lba,
2583 						    lba_count,
2584 						    bdev_nvme_readv_done, bio,
2585 						    flags,
2586 						    0, 0);
2587 	} else {
2588 		rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count,
2589 						     bdev_nvme_writev_done, bio, flags,
2590 						     bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
2591 						     md, 0, 0);
2592 	}
2593 
2594 	if (rc != 0 && rc != -ENOMEM) {
2595 		SPDK_ERRLOG("writev failed: rc = %d\n", rc);
2596 	}
2597 	return rc;
2598 }
2599 
2600 static int
2601 bdev_nvme_comparev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
2602 		   struct nvme_bdev_io *bio,
2603 		   struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
2604 		   uint32_t flags)
2605 {
2606 	int rc;
2607 
2608 	SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n",
2609 		      lba_count, lba);
2610 
2611 	bio->iovs = iov;
2612 	bio->iovcnt = iovcnt;
2613 	bio->iovpos = 0;
2614 	bio->iov_offset = 0;
2615 
2616 	rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count,
2617 					       bdev_nvme_comparev_done, bio, flags,
2618 					       bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
2619 					       md, 0, 0);
2620 
2621 	if (rc != 0 && rc != -ENOMEM) {
2622 		SPDK_ERRLOG("comparev failed: rc = %d\n", rc);
2623 	}
2624 	return rc;
2625 }
2626 
2627 static int
2628 bdev_nvme_comparev_and_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
2629 			      struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt,
2630 			      struct iovec *write_iov, int write_iovcnt,
2631 			      void *md, uint64_t lba_count, uint64_t lba, uint32_t flags)
2632 {
2633 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2634 	int rc;
2635 
2636 	SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n",
2637 		      lba_count, lba);
2638 
2639 	bio->iovs = cmp_iov;
2640 	bio->iovcnt = cmp_iovcnt;
2641 	bio->iovpos = 0;
2642 	bio->iov_offset = 0;
2643 	bio->fused_iovs = write_iov;
2644 	bio->fused_iovcnt = write_iovcnt;
2645 	bio->fused_iovpos = 0;
2646 	bio->fused_iov_offset = 0;
2647 
2648 	if (bdev_io->num_retries == 0) {
2649 		bio->first_fused_submitted = false;
2650 	}
2651 
2652 	if (!bio->first_fused_submitted) {
2653 		flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST;
2654 		memset(&bio->cpl, 0, sizeof(bio->cpl));
2655 
2656 		rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count,
2657 						       bdev_nvme_comparev_and_writev_done, bio, flags,
2658 						       bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0);
2659 		if (rc == 0) {
2660 			bio->first_fused_submitted = true;
2661 			flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST;
2662 		} else {
2663 			if (rc != -ENOMEM) {
2664 				SPDK_ERRLOG("compare failed: rc = %d\n", rc);
2665 			}
2666 			return rc;
2667 		}
2668 	}
2669 
2670 	flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND;
2671 
2672 	rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count,
2673 					     bdev_nvme_comparev_and_writev_done, bio, flags,
2674 					     bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0);
2675 	if (rc != 0 && rc != -ENOMEM) {
2676 		SPDK_ERRLOG("write failed: rc = %d\n", rc);
2677 		rc = 0;
2678 	}
2679 
2680 	return rc;
2681 }
2682 
2683 static int
2684 bdev_nvme_unmap(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
2685 		struct nvme_bdev_io *bio,
2686 		uint64_t offset_blocks,
2687 		uint64_t num_blocks)
2688 {
2689 	struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES];
2690 	struct spdk_nvme_dsm_range *range;
2691 	uint64_t offset, remaining;
2692 	uint64_t num_ranges_u64;
2693 	uint16_t num_ranges;
2694 	int rc;
2695 
2696 	num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) /
2697 			 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
2698 	if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) {
2699 		SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks);
2700 		return -EINVAL;
2701 	}
2702 	num_ranges = (uint16_t)num_ranges_u64;
2703 
2704 	offset = offset_blocks;
2705 	remaining = num_blocks;
2706 	range = &dsm_ranges[0];
2707 
2708 	/* Fill max-size ranges until the remaining blocks fit into one range */
2709 	while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) {
2710 		range->attributes.raw = 0;
2711 		range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
2712 		range->starting_lba = offset;
2713 
2714 		offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
2715 		remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
2716 		range++;
2717 	}
2718 
2719 	/* Final range describes the remaining blocks */
2720 	range->attributes.raw = 0;
2721 	range->length = remaining;
2722 	range->starting_lba = offset;
2723 
2724 	rc = spdk_nvme_ns_cmd_dataset_management(ns, qpair,
2725 			SPDK_NVME_DSM_ATTR_DEALLOCATE,
2726 			dsm_ranges, num_ranges,
2727 			bdev_nvme_queued_done, bio);
2728 
2729 	return rc;
2730 }
2731 
2732 static int
2733 bdev_nvme_admin_passthru(struct nvme_io_channel *nvme_ch, struct nvme_bdev_io *bio,
2734 			 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes)
2735 {
2736 	uint32_t max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ch->ctrlr->ctrlr);
2737 
2738 	if (nbytes > max_xfer_size) {
2739 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
2740 		return -EINVAL;
2741 	}
2742 
2743 	bio->orig_thread = spdk_io_channel_get_thread(spdk_io_channel_from_ctx(nvme_ch));
2744 
2745 	return spdk_nvme_ctrlr_cmd_admin_raw(nvme_ch->ctrlr->ctrlr, cmd, buf,
2746 					     (uint32_t)nbytes, bdev_nvme_admin_passthru_done, bio);
2747 }
2748 
2749 static int
2750 bdev_nvme_io_passthru(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
2751 		      struct nvme_bdev_io *bio,
2752 		      struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes)
2753 {
2754 	uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns);
2755 	struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns);
2756 
2757 	if (nbytes > max_xfer_size) {
2758 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
2759 		return -EINVAL;
2760 	}
2761 
2762 	/*
2763 	 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid,
2764 	 * so fill it out automatically.
2765 	 */
2766 	cmd->nsid = spdk_nvme_ns_get_id(ns);
2767 
2768 	return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf,
2769 					  (uint32_t)nbytes, bdev_nvme_queued_done, bio);
2770 }
2771 
2772 static int
2773 bdev_nvme_io_passthru_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
2774 			 struct nvme_bdev_io *bio,
2775 			 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len)
2776 {
2777 	size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns);
2778 	uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns);
2779 	struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns);
2780 
2781 	if (nbytes > max_xfer_size) {
2782 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
2783 		return -EINVAL;
2784 	}
2785 
2786 	if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) {
2787 		SPDK_ERRLOG("invalid meta data buffer size\n");
2788 		return -EINVAL;
2789 	}
2790 
2791 	/*
2792 	 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid,
2793 	 * so fill it out automatically.
2794 	 */
2795 	cmd->nsid = spdk_nvme_ns_get_id(ns);
2796 
2797 	return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf,
2798 			(uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio);
2799 }
2800 
2801 static void
2802 bdev_nvme_abort_admin_cmd(void *ctx)
2803 {
2804 	struct nvme_bdev_io *bio = ctx;
2805 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2806 	struct nvme_io_channel *nvme_ch;
2807 	struct nvme_bdev_io *bio_to_abort;
2808 	int rc;
2809 
2810 	nvme_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io));
2811 	bio_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx;
2812 
2813 	rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ch->ctrlr->ctrlr,
2814 					   NULL,
2815 					   bio_to_abort,
2816 					   bdev_nvme_abort_done, bio);
2817 	if (rc == -ENOENT) {
2818 		/* If no admin command was found in admin qpair, complete the abort
2819 		 * request with failure.
2820 		 */
2821 		bio->cpl.cdw0 |= 1U;
2822 		bio->cpl.status.sc = SPDK_NVME_SC_SUCCESS;
2823 		bio->cpl.status.sct = SPDK_NVME_SCT_GENERIC;
2824 
2825 		spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_completion, bio);
2826 	}
2827 }
2828 
2829 static int
2830 bdev_nvme_abort(struct nvme_io_channel *nvme_ch, struct nvme_bdev_io *bio,
2831 		struct nvme_bdev_io *bio_to_abort)
2832 {
2833 	int rc;
2834 
2835 	bio->orig_thread = spdk_io_channel_get_thread(spdk_io_channel_from_ctx(nvme_ch));
2836 
2837 	rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ch->ctrlr->ctrlr,
2838 					   nvme_ch->qpair,
2839 					   bio_to_abort,
2840 					   bdev_nvme_abort_done, bio);
2841 	if (rc == -ENOENT) {
2842 		/* If no command was found in I/O qpair, the target command may be
2843 		 * admin command. Only a single thread tries aborting admin command
2844 		 * to clean I/O flow.
2845 		 */
2846 		spdk_thread_send_msg(nvme_ch->ctrlr->thread,
2847 				     bdev_nvme_abort_admin_cmd, bio);
2848 		rc = 0;
2849 	}
2850 
2851 	return rc;
2852 }
2853 
2854 static void
2855 nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w,
2856 		struct nvme_bdev_ns *nvme_ns)
2857 {
2858 	/* nop */
2859 }
2860 
2861 static void
2862 nvme_namespace_config_json(struct spdk_json_write_ctx *w, struct nvme_bdev_ns *nvme_ns)
2863 {
2864 	g_config_json_namespace_fn[nvme_ns->type](w, nvme_ns);
2865 }
2866 
2867 static void
2868 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w)
2869 {
2870 	const char	*action;
2871 
2872 	if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) {
2873 		action = "reset";
2874 	} else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) {
2875 		action = "abort";
2876 	} else {
2877 		action = "none";
2878 	}
2879 
2880 	spdk_json_write_object_begin(w);
2881 
2882 	spdk_json_write_named_string(w, "method", "bdev_nvme_set_options");
2883 
2884 	spdk_json_write_named_object_begin(w, "params");
2885 	spdk_json_write_named_string(w, "action_on_timeout", action);
2886 	spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us);
2887 	spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms);
2888 	spdk_json_write_named_uint32(w, "retry_count", g_opts.retry_count);
2889 	spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst);
2890 	spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight);
2891 	spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight);
2892 	spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight);
2893 	spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us);
2894 	spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us);
2895 	spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests);
2896 	spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit);
2897 	spdk_json_write_object_end(w);
2898 
2899 	spdk_json_write_object_end(w);
2900 }
2901 
2902 static void
2903 nvme_bdev_ctrlr_config_json(struct spdk_json_write_ctx *w,
2904 			    struct nvme_bdev_ctrlr *nvme_bdev_ctrlr)
2905 {
2906 	struct spdk_nvme_transport_id	*trid;
2907 
2908 	trid = nvme_bdev_ctrlr->connected_trid;
2909 
2910 	spdk_json_write_object_begin(w);
2911 
2912 	spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller");
2913 
2914 	spdk_json_write_named_object_begin(w, "params");
2915 	spdk_json_write_named_string(w, "name", nvme_bdev_ctrlr->name);
2916 	nvme_bdev_dump_trid_json(trid, w);
2917 	spdk_json_write_named_bool(w, "prchk_reftag",
2918 				   (nvme_bdev_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0);
2919 	spdk_json_write_named_bool(w, "prchk_guard",
2920 				   (nvme_bdev_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0);
2921 
2922 	spdk_json_write_object_end(w);
2923 
2924 	spdk_json_write_object_end(w);
2925 }
2926 
2927 static void
2928 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w)
2929 {
2930 	spdk_json_write_object_begin(w);
2931 	spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug");
2932 
2933 	spdk_json_write_named_object_begin(w, "params");
2934 	spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us);
2935 	spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled);
2936 	spdk_json_write_object_end(w);
2937 
2938 	spdk_json_write_object_end(w);
2939 }
2940 
2941 static int
2942 bdev_nvme_config_json(struct spdk_json_write_ctx *w)
2943 {
2944 	struct nvme_bdev_ctrlr	*nvme_bdev_ctrlr;
2945 	uint32_t		nsid;
2946 
2947 	bdev_nvme_opts_config_json(w);
2948 
2949 	pthread_mutex_lock(&g_bdev_nvme_mutex);
2950 
2951 	TAILQ_FOREACH(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
2952 		nvme_bdev_ctrlr_config_json(w, nvme_bdev_ctrlr);
2953 
2954 		for (nsid = 0; nsid < nvme_bdev_ctrlr->num_ns; ++nsid) {
2955 			if (!nvme_bdev_ctrlr->namespaces[nsid]->populated) {
2956 				continue;
2957 			}
2958 
2959 			nvme_namespace_config_json(w, nvme_bdev_ctrlr->namespaces[nsid]);
2960 		}
2961 	}
2962 
2963 	/* Dump as last parameter to give all NVMe bdevs chance to be constructed
2964 	 * before enabling hotplug poller.
2965 	 */
2966 	bdev_nvme_hotplug_config_json(w);
2967 
2968 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
2969 	return 0;
2970 }
2971 
2972 struct spdk_nvme_ctrlr *
2973 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev)
2974 {
2975 	if (!bdev || bdev->module != &nvme_if) {
2976 		return NULL;
2977 	}
2978 
2979 	return SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk)->nvme_ns->ctrlr->ctrlr;
2980 }
2981 
2982 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme)
2983