xref: /spdk/module/bdev/nvme/bdev_nvme.c (revision de21d8f4e45b732c13ce5c7aa1872f73bffd38aa)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation. All rights reserved.
5  *   Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "spdk/stdinc.h"
35 
36 #include "bdev_nvme.h"
37 #include "bdev_ocssd.h"
38 
39 #include "spdk/accel_engine.h"
40 #include "spdk/config.h"
41 #include "spdk/endian.h"
42 #include "spdk/bdev.h"
43 #include "spdk/json.h"
44 #include "spdk/nvme.h"
45 #include "spdk/nvme_ocssd.h"
46 #include "spdk/thread.h"
47 #include "spdk/string.h"
48 #include "spdk/util.h"
49 
50 #include "spdk/bdev_module.h"
51 #include "spdk/log.h"
52 
53 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true
54 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS	(10000)
55 
56 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w);
57 
58 struct nvme_bdev_io {
59 	/** array of iovecs to transfer. */
60 	struct iovec *iovs;
61 
62 	/** Number of iovecs in iovs array. */
63 	int iovcnt;
64 
65 	/** Current iovec position. */
66 	int iovpos;
67 
68 	/** Offset in current iovec. */
69 	uint32_t iov_offset;
70 
71 	/** array of iovecs to transfer. */
72 	struct iovec *fused_iovs;
73 
74 	/** Number of iovecs in iovs array. */
75 	int fused_iovcnt;
76 
77 	/** Current iovec position. */
78 	int fused_iovpos;
79 
80 	/** Offset in current iovec. */
81 	uint32_t fused_iov_offset;
82 
83 	/** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */
84 	struct spdk_nvme_cpl cpl;
85 
86 	/** Originating thread */
87 	struct spdk_thread *orig_thread;
88 
89 	/** Keeps track if first of fused commands was submitted */
90 	bool first_fused_submitted;
91 };
92 
93 struct nvme_probe_ctx {
94 	size_t count;
95 	struct spdk_nvme_transport_id trids[NVME_MAX_CONTROLLERS];
96 	struct spdk_nvme_host_id hostids[NVME_MAX_CONTROLLERS];
97 	const char *names[NVME_MAX_CONTROLLERS];
98 	uint32_t prchk_flags[NVME_MAX_CONTROLLERS];
99 	const char *hostnqn;
100 };
101 
102 struct nvme_probe_skip_entry {
103 	struct spdk_nvme_transport_id		trid;
104 	TAILQ_ENTRY(nvme_probe_skip_entry)	tailq;
105 };
106 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */
107 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER(
108 			g_skipped_nvme_ctrlrs);
109 
110 static struct spdk_bdev_nvme_opts g_opts = {
111 	.action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE,
112 	.timeout_us = 0,
113 	.keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS,
114 	.retry_count = 4,
115 	.arbitration_burst = 0,
116 	.low_priority_weight = 0,
117 	.medium_priority_weight = 0,
118 	.high_priority_weight = 0,
119 	.nvme_adminq_poll_period_us = 10000ULL,
120 	.nvme_ioq_poll_period_us = 0,
121 	.io_queue_requests = 0,
122 	.delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT,
123 };
124 
125 #define NVME_HOTPLUG_POLL_PERIOD_MAX			10000000ULL
126 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT		100000ULL
127 
128 static int g_hot_insert_nvme_controller_index = 0;
129 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT;
130 static bool g_nvme_hotplug_enabled = false;
131 static struct spdk_thread *g_bdev_nvme_init_thread;
132 static struct spdk_poller *g_hotplug_poller;
133 static struct spdk_poller *g_hotplug_probe_poller;
134 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx;
135 
136 static void nvme_ctrlr_populate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
137 		struct nvme_async_probe_ctx *ctx);
138 static void nvme_ctrlr_populate_namespaces_done(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
139 		struct nvme_async_probe_ctx *ctx);
140 static int bdev_nvme_library_init(void);
141 static void bdev_nvme_library_fini(void);
142 static int bdev_nvme_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
143 			   struct nvme_bdev_io *bio,
144 			   struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
145 			   uint32_t flags);
146 static int bdev_nvme_no_pi_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
147 				 struct nvme_bdev_io *bio,
148 				 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba);
149 static int bdev_nvme_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
150 			    struct nvme_bdev_io *bio,
151 			    struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
152 			    uint32_t flags);
153 static int bdev_nvme_comparev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
154 			      struct nvme_bdev_io *bio,
155 			      struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
156 			      uint32_t flags);
157 static int bdev_nvme_comparev_and_writev(struct spdk_nvme_ns *ns,
158 		struct spdk_nvme_qpair *qpair,
159 		struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov,
160 		int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba,
161 		uint32_t flags);
162 static int bdev_nvme_admin_passthru(struct nvme_io_channel *nvme_ch,
163 				    struct nvme_bdev_io *bio,
164 				    struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes);
165 static int bdev_nvme_io_passthru(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
166 				 struct nvme_bdev_io *bio,
167 				 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes);
168 static int bdev_nvme_io_passthru_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
169 				    struct nvme_bdev_io *bio,
170 				    struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len);
171 static int bdev_nvme_abort(struct nvme_io_channel *nvme_ch,
172 			   struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort);
173 static int bdev_nvme_reset(struct nvme_io_channel *nvme_ch, struct nvme_bdev_io *bio);
174 static int bdev_nvme_failover(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, bool remove);
175 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr);
176 
177 typedef void (*populate_namespace_fn)(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
178 				      struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx);
179 static void nvme_ctrlr_populate_standard_namespace(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
180 		struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx);
181 
182 static populate_namespace_fn g_populate_namespace_fn[] = {
183 	NULL,
184 	nvme_ctrlr_populate_standard_namespace,
185 	bdev_ocssd_populate_namespace,
186 };
187 
188 typedef void (*depopulate_namespace_fn)(struct nvme_bdev_ns *nvme_ns);
189 static void nvme_ctrlr_depopulate_standard_namespace(struct nvme_bdev_ns *nvme_ns);
190 
191 static depopulate_namespace_fn g_depopulate_namespace_fn[] = {
192 	NULL,
193 	nvme_ctrlr_depopulate_standard_namespace,
194 	bdev_ocssd_depopulate_namespace,
195 };
196 
197 typedef void (*config_json_namespace_fn)(struct spdk_json_write_ctx *w,
198 		struct nvme_bdev_ns *nvme_ns);
199 static void nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w,
200 		struct nvme_bdev_ns *nvme_ns);
201 
202 static config_json_namespace_fn g_config_json_namespace_fn[] = {
203 	NULL,
204 	nvme_ctrlr_config_json_standard_namespace,
205 	bdev_ocssd_namespace_config_json,
206 };
207 
208 struct spdk_nvme_qpair *
209 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch)
210 {
211 	struct nvme_io_channel *nvme_ch;
212 
213 	assert(ctrlr_io_ch != NULL);
214 
215 	nvme_ch = spdk_io_channel_get_ctx(ctrlr_io_ch);
216 
217 	return nvme_ch->qpair;
218 }
219 
220 static int
221 bdev_nvme_get_ctx_size(void)
222 {
223 	return sizeof(struct nvme_bdev_io);
224 }
225 
226 static struct spdk_bdev_module nvme_if = {
227 	.name = "nvme",
228 	.async_fini = true,
229 	.module_init = bdev_nvme_library_init,
230 	.module_fini = bdev_nvme_library_fini,
231 	.config_json = bdev_nvme_config_json,
232 	.get_ctx_size = bdev_nvme_get_ctx_size,
233 
234 };
235 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if)
236 
237 static void
238 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx)
239 {
240 	int rc;
241 
242 	SPDK_DEBUGLOG(bdev_nvme, "qpair %p is disconnected, attempting reconnect.\n", qpair);
243 	/*
244 	 * Currently, just try to reconnect indefinitely. If we are doing a reset, the reset will
245 	 * reconnect a qpair and we will stop getting a callback for this one.
246 	 */
247 	rc = spdk_nvme_ctrlr_reconnect_io_qpair(qpair);
248 	if (rc != 0) {
249 		SPDK_WARNLOG("Failed to reconnect to qpair %p, errno %d\n", qpair, -rc);
250 	}
251 }
252 
253 static int
254 bdev_nvme_poll(void *arg)
255 {
256 	struct nvme_bdev_poll_group *group = arg;
257 	int64_t num_completions;
258 
259 	if (group->collect_spin_stat && group->start_ticks == 0) {
260 		group->start_ticks = spdk_get_ticks();
261 	}
262 
263 	num_completions = spdk_nvme_poll_group_process_completions(group->group, 0,
264 			  bdev_nvme_disconnected_qpair_cb);
265 	if (group->collect_spin_stat) {
266 		if (num_completions > 0) {
267 			if (group->end_ticks != 0) {
268 				group->spin_ticks += (group->end_ticks - group->start_ticks);
269 				group->end_ticks = 0;
270 			}
271 			group->start_ticks = 0;
272 		} else {
273 			group->end_ticks = spdk_get_ticks();
274 		}
275 	}
276 
277 	return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
278 }
279 
280 static int
281 bdev_nvme_poll_adminq(void *arg)
282 {
283 	int32_t rc;
284 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = arg;
285 
286 	assert(nvme_bdev_ctrlr != NULL);
287 
288 	rc = spdk_nvme_ctrlr_process_admin_completions(nvme_bdev_ctrlr->ctrlr);
289 	if (rc < 0) {
290 		bdev_nvme_failover(nvme_bdev_ctrlr, false);
291 	}
292 
293 	return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY;
294 }
295 
296 static int
297 bdev_nvme_destruct(void *ctx)
298 {
299 	struct nvme_bdev *nvme_disk = ctx;
300 	struct nvme_bdev_ns *nvme_ns = nvme_disk->nvme_ns;
301 
302 	nvme_ns->bdev = NULL;
303 
304 	nvme_bdev_ns_detach(nvme_ns);
305 
306 	free(nvme_disk->disk.name);
307 	free(nvme_disk);
308 
309 	return 0;
310 }
311 
312 static int
313 bdev_nvme_flush(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
314 		struct nvme_bdev_io *bio, uint64_t offset, uint64_t nbytes)
315 {
316 	spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_SUCCESS);
317 
318 	return 0;
319 }
320 
321 static int
322 bdev_nvme_create_qpair(struct nvme_io_channel *nvme_ch)
323 {
324 	struct spdk_nvme_ctrlr *ctrlr = nvme_ch->ctrlr->ctrlr;
325 	struct spdk_nvme_io_qpair_opts opts;
326 	int rc;
327 
328 	spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &opts, sizeof(opts));
329 	opts.delay_cmd_submit = g_opts.delay_cmd_submit;
330 	opts.create_only = true;
331 	opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests);
332 	g_opts.io_queue_requests = opts.io_queue_requests;
333 
334 	nvme_ch->qpair = spdk_nvme_ctrlr_alloc_io_qpair(ctrlr, &opts, sizeof(opts));
335 	if (nvme_ch->qpair == NULL) {
336 		return -1;
337 	}
338 
339 	assert(nvme_ch->group != NULL);
340 
341 	rc = spdk_nvme_poll_group_add(nvme_ch->group->group, nvme_ch->qpair);
342 	if (rc != 0) {
343 		SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n");
344 		goto err;
345 	}
346 
347 	rc = spdk_nvme_ctrlr_connect_io_qpair(ctrlr, nvme_ch->qpair);
348 	if (rc != 0) {
349 		SPDK_ERRLOG("Unable to connect I/O qpair.\n");
350 		goto err;
351 	}
352 
353 	return 0;
354 
355 err:
356 	spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair);
357 
358 	return rc;
359 }
360 
361 static void
362 _bdev_nvme_check_pending_destruct(struct spdk_io_channel_iter *i, int status)
363 {
364 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i);
365 
366 	pthread_mutex_lock(&nvme_bdev_ctrlr->mutex);
367 	if (nvme_bdev_ctrlr->destruct_after_reset) {
368 		assert(nvme_bdev_ctrlr->ref == 0 && nvme_bdev_ctrlr->destruct);
369 		pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
370 
371 		spdk_thread_send_msg(nvme_bdev_ctrlr->thread, nvme_bdev_ctrlr_do_destruct,
372 				     nvme_bdev_ctrlr);
373 	} else {
374 		pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
375 	}
376 }
377 
378 static void
379 _bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i)
380 {
381 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
382 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(_ch);
383 	struct spdk_bdev_io *bdev_io;
384 	enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS;
385 
386 	/* A NULL ctx means success. */
387 	if (spdk_io_channel_iter_get_ctx(i) != NULL) {
388 		status = SPDK_BDEV_IO_STATUS_FAILED;
389 	}
390 
391 	while (!TAILQ_EMPTY(&nvme_ch->pending_resets)) {
392 		bdev_io = TAILQ_FIRST(&nvme_ch->pending_resets);
393 		TAILQ_REMOVE(&nvme_ch->pending_resets, bdev_io, module_link);
394 		spdk_bdev_io_complete(bdev_io, status);
395 	}
396 
397 	spdk_for_each_channel_continue(i, 0);
398 }
399 
400 static void
401 _bdev_nvme_reset_complete(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, int rc)
402 {
403 	/* we are using the for_each_channel cb_arg like a return code here. */
404 	/* If it's zero, we succeeded, otherwise, the reset failed. */
405 	void *cb_arg = NULL;
406 	struct nvme_bdev_ctrlr_trid *curr_trid;
407 
408 	if (rc) {
409 		cb_arg = (void *)0x1;
410 		SPDK_ERRLOG("Resetting controller failed.\n");
411 	} else {
412 		SPDK_NOTICELOG("Resetting controller successful.\n");
413 	}
414 
415 	pthread_mutex_lock(&nvme_bdev_ctrlr->mutex);
416 	nvme_bdev_ctrlr->resetting = false;
417 	nvme_bdev_ctrlr->failover_in_progress = false;
418 
419 	curr_trid = TAILQ_FIRST(&nvme_bdev_ctrlr->trids);
420 	assert(curr_trid != NULL);
421 	assert(&curr_trid->trid == nvme_bdev_ctrlr->connected_trid);
422 
423 	curr_trid->is_failed = cb_arg != NULL ? true : false;
424 
425 	if (nvme_bdev_ctrlr->ref == 0 && nvme_bdev_ctrlr->destruct) {
426 		/* Destruct ctrlr after clearing pending resets. */
427 		nvme_bdev_ctrlr->destruct_after_reset = true;
428 	}
429 
430 	pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
431 
432 	/* Make sure we clear any pending resets before returning. */
433 	spdk_for_each_channel(nvme_bdev_ctrlr,
434 			      _bdev_nvme_complete_pending_resets,
435 			      cb_arg,
436 			      _bdev_nvme_check_pending_destruct);
437 }
438 
439 static void
440 _bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status)
441 {
442 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i);
443 	struct nvme_bdev_io *bio = spdk_io_channel_iter_get_ctx(i);
444 	int rc = SPDK_BDEV_IO_STATUS_SUCCESS;
445 
446 	if (status) {
447 		rc = SPDK_BDEV_IO_STATUS_FAILED;
448 	}
449 	if (bio) {
450 		spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), rc);
451 	}
452 	_bdev_nvme_reset_complete(nvme_bdev_ctrlr, status);
453 }
454 
455 static void
456 _bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i)
457 {
458 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
459 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(_ch);
460 	int rc;
461 
462 	rc = bdev_nvme_create_qpair(nvme_ch);
463 
464 	spdk_for_each_channel_continue(i, rc);
465 }
466 
467 static void
468 _bdev_nvme_reset_ctrlr(struct spdk_io_channel_iter *i, int status)
469 {
470 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i);
471 	struct nvme_bdev_io *bio = spdk_io_channel_iter_get_ctx(i);
472 	int rc;
473 
474 	if (status) {
475 		rc = status;
476 		goto err;
477 	}
478 
479 	rc = spdk_nvme_ctrlr_reset(nvme_bdev_ctrlr->ctrlr);
480 	if (rc != 0) {
481 		goto err;
482 	}
483 
484 	/* Recreate all of the I/O queue pairs */
485 	spdk_for_each_channel(nvme_bdev_ctrlr,
486 			      _bdev_nvme_reset_create_qpair,
487 			      bio,
488 			      _bdev_nvme_reset_create_qpairs_done);
489 	return;
490 
491 err:
492 	if (bio) {
493 		spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_FAILED);
494 	}
495 	_bdev_nvme_reset_complete(nvme_bdev_ctrlr, rc);
496 }
497 
498 static void
499 _bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i)
500 {
501 	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
502 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
503 	int rc;
504 
505 	rc = spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair);
506 	if (!rc) {
507 		nvme_ch->qpair = NULL;
508 	}
509 
510 	spdk_for_each_channel_continue(i, rc);
511 }
512 
513 static int
514 _bdev_nvme_reset_start(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr)
515 {
516 	pthread_mutex_lock(&nvme_bdev_ctrlr->mutex);
517 	if (nvme_bdev_ctrlr->destruct) {
518 		pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
519 		return -EBUSY;
520 	}
521 
522 	if (nvme_bdev_ctrlr->resetting) {
523 		pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
524 		SPDK_NOTICELOG("Unable to perform reset, already in progress.\n");
525 		return -EAGAIN;
526 	}
527 
528 	nvme_bdev_ctrlr->resetting = true;
529 
530 	pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
531 	return 0;
532 }
533 
534 static int
535 _bdev_nvme_reset(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr)
536 {
537 	int rc;
538 
539 	rc = _bdev_nvme_reset_start(nvme_bdev_ctrlr);
540 	if (rc == 0) {
541 		/* First, delete all NVMe I/O queue pairs. */
542 		spdk_for_each_channel(nvme_bdev_ctrlr,
543 				      _bdev_nvme_reset_destroy_qpair,
544 				      NULL,
545 				      _bdev_nvme_reset_ctrlr);
546 	}
547 
548 	return rc;
549 }
550 
551 static int
552 bdev_nvme_reset(struct nvme_io_channel *nvme_ch, struct nvme_bdev_io *bio)
553 {
554 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
555 	int rc;
556 
557 	rc = _bdev_nvme_reset_start(nvme_ch->ctrlr);
558 	if (rc == 0) {
559 		/* First, delete all NVMe I/O queue pairs. */
560 		spdk_for_each_channel(nvme_ch->ctrlr,
561 				      _bdev_nvme_reset_destroy_qpair,
562 				      bio,
563 				      _bdev_nvme_reset_ctrlr);
564 	} else if (rc == -EBUSY) {
565 		/* Don't bother resetting if the controller is in the process of being destructed. */
566 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
567 	} else if (rc == -EAGAIN) {
568 		/*
569 		 * Reset call is queued only if it is from the app framework. This is on purpose so that
570 		 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the
571 		 * upper level. If they are in the middle of a reset, we won't try to schedule another one.
572 		 */
573 		TAILQ_INSERT_TAIL(&nvme_ch->pending_resets, bdev_io, module_link);
574 	} else {
575 		return rc;
576 	}
577 
578 	return 0;
579 }
580 
581 static int
582 _bdev_nvme_failover_start(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, bool remove)
583 {
584 	struct nvme_bdev_ctrlr_trid *curr_trid = NULL, *next_trid = NULL;
585 	int rc;
586 
587 	pthread_mutex_lock(&nvme_bdev_ctrlr->mutex);
588 	if (nvme_bdev_ctrlr->destruct) {
589 		pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
590 		/* Don't bother resetting if the controller is in the process of being destructed. */
591 		return -EBUSY;
592 	}
593 
594 	curr_trid = TAILQ_FIRST(&nvme_bdev_ctrlr->trids);
595 	assert(curr_trid);
596 	assert(&curr_trid->trid == nvme_bdev_ctrlr->connected_trid);
597 	next_trid = TAILQ_NEXT(curr_trid, link);
598 
599 	if (nvme_bdev_ctrlr->resetting) {
600 		if (next_trid && !nvme_bdev_ctrlr->failover_in_progress) {
601 			rc = -EAGAIN;
602 		} else {
603 			rc = -EBUSY;
604 		}
605 		pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
606 		SPDK_NOTICELOG("Unable to perform reset, already in progress.\n");
607 		return rc;
608 	}
609 
610 	nvme_bdev_ctrlr->resetting = true;
611 	curr_trid->is_failed = true;
612 
613 	if (next_trid) {
614 		assert(curr_trid->trid.trtype != SPDK_NVME_TRANSPORT_PCIE);
615 
616 		SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", curr_trid->trid.traddr,
617 			       curr_trid->trid.trsvcid,	next_trid->trid.traddr, next_trid->trid.trsvcid);
618 
619 		nvme_bdev_ctrlr->failover_in_progress = true;
620 		spdk_nvme_ctrlr_fail(nvme_bdev_ctrlr->ctrlr);
621 		nvme_bdev_ctrlr->connected_trid = &next_trid->trid;
622 		rc = spdk_nvme_ctrlr_set_trid(nvme_bdev_ctrlr->ctrlr, &next_trid->trid);
623 		assert(rc == 0);
624 		TAILQ_REMOVE(&nvme_bdev_ctrlr->trids, curr_trid, link);
625 		if (!remove) {
626 			/** Shuffle the old trid to the end of the list and use the new one.
627 			 * Allows for round robin through multiple connections.
628 			 */
629 			TAILQ_INSERT_TAIL(&nvme_bdev_ctrlr->trids, curr_trid, link);
630 		} else {
631 			free(curr_trid);
632 		}
633 	}
634 
635 	pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
636 	return 0;
637 }
638 
639 static int
640 bdev_nvme_failover(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, bool remove)
641 {
642 	int rc;
643 
644 	rc = _bdev_nvme_failover_start(nvme_bdev_ctrlr, remove);
645 	if (rc == 0) {
646 		/* First, delete all NVMe I/O queue pairs. */
647 		spdk_for_each_channel(nvme_bdev_ctrlr,
648 				      _bdev_nvme_reset_destroy_qpair,
649 				      NULL,
650 				      _bdev_nvme_reset_ctrlr);
651 	} else if (rc != -EBUSY) {
652 		return rc;
653 	}
654 
655 	return 0;
656 }
657 
658 static int
659 bdev_nvme_unmap(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
660 		struct nvme_bdev_io *bio,
661 		uint64_t offset_blocks,
662 		uint64_t num_blocks);
663 
664 static void
665 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
666 		     bool success)
667 {
668 	struct spdk_bdev *bdev = bdev_io->bdev;
669 	struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev->ctxt;
670 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
671 	struct nvme_bdev_ns *nvme_ns;
672 	struct spdk_nvme_qpair *qpair;
673 	int ret;
674 
675 	if (!success) {
676 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
677 		return;
678 	}
679 
680 	if (spdk_unlikely(!bdev_nvme_find_io_path(nbdev, nvme_ch, &nvme_ns, &qpair))) {
681 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
682 		return;
683 	}
684 
685 	ret = bdev_nvme_readv(nvme_ns->ns,
686 			      qpair,
687 			      (struct nvme_bdev_io *)bdev_io->driver_ctx,
688 			      bdev_io->u.bdev.iovs,
689 			      bdev_io->u.bdev.iovcnt,
690 			      bdev_io->u.bdev.md_buf,
691 			      bdev_io->u.bdev.num_blocks,
692 			      bdev_io->u.bdev.offset_blocks,
693 			      bdev->dif_check_flags);
694 
695 	if (spdk_likely(ret == 0)) {
696 		return;
697 	} else if (ret == -ENOMEM) {
698 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
699 	} else {
700 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
701 	}
702 }
703 
704 static int
705 _bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
706 {
707 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
708 	struct spdk_bdev *bdev = bdev_io->bdev;
709 	struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev->ctxt;
710 	struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx;
711 	struct nvme_bdev_io *nbdev_io_to_abort;
712 	struct nvme_bdev_ns *nvme_ns;
713 	struct spdk_nvme_qpair *qpair;
714 
715 	if (spdk_unlikely(!bdev_nvme_find_io_path(nbdev, nvme_ch, &nvme_ns, &qpair))) {
716 		return -1;
717 	}
718 
719 	switch (bdev_io->type) {
720 	case SPDK_BDEV_IO_TYPE_READ:
721 		if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) {
722 			return bdev_nvme_readv(nvme_ns->ns,
723 					       qpair,
724 					       nbdev_io,
725 					       bdev_io->u.bdev.iovs,
726 					       bdev_io->u.bdev.iovcnt,
727 					       bdev_io->u.bdev.md_buf,
728 					       bdev_io->u.bdev.num_blocks,
729 					       bdev_io->u.bdev.offset_blocks,
730 					       bdev->dif_check_flags);
731 		} else {
732 			spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb,
733 					     bdev_io->u.bdev.num_blocks * bdev->blocklen);
734 			return 0;
735 		}
736 
737 	case SPDK_BDEV_IO_TYPE_WRITE:
738 		return bdev_nvme_writev(nvme_ns->ns,
739 					qpair,
740 					nbdev_io,
741 					bdev_io->u.bdev.iovs,
742 					bdev_io->u.bdev.iovcnt,
743 					bdev_io->u.bdev.md_buf,
744 					bdev_io->u.bdev.num_blocks,
745 					bdev_io->u.bdev.offset_blocks,
746 					bdev->dif_check_flags);
747 
748 	case SPDK_BDEV_IO_TYPE_COMPARE:
749 		return bdev_nvme_comparev(nvme_ns->ns,
750 					  qpair,
751 					  nbdev_io,
752 					  bdev_io->u.bdev.iovs,
753 					  bdev_io->u.bdev.iovcnt,
754 					  bdev_io->u.bdev.md_buf,
755 					  bdev_io->u.bdev.num_blocks,
756 					  bdev_io->u.bdev.offset_blocks,
757 					  bdev->dif_check_flags);
758 
759 	case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE:
760 		return bdev_nvme_comparev_and_writev(nvme_ns->ns,
761 						     qpair,
762 						     nbdev_io,
763 						     bdev_io->u.bdev.iovs,
764 						     bdev_io->u.bdev.iovcnt,
765 						     bdev_io->u.bdev.fused_iovs,
766 						     bdev_io->u.bdev.fused_iovcnt,
767 						     bdev_io->u.bdev.md_buf,
768 						     bdev_io->u.bdev.num_blocks,
769 						     bdev_io->u.bdev.offset_blocks,
770 						     bdev->dif_check_flags);
771 
772 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
773 		return bdev_nvme_unmap(nvme_ns->ns,
774 				       qpair,
775 				       nbdev_io,
776 				       bdev_io->u.bdev.offset_blocks,
777 				       bdev_io->u.bdev.num_blocks);
778 
779 	case SPDK_BDEV_IO_TYPE_UNMAP:
780 		return bdev_nvme_unmap(nvme_ns->ns,
781 				       qpair,
782 				       nbdev_io,
783 				       bdev_io->u.bdev.offset_blocks,
784 				       bdev_io->u.bdev.num_blocks);
785 
786 	case SPDK_BDEV_IO_TYPE_RESET:
787 		return bdev_nvme_reset(nvme_ch, nbdev_io);
788 
789 	case SPDK_BDEV_IO_TYPE_FLUSH:
790 		return bdev_nvme_flush(nvme_ns->ns,
791 				       qpair,
792 				       nbdev_io,
793 				       bdev_io->u.bdev.offset_blocks,
794 				       bdev_io->u.bdev.num_blocks);
795 
796 	case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
797 		return bdev_nvme_admin_passthru(nvme_ch,
798 						nbdev_io,
799 						&bdev_io->u.nvme_passthru.cmd,
800 						bdev_io->u.nvme_passthru.buf,
801 						bdev_io->u.nvme_passthru.nbytes);
802 
803 	case SPDK_BDEV_IO_TYPE_NVME_IO:
804 		return bdev_nvme_io_passthru(nvme_ns->ns,
805 					     qpair,
806 					     nbdev_io,
807 					     &bdev_io->u.nvme_passthru.cmd,
808 					     bdev_io->u.nvme_passthru.buf,
809 					     bdev_io->u.nvme_passthru.nbytes);
810 
811 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
812 		return bdev_nvme_io_passthru_md(nvme_ns->ns,
813 						qpair,
814 						nbdev_io,
815 						&bdev_io->u.nvme_passthru.cmd,
816 						bdev_io->u.nvme_passthru.buf,
817 						bdev_io->u.nvme_passthru.nbytes,
818 						bdev_io->u.nvme_passthru.md_buf,
819 						bdev_io->u.nvme_passthru.md_len);
820 
821 	case SPDK_BDEV_IO_TYPE_ABORT:
822 		nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx;
823 		return bdev_nvme_abort(nvme_ch,
824 				       nbdev_io,
825 				       nbdev_io_to_abort);
826 
827 	default:
828 		return -EINVAL;
829 	}
830 	return 0;
831 }
832 
833 static void
834 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
835 {
836 	int rc = _bdev_nvme_submit_request(ch, bdev_io);
837 
838 	if (spdk_unlikely(rc != 0)) {
839 		if (rc == -ENOMEM) {
840 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
841 		} else {
842 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
843 		}
844 	}
845 }
846 
847 static bool
848 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
849 {
850 	struct nvme_bdev *nbdev = ctx;
851 	struct nvme_bdev_ns *nvme_ns;
852 	struct spdk_nvme_ns *ns;
853 	struct spdk_nvme_ctrlr *ctrlr;
854 	const struct spdk_nvme_ctrlr_data *cdata;
855 
856 	nvme_ns = nvme_bdev_to_bdev_ns(nbdev);
857 	assert(nvme_ns != NULL);
858 	ns = nvme_ns->ns;
859 	ctrlr = spdk_nvme_ns_get_ctrlr(ns);
860 
861 	switch (io_type) {
862 	case SPDK_BDEV_IO_TYPE_READ:
863 	case SPDK_BDEV_IO_TYPE_WRITE:
864 	case SPDK_BDEV_IO_TYPE_RESET:
865 	case SPDK_BDEV_IO_TYPE_FLUSH:
866 	case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
867 	case SPDK_BDEV_IO_TYPE_NVME_IO:
868 	case SPDK_BDEV_IO_TYPE_ABORT:
869 		return true;
870 
871 	case SPDK_BDEV_IO_TYPE_COMPARE:
872 		return spdk_nvme_ns_supports_compare(ns);
873 
874 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
875 		return spdk_nvme_ns_get_md_size(ns) ? true : false;
876 
877 	case SPDK_BDEV_IO_TYPE_UNMAP:
878 		cdata = spdk_nvme_ctrlr_get_data(ctrlr);
879 		return cdata->oncs.dsm;
880 
881 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
882 		cdata = spdk_nvme_ctrlr_get_data(ctrlr);
883 		/*
884 		 * If an NVMe controller guarantees reading unallocated blocks returns zero,
885 		 * we can implement WRITE_ZEROES as an NVMe deallocate command.
886 		 */
887 		if (cdata->oncs.dsm &&
888 		    spdk_nvme_ns_get_dealloc_logical_block_read_value(ns) ==
889 		    SPDK_NVME_DEALLOC_READ_00) {
890 			return true;
891 		}
892 		/*
893 		 * The NVMe controller write_zeroes function is currently not used by our driver.
894 		 * If a user submits an arbitrarily large write_zeroes request to the controller, the request will fail.
895 		 * Until this is resolved, we only claim support for write_zeroes if deallocated blocks return 0's when read.
896 		 */
897 		return false;
898 
899 	case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE:
900 		if (spdk_nvme_ctrlr_get_flags(ctrlr) &
901 		    SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) {
902 			return true;
903 		}
904 		return false;
905 
906 	default:
907 		return false;
908 	}
909 }
910 
911 static int
912 bdev_nvme_create_cb(void *io_device, void *ctx_buf)
913 {
914 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = io_device;
915 	struct nvme_io_channel *nvme_ch = ctx_buf;
916 	struct spdk_io_channel *pg_ch = NULL;
917 	int rc;
918 
919 	if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) {
920 		rc = bdev_ocssd_create_io_channel(nvme_ch);
921 		if (rc != 0) {
922 			return rc;
923 		}
924 	}
925 
926 	pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs);
927 	if (!pg_ch) {
928 		rc = -1;
929 		goto err_pg_ch;
930 	}
931 
932 	nvme_ch->group = spdk_io_channel_get_ctx(pg_ch);
933 
934 #ifdef SPDK_CONFIG_VTUNE
935 	nvme_ch->group->collect_spin_stat = true;
936 #else
937 	nvme_ch->group->collect_spin_stat = false;
938 #endif
939 
940 	TAILQ_INIT(&nvme_ch->pending_resets);
941 
942 	nvme_ch->ctrlr = nvme_bdev_ctrlr;
943 
944 	rc = bdev_nvme_create_qpair(nvme_ch);
945 	if (rc != 0) {
946 		goto err_qpair;
947 	}
948 
949 	return 0;
950 
951 err_qpair:
952 	spdk_put_io_channel(pg_ch);
953 err_pg_ch:
954 	if (nvme_ch->ocssd_ch) {
955 		bdev_ocssd_destroy_io_channel(nvme_ch);
956 	}
957 
958 	return rc;
959 }
960 
961 static void
962 bdev_nvme_destroy_cb(void *io_device, void *ctx_buf)
963 {
964 	struct nvme_io_channel *nvme_ch = ctx_buf;
965 
966 	assert(nvme_ch->group != NULL);
967 
968 	if (nvme_ch->ocssd_ch != NULL) {
969 		bdev_ocssd_destroy_io_channel(nvme_ch);
970 	}
971 
972 	spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair);
973 
974 	spdk_put_io_channel(spdk_io_channel_from_ctx(nvme_ch->group));
975 }
976 
977 static void
978 bdev_nvme_poll_group_submit_accel_crc32c(void *ctx, uint32_t *dst, struct iovec *iov,
979 		uint32_t iov_cnt, uint32_t seed,
980 		spdk_nvme_accel_completion_cb cb_fn, void *cb_arg)
981 {
982 	struct nvme_bdev_poll_group *group = ctx;
983 	int rc;
984 
985 	assert(group->accel_channel != NULL);
986 	assert(cb_fn != NULL);
987 
988 	rc = spdk_accel_submit_crc32cv(group->accel_channel, dst, iov, iov_cnt, seed, cb_fn, cb_arg);
989 	if (rc) {
990 		/* For the two cases, spdk_accel_submit_crc32cv does not call the user's cb_fn */
991 		if (rc == -ENOMEM || rc == -EINVAL) {
992 			cb_fn(cb_arg, rc);
993 		}
994 		SPDK_ERRLOG("Cannot complete the accelerated crc32c operation with iov=%p\n", iov);
995 	}
996 }
997 
998 static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = {
999 	.table_size		= sizeof(struct spdk_nvme_accel_fn_table),
1000 	.submit_accel_crc32c	= bdev_nvme_poll_group_submit_accel_crc32c,
1001 };
1002 
1003 static int
1004 bdev_nvme_poll_group_create_cb(void *io_device, void *ctx_buf)
1005 {
1006 	struct nvme_bdev_poll_group *group = ctx_buf;
1007 
1008 	group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table);
1009 	if (group->group == NULL) {
1010 		return -1;
1011 	}
1012 
1013 	group->accel_channel = spdk_accel_engine_get_io_channel();
1014 	if (!group->accel_channel) {
1015 		spdk_nvme_poll_group_destroy(group->group);
1016 		SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n",
1017 			    group);
1018 		return -1;
1019 	}
1020 
1021 	group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us);
1022 
1023 	if (group->poller == NULL) {
1024 		spdk_put_io_channel(group->accel_channel);
1025 		spdk_nvme_poll_group_destroy(group->group);
1026 		return -1;
1027 	}
1028 
1029 	return 0;
1030 }
1031 
1032 static void
1033 bdev_nvme_poll_group_destroy_cb(void *io_device, void *ctx_buf)
1034 {
1035 	struct nvme_bdev_poll_group *group = ctx_buf;
1036 
1037 	if (group->accel_channel) {
1038 		spdk_put_io_channel(group->accel_channel);
1039 	}
1040 
1041 	spdk_poller_unregister(&group->poller);
1042 	if (spdk_nvme_poll_group_destroy(group->group)) {
1043 		SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.");
1044 		assert(false);
1045 	}
1046 }
1047 
1048 static struct spdk_io_channel *
1049 bdev_nvme_get_io_channel(void *ctx)
1050 {
1051 	struct nvme_bdev *nvme_bdev = ctx;
1052 
1053 	return spdk_get_io_channel(nvme_bdev->nvme_ns->ctrlr);
1054 }
1055 
1056 static void *
1057 bdev_nvme_get_module_ctx(void *ctx)
1058 {
1059 	struct nvme_bdev *nvme_bdev = ctx;
1060 
1061 	return bdev_nvme_get_ctrlr(&nvme_bdev->disk);
1062 }
1063 
1064 static int
1065 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
1066 {
1067 	struct nvme_bdev *nvme_bdev = ctx;
1068 	struct nvme_bdev_ns *nvme_ns;
1069 	struct spdk_nvme_ns *ns;
1070 	struct spdk_nvme_ctrlr *ctrlr;
1071 	const struct spdk_nvme_ctrlr_data *cdata;
1072 	const struct spdk_nvme_transport_id *trid;
1073 	union spdk_nvme_vs_register vs;
1074 	union spdk_nvme_csts_register csts;
1075 	char buf[128];
1076 
1077 	nvme_ns = nvme_bdev_to_bdev_ns(nvme_bdev);
1078 	assert(nvme_ns != NULL);
1079 	ns = nvme_ns->ns;
1080 	ctrlr = spdk_nvme_ns_get_ctrlr(ns);
1081 
1082 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
1083 	trid = spdk_nvme_ctrlr_get_transport_id(ctrlr);
1084 	vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr);
1085 	csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr);
1086 
1087 	spdk_json_write_named_object_begin(w, "nvme");
1088 
1089 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
1090 		spdk_json_write_named_string(w, "pci_address", trid->traddr);
1091 	}
1092 
1093 	spdk_json_write_named_object_begin(w, "trid");
1094 
1095 	nvme_bdev_dump_trid_json(trid, w);
1096 
1097 	spdk_json_write_object_end(w);
1098 
1099 #ifdef SPDK_CONFIG_NVME_CUSE
1100 	size_t cuse_name_size = 128;
1101 	char cuse_name[cuse_name_size];
1102 
1103 	int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns),
1104 					    cuse_name, &cuse_name_size);
1105 	if (rc == 0) {
1106 		spdk_json_write_named_string(w, "cuse_device", cuse_name);
1107 	}
1108 #endif
1109 
1110 	spdk_json_write_named_object_begin(w, "ctrlr_data");
1111 
1112 	spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid);
1113 
1114 	snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn);
1115 	spdk_str_trim(buf);
1116 	spdk_json_write_named_string(w, "model_number", buf);
1117 
1118 	snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn);
1119 	spdk_str_trim(buf);
1120 	spdk_json_write_named_string(w, "serial_number", buf);
1121 
1122 	snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr);
1123 	spdk_str_trim(buf);
1124 	spdk_json_write_named_string(w, "firmware_revision", buf);
1125 
1126 	if (cdata->subnqn[0] != '\0') {
1127 		spdk_json_write_named_string(w, "subnqn", cdata->subnqn);
1128 	}
1129 
1130 	spdk_json_write_named_object_begin(w, "oacs");
1131 
1132 	spdk_json_write_named_uint32(w, "security", cdata->oacs.security);
1133 	spdk_json_write_named_uint32(w, "format", cdata->oacs.format);
1134 	spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware);
1135 	spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage);
1136 
1137 	spdk_json_write_object_end(w);
1138 
1139 	spdk_json_write_object_end(w);
1140 
1141 	spdk_json_write_named_object_begin(w, "vs");
1142 
1143 	spdk_json_write_name(w, "nvme_version");
1144 	if (vs.bits.ter) {
1145 		spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter);
1146 	} else {
1147 		spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr);
1148 	}
1149 
1150 	spdk_json_write_object_end(w);
1151 
1152 	spdk_json_write_named_object_begin(w, "csts");
1153 
1154 	spdk_json_write_named_uint32(w, "rdy", csts.bits.rdy);
1155 	spdk_json_write_named_uint32(w, "cfs", csts.bits.cfs);
1156 
1157 	spdk_json_write_object_end(w);
1158 
1159 	spdk_json_write_named_object_begin(w, "ns_data");
1160 
1161 	spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns));
1162 
1163 	spdk_json_write_object_end(w);
1164 
1165 	if (cdata->oacs.security) {
1166 		spdk_json_write_named_object_begin(w, "security");
1167 
1168 		spdk_json_write_named_bool(w, "opal", nvme_bdev->opal);
1169 
1170 		spdk_json_write_object_end(w);
1171 	}
1172 
1173 	spdk_json_write_object_end(w);
1174 
1175 	return 0;
1176 }
1177 
1178 static void
1179 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
1180 {
1181 	/* No config per bdev needed */
1182 }
1183 
1184 static uint64_t
1185 bdev_nvme_get_spin_time(struct spdk_io_channel *ch)
1186 {
1187 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
1188 	struct nvme_bdev_poll_group *group = nvme_ch->group;
1189 	uint64_t spin_time;
1190 
1191 	if (!group || !group->collect_spin_stat) {
1192 		return 0;
1193 	}
1194 
1195 	if (group->end_ticks != 0) {
1196 		group->spin_ticks += (group->end_ticks - group->start_ticks);
1197 		group->end_ticks = 0;
1198 	}
1199 
1200 	spin_time = (group->spin_ticks * 1000000ULL) / spdk_get_ticks_hz();
1201 	group->start_ticks = 0;
1202 	group->spin_ticks = 0;
1203 
1204 	return spin_time;
1205 }
1206 
1207 static const struct spdk_bdev_fn_table nvmelib_fn_table = {
1208 	.destruct		= bdev_nvme_destruct,
1209 	.submit_request		= bdev_nvme_submit_request,
1210 	.io_type_supported	= bdev_nvme_io_type_supported,
1211 	.get_io_channel		= bdev_nvme_get_io_channel,
1212 	.dump_info_json		= bdev_nvme_dump_info_json,
1213 	.write_config_json	= bdev_nvme_write_config_json,
1214 	.get_spin_time		= bdev_nvme_get_spin_time,
1215 	.get_module_ctx		= bdev_nvme_get_module_ctx,
1216 };
1217 
1218 static int
1219 nvme_disk_create(struct spdk_bdev *disk, const char *base_name,
1220 		 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns,
1221 		 uint32_t prchk_flags, void *ctx)
1222 {
1223 	const struct spdk_uuid		*uuid;
1224 	const struct spdk_nvme_ctrlr_data *cdata;
1225 	const struct spdk_nvme_ns_data	*nsdata;
1226 	int				rc;
1227 
1228 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
1229 
1230 	disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns));
1231 	if (!disk->name) {
1232 		return -ENOMEM;
1233 	}
1234 	disk->product_name = "NVMe disk";
1235 
1236 	disk->write_cache = 0;
1237 	if (cdata->vwc.present) {
1238 		/* Enable if the Volatile Write Cache exists */
1239 		disk->write_cache = 1;
1240 	}
1241 	disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns);
1242 	disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns);
1243 	disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns);
1244 
1245 	uuid = spdk_nvme_ns_get_uuid(ns);
1246 	if (uuid != NULL) {
1247 		disk->uuid = *uuid;
1248 	}
1249 
1250 	nsdata = spdk_nvme_ns_get_data(ns);
1251 
1252 	disk->md_len = spdk_nvme_ns_get_md_size(ns);
1253 	if (disk->md_len != 0) {
1254 		disk->md_interleave = nsdata->flbas.extended;
1255 		disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns);
1256 		if (disk->dif_type != SPDK_DIF_DISABLE) {
1257 			disk->dif_is_head_of_md = nsdata->dps.md_start;
1258 			disk->dif_check_flags = prchk_flags;
1259 		}
1260 	}
1261 
1262 	if (!(spdk_nvme_ctrlr_get_flags(ctrlr) &
1263 	      SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) {
1264 		disk->acwu = 0;
1265 	} else if (nsdata->nsfeat.ns_atomic_write_unit) {
1266 		disk->acwu = nsdata->nacwu;
1267 	} else {
1268 		disk->acwu = cdata->acwu;
1269 	}
1270 
1271 	disk->ctxt = ctx;
1272 	disk->fn_table = &nvmelib_fn_table;
1273 	disk->module = &nvme_if;
1274 	rc = spdk_bdev_register(disk);
1275 	if (rc) {
1276 		SPDK_ERRLOG("spdk_bdev_register() failed\n");
1277 		free(disk->name);
1278 		return rc;
1279 	}
1280 
1281 	return 0;
1282 }
1283 
1284 static int
1285 nvme_bdev_create(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, struct nvme_bdev_ns *nvme_ns)
1286 {
1287 	struct nvme_bdev *bdev;
1288 	int rc;
1289 
1290 	bdev = calloc(1, sizeof(*bdev));
1291 	if (!bdev) {
1292 		SPDK_ERRLOG("bdev calloc() failed\n");
1293 		return -ENOMEM;
1294 	}
1295 
1296 	bdev->nvme_ns = nvme_ns;
1297 	bdev->opal = nvme_bdev_ctrlr->opal_dev != NULL;
1298 
1299 	rc = nvme_disk_create(&bdev->disk, nvme_bdev_ctrlr->name, nvme_bdev_ctrlr->ctrlr,
1300 			      nvme_ns->ns, nvme_bdev_ctrlr->prchk_flags, bdev);
1301 	if (rc != 0) {
1302 		SPDK_ERRLOG("Failed to create NVMe disk\n");
1303 		free(bdev);
1304 		return rc;
1305 	}
1306 
1307 	nvme_ns->ref++;
1308 	nvme_ns->bdev = bdev;
1309 
1310 	return 0;
1311 }
1312 
1313 static void
1314 nvme_ctrlr_populate_standard_namespace(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
1315 				       struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx)
1316 {
1317 	struct spdk_nvme_ctrlr	*ctrlr = nvme_bdev_ctrlr->ctrlr;
1318 	struct spdk_nvme_ns	*ns;
1319 	int			rc = 0;
1320 
1321 	ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id);
1322 	if (!ns) {
1323 		SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id);
1324 		rc = -EINVAL;
1325 		goto done;
1326 	}
1327 
1328 	nvme_ns->ns = ns;
1329 	nvme_ns->ref = 1;
1330 
1331 	rc = nvme_bdev_create(nvme_bdev_ctrlr, nvme_ns);
1332 done:
1333 	nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, rc);
1334 }
1335 
1336 static bool
1337 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
1338 		 struct spdk_nvme_ctrlr_opts *opts)
1339 {
1340 	struct nvme_probe_skip_entry *entry;
1341 
1342 	TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) {
1343 		if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) {
1344 			return false;
1345 		}
1346 	}
1347 
1348 	opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst;
1349 	opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight;
1350 	opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight;
1351 	opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight;
1352 
1353 	SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr);
1354 
1355 	return true;
1356 }
1357 
1358 static void
1359 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl)
1360 {
1361 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = ctx;
1362 
1363 	if (spdk_nvme_cpl_is_error(cpl)) {
1364 		SPDK_WARNLOG("Abort failed. Resetting controller.\n");
1365 		_bdev_nvme_reset(nvme_bdev_ctrlr);
1366 	}
1367 }
1368 
1369 static void
1370 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr,
1371 	   struct spdk_nvme_qpair *qpair, uint16_t cid)
1372 {
1373 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = cb_arg;
1374 	union spdk_nvme_csts_register csts;
1375 	int rc;
1376 
1377 	assert(nvme_bdev_ctrlr->ctrlr == ctrlr);
1378 
1379 	SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid);
1380 
1381 	/* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O
1382 	 * queue.  (Note: qpair == NULL when there's an admin cmd timeout.)  Otherwise we
1383 	 * would submit another fabrics cmd on the admin queue to read CSTS and check for its
1384 	 * completion recursively.
1385 	 */
1386 	if (nvme_bdev_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) {
1387 		csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr);
1388 		if (csts.bits.cfs) {
1389 			SPDK_ERRLOG("Controller Fatal Status, reset required\n");
1390 			_bdev_nvme_reset(nvme_bdev_ctrlr);
1391 			return;
1392 		}
1393 	}
1394 
1395 	switch (g_opts.action_on_timeout) {
1396 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT:
1397 		if (qpair) {
1398 			rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid,
1399 						       nvme_abort_cpl, nvme_bdev_ctrlr);
1400 			if (rc == 0) {
1401 				return;
1402 			}
1403 
1404 			SPDK_ERRLOG("Unable to send abort. Resetting.\n");
1405 		}
1406 
1407 	/* FALLTHROUGH */
1408 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET:
1409 		_bdev_nvme_reset(nvme_bdev_ctrlr);
1410 		break;
1411 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE:
1412 		SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n");
1413 		break;
1414 	default:
1415 		SPDK_ERRLOG("An invalid timeout action value is found.\n");
1416 		break;
1417 	}
1418 }
1419 
1420 void
1421 nvme_ctrlr_depopulate_namespace_done(struct nvme_bdev_ns *nvme_ns)
1422 {
1423 	nvme_bdev_ns_detach(nvme_ns);
1424 }
1425 
1426 static void
1427 nvme_ctrlr_depopulate_standard_namespace(struct nvme_bdev_ns *nvme_ns)
1428 {
1429 	struct nvme_bdev *bdev;
1430 
1431 	bdev = nvme_bdev_ns_to_bdev(nvme_ns);
1432 	if (bdev != NULL) {
1433 		spdk_bdev_unregister(&bdev->disk, NULL, NULL);
1434 	}
1435 
1436 	nvme_ns->populated = false;
1437 
1438 	nvme_ctrlr_depopulate_namespace_done(nvme_ns);
1439 }
1440 
1441 static void
1442 nvme_ctrlr_populate_namespace(struct nvme_bdev_ctrlr *ctrlr, struct nvme_bdev_ns *nvme_ns,
1443 			      struct nvme_async_probe_ctx *ctx)
1444 {
1445 	g_populate_namespace_fn[nvme_ns->type](ctrlr, nvme_ns, ctx);
1446 }
1447 
1448 static void
1449 nvme_ctrlr_depopulate_namespace(struct nvme_bdev_ctrlr *ctrlr, struct nvme_bdev_ns *nvme_ns)
1450 {
1451 	g_depopulate_namespace_fn[nvme_ns->type](nvme_ns);
1452 }
1453 
1454 void
1455 nvme_ctrlr_populate_namespace_done(struct nvme_async_probe_ctx *ctx,
1456 				   struct nvme_bdev_ns *nvme_ns, int rc)
1457 {
1458 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = nvme_ns->ctrlr;
1459 
1460 	assert(nvme_bdev_ctrlr != NULL);
1461 
1462 	if (rc == 0) {
1463 		nvme_ns->populated = true;
1464 		pthread_mutex_lock(&nvme_bdev_ctrlr->mutex);
1465 		nvme_bdev_ctrlr->ref++;
1466 		pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
1467 	} else {
1468 		memset(nvme_ns, 0, sizeof(*nvme_ns));
1469 	}
1470 
1471 	if (ctx) {
1472 		ctx->populates_in_progress--;
1473 		if (ctx->populates_in_progress == 0) {
1474 			nvme_ctrlr_populate_namespaces_done(nvme_bdev_ctrlr, ctx);
1475 		}
1476 	}
1477 }
1478 
1479 static void
1480 nvme_ctrlr_populate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
1481 			       struct nvme_async_probe_ctx *ctx)
1482 {
1483 	struct spdk_nvme_ctrlr	*ctrlr = nvme_bdev_ctrlr->ctrlr;
1484 	struct nvme_bdev_ns	*nvme_ns;
1485 	struct spdk_nvme_ns	*ns;
1486 	struct nvme_bdev	*bdev;
1487 	uint32_t		i;
1488 	int			rc;
1489 	uint64_t		num_sectors;
1490 	bool			ns_is_active;
1491 
1492 	if (ctx) {
1493 		/* Initialize this count to 1 to handle the populate functions
1494 		 * calling nvme_ctrlr_populate_namespace_done() immediately.
1495 		 */
1496 		ctx->populates_in_progress = 1;
1497 	}
1498 
1499 	for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
1500 		uint32_t	nsid = i + 1;
1501 
1502 		nvme_ns = nvme_bdev_ctrlr->namespaces[i];
1503 		ns_is_active = spdk_nvme_ctrlr_is_active_ns(ctrlr, nsid);
1504 
1505 		if (nvme_ns->populated && ns_is_active && nvme_ns->type == NVME_BDEV_NS_STANDARD) {
1506 			/* NS is still there but attributes may have changed */
1507 			ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid);
1508 			num_sectors = spdk_nvme_ns_get_num_sectors(ns);
1509 			bdev = nvme_bdev_ns_to_bdev(nvme_ns);
1510 			assert(bdev != NULL);
1511 			if (bdev->disk.blockcnt != num_sectors) {
1512 				SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n",
1513 					       nsid,
1514 					       bdev->disk.name,
1515 					       bdev->disk.blockcnt,
1516 					       num_sectors);
1517 				rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors);
1518 				if (rc != 0) {
1519 					SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n",
1520 						    bdev->disk.name, rc);
1521 				}
1522 			}
1523 		}
1524 
1525 		if (!nvme_ns->populated && ns_is_active) {
1526 			nvme_ns->id = nsid;
1527 			nvme_ns->ctrlr = nvme_bdev_ctrlr;
1528 			if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) {
1529 				nvme_ns->type = NVME_BDEV_NS_OCSSD;
1530 			} else {
1531 				nvme_ns->type = NVME_BDEV_NS_STANDARD;
1532 			}
1533 
1534 			nvme_ns->bdev = NULL;
1535 
1536 			if (ctx) {
1537 				ctx->populates_in_progress++;
1538 			}
1539 			nvme_ctrlr_populate_namespace(nvme_bdev_ctrlr, nvme_ns, ctx);
1540 		}
1541 
1542 		if (nvme_ns->populated && !ns_is_active) {
1543 			nvme_ctrlr_depopulate_namespace(nvme_bdev_ctrlr, nvme_ns);
1544 		}
1545 	}
1546 
1547 	if (ctx) {
1548 		/* Decrement this count now that the loop is over to account
1549 		 * for the one we started with.  If the count is then 0, we
1550 		 * know any populate_namespace functions completed immediately,
1551 		 * so we'll kick the callback here.
1552 		 */
1553 		ctx->populates_in_progress--;
1554 		if (ctx->populates_in_progress == 0) {
1555 			nvme_ctrlr_populate_namespaces_done(nvme_bdev_ctrlr, ctx);
1556 		}
1557 	}
1558 
1559 }
1560 
1561 static void
1562 nvme_ctrlr_depopulate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr)
1563 {
1564 	uint32_t i;
1565 	struct nvme_bdev_ns *nvme_ns;
1566 
1567 	for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
1568 		uint32_t nsid = i + 1;
1569 
1570 		nvme_ns = nvme_bdev_ctrlr->namespaces[nsid - 1];
1571 		if (nvme_ns->populated) {
1572 			assert(nvme_ns->id == nsid);
1573 			nvme_ctrlr_depopulate_namespace(nvme_bdev_ctrlr, nvme_ns);
1574 		}
1575 	}
1576 }
1577 
1578 static void
1579 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl)
1580 {
1581 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr		= arg;
1582 	union spdk_nvme_async_event_completion	event;
1583 
1584 	if (spdk_nvme_cpl_is_error(cpl)) {
1585 		SPDK_WARNLOG("AER request execute failed");
1586 		return;
1587 	}
1588 
1589 	event.raw = cpl->cdw0;
1590 	if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) &&
1591 	    (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) {
1592 		nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, NULL);
1593 	} else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_VENDOR) &&
1594 		   (event.bits.log_page_identifier == SPDK_OCSSD_LOG_CHUNK_NOTIFICATION) &&
1595 		   spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) {
1596 		bdev_ocssd_handle_chunk_notification(nvme_bdev_ctrlr);
1597 	}
1598 }
1599 
1600 static int
1601 nvme_bdev_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr,
1602 		       const char *name,
1603 		       const struct spdk_nvme_transport_id *trid,
1604 		       uint32_t prchk_flags,
1605 		       struct nvme_bdev_ctrlr **_nvme_bdev_ctrlr)
1606 {
1607 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
1608 	struct nvme_bdev_ctrlr_trid *trid_entry;
1609 	uint32_t i;
1610 	int rc;
1611 
1612 	nvme_bdev_ctrlr = calloc(1, sizeof(*nvme_bdev_ctrlr));
1613 	if (nvme_bdev_ctrlr == NULL) {
1614 		SPDK_ERRLOG("Failed to allocate device struct\n");
1615 		return -ENOMEM;
1616 	}
1617 
1618 	rc = pthread_mutex_init(&nvme_bdev_ctrlr->mutex, NULL);
1619 	if (rc != 0) {
1620 		goto err_init_mutex;
1621 	}
1622 
1623 	TAILQ_INIT(&nvme_bdev_ctrlr->trids);
1624 	nvme_bdev_ctrlr->num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr);
1625 	if (nvme_bdev_ctrlr->num_ns != 0) {
1626 		nvme_bdev_ctrlr->namespaces = calloc(nvme_bdev_ctrlr->num_ns, sizeof(struct nvme_bdev_ns *));
1627 		if (!nvme_bdev_ctrlr->namespaces) {
1628 			SPDK_ERRLOG("Failed to allocate block namespaces pointer\n");
1629 			rc = -ENOMEM;
1630 			goto err_alloc_namespaces;
1631 		}
1632 	}
1633 
1634 	trid_entry = calloc(1, sizeof(*trid_entry));
1635 	if (trid_entry == NULL) {
1636 		SPDK_ERRLOG("Failed to allocate trid entry pointer\n");
1637 		rc = -ENOMEM;
1638 		goto err_alloc_trid;
1639 	}
1640 
1641 	trid_entry->trid = *trid;
1642 
1643 	for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
1644 		nvme_bdev_ctrlr->namespaces[i] = calloc(1, sizeof(struct nvme_bdev_ns));
1645 		if (nvme_bdev_ctrlr->namespaces[i] == NULL) {
1646 			SPDK_ERRLOG("Failed to allocate block namespace struct\n");
1647 			rc = -ENOMEM;
1648 			goto err_alloc_namespace;
1649 		}
1650 	}
1651 
1652 	nvme_bdev_ctrlr->thread = spdk_get_thread();
1653 	nvme_bdev_ctrlr->adminq_timer_poller = NULL;
1654 	nvme_bdev_ctrlr->ctrlr = ctrlr;
1655 	nvme_bdev_ctrlr->ref = 1;
1656 	nvme_bdev_ctrlr->connected_trid = &trid_entry->trid;
1657 	nvme_bdev_ctrlr->name = strdup(name);
1658 	if (nvme_bdev_ctrlr->name == NULL) {
1659 		rc = -ENOMEM;
1660 		goto err_alloc_name;
1661 	}
1662 
1663 	if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) {
1664 		rc = bdev_ocssd_init_ctrlr(nvme_bdev_ctrlr);
1665 		if (spdk_unlikely(rc != 0)) {
1666 			SPDK_ERRLOG("Unable to initialize OCSSD controller\n");
1667 			goto err_init_ocssd;
1668 		}
1669 	}
1670 
1671 	nvme_bdev_ctrlr->prchk_flags = prchk_flags;
1672 
1673 	spdk_io_device_register(nvme_bdev_ctrlr, bdev_nvme_create_cb, bdev_nvme_destroy_cb,
1674 				sizeof(struct nvme_io_channel),
1675 				name);
1676 
1677 	nvme_bdev_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_bdev_ctrlr,
1678 					       g_opts.nvme_adminq_poll_period_us);
1679 
1680 	TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nvme_bdev_ctrlr, tailq);
1681 
1682 	if (g_opts.timeout_us > 0) {
1683 		spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us,
1684 				timeout_cb, nvme_bdev_ctrlr);
1685 	}
1686 
1687 	spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_bdev_ctrlr);
1688 	spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_bdev_ctrlr);
1689 
1690 	if (spdk_nvme_ctrlr_get_flags(nvme_bdev_ctrlr->ctrlr) &
1691 	    SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) {
1692 		nvme_bdev_ctrlr->opal_dev = spdk_opal_dev_construct(nvme_bdev_ctrlr->ctrlr);
1693 		if (nvme_bdev_ctrlr->opal_dev == NULL) {
1694 			SPDK_ERRLOG("Failed to initialize Opal\n");
1695 		}
1696 	}
1697 
1698 	TAILQ_INSERT_HEAD(&nvme_bdev_ctrlr->trids, trid_entry, link);
1699 
1700 	if (_nvme_bdev_ctrlr != NULL) {
1701 		*_nvme_bdev_ctrlr = nvme_bdev_ctrlr;
1702 	}
1703 	return 0;
1704 
1705 err_init_ocssd:
1706 	free(nvme_bdev_ctrlr->name);
1707 err_alloc_name:
1708 err_alloc_namespace:
1709 	for (; i > 0; i--) {
1710 		free(nvme_bdev_ctrlr->namespaces[i - 1]);
1711 	}
1712 	free(trid_entry);
1713 err_alloc_trid:
1714 	free(nvme_bdev_ctrlr->namespaces);
1715 err_alloc_namespaces:
1716 	pthread_mutex_destroy(&nvme_bdev_ctrlr->mutex);
1717 err_init_mutex:
1718 	free(nvme_bdev_ctrlr);
1719 	return rc;
1720 }
1721 
1722 static void
1723 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
1724 	  struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
1725 {
1726 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
1727 	struct nvme_probe_ctx *ctx = cb_ctx;
1728 	char *name = NULL;
1729 	uint32_t prchk_flags = 0;
1730 	size_t i;
1731 	int rc;
1732 
1733 	if (ctx) {
1734 		for (i = 0; i < ctx->count; i++) {
1735 			if (spdk_nvme_transport_id_compare(trid, &ctx->trids[i]) == 0) {
1736 				prchk_flags = ctx->prchk_flags[i];
1737 				name = strdup(ctx->names[i]);
1738 				break;
1739 			}
1740 		}
1741 	} else {
1742 		name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++);
1743 	}
1744 	if (!name) {
1745 		SPDK_ERRLOG("Failed to assign name to NVMe device\n");
1746 		return;
1747 	}
1748 
1749 	SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name);
1750 
1751 	rc = nvme_bdev_ctrlr_create(ctrlr, name, trid, prchk_flags, &nvme_bdev_ctrlr);
1752 	if (rc != 0) {
1753 		SPDK_ERRLOG("Failed to create new NVMe controller\n");
1754 		free(name);
1755 		return;
1756 	}
1757 
1758 	nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, NULL);
1759 
1760 	free(name);
1761 }
1762 
1763 static void
1764 _nvme_bdev_ctrlr_destruct(void *ctx)
1765 {
1766 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = ctx;
1767 
1768 	nvme_ctrlr_depopulate_namespaces(nvme_bdev_ctrlr);
1769 	nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr);
1770 }
1771 
1772 static int
1773 _bdev_nvme_delete(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, bool hotplug)
1774 {
1775 	struct nvme_probe_skip_entry *entry;
1776 
1777 	pthread_mutex_lock(&nvme_bdev_ctrlr->mutex);
1778 
1779 	/* The controller's destruction was already started */
1780 	if (nvme_bdev_ctrlr->destruct) {
1781 		pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
1782 		return 0;
1783 	}
1784 
1785 	if (!hotplug &&
1786 	    nvme_bdev_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
1787 		entry = calloc(1, sizeof(*entry));
1788 		if (!entry) {
1789 			pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
1790 			return -ENOMEM;
1791 		}
1792 		entry->trid = *nvme_bdev_ctrlr->connected_trid;
1793 		TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq);
1794 	}
1795 
1796 	nvme_bdev_ctrlr->destruct = true;
1797 	pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
1798 
1799 	_nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr);
1800 
1801 	return 0;
1802 }
1803 
1804 static void
1805 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr)
1806 {
1807 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = cb_ctx;
1808 
1809 	_bdev_nvme_delete(nvme_bdev_ctrlr, true);
1810 }
1811 
1812 static int
1813 bdev_nvme_hotplug_probe(void *arg)
1814 {
1815 	if (g_hotplug_probe_ctx == NULL) {
1816 		spdk_poller_unregister(&g_hotplug_probe_poller);
1817 		return SPDK_POLLER_IDLE;
1818 	}
1819 
1820 	if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) {
1821 		g_hotplug_probe_ctx = NULL;
1822 		spdk_poller_unregister(&g_hotplug_probe_poller);
1823 	}
1824 
1825 	return SPDK_POLLER_BUSY;
1826 }
1827 
1828 static int
1829 bdev_nvme_hotplug(void *arg)
1830 {
1831 	struct spdk_nvme_transport_id trid_pcie;
1832 
1833 	if (g_hotplug_probe_ctx) {
1834 		return SPDK_POLLER_BUSY;
1835 	}
1836 
1837 	memset(&trid_pcie, 0, sizeof(trid_pcie));
1838 	spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE);
1839 
1840 	g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL,
1841 			      hotplug_probe_cb, attach_cb, NULL);
1842 
1843 	if (g_hotplug_probe_ctx) {
1844 		assert(g_hotplug_probe_poller == NULL);
1845 		g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000);
1846 	}
1847 
1848 	return SPDK_POLLER_BUSY;
1849 }
1850 
1851 void
1852 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts)
1853 {
1854 	*opts = g_opts;
1855 }
1856 
1857 int
1858 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts)
1859 {
1860 	if (g_bdev_nvme_init_thread != NULL) {
1861 		if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
1862 			return -EPERM;
1863 		}
1864 	}
1865 
1866 	g_opts = *opts;
1867 
1868 	return 0;
1869 }
1870 
1871 struct set_nvme_hotplug_ctx {
1872 	uint64_t period_us;
1873 	bool enabled;
1874 	spdk_msg_fn fn;
1875 	void *fn_ctx;
1876 };
1877 
1878 static void
1879 set_nvme_hotplug_period_cb(void *_ctx)
1880 {
1881 	struct set_nvme_hotplug_ctx *ctx = _ctx;
1882 
1883 	spdk_poller_unregister(&g_hotplug_poller);
1884 	if (ctx->enabled) {
1885 		g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us);
1886 	}
1887 
1888 	g_nvme_hotplug_poll_period_us = ctx->period_us;
1889 	g_nvme_hotplug_enabled = ctx->enabled;
1890 	if (ctx->fn) {
1891 		ctx->fn(ctx->fn_ctx);
1892 	}
1893 
1894 	free(ctx);
1895 }
1896 
1897 int
1898 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx)
1899 {
1900 	struct set_nvme_hotplug_ctx *ctx;
1901 
1902 	if (enabled == true && !spdk_process_is_primary()) {
1903 		return -EPERM;
1904 	}
1905 
1906 	ctx = calloc(1, sizeof(*ctx));
1907 	if (ctx == NULL) {
1908 		return -ENOMEM;
1909 	}
1910 
1911 	period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us;
1912 	ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX);
1913 	ctx->enabled = enabled;
1914 	ctx->fn = cb;
1915 	ctx->fn_ctx = cb_ctx;
1916 
1917 	spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx);
1918 	return 0;
1919 }
1920 
1921 static void
1922 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, size_t count, int rc)
1923 {
1924 	if (ctx->cb_fn) {
1925 		ctx->cb_fn(ctx->cb_ctx, count, rc);
1926 	}
1927 
1928 	ctx->namespaces_populated = true;
1929 	if (ctx->probe_done) {
1930 		/* The probe was already completed, so we need to free the context
1931 		 * here.  This can happen for cases like OCSSD, where we need to
1932 		 * send additional commands to the SSD after attach.
1933 		 */
1934 		free(ctx);
1935 	}
1936 }
1937 
1938 static void
1939 nvme_ctrlr_populate_namespaces_done(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
1940 				    struct nvme_async_probe_ctx *ctx)
1941 {
1942 	struct nvme_bdev_ns	*nvme_ns;
1943 	struct nvme_bdev	*nvme_bdev;
1944 	uint32_t		i, nsid;
1945 	size_t			j;
1946 
1947 	assert(nvme_bdev_ctrlr != NULL);
1948 
1949 	/*
1950 	 * Report the new bdevs that were created in this call.
1951 	 * There can be more than one bdev per NVMe controller.
1952 	 */
1953 	j = 0;
1954 	for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
1955 		nsid = i + 1;
1956 		nvme_ns = nvme_bdev_ctrlr->namespaces[nsid - 1];
1957 		if (!nvme_ns->populated) {
1958 			continue;
1959 		}
1960 		assert(nvme_ns->id == nsid);
1961 		nvme_bdev = nvme_bdev_ns_to_bdev(nvme_ns);
1962 		if (nvme_bdev == NULL) {
1963 			assert(nvme_ns->type == NVME_BDEV_NS_OCSSD);
1964 			continue;
1965 		}
1966 		if (j < ctx->count) {
1967 			ctx->names[j] = nvme_bdev->disk.name;
1968 			j++;
1969 		} else {
1970 			SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n",
1971 				    ctx->count);
1972 			populate_namespaces_cb(ctx, 0, -ERANGE);
1973 			return;
1974 		}
1975 	}
1976 
1977 	populate_namespaces_cb(ctx, j, 0);
1978 }
1979 
1980 static bool
1981 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2)
1982 {
1983 	const struct spdk_nvme_ns_data *nsdata1, *nsdata2;
1984 
1985 	nsdata1 = spdk_nvme_ns_get_data(ns1);
1986 	nsdata2 = spdk_nvme_ns_get_data(ns2);
1987 
1988 	return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid));
1989 }
1990 
1991 static int
1992 bdev_nvme_add_trid(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, struct spdk_nvme_ctrlr *new_ctrlr,
1993 		   struct spdk_nvme_transport_id *trid)
1994 {
1995 	uint32_t			i, nsid;
1996 	struct nvme_bdev_ns		*nvme_ns;
1997 	struct spdk_nvme_ns		*new_ns;
1998 	struct nvme_bdev_ctrlr_trid	*new_trid, *tmp_trid;
1999 	int				rc = 0;
2000 
2001 	assert(nvme_bdev_ctrlr != NULL);
2002 
2003 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
2004 		SPDK_ERRLOG("PCIe failover is not supported.\n");
2005 		return -ENOTSUP;
2006 	}
2007 
2008 	pthread_mutex_lock(&nvme_bdev_ctrlr->mutex);
2009 
2010 	/* Currently we only support failover to the same transport type. */
2011 	if (nvme_bdev_ctrlr->connected_trid->trtype != trid->trtype) {
2012 		rc = -EINVAL;
2013 		goto exit;
2014 	}
2015 
2016 	/* Currently we only support failover to the same NQN. */
2017 	if (strncmp(trid->subnqn, nvme_bdev_ctrlr->connected_trid->subnqn, SPDK_NVMF_NQN_MAX_LEN)) {
2018 		rc = -EINVAL;
2019 		goto exit;
2020 	}
2021 
2022 	/* Skip all the other checks if we've already registered this path. */
2023 	TAILQ_FOREACH(new_trid, &nvme_bdev_ctrlr->trids, link) {
2024 		if (!spdk_nvme_transport_id_compare(&new_trid->trid, trid)) {
2025 			rc = -EEXIST;
2026 			goto exit;
2027 		}
2028 	}
2029 
2030 	if (spdk_nvme_ctrlr_get_num_ns(new_ctrlr) != nvme_bdev_ctrlr->num_ns) {
2031 		rc = -EINVAL;
2032 		goto exit;
2033 	}
2034 
2035 	for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
2036 		nsid = i + 1;
2037 
2038 		nvme_ns = nvme_bdev_ctrlr->namespaces[i];
2039 		if (!nvme_ns->populated) {
2040 			continue;
2041 		}
2042 
2043 		new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nsid);
2044 		assert(new_ns != NULL);
2045 
2046 		if (bdev_nvme_compare_ns(nvme_ns->ns, new_ns) != 0) {
2047 			rc = -EINVAL;
2048 			goto exit;
2049 		}
2050 	}
2051 
2052 	new_trid = calloc(1, sizeof(*new_trid));
2053 	if (new_trid == NULL) {
2054 		rc = -ENOMEM;
2055 		goto exit;
2056 	}
2057 	new_trid->trid = *trid;
2058 	new_trid->is_failed = false;
2059 
2060 	TAILQ_FOREACH(tmp_trid, &nvme_bdev_ctrlr->trids, link) {
2061 		if (tmp_trid->is_failed) {
2062 			TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link);
2063 			goto exit;
2064 		}
2065 	}
2066 
2067 	TAILQ_INSERT_TAIL(&nvme_bdev_ctrlr->trids, new_trid, link);
2068 
2069 exit:
2070 	pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
2071 	return rc;
2072 }
2073 
2074 static void
2075 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
2076 		  struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
2077 {
2078 	struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx;
2079 	struct nvme_bdev_ctrlr	*nvme_bdev_ctrlr;
2080 	struct nvme_async_probe_ctx *ctx;
2081 	int rc;
2082 
2083 	ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, opts);
2084 	ctx->ctrlr_attached = true;
2085 
2086 	nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(ctx->base_name);
2087 	if (nvme_bdev_ctrlr) {
2088 		/* This is the case that a secondary path is added to an existing
2089 		 * nvme_bdev_ctrlr for failover. After checking if it can access the same
2090 		 * namespaces as the primary path, it is disconnected until failover occurs.
2091 		 */
2092 		rc = bdev_nvme_add_trid(nvme_bdev_ctrlr, ctrlr, &ctx->trid);
2093 
2094 		spdk_nvme_detach(ctrlr);
2095 		goto exit;
2096 	}
2097 
2098 	rc = nvme_bdev_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx->prchk_flags,
2099 				    &nvme_bdev_ctrlr);
2100 	if (rc) {
2101 		SPDK_ERRLOG("Failed to create new device\n");
2102 		goto exit;
2103 	}
2104 
2105 	nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, ctx);
2106 	return;
2107 
2108 exit:
2109 	populate_namespaces_cb(ctx, 0, rc);
2110 }
2111 
2112 static int
2113 bdev_nvme_async_poll(void *arg)
2114 {
2115 	struct nvme_async_probe_ctx	*ctx = arg;
2116 	int				rc;
2117 
2118 	rc = spdk_nvme_probe_poll_async(ctx->probe_ctx);
2119 	if (spdk_unlikely(rc != -EAGAIN)) {
2120 		ctx->probe_done = true;
2121 		spdk_poller_unregister(&ctx->poller);
2122 		if (!ctx->ctrlr_attached) {
2123 			/* The probe is done, but no controller was attached.
2124 			 * That means we had a failure, so report -EIO back to
2125 			 * the caller (usually the RPC). populate_namespaces_cb()
2126 			 * will take care of freeing the nvme_async_probe_ctx.
2127 			 */
2128 			populate_namespaces_cb(ctx, 0, -EIO);
2129 		} else if (ctx->namespaces_populated) {
2130 			/* The namespaces for the attached controller were all
2131 			 * populated and the response was already sent to the
2132 			 * caller (usually the RPC).  So free the context here.
2133 			 */
2134 			free(ctx);
2135 		}
2136 	}
2137 
2138 	return SPDK_POLLER_BUSY;
2139 }
2140 
2141 int
2142 bdev_nvme_create(struct spdk_nvme_transport_id *trid,
2143 		 struct spdk_nvme_host_id *hostid,
2144 		 const char *base_name,
2145 		 const char **names,
2146 		 uint32_t count,
2147 		 const char *hostnqn,
2148 		 uint32_t prchk_flags,
2149 		 spdk_bdev_create_nvme_fn cb_fn,
2150 		 void *cb_ctx,
2151 		 struct spdk_nvme_ctrlr_opts *opts)
2152 {
2153 	struct nvme_probe_skip_entry	*entry, *tmp;
2154 	struct nvme_async_probe_ctx	*ctx;
2155 
2156 	/* TODO expand this check to include both the host and target TRIDs.
2157 	 * Only if both are the same should we fail.
2158 	 */
2159 	if (nvme_bdev_ctrlr_get(trid) != NULL) {
2160 		SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr);
2161 		return -EEXIST;
2162 	}
2163 
2164 	ctx = calloc(1, sizeof(*ctx));
2165 	if (!ctx) {
2166 		return -ENOMEM;
2167 	}
2168 	ctx->base_name = base_name;
2169 	ctx->names = names;
2170 	ctx->count = count;
2171 	ctx->cb_fn = cb_fn;
2172 	ctx->cb_ctx = cb_ctx;
2173 	ctx->prchk_flags = prchk_flags;
2174 	ctx->trid = *trid;
2175 
2176 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
2177 		TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) {
2178 			if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) {
2179 				TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq);
2180 				free(entry);
2181 				break;
2182 			}
2183 		}
2184 	}
2185 
2186 	if (opts) {
2187 		memcpy(&ctx->opts, opts, sizeof(*opts));
2188 	} else {
2189 		spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->opts, sizeof(ctx->opts));
2190 	}
2191 
2192 	ctx->opts.transport_retry_count = g_opts.retry_count;
2193 	ctx->opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms;
2194 
2195 	if (hostnqn) {
2196 		snprintf(ctx->opts.hostnqn, sizeof(ctx->opts.hostnqn), "%s", hostnqn);
2197 	}
2198 
2199 	if (hostid->hostaddr[0] != '\0') {
2200 		snprintf(ctx->opts.src_addr, sizeof(ctx->opts.src_addr), "%s", hostid->hostaddr);
2201 	}
2202 
2203 	if (hostid->hostsvcid[0] != '\0') {
2204 		snprintf(ctx->opts.src_svcid, sizeof(ctx->opts.src_svcid), "%s", hostid->hostsvcid);
2205 	}
2206 
2207 	ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->opts, connect_attach_cb);
2208 	if (ctx->probe_ctx == NULL) {
2209 		SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr);
2210 		free(ctx);
2211 		return -ENODEV;
2212 	}
2213 	ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000);
2214 
2215 	return 0;
2216 }
2217 
2218 int
2219 bdev_nvme_delete(const char *name, const struct spdk_nvme_transport_id *trid)
2220 {
2221 	struct nvme_bdev_ctrlr		*nvme_bdev_ctrlr;
2222 	struct nvme_bdev_ctrlr_trid	*ctrlr_trid, *tmp_trid;
2223 
2224 	if (name == NULL) {
2225 		return -EINVAL;
2226 	}
2227 
2228 	nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name);
2229 	if (nvme_bdev_ctrlr == NULL) {
2230 		SPDK_ERRLOG("Failed to find NVMe controller\n");
2231 		return -ENODEV;
2232 	}
2233 
2234 	/* case 1: remove the controller itself. */
2235 	if (trid == NULL) {
2236 		return _bdev_nvme_delete(nvme_bdev_ctrlr, false);
2237 	}
2238 
2239 	/* case 2: we are currently using the path to be removed. */
2240 	if (!spdk_nvme_transport_id_compare(trid, nvme_bdev_ctrlr->connected_trid)) {
2241 		ctrlr_trid = TAILQ_FIRST(&nvme_bdev_ctrlr->trids);
2242 		assert(nvme_bdev_ctrlr->connected_trid == &ctrlr_trid->trid);
2243 		/* case 2A: the current path is the only path. */
2244 		if (!TAILQ_NEXT(ctrlr_trid, link)) {
2245 			return _bdev_nvme_delete(nvme_bdev_ctrlr, false);
2246 		}
2247 
2248 		/* case 1B: there is an alternative path. */
2249 		return bdev_nvme_failover(nvme_bdev_ctrlr, true);
2250 	}
2251 	/* case 3: We are not using the specified path. */
2252 	TAILQ_FOREACH_SAFE(ctrlr_trid, &nvme_bdev_ctrlr->trids, link, tmp_trid) {
2253 		if (!spdk_nvme_transport_id_compare(&ctrlr_trid->trid, trid)) {
2254 			TAILQ_REMOVE(&nvme_bdev_ctrlr->trids, ctrlr_trid, link);
2255 			free(ctrlr_trid);
2256 			return 0;
2257 		}
2258 	}
2259 
2260 	/* case 3A: The address isn't even in the registered list. */
2261 	return -ENXIO;
2262 }
2263 
2264 static int
2265 bdev_nvme_library_init(void)
2266 {
2267 	g_bdev_nvme_init_thread = spdk_get_thread();
2268 
2269 	spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_poll_group_create_cb,
2270 				bdev_nvme_poll_group_destroy_cb,
2271 				sizeof(struct nvme_bdev_poll_group),  "bdev_nvme_poll_groups");
2272 
2273 	return 0;
2274 }
2275 
2276 static void
2277 bdev_nvme_library_fini(void)
2278 {
2279 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, *tmp;
2280 	struct nvme_probe_skip_entry *entry, *entry_tmp;
2281 
2282 	spdk_poller_unregister(&g_hotplug_poller);
2283 	free(g_hotplug_probe_ctx);
2284 	g_hotplug_probe_ctx = NULL;
2285 
2286 	TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) {
2287 		TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq);
2288 		free(entry);
2289 	}
2290 
2291 	pthread_mutex_lock(&g_bdev_nvme_mutex);
2292 	TAILQ_FOREACH_SAFE(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq, tmp) {
2293 		pthread_mutex_lock(&nvme_bdev_ctrlr->mutex);
2294 		if (nvme_bdev_ctrlr->destruct) {
2295 			/* This controller's destruction was already started
2296 			 * before the application started shutting down
2297 			 */
2298 			pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
2299 			continue;
2300 		}
2301 		nvme_bdev_ctrlr->destruct = true;
2302 		pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
2303 
2304 		spdk_thread_send_msg(nvme_bdev_ctrlr->thread, _nvme_bdev_ctrlr_destruct,
2305 				     nvme_bdev_ctrlr);
2306 	}
2307 
2308 	g_bdev_nvme_module_finish = true;
2309 	if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
2310 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
2311 		spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL);
2312 		spdk_bdev_module_finish_done();
2313 		return;
2314 	}
2315 
2316 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
2317 }
2318 
2319 static void
2320 bdev_nvme_verify_pi_error(struct spdk_bdev_io *bdev_io)
2321 {
2322 	struct spdk_bdev *bdev = bdev_io->bdev;
2323 	struct spdk_dif_ctx dif_ctx;
2324 	struct spdk_dif_error err_blk = {};
2325 	int rc;
2326 
2327 	rc = spdk_dif_ctx_init(&dif_ctx,
2328 			       bdev->blocklen, bdev->md_len, bdev->md_interleave,
2329 			       bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags,
2330 			       bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0);
2331 	if (rc != 0) {
2332 		SPDK_ERRLOG("Initialization of DIF context failed\n");
2333 		return;
2334 	}
2335 
2336 	if (bdev->md_interleave) {
2337 		rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
2338 				     bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
2339 	} else {
2340 		struct iovec md_iov = {
2341 			.iov_base	= bdev_io->u.bdev.md_buf,
2342 			.iov_len	= bdev_io->u.bdev.num_blocks * bdev->md_len,
2343 		};
2344 
2345 		rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
2346 				     &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
2347 	}
2348 
2349 	if (rc != 0) {
2350 		SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n",
2351 			    err_blk.err_type, err_blk.err_offset);
2352 	} else {
2353 		SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n");
2354 	}
2355 }
2356 
2357 static void
2358 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl)
2359 {
2360 	struct nvme_bdev_io *bio = ref;
2361 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2362 
2363 	if (spdk_nvme_cpl_is_success(cpl)) {
2364 		/* Run PI verification for read data buffer. */
2365 		bdev_nvme_verify_pi_error(bdev_io);
2366 	}
2367 
2368 	/* Return original completion status */
2369 	spdk_bdev_io_complete_nvme_status(bdev_io, bio->cpl.cdw0, bio->cpl.status.sct,
2370 					  bio->cpl.status.sc);
2371 }
2372 
2373 static void
2374 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl)
2375 {
2376 	struct nvme_bdev_io *bio = ref;
2377 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2378 	struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt;
2379 	struct nvme_io_channel *nvme_ch;
2380 	struct nvme_bdev_ns *nvme_ns;
2381 	struct spdk_nvme_qpair *qpair;
2382 	int ret;
2383 
2384 	if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) {
2385 		SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n",
2386 			    cpl->status.sct, cpl->status.sc);
2387 
2388 		/* Save completion status to use after verifying PI error. */
2389 		bio->cpl = *cpl;
2390 
2391 		nvme_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io));
2392 
2393 		if (spdk_likely(bdev_nvme_find_io_path(nbdev, nvme_ch, &nvme_ns, &qpair))) {
2394 			/* Read without PI checking to verify PI error. */
2395 			ret = bdev_nvme_no_pi_readv(nvme_ns->ns,
2396 						    qpair,
2397 						    bio,
2398 						    bdev_io->u.bdev.iovs,
2399 						    bdev_io->u.bdev.iovcnt,
2400 						    bdev_io->u.bdev.md_buf,
2401 						    bdev_io->u.bdev.num_blocks,
2402 						    bdev_io->u.bdev.offset_blocks);
2403 			if (ret == 0) {
2404 				return;
2405 			}
2406 		}
2407 	}
2408 
2409 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2410 }
2411 
2412 static void
2413 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl)
2414 {
2415 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref);
2416 
2417 	if (spdk_nvme_cpl_is_pi_error(cpl)) {
2418 		SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n",
2419 			    cpl->status.sct, cpl->status.sc);
2420 		/* Run PI verification for write data buffer if PI error is detected. */
2421 		bdev_nvme_verify_pi_error(bdev_io);
2422 	}
2423 
2424 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2425 }
2426 
2427 static void
2428 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl)
2429 {
2430 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref);
2431 
2432 	if (spdk_nvme_cpl_is_pi_error(cpl)) {
2433 		SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n",
2434 			    cpl->status.sct, cpl->status.sc);
2435 		/* Run PI verification for compare data buffer if PI error is detected. */
2436 		bdev_nvme_verify_pi_error(bdev_io);
2437 	}
2438 
2439 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2440 }
2441 
2442 static void
2443 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl)
2444 {
2445 	struct nvme_bdev_io *bio = ref;
2446 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2447 
2448 	/* Compare operation completion */
2449 	if ((cpl->cdw0 & 0xFF) == SPDK_NVME_OPC_COMPARE) {
2450 		/* Save compare result for write callback */
2451 		bio->cpl = *cpl;
2452 		return;
2453 	}
2454 
2455 	/* Write operation completion */
2456 	if (spdk_nvme_cpl_is_error(&bio->cpl)) {
2457 		/* If bio->cpl is already an error, it means the compare operation failed.  In that case,
2458 		 * complete the IO with the compare operation's status.
2459 		 */
2460 		if (!spdk_nvme_cpl_is_error(cpl)) {
2461 			SPDK_ERRLOG("Unexpected write success after compare failure.\n");
2462 		}
2463 
2464 		spdk_bdev_io_complete_nvme_status(bdev_io, bio->cpl.cdw0, bio->cpl.status.sct, bio->cpl.status.sc);
2465 	} else {
2466 		spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2467 	}
2468 }
2469 
2470 static void
2471 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl)
2472 {
2473 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref);
2474 
2475 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2476 }
2477 
2478 static void
2479 bdev_nvme_admin_passthru_completion(void *ctx)
2480 {
2481 	struct nvme_bdev_io *bio = ctx;
2482 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2483 
2484 	spdk_bdev_io_complete_nvme_status(bdev_io,
2485 					  bio->cpl.cdw0, bio->cpl.status.sct, bio->cpl.status.sc);
2486 }
2487 
2488 static void
2489 bdev_nvme_abort_completion(void *ctx)
2490 {
2491 	struct nvme_bdev_io *bio = ctx;
2492 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2493 
2494 	if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) {
2495 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
2496 	} else {
2497 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
2498 	}
2499 }
2500 
2501 static void
2502 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl)
2503 {
2504 	struct nvme_bdev_io *bio = ref;
2505 
2506 	bio->cpl = *cpl;
2507 	spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_completion, bio);
2508 }
2509 
2510 static void
2511 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl)
2512 {
2513 	struct nvme_bdev_io *bio = ref;
2514 
2515 	bio->cpl = *cpl;
2516 	spdk_thread_send_msg(bio->orig_thread, bdev_nvme_admin_passthru_completion, bio);
2517 }
2518 
2519 static void
2520 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset)
2521 {
2522 	struct nvme_bdev_io *bio = ref;
2523 	struct iovec *iov;
2524 
2525 	bio->iov_offset = sgl_offset;
2526 	for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) {
2527 		iov = &bio->iovs[bio->iovpos];
2528 		if (bio->iov_offset < iov->iov_len) {
2529 			break;
2530 		}
2531 
2532 		bio->iov_offset -= iov->iov_len;
2533 	}
2534 }
2535 
2536 static int
2537 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length)
2538 {
2539 	struct nvme_bdev_io *bio = ref;
2540 	struct iovec *iov;
2541 
2542 	assert(bio->iovpos < bio->iovcnt);
2543 
2544 	iov = &bio->iovs[bio->iovpos];
2545 
2546 	*address = iov->iov_base;
2547 	*length = iov->iov_len;
2548 
2549 	if (bio->iov_offset) {
2550 		assert(bio->iov_offset <= iov->iov_len);
2551 		*address += bio->iov_offset;
2552 		*length -= bio->iov_offset;
2553 	}
2554 
2555 	bio->iov_offset += *length;
2556 	if (bio->iov_offset == iov->iov_len) {
2557 		bio->iovpos++;
2558 		bio->iov_offset = 0;
2559 	}
2560 
2561 	return 0;
2562 }
2563 
2564 static void
2565 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset)
2566 {
2567 	struct nvme_bdev_io *bio = ref;
2568 	struct iovec *iov;
2569 
2570 	bio->fused_iov_offset = sgl_offset;
2571 	for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) {
2572 		iov = &bio->fused_iovs[bio->fused_iovpos];
2573 		if (bio->fused_iov_offset < iov->iov_len) {
2574 			break;
2575 		}
2576 
2577 		bio->fused_iov_offset -= iov->iov_len;
2578 	}
2579 }
2580 
2581 static int
2582 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length)
2583 {
2584 	struct nvme_bdev_io *bio = ref;
2585 	struct iovec *iov;
2586 
2587 	assert(bio->fused_iovpos < bio->fused_iovcnt);
2588 
2589 	iov = &bio->fused_iovs[bio->fused_iovpos];
2590 
2591 	*address = iov->iov_base;
2592 	*length = iov->iov_len;
2593 
2594 	if (bio->fused_iov_offset) {
2595 		assert(bio->fused_iov_offset <= iov->iov_len);
2596 		*address += bio->fused_iov_offset;
2597 		*length -= bio->fused_iov_offset;
2598 	}
2599 
2600 	bio->fused_iov_offset += *length;
2601 	if (bio->fused_iov_offset == iov->iov_len) {
2602 		bio->fused_iovpos++;
2603 		bio->fused_iov_offset = 0;
2604 	}
2605 
2606 	return 0;
2607 }
2608 
2609 static int
2610 bdev_nvme_no_pi_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
2611 		      struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
2612 		      void *md, uint64_t lba_count, uint64_t lba)
2613 {
2614 	int rc;
2615 
2616 	SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n",
2617 		      lba_count, lba);
2618 
2619 	bio->iovs = iov;
2620 	bio->iovcnt = iovcnt;
2621 	bio->iovpos = 0;
2622 	bio->iov_offset = 0;
2623 
2624 	rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count,
2625 					    bdev_nvme_no_pi_readv_done, bio, 0,
2626 					    bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
2627 					    md, 0, 0);
2628 
2629 	if (rc != 0 && rc != -ENOMEM) {
2630 		SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc);
2631 	}
2632 	return rc;
2633 }
2634 
2635 static int
2636 bdev_nvme_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
2637 		struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
2638 		void *md, uint64_t lba_count, uint64_t lba, uint32_t flags)
2639 {
2640 	int rc;
2641 
2642 	SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n",
2643 		      lba_count, lba);
2644 
2645 	bio->iovs = iov;
2646 	bio->iovcnt = iovcnt;
2647 	bio->iovpos = 0;
2648 	bio->iov_offset = 0;
2649 
2650 	if (iovcnt == 1) {
2651 		rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, md, lba,
2652 						   lba_count,
2653 						   bdev_nvme_readv_done, bio,
2654 						   flags,
2655 						   0, 0);
2656 	} else {
2657 		rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count,
2658 						    bdev_nvme_readv_done, bio, flags,
2659 						    bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
2660 						    md, 0, 0);
2661 	}
2662 
2663 	if (rc != 0 && rc != -ENOMEM) {
2664 		SPDK_ERRLOG("readv failed: rc = %d\n", rc);
2665 	}
2666 	return rc;
2667 }
2668 
2669 static int
2670 bdev_nvme_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
2671 		 struct nvme_bdev_io *bio,
2672 		 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
2673 		 uint32_t flags)
2674 {
2675 	int rc;
2676 
2677 	SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n",
2678 		      lba_count, lba);
2679 
2680 	bio->iovs = iov;
2681 	bio->iovcnt = iovcnt;
2682 	bio->iovpos = 0;
2683 	bio->iov_offset = 0;
2684 
2685 	if (iovcnt == 1) {
2686 		rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, md, lba,
2687 						    lba_count,
2688 						    bdev_nvme_writev_done, bio,
2689 						    flags,
2690 						    0, 0);
2691 	} else {
2692 		rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count,
2693 						     bdev_nvme_writev_done, bio, flags,
2694 						     bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
2695 						     md, 0, 0);
2696 	}
2697 
2698 	if (rc != 0 && rc != -ENOMEM) {
2699 		SPDK_ERRLOG("writev failed: rc = %d\n", rc);
2700 	}
2701 	return rc;
2702 }
2703 
2704 static int
2705 bdev_nvme_comparev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
2706 		   struct nvme_bdev_io *bio,
2707 		   struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
2708 		   uint32_t flags)
2709 {
2710 	int rc;
2711 
2712 	SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n",
2713 		      lba_count, lba);
2714 
2715 	bio->iovs = iov;
2716 	bio->iovcnt = iovcnt;
2717 	bio->iovpos = 0;
2718 	bio->iov_offset = 0;
2719 
2720 	rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count,
2721 					       bdev_nvme_comparev_done, bio, flags,
2722 					       bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
2723 					       md, 0, 0);
2724 
2725 	if (rc != 0 && rc != -ENOMEM) {
2726 		SPDK_ERRLOG("comparev failed: rc = %d\n", rc);
2727 	}
2728 	return rc;
2729 }
2730 
2731 static int
2732 bdev_nvme_comparev_and_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
2733 			      struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt,
2734 			      struct iovec *write_iov, int write_iovcnt,
2735 			      void *md, uint64_t lba_count, uint64_t lba, uint32_t flags)
2736 {
2737 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2738 	int rc;
2739 
2740 	SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n",
2741 		      lba_count, lba);
2742 
2743 	bio->iovs = cmp_iov;
2744 	bio->iovcnt = cmp_iovcnt;
2745 	bio->iovpos = 0;
2746 	bio->iov_offset = 0;
2747 	bio->fused_iovs = write_iov;
2748 	bio->fused_iovcnt = write_iovcnt;
2749 	bio->fused_iovpos = 0;
2750 	bio->fused_iov_offset = 0;
2751 
2752 	if (bdev_io->num_retries == 0) {
2753 		bio->first_fused_submitted = false;
2754 	}
2755 
2756 	if (!bio->first_fused_submitted) {
2757 		flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST;
2758 		memset(&bio->cpl, 0, sizeof(bio->cpl));
2759 
2760 		rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count,
2761 						       bdev_nvme_comparev_and_writev_done, bio, flags,
2762 						       bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0);
2763 		if (rc == 0) {
2764 			bio->first_fused_submitted = true;
2765 			flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST;
2766 		} else {
2767 			if (rc != -ENOMEM) {
2768 				SPDK_ERRLOG("compare failed: rc = %d\n", rc);
2769 			}
2770 			return rc;
2771 		}
2772 	}
2773 
2774 	flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND;
2775 
2776 	rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count,
2777 					     bdev_nvme_comparev_and_writev_done, bio, flags,
2778 					     bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0);
2779 	if (rc != 0 && rc != -ENOMEM) {
2780 		SPDK_ERRLOG("write failed: rc = %d\n", rc);
2781 		rc = 0;
2782 	}
2783 
2784 	return rc;
2785 }
2786 
2787 static int
2788 bdev_nvme_unmap(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
2789 		struct nvme_bdev_io *bio,
2790 		uint64_t offset_blocks,
2791 		uint64_t num_blocks)
2792 {
2793 	struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES];
2794 	struct spdk_nvme_dsm_range *range;
2795 	uint64_t offset, remaining;
2796 	uint64_t num_ranges_u64;
2797 	uint16_t num_ranges;
2798 	int rc;
2799 
2800 	num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) /
2801 			 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
2802 	if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) {
2803 		SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks);
2804 		return -EINVAL;
2805 	}
2806 	num_ranges = (uint16_t)num_ranges_u64;
2807 
2808 	offset = offset_blocks;
2809 	remaining = num_blocks;
2810 	range = &dsm_ranges[0];
2811 
2812 	/* Fill max-size ranges until the remaining blocks fit into one range */
2813 	while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) {
2814 		range->attributes.raw = 0;
2815 		range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
2816 		range->starting_lba = offset;
2817 
2818 		offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
2819 		remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
2820 		range++;
2821 	}
2822 
2823 	/* Final range describes the remaining blocks */
2824 	range->attributes.raw = 0;
2825 	range->length = remaining;
2826 	range->starting_lba = offset;
2827 
2828 	rc = spdk_nvme_ns_cmd_dataset_management(ns, qpair,
2829 			SPDK_NVME_DSM_ATTR_DEALLOCATE,
2830 			dsm_ranges, num_ranges,
2831 			bdev_nvme_queued_done, bio);
2832 
2833 	return rc;
2834 }
2835 
2836 static int
2837 bdev_nvme_admin_passthru(struct nvme_io_channel *nvme_ch, struct nvme_bdev_io *bio,
2838 			 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes)
2839 {
2840 	uint32_t max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ch->ctrlr->ctrlr);
2841 
2842 	if (nbytes > max_xfer_size) {
2843 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
2844 		return -EINVAL;
2845 	}
2846 
2847 	bio->orig_thread = spdk_io_channel_get_thread(spdk_io_channel_from_ctx(nvme_ch));
2848 
2849 	return spdk_nvme_ctrlr_cmd_admin_raw(nvme_ch->ctrlr->ctrlr, cmd, buf,
2850 					     (uint32_t)nbytes, bdev_nvme_admin_passthru_done, bio);
2851 }
2852 
2853 static int
2854 bdev_nvme_io_passthru(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
2855 		      struct nvme_bdev_io *bio,
2856 		      struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes)
2857 {
2858 	uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns);
2859 	struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns);
2860 
2861 	if (nbytes > max_xfer_size) {
2862 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
2863 		return -EINVAL;
2864 	}
2865 
2866 	/*
2867 	 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid,
2868 	 * so fill it out automatically.
2869 	 */
2870 	cmd->nsid = spdk_nvme_ns_get_id(ns);
2871 
2872 	return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf,
2873 					  (uint32_t)nbytes, bdev_nvme_queued_done, bio);
2874 }
2875 
2876 static int
2877 bdev_nvme_io_passthru_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
2878 			 struct nvme_bdev_io *bio,
2879 			 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len)
2880 {
2881 	size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns);
2882 	uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns);
2883 	struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns);
2884 
2885 	if (nbytes > max_xfer_size) {
2886 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
2887 		return -EINVAL;
2888 	}
2889 
2890 	if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) {
2891 		SPDK_ERRLOG("invalid meta data buffer size\n");
2892 		return -EINVAL;
2893 	}
2894 
2895 	/*
2896 	 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid,
2897 	 * so fill it out automatically.
2898 	 */
2899 	cmd->nsid = spdk_nvme_ns_get_id(ns);
2900 
2901 	return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf,
2902 			(uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio);
2903 }
2904 
2905 static void
2906 bdev_nvme_abort_admin_cmd(void *ctx)
2907 {
2908 	struct nvme_bdev_io *bio = ctx;
2909 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2910 	struct nvme_io_channel *nvme_ch;
2911 	struct nvme_bdev_io *bio_to_abort;
2912 	int rc;
2913 
2914 	nvme_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io));
2915 	bio_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx;
2916 
2917 	rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ch->ctrlr->ctrlr,
2918 					   NULL,
2919 					   bio_to_abort,
2920 					   bdev_nvme_abort_done, bio);
2921 	if (rc == -ENOENT) {
2922 		/* If no admin command was found in admin qpair, complete the abort
2923 		 * request with failure.
2924 		 */
2925 		bio->cpl.cdw0 |= 1U;
2926 		bio->cpl.status.sc = SPDK_NVME_SC_SUCCESS;
2927 		bio->cpl.status.sct = SPDK_NVME_SCT_GENERIC;
2928 
2929 		spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_completion, bio);
2930 	}
2931 }
2932 
2933 static int
2934 bdev_nvme_abort(struct nvme_io_channel *nvme_ch, struct nvme_bdev_io *bio,
2935 		struct nvme_bdev_io *bio_to_abort)
2936 {
2937 	int rc;
2938 
2939 	bio->orig_thread = spdk_io_channel_get_thread(spdk_io_channel_from_ctx(nvme_ch));
2940 
2941 	rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ch->ctrlr->ctrlr,
2942 					   nvme_ch->qpair,
2943 					   bio_to_abort,
2944 					   bdev_nvme_abort_done, bio);
2945 	if (rc == -ENOENT) {
2946 		/* If no command was found in I/O qpair, the target command may be
2947 		 * admin command. Only a single thread tries aborting admin command
2948 		 * to clean I/O flow.
2949 		 */
2950 		spdk_thread_send_msg(nvme_ch->ctrlr->thread,
2951 				     bdev_nvme_abort_admin_cmd, bio);
2952 		rc = 0;
2953 	}
2954 
2955 	return rc;
2956 }
2957 
2958 static void
2959 nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w,
2960 		struct nvme_bdev_ns *nvme_ns)
2961 {
2962 	/* nop */
2963 }
2964 
2965 static void
2966 nvme_namespace_config_json(struct spdk_json_write_ctx *w, struct nvme_bdev_ns *nvme_ns)
2967 {
2968 	g_config_json_namespace_fn[nvme_ns->type](w, nvme_ns);
2969 }
2970 
2971 static void
2972 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w)
2973 {
2974 	const char	*action;
2975 
2976 	if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) {
2977 		action = "reset";
2978 	} else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) {
2979 		action = "abort";
2980 	} else {
2981 		action = "none";
2982 	}
2983 
2984 	spdk_json_write_object_begin(w);
2985 
2986 	spdk_json_write_named_string(w, "method", "bdev_nvme_set_options");
2987 
2988 	spdk_json_write_named_object_begin(w, "params");
2989 	spdk_json_write_named_string(w, "action_on_timeout", action);
2990 	spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us);
2991 	spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms);
2992 	spdk_json_write_named_uint32(w, "retry_count", g_opts.retry_count);
2993 	spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst);
2994 	spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight);
2995 	spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight);
2996 	spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight);
2997 	spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us);
2998 	spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us);
2999 	spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests);
3000 	spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit);
3001 	spdk_json_write_object_end(w);
3002 
3003 	spdk_json_write_object_end(w);
3004 }
3005 
3006 static void
3007 nvme_bdev_ctrlr_config_json(struct spdk_json_write_ctx *w,
3008 			    struct nvme_bdev_ctrlr *nvme_bdev_ctrlr)
3009 {
3010 	struct spdk_nvme_transport_id	*trid;
3011 
3012 	trid = nvme_bdev_ctrlr->connected_trid;
3013 
3014 	spdk_json_write_object_begin(w);
3015 
3016 	spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller");
3017 
3018 	spdk_json_write_named_object_begin(w, "params");
3019 	spdk_json_write_named_string(w, "name", nvme_bdev_ctrlr->name);
3020 	nvme_bdev_dump_trid_json(trid, w);
3021 	spdk_json_write_named_bool(w, "prchk_reftag",
3022 				   (nvme_bdev_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0);
3023 	spdk_json_write_named_bool(w, "prchk_guard",
3024 				   (nvme_bdev_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0);
3025 
3026 	spdk_json_write_object_end(w);
3027 
3028 	spdk_json_write_object_end(w);
3029 }
3030 
3031 static void
3032 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w)
3033 {
3034 	spdk_json_write_object_begin(w);
3035 	spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug");
3036 
3037 	spdk_json_write_named_object_begin(w, "params");
3038 	spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us);
3039 	spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled);
3040 	spdk_json_write_object_end(w);
3041 
3042 	spdk_json_write_object_end(w);
3043 }
3044 
3045 static int
3046 bdev_nvme_config_json(struct spdk_json_write_ctx *w)
3047 {
3048 	struct nvme_bdev_ctrlr	*nvme_bdev_ctrlr;
3049 	uint32_t		nsid;
3050 
3051 	bdev_nvme_opts_config_json(w);
3052 
3053 	pthread_mutex_lock(&g_bdev_nvme_mutex);
3054 
3055 	TAILQ_FOREACH(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
3056 		nvme_bdev_ctrlr_config_json(w, nvme_bdev_ctrlr);
3057 
3058 		for (nsid = 0; nsid < nvme_bdev_ctrlr->num_ns; ++nsid) {
3059 			if (!nvme_bdev_ctrlr->namespaces[nsid]->populated) {
3060 				continue;
3061 			}
3062 
3063 			nvme_namespace_config_json(w, nvme_bdev_ctrlr->namespaces[nsid]);
3064 		}
3065 	}
3066 
3067 	/* Dump as last parameter to give all NVMe bdevs chance to be constructed
3068 	 * before enabling hotplug poller.
3069 	 */
3070 	bdev_nvme_hotplug_config_json(w);
3071 
3072 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
3073 	return 0;
3074 }
3075 
3076 struct spdk_nvme_ctrlr *
3077 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev)
3078 {
3079 	if (!bdev || bdev->module != &nvme_if) {
3080 		return NULL;
3081 	}
3082 
3083 	return SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk)->nvme_ns->ctrlr->ctrlr;
3084 }
3085 
3086 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme)
3087