xref: /spdk/module/bdev/nvme/bdev_nvme.c (revision 9239ed33f6d988b0ed361db64ccd27eacdd367e6)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation. All rights reserved.
5  *   Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "spdk/stdinc.h"
35 
36 #include "bdev_nvme.h"
37 #include "bdev_ocssd.h"
38 
39 #include "spdk/config.h"
40 #include "spdk/endian.h"
41 #include "spdk/bdev.h"
42 #include "spdk/json.h"
43 #include "spdk/nvme.h"
44 #include "spdk/nvme_ocssd.h"
45 #include "spdk/thread.h"
46 #include "spdk/string.h"
47 #include "spdk/util.h"
48 
49 #include "spdk/bdev_module.h"
50 #include "spdk/log.h"
51 
52 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true
53 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS	(10000)
54 
55 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w);
56 
57 struct nvme_bdev_io {
58 	/** array of iovecs to transfer. */
59 	struct iovec *iovs;
60 
61 	/** Number of iovecs in iovs array. */
62 	int iovcnt;
63 
64 	/** Current iovec position. */
65 	int iovpos;
66 
67 	/** Offset in current iovec. */
68 	uint32_t iov_offset;
69 
70 	/** array of iovecs to transfer. */
71 	struct iovec *fused_iovs;
72 
73 	/** Number of iovecs in iovs array. */
74 	int fused_iovcnt;
75 
76 	/** Current iovec position. */
77 	int fused_iovpos;
78 
79 	/** Offset in current iovec. */
80 	uint32_t fused_iov_offset;
81 
82 	/** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */
83 	struct spdk_nvme_cpl cpl;
84 
85 	/** Originating thread */
86 	struct spdk_thread *orig_thread;
87 
88 	/** Keeps track if first of fused commands was submitted */
89 	bool first_fused_submitted;
90 };
91 
92 struct nvme_probe_ctx {
93 	size_t count;
94 	struct spdk_nvme_transport_id trids[NVME_MAX_CONTROLLERS];
95 	struct spdk_nvme_host_id hostids[NVME_MAX_CONTROLLERS];
96 	const char *names[NVME_MAX_CONTROLLERS];
97 	uint32_t prchk_flags[NVME_MAX_CONTROLLERS];
98 	const char *hostnqn;
99 };
100 
101 struct nvme_probe_skip_entry {
102 	struct spdk_nvme_transport_id		trid;
103 	TAILQ_ENTRY(nvme_probe_skip_entry)	tailq;
104 };
105 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */
106 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER(
107 			g_skipped_nvme_ctrlrs);
108 
109 static struct spdk_bdev_nvme_opts g_opts = {
110 	.action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE,
111 	.timeout_us = 0,
112 	.keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS,
113 	.retry_count = 4,
114 	.arbitration_burst = 0,
115 	.low_priority_weight = 0,
116 	.medium_priority_weight = 0,
117 	.high_priority_weight = 0,
118 	.nvme_adminq_poll_period_us = 10000ULL,
119 	.nvme_ioq_poll_period_us = 0,
120 	.io_queue_requests = 0,
121 	.delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT,
122 };
123 
124 #define NVME_HOTPLUG_POLL_PERIOD_MAX			10000000ULL
125 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT		100000ULL
126 
127 static int g_hot_insert_nvme_controller_index = 0;
128 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT;
129 static bool g_nvme_hotplug_enabled = false;
130 static struct spdk_thread *g_bdev_nvme_init_thread;
131 static struct spdk_poller *g_hotplug_poller;
132 static struct spdk_poller *g_hotplug_probe_poller;
133 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx;
134 
135 static void nvme_ctrlr_populate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
136 		struct nvme_async_probe_ctx *ctx);
137 static void nvme_ctrlr_populate_namespaces_done(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
138 		struct nvme_async_probe_ctx *ctx);
139 static int bdev_nvme_library_init(void);
140 static void bdev_nvme_library_fini(void);
141 static int bdev_nvme_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
142 			   struct nvme_bdev_io *bio,
143 			   struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
144 			   uint32_t flags);
145 static int bdev_nvme_no_pi_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
146 				 struct nvme_bdev_io *bio,
147 				 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba);
148 static int bdev_nvme_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
149 			    struct nvme_bdev_io *bio,
150 			    struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
151 			    uint32_t flags);
152 static int bdev_nvme_comparev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
153 			      struct nvme_bdev_io *bio,
154 			      struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
155 			      uint32_t flags);
156 static int bdev_nvme_comparev_and_writev(struct spdk_nvme_ns *ns,
157 		struct spdk_nvme_qpair *qpair,
158 		struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov,
159 		int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba,
160 		uint32_t flags);
161 static int bdev_nvme_admin_passthru(struct nvme_io_channel *nvme_ch,
162 				    struct nvme_bdev_io *bio,
163 				    struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes);
164 static int bdev_nvme_io_passthru(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
165 				 struct nvme_bdev_io *bio,
166 				 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes);
167 static int bdev_nvme_io_passthru_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
168 				    struct nvme_bdev_io *bio,
169 				    struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len);
170 static int bdev_nvme_abort(struct nvme_io_channel *nvme_ch,
171 			   struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort);
172 static int bdev_nvme_reset(struct nvme_io_channel *nvme_ch, struct nvme_bdev_io *bio);
173 static int bdev_nvme_failover(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, bool remove);
174 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr);
175 
176 typedef void (*populate_namespace_fn)(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
177 				      struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx);
178 static void nvme_ctrlr_populate_standard_namespace(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
179 		struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx);
180 
181 static populate_namespace_fn g_populate_namespace_fn[] = {
182 	NULL,
183 	nvme_ctrlr_populate_standard_namespace,
184 	bdev_ocssd_populate_namespace,
185 };
186 
187 typedef void (*depopulate_namespace_fn)(struct nvme_bdev_ns *nvme_ns);
188 static void nvme_ctrlr_depopulate_standard_namespace(struct nvme_bdev_ns *nvme_ns);
189 
190 static depopulate_namespace_fn g_depopulate_namespace_fn[] = {
191 	NULL,
192 	nvme_ctrlr_depopulate_standard_namespace,
193 	bdev_ocssd_depopulate_namespace,
194 };
195 
196 typedef void (*config_json_namespace_fn)(struct spdk_json_write_ctx *w,
197 		struct nvme_bdev_ns *nvme_ns);
198 static void nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w,
199 		struct nvme_bdev_ns *nvme_ns);
200 
201 static config_json_namespace_fn g_config_json_namespace_fn[] = {
202 	NULL,
203 	nvme_ctrlr_config_json_standard_namespace,
204 	bdev_ocssd_namespace_config_json,
205 };
206 
207 struct spdk_nvme_qpair *
208 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch)
209 {
210 	struct nvme_io_channel *nvme_ch;
211 
212 	assert(ctrlr_io_ch != NULL);
213 
214 	nvme_ch = spdk_io_channel_get_ctx(ctrlr_io_ch);
215 
216 	return nvme_ch->qpair;
217 }
218 
219 static int
220 bdev_nvme_get_ctx_size(void)
221 {
222 	return sizeof(struct nvme_bdev_io);
223 }
224 
225 static struct spdk_bdev_module nvme_if = {
226 	.name = "nvme",
227 	.async_fini = true,
228 	.module_init = bdev_nvme_library_init,
229 	.module_fini = bdev_nvme_library_fini,
230 	.config_json = bdev_nvme_config_json,
231 	.get_ctx_size = bdev_nvme_get_ctx_size,
232 
233 };
234 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if)
235 
236 static void
237 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx)
238 {
239 	int rc;
240 
241 	SPDK_DEBUGLOG(bdev_nvme, "qpair %p is disconnected, attempting reconnect.\n", qpair);
242 	/*
243 	 * Currently, just try to reconnect indefinitely. If we are doing a reset, the reset will
244 	 * reconnect a qpair and we will stop getting a callback for this one.
245 	 */
246 	rc = spdk_nvme_ctrlr_reconnect_io_qpair(qpair);
247 	if (rc != 0) {
248 		SPDK_WARNLOG("Failed to reconnect to qpair %p, errno %d\n", qpair, -rc);
249 	}
250 }
251 
252 static int
253 bdev_nvme_poll(void *arg)
254 {
255 	struct nvme_bdev_poll_group *group = arg;
256 	int64_t num_completions;
257 
258 	if (group->collect_spin_stat && group->start_ticks == 0) {
259 		group->start_ticks = spdk_get_ticks();
260 	}
261 
262 	num_completions = spdk_nvme_poll_group_process_completions(group->group, 0,
263 			  bdev_nvme_disconnected_qpair_cb);
264 	if (group->collect_spin_stat) {
265 		if (num_completions > 0) {
266 			if (group->end_ticks != 0) {
267 				group->spin_ticks += (group->end_ticks - group->start_ticks);
268 				group->end_ticks = 0;
269 			}
270 			group->start_ticks = 0;
271 		} else {
272 			group->end_ticks = spdk_get_ticks();
273 		}
274 	}
275 
276 	return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
277 }
278 
279 static int
280 bdev_nvme_poll_adminq(void *arg)
281 {
282 	int32_t rc;
283 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = arg;
284 
285 	assert(nvme_bdev_ctrlr != NULL);
286 
287 	rc = spdk_nvme_ctrlr_process_admin_completions(nvme_bdev_ctrlr->ctrlr);
288 	if (rc < 0) {
289 		bdev_nvme_failover(nvme_bdev_ctrlr, false);
290 	}
291 
292 	return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY;
293 }
294 
295 static int
296 bdev_nvme_destruct(void *ctx)
297 {
298 	struct nvme_bdev *nvme_disk = ctx;
299 	struct nvme_bdev_ns *nvme_ns = nvme_disk->nvme_ns;
300 
301 	nvme_ns->bdev = NULL;
302 
303 	nvme_bdev_ns_detach(nvme_ns);
304 
305 	free(nvme_disk->disk.name);
306 	free(nvme_disk);
307 
308 	return 0;
309 }
310 
311 static int
312 bdev_nvme_flush(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
313 		struct nvme_bdev_io *bio, uint64_t offset, uint64_t nbytes)
314 {
315 	spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_SUCCESS);
316 
317 	return 0;
318 }
319 
320 static int
321 bdev_nvme_create_qpair(struct nvme_io_channel *nvme_ch)
322 {
323 	struct spdk_nvme_ctrlr *ctrlr = nvme_ch->ctrlr->ctrlr;
324 	struct spdk_nvme_io_qpair_opts opts;
325 	int rc;
326 
327 	spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &opts, sizeof(opts));
328 	opts.delay_cmd_submit = g_opts.delay_cmd_submit;
329 	opts.create_only = true;
330 	opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests);
331 	g_opts.io_queue_requests = opts.io_queue_requests;
332 
333 	nvme_ch->qpair = spdk_nvme_ctrlr_alloc_io_qpair(ctrlr, &opts, sizeof(opts));
334 	if (nvme_ch->qpair == NULL) {
335 		return -1;
336 	}
337 
338 	assert(nvme_ch->group != NULL);
339 
340 	rc = spdk_nvme_poll_group_add(nvme_ch->group->group, nvme_ch->qpair);
341 	if (rc != 0) {
342 		SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n");
343 		goto err;
344 	}
345 
346 	rc = spdk_nvme_ctrlr_connect_io_qpair(ctrlr, nvme_ch->qpair);
347 	if (rc != 0) {
348 		SPDK_ERRLOG("Unable to connect I/O qpair.\n");
349 		goto err;
350 	}
351 
352 	return 0;
353 
354 err:
355 	spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair);
356 
357 	return rc;
358 }
359 
360 static void
361 _bdev_nvme_check_pending_destruct(struct spdk_io_channel_iter *i, int status)
362 {
363 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i);
364 
365 	pthread_mutex_lock(&nvme_bdev_ctrlr->mutex);
366 	if (nvme_bdev_ctrlr->destruct_after_reset) {
367 		assert(nvme_bdev_ctrlr->ref == 0 && nvme_bdev_ctrlr->destruct);
368 		pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
369 
370 		spdk_thread_send_msg(nvme_bdev_ctrlr->thread, nvme_bdev_ctrlr_do_destruct,
371 				     nvme_bdev_ctrlr);
372 	} else {
373 		pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
374 	}
375 }
376 
377 static void
378 _bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i)
379 {
380 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
381 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(_ch);
382 	struct spdk_bdev_io *bdev_io;
383 	enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS;
384 
385 	/* A NULL ctx means success. */
386 	if (spdk_io_channel_iter_get_ctx(i) != NULL) {
387 		status = SPDK_BDEV_IO_STATUS_FAILED;
388 	}
389 
390 	while (!TAILQ_EMPTY(&nvme_ch->pending_resets)) {
391 		bdev_io = TAILQ_FIRST(&nvme_ch->pending_resets);
392 		TAILQ_REMOVE(&nvme_ch->pending_resets, bdev_io, module_link);
393 		spdk_bdev_io_complete(bdev_io, status);
394 	}
395 
396 	spdk_for_each_channel_continue(i, 0);
397 }
398 
399 static void
400 _bdev_nvme_reset_complete(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, int rc)
401 {
402 	/* we are using the for_each_channel cb_arg like a return code here. */
403 	/* If it's zero, we succeeded, otherwise, the reset failed. */
404 	void *cb_arg = NULL;
405 	struct nvme_bdev_ctrlr_trid *curr_trid;
406 
407 	if (rc) {
408 		cb_arg = (void *)0x1;
409 		SPDK_ERRLOG("Resetting controller failed.\n");
410 	} else {
411 		SPDK_NOTICELOG("Resetting controller successful.\n");
412 	}
413 
414 	pthread_mutex_lock(&nvme_bdev_ctrlr->mutex);
415 	nvme_bdev_ctrlr->resetting = false;
416 	nvme_bdev_ctrlr->failover_in_progress = false;
417 
418 	curr_trid = TAILQ_FIRST(&nvme_bdev_ctrlr->trids);
419 	assert(curr_trid != NULL);
420 	assert(&curr_trid->trid == nvme_bdev_ctrlr->connected_trid);
421 
422 	curr_trid->is_failed = cb_arg != NULL ? true : false;
423 
424 	if (nvme_bdev_ctrlr->ref == 0 && nvme_bdev_ctrlr->destruct) {
425 		/* Destruct ctrlr after clearing pending resets. */
426 		nvme_bdev_ctrlr->destruct_after_reset = true;
427 	}
428 
429 	pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
430 
431 	/* Make sure we clear any pending resets before returning. */
432 	spdk_for_each_channel(nvme_bdev_ctrlr,
433 			      _bdev_nvme_complete_pending_resets,
434 			      cb_arg,
435 			      _bdev_nvme_check_pending_destruct);
436 }
437 
438 static void
439 _bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status)
440 {
441 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i);
442 	struct nvme_bdev_io *bio = spdk_io_channel_iter_get_ctx(i);
443 	int rc = SPDK_BDEV_IO_STATUS_SUCCESS;
444 
445 	if (status) {
446 		rc = SPDK_BDEV_IO_STATUS_FAILED;
447 	}
448 	if (bio) {
449 		spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), rc);
450 	}
451 	_bdev_nvme_reset_complete(nvme_bdev_ctrlr, status);
452 }
453 
454 static void
455 _bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i)
456 {
457 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
458 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(_ch);
459 	int rc;
460 
461 	rc = bdev_nvme_create_qpair(nvme_ch);
462 
463 	spdk_for_each_channel_continue(i, rc);
464 }
465 
466 static void
467 _bdev_nvme_reset_ctrlr(struct spdk_io_channel_iter *i, int status)
468 {
469 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i);
470 	struct nvme_bdev_io *bio = spdk_io_channel_iter_get_ctx(i);
471 	int rc;
472 
473 	if (status) {
474 		rc = status;
475 		goto err;
476 	}
477 
478 	rc = spdk_nvme_ctrlr_reset(nvme_bdev_ctrlr->ctrlr);
479 	if (rc != 0) {
480 		goto err;
481 	}
482 
483 	/* Recreate all of the I/O queue pairs */
484 	spdk_for_each_channel(nvme_bdev_ctrlr,
485 			      _bdev_nvme_reset_create_qpair,
486 			      bio,
487 			      _bdev_nvme_reset_create_qpairs_done);
488 	return;
489 
490 err:
491 	if (bio) {
492 		spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_FAILED);
493 	}
494 	_bdev_nvme_reset_complete(nvme_bdev_ctrlr, rc);
495 }
496 
497 static void
498 _bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i)
499 {
500 	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
501 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
502 	int rc;
503 
504 	rc = spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair);
505 	if (!rc) {
506 		nvme_ch->qpair = NULL;
507 	}
508 
509 	spdk_for_each_channel_continue(i, rc);
510 }
511 
512 static int
513 _bdev_nvme_reset_start(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr)
514 {
515 	pthread_mutex_lock(&nvme_bdev_ctrlr->mutex);
516 	if (nvme_bdev_ctrlr->destruct) {
517 		pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
518 		return -EBUSY;
519 	}
520 
521 	if (nvme_bdev_ctrlr->resetting) {
522 		pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
523 		SPDK_NOTICELOG("Unable to perform reset, already in progress.\n");
524 		return -EAGAIN;
525 	}
526 
527 	nvme_bdev_ctrlr->resetting = true;
528 
529 	pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
530 	return 0;
531 }
532 
533 static int
534 _bdev_nvme_reset(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr)
535 {
536 	int rc;
537 
538 	rc = _bdev_nvme_reset_start(nvme_bdev_ctrlr);
539 	if (rc == 0) {
540 		/* First, delete all NVMe I/O queue pairs. */
541 		spdk_for_each_channel(nvme_bdev_ctrlr,
542 				      _bdev_nvme_reset_destroy_qpair,
543 				      NULL,
544 				      _bdev_nvme_reset_ctrlr);
545 	}
546 
547 	return rc;
548 }
549 
550 static int
551 bdev_nvme_reset(struct nvme_io_channel *nvme_ch, struct nvme_bdev_io *bio)
552 {
553 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
554 	int rc;
555 
556 	rc = _bdev_nvme_reset_start(nvme_ch->ctrlr);
557 	if (rc == 0) {
558 		/* First, delete all NVMe I/O queue pairs. */
559 		spdk_for_each_channel(nvme_ch->ctrlr,
560 				      _bdev_nvme_reset_destroy_qpair,
561 				      bio,
562 				      _bdev_nvme_reset_ctrlr);
563 	} else if (rc == -EBUSY) {
564 		/* Don't bother resetting if the controller is in the process of being destructed. */
565 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
566 	} else if (rc == -EAGAIN) {
567 		/*
568 		 * Reset call is queued only if it is from the app framework. This is on purpose so that
569 		 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the
570 		 * upper level. If they are in the middle of a reset, we won't try to schedule another one.
571 		 */
572 		TAILQ_INSERT_TAIL(&nvme_ch->pending_resets, bdev_io, module_link);
573 	} else {
574 		return rc;
575 	}
576 
577 	return 0;
578 }
579 
580 static int
581 _bdev_nvme_failover_start(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, bool remove)
582 {
583 	struct nvme_bdev_ctrlr_trid *curr_trid = NULL, *next_trid = NULL;
584 	int rc;
585 
586 	pthread_mutex_lock(&nvme_bdev_ctrlr->mutex);
587 	if (nvme_bdev_ctrlr->destruct) {
588 		pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
589 		/* Don't bother resetting if the controller is in the process of being destructed. */
590 		return -EBUSY;
591 	}
592 
593 	curr_trid = TAILQ_FIRST(&nvme_bdev_ctrlr->trids);
594 	assert(curr_trid);
595 	assert(&curr_trid->trid == nvme_bdev_ctrlr->connected_trid);
596 	next_trid = TAILQ_NEXT(curr_trid, link);
597 
598 	if (nvme_bdev_ctrlr->resetting) {
599 		if (next_trid && !nvme_bdev_ctrlr->failover_in_progress) {
600 			rc = -EAGAIN;
601 		} else {
602 			rc = -EBUSY;
603 		}
604 		pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
605 		SPDK_NOTICELOG("Unable to perform reset, already in progress.\n");
606 		return rc;
607 	}
608 
609 	nvme_bdev_ctrlr->resetting = true;
610 	curr_trid->is_failed = true;
611 
612 	if (next_trid) {
613 		assert(curr_trid->trid.trtype != SPDK_NVME_TRANSPORT_PCIE);
614 
615 		SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", curr_trid->trid.traddr,
616 			       curr_trid->trid.trsvcid,	next_trid->trid.traddr, next_trid->trid.trsvcid);
617 
618 		nvme_bdev_ctrlr->failover_in_progress = true;
619 		spdk_nvme_ctrlr_fail(nvme_bdev_ctrlr->ctrlr);
620 		nvme_bdev_ctrlr->connected_trid = &next_trid->trid;
621 		rc = spdk_nvme_ctrlr_set_trid(nvme_bdev_ctrlr->ctrlr, &next_trid->trid);
622 		assert(rc == 0);
623 		TAILQ_REMOVE(&nvme_bdev_ctrlr->trids, curr_trid, link);
624 		if (!remove) {
625 			/** Shuffle the old trid to the end of the list and use the new one.
626 			 * Allows for round robin through multiple connections.
627 			 */
628 			TAILQ_INSERT_TAIL(&nvme_bdev_ctrlr->trids, curr_trid, link);
629 		} else {
630 			free(curr_trid);
631 		}
632 	}
633 
634 	pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
635 	return 0;
636 }
637 
638 static int
639 bdev_nvme_failover(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, bool remove)
640 {
641 	int rc;
642 
643 	rc = _bdev_nvme_failover_start(nvme_bdev_ctrlr, remove);
644 	if (rc == 0) {
645 		/* First, delete all NVMe I/O queue pairs. */
646 		spdk_for_each_channel(nvme_bdev_ctrlr,
647 				      _bdev_nvme_reset_destroy_qpair,
648 				      NULL,
649 				      _bdev_nvme_reset_ctrlr);
650 	} else if (rc != -EBUSY) {
651 		return rc;
652 	}
653 
654 	return 0;
655 }
656 
657 static int
658 bdev_nvme_unmap(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
659 		struct nvme_bdev_io *bio,
660 		uint64_t offset_blocks,
661 		uint64_t num_blocks);
662 
663 static void
664 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
665 		     bool success)
666 {
667 	struct spdk_bdev *bdev = bdev_io->bdev;
668 	struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev->ctxt;
669 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
670 	struct nvme_bdev_ns *nvme_ns;
671 	struct spdk_nvme_qpair *qpair;
672 	int ret;
673 
674 	if (!success) {
675 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
676 		return;
677 	}
678 
679 	if (spdk_unlikely(!bdev_nvme_find_io_path(nbdev, nvme_ch, &nvme_ns, &qpair))) {
680 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
681 		return;
682 	}
683 
684 	ret = bdev_nvme_readv(nvme_ns->ns,
685 			      qpair,
686 			      (struct nvme_bdev_io *)bdev_io->driver_ctx,
687 			      bdev_io->u.bdev.iovs,
688 			      bdev_io->u.bdev.iovcnt,
689 			      bdev_io->u.bdev.md_buf,
690 			      bdev_io->u.bdev.num_blocks,
691 			      bdev_io->u.bdev.offset_blocks,
692 			      bdev->dif_check_flags);
693 
694 	if (spdk_likely(ret == 0)) {
695 		return;
696 	} else if (ret == -ENOMEM) {
697 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
698 	} else {
699 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
700 	}
701 }
702 
703 static int
704 _bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
705 {
706 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
707 	struct spdk_bdev *bdev = bdev_io->bdev;
708 	struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev->ctxt;
709 	struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx;
710 	struct nvme_bdev_io *nbdev_io_to_abort;
711 	struct nvme_bdev_ns *nvme_ns;
712 	struct spdk_nvme_qpair *qpair;
713 
714 	if (spdk_unlikely(!bdev_nvme_find_io_path(nbdev, nvme_ch, &nvme_ns, &qpair))) {
715 		return -1;
716 	}
717 
718 	switch (bdev_io->type) {
719 	case SPDK_BDEV_IO_TYPE_READ:
720 		if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) {
721 			return bdev_nvme_readv(nvme_ns->ns,
722 					       qpair,
723 					       nbdev_io,
724 					       bdev_io->u.bdev.iovs,
725 					       bdev_io->u.bdev.iovcnt,
726 					       bdev_io->u.bdev.md_buf,
727 					       bdev_io->u.bdev.num_blocks,
728 					       bdev_io->u.bdev.offset_blocks,
729 					       bdev->dif_check_flags);
730 		} else {
731 			spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb,
732 					     bdev_io->u.bdev.num_blocks * bdev->blocklen);
733 			return 0;
734 		}
735 
736 	case SPDK_BDEV_IO_TYPE_WRITE:
737 		return bdev_nvme_writev(nvme_ns->ns,
738 					qpair,
739 					nbdev_io,
740 					bdev_io->u.bdev.iovs,
741 					bdev_io->u.bdev.iovcnt,
742 					bdev_io->u.bdev.md_buf,
743 					bdev_io->u.bdev.num_blocks,
744 					bdev_io->u.bdev.offset_blocks,
745 					bdev->dif_check_flags);
746 
747 	case SPDK_BDEV_IO_TYPE_COMPARE:
748 		return bdev_nvme_comparev(nvme_ns->ns,
749 					  qpair,
750 					  nbdev_io,
751 					  bdev_io->u.bdev.iovs,
752 					  bdev_io->u.bdev.iovcnt,
753 					  bdev_io->u.bdev.md_buf,
754 					  bdev_io->u.bdev.num_blocks,
755 					  bdev_io->u.bdev.offset_blocks,
756 					  bdev->dif_check_flags);
757 
758 	case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE:
759 		return bdev_nvme_comparev_and_writev(nvme_ns->ns,
760 						     qpair,
761 						     nbdev_io,
762 						     bdev_io->u.bdev.iovs,
763 						     bdev_io->u.bdev.iovcnt,
764 						     bdev_io->u.bdev.fused_iovs,
765 						     bdev_io->u.bdev.fused_iovcnt,
766 						     bdev_io->u.bdev.md_buf,
767 						     bdev_io->u.bdev.num_blocks,
768 						     bdev_io->u.bdev.offset_blocks,
769 						     bdev->dif_check_flags);
770 
771 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
772 		return bdev_nvme_unmap(nvme_ns->ns,
773 				       qpair,
774 				       nbdev_io,
775 				       bdev_io->u.bdev.offset_blocks,
776 				       bdev_io->u.bdev.num_blocks);
777 
778 	case SPDK_BDEV_IO_TYPE_UNMAP:
779 		return bdev_nvme_unmap(nvme_ns->ns,
780 				       qpair,
781 				       nbdev_io,
782 				       bdev_io->u.bdev.offset_blocks,
783 				       bdev_io->u.bdev.num_blocks);
784 
785 	case SPDK_BDEV_IO_TYPE_RESET:
786 		return bdev_nvme_reset(nvme_ch, nbdev_io);
787 
788 	case SPDK_BDEV_IO_TYPE_FLUSH:
789 		return bdev_nvme_flush(nvme_ns->ns,
790 				       qpair,
791 				       nbdev_io,
792 				       bdev_io->u.bdev.offset_blocks,
793 				       bdev_io->u.bdev.num_blocks);
794 
795 	case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
796 		return bdev_nvme_admin_passthru(nvme_ch,
797 						nbdev_io,
798 						&bdev_io->u.nvme_passthru.cmd,
799 						bdev_io->u.nvme_passthru.buf,
800 						bdev_io->u.nvme_passthru.nbytes);
801 
802 	case SPDK_BDEV_IO_TYPE_NVME_IO:
803 		return bdev_nvme_io_passthru(nvme_ns->ns,
804 					     qpair,
805 					     nbdev_io,
806 					     &bdev_io->u.nvme_passthru.cmd,
807 					     bdev_io->u.nvme_passthru.buf,
808 					     bdev_io->u.nvme_passthru.nbytes);
809 
810 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
811 		return bdev_nvme_io_passthru_md(nvme_ns->ns,
812 						qpair,
813 						nbdev_io,
814 						&bdev_io->u.nvme_passthru.cmd,
815 						bdev_io->u.nvme_passthru.buf,
816 						bdev_io->u.nvme_passthru.nbytes,
817 						bdev_io->u.nvme_passthru.md_buf,
818 						bdev_io->u.nvme_passthru.md_len);
819 
820 	case SPDK_BDEV_IO_TYPE_ABORT:
821 		nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx;
822 		return bdev_nvme_abort(nvme_ch,
823 				       nbdev_io,
824 				       nbdev_io_to_abort);
825 
826 	default:
827 		return -EINVAL;
828 	}
829 	return 0;
830 }
831 
832 static void
833 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
834 {
835 	int rc = _bdev_nvme_submit_request(ch, bdev_io);
836 
837 	if (spdk_unlikely(rc != 0)) {
838 		if (rc == -ENOMEM) {
839 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
840 		} else {
841 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
842 		}
843 	}
844 }
845 
846 static bool
847 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
848 {
849 	struct nvme_bdev *nbdev = ctx;
850 	struct nvme_bdev_ns *nvme_ns;
851 	struct spdk_nvme_ns *ns;
852 	struct spdk_nvme_ctrlr *ctrlr;
853 	const struct spdk_nvme_ctrlr_data *cdata;
854 
855 	nvme_ns = nvme_bdev_to_bdev_ns(nbdev);
856 	assert(nvme_ns != NULL);
857 	ns = nvme_ns->ns;
858 	ctrlr = spdk_nvme_ns_get_ctrlr(ns);
859 
860 	switch (io_type) {
861 	case SPDK_BDEV_IO_TYPE_READ:
862 	case SPDK_BDEV_IO_TYPE_WRITE:
863 	case SPDK_BDEV_IO_TYPE_RESET:
864 	case SPDK_BDEV_IO_TYPE_FLUSH:
865 	case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
866 	case SPDK_BDEV_IO_TYPE_NVME_IO:
867 	case SPDK_BDEV_IO_TYPE_ABORT:
868 		return true;
869 
870 	case SPDK_BDEV_IO_TYPE_COMPARE:
871 		return spdk_nvme_ns_supports_compare(ns);
872 
873 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
874 		return spdk_nvme_ns_get_md_size(ns) ? true : false;
875 
876 	case SPDK_BDEV_IO_TYPE_UNMAP:
877 		cdata = spdk_nvme_ctrlr_get_data(ctrlr);
878 		return cdata->oncs.dsm;
879 
880 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
881 		cdata = spdk_nvme_ctrlr_get_data(ctrlr);
882 		/*
883 		 * If an NVMe controller guarantees reading unallocated blocks returns zero,
884 		 * we can implement WRITE_ZEROES as an NVMe deallocate command.
885 		 */
886 		if (cdata->oncs.dsm &&
887 		    spdk_nvme_ns_get_dealloc_logical_block_read_value(ns) ==
888 		    SPDK_NVME_DEALLOC_READ_00) {
889 			return true;
890 		}
891 		/*
892 		 * The NVMe controller write_zeroes function is currently not used by our driver.
893 		 * If a user submits an arbitrarily large write_zeroes request to the controller, the request will fail.
894 		 * Until this is resolved, we only claim support for write_zeroes if deallocated blocks return 0's when read.
895 		 */
896 		return false;
897 
898 	case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE:
899 		if (spdk_nvme_ctrlr_get_flags(ctrlr) &
900 		    SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) {
901 			return true;
902 		}
903 		return false;
904 
905 	default:
906 		return false;
907 	}
908 }
909 
910 static int
911 bdev_nvme_create_cb(void *io_device, void *ctx_buf)
912 {
913 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = io_device;
914 	struct nvme_io_channel *nvme_ch = ctx_buf;
915 	struct spdk_io_channel *pg_ch = NULL;
916 	int rc;
917 
918 	if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) {
919 		rc = bdev_ocssd_create_io_channel(nvme_ch);
920 		if (rc != 0) {
921 			return rc;
922 		}
923 	}
924 
925 	pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs);
926 	if (!pg_ch) {
927 		rc = -1;
928 		goto err_pg_ch;
929 	}
930 
931 	nvme_ch->group = spdk_io_channel_get_ctx(pg_ch);
932 
933 #ifdef SPDK_CONFIG_VTUNE
934 	nvme_ch->group->collect_spin_stat = true;
935 #else
936 	nvme_ch->group->collect_spin_stat = false;
937 #endif
938 
939 	TAILQ_INIT(&nvme_ch->pending_resets);
940 
941 	nvme_ch->ctrlr = nvme_bdev_ctrlr;
942 
943 	rc = bdev_nvme_create_qpair(nvme_ch);
944 	if (rc != 0) {
945 		goto err_qpair;
946 	}
947 
948 	return 0;
949 
950 err_qpair:
951 	spdk_put_io_channel(pg_ch);
952 err_pg_ch:
953 	if (nvme_ch->ocssd_ch) {
954 		bdev_ocssd_destroy_io_channel(nvme_ch);
955 	}
956 
957 	return rc;
958 }
959 
960 static void
961 bdev_nvme_destroy_cb(void *io_device, void *ctx_buf)
962 {
963 	struct nvme_io_channel *nvme_ch = ctx_buf;
964 
965 	assert(nvme_ch->group != NULL);
966 
967 	if (nvme_ch->ocssd_ch != NULL) {
968 		bdev_ocssd_destroy_io_channel(nvme_ch);
969 	}
970 
971 	spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair);
972 
973 	spdk_put_io_channel(spdk_io_channel_from_ctx(nvme_ch->group));
974 }
975 
976 static int
977 bdev_nvme_poll_group_create_cb(void *io_device, void *ctx_buf)
978 {
979 	struct nvme_bdev_poll_group *group = ctx_buf;
980 
981 	group->group = spdk_nvme_poll_group_create(group, NULL);
982 	if (group->group == NULL) {
983 		return -1;
984 	}
985 
986 	group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us);
987 
988 	if (group->poller == NULL) {
989 		spdk_nvme_poll_group_destroy(group->group);
990 		return -1;
991 	}
992 
993 	return 0;
994 }
995 
996 static void
997 bdev_nvme_poll_group_destroy_cb(void *io_device, void *ctx_buf)
998 {
999 	struct nvme_bdev_poll_group *group = ctx_buf;
1000 
1001 	spdk_poller_unregister(&group->poller);
1002 	if (spdk_nvme_poll_group_destroy(group->group)) {
1003 		SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.");
1004 		assert(false);
1005 	}
1006 }
1007 
1008 static struct spdk_io_channel *
1009 bdev_nvme_get_io_channel(void *ctx)
1010 {
1011 	struct nvme_bdev *nvme_bdev = ctx;
1012 
1013 	return spdk_get_io_channel(nvme_bdev->nvme_ns->ctrlr);
1014 }
1015 
1016 static void *
1017 bdev_nvme_get_module_ctx(void *ctx)
1018 {
1019 	struct nvme_bdev *nvme_bdev = ctx;
1020 
1021 	return bdev_nvme_get_ctrlr(&nvme_bdev->disk);
1022 }
1023 
1024 static int
1025 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
1026 {
1027 	struct nvme_bdev *nvme_bdev = ctx;
1028 	struct nvme_bdev_ns *nvme_ns;
1029 	struct spdk_nvme_ns *ns;
1030 	struct spdk_nvme_ctrlr *ctrlr;
1031 	const struct spdk_nvme_ctrlr_data *cdata;
1032 	const struct spdk_nvme_transport_id *trid;
1033 	union spdk_nvme_vs_register vs;
1034 	union spdk_nvme_csts_register csts;
1035 	char buf[128];
1036 
1037 	nvme_ns = nvme_bdev_to_bdev_ns(nvme_bdev);
1038 	assert(nvme_ns != NULL);
1039 	ns = nvme_ns->ns;
1040 	ctrlr = spdk_nvme_ns_get_ctrlr(ns);
1041 
1042 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
1043 	trid = spdk_nvme_ctrlr_get_transport_id(ctrlr);
1044 	vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr);
1045 	csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr);
1046 
1047 	spdk_json_write_named_object_begin(w, "nvme");
1048 
1049 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
1050 		spdk_json_write_named_string(w, "pci_address", trid->traddr);
1051 	}
1052 
1053 	spdk_json_write_named_object_begin(w, "trid");
1054 
1055 	nvme_bdev_dump_trid_json(trid, w);
1056 
1057 	spdk_json_write_object_end(w);
1058 
1059 #ifdef SPDK_CONFIG_NVME_CUSE
1060 	size_t cuse_name_size = 128;
1061 	char cuse_name[cuse_name_size];
1062 
1063 	int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns),
1064 					    cuse_name, &cuse_name_size);
1065 	if (rc == 0) {
1066 		spdk_json_write_named_string(w, "cuse_device", cuse_name);
1067 	}
1068 #endif
1069 
1070 	spdk_json_write_named_object_begin(w, "ctrlr_data");
1071 
1072 	spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid);
1073 
1074 	snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn);
1075 	spdk_str_trim(buf);
1076 	spdk_json_write_named_string(w, "model_number", buf);
1077 
1078 	snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn);
1079 	spdk_str_trim(buf);
1080 	spdk_json_write_named_string(w, "serial_number", buf);
1081 
1082 	snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr);
1083 	spdk_str_trim(buf);
1084 	spdk_json_write_named_string(w, "firmware_revision", buf);
1085 
1086 	if (cdata->subnqn[0] != '\0') {
1087 		spdk_json_write_named_string(w, "subnqn", cdata->subnqn);
1088 	}
1089 
1090 	spdk_json_write_named_object_begin(w, "oacs");
1091 
1092 	spdk_json_write_named_uint32(w, "security", cdata->oacs.security);
1093 	spdk_json_write_named_uint32(w, "format", cdata->oacs.format);
1094 	spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware);
1095 	spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage);
1096 
1097 	spdk_json_write_object_end(w);
1098 
1099 	spdk_json_write_object_end(w);
1100 
1101 	spdk_json_write_named_object_begin(w, "vs");
1102 
1103 	spdk_json_write_name(w, "nvme_version");
1104 	if (vs.bits.ter) {
1105 		spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter);
1106 	} else {
1107 		spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr);
1108 	}
1109 
1110 	spdk_json_write_object_end(w);
1111 
1112 	spdk_json_write_named_object_begin(w, "csts");
1113 
1114 	spdk_json_write_named_uint32(w, "rdy", csts.bits.rdy);
1115 	spdk_json_write_named_uint32(w, "cfs", csts.bits.cfs);
1116 
1117 	spdk_json_write_object_end(w);
1118 
1119 	spdk_json_write_named_object_begin(w, "ns_data");
1120 
1121 	spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns));
1122 
1123 	spdk_json_write_object_end(w);
1124 
1125 	if (cdata->oacs.security) {
1126 		spdk_json_write_named_object_begin(w, "security");
1127 
1128 		spdk_json_write_named_bool(w, "opal", nvme_bdev->opal);
1129 
1130 		spdk_json_write_object_end(w);
1131 	}
1132 
1133 	spdk_json_write_object_end(w);
1134 
1135 	return 0;
1136 }
1137 
1138 static void
1139 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
1140 {
1141 	/* No config per bdev needed */
1142 }
1143 
1144 static uint64_t
1145 bdev_nvme_get_spin_time(struct spdk_io_channel *ch)
1146 {
1147 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
1148 	struct nvme_bdev_poll_group *group = nvme_ch->group;
1149 	uint64_t spin_time;
1150 
1151 	if (!group || !group->collect_spin_stat) {
1152 		return 0;
1153 	}
1154 
1155 	if (group->end_ticks != 0) {
1156 		group->spin_ticks += (group->end_ticks - group->start_ticks);
1157 		group->end_ticks = 0;
1158 	}
1159 
1160 	spin_time = (group->spin_ticks * 1000000ULL) / spdk_get_ticks_hz();
1161 	group->start_ticks = 0;
1162 	group->spin_ticks = 0;
1163 
1164 	return spin_time;
1165 }
1166 
1167 static const struct spdk_bdev_fn_table nvmelib_fn_table = {
1168 	.destruct		= bdev_nvme_destruct,
1169 	.submit_request		= bdev_nvme_submit_request,
1170 	.io_type_supported	= bdev_nvme_io_type_supported,
1171 	.get_io_channel		= bdev_nvme_get_io_channel,
1172 	.dump_info_json		= bdev_nvme_dump_info_json,
1173 	.write_config_json	= bdev_nvme_write_config_json,
1174 	.get_spin_time		= bdev_nvme_get_spin_time,
1175 	.get_module_ctx		= bdev_nvme_get_module_ctx,
1176 };
1177 
1178 static int
1179 nvme_disk_create(struct spdk_bdev *disk, const char *base_name,
1180 		 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns,
1181 		 uint32_t prchk_flags, void *ctx)
1182 {
1183 	const struct spdk_uuid		*uuid;
1184 	const struct spdk_nvme_ctrlr_data *cdata;
1185 	const struct spdk_nvme_ns_data	*nsdata;
1186 	int				rc;
1187 
1188 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
1189 
1190 	disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns));
1191 	if (!disk->name) {
1192 		return -ENOMEM;
1193 	}
1194 	disk->product_name = "NVMe disk";
1195 
1196 	disk->write_cache = 0;
1197 	if (cdata->vwc.present) {
1198 		/* Enable if the Volatile Write Cache exists */
1199 		disk->write_cache = 1;
1200 	}
1201 	disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns);
1202 	disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns);
1203 	disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns);
1204 
1205 	uuid = spdk_nvme_ns_get_uuid(ns);
1206 	if (uuid != NULL) {
1207 		disk->uuid = *uuid;
1208 	}
1209 
1210 	nsdata = spdk_nvme_ns_get_data(ns);
1211 
1212 	disk->md_len = spdk_nvme_ns_get_md_size(ns);
1213 	if (disk->md_len != 0) {
1214 		disk->md_interleave = nsdata->flbas.extended;
1215 		disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns);
1216 		if (disk->dif_type != SPDK_DIF_DISABLE) {
1217 			disk->dif_is_head_of_md = nsdata->dps.md_start;
1218 			disk->dif_check_flags = prchk_flags;
1219 		}
1220 	}
1221 
1222 	if (!(spdk_nvme_ctrlr_get_flags(ctrlr) &
1223 	      SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) {
1224 		disk->acwu = 0;
1225 	} else if (nsdata->nsfeat.ns_atomic_write_unit) {
1226 		disk->acwu = nsdata->nacwu;
1227 	} else {
1228 		disk->acwu = cdata->acwu;
1229 	}
1230 
1231 	disk->ctxt = ctx;
1232 	disk->fn_table = &nvmelib_fn_table;
1233 	disk->module = &nvme_if;
1234 	rc = spdk_bdev_register(disk);
1235 	if (rc) {
1236 		SPDK_ERRLOG("spdk_bdev_register() failed\n");
1237 		free(disk->name);
1238 		return rc;
1239 	}
1240 
1241 	return 0;
1242 }
1243 
1244 static int
1245 nvme_bdev_create(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, struct nvme_bdev_ns *nvme_ns)
1246 {
1247 	struct nvme_bdev *bdev;
1248 	int rc;
1249 
1250 	bdev = calloc(1, sizeof(*bdev));
1251 	if (!bdev) {
1252 		SPDK_ERRLOG("bdev calloc() failed\n");
1253 		return -ENOMEM;
1254 	}
1255 
1256 	bdev->nvme_ns = nvme_ns;
1257 	bdev->opal = nvme_bdev_ctrlr->opal_dev != NULL;
1258 
1259 	rc = nvme_disk_create(&bdev->disk, nvme_bdev_ctrlr->name, nvme_bdev_ctrlr->ctrlr,
1260 			      nvme_ns->ns, nvme_bdev_ctrlr->prchk_flags, bdev);
1261 	if (rc != 0) {
1262 		SPDK_ERRLOG("Failed to create NVMe disk\n");
1263 		free(bdev);
1264 		return rc;
1265 	}
1266 
1267 	nvme_ns->ref++;
1268 	nvme_ns->bdev = bdev;
1269 
1270 	return 0;
1271 }
1272 
1273 static void
1274 nvme_ctrlr_populate_standard_namespace(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
1275 				       struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx)
1276 {
1277 	struct spdk_nvme_ctrlr	*ctrlr = nvme_bdev_ctrlr->ctrlr;
1278 	struct spdk_nvme_ns	*ns;
1279 	int			rc = 0;
1280 
1281 	ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id);
1282 	if (!ns) {
1283 		SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id);
1284 		rc = -EINVAL;
1285 		goto done;
1286 	}
1287 
1288 	nvme_ns->ns = ns;
1289 	nvme_ns->ref = 1;
1290 
1291 	rc = nvme_bdev_create(nvme_bdev_ctrlr, nvme_ns);
1292 done:
1293 	nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, rc);
1294 }
1295 
1296 static bool
1297 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
1298 		 struct spdk_nvme_ctrlr_opts *opts)
1299 {
1300 	struct nvme_probe_skip_entry *entry;
1301 
1302 	TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) {
1303 		if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) {
1304 			return false;
1305 		}
1306 	}
1307 
1308 	opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst;
1309 	opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight;
1310 	opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight;
1311 	opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight;
1312 
1313 	SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr);
1314 
1315 	return true;
1316 }
1317 
1318 static void
1319 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl)
1320 {
1321 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = ctx;
1322 
1323 	if (spdk_nvme_cpl_is_error(cpl)) {
1324 		SPDK_WARNLOG("Abort failed. Resetting controller.\n");
1325 		_bdev_nvme_reset(nvme_bdev_ctrlr);
1326 	}
1327 }
1328 
1329 static void
1330 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr,
1331 	   struct spdk_nvme_qpair *qpair, uint16_t cid)
1332 {
1333 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = cb_arg;
1334 	union spdk_nvme_csts_register csts;
1335 	int rc;
1336 
1337 	assert(nvme_bdev_ctrlr->ctrlr == ctrlr);
1338 
1339 	SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid);
1340 
1341 	/* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O
1342 	 * queue.  (Note: qpair == NULL when there's an admin cmd timeout.)  Otherwise we
1343 	 * would submit another fabrics cmd on the admin queue to read CSTS and check for its
1344 	 * completion recursively.
1345 	 */
1346 	if (nvme_bdev_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) {
1347 		csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr);
1348 		if (csts.bits.cfs) {
1349 			SPDK_ERRLOG("Controller Fatal Status, reset required\n");
1350 			_bdev_nvme_reset(nvme_bdev_ctrlr);
1351 			return;
1352 		}
1353 	}
1354 
1355 	switch (g_opts.action_on_timeout) {
1356 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT:
1357 		if (qpair) {
1358 			rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid,
1359 						       nvme_abort_cpl, nvme_bdev_ctrlr);
1360 			if (rc == 0) {
1361 				return;
1362 			}
1363 
1364 			SPDK_ERRLOG("Unable to send abort. Resetting.\n");
1365 		}
1366 
1367 	/* FALLTHROUGH */
1368 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET:
1369 		_bdev_nvme_reset(nvme_bdev_ctrlr);
1370 		break;
1371 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE:
1372 		SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n");
1373 		break;
1374 	default:
1375 		SPDK_ERRLOG("An invalid timeout action value is found.\n");
1376 		break;
1377 	}
1378 }
1379 
1380 void
1381 nvme_ctrlr_depopulate_namespace_done(struct nvme_bdev_ns *nvme_ns)
1382 {
1383 	nvme_bdev_ns_detach(nvme_ns);
1384 }
1385 
1386 static void
1387 nvme_ctrlr_depopulate_standard_namespace(struct nvme_bdev_ns *nvme_ns)
1388 {
1389 	struct nvme_bdev *bdev;
1390 
1391 	bdev = nvme_bdev_ns_to_bdev(nvme_ns);
1392 	if (bdev != NULL) {
1393 		spdk_bdev_unregister(&bdev->disk, NULL, NULL);
1394 	}
1395 
1396 	nvme_ns->populated = false;
1397 
1398 	nvme_ctrlr_depopulate_namespace_done(nvme_ns);
1399 }
1400 
1401 static void
1402 nvme_ctrlr_populate_namespace(struct nvme_bdev_ctrlr *ctrlr, struct nvme_bdev_ns *nvme_ns,
1403 			      struct nvme_async_probe_ctx *ctx)
1404 {
1405 	g_populate_namespace_fn[nvme_ns->type](ctrlr, nvme_ns, ctx);
1406 }
1407 
1408 static void
1409 nvme_ctrlr_depopulate_namespace(struct nvme_bdev_ctrlr *ctrlr, struct nvme_bdev_ns *nvme_ns)
1410 {
1411 	g_depopulate_namespace_fn[nvme_ns->type](nvme_ns);
1412 }
1413 
1414 void
1415 nvme_ctrlr_populate_namespace_done(struct nvme_async_probe_ctx *ctx,
1416 				   struct nvme_bdev_ns *nvme_ns, int rc)
1417 {
1418 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = nvme_ns->ctrlr;
1419 
1420 	assert(nvme_bdev_ctrlr != NULL);
1421 
1422 	if (rc == 0) {
1423 		nvme_ns->populated = true;
1424 		pthread_mutex_lock(&nvme_bdev_ctrlr->mutex);
1425 		nvme_bdev_ctrlr->ref++;
1426 		pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
1427 	} else {
1428 		memset(nvme_ns, 0, sizeof(*nvme_ns));
1429 	}
1430 
1431 	if (ctx) {
1432 		ctx->populates_in_progress--;
1433 		if (ctx->populates_in_progress == 0) {
1434 			nvme_ctrlr_populate_namespaces_done(nvme_bdev_ctrlr, ctx);
1435 		}
1436 	}
1437 }
1438 
1439 static void
1440 nvme_ctrlr_populate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
1441 			       struct nvme_async_probe_ctx *ctx)
1442 {
1443 	struct spdk_nvme_ctrlr	*ctrlr = nvme_bdev_ctrlr->ctrlr;
1444 	struct nvme_bdev_ns	*nvme_ns;
1445 	struct spdk_nvme_ns	*ns;
1446 	struct nvme_bdev	*bdev;
1447 	uint32_t		i;
1448 	int			rc;
1449 	uint64_t		num_sectors;
1450 	bool			ns_is_active;
1451 
1452 	if (ctx) {
1453 		/* Initialize this count to 1 to handle the populate functions
1454 		 * calling nvme_ctrlr_populate_namespace_done() immediately.
1455 		 */
1456 		ctx->populates_in_progress = 1;
1457 	}
1458 
1459 	for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
1460 		uint32_t	nsid = i + 1;
1461 
1462 		nvme_ns = nvme_bdev_ctrlr->namespaces[i];
1463 		ns_is_active = spdk_nvme_ctrlr_is_active_ns(ctrlr, nsid);
1464 
1465 		if (nvme_ns->populated && ns_is_active && nvme_ns->type == NVME_BDEV_NS_STANDARD) {
1466 			/* NS is still there but attributes may have changed */
1467 			ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid);
1468 			num_sectors = spdk_nvme_ns_get_num_sectors(ns);
1469 			bdev = nvme_bdev_ns_to_bdev(nvme_ns);
1470 			assert(bdev != NULL);
1471 			if (bdev->disk.blockcnt != num_sectors) {
1472 				SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n",
1473 					       nsid,
1474 					       bdev->disk.name,
1475 					       bdev->disk.blockcnt,
1476 					       num_sectors);
1477 				rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors);
1478 				if (rc != 0) {
1479 					SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n",
1480 						    bdev->disk.name, rc);
1481 				}
1482 			}
1483 		}
1484 
1485 		if (!nvme_ns->populated && ns_is_active) {
1486 			nvme_ns->id = nsid;
1487 			nvme_ns->ctrlr = nvme_bdev_ctrlr;
1488 			if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) {
1489 				nvme_ns->type = NVME_BDEV_NS_OCSSD;
1490 			} else {
1491 				nvme_ns->type = NVME_BDEV_NS_STANDARD;
1492 			}
1493 
1494 			nvme_ns->bdev = NULL;
1495 
1496 			if (ctx) {
1497 				ctx->populates_in_progress++;
1498 			}
1499 			nvme_ctrlr_populate_namespace(nvme_bdev_ctrlr, nvme_ns, ctx);
1500 		}
1501 
1502 		if (nvme_ns->populated && !ns_is_active) {
1503 			nvme_ctrlr_depopulate_namespace(nvme_bdev_ctrlr, nvme_ns);
1504 		}
1505 	}
1506 
1507 	if (ctx) {
1508 		/* Decrement this count now that the loop is over to account
1509 		 * for the one we started with.  If the count is then 0, we
1510 		 * know any populate_namespace functions completed immediately,
1511 		 * so we'll kick the callback here.
1512 		 */
1513 		ctx->populates_in_progress--;
1514 		if (ctx->populates_in_progress == 0) {
1515 			nvme_ctrlr_populate_namespaces_done(nvme_bdev_ctrlr, ctx);
1516 		}
1517 	}
1518 
1519 }
1520 
1521 static void
1522 nvme_ctrlr_depopulate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr)
1523 {
1524 	uint32_t i;
1525 	struct nvme_bdev_ns *nvme_ns;
1526 
1527 	for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
1528 		uint32_t nsid = i + 1;
1529 
1530 		nvme_ns = nvme_bdev_ctrlr->namespaces[nsid - 1];
1531 		if (nvme_ns->populated) {
1532 			assert(nvme_ns->id == nsid);
1533 			nvme_ctrlr_depopulate_namespace(nvme_bdev_ctrlr, nvme_ns);
1534 		}
1535 	}
1536 }
1537 
1538 static void
1539 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl)
1540 {
1541 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr		= arg;
1542 	union spdk_nvme_async_event_completion	event;
1543 
1544 	if (spdk_nvme_cpl_is_error(cpl)) {
1545 		SPDK_WARNLOG("AER request execute failed");
1546 		return;
1547 	}
1548 
1549 	event.raw = cpl->cdw0;
1550 	if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) &&
1551 	    (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) {
1552 		nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, NULL);
1553 	} else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_VENDOR) &&
1554 		   (event.bits.log_page_identifier == SPDK_OCSSD_LOG_CHUNK_NOTIFICATION) &&
1555 		   spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) {
1556 		bdev_ocssd_handle_chunk_notification(nvme_bdev_ctrlr);
1557 	}
1558 }
1559 
1560 static int
1561 nvme_bdev_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr,
1562 		       const char *name,
1563 		       const struct spdk_nvme_transport_id *trid,
1564 		       uint32_t prchk_flags,
1565 		       struct nvme_bdev_ctrlr **_nvme_bdev_ctrlr)
1566 {
1567 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
1568 	struct nvme_bdev_ctrlr_trid *trid_entry;
1569 	uint32_t i;
1570 	int rc;
1571 
1572 	nvme_bdev_ctrlr = calloc(1, sizeof(*nvme_bdev_ctrlr));
1573 	if (nvme_bdev_ctrlr == NULL) {
1574 		SPDK_ERRLOG("Failed to allocate device struct\n");
1575 		return -ENOMEM;
1576 	}
1577 
1578 	rc = pthread_mutex_init(&nvme_bdev_ctrlr->mutex, NULL);
1579 	if (rc != 0) {
1580 		goto err_init_mutex;
1581 	}
1582 
1583 	TAILQ_INIT(&nvme_bdev_ctrlr->trids);
1584 	nvme_bdev_ctrlr->num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr);
1585 	if (nvme_bdev_ctrlr->num_ns != 0) {
1586 		nvme_bdev_ctrlr->namespaces = calloc(nvme_bdev_ctrlr->num_ns, sizeof(struct nvme_bdev_ns *));
1587 		if (!nvme_bdev_ctrlr->namespaces) {
1588 			SPDK_ERRLOG("Failed to allocate block namespaces pointer\n");
1589 			rc = -ENOMEM;
1590 			goto err_alloc_namespaces;
1591 		}
1592 	}
1593 
1594 	trid_entry = calloc(1, sizeof(*trid_entry));
1595 	if (trid_entry == NULL) {
1596 		SPDK_ERRLOG("Failed to allocate trid entry pointer\n");
1597 		rc = -ENOMEM;
1598 		goto err_alloc_trid;
1599 	}
1600 
1601 	trid_entry->trid = *trid;
1602 
1603 	for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
1604 		nvme_bdev_ctrlr->namespaces[i] = calloc(1, sizeof(struct nvme_bdev_ns));
1605 		if (nvme_bdev_ctrlr->namespaces[i] == NULL) {
1606 			SPDK_ERRLOG("Failed to allocate block namespace struct\n");
1607 			rc = -ENOMEM;
1608 			goto err_alloc_namespace;
1609 		}
1610 	}
1611 
1612 	nvme_bdev_ctrlr->thread = spdk_get_thread();
1613 	nvme_bdev_ctrlr->adminq_timer_poller = NULL;
1614 	nvme_bdev_ctrlr->ctrlr = ctrlr;
1615 	nvme_bdev_ctrlr->ref = 1;
1616 	nvme_bdev_ctrlr->connected_trid = &trid_entry->trid;
1617 	nvme_bdev_ctrlr->name = strdup(name);
1618 	if (nvme_bdev_ctrlr->name == NULL) {
1619 		rc = -ENOMEM;
1620 		goto err_alloc_name;
1621 	}
1622 
1623 	if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) {
1624 		rc = bdev_ocssd_init_ctrlr(nvme_bdev_ctrlr);
1625 		if (spdk_unlikely(rc != 0)) {
1626 			SPDK_ERRLOG("Unable to initialize OCSSD controller\n");
1627 			goto err_init_ocssd;
1628 		}
1629 	}
1630 
1631 	nvme_bdev_ctrlr->prchk_flags = prchk_flags;
1632 
1633 	spdk_io_device_register(nvme_bdev_ctrlr, bdev_nvme_create_cb, bdev_nvme_destroy_cb,
1634 				sizeof(struct nvme_io_channel),
1635 				name);
1636 
1637 	nvme_bdev_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_bdev_ctrlr,
1638 					       g_opts.nvme_adminq_poll_period_us);
1639 
1640 	TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nvme_bdev_ctrlr, tailq);
1641 
1642 	if (g_opts.timeout_us > 0) {
1643 		spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us,
1644 				timeout_cb, nvme_bdev_ctrlr);
1645 	}
1646 
1647 	spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_bdev_ctrlr);
1648 	spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_bdev_ctrlr);
1649 
1650 	if (spdk_nvme_ctrlr_get_flags(nvme_bdev_ctrlr->ctrlr) &
1651 	    SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) {
1652 		nvme_bdev_ctrlr->opal_dev = spdk_opal_dev_construct(nvme_bdev_ctrlr->ctrlr);
1653 		if (nvme_bdev_ctrlr->opal_dev == NULL) {
1654 			SPDK_ERRLOG("Failed to initialize Opal\n");
1655 		}
1656 	}
1657 
1658 	TAILQ_INSERT_HEAD(&nvme_bdev_ctrlr->trids, trid_entry, link);
1659 
1660 	if (_nvme_bdev_ctrlr != NULL) {
1661 		*_nvme_bdev_ctrlr = nvme_bdev_ctrlr;
1662 	}
1663 	return 0;
1664 
1665 err_init_ocssd:
1666 	free(nvme_bdev_ctrlr->name);
1667 err_alloc_name:
1668 err_alloc_namespace:
1669 	for (; i > 0; i--) {
1670 		free(nvme_bdev_ctrlr->namespaces[i - 1]);
1671 	}
1672 	free(trid_entry);
1673 err_alloc_trid:
1674 	free(nvme_bdev_ctrlr->namespaces);
1675 err_alloc_namespaces:
1676 	pthread_mutex_destroy(&nvme_bdev_ctrlr->mutex);
1677 err_init_mutex:
1678 	free(nvme_bdev_ctrlr);
1679 	return rc;
1680 }
1681 
1682 static void
1683 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
1684 	  struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
1685 {
1686 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
1687 	struct nvme_probe_ctx *ctx = cb_ctx;
1688 	char *name = NULL;
1689 	uint32_t prchk_flags = 0;
1690 	size_t i;
1691 	int rc;
1692 
1693 	if (ctx) {
1694 		for (i = 0; i < ctx->count; i++) {
1695 			if (spdk_nvme_transport_id_compare(trid, &ctx->trids[i]) == 0) {
1696 				prchk_flags = ctx->prchk_flags[i];
1697 				name = strdup(ctx->names[i]);
1698 				break;
1699 			}
1700 		}
1701 	} else {
1702 		name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++);
1703 	}
1704 	if (!name) {
1705 		SPDK_ERRLOG("Failed to assign name to NVMe device\n");
1706 		return;
1707 	}
1708 
1709 	SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name);
1710 
1711 	rc = nvme_bdev_ctrlr_create(ctrlr, name, trid, prchk_flags, &nvme_bdev_ctrlr);
1712 	if (rc != 0) {
1713 		SPDK_ERRLOG("Failed to create new NVMe controller\n");
1714 		free(name);
1715 		return;
1716 	}
1717 
1718 	nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, NULL);
1719 
1720 	free(name);
1721 }
1722 
1723 static void
1724 _nvme_bdev_ctrlr_destruct(void *ctx)
1725 {
1726 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = ctx;
1727 
1728 	nvme_ctrlr_depopulate_namespaces(nvme_bdev_ctrlr);
1729 	nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr);
1730 }
1731 
1732 static int
1733 _bdev_nvme_delete(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, bool hotplug)
1734 {
1735 	struct nvme_probe_skip_entry *entry;
1736 
1737 	pthread_mutex_lock(&nvme_bdev_ctrlr->mutex);
1738 
1739 	/* The controller's destruction was already started */
1740 	if (nvme_bdev_ctrlr->destruct) {
1741 		pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
1742 		return 0;
1743 	}
1744 
1745 	if (!hotplug &&
1746 	    nvme_bdev_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
1747 		entry = calloc(1, sizeof(*entry));
1748 		if (!entry) {
1749 			pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
1750 			return -ENOMEM;
1751 		}
1752 		entry->trid = *nvme_bdev_ctrlr->connected_trid;
1753 		TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq);
1754 	}
1755 
1756 	nvme_bdev_ctrlr->destruct = true;
1757 	pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
1758 
1759 	_nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr);
1760 
1761 	return 0;
1762 }
1763 
1764 static void
1765 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr)
1766 {
1767 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = cb_ctx;
1768 
1769 	_bdev_nvme_delete(nvme_bdev_ctrlr, true);
1770 }
1771 
1772 static int
1773 bdev_nvme_hotplug_probe(void *arg)
1774 {
1775 	if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) {
1776 		g_hotplug_probe_ctx = NULL;
1777 		spdk_poller_unregister(&g_hotplug_probe_poller);
1778 	}
1779 
1780 	return SPDK_POLLER_BUSY;
1781 }
1782 
1783 static int
1784 bdev_nvme_hotplug(void *arg)
1785 {
1786 	struct spdk_nvme_transport_id trid_pcie;
1787 
1788 	if (g_hotplug_probe_ctx) {
1789 		return SPDK_POLLER_BUSY;
1790 	}
1791 
1792 	memset(&trid_pcie, 0, sizeof(trid_pcie));
1793 	spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE);
1794 
1795 	g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL,
1796 			      hotplug_probe_cb, attach_cb, NULL);
1797 
1798 	if (g_hotplug_probe_ctx) {
1799 		assert(g_hotplug_probe_poller == NULL);
1800 		g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000);
1801 	}
1802 
1803 	return SPDK_POLLER_BUSY;
1804 }
1805 
1806 void
1807 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts)
1808 {
1809 	*opts = g_opts;
1810 }
1811 
1812 int
1813 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts)
1814 {
1815 	if (g_bdev_nvme_init_thread != NULL) {
1816 		if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
1817 			return -EPERM;
1818 		}
1819 	}
1820 
1821 	g_opts = *opts;
1822 
1823 	return 0;
1824 }
1825 
1826 struct set_nvme_hotplug_ctx {
1827 	uint64_t period_us;
1828 	bool enabled;
1829 	spdk_msg_fn fn;
1830 	void *fn_ctx;
1831 };
1832 
1833 static void
1834 set_nvme_hotplug_period_cb(void *_ctx)
1835 {
1836 	struct set_nvme_hotplug_ctx *ctx = _ctx;
1837 
1838 	spdk_poller_unregister(&g_hotplug_poller);
1839 	if (ctx->enabled) {
1840 		g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us);
1841 	}
1842 
1843 	g_nvme_hotplug_poll_period_us = ctx->period_us;
1844 	g_nvme_hotplug_enabled = ctx->enabled;
1845 	if (ctx->fn) {
1846 		ctx->fn(ctx->fn_ctx);
1847 	}
1848 
1849 	free(ctx);
1850 }
1851 
1852 int
1853 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx)
1854 {
1855 	struct set_nvme_hotplug_ctx *ctx;
1856 
1857 	if (enabled == true && !spdk_process_is_primary()) {
1858 		return -EPERM;
1859 	}
1860 
1861 	ctx = calloc(1, sizeof(*ctx));
1862 	if (ctx == NULL) {
1863 		return -ENOMEM;
1864 	}
1865 
1866 	period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us;
1867 	ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX);
1868 	ctx->enabled = enabled;
1869 	ctx->fn = cb;
1870 	ctx->fn_ctx = cb_ctx;
1871 
1872 	spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx);
1873 	return 0;
1874 }
1875 
1876 static void
1877 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, size_t count, int rc)
1878 {
1879 	if (ctx->cb_fn) {
1880 		ctx->cb_fn(ctx->cb_ctx, count, rc);
1881 	}
1882 
1883 	ctx->namespaces_populated = true;
1884 	if (ctx->probe_done) {
1885 		/* The probe was already completed, so we need to free the context
1886 		 * here.  This can happen for cases like OCSSD, where we need to
1887 		 * send additional commands to the SSD after attach.
1888 		 */
1889 		free(ctx);
1890 	}
1891 }
1892 
1893 static void
1894 nvme_ctrlr_populate_namespaces_done(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
1895 				    struct nvme_async_probe_ctx *ctx)
1896 {
1897 	struct nvme_bdev_ns	*nvme_ns;
1898 	struct nvme_bdev	*nvme_bdev;
1899 	uint32_t		i, nsid;
1900 	size_t			j;
1901 
1902 	assert(nvme_bdev_ctrlr != NULL);
1903 
1904 	/*
1905 	 * Report the new bdevs that were created in this call.
1906 	 * There can be more than one bdev per NVMe controller.
1907 	 */
1908 	j = 0;
1909 	for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
1910 		nsid = i + 1;
1911 		nvme_ns = nvme_bdev_ctrlr->namespaces[nsid - 1];
1912 		if (!nvme_ns->populated) {
1913 			continue;
1914 		}
1915 		assert(nvme_ns->id == nsid);
1916 		nvme_bdev = nvme_bdev_ns_to_bdev(nvme_ns);
1917 		if (nvme_bdev == NULL) {
1918 			assert(nvme_ns->type == NVME_BDEV_NS_OCSSD);
1919 			continue;
1920 		}
1921 		if (j < ctx->count) {
1922 			ctx->names[j] = nvme_bdev->disk.name;
1923 			j++;
1924 		} else {
1925 			SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n",
1926 				    ctx->count);
1927 			populate_namespaces_cb(ctx, 0, -ERANGE);
1928 			return;
1929 		}
1930 	}
1931 
1932 	populate_namespaces_cb(ctx, j, 0);
1933 }
1934 
1935 static bool
1936 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2)
1937 {
1938 	const struct spdk_nvme_ns_data *nsdata1, *nsdata2;
1939 
1940 	nsdata1 = spdk_nvme_ns_get_data(ns1);
1941 	nsdata2 = spdk_nvme_ns_get_data(ns2);
1942 
1943 	return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid));
1944 }
1945 
1946 static int
1947 bdev_nvme_add_trid(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, struct spdk_nvme_ctrlr *new_ctrlr,
1948 		   struct spdk_nvme_transport_id *trid)
1949 {
1950 	uint32_t			i, nsid;
1951 	struct nvme_bdev_ns		*nvme_ns;
1952 	struct spdk_nvme_ns		*new_ns;
1953 	struct nvme_bdev_ctrlr_trid	*new_trid, *tmp_trid;
1954 	int				rc = 0;
1955 
1956 	assert(nvme_bdev_ctrlr != NULL);
1957 
1958 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
1959 		SPDK_ERRLOG("PCIe failover is not supported.\n");
1960 		return -ENOTSUP;
1961 	}
1962 
1963 	pthread_mutex_lock(&nvme_bdev_ctrlr->mutex);
1964 
1965 	/* Currently we only support failover to the same transport type. */
1966 	if (nvme_bdev_ctrlr->connected_trid->trtype != trid->trtype) {
1967 		rc = -EINVAL;
1968 		goto exit;
1969 	}
1970 
1971 	/* Currently we only support failover to the same NQN. */
1972 	if (strncmp(trid->subnqn, nvme_bdev_ctrlr->connected_trid->subnqn, SPDK_NVMF_NQN_MAX_LEN)) {
1973 		rc = -EINVAL;
1974 		goto exit;
1975 	}
1976 
1977 	/* Skip all the other checks if we've already registered this path. */
1978 	TAILQ_FOREACH(new_trid, &nvme_bdev_ctrlr->trids, link) {
1979 		if (!spdk_nvme_transport_id_compare(&new_trid->trid, trid)) {
1980 			rc = -EEXIST;
1981 			goto exit;
1982 		}
1983 	}
1984 
1985 	if (spdk_nvme_ctrlr_get_num_ns(new_ctrlr) != nvme_bdev_ctrlr->num_ns) {
1986 		rc = -EINVAL;
1987 		goto exit;
1988 	}
1989 
1990 	for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
1991 		nsid = i + 1;
1992 
1993 		nvme_ns = nvme_bdev_ctrlr->namespaces[i];
1994 		if (!nvme_ns->populated) {
1995 			continue;
1996 		}
1997 
1998 		new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nsid);
1999 		assert(new_ns != NULL);
2000 
2001 		if (bdev_nvme_compare_ns(nvme_ns->ns, new_ns) != 0) {
2002 			rc = -EINVAL;
2003 			goto exit;
2004 		}
2005 	}
2006 
2007 	new_trid = calloc(1, sizeof(*new_trid));
2008 	if (new_trid == NULL) {
2009 		rc = -ENOMEM;
2010 		goto exit;
2011 	}
2012 	new_trid->trid = *trid;
2013 	new_trid->is_failed = false;
2014 
2015 	TAILQ_FOREACH(tmp_trid, &nvme_bdev_ctrlr->trids, link) {
2016 		if (tmp_trid->is_failed) {
2017 			TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link);
2018 			goto exit;
2019 		}
2020 	}
2021 
2022 	TAILQ_INSERT_TAIL(&nvme_bdev_ctrlr->trids, new_trid, link);
2023 
2024 exit:
2025 	pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
2026 	return rc;
2027 }
2028 
2029 static void
2030 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
2031 		  struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
2032 {
2033 	struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx;
2034 	struct nvme_bdev_ctrlr	*nvme_bdev_ctrlr;
2035 	struct nvme_async_probe_ctx *ctx;
2036 	int rc;
2037 
2038 	ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, opts);
2039 	ctx->ctrlr_attached = true;
2040 
2041 	nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(ctx->base_name);
2042 	if (nvme_bdev_ctrlr) {
2043 		/* This is the case that a secondary path is added to an existing
2044 		 * nvme_bdev_ctrlr for failover. After checking if it can access the same
2045 		 * namespaces as the primary path, it is disconnected until failover occurs.
2046 		 */
2047 		rc = bdev_nvme_add_trid(nvme_bdev_ctrlr, ctrlr, &ctx->trid);
2048 
2049 		spdk_nvme_detach(ctrlr);
2050 		goto exit;
2051 	}
2052 
2053 	rc = nvme_bdev_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx->prchk_flags,
2054 				    &nvme_bdev_ctrlr);
2055 	if (rc) {
2056 		SPDK_ERRLOG("Failed to create new device\n");
2057 		goto exit;
2058 	}
2059 
2060 	nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, ctx);
2061 	return;
2062 
2063 exit:
2064 	populate_namespaces_cb(ctx, 0, rc);
2065 }
2066 
2067 static int
2068 bdev_nvme_async_poll(void *arg)
2069 {
2070 	struct nvme_async_probe_ctx	*ctx = arg;
2071 	int				rc;
2072 
2073 	rc = spdk_nvme_probe_poll_async(ctx->probe_ctx);
2074 	if (spdk_unlikely(rc != -EAGAIN)) {
2075 		ctx->probe_done = true;
2076 		spdk_poller_unregister(&ctx->poller);
2077 		if (!ctx->ctrlr_attached) {
2078 			/* The probe is done, but no controller was attached.
2079 			 * That means we had a failure, so report -EIO back to
2080 			 * the caller (usually the RPC). populate_namespaces_cb()
2081 			 * will take care of freeing the nvme_async_probe_ctx.
2082 			 */
2083 			populate_namespaces_cb(ctx, 0, -EIO);
2084 		} else if (ctx->namespaces_populated) {
2085 			/* The namespaces for the attached controller were all
2086 			 * populated and the response was already sent to the
2087 			 * caller (usually the RPC).  So free the context here.
2088 			 */
2089 			free(ctx);
2090 		}
2091 	}
2092 
2093 	return SPDK_POLLER_BUSY;
2094 }
2095 
2096 int
2097 bdev_nvme_create(struct spdk_nvme_transport_id *trid,
2098 		 struct spdk_nvme_host_id *hostid,
2099 		 const char *base_name,
2100 		 const char **names,
2101 		 uint32_t count,
2102 		 const char *hostnqn,
2103 		 uint32_t prchk_flags,
2104 		 spdk_bdev_create_nvme_fn cb_fn,
2105 		 void *cb_ctx,
2106 		 struct spdk_nvme_ctrlr_opts *opts)
2107 {
2108 	struct nvme_probe_skip_entry	*entry, *tmp;
2109 	struct nvme_async_probe_ctx	*ctx;
2110 
2111 	/* TODO expand this check to include both the host and target TRIDs.
2112 	 * Only if both are the same should we fail.
2113 	 */
2114 	if (nvme_bdev_ctrlr_get(trid) != NULL) {
2115 		SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr);
2116 		return -EEXIST;
2117 	}
2118 
2119 	ctx = calloc(1, sizeof(*ctx));
2120 	if (!ctx) {
2121 		return -ENOMEM;
2122 	}
2123 	ctx->base_name = base_name;
2124 	ctx->names = names;
2125 	ctx->count = count;
2126 	ctx->cb_fn = cb_fn;
2127 	ctx->cb_ctx = cb_ctx;
2128 	ctx->prchk_flags = prchk_flags;
2129 	ctx->trid = *trid;
2130 
2131 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
2132 		TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) {
2133 			if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) {
2134 				TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq);
2135 				free(entry);
2136 				break;
2137 			}
2138 		}
2139 	}
2140 
2141 	if (opts) {
2142 		memcpy(&ctx->opts, opts, sizeof(*opts));
2143 	} else {
2144 		spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->opts, sizeof(ctx->opts));
2145 	}
2146 
2147 	ctx->opts.transport_retry_count = g_opts.retry_count;
2148 	ctx->opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms;
2149 
2150 	if (hostnqn) {
2151 		snprintf(ctx->opts.hostnqn, sizeof(ctx->opts.hostnqn), "%s", hostnqn);
2152 	}
2153 
2154 	if (hostid->hostaddr[0] != '\0') {
2155 		snprintf(ctx->opts.src_addr, sizeof(ctx->opts.src_addr), "%s", hostid->hostaddr);
2156 	}
2157 
2158 	if (hostid->hostsvcid[0] != '\0') {
2159 		snprintf(ctx->opts.src_svcid, sizeof(ctx->opts.src_svcid), "%s", hostid->hostsvcid);
2160 	}
2161 
2162 	ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->opts, connect_attach_cb);
2163 	if (ctx->probe_ctx == NULL) {
2164 		SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr);
2165 		free(ctx);
2166 		return -ENODEV;
2167 	}
2168 	ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000);
2169 
2170 	return 0;
2171 }
2172 
2173 int
2174 bdev_nvme_delete(const char *name, const struct spdk_nvme_transport_id *trid)
2175 {
2176 	struct nvme_bdev_ctrlr		*nvme_bdev_ctrlr;
2177 	struct nvme_bdev_ctrlr_trid	*ctrlr_trid, *tmp_trid;
2178 
2179 	if (name == NULL) {
2180 		return -EINVAL;
2181 	}
2182 
2183 	nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name);
2184 	if (nvme_bdev_ctrlr == NULL) {
2185 		SPDK_ERRLOG("Failed to find NVMe controller\n");
2186 		return -ENODEV;
2187 	}
2188 
2189 	/* case 1: remove the controller itself. */
2190 	if (trid == NULL) {
2191 		return _bdev_nvme_delete(nvme_bdev_ctrlr, false);
2192 	}
2193 
2194 	/* case 2: we are currently using the path to be removed. */
2195 	if (!spdk_nvme_transport_id_compare(trid, nvme_bdev_ctrlr->connected_trid)) {
2196 		ctrlr_trid = TAILQ_FIRST(&nvme_bdev_ctrlr->trids);
2197 		assert(nvme_bdev_ctrlr->connected_trid == &ctrlr_trid->trid);
2198 		/* case 2A: the current path is the only path. */
2199 		if (!TAILQ_NEXT(ctrlr_trid, link)) {
2200 			return _bdev_nvme_delete(nvme_bdev_ctrlr, false);
2201 		}
2202 
2203 		/* case 1B: there is an alternative path. */
2204 		return bdev_nvme_failover(nvme_bdev_ctrlr, true);
2205 	}
2206 	/* case 3: We are not using the specified path. */
2207 	TAILQ_FOREACH_SAFE(ctrlr_trid, &nvme_bdev_ctrlr->trids, link, tmp_trid) {
2208 		if (!spdk_nvme_transport_id_compare(&ctrlr_trid->trid, trid)) {
2209 			TAILQ_REMOVE(&nvme_bdev_ctrlr->trids, ctrlr_trid, link);
2210 			free(ctrlr_trid);
2211 			return 0;
2212 		}
2213 	}
2214 
2215 	/* case 3A: The address isn't even in the registered list. */
2216 	return -ENXIO;
2217 }
2218 
2219 static int
2220 bdev_nvme_library_init(void)
2221 {
2222 	g_bdev_nvme_init_thread = spdk_get_thread();
2223 
2224 	spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_poll_group_create_cb,
2225 				bdev_nvme_poll_group_destroy_cb,
2226 				sizeof(struct nvme_bdev_poll_group),  "bdev_nvme_poll_groups");
2227 
2228 	return 0;
2229 }
2230 
2231 static void
2232 bdev_nvme_library_fini(void)
2233 {
2234 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, *tmp;
2235 	struct nvme_probe_skip_entry *entry, *entry_tmp;
2236 
2237 	spdk_poller_unregister(&g_hotplug_poller);
2238 	free(g_hotplug_probe_ctx);
2239 
2240 	TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) {
2241 		TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq);
2242 		free(entry);
2243 	}
2244 
2245 	pthread_mutex_lock(&g_bdev_nvme_mutex);
2246 	TAILQ_FOREACH_SAFE(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq, tmp) {
2247 		pthread_mutex_lock(&nvme_bdev_ctrlr->mutex);
2248 		if (nvme_bdev_ctrlr->destruct) {
2249 			/* This controller's destruction was already started
2250 			 * before the application started shutting down
2251 			 */
2252 			pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
2253 			continue;
2254 		}
2255 		nvme_bdev_ctrlr->destruct = true;
2256 		pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
2257 
2258 		spdk_thread_send_msg(nvme_bdev_ctrlr->thread, _nvme_bdev_ctrlr_destruct,
2259 				     nvme_bdev_ctrlr);
2260 	}
2261 
2262 	g_bdev_nvme_module_finish = true;
2263 	if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
2264 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
2265 		spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL);
2266 		spdk_bdev_module_finish_done();
2267 		return;
2268 	}
2269 
2270 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
2271 }
2272 
2273 static void
2274 bdev_nvme_verify_pi_error(struct spdk_bdev_io *bdev_io)
2275 {
2276 	struct spdk_bdev *bdev = bdev_io->bdev;
2277 	struct spdk_dif_ctx dif_ctx;
2278 	struct spdk_dif_error err_blk = {};
2279 	int rc;
2280 
2281 	rc = spdk_dif_ctx_init(&dif_ctx,
2282 			       bdev->blocklen, bdev->md_len, bdev->md_interleave,
2283 			       bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags,
2284 			       bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0);
2285 	if (rc != 0) {
2286 		SPDK_ERRLOG("Initialization of DIF context failed\n");
2287 		return;
2288 	}
2289 
2290 	if (bdev->md_interleave) {
2291 		rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
2292 				     bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
2293 	} else {
2294 		struct iovec md_iov = {
2295 			.iov_base	= bdev_io->u.bdev.md_buf,
2296 			.iov_len	= bdev_io->u.bdev.num_blocks * bdev->md_len,
2297 		};
2298 
2299 		rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
2300 				     &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
2301 	}
2302 
2303 	if (rc != 0) {
2304 		SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n",
2305 			    err_blk.err_type, err_blk.err_offset);
2306 	} else {
2307 		SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n");
2308 	}
2309 }
2310 
2311 static void
2312 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl)
2313 {
2314 	struct nvme_bdev_io *bio = ref;
2315 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2316 
2317 	if (spdk_nvme_cpl_is_success(cpl)) {
2318 		/* Run PI verification for read data buffer. */
2319 		bdev_nvme_verify_pi_error(bdev_io);
2320 	}
2321 
2322 	/* Return original completion status */
2323 	spdk_bdev_io_complete_nvme_status(bdev_io, bio->cpl.cdw0, bio->cpl.status.sct,
2324 					  bio->cpl.status.sc);
2325 }
2326 
2327 static void
2328 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl)
2329 {
2330 	struct nvme_bdev_io *bio = ref;
2331 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2332 	struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt;
2333 	struct nvme_io_channel *nvme_ch;
2334 	struct nvme_bdev_ns *nvme_ns;
2335 	struct spdk_nvme_qpair *qpair;
2336 	int ret;
2337 
2338 	if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) {
2339 		SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n",
2340 			    cpl->status.sct, cpl->status.sc);
2341 
2342 		/* Save completion status to use after verifying PI error. */
2343 		bio->cpl = *cpl;
2344 
2345 		nvme_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io));
2346 
2347 		if (spdk_likely(bdev_nvme_find_io_path(nbdev, nvme_ch, &nvme_ns, &qpair))) {
2348 			/* Read without PI checking to verify PI error. */
2349 			ret = bdev_nvme_no_pi_readv(nvme_ns->ns,
2350 						    qpair,
2351 						    bio,
2352 						    bdev_io->u.bdev.iovs,
2353 						    bdev_io->u.bdev.iovcnt,
2354 						    bdev_io->u.bdev.md_buf,
2355 						    bdev_io->u.bdev.num_blocks,
2356 						    bdev_io->u.bdev.offset_blocks);
2357 			if (ret == 0) {
2358 				return;
2359 			}
2360 		}
2361 	}
2362 
2363 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2364 }
2365 
2366 static void
2367 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl)
2368 {
2369 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref);
2370 
2371 	if (spdk_nvme_cpl_is_pi_error(cpl)) {
2372 		SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n",
2373 			    cpl->status.sct, cpl->status.sc);
2374 		/* Run PI verification for write data buffer if PI error is detected. */
2375 		bdev_nvme_verify_pi_error(bdev_io);
2376 	}
2377 
2378 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2379 }
2380 
2381 static void
2382 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl)
2383 {
2384 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref);
2385 
2386 	if (spdk_nvme_cpl_is_pi_error(cpl)) {
2387 		SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n",
2388 			    cpl->status.sct, cpl->status.sc);
2389 		/* Run PI verification for compare data buffer if PI error is detected. */
2390 		bdev_nvme_verify_pi_error(bdev_io);
2391 	}
2392 
2393 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2394 }
2395 
2396 static void
2397 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl)
2398 {
2399 	struct nvme_bdev_io *bio = ref;
2400 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2401 
2402 	/* Compare operation completion */
2403 	if ((cpl->cdw0 & 0xFF) == SPDK_NVME_OPC_COMPARE) {
2404 		/* Save compare result for write callback */
2405 		bio->cpl = *cpl;
2406 		return;
2407 	}
2408 
2409 	/* Write operation completion */
2410 	if (spdk_nvme_cpl_is_error(&bio->cpl)) {
2411 		/* If bio->cpl is already an error, it means the compare operation failed.  In that case,
2412 		 * complete the IO with the compare operation's status.
2413 		 */
2414 		if (!spdk_nvme_cpl_is_error(cpl)) {
2415 			SPDK_ERRLOG("Unexpected write success after compare failure.\n");
2416 		}
2417 
2418 		spdk_bdev_io_complete_nvme_status(bdev_io, bio->cpl.cdw0, bio->cpl.status.sct, bio->cpl.status.sc);
2419 	} else {
2420 		spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2421 	}
2422 }
2423 
2424 static void
2425 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl)
2426 {
2427 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref);
2428 
2429 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2430 }
2431 
2432 static void
2433 bdev_nvme_admin_passthru_completion(void *ctx)
2434 {
2435 	struct nvme_bdev_io *bio = ctx;
2436 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2437 
2438 	spdk_bdev_io_complete_nvme_status(bdev_io,
2439 					  bio->cpl.cdw0, bio->cpl.status.sct, bio->cpl.status.sc);
2440 }
2441 
2442 static void
2443 bdev_nvme_abort_completion(void *ctx)
2444 {
2445 	struct nvme_bdev_io *bio = ctx;
2446 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2447 
2448 	if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) {
2449 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
2450 	} else {
2451 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
2452 	}
2453 }
2454 
2455 static void
2456 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl)
2457 {
2458 	struct nvme_bdev_io *bio = ref;
2459 
2460 	bio->cpl = *cpl;
2461 	spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_completion, bio);
2462 }
2463 
2464 static void
2465 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl)
2466 {
2467 	struct nvme_bdev_io *bio = ref;
2468 
2469 	bio->cpl = *cpl;
2470 	spdk_thread_send_msg(bio->orig_thread, bdev_nvme_admin_passthru_completion, bio);
2471 }
2472 
2473 static void
2474 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset)
2475 {
2476 	struct nvme_bdev_io *bio = ref;
2477 	struct iovec *iov;
2478 
2479 	bio->iov_offset = sgl_offset;
2480 	for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) {
2481 		iov = &bio->iovs[bio->iovpos];
2482 		if (bio->iov_offset < iov->iov_len) {
2483 			break;
2484 		}
2485 
2486 		bio->iov_offset -= iov->iov_len;
2487 	}
2488 }
2489 
2490 static int
2491 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length)
2492 {
2493 	struct nvme_bdev_io *bio = ref;
2494 	struct iovec *iov;
2495 
2496 	assert(bio->iovpos < bio->iovcnt);
2497 
2498 	iov = &bio->iovs[bio->iovpos];
2499 
2500 	*address = iov->iov_base;
2501 	*length = iov->iov_len;
2502 
2503 	if (bio->iov_offset) {
2504 		assert(bio->iov_offset <= iov->iov_len);
2505 		*address += bio->iov_offset;
2506 		*length -= bio->iov_offset;
2507 	}
2508 
2509 	bio->iov_offset += *length;
2510 	if (bio->iov_offset == iov->iov_len) {
2511 		bio->iovpos++;
2512 		bio->iov_offset = 0;
2513 	}
2514 
2515 	return 0;
2516 }
2517 
2518 static void
2519 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset)
2520 {
2521 	struct nvme_bdev_io *bio = ref;
2522 	struct iovec *iov;
2523 
2524 	bio->fused_iov_offset = sgl_offset;
2525 	for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) {
2526 		iov = &bio->fused_iovs[bio->fused_iovpos];
2527 		if (bio->fused_iov_offset < iov->iov_len) {
2528 			break;
2529 		}
2530 
2531 		bio->fused_iov_offset -= iov->iov_len;
2532 	}
2533 }
2534 
2535 static int
2536 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length)
2537 {
2538 	struct nvme_bdev_io *bio = ref;
2539 	struct iovec *iov;
2540 
2541 	assert(bio->fused_iovpos < bio->fused_iovcnt);
2542 
2543 	iov = &bio->fused_iovs[bio->fused_iovpos];
2544 
2545 	*address = iov->iov_base;
2546 	*length = iov->iov_len;
2547 
2548 	if (bio->fused_iov_offset) {
2549 		assert(bio->fused_iov_offset <= iov->iov_len);
2550 		*address += bio->fused_iov_offset;
2551 		*length -= bio->fused_iov_offset;
2552 	}
2553 
2554 	bio->fused_iov_offset += *length;
2555 	if (bio->fused_iov_offset == iov->iov_len) {
2556 		bio->fused_iovpos++;
2557 		bio->fused_iov_offset = 0;
2558 	}
2559 
2560 	return 0;
2561 }
2562 
2563 static int
2564 bdev_nvme_no_pi_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
2565 		      struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
2566 		      void *md, uint64_t lba_count, uint64_t lba)
2567 {
2568 	int rc;
2569 
2570 	SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n",
2571 		      lba_count, lba);
2572 
2573 	bio->iovs = iov;
2574 	bio->iovcnt = iovcnt;
2575 	bio->iovpos = 0;
2576 	bio->iov_offset = 0;
2577 
2578 	rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count,
2579 					    bdev_nvme_no_pi_readv_done, bio, 0,
2580 					    bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
2581 					    md, 0, 0);
2582 
2583 	if (rc != 0 && rc != -ENOMEM) {
2584 		SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc);
2585 	}
2586 	return rc;
2587 }
2588 
2589 static int
2590 bdev_nvme_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
2591 		struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
2592 		void *md, uint64_t lba_count, uint64_t lba, uint32_t flags)
2593 {
2594 	int rc;
2595 
2596 	SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n",
2597 		      lba_count, lba);
2598 
2599 	bio->iovs = iov;
2600 	bio->iovcnt = iovcnt;
2601 	bio->iovpos = 0;
2602 	bio->iov_offset = 0;
2603 
2604 	if (iovcnt == 1) {
2605 		rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, md, lba,
2606 						   lba_count,
2607 						   bdev_nvme_readv_done, bio,
2608 						   flags,
2609 						   0, 0);
2610 	} else {
2611 		rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count,
2612 						    bdev_nvme_readv_done, bio, flags,
2613 						    bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
2614 						    md, 0, 0);
2615 	}
2616 
2617 	if (rc != 0 && rc != -ENOMEM) {
2618 		SPDK_ERRLOG("readv failed: rc = %d\n", rc);
2619 	}
2620 	return rc;
2621 }
2622 
2623 static int
2624 bdev_nvme_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
2625 		 struct nvme_bdev_io *bio,
2626 		 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
2627 		 uint32_t flags)
2628 {
2629 	int rc;
2630 
2631 	SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n",
2632 		      lba_count, lba);
2633 
2634 	bio->iovs = iov;
2635 	bio->iovcnt = iovcnt;
2636 	bio->iovpos = 0;
2637 	bio->iov_offset = 0;
2638 
2639 	if (iovcnt == 1) {
2640 		rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, md, lba,
2641 						    lba_count,
2642 						    bdev_nvme_writev_done, bio,
2643 						    flags,
2644 						    0, 0);
2645 	} else {
2646 		rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count,
2647 						     bdev_nvme_writev_done, bio, flags,
2648 						     bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
2649 						     md, 0, 0);
2650 	}
2651 
2652 	if (rc != 0 && rc != -ENOMEM) {
2653 		SPDK_ERRLOG("writev failed: rc = %d\n", rc);
2654 	}
2655 	return rc;
2656 }
2657 
2658 static int
2659 bdev_nvme_comparev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
2660 		   struct nvme_bdev_io *bio,
2661 		   struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
2662 		   uint32_t flags)
2663 {
2664 	int rc;
2665 
2666 	SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n",
2667 		      lba_count, lba);
2668 
2669 	bio->iovs = iov;
2670 	bio->iovcnt = iovcnt;
2671 	bio->iovpos = 0;
2672 	bio->iov_offset = 0;
2673 
2674 	rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count,
2675 					       bdev_nvme_comparev_done, bio, flags,
2676 					       bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
2677 					       md, 0, 0);
2678 
2679 	if (rc != 0 && rc != -ENOMEM) {
2680 		SPDK_ERRLOG("comparev failed: rc = %d\n", rc);
2681 	}
2682 	return rc;
2683 }
2684 
2685 static int
2686 bdev_nvme_comparev_and_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
2687 			      struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt,
2688 			      struct iovec *write_iov, int write_iovcnt,
2689 			      void *md, uint64_t lba_count, uint64_t lba, uint32_t flags)
2690 {
2691 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2692 	int rc;
2693 
2694 	SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n",
2695 		      lba_count, lba);
2696 
2697 	bio->iovs = cmp_iov;
2698 	bio->iovcnt = cmp_iovcnt;
2699 	bio->iovpos = 0;
2700 	bio->iov_offset = 0;
2701 	bio->fused_iovs = write_iov;
2702 	bio->fused_iovcnt = write_iovcnt;
2703 	bio->fused_iovpos = 0;
2704 	bio->fused_iov_offset = 0;
2705 
2706 	if (bdev_io->num_retries == 0) {
2707 		bio->first_fused_submitted = false;
2708 	}
2709 
2710 	if (!bio->first_fused_submitted) {
2711 		flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST;
2712 		memset(&bio->cpl, 0, sizeof(bio->cpl));
2713 
2714 		rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count,
2715 						       bdev_nvme_comparev_and_writev_done, bio, flags,
2716 						       bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0);
2717 		if (rc == 0) {
2718 			bio->first_fused_submitted = true;
2719 			flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST;
2720 		} else {
2721 			if (rc != -ENOMEM) {
2722 				SPDK_ERRLOG("compare failed: rc = %d\n", rc);
2723 			}
2724 			return rc;
2725 		}
2726 	}
2727 
2728 	flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND;
2729 
2730 	rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count,
2731 					     bdev_nvme_comparev_and_writev_done, bio, flags,
2732 					     bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0);
2733 	if (rc != 0 && rc != -ENOMEM) {
2734 		SPDK_ERRLOG("write failed: rc = %d\n", rc);
2735 		rc = 0;
2736 	}
2737 
2738 	return rc;
2739 }
2740 
2741 static int
2742 bdev_nvme_unmap(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
2743 		struct nvme_bdev_io *bio,
2744 		uint64_t offset_blocks,
2745 		uint64_t num_blocks)
2746 {
2747 	struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES];
2748 	struct spdk_nvme_dsm_range *range;
2749 	uint64_t offset, remaining;
2750 	uint64_t num_ranges_u64;
2751 	uint16_t num_ranges;
2752 	int rc;
2753 
2754 	num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) /
2755 			 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
2756 	if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) {
2757 		SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks);
2758 		return -EINVAL;
2759 	}
2760 	num_ranges = (uint16_t)num_ranges_u64;
2761 
2762 	offset = offset_blocks;
2763 	remaining = num_blocks;
2764 	range = &dsm_ranges[0];
2765 
2766 	/* Fill max-size ranges until the remaining blocks fit into one range */
2767 	while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) {
2768 		range->attributes.raw = 0;
2769 		range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
2770 		range->starting_lba = offset;
2771 
2772 		offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
2773 		remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
2774 		range++;
2775 	}
2776 
2777 	/* Final range describes the remaining blocks */
2778 	range->attributes.raw = 0;
2779 	range->length = remaining;
2780 	range->starting_lba = offset;
2781 
2782 	rc = spdk_nvme_ns_cmd_dataset_management(ns, qpair,
2783 			SPDK_NVME_DSM_ATTR_DEALLOCATE,
2784 			dsm_ranges, num_ranges,
2785 			bdev_nvme_queued_done, bio);
2786 
2787 	return rc;
2788 }
2789 
2790 static int
2791 bdev_nvme_admin_passthru(struct nvme_io_channel *nvme_ch, struct nvme_bdev_io *bio,
2792 			 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes)
2793 {
2794 	uint32_t max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ch->ctrlr->ctrlr);
2795 
2796 	if (nbytes > max_xfer_size) {
2797 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
2798 		return -EINVAL;
2799 	}
2800 
2801 	bio->orig_thread = spdk_io_channel_get_thread(spdk_io_channel_from_ctx(nvme_ch));
2802 
2803 	return spdk_nvme_ctrlr_cmd_admin_raw(nvme_ch->ctrlr->ctrlr, cmd, buf,
2804 					     (uint32_t)nbytes, bdev_nvme_admin_passthru_done, bio);
2805 }
2806 
2807 static int
2808 bdev_nvme_io_passthru(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
2809 		      struct nvme_bdev_io *bio,
2810 		      struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes)
2811 {
2812 	uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns);
2813 	struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns);
2814 
2815 	if (nbytes > max_xfer_size) {
2816 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
2817 		return -EINVAL;
2818 	}
2819 
2820 	/*
2821 	 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid,
2822 	 * so fill it out automatically.
2823 	 */
2824 	cmd->nsid = spdk_nvme_ns_get_id(ns);
2825 
2826 	return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf,
2827 					  (uint32_t)nbytes, bdev_nvme_queued_done, bio);
2828 }
2829 
2830 static int
2831 bdev_nvme_io_passthru_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
2832 			 struct nvme_bdev_io *bio,
2833 			 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len)
2834 {
2835 	size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns);
2836 	uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns);
2837 	struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns);
2838 
2839 	if (nbytes > max_xfer_size) {
2840 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
2841 		return -EINVAL;
2842 	}
2843 
2844 	if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) {
2845 		SPDK_ERRLOG("invalid meta data buffer size\n");
2846 		return -EINVAL;
2847 	}
2848 
2849 	/*
2850 	 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid,
2851 	 * so fill it out automatically.
2852 	 */
2853 	cmd->nsid = spdk_nvme_ns_get_id(ns);
2854 
2855 	return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf,
2856 			(uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio);
2857 }
2858 
2859 static void
2860 bdev_nvme_abort_admin_cmd(void *ctx)
2861 {
2862 	struct nvme_bdev_io *bio = ctx;
2863 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2864 	struct nvme_io_channel *nvme_ch;
2865 	struct nvme_bdev_io *bio_to_abort;
2866 	int rc;
2867 
2868 	nvme_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io));
2869 	bio_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx;
2870 
2871 	rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ch->ctrlr->ctrlr,
2872 					   NULL,
2873 					   bio_to_abort,
2874 					   bdev_nvme_abort_done, bio);
2875 	if (rc == -ENOENT) {
2876 		/* If no admin command was found in admin qpair, complete the abort
2877 		 * request with failure.
2878 		 */
2879 		bio->cpl.cdw0 |= 1U;
2880 		bio->cpl.status.sc = SPDK_NVME_SC_SUCCESS;
2881 		bio->cpl.status.sct = SPDK_NVME_SCT_GENERIC;
2882 
2883 		spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_completion, bio);
2884 	}
2885 }
2886 
2887 static int
2888 bdev_nvme_abort(struct nvme_io_channel *nvme_ch, struct nvme_bdev_io *bio,
2889 		struct nvme_bdev_io *bio_to_abort)
2890 {
2891 	int rc;
2892 
2893 	bio->orig_thread = spdk_io_channel_get_thread(spdk_io_channel_from_ctx(nvme_ch));
2894 
2895 	rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ch->ctrlr->ctrlr,
2896 					   nvme_ch->qpair,
2897 					   bio_to_abort,
2898 					   bdev_nvme_abort_done, bio);
2899 	if (rc == -ENOENT) {
2900 		/* If no command was found in I/O qpair, the target command may be
2901 		 * admin command. Only a single thread tries aborting admin command
2902 		 * to clean I/O flow.
2903 		 */
2904 		spdk_thread_send_msg(nvme_ch->ctrlr->thread,
2905 				     bdev_nvme_abort_admin_cmd, bio);
2906 		rc = 0;
2907 	}
2908 
2909 	return rc;
2910 }
2911 
2912 static void
2913 nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w,
2914 		struct nvme_bdev_ns *nvme_ns)
2915 {
2916 	/* nop */
2917 }
2918 
2919 static void
2920 nvme_namespace_config_json(struct spdk_json_write_ctx *w, struct nvme_bdev_ns *nvme_ns)
2921 {
2922 	g_config_json_namespace_fn[nvme_ns->type](w, nvme_ns);
2923 }
2924 
2925 static void
2926 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w)
2927 {
2928 	const char	*action;
2929 
2930 	if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) {
2931 		action = "reset";
2932 	} else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) {
2933 		action = "abort";
2934 	} else {
2935 		action = "none";
2936 	}
2937 
2938 	spdk_json_write_object_begin(w);
2939 
2940 	spdk_json_write_named_string(w, "method", "bdev_nvme_set_options");
2941 
2942 	spdk_json_write_named_object_begin(w, "params");
2943 	spdk_json_write_named_string(w, "action_on_timeout", action);
2944 	spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us);
2945 	spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms);
2946 	spdk_json_write_named_uint32(w, "retry_count", g_opts.retry_count);
2947 	spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst);
2948 	spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight);
2949 	spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight);
2950 	spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight);
2951 	spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us);
2952 	spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us);
2953 	spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests);
2954 	spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit);
2955 	spdk_json_write_object_end(w);
2956 
2957 	spdk_json_write_object_end(w);
2958 }
2959 
2960 static void
2961 nvme_bdev_ctrlr_config_json(struct spdk_json_write_ctx *w,
2962 			    struct nvme_bdev_ctrlr *nvme_bdev_ctrlr)
2963 {
2964 	struct spdk_nvme_transport_id	*trid;
2965 
2966 	trid = nvme_bdev_ctrlr->connected_trid;
2967 
2968 	spdk_json_write_object_begin(w);
2969 
2970 	spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller");
2971 
2972 	spdk_json_write_named_object_begin(w, "params");
2973 	spdk_json_write_named_string(w, "name", nvme_bdev_ctrlr->name);
2974 	nvme_bdev_dump_trid_json(trid, w);
2975 	spdk_json_write_named_bool(w, "prchk_reftag",
2976 				   (nvme_bdev_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0);
2977 	spdk_json_write_named_bool(w, "prchk_guard",
2978 				   (nvme_bdev_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0);
2979 
2980 	spdk_json_write_object_end(w);
2981 
2982 	spdk_json_write_object_end(w);
2983 }
2984 
2985 static void
2986 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w)
2987 {
2988 	spdk_json_write_object_begin(w);
2989 	spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug");
2990 
2991 	spdk_json_write_named_object_begin(w, "params");
2992 	spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us);
2993 	spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled);
2994 	spdk_json_write_object_end(w);
2995 
2996 	spdk_json_write_object_end(w);
2997 }
2998 
2999 static int
3000 bdev_nvme_config_json(struct spdk_json_write_ctx *w)
3001 {
3002 	struct nvme_bdev_ctrlr	*nvme_bdev_ctrlr;
3003 	uint32_t		nsid;
3004 
3005 	bdev_nvme_opts_config_json(w);
3006 
3007 	pthread_mutex_lock(&g_bdev_nvme_mutex);
3008 
3009 	TAILQ_FOREACH(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
3010 		nvme_bdev_ctrlr_config_json(w, nvme_bdev_ctrlr);
3011 
3012 		for (nsid = 0; nsid < nvme_bdev_ctrlr->num_ns; ++nsid) {
3013 			if (!nvme_bdev_ctrlr->namespaces[nsid]->populated) {
3014 				continue;
3015 			}
3016 
3017 			nvme_namespace_config_json(w, nvme_bdev_ctrlr->namespaces[nsid]);
3018 		}
3019 	}
3020 
3021 	/* Dump as last parameter to give all NVMe bdevs chance to be constructed
3022 	 * before enabling hotplug poller.
3023 	 */
3024 	bdev_nvme_hotplug_config_json(w);
3025 
3026 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
3027 	return 0;
3028 }
3029 
3030 struct spdk_nvme_ctrlr *
3031 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev)
3032 {
3033 	if (!bdev || bdev->module != &nvme_if) {
3034 		return NULL;
3035 	}
3036 
3037 	return SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk)->nvme_ns->ctrlr->ctrlr;
3038 }
3039 
3040 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme)
3041