xref: /spdk/module/bdev/nvme/bdev_nvme.c (revision df902b1d2e0abbbdeb84c0972bad34d250227e26)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation. All rights reserved.
5  *   Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "spdk/stdinc.h"
35 
36 #include "bdev_nvme.h"
37 #include "bdev_ocssd.h"
38 
39 #include "spdk/accel_engine.h"
40 #include "spdk/config.h"
41 #include "spdk/endian.h"
42 #include "spdk/bdev.h"
43 #include "spdk/json.h"
44 #include "spdk/nvme.h"
45 #include "spdk/nvme_ocssd.h"
46 #include "spdk/nvme_zns.h"
47 #include "spdk/thread.h"
48 #include "spdk/string.h"
49 #include "spdk/util.h"
50 
51 #include "spdk/bdev_module.h"
52 #include "spdk/log.h"
53 
54 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true
55 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS	(10000)
56 
57 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w);
58 
59 struct nvme_bdev_io {
60 	/** array of iovecs to transfer. */
61 	struct iovec *iovs;
62 
63 	/** Number of iovecs in iovs array. */
64 	int iovcnt;
65 
66 	/** Current iovec position. */
67 	int iovpos;
68 
69 	/** Offset in current iovec. */
70 	uint32_t iov_offset;
71 
72 	/** array of iovecs to transfer. */
73 	struct iovec *fused_iovs;
74 
75 	/** Number of iovecs in iovs array. */
76 	int fused_iovcnt;
77 
78 	/** Current iovec position. */
79 	int fused_iovpos;
80 
81 	/** Offset in current iovec. */
82 	uint32_t fused_iov_offset;
83 
84 	/** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */
85 	struct spdk_nvme_cpl cpl;
86 
87 	/** Originating thread */
88 	struct spdk_thread *orig_thread;
89 
90 	/** Keeps track if first of fused commands was submitted */
91 	bool first_fused_submitted;
92 
93 	/** Temporary pointer to zone report buffer */
94 	struct spdk_nvme_zns_zone_report *zone_report_buf;
95 
96 	/** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */
97 	uint64_t handled_zones;
98 };
99 
100 struct nvme_probe_ctx {
101 	size_t count;
102 	struct spdk_nvme_transport_id trids[NVME_MAX_CONTROLLERS];
103 	struct spdk_nvme_host_id hostids[NVME_MAX_CONTROLLERS];
104 	const char *names[NVME_MAX_CONTROLLERS];
105 	uint32_t prchk_flags[NVME_MAX_CONTROLLERS];
106 	const char *hostnqn;
107 };
108 
109 struct nvme_probe_skip_entry {
110 	struct spdk_nvme_transport_id		trid;
111 	TAILQ_ENTRY(nvme_probe_skip_entry)	tailq;
112 };
113 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */
114 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER(
115 			g_skipped_nvme_ctrlrs);
116 
117 static struct spdk_bdev_nvme_opts g_opts = {
118 	.action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE,
119 	.timeout_us = 0,
120 	.keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS,
121 	.retry_count = 4,
122 	.arbitration_burst = 0,
123 	.low_priority_weight = 0,
124 	.medium_priority_weight = 0,
125 	.high_priority_weight = 0,
126 	.nvme_adminq_poll_period_us = 10000ULL,
127 	.nvme_ioq_poll_period_us = 0,
128 	.io_queue_requests = 0,
129 	.delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT,
130 };
131 
132 #define NVME_HOTPLUG_POLL_PERIOD_MAX			10000000ULL
133 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT		100000ULL
134 
135 static int g_hot_insert_nvme_controller_index = 0;
136 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT;
137 static bool g_nvme_hotplug_enabled = false;
138 static struct spdk_thread *g_bdev_nvme_init_thread;
139 static struct spdk_poller *g_hotplug_poller;
140 static struct spdk_poller *g_hotplug_probe_poller;
141 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx;
142 
143 static void nvme_ctrlr_populate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
144 		struct nvme_async_probe_ctx *ctx);
145 static void nvme_ctrlr_populate_namespaces_done(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
146 		struct nvme_async_probe_ctx *ctx);
147 static int bdev_nvme_library_init(void);
148 static void bdev_nvme_library_fini(void);
149 static int bdev_nvme_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
150 			   struct nvme_bdev_io *bio,
151 			   struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
152 			   uint32_t flags);
153 static int bdev_nvme_no_pi_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
154 				 struct nvme_bdev_io *bio,
155 				 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba);
156 static int bdev_nvme_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
157 			    struct nvme_bdev_io *bio,
158 			    struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
159 			    uint32_t flags);
160 static int bdev_nvme_zone_appendv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
161 				  struct nvme_bdev_io *bio,
162 				  struct iovec *iov, int iovcnt, void *md, uint64_t lba_count,
163 				  uint64_t zslba, uint32_t flags);
164 static int bdev_nvme_comparev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
165 			      struct nvme_bdev_io *bio,
166 			      struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
167 			      uint32_t flags);
168 static int bdev_nvme_comparev_and_writev(struct spdk_nvme_ns *ns,
169 		struct spdk_nvme_qpair *qpair,
170 		struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov,
171 		int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba,
172 		uint32_t flags);
173 static int bdev_nvme_get_zone_info(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
174 				   struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones,
175 				   struct spdk_bdev_zone_info *info);
176 static int bdev_nvme_zone_management(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
177 				     struct nvme_bdev_io *bio, uint64_t zone_id,
178 				     enum spdk_bdev_zone_action action);
179 static int bdev_nvme_admin_passthru(struct nvme_io_channel *nvme_ch,
180 				    struct nvme_bdev_io *bio,
181 				    struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes);
182 static int bdev_nvme_io_passthru(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
183 				 struct nvme_bdev_io *bio,
184 				 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes);
185 static int bdev_nvme_io_passthru_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
186 				    struct nvme_bdev_io *bio,
187 				    struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len);
188 static int bdev_nvme_abort(struct nvme_io_channel *nvme_ch,
189 			   struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort);
190 static int bdev_nvme_reset(struct nvme_io_channel *nvme_ch, struct nvme_bdev_io *bio);
191 static int bdev_nvme_failover(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, bool remove);
192 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr);
193 
194 typedef void (*populate_namespace_fn)(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
195 				      struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx);
196 static void nvme_ctrlr_populate_standard_namespace(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
197 		struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx);
198 
199 static populate_namespace_fn g_populate_namespace_fn[] = {
200 	NULL,
201 	nvme_ctrlr_populate_standard_namespace,
202 	bdev_ocssd_populate_namespace,
203 };
204 
205 typedef void (*depopulate_namespace_fn)(struct nvme_bdev_ns *nvme_ns);
206 static void nvme_ctrlr_depopulate_standard_namespace(struct nvme_bdev_ns *nvme_ns);
207 
208 static depopulate_namespace_fn g_depopulate_namespace_fn[] = {
209 	NULL,
210 	nvme_ctrlr_depopulate_standard_namespace,
211 	bdev_ocssd_depopulate_namespace,
212 };
213 
214 typedef void (*config_json_namespace_fn)(struct spdk_json_write_ctx *w,
215 		struct nvme_bdev_ns *nvme_ns);
216 static void nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w,
217 		struct nvme_bdev_ns *nvme_ns);
218 
219 static config_json_namespace_fn g_config_json_namespace_fn[] = {
220 	NULL,
221 	nvme_ctrlr_config_json_standard_namespace,
222 	bdev_ocssd_namespace_config_json,
223 };
224 
225 struct spdk_nvme_qpair *
226 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch)
227 {
228 	struct nvme_io_channel *nvme_ch;
229 
230 	assert(ctrlr_io_ch != NULL);
231 
232 	nvme_ch = spdk_io_channel_get_ctx(ctrlr_io_ch);
233 
234 	return nvme_ch->qpair;
235 }
236 
237 static int
238 bdev_nvme_get_ctx_size(void)
239 {
240 	return sizeof(struct nvme_bdev_io);
241 }
242 
243 static struct spdk_bdev_module nvme_if = {
244 	.name = "nvme",
245 	.async_fini = true,
246 	.module_init = bdev_nvme_library_init,
247 	.module_fini = bdev_nvme_library_fini,
248 	.config_json = bdev_nvme_config_json,
249 	.get_ctx_size = bdev_nvme_get_ctx_size,
250 
251 };
252 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if)
253 
254 static void
255 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx)
256 {
257 	int rc;
258 
259 	SPDK_DEBUGLOG(bdev_nvme, "qpair %p is disconnected, attempting reconnect.\n", qpair);
260 	/*
261 	 * Currently, just try to reconnect indefinitely. If we are doing a reset, the reset will
262 	 * reconnect a qpair and we will stop getting a callback for this one.
263 	 */
264 	rc = spdk_nvme_ctrlr_reconnect_io_qpair(qpair);
265 	if (rc != 0) {
266 		SPDK_WARNLOG("Failed to reconnect to qpair %p, errno %d\n", qpair, -rc);
267 	}
268 }
269 
270 static int
271 bdev_nvme_poll(void *arg)
272 {
273 	struct nvme_bdev_poll_group *group = arg;
274 	int64_t num_completions;
275 
276 	if (group->collect_spin_stat && group->start_ticks == 0) {
277 		group->start_ticks = spdk_get_ticks();
278 	}
279 
280 	num_completions = spdk_nvme_poll_group_process_completions(group->group, 0,
281 			  bdev_nvme_disconnected_qpair_cb);
282 	if (group->collect_spin_stat) {
283 		if (num_completions > 0) {
284 			if (group->end_ticks != 0) {
285 				group->spin_ticks += (group->end_ticks - group->start_ticks);
286 				group->end_ticks = 0;
287 			}
288 			group->start_ticks = 0;
289 		} else {
290 			group->end_ticks = spdk_get_ticks();
291 		}
292 	}
293 
294 	return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
295 }
296 
297 static int
298 bdev_nvme_poll_adminq(void *arg)
299 {
300 	int32_t rc;
301 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = arg;
302 
303 	assert(nvme_bdev_ctrlr != NULL);
304 
305 	rc = spdk_nvme_ctrlr_process_admin_completions(nvme_bdev_ctrlr->ctrlr);
306 	if (rc < 0) {
307 		bdev_nvme_failover(nvme_bdev_ctrlr, false);
308 	}
309 
310 	return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY;
311 }
312 
313 static int
314 bdev_nvme_destruct(void *ctx)
315 {
316 	struct nvme_bdev *nvme_disk = ctx;
317 	struct nvme_bdev_ns *nvme_ns = nvme_disk->nvme_ns;
318 
319 	pthread_mutex_lock(&nvme_ns->ctrlr->mutex);
320 
321 	nvme_ns->bdev = NULL;
322 
323 	if (!nvme_ns->populated) {
324 		pthread_mutex_unlock(&nvme_ns->ctrlr->mutex);
325 
326 		nvme_bdev_ctrlr_destruct(nvme_ns->ctrlr);
327 	} else {
328 		pthread_mutex_unlock(&nvme_ns->ctrlr->mutex);
329 	}
330 
331 	free(nvme_disk->disk.name);
332 	free(nvme_disk);
333 
334 	return 0;
335 }
336 
337 static int
338 bdev_nvme_flush(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
339 		struct nvme_bdev_io *bio, uint64_t offset, uint64_t nbytes)
340 {
341 	spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_SUCCESS);
342 
343 	return 0;
344 }
345 
346 static int
347 bdev_nvme_create_qpair(struct nvme_io_channel *nvme_ch)
348 {
349 	struct spdk_nvme_ctrlr *ctrlr = nvme_ch->ctrlr->ctrlr;
350 	struct spdk_nvme_io_qpair_opts opts;
351 	struct spdk_nvme_qpair *qpair;
352 	int rc;
353 
354 	spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &opts, sizeof(opts));
355 	opts.delay_cmd_submit = g_opts.delay_cmd_submit;
356 	opts.create_only = true;
357 	opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests);
358 	g_opts.io_queue_requests = opts.io_queue_requests;
359 
360 	qpair = spdk_nvme_ctrlr_alloc_io_qpair(ctrlr, &opts, sizeof(opts));
361 	if (qpair == NULL) {
362 		return -1;
363 	}
364 
365 	assert(nvme_ch->group != NULL);
366 
367 	rc = spdk_nvme_poll_group_add(nvme_ch->group->group, qpair);
368 	if (rc != 0) {
369 		SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n");
370 		goto err;
371 	}
372 
373 	rc = spdk_nvme_ctrlr_connect_io_qpair(ctrlr, qpair);
374 	if (rc != 0) {
375 		SPDK_ERRLOG("Unable to connect I/O qpair.\n");
376 		goto err;
377 	}
378 
379 	nvme_ch->qpair = qpair;
380 
381 	return 0;
382 
383 err:
384 	spdk_nvme_ctrlr_free_io_qpair(qpair);
385 
386 	return rc;
387 }
388 
389 static int
390 bdev_nvme_destroy_qpair(struct nvme_io_channel *nvme_ch)
391 {
392 	int rc;
393 
394 	if (nvme_ch->qpair == NULL) {
395 		return 0;
396 	}
397 
398 	rc = spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair);
399 	if (!rc) {
400 		nvme_ch->qpair = NULL;
401 	}
402 	return rc;
403 }
404 
405 static void
406 _bdev_nvme_check_pending_destruct(struct spdk_io_channel_iter *i, int status)
407 {
408 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_ctx(i);
409 
410 	pthread_mutex_lock(&nvme_bdev_ctrlr->mutex);
411 	if (nvme_bdev_ctrlr->destruct_after_reset) {
412 		assert(nvme_bdev_ctrlr->ref == 0 && nvme_bdev_ctrlr->destruct);
413 		pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
414 
415 		spdk_thread_send_msg(nvme_bdev_ctrlr->thread, nvme_bdev_ctrlr_unregister,
416 				     nvme_bdev_ctrlr);
417 	} else {
418 		pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
419 	}
420 }
421 
422 static void
423 _bdev_nvme_complete_pending_resets(struct nvme_io_channel *nvme_ch,
424 				   enum spdk_bdev_io_status status)
425 {
426 	struct spdk_bdev_io *bdev_io;
427 
428 	while (!TAILQ_EMPTY(&nvme_ch->pending_resets)) {
429 		bdev_io = TAILQ_FIRST(&nvme_ch->pending_resets);
430 		TAILQ_REMOVE(&nvme_ch->pending_resets, bdev_io, module_link);
431 		spdk_bdev_io_complete(bdev_io, status);
432 	}
433 }
434 
435 static void
436 bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i)
437 {
438 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
439 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(_ch);
440 
441 	_bdev_nvme_complete_pending_resets(nvme_ch, SPDK_BDEV_IO_STATUS_SUCCESS);
442 
443 	spdk_for_each_channel_continue(i, 0);
444 }
445 
446 static void
447 bdev_nvme_abort_pending_resets(struct spdk_io_channel_iter *i)
448 {
449 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
450 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(_ch);
451 
452 	_bdev_nvme_complete_pending_resets(nvme_ch, SPDK_BDEV_IO_STATUS_FAILED);
453 
454 	spdk_for_each_channel_continue(i, 0);
455 }
456 
457 static void
458 _bdev_nvme_reset_complete(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, int rc)
459 {
460 	struct nvme_bdev_ctrlr_trid *curr_trid;
461 	struct nvme_bdev_io *bio = nvme_bdev_ctrlr->reset_bio;
462 	enum spdk_bdev_io_status io_status = SPDK_BDEV_IO_STATUS_SUCCESS;
463 
464 	nvme_bdev_ctrlr->reset_bio = NULL;
465 
466 	if (rc) {
467 		SPDK_ERRLOG("Resetting controller failed.\n");
468 		io_status = SPDK_BDEV_IO_STATUS_FAILED;
469 	} else {
470 		SPDK_NOTICELOG("Resetting controller successful.\n");
471 	}
472 
473 	pthread_mutex_lock(&nvme_bdev_ctrlr->mutex);
474 	nvme_bdev_ctrlr->resetting = false;
475 	nvme_bdev_ctrlr->failover_in_progress = false;
476 
477 	curr_trid = TAILQ_FIRST(&nvme_bdev_ctrlr->trids);
478 	assert(curr_trid != NULL);
479 	assert(&curr_trid->trid == nvme_bdev_ctrlr->connected_trid);
480 
481 	curr_trid->is_failed = rc != 0 ? true : false;
482 
483 	if (nvme_bdev_ctrlr->ref == 0 && nvme_bdev_ctrlr->destruct) {
484 		/* Destruct ctrlr after clearing pending resets. */
485 		nvme_bdev_ctrlr->destruct_after_reset = true;
486 	}
487 
488 	pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
489 
490 	if (bio) {
491 		spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), io_status);
492 	}
493 
494 	/* Make sure we clear any pending resets before returning. */
495 	spdk_for_each_channel(nvme_bdev_ctrlr,
496 			      rc == 0 ? bdev_nvme_complete_pending_resets :
497 			      bdev_nvme_abort_pending_resets,
498 			      nvme_bdev_ctrlr,
499 			      _bdev_nvme_check_pending_destruct);
500 }
501 
502 static void
503 _bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status)
504 {
505 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_ctx(i);
506 
507 	_bdev_nvme_reset_complete(nvme_bdev_ctrlr, status);
508 }
509 
510 static void
511 _bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i)
512 {
513 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
514 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(_ch);
515 	int rc;
516 
517 	rc = bdev_nvme_create_qpair(nvme_ch);
518 
519 	spdk_for_each_channel_continue(i, rc);
520 }
521 
522 static void
523 _bdev_nvme_reset_ctrlr(struct spdk_io_channel_iter *i, int status)
524 {
525 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_ctx(i);
526 	int rc;
527 
528 	if (status) {
529 		rc = status;
530 		goto err;
531 	}
532 
533 	rc = spdk_nvme_ctrlr_reset(nvme_bdev_ctrlr->ctrlr);
534 	if (rc != 0) {
535 		goto err;
536 	}
537 
538 	/* Recreate all of the I/O queue pairs */
539 	spdk_for_each_channel(nvme_bdev_ctrlr,
540 			      _bdev_nvme_reset_create_qpair,
541 			      nvme_bdev_ctrlr,
542 			      _bdev_nvme_reset_create_qpairs_done);
543 	return;
544 
545 err:
546 	_bdev_nvme_reset_complete(nvme_bdev_ctrlr, rc);
547 }
548 
549 static void
550 _bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i)
551 {
552 	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
553 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
554 	int rc;
555 
556 	rc = bdev_nvme_destroy_qpair(nvme_ch);
557 
558 	spdk_for_each_channel_continue(i, rc);
559 }
560 
561 static int
562 _bdev_nvme_reset(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr)
563 {
564 	pthread_mutex_lock(&nvme_bdev_ctrlr->mutex);
565 	if (nvme_bdev_ctrlr->destruct) {
566 		pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
567 		return -EBUSY;
568 	}
569 
570 	if (nvme_bdev_ctrlr->resetting) {
571 		pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
572 		SPDK_NOTICELOG("Unable to perform reset, already in progress.\n");
573 		return -EAGAIN;
574 	}
575 
576 	nvme_bdev_ctrlr->resetting = true;
577 	pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
578 
579 	/* First, delete all NVMe I/O queue pairs. */
580 	spdk_for_each_channel(nvme_bdev_ctrlr,
581 			      _bdev_nvme_reset_destroy_qpair,
582 			      nvme_bdev_ctrlr,
583 			      _bdev_nvme_reset_ctrlr);
584 
585 	return 0;
586 }
587 
588 static int
589 bdev_nvme_reset(struct nvme_io_channel *nvme_ch, struct nvme_bdev_io *bio)
590 {
591 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
592 	int rc;
593 
594 	rc = _bdev_nvme_reset(nvme_ch->ctrlr);
595 	if (rc == 0) {
596 		assert(nvme_ch->ctrlr->reset_bio == NULL);
597 		nvme_ch->ctrlr->reset_bio = bio;
598 	} else if (rc == -EBUSY) {
599 		/* Don't bother resetting if the controller is in the process of being destructed. */
600 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
601 	} else if (rc == -EAGAIN) {
602 		/*
603 		 * Reset call is queued only if it is from the app framework. This is on purpose so that
604 		 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the
605 		 * upper level. If they are in the middle of a reset, we won't try to schedule another one.
606 		 */
607 		TAILQ_INSERT_TAIL(&nvme_ch->pending_resets, bdev_io, module_link);
608 	} else {
609 		return rc;
610 	}
611 
612 	return 0;
613 }
614 
615 static int
616 _bdev_nvme_failover_start(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, bool remove)
617 {
618 	struct nvme_bdev_ctrlr_trid *curr_trid = NULL, *next_trid = NULL;
619 	int rc;
620 
621 	pthread_mutex_lock(&nvme_bdev_ctrlr->mutex);
622 	if (nvme_bdev_ctrlr->destruct) {
623 		pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
624 		/* Don't bother resetting if the controller is in the process of being destructed. */
625 		return -EBUSY;
626 	}
627 
628 	curr_trid = TAILQ_FIRST(&nvme_bdev_ctrlr->trids);
629 	assert(curr_trid);
630 	assert(&curr_trid->trid == nvme_bdev_ctrlr->connected_trid);
631 	next_trid = TAILQ_NEXT(curr_trid, link);
632 
633 	if (nvme_bdev_ctrlr->resetting) {
634 		if (next_trid && !nvme_bdev_ctrlr->failover_in_progress) {
635 			rc = -EAGAIN;
636 		} else {
637 			rc = -EBUSY;
638 		}
639 		pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
640 		SPDK_NOTICELOG("Unable to perform reset, already in progress.\n");
641 		return rc;
642 	}
643 
644 	nvme_bdev_ctrlr->resetting = true;
645 	curr_trid->is_failed = true;
646 
647 	if (next_trid) {
648 		assert(curr_trid->trid.trtype != SPDK_NVME_TRANSPORT_PCIE);
649 
650 		SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", curr_trid->trid.traddr,
651 			       curr_trid->trid.trsvcid,	next_trid->trid.traddr, next_trid->trid.trsvcid);
652 
653 		nvme_bdev_ctrlr->failover_in_progress = true;
654 		spdk_nvme_ctrlr_fail(nvme_bdev_ctrlr->ctrlr);
655 		nvme_bdev_ctrlr->connected_trid = &next_trid->trid;
656 		rc = spdk_nvme_ctrlr_set_trid(nvme_bdev_ctrlr->ctrlr, &next_trid->trid);
657 		assert(rc == 0);
658 		TAILQ_REMOVE(&nvme_bdev_ctrlr->trids, curr_trid, link);
659 		if (!remove) {
660 			/** Shuffle the old trid to the end of the list and use the new one.
661 			 * Allows for round robin through multiple connections.
662 			 */
663 			TAILQ_INSERT_TAIL(&nvme_bdev_ctrlr->trids, curr_trid, link);
664 		} else {
665 			free(curr_trid);
666 		}
667 	}
668 
669 	pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
670 	return 0;
671 }
672 
673 static int
674 bdev_nvme_failover(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, bool remove)
675 {
676 	int rc;
677 
678 	rc = _bdev_nvme_failover_start(nvme_bdev_ctrlr, remove);
679 	if (rc == 0) {
680 		/* First, delete all NVMe I/O queue pairs. */
681 		spdk_for_each_channel(nvme_bdev_ctrlr,
682 				      _bdev_nvme_reset_destroy_qpair,
683 				      nvme_bdev_ctrlr,
684 				      _bdev_nvme_reset_ctrlr);
685 	} else if (rc != -EBUSY) {
686 		return rc;
687 	}
688 
689 	return 0;
690 }
691 
692 static int
693 bdev_nvme_unmap(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
694 		struct nvme_bdev_io *bio,
695 		uint64_t offset_blocks,
696 		uint64_t num_blocks);
697 
698 static void
699 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
700 		     bool success)
701 {
702 	struct spdk_bdev *bdev = bdev_io->bdev;
703 	struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev->ctxt;
704 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
705 	struct nvme_bdev_ns *nvme_ns;
706 	struct spdk_nvme_qpair *qpair;
707 	int ret;
708 
709 	if (!success) {
710 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
711 		return;
712 	}
713 
714 	if (spdk_unlikely(!bdev_nvme_find_io_path(nbdev, nvme_ch, &nvme_ns, &qpair))) {
715 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
716 		return;
717 	}
718 
719 	ret = bdev_nvme_readv(nvme_ns->ns,
720 			      qpair,
721 			      (struct nvme_bdev_io *)bdev_io->driver_ctx,
722 			      bdev_io->u.bdev.iovs,
723 			      bdev_io->u.bdev.iovcnt,
724 			      bdev_io->u.bdev.md_buf,
725 			      bdev_io->u.bdev.num_blocks,
726 			      bdev_io->u.bdev.offset_blocks,
727 			      bdev->dif_check_flags);
728 
729 	if (spdk_likely(ret == 0)) {
730 		return;
731 	} else if (ret == -ENOMEM) {
732 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
733 	} else {
734 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
735 	}
736 }
737 
738 static int
739 _bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
740 {
741 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
742 	struct spdk_bdev *bdev = bdev_io->bdev;
743 	struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev->ctxt;
744 	struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx;
745 	struct nvme_bdev_io *nbdev_io_to_abort;
746 	struct nvme_bdev_ns *nvme_ns;
747 	struct spdk_nvme_qpair *qpair;
748 
749 	if (spdk_unlikely(!bdev_nvme_find_io_path(nbdev, nvme_ch, &nvme_ns, &qpair))) {
750 		return -1;
751 	}
752 
753 	switch (bdev_io->type) {
754 	case SPDK_BDEV_IO_TYPE_READ:
755 		if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) {
756 			return bdev_nvme_readv(nvme_ns->ns,
757 					       qpair,
758 					       nbdev_io,
759 					       bdev_io->u.bdev.iovs,
760 					       bdev_io->u.bdev.iovcnt,
761 					       bdev_io->u.bdev.md_buf,
762 					       bdev_io->u.bdev.num_blocks,
763 					       bdev_io->u.bdev.offset_blocks,
764 					       bdev->dif_check_flags);
765 		} else {
766 			spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb,
767 					     bdev_io->u.bdev.num_blocks * bdev->blocklen);
768 			return 0;
769 		}
770 
771 	case SPDK_BDEV_IO_TYPE_WRITE:
772 		return bdev_nvme_writev(nvme_ns->ns,
773 					qpair,
774 					nbdev_io,
775 					bdev_io->u.bdev.iovs,
776 					bdev_io->u.bdev.iovcnt,
777 					bdev_io->u.bdev.md_buf,
778 					bdev_io->u.bdev.num_blocks,
779 					bdev_io->u.bdev.offset_blocks,
780 					bdev->dif_check_flags);
781 
782 	case SPDK_BDEV_IO_TYPE_COMPARE:
783 		return bdev_nvme_comparev(nvme_ns->ns,
784 					  qpair,
785 					  nbdev_io,
786 					  bdev_io->u.bdev.iovs,
787 					  bdev_io->u.bdev.iovcnt,
788 					  bdev_io->u.bdev.md_buf,
789 					  bdev_io->u.bdev.num_blocks,
790 					  bdev_io->u.bdev.offset_blocks,
791 					  bdev->dif_check_flags);
792 
793 	case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE:
794 		return bdev_nvme_comparev_and_writev(nvme_ns->ns,
795 						     qpair,
796 						     nbdev_io,
797 						     bdev_io->u.bdev.iovs,
798 						     bdev_io->u.bdev.iovcnt,
799 						     bdev_io->u.bdev.fused_iovs,
800 						     bdev_io->u.bdev.fused_iovcnt,
801 						     bdev_io->u.bdev.md_buf,
802 						     bdev_io->u.bdev.num_blocks,
803 						     bdev_io->u.bdev.offset_blocks,
804 						     bdev->dif_check_flags);
805 
806 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
807 		return bdev_nvme_unmap(nvme_ns->ns,
808 				       qpair,
809 				       nbdev_io,
810 				       bdev_io->u.bdev.offset_blocks,
811 				       bdev_io->u.bdev.num_blocks);
812 
813 	case SPDK_BDEV_IO_TYPE_UNMAP:
814 		return bdev_nvme_unmap(nvme_ns->ns,
815 				       qpair,
816 				       nbdev_io,
817 				       bdev_io->u.bdev.offset_blocks,
818 				       bdev_io->u.bdev.num_blocks);
819 
820 	case SPDK_BDEV_IO_TYPE_RESET:
821 		return bdev_nvme_reset(nvme_ch, nbdev_io);
822 
823 	case SPDK_BDEV_IO_TYPE_FLUSH:
824 		return bdev_nvme_flush(nvme_ns->ns,
825 				       qpair,
826 				       nbdev_io,
827 				       bdev_io->u.bdev.offset_blocks,
828 				       bdev_io->u.bdev.num_blocks);
829 
830 	case SPDK_BDEV_IO_TYPE_ZONE_APPEND:
831 		return bdev_nvme_zone_appendv(nvme_ns->ns,
832 					      qpair,
833 					      nbdev_io,
834 					      bdev_io->u.bdev.iovs,
835 					      bdev_io->u.bdev.iovcnt,
836 					      bdev_io->u.bdev.md_buf,
837 					      bdev_io->u.bdev.num_blocks,
838 					      bdev_io->u.bdev.offset_blocks,
839 					      bdev->dif_check_flags);
840 
841 	case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO:
842 		return bdev_nvme_get_zone_info(nvme_ns->ns,
843 					       qpair,
844 					       nbdev_io,
845 					       bdev_io->u.zone_mgmt.zone_id,
846 					       bdev_io->u.zone_mgmt.num_zones,
847 					       bdev_io->u.zone_mgmt.buf);
848 
849 	case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
850 		return bdev_nvme_zone_management(nvme_ns->ns,
851 						 qpair,
852 						 nbdev_io,
853 						 bdev_io->u.zone_mgmt.zone_id,
854 						 bdev_io->u.zone_mgmt.zone_action);
855 
856 	case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
857 		return bdev_nvme_admin_passthru(nvme_ch,
858 						nbdev_io,
859 						&bdev_io->u.nvme_passthru.cmd,
860 						bdev_io->u.nvme_passthru.buf,
861 						bdev_io->u.nvme_passthru.nbytes);
862 
863 	case SPDK_BDEV_IO_TYPE_NVME_IO:
864 		return bdev_nvme_io_passthru(nvme_ns->ns,
865 					     qpair,
866 					     nbdev_io,
867 					     &bdev_io->u.nvme_passthru.cmd,
868 					     bdev_io->u.nvme_passthru.buf,
869 					     bdev_io->u.nvme_passthru.nbytes);
870 
871 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
872 		return bdev_nvme_io_passthru_md(nvme_ns->ns,
873 						qpair,
874 						nbdev_io,
875 						&bdev_io->u.nvme_passthru.cmd,
876 						bdev_io->u.nvme_passthru.buf,
877 						bdev_io->u.nvme_passthru.nbytes,
878 						bdev_io->u.nvme_passthru.md_buf,
879 						bdev_io->u.nvme_passthru.md_len);
880 
881 	case SPDK_BDEV_IO_TYPE_ABORT:
882 		nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx;
883 		return bdev_nvme_abort(nvme_ch,
884 				       nbdev_io,
885 				       nbdev_io_to_abort);
886 
887 	default:
888 		return -EINVAL;
889 	}
890 	return 0;
891 }
892 
893 static void
894 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
895 {
896 	int rc = _bdev_nvme_submit_request(ch, bdev_io);
897 
898 	if (spdk_unlikely(rc != 0)) {
899 		if (rc == -ENOMEM) {
900 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
901 		} else {
902 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
903 		}
904 	}
905 }
906 
907 static bool
908 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
909 {
910 	struct nvme_bdev *nbdev = ctx;
911 	struct nvme_bdev_ns *nvme_ns;
912 	struct spdk_nvme_ns *ns;
913 	struct spdk_nvme_ctrlr *ctrlr;
914 	const struct spdk_nvme_ctrlr_data *cdata;
915 
916 	nvme_ns = nvme_bdev_to_bdev_ns(nbdev);
917 	assert(nvme_ns != NULL);
918 	ns = nvme_ns->ns;
919 	ctrlr = spdk_nvme_ns_get_ctrlr(ns);
920 
921 	switch (io_type) {
922 	case SPDK_BDEV_IO_TYPE_READ:
923 	case SPDK_BDEV_IO_TYPE_WRITE:
924 	case SPDK_BDEV_IO_TYPE_RESET:
925 	case SPDK_BDEV_IO_TYPE_FLUSH:
926 	case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
927 	case SPDK_BDEV_IO_TYPE_NVME_IO:
928 	case SPDK_BDEV_IO_TYPE_ABORT:
929 		return true;
930 
931 	case SPDK_BDEV_IO_TYPE_COMPARE:
932 		return spdk_nvme_ns_supports_compare(ns);
933 
934 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
935 		return spdk_nvme_ns_get_md_size(ns) ? true : false;
936 
937 	case SPDK_BDEV_IO_TYPE_UNMAP:
938 		cdata = spdk_nvme_ctrlr_get_data(ctrlr);
939 		return cdata->oncs.dsm;
940 
941 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
942 		cdata = spdk_nvme_ctrlr_get_data(ctrlr);
943 		/*
944 		 * If an NVMe controller guarantees reading unallocated blocks returns zero,
945 		 * we can implement WRITE_ZEROES as an NVMe deallocate command.
946 		 */
947 		if (cdata->oncs.dsm &&
948 		    spdk_nvme_ns_get_dealloc_logical_block_read_value(ns) ==
949 		    SPDK_NVME_DEALLOC_READ_00) {
950 			return true;
951 		}
952 		/*
953 		 * The NVMe controller write_zeroes function is currently not used by our driver.
954 		 * If a user submits an arbitrarily large write_zeroes request to the controller, the request will fail.
955 		 * Until this is resolved, we only claim support for write_zeroes if deallocated blocks return 0's when read.
956 		 */
957 		return false;
958 
959 	case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE:
960 		if (spdk_nvme_ctrlr_get_flags(ctrlr) &
961 		    SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) {
962 			return true;
963 		}
964 		return false;
965 
966 	case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO:
967 	case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
968 		return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS;
969 
970 	case SPDK_BDEV_IO_TYPE_ZONE_APPEND:
971 		return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS &&
972 		       spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED;
973 
974 	default:
975 		return false;
976 	}
977 }
978 
979 static int
980 bdev_nvme_create_cb(void *io_device, void *ctx_buf)
981 {
982 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = io_device;
983 	struct nvme_io_channel *nvme_ch = ctx_buf;
984 	struct spdk_io_channel *pg_ch = NULL;
985 	int rc;
986 
987 	if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) {
988 		rc = bdev_ocssd_create_io_channel(nvme_ch);
989 		if (rc != 0) {
990 			return rc;
991 		}
992 	}
993 
994 	pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs);
995 	if (!pg_ch) {
996 		rc = -1;
997 		goto err_pg_ch;
998 	}
999 
1000 	nvme_ch->group = spdk_io_channel_get_ctx(pg_ch);
1001 
1002 #ifdef SPDK_CONFIG_VTUNE
1003 	nvme_ch->group->collect_spin_stat = true;
1004 #else
1005 	nvme_ch->group->collect_spin_stat = false;
1006 #endif
1007 
1008 	TAILQ_INIT(&nvme_ch->pending_resets);
1009 
1010 	nvme_ch->ctrlr = nvme_bdev_ctrlr;
1011 
1012 	rc = bdev_nvme_create_qpair(nvme_ch);
1013 	if (rc != 0) {
1014 		goto err_qpair;
1015 	}
1016 
1017 	return 0;
1018 
1019 err_qpair:
1020 	spdk_put_io_channel(pg_ch);
1021 err_pg_ch:
1022 	if (nvme_ch->ocssd_ch) {
1023 		bdev_ocssd_destroy_io_channel(nvme_ch);
1024 	}
1025 
1026 	return rc;
1027 }
1028 
1029 static void
1030 bdev_nvme_destroy_cb(void *io_device, void *ctx_buf)
1031 {
1032 	struct nvme_io_channel *nvme_ch = ctx_buf;
1033 
1034 	assert(nvme_ch->group != NULL);
1035 
1036 	if (nvme_ch->ocssd_ch != NULL) {
1037 		bdev_ocssd_destroy_io_channel(nvme_ch);
1038 	}
1039 
1040 	bdev_nvme_destroy_qpair(nvme_ch);
1041 
1042 	spdk_put_io_channel(spdk_io_channel_from_ctx(nvme_ch->group));
1043 }
1044 
1045 static void
1046 bdev_nvme_poll_group_submit_accel_crc32c(void *ctx, uint32_t *dst, struct iovec *iov,
1047 		uint32_t iov_cnt, uint32_t seed,
1048 		spdk_nvme_accel_completion_cb cb_fn, void *cb_arg)
1049 {
1050 	struct nvme_bdev_poll_group *group = ctx;
1051 	int rc;
1052 
1053 	assert(group->accel_channel != NULL);
1054 	assert(cb_fn != NULL);
1055 
1056 	rc = spdk_accel_submit_crc32cv(group->accel_channel, dst, iov, iov_cnt, seed, cb_fn, cb_arg);
1057 	if (rc) {
1058 		/* For the two cases, spdk_accel_submit_crc32cv does not call the user's cb_fn */
1059 		if (rc == -ENOMEM || rc == -EINVAL) {
1060 			cb_fn(cb_arg, rc);
1061 		}
1062 		SPDK_ERRLOG("Cannot complete the accelerated crc32c operation with iov=%p\n", iov);
1063 	}
1064 }
1065 
1066 static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = {
1067 	.table_size		= sizeof(struct spdk_nvme_accel_fn_table),
1068 	.submit_accel_crc32c	= bdev_nvme_poll_group_submit_accel_crc32c,
1069 };
1070 
1071 static int
1072 bdev_nvme_poll_group_create_cb(void *io_device, void *ctx_buf)
1073 {
1074 	struct nvme_bdev_poll_group *group = ctx_buf;
1075 
1076 	group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table);
1077 	if (group->group == NULL) {
1078 		return -1;
1079 	}
1080 
1081 	group->accel_channel = spdk_accel_engine_get_io_channel();
1082 	if (!group->accel_channel) {
1083 		spdk_nvme_poll_group_destroy(group->group);
1084 		SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n",
1085 			    group);
1086 		return -1;
1087 	}
1088 
1089 	group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us);
1090 
1091 	if (group->poller == NULL) {
1092 		spdk_put_io_channel(group->accel_channel);
1093 		spdk_nvme_poll_group_destroy(group->group);
1094 		return -1;
1095 	}
1096 
1097 	return 0;
1098 }
1099 
1100 static void
1101 bdev_nvme_poll_group_destroy_cb(void *io_device, void *ctx_buf)
1102 {
1103 	struct nvme_bdev_poll_group *group = ctx_buf;
1104 
1105 	if (group->accel_channel) {
1106 		spdk_put_io_channel(group->accel_channel);
1107 	}
1108 
1109 	spdk_poller_unregister(&group->poller);
1110 	if (spdk_nvme_poll_group_destroy(group->group)) {
1111 		SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.");
1112 		assert(false);
1113 	}
1114 }
1115 
1116 static struct spdk_io_channel *
1117 bdev_nvme_get_io_channel(void *ctx)
1118 {
1119 	struct nvme_bdev *nvme_bdev = ctx;
1120 
1121 	return spdk_get_io_channel(nvme_bdev->nvme_ns->ctrlr);
1122 }
1123 
1124 static void *
1125 bdev_nvme_get_module_ctx(void *ctx)
1126 {
1127 	struct nvme_bdev *nvme_bdev = ctx;
1128 
1129 	return bdev_nvme_get_ctrlr(&nvme_bdev->disk);
1130 }
1131 
1132 static int
1133 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
1134 {
1135 	struct nvme_bdev *nvme_bdev = ctx;
1136 	struct nvme_bdev_ns *nvme_ns;
1137 	struct spdk_nvme_ns *ns;
1138 	struct spdk_nvme_ctrlr *ctrlr;
1139 	const struct spdk_nvme_ctrlr_data *cdata;
1140 	const struct spdk_nvme_transport_id *trid;
1141 	union spdk_nvme_vs_register vs;
1142 	union spdk_nvme_csts_register csts;
1143 	char buf[128];
1144 
1145 	nvme_ns = nvme_bdev_to_bdev_ns(nvme_bdev);
1146 	assert(nvme_ns != NULL);
1147 	ns = nvme_ns->ns;
1148 	ctrlr = spdk_nvme_ns_get_ctrlr(ns);
1149 
1150 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
1151 	trid = spdk_nvme_ctrlr_get_transport_id(ctrlr);
1152 	vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr);
1153 	csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr);
1154 
1155 	spdk_json_write_named_object_begin(w, "nvme");
1156 
1157 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
1158 		spdk_json_write_named_string(w, "pci_address", trid->traddr);
1159 	}
1160 
1161 	spdk_json_write_named_object_begin(w, "trid");
1162 
1163 	nvme_bdev_dump_trid_json(trid, w);
1164 
1165 	spdk_json_write_object_end(w);
1166 
1167 #ifdef SPDK_CONFIG_NVME_CUSE
1168 	size_t cuse_name_size = 128;
1169 	char cuse_name[cuse_name_size];
1170 
1171 	int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns),
1172 					    cuse_name, &cuse_name_size);
1173 	if (rc == 0) {
1174 		spdk_json_write_named_string(w, "cuse_device", cuse_name);
1175 	}
1176 #endif
1177 
1178 	spdk_json_write_named_object_begin(w, "ctrlr_data");
1179 
1180 	spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid);
1181 
1182 	snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn);
1183 	spdk_str_trim(buf);
1184 	spdk_json_write_named_string(w, "model_number", buf);
1185 
1186 	snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn);
1187 	spdk_str_trim(buf);
1188 	spdk_json_write_named_string(w, "serial_number", buf);
1189 
1190 	snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr);
1191 	spdk_str_trim(buf);
1192 	spdk_json_write_named_string(w, "firmware_revision", buf);
1193 
1194 	if (cdata->subnqn[0] != '\0') {
1195 		spdk_json_write_named_string(w, "subnqn", cdata->subnqn);
1196 	}
1197 
1198 	spdk_json_write_named_object_begin(w, "oacs");
1199 
1200 	spdk_json_write_named_uint32(w, "security", cdata->oacs.security);
1201 	spdk_json_write_named_uint32(w, "format", cdata->oacs.format);
1202 	spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware);
1203 	spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage);
1204 
1205 	spdk_json_write_object_end(w);
1206 
1207 	spdk_json_write_object_end(w);
1208 
1209 	spdk_json_write_named_object_begin(w, "vs");
1210 
1211 	spdk_json_write_name(w, "nvme_version");
1212 	if (vs.bits.ter) {
1213 		spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter);
1214 	} else {
1215 		spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr);
1216 	}
1217 
1218 	spdk_json_write_object_end(w);
1219 
1220 	spdk_json_write_named_object_begin(w, "csts");
1221 
1222 	spdk_json_write_named_uint32(w, "rdy", csts.bits.rdy);
1223 	spdk_json_write_named_uint32(w, "cfs", csts.bits.cfs);
1224 
1225 	spdk_json_write_object_end(w);
1226 
1227 	spdk_json_write_named_object_begin(w, "ns_data");
1228 
1229 	spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns));
1230 
1231 	spdk_json_write_object_end(w);
1232 
1233 	if (cdata->oacs.security) {
1234 		spdk_json_write_named_object_begin(w, "security");
1235 
1236 		spdk_json_write_named_bool(w, "opal", nvme_bdev->opal);
1237 
1238 		spdk_json_write_object_end(w);
1239 	}
1240 
1241 	spdk_json_write_object_end(w);
1242 
1243 	return 0;
1244 }
1245 
1246 static void
1247 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
1248 {
1249 	/* No config per bdev needed */
1250 }
1251 
1252 static uint64_t
1253 bdev_nvme_get_spin_time(struct spdk_io_channel *ch)
1254 {
1255 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
1256 	struct nvme_bdev_poll_group *group = nvme_ch->group;
1257 	uint64_t spin_time;
1258 
1259 	if (!group || !group->collect_spin_stat) {
1260 		return 0;
1261 	}
1262 
1263 	if (group->end_ticks != 0) {
1264 		group->spin_ticks += (group->end_ticks - group->start_ticks);
1265 		group->end_ticks = 0;
1266 	}
1267 
1268 	spin_time = (group->spin_ticks * 1000000ULL) / spdk_get_ticks_hz();
1269 	group->start_ticks = 0;
1270 	group->spin_ticks = 0;
1271 
1272 	return spin_time;
1273 }
1274 
1275 static const struct spdk_bdev_fn_table nvmelib_fn_table = {
1276 	.destruct		= bdev_nvme_destruct,
1277 	.submit_request		= bdev_nvme_submit_request,
1278 	.io_type_supported	= bdev_nvme_io_type_supported,
1279 	.get_io_channel		= bdev_nvme_get_io_channel,
1280 	.dump_info_json		= bdev_nvme_dump_info_json,
1281 	.write_config_json	= bdev_nvme_write_config_json,
1282 	.get_spin_time		= bdev_nvme_get_spin_time,
1283 	.get_module_ctx		= bdev_nvme_get_module_ctx,
1284 };
1285 
1286 static int
1287 nvme_disk_create(struct spdk_bdev *disk, const char *base_name,
1288 		 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns,
1289 		 uint32_t prchk_flags, void *ctx)
1290 {
1291 	const struct spdk_uuid		*uuid;
1292 	const struct spdk_nvme_ctrlr_data *cdata;
1293 	const struct spdk_nvme_ns_data	*nsdata;
1294 	int				rc;
1295 	enum spdk_nvme_csi		csi;
1296 
1297 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
1298 	csi = spdk_nvme_ns_get_csi(ns);
1299 
1300 	switch (csi) {
1301 	case SPDK_NVME_CSI_NVM:
1302 		disk->product_name = "NVMe disk";
1303 		break;
1304 	case SPDK_NVME_CSI_ZNS:
1305 		disk->product_name = "NVMe ZNS disk";
1306 		disk->zoned = true;
1307 		disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns);
1308 		disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) /
1309 					     spdk_nvme_ns_get_extended_sector_size(ns);
1310 		disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns);
1311 		disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns);
1312 		break;
1313 	default:
1314 		SPDK_ERRLOG("unsupported CSI: %u\n", csi);
1315 		return -ENOTSUP;
1316 	}
1317 
1318 	disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns));
1319 	if (!disk->name) {
1320 		return -ENOMEM;
1321 	}
1322 
1323 	disk->write_cache = 0;
1324 	if (cdata->vwc.present) {
1325 		/* Enable if the Volatile Write Cache exists */
1326 		disk->write_cache = 1;
1327 	}
1328 	disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns);
1329 	disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns);
1330 	disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns);
1331 
1332 	uuid = spdk_nvme_ns_get_uuid(ns);
1333 	if (uuid != NULL) {
1334 		disk->uuid = *uuid;
1335 	}
1336 
1337 	nsdata = spdk_nvme_ns_get_data(ns);
1338 
1339 	disk->md_len = spdk_nvme_ns_get_md_size(ns);
1340 	if (disk->md_len != 0) {
1341 		disk->md_interleave = nsdata->flbas.extended;
1342 		disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns);
1343 		if (disk->dif_type != SPDK_DIF_DISABLE) {
1344 			disk->dif_is_head_of_md = nsdata->dps.md_start;
1345 			disk->dif_check_flags = prchk_flags;
1346 		}
1347 	}
1348 
1349 	if (!(spdk_nvme_ctrlr_get_flags(ctrlr) &
1350 	      SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) {
1351 		disk->acwu = 0;
1352 	} else if (nsdata->nsfeat.ns_atomic_write_unit) {
1353 		disk->acwu = nsdata->nacwu;
1354 	} else {
1355 		disk->acwu = cdata->acwu;
1356 	}
1357 
1358 	disk->ctxt = ctx;
1359 	disk->fn_table = &nvmelib_fn_table;
1360 	disk->module = &nvme_if;
1361 	rc = spdk_bdev_register(disk);
1362 	if (rc) {
1363 		SPDK_ERRLOG("spdk_bdev_register() failed\n");
1364 		free(disk->name);
1365 		return rc;
1366 	}
1367 
1368 	return 0;
1369 }
1370 
1371 static int
1372 nvme_bdev_create(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, struct nvme_bdev_ns *nvme_ns)
1373 {
1374 	struct nvme_bdev *bdev;
1375 	int rc;
1376 
1377 	bdev = calloc(1, sizeof(*bdev));
1378 	if (!bdev) {
1379 		SPDK_ERRLOG("bdev calloc() failed\n");
1380 		return -ENOMEM;
1381 	}
1382 
1383 	bdev->nvme_ns = nvme_ns;
1384 	bdev->opal = nvme_bdev_ctrlr->opal_dev != NULL;
1385 
1386 	rc = nvme_disk_create(&bdev->disk, nvme_bdev_ctrlr->name, nvme_bdev_ctrlr->ctrlr,
1387 			      nvme_ns->ns, nvme_bdev_ctrlr->prchk_flags, bdev);
1388 	if (rc != 0) {
1389 		SPDK_ERRLOG("Failed to create NVMe disk\n");
1390 		free(bdev);
1391 		return rc;
1392 	}
1393 
1394 	nvme_ns->bdev = bdev;
1395 
1396 	return 0;
1397 }
1398 
1399 static void
1400 nvme_ctrlr_populate_standard_namespace(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
1401 				       struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx)
1402 {
1403 	struct spdk_nvme_ctrlr	*ctrlr = nvme_bdev_ctrlr->ctrlr;
1404 	struct spdk_nvme_ns	*ns;
1405 	int			rc = 0;
1406 
1407 	ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id);
1408 	if (!ns) {
1409 		SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id);
1410 		rc = -EINVAL;
1411 		goto done;
1412 	}
1413 
1414 	nvme_ns->ns = ns;
1415 	nvme_ns->populated = true;
1416 
1417 	rc = nvme_bdev_create(nvme_bdev_ctrlr, nvme_ns);
1418 done:
1419 	nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, rc);
1420 }
1421 
1422 static bool
1423 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
1424 		 struct spdk_nvme_ctrlr_opts *opts)
1425 {
1426 	struct nvme_probe_skip_entry *entry;
1427 
1428 	TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) {
1429 		if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) {
1430 			return false;
1431 		}
1432 	}
1433 
1434 	opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst;
1435 	opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight;
1436 	opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight;
1437 	opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight;
1438 
1439 	SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr);
1440 
1441 	return true;
1442 }
1443 
1444 static void
1445 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl)
1446 {
1447 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = ctx;
1448 
1449 	if (spdk_nvme_cpl_is_error(cpl)) {
1450 		SPDK_WARNLOG("Abort failed. Resetting controller.\n");
1451 		_bdev_nvme_reset(nvme_bdev_ctrlr);
1452 	}
1453 }
1454 
1455 static void
1456 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr,
1457 	   struct spdk_nvme_qpair *qpair, uint16_t cid)
1458 {
1459 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = cb_arg;
1460 	union spdk_nvme_csts_register csts;
1461 	int rc;
1462 
1463 	assert(nvme_bdev_ctrlr->ctrlr == ctrlr);
1464 
1465 	SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid);
1466 
1467 	/* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O
1468 	 * queue.  (Note: qpair == NULL when there's an admin cmd timeout.)  Otherwise we
1469 	 * would submit another fabrics cmd on the admin queue to read CSTS and check for its
1470 	 * completion recursively.
1471 	 */
1472 	if (nvme_bdev_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) {
1473 		csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr);
1474 		if (csts.bits.cfs) {
1475 			SPDK_ERRLOG("Controller Fatal Status, reset required\n");
1476 			_bdev_nvme_reset(nvme_bdev_ctrlr);
1477 			return;
1478 		}
1479 	}
1480 
1481 	switch (g_opts.action_on_timeout) {
1482 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT:
1483 		if (qpair) {
1484 			rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid,
1485 						       nvme_abort_cpl, nvme_bdev_ctrlr);
1486 			if (rc == 0) {
1487 				return;
1488 			}
1489 
1490 			SPDK_ERRLOG("Unable to send abort. Resetting.\n");
1491 		}
1492 
1493 	/* FALLTHROUGH */
1494 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET:
1495 		_bdev_nvme_reset(nvme_bdev_ctrlr);
1496 		break;
1497 	case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE:
1498 		SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n");
1499 		break;
1500 	default:
1501 		SPDK_ERRLOG("An invalid timeout action value is found.\n");
1502 		break;
1503 	}
1504 }
1505 
1506 static void
1507 nvme_ctrlr_depopulate_standard_namespace(struct nvme_bdev_ns *nvme_ns)
1508 {
1509 	struct nvme_bdev *bdev;
1510 
1511 	bdev = nvme_ns->bdev;
1512 	if (bdev != NULL) {
1513 		spdk_bdev_unregister(&bdev->disk, NULL, NULL);
1514 	}
1515 
1516 	nvme_ctrlr_depopulate_namespace_done(nvme_ns);
1517 }
1518 
1519 static void
1520 nvme_ctrlr_populate_namespace(struct nvme_bdev_ctrlr *ctrlr, struct nvme_bdev_ns *nvme_ns,
1521 			      struct nvme_async_probe_ctx *ctx)
1522 {
1523 	g_populate_namespace_fn[nvme_ns->type](ctrlr, nvme_ns, ctx);
1524 }
1525 
1526 static void
1527 nvme_ctrlr_depopulate_namespace(struct nvme_bdev_ctrlr *ctrlr, struct nvme_bdev_ns *nvme_ns)
1528 {
1529 	g_depopulate_namespace_fn[nvme_ns->type](nvme_ns);
1530 }
1531 
1532 void
1533 nvme_ctrlr_populate_namespace_done(struct nvme_async_probe_ctx *ctx,
1534 				   struct nvme_bdev_ns *nvme_ns, int rc)
1535 {
1536 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = nvme_ns->ctrlr;
1537 
1538 	assert(nvme_bdev_ctrlr != NULL);
1539 
1540 	if (rc == 0) {
1541 		pthread_mutex_lock(&nvme_bdev_ctrlr->mutex);
1542 		nvme_bdev_ctrlr->ref++;
1543 		pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
1544 	} else {
1545 		memset(nvme_ns, 0, sizeof(*nvme_ns));
1546 	}
1547 
1548 	if (ctx) {
1549 		ctx->populates_in_progress--;
1550 		if (ctx->populates_in_progress == 0) {
1551 			nvme_ctrlr_populate_namespaces_done(nvme_bdev_ctrlr, ctx);
1552 		}
1553 	}
1554 }
1555 
1556 static void
1557 nvme_ctrlr_populate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
1558 			       struct nvme_async_probe_ctx *ctx)
1559 {
1560 	struct spdk_nvme_ctrlr	*ctrlr = nvme_bdev_ctrlr->ctrlr;
1561 	struct nvme_bdev_ns	*nvme_ns;
1562 	struct spdk_nvme_ns	*ns;
1563 	struct nvme_bdev	*bdev;
1564 	uint32_t		i;
1565 	int			rc;
1566 	uint64_t		num_sectors;
1567 	bool			ns_is_active;
1568 
1569 	if (ctx) {
1570 		/* Initialize this count to 1 to handle the populate functions
1571 		 * calling nvme_ctrlr_populate_namespace_done() immediately.
1572 		 */
1573 		ctx->populates_in_progress = 1;
1574 	}
1575 
1576 	for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
1577 		uint32_t	nsid = i + 1;
1578 
1579 		nvme_ns = nvme_bdev_ctrlr->namespaces[i];
1580 		ns_is_active = spdk_nvme_ctrlr_is_active_ns(ctrlr, nsid);
1581 
1582 		if (nvme_ns->populated && ns_is_active && nvme_ns->type == NVME_BDEV_NS_STANDARD) {
1583 			/* NS is still there but attributes may have changed */
1584 			ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid);
1585 			num_sectors = spdk_nvme_ns_get_num_sectors(ns);
1586 			bdev = nvme_ns->bdev;
1587 			assert(bdev != NULL);
1588 			if (bdev->disk.blockcnt != num_sectors) {
1589 				SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n",
1590 					       nsid,
1591 					       bdev->disk.name,
1592 					       bdev->disk.blockcnt,
1593 					       num_sectors);
1594 				rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors);
1595 				if (rc != 0) {
1596 					SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n",
1597 						    bdev->disk.name, rc);
1598 				}
1599 			}
1600 		}
1601 
1602 		if (!nvme_ns->populated && ns_is_active) {
1603 			nvme_ns->id = nsid;
1604 			nvme_ns->ctrlr = nvme_bdev_ctrlr;
1605 			if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) {
1606 				nvme_ns->type = NVME_BDEV_NS_OCSSD;
1607 			} else {
1608 				nvme_ns->type = NVME_BDEV_NS_STANDARD;
1609 			}
1610 
1611 			nvme_ns->bdev = NULL;
1612 
1613 			if (ctx) {
1614 				ctx->populates_in_progress++;
1615 			}
1616 			nvme_ctrlr_populate_namespace(nvme_bdev_ctrlr, nvme_ns, ctx);
1617 		}
1618 
1619 		if (nvme_ns->populated && !ns_is_active) {
1620 			nvme_ctrlr_depopulate_namespace(nvme_bdev_ctrlr, nvme_ns);
1621 		}
1622 	}
1623 
1624 	if (ctx) {
1625 		/* Decrement this count now that the loop is over to account
1626 		 * for the one we started with.  If the count is then 0, we
1627 		 * know any populate_namespace functions completed immediately,
1628 		 * so we'll kick the callback here.
1629 		 */
1630 		ctx->populates_in_progress--;
1631 		if (ctx->populates_in_progress == 0) {
1632 			nvme_ctrlr_populate_namespaces_done(nvme_bdev_ctrlr, ctx);
1633 		}
1634 	}
1635 
1636 }
1637 
1638 static void
1639 nvme_ctrlr_depopulate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr)
1640 {
1641 	uint32_t i;
1642 	struct nvme_bdev_ns *nvme_ns;
1643 
1644 	for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
1645 		uint32_t nsid = i + 1;
1646 
1647 		nvme_ns = nvme_bdev_ctrlr->namespaces[nsid - 1];
1648 		if (nvme_ns->populated) {
1649 			assert(nvme_ns->id == nsid);
1650 			nvme_ctrlr_depopulate_namespace(nvme_bdev_ctrlr, nvme_ns);
1651 		}
1652 	}
1653 }
1654 
1655 static void
1656 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl)
1657 {
1658 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr		= arg;
1659 	union spdk_nvme_async_event_completion	event;
1660 
1661 	if (spdk_nvme_cpl_is_error(cpl)) {
1662 		SPDK_WARNLOG("AER request execute failed");
1663 		return;
1664 	}
1665 
1666 	event.raw = cpl->cdw0;
1667 	if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) &&
1668 	    (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) {
1669 		nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, NULL);
1670 	} else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_VENDOR) &&
1671 		   (event.bits.log_page_identifier == SPDK_OCSSD_LOG_CHUNK_NOTIFICATION) &&
1672 		   spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) {
1673 		bdev_ocssd_handle_chunk_notification(nvme_bdev_ctrlr);
1674 	}
1675 }
1676 
1677 static void
1678 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, size_t count, int rc)
1679 {
1680 	if (ctx->cb_fn) {
1681 		ctx->cb_fn(ctx->cb_ctx, count, rc);
1682 	}
1683 
1684 	ctx->namespaces_populated = true;
1685 	if (ctx->probe_done) {
1686 		/* The probe was already completed, so we need to free the context
1687 		 * here.  This can happen for cases like OCSSD, where we need to
1688 		 * send additional commands to the SSD after attach.
1689 		 */
1690 		free(ctx);
1691 	}
1692 }
1693 
1694 static int
1695 _nvme_bdev_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr,
1696 			const char *name,
1697 			const struct spdk_nvme_transport_id *trid,
1698 			uint32_t prchk_flags,
1699 			struct nvme_bdev_ctrlr **_nvme_bdev_ctrlr)
1700 {
1701 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
1702 	struct nvme_bdev_ctrlr_trid *trid_entry;
1703 	uint32_t i;
1704 	int rc;
1705 
1706 	nvme_bdev_ctrlr = calloc(1, sizeof(*nvme_bdev_ctrlr));
1707 	if (nvme_bdev_ctrlr == NULL) {
1708 		SPDK_ERRLOG("Failed to allocate device struct\n");
1709 		return -ENOMEM;
1710 	}
1711 
1712 	rc = pthread_mutex_init(&nvme_bdev_ctrlr->mutex, NULL);
1713 	if (rc != 0) {
1714 		goto err_init_mutex;
1715 	}
1716 
1717 	TAILQ_INIT(&nvme_bdev_ctrlr->trids);
1718 	nvme_bdev_ctrlr->num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr);
1719 	if (nvme_bdev_ctrlr->num_ns != 0) {
1720 		nvme_bdev_ctrlr->namespaces = calloc(nvme_bdev_ctrlr->num_ns, sizeof(struct nvme_bdev_ns *));
1721 		if (!nvme_bdev_ctrlr->namespaces) {
1722 			SPDK_ERRLOG("Failed to allocate block namespaces pointer\n");
1723 			rc = -ENOMEM;
1724 			goto err_alloc_namespaces;
1725 		}
1726 	}
1727 
1728 	trid_entry = calloc(1, sizeof(*trid_entry));
1729 	if (trid_entry == NULL) {
1730 		SPDK_ERRLOG("Failed to allocate trid entry pointer\n");
1731 		rc = -ENOMEM;
1732 		goto err_alloc_trid;
1733 	}
1734 
1735 	trid_entry->trid = *trid;
1736 
1737 	for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
1738 		nvme_bdev_ctrlr->namespaces[i] = calloc(1, sizeof(struct nvme_bdev_ns));
1739 		if (nvme_bdev_ctrlr->namespaces[i] == NULL) {
1740 			SPDK_ERRLOG("Failed to allocate block namespace struct\n");
1741 			rc = -ENOMEM;
1742 			goto err_alloc_namespace;
1743 		}
1744 	}
1745 
1746 	nvme_bdev_ctrlr->thread = spdk_get_thread();
1747 	nvme_bdev_ctrlr->adminq_timer_poller = NULL;
1748 	nvme_bdev_ctrlr->ctrlr = ctrlr;
1749 	nvme_bdev_ctrlr->ref = 1;
1750 	nvme_bdev_ctrlr->connected_trid = &trid_entry->trid;
1751 	nvme_bdev_ctrlr->name = strdup(name);
1752 	if (nvme_bdev_ctrlr->name == NULL) {
1753 		rc = -ENOMEM;
1754 		goto err_alloc_name;
1755 	}
1756 
1757 	if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) {
1758 		rc = bdev_ocssd_init_ctrlr(nvme_bdev_ctrlr);
1759 		if (spdk_unlikely(rc != 0)) {
1760 			SPDK_ERRLOG("Unable to initialize OCSSD controller\n");
1761 			goto err_init_ocssd;
1762 		}
1763 	}
1764 
1765 	nvme_bdev_ctrlr->prchk_flags = prchk_flags;
1766 
1767 	spdk_io_device_register(nvme_bdev_ctrlr, bdev_nvme_create_cb, bdev_nvme_destroy_cb,
1768 				sizeof(struct nvme_io_channel),
1769 				name);
1770 
1771 	nvme_bdev_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_bdev_ctrlr,
1772 					       g_opts.nvme_adminq_poll_period_us);
1773 
1774 	TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nvme_bdev_ctrlr, tailq);
1775 
1776 	if (g_opts.timeout_us > 0) {
1777 		spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us,
1778 				timeout_cb, nvme_bdev_ctrlr);
1779 	}
1780 
1781 	spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_bdev_ctrlr);
1782 	spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_bdev_ctrlr);
1783 
1784 	if (spdk_nvme_ctrlr_get_flags(nvme_bdev_ctrlr->ctrlr) &
1785 	    SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) {
1786 		nvme_bdev_ctrlr->opal_dev = spdk_opal_dev_construct(nvme_bdev_ctrlr->ctrlr);
1787 		if (nvme_bdev_ctrlr->opal_dev == NULL) {
1788 			SPDK_ERRLOG("Failed to initialize Opal\n");
1789 		}
1790 	}
1791 
1792 	TAILQ_INSERT_HEAD(&nvme_bdev_ctrlr->trids, trid_entry, link);
1793 
1794 	if (_nvme_bdev_ctrlr != NULL) {
1795 		*_nvme_bdev_ctrlr = nvme_bdev_ctrlr;
1796 	}
1797 	return 0;
1798 
1799 err_init_ocssd:
1800 	free(nvme_bdev_ctrlr->name);
1801 err_alloc_name:
1802 err_alloc_namespace:
1803 	for (; i > 0; i--) {
1804 		free(nvme_bdev_ctrlr->namespaces[i - 1]);
1805 	}
1806 	free(trid_entry);
1807 err_alloc_trid:
1808 	free(nvme_bdev_ctrlr->namespaces);
1809 err_alloc_namespaces:
1810 	pthread_mutex_destroy(&nvme_bdev_ctrlr->mutex);
1811 err_init_mutex:
1812 	free(nvme_bdev_ctrlr);
1813 	return rc;
1814 }
1815 
1816 static void
1817 nvme_bdev_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr,
1818 		       const char *name,
1819 		       const struct spdk_nvme_transport_id *trid,
1820 		       uint32_t prchk_flags,
1821 		       struct nvme_async_probe_ctx *ctx)
1822 {
1823 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = NULL;
1824 	int rc;
1825 
1826 	rc = _nvme_bdev_ctrlr_create(ctrlr, name, trid, prchk_flags, &nvme_bdev_ctrlr);
1827 	if (rc != 0) {
1828 		SPDK_ERRLOG("Failed to create new NVMe controller\n");
1829 		goto err;
1830 	}
1831 
1832 	nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, ctx);
1833 	return;
1834 
1835 err:
1836 	if (ctx != NULL) {
1837 		populate_namespaces_cb(ctx, 0, rc);
1838 	}
1839 }
1840 
1841 static void
1842 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
1843 	  struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
1844 {
1845 	struct nvme_probe_ctx *ctx = cb_ctx;
1846 	char *name = NULL;
1847 	uint32_t prchk_flags = 0;
1848 	size_t i;
1849 
1850 	if (ctx) {
1851 		for (i = 0; i < ctx->count; i++) {
1852 			if (spdk_nvme_transport_id_compare(trid, &ctx->trids[i]) == 0) {
1853 				prchk_flags = ctx->prchk_flags[i];
1854 				name = strdup(ctx->names[i]);
1855 				break;
1856 			}
1857 		}
1858 	} else {
1859 		name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++);
1860 	}
1861 	if (!name) {
1862 		SPDK_ERRLOG("Failed to assign name to NVMe device\n");
1863 		return;
1864 	}
1865 
1866 	SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name);
1867 
1868 	nvme_bdev_ctrlr_create(ctrlr, name, trid, prchk_flags, NULL);
1869 
1870 	free(name);
1871 }
1872 
1873 static void
1874 _nvme_bdev_ctrlr_destruct(void *ctx)
1875 {
1876 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = ctx;
1877 
1878 	nvme_ctrlr_depopulate_namespaces(nvme_bdev_ctrlr);
1879 	nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr);
1880 }
1881 
1882 static int
1883 _bdev_nvme_delete(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, bool hotplug)
1884 {
1885 	struct nvme_probe_skip_entry *entry;
1886 
1887 	pthread_mutex_lock(&nvme_bdev_ctrlr->mutex);
1888 
1889 	/* The controller's destruction was already started */
1890 	if (nvme_bdev_ctrlr->destruct) {
1891 		pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
1892 		return 0;
1893 	}
1894 
1895 	if (!hotplug &&
1896 	    nvme_bdev_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
1897 		entry = calloc(1, sizeof(*entry));
1898 		if (!entry) {
1899 			pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
1900 			return -ENOMEM;
1901 		}
1902 		entry->trid = *nvme_bdev_ctrlr->connected_trid;
1903 		TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq);
1904 	}
1905 
1906 	nvme_bdev_ctrlr->destruct = true;
1907 	pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
1908 
1909 	_nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr);
1910 
1911 	return 0;
1912 }
1913 
1914 static void
1915 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr)
1916 {
1917 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = cb_ctx;
1918 
1919 	_bdev_nvme_delete(nvme_bdev_ctrlr, true);
1920 }
1921 
1922 static int
1923 bdev_nvme_hotplug_probe(void *arg)
1924 {
1925 	if (g_hotplug_probe_ctx == NULL) {
1926 		spdk_poller_unregister(&g_hotplug_probe_poller);
1927 		return SPDK_POLLER_IDLE;
1928 	}
1929 
1930 	if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) {
1931 		g_hotplug_probe_ctx = NULL;
1932 		spdk_poller_unregister(&g_hotplug_probe_poller);
1933 	}
1934 
1935 	return SPDK_POLLER_BUSY;
1936 }
1937 
1938 static int
1939 bdev_nvme_hotplug(void *arg)
1940 {
1941 	struct spdk_nvme_transport_id trid_pcie;
1942 
1943 	if (g_hotplug_probe_ctx) {
1944 		return SPDK_POLLER_BUSY;
1945 	}
1946 
1947 	memset(&trid_pcie, 0, sizeof(trid_pcie));
1948 	spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE);
1949 
1950 	g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL,
1951 			      hotplug_probe_cb, attach_cb, NULL);
1952 
1953 	if (g_hotplug_probe_ctx) {
1954 		assert(g_hotplug_probe_poller == NULL);
1955 		g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000);
1956 	}
1957 
1958 	return SPDK_POLLER_BUSY;
1959 }
1960 
1961 void
1962 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts)
1963 {
1964 	*opts = g_opts;
1965 }
1966 
1967 int
1968 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts)
1969 {
1970 	if (g_bdev_nvme_init_thread != NULL) {
1971 		if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
1972 			return -EPERM;
1973 		}
1974 	}
1975 
1976 	g_opts = *opts;
1977 
1978 	return 0;
1979 }
1980 
1981 struct set_nvme_hotplug_ctx {
1982 	uint64_t period_us;
1983 	bool enabled;
1984 	spdk_msg_fn fn;
1985 	void *fn_ctx;
1986 };
1987 
1988 static void
1989 set_nvme_hotplug_period_cb(void *_ctx)
1990 {
1991 	struct set_nvme_hotplug_ctx *ctx = _ctx;
1992 
1993 	spdk_poller_unregister(&g_hotplug_poller);
1994 	if (ctx->enabled) {
1995 		g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us);
1996 	}
1997 
1998 	g_nvme_hotplug_poll_period_us = ctx->period_us;
1999 	g_nvme_hotplug_enabled = ctx->enabled;
2000 	if (ctx->fn) {
2001 		ctx->fn(ctx->fn_ctx);
2002 	}
2003 
2004 	free(ctx);
2005 }
2006 
2007 int
2008 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx)
2009 {
2010 	struct set_nvme_hotplug_ctx *ctx;
2011 
2012 	if (enabled == true && !spdk_process_is_primary()) {
2013 		return -EPERM;
2014 	}
2015 
2016 	ctx = calloc(1, sizeof(*ctx));
2017 	if (ctx == NULL) {
2018 		return -ENOMEM;
2019 	}
2020 
2021 	period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us;
2022 	ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX);
2023 	ctx->enabled = enabled;
2024 	ctx->fn = cb;
2025 	ctx->fn_ctx = cb_ctx;
2026 
2027 	spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx);
2028 	return 0;
2029 }
2030 
2031 static void
2032 nvme_ctrlr_populate_namespaces_done(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
2033 				    struct nvme_async_probe_ctx *ctx)
2034 {
2035 	struct nvme_bdev_ns	*nvme_ns;
2036 	struct nvme_bdev	*nvme_bdev;
2037 	uint32_t		i, nsid;
2038 	size_t			j;
2039 
2040 	assert(nvme_bdev_ctrlr != NULL);
2041 
2042 	/*
2043 	 * Report the new bdevs that were created in this call.
2044 	 * There can be more than one bdev per NVMe controller.
2045 	 */
2046 	j = 0;
2047 	for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
2048 		nsid = i + 1;
2049 		nvme_ns = nvme_bdev_ctrlr->namespaces[nsid - 1];
2050 		if (!nvme_ns->populated) {
2051 			continue;
2052 		}
2053 		assert(nvme_ns->id == nsid);
2054 		nvme_bdev = nvme_ns->bdev;
2055 		if (nvme_bdev == NULL) {
2056 			assert(nvme_ns->type == NVME_BDEV_NS_OCSSD);
2057 			continue;
2058 		}
2059 		if (j < ctx->count) {
2060 			ctx->names[j] = nvme_bdev->disk.name;
2061 			j++;
2062 		} else {
2063 			SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n",
2064 				    ctx->count);
2065 			populate_namespaces_cb(ctx, 0, -ERANGE);
2066 			return;
2067 		}
2068 	}
2069 
2070 	populate_namespaces_cb(ctx, j, 0);
2071 }
2072 
2073 static int
2074 bdev_nvme_compare_trids(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
2075 			struct spdk_nvme_ctrlr *new_ctrlr,
2076 			struct spdk_nvme_transport_id *trid)
2077 {
2078 	struct nvme_bdev_ctrlr_trid *tmp_trid;
2079 
2080 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
2081 		SPDK_ERRLOG("PCIe failover is not supported.\n");
2082 		return -ENOTSUP;
2083 	}
2084 
2085 	/* Currently we only support failover to the same transport type. */
2086 	if (nvme_bdev_ctrlr->connected_trid->trtype != trid->trtype) {
2087 		return -EINVAL;
2088 	}
2089 
2090 	/* Currently we only support failover to the same NQN. */
2091 	if (strncmp(trid->subnqn, nvme_bdev_ctrlr->connected_trid->subnqn, SPDK_NVMF_NQN_MAX_LEN)) {
2092 		return -EINVAL;
2093 	}
2094 
2095 	/* Skip all the other checks if we've already registered this path. */
2096 	TAILQ_FOREACH(tmp_trid, &nvme_bdev_ctrlr->trids, link) {
2097 		if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) {
2098 			return -EEXIST;
2099 		}
2100 	}
2101 
2102 	return 0;
2103 }
2104 
2105 static bool
2106 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2)
2107 {
2108 	const struct spdk_nvme_ns_data *nsdata1, *nsdata2;
2109 
2110 	nsdata1 = spdk_nvme_ns_get_data(ns1);
2111 	nsdata2 = spdk_nvme_ns_get_data(ns2);
2112 
2113 	return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid));
2114 }
2115 
2116 static int
2117 bdev_nvme_compare_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
2118 			     struct spdk_nvme_ctrlr *new_ctrlr)
2119 {
2120 	uint32_t i, nsid;
2121 	struct nvme_bdev_ns *nvme_ns;
2122 	struct spdk_nvme_ns *new_ns;
2123 
2124 	if (spdk_nvme_ctrlr_get_num_ns(new_ctrlr) != nvme_bdev_ctrlr->num_ns) {
2125 		return -EINVAL;
2126 	}
2127 
2128 	for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
2129 		nsid = i + 1;
2130 
2131 		nvme_ns = nvme_bdev_ctrlr->namespaces[i];
2132 		if (!nvme_ns->populated) {
2133 			continue;
2134 		}
2135 
2136 		new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nsid);
2137 		assert(new_ns != NULL);
2138 
2139 		if (bdev_nvme_compare_ns(nvme_ns->ns, new_ns) != 0) {
2140 			return -EINVAL;
2141 		}
2142 	}
2143 
2144 	return 0;
2145 }
2146 
2147 static int
2148 _bdev_nvme_add_secondary_trid(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
2149 			      struct spdk_nvme_transport_id *trid)
2150 {
2151 	struct nvme_bdev_ctrlr_trid *new_trid, *tmp_trid;
2152 
2153 	new_trid = calloc(1, sizeof(*new_trid));
2154 	if (new_trid == NULL) {
2155 		return -ENOMEM;
2156 	}
2157 	new_trid->trid = *trid;
2158 	new_trid->is_failed = false;
2159 
2160 	TAILQ_FOREACH(tmp_trid, &nvme_bdev_ctrlr->trids, link) {
2161 		if (tmp_trid->is_failed) {
2162 			TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link);
2163 			return 0;
2164 		}
2165 	}
2166 
2167 	TAILQ_INSERT_TAIL(&nvme_bdev_ctrlr->trids, new_trid, link);
2168 	return 0;
2169 }
2170 
2171 /* This is the case that a secondary path is added to an existing
2172  * nvme_bdev_ctrlr for failover. After checking if it can access the same
2173  * namespaces as the primary path, it is disconnected until failover occurs.
2174  */
2175 static void
2176 bdev_nvme_add_secondary_trid(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
2177 			     struct spdk_nvme_ctrlr *new_ctrlr,
2178 			     struct spdk_nvme_transport_id *trid,
2179 			     struct nvme_async_probe_ctx *ctx)
2180 {
2181 	int rc;
2182 
2183 	assert(nvme_bdev_ctrlr != NULL);
2184 
2185 	pthread_mutex_lock(&nvme_bdev_ctrlr->mutex);
2186 
2187 	rc = bdev_nvme_compare_trids(nvme_bdev_ctrlr, new_ctrlr, trid);
2188 	if (rc != 0) {
2189 		goto exit;
2190 	}
2191 
2192 	rc = bdev_nvme_compare_namespaces(nvme_bdev_ctrlr, new_ctrlr);
2193 	if (rc != 0) {
2194 		goto exit;
2195 	}
2196 
2197 	rc = _bdev_nvme_add_secondary_trid(nvme_bdev_ctrlr, trid);
2198 
2199 exit:
2200 	pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
2201 
2202 	spdk_nvme_detach(new_ctrlr);
2203 
2204 	if (ctx != NULL) {
2205 		populate_namespaces_cb(ctx, 0, rc);
2206 	}
2207 }
2208 
2209 static void
2210 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
2211 		  struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
2212 {
2213 	struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx;
2214 	struct nvme_bdev_ctrlr	*nvme_bdev_ctrlr;
2215 	struct nvme_async_probe_ctx *ctx;
2216 
2217 	ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, opts);
2218 	ctx->ctrlr_attached = true;
2219 
2220 	nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(ctx->base_name);
2221 	if (nvme_bdev_ctrlr) {
2222 		bdev_nvme_add_secondary_trid(nvme_bdev_ctrlr, ctrlr, &ctx->trid, ctx);
2223 		return;
2224 	}
2225 
2226 	nvme_bdev_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx->prchk_flags, ctx);
2227 }
2228 
2229 static int
2230 bdev_nvme_async_poll(void *arg)
2231 {
2232 	struct nvme_async_probe_ctx	*ctx = arg;
2233 	int				rc;
2234 
2235 	rc = spdk_nvme_probe_poll_async(ctx->probe_ctx);
2236 	if (spdk_unlikely(rc != -EAGAIN)) {
2237 		ctx->probe_done = true;
2238 		spdk_poller_unregister(&ctx->poller);
2239 		if (!ctx->ctrlr_attached) {
2240 			/* The probe is done, but no controller was attached.
2241 			 * That means we had a failure, so report -EIO back to
2242 			 * the caller (usually the RPC). populate_namespaces_cb()
2243 			 * will take care of freeing the nvme_async_probe_ctx.
2244 			 */
2245 			populate_namespaces_cb(ctx, 0, -EIO);
2246 		} else if (ctx->namespaces_populated) {
2247 			/* The namespaces for the attached controller were all
2248 			 * populated and the response was already sent to the
2249 			 * caller (usually the RPC).  So free the context here.
2250 			 */
2251 			free(ctx);
2252 		}
2253 	}
2254 
2255 	return SPDK_POLLER_BUSY;
2256 }
2257 
2258 int
2259 bdev_nvme_create(struct spdk_nvme_transport_id *trid,
2260 		 struct spdk_nvme_host_id *hostid,
2261 		 const char *base_name,
2262 		 const char **names,
2263 		 uint32_t count,
2264 		 const char *hostnqn,
2265 		 uint32_t prchk_flags,
2266 		 spdk_bdev_create_nvme_fn cb_fn,
2267 		 void *cb_ctx,
2268 		 struct spdk_nvme_ctrlr_opts *opts)
2269 {
2270 	struct nvme_probe_skip_entry	*entry, *tmp;
2271 	struct nvme_async_probe_ctx	*ctx;
2272 
2273 	/* TODO expand this check to include both the host and target TRIDs.
2274 	 * Only if both are the same should we fail.
2275 	 */
2276 	if (nvme_bdev_ctrlr_get(trid) != NULL) {
2277 		SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr);
2278 		return -EEXIST;
2279 	}
2280 
2281 	ctx = calloc(1, sizeof(*ctx));
2282 	if (!ctx) {
2283 		return -ENOMEM;
2284 	}
2285 	ctx->base_name = base_name;
2286 	ctx->names = names;
2287 	ctx->count = count;
2288 	ctx->cb_fn = cb_fn;
2289 	ctx->cb_ctx = cb_ctx;
2290 	ctx->prchk_flags = prchk_flags;
2291 	ctx->trid = *trid;
2292 
2293 	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
2294 		TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) {
2295 			if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) {
2296 				TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq);
2297 				free(entry);
2298 				break;
2299 			}
2300 		}
2301 	}
2302 
2303 	if (opts) {
2304 		memcpy(&ctx->opts, opts, sizeof(*opts));
2305 	} else {
2306 		spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->opts, sizeof(ctx->opts));
2307 	}
2308 
2309 	ctx->opts.transport_retry_count = g_opts.retry_count;
2310 	ctx->opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms;
2311 
2312 	if (hostnqn) {
2313 		snprintf(ctx->opts.hostnqn, sizeof(ctx->opts.hostnqn), "%s", hostnqn);
2314 	}
2315 
2316 	if (hostid->hostaddr[0] != '\0') {
2317 		snprintf(ctx->opts.src_addr, sizeof(ctx->opts.src_addr), "%s", hostid->hostaddr);
2318 	}
2319 
2320 	if (hostid->hostsvcid[0] != '\0') {
2321 		snprintf(ctx->opts.src_svcid, sizeof(ctx->opts.src_svcid), "%s", hostid->hostsvcid);
2322 	}
2323 
2324 	ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->opts, connect_attach_cb);
2325 	if (ctx->probe_ctx == NULL) {
2326 		SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr);
2327 		free(ctx);
2328 		return -ENODEV;
2329 	}
2330 	ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000);
2331 
2332 	return 0;
2333 }
2334 
2335 static int
2336 bdev_nvme_delete_secondary_trid(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
2337 				const struct spdk_nvme_transport_id *trid)
2338 {
2339 	struct nvme_bdev_ctrlr_trid	*ctrlr_trid, *tmp_trid;
2340 
2341 	if (!spdk_nvme_transport_id_compare(trid, nvme_bdev_ctrlr->connected_trid)) {
2342 		return -EBUSY;
2343 	}
2344 
2345 	TAILQ_FOREACH_SAFE(ctrlr_trid, &nvme_bdev_ctrlr->trids, link, tmp_trid) {
2346 		if (!spdk_nvme_transport_id_compare(&ctrlr_trid->trid, trid)) {
2347 			TAILQ_REMOVE(&nvme_bdev_ctrlr->trids, ctrlr_trid, link);
2348 			free(ctrlr_trid);
2349 			return 0;
2350 		}
2351 	}
2352 
2353 	return -ENXIO;
2354 }
2355 
2356 int
2357 bdev_nvme_delete(const char *name, const struct spdk_nvme_transport_id *trid)
2358 {
2359 	struct nvme_bdev_ctrlr		*nvme_bdev_ctrlr;
2360 	struct nvme_bdev_ctrlr_trid	*ctrlr_trid;
2361 
2362 	if (name == NULL) {
2363 		return -EINVAL;
2364 	}
2365 
2366 	nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name);
2367 	if (nvme_bdev_ctrlr == NULL) {
2368 		SPDK_ERRLOG("Failed to find NVMe controller\n");
2369 		return -ENODEV;
2370 	}
2371 
2372 	/* case 1: remove the controller itself. */
2373 	if (trid == NULL) {
2374 		return _bdev_nvme_delete(nvme_bdev_ctrlr, false);
2375 	}
2376 
2377 	/* case 2: we are currently using the path to be removed. */
2378 	if (!spdk_nvme_transport_id_compare(trid, nvme_bdev_ctrlr->connected_trid)) {
2379 		ctrlr_trid = TAILQ_FIRST(&nvme_bdev_ctrlr->trids);
2380 		assert(nvme_bdev_ctrlr->connected_trid == &ctrlr_trid->trid);
2381 		/* case 2A: the current path is the only path. */
2382 		if (!TAILQ_NEXT(ctrlr_trid, link)) {
2383 			return _bdev_nvme_delete(nvme_bdev_ctrlr, false);
2384 		}
2385 
2386 		/* case 2B: there is an alternative path. */
2387 		return bdev_nvme_failover(nvme_bdev_ctrlr, true);
2388 	}
2389 
2390 	/* case 3: We are not using the specified path. */
2391 	return bdev_nvme_delete_secondary_trid(nvme_bdev_ctrlr, trid);
2392 }
2393 
2394 static int
2395 bdev_nvme_library_init(void)
2396 {
2397 	g_bdev_nvme_init_thread = spdk_get_thread();
2398 
2399 	spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_poll_group_create_cb,
2400 				bdev_nvme_poll_group_destroy_cb,
2401 				sizeof(struct nvme_bdev_poll_group),  "bdev_nvme_poll_groups");
2402 
2403 	return 0;
2404 }
2405 
2406 static void
2407 bdev_nvme_library_fini(void)
2408 {
2409 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, *tmp;
2410 	struct nvme_probe_skip_entry *entry, *entry_tmp;
2411 
2412 	spdk_poller_unregister(&g_hotplug_poller);
2413 	free(g_hotplug_probe_ctx);
2414 	g_hotplug_probe_ctx = NULL;
2415 
2416 	TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) {
2417 		TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq);
2418 		free(entry);
2419 	}
2420 
2421 	pthread_mutex_lock(&g_bdev_nvme_mutex);
2422 	TAILQ_FOREACH_SAFE(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq, tmp) {
2423 		pthread_mutex_lock(&nvme_bdev_ctrlr->mutex);
2424 		if (nvme_bdev_ctrlr->destruct) {
2425 			/* This controller's destruction was already started
2426 			 * before the application started shutting down
2427 			 */
2428 			pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
2429 			continue;
2430 		}
2431 		nvme_bdev_ctrlr->destruct = true;
2432 		pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex);
2433 
2434 		spdk_thread_send_msg(nvme_bdev_ctrlr->thread, _nvme_bdev_ctrlr_destruct,
2435 				     nvme_bdev_ctrlr);
2436 	}
2437 
2438 	g_bdev_nvme_module_finish = true;
2439 	if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
2440 		pthread_mutex_unlock(&g_bdev_nvme_mutex);
2441 		spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL);
2442 		spdk_bdev_module_finish_done();
2443 		return;
2444 	}
2445 
2446 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
2447 }
2448 
2449 static void
2450 bdev_nvme_verify_pi_error(struct spdk_bdev_io *bdev_io)
2451 {
2452 	struct spdk_bdev *bdev = bdev_io->bdev;
2453 	struct spdk_dif_ctx dif_ctx;
2454 	struct spdk_dif_error err_blk = {};
2455 	int rc;
2456 
2457 	rc = spdk_dif_ctx_init(&dif_ctx,
2458 			       bdev->blocklen, bdev->md_len, bdev->md_interleave,
2459 			       bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags,
2460 			       bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0);
2461 	if (rc != 0) {
2462 		SPDK_ERRLOG("Initialization of DIF context failed\n");
2463 		return;
2464 	}
2465 
2466 	if (bdev->md_interleave) {
2467 		rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
2468 				     bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
2469 	} else {
2470 		struct iovec md_iov = {
2471 			.iov_base	= bdev_io->u.bdev.md_buf,
2472 			.iov_len	= bdev_io->u.bdev.num_blocks * bdev->md_len,
2473 		};
2474 
2475 		rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
2476 				     &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
2477 	}
2478 
2479 	if (rc != 0) {
2480 		SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n",
2481 			    err_blk.err_type, err_blk.err_offset);
2482 	} else {
2483 		SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n");
2484 	}
2485 }
2486 
2487 static void
2488 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl)
2489 {
2490 	struct nvme_bdev_io *bio = ref;
2491 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2492 
2493 	if (spdk_nvme_cpl_is_success(cpl)) {
2494 		/* Run PI verification for read data buffer. */
2495 		bdev_nvme_verify_pi_error(bdev_io);
2496 	}
2497 
2498 	/* Return original completion status */
2499 	spdk_bdev_io_complete_nvme_status(bdev_io, bio->cpl.cdw0, bio->cpl.status.sct,
2500 					  bio->cpl.status.sc);
2501 }
2502 
2503 static void
2504 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl)
2505 {
2506 	struct nvme_bdev_io *bio = ref;
2507 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2508 	struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt;
2509 	struct nvme_io_channel *nvme_ch;
2510 	struct nvme_bdev_ns *nvme_ns;
2511 	struct spdk_nvme_qpair *qpair;
2512 	int ret;
2513 
2514 	if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) {
2515 		SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n",
2516 			    cpl->status.sct, cpl->status.sc);
2517 
2518 		/* Save completion status to use after verifying PI error. */
2519 		bio->cpl = *cpl;
2520 
2521 		nvme_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io));
2522 
2523 		if (spdk_likely(bdev_nvme_find_io_path(nbdev, nvme_ch, &nvme_ns, &qpair))) {
2524 			/* Read without PI checking to verify PI error. */
2525 			ret = bdev_nvme_no_pi_readv(nvme_ns->ns,
2526 						    qpair,
2527 						    bio,
2528 						    bdev_io->u.bdev.iovs,
2529 						    bdev_io->u.bdev.iovcnt,
2530 						    bdev_io->u.bdev.md_buf,
2531 						    bdev_io->u.bdev.num_blocks,
2532 						    bdev_io->u.bdev.offset_blocks);
2533 			if (ret == 0) {
2534 				return;
2535 			}
2536 		}
2537 	}
2538 
2539 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2540 }
2541 
2542 static void
2543 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl)
2544 {
2545 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref);
2546 
2547 	if (spdk_nvme_cpl_is_pi_error(cpl)) {
2548 		SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n",
2549 			    cpl->status.sct, cpl->status.sc);
2550 		/* Run PI verification for write data buffer if PI error is detected. */
2551 		bdev_nvme_verify_pi_error(bdev_io);
2552 	}
2553 
2554 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2555 }
2556 
2557 static void
2558 bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl)
2559 {
2560 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref);
2561 
2562 	/* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks.
2563 	 * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error().
2564 	 */
2565 	bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0;
2566 
2567 	if (spdk_nvme_cpl_is_pi_error(cpl)) {
2568 		SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n",
2569 			    cpl->status.sct, cpl->status.sc);
2570 		/* Run PI verification for zone append data buffer if PI error is detected. */
2571 		bdev_nvme_verify_pi_error(bdev_io);
2572 	}
2573 
2574 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2575 }
2576 
2577 static void
2578 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl)
2579 {
2580 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref);
2581 
2582 	if (spdk_nvme_cpl_is_pi_error(cpl)) {
2583 		SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n",
2584 			    cpl->status.sct, cpl->status.sc);
2585 		/* Run PI verification for compare data buffer if PI error is detected. */
2586 		bdev_nvme_verify_pi_error(bdev_io);
2587 	}
2588 
2589 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2590 }
2591 
2592 static void
2593 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl)
2594 {
2595 	struct nvme_bdev_io *bio = ref;
2596 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2597 
2598 	/* Compare operation completion */
2599 	if ((cpl->cdw0 & 0xFF) == SPDK_NVME_OPC_COMPARE) {
2600 		/* Save compare result for write callback */
2601 		bio->cpl = *cpl;
2602 		return;
2603 	}
2604 
2605 	/* Write operation completion */
2606 	if (spdk_nvme_cpl_is_error(&bio->cpl)) {
2607 		/* If bio->cpl is already an error, it means the compare operation failed.  In that case,
2608 		 * complete the IO with the compare operation's status.
2609 		 */
2610 		if (!spdk_nvme_cpl_is_error(cpl)) {
2611 			SPDK_ERRLOG("Unexpected write success after compare failure.\n");
2612 		}
2613 
2614 		spdk_bdev_io_complete_nvme_status(bdev_io, bio->cpl.cdw0, bio->cpl.status.sct, bio->cpl.status.sc);
2615 	} else {
2616 		spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2617 	}
2618 }
2619 
2620 static void
2621 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl)
2622 {
2623 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref);
2624 
2625 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2626 }
2627 
2628 static int
2629 fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc)
2630 {
2631 	switch (desc->zs) {
2632 	case SPDK_NVME_ZONE_STATE_EMPTY:
2633 		info->state = SPDK_BDEV_ZONE_STATE_EMPTY;
2634 		break;
2635 	case SPDK_NVME_ZONE_STATE_IOPEN:
2636 		info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN;
2637 		break;
2638 	case SPDK_NVME_ZONE_STATE_EOPEN:
2639 		info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN;
2640 		break;
2641 	case SPDK_NVME_ZONE_STATE_CLOSED:
2642 		info->state = SPDK_BDEV_ZONE_STATE_CLOSED;
2643 		break;
2644 	case SPDK_NVME_ZONE_STATE_RONLY:
2645 		info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY;
2646 		break;
2647 	case SPDK_NVME_ZONE_STATE_FULL:
2648 		info->state = SPDK_BDEV_ZONE_STATE_FULL;
2649 		break;
2650 	case SPDK_NVME_ZONE_STATE_OFFLINE:
2651 		info->state = SPDK_BDEV_ZONE_STATE_OFFLINE;
2652 		break;
2653 	default:
2654 		SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs);
2655 		return -EIO;
2656 	}
2657 
2658 	info->zone_id = desc->zslba;
2659 	info->write_pointer = desc->wp;
2660 	info->capacity = desc->zcap;
2661 
2662 	return 0;
2663 }
2664 
2665 static void
2666 bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl)
2667 {
2668 	struct nvme_bdev_io *bio = ref;
2669 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2670 	struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt;
2671 	struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io);
2672 	struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
2673 	uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id;
2674 	uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones;
2675 	struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf;
2676 	enum spdk_bdev_io_status status;
2677 	uint64_t max_zones_per_buf, i;
2678 	uint32_t zone_report_bufsize;
2679 	struct nvme_bdev_ns *nvme_ns;
2680 	struct spdk_nvme_qpair *qpair;
2681 	int ret;
2682 
2683 	if (spdk_nvme_cpl_is_error(cpl)) {
2684 		goto out_complete_io_nvme_cpl;
2685 	}
2686 
2687 	if (!bdev_nvme_find_io_path(nbdev, nvme_ch, &nvme_ns, &qpair)) {
2688 		status = SPDK_BDEV_IO_STATUS_FAILED;
2689 		goto out_complete_io_status;
2690 	}
2691 
2692 	zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(nvme_ns->ns);
2693 	max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) /
2694 			    sizeof(bio->zone_report_buf->descs[0]);
2695 
2696 	if (bio->zone_report_buf->nr_zones > max_zones_per_buf) {
2697 		status = SPDK_BDEV_IO_STATUS_FAILED;
2698 		goto out_complete_io_status;
2699 	}
2700 
2701 	if (!bio->zone_report_buf->nr_zones) {
2702 		status = SPDK_BDEV_IO_STATUS_FAILED;
2703 		goto out_complete_io_status;
2704 	}
2705 
2706 	for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) {
2707 		ret = fill_zone_from_report(&info[bio->handled_zones],
2708 					    &bio->zone_report_buf->descs[i]);
2709 		if (ret) {
2710 			status = SPDK_BDEV_IO_STATUS_FAILED;
2711 			goto out_complete_io_status;
2712 		}
2713 		bio->handled_zones++;
2714 	}
2715 
2716 	if (bio->handled_zones < zones_to_copy) {
2717 		uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(nvme_ns->ns);
2718 		uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones);
2719 
2720 		memset(bio->zone_report_buf, 0, zone_report_bufsize);
2721 		ret = spdk_nvme_zns_report_zones(nvme_ns->ns, qpair,
2722 						 bio->zone_report_buf, zone_report_bufsize,
2723 						 slba, SPDK_NVME_ZRA_LIST_ALL, true,
2724 						 bdev_nvme_get_zone_info_done, bio);
2725 		if (!ret) {
2726 			return;
2727 		} else if (ret == -ENOMEM) {
2728 			status = SPDK_BDEV_IO_STATUS_NOMEM;
2729 			goto out_complete_io_status;
2730 		} else {
2731 			status = SPDK_BDEV_IO_STATUS_FAILED;
2732 			goto out_complete_io_status;
2733 		}
2734 	}
2735 
2736 out_complete_io_nvme_cpl:
2737 	free(bio->zone_report_buf);
2738 	bio->zone_report_buf = NULL;
2739 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2740 	return;
2741 
2742 out_complete_io_status:
2743 	free(bio->zone_report_buf);
2744 	bio->zone_report_buf = NULL;
2745 	spdk_bdev_io_complete(bdev_io, status);
2746 }
2747 
2748 static void
2749 bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl)
2750 {
2751 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref);
2752 
2753 	spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
2754 }
2755 
2756 static void
2757 bdev_nvme_admin_passthru_completion(void *ctx)
2758 {
2759 	struct nvme_bdev_io *bio = ctx;
2760 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2761 
2762 	spdk_bdev_io_complete_nvme_status(bdev_io,
2763 					  bio->cpl.cdw0, bio->cpl.status.sct, bio->cpl.status.sc);
2764 }
2765 
2766 static void
2767 bdev_nvme_abort_completion(void *ctx)
2768 {
2769 	struct nvme_bdev_io *bio = ctx;
2770 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
2771 
2772 	if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) {
2773 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
2774 	} else {
2775 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
2776 	}
2777 }
2778 
2779 static void
2780 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl)
2781 {
2782 	struct nvme_bdev_io *bio = ref;
2783 
2784 	bio->cpl = *cpl;
2785 	spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_completion, bio);
2786 }
2787 
2788 static void
2789 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl)
2790 {
2791 	struct nvme_bdev_io *bio = ref;
2792 
2793 	bio->cpl = *cpl;
2794 	spdk_thread_send_msg(bio->orig_thread, bdev_nvme_admin_passthru_completion, bio);
2795 }
2796 
2797 static void
2798 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset)
2799 {
2800 	struct nvme_bdev_io *bio = ref;
2801 	struct iovec *iov;
2802 
2803 	bio->iov_offset = sgl_offset;
2804 	for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) {
2805 		iov = &bio->iovs[bio->iovpos];
2806 		if (bio->iov_offset < iov->iov_len) {
2807 			break;
2808 		}
2809 
2810 		bio->iov_offset -= iov->iov_len;
2811 	}
2812 }
2813 
2814 static int
2815 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length)
2816 {
2817 	struct nvme_bdev_io *bio = ref;
2818 	struct iovec *iov;
2819 
2820 	assert(bio->iovpos < bio->iovcnt);
2821 
2822 	iov = &bio->iovs[bio->iovpos];
2823 
2824 	*address = iov->iov_base;
2825 	*length = iov->iov_len;
2826 
2827 	if (bio->iov_offset) {
2828 		assert(bio->iov_offset <= iov->iov_len);
2829 		*address += bio->iov_offset;
2830 		*length -= bio->iov_offset;
2831 	}
2832 
2833 	bio->iov_offset += *length;
2834 	if (bio->iov_offset == iov->iov_len) {
2835 		bio->iovpos++;
2836 		bio->iov_offset = 0;
2837 	}
2838 
2839 	return 0;
2840 }
2841 
2842 static void
2843 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset)
2844 {
2845 	struct nvme_bdev_io *bio = ref;
2846 	struct iovec *iov;
2847 
2848 	bio->fused_iov_offset = sgl_offset;
2849 	for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) {
2850 		iov = &bio->fused_iovs[bio->fused_iovpos];
2851 		if (bio->fused_iov_offset < iov->iov_len) {
2852 			break;
2853 		}
2854 
2855 		bio->fused_iov_offset -= iov->iov_len;
2856 	}
2857 }
2858 
2859 static int
2860 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length)
2861 {
2862 	struct nvme_bdev_io *bio = ref;
2863 	struct iovec *iov;
2864 
2865 	assert(bio->fused_iovpos < bio->fused_iovcnt);
2866 
2867 	iov = &bio->fused_iovs[bio->fused_iovpos];
2868 
2869 	*address = iov->iov_base;
2870 	*length = iov->iov_len;
2871 
2872 	if (bio->fused_iov_offset) {
2873 		assert(bio->fused_iov_offset <= iov->iov_len);
2874 		*address += bio->fused_iov_offset;
2875 		*length -= bio->fused_iov_offset;
2876 	}
2877 
2878 	bio->fused_iov_offset += *length;
2879 	if (bio->fused_iov_offset == iov->iov_len) {
2880 		bio->fused_iovpos++;
2881 		bio->fused_iov_offset = 0;
2882 	}
2883 
2884 	return 0;
2885 }
2886 
2887 static int
2888 bdev_nvme_no_pi_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
2889 		      struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
2890 		      void *md, uint64_t lba_count, uint64_t lba)
2891 {
2892 	int rc;
2893 
2894 	SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n",
2895 		      lba_count, lba);
2896 
2897 	bio->iovs = iov;
2898 	bio->iovcnt = iovcnt;
2899 	bio->iovpos = 0;
2900 	bio->iov_offset = 0;
2901 
2902 	rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count,
2903 					    bdev_nvme_no_pi_readv_done, bio, 0,
2904 					    bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
2905 					    md, 0, 0);
2906 
2907 	if (rc != 0 && rc != -ENOMEM) {
2908 		SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc);
2909 	}
2910 	return rc;
2911 }
2912 
2913 static int
2914 bdev_nvme_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
2915 		struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
2916 		void *md, uint64_t lba_count, uint64_t lba, uint32_t flags)
2917 {
2918 	int rc;
2919 
2920 	SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n",
2921 		      lba_count, lba);
2922 
2923 	bio->iovs = iov;
2924 	bio->iovcnt = iovcnt;
2925 	bio->iovpos = 0;
2926 	bio->iov_offset = 0;
2927 
2928 	if (iovcnt == 1) {
2929 		rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, md, lba,
2930 						   lba_count,
2931 						   bdev_nvme_readv_done, bio,
2932 						   flags,
2933 						   0, 0);
2934 	} else {
2935 		rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count,
2936 						    bdev_nvme_readv_done, bio, flags,
2937 						    bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
2938 						    md, 0, 0);
2939 	}
2940 
2941 	if (rc != 0 && rc != -ENOMEM) {
2942 		SPDK_ERRLOG("readv failed: rc = %d\n", rc);
2943 	}
2944 	return rc;
2945 }
2946 
2947 static int
2948 bdev_nvme_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
2949 		 struct nvme_bdev_io *bio,
2950 		 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
2951 		 uint32_t flags)
2952 {
2953 	int rc;
2954 
2955 	SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n",
2956 		      lba_count, lba);
2957 
2958 	bio->iovs = iov;
2959 	bio->iovcnt = iovcnt;
2960 	bio->iovpos = 0;
2961 	bio->iov_offset = 0;
2962 
2963 	if (iovcnt == 1) {
2964 		rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, md, lba,
2965 						    lba_count,
2966 						    bdev_nvme_writev_done, bio,
2967 						    flags,
2968 						    0, 0);
2969 	} else {
2970 		rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count,
2971 						     bdev_nvme_writev_done, bio, flags,
2972 						     bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
2973 						     md, 0, 0);
2974 	}
2975 
2976 	if (rc != 0 && rc != -ENOMEM) {
2977 		SPDK_ERRLOG("writev failed: rc = %d\n", rc);
2978 	}
2979 	return rc;
2980 }
2981 
2982 static int
2983 bdev_nvme_zone_appendv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
2984 		       struct nvme_bdev_io *bio,
2985 		       struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t zslba,
2986 		       uint32_t flags)
2987 {
2988 	int rc;
2989 
2990 	SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n",
2991 		      lba_count, zslba);
2992 
2993 	bio->iovs = iov;
2994 	bio->iovcnt = iovcnt;
2995 	bio->iovpos = 0;
2996 	bio->iov_offset = 0;
2997 
2998 	if (iovcnt == 1) {
2999 		rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba,
3000 						       lba_count,
3001 						       bdev_nvme_zone_appendv_done, bio,
3002 						       flags,
3003 						       0, 0);
3004 	} else {
3005 		rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count,
3006 							bdev_nvme_zone_appendv_done, bio, flags,
3007 							bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
3008 							md, 0, 0);
3009 	}
3010 
3011 	if (rc != 0 && rc != -ENOMEM) {
3012 		SPDK_ERRLOG("zone append failed: rc = %d\n", rc);
3013 	}
3014 	return rc;
3015 }
3016 
3017 static int
3018 bdev_nvme_comparev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3019 		   struct nvme_bdev_io *bio,
3020 		   struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba,
3021 		   uint32_t flags)
3022 {
3023 	int rc;
3024 
3025 	SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n",
3026 		      lba_count, lba);
3027 
3028 	bio->iovs = iov;
3029 	bio->iovcnt = iovcnt;
3030 	bio->iovpos = 0;
3031 	bio->iov_offset = 0;
3032 
3033 	rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count,
3034 					       bdev_nvme_comparev_done, bio, flags,
3035 					       bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
3036 					       md, 0, 0);
3037 
3038 	if (rc != 0 && rc != -ENOMEM) {
3039 		SPDK_ERRLOG("comparev failed: rc = %d\n", rc);
3040 	}
3041 	return rc;
3042 }
3043 
3044 static int
3045 bdev_nvme_comparev_and_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3046 			      struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt,
3047 			      struct iovec *write_iov, int write_iovcnt,
3048 			      void *md, uint64_t lba_count, uint64_t lba, uint32_t flags)
3049 {
3050 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
3051 	int rc;
3052 
3053 	SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n",
3054 		      lba_count, lba);
3055 
3056 	bio->iovs = cmp_iov;
3057 	bio->iovcnt = cmp_iovcnt;
3058 	bio->iovpos = 0;
3059 	bio->iov_offset = 0;
3060 	bio->fused_iovs = write_iov;
3061 	bio->fused_iovcnt = write_iovcnt;
3062 	bio->fused_iovpos = 0;
3063 	bio->fused_iov_offset = 0;
3064 
3065 	if (bdev_io->num_retries == 0) {
3066 		bio->first_fused_submitted = false;
3067 	}
3068 
3069 	if (!bio->first_fused_submitted) {
3070 		flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST;
3071 		memset(&bio->cpl, 0, sizeof(bio->cpl));
3072 
3073 		rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count,
3074 						       bdev_nvme_comparev_and_writev_done, bio, flags,
3075 						       bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0);
3076 		if (rc == 0) {
3077 			bio->first_fused_submitted = true;
3078 			flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST;
3079 		} else {
3080 			if (rc != -ENOMEM) {
3081 				SPDK_ERRLOG("compare failed: rc = %d\n", rc);
3082 			}
3083 			return rc;
3084 		}
3085 	}
3086 
3087 	flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND;
3088 
3089 	rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count,
3090 					     bdev_nvme_comparev_and_writev_done, bio, flags,
3091 					     bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0);
3092 	if (rc != 0 && rc != -ENOMEM) {
3093 		SPDK_ERRLOG("write failed: rc = %d\n", rc);
3094 		rc = 0;
3095 	}
3096 
3097 	return rc;
3098 }
3099 
3100 static int
3101 bdev_nvme_unmap(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3102 		struct nvme_bdev_io *bio,
3103 		uint64_t offset_blocks,
3104 		uint64_t num_blocks)
3105 {
3106 	struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES];
3107 	struct spdk_nvme_dsm_range *range;
3108 	uint64_t offset, remaining;
3109 	uint64_t num_ranges_u64;
3110 	uint16_t num_ranges;
3111 	int rc;
3112 
3113 	num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) /
3114 			 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
3115 	if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) {
3116 		SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks);
3117 		return -EINVAL;
3118 	}
3119 	num_ranges = (uint16_t)num_ranges_u64;
3120 
3121 	offset = offset_blocks;
3122 	remaining = num_blocks;
3123 	range = &dsm_ranges[0];
3124 
3125 	/* Fill max-size ranges until the remaining blocks fit into one range */
3126 	while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) {
3127 		range->attributes.raw = 0;
3128 		range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
3129 		range->starting_lba = offset;
3130 
3131 		offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
3132 		remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
3133 		range++;
3134 	}
3135 
3136 	/* Final range describes the remaining blocks */
3137 	range->attributes.raw = 0;
3138 	range->length = remaining;
3139 	range->starting_lba = offset;
3140 
3141 	rc = spdk_nvme_ns_cmd_dataset_management(ns, qpair,
3142 			SPDK_NVME_DSM_ATTR_DEALLOCATE,
3143 			dsm_ranges, num_ranges,
3144 			bdev_nvme_queued_done, bio);
3145 
3146 	return rc;
3147 }
3148 
3149 static int
3150 bdev_nvme_get_zone_info(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3151 			struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones,
3152 			struct spdk_bdev_zone_info *info)
3153 {
3154 	uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns);
3155 	uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns);
3156 	uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns);
3157 
3158 	if (zone_id % zone_size != 0) {
3159 		return -EINVAL;
3160 	}
3161 
3162 	if (num_zones > total_zones || !num_zones) {
3163 		return -EINVAL;
3164 	}
3165 
3166 	assert(!bio->zone_report_buf);
3167 	bio->zone_report_buf = calloc(1, zone_report_bufsize);
3168 	if (!bio->zone_report_buf) {
3169 		return -ENOMEM;
3170 	}
3171 
3172 	bio->handled_zones = 0;
3173 
3174 	return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize,
3175 					  zone_id, SPDK_NVME_ZRA_LIST_ALL, true,
3176 					  bdev_nvme_get_zone_info_done, bio);
3177 }
3178 
3179 static int
3180 bdev_nvme_zone_management(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3181 			  struct nvme_bdev_io *bio, uint64_t zone_id,
3182 			  enum spdk_bdev_zone_action action)
3183 {
3184 	switch (action) {
3185 	case SPDK_BDEV_ZONE_CLOSE:
3186 		return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false,
3187 						bdev_nvme_zone_management_done, bio);
3188 	case SPDK_BDEV_ZONE_FINISH:
3189 		return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false,
3190 						 bdev_nvme_zone_management_done, bio);
3191 	case SPDK_BDEV_ZONE_OPEN:
3192 		return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false,
3193 					       bdev_nvme_zone_management_done, bio);
3194 	case SPDK_BDEV_ZONE_RESET:
3195 		return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false,
3196 						bdev_nvme_zone_management_done, bio);
3197 	case SPDK_BDEV_ZONE_OFFLINE:
3198 		return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false,
3199 						  bdev_nvme_zone_management_done, bio);
3200 	default:
3201 		return -EINVAL;
3202 	}
3203 }
3204 
3205 static int
3206 bdev_nvme_admin_passthru(struct nvme_io_channel *nvme_ch, struct nvme_bdev_io *bio,
3207 			 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes)
3208 {
3209 	struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
3210 	uint32_t max_xfer_size;
3211 
3212 	if (!bdev_nvme_find_admin_path(nvme_ch, &nvme_bdev_ctrlr)) {
3213 		return -EINVAL;
3214 	}
3215 
3216 	max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_bdev_ctrlr->ctrlr);
3217 
3218 	if (nbytes > max_xfer_size) {
3219 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
3220 		return -EINVAL;
3221 	}
3222 
3223 	bio->orig_thread = spdk_get_thread();
3224 
3225 	return spdk_nvme_ctrlr_cmd_admin_raw(nvme_bdev_ctrlr->ctrlr, cmd, buf,
3226 					     (uint32_t)nbytes, bdev_nvme_admin_passthru_done, bio);
3227 }
3228 
3229 static int
3230 bdev_nvme_io_passthru(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3231 		      struct nvme_bdev_io *bio,
3232 		      struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes)
3233 {
3234 	uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns);
3235 	struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns);
3236 
3237 	if (nbytes > max_xfer_size) {
3238 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
3239 		return -EINVAL;
3240 	}
3241 
3242 	/*
3243 	 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid,
3244 	 * so fill it out automatically.
3245 	 */
3246 	cmd->nsid = spdk_nvme_ns_get_id(ns);
3247 
3248 	return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf,
3249 					  (uint32_t)nbytes, bdev_nvme_queued_done, bio);
3250 }
3251 
3252 static int
3253 bdev_nvme_io_passthru_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
3254 			 struct nvme_bdev_io *bio,
3255 			 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len)
3256 {
3257 	size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns);
3258 	uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns);
3259 	struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns);
3260 
3261 	if (nbytes > max_xfer_size) {
3262 		SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
3263 		return -EINVAL;
3264 	}
3265 
3266 	if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) {
3267 		SPDK_ERRLOG("invalid meta data buffer size\n");
3268 		return -EINVAL;
3269 	}
3270 
3271 	/*
3272 	 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid,
3273 	 * so fill it out automatically.
3274 	 */
3275 	cmd->nsid = spdk_nvme_ns_get_id(ns);
3276 
3277 	return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf,
3278 			(uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio);
3279 }
3280 
3281 static void
3282 bdev_nvme_abort_admin_cmd(void *ctx)
3283 {
3284 	struct nvme_bdev_io *bio = ctx;
3285 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
3286 	struct nvme_io_channel *nvme_ch;
3287 	struct nvme_bdev_io *bio_to_abort;
3288 	int rc;
3289 
3290 	nvme_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io));
3291 	bio_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx;
3292 
3293 	rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ch->ctrlr->ctrlr,
3294 					   NULL,
3295 					   bio_to_abort,
3296 					   bdev_nvme_abort_done, bio);
3297 	if (rc == -ENOENT) {
3298 		/* If no admin command was found in admin qpair, complete the abort
3299 		 * request with failure.
3300 		 */
3301 		bio->cpl.cdw0 |= 1U;
3302 		bio->cpl.status.sc = SPDK_NVME_SC_SUCCESS;
3303 		bio->cpl.status.sct = SPDK_NVME_SCT_GENERIC;
3304 
3305 		spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_completion, bio);
3306 	}
3307 }
3308 
3309 static int
3310 bdev_nvme_abort(struct nvme_io_channel *nvme_ch, struct nvme_bdev_io *bio,
3311 		struct nvme_bdev_io *bio_to_abort)
3312 {
3313 	int rc;
3314 
3315 	bio->orig_thread = spdk_get_thread();
3316 
3317 	rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ch->ctrlr->ctrlr,
3318 					   nvme_ch->qpair,
3319 					   bio_to_abort,
3320 					   bdev_nvme_abort_done, bio);
3321 	if (rc == -ENOENT) {
3322 		/* If no command was found in I/O qpair, the target command may be
3323 		 * admin command. Only a single thread tries aborting admin command
3324 		 * to clean I/O flow.
3325 		 */
3326 		spdk_thread_send_msg(nvme_ch->ctrlr->thread,
3327 				     bdev_nvme_abort_admin_cmd, bio);
3328 		rc = 0;
3329 	}
3330 
3331 	return rc;
3332 }
3333 
3334 static void
3335 nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w,
3336 		struct nvme_bdev_ns *nvme_ns)
3337 {
3338 	/* nop */
3339 }
3340 
3341 static void
3342 nvme_namespace_config_json(struct spdk_json_write_ctx *w, struct nvme_bdev_ns *nvme_ns)
3343 {
3344 	g_config_json_namespace_fn[nvme_ns->type](w, nvme_ns);
3345 }
3346 
3347 static void
3348 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w)
3349 {
3350 	const char	*action;
3351 
3352 	if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) {
3353 		action = "reset";
3354 	} else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) {
3355 		action = "abort";
3356 	} else {
3357 		action = "none";
3358 	}
3359 
3360 	spdk_json_write_object_begin(w);
3361 
3362 	spdk_json_write_named_string(w, "method", "bdev_nvme_set_options");
3363 
3364 	spdk_json_write_named_object_begin(w, "params");
3365 	spdk_json_write_named_string(w, "action_on_timeout", action);
3366 	spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us);
3367 	spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms);
3368 	spdk_json_write_named_uint32(w, "retry_count", g_opts.retry_count);
3369 	spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst);
3370 	spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight);
3371 	spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight);
3372 	spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight);
3373 	spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us);
3374 	spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us);
3375 	spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests);
3376 	spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit);
3377 	spdk_json_write_object_end(w);
3378 
3379 	spdk_json_write_object_end(w);
3380 }
3381 
3382 static void
3383 nvme_bdev_ctrlr_config_json(struct spdk_json_write_ctx *w,
3384 			    struct nvme_bdev_ctrlr *nvme_bdev_ctrlr)
3385 {
3386 	struct spdk_nvme_transport_id	*trid;
3387 
3388 	trid = nvme_bdev_ctrlr->connected_trid;
3389 
3390 	spdk_json_write_object_begin(w);
3391 
3392 	spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller");
3393 
3394 	spdk_json_write_named_object_begin(w, "params");
3395 	spdk_json_write_named_string(w, "name", nvme_bdev_ctrlr->name);
3396 	nvme_bdev_dump_trid_json(trid, w);
3397 	spdk_json_write_named_bool(w, "prchk_reftag",
3398 				   (nvme_bdev_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0);
3399 	spdk_json_write_named_bool(w, "prchk_guard",
3400 				   (nvme_bdev_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0);
3401 
3402 	spdk_json_write_object_end(w);
3403 
3404 	spdk_json_write_object_end(w);
3405 }
3406 
3407 static void
3408 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w)
3409 {
3410 	spdk_json_write_object_begin(w);
3411 	spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug");
3412 
3413 	spdk_json_write_named_object_begin(w, "params");
3414 	spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us);
3415 	spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled);
3416 	spdk_json_write_object_end(w);
3417 
3418 	spdk_json_write_object_end(w);
3419 }
3420 
3421 static int
3422 bdev_nvme_config_json(struct spdk_json_write_ctx *w)
3423 {
3424 	struct nvme_bdev_ctrlr	*nvme_bdev_ctrlr;
3425 	uint32_t		nsid;
3426 
3427 	bdev_nvme_opts_config_json(w);
3428 
3429 	pthread_mutex_lock(&g_bdev_nvme_mutex);
3430 
3431 	TAILQ_FOREACH(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
3432 		nvme_bdev_ctrlr_config_json(w, nvme_bdev_ctrlr);
3433 
3434 		for (nsid = 0; nsid < nvme_bdev_ctrlr->num_ns; ++nsid) {
3435 			if (!nvme_bdev_ctrlr->namespaces[nsid]->populated) {
3436 				continue;
3437 			}
3438 
3439 			nvme_namespace_config_json(w, nvme_bdev_ctrlr->namespaces[nsid]);
3440 		}
3441 	}
3442 
3443 	/* Dump as last parameter to give all NVMe bdevs chance to be constructed
3444 	 * before enabling hotplug poller.
3445 	 */
3446 	bdev_nvme_hotplug_config_json(w);
3447 
3448 	pthread_mutex_unlock(&g_bdev_nvme_mutex);
3449 	return 0;
3450 }
3451 
3452 struct spdk_nvme_ctrlr *
3453 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev)
3454 {
3455 	if (!bdev || bdev->module != &nvme_if) {
3456 		return NULL;
3457 	}
3458 
3459 	return SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk)->nvme_ns->ctrlr->ctrlr;
3460 }
3461 
3462 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme)
3463