xref: /spdk/lib/nvmf/vfio_user.c (revision 6c8dc25e138a6c0343094592c07c24c413a934d6)
1 /*-
2  *   BSD LICENSE
3  *   Copyright (c) Intel Corporation. All rights reserved.
4  *   Copyright (c) 2019, Nutanix Inc. All rights reserved.
5  *   Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 /*
35  * NVMe over vfio-user transport
36  */
37 
38 #include <vfio-user/libvfio-user.h>
39 #include <vfio-user/pci_defs.h>
40 
41 #include "spdk/barrier.h"
42 #include "spdk/stdinc.h"
43 #include "spdk/assert.h"
44 #include "spdk/thread.h"
45 #include "spdk/nvmf_transport.h"
46 #include "spdk/sock.h"
47 #include "spdk/string.h"
48 #include "spdk/util.h"
49 #include "spdk/log.h"
50 
51 #include "transport.h"
52 
53 #include "nvmf_internal.h"
54 
55 #define NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH 256
56 #define NVMF_VFIO_USER_DEFAULT_AQ_DEPTH 32
57 #define NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE ((NVMF_REQ_MAX_BUFFERS - 1) << SHIFT_4KB)
58 #define NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE
59 
60 #define NVME_DOORBELLS_OFFSET	0x1000
61 #define NVMF_VFIO_USER_DOORBELLS_SIZE 0x1000
62 
63 /*
64  * NVMe driver reads 4096 bytes, which is the extended PCI configuration space
65  * available on PCI-X 2.0 and PCI Express buses
66  */
67 #define NVME_REG_CFG_SIZE       0x1000
68 #define NVME_REG_BAR0_SIZE      (NVME_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE)
69 #define NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR ((NVMF_VFIO_USER_DOORBELLS_SIZE) / 8)
70 #define NVME_IRQ_MSIX_NUM	NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR
71 /* MSIX Table Size */
72 #define NVME_BAR4_SIZE		SPDK_ALIGN_CEIL((NVME_IRQ_MSIX_NUM * 16), 0x1000)
73 /* MSIX Pending Bit Array Size */
74 #define NVME_BAR5_SIZE		SPDK_ALIGN_CEIL((NVME_IRQ_MSIX_NUM / 8), 0x1000)
75 
76 #define NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR (NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR / 4)
77 
78 struct nvmf_vfio_user_req;
79 
80 typedef int (*nvmf_vfio_user_req_cb_fn)(struct nvmf_vfio_user_req *req, void *cb_arg);
81 
82 /* 1 more for PRP2 list itself */
83 #define NVMF_VFIO_USER_MAX_IOVECS	(NVMF_REQ_MAX_BUFFERS + 1)
84 
85 enum nvmf_vfio_user_req_state {
86 	VFIO_USER_REQUEST_STATE_FREE = 0,
87 	VFIO_USER_REQUEST_STATE_EXECUTING,
88 };
89 
90 /* NVMe device state representation */
91 struct nvme_migr_sq_state {
92 	uint16_t	sqid;
93 	uint16_t	cqid;
94 	uint32_t	head;
95 	uint32_t	size;
96 	uint32_t	reserved;
97 	uint64_t	dma_addr;
98 };
99 SPDK_STATIC_ASSERT(sizeof(struct nvme_migr_sq_state) == 0x18, "Incorrect size");
100 
101 struct nvme_migr_cq_state {
102 	uint16_t	cqid;
103 	uint16_t	phase;
104 	uint32_t	tail;
105 	uint32_t	size;
106 	uint32_t	iv;
107 	uint32_t	ien;
108 	uint32_t	reserved;
109 	uint64_t	dma_addr;
110 };
111 SPDK_STATIC_ASSERT(sizeof(struct nvme_migr_cq_state) == 0x20, "Incorrect size");
112 
113 #define VFIO_USER_NVME_MIGR_MAGIC	0xAFEDBC23
114 
115 /* The device state is in VFIO MIGRATION BAR(9) region, keep the device state page aligned.
116  *
117  * NVMe device migration region is defined as below:
118  * ----------------------------------------------------------------------
119  * | nvme_migr_device_state | private controller data | queue pairs | BARs |
120  * ----------------------------------------------------------------------
121  *
122  * Keep nvme_migr_device_state as a fixed 0x1000 length, all new added fields
123  * can use the reserved space at the end of the data structure.
124  */
125 struct nvme_migr_device_state {
126 	/* Magic value to validate migration data */
127 	uint32_t	magic;
128 	/* Version to check the data is same from source to destination */
129 	uint32_t	version;
130 
131 	/* The library uses this field to know how many fields in this
132 	 * structure are valid, starting at the beginning of this data
133 	 * structure.  New added fields in future use `unused` memory
134 	 * spaces.
135 	 */
136 	uint32_t	opts_size;
137 	uint32_t	reserved0;
138 
139 	/* BARs information */
140 	uint64_t	bar_offset[VFU_PCI_DEV_NUM_REGIONS];
141 	uint64_t	bar_len[VFU_PCI_DEV_NUM_REGIONS];
142 
143 	/* Queue pair start offset, starting at the beginning of this
144 	 * data structure.
145 	 */
146 	uint64_t	qp_offset;
147 	uint64_t	qp_len;
148 
149 	/* Controller data structure */
150 	uint32_t	num_io_queues;
151 	uint32_t	reserved1;
152 
153 	uint16_t	reserved2[3];
154 	uint16_t	nr_aers;
155 	uint16_t	aer_cids[NVMF_MIGR_MAX_PENDING_AERS];
156 
157 	/* Controller private data offset and length if exist, starting at
158 	 * the beginning of this data structure.
159 	 */
160 	uint64_t	private_data_offset;
161 	uint64_t	private_data_len;
162 
163 	/* Reserved memory space for new added fields, the
164 	 * field is always at the end of this data structure.
165 	 */
166 	uint8_t		unused[3356];
167 };
168 SPDK_STATIC_ASSERT(sizeof(struct nvme_migr_device_state) == 0x1000, "Incorrect size");
169 
170 struct vfio_user_nvme_migr_qp {
171 	struct nvme_migr_sq_state	sq;
172 	struct nvme_migr_cq_state	cq;
173 };
174 
175 /* NVMe state definition used temporarily to load/restore from/to NVMe migration BAR region */
176 struct vfio_user_nvme_migr_state {
177 	struct nvme_migr_device_state	ctrlr_data;
178 	struct nvmf_ctrlr_migr_data	private_data;
179 	struct vfio_user_nvme_migr_qp	qps[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR];
180 	uint8_t				bar0[NVME_REG_BAR0_SIZE];
181 	uint8_t				cfg[NVME_REG_CFG_SIZE];
182 };
183 
184 struct nvmf_vfio_user_req  {
185 	struct spdk_nvmf_request		req;
186 	struct spdk_nvme_cpl			rsp;
187 	struct spdk_nvme_cmd			cmd;
188 
189 	enum nvmf_vfio_user_req_state		state;
190 	nvmf_vfio_user_req_cb_fn		cb_fn;
191 	void					*cb_arg;
192 
193 	/* old CC before prop_set_cc fabric command */
194 	union spdk_nvme_cc_register		cc;
195 
196 	TAILQ_ENTRY(nvmf_vfio_user_req)		link;
197 
198 	struct iovec				iov[NVMF_VFIO_USER_MAX_IOVECS];
199 	uint8_t					iovcnt;
200 
201 	/* NVMF_VFIO_USER_MAX_IOVECS worth of dma_sg_t. */
202 	uint8_t					sg[];
203 };
204 
205 /*
206  * Mapping of an NVMe queue.
207  *
208  * This holds the information tracking a local process mapping of an NVMe queue
209  * shared by the client.
210  */
211 struct nvme_q_mapping {
212 	/* iov of local process mapping. */
213 	struct iovec iov;
214 	/* Stored sg, needed for unmap. */
215 	dma_sg_t *sg;
216 	/* Client PRP of queue. */
217 	uint64_t prp1;
218 };
219 
220 enum nvmf_vfio_user_sq_state {
221 	VFIO_USER_SQ_UNUSED = 0,
222 	VFIO_USER_SQ_CREATED,
223 	VFIO_USER_SQ_DELETED,
224 	VFIO_USER_SQ_ACTIVE,
225 	VFIO_USER_SQ_INACTIVE
226 };
227 
228 enum nvmf_vfio_user_cq_state {
229 	VFIO_USER_CQ_UNUSED = 0,
230 	VFIO_USER_CQ_CREATED,
231 	VFIO_USER_CQ_DELETED,
232 };
233 
234 enum nvmf_vfio_user_ctrlr_state {
235 	VFIO_USER_CTRLR_CREATING = 0,
236 	VFIO_USER_CTRLR_RUNNING,
237 	/* Quiesce requested by libvfio-user */
238 	VFIO_USER_CTRLR_PAUSING,
239 	/* NVMf subsystem is paused, it's safe to do PCI reset, memory register,
240 	 * memory unergister, and vfio migration state transition in this state.
241 	 */
242 	VFIO_USER_CTRLR_PAUSED,
243 	/*
244 	 * Implies that the NVMf subsystem is paused. Device will be unquiesced (PCI
245 	 * reset, memory register and unregister, controller in destination VM has
246 	 * been restored).  NVMf subsystem resume has been requested.
247 	 */
248 	VFIO_USER_CTRLR_RESUMING,
249 	/*
250 	 * Implies that the NVMf subsystem is paused. Both controller in source VM and
251 	 * destinatiom VM is in this state when doing live migration.
252 	 */
253 	VFIO_USER_CTRLR_MIGRATING
254 };
255 
256 /* Migration region to record NVMe device state data structure */
257 struct vfio_user_migration_region {
258 	uint64_t last_data_offset;
259 	uint64_t pending_bytes;
260 };
261 
262 struct nvmf_vfio_user_sq {
263 	struct spdk_nvmf_qpair			qpair;
264 	struct spdk_nvmf_transport_poll_group	*group;
265 	struct nvmf_vfio_user_ctrlr		*ctrlr;
266 
267 	uint32_t				qid;
268 	/* Number of entries in queue. */
269 	uint32_t				size;
270 	struct nvme_q_mapping			mapping;
271 	enum nvmf_vfio_user_sq_state		sq_state;
272 
273 	uint32_t				head;
274 
275 	/* multiple SQs can be mapped to the same CQ */
276 	uint16_t				cqid;
277 
278 	/* handle_queue_connect_rsp() can be used both for CREATE IO SQ response
279 	 * and SQ re-connect response in the destination VM, for the prior case,
280 	 * we will post a NVMe completion to VM, we will not set this flag when
281 	 * re-connecting SQs in the destination VM.
282 	 */
283 	bool					post_create_io_sq_completion;
284 	/* Copy of Create IO SQ command, this field is used together with
285 	 * `post_create_io_sq_completion` flag.
286 	 */
287 	struct spdk_nvme_cmd			create_io_sq_cmd;
288 
289 	/* Currently unallocated reqs. */
290 	TAILQ_HEAD(, nvmf_vfio_user_req)	free_reqs;
291 	/* Poll group entry */
292 	TAILQ_ENTRY(nvmf_vfio_user_sq)		link;
293 	/* Connected SQ entry */
294 	TAILQ_ENTRY(nvmf_vfio_user_sq)		tailq;
295 };
296 
297 struct nvmf_vfio_user_cq {
298 	struct spdk_nvmf_transport_poll_group	*group;
299 	struct spdk_thread			*thread;
300 	uint32_t				cq_ref;
301 
302 	uint32_t				qid;
303 	/* Number of entries in queue. */
304 	uint32_t				size;
305 	struct nvme_q_mapping			mapping;
306 	enum nvmf_vfio_user_cq_state		cq_state;
307 
308 	uint32_t				tail;
309 	bool					phase;
310 
311 	uint16_t				iv;
312 	bool					ien;
313 };
314 
315 struct nvmf_vfio_user_poll_group {
316 	struct spdk_nvmf_transport_poll_group	group;
317 	TAILQ_ENTRY(nvmf_vfio_user_poll_group)	link;
318 	TAILQ_HEAD(, nvmf_vfio_user_sq)		sqs;
319 };
320 
321 struct nvmf_vfio_user_ctrlr {
322 	struct nvmf_vfio_user_endpoint		*endpoint;
323 	struct nvmf_vfio_user_transport		*transport;
324 
325 	/* Connected SQs list */
326 	TAILQ_HEAD(, nvmf_vfio_user_sq)		connected_sqs;
327 	enum nvmf_vfio_user_ctrlr_state		state;
328 
329 	struct vfio_user_migration_region	migr_reg;
330 	/* Controller is in source VM when doing live migration */
331 	bool					in_source_vm;
332 
333 	struct spdk_thread			*thread;
334 	struct spdk_poller			*vfu_ctx_poller;
335 
336 	bool					queued_quiesce;
337 
338 	bool					reset_shn;
339 
340 	uint16_t				cntlid;
341 	struct spdk_nvmf_ctrlr			*ctrlr;
342 
343 	struct nvmf_vfio_user_sq		*sqs[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR];
344 	struct nvmf_vfio_user_cq		*cqs[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR];
345 
346 	TAILQ_ENTRY(nvmf_vfio_user_ctrlr)	link;
347 
348 	volatile uint32_t			*doorbells;
349 
350 	/* internal CSTS.CFS register for vfio-user fatal errors */
351 	uint32_t				cfs : 1;
352 };
353 
354 struct nvmf_vfio_user_endpoint {
355 	vfu_ctx_t				*vfu_ctx;
356 	struct msixcap				*msix;
357 	vfu_pci_config_space_t			*pci_config_space;
358 	int					devmem_fd;
359 	volatile uint32_t			*doorbells;
360 
361 	int					migr_fd;
362 	void					*migr_data;
363 
364 	struct spdk_nvme_transport_id		trid;
365 	const struct spdk_nvmf_subsystem	*subsystem;
366 
367 	struct nvmf_vfio_user_ctrlr		*ctrlr;
368 	pthread_mutex_t				lock;
369 
370 	bool					need_async_destroy;
371 
372 	TAILQ_ENTRY(nvmf_vfio_user_endpoint)	link;
373 };
374 
375 struct nvmf_vfio_user_transport_opts {
376 	bool					disable_mappable_bar0;
377 };
378 
379 struct nvmf_vfio_user_transport {
380 	struct spdk_nvmf_transport		transport;
381 	struct nvmf_vfio_user_transport_opts    transport_opts;
382 	struct spdk_poller			*accept_poller;
383 	pthread_mutex_t				lock;
384 	TAILQ_HEAD(, nvmf_vfio_user_endpoint)	endpoints;
385 
386 	pthread_mutex_t				pg_lock;
387 	TAILQ_HEAD(, nvmf_vfio_user_poll_group)	poll_groups;
388 	struct nvmf_vfio_user_poll_group	*next_pg;
389 };
390 
391 /*
392  * function prototypes
393  */
394 static int
395 nvmf_vfio_user_req_free(struct spdk_nvmf_request *req);
396 
397 static struct nvmf_vfio_user_req *
398 get_nvmf_vfio_user_req(struct nvmf_vfio_user_sq *sq);
399 
400 /*
401  * Local process virtual address of a queue.
402  */
403 static inline void *
404 q_addr(struct nvme_q_mapping *mapping)
405 {
406 	return mapping->iov.iov_base;
407 }
408 
409 static inline int
410 queue_index(uint16_t qid, bool is_cq)
411 {
412 	return (qid * 2) + is_cq;
413 }
414 
415 static inline volatile uint32_t *
416 sq_headp(struct nvmf_vfio_user_sq *sq)
417 {
418 	assert(sq != NULL);
419 	return &sq->head;
420 }
421 
422 static inline volatile uint32_t *
423 sq_dbl_tailp(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_sq *sq)
424 {
425 	assert(ctrlr != NULL);
426 	assert(sq != NULL);
427 	return &ctrlr->doorbells[queue_index(sq->qid, false)];
428 }
429 
430 static inline volatile uint32_t *
431 cq_dbl_headp(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_cq *cq)
432 {
433 	assert(ctrlr != NULL);
434 	assert(cq != NULL);
435 	return &ctrlr->doorbells[queue_index(cq->qid, true)];
436 }
437 
438 static inline volatile uint32_t *
439 cq_tailp(struct nvmf_vfio_user_cq *cq)
440 {
441 	assert(cq != NULL);
442 	return &cq->tail;
443 }
444 
445 static inline void
446 sq_head_advance(struct nvmf_vfio_user_sq *sq)
447 {
448 	assert(sq != NULL);
449 
450 	assert(*sq_headp(sq) < sq->size);
451 	(*sq_headp(sq))++;
452 
453 	if (spdk_unlikely(*sq_headp(sq) == sq->size)) {
454 		*sq_headp(sq) = 0;
455 	}
456 }
457 
458 static inline void
459 cq_tail_advance(struct nvmf_vfio_user_cq *cq)
460 {
461 	assert(cq != NULL);
462 
463 	assert(*cq_tailp(cq) < cq->size);
464 	(*cq_tailp(cq))++;
465 
466 	if (spdk_unlikely(*cq_tailp(cq) == cq->size)) {
467 		*cq_tailp(cq) = 0;
468 		cq->phase = !cq->phase;
469 	}
470 }
471 
472 static inline bool
473 cq_is_full(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_cq *cq)
474 {
475 	uint32_t qindex;
476 
477 	assert(ctrlr != NULL);
478 	assert(cq != NULL);
479 
480 	qindex = *cq_tailp(cq) + 1;
481 	if (spdk_unlikely(qindex == cq->size)) {
482 		qindex = 0;
483 	}
484 
485 	return qindex == *cq_dbl_headp(ctrlr, cq);
486 }
487 
488 
489 /* TODO: wrapper to data structure */
490 static inline size_t
491 vfio_user_migr_data_len(void)
492 {
493 	size_t len = 0;
494 
495 	len = NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR * (sizeof(struct nvme_migr_sq_state) + sizeof(
496 				struct nvme_migr_cq_state));
497 	len += sizeof(struct nvme_migr_device_state);
498 	len += sizeof(struct nvmf_ctrlr_migr_data);
499 	len += NVME_REG_BAR0_SIZE;
500 	len += NVME_REG_CFG_SIZE;
501 	/* BAR4 */
502 	len += NVME_BAR4_SIZE;
503 	/* BAR5 */
504 	len += NVME_BAR5_SIZE;
505 
506 	return SPDK_ALIGN_CEIL(len, PAGE_SIZE);
507 }
508 
509 static int
510 nvme_cmd_map_prps(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs,
511 		  uint32_t max_iovcnt, uint32_t len, size_t mps,
512 		  void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot))
513 {
514 	uint64_t prp1, prp2;
515 	void *vva;
516 	uint32_t i;
517 	uint32_t residue_len, nents;
518 	uint64_t *prp_list;
519 	uint32_t iovcnt;
520 
521 	assert(max_iovcnt > 0);
522 
523 	prp1 = cmd->dptr.prp.prp1;
524 	prp2 = cmd->dptr.prp.prp2;
525 
526 	/* PRP1 may started with unaligned page address */
527 	residue_len = mps - (prp1 % mps);
528 	residue_len = spdk_min(len, residue_len);
529 
530 	vva = gpa_to_vva(prv, prp1, residue_len, PROT_READ | PROT_WRITE);
531 	if (spdk_unlikely(vva == NULL)) {
532 		SPDK_ERRLOG("GPA to VVA failed\n");
533 		return -EINVAL;
534 	}
535 	len -= residue_len;
536 	if (len && max_iovcnt < 2) {
537 		SPDK_ERRLOG("Too many page entries, at least two iovs are required\n");
538 		return -ERANGE;
539 	}
540 	iovs[0].iov_base = vva;
541 	iovs[0].iov_len = residue_len;
542 
543 	if (len) {
544 		if (spdk_unlikely(prp2 == 0)) {
545 			SPDK_ERRLOG("no PRP2, %d remaining\n", len);
546 			return -EINVAL;
547 		}
548 
549 		if (len <= mps) {
550 			/* 2 PRP used */
551 			iovcnt = 2;
552 			vva = gpa_to_vva(prv, prp2, len, PROT_READ | PROT_WRITE);
553 			if (spdk_unlikely(vva == NULL)) {
554 				SPDK_ERRLOG("no VVA for %#" PRIx64 ", len%#x\n",
555 					    prp2, len);
556 				return -EINVAL;
557 			}
558 			iovs[1].iov_base = vva;
559 			iovs[1].iov_len = len;
560 		} else {
561 			/* PRP list used */
562 			nents = (len + mps - 1) / mps;
563 			if (spdk_unlikely(nents + 1 > max_iovcnt)) {
564 				SPDK_ERRLOG("Too many page entries\n");
565 				return -ERANGE;
566 			}
567 
568 			vva = gpa_to_vva(prv, prp2, nents * sizeof(*prp_list), PROT_READ);
569 			if (spdk_unlikely(vva == NULL)) {
570 				SPDK_ERRLOG("no VVA for %#" PRIx64 ", nents=%#x\n",
571 					    prp2, nents);
572 				return -EINVAL;
573 			}
574 			prp_list = vva;
575 			i = 0;
576 			while (len != 0) {
577 				residue_len = spdk_min(len, mps);
578 				vva = gpa_to_vva(prv, prp_list[i], residue_len, PROT_READ | PROT_WRITE);
579 				if (spdk_unlikely(vva == NULL)) {
580 					SPDK_ERRLOG("no VVA for %#" PRIx64 ", residue_len=%#x\n",
581 						    prp_list[i], residue_len);
582 					return -EINVAL;
583 				}
584 				iovs[i + 1].iov_base = vva;
585 				iovs[i + 1].iov_len = residue_len;
586 				len -= residue_len;
587 				i++;
588 			}
589 			iovcnt = i + 1;
590 		}
591 	} else {
592 		/* 1 PRP used */
593 		iovcnt = 1;
594 	}
595 
596 	assert(iovcnt <= max_iovcnt);
597 	return iovcnt;
598 }
599 
600 static int
601 nvme_cmd_map_sgls_data(void *prv, struct spdk_nvme_sgl_descriptor *sgls, uint32_t num_sgls,
602 		       struct iovec *iovs, uint32_t max_iovcnt,
603 		       void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot))
604 {
605 	uint32_t i;
606 	void *vva;
607 
608 	if (spdk_unlikely(max_iovcnt < num_sgls)) {
609 		return -ERANGE;
610 	}
611 
612 	for (i = 0; i < num_sgls; i++) {
613 		if (spdk_unlikely(sgls[i].unkeyed.type != SPDK_NVME_SGL_TYPE_DATA_BLOCK)) {
614 			SPDK_ERRLOG("Invalid SGL type %u\n", sgls[i].unkeyed.type);
615 			return -EINVAL;
616 		}
617 		vva = gpa_to_vva(prv, sgls[i].address, sgls[i].unkeyed.length, PROT_READ | PROT_WRITE);
618 		if (spdk_unlikely(vva == NULL)) {
619 			SPDK_ERRLOG("GPA to VVA failed\n");
620 			return -EINVAL;
621 		}
622 		iovs[i].iov_base = vva;
623 		iovs[i].iov_len = sgls[i].unkeyed.length;
624 	}
625 
626 	return num_sgls;
627 }
628 
629 static int
630 nvme_cmd_map_sgls(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt,
631 		  uint32_t len, size_t mps,
632 		  void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot))
633 {
634 	struct spdk_nvme_sgl_descriptor *sgl, *last_sgl;
635 	uint32_t num_sgls, seg_len;
636 	void *vva;
637 	int ret;
638 	uint32_t total_iovcnt = 0;
639 
640 	/* SGL cases */
641 	sgl = &cmd->dptr.sgl1;
642 
643 	/* only one SGL segment */
644 	if (sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) {
645 		assert(max_iovcnt > 0);
646 		vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, PROT_READ | PROT_WRITE);
647 		if (spdk_unlikely(vva == NULL)) {
648 			SPDK_ERRLOG("GPA to VVA failed\n");
649 			return -EINVAL;
650 		}
651 		iovs[0].iov_base = vva;
652 		iovs[0].iov_len = sgl->unkeyed.length;
653 		assert(sgl->unkeyed.length == len);
654 
655 		return 1;
656 	}
657 
658 	for (;;) {
659 		if (spdk_unlikely((sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_SEGMENT) &&
660 				  (sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_LAST_SEGMENT))) {
661 			SPDK_ERRLOG("Invalid SGL type %u\n", sgl->unkeyed.type);
662 			return -EINVAL;
663 		}
664 
665 		seg_len = sgl->unkeyed.length;
666 		if (spdk_unlikely(seg_len % sizeof(struct spdk_nvme_sgl_descriptor))) {
667 			SPDK_ERRLOG("Invalid SGL segment len %u\n", seg_len);
668 			return -EINVAL;
669 		}
670 
671 		num_sgls = seg_len / sizeof(struct spdk_nvme_sgl_descriptor);
672 		vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, PROT_READ);
673 		if (spdk_unlikely(vva == NULL)) {
674 			SPDK_ERRLOG("GPA to VVA failed\n");
675 			return -EINVAL;
676 		}
677 
678 		/* sgl point to the first segment */
679 		sgl = (struct spdk_nvme_sgl_descriptor *)vva;
680 		last_sgl = &sgl[num_sgls - 1];
681 
682 		/* we are done */
683 		if (last_sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) {
684 			/* map whole sgl list */
685 			ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls, &iovs[total_iovcnt],
686 						     max_iovcnt - total_iovcnt, gpa_to_vva);
687 			if (spdk_unlikely(ret < 0)) {
688 				return ret;
689 			}
690 			total_iovcnt += ret;
691 
692 			return total_iovcnt;
693 		}
694 
695 		if (num_sgls > 1) {
696 			/* map whole sgl exclude last_sgl */
697 			ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls - 1, &iovs[total_iovcnt],
698 						     max_iovcnt - total_iovcnt, gpa_to_vva);
699 			if (spdk_unlikely(ret < 0)) {
700 				return ret;
701 			}
702 			total_iovcnt += ret;
703 		}
704 
705 		/* move to next level's segments */
706 		sgl = last_sgl;
707 	}
708 
709 	return 0;
710 }
711 
712 static int
713 nvme_map_cmd(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt,
714 	     uint32_t len, size_t mps,
715 	     void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot))
716 {
717 	if (cmd->psdt == SPDK_NVME_PSDT_PRP) {
718 		return nvme_cmd_map_prps(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva);
719 	}
720 
721 	return nvme_cmd_map_sgls(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva);
722 }
723 
724 static char *
725 endpoint_id(struct nvmf_vfio_user_endpoint *endpoint)
726 {
727 	return endpoint->trid.traddr;
728 }
729 
730 static char *
731 ctrlr_id(struct nvmf_vfio_user_ctrlr *ctrlr)
732 {
733 	if (!ctrlr || !ctrlr->endpoint) {
734 		return "Null Ctrlr";
735 	}
736 
737 	return endpoint_id(ctrlr->endpoint);
738 }
739 
740 static void
741 fail_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr)
742 {
743 	assert(ctrlr != NULL);
744 
745 	if (ctrlr->cfs == 0) {
746 		SPDK_ERRLOG(":%s failing controller\n", ctrlr_id(ctrlr));
747 	}
748 
749 	ctrlr->cfs = 1U;
750 }
751 
752 static inline bool
753 ctrlr_interrupt_enabled(struct nvmf_vfio_user_ctrlr *vu_ctrlr)
754 {
755 	assert(vu_ctrlr != NULL);
756 	assert(vu_ctrlr->endpoint != NULL);
757 
758 	vfu_pci_config_space_t *pci = vu_ctrlr->endpoint->pci_config_space;
759 
760 	return (!pci->hdr.cmd.id || vu_ctrlr->endpoint->msix->mxc.mxe);
761 }
762 
763 static void
764 nvmf_vfio_user_destroy_endpoint(struct nvmf_vfio_user_endpoint *endpoint)
765 {
766 	SPDK_DEBUGLOG(nvmf_vfio, "destroy endpoint %s\n", endpoint_id(endpoint));
767 
768 	if (endpoint->doorbells) {
769 		munmap((void *)endpoint->doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE);
770 	}
771 
772 	if (endpoint->devmem_fd > 0) {
773 		close(endpoint->devmem_fd);
774 	}
775 
776 	if (endpoint->migr_data) {
777 		munmap(endpoint->migr_data, vfio_user_migr_data_len());
778 	}
779 
780 	if (endpoint->migr_fd > 0) {
781 		close(endpoint->migr_fd);
782 	}
783 
784 	if (endpoint->vfu_ctx) {
785 		vfu_destroy_ctx(endpoint->vfu_ctx);
786 	}
787 
788 	pthread_mutex_destroy(&endpoint->lock);
789 	free(endpoint);
790 }
791 
792 /* called when process exits */
793 static int
794 nvmf_vfio_user_destroy(struct spdk_nvmf_transport *transport,
795 		       spdk_nvmf_transport_destroy_done_cb cb_fn, void *cb_arg)
796 {
797 	struct nvmf_vfio_user_transport *vu_transport;
798 	struct nvmf_vfio_user_endpoint *endpoint, *tmp;
799 
800 	SPDK_DEBUGLOG(nvmf_vfio, "destroy transport\n");
801 
802 	vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport,
803 					transport);
804 
805 	spdk_poller_unregister(&vu_transport->accept_poller);
806 	pthread_mutex_destroy(&vu_transport->lock);
807 	pthread_mutex_destroy(&vu_transport->pg_lock);
808 
809 	TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) {
810 		TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link);
811 		nvmf_vfio_user_destroy_endpoint(endpoint);
812 	}
813 
814 	free(vu_transport);
815 
816 	if (cb_fn) {
817 		cb_fn(cb_arg);
818 	}
819 
820 	return 0;
821 }
822 
823 static const struct spdk_json_object_decoder vfio_user_transport_opts_decoder[] = {
824 	{
825 		"disable_mappable_bar0",
826 		offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_mappable_bar0),
827 		spdk_json_decode_bool, true
828 	},
829 };
830 
831 static int
832 nvmf_vfio_user_accept(void *ctx);
833 
834 static struct spdk_nvmf_transport *
835 nvmf_vfio_user_create(struct spdk_nvmf_transport_opts *opts)
836 {
837 	struct nvmf_vfio_user_transport *vu_transport;
838 	int err;
839 
840 	if (opts->max_qpairs_per_ctrlr > NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) {
841 		SPDK_ERRLOG("Invalid max_qpairs_per_ctrlr=%d, supported max_qpairs_per_ctrlr=%d\n",
842 			    opts->max_qpairs_per_ctrlr, NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR);
843 		return NULL;
844 	}
845 
846 	vu_transport = calloc(1, sizeof(*vu_transport));
847 	if (vu_transport == NULL) {
848 		SPDK_ERRLOG("Transport alloc fail: %m\n");
849 		return NULL;
850 	}
851 
852 	err = pthread_mutex_init(&vu_transport->lock, NULL);
853 	if (err != 0) {
854 		SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err);
855 		goto err;
856 	}
857 	TAILQ_INIT(&vu_transport->endpoints);
858 
859 	err = pthread_mutex_init(&vu_transport->pg_lock, NULL);
860 	if (err != 0) {
861 		pthread_mutex_destroy(&vu_transport->lock);
862 		SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err);
863 		goto err;
864 	}
865 	TAILQ_INIT(&vu_transport->poll_groups);
866 
867 	if (opts->transport_specific != NULL &&
868 	    spdk_json_decode_object_relaxed(opts->transport_specific, vfio_user_transport_opts_decoder,
869 					    SPDK_COUNTOF(vfio_user_transport_opts_decoder),
870 					    vu_transport)) {
871 		SPDK_ERRLOG("spdk_json_decode_object_relaxed failed\n");
872 		goto cleanup;
873 	}
874 
875 	vu_transport->accept_poller = SPDK_POLLER_REGISTER(nvmf_vfio_user_accept, &vu_transport->transport,
876 				      opts->acceptor_poll_rate);
877 	if (!vu_transport->accept_poller) {
878 		goto cleanup;
879 	}
880 
881 	SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_mappable_bar0=%d\n",
882 		      vu_transport->transport_opts.disable_mappable_bar0);
883 
884 	return &vu_transport->transport;
885 
886 cleanup:
887 	pthread_mutex_destroy(&vu_transport->lock);
888 	pthread_mutex_destroy(&vu_transport->pg_lock);
889 err:
890 	free(vu_transport);
891 	return NULL;
892 }
893 
894 static uint32_t
895 max_queue_size(struct nvmf_vfio_user_ctrlr const *vu_ctrlr)
896 {
897 	assert(vu_ctrlr != NULL);
898 	assert(vu_ctrlr->ctrlr != NULL);
899 
900 	return vu_ctrlr->ctrlr->vcprop.cap.bits.mqes + 1;
901 }
902 
903 static void *
904 map_one(vfu_ctx_t *ctx, uint64_t addr, uint64_t len, dma_sg_t *sg, struct iovec *iov, int prot)
905 {
906 	int ret;
907 
908 	assert(ctx != NULL);
909 	assert(sg != NULL);
910 	assert(iov != NULL);
911 
912 	ret = vfu_addr_to_sg(ctx, (void *)(uintptr_t)addr, len, sg, 1, prot);
913 	if (ret < 0) {
914 		return NULL;
915 	}
916 
917 	ret = vfu_map_sg(ctx, sg, iov, 1, 0);
918 	if (ret != 0) {
919 		return NULL;
920 	}
921 
922 	assert(iov->iov_base != NULL);
923 	return iov->iov_base;
924 }
925 
926 static int
927 map_q(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvme_q_mapping *mapping,
928       uint32_t q_size, bool is_cq, bool unmap)
929 {
930 	uint64_t len;
931 	void *ret;
932 
933 	assert(q_size);
934 	assert(q_addr(mapping) == NULL);
935 
936 	if (is_cq) {
937 		len = q_size * sizeof(struct spdk_nvme_cpl);
938 	} else {
939 		len = q_size * sizeof(struct spdk_nvme_cmd);
940 	}
941 
942 	ret = map_one(vu_ctrlr->endpoint->vfu_ctx, mapping->prp1, len,
943 		      mapping->sg, &mapping->iov,
944 		      is_cq ? PROT_READ | PROT_WRITE : PROT_READ);
945 	if (ret == NULL) {
946 		return -EFAULT;
947 	}
948 
949 	if (unmap) {
950 		memset(q_addr(mapping), 0, len);
951 	}
952 
953 	return 0;
954 }
955 
956 static inline void
957 unmap_q(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvme_q_mapping *mapping)
958 {
959 	if (q_addr(mapping) != NULL) {
960 		vfu_unmap_sg(vu_ctrlr->endpoint->vfu_ctx, mapping->sg,
961 			     &mapping->iov, 1);
962 		mapping->iov.iov_base = NULL;
963 	}
964 }
965 
966 static int
967 asq_setup(struct nvmf_vfio_user_ctrlr *ctrlr)
968 {
969 	struct nvmf_vfio_user_sq *sq;
970 	const struct spdk_nvmf_registers *regs;
971 	int ret;
972 
973 	assert(ctrlr != NULL);
974 
975 	sq = ctrlr->sqs[0];
976 
977 	assert(sq != NULL);
978 	assert(q_addr(&sq->mapping) == NULL);
979 	/* XXX ctrlr->asq == 0 is a valid memory address */
980 
981 	regs = spdk_nvmf_ctrlr_get_regs(ctrlr->ctrlr);
982 	sq->qid = 0;
983 	sq->size = regs->aqa.bits.asqs + 1;
984 	sq->mapping.prp1 = regs->asq;
985 	*sq_headp(sq) = 0;
986 	sq->cqid = 0;
987 
988 	ret = map_q(ctrlr, &sq->mapping, sq->size, false, true);
989 	if (ret) {
990 		return ret;
991 	}
992 
993 	*sq_dbl_tailp(ctrlr, sq) = 0;
994 
995 	return 0;
996 }
997 
998 static int
999 acq_setup(struct nvmf_vfio_user_ctrlr *ctrlr)
1000 {
1001 	struct nvmf_vfio_user_cq *cq;
1002 	const struct spdk_nvmf_registers *regs;
1003 	int ret;
1004 
1005 	assert(ctrlr != NULL);
1006 
1007 	cq = ctrlr->cqs[0];
1008 
1009 	assert(cq != NULL);
1010 
1011 	assert(q_addr(&cq->mapping) == NULL);
1012 
1013 	regs = spdk_nvmf_ctrlr_get_regs(ctrlr->ctrlr);
1014 	assert(regs != NULL);
1015 	cq->qid = 0;
1016 	cq->size = regs->aqa.bits.acqs + 1;
1017 	cq->mapping.prp1 = regs->acq;
1018 	*cq_tailp(cq) = 0;
1019 	cq->ien = true;
1020 	cq->phase = true;
1021 
1022 	ret = map_q(ctrlr, &cq->mapping, cq->size, true, true);
1023 	if (ret) {
1024 		return ret;
1025 	}
1026 
1027 	*cq_dbl_headp(ctrlr, cq) = 0;
1028 
1029 	return 0;
1030 }
1031 
1032 static inline dma_sg_t *
1033 vu_req_to_sg_t(struct nvmf_vfio_user_req *vu_req, uint32_t iovcnt)
1034 {
1035 	return (dma_sg_t *)(vu_req->sg + iovcnt * dma_sg_size());
1036 }
1037 
1038 static void *
1039 _map_one(void *prv, uint64_t addr, uint64_t len, int prot)
1040 {
1041 	struct spdk_nvmf_request *req = (struct spdk_nvmf_request *)prv;
1042 	struct spdk_nvmf_qpair *qpair;
1043 	struct nvmf_vfio_user_req *vu_req;
1044 	struct nvmf_vfio_user_sq *sq;
1045 	void *ret;
1046 
1047 	assert(req != NULL);
1048 	qpair = req->qpair;
1049 	vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req);
1050 	sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair);
1051 
1052 	assert(vu_req->iovcnt < NVMF_VFIO_USER_MAX_IOVECS);
1053 	ret = map_one(sq->ctrlr->endpoint->vfu_ctx, addr, len,
1054 		      vu_req_to_sg_t(vu_req, vu_req->iovcnt),
1055 		      &vu_req->iov[vu_req->iovcnt], prot);
1056 	if (spdk_likely(ret != NULL)) {
1057 		vu_req->iovcnt++;
1058 	}
1059 	return ret;
1060 }
1061 
1062 static int
1063 vfio_user_map_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req,
1064 		  struct iovec *iov, uint32_t length)
1065 {
1066 	/* Map PRP list to from Guest physical memory to
1067 	 * virtual memory address.
1068 	 */
1069 	return nvme_map_cmd(req, &req->cmd->nvme_cmd, iov, NVMF_REQ_MAX_BUFFERS,
1070 			    length, 4096, _map_one);
1071 }
1072 
1073 static int
1074 handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd,
1075 	       struct nvmf_vfio_user_sq *sq);
1076 
1077 /*
1078  * Posts a CQE in the completion queue.
1079  *
1080  * @ctrlr: the vfio-user controller
1081  * @cq: the completion queue
1082  * @cdw0: cdw0 as reported by NVMf
1083  * @sqid: submission queue ID
1084  * @cid: command identifier in NVMe command
1085  * @sc: the NVMe CQE status code
1086  * @sct: the NVMe CQE status code type
1087  */
1088 static int
1089 post_completion(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_cq *cq,
1090 		uint32_t cdw0, uint16_t sqid, uint16_t cid, uint16_t sc, uint16_t sct)
1091 {
1092 	struct spdk_nvme_cpl *cpl;
1093 	const struct spdk_nvmf_registers *regs;
1094 	int err;
1095 
1096 	assert(ctrlr != NULL);
1097 
1098 	if (spdk_unlikely(cq == NULL || q_addr(&cq->mapping) == NULL)) {
1099 		return 0;
1100 	}
1101 
1102 	regs = spdk_nvmf_ctrlr_get_regs(ctrlr->ctrlr);
1103 	if (regs->csts.bits.shst != SPDK_NVME_SHST_NORMAL) {
1104 		SPDK_DEBUGLOG(nvmf_vfio,
1105 			      "%s: ignore completion SQ%d cid=%d status=%#x\n",
1106 			      ctrlr_id(ctrlr), sqid, cid, sc);
1107 		return 0;
1108 	}
1109 
1110 	if (cq_is_full(ctrlr, cq)) {
1111 		SPDK_ERRLOG("%s: CQ%d full (tail=%d, head=%d)\n",
1112 			    ctrlr_id(ctrlr), cq->qid, *cq_tailp(cq),
1113 			    *cq_dbl_headp(ctrlr, cq));
1114 		return -1;
1115 	}
1116 
1117 	cpl = ((struct spdk_nvme_cpl *)q_addr(&cq->mapping)) + *cq_tailp(cq);
1118 
1119 	assert(ctrlr->sqs[sqid] != NULL);
1120 	SPDK_DEBUGLOG(nvmf_vfio,
1121 		      "%s: request complete SQ%d cid=%d status=%#x SQ head=%#x CQ tail=%#x\n",
1122 		      ctrlr_id(ctrlr), sqid, cid, sc, *sq_headp(ctrlr->sqs[sqid]),
1123 		      *cq_tailp(cq));
1124 
1125 	cpl->sqhd = *sq_headp(ctrlr->sqs[sqid]);
1126 	cpl->sqid = sqid;
1127 	cpl->cid = cid;
1128 	cpl->cdw0 = cdw0;
1129 	cpl->status.dnr = 0x0;
1130 	cpl->status.m = 0x0;
1131 	cpl->status.sct = sct;
1132 	cpl->status.sc = sc;
1133 	cpl->status.p = cq->phase;
1134 
1135 	/* Ensure the Completion Queue Entry is visible. */
1136 	spdk_wmb();
1137 	cq_tail_advance(cq);
1138 
1139 	/*
1140 	 * this function now executes at SPDK thread context, we
1141 	 * might be triggering interrupts from vfio-user thread context so
1142 	 * check for race conditions.
1143 	 */
1144 	if (ctrlr_interrupt_enabled(ctrlr) && cq->ien) {
1145 		err = vfu_irq_trigger(ctrlr->endpoint->vfu_ctx, cq->iv);
1146 		if (err != 0) {
1147 			SPDK_ERRLOG("%s: failed to trigger interrupt: %m\n",
1148 				    ctrlr_id(ctrlr));
1149 			return err;
1150 		}
1151 	}
1152 
1153 	return 0;
1154 }
1155 
1156 static bool
1157 io_q_exists(struct nvmf_vfio_user_ctrlr *vu_ctrlr, const uint16_t qid, const bool is_cq)
1158 {
1159 	assert(vu_ctrlr != NULL);
1160 
1161 	if (qid == 0 || qid >= NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) {
1162 		return false;
1163 	}
1164 
1165 	if (is_cq) {
1166 		if (vu_ctrlr->cqs[qid] == NULL) {
1167 			return false;
1168 		}
1169 
1170 		return (vu_ctrlr->cqs[qid]->cq_state != VFIO_USER_CQ_DELETED &&
1171 			vu_ctrlr->cqs[qid]->cq_state != VFIO_USER_CQ_UNUSED);
1172 	}
1173 
1174 	if (vu_ctrlr->sqs[qid] == NULL) {
1175 		return false;
1176 	}
1177 
1178 	return (vu_ctrlr->sqs[qid]->sq_state != VFIO_USER_SQ_DELETED &&
1179 		vu_ctrlr->sqs[qid]->sq_state != VFIO_USER_SQ_UNUSED);
1180 }
1181 
1182 static void
1183 free_sq_reqs(struct nvmf_vfio_user_sq *sq)
1184 {
1185 	while (!TAILQ_EMPTY(&sq->free_reqs)) {
1186 		struct nvmf_vfio_user_req *vu_req = TAILQ_FIRST(&sq->free_reqs);
1187 		TAILQ_REMOVE(&sq->free_reqs, vu_req, link);
1188 		free(vu_req);
1189 	}
1190 }
1191 
1192 /* Deletes a SQ, if this SQ is the last user of the associated CQ
1193  * and the controller is being shut down or reset, then the CQ is
1194  * also deleted.
1195  */
1196 static void
1197 delete_sq_done(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvmf_vfio_user_sq *sq)
1198 {
1199 	struct nvmf_vfio_user_cq *cq;
1200 	uint16_t cqid;
1201 
1202 	SPDK_DEBUGLOG(nvmf_vfio, "%s: delete SQ%d=%p done\n", ctrlr_id(vu_ctrlr),
1203 		      sq->qid, sq);
1204 
1205 	/* Free SQ resources */
1206 	unmap_q(vu_ctrlr, &sq->mapping);
1207 
1208 	free_sq_reqs(sq);
1209 
1210 	sq->size = 0;
1211 
1212 	sq->sq_state = VFIO_USER_SQ_DELETED;
1213 
1214 	/* Controller RESET and SHUTDOWN are special cases,
1215 	 * VM may not send DELETE IO SQ/CQ commands, NVMf library
1216 	 * will disconnect IO queue pairs.
1217 	 */
1218 	if (vu_ctrlr->reset_shn) {
1219 		cqid = sq->cqid;
1220 		cq = vu_ctrlr->cqs[cqid];
1221 
1222 		SPDK_DEBUGLOG(nvmf_vfio, "%s: try to delete CQ%d=%p\n", ctrlr_id(vu_ctrlr),
1223 			      cq->qid, cq);
1224 
1225 		if (cq->cq_ref) {
1226 			cq->cq_ref--;
1227 		}
1228 		if (cq->cq_ref == 0) {
1229 			unmap_q(vu_ctrlr, &cq->mapping);
1230 			cq->size = 0;
1231 			cq->cq_state = VFIO_USER_CQ_DELETED;
1232 			cq->group = NULL;
1233 		}
1234 	}
1235 }
1236 
1237 static void
1238 free_qp(struct nvmf_vfio_user_ctrlr *ctrlr, uint16_t qid)
1239 {
1240 	struct nvmf_vfio_user_sq *sq;
1241 	struct nvmf_vfio_user_cq *cq;
1242 
1243 	if (ctrlr == NULL) {
1244 		return;
1245 	}
1246 
1247 	sq = ctrlr->sqs[qid];
1248 	if (sq) {
1249 		SPDK_DEBUGLOG(nvmf_vfio, "%s: Free SQ %u\n", ctrlr_id(ctrlr), qid);
1250 		unmap_q(ctrlr, &sq->mapping);
1251 
1252 		free_sq_reqs(sq);
1253 
1254 		free(sq->mapping.sg);
1255 		free(sq);
1256 		ctrlr->sqs[qid] = NULL;
1257 	}
1258 
1259 	cq = ctrlr->cqs[qid];
1260 	if (cq) {
1261 		SPDK_DEBUGLOG(nvmf_vfio, "%s: Free CQ %u\n", ctrlr_id(ctrlr), qid);
1262 		unmap_q(ctrlr, &cq->mapping);
1263 		free(cq->mapping.sg);
1264 		free(cq);
1265 		ctrlr->cqs[qid] = NULL;
1266 	}
1267 }
1268 
1269 static int
1270 init_sq(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_transport *transport,
1271 	const uint16_t id)
1272 {
1273 	struct nvmf_vfio_user_sq *sq;
1274 
1275 	assert(ctrlr != NULL);
1276 	assert(transport != NULL);
1277 	assert(ctrlr->sqs[id] == NULL);
1278 
1279 	sq = calloc(1, sizeof(*sq));
1280 	if (sq == NULL) {
1281 		return -ENOMEM;
1282 	}
1283 	sq->mapping.sg = calloc(1, dma_sg_size());
1284 	if (sq->mapping.sg == NULL) {
1285 		free(sq);
1286 		return -ENOMEM;
1287 	}
1288 
1289 	sq->qid = id;
1290 	sq->qpair.qid = id;
1291 	sq->qpair.transport = transport;
1292 	sq->ctrlr = ctrlr;
1293 	ctrlr->sqs[id] = sq;
1294 
1295 	TAILQ_INIT(&sq->free_reqs);
1296 
1297 	return 0;
1298 }
1299 
1300 static int
1301 init_cq(struct nvmf_vfio_user_ctrlr *vu_ctrlr, const uint16_t id)
1302 {
1303 	struct nvmf_vfio_user_cq *cq;
1304 
1305 	assert(vu_ctrlr != NULL);
1306 	assert(vu_ctrlr->cqs[id] == NULL);
1307 
1308 	cq = calloc(1, sizeof(*cq));
1309 	if (cq == NULL) {
1310 		return -ENOMEM;
1311 	}
1312 	cq->mapping.sg = calloc(1, dma_sg_size());
1313 	if (cq->mapping.sg == NULL) {
1314 		free(cq);
1315 		return -ENOMEM;
1316 	}
1317 
1318 	cq->qid = id;
1319 	vu_ctrlr->cqs[id] = cq;
1320 
1321 	return 0;
1322 }
1323 
1324 static int
1325 alloc_sq_reqs(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvmf_vfio_user_sq *sq)
1326 {
1327 	struct nvmf_vfio_user_req *vu_req, *tmp;
1328 	size_t req_size;
1329 	uint32_t i;
1330 
1331 	req_size = sizeof(struct nvmf_vfio_user_req) +
1332 		   (dma_sg_size() * NVMF_VFIO_USER_MAX_IOVECS);
1333 
1334 	for (i = 0; i < sq->size; i++) {
1335 		struct spdk_nvmf_request *req;
1336 
1337 		vu_req = calloc(1, req_size);
1338 		if (vu_req == NULL) {
1339 			goto err;
1340 		}
1341 
1342 		req = &vu_req->req;
1343 		req->qpair = &sq->qpair;
1344 		req->rsp = (union nvmf_c2h_msg *)&vu_req->rsp;
1345 		req->cmd = (union nvmf_h2c_msg *)&vu_req->cmd;
1346 
1347 		TAILQ_INSERT_TAIL(&sq->free_reqs, vu_req, link);
1348 	}
1349 
1350 	return 0;
1351 
1352 err:
1353 	TAILQ_FOREACH_SAFE(vu_req, &sq->free_reqs, link, tmp) {
1354 		free(vu_req);
1355 	}
1356 	return -ENOMEM;
1357 }
1358 
1359 static uint16_t
1360 handle_create_io_sq(struct nvmf_vfio_user_ctrlr *ctrlr,
1361 		    struct spdk_nvme_cmd *cmd, uint16_t *sct)
1362 {
1363 	struct nvmf_vfio_user_transport *vu_transport = ctrlr->transport;
1364 	struct nvmf_vfio_user_sq *sq;
1365 	uint32_t qsize;
1366 	uint16_t cqid;
1367 	uint16_t qid;
1368 	int err;
1369 
1370 	qid = cmd->cdw10_bits.create_io_q.qid;
1371 	cqid = cmd->cdw11_bits.create_io_sq.cqid;
1372 	qsize = cmd->cdw10_bits.create_io_q.qsize + 1;
1373 
1374 	if (ctrlr->sqs[qid] == NULL) {
1375 		err = init_sq(ctrlr, ctrlr->sqs[0]->qpair.transport, qid);
1376 		if (err != 0) {
1377 			*sct = SPDK_NVME_SCT_GENERIC;
1378 			return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
1379 		}
1380 	}
1381 
1382 	if (cqid == 0 || cqid >= vu_transport->transport.opts.max_qpairs_per_ctrlr) {
1383 		SPDK_ERRLOG("%s: invalid CQID %u\n", ctrlr_id(ctrlr), cqid);
1384 		*sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
1385 		return SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER;
1386 	}
1387 
1388 	/* CQ must be created before SQ. */
1389 	if (!io_q_exists(ctrlr, cqid, true)) {
1390 		SPDK_ERRLOG("%s: CQ%u does not exist\n", ctrlr_id(ctrlr), cqid);
1391 		*sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
1392 		return SPDK_NVME_SC_COMPLETION_QUEUE_INVALID;
1393 	}
1394 
1395 	if (cmd->cdw11_bits.create_io_sq.pc != 0x1) {
1396 		SPDK_ERRLOG("%s: non-PC SQ not supported\n", ctrlr_id(ctrlr));
1397 		*sct = SPDK_NVME_SCT_GENERIC;
1398 		return SPDK_NVME_SC_INVALID_FIELD;
1399 	}
1400 
1401 	sq = ctrlr->sqs[qid];
1402 	sq->size = qsize;
1403 
1404 	SPDK_DEBUGLOG(nvmf_vfio, "%s: SQ%d CQID=%d\n", ctrlr_id(ctrlr),
1405 		      qid, cqid);
1406 
1407 	sq->mapping.prp1 = cmd->dptr.prp.prp1;
1408 
1409 	err = map_q(ctrlr, &sq->mapping, sq->size, false, true);
1410 	if (err) {
1411 		SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr));
1412 		*sct = SPDK_NVME_SCT_GENERIC;
1413 		return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
1414 	}
1415 
1416 	SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped SQ%d IOVA=%#lx vaddr=%p\n",
1417 		      ctrlr_id(ctrlr), qid, cmd->dptr.prp.prp1,
1418 		      q_addr(&sq->mapping));
1419 
1420 	err = alloc_sq_reqs(ctrlr, sq);
1421 	if (err < 0) {
1422 		SPDK_ERRLOG("%s: failed to allocate SQ requests: %m\n", ctrlr_id(ctrlr));
1423 		*sct = SPDK_NVME_SCT_GENERIC;
1424 		return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
1425 	}
1426 
1427 	sq->cqid = cqid;
1428 	ctrlr->cqs[sq->cqid]->cq_ref++;
1429 	sq->sq_state = VFIO_USER_SQ_CREATED;
1430 	*sq_headp(sq) = 0;
1431 	*sq_dbl_tailp(ctrlr, sq) = 0;
1432 
1433 	/*
1434 	 * Create our new I/O qpair. This asynchronously invokes, on a suitable
1435 	 * poll group, the nvmf_vfio_user_poll_group_add() callback, which will
1436 	 * call spdk_nvmf_request_exec_fabrics() with a generated fabrics
1437 	 * connect command. This command is then eventually completed via
1438 	 * handle_queue_connect_rsp().
1439 	 */
1440 	sq->create_io_sq_cmd = *cmd;
1441 	sq->post_create_io_sq_completion = true;
1442 
1443 	spdk_nvmf_tgt_new_qpair(ctrlr->transport->transport.tgt,
1444 				&sq->qpair);
1445 
1446 	*sct = SPDK_NVME_SCT_GENERIC;
1447 	return SPDK_NVME_SC_SUCCESS;
1448 }
1449 
1450 static uint16_t
1451 handle_create_io_cq(struct nvmf_vfio_user_ctrlr *ctrlr,
1452 		    struct spdk_nvme_cmd *cmd, uint16_t *sct)
1453 {
1454 	struct nvmf_vfio_user_cq *cq;
1455 	uint32_t qsize;
1456 	uint16_t qid;
1457 	int err;
1458 
1459 	qid = cmd->cdw10_bits.create_io_q.qid;
1460 	qsize = cmd->cdw10_bits.create_io_q.qsize + 1;
1461 
1462 	if (ctrlr->cqs[qid] == NULL) {
1463 		err = init_cq(ctrlr, qid);
1464 		if (err != 0) {
1465 			*sct = SPDK_NVME_SCT_GENERIC;
1466 			return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
1467 		}
1468 	}
1469 
1470 	if (cmd->cdw11_bits.create_io_cq.pc != 0x1) {
1471 		SPDK_ERRLOG("%s: non-PC CQ not supported\n", ctrlr_id(ctrlr));
1472 		*sct = SPDK_NVME_SCT_GENERIC;
1473 		return SPDK_NVME_SC_INVALID_FIELD;
1474 	}
1475 
1476 	if (cmd->cdw11_bits.create_io_cq.iv > NVME_IRQ_MSIX_NUM - 1) {
1477 		SPDK_ERRLOG("%s: IV is too big\n", ctrlr_id(ctrlr));
1478 		*sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
1479 		return SPDK_NVME_SC_INVALID_INTERRUPT_VECTOR;
1480 	}
1481 
1482 	cq = ctrlr->cqs[qid];
1483 	cq->size = qsize;
1484 
1485 	cq->mapping.prp1 = cmd->dptr.prp.prp1;
1486 
1487 	err = map_q(ctrlr, &cq->mapping, cq->size, true, true);
1488 	if (err) {
1489 		SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr));
1490 		*sct = SPDK_NVME_SCT_GENERIC;
1491 		return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
1492 	}
1493 
1494 	SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped CQ%d IOVA=%#lx vaddr=%p\n",
1495 		      ctrlr_id(ctrlr), qid, cmd->dptr.prp.prp1,
1496 		      q_addr(&cq->mapping));
1497 
1498 	cq->ien = cmd->cdw11_bits.create_io_cq.ien;
1499 	cq->iv = cmd->cdw11_bits.create_io_cq.iv;
1500 	cq->phase = true;
1501 	cq->cq_state = VFIO_USER_CQ_CREATED;
1502 
1503 	*cq_tailp(cq) = 0;
1504 	*cq_dbl_headp(ctrlr, cq) = 0;
1505 
1506 	*sct = SPDK_NVME_SCT_GENERIC;
1507 	return SPDK_NVME_SC_SUCCESS;
1508 }
1509 
1510 /*
1511  * Creates a completion or submission I/O queue. Returns 0 on success, -errno
1512  * on error.
1513  */
1514 static int
1515 handle_create_io_q(struct nvmf_vfio_user_ctrlr *ctrlr,
1516 		   struct spdk_nvme_cmd *cmd, const bool is_cq)
1517 {
1518 	struct nvmf_vfio_user_transport *vu_transport = ctrlr->transport;
1519 	uint16_t sct = SPDK_NVME_SCT_GENERIC;
1520 	uint16_t sc = SPDK_NVME_SC_SUCCESS;
1521 	uint32_t qsize;
1522 	uint16_t qid;
1523 
1524 	assert(ctrlr != NULL);
1525 	assert(cmd != NULL);
1526 
1527 	qid = cmd->cdw10_bits.create_io_q.qid;
1528 	if (qid == 0 || qid >= vu_transport->transport.opts.max_qpairs_per_ctrlr) {
1529 		SPDK_ERRLOG("%s: invalid QID=%d, max=%d\n", ctrlr_id(ctrlr),
1530 			    qid, vu_transport->transport.opts.max_qpairs_per_ctrlr);
1531 		sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
1532 		sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER;
1533 		goto out;
1534 	}
1535 
1536 	if (io_q_exists(ctrlr, qid, is_cq)) {
1537 		SPDK_ERRLOG("%s: %cQ%d already exists\n", ctrlr_id(ctrlr),
1538 			    is_cq ? 'C' : 'S', qid);
1539 		sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
1540 		sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER;
1541 		goto out;
1542 	}
1543 
1544 	qsize = cmd->cdw10_bits.create_io_q.qsize + 1;
1545 	if (qsize == 1 || qsize > max_queue_size(ctrlr)) {
1546 		SPDK_ERRLOG("%s: invalid I/O queue size %u\n", ctrlr_id(ctrlr), qsize);
1547 		sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
1548 		sc = SPDK_NVME_SC_INVALID_QUEUE_SIZE;
1549 		goto out;
1550 	}
1551 
1552 	if (is_cq) {
1553 		sc = handle_create_io_cq(ctrlr, cmd, &sct);
1554 	} else {
1555 		sc = handle_create_io_sq(ctrlr, cmd, &sct);
1556 
1557 		if (sct == SPDK_NVME_SCT_GENERIC &&
1558 		    sc == SPDK_NVME_SC_SUCCESS) {
1559 			/* Completion posted asynchronously. */
1560 			return 0;
1561 		}
1562 	}
1563 
1564 out:
1565 	return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct);
1566 }
1567 
1568 /* For ADMIN I/O DELETE SUBMISSION QUEUE the NVMf library will disconnect and free
1569  * queue pair, so save the command in a context.
1570  */
1571 struct vfio_user_delete_sq_ctx {
1572 	struct nvmf_vfio_user_ctrlr *vu_ctrlr;
1573 	struct spdk_nvme_cmd delete_io_sq_cmd;
1574 };
1575 
1576 static void
1577 vfio_user_qpair_delete_cb(void *cb_arg)
1578 {
1579 	struct vfio_user_delete_sq_ctx *ctx = cb_arg;
1580 	struct nvmf_vfio_user_ctrlr *vu_ctrlr = ctx->vu_ctrlr;
1581 
1582 	post_completion(vu_ctrlr, vu_ctrlr->cqs[0], 0, 0, ctx->delete_io_sq_cmd.cid,
1583 			SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC);
1584 	free(ctx);
1585 }
1586 
1587 /*
1588  * Deletes a completion or submission I/O queue.
1589  */
1590 static int
1591 handle_del_io_q(struct nvmf_vfio_user_ctrlr *ctrlr,
1592 		struct spdk_nvme_cmd *cmd, const bool is_cq)
1593 {
1594 	uint16_t sct = SPDK_NVME_SCT_GENERIC;
1595 	uint16_t sc = SPDK_NVME_SC_SUCCESS;
1596 	struct nvmf_vfio_user_sq *sq;
1597 	struct nvmf_vfio_user_cq *cq;
1598 	struct vfio_user_delete_sq_ctx *ctx;
1599 
1600 	SPDK_DEBUGLOG(nvmf_vfio, "%s: delete I/O %cQ: QID=%d\n",
1601 		      ctrlr_id(ctrlr), is_cq ? 'C' : 'S',
1602 		      cmd->cdw10_bits.delete_io_q.qid);
1603 
1604 	if (!io_q_exists(ctrlr, cmd->cdw10_bits.delete_io_q.qid, is_cq)) {
1605 		SPDK_ERRLOG("%s: I/O %cQ%d does not exist\n", ctrlr_id(ctrlr),
1606 			    is_cq ? 'C' : 'S', cmd->cdw10_bits.delete_io_q.qid);
1607 		sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
1608 		sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER;
1609 		goto out;
1610 	}
1611 
1612 	if (is_cq) {
1613 		cq = ctrlr->cqs[cmd->cdw10_bits.delete_io_q.qid];
1614 		if (cq->cq_ref) {
1615 			SPDK_ERRLOG("%s: the associated SQ must be deleted first\n", ctrlr_id(ctrlr));
1616 			sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
1617 			sc = SPDK_NVME_SC_INVALID_QUEUE_DELETION;
1618 			goto out;
1619 		}
1620 
1621 		unmap_q(ctrlr, &cq->mapping);
1622 		cq->size = 0;
1623 		cq->cq_state = VFIO_USER_CQ_DELETED;
1624 		cq->group = NULL;
1625 	} else {
1626 		ctx = calloc(1, sizeof(*ctx));
1627 		if (!ctx) {
1628 			sct = SPDK_NVME_SCT_GENERIC;
1629 			sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
1630 			goto out;
1631 		}
1632 		ctx->vu_ctrlr = ctrlr;
1633 		ctx->delete_io_sq_cmd = *cmd;
1634 
1635 		sq = ctrlr->sqs[cmd->cdw10_bits.delete_io_q.qid];
1636 		sq->sq_state = VFIO_USER_SQ_DELETED;
1637 		assert(ctrlr->cqs[sq->cqid]->cq_ref);
1638 		ctrlr->cqs[sq->cqid]->cq_ref--;
1639 
1640 		spdk_nvmf_qpair_disconnect(&sq->qpair, vfio_user_qpair_delete_cb, ctx);
1641 		return 0;
1642 	}
1643 
1644 out:
1645 	return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct);
1646 }
1647 
1648 /*
1649  * Returns 0 on success and -errno on error.
1650  */
1651 static int
1652 consume_admin_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd)
1653 {
1654 	assert(ctrlr != NULL);
1655 	assert(cmd != NULL);
1656 
1657 	if (cmd->fuse != 0) {
1658 		/* Fused admin commands are not supported. */
1659 		return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid,
1660 				       SPDK_NVME_SC_INVALID_FIELD,
1661 				       SPDK_NVME_SCT_GENERIC);
1662 	}
1663 
1664 	switch (cmd->opc) {
1665 	case SPDK_NVME_OPC_CREATE_IO_CQ:
1666 	case SPDK_NVME_OPC_CREATE_IO_SQ:
1667 		return handle_create_io_q(ctrlr, cmd,
1668 					  cmd->opc == SPDK_NVME_OPC_CREATE_IO_CQ);
1669 	case SPDK_NVME_OPC_DELETE_IO_SQ:
1670 	case SPDK_NVME_OPC_DELETE_IO_CQ:
1671 		return handle_del_io_q(ctrlr, cmd,
1672 				       cmd->opc == SPDK_NVME_OPC_DELETE_IO_CQ);
1673 	default:
1674 		return handle_cmd_req(ctrlr, cmd, ctrlr->sqs[0]);
1675 	}
1676 }
1677 
1678 static int
1679 handle_cmd_rsp(struct nvmf_vfio_user_req *vu_req, void *cb_arg)
1680 {
1681 	struct nvmf_vfio_user_sq *sq = cb_arg;
1682 	struct nvmf_vfio_user_ctrlr *vu_ctrlr = sq->ctrlr;
1683 	uint16_t sqid, cqid;
1684 
1685 	assert(sq != NULL);
1686 	assert(vu_req != NULL);
1687 	assert(vu_ctrlr != NULL);
1688 
1689 	if (spdk_likely(vu_req->iovcnt)) {
1690 		vfu_unmap_sg(vu_ctrlr->endpoint->vfu_ctx,
1691 			     vu_req_to_sg_t(vu_req, 0),
1692 			     vu_req->iov, vu_req->iovcnt);
1693 	}
1694 	sqid = sq->qid;
1695 	cqid = sq->cqid;
1696 
1697 	return post_completion(vu_ctrlr, vu_ctrlr->cqs[cqid],
1698 			       vu_req->req.rsp->nvme_cpl.cdw0,
1699 			       sqid,
1700 			       vu_req->req.cmd->nvme_cmd.cid,
1701 			       vu_req->req.rsp->nvme_cpl.status.sc,
1702 			       vu_req->req.rsp->nvme_cpl.status.sct);
1703 }
1704 
1705 static int
1706 consume_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_sq *sq,
1707 	    struct spdk_nvme_cmd *cmd)
1708 {
1709 	assert(sq != NULL);
1710 	if (nvmf_qpair_is_admin_queue(&sq->qpair)) {
1711 		return consume_admin_cmd(ctrlr, cmd);
1712 	}
1713 
1714 	return handle_cmd_req(ctrlr, cmd, sq);
1715 }
1716 
1717 /* Returns the number of commands processed, or a negative value on error. */
1718 static int
1719 handle_sq_tdbl_write(struct nvmf_vfio_user_ctrlr *ctrlr, const uint32_t new_tail,
1720 		     struct nvmf_vfio_user_sq *sq)
1721 {
1722 	struct spdk_nvme_cmd *queue;
1723 	int count = 0;
1724 
1725 	assert(ctrlr != NULL);
1726 	assert(sq != NULL);
1727 
1728 	queue = q_addr(&sq->mapping);
1729 	while (*sq_headp(sq) != new_tail) {
1730 		int err;
1731 		struct spdk_nvme_cmd *cmd = &queue[*sq_headp(sq)];
1732 
1733 		count++;
1734 
1735 		/*
1736 		 * SQHD must contain the new head pointer, so we must increase
1737 		 * it before we generate a completion.
1738 		 */
1739 		sq_head_advance(sq);
1740 
1741 		err = consume_cmd(ctrlr, sq, cmd);
1742 		if (err != 0) {
1743 			return err;
1744 		}
1745 	}
1746 
1747 	return count;
1748 }
1749 
1750 static int
1751 enable_admin_queue(struct nvmf_vfio_user_ctrlr *ctrlr)
1752 {
1753 	int err;
1754 
1755 	assert(ctrlr != NULL);
1756 
1757 	err = acq_setup(ctrlr);
1758 	if (err != 0) {
1759 		return err;
1760 	}
1761 
1762 	err = asq_setup(ctrlr);
1763 	if (err != 0) {
1764 		return err;
1765 	}
1766 
1767 	return 0;
1768 }
1769 
1770 static void
1771 disable_admin_queue(struct nvmf_vfio_user_ctrlr *ctrlr)
1772 {
1773 	assert(ctrlr->sqs[0] != NULL);
1774 	assert(ctrlr->cqs[0] != NULL);
1775 
1776 	unmap_q(ctrlr, &ctrlr->sqs[0]->mapping);
1777 	unmap_q(ctrlr, &ctrlr->cqs[0]->mapping);
1778 
1779 	ctrlr->sqs[0]->size = 0;
1780 	*sq_headp(ctrlr->sqs[0]) = 0;
1781 	ctrlr->cqs[0]->size = 0;
1782 	*cq_dbl_headp(ctrlr, ctrlr->cqs[0]) = 0;
1783 }
1784 
1785 static void
1786 memory_region_add_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
1787 {
1788 	struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx);
1789 	struct nvmf_vfio_user_ctrlr *ctrlr;
1790 	struct nvmf_vfio_user_sq *sq;
1791 	struct nvmf_vfio_user_cq *cq;
1792 	void *map_start, *map_end;
1793 	int ret;
1794 
1795 	/*
1796 	 * We're not interested in any DMA regions that aren't mappable (we don't
1797 	 * support clients that don't share their memory).
1798 	 */
1799 	if (!info->vaddr) {
1800 		return;
1801 	}
1802 
1803 	map_start = info->mapping.iov_base;
1804 	map_end = info->mapping.iov_base + info->mapping.iov_len;
1805 
1806 	if (((uintptr_t)info->mapping.iov_base & MASK_2MB) ||
1807 	    (info->mapping.iov_len & MASK_2MB)) {
1808 		SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %p-%p\n",
1809 			      info->vaddr, map_start, map_end);
1810 		return;
1811 	}
1812 
1813 	assert(endpoint != NULL);
1814 	if (endpoint->ctrlr == NULL) {
1815 		return;
1816 	}
1817 	ctrlr = endpoint->ctrlr;
1818 
1819 	SPDK_DEBUGLOG(nvmf_vfio, "%s: map IOVA %p-%p\n", endpoint_id(endpoint),
1820 		      map_start, map_end);
1821 
1822 	/* VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE are enabled when registering to VFIO, here we also
1823 	 * check the protection bits before registering.
1824 	 */
1825 	if (info->prot == (PROT_WRITE | PROT_READ)) {
1826 		ret = spdk_mem_register(info->mapping.iov_base, info->mapping.iov_len);
1827 		if (ret) {
1828 			SPDK_ERRLOG("Memory region register %p-%p failed, ret=%d\n",
1829 				    map_start, map_end, ret);
1830 		}
1831 	}
1832 
1833 	pthread_mutex_lock(&endpoint->lock);
1834 	TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) {
1835 		if (sq->sq_state != VFIO_USER_SQ_INACTIVE) {
1836 			continue;
1837 		}
1838 
1839 		cq = ctrlr->cqs[sq->cqid];
1840 
1841 		/* For shared CQ case, we will use q_addr() to avoid mapping CQ multiple times */
1842 		if (cq->size && q_addr(&cq->mapping) == NULL) {
1843 			ret = map_q(ctrlr, &cq->mapping, cq->size, true, false);
1844 			if (ret) {
1845 				SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap CQID %d %#lx-%#lx\n",
1846 					      cq->qid, cq->mapping.prp1,
1847 					      cq->mapping.prp1 + cq->size * sizeof(struct spdk_nvme_cpl));
1848 				continue;
1849 			}
1850 		}
1851 
1852 		if (sq->size) {
1853 			ret = map_q(ctrlr, &sq->mapping, sq->size, false, false);
1854 			if (ret) {
1855 				SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap SQID %d %#lx-%#lx\n",
1856 					      sq->qid, sq->mapping.prp1,
1857 					      sq->mapping.prp1 + sq->size * sizeof(struct spdk_nvme_cmd));
1858 				continue;
1859 			}
1860 		}
1861 		sq->sq_state = VFIO_USER_SQ_ACTIVE;
1862 		SPDK_DEBUGLOG(nvmf_vfio, "Remap SQ %u successfully\n", sq->qid);
1863 	}
1864 	pthread_mutex_unlock(&endpoint->lock);
1865 }
1866 
1867 static void
1868 memory_region_remove_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
1869 {
1870 	struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx);
1871 	struct nvmf_vfio_user_sq *sq;
1872 	struct nvmf_vfio_user_cq *cq;
1873 	void *map_start, *map_end;
1874 	int ret = 0;
1875 
1876 	if (!info->vaddr) {
1877 		return;
1878 	}
1879 
1880 	map_start = info->mapping.iov_base;
1881 	map_end = info->mapping.iov_base + info->mapping.iov_len;
1882 
1883 	if (((uintptr_t)info->mapping.iov_base & MASK_2MB) ||
1884 	    (info->mapping.iov_len & MASK_2MB)) {
1885 		SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %p-%p\n",
1886 			      info->vaddr, map_start, map_end);
1887 		return;
1888 	}
1889 
1890 	assert(endpoint != NULL);
1891 	SPDK_DEBUGLOG(nvmf_vfio, "%s: unmap IOVA %p-%p\n", endpoint_id(endpoint),
1892 		      map_start, map_end);
1893 
1894 	if (endpoint->ctrlr != NULL) {
1895 		struct nvmf_vfio_user_ctrlr *ctrlr;
1896 		ctrlr = endpoint->ctrlr;
1897 
1898 		pthread_mutex_lock(&endpoint->lock);
1899 		TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) {
1900 			if (q_addr(&sq->mapping) >= map_start && q_addr(&sq->mapping) <= map_end) {
1901 				unmap_q(ctrlr, &sq->mapping);
1902 				sq->sq_state = VFIO_USER_SQ_INACTIVE;
1903 			}
1904 
1905 			cq = ctrlr->cqs[sq->cqid];
1906 			if (q_addr(&cq->mapping) >= map_start && q_addr(&cq->mapping) <= map_end) {
1907 				unmap_q(ctrlr, &cq->mapping);
1908 			}
1909 		}
1910 		pthread_mutex_unlock(&endpoint->lock);
1911 	}
1912 
1913 	if (info->prot == (PROT_WRITE | PROT_READ)) {
1914 		ret = spdk_mem_unregister(info->mapping.iov_base, info->mapping.iov_len);
1915 		if (ret) {
1916 			SPDK_ERRLOG("Memory region unregister %p-%p failed, ret=%d\n",
1917 				    map_start, map_end, ret);
1918 		}
1919 	}
1920 }
1921 
1922 static int
1923 nvmf_vfio_user_prop_req_rsp(struct nvmf_vfio_user_req *req, void *cb_arg)
1924 {
1925 	struct nvmf_vfio_user_sq *sq = cb_arg;
1926 	struct nvmf_vfio_user_ctrlr *vu_ctrlr;
1927 	int ret;
1928 
1929 	assert(sq != NULL);
1930 	assert(req != NULL);
1931 
1932 	if (req->req.cmd->prop_get_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET) {
1933 		assert(sq->ctrlr != NULL);
1934 		assert(req != NULL);
1935 
1936 		memcpy(req->req.data,
1937 		       &req->req.rsp->prop_get_rsp.value.u64,
1938 		       req->req.length);
1939 	} else {
1940 		assert(req->req.cmd->prop_set_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET);
1941 		assert(sq->ctrlr != NULL);
1942 		vu_ctrlr = sq->ctrlr;
1943 
1944 		if (req->req.cmd->prop_set_cmd.ofst == offsetof(struct spdk_nvme_registers, cc)) {
1945 			union spdk_nvme_cc_register cc, diff;
1946 
1947 			cc.raw = req->req.cmd->prop_set_cmd.value.u64;
1948 			diff.raw = cc.raw ^ req->cc.raw;
1949 
1950 			if (diff.bits.en) {
1951 				if (cc.bits.en) {
1952 					SPDK_DEBUGLOG(nvmf_vfio, "%s: MAP Admin queue\n", ctrlr_id(vu_ctrlr));
1953 					ret = enable_admin_queue(vu_ctrlr);
1954 					if (ret) {
1955 						SPDK_ERRLOG("%s: failed to map Admin queue\n", ctrlr_id(vu_ctrlr));
1956 						return ret;
1957 					}
1958 					sq->sq_state = VFIO_USER_SQ_ACTIVE;
1959 					vu_ctrlr->reset_shn = false;
1960 				} else {
1961 					vu_ctrlr->reset_shn = true;
1962 				}
1963 			}
1964 
1965 			if (diff.bits.shn) {
1966 				if (cc.bits.shn == SPDK_NVME_SHN_NORMAL || cc.bits.shn == SPDK_NVME_SHN_ABRUPT) {
1967 					vu_ctrlr->reset_shn = true;
1968 				}
1969 			}
1970 
1971 			if (vu_ctrlr->reset_shn) {
1972 				SPDK_DEBUGLOG(nvmf_vfio,
1973 					      "%s: UNMAP Admin queue\n",
1974 					      ctrlr_id(vu_ctrlr));
1975 				sq->sq_state = VFIO_USER_SQ_INACTIVE;
1976 				disable_admin_queue(vu_ctrlr);
1977 				/* For PCIe controller reset or shutdown, we will drop all AER responses */
1978 				nvmf_ctrlr_abort_aer(vu_ctrlr->ctrlr);
1979 			}
1980 		}
1981 	}
1982 
1983 	return 0;
1984 }
1985 
1986 /*
1987  * Handles a write at offset 0x1000 or more; this is the non-mapped path when a
1988  * doorbell is written via access_bar0_fn().
1989  *
1990  * DSTRD is set to fixed value 0 for NVMf.
1991  *
1992  */
1993 static int
1994 handle_dbl_access(struct nvmf_vfio_user_ctrlr *ctrlr, uint32_t *buf,
1995 		  const size_t count, loff_t pos, const bool is_write)
1996 {
1997 	assert(ctrlr != NULL);
1998 	assert(buf != NULL);
1999 
2000 	if (count != sizeof(uint32_t)) {
2001 		SPDK_ERRLOG("%s: bad doorbell buffer size %ld\n",
2002 			    ctrlr_id(ctrlr), count);
2003 		errno = EINVAL;
2004 		return -1;
2005 	}
2006 
2007 	pos -= NVME_DOORBELLS_OFFSET;
2008 
2009 	/* pos must be dword aligned */
2010 	if ((pos & 0x3) != 0) {
2011 		SPDK_ERRLOG("%s: bad doorbell offset %#lx\n", ctrlr_id(ctrlr), pos);
2012 		errno = EINVAL;
2013 		return -1;
2014 	}
2015 
2016 	/* convert byte offset to array index */
2017 	pos >>= 2;
2018 
2019 	if (pos >= NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR * 2) {
2020 		SPDK_ERRLOG("%s: bad doorbell index %#lx\n", ctrlr_id(ctrlr), pos);
2021 		errno = EINVAL;
2022 		return -1;
2023 	}
2024 
2025 	if (is_write) {
2026 		ctrlr->doorbells[pos] = *buf;
2027 		spdk_wmb();
2028 	} else {
2029 		spdk_rmb();
2030 		*buf = ctrlr->doorbells[pos];
2031 	}
2032 	return 0;
2033 }
2034 
2035 static size_t
2036 vfio_user_property_access(struct nvmf_vfio_user_ctrlr *vu_ctrlr,
2037 			  char *buf, size_t count, loff_t pos,
2038 			  bool is_write)
2039 {
2040 	struct nvmf_vfio_user_req *req;
2041 	const struct spdk_nvmf_registers *regs;
2042 
2043 	/* Construct a Fabric Property Get/Set command and send it */
2044 	req = get_nvmf_vfio_user_req(vu_ctrlr->sqs[0]);
2045 	if (req == NULL) {
2046 		errno = ENOBUFS;
2047 		return -1;
2048 	}
2049 	regs = spdk_nvmf_ctrlr_get_regs(vu_ctrlr->ctrlr);
2050 	req->cc.raw = regs->cc.raw;
2051 
2052 	req->cb_fn = nvmf_vfio_user_prop_req_rsp;
2053 	req->cb_arg = vu_ctrlr->sqs[0];
2054 	req->req.cmd->prop_set_cmd.opcode = SPDK_NVME_OPC_FABRIC;
2055 	req->req.cmd->prop_set_cmd.cid = 0;
2056 	req->req.cmd->prop_set_cmd.attrib.size = (count / 4) - 1;
2057 	req->req.cmd->prop_set_cmd.ofst = pos;
2058 	if (is_write) {
2059 		req->req.cmd->prop_set_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET;
2060 		if (req->req.cmd->prop_set_cmd.attrib.size) {
2061 			req->req.cmd->prop_set_cmd.value.u64 = *(uint64_t *)buf;
2062 		} else {
2063 			req->req.cmd->prop_set_cmd.value.u32.high = 0;
2064 			req->req.cmd->prop_set_cmd.value.u32.low = *(uint32_t *)buf;
2065 		}
2066 	} else {
2067 		req->req.cmd->prop_get_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET;
2068 	}
2069 	req->req.length = count;
2070 	req->req.data = buf;
2071 
2072 	spdk_nvmf_request_exec_fabrics(&req->req);
2073 
2074 	return count;
2075 }
2076 
2077 static ssize_t
2078 access_bar0_fn(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t pos,
2079 	       bool is_write)
2080 {
2081 	struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx);
2082 	struct nvmf_vfio_user_ctrlr *ctrlr;
2083 	int ret;
2084 
2085 	ctrlr = endpoint->ctrlr;
2086 	if (endpoint->need_async_destroy || !ctrlr) {
2087 		errno = EIO;
2088 		return -1;
2089 	}
2090 
2091 	SPDK_DEBUGLOG(nvmf_vfio,
2092 		      "%s: bar0 %s ctrlr: %p, count=%zu, pos=%"PRIX64"\n",
2093 		      endpoint_id(endpoint), is_write ? "write" : "read",
2094 		      ctrlr, count, pos);
2095 
2096 	if (pos >= NVME_DOORBELLS_OFFSET) {
2097 		/*
2098 		 * The fact that the doorbells can be memory mapped doesn't mean
2099 		 * that the client (VFIO in QEMU) is obliged to memory map them,
2100 		 * it might still elect to access them via regular read/write;
2101 		 * we might also have had disable_mappable_bar0 set.
2102 		 */
2103 		ret = handle_dbl_access(ctrlr, (uint32_t *)buf, count,
2104 					pos, is_write);
2105 		if (ret == 0) {
2106 			return count;
2107 		}
2108 		return ret;
2109 	}
2110 
2111 	return vfio_user_property_access(ctrlr, buf, count, pos, is_write);
2112 }
2113 
2114 static ssize_t
2115 access_pci_config(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t offset,
2116 		  bool is_write)
2117 {
2118 	struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx);
2119 
2120 	if (is_write) {
2121 		SPDK_ERRLOG("%s: write %#lx-%#lx not supported\n",
2122 			    endpoint_id(endpoint), offset, offset + count);
2123 		errno = EINVAL;
2124 		return -1;
2125 	}
2126 
2127 	if (offset + count > NVME_REG_CFG_SIZE) {
2128 		SPDK_ERRLOG("%s: access past end of extended PCI configuration space, want=%ld+%ld, max=%d\n",
2129 			    endpoint_id(endpoint), offset, count,
2130 			    NVME_REG_CFG_SIZE);
2131 		errno = ERANGE;
2132 		return -1;
2133 	}
2134 
2135 	memcpy(buf, ((unsigned char *)endpoint->pci_config_space) + offset, count);
2136 
2137 	return count;
2138 }
2139 
2140 static void
2141 vfio_user_log(vfu_ctx_t *vfu_ctx, int level, char const *msg)
2142 {
2143 	struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx);
2144 
2145 	if (level >= LOG_DEBUG) {
2146 		SPDK_DEBUGLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg);
2147 	} else if (level >= LOG_INFO) {
2148 		SPDK_INFOLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg);
2149 	} else if (level >= LOG_NOTICE) {
2150 		SPDK_NOTICELOG("%s: %s\n", endpoint_id(endpoint), msg);
2151 	} else if (level >= LOG_WARNING) {
2152 		SPDK_WARNLOG("%s: %s\n", endpoint_id(endpoint), msg);
2153 	} else {
2154 		SPDK_ERRLOG("%s: %s\n", endpoint_id(endpoint), msg);
2155 	}
2156 }
2157 
2158 static int
2159 vfio_user_get_log_level(void)
2160 {
2161 	int level;
2162 
2163 	if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) {
2164 		return LOG_DEBUG;
2165 	}
2166 
2167 	level = spdk_log_to_syslog_level(spdk_log_get_level());
2168 	if (level < 0) {
2169 		return LOG_ERR;
2170 	}
2171 
2172 	return level;
2173 }
2174 
2175 static void
2176 init_pci_config_space(vfu_pci_config_space_t *p)
2177 {
2178 	/* MLBAR */
2179 	p->hdr.bars[0].raw = 0x0;
2180 	/* MUBAR */
2181 	p->hdr.bars[1].raw = 0x0;
2182 
2183 	/* vendor specific, let's set them to zero for now */
2184 	p->hdr.bars[3].raw = 0x0;
2185 	p->hdr.bars[4].raw = 0x0;
2186 	p->hdr.bars[5].raw = 0x0;
2187 
2188 	/* enable INTx */
2189 	p->hdr.intr.ipin = 0x1;
2190 }
2191 
2192 static void
2193 vfio_user_dev_migr_resume_done(struct spdk_nvmf_subsystem *subsystem,
2194 			       void *cb_arg, int status)
2195 {
2196 	struct nvmf_vfio_user_ctrlr *vu_ctrlr = cb_arg;
2197 
2198 	SPDK_DEBUGLOG(nvmf_vfio, "%s resumed done with status %d\n", ctrlr_id(vu_ctrlr), status);
2199 
2200 	vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING;
2201 }
2202 
2203 static void
2204 vfio_user_dev_quiesce_done(struct spdk_nvmf_subsystem *subsystem,
2205 			   void *cb_arg, int status);
2206 
2207 static void
2208 vfio_user_endpoint_resume_done(struct spdk_nvmf_subsystem *subsystem,
2209 			       void *cb_arg, int status)
2210 {
2211 	struct nvmf_vfio_user_endpoint *endpoint = cb_arg;
2212 	struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr;
2213 	int ret;
2214 
2215 	SPDK_DEBUGLOG(nvmf_vfio, "%s resumed done with status %d\n", endpoint_id(endpoint), status);
2216 
2217 	if (!vu_ctrlr) {
2218 		return;
2219 	}
2220 	vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING;
2221 
2222 	/* Basically, once we call `vfu_device_quiesced` the device is unquiesced from
2223 	 * libvfio-user's perspective so from the moment `vfio_user_dev_quiesce_done` returns
2224 	 * libvfio-user might quiesce the device again. However, because the NVMf subsytem is
2225 	 * an asynchronous operation, this quiesce might come _before_ the NVMf subsystem has
2226 	 * been resumed, so in the callback of `spdk_nvmf_subsystem_resume` we need to check
2227 	 * whether a quiesce was requested.
2228 	 */
2229 	if (vu_ctrlr->queued_quiesce) {
2230 		SPDK_DEBUGLOG(nvmf_vfio, "%s has queued quiesce event, pause again\n", ctrlr_id(vu_ctrlr));
2231 		vu_ctrlr->state = VFIO_USER_CTRLR_PAUSING;
2232 		ret = spdk_nvmf_subsystem_pause((struct spdk_nvmf_subsystem *)endpoint->subsystem, 0,
2233 						vfio_user_dev_quiesce_done, vu_ctrlr);
2234 		if (ret < 0) {
2235 			vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING;
2236 			SPDK_ERRLOG("%s: failed to pause, ret=%d\n", endpoint_id(endpoint), ret);
2237 		}
2238 	}
2239 }
2240 
2241 static void
2242 vfio_user_dev_quiesce_done(struct spdk_nvmf_subsystem *subsystem,
2243 			   void *cb_arg, int status)
2244 {
2245 	struct nvmf_vfio_user_ctrlr *vu_ctrlr = cb_arg;
2246 	struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint;
2247 	int ret;
2248 
2249 	SPDK_DEBUGLOG(nvmf_vfio, "%s paused done with status %d\n", ctrlr_id(vu_ctrlr), status);
2250 
2251 	assert(vu_ctrlr->state == VFIO_USER_CTRLR_PAUSING);
2252 	vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED;
2253 	vfu_device_quiesced(endpoint->vfu_ctx, status);
2254 	vu_ctrlr->queued_quiesce = false;
2255 
2256 	/* `vfu_device_quiesced` can change the migration state,
2257 	 * so we need to re-check `vu_ctrlr->state`.
2258 	 */
2259 	if (vu_ctrlr->state == VFIO_USER_CTRLR_MIGRATING) {
2260 		SPDK_DEBUGLOG(nvmf_vfio, "%s is in MIGRATION state\n", ctrlr_id(vu_ctrlr));
2261 		return;
2262 	}
2263 
2264 	SPDK_DEBUGLOG(nvmf_vfio, "%s start to resume\n", ctrlr_id(vu_ctrlr));
2265 	vu_ctrlr->state = VFIO_USER_CTRLR_RESUMING;
2266 	ret = spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem,
2267 					 vfio_user_endpoint_resume_done, endpoint);
2268 	if (ret < 0) {
2269 		vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED;
2270 		SPDK_ERRLOG("%s: failed to resume, ret=%d\n", endpoint_id(endpoint), ret);
2271 	}
2272 }
2273 
2274 static int
2275 vfio_user_dev_quiesce_cb(vfu_ctx_t *vfu_ctx)
2276 {
2277 	struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx);
2278 	struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr;
2279 	int ret;
2280 
2281 	if (!vu_ctrlr) {
2282 		return 0;
2283 	}
2284 
2285 	/* NVMf library will destruct controller when no
2286 	 * connected queue pairs.
2287 	 */
2288 	if (!nvmf_subsystem_get_ctrlr((struct spdk_nvmf_subsystem *)endpoint->subsystem,
2289 				      vu_ctrlr->cntlid)) {
2290 		return 0;
2291 	}
2292 
2293 	SPDK_DEBUGLOG(nvmf_vfio, "%s starts to quiesce\n", ctrlr_id(vu_ctrlr));
2294 
2295 	/* There is no race condition here as device quiesce callback
2296 	 * and nvmf_prop_set_cc() are running in the same thread context.
2297 	 */
2298 	if (!vu_ctrlr->ctrlr->vcprop.cc.bits.en) {
2299 		return 0;
2300 	} else if (!vu_ctrlr->ctrlr->vcprop.csts.bits.rdy) {
2301 		return 0;
2302 	} else if (vu_ctrlr->ctrlr->vcprop.csts.bits.shst == SPDK_NVME_SHST_COMPLETE) {
2303 		return 0;
2304 	}
2305 
2306 	switch (vu_ctrlr->state) {
2307 	case VFIO_USER_CTRLR_PAUSED:
2308 	case VFIO_USER_CTRLR_MIGRATING:
2309 		return 0;
2310 	case VFIO_USER_CTRLR_RUNNING:
2311 		vu_ctrlr->state = VFIO_USER_CTRLR_PAUSING;
2312 		ret = spdk_nvmf_subsystem_pause((struct spdk_nvmf_subsystem *)endpoint->subsystem, 0,
2313 						vfio_user_dev_quiesce_done, vu_ctrlr);
2314 		if (ret < 0) {
2315 			vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING;
2316 			SPDK_ERRLOG("%s: failed to pause, ret=%d\n", endpoint_id(endpoint), ret);
2317 			return 0;
2318 		}
2319 		break;
2320 	case VFIO_USER_CTRLR_RESUMING:
2321 		vu_ctrlr->queued_quiesce = true;
2322 		SPDK_DEBUGLOG(nvmf_vfio, "%s is busy to quiesce, current state %u\n", ctrlr_id(vu_ctrlr),
2323 			      vu_ctrlr->state);
2324 		break;
2325 	default:
2326 		assert(vu_ctrlr->state != VFIO_USER_CTRLR_PAUSING);
2327 		break;
2328 	}
2329 
2330 	errno = EBUSY;
2331 	return -1;
2332 }
2333 
2334 static void
2335 vfio_user_ctrlr_dump_migr_data(const char *name, struct vfio_user_nvme_migr_state *migr_data)
2336 {
2337 	struct spdk_nvme_registers *regs;
2338 	struct nvme_migr_sq_state *sq;
2339 	struct nvme_migr_cq_state *cq;
2340 	uint32_t *doorbell_base;
2341 	uint32_t i;
2342 
2343 	SPDK_NOTICELOG("Dump %s\n", name);
2344 
2345 	regs = (struct spdk_nvme_registers *)migr_data->bar0;
2346 	doorbell_base = (uint32_t *)&regs->doorbell[0].sq_tdbl;
2347 
2348 	SPDK_NOTICELOG("Registers\n");
2349 	SPDK_NOTICELOG("CSTS 0x%x\n", regs->csts.raw);
2350 	SPDK_NOTICELOG("CAP  0x%"PRIx64"\n", regs->cap.raw);
2351 	SPDK_NOTICELOG("VS   0x%x\n", regs->vs.raw);
2352 	SPDK_NOTICELOG("CC   0x%x\n", regs->cc.raw);
2353 	SPDK_NOTICELOG("AQA  0x%x\n", regs->aqa.raw);
2354 	SPDK_NOTICELOG("ASQ  0x%"PRIx64"\n", regs->asq);
2355 	SPDK_NOTICELOG("ACQ  0x%"PRIx64"\n", regs->acq);
2356 
2357 	SPDK_NOTICELOG("Number of IO Queues %u\n", migr_data->ctrlr_data.num_io_queues);
2358 	for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) {
2359 		sq = &migr_data->qps[i].sq;
2360 		cq = &migr_data->qps[i].cq;
2361 
2362 		if (sq->size) {
2363 			SPDK_NOTICELOG("SQID %u, SQ DOORBELL %u\n", sq->sqid, doorbell_base[i * 2]);
2364 			SPDK_NOTICELOG("SQ SQID %u, CQID %u, HEAD %u, SIZE %u, DMA ADDR 0x%"PRIx64"\n",
2365 				       sq->sqid, sq->cqid, sq->head, sq->size, sq->dma_addr);
2366 		}
2367 
2368 		if (cq->size) {
2369 			SPDK_NOTICELOG("CQID %u, CQ DOORBELL %u\n", cq->cqid, doorbell_base[i * 2 + 1]);
2370 			SPDK_NOTICELOG("CQ CQID %u, PHASE %u, TAIL %u, SIZE %u, IV %u, IEN %u, DMA ADDR 0x%"PRIx64"\n",
2371 				       cq->cqid, cq->phase, cq->tail, cq->size, cq->iv, cq->ien, cq->dma_addr);
2372 		}
2373 	}
2374 
2375 	SPDK_NOTICELOG("%s Dump Done\n", name);
2376 }
2377 
2378 /* Read region 9 content and restore it to migration data structures */
2379 static int
2380 vfio_user_migr_stream_to_data(struct nvmf_vfio_user_endpoint *endpoint,
2381 			      struct vfio_user_nvme_migr_state *migr_state)
2382 {
2383 	void *data_ptr = endpoint->migr_data;
2384 
2385 	/* Load nvme_migr_device_state first */
2386 	memcpy(&migr_state->ctrlr_data, data_ptr, sizeof(struct nvme_migr_device_state));
2387 	/* TODO: version check */
2388 	if (migr_state->ctrlr_data.magic != VFIO_USER_NVME_MIGR_MAGIC) {
2389 		SPDK_ERRLOG("%s: bad magic number %x\n", endpoint_id(endpoint), migr_state->ctrlr_data.magic);
2390 		return -EINVAL;
2391 	}
2392 
2393 	/* Load private controller data */
2394 	data_ptr = endpoint->migr_data + migr_state->ctrlr_data.private_data_offset;
2395 	memcpy(&migr_state->private_data, data_ptr, migr_state->ctrlr_data.private_data_len);
2396 
2397 	/* Load queue pairs */
2398 	data_ptr = endpoint->migr_data + migr_state->ctrlr_data.qp_offset;
2399 	memcpy(&migr_state->qps, data_ptr, migr_state->ctrlr_data.qp_len);
2400 
2401 	/* Load BAR0 */
2402 	data_ptr = endpoint->migr_data + migr_state->ctrlr_data.bar_offset[VFU_PCI_DEV_BAR0_REGION_IDX];
2403 	memcpy(&migr_state->bar0, data_ptr, migr_state->ctrlr_data.bar_len[VFU_PCI_DEV_BAR0_REGION_IDX]);
2404 
2405 	/* Load CFG */
2406 	data_ptr = endpoint->migr_data + migr_state->ctrlr_data.bar_offset[VFU_PCI_DEV_CFG_REGION_IDX];
2407 	memcpy(&migr_state->cfg, data_ptr, migr_state->ctrlr_data.bar_len[VFU_PCI_DEV_CFG_REGION_IDX]);
2408 
2409 	return 0;
2410 }
2411 
2412 
2413 static void
2414 vfio_user_migr_ctrlr_save_data(struct nvmf_vfio_user_ctrlr *vu_ctrlr)
2415 {
2416 	struct spdk_nvmf_ctrlr *ctrlr = vu_ctrlr->ctrlr;
2417 	struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint;
2418 	struct nvmf_vfio_user_sq *sq;
2419 	struct nvmf_vfio_user_cq *cq;
2420 	struct vfio_user_nvme_migr_state migr_state = {};
2421 	uint64_t data_offset;
2422 	void *data_ptr;
2423 	int num_aers;
2424 	struct spdk_nvme_registers *regs;
2425 	uint32_t *doorbell_base;
2426 	uint32_t i = 0;
2427 	uint16_t sqid, cqid;
2428 
2429 	/* Save all data to vfio_user_nvme_migr_state first, then we will
2430 	 * copy it to device migration region at last.
2431 	 */
2432 
2433 	/* save magic number */
2434 	migr_state.ctrlr_data.magic = VFIO_USER_NVME_MIGR_MAGIC;
2435 
2436 	/* save controller data */
2437 	num_aers = nvmf_ctrlr_save_aers(ctrlr, migr_state.ctrlr_data.aer_cids,
2438 					256);
2439 	assert(num_aers >= 0);
2440 	migr_state.ctrlr_data.nr_aers = num_aers;
2441 
2442 	/* save controller private data */
2443 	nvmf_ctrlr_save_migr_data(ctrlr, (struct nvmf_ctrlr_migr_data *)&migr_state.private_data);
2444 
2445 	/* save connected queue pairs */
2446 	TAILQ_FOREACH(sq, &vu_ctrlr->connected_sqs, tailq) {
2447 		/* save sq */
2448 		sqid = sq->qid;
2449 		migr_state.qps[sqid].sq.sqid = sq->qid;
2450 		migr_state.qps[sqid].sq.cqid = sq->cqid;
2451 		migr_state.qps[sqid].sq.head = *sq_headp(sq);
2452 		migr_state.qps[sqid].sq.size = sq->size;
2453 		migr_state.qps[sqid].sq.dma_addr = sq->mapping.prp1;
2454 
2455 		/* save cq, for shared cq case, cq may be saved multiple times */
2456 		cqid = sq->cqid;
2457 		cq = vu_ctrlr->cqs[cqid];
2458 		migr_state.qps[cqid].cq.cqid = cqid;
2459 		migr_state.qps[cqid].cq.tail = *cq_tailp(cq);
2460 		migr_state.qps[cqid].cq.ien = cq->ien;
2461 		migr_state.qps[cqid].cq.iv = cq->iv;
2462 		migr_state.qps[cqid].cq.size = cq->size;
2463 		migr_state.qps[cqid].cq.phase = cq->phase;
2464 		migr_state.qps[cqid].cq.dma_addr = cq->mapping.prp1;
2465 		i++;
2466 	}
2467 
2468 	assert(i > 0);
2469 	migr_state.ctrlr_data.num_io_queues = i - 1;
2470 
2471 	regs = (struct spdk_nvme_registers *)&migr_state.bar0;
2472 	/* Save mandarory registers to bar0 */
2473 	regs->cap.raw = ctrlr->vcprop.cap.raw;
2474 	regs->vs.raw = ctrlr->vcprop.vs.raw;
2475 	regs->cc.raw = ctrlr->vcprop.cc.raw;
2476 	regs->aqa.raw = ctrlr->vcprop.aqa.raw;
2477 	regs->asq = ctrlr->vcprop.asq;
2478 	regs->acq = ctrlr->vcprop.acq;
2479 	/* Save doorbells */
2480 	doorbell_base = (uint32_t *)&regs->doorbell[0].sq_tdbl;
2481 	memcpy(doorbell_base, (void *)vu_ctrlr->doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE);
2482 
2483 	/* Save PCI configuration space */
2484 	memcpy(&migr_state.cfg, (void *)endpoint->pci_config_space, NVME_REG_CFG_SIZE);
2485 
2486 	/* Save all data to device migration region */
2487 	data_ptr = endpoint->migr_data;
2488 
2489 	/* Copy private controller data */
2490 	data_offset = sizeof(struct nvme_migr_device_state);
2491 	data_ptr += data_offset;
2492 	migr_state.ctrlr_data.private_data_offset = data_offset;
2493 	migr_state.ctrlr_data.private_data_len = sizeof(struct nvmf_ctrlr_migr_data);
2494 	memcpy(data_ptr, &migr_state.private_data, sizeof(struct nvmf_ctrlr_migr_data));
2495 
2496 	/* Copy queue pairs */
2497 	data_offset += sizeof(struct nvmf_ctrlr_migr_data);
2498 	data_ptr += sizeof(struct nvmf_ctrlr_migr_data);
2499 	migr_state.ctrlr_data.qp_offset = data_offset;
2500 	migr_state.ctrlr_data.qp_len = i * (sizeof(struct nvme_migr_sq_state) + sizeof(
2501 			struct nvme_migr_cq_state));
2502 	memcpy(data_ptr, &migr_state.qps, migr_state.ctrlr_data.qp_len);
2503 
2504 	/* Copy BAR0 */
2505 	data_offset += migr_state.ctrlr_data.qp_len;
2506 	data_ptr += migr_state.ctrlr_data.qp_len;
2507 	migr_state.ctrlr_data.bar_offset[VFU_PCI_DEV_BAR0_REGION_IDX] = data_offset;
2508 	migr_state.ctrlr_data.bar_len[VFU_PCI_DEV_BAR0_REGION_IDX] = NVME_REG_BAR0_SIZE;
2509 	memcpy(data_ptr, &migr_state.bar0, NVME_REG_BAR0_SIZE);
2510 
2511 	/* Copy CFG */
2512 	data_offset += NVME_REG_BAR0_SIZE;
2513 	data_ptr += NVME_REG_BAR0_SIZE;
2514 	migr_state.ctrlr_data.bar_offset[VFU_PCI_DEV_CFG_REGION_IDX] = data_offset;
2515 	migr_state.ctrlr_data.bar_len[VFU_PCI_DEV_CFG_REGION_IDX] = NVME_REG_CFG_SIZE;
2516 	memcpy(data_ptr, &migr_state.cfg, NVME_REG_CFG_SIZE);
2517 
2518 	/* Copy device state finally */
2519 	memcpy(endpoint->migr_data, &migr_state.ctrlr_data, sizeof(struct nvme_migr_device_state));
2520 
2521 	if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) {
2522 		vfio_user_ctrlr_dump_migr_data("SAVE", &migr_state);
2523 	}
2524 }
2525 
2526 static int
2527 vfio_user_migr_ctrlr_construct_qps(struct nvmf_vfio_user_ctrlr *vu_ctrlr,
2528 				   struct vfio_user_nvme_migr_state *migr_state)
2529 {
2530 	uint32_t i, qsize = 0;
2531 	uint16_t sqid, cqid;
2532 	struct vfio_user_nvme_migr_qp migr_qp;
2533 	void *addr;
2534 	int ret;
2535 
2536 	if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) {
2537 		vfio_user_ctrlr_dump_migr_data("RESUME", migr_state);
2538 	}
2539 
2540 	/* restore connected queue pairs */
2541 	for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) {
2542 		migr_qp =  migr_state->qps[i];
2543 
2544 		qsize = migr_qp.sq.size;
2545 		if (qsize) {
2546 			struct nvmf_vfio_user_sq *sq;
2547 
2548 			sqid = migr_qp.sq.sqid;
2549 			if (sqid != i) {
2550 				SPDK_ERRLOG("Expected sqid %u while got %u", i, sqid);
2551 				return -EINVAL;
2552 			}
2553 
2554 			/* allocate sq if necessary */
2555 			if (vu_ctrlr->sqs[sqid] == NULL) {
2556 				ret = init_sq(vu_ctrlr, &vu_ctrlr->transport->transport, sqid);
2557 				if (ret) {
2558 					SPDK_ERRLOG("Construct qpair with qid %u failed\n", sqid);
2559 					return -EFAULT;
2560 				}
2561 			}
2562 
2563 			sq = vu_ctrlr->sqs[sqid];
2564 
2565 			sq->size = qsize;
2566 
2567 			ret = alloc_sq_reqs(vu_ctrlr, sq);
2568 			if (ret) {
2569 				SPDK_ERRLOG("Construct sq with qid %u failed\n", sqid);
2570 				return -EFAULT;
2571 			}
2572 
2573 			/* restore sq */
2574 			sq->cqid = migr_qp.sq.cqid;
2575 			*sq_headp(sq) = migr_qp.sq.head;
2576 			sq->mapping.prp1 = migr_qp.sq.dma_addr;
2577 			addr = map_one(vu_ctrlr->endpoint->vfu_ctx,
2578 				       sq->mapping.prp1, sq->size * 64,
2579 				       sq->mapping.sg, &sq->mapping.iov,
2580 				       PROT_READ);
2581 			if (addr == NULL) {
2582 				SPDK_ERRLOG("Restore sq with qid %u PRP1 0x%"PRIx64" with size %u failed\n",
2583 					    sqid, sq->mapping.prp1, sq->size);
2584 				return -EFAULT;
2585 			}
2586 		}
2587 
2588 		qsize = migr_qp.cq.size;
2589 		if (qsize) {
2590 			struct nvmf_vfio_user_cq *cq;
2591 
2592 			/* restore cq */
2593 			cqid = migr_qp.sq.cqid;
2594 			assert(cqid == i);
2595 
2596 			/* allocate cq if necessary */
2597 			if (vu_ctrlr->cqs[cqid] == NULL) {
2598 				ret = init_cq(vu_ctrlr, cqid);
2599 				if (ret) {
2600 					SPDK_ERRLOG("Construct qpair with qid %u failed\n", cqid);
2601 					return -EFAULT;
2602 				}
2603 			}
2604 
2605 			cq = vu_ctrlr->cqs[cqid];
2606 
2607 			cq->size = qsize;
2608 
2609 			*cq_tailp(cq) = migr_qp.cq.tail;
2610 			cq->mapping.prp1 = migr_qp.cq.dma_addr;
2611 			cq->ien = migr_qp.cq.ien;
2612 			cq->iv = migr_qp.cq.iv;
2613 			cq->phase = migr_qp.cq.phase;
2614 			addr = map_one(vu_ctrlr->endpoint->vfu_ctx,
2615 				       cq->mapping.prp1, cq->size * 16,
2616 				       cq->mapping.sg, &cq->mapping.iov,
2617 				       PROT_READ | PROT_WRITE);
2618 			if (addr == NULL) {
2619 				SPDK_ERRLOG("Restore cq with qid %u PRP1 0x%"PRIx64" with size %u failed\n",
2620 					    cqid, cq->mapping.prp1, cq->size);
2621 				return -EFAULT;
2622 			}
2623 		}
2624 	}
2625 
2626 	return 0;
2627 }
2628 
2629 static int
2630 vfio_user_migr_ctrlr_restore(struct nvmf_vfio_user_ctrlr *vu_ctrlr)
2631 {
2632 	struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint;
2633 	struct spdk_nvmf_ctrlr *ctrlr = vu_ctrlr->ctrlr;
2634 	uint32_t *doorbell_base;
2635 	struct vfio_user_nvme_migr_state migr_state = {};
2636 	struct spdk_nvme_registers *regs;
2637 	struct spdk_nvme_cmd cmd;
2638 	uint16_t i;
2639 	int rc = 0;
2640 
2641 	assert(endpoint->migr_data != NULL);
2642 	assert(ctrlr != NULL);
2643 	rc = vfio_user_migr_stream_to_data(endpoint, &migr_state);
2644 	if (rc) {
2645 		return rc;
2646 	}
2647 
2648 	rc = vfio_user_migr_ctrlr_construct_qps(vu_ctrlr, &migr_state);
2649 	if (rc) {
2650 		return rc;
2651 	}
2652 
2653 	/* restore PCI configuration space */
2654 	memcpy((void *)endpoint->pci_config_space, &migr_state.cfg, NVME_REG_CFG_SIZE);
2655 
2656 	regs = (struct spdk_nvme_registers *)&migr_state.bar0;
2657 	doorbell_base = (uint32_t *)&regs->doorbell[0].sq_tdbl;
2658 	/* restore doorbells from saved registers */
2659 	memcpy((void *)vu_ctrlr->doorbells, doorbell_base, NVMF_VFIO_USER_DOORBELLS_SIZE);
2660 
2661 	/* restore controller registers after ADMIN queue connection */
2662 	ctrlr->vcprop.cap.raw = regs->cap.raw;
2663 	ctrlr->vcprop.vs.raw = regs->vs.raw;
2664 	ctrlr->vcprop.cc.raw = regs->cc.raw;
2665 	ctrlr->vcprop.aqa.raw = regs->aqa.raw;
2666 	ctrlr->vcprop.asq = regs->asq;
2667 	ctrlr->vcprop.acq = regs->acq;
2668 
2669 	/* restore controller private data */
2670 	rc = nvmf_ctrlr_restore_migr_data(ctrlr, &migr_state.private_data);
2671 	if (rc) {
2672 		return rc;
2673 	}
2674 
2675 	/* resubmit pending AERs */
2676 	for (i = 0; i < migr_state.ctrlr_data.nr_aers; i++) {
2677 		SPDK_DEBUGLOG(nvmf_vfio, "%s AER resubmit, CID %u\n", ctrlr_id(vu_ctrlr),
2678 			      migr_state.ctrlr_data.aer_cids[i]);
2679 		memset(&cmd, 0, sizeof(cmd));
2680 		cmd.opc = SPDK_NVME_OPC_ASYNC_EVENT_REQUEST;
2681 		cmd.cid = migr_state.ctrlr_data.aer_cids[i];
2682 		rc = handle_cmd_req(vu_ctrlr, &cmd, vu_ctrlr->sqs[0]);
2683 		if (rc) {
2684 			break;
2685 		}
2686 	}
2687 
2688 	return rc;
2689 }
2690 
2691 static void
2692 vfio_user_migr_ctrlr_enable_sqs(struct nvmf_vfio_user_ctrlr *vu_ctrlr)
2693 {
2694 	uint32_t i;
2695 	struct nvmf_vfio_user_sq *sq;
2696 
2697 	for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) {
2698 		sq = vu_ctrlr->sqs[i];
2699 		if (!sq || !sq->size) {
2700 			continue;
2701 		}
2702 
2703 		if (nvmf_qpair_is_admin_queue(&sq->qpair)) {
2704 			/* ADMIN queue pair is always in the poll group, just enable it */
2705 			sq->sq_state = VFIO_USER_SQ_ACTIVE;
2706 		} else {
2707 			spdk_nvmf_tgt_new_qpair(vu_ctrlr->transport->transport.tgt, &sq->qpair);
2708 		}
2709 	}
2710 }
2711 
2712 static int
2713 vfio_user_migration_device_state_transition(vfu_ctx_t *vfu_ctx, vfu_migr_state_t state)
2714 {
2715 	struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx);
2716 	struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr;
2717 	struct nvmf_vfio_user_sq *sq;
2718 	int ret = 0;
2719 
2720 	SPDK_DEBUGLOG(nvmf_vfio, "%s controller state %u, migration state %u\n", endpoint_id(endpoint),
2721 		      vu_ctrlr->state, state);
2722 
2723 	switch (state) {
2724 	case VFU_MIGR_STATE_STOP_AND_COPY:
2725 		vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING;
2726 		vfio_user_migr_ctrlr_save_data(vu_ctrlr);
2727 		break;
2728 	case VFU_MIGR_STATE_STOP:
2729 		vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING;
2730 		break;
2731 	case VFU_MIGR_STATE_PRE_COPY:
2732 		assert(vu_ctrlr->state == VFIO_USER_CTRLR_PAUSED);
2733 		vu_ctrlr->migr_reg.pending_bytes = vfio_user_migr_data_len();
2734 		vu_ctrlr->migr_reg.last_data_offset = 0;
2735 		vu_ctrlr->in_source_vm = true;
2736 		break;
2737 	case VFU_MIGR_STATE_RESUME:
2738 		/*
2739 		 * Destination ADMIN queue pair is connected when starting the VM,
2740 		 * but the ADMIN queue pair isn't enabled in destination VM, the poll
2741 		 * group will do nothing to ADMIN queue pair for now.
2742 		 */
2743 		if (vu_ctrlr->state != VFIO_USER_CTRLR_RUNNING) {
2744 			break;
2745 		}
2746 
2747 		assert(!vu_ctrlr->in_source_vm);
2748 		vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING;
2749 
2750 		sq = TAILQ_FIRST(&vu_ctrlr->connected_sqs);
2751 		assert(sq != NULL);
2752 		assert(sq->qpair.qid == 0);
2753 		sq->sq_state = VFIO_USER_SQ_INACTIVE;
2754 
2755 		/* Free ADMIN SQ resources first, SQ resources will be
2756 		 * allocated based on queue size from source VM.
2757 		 */
2758 		free_sq_reqs(sq);
2759 		sq->size = 0;
2760 		break;
2761 	case VFU_MIGR_STATE_RUNNING:
2762 		if (vu_ctrlr->state != VFIO_USER_CTRLR_MIGRATING) {
2763 			break;
2764 		}
2765 
2766 		if (!vu_ctrlr->in_source_vm) {
2767 			/* Restore destination VM from BAR9 */
2768 			ret = vfio_user_migr_ctrlr_restore(vu_ctrlr);
2769 			if (ret) {
2770 				break;
2771 			}
2772 			vfio_user_migr_ctrlr_enable_sqs(vu_ctrlr);
2773 			vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING;
2774 		} else {
2775 			/* Rollback source VM */
2776 			vu_ctrlr->state = VFIO_USER_CTRLR_RESUMING;
2777 			ret = spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem,
2778 							 vfio_user_dev_migr_resume_done, vu_ctrlr);
2779 			if (ret < 0) {
2780 				/* TODO: fail controller with CFS bit set */
2781 				vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED;
2782 				SPDK_ERRLOG("%s: failed to resume, ret=%d\n", endpoint_id(endpoint), ret);
2783 				break;
2784 			}
2785 		}
2786 		break;
2787 
2788 	default:
2789 		return -EINVAL;
2790 	}
2791 
2792 	return ret;
2793 }
2794 
2795 static uint64_t
2796 vfio_user_migration_get_pending_bytes(vfu_ctx_t *vfu_ctx)
2797 {
2798 	struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx);
2799 	struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr;
2800 	struct vfio_user_migration_region *migr_reg = &ctrlr->migr_reg;
2801 
2802 	SPDK_DEBUGLOG(nvmf_vfio, "%s current state %u, pending bytes 0x%"PRIx64"\n", endpoint_id(endpoint),
2803 		      ctrlr->state, migr_reg->pending_bytes);
2804 
2805 	return migr_reg->pending_bytes;
2806 }
2807 
2808 static int
2809 vfio_user_migration_prepare_data(vfu_ctx_t *vfu_ctx, uint64_t *offset, uint64_t *size)
2810 {
2811 	struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx);
2812 	struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr;
2813 	struct vfio_user_migration_region *migr_reg = &ctrlr->migr_reg;
2814 
2815 	if (migr_reg->last_data_offset == vfio_user_migr_data_len()) {
2816 		*offset = vfio_user_migr_data_len();
2817 		if (size) {
2818 			*size = 0;
2819 		}
2820 		migr_reg->pending_bytes = 0;
2821 	} else {
2822 		*offset = 0;
2823 		if (size) {
2824 			*size = vfio_user_migr_data_len();
2825 			if (ctrlr->state == VFIO_USER_CTRLR_MIGRATING) {
2826 				vfio_user_migr_ctrlr_save_data(ctrlr);
2827 				migr_reg->last_data_offset = vfio_user_migr_data_len();
2828 			}
2829 		}
2830 	}
2831 
2832 	SPDK_DEBUGLOG(nvmf_vfio, "%s current state %u\n", endpoint_id(endpoint), ctrlr->state);
2833 
2834 	return 0;
2835 }
2836 
2837 static ssize_t
2838 vfio_user_migration_read_data(vfu_ctx_t *vfu_ctx, void *buf, uint64_t count, uint64_t offset)
2839 {
2840 	struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx);
2841 	struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr;
2842 	struct vfio_user_migration_region *migr_reg = &ctrlr->migr_reg;
2843 
2844 	memcpy(buf, endpoint->migr_data, count);
2845 	migr_reg->pending_bytes = 0;
2846 
2847 	return 0;
2848 }
2849 
2850 static ssize_t
2851 vfio_user_migration_write_data(vfu_ctx_t *vfu_ctx, void *buf, uint64_t count, uint64_t offset)
2852 {
2853 	struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx);
2854 
2855 	memcpy(endpoint->migr_data, buf, count);
2856 
2857 	return 0;
2858 }
2859 
2860 static int
2861 vfio_user_migration_data_written(vfu_ctx_t *vfu_ctx, uint64_t count)
2862 {
2863 	SPDK_DEBUGLOG(nvmf_vfio, "write 0x%"PRIx64"\n", (uint64_t)count);
2864 
2865 	return 0;
2866 }
2867 
2868 static int
2869 vfio_user_dev_info_fill(struct nvmf_vfio_user_transport *vu_transport,
2870 			struct nvmf_vfio_user_endpoint *endpoint)
2871 {
2872 	int ret;
2873 	ssize_t cap_offset;
2874 	vfu_ctx_t *vfu_ctx = endpoint->vfu_ctx;
2875 	struct iovec migr_sparse_mmap = {};
2876 
2877 	struct pmcap pmcap = { .hdr.id = PCI_CAP_ID_PM, .pmcs.nsfrst = 0x1 };
2878 	struct pxcap pxcap = {
2879 		.hdr.id = PCI_CAP_ID_EXP,
2880 		.pxcaps.ver = 0x2,
2881 		.pxdcap = {.rer = 0x1, .flrc = 0x1},
2882 		.pxdcap2.ctds = 0x1
2883 	};
2884 
2885 	struct msixcap msixcap = {
2886 		.hdr.id = PCI_CAP_ID_MSIX,
2887 		.mxc.ts = NVME_IRQ_MSIX_NUM - 1,
2888 		.mtab = {.tbir = 0x4, .to = 0x0},
2889 		.mpba = {.pbir = 0x5, .pbao = 0x0}
2890 	};
2891 
2892 	struct iovec sparse_mmap[] = {
2893 		{
2894 			.iov_base = (void *)NVME_DOORBELLS_OFFSET,
2895 			.iov_len = NVMF_VFIO_USER_DOORBELLS_SIZE,
2896 		},
2897 	};
2898 
2899 	const vfu_migration_callbacks_t migr_callbacks = {
2900 		.version = VFU_MIGR_CALLBACKS_VERS,
2901 		.transition = &vfio_user_migration_device_state_transition,
2902 		.get_pending_bytes = &vfio_user_migration_get_pending_bytes,
2903 		.prepare_data = &vfio_user_migration_prepare_data,
2904 		.read_data = &vfio_user_migration_read_data,
2905 		.data_written = &vfio_user_migration_data_written,
2906 		.write_data = &vfio_user_migration_write_data
2907 	};
2908 
2909 	ret = vfu_pci_init(vfu_ctx, VFU_PCI_TYPE_EXPRESS, PCI_HEADER_TYPE_NORMAL, 0);
2910 	if (ret < 0) {
2911 		SPDK_ERRLOG("vfu_ctx %p failed to initialize PCI\n", vfu_ctx);
2912 		return ret;
2913 	}
2914 	vfu_pci_set_id(vfu_ctx, SPDK_PCI_VID_NUTANIX, 0x0001, SPDK_PCI_VID_NUTANIX, 0);
2915 	/*
2916 	 * 0x02, controller uses the NVM Express programming interface
2917 	 * 0x08, non-volatile memory controller
2918 	 * 0x01, mass storage controller
2919 	 */
2920 	vfu_pci_set_class(vfu_ctx, 0x01, 0x08, 0x02);
2921 
2922 	cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pmcap);
2923 	if (cap_offset < 0) {
2924 		SPDK_ERRLOG("vfu_ctx %p failed add pmcap\n", vfu_ctx);
2925 		return ret;
2926 	}
2927 
2928 	cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pxcap);
2929 	if (cap_offset < 0) {
2930 		SPDK_ERRLOG("vfu_ctx %p failed add pxcap\n", vfu_ctx);
2931 		return ret;
2932 	}
2933 
2934 	cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &msixcap);
2935 	if (cap_offset < 0) {
2936 		SPDK_ERRLOG("vfu_ctx %p failed add msixcap\n", vfu_ctx);
2937 		return ret;
2938 	}
2939 
2940 	ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_CFG_REGION_IDX, NVME_REG_CFG_SIZE,
2941 			       access_pci_config, VFU_REGION_FLAG_RW, NULL, 0, -1, 0);
2942 	if (ret < 0) {
2943 		SPDK_ERRLOG("vfu_ctx %p failed to setup cfg\n", vfu_ctx);
2944 		return ret;
2945 	}
2946 
2947 	if (vu_transport->transport_opts.disable_mappable_bar0) {
2948 		ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE,
2949 				       access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM,
2950 				       NULL, 0, -1, 0);
2951 	} else {
2952 		ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE,
2953 				       access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM,
2954 				       sparse_mmap, 1, endpoint->devmem_fd, 0);
2955 	}
2956 
2957 	if (ret < 0) {
2958 		SPDK_ERRLOG("vfu_ctx %p failed to setup bar 0\n", vfu_ctx);
2959 		return ret;
2960 	}
2961 
2962 	ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR4_REGION_IDX, NVME_BAR4_SIZE,
2963 			       NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0);
2964 	if (ret < 0) {
2965 		SPDK_ERRLOG("vfu_ctx %p failed to setup bar 4\n", vfu_ctx);
2966 		return ret;
2967 	}
2968 
2969 	ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR5_REGION_IDX, NVME_BAR5_SIZE,
2970 			       NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0);
2971 	if (ret < 0) {
2972 		SPDK_ERRLOG("vfu_ctx %p failed to setup bar 5\n", vfu_ctx);
2973 		return ret;
2974 	}
2975 
2976 	ret = vfu_setup_device_dma(vfu_ctx, memory_region_add_cb, memory_region_remove_cb);
2977 	if (ret < 0) {
2978 		SPDK_ERRLOG("vfu_ctx %p failed to setup dma callback\n", vfu_ctx);
2979 		return ret;
2980 	}
2981 
2982 	ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_INTX_IRQ, 1);
2983 	if (ret < 0) {
2984 		SPDK_ERRLOG("vfu_ctx %p failed to setup INTX\n", vfu_ctx);
2985 		return ret;
2986 	}
2987 
2988 	ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSIX_IRQ, NVME_IRQ_MSIX_NUM);
2989 	if (ret < 0) {
2990 		SPDK_ERRLOG("vfu_ctx %p failed to setup MSIX\n", vfu_ctx);
2991 		return ret;
2992 	}
2993 
2994 	vfu_setup_device_quiesce_cb(vfu_ctx, vfio_user_dev_quiesce_cb);
2995 
2996 	migr_sparse_mmap.iov_base = (void *)4096;
2997 	migr_sparse_mmap.iov_len = vfio_user_migr_data_len();
2998 	ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_MIGR_REGION_IDX,
2999 			       vfu_get_migr_register_area_size() + vfio_user_migr_data_len(),
3000 			       NULL, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, &migr_sparse_mmap,
3001 			       1, endpoint->migr_fd, 0);
3002 	if (ret < 0) {
3003 		SPDK_ERRLOG("vfu_ctx %p failed to setup migration region\n", vfu_ctx);
3004 		return ret;
3005 	}
3006 
3007 	ret = vfu_setup_device_migration_callbacks(vfu_ctx, &migr_callbacks,
3008 			vfu_get_migr_register_area_size());
3009 	if (ret < 0) {
3010 		SPDK_ERRLOG("vfu_ctx %p failed to setup migration callbacks\n", vfu_ctx);
3011 		return ret;
3012 	}
3013 
3014 	ret = vfu_realize_ctx(vfu_ctx);
3015 	if (ret < 0) {
3016 		SPDK_ERRLOG("vfu_ctx %p failed to realize\n", vfu_ctx);
3017 		return ret;
3018 	}
3019 
3020 	endpoint->pci_config_space = vfu_pci_get_config_space(endpoint->vfu_ctx);
3021 	assert(endpoint->pci_config_space != NULL);
3022 	init_pci_config_space(endpoint->pci_config_space);
3023 
3024 	assert(cap_offset != 0);
3025 	endpoint->msix = (struct msixcap *)((uint8_t *)endpoint->pci_config_space + cap_offset);
3026 
3027 	return 0;
3028 }
3029 
3030 static void
3031 _free_ctrlr(void *ctx)
3032 {
3033 	struct nvmf_vfio_user_ctrlr *ctrlr = ctx;
3034 	struct nvmf_vfio_user_endpoint *endpoint = ctrlr->endpoint;
3035 
3036 	spdk_poller_unregister(&ctrlr->vfu_ctx_poller);
3037 	free(ctrlr);
3038 
3039 	if (endpoint && endpoint->need_async_destroy) {
3040 		nvmf_vfio_user_destroy_endpoint(endpoint);
3041 	}
3042 }
3043 
3044 static void
3045 free_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr)
3046 {
3047 	int i;
3048 	assert(ctrlr != NULL);
3049 
3050 	SPDK_DEBUGLOG(nvmf_vfio, "free %s\n", ctrlr_id(ctrlr));
3051 
3052 	for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) {
3053 		free_qp(ctrlr, i);
3054 	}
3055 
3056 	if (ctrlr->thread == spdk_get_thread()) {
3057 		_free_ctrlr(ctrlr);
3058 	} else {
3059 		spdk_thread_send_msg(ctrlr->thread, _free_ctrlr, ctrlr);
3060 	}
3061 }
3062 
3063 static void
3064 nvmf_vfio_user_create_ctrlr(struct nvmf_vfio_user_transport *transport,
3065 			    struct nvmf_vfio_user_endpoint *endpoint)
3066 {
3067 	struct nvmf_vfio_user_ctrlr *ctrlr;
3068 	int err = 0;
3069 
3070 	/* First, construct a vfio-user CUSTOM transport controller */
3071 	ctrlr = calloc(1, sizeof(*ctrlr));
3072 	if (ctrlr == NULL) {
3073 		err = -ENOMEM;
3074 		goto out;
3075 	}
3076 	/* We can only support one connection for now */
3077 	ctrlr->cntlid = 0x1;
3078 	ctrlr->transport = transport;
3079 	ctrlr->endpoint = endpoint;
3080 	ctrlr->doorbells = endpoint->doorbells;
3081 	TAILQ_INIT(&ctrlr->connected_sqs);
3082 
3083 	/* Then, construct an admin queue pair */
3084 	err = init_sq(ctrlr, &transport->transport, 0);
3085 	if (err != 0) {
3086 		free(ctrlr);
3087 		goto out;
3088 	}
3089 
3090 	err = init_cq(ctrlr, 0);
3091 	if (err != 0) {
3092 		free(ctrlr);
3093 		goto out;
3094 	}
3095 
3096 	ctrlr->sqs[0]->size = NVMF_VFIO_USER_DEFAULT_AQ_DEPTH;
3097 
3098 	err = alloc_sq_reqs(ctrlr, ctrlr->sqs[0]);
3099 	if (err != 0) {
3100 		free(ctrlr);
3101 		goto out;
3102 	}
3103 	endpoint->ctrlr = ctrlr;
3104 
3105 	/* Notify the generic layer about the new admin queue pair */
3106 	spdk_nvmf_tgt_new_qpair(transport->transport.tgt, &ctrlr->sqs[0]->qpair);
3107 
3108 out:
3109 	if (err != 0) {
3110 		SPDK_ERRLOG("%s: failed to create vfio-user controller: %s\n",
3111 			    endpoint_id(endpoint), strerror(-err));
3112 	}
3113 }
3114 
3115 static int
3116 nvmf_vfio_user_listen(struct spdk_nvmf_transport *transport,
3117 		      const struct spdk_nvme_transport_id *trid,
3118 		      struct spdk_nvmf_listen_opts *listen_opts)
3119 {
3120 	struct nvmf_vfio_user_transport *vu_transport;
3121 	struct nvmf_vfio_user_endpoint *endpoint, *tmp;
3122 	char path[PATH_MAX] = {};
3123 	char uuid[PATH_MAX] = {};
3124 	int ret;
3125 
3126 	vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport,
3127 					transport);
3128 
3129 	pthread_mutex_lock(&vu_transport->lock);
3130 	TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) {
3131 		/* Only compare traddr */
3132 		if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) {
3133 			pthread_mutex_unlock(&vu_transport->lock);
3134 			return -EEXIST;
3135 		}
3136 	}
3137 	pthread_mutex_unlock(&vu_transport->lock);
3138 
3139 	endpoint = calloc(1, sizeof(*endpoint));
3140 	if (!endpoint) {
3141 		return -ENOMEM;
3142 	}
3143 
3144 	pthread_mutex_init(&endpoint->lock, NULL);
3145 	endpoint->devmem_fd = -1;
3146 	memcpy(&endpoint->trid, trid, sizeof(endpoint->trid));
3147 
3148 	ret = snprintf(path, PATH_MAX, "%s/bar0", endpoint_id(endpoint));
3149 	if (ret < 0 || ret >= PATH_MAX) {
3150 		SPDK_ERRLOG("%s: error to get socket path: %s.\n", endpoint_id(endpoint), spdk_strerror(errno));
3151 		ret = -1;
3152 		goto out;
3153 	}
3154 
3155 	ret = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
3156 	if (ret == -1) {
3157 		SPDK_ERRLOG("%s: failed to open device memory at %s: %s.\n",
3158 			    endpoint_id(endpoint), path, spdk_strerror(errno));
3159 		goto out;
3160 	}
3161 
3162 	endpoint->devmem_fd = ret;
3163 	ret = ftruncate(endpoint->devmem_fd,
3164 			NVME_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE);
3165 	if (ret != 0) {
3166 		SPDK_ERRLOG("%s: error to ftruncate file %s: %s.\n", endpoint_id(endpoint), path,
3167 			    spdk_strerror(errno));
3168 		goto out;
3169 	}
3170 
3171 	endpoint->doorbells = mmap(NULL, NVMF_VFIO_USER_DOORBELLS_SIZE,
3172 				   PROT_READ | PROT_WRITE, MAP_SHARED, endpoint->devmem_fd, NVME_DOORBELLS_OFFSET);
3173 	if (endpoint->doorbells == MAP_FAILED) {
3174 		SPDK_ERRLOG("%s: error to mmap file %s: %s.\n", endpoint_id(endpoint), path, spdk_strerror(errno));
3175 		endpoint->doorbells = NULL;
3176 		ret = -1;
3177 		goto out;
3178 	}
3179 
3180 	ret = snprintf(path, PATH_MAX, "%s/migr", endpoint_id(endpoint));
3181 	if (ret < 0 || ret >= PATH_MAX) {
3182 		SPDK_ERRLOG("%s: error to get migration file path: %s.\n", endpoint_id(endpoint),
3183 			    spdk_strerror(errno));
3184 		ret = -1;
3185 		goto out;
3186 	}
3187 	ret = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
3188 	if (ret == -1) {
3189 		SPDK_ERRLOG("%s: failed to open device memory at %s: %s.\n",
3190 			    endpoint_id(endpoint), path, spdk_strerror(errno));
3191 		goto out;
3192 	}
3193 
3194 	endpoint->migr_fd = ret;
3195 	ret = ftruncate(endpoint->migr_fd,
3196 			vfu_get_migr_register_area_size() + vfio_user_migr_data_len());
3197 	if (ret != 0) {
3198 		SPDK_ERRLOG("%s: error to ftruncate migration file %s: %s.\n", endpoint_id(endpoint), path,
3199 			    spdk_strerror(errno));
3200 		goto out;
3201 	}
3202 
3203 	endpoint->migr_data = mmap(NULL, vfio_user_migr_data_len(),
3204 				   PROT_READ | PROT_WRITE, MAP_SHARED, endpoint->migr_fd, vfu_get_migr_register_area_size());
3205 	if (endpoint->migr_data == MAP_FAILED) {
3206 		SPDK_ERRLOG("%s: error to mmap file %s: %s.\n", endpoint_id(endpoint), path, spdk_strerror(errno));
3207 		endpoint->migr_data = NULL;
3208 		ret = -1;
3209 		goto out;
3210 	}
3211 
3212 	ret = snprintf(uuid, PATH_MAX, "%s/cntrl", endpoint_id(endpoint));
3213 	if (ret < 0 || ret >= PATH_MAX) {
3214 		SPDK_ERRLOG("%s: error to get ctrlr file path: %s\n", endpoint_id(endpoint), spdk_strerror(errno));
3215 		ret = -1;
3216 		goto out;
3217 	}
3218 
3219 	endpoint->vfu_ctx = vfu_create_ctx(VFU_TRANS_SOCK, uuid, LIBVFIO_USER_FLAG_ATTACH_NB,
3220 					   endpoint, VFU_DEV_TYPE_PCI);
3221 	if (endpoint->vfu_ctx == NULL) {
3222 		SPDK_ERRLOG("%s: error creating libmuser context: %m\n",
3223 			    endpoint_id(endpoint));
3224 		ret = -1;
3225 		goto out;
3226 	}
3227 	vfu_setup_log(endpoint->vfu_ctx, vfio_user_log, vfio_user_get_log_level());
3228 
3229 	ret = vfio_user_dev_info_fill(vu_transport, endpoint);
3230 	if (ret < 0) {
3231 		goto out;
3232 	}
3233 
3234 	pthread_mutex_lock(&vu_transport->lock);
3235 	TAILQ_INSERT_TAIL(&vu_transport->endpoints, endpoint, link);
3236 	pthread_mutex_unlock(&vu_transport->lock);
3237 
3238 	SPDK_DEBUGLOG(nvmf_vfio, "%s: doorbells %p\n", uuid, endpoint->doorbells);
3239 
3240 out:
3241 	if (ret != 0) {
3242 		nvmf_vfio_user_destroy_endpoint(endpoint);
3243 	}
3244 
3245 	return ret;
3246 }
3247 
3248 static void
3249 nvmf_vfio_user_stop_listen(struct spdk_nvmf_transport *transport,
3250 			   const struct spdk_nvme_transport_id *trid)
3251 {
3252 	struct nvmf_vfio_user_transport *vu_transport;
3253 	struct nvmf_vfio_user_endpoint *endpoint, *tmp;
3254 
3255 	assert(trid != NULL);
3256 	assert(trid->traddr != NULL);
3257 
3258 	SPDK_DEBUGLOG(nvmf_vfio, "%s: stop listen\n", trid->traddr);
3259 
3260 	vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport,
3261 					transport);
3262 
3263 	pthread_mutex_lock(&vu_transport->lock);
3264 	TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) {
3265 		if (strcmp(trid->traddr, endpoint->trid.traddr) == 0) {
3266 			TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link);
3267 			/* Defer to free endpoint resources until the controller
3268 			 * is freed.  There are two cases when running here:
3269 			 * 1. kill nvmf target while VM is connected
3270 			 * 2. remove listener via RPC call
3271 			 * nvmf library will disconnect all queue paris.
3272 			 */
3273 			if (endpoint->ctrlr) {
3274 				assert(!endpoint->need_async_destroy);
3275 				endpoint->need_async_destroy = true;
3276 				pthread_mutex_unlock(&vu_transport->lock);
3277 				return;
3278 			}
3279 
3280 			nvmf_vfio_user_destroy_endpoint(endpoint);
3281 			pthread_mutex_unlock(&vu_transport->lock);
3282 			return;
3283 		}
3284 	}
3285 	pthread_mutex_unlock(&vu_transport->lock);
3286 
3287 	SPDK_DEBUGLOG(nvmf_vfio, "%s: not found\n", trid->traddr);
3288 }
3289 
3290 static void
3291 nvmf_vfio_user_cdata_init(struct spdk_nvmf_transport *transport,
3292 			  struct spdk_nvmf_subsystem *subsystem,
3293 			  struct spdk_nvmf_ctrlr_data *cdata)
3294 {
3295 	cdata->vid = SPDK_PCI_VID_NUTANIX;
3296 	cdata->ssvid = SPDK_PCI_VID_NUTANIX;
3297 	cdata->ieee[0] = 0x8d;
3298 	cdata->ieee[1] = 0x6b;
3299 	cdata->ieee[2] = 0x50;
3300 	memset(&cdata->sgls, 0, sizeof(struct spdk_nvme_cdata_sgls));
3301 	cdata->sgls.supported = SPDK_NVME_SGLS_SUPPORTED_DWORD_ALIGNED;
3302 	/* libvfio-user can only support 1 connection for now */
3303 	cdata->oncs.reservations = 0;
3304 }
3305 
3306 static int
3307 nvmf_vfio_user_listen_associate(struct spdk_nvmf_transport *transport,
3308 				const struct spdk_nvmf_subsystem *subsystem,
3309 				const struct spdk_nvme_transport_id *trid)
3310 {
3311 	struct nvmf_vfio_user_transport *vu_transport;
3312 	struct nvmf_vfio_user_endpoint *endpoint;
3313 
3314 	vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, transport);
3315 
3316 	pthread_mutex_lock(&vu_transport->lock);
3317 	TAILQ_FOREACH(endpoint, &vu_transport->endpoints, link) {
3318 		if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) {
3319 			break;
3320 		}
3321 	}
3322 	pthread_mutex_unlock(&vu_transport->lock);
3323 
3324 	if (endpoint == NULL) {
3325 		return -ENOENT;
3326 	}
3327 
3328 	endpoint->subsystem = subsystem;
3329 
3330 	return 0;
3331 }
3332 
3333 /*
3334  * Executed periodically at a default SPDK_NVMF_DEFAULT_ACCEPT_POLL_RATE_US
3335  * frequency.
3336  *
3337  * For each transport endpoint (which at the libvfio-user level corresponds to
3338  * a socket), if we don't currently have a controller set up, peek to see if the
3339  * socket is able to accept a new connection.
3340  *
3341  * This poller also takes care of handling the creation of any pending new
3342  * qpairs.
3343  */
3344 static int
3345 nvmf_vfio_user_accept(void *ctx)
3346 {
3347 	struct spdk_nvmf_transport *transport = ctx;
3348 	struct nvmf_vfio_user_transport *vu_transport;
3349 	struct nvmf_vfio_user_endpoint *endpoint;
3350 	uint32_t count = 0;
3351 	int err;
3352 
3353 	vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport,
3354 					transport);
3355 
3356 	pthread_mutex_lock(&vu_transport->lock);
3357 
3358 	TAILQ_FOREACH(endpoint, &vu_transport->endpoints, link) {
3359 		if (endpoint->ctrlr != NULL) {
3360 			continue;
3361 		}
3362 
3363 		err = vfu_attach_ctx(endpoint->vfu_ctx);
3364 		if (err != 0) {
3365 			if (errno == EAGAIN || errno == EWOULDBLOCK) {
3366 				continue;
3367 			}
3368 
3369 			pthread_mutex_unlock(&vu_transport->lock);
3370 			return SPDK_POLLER_BUSY;
3371 		}
3372 
3373 		count++;
3374 
3375 		/* Construct a controller */
3376 		nvmf_vfio_user_create_ctrlr(vu_transport, endpoint);
3377 	}
3378 
3379 	pthread_mutex_unlock(&vu_transport->lock);
3380 
3381 	return count > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
3382 }
3383 
3384 static void
3385 nvmf_vfio_user_discover(struct spdk_nvmf_transport *transport,
3386 			struct spdk_nvme_transport_id *trid,
3387 			struct spdk_nvmf_discovery_log_page_entry *entry)
3388 { }
3389 
3390 static struct spdk_nvmf_transport_poll_group *
3391 nvmf_vfio_user_poll_group_create(struct spdk_nvmf_transport *transport)
3392 {
3393 	struct nvmf_vfio_user_transport *vu_transport;
3394 	struct nvmf_vfio_user_poll_group *vu_group;
3395 
3396 	SPDK_DEBUGLOG(nvmf_vfio, "create poll group\n");
3397 
3398 	vu_group = calloc(1, sizeof(*vu_group));
3399 	if (vu_group == NULL) {
3400 		SPDK_ERRLOG("Error allocating poll group: %m");
3401 		return NULL;
3402 	}
3403 
3404 	TAILQ_INIT(&vu_group->sqs);
3405 
3406 	vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport,
3407 					transport);
3408 	pthread_mutex_lock(&vu_transport->pg_lock);
3409 	TAILQ_INSERT_TAIL(&vu_transport->poll_groups, vu_group, link);
3410 	if (vu_transport->next_pg == NULL) {
3411 		vu_transport->next_pg = vu_group;
3412 	}
3413 	pthread_mutex_unlock(&vu_transport->pg_lock);
3414 
3415 	return &vu_group->group;
3416 }
3417 
3418 static struct spdk_nvmf_transport_poll_group *
3419 nvmf_vfio_user_get_optimal_poll_group(struct spdk_nvmf_qpair *qpair)
3420 {
3421 	struct nvmf_vfio_user_transport *vu_transport;
3422 	struct nvmf_vfio_user_poll_group **vu_group;
3423 	struct nvmf_vfio_user_sq *sq;
3424 	struct nvmf_vfio_user_cq *cq;
3425 
3426 	struct spdk_nvmf_transport_poll_group *result;
3427 
3428 	sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair);
3429 	cq = sq->ctrlr->cqs[sq->cqid];
3430 	assert(cq != NULL);
3431 	vu_transport = SPDK_CONTAINEROF(qpair->transport, struct nvmf_vfio_user_transport, transport);
3432 
3433 	pthread_mutex_lock(&vu_transport->pg_lock);
3434 	if (TAILQ_EMPTY(&vu_transport->poll_groups)) {
3435 		pthread_mutex_unlock(&vu_transport->pg_lock);
3436 		return NULL;
3437 	}
3438 
3439 	/* If this is shared IO CQ case, just return the used CQ's poll group */
3440 	if (!nvmf_qpair_is_admin_queue(qpair)) {
3441 		if (cq->group) {
3442 			pthread_mutex_unlock(&vu_transport->pg_lock);
3443 			return cq->group;
3444 		}
3445 	}
3446 
3447 	vu_group = &vu_transport->next_pg;
3448 	assert(*vu_group != NULL);
3449 
3450 	result = &(*vu_group)->group;
3451 	*vu_group = TAILQ_NEXT(*vu_group, link);
3452 	if (*vu_group == NULL) {
3453 		*vu_group = TAILQ_FIRST(&vu_transport->poll_groups);
3454 	}
3455 
3456 	if (cq->group == NULL) {
3457 		cq->group = result;
3458 	}
3459 
3460 	pthread_mutex_unlock(&vu_transport->pg_lock);
3461 	return result;
3462 }
3463 
3464 /* called when process exits */
3465 static void
3466 nvmf_vfio_user_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group)
3467 {
3468 	struct nvmf_vfio_user_poll_group *vu_group, *next_tgroup;;
3469 	struct nvmf_vfio_user_transport *vu_transport;
3470 
3471 	SPDK_DEBUGLOG(nvmf_vfio, "destroy poll group\n");
3472 
3473 	vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group);
3474 	vu_transport = SPDK_CONTAINEROF(vu_group->group.transport, struct nvmf_vfio_user_transport,
3475 					transport);
3476 
3477 	pthread_mutex_lock(&vu_transport->pg_lock);
3478 	next_tgroup = TAILQ_NEXT(vu_group, link);
3479 	TAILQ_REMOVE(&vu_transport->poll_groups, vu_group, link);
3480 	if (next_tgroup == NULL) {
3481 		next_tgroup = TAILQ_FIRST(&vu_transport->poll_groups);
3482 	}
3483 	if (vu_transport->next_pg == vu_group) {
3484 		vu_transport->next_pg = next_tgroup;
3485 	}
3486 	pthread_mutex_unlock(&vu_transport->pg_lock);
3487 
3488 	free(vu_group);
3489 }
3490 
3491 static void
3492 _vfio_user_qpair_disconnect(void *ctx)
3493 {
3494 	struct nvmf_vfio_user_sq *sq = ctx;
3495 
3496 	spdk_nvmf_qpair_disconnect(&sq->qpair, NULL, NULL);
3497 }
3498 
3499 /* The function is used when socket connection is destroyed */
3500 static int
3501 vfio_user_destroy_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr)
3502 {
3503 	struct nvmf_vfio_user_sq *sq;
3504 	struct nvmf_vfio_user_endpoint *endpoint;
3505 
3506 	SPDK_DEBUGLOG(nvmf_vfio, "%s stop processing\n", ctrlr_id(ctrlr));
3507 
3508 	endpoint = ctrlr->endpoint;
3509 	assert(endpoint != NULL);
3510 
3511 	pthread_mutex_lock(&endpoint->lock);
3512 	if (TAILQ_EMPTY(&ctrlr->connected_sqs)) {
3513 		endpoint->ctrlr = NULL;
3514 		free_ctrlr(ctrlr);
3515 		pthread_mutex_unlock(&endpoint->lock);
3516 		return 0;
3517 	}
3518 
3519 	TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) {
3520 		/* add another round thread poll to avoid recursive endpoint lock */
3521 		spdk_thread_send_msg(ctrlr->thread, _vfio_user_qpair_disconnect, sq);
3522 	}
3523 	pthread_mutex_unlock(&endpoint->lock);
3524 
3525 	return 0;
3526 }
3527 
3528 /*
3529  * Poll for and process any incoming vfio-user messages.
3530  */
3531 static int
3532 vfio_user_poll_vfu_ctx(void *ctx)
3533 {
3534 	struct nvmf_vfio_user_ctrlr *ctrlr = ctx;
3535 	int ret;
3536 
3537 	assert(ctrlr != NULL);
3538 
3539 	/* This will call access_bar0_fn() if there are any writes
3540 	 * to the portion of the BAR that is not mmap'd */
3541 	ret = vfu_run_ctx(ctrlr->endpoint->vfu_ctx);
3542 	if (spdk_unlikely(ret == -1)) {
3543 		if (errno == EBUSY) {
3544 			return SPDK_POLLER_BUSY;
3545 		}
3546 
3547 		spdk_poller_unregister(&ctrlr->vfu_ctx_poller);
3548 
3549 		/* initiator shutdown or reset, waiting for another re-connect */
3550 		if (errno == ENOTCONN) {
3551 			vfio_user_destroy_ctrlr(ctrlr);
3552 			return SPDK_POLLER_BUSY;
3553 		}
3554 
3555 		fail_ctrlr(ctrlr);
3556 	}
3557 
3558 	return ret != 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
3559 }
3560 
3561 struct vfio_user_post_cpl_ctx {
3562 	struct nvmf_vfio_user_ctrlr	*ctrlr;
3563 	struct nvmf_vfio_user_cq	*cq;
3564 	struct spdk_nvme_cpl		cpl;
3565 };
3566 
3567 static void
3568 _post_completion_msg(void *ctx)
3569 {
3570 	struct vfio_user_post_cpl_ctx *cpl_ctx = ctx;
3571 
3572 	post_completion(cpl_ctx->ctrlr, cpl_ctx->cq, cpl_ctx->cpl.cdw0, cpl_ctx->cpl.sqid,
3573 			cpl_ctx->cpl.cid, cpl_ctx->cpl.status.sc, cpl_ctx->cpl.status.sct);
3574 	free(cpl_ctx);
3575 }
3576 
3577 static int
3578 handle_queue_connect_rsp(struct nvmf_vfio_user_req *req, void *cb_arg)
3579 {
3580 	struct nvmf_vfio_user_poll_group *vu_group;
3581 	struct nvmf_vfio_user_sq *sq = cb_arg;
3582 	struct nvmf_vfio_user_cq *cq;
3583 	struct nvmf_vfio_user_ctrlr *vu_ctrlr;
3584 	struct nvmf_vfio_user_endpoint *endpoint;
3585 
3586 	assert(sq != NULL);
3587 	assert(req != NULL);
3588 
3589 	vu_ctrlr = sq->ctrlr;
3590 	assert(vu_ctrlr != NULL);
3591 	endpoint = vu_ctrlr->endpoint;
3592 	assert(endpoint != NULL);
3593 
3594 	if (spdk_nvme_cpl_is_error(&req->req.rsp->nvme_cpl)) {
3595 		SPDK_ERRLOG("SC %u, SCT %u\n", req->req.rsp->nvme_cpl.status.sc, req->req.rsp->nvme_cpl.status.sct);
3596 		endpoint->ctrlr = NULL;
3597 		free_ctrlr(vu_ctrlr);
3598 		return -1;
3599 	}
3600 
3601 	vu_group = SPDK_CONTAINEROF(sq->group, struct nvmf_vfio_user_poll_group, group);
3602 	TAILQ_INSERT_TAIL(&vu_group->sqs, sq, link);
3603 
3604 	cq = vu_ctrlr->cqs[0];
3605 	assert(cq != NULL);
3606 
3607 	pthread_mutex_lock(&endpoint->lock);
3608 	if (nvmf_qpair_is_admin_queue(&sq->qpair)) {
3609 		vu_ctrlr->cntlid = sq->qpair.ctrlr->cntlid;
3610 		vu_ctrlr->thread = spdk_get_thread();
3611 		vu_ctrlr->ctrlr = sq->qpair.ctrlr;
3612 		vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING;
3613 		vu_ctrlr->vfu_ctx_poller = SPDK_POLLER_REGISTER(vfio_user_poll_vfu_ctx, vu_ctrlr, 0);
3614 		cq->thread = spdk_get_thread();
3615 	} else {
3616 		/* For I/O queues this command was generated in response to an
3617 		 * ADMIN I/O CREATE SUBMISSION QUEUE command which has not yet
3618 		 * been completed. Complete it now.
3619 		 */
3620 		if (sq->post_create_io_sq_completion) {
3621 			assert(cq->thread != NULL);
3622 			if (cq->thread != spdk_get_thread()) {
3623 				struct vfio_user_post_cpl_ctx *cpl_ctx;
3624 
3625 				cpl_ctx = calloc(1, sizeof(*cpl_ctx));
3626 				if (!cpl_ctx) {
3627 					return -ENOMEM;
3628 				}
3629 				cpl_ctx->ctrlr = vu_ctrlr;
3630 				cpl_ctx->cq = cq;
3631 				cpl_ctx->cpl.sqid = 0;
3632 				cpl_ctx->cpl.cdw0 = 0;
3633 				cpl_ctx->cpl.cid = sq->create_io_sq_cmd.cid;
3634 				cpl_ctx->cpl.status.sc = SPDK_NVME_SC_SUCCESS;
3635 				cpl_ctx->cpl.status.sct = SPDK_NVME_SCT_GENERIC;
3636 
3637 				spdk_thread_send_msg(cq->thread, _post_completion_msg, cpl_ctx);
3638 			} else {
3639 				post_completion(vu_ctrlr, cq, 0, 0,
3640 						sq->create_io_sq_cmd.cid, SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC);
3641 			}
3642 			sq->post_create_io_sq_completion = false;
3643 		}
3644 		sq->sq_state = VFIO_USER_SQ_ACTIVE;
3645 	}
3646 
3647 	TAILQ_INSERT_TAIL(&vu_ctrlr->connected_sqs, sq, tailq);
3648 	pthread_mutex_unlock(&endpoint->lock);
3649 
3650 	free(req->req.data);
3651 	req->req.data = NULL;
3652 
3653 	return 0;
3654 }
3655 
3656 /*
3657  * Add the given qpair to the given poll group. New qpairs are added via
3658  * spdk_nvmf_tgt_new_qpair(), which picks a poll group, then calls back
3659  * here via nvmf_transport_poll_group_add().
3660  */
3661 static int
3662 nvmf_vfio_user_poll_group_add(struct spdk_nvmf_transport_poll_group *group,
3663 			      struct spdk_nvmf_qpair *qpair)
3664 {
3665 	struct nvmf_vfio_user_sq *sq;
3666 	struct nvmf_vfio_user_req *vu_req;
3667 	struct nvmf_vfio_user_ctrlr *ctrlr;
3668 	struct spdk_nvmf_request *req;
3669 	struct spdk_nvmf_fabric_connect_data *data;
3670 	bool admin;
3671 
3672 	sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair);
3673 	sq->group = group;
3674 	ctrlr = sq->ctrlr;
3675 
3676 	SPDK_DEBUGLOG(nvmf_vfio, "%s: add QP%d=%p(%p) to poll_group=%p\n",
3677 		      ctrlr_id(ctrlr), sq->qpair.qid,
3678 		      sq, qpair, group);
3679 
3680 	admin = nvmf_qpair_is_admin_queue(&sq->qpair);
3681 
3682 	vu_req = get_nvmf_vfio_user_req(sq);
3683 	if (vu_req == NULL) {
3684 		return -1;
3685 	}
3686 
3687 	req = &vu_req->req;
3688 	req->cmd->connect_cmd.opcode = SPDK_NVME_OPC_FABRIC;
3689 	req->cmd->connect_cmd.cid = 0;
3690 	req->cmd->connect_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_CONNECT;
3691 	req->cmd->connect_cmd.recfmt = 0;
3692 	req->cmd->connect_cmd.sqsize = sq->size - 1;
3693 	req->cmd->connect_cmd.qid = admin ? 0 : qpair->qid;
3694 
3695 	req->length = sizeof(struct spdk_nvmf_fabric_connect_data);
3696 	req->data = calloc(1, req->length);
3697 	if (req->data == NULL) {
3698 		nvmf_vfio_user_req_free(req);
3699 		return -ENOMEM;
3700 	}
3701 
3702 	data = (struct spdk_nvmf_fabric_connect_data *)req->data;
3703 	data->cntlid = ctrlr->cntlid;
3704 	snprintf(data->subnqn, sizeof(data->subnqn), "%s",
3705 		 spdk_nvmf_subsystem_get_nqn(ctrlr->endpoint->subsystem));
3706 
3707 	vu_req->cb_fn = handle_queue_connect_rsp;
3708 	vu_req->cb_arg = sq;
3709 
3710 	SPDK_DEBUGLOG(nvmf_vfio,
3711 		      "%s: sending connect fabrics command for QID=%#x cntlid=%#x\n",
3712 		      ctrlr_id(ctrlr), qpair->qid, data->cntlid);
3713 
3714 	spdk_nvmf_request_exec_fabrics(req);
3715 	return 0;
3716 }
3717 
3718 static int
3719 nvmf_vfio_user_poll_group_remove(struct spdk_nvmf_transport_poll_group *group,
3720 				 struct spdk_nvmf_qpair *qpair)
3721 {
3722 	struct nvmf_vfio_user_sq *sq;
3723 	struct nvmf_vfio_user_poll_group *vu_group;
3724 
3725 	sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair);
3726 
3727 	SPDK_DEBUGLOG(nvmf_vfio,
3728 		      "%s: remove NVMf QP%d=%p from NVMf poll_group=%p\n",
3729 		      ctrlr_id(sq->ctrlr), qpair->qid, qpair, group);
3730 
3731 
3732 	vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group);
3733 	TAILQ_REMOVE(&vu_group->sqs, sq, link);
3734 
3735 	return 0;
3736 }
3737 
3738 static void
3739 _nvmf_vfio_user_req_free(struct nvmf_vfio_user_sq *sq, struct nvmf_vfio_user_req *vu_req)
3740 {
3741 	memset(&vu_req->cmd, 0, sizeof(vu_req->cmd));
3742 	memset(&vu_req->rsp, 0, sizeof(vu_req->rsp));
3743 	vu_req->iovcnt = 0;
3744 	vu_req->state = VFIO_USER_REQUEST_STATE_FREE;
3745 
3746 	TAILQ_INSERT_TAIL(&sq->free_reqs, vu_req, link);
3747 }
3748 
3749 static int
3750 nvmf_vfio_user_req_free(struct spdk_nvmf_request *req)
3751 {
3752 	struct nvmf_vfio_user_sq *sq;
3753 	struct nvmf_vfio_user_req *vu_req;
3754 
3755 	assert(req != NULL);
3756 
3757 	vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req);
3758 	sq = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair);
3759 
3760 	_nvmf_vfio_user_req_free(sq, vu_req);
3761 
3762 	return 0;
3763 }
3764 
3765 static int
3766 nvmf_vfio_user_req_complete(struct spdk_nvmf_request *req)
3767 {
3768 	struct nvmf_vfio_user_sq *sq;
3769 	struct nvmf_vfio_user_req *vu_req;
3770 
3771 	assert(req != NULL);
3772 
3773 	vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req);
3774 	sq = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair);
3775 
3776 	if (vu_req->cb_fn != NULL) {
3777 		if (vu_req->cb_fn(vu_req, vu_req->cb_arg) != 0) {
3778 			fail_ctrlr(sq->ctrlr);
3779 		}
3780 	}
3781 
3782 	_nvmf_vfio_user_req_free(sq, vu_req);
3783 
3784 	return 0;
3785 }
3786 
3787 static void
3788 nvmf_vfio_user_close_qpair(struct spdk_nvmf_qpair *qpair,
3789 			   spdk_nvmf_transport_qpair_fini_cb cb_fn, void *cb_arg)
3790 {
3791 	struct nvmf_vfio_user_sq *sq;
3792 	struct nvmf_vfio_user_ctrlr *vu_ctrlr;
3793 	struct nvmf_vfio_user_endpoint *endpoint;
3794 
3795 	assert(qpair != NULL);
3796 	sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair);
3797 	vu_ctrlr = sq->ctrlr;
3798 	endpoint = vu_ctrlr->endpoint;
3799 
3800 	pthread_mutex_lock(&endpoint->lock);
3801 	TAILQ_REMOVE(&vu_ctrlr->connected_sqs, sq, tailq);
3802 	delete_sq_done(vu_ctrlr, sq);
3803 	if (TAILQ_EMPTY(&vu_ctrlr->connected_sqs)) {
3804 		endpoint->ctrlr = NULL;
3805 		free_ctrlr(vu_ctrlr);
3806 	}
3807 	pthread_mutex_unlock(&endpoint->lock);
3808 
3809 	if (cb_fn) {
3810 		cb_fn(cb_arg);
3811 	}
3812 }
3813 
3814 /**
3815  * Returns a preallocated request, or NULL if there isn't one available.
3816  */
3817 static struct nvmf_vfio_user_req *
3818 get_nvmf_vfio_user_req(struct nvmf_vfio_user_sq *sq)
3819 {
3820 	struct nvmf_vfio_user_req *req;
3821 
3822 	if (sq == NULL) {
3823 		return NULL;
3824 	}
3825 
3826 	req = TAILQ_FIRST(&sq->free_reqs);
3827 	if (req == NULL) {
3828 		return NULL;
3829 	}
3830 
3831 	TAILQ_REMOVE(&sq->free_reqs, req, link);
3832 
3833 	return req;
3834 }
3835 
3836 static int
3837 get_nvmf_io_req_length(struct spdk_nvmf_request *req)
3838 {
3839 	uint16_t nr;
3840 	uint32_t nlb, nsid;
3841 	struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
3842 	struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
3843 	struct spdk_nvmf_ns *ns;
3844 
3845 	nsid = cmd->nsid;
3846 	ns = _nvmf_subsystem_get_ns(ctrlr->subsys, nsid);
3847 	if (ns == NULL || ns->bdev == NULL) {
3848 		SPDK_ERRLOG("unsuccessful query for nsid %u\n", cmd->nsid);
3849 		return -EINVAL;
3850 	}
3851 
3852 	if (cmd->opc == SPDK_NVME_OPC_DATASET_MANAGEMENT) {
3853 		nr = cmd->cdw10_bits.dsm.nr + 1;
3854 		return nr * sizeof(struct spdk_nvme_dsm_range);
3855 	}
3856 
3857 	nlb = (cmd->cdw12 & 0x0000ffffu) + 1;
3858 	return nlb * spdk_bdev_get_block_size(ns->bdev);
3859 }
3860 
3861 static int
3862 map_admin_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req)
3863 {
3864 	struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
3865 	uint32_t len = 0;
3866 	uint8_t fid;
3867 	int iovcnt;
3868 
3869 	req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc);
3870 	req->length = 0;
3871 	req->data = NULL;
3872 
3873 	if (req->xfer == SPDK_NVME_DATA_NONE) {
3874 		return 0;
3875 	}
3876 
3877 	switch (cmd->opc) {
3878 	case SPDK_NVME_OPC_IDENTIFY:
3879 		len = 4096;
3880 		break;
3881 	case SPDK_NVME_OPC_GET_LOG_PAGE:
3882 		len = (((cmd->cdw11_bits.get_log_page.numdu << 16) | cmd->cdw10_bits.get_log_page.numdl) + 1) * 4;
3883 		break;
3884 	case SPDK_NVME_OPC_GET_FEATURES:
3885 	case SPDK_NVME_OPC_SET_FEATURES:
3886 		fid = cmd->cdw10_bits.set_features.fid;
3887 		switch (fid) {
3888 		case SPDK_NVME_FEAT_LBA_RANGE_TYPE:
3889 			len = 4096;
3890 			break;
3891 		case SPDK_NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
3892 			len = 256;
3893 			break;
3894 		case SPDK_NVME_FEAT_TIMESTAMP:
3895 			len = 8;
3896 			break;
3897 		case SPDK_NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
3898 			len = 512;
3899 			break;
3900 		case SPDK_NVME_FEAT_HOST_IDENTIFIER:
3901 			if (cmd->cdw11_bits.feat_host_identifier.bits.exhid) {
3902 				len = 16;
3903 			} else {
3904 				len = 8;
3905 			}
3906 			break;
3907 		default:
3908 			return 0;
3909 		}
3910 		break;
3911 	default:
3912 		return 0;
3913 	}
3914 
3915 	/* ADMIN command will not use SGL */
3916 	if (cmd->psdt != 0) {
3917 		return -EINVAL;
3918 	}
3919 
3920 	iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, len);
3921 	if (iovcnt < 0) {
3922 		SPDK_ERRLOG("%s: map Admin Opc %x failed\n",
3923 			    ctrlr_id(ctrlr), cmd->opc);
3924 		return -1;
3925 	}
3926 	req->length = len;
3927 	req->data = req->iov[0].iov_base;
3928 	req->iovcnt = iovcnt;
3929 
3930 	return 0;
3931 }
3932 
3933 /*
3934  * Map an I/O command's buffers.
3935  *
3936  * Returns 0 on success and -errno on failure.
3937  */
3938 static int
3939 map_io_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req)
3940 {
3941 	int len, iovcnt;
3942 	struct spdk_nvme_cmd *cmd;
3943 
3944 	assert(ctrlr != NULL);
3945 	assert(req != NULL);
3946 
3947 	cmd = &req->cmd->nvme_cmd;
3948 	req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc);
3949 	req->length = 0;
3950 	req->data = NULL;
3951 
3952 	if (spdk_unlikely(req->xfer == SPDK_NVME_DATA_NONE)) {
3953 		return 0;
3954 	}
3955 
3956 	len = get_nvmf_io_req_length(req);
3957 	if (len < 0) {
3958 		return -EINVAL;
3959 	}
3960 	req->length = len;
3961 
3962 	iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, req->length);
3963 	if (iovcnt < 0) {
3964 		SPDK_ERRLOG("%s: failed to map IO OPC %u\n", ctrlr_id(ctrlr), cmd->opc);
3965 		return -EFAULT;
3966 	}
3967 	req->data = req->iov[0].iov_base;
3968 	req->iovcnt = iovcnt;
3969 
3970 	return 0;
3971 }
3972 
3973 static int
3974 handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd,
3975 	       struct nvmf_vfio_user_sq *sq)
3976 {
3977 	int err;
3978 	struct nvmf_vfio_user_req *vu_req;
3979 	struct spdk_nvmf_request *req;
3980 
3981 	assert(ctrlr != NULL);
3982 	assert(cmd != NULL);
3983 
3984 	vu_req = get_nvmf_vfio_user_req(sq);
3985 	if (spdk_unlikely(vu_req == NULL)) {
3986 		SPDK_ERRLOG("%s: no request for NVMe command opc 0x%x\n", ctrlr_id(ctrlr), cmd->opc);
3987 		return post_completion(ctrlr, ctrlr->cqs[sq->cqid], 0, 0, cmd->cid,
3988 				       SPDK_NVME_SC_INTERNAL_DEVICE_ERROR, SPDK_NVME_SCT_GENERIC);
3989 
3990 	}
3991 	req = &vu_req->req;
3992 
3993 	assert(req->qpair != NULL);
3994 	SPDK_DEBUGLOG(nvmf_vfio, "%s: handle qid%u, req opc=%#x cid=%d\n",
3995 		      ctrlr_id(ctrlr), req->qpair->qid, cmd->opc, cmd->cid);
3996 
3997 	vu_req->cb_fn = handle_cmd_rsp;
3998 	vu_req->cb_arg = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair);
3999 	req->cmd->nvme_cmd = *cmd;
4000 
4001 	if (nvmf_qpair_is_admin_queue(req->qpair)) {
4002 		err = map_admin_cmd_req(ctrlr, req);
4003 	} else {
4004 		switch (cmd->opc) {
4005 		case SPDK_NVME_OPC_RESERVATION_REGISTER:
4006 		case SPDK_NVME_OPC_RESERVATION_REPORT:
4007 		case SPDK_NVME_OPC_RESERVATION_ACQUIRE:
4008 		case SPDK_NVME_OPC_RESERVATION_RELEASE:
4009 			err = -ENOTSUP;
4010 			break;
4011 		default:
4012 			err = map_io_cmd_req(ctrlr, req);
4013 			break;
4014 		}
4015 	}
4016 
4017 	if (spdk_unlikely(err < 0)) {
4018 		SPDK_ERRLOG("%s: process NVMe command opc 0x%x failed\n",
4019 			    ctrlr_id(ctrlr), cmd->opc);
4020 		req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
4021 		req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC;
4022 		err = handle_cmd_rsp(vu_req, vu_req->cb_arg);
4023 		_nvmf_vfio_user_req_free(sq, vu_req);
4024 		return err;
4025 	}
4026 
4027 	vu_req->state = VFIO_USER_REQUEST_STATE_EXECUTING;
4028 	spdk_nvmf_request_exec(req);
4029 
4030 	return 0;
4031 }
4032 
4033 /* Returns the number of commands processed, or a negative value on error. */
4034 static int
4035 nvmf_vfio_user_sq_poll(struct nvmf_vfio_user_sq *sq)
4036 {
4037 	struct nvmf_vfio_user_ctrlr *ctrlr;
4038 	uint32_t new_tail;
4039 	int count = 0;
4040 
4041 	assert(sq != NULL);
4042 
4043 	ctrlr = sq->ctrlr;
4044 
4045 	/* On aarch64 platforms, doorbells update from guest VM may not be seen
4046 	 * on SPDK target side. This is because there is memory type mismatch
4047 	 * situation here. That is on guest VM side, the doorbells are treated as
4048 	 * device memory while on SPDK target side, it is treated as normal
4049 	 * memory. And this situation cause problem on ARM platform.
4050 	 * Refer to "https://developer.arm.com/documentation/102376/0100/
4051 	 * Memory-aliasing-and-mismatched-memory-types". Only using spdk_mb()
4052 	 * cannot fix this. Use "dc civac" to invalidate cache may solve
4053 	 * this.
4054 	 */
4055 	spdk_ivdt_dcache(sq_dbl_tailp(ctrlr, sq));
4056 
4057 	/* Load-Acquire. */
4058 	new_tail = *sq_dbl_tailp(ctrlr, sq);
4059 
4060 	/*
4061 	 * Ensure that changes to the queue are visible to us.
4062 	 * The host driver should write the queue first, do a wmb(), and then
4063 	 * update the SQ tail doorbell (their Store-Release).
4064 	 */
4065 	spdk_rmb();
4066 
4067 	new_tail = new_tail & 0xffffu;
4068 	if (spdk_unlikely(new_tail >= sq->size)) {
4069 		union spdk_nvme_async_event_completion event = {};
4070 
4071 		SPDK_DEBUGLOG(nvmf_vfio, "%s: invalid SQ%u doorbell value %u\n", ctrlr_id(ctrlr), sq->qid,
4072 			      new_tail);
4073 		event.bits.async_event_type = SPDK_NVME_ASYNC_EVENT_TYPE_ERROR;
4074 		event.bits.async_event_info = SPDK_NVME_ASYNC_EVENT_INVALID_DB_WRITE;
4075 		nvmf_ctrlr_async_event_error_event(ctrlr->ctrlr, event);
4076 
4077 		return 0;
4078 	}
4079 
4080 	if (*sq_headp(sq) == new_tail) {
4081 		return 0;
4082 	}
4083 
4084 	count = handle_sq_tdbl_write(ctrlr, new_tail, sq);
4085 	if (count < 0) {
4086 		fail_ctrlr(ctrlr);
4087 	}
4088 
4089 	return count;
4090 }
4091 
4092 /*
4093  * vfio-user transport poll handler. Note that the library context is polled in
4094  * a separate poller (->vfu_ctx_poller), so this poller only needs to poll the
4095  * active qpairs.
4096  *
4097  * Returns the number of commands processed, or a negative value on error.
4098  */
4099 static int
4100 nvmf_vfio_user_poll_group_poll(struct spdk_nvmf_transport_poll_group *group)
4101 {
4102 	struct nvmf_vfio_user_poll_group *vu_group;
4103 	struct nvmf_vfio_user_sq *sq, *tmp;
4104 	int count = 0;
4105 
4106 	assert(group != NULL);
4107 
4108 	spdk_rmb();
4109 
4110 	vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group);
4111 
4112 	TAILQ_FOREACH_SAFE(sq, &vu_group->sqs, link, tmp) {
4113 		int ret;
4114 
4115 		if (spdk_unlikely(sq->sq_state != VFIO_USER_SQ_ACTIVE || !sq->size)) {
4116 			continue;
4117 		}
4118 
4119 		ret = nvmf_vfio_user_sq_poll(sq);
4120 
4121 		if (ret < 0) {
4122 			return ret;
4123 		}
4124 
4125 		count += ret;
4126 	}
4127 
4128 	return count;
4129 }
4130 
4131 static int
4132 nvmf_vfio_user_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair,
4133 				    struct spdk_nvme_transport_id *trid)
4134 {
4135 	struct nvmf_vfio_user_sq *sq;
4136 	struct nvmf_vfio_user_ctrlr *ctrlr;
4137 
4138 	sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair);
4139 	ctrlr = sq->ctrlr;
4140 
4141 	memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid));
4142 	return 0;
4143 }
4144 
4145 static int
4146 nvmf_vfio_user_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair,
4147 				   struct spdk_nvme_transport_id *trid)
4148 {
4149 	return 0;
4150 }
4151 
4152 static int
4153 nvmf_vfio_user_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair,
4154 				     struct spdk_nvme_transport_id *trid)
4155 {
4156 	struct nvmf_vfio_user_sq *sq;
4157 	struct nvmf_vfio_user_ctrlr *ctrlr;
4158 
4159 	sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair);
4160 	ctrlr = sq->ctrlr;
4161 
4162 	memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid));
4163 	return 0;
4164 }
4165 
4166 static void
4167 nvmf_vfio_user_qpair_abort_request(struct spdk_nvmf_qpair *qpair,
4168 				   struct spdk_nvmf_request *req)
4169 {
4170 	struct spdk_nvmf_request *req_to_abort = NULL;
4171 	uint16_t cid;
4172 
4173 	cid = req->cmd->nvme_cmd.cdw10_bits.abort.cid;
4174 
4175 	TAILQ_FOREACH(req, &qpair->outstanding, link) {
4176 		struct nvmf_vfio_user_req *vu_req;
4177 
4178 		vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req);
4179 
4180 		if (vu_req->state == VFIO_USER_REQUEST_STATE_EXECUTING && vu_req->cmd.cid == cid) {
4181 			req_to_abort = req;
4182 			break;
4183 		}
4184 	}
4185 
4186 	if (req_to_abort == NULL) {
4187 		spdk_nvmf_request_complete(req);
4188 		return;
4189 	}
4190 
4191 	req->req_to_abort = req_to_abort;
4192 	nvmf_ctrlr_abort_request(req);
4193 }
4194 
4195 static void
4196 nvmf_vfio_user_opts_init(struct spdk_nvmf_transport_opts *opts)
4197 {
4198 	opts->max_queue_depth =		NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH;
4199 	opts->max_qpairs_per_ctrlr =	NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR;
4200 	opts->in_capsule_data_size =	0;
4201 	opts->max_io_size =		NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE;
4202 	opts->io_unit_size =		NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE;
4203 	opts->max_aq_depth =		NVMF_VFIO_USER_DEFAULT_AQ_DEPTH;
4204 	opts->num_shared_buffers =	0;
4205 	opts->buf_cache_size =		0;
4206 	opts->association_timeout =	0;
4207 	opts->transport_specific =      NULL;
4208 }
4209 
4210 const struct spdk_nvmf_transport_ops spdk_nvmf_transport_vfio_user = {
4211 	.name = "VFIOUSER",
4212 	.type = SPDK_NVME_TRANSPORT_VFIOUSER,
4213 	.opts_init = nvmf_vfio_user_opts_init,
4214 	.create = nvmf_vfio_user_create,
4215 	.destroy = nvmf_vfio_user_destroy,
4216 
4217 	.listen = nvmf_vfio_user_listen,
4218 	.stop_listen = nvmf_vfio_user_stop_listen,
4219 	.cdata_init = nvmf_vfio_user_cdata_init,
4220 	.listen_associate = nvmf_vfio_user_listen_associate,
4221 
4222 	.listener_discover = nvmf_vfio_user_discover,
4223 
4224 	.poll_group_create = nvmf_vfio_user_poll_group_create,
4225 	.get_optimal_poll_group = nvmf_vfio_user_get_optimal_poll_group,
4226 	.poll_group_destroy = nvmf_vfio_user_poll_group_destroy,
4227 	.poll_group_add = nvmf_vfio_user_poll_group_add,
4228 	.poll_group_remove = nvmf_vfio_user_poll_group_remove,
4229 	.poll_group_poll = nvmf_vfio_user_poll_group_poll,
4230 
4231 	.req_free = nvmf_vfio_user_req_free,
4232 	.req_complete = nvmf_vfio_user_req_complete,
4233 
4234 	.qpair_fini = nvmf_vfio_user_close_qpair,
4235 	.qpair_get_local_trid = nvmf_vfio_user_qpair_get_local_trid,
4236 	.qpair_get_peer_trid = nvmf_vfio_user_qpair_get_peer_trid,
4237 	.qpair_get_listen_trid = nvmf_vfio_user_qpair_get_listen_trid,
4238 	.qpair_abort_request = nvmf_vfio_user_qpair_abort_request,
4239 };
4240 
4241 SPDK_NVMF_TRANSPORT_REGISTER(muser, &spdk_nvmf_transport_vfio_user);
4242 SPDK_LOG_REGISTER_COMPONENT(nvmf_vfio)
4243