xref: /spdk/lib/nvmf/vfio_user.c (revision 1a00f5c09488e7466a331b8c75cde4969740357f)
1 /*-
2  *   BSD LICENSE
3  *   Copyright (c) Intel Corporation. All rights reserved.
4  *   Copyright (c) 2019, Nutanix Inc. All rights reserved.
5  *   Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 /*
35  * NVMe over vfio-user transport
36  */
37 
38 #include <vfio-user/libvfio-user.h>
39 #include <vfio-user/pci_defs.h>
40 
41 #include "spdk/barrier.h"
42 #include "spdk/stdinc.h"
43 #include "spdk/assert.h"
44 #include "spdk/thread.h"
45 #include "spdk/nvmf_transport.h"
46 #include "spdk/sock.h"
47 #include "spdk/string.h"
48 #include "spdk/util.h"
49 #include "spdk/log.h"
50 
51 #include "transport.h"
52 
53 #include "nvmf_internal.h"
54 
55 #define NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH 256
56 #define NVMF_VFIO_USER_DEFAULT_AQ_DEPTH 32
57 #define NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE ((NVMF_REQ_MAX_BUFFERS - 1) << SHIFT_4KB)
58 #define NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE
59 
60 #define NVME_DOORBELLS_OFFSET	0x1000
61 #define NVMF_VFIO_USER_DOORBELLS_SIZE 0x1000
62 
63 /*
64  * NVMe driver reads 4096 bytes, which is the extended PCI configuration space
65  * available on PCI-X 2.0 and PCI Express buses
66  */
67 #define NVME_REG_CFG_SIZE       0x1000
68 #define NVME_REG_BAR0_SIZE      (NVME_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE)
69 #define NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR ((NVMF_VFIO_USER_DOORBELLS_SIZE) / 8)
70 #define NVME_IRQ_MSIX_NUM	NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR
71 /* MSIX Table Size */
72 #define NVME_BAR4_SIZE		SPDK_ALIGN_CEIL((NVME_IRQ_MSIX_NUM * 16), 0x1000)
73 /* MSIX Pending Bit Array Size */
74 #define NVME_BAR5_SIZE		SPDK_ALIGN_CEIL((NVME_IRQ_MSIX_NUM / 8), 0x1000)
75 
76 #define NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR (NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR / 4)
77 
78 struct nvmf_vfio_user_req;
79 
80 typedef int (*nvmf_vfio_user_req_cb_fn)(struct nvmf_vfio_user_req *req, void *cb_arg);
81 
82 /* 1 more for PRP2 list itself */
83 #define NVMF_VFIO_USER_MAX_IOVECS	(NVMF_REQ_MAX_BUFFERS + 1)
84 
85 enum nvmf_vfio_user_req_state {
86 	VFIO_USER_REQUEST_STATE_FREE = 0,
87 	VFIO_USER_REQUEST_STATE_EXECUTING,
88 };
89 
90 /* NVMe device state representation */
91 struct nvme_migr_sq_state {
92 	uint16_t	sqid;
93 	uint16_t	cqid;
94 	uint32_t	head;
95 	uint32_t	size;
96 	uint32_t	reserved;
97 	uint64_t	dma_addr;
98 };
99 SPDK_STATIC_ASSERT(sizeof(struct nvme_migr_sq_state) == 0x18, "Incorrect size");
100 
101 struct nvme_migr_cq_state {
102 	uint16_t	cqid;
103 	uint16_t	phase;
104 	uint32_t	tail;
105 	uint32_t	size;
106 	uint32_t	iv;
107 	uint32_t	ien;
108 	uint32_t	reserved;
109 	uint64_t	dma_addr;
110 };
111 SPDK_STATIC_ASSERT(sizeof(struct nvme_migr_cq_state) == 0x20, "Incorrect size");
112 
113 #define VFIO_USER_NVME_MIGR_MAGIC	0xAFEDBC23
114 
115 /* The device state is in VFIO MIGRATION BAR(9) region, keep the device state page aligned.
116  *
117  * NVMe device migration region is defined as below:
118  * -------------------------------------------------------------------------
119  * | vfio_user_nvme_migr_header | nvmf controller data | queue pairs | BARs |
120  * -------------------------------------------------------------------------
121  *
122  * Keep vfio_user_nvme_migr_header as a fixed 0x1000 length, all new added fields
123  * can use the reserved space at the end of the data structure.
124  */
125 struct vfio_user_nvme_migr_header {
126 	/* Magic value to validate migration data */
127 	uint32_t	magic;
128 	/* Version to check the data is same from source to destination */
129 	uint32_t	version;
130 
131 	/* The library uses this field to know how many fields in this
132 	 * structure are valid, starting at the beginning of this data
133 	 * structure.  New added fields in future use `unused` memory
134 	 * spaces.
135 	 */
136 	uint32_t	opts_size;
137 	uint32_t	reserved0;
138 
139 	/* BARs information */
140 	uint64_t	bar_offset[VFU_PCI_DEV_NUM_REGIONS];
141 	uint64_t	bar_len[VFU_PCI_DEV_NUM_REGIONS];
142 
143 	/* Queue pair start offset, starting at the beginning of this
144 	 * data structure.
145 	 */
146 	uint64_t	qp_offset;
147 	uint64_t	qp_len;
148 
149 	/* Controller data structure */
150 	uint32_t	num_io_queues;
151 	uint32_t	reserved1;
152 
153 	/* TODO: this part will be moved to common nvmf controller data */
154 	uint16_t	reserved2[3];
155 	uint16_t	nr_aers;
156 	uint16_t	aer_cids[NVMF_MIGR_MAX_PENDING_AERS];
157 
158 	/* NVMf controller data offset and length if exist, starting at
159 	 * the beginning of this data structure.
160 	 */
161 	uint64_t	nvmf_data_offset;
162 	uint64_t	nvmf_data_len;
163 
164 	/* Reserved memory space for new added fields, the
165 	 * field is always at the end of this data structure.
166 	 */
167 	uint8_t		unused[3356];
168 };
169 SPDK_STATIC_ASSERT(sizeof(struct vfio_user_nvme_migr_header) == 0x1000, "Incorrect size");
170 
171 struct vfio_user_nvme_migr_qp {
172 	struct nvme_migr_sq_state	sq;
173 	struct nvme_migr_cq_state	cq;
174 };
175 
176 /* NVMe state definition used to load/restore from/to NVMe migration BAR region */
177 struct vfio_user_nvme_migr_state {
178 	struct vfio_user_nvme_migr_header	ctrlr_header;
179 	struct nvmf_ctrlr_migr_data		nvmf_data;
180 	struct vfio_user_nvme_migr_qp		qps[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR];
181 	uint8_t					bar0[NVME_REG_BAR0_SIZE];
182 	uint8_t					cfg[NVME_REG_CFG_SIZE];
183 };
184 
185 struct nvmf_vfio_user_req  {
186 	struct spdk_nvmf_request		req;
187 	struct spdk_nvme_cpl			rsp;
188 	struct spdk_nvme_cmd			cmd;
189 
190 	enum nvmf_vfio_user_req_state		state;
191 	nvmf_vfio_user_req_cb_fn		cb_fn;
192 	void					*cb_arg;
193 
194 	/* old CC before prop_set_cc fabric command */
195 	union spdk_nvme_cc_register		cc;
196 
197 	TAILQ_ENTRY(nvmf_vfio_user_req)		link;
198 
199 	struct iovec				iov[NVMF_VFIO_USER_MAX_IOVECS];
200 	uint8_t					iovcnt;
201 
202 	/* NVMF_VFIO_USER_MAX_IOVECS worth of dma_sg_t. */
203 	uint8_t					sg[];
204 };
205 
206 /*
207  * Mapping of an NVMe queue.
208  *
209  * This holds the information tracking a local process mapping of an NVMe queue
210  * shared by the client.
211  */
212 struct nvme_q_mapping {
213 	/* iov of local process mapping. */
214 	struct iovec iov;
215 	/* Stored sg, needed for unmap. */
216 	dma_sg_t *sg;
217 	/* Client PRP of queue. */
218 	uint64_t prp1;
219 };
220 
221 enum nvmf_vfio_user_sq_state {
222 	VFIO_USER_SQ_UNUSED = 0,
223 	VFIO_USER_SQ_CREATED,
224 	VFIO_USER_SQ_DELETED,
225 	VFIO_USER_SQ_ACTIVE,
226 	VFIO_USER_SQ_INACTIVE
227 };
228 
229 enum nvmf_vfio_user_cq_state {
230 	VFIO_USER_CQ_UNUSED = 0,
231 	VFIO_USER_CQ_CREATED,
232 	VFIO_USER_CQ_DELETED,
233 };
234 
235 enum nvmf_vfio_user_ctrlr_state {
236 	VFIO_USER_CTRLR_CREATING = 0,
237 	VFIO_USER_CTRLR_RUNNING,
238 	/* Quiesce requested by libvfio-user */
239 	VFIO_USER_CTRLR_PAUSING,
240 	/* NVMf subsystem is paused, it's safe to do PCI reset, memory register,
241 	 * memory unergister, and vfio migration state transition in this state.
242 	 */
243 	VFIO_USER_CTRLR_PAUSED,
244 	/*
245 	 * Implies that the NVMf subsystem is paused. Device will be unquiesced (PCI
246 	 * reset, memory register and unregister, controller in destination VM has
247 	 * been restored).  NVMf subsystem resume has been requested.
248 	 */
249 	VFIO_USER_CTRLR_RESUMING,
250 	/*
251 	 * Implies that the NVMf subsystem is paused. Both controller in source VM and
252 	 * destinatiom VM is in this state when doing live migration.
253 	 */
254 	VFIO_USER_CTRLR_MIGRATING
255 };
256 
257 /* Migration region to record NVMe device state data structure */
258 struct vfio_user_migration_region {
259 	uint64_t last_data_offset;
260 	uint64_t pending_bytes;
261 };
262 
263 struct nvmf_vfio_user_sq {
264 	struct spdk_nvmf_qpair			qpair;
265 	struct spdk_nvmf_transport_poll_group	*group;
266 	struct nvmf_vfio_user_ctrlr		*ctrlr;
267 
268 	uint32_t				qid;
269 	/* Number of entries in queue. */
270 	uint32_t				size;
271 	struct nvme_q_mapping			mapping;
272 	enum nvmf_vfio_user_sq_state		sq_state;
273 
274 	uint32_t				head;
275 
276 	/* multiple SQs can be mapped to the same CQ */
277 	uint16_t				cqid;
278 
279 	/* handle_queue_connect_rsp() can be used both for CREATE IO SQ response
280 	 * and SQ re-connect response in the destination VM, for the prior case,
281 	 * we will post a NVMe completion to VM, we will not set this flag when
282 	 * re-connecting SQs in the destination VM.
283 	 */
284 	bool					post_create_io_sq_completion;
285 	/* Copy of Create IO SQ command, this field is used together with
286 	 * `post_create_io_sq_completion` flag.
287 	 */
288 	struct spdk_nvme_cmd			create_io_sq_cmd;
289 
290 	/* Currently unallocated reqs. */
291 	TAILQ_HEAD(, nvmf_vfio_user_req)	free_reqs;
292 	/* Poll group entry */
293 	TAILQ_ENTRY(nvmf_vfio_user_sq)		link;
294 	/* Connected SQ entry */
295 	TAILQ_ENTRY(nvmf_vfio_user_sq)		tailq;
296 };
297 
298 struct nvmf_vfio_user_cq {
299 	struct spdk_nvmf_transport_poll_group	*group;
300 	struct spdk_thread			*thread;
301 	uint32_t				cq_ref;
302 
303 	uint32_t				qid;
304 	/* Number of entries in queue. */
305 	uint32_t				size;
306 	struct nvme_q_mapping			mapping;
307 	enum nvmf_vfio_user_cq_state		cq_state;
308 
309 	uint32_t				tail;
310 	bool					phase;
311 
312 	uint16_t				iv;
313 	bool					ien;
314 };
315 
316 struct nvmf_vfio_user_poll_group {
317 	struct spdk_nvmf_transport_poll_group	group;
318 	TAILQ_ENTRY(nvmf_vfio_user_poll_group)	link;
319 	TAILQ_HEAD(, nvmf_vfio_user_sq)		sqs;
320 };
321 
322 struct nvmf_vfio_user_ctrlr {
323 	struct nvmf_vfio_user_endpoint		*endpoint;
324 	struct nvmf_vfio_user_transport		*transport;
325 
326 	/* Connected SQs list */
327 	TAILQ_HEAD(, nvmf_vfio_user_sq)		connected_sqs;
328 	enum nvmf_vfio_user_ctrlr_state		state;
329 
330 	struct vfio_user_migration_region	migr_reg;
331 	/* Controller is in source VM when doing live migration */
332 	bool					in_source_vm;
333 
334 	struct spdk_thread			*thread;
335 	struct spdk_poller			*vfu_ctx_poller;
336 	struct spdk_interrupt			*intr;
337 	int					intr_fd;
338 
339 	bool					queued_quiesce;
340 
341 	bool					reset_shn;
342 
343 	uint16_t				cntlid;
344 	struct spdk_nvmf_ctrlr			*ctrlr;
345 
346 	struct nvmf_vfio_user_sq		*sqs[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR];
347 	struct nvmf_vfio_user_cq		*cqs[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR];
348 
349 	TAILQ_ENTRY(nvmf_vfio_user_ctrlr)	link;
350 
351 	volatile uint32_t			*doorbells;
352 };
353 
354 struct nvmf_vfio_user_endpoint {
355 	struct nvmf_vfio_user_transport		*transport;
356 	vfu_ctx_t				*vfu_ctx;
357 	struct spdk_poller			*accept_poller;
358 	struct spdk_thread			*accept_thread;
359 	struct msixcap				*msix;
360 	vfu_pci_config_space_t			*pci_config_space;
361 	int					devmem_fd;
362 	int					accept_intr_fd;
363 	struct spdk_interrupt			*accept_intr;
364 
365 	volatile uint32_t			*doorbells;
366 
367 	int					migr_fd;
368 	void					*migr_data;
369 
370 	struct spdk_nvme_transport_id		trid;
371 	const struct spdk_nvmf_subsystem	*subsystem;
372 
373 	struct nvmf_vfio_user_ctrlr		*ctrlr;
374 	pthread_mutex_t				lock;
375 
376 	bool					need_async_destroy;
377 
378 	TAILQ_ENTRY(nvmf_vfio_user_endpoint)	link;
379 };
380 
381 struct nvmf_vfio_user_transport_opts {
382 	bool					disable_mappable_bar0;
383 };
384 
385 struct nvmf_vfio_user_transport {
386 	struct spdk_nvmf_transport		transport;
387 	struct nvmf_vfio_user_transport_opts    transport_opts;
388 	bool					intr_mode_supported;
389 	pthread_mutex_t				lock;
390 	TAILQ_HEAD(, nvmf_vfio_user_endpoint)	endpoints;
391 
392 	pthread_mutex_t				pg_lock;
393 	TAILQ_HEAD(, nvmf_vfio_user_poll_group)	poll_groups;
394 	struct nvmf_vfio_user_poll_group	*next_pg;
395 };
396 
397 /*
398  * function prototypes
399  */
400 static int
401 nvmf_vfio_user_req_free(struct spdk_nvmf_request *req);
402 
403 static struct nvmf_vfio_user_req *
404 get_nvmf_vfio_user_req(struct nvmf_vfio_user_sq *sq);
405 
406 /*
407  * Local process virtual address of a queue.
408  */
409 static inline void *
410 q_addr(struct nvme_q_mapping *mapping)
411 {
412 	return mapping->iov.iov_base;
413 }
414 
415 static inline int
416 queue_index(uint16_t qid, bool is_cq)
417 {
418 	return (qid * 2) + is_cq;
419 }
420 
421 static inline volatile uint32_t *
422 sq_headp(struct nvmf_vfio_user_sq *sq)
423 {
424 	assert(sq != NULL);
425 	return &sq->head;
426 }
427 
428 static inline volatile uint32_t *
429 sq_dbl_tailp(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_sq *sq)
430 {
431 	assert(ctrlr != NULL);
432 	assert(sq != NULL);
433 	return &ctrlr->doorbells[queue_index(sq->qid, false)];
434 }
435 
436 static inline volatile uint32_t *
437 cq_dbl_headp(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_cq *cq)
438 {
439 	assert(ctrlr != NULL);
440 	assert(cq != NULL);
441 	return &ctrlr->doorbells[queue_index(cq->qid, true)];
442 }
443 
444 static inline volatile uint32_t *
445 cq_tailp(struct nvmf_vfio_user_cq *cq)
446 {
447 	assert(cq != NULL);
448 	return &cq->tail;
449 }
450 
451 static inline void
452 sq_head_advance(struct nvmf_vfio_user_sq *sq)
453 {
454 	assert(sq != NULL);
455 
456 	assert(*sq_headp(sq) < sq->size);
457 	(*sq_headp(sq))++;
458 
459 	if (spdk_unlikely(*sq_headp(sq) == sq->size)) {
460 		*sq_headp(sq) = 0;
461 	}
462 }
463 
464 static inline void
465 cq_tail_advance(struct nvmf_vfio_user_cq *cq)
466 {
467 	assert(cq != NULL);
468 
469 	assert(*cq_tailp(cq) < cq->size);
470 	(*cq_tailp(cq))++;
471 
472 	if (spdk_unlikely(*cq_tailp(cq) == cq->size)) {
473 		*cq_tailp(cq) = 0;
474 		cq->phase = !cq->phase;
475 	}
476 }
477 
478 static inline bool
479 cq_is_full(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_cq *cq)
480 {
481 	uint32_t qindex;
482 
483 	assert(ctrlr != NULL);
484 	assert(cq != NULL);
485 
486 	qindex = *cq_tailp(cq) + 1;
487 	if (spdk_unlikely(qindex == cq->size)) {
488 		qindex = 0;
489 	}
490 
491 	return qindex == *cq_dbl_headp(ctrlr, cq);
492 }
493 
494 static inline size_t
495 vfio_user_migr_data_len(void)
496 {
497 	return SPDK_ALIGN_CEIL(sizeof(struct vfio_user_nvme_migr_state), PAGE_SIZE);
498 }
499 
500 static int
501 nvme_cmd_map_prps(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs,
502 		  uint32_t max_iovcnt, uint32_t len, size_t mps,
503 		  void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot))
504 {
505 	uint64_t prp1, prp2;
506 	void *vva;
507 	uint32_t i;
508 	uint32_t residue_len, nents;
509 	uint64_t *prp_list;
510 	uint32_t iovcnt;
511 
512 	assert(max_iovcnt > 0);
513 
514 	prp1 = cmd->dptr.prp.prp1;
515 	prp2 = cmd->dptr.prp.prp2;
516 
517 	/* PRP1 may started with unaligned page address */
518 	residue_len = mps - (prp1 % mps);
519 	residue_len = spdk_min(len, residue_len);
520 
521 	vva = gpa_to_vva(prv, prp1, residue_len, PROT_READ | PROT_WRITE);
522 	if (spdk_unlikely(vva == NULL)) {
523 		SPDK_ERRLOG("GPA to VVA failed\n");
524 		return -EINVAL;
525 	}
526 	len -= residue_len;
527 	if (len && max_iovcnt < 2) {
528 		SPDK_ERRLOG("Too many page entries, at least two iovs are required\n");
529 		return -ERANGE;
530 	}
531 	iovs[0].iov_base = vva;
532 	iovs[0].iov_len = residue_len;
533 
534 	if (len) {
535 		if (spdk_unlikely(prp2 == 0)) {
536 			SPDK_ERRLOG("no PRP2, %d remaining\n", len);
537 			return -EINVAL;
538 		}
539 
540 		if (len <= mps) {
541 			/* 2 PRP used */
542 			iovcnt = 2;
543 			vva = gpa_to_vva(prv, prp2, len, PROT_READ | PROT_WRITE);
544 			if (spdk_unlikely(vva == NULL)) {
545 				SPDK_ERRLOG("no VVA for %#" PRIx64 ", len%#x\n",
546 					    prp2, len);
547 				return -EINVAL;
548 			}
549 			iovs[1].iov_base = vva;
550 			iovs[1].iov_len = len;
551 		} else {
552 			/* PRP list used */
553 			nents = (len + mps - 1) / mps;
554 			if (spdk_unlikely(nents + 1 > max_iovcnt)) {
555 				SPDK_ERRLOG("Too many page entries\n");
556 				return -ERANGE;
557 			}
558 
559 			vva = gpa_to_vva(prv, prp2, nents * sizeof(*prp_list), PROT_READ);
560 			if (spdk_unlikely(vva == NULL)) {
561 				SPDK_ERRLOG("no VVA for %#" PRIx64 ", nents=%#x\n",
562 					    prp2, nents);
563 				return -EINVAL;
564 			}
565 			prp_list = vva;
566 			i = 0;
567 			while (len != 0) {
568 				residue_len = spdk_min(len, mps);
569 				vva = gpa_to_vva(prv, prp_list[i], residue_len, PROT_READ | PROT_WRITE);
570 				if (spdk_unlikely(vva == NULL)) {
571 					SPDK_ERRLOG("no VVA for %#" PRIx64 ", residue_len=%#x\n",
572 						    prp_list[i], residue_len);
573 					return -EINVAL;
574 				}
575 				iovs[i + 1].iov_base = vva;
576 				iovs[i + 1].iov_len = residue_len;
577 				len -= residue_len;
578 				i++;
579 			}
580 			iovcnt = i + 1;
581 		}
582 	} else {
583 		/* 1 PRP used */
584 		iovcnt = 1;
585 	}
586 
587 	assert(iovcnt <= max_iovcnt);
588 	return iovcnt;
589 }
590 
591 static int
592 nvme_cmd_map_sgls_data(void *prv, struct spdk_nvme_sgl_descriptor *sgls, uint32_t num_sgls,
593 		       struct iovec *iovs, uint32_t max_iovcnt,
594 		       void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot))
595 {
596 	uint32_t i;
597 	void *vva;
598 
599 	if (spdk_unlikely(max_iovcnt < num_sgls)) {
600 		return -ERANGE;
601 	}
602 
603 	for (i = 0; i < num_sgls; i++) {
604 		if (spdk_unlikely(sgls[i].unkeyed.type != SPDK_NVME_SGL_TYPE_DATA_BLOCK)) {
605 			SPDK_ERRLOG("Invalid SGL type %u\n", sgls[i].unkeyed.type);
606 			return -EINVAL;
607 		}
608 		vva = gpa_to_vva(prv, sgls[i].address, sgls[i].unkeyed.length, PROT_READ | PROT_WRITE);
609 		if (spdk_unlikely(vva == NULL)) {
610 			SPDK_ERRLOG("GPA to VVA failed\n");
611 			return -EINVAL;
612 		}
613 		iovs[i].iov_base = vva;
614 		iovs[i].iov_len = sgls[i].unkeyed.length;
615 	}
616 
617 	return num_sgls;
618 }
619 
620 static int
621 nvme_cmd_map_sgls(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt,
622 		  uint32_t len, size_t mps,
623 		  void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot))
624 {
625 	struct spdk_nvme_sgl_descriptor *sgl, *last_sgl;
626 	uint32_t num_sgls, seg_len;
627 	void *vva;
628 	int ret;
629 	uint32_t total_iovcnt = 0;
630 
631 	/* SGL cases */
632 	sgl = &cmd->dptr.sgl1;
633 
634 	/* only one SGL segment */
635 	if (sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) {
636 		assert(max_iovcnt > 0);
637 		vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, PROT_READ | PROT_WRITE);
638 		if (spdk_unlikely(vva == NULL)) {
639 			SPDK_ERRLOG("GPA to VVA failed\n");
640 			return -EINVAL;
641 		}
642 		iovs[0].iov_base = vva;
643 		iovs[0].iov_len = sgl->unkeyed.length;
644 		assert(sgl->unkeyed.length == len);
645 
646 		return 1;
647 	}
648 
649 	for (;;) {
650 		if (spdk_unlikely((sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_SEGMENT) &&
651 				  (sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_LAST_SEGMENT))) {
652 			SPDK_ERRLOG("Invalid SGL type %u\n", sgl->unkeyed.type);
653 			return -EINVAL;
654 		}
655 
656 		seg_len = sgl->unkeyed.length;
657 		if (spdk_unlikely(seg_len % sizeof(struct spdk_nvme_sgl_descriptor))) {
658 			SPDK_ERRLOG("Invalid SGL segment len %u\n", seg_len);
659 			return -EINVAL;
660 		}
661 
662 		num_sgls = seg_len / sizeof(struct spdk_nvme_sgl_descriptor);
663 		vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, PROT_READ);
664 		if (spdk_unlikely(vva == NULL)) {
665 			SPDK_ERRLOG("GPA to VVA failed\n");
666 			return -EINVAL;
667 		}
668 
669 		/* sgl point to the first segment */
670 		sgl = (struct spdk_nvme_sgl_descriptor *)vva;
671 		last_sgl = &sgl[num_sgls - 1];
672 
673 		/* we are done */
674 		if (last_sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) {
675 			/* map whole sgl list */
676 			ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls, &iovs[total_iovcnt],
677 						     max_iovcnt - total_iovcnt, gpa_to_vva);
678 			if (spdk_unlikely(ret < 0)) {
679 				return ret;
680 			}
681 			total_iovcnt += ret;
682 
683 			return total_iovcnt;
684 		}
685 
686 		if (num_sgls > 1) {
687 			/* map whole sgl exclude last_sgl */
688 			ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls - 1, &iovs[total_iovcnt],
689 						     max_iovcnt - total_iovcnt, gpa_to_vva);
690 			if (spdk_unlikely(ret < 0)) {
691 				return ret;
692 			}
693 			total_iovcnt += ret;
694 		}
695 
696 		/* move to next level's segments */
697 		sgl = last_sgl;
698 	}
699 
700 	return 0;
701 }
702 
703 static int
704 nvme_map_cmd(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt,
705 	     uint32_t len, size_t mps,
706 	     void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot))
707 {
708 	if (cmd->psdt == SPDK_NVME_PSDT_PRP) {
709 		return nvme_cmd_map_prps(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva);
710 	}
711 
712 	return nvme_cmd_map_sgls(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva);
713 }
714 
715 static char *
716 endpoint_id(struct nvmf_vfio_user_endpoint *endpoint)
717 {
718 	return endpoint->trid.traddr;
719 }
720 
721 static char *
722 ctrlr_id(struct nvmf_vfio_user_ctrlr *ctrlr)
723 {
724 	if (!ctrlr || !ctrlr->endpoint) {
725 		return "Null Ctrlr";
726 	}
727 
728 	return endpoint_id(ctrlr->endpoint);
729 }
730 
731 static void
732 fail_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr)
733 {
734 	const struct spdk_nvmf_registers *regs;
735 
736 	assert(vu_ctrlr != NULL);
737 	assert(vu_ctrlr->ctrlr != NULL);
738 
739 	regs = spdk_nvmf_ctrlr_get_regs(vu_ctrlr->ctrlr);
740 	if (regs->csts.bits.cfs == 0) {
741 		SPDK_ERRLOG(":%s failing controller\n", ctrlr_id(vu_ctrlr));
742 	}
743 
744 	nvmf_ctrlr_set_fatal_status(vu_ctrlr->ctrlr);
745 }
746 
747 static inline bool
748 ctrlr_interrupt_enabled(struct nvmf_vfio_user_ctrlr *vu_ctrlr)
749 {
750 	assert(vu_ctrlr != NULL);
751 	assert(vu_ctrlr->endpoint != NULL);
752 
753 	vfu_pci_config_space_t *pci = vu_ctrlr->endpoint->pci_config_space;
754 
755 	return (!pci->hdr.cmd.id || vu_ctrlr->endpoint->msix->mxc.mxe);
756 }
757 
758 static void
759 nvmf_vfio_user_destroy_endpoint(struct nvmf_vfio_user_endpoint *endpoint)
760 {
761 	SPDK_DEBUGLOG(nvmf_vfio, "destroy endpoint %s\n", endpoint_id(endpoint));
762 
763 	spdk_interrupt_unregister(&endpoint->accept_intr);
764 	spdk_poller_unregister(&endpoint->accept_poller);
765 
766 	if (endpoint->doorbells) {
767 		munmap((void *)endpoint->doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE);
768 	}
769 
770 	if (endpoint->devmem_fd > 0) {
771 		close(endpoint->devmem_fd);
772 	}
773 
774 	if (endpoint->migr_data) {
775 		munmap(endpoint->migr_data, vfio_user_migr_data_len());
776 	}
777 
778 	if (endpoint->migr_fd > 0) {
779 		close(endpoint->migr_fd);
780 	}
781 
782 	if (endpoint->vfu_ctx) {
783 		vfu_destroy_ctx(endpoint->vfu_ctx);
784 	}
785 
786 	pthread_mutex_destroy(&endpoint->lock);
787 	free(endpoint);
788 }
789 
790 /* called when process exits */
791 static int
792 nvmf_vfio_user_destroy(struct spdk_nvmf_transport *transport,
793 		       spdk_nvmf_transport_destroy_done_cb cb_fn, void *cb_arg)
794 {
795 	struct nvmf_vfio_user_transport *vu_transport;
796 	struct nvmf_vfio_user_endpoint *endpoint, *tmp;
797 
798 	SPDK_DEBUGLOG(nvmf_vfio, "destroy transport\n");
799 
800 	vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport,
801 					transport);
802 
803 	pthread_mutex_destroy(&vu_transport->lock);
804 	pthread_mutex_destroy(&vu_transport->pg_lock);
805 
806 	TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) {
807 		TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link);
808 		nvmf_vfio_user_destroy_endpoint(endpoint);
809 	}
810 
811 	free(vu_transport);
812 
813 	if (cb_fn) {
814 		cb_fn(cb_arg);
815 	}
816 
817 	return 0;
818 }
819 
820 static const struct spdk_json_object_decoder vfio_user_transport_opts_decoder[] = {
821 	{
822 		"disable_mappable_bar0",
823 		offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_mappable_bar0),
824 		spdk_json_decode_bool, true
825 	},
826 };
827 
828 static struct spdk_nvmf_transport *
829 nvmf_vfio_user_create(struct spdk_nvmf_transport_opts *opts)
830 {
831 	struct nvmf_vfio_user_transport *vu_transport;
832 	int err;
833 
834 	if (opts->max_qpairs_per_ctrlr > NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) {
835 		SPDK_ERRLOG("Invalid max_qpairs_per_ctrlr=%d, supported max_qpairs_per_ctrlr=%d\n",
836 			    opts->max_qpairs_per_ctrlr, NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR);
837 		return NULL;
838 	}
839 
840 	vu_transport = calloc(1, sizeof(*vu_transport));
841 	if (vu_transport == NULL) {
842 		SPDK_ERRLOG("Transport alloc fail: %m\n");
843 		return NULL;
844 	}
845 
846 	err = pthread_mutex_init(&vu_transport->lock, NULL);
847 	if (err != 0) {
848 		SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err);
849 		goto err;
850 	}
851 	TAILQ_INIT(&vu_transport->endpoints);
852 
853 	err = pthread_mutex_init(&vu_transport->pg_lock, NULL);
854 	if (err != 0) {
855 		pthread_mutex_destroy(&vu_transport->lock);
856 		SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err);
857 		goto err;
858 	}
859 	TAILQ_INIT(&vu_transport->poll_groups);
860 
861 	if (opts->transport_specific != NULL &&
862 	    spdk_json_decode_object_relaxed(opts->transport_specific, vfio_user_transport_opts_decoder,
863 					    SPDK_COUNTOF(vfio_user_transport_opts_decoder),
864 					    vu_transport)) {
865 		SPDK_ERRLOG("spdk_json_decode_object_relaxed failed\n");
866 		goto cleanup;
867 	}
868 
869 	SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_mappable_bar0=%d\n",
870 		      vu_transport->transport_opts.disable_mappable_bar0);
871 
872 	/*
873 	 * To support interrupt mode, the transport must be configured with
874 	 * mappable BAR0 disabled: we need a vfio-user message to wake us up
875 	 * when a client writes new doorbell values to BAR0, via the
876 	 * libvfio-user socket fd.
877 	 */
878 	vu_transport->intr_mode_supported =
879 		vu_transport->transport_opts.disable_mappable_bar0;
880 
881 	return &vu_transport->transport;
882 
883 cleanup:
884 	pthread_mutex_destroy(&vu_transport->lock);
885 	pthread_mutex_destroy(&vu_transport->pg_lock);
886 err:
887 	free(vu_transport);
888 	return NULL;
889 }
890 
891 static uint32_t
892 max_queue_size(struct nvmf_vfio_user_ctrlr const *vu_ctrlr)
893 {
894 	assert(vu_ctrlr != NULL);
895 	assert(vu_ctrlr->ctrlr != NULL);
896 
897 	return vu_ctrlr->ctrlr->vcprop.cap.bits.mqes + 1;
898 }
899 
900 static void *
901 map_one(vfu_ctx_t *ctx, uint64_t addr, uint64_t len, dma_sg_t *sg, struct iovec *iov, int prot)
902 {
903 	int ret;
904 
905 	assert(ctx != NULL);
906 	assert(sg != NULL);
907 	assert(iov != NULL);
908 
909 	ret = vfu_addr_to_sg(ctx, (void *)(uintptr_t)addr, len, sg, 1, prot);
910 	if (ret < 0) {
911 		return NULL;
912 	}
913 
914 	ret = vfu_map_sg(ctx, sg, iov, 1, 0);
915 	if (ret != 0) {
916 		return NULL;
917 	}
918 
919 	assert(iov->iov_base != NULL);
920 	return iov->iov_base;
921 }
922 
923 static int
924 map_q(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvme_q_mapping *mapping,
925       uint32_t q_size, bool is_cq, bool unmap)
926 {
927 	uint64_t len;
928 	void *ret;
929 
930 	assert(q_size);
931 	assert(q_addr(mapping) == NULL);
932 
933 	if (is_cq) {
934 		len = q_size * sizeof(struct spdk_nvme_cpl);
935 	} else {
936 		len = q_size * sizeof(struct spdk_nvme_cmd);
937 	}
938 
939 	ret = map_one(vu_ctrlr->endpoint->vfu_ctx, mapping->prp1, len,
940 		      mapping->sg, &mapping->iov,
941 		      is_cq ? PROT_READ | PROT_WRITE : PROT_READ);
942 	if (ret == NULL) {
943 		return -EFAULT;
944 	}
945 
946 	if (unmap) {
947 		memset(q_addr(mapping), 0, len);
948 	}
949 
950 	return 0;
951 }
952 
953 static inline void
954 unmap_q(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvme_q_mapping *mapping)
955 {
956 	if (q_addr(mapping) != NULL) {
957 		vfu_unmap_sg(vu_ctrlr->endpoint->vfu_ctx, mapping->sg,
958 			     &mapping->iov, 1);
959 		mapping->iov.iov_base = NULL;
960 	}
961 }
962 
963 static int
964 asq_setup(struct nvmf_vfio_user_ctrlr *ctrlr)
965 {
966 	struct nvmf_vfio_user_sq *sq;
967 	const struct spdk_nvmf_registers *regs;
968 	int ret;
969 
970 	assert(ctrlr != NULL);
971 
972 	sq = ctrlr->sqs[0];
973 
974 	assert(sq != NULL);
975 	assert(q_addr(&sq->mapping) == NULL);
976 	/* XXX ctrlr->asq == 0 is a valid memory address */
977 
978 	regs = spdk_nvmf_ctrlr_get_regs(ctrlr->ctrlr);
979 	sq->qid = 0;
980 	sq->size = regs->aqa.bits.asqs + 1;
981 	sq->mapping.prp1 = regs->asq;
982 	*sq_headp(sq) = 0;
983 	sq->cqid = 0;
984 
985 	ret = map_q(ctrlr, &sq->mapping, sq->size, false, true);
986 	if (ret) {
987 		return ret;
988 	}
989 
990 	*sq_dbl_tailp(ctrlr, sq) = 0;
991 
992 	return 0;
993 }
994 
995 static int
996 acq_setup(struct nvmf_vfio_user_ctrlr *ctrlr)
997 {
998 	struct nvmf_vfio_user_cq *cq;
999 	const struct spdk_nvmf_registers *regs;
1000 	int ret;
1001 
1002 	assert(ctrlr != NULL);
1003 
1004 	cq = ctrlr->cqs[0];
1005 
1006 	assert(cq != NULL);
1007 
1008 	assert(q_addr(&cq->mapping) == NULL);
1009 
1010 	regs = spdk_nvmf_ctrlr_get_regs(ctrlr->ctrlr);
1011 	assert(regs != NULL);
1012 	cq->qid = 0;
1013 	cq->size = regs->aqa.bits.acqs + 1;
1014 	cq->mapping.prp1 = regs->acq;
1015 	*cq_tailp(cq) = 0;
1016 	cq->ien = true;
1017 	cq->phase = true;
1018 
1019 	ret = map_q(ctrlr, &cq->mapping, cq->size, true, true);
1020 	if (ret) {
1021 		return ret;
1022 	}
1023 
1024 	*cq_dbl_headp(ctrlr, cq) = 0;
1025 
1026 	return 0;
1027 }
1028 
1029 static inline dma_sg_t *
1030 vu_req_to_sg_t(struct nvmf_vfio_user_req *vu_req, uint32_t iovcnt)
1031 {
1032 	return (dma_sg_t *)(vu_req->sg + iovcnt * dma_sg_size());
1033 }
1034 
1035 static void *
1036 _map_one(void *prv, uint64_t addr, uint64_t len, int prot)
1037 {
1038 	struct spdk_nvmf_request *req = (struct spdk_nvmf_request *)prv;
1039 	struct spdk_nvmf_qpair *qpair;
1040 	struct nvmf_vfio_user_req *vu_req;
1041 	struct nvmf_vfio_user_sq *sq;
1042 	void *ret;
1043 
1044 	assert(req != NULL);
1045 	qpair = req->qpair;
1046 	vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req);
1047 	sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair);
1048 
1049 	assert(vu_req->iovcnt < NVMF_VFIO_USER_MAX_IOVECS);
1050 	ret = map_one(sq->ctrlr->endpoint->vfu_ctx, addr, len,
1051 		      vu_req_to_sg_t(vu_req, vu_req->iovcnt),
1052 		      &vu_req->iov[vu_req->iovcnt], prot);
1053 	if (spdk_likely(ret != NULL)) {
1054 		vu_req->iovcnt++;
1055 	}
1056 	return ret;
1057 }
1058 
1059 static int
1060 vfio_user_map_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req,
1061 		  struct iovec *iov, uint32_t length)
1062 {
1063 	/* Map PRP list to from Guest physical memory to
1064 	 * virtual memory address.
1065 	 */
1066 	return nvme_map_cmd(req, &req->cmd->nvme_cmd, iov, NVMF_REQ_MAX_BUFFERS,
1067 			    length, 4096, _map_one);
1068 }
1069 
1070 static int
1071 handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd,
1072 	       struct nvmf_vfio_user_sq *sq);
1073 
1074 /*
1075  * Posts a CQE in the completion queue.
1076  *
1077  * @ctrlr: the vfio-user controller
1078  * @cq: the completion queue
1079  * @cdw0: cdw0 as reported by NVMf
1080  * @sqid: submission queue ID
1081  * @cid: command identifier in NVMe command
1082  * @sc: the NVMe CQE status code
1083  * @sct: the NVMe CQE status code type
1084  */
1085 static int
1086 post_completion(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_cq *cq,
1087 		uint32_t cdw0, uint16_t sqid, uint16_t cid, uint16_t sc, uint16_t sct)
1088 {
1089 	struct spdk_nvme_cpl *cpl;
1090 	const struct spdk_nvmf_registers *regs;
1091 	int err;
1092 
1093 	assert(ctrlr != NULL);
1094 
1095 	if (spdk_unlikely(cq == NULL || q_addr(&cq->mapping) == NULL)) {
1096 		return 0;
1097 	}
1098 
1099 	regs = spdk_nvmf_ctrlr_get_regs(ctrlr->ctrlr);
1100 	if (regs->csts.bits.shst != SPDK_NVME_SHST_NORMAL) {
1101 		SPDK_DEBUGLOG(nvmf_vfio,
1102 			      "%s: ignore completion SQ%d cid=%d status=%#x\n",
1103 			      ctrlr_id(ctrlr), sqid, cid, sc);
1104 		return 0;
1105 	}
1106 
1107 	if (cq_is_full(ctrlr, cq)) {
1108 		SPDK_ERRLOG("%s: CQ%d full (tail=%d, head=%d)\n",
1109 			    ctrlr_id(ctrlr), cq->qid, *cq_tailp(cq),
1110 			    *cq_dbl_headp(ctrlr, cq));
1111 		return -1;
1112 	}
1113 
1114 	cpl = ((struct spdk_nvme_cpl *)q_addr(&cq->mapping)) + *cq_tailp(cq);
1115 
1116 	assert(ctrlr->sqs[sqid] != NULL);
1117 	SPDK_DEBUGLOG(nvmf_vfio,
1118 		      "%s: request complete SQ%d cid=%d status=%#x SQ head=%#x CQ tail=%#x\n",
1119 		      ctrlr_id(ctrlr), sqid, cid, sc, *sq_headp(ctrlr->sqs[sqid]),
1120 		      *cq_tailp(cq));
1121 
1122 	cpl->sqhd = *sq_headp(ctrlr->sqs[sqid]);
1123 	cpl->sqid = sqid;
1124 	cpl->cid = cid;
1125 	cpl->cdw0 = cdw0;
1126 	cpl->status.dnr = 0x0;
1127 	cpl->status.m = 0x0;
1128 	cpl->status.sct = sct;
1129 	cpl->status.sc = sc;
1130 	cpl->status.p = cq->phase;
1131 
1132 	/* Ensure the Completion Queue Entry is visible. */
1133 	spdk_wmb();
1134 	cq_tail_advance(cq);
1135 
1136 	/*
1137 	 * this function now executes at SPDK thread context, we
1138 	 * might be triggering interrupts from vfio-user thread context so
1139 	 * check for race conditions.
1140 	 */
1141 	if (ctrlr_interrupt_enabled(ctrlr) && cq->ien) {
1142 		err = vfu_irq_trigger(ctrlr->endpoint->vfu_ctx, cq->iv);
1143 		if (err != 0) {
1144 			SPDK_ERRLOG("%s: failed to trigger interrupt: %m\n",
1145 				    ctrlr_id(ctrlr));
1146 			return err;
1147 		}
1148 	}
1149 
1150 	return 0;
1151 }
1152 
1153 static bool
1154 io_q_exists(struct nvmf_vfio_user_ctrlr *vu_ctrlr, const uint16_t qid, const bool is_cq)
1155 {
1156 	assert(vu_ctrlr != NULL);
1157 
1158 	if (qid == 0 || qid >= NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) {
1159 		return false;
1160 	}
1161 
1162 	if (is_cq) {
1163 		if (vu_ctrlr->cqs[qid] == NULL) {
1164 			return false;
1165 		}
1166 
1167 		return (vu_ctrlr->cqs[qid]->cq_state != VFIO_USER_CQ_DELETED &&
1168 			vu_ctrlr->cqs[qid]->cq_state != VFIO_USER_CQ_UNUSED);
1169 	}
1170 
1171 	if (vu_ctrlr->sqs[qid] == NULL) {
1172 		return false;
1173 	}
1174 
1175 	return (vu_ctrlr->sqs[qid]->sq_state != VFIO_USER_SQ_DELETED &&
1176 		vu_ctrlr->sqs[qid]->sq_state != VFIO_USER_SQ_UNUSED);
1177 }
1178 
1179 static void
1180 free_sq_reqs(struct nvmf_vfio_user_sq *sq)
1181 {
1182 	while (!TAILQ_EMPTY(&sq->free_reqs)) {
1183 		struct nvmf_vfio_user_req *vu_req = TAILQ_FIRST(&sq->free_reqs);
1184 		TAILQ_REMOVE(&sq->free_reqs, vu_req, link);
1185 		free(vu_req);
1186 	}
1187 }
1188 
1189 /* Deletes a SQ, if this SQ is the last user of the associated CQ
1190  * and the controller is being shut down or reset, then the CQ is
1191  * also deleted.
1192  */
1193 static void
1194 delete_sq_done(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvmf_vfio_user_sq *sq)
1195 {
1196 	struct nvmf_vfio_user_cq *cq;
1197 	uint16_t cqid;
1198 
1199 	SPDK_DEBUGLOG(nvmf_vfio, "%s: delete SQ%d=%p done\n", ctrlr_id(vu_ctrlr),
1200 		      sq->qid, sq);
1201 
1202 	/* Free SQ resources */
1203 	unmap_q(vu_ctrlr, &sq->mapping);
1204 
1205 	free_sq_reqs(sq);
1206 
1207 	sq->size = 0;
1208 
1209 	sq->sq_state = VFIO_USER_SQ_DELETED;
1210 
1211 	/* Controller RESET and SHUTDOWN are special cases,
1212 	 * VM may not send DELETE IO SQ/CQ commands, NVMf library
1213 	 * will disconnect IO queue pairs.
1214 	 */
1215 	if (vu_ctrlr->reset_shn) {
1216 		cqid = sq->cqid;
1217 		cq = vu_ctrlr->cqs[cqid];
1218 
1219 		SPDK_DEBUGLOG(nvmf_vfio, "%s: try to delete CQ%d=%p\n", ctrlr_id(vu_ctrlr),
1220 			      cq->qid, cq);
1221 
1222 		if (cq->cq_ref) {
1223 			cq->cq_ref--;
1224 		}
1225 		if (cq->cq_ref == 0) {
1226 			unmap_q(vu_ctrlr, &cq->mapping);
1227 			cq->size = 0;
1228 			cq->cq_state = VFIO_USER_CQ_DELETED;
1229 			cq->group = NULL;
1230 		}
1231 	}
1232 }
1233 
1234 static void
1235 free_qp(struct nvmf_vfio_user_ctrlr *ctrlr, uint16_t qid)
1236 {
1237 	struct nvmf_vfio_user_sq *sq;
1238 	struct nvmf_vfio_user_cq *cq;
1239 
1240 	if (ctrlr == NULL) {
1241 		return;
1242 	}
1243 
1244 	sq = ctrlr->sqs[qid];
1245 	if (sq) {
1246 		SPDK_DEBUGLOG(nvmf_vfio, "%s: Free SQ %u\n", ctrlr_id(ctrlr), qid);
1247 		unmap_q(ctrlr, &sq->mapping);
1248 
1249 		free_sq_reqs(sq);
1250 
1251 		free(sq->mapping.sg);
1252 		free(sq);
1253 		ctrlr->sqs[qid] = NULL;
1254 	}
1255 
1256 	cq = ctrlr->cqs[qid];
1257 	if (cq) {
1258 		SPDK_DEBUGLOG(nvmf_vfio, "%s: Free CQ %u\n", ctrlr_id(ctrlr), qid);
1259 		unmap_q(ctrlr, &cq->mapping);
1260 		free(cq->mapping.sg);
1261 		free(cq);
1262 		ctrlr->cqs[qid] = NULL;
1263 	}
1264 }
1265 
1266 static int
1267 init_sq(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_transport *transport,
1268 	const uint16_t id)
1269 {
1270 	struct nvmf_vfio_user_sq *sq;
1271 
1272 	assert(ctrlr != NULL);
1273 	assert(transport != NULL);
1274 	assert(ctrlr->sqs[id] == NULL);
1275 
1276 	sq = calloc(1, sizeof(*sq));
1277 	if (sq == NULL) {
1278 		return -ENOMEM;
1279 	}
1280 	sq->mapping.sg = calloc(1, dma_sg_size());
1281 	if (sq->mapping.sg == NULL) {
1282 		free(sq);
1283 		return -ENOMEM;
1284 	}
1285 
1286 	sq->qid = id;
1287 	sq->qpair.qid = id;
1288 	sq->qpair.transport = transport;
1289 	sq->ctrlr = ctrlr;
1290 	ctrlr->sqs[id] = sq;
1291 
1292 	TAILQ_INIT(&sq->free_reqs);
1293 
1294 	return 0;
1295 }
1296 
1297 static int
1298 init_cq(struct nvmf_vfio_user_ctrlr *vu_ctrlr, const uint16_t id)
1299 {
1300 	struct nvmf_vfio_user_cq *cq;
1301 
1302 	assert(vu_ctrlr != NULL);
1303 	assert(vu_ctrlr->cqs[id] == NULL);
1304 
1305 	cq = calloc(1, sizeof(*cq));
1306 	if (cq == NULL) {
1307 		return -ENOMEM;
1308 	}
1309 	cq->mapping.sg = calloc(1, dma_sg_size());
1310 	if (cq->mapping.sg == NULL) {
1311 		free(cq);
1312 		return -ENOMEM;
1313 	}
1314 
1315 	cq->qid = id;
1316 	vu_ctrlr->cqs[id] = cq;
1317 
1318 	return 0;
1319 }
1320 
1321 static int
1322 alloc_sq_reqs(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvmf_vfio_user_sq *sq)
1323 {
1324 	struct nvmf_vfio_user_req *vu_req, *tmp;
1325 	size_t req_size;
1326 	uint32_t i;
1327 
1328 	req_size = sizeof(struct nvmf_vfio_user_req) +
1329 		   (dma_sg_size() * NVMF_VFIO_USER_MAX_IOVECS);
1330 
1331 	for (i = 0; i < sq->size; i++) {
1332 		struct spdk_nvmf_request *req;
1333 
1334 		vu_req = calloc(1, req_size);
1335 		if (vu_req == NULL) {
1336 			goto err;
1337 		}
1338 
1339 		req = &vu_req->req;
1340 		req->qpair = &sq->qpair;
1341 		req->rsp = (union nvmf_c2h_msg *)&vu_req->rsp;
1342 		req->cmd = (union nvmf_h2c_msg *)&vu_req->cmd;
1343 
1344 		TAILQ_INSERT_TAIL(&sq->free_reqs, vu_req, link);
1345 	}
1346 
1347 	return 0;
1348 
1349 err:
1350 	TAILQ_FOREACH_SAFE(vu_req, &sq->free_reqs, link, tmp) {
1351 		free(vu_req);
1352 	}
1353 	return -ENOMEM;
1354 }
1355 
1356 static uint16_t
1357 handle_create_io_sq(struct nvmf_vfio_user_ctrlr *ctrlr,
1358 		    struct spdk_nvme_cmd *cmd, uint16_t *sct)
1359 {
1360 	struct nvmf_vfio_user_transport *vu_transport = ctrlr->transport;
1361 	struct nvmf_vfio_user_sq *sq;
1362 	uint32_t qsize;
1363 	uint16_t cqid;
1364 	uint16_t qid;
1365 	int err;
1366 
1367 	qid = cmd->cdw10_bits.create_io_q.qid;
1368 	cqid = cmd->cdw11_bits.create_io_sq.cqid;
1369 	qsize = cmd->cdw10_bits.create_io_q.qsize + 1;
1370 
1371 	if (ctrlr->sqs[qid] == NULL) {
1372 		err = init_sq(ctrlr, ctrlr->sqs[0]->qpair.transport, qid);
1373 		if (err != 0) {
1374 			*sct = SPDK_NVME_SCT_GENERIC;
1375 			return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
1376 		}
1377 	}
1378 
1379 	if (cqid == 0 || cqid >= vu_transport->transport.opts.max_qpairs_per_ctrlr) {
1380 		SPDK_ERRLOG("%s: invalid CQID %u\n", ctrlr_id(ctrlr), cqid);
1381 		*sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
1382 		return SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER;
1383 	}
1384 
1385 	/* CQ must be created before SQ. */
1386 	if (!io_q_exists(ctrlr, cqid, true)) {
1387 		SPDK_ERRLOG("%s: CQ%u does not exist\n", ctrlr_id(ctrlr), cqid);
1388 		*sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
1389 		return SPDK_NVME_SC_COMPLETION_QUEUE_INVALID;
1390 	}
1391 
1392 	if (cmd->cdw11_bits.create_io_sq.pc != 0x1) {
1393 		SPDK_ERRLOG("%s: non-PC SQ not supported\n", ctrlr_id(ctrlr));
1394 		*sct = SPDK_NVME_SCT_GENERIC;
1395 		return SPDK_NVME_SC_INVALID_FIELD;
1396 	}
1397 
1398 	sq = ctrlr->sqs[qid];
1399 	sq->size = qsize;
1400 
1401 	SPDK_DEBUGLOG(nvmf_vfio, "%s: SQ%d CQID=%d\n", ctrlr_id(ctrlr),
1402 		      qid, cqid);
1403 
1404 	sq->mapping.prp1 = cmd->dptr.prp.prp1;
1405 
1406 	err = map_q(ctrlr, &sq->mapping, sq->size, false, true);
1407 	if (err) {
1408 		SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr));
1409 		*sct = SPDK_NVME_SCT_GENERIC;
1410 		return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
1411 	}
1412 
1413 	SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped SQ%d IOVA=%#lx vaddr=%p\n",
1414 		      ctrlr_id(ctrlr), qid, cmd->dptr.prp.prp1,
1415 		      q_addr(&sq->mapping));
1416 
1417 	err = alloc_sq_reqs(ctrlr, sq);
1418 	if (err < 0) {
1419 		SPDK_ERRLOG("%s: failed to allocate SQ requests: %m\n", ctrlr_id(ctrlr));
1420 		*sct = SPDK_NVME_SCT_GENERIC;
1421 		return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
1422 	}
1423 
1424 	sq->cqid = cqid;
1425 	ctrlr->cqs[sq->cqid]->cq_ref++;
1426 	sq->sq_state = VFIO_USER_SQ_CREATED;
1427 	*sq_headp(sq) = 0;
1428 	*sq_dbl_tailp(ctrlr, sq) = 0;
1429 
1430 	/*
1431 	 * Create our new I/O qpair. This asynchronously invokes, on a suitable
1432 	 * poll group, the nvmf_vfio_user_poll_group_add() callback, which will
1433 	 * call spdk_nvmf_request_exec_fabrics() with a generated fabrics
1434 	 * connect command. This command is then eventually completed via
1435 	 * handle_queue_connect_rsp().
1436 	 */
1437 	sq->create_io_sq_cmd = *cmd;
1438 	sq->post_create_io_sq_completion = true;
1439 
1440 	spdk_nvmf_tgt_new_qpair(ctrlr->transport->transport.tgt,
1441 				&sq->qpair);
1442 
1443 	*sct = SPDK_NVME_SCT_GENERIC;
1444 	return SPDK_NVME_SC_SUCCESS;
1445 }
1446 
1447 static uint16_t
1448 handle_create_io_cq(struct nvmf_vfio_user_ctrlr *ctrlr,
1449 		    struct spdk_nvme_cmd *cmd, uint16_t *sct)
1450 {
1451 	struct nvmf_vfio_user_cq *cq;
1452 	uint32_t qsize;
1453 	uint16_t qid;
1454 	int err;
1455 
1456 	qid = cmd->cdw10_bits.create_io_q.qid;
1457 	qsize = cmd->cdw10_bits.create_io_q.qsize + 1;
1458 
1459 	if (ctrlr->cqs[qid] == NULL) {
1460 		err = init_cq(ctrlr, qid);
1461 		if (err != 0) {
1462 			*sct = SPDK_NVME_SCT_GENERIC;
1463 			return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
1464 		}
1465 	}
1466 
1467 	if (cmd->cdw11_bits.create_io_cq.pc != 0x1) {
1468 		SPDK_ERRLOG("%s: non-PC CQ not supported\n", ctrlr_id(ctrlr));
1469 		*sct = SPDK_NVME_SCT_GENERIC;
1470 		return SPDK_NVME_SC_INVALID_FIELD;
1471 	}
1472 
1473 	if (cmd->cdw11_bits.create_io_cq.iv > NVME_IRQ_MSIX_NUM - 1) {
1474 		SPDK_ERRLOG("%s: IV is too big\n", ctrlr_id(ctrlr));
1475 		*sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
1476 		return SPDK_NVME_SC_INVALID_INTERRUPT_VECTOR;
1477 	}
1478 
1479 	cq = ctrlr->cqs[qid];
1480 	cq->size = qsize;
1481 
1482 	cq->mapping.prp1 = cmd->dptr.prp.prp1;
1483 
1484 	err = map_q(ctrlr, &cq->mapping, cq->size, true, true);
1485 	if (err) {
1486 		SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr));
1487 		*sct = SPDK_NVME_SCT_GENERIC;
1488 		return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
1489 	}
1490 
1491 	SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped CQ%d IOVA=%#lx vaddr=%p\n",
1492 		      ctrlr_id(ctrlr), qid, cmd->dptr.prp.prp1,
1493 		      q_addr(&cq->mapping));
1494 
1495 	cq->ien = cmd->cdw11_bits.create_io_cq.ien;
1496 	cq->iv = cmd->cdw11_bits.create_io_cq.iv;
1497 	cq->phase = true;
1498 	cq->cq_state = VFIO_USER_CQ_CREATED;
1499 
1500 	*cq_tailp(cq) = 0;
1501 	*cq_dbl_headp(ctrlr, cq) = 0;
1502 
1503 	*sct = SPDK_NVME_SCT_GENERIC;
1504 	return SPDK_NVME_SC_SUCCESS;
1505 }
1506 
1507 /*
1508  * Creates a completion or submission I/O queue. Returns 0 on success, -errno
1509  * on error.
1510  */
1511 static int
1512 handle_create_io_q(struct nvmf_vfio_user_ctrlr *ctrlr,
1513 		   struct spdk_nvme_cmd *cmd, const bool is_cq)
1514 {
1515 	struct nvmf_vfio_user_transport *vu_transport = ctrlr->transport;
1516 	uint16_t sct = SPDK_NVME_SCT_GENERIC;
1517 	uint16_t sc = SPDK_NVME_SC_SUCCESS;
1518 	uint32_t qsize;
1519 	uint16_t qid;
1520 
1521 	assert(ctrlr != NULL);
1522 	assert(cmd != NULL);
1523 
1524 	qid = cmd->cdw10_bits.create_io_q.qid;
1525 	if (qid == 0 || qid >= vu_transport->transport.opts.max_qpairs_per_ctrlr) {
1526 		SPDK_ERRLOG("%s: invalid QID=%d, max=%d\n", ctrlr_id(ctrlr),
1527 			    qid, vu_transport->transport.opts.max_qpairs_per_ctrlr);
1528 		sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
1529 		sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER;
1530 		goto out;
1531 	}
1532 
1533 	if (io_q_exists(ctrlr, qid, is_cq)) {
1534 		SPDK_ERRLOG("%s: %cQ%d already exists\n", ctrlr_id(ctrlr),
1535 			    is_cq ? 'C' : 'S', qid);
1536 		sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
1537 		sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER;
1538 		goto out;
1539 	}
1540 
1541 	qsize = cmd->cdw10_bits.create_io_q.qsize + 1;
1542 	if (qsize == 1 || qsize > max_queue_size(ctrlr)) {
1543 		SPDK_ERRLOG("%s: invalid I/O queue size %u\n", ctrlr_id(ctrlr), qsize);
1544 		sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
1545 		sc = SPDK_NVME_SC_INVALID_QUEUE_SIZE;
1546 		goto out;
1547 	}
1548 
1549 	if (is_cq) {
1550 		sc = handle_create_io_cq(ctrlr, cmd, &sct);
1551 	} else {
1552 		sc = handle_create_io_sq(ctrlr, cmd, &sct);
1553 
1554 		if (sct == SPDK_NVME_SCT_GENERIC &&
1555 		    sc == SPDK_NVME_SC_SUCCESS) {
1556 			/* Completion posted asynchronously. */
1557 			return 0;
1558 		}
1559 	}
1560 
1561 out:
1562 	return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct);
1563 }
1564 
1565 /* For ADMIN I/O DELETE SUBMISSION QUEUE the NVMf library will disconnect and free
1566  * queue pair, so save the command in a context.
1567  */
1568 struct vfio_user_delete_sq_ctx {
1569 	struct nvmf_vfio_user_ctrlr *vu_ctrlr;
1570 	struct spdk_nvme_cmd delete_io_sq_cmd;
1571 };
1572 
1573 static void
1574 vfio_user_qpair_delete_cb(void *cb_arg)
1575 {
1576 	struct vfio_user_delete_sq_ctx *ctx = cb_arg;
1577 	struct nvmf_vfio_user_ctrlr *vu_ctrlr = ctx->vu_ctrlr;
1578 
1579 	post_completion(vu_ctrlr, vu_ctrlr->cqs[0], 0, 0, ctx->delete_io_sq_cmd.cid,
1580 			SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC);
1581 	free(ctx);
1582 }
1583 
1584 /*
1585  * Deletes a completion or submission I/O queue.
1586  */
1587 static int
1588 handle_del_io_q(struct nvmf_vfio_user_ctrlr *ctrlr,
1589 		struct spdk_nvme_cmd *cmd, const bool is_cq)
1590 {
1591 	uint16_t sct = SPDK_NVME_SCT_GENERIC;
1592 	uint16_t sc = SPDK_NVME_SC_SUCCESS;
1593 	struct nvmf_vfio_user_sq *sq;
1594 	struct nvmf_vfio_user_cq *cq;
1595 	struct vfio_user_delete_sq_ctx *ctx;
1596 
1597 	SPDK_DEBUGLOG(nvmf_vfio, "%s: delete I/O %cQ: QID=%d\n",
1598 		      ctrlr_id(ctrlr), is_cq ? 'C' : 'S',
1599 		      cmd->cdw10_bits.delete_io_q.qid);
1600 
1601 	if (!io_q_exists(ctrlr, cmd->cdw10_bits.delete_io_q.qid, is_cq)) {
1602 		SPDK_ERRLOG("%s: I/O %cQ%d does not exist\n", ctrlr_id(ctrlr),
1603 			    is_cq ? 'C' : 'S', cmd->cdw10_bits.delete_io_q.qid);
1604 		sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
1605 		sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER;
1606 		goto out;
1607 	}
1608 
1609 	if (is_cq) {
1610 		cq = ctrlr->cqs[cmd->cdw10_bits.delete_io_q.qid];
1611 		if (cq->cq_ref) {
1612 			SPDK_ERRLOG("%s: the associated SQ must be deleted first\n", ctrlr_id(ctrlr));
1613 			sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
1614 			sc = SPDK_NVME_SC_INVALID_QUEUE_DELETION;
1615 			goto out;
1616 		}
1617 
1618 		unmap_q(ctrlr, &cq->mapping);
1619 		cq->size = 0;
1620 		cq->cq_state = VFIO_USER_CQ_DELETED;
1621 		cq->group = NULL;
1622 	} else {
1623 		ctx = calloc(1, sizeof(*ctx));
1624 		if (!ctx) {
1625 			sct = SPDK_NVME_SCT_GENERIC;
1626 			sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
1627 			goto out;
1628 		}
1629 		ctx->vu_ctrlr = ctrlr;
1630 		ctx->delete_io_sq_cmd = *cmd;
1631 
1632 		sq = ctrlr->sqs[cmd->cdw10_bits.delete_io_q.qid];
1633 		sq->sq_state = VFIO_USER_SQ_DELETED;
1634 		assert(ctrlr->cqs[sq->cqid]->cq_ref);
1635 		ctrlr->cqs[sq->cqid]->cq_ref--;
1636 
1637 		spdk_nvmf_qpair_disconnect(&sq->qpair, vfio_user_qpair_delete_cb, ctx);
1638 		return 0;
1639 	}
1640 
1641 out:
1642 	return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct);
1643 }
1644 
1645 /*
1646  * Returns 0 on success and -errno on error.
1647  */
1648 static int
1649 consume_admin_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd)
1650 {
1651 	assert(ctrlr != NULL);
1652 	assert(cmd != NULL);
1653 
1654 	if (cmd->fuse != 0) {
1655 		/* Fused admin commands are not supported. */
1656 		return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid,
1657 				       SPDK_NVME_SC_INVALID_FIELD,
1658 				       SPDK_NVME_SCT_GENERIC);
1659 	}
1660 
1661 	switch (cmd->opc) {
1662 	case SPDK_NVME_OPC_CREATE_IO_CQ:
1663 	case SPDK_NVME_OPC_CREATE_IO_SQ:
1664 		return handle_create_io_q(ctrlr, cmd,
1665 					  cmd->opc == SPDK_NVME_OPC_CREATE_IO_CQ);
1666 	case SPDK_NVME_OPC_DELETE_IO_SQ:
1667 	case SPDK_NVME_OPC_DELETE_IO_CQ:
1668 		return handle_del_io_q(ctrlr, cmd,
1669 				       cmd->opc == SPDK_NVME_OPC_DELETE_IO_CQ);
1670 	default:
1671 		return handle_cmd_req(ctrlr, cmd, ctrlr->sqs[0]);
1672 	}
1673 }
1674 
1675 static int
1676 handle_cmd_rsp(struct nvmf_vfio_user_req *vu_req, void *cb_arg)
1677 {
1678 	struct nvmf_vfio_user_sq *sq = cb_arg;
1679 	struct nvmf_vfio_user_ctrlr *vu_ctrlr = sq->ctrlr;
1680 	uint16_t sqid, cqid;
1681 
1682 	assert(sq != NULL);
1683 	assert(vu_req != NULL);
1684 	assert(vu_ctrlr != NULL);
1685 
1686 	if (spdk_likely(vu_req->iovcnt)) {
1687 		vfu_unmap_sg(vu_ctrlr->endpoint->vfu_ctx,
1688 			     vu_req_to_sg_t(vu_req, 0),
1689 			     vu_req->iov, vu_req->iovcnt);
1690 	}
1691 	sqid = sq->qid;
1692 	cqid = sq->cqid;
1693 
1694 	return post_completion(vu_ctrlr, vu_ctrlr->cqs[cqid],
1695 			       vu_req->req.rsp->nvme_cpl.cdw0,
1696 			       sqid,
1697 			       vu_req->req.cmd->nvme_cmd.cid,
1698 			       vu_req->req.rsp->nvme_cpl.status.sc,
1699 			       vu_req->req.rsp->nvme_cpl.status.sct);
1700 }
1701 
1702 static int
1703 consume_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_sq *sq,
1704 	    struct spdk_nvme_cmd *cmd)
1705 {
1706 	assert(sq != NULL);
1707 	if (nvmf_qpair_is_admin_queue(&sq->qpair)) {
1708 		return consume_admin_cmd(ctrlr, cmd);
1709 	}
1710 
1711 	return handle_cmd_req(ctrlr, cmd, sq);
1712 }
1713 
1714 /* Returns the number of commands processed, or a negative value on error. */
1715 static int
1716 handle_sq_tdbl_write(struct nvmf_vfio_user_ctrlr *ctrlr, const uint32_t new_tail,
1717 		     struct nvmf_vfio_user_sq *sq)
1718 {
1719 	struct spdk_nvme_cmd *queue;
1720 	int count = 0;
1721 
1722 	assert(ctrlr != NULL);
1723 	assert(sq != NULL);
1724 
1725 	queue = q_addr(&sq->mapping);
1726 	while (*sq_headp(sq) != new_tail) {
1727 		int err;
1728 		struct spdk_nvme_cmd *cmd = &queue[*sq_headp(sq)];
1729 
1730 		count++;
1731 
1732 		/*
1733 		 * SQHD must contain the new head pointer, so we must increase
1734 		 * it before we generate a completion.
1735 		 */
1736 		sq_head_advance(sq);
1737 
1738 		err = consume_cmd(ctrlr, sq, cmd);
1739 		if (err != 0) {
1740 			return err;
1741 		}
1742 	}
1743 
1744 	return count;
1745 }
1746 
1747 static int
1748 enable_admin_queue(struct nvmf_vfio_user_ctrlr *ctrlr)
1749 {
1750 	int err;
1751 
1752 	assert(ctrlr != NULL);
1753 
1754 	err = acq_setup(ctrlr);
1755 	if (err != 0) {
1756 		return err;
1757 	}
1758 
1759 	err = asq_setup(ctrlr);
1760 	if (err != 0) {
1761 		return err;
1762 	}
1763 
1764 	return 0;
1765 }
1766 
1767 static void
1768 disable_admin_queue(struct nvmf_vfio_user_ctrlr *ctrlr)
1769 {
1770 	assert(ctrlr->sqs[0] != NULL);
1771 	assert(ctrlr->cqs[0] != NULL);
1772 
1773 	unmap_q(ctrlr, &ctrlr->sqs[0]->mapping);
1774 	unmap_q(ctrlr, &ctrlr->cqs[0]->mapping);
1775 
1776 	ctrlr->sqs[0]->size = 0;
1777 	*sq_headp(ctrlr->sqs[0]) = 0;
1778 	ctrlr->cqs[0]->size = 0;
1779 	*cq_dbl_headp(ctrlr, ctrlr->cqs[0]) = 0;
1780 }
1781 
1782 static void
1783 memory_region_add_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
1784 {
1785 	struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx);
1786 	struct nvmf_vfio_user_ctrlr *ctrlr;
1787 	struct nvmf_vfio_user_sq *sq;
1788 	struct nvmf_vfio_user_cq *cq;
1789 	void *map_start, *map_end;
1790 	int ret;
1791 
1792 	/*
1793 	 * We're not interested in any DMA regions that aren't mappable (we don't
1794 	 * support clients that don't share their memory).
1795 	 */
1796 	if (!info->vaddr) {
1797 		return;
1798 	}
1799 
1800 	map_start = info->mapping.iov_base;
1801 	map_end = info->mapping.iov_base + info->mapping.iov_len;
1802 
1803 	if (((uintptr_t)info->mapping.iov_base & MASK_2MB) ||
1804 	    (info->mapping.iov_len & MASK_2MB)) {
1805 		SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %p-%p\n",
1806 			      info->vaddr, map_start, map_end);
1807 		return;
1808 	}
1809 
1810 	assert(endpoint != NULL);
1811 	if (endpoint->ctrlr == NULL) {
1812 		return;
1813 	}
1814 	ctrlr = endpoint->ctrlr;
1815 
1816 	SPDK_DEBUGLOG(nvmf_vfio, "%s: map IOVA %p-%p\n", endpoint_id(endpoint),
1817 		      map_start, map_end);
1818 
1819 	/* VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE are enabled when registering to VFIO, here we also
1820 	 * check the protection bits before registering.
1821 	 */
1822 	if (info->prot == (PROT_WRITE | PROT_READ)) {
1823 		ret = spdk_mem_register(info->mapping.iov_base, info->mapping.iov_len);
1824 		if (ret) {
1825 			SPDK_ERRLOG("Memory region register %p-%p failed, ret=%d\n",
1826 				    map_start, map_end, ret);
1827 		}
1828 	}
1829 
1830 	pthread_mutex_lock(&endpoint->lock);
1831 	TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) {
1832 		if (sq->sq_state != VFIO_USER_SQ_INACTIVE) {
1833 			continue;
1834 		}
1835 
1836 		cq = ctrlr->cqs[sq->cqid];
1837 
1838 		/* For shared CQ case, we will use q_addr() to avoid mapping CQ multiple times */
1839 		if (cq->size && q_addr(&cq->mapping) == NULL) {
1840 			ret = map_q(ctrlr, &cq->mapping, cq->size, true, false);
1841 			if (ret) {
1842 				SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap CQID %d %#lx-%#lx\n",
1843 					      cq->qid, cq->mapping.prp1,
1844 					      cq->mapping.prp1 + cq->size * sizeof(struct spdk_nvme_cpl));
1845 				continue;
1846 			}
1847 		}
1848 
1849 		if (sq->size) {
1850 			ret = map_q(ctrlr, &sq->mapping, sq->size, false, false);
1851 			if (ret) {
1852 				SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap SQID %d %#lx-%#lx\n",
1853 					      sq->qid, sq->mapping.prp1,
1854 					      sq->mapping.prp1 + sq->size * sizeof(struct spdk_nvme_cmd));
1855 				continue;
1856 			}
1857 		}
1858 		sq->sq_state = VFIO_USER_SQ_ACTIVE;
1859 		SPDK_DEBUGLOG(nvmf_vfio, "Remap SQ %u successfully\n", sq->qid);
1860 	}
1861 	pthread_mutex_unlock(&endpoint->lock);
1862 }
1863 
1864 static void
1865 memory_region_remove_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
1866 {
1867 	struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx);
1868 	struct nvmf_vfio_user_sq *sq;
1869 	struct nvmf_vfio_user_cq *cq;
1870 	void *map_start, *map_end;
1871 	int ret = 0;
1872 
1873 	if (!info->vaddr) {
1874 		return;
1875 	}
1876 
1877 	map_start = info->mapping.iov_base;
1878 	map_end = info->mapping.iov_base + info->mapping.iov_len;
1879 
1880 	if (((uintptr_t)info->mapping.iov_base & MASK_2MB) ||
1881 	    (info->mapping.iov_len & MASK_2MB)) {
1882 		SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %p-%p\n",
1883 			      info->vaddr, map_start, map_end);
1884 		return;
1885 	}
1886 
1887 	assert(endpoint != NULL);
1888 	SPDK_DEBUGLOG(nvmf_vfio, "%s: unmap IOVA %p-%p\n", endpoint_id(endpoint),
1889 		      map_start, map_end);
1890 
1891 	if (endpoint->ctrlr != NULL) {
1892 		struct nvmf_vfio_user_ctrlr *ctrlr;
1893 		ctrlr = endpoint->ctrlr;
1894 
1895 		pthread_mutex_lock(&endpoint->lock);
1896 		TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) {
1897 			if (q_addr(&sq->mapping) >= map_start && q_addr(&sq->mapping) <= map_end) {
1898 				unmap_q(ctrlr, &sq->mapping);
1899 				sq->sq_state = VFIO_USER_SQ_INACTIVE;
1900 			}
1901 
1902 			cq = ctrlr->cqs[sq->cqid];
1903 			if (q_addr(&cq->mapping) >= map_start && q_addr(&cq->mapping) <= map_end) {
1904 				unmap_q(ctrlr, &cq->mapping);
1905 			}
1906 		}
1907 		pthread_mutex_unlock(&endpoint->lock);
1908 	}
1909 
1910 	if (info->prot == (PROT_WRITE | PROT_READ)) {
1911 		ret = spdk_mem_unregister(info->mapping.iov_base, info->mapping.iov_len);
1912 		if (ret) {
1913 			SPDK_ERRLOG("Memory region unregister %p-%p failed, ret=%d\n",
1914 				    map_start, map_end, ret);
1915 		}
1916 	}
1917 }
1918 
1919 static int
1920 nvmf_vfio_user_prop_req_rsp(struct nvmf_vfio_user_req *req, void *cb_arg)
1921 {
1922 	struct nvmf_vfio_user_sq *sq = cb_arg;
1923 	struct nvmf_vfio_user_ctrlr *vu_ctrlr;
1924 	int ret;
1925 
1926 	assert(sq != NULL);
1927 	assert(req != NULL);
1928 
1929 	if (req->req.cmd->prop_get_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET) {
1930 		assert(sq->ctrlr != NULL);
1931 		assert(req != NULL);
1932 
1933 		memcpy(req->req.data,
1934 		       &req->req.rsp->prop_get_rsp.value.u64,
1935 		       req->req.length);
1936 	} else {
1937 		assert(req->req.cmd->prop_set_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET);
1938 		assert(sq->ctrlr != NULL);
1939 		vu_ctrlr = sq->ctrlr;
1940 
1941 		if (req->req.cmd->prop_set_cmd.ofst == offsetof(struct spdk_nvme_registers, cc)) {
1942 			union spdk_nvme_cc_register cc, diff;
1943 
1944 			cc.raw = req->req.cmd->prop_set_cmd.value.u64;
1945 			diff.raw = cc.raw ^ req->cc.raw;
1946 
1947 			if (diff.bits.en) {
1948 				if (cc.bits.en) {
1949 					SPDK_DEBUGLOG(nvmf_vfio, "%s: MAP Admin queue\n", ctrlr_id(vu_ctrlr));
1950 					ret = enable_admin_queue(vu_ctrlr);
1951 					if (ret) {
1952 						SPDK_ERRLOG("%s: failed to map Admin queue\n", ctrlr_id(vu_ctrlr));
1953 						return ret;
1954 					}
1955 					sq->sq_state = VFIO_USER_SQ_ACTIVE;
1956 					vu_ctrlr->reset_shn = false;
1957 				} else {
1958 					vu_ctrlr->reset_shn = true;
1959 				}
1960 			}
1961 
1962 			if (diff.bits.shn) {
1963 				if (cc.bits.shn == SPDK_NVME_SHN_NORMAL || cc.bits.shn == SPDK_NVME_SHN_ABRUPT) {
1964 					vu_ctrlr->reset_shn = true;
1965 				}
1966 			}
1967 
1968 			if (vu_ctrlr->reset_shn) {
1969 				SPDK_DEBUGLOG(nvmf_vfio,
1970 					      "%s: UNMAP Admin queue\n",
1971 					      ctrlr_id(vu_ctrlr));
1972 				sq->sq_state = VFIO_USER_SQ_INACTIVE;
1973 				disable_admin_queue(vu_ctrlr);
1974 				/* For PCIe controller reset or shutdown, we will drop all AER responses */
1975 				nvmf_ctrlr_abort_aer(vu_ctrlr->ctrlr);
1976 			}
1977 		}
1978 	}
1979 
1980 	return 0;
1981 }
1982 
1983 /*
1984  * Handles a write at offset 0x1000 or more; this is the non-mapped path when a
1985  * doorbell is written via access_bar0_fn().
1986  *
1987  * DSTRD is set to fixed value 0 for NVMf.
1988  *
1989  */
1990 static int
1991 handle_dbl_access(struct nvmf_vfio_user_ctrlr *ctrlr, uint32_t *buf,
1992 		  const size_t count, loff_t pos, const bool is_write)
1993 {
1994 	assert(ctrlr != NULL);
1995 	assert(buf != NULL);
1996 
1997 	if (count != sizeof(uint32_t)) {
1998 		SPDK_ERRLOG("%s: bad doorbell buffer size %ld\n",
1999 			    ctrlr_id(ctrlr), count);
2000 		errno = EINVAL;
2001 		return -1;
2002 	}
2003 
2004 	pos -= NVME_DOORBELLS_OFFSET;
2005 
2006 	/* pos must be dword aligned */
2007 	if ((pos & 0x3) != 0) {
2008 		SPDK_ERRLOG("%s: bad doorbell offset %#lx\n", ctrlr_id(ctrlr), pos);
2009 		errno = EINVAL;
2010 		return -1;
2011 	}
2012 
2013 	/* convert byte offset to array index */
2014 	pos >>= 2;
2015 
2016 	if (pos >= NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR * 2) {
2017 		SPDK_ERRLOG("%s: bad doorbell index %#lx\n", ctrlr_id(ctrlr), pos);
2018 		errno = EINVAL;
2019 		return -1;
2020 	}
2021 
2022 	if (is_write) {
2023 		ctrlr->doorbells[pos] = *buf;
2024 		spdk_wmb();
2025 	} else {
2026 		spdk_rmb();
2027 		*buf = ctrlr->doorbells[pos];
2028 	}
2029 	return 0;
2030 }
2031 
2032 static size_t
2033 vfio_user_property_access(struct nvmf_vfio_user_ctrlr *vu_ctrlr,
2034 			  char *buf, size_t count, loff_t pos,
2035 			  bool is_write)
2036 {
2037 	struct nvmf_vfio_user_req *req;
2038 	const struct spdk_nvmf_registers *regs;
2039 
2040 	/* Construct a Fabric Property Get/Set command and send it */
2041 	req = get_nvmf_vfio_user_req(vu_ctrlr->sqs[0]);
2042 	if (req == NULL) {
2043 		errno = ENOBUFS;
2044 		return -1;
2045 	}
2046 	regs = spdk_nvmf_ctrlr_get_regs(vu_ctrlr->ctrlr);
2047 	req->cc.raw = regs->cc.raw;
2048 
2049 	req->cb_fn = nvmf_vfio_user_prop_req_rsp;
2050 	req->cb_arg = vu_ctrlr->sqs[0];
2051 	req->req.cmd->prop_set_cmd.opcode = SPDK_NVME_OPC_FABRIC;
2052 	req->req.cmd->prop_set_cmd.cid = 0;
2053 	req->req.cmd->prop_set_cmd.attrib.size = (count / 4) - 1;
2054 	req->req.cmd->prop_set_cmd.ofst = pos;
2055 	if (is_write) {
2056 		req->req.cmd->prop_set_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET;
2057 		if (req->req.cmd->prop_set_cmd.attrib.size) {
2058 			req->req.cmd->prop_set_cmd.value.u64 = *(uint64_t *)buf;
2059 		} else {
2060 			req->req.cmd->prop_set_cmd.value.u32.high = 0;
2061 			req->req.cmd->prop_set_cmd.value.u32.low = *(uint32_t *)buf;
2062 		}
2063 	} else {
2064 		req->req.cmd->prop_get_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET;
2065 	}
2066 	req->req.length = count;
2067 	req->req.data = buf;
2068 
2069 	spdk_nvmf_request_exec_fabrics(&req->req);
2070 
2071 	return count;
2072 }
2073 
2074 static ssize_t
2075 access_bar0_fn(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t pos,
2076 	       bool is_write)
2077 {
2078 	struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx);
2079 	struct nvmf_vfio_user_ctrlr *ctrlr;
2080 	int ret;
2081 
2082 	ctrlr = endpoint->ctrlr;
2083 	if (endpoint->need_async_destroy || !ctrlr) {
2084 		errno = EIO;
2085 		return -1;
2086 	}
2087 
2088 	SPDK_DEBUGLOG(nvmf_vfio,
2089 		      "%s: bar0 %s ctrlr: %p, count=%zu, pos=%"PRIX64"\n",
2090 		      endpoint_id(endpoint), is_write ? "write" : "read",
2091 		      ctrlr, count, pos);
2092 
2093 	if (pos >= NVME_DOORBELLS_OFFSET) {
2094 		/*
2095 		 * The fact that the doorbells can be memory mapped doesn't mean
2096 		 * that the client (VFIO in QEMU) is obliged to memory map them,
2097 		 * it might still elect to access them via regular read/write;
2098 		 * we might also have had disable_mappable_bar0 set.
2099 		 */
2100 		ret = handle_dbl_access(ctrlr, (uint32_t *)buf, count,
2101 					pos, is_write);
2102 		if (ret == 0) {
2103 			return count;
2104 		}
2105 		return ret;
2106 	}
2107 
2108 	return vfio_user_property_access(ctrlr, buf, count, pos, is_write);
2109 }
2110 
2111 static ssize_t
2112 access_pci_config(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t offset,
2113 		  bool is_write)
2114 {
2115 	struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx);
2116 
2117 	if (is_write) {
2118 		SPDK_ERRLOG("%s: write %#lx-%#lx not supported\n",
2119 			    endpoint_id(endpoint), offset, offset + count);
2120 		errno = EINVAL;
2121 		return -1;
2122 	}
2123 
2124 	if (offset + count > NVME_REG_CFG_SIZE) {
2125 		SPDK_ERRLOG("%s: access past end of extended PCI configuration space, want=%ld+%ld, max=%d\n",
2126 			    endpoint_id(endpoint), offset, count,
2127 			    NVME_REG_CFG_SIZE);
2128 		errno = ERANGE;
2129 		return -1;
2130 	}
2131 
2132 	memcpy(buf, ((unsigned char *)endpoint->pci_config_space) + offset, count);
2133 
2134 	return count;
2135 }
2136 
2137 static void
2138 vfio_user_log(vfu_ctx_t *vfu_ctx, int level, char const *msg)
2139 {
2140 	struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx);
2141 
2142 	if (level >= LOG_DEBUG) {
2143 		SPDK_DEBUGLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg);
2144 	} else if (level >= LOG_INFO) {
2145 		SPDK_INFOLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg);
2146 	} else if (level >= LOG_NOTICE) {
2147 		SPDK_NOTICELOG("%s: %s\n", endpoint_id(endpoint), msg);
2148 	} else if (level >= LOG_WARNING) {
2149 		SPDK_WARNLOG("%s: %s\n", endpoint_id(endpoint), msg);
2150 	} else {
2151 		SPDK_ERRLOG("%s: %s\n", endpoint_id(endpoint), msg);
2152 	}
2153 }
2154 
2155 static int
2156 vfio_user_get_log_level(void)
2157 {
2158 	int level;
2159 
2160 	if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) {
2161 		return LOG_DEBUG;
2162 	}
2163 
2164 	level = spdk_log_to_syslog_level(spdk_log_get_level());
2165 	if (level < 0) {
2166 		return LOG_ERR;
2167 	}
2168 
2169 	return level;
2170 }
2171 
2172 static void
2173 init_pci_config_space(vfu_pci_config_space_t *p)
2174 {
2175 	/* MLBAR */
2176 	p->hdr.bars[0].raw = 0x0;
2177 	/* MUBAR */
2178 	p->hdr.bars[1].raw = 0x0;
2179 
2180 	/* vendor specific, let's set them to zero for now */
2181 	p->hdr.bars[3].raw = 0x0;
2182 	p->hdr.bars[4].raw = 0x0;
2183 	p->hdr.bars[5].raw = 0x0;
2184 
2185 	/* enable INTx */
2186 	p->hdr.intr.ipin = 0x1;
2187 }
2188 
2189 static void
2190 vfio_user_dev_quiesce_done(struct spdk_nvmf_subsystem *subsystem,
2191 			   void *cb_arg, int status);
2192 
2193 static void
2194 vfio_user_endpoint_resume_done(struct spdk_nvmf_subsystem *subsystem,
2195 			       void *cb_arg, int status)
2196 {
2197 	struct nvmf_vfio_user_endpoint *endpoint = cb_arg;
2198 	struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr;
2199 	int ret;
2200 
2201 	SPDK_DEBUGLOG(nvmf_vfio, "%s resumed done with status %d\n", endpoint_id(endpoint), status);
2202 
2203 	if (!vu_ctrlr) {
2204 		return;
2205 	}
2206 	vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING;
2207 
2208 	/* Basically, once we call `vfu_device_quiesced` the device is unquiesced from
2209 	 * libvfio-user's perspective so from the moment `vfio_user_dev_quiesce_done` returns
2210 	 * libvfio-user might quiesce the device again. However, because the NVMf subsytem is
2211 	 * an asynchronous operation, this quiesce might come _before_ the NVMf subsystem has
2212 	 * been resumed, so in the callback of `spdk_nvmf_subsystem_resume` we need to check
2213 	 * whether a quiesce was requested.
2214 	 */
2215 	if (vu_ctrlr->queued_quiesce) {
2216 		SPDK_DEBUGLOG(nvmf_vfio, "%s has queued quiesce event, pause again\n", ctrlr_id(vu_ctrlr));
2217 		vu_ctrlr->state = VFIO_USER_CTRLR_PAUSING;
2218 		ret = spdk_nvmf_subsystem_pause((struct spdk_nvmf_subsystem *)endpoint->subsystem, 0,
2219 						vfio_user_dev_quiesce_done, vu_ctrlr);
2220 		if (ret < 0) {
2221 			vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING;
2222 			SPDK_ERRLOG("%s: failed to pause, ret=%d\n", endpoint_id(endpoint), ret);
2223 		}
2224 	}
2225 }
2226 
2227 static void
2228 vfio_user_dev_quiesce_done(struct spdk_nvmf_subsystem *subsystem,
2229 			   void *cb_arg, int status)
2230 {
2231 	struct nvmf_vfio_user_ctrlr *vu_ctrlr = cb_arg;
2232 	struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint;
2233 	int ret;
2234 
2235 	SPDK_DEBUGLOG(nvmf_vfio, "%s paused done with status %d\n", ctrlr_id(vu_ctrlr), status);
2236 
2237 	assert(vu_ctrlr->state == VFIO_USER_CTRLR_PAUSING);
2238 	vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED;
2239 	vfu_device_quiesced(endpoint->vfu_ctx, status);
2240 	vu_ctrlr->queued_quiesce = false;
2241 
2242 	/* `vfu_device_quiesced` can change the migration state,
2243 	 * so we need to re-check `vu_ctrlr->state`.
2244 	 */
2245 	if (vu_ctrlr->state == VFIO_USER_CTRLR_MIGRATING) {
2246 		SPDK_DEBUGLOG(nvmf_vfio, "%s is in MIGRATION state\n", ctrlr_id(vu_ctrlr));
2247 		return;
2248 	}
2249 
2250 	SPDK_DEBUGLOG(nvmf_vfio, "%s start to resume\n", ctrlr_id(vu_ctrlr));
2251 	vu_ctrlr->state = VFIO_USER_CTRLR_RESUMING;
2252 	ret = spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem,
2253 					 vfio_user_endpoint_resume_done, endpoint);
2254 	if (ret < 0) {
2255 		vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED;
2256 		SPDK_ERRLOG("%s: failed to resume, ret=%d\n", endpoint_id(endpoint), ret);
2257 	}
2258 }
2259 
2260 static int
2261 vfio_user_dev_quiesce_cb(vfu_ctx_t *vfu_ctx)
2262 {
2263 	struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx);
2264 	struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr;
2265 	int ret;
2266 
2267 	if (!vu_ctrlr) {
2268 		return 0;
2269 	}
2270 
2271 	/* NVMf library will destruct controller when no
2272 	 * connected queue pairs.
2273 	 */
2274 	if (!nvmf_subsystem_get_ctrlr((struct spdk_nvmf_subsystem *)endpoint->subsystem,
2275 				      vu_ctrlr->cntlid)) {
2276 		return 0;
2277 	}
2278 
2279 	SPDK_DEBUGLOG(nvmf_vfio, "%s starts to quiesce\n", ctrlr_id(vu_ctrlr));
2280 
2281 	/* There is no race condition here as device quiesce callback
2282 	 * and nvmf_prop_set_cc() are running in the same thread context.
2283 	 */
2284 	if (!vu_ctrlr->ctrlr->vcprop.cc.bits.en) {
2285 		return 0;
2286 	} else if (!vu_ctrlr->ctrlr->vcprop.csts.bits.rdy) {
2287 		return 0;
2288 	} else if (vu_ctrlr->ctrlr->vcprop.csts.bits.shst == SPDK_NVME_SHST_COMPLETE) {
2289 		return 0;
2290 	}
2291 
2292 	switch (vu_ctrlr->state) {
2293 	case VFIO_USER_CTRLR_PAUSED:
2294 	case VFIO_USER_CTRLR_MIGRATING:
2295 		return 0;
2296 	case VFIO_USER_CTRLR_RUNNING:
2297 		vu_ctrlr->state = VFIO_USER_CTRLR_PAUSING;
2298 		ret = spdk_nvmf_subsystem_pause((struct spdk_nvmf_subsystem *)endpoint->subsystem, 0,
2299 						vfio_user_dev_quiesce_done, vu_ctrlr);
2300 		if (ret < 0) {
2301 			vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING;
2302 			SPDK_ERRLOG("%s: failed to pause, ret=%d\n", endpoint_id(endpoint), ret);
2303 			return 0;
2304 		}
2305 		break;
2306 	case VFIO_USER_CTRLR_RESUMING:
2307 		vu_ctrlr->queued_quiesce = true;
2308 		SPDK_DEBUGLOG(nvmf_vfio, "%s is busy to quiesce, current state %u\n", ctrlr_id(vu_ctrlr),
2309 			      vu_ctrlr->state);
2310 		break;
2311 	default:
2312 		assert(vu_ctrlr->state != VFIO_USER_CTRLR_PAUSING);
2313 		break;
2314 	}
2315 
2316 	errno = EBUSY;
2317 	return -1;
2318 }
2319 
2320 static void
2321 vfio_user_ctrlr_dump_migr_data(const char *name, struct vfio_user_nvme_migr_state *migr_data)
2322 {
2323 	struct spdk_nvme_registers *regs;
2324 	struct nvme_migr_sq_state *sq;
2325 	struct nvme_migr_cq_state *cq;
2326 	uint32_t *doorbell_base;
2327 	uint32_t i;
2328 
2329 	SPDK_NOTICELOG("Dump %s\n", name);
2330 
2331 	regs = (struct spdk_nvme_registers *)migr_data->bar0;
2332 	doorbell_base = (uint32_t *)&regs->doorbell[0].sq_tdbl;
2333 
2334 	SPDK_NOTICELOG("Registers\n");
2335 	SPDK_NOTICELOG("CSTS 0x%x\n", regs->csts.raw);
2336 	SPDK_NOTICELOG("CAP  0x%"PRIx64"\n", regs->cap.raw);
2337 	SPDK_NOTICELOG("VS   0x%x\n", regs->vs.raw);
2338 	SPDK_NOTICELOG("CC   0x%x\n", regs->cc.raw);
2339 	SPDK_NOTICELOG("AQA  0x%x\n", regs->aqa.raw);
2340 	SPDK_NOTICELOG("ASQ  0x%"PRIx64"\n", regs->asq);
2341 	SPDK_NOTICELOG("ACQ  0x%"PRIx64"\n", regs->acq);
2342 
2343 	SPDK_NOTICELOG("Number of IO Queues %u\n", migr_data->ctrlr_header.num_io_queues);
2344 	for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) {
2345 		sq = &migr_data->qps[i].sq;
2346 		cq = &migr_data->qps[i].cq;
2347 
2348 		if (sq->size) {
2349 			SPDK_NOTICELOG("SQID %u, SQ DOORBELL %u\n", sq->sqid, doorbell_base[i * 2]);
2350 			SPDK_NOTICELOG("SQ SQID %u, CQID %u, HEAD %u, SIZE %u, DMA ADDR 0x%"PRIx64"\n",
2351 				       sq->sqid, sq->cqid, sq->head, sq->size, sq->dma_addr);
2352 		}
2353 
2354 		if (cq->size) {
2355 			SPDK_NOTICELOG("CQID %u, CQ DOORBELL %u\n", cq->cqid, doorbell_base[i * 2 + 1]);
2356 			SPDK_NOTICELOG("CQ CQID %u, PHASE %u, TAIL %u, SIZE %u, IV %u, IEN %u, DMA ADDR 0x%"PRIx64"\n",
2357 				       cq->cqid, cq->phase, cq->tail, cq->size, cq->iv, cq->ien, cq->dma_addr);
2358 		}
2359 	}
2360 
2361 	SPDK_NOTICELOG("%s Dump Done\n", name);
2362 }
2363 
2364 /* Read region 9 content and restore it to migration data structures */
2365 static int
2366 vfio_user_migr_stream_to_data(struct nvmf_vfio_user_endpoint *endpoint,
2367 			      struct vfio_user_nvme_migr_state *migr_state)
2368 {
2369 	void *data_ptr = endpoint->migr_data;
2370 
2371 	/* Load vfio_user_nvme_migr_header first */
2372 	memcpy(&migr_state->ctrlr_header, data_ptr, sizeof(struct vfio_user_nvme_migr_header));
2373 	/* TODO: version check */
2374 	if (migr_state->ctrlr_header.magic != VFIO_USER_NVME_MIGR_MAGIC) {
2375 		SPDK_ERRLOG("%s: bad magic number %x\n", endpoint_id(endpoint), migr_state->ctrlr_header.magic);
2376 		return -EINVAL;
2377 	}
2378 
2379 	/* Load nvmf controller data */
2380 	data_ptr = endpoint->migr_data + migr_state->ctrlr_header.nvmf_data_offset;
2381 	memcpy(&migr_state->nvmf_data, data_ptr, migr_state->ctrlr_header.nvmf_data_len);
2382 
2383 	/* Load queue pairs */
2384 	data_ptr = endpoint->migr_data + migr_state->ctrlr_header.qp_offset;
2385 	memcpy(&migr_state->qps, data_ptr, migr_state->ctrlr_header.qp_len);
2386 
2387 	/* Load BAR0 */
2388 	data_ptr = endpoint->migr_data + migr_state->ctrlr_header.bar_offset[VFU_PCI_DEV_BAR0_REGION_IDX];
2389 	memcpy(&migr_state->bar0, data_ptr, migr_state->ctrlr_header.bar_len[VFU_PCI_DEV_BAR0_REGION_IDX]);
2390 
2391 	/* Load CFG */
2392 	data_ptr = endpoint->migr_data + migr_state->ctrlr_header.bar_offset[VFU_PCI_DEV_CFG_REGION_IDX];
2393 	memcpy(&migr_state->cfg, data_ptr, migr_state->ctrlr_header.bar_len[VFU_PCI_DEV_CFG_REGION_IDX]);
2394 
2395 	return 0;
2396 }
2397 
2398 
2399 static void
2400 vfio_user_migr_ctrlr_save_data(struct nvmf_vfio_user_ctrlr *vu_ctrlr)
2401 {
2402 	struct spdk_nvmf_ctrlr *ctrlr = vu_ctrlr->ctrlr;
2403 	struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint;
2404 	struct nvmf_vfio_user_sq *sq;
2405 	struct nvmf_vfio_user_cq *cq;
2406 	struct vfio_user_nvme_migr_state migr_state = {};
2407 	uint64_t data_offset;
2408 	void *data_ptr;
2409 	int num_aers;
2410 	struct spdk_nvme_registers *regs;
2411 	uint32_t *doorbell_base;
2412 	uint32_t i = 0;
2413 	uint16_t sqid, cqid;
2414 
2415 	/* Save all data to vfio_user_nvme_migr_state first, then we will
2416 	 * copy it to device migration region at last.
2417 	 */
2418 
2419 	/* save magic number */
2420 	migr_state.ctrlr_header.magic = VFIO_USER_NVME_MIGR_MAGIC;
2421 
2422 	/* save controller data */
2423 	num_aers = nvmf_ctrlr_save_aers(ctrlr, migr_state.ctrlr_header.aer_cids,
2424 					256);
2425 	assert(num_aers >= 0);
2426 	migr_state.ctrlr_header.nr_aers = num_aers;
2427 
2428 	/* save nvmf controller data */
2429 	nvmf_ctrlr_save_migr_data(ctrlr, (struct nvmf_ctrlr_migr_data *)&migr_state.nvmf_data);
2430 
2431 	/* save connected queue pairs */
2432 	TAILQ_FOREACH(sq, &vu_ctrlr->connected_sqs, tailq) {
2433 		/* save sq */
2434 		sqid = sq->qid;
2435 		migr_state.qps[sqid].sq.sqid = sq->qid;
2436 		migr_state.qps[sqid].sq.cqid = sq->cqid;
2437 		migr_state.qps[sqid].sq.head = *sq_headp(sq);
2438 		migr_state.qps[sqid].sq.size = sq->size;
2439 		migr_state.qps[sqid].sq.dma_addr = sq->mapping.prp1;
2440 
2441 		/* save cq, for shared cq case, cq may be saved multiple times */
2442 		cqid = sq->cqid;
2443 		cq = vu_ctrlr->cqs[cqid];
2444 		migr_state.qps[cqid].cq.cqid = cqid;
2445 		migr_state.qps[cqid].cq.tail = *cq_tailp(cq);
2446 		migr_state.qps[cqid].cq.ien = cq->ien;
2447 		migr_state.qps[cqid].cq.iv = cq->iv;
2448 		migr_state.qps[cqid].cq.size = cq->size;
2449 		migr_state.qps[cqid].cq.phase = cq->phase;
2450 		migr_state.qps[cqid].cq.dma_addr = cq->mapping.prp1;
2451 		i++;
2452 	}
2453 
2454 	assert(i > 0);
2455 	migr_state.ctrlr_header.num_io_queues = i - 1;
2456 
2457 	regs = (struct spdk_nvme_registers *)&migr_state.bar0;
2458 	/* Save mandarory registers to bar0 */
2459 	regs->csts.raw = ctrlr->vcprop.csts.raw;
2460 	regs->cap.raw = ctrlr->vcprop.cap.raw;
2461 	regs->vs.raw = ctrlr->vcprop.vs.raw;
2462 	regs->cc.raw = ctrlr->vcprop.cc.raw;
2463 	regs->aqa.raw = ctrlr->vcprop.aqa.raw;
2464 	regs->asq = ctrlr->vcprop.asq;
2465 	regs->acq = ctrlr->vcprop.acq;
2466 	/* Save doorbells */
2467 	doorbell_base = (uint32_t *)&regs->doorbell[0].sq_tdbl;
2468 	memcpy(doorbell_base, (void *)vu_ctrlr->doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE);
2469 
2470 	/* Save PCI configuration space */
2471 	memcpy(&migr_state.cfg, (void *)endpoint->pci_config_space, NVME_REG_CFG_SIZE);
2472 
2473 	/* Save all data to device migration region */
2474 	data_ptr = endpoint->migr_data;
2475 
2476 	/* Copy nvmf controller data */
2477 	data_offset = sizeof(struct vfio_user_nvme_migr_header);
2478 	data_ptr += data_offset;
2479 	migr_state.ctrlr_header.nvmf_data_offset = data_offset;
2480 	migr_state.ctrlr_header.nvmf_data_len = sizeof(struct nvmf_ctrlr_migr_data);
2481 	memcpy(data_ptr, &migr_state.nvmf_data, sizeof(struct nvmf_ctrlr_migr_data));
2482 
2483 	/* Copy queue pairs */
2484 	data_offset += sizeof(struct nvmf_ctrlr_migr_data);
2485 	data_ptr += sizeof(struct nvmf_ctrlr_migr_data);
2486 	migr_state.ctrlr_header.qp_offset = data_offset;
2487 	migr_state.ctrlr_header.qp_len = i * (sizeof(struct nvme_migr_sq_state) + sizeof(
2488 			struct nvme_migr_cq_state));
2489 	memcpy(data_ptr, &migr_state.qps, migr_state.ctrlr_header.qp_len);
2490 
2491 	/* Copy BAR0 */
2492 	data_offset += migr_state.ctrlr_header.qp_len;
2493 	data_ptr += migr_state.ctrlr_header.qp_len;
2494 	migr_state.ctrlr_header.bar_offset[VFU_PCI_DEV_BAR0_REGION_IDX] = data_offset;
2495 	migr_state.ctrlr_header.bar_len[VFU_PCI_DEV_BAR0_REGION_IDX] = NVME_REG_BAR0_SIZE;
2496 	memcpy(data_ptr, &migr_state.bar0, NVME_REG_BAR0_SIZE);
2497 
2498 	/* Copy CFG */
2499 	data_offset += NVME_REG_BAR0_SIZE;
2500 	data_ptr += NVME_REG_BAR0_SIZE;
2501 	migr_state.ctrlr_header.bar_offset[VFU_PCI_DEV_CFG_REGION_IDX] = data_offset;
2502 	migr_state.ctrlr_header.bar_len[VFU_PCI_DEV_CFG_REGION_IDX] = NVME_REG_CFG_SIZE;
2503 	memcpy(data_ptr, &migr_state.cfg, NVME_REG_CFG_SIZE);
2504 
2505 	/* Copy nvme migration header finally */
2506 	memcpy(endpoint->migr_data, &migr_state.ctrlr_header, sizeof(struct vfio_user_nvme_migr_header));
2507 
2508 	if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) {
2509 		vfio_user_ctrlr_dump_migr_data("SAVE", &migr_state);
2510 	}
2511 }
2512 
2513 /*
2514  * If we are about to close the connection, we need to unregister the interrupt,
2515  * as the library will subsequently close the file descriptor we registered.
2516  */
2517 static int
2518 vfio_user_device_reset(vfu_ctx_t *vfu_ctx, vfu_reset_type_t type)
2519 {
2520 	struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx);
2521 	struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr;
2522 
2523 	SPDK_DEBUGLOG(nvmf_vfio, "Device reset type %u\n", type);
2524 
2525 	if (type == VFU_RESET_LOST_CONN) {
2526 		if (ctrlr != NULL) {
2527 			spdk_interrupt_unregister(&ctrlr->intr);
2528 			ctrlr->intr_fd = -1;
2529 		}
2530 		return 0;
2531 	}
2532 
2533 	/* FIXME: much more needed here. */
2534 
2535 	return 0;
2536 }
2537 
2538 static int
2539 vfio_user_migr_ctrlr_construct_qps(struct nvmf_vfio_user_ctrlr *vu_ctrlr,
2540 				   struct vfio_user_nvme_migr_state *migr_state)
2541 {
2542 	uint32_t i, qsize = 0;
2543 	uint16_t sqid, cqid;
2544 	struct vfio_user_nvme_migr_qp migr_qp;
2545 	void *addr;
2546 	uint32_t cqs_ref[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR] = {};
2547 	int ret;
2548 
2549 	if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) {
2550 		vfio_user_ctrlr_dump_migr_data("RESUME", migr_state);
2551 	}
2552 
2553 	/* restore submission queues */
2554 	for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) {
2555 		migr_qp =  migr_state->qps[i];
2556 
2557 		qsize = migr_qp.sq.size;
2558 		if (qsize) {
2559 			struct nvmf_vfio_user_sq *sq;
2560 
2561 			sqid = migr_qp.sq.sqid;
2562 			if (sqid != i) {
2563 				SPDK_ERRLOG("Expected sqid %u while got %u", i, sqid);
2564 				return -EINVAL;
2565 			}
2566 
2567 			/* allocate sq if necessary */
2568 			if (vu_ctrlr->sqs[sqid] == NULL) {
2569 				ret = init_sq(vu_ctrlr, &vu_ctrlr->transport->transport, sqid);
2570 				if (ret) {
2571 					SPDK_ERRLOG("Construct qpair with qid %u failed\n", sqid);
2572 					return -EFAULT;
2573 				}
2574 			}
2575 
2576 			sq = vu_ctrlr->sqs[sqid];
2577 			sq->size = qsize;
2578 
2579 			ret = alloc_sq_reqs(vu_ctrlr, sq);
2580 			if (ret) {
2581 				SPDK_ERRLOG("Construct sq with qid %u failed\n", sqid);
2582 				return -EFAULT;
2583 			}
2584 
2585 			/* restore sq */
2586 			sq->sq_state = VFIO_USER_SQ_CREATED;
2587 			sq->cqid = migr_qp.sq.cqid;
2588 			*sq_headp(sq) = migr_qp.sq.head;
2589 			sq->mapping.prp1 = migr_qp.sq.dma_addr;
2590 			addr = map_one(vu_ctrlr->endpoint->vfu_ctx,
2591 				       sq->mapping.prp1, sq->size * 64,
2592 				       sq->mapping.sg, &sq->mapping.iov,
2593 				       PROT_READ);
2594 			if (addr == NULL) {
2595 				SPDK_ERRLOG("Restore sq with qid %u PRP1 0x%"PRIx64" with size %u failed\n",
2596 					    sqid, sq->mapping.prp1, sq->size);
2597 				return -EFAULT;
2598 			}
2599 			cqs_ref[sq->cqid]++;
2600 		}
2601 	}
2602 
2603 	/* restore completion queues */
2604 	for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) {
2605 		migr_qp =  migr_state->qps[i];
2606 
2607 		qsize = migr_qp.cq.size;
2608 		if (qsize) {
2609 			struct nvmf_vfio_user_cq *cq;
2610 
2611 			/* restore cq */
2612 			cqid = migr_qp.sq.cqid;
2613 			assert(cqid == i);
2614 
2615 			/* allocate cq if necessary */
2616 			if (vu_ctrlr->cqs[cqid] == NULL) {
2617 				ret = init_cq(vu_ctrlr, cqid);
2618 				if (ret) {
2619 					SPDK_ERRLOG("Construct qpair with qid %u failed\n", cqid);
2620 					return -EFAULT;
2621 				}
2622 			}
2623 
2624 			cq = vu_ctrlr->cqs[cqid];
2625 
2626 			cq->size = qsize;
2627 
2628 			cq->cq_state = VFIO_USER_CQ_CREATED;
2629 			cq->cq_ref = cqs_ref[cqid];
2630 			*cq_tailp(cq) = migr_qp.cq.tail;
2631 			cq->mapping.prp1 = migr_qp.cq.dma_addr;
2632 			cq->ien = migr_qp.cq.ien;
2633 			cq->iv = migr_qp.cq.iv;
2634 			cq->phase = migr_qp.cq.phase;
2635 			addr = map_one(vu_ctrlr->endpoint->vfu_ctx,
2636 				       cq->mapping.prp1, cq->size * 16,
2637 				       cq->mapping.sg, &cq->mapping.iov,
2638 				       PROT_READ | PROT_WRITE);
2639 			if (addr == NULL) {
2640 				SPDK_ERRLOG("Restore cq with qid %u PRP1 0x%"PRIx64" with size %u failed\n",
2641 					    cqid, cq->mapping.prp1, cq->size);
2642 				return -EFAULT;
2643 			}
2644 		}
2645 	}
2646 
2647 	return 0;
2648 }
2649 
2650 static int
2651 vfio_user_migr_ctrlr_restore(struct nvmf_vfio_user_ctrlr *vu_ctrlr)
2652 {
2653 	struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint;
2654 	struct spdk_nvmf_ctrlr *ctrlr = vu_ctrlr->ctrlr;
2655 	uint32_t *doorbell_base;
2656 	struct vfio_user_nvme_migr_state migr_state = {};
2657 	struct spdk_nvme_registers *regs;
2658 	struct spdk_nvme_cmd cmd;
2659 	uint16_t i;
2660 	int rc = 0;
2661 
2662 	assert(endpoint->migr_data != NULL);
2663 	assert(ctrlr != NULL);
2664 	rc = vfio_user_migr_stream_to_data(endpoint, &migr_state);
2665 	if (rc) {
2666 		return rc;
2667 	}
2668 
2669 	rc = vfio_user_migr_ctrlr_construct_qps(vu_ctrlr, &migr_state);
2670 	if (rc) {
2671 		return rc;
2672 	}
2673 
2674 	/* restore PCI configuration space */
2675 	memcpy((void *)endpoint->pci_config_space, &migr_state.cfg, NVME_REG_CFG_SIZE);
2676 
2677 	regs = (struct spdk_nvme_registers *)&migr_state.bar0;
2678 	doorbell_base = (uint32_t *)&regs->doorbell[0].sq_tdbl;
2679 	/* restore doorbells from saved registers */
2680 	memcpy((void *)vu_ctrlr->doorbells, doorbell_base, NVMF_VFIO_USER_DOORBELLS_SIZE);
2681 
2682 	/* restore controller registers after ADMIN queue connection */
2683 	ctrlr->vcprop.csts.raw = regs->csts.raw;
2684 	ctrlr->vcprop.cap.raw = regs->cap.raw;
2685 	ctrlr->vcprop.vs.raw = regs->vs.raw;
2686 	ctrlr->vcprop.cc.raw = regs->cc.raw;
2687 	ctrlr->vcprop.aqa.raw = regs->aqa.raw;
2688 	ctrlr->vcprop.asq = regs->asq;
2689 	ctrlr->vcprop.acq = regs->acq;
2690 
2691 	/* restore nvmf controller data */
2692 	rc = nvmf_ctrlr_restore_migr_data(ctrlr, &migr_state.nvmf_data);
2693 	if (rc) {
2694 		return rc;
2695 	}
2696 
2697 	/* resubmit pending AERs */
2698 	for (i = 0; i < migr_state.ctrlr_header.nr_aers; i++) {
2699 		SPDK_DEBUGLOG(nvmf_vfio, "%s AER resubmit, CID %u\n", ctrlr_id(vu_ctrlr),
2700 			      migr_state.ctrlr_header.aer_cids[i]);
2701 		memset(&cmd, 0, sizeof(cmd));
2702 		cmd.opc = SPDK_NVME_OPC_ASYNC_EVENT_REQUEST;
2703 		cmd.cid = migr_state.ctrlr_header.aer_cids[i];
2704 		rc = handle_cmd_req(vu_ctrlr, &cmd, vu_ctrlr->sqs[0]);
2705 		if (rc) {
2706 			break;
2707 		}
2708 	}
2709 
2710 	return rc;
2711 }
2712 
2713 static void
2714 vfio_user_migr_ctrlr_enable_sqs(struct nvmf_vfio_user_ctrlr *vu_ctrlr)
2715 {
2716 	uint32_t i;
2717 	struct nvmf_vfio_user_sq *sq;
2718 
2719 	for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) {
2720 		sq = vu_ctrlr->sqs[i];
2721 		if (!sq || !sq->size) {
2722 			continue;
2723 		}
2724 
2725 		if (nvmf_qpair_is_admin_queue(&sq->qpair)) {
2726 			/* ADMIN queue pair is always in the poll group, just enable it */
2727 			sq->sq_state = VFIO_USER_SQ_ACTIVE;
2728 		} else {
2729 			spdk_nvmf_tgt_new_qpair(vu_ctrlr->transport->transport.tgt, &sq->qpair);
2730 		}
2731 	}
2732 }
2733 
2734 static int
2735 vfio_user_migration_device_state_transition(vfu_ctx_t *vfu_ctx, vfu_migr_state_t state)
2736 {
2737 	struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx);
2738 	struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr;
2739 	struct nvmf_vfio_user_sq *sq;
2740 	int ret = 0;
2741 
2742 	SPDK_DEBUGLOG(nvmf_vfio, "%s controller state %u, migration state %u\n", endpoint_id(endpoint),
2743 		      vu_ctrlr->state, state);
2744 
2745 	switch (state) {
2746 	case VFU_MIGR_STATE_STOP_AND_COPY:
2747 		vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING;
2748 		vfio_user_migr_ctrlr_save_data(vu_ctrlr);
2749 		break;
2750 	case VFU_MIGR_STATE_STOP:
2751 		vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING;
2752 		break;
2753 	case VFU_MIGR_STATE_PRE_COPY:
2754 		assert(vu_ctrlr->state == VFIO_USER_CTRLR_PAUSED);
2755 		vu_ctrlr->migr_reg.pending_bytes = vfio_user_migr_data_len();
2756 		vu_ctrlr->migr_reg.last_data_offset = 0;
2757 		vu_ctrlr->in_source_vm = true;
2758 		break;
2759 	case VFU_MIGR_STATE_RESUME:
2760 		/*
2761 		 * Destination ADMIN queue pair is connected when starting the VM,
2762 		 * but the ADMIN queue pair isn't enabled in destination VM, the poll
2763 		 * group will do nothing to ADMIN queue pair for now.
2764 		 */
2765 		if (vu_ctrlr->state != VFIO_USER_CTRLR_RUNNING) {
2766 			break;
2767 		}
2768 
2769 		assert(!vu_ctrlr->in_source_vm);
2770 		vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING;
2771 
2772 		sq = TAILQ_FIRST(&vu_ctrlr->connected_sqs);
2773 		assert(sq != NULL);
2774 		assert(sq->qpair.qid == 0);
2775 		sq->sq_state = VFIO_USER_SQ_INACTIVE;
2776 
2777 		/* Free ADMIN SQ resources first, SQ resources will be
2778 		 * allocated based on queue size from source VM.
2779 		 */
2780 		free_sq_reqs(sq);
2781 		sq->size = 0;
2782 		break;
2783 	case VFU_MIGR_STATE_RUNNING:
2784 		if (vu_ctrlr->state != VFIO_USER_CTRLR_MIGRATING) {
2785 			break;
2786 		}
2787 
2788 		if (!vu_ctrlr->in_source_vm) {
2789 			/* Restore destination VM from BAR9 */
2790 			ret = vfio_user_migr_ctrlr_restore(vu_ctrlr);
2791 			if (ret) {
2792 				break;
2793 			}
2794 			vfio_user_migr_ctrlr_enable_sqs(vu_ctrlr);
2795 			vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING;
2796 		} else {
2797 			/* Rollback source VM */
2798 			vu_ctrlr->state = VFIO_USER_CTRLR_RESUMING;
2799 			ret = spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem,
2800 							 vfio_user_endpoint_resume_done, endpoint);
2801 			if (ret < 0) {
2802 				/* TODO: fail controller with CFS bit set */
2803 				vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED;
2804 				SPDK_ERRLOG("%s: failed to resume, ret=%d\n", endpoint_id(endpoint), ret);
2805 				break;
2806 			}
2807 		}
2808 		break;
2809 
2810 	default:
2811 		return -EINVAL;
2812 	}
2813 
2814 	return ret;
2815 }
2816 
2817 static uint64_t
2818 vfio_user_migration_get_pending_bytes(vfu_ctx_t *vfu_ctx)
2819 {
2820 	struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx);
2821 	struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr;
2822 	struct vfio_user_migration_region *migr_reg = &ctrlr->migr_reg;
2823 
2824 	SPDK_DEBUGLOG(nvmf_vfio, "%s current state %u, pending bytes 0x%"PRIx64"\n", endpoint_id(endpoint),
2825 		      ctrlr->state, migr_reg->pending_bytes);
2826 
2827 	return migr_reg->pending_bytes;
2828 }
2829 
2830 static int
2831 vfio_user_migration_prepare_data(vfu_ctx_t *vfu_ctx, uint64_t *offset, uint64_t *size)
2832 {
2833 	struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx);
2834 	struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr;
2835 	struct vfio_user_migration_region *migr_reg = &ctrlr->migr_reg;
2836 
2837 	if (migr_reg->last_data_offset == vfio_user_migr_data_len()) {
2838 		*offset = vfio_user_migr_data_len();
2839 		if (size) {
2840 			*size = 0;
2841 		}
2842 		migr_reg->pending_bytes = 0;
2843 	} else {
2844 		*offset = 0;
2845 		if (size) {
2846 			*size = vfio_user_migr_data_len();
2847 			if (ctrlr->state == VFIO_USER_CTRLR_MIGRATING) {
2848 				vfio_user_migr_ctrlr_save_data(ctrlr);
2849 				migr_reg->last_data_offset = vfio_user_migr_data_len();
2850 			}
2851 		}
2852 	}
2853 
2854 	SPDK_DEBUGLOG(nvmf_vfio, "%s current state %u\n", endpoint_id(endpoint), ctrlr->state);
2855 
2856 	return 0;
2857 }
2858 
2859 static ssize_t
2860 vfio_user_migration_read_data(vfu_ctx_t *vfu_ctx, void *buf, uint64_t count, uint64_t offset)
2861 {
2862 	struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx);
2863 	struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr;
2864 	struct vfio_user_migration_region *migr_reg = &ctrlr->migr_reg;
2865 
2866 	memcpy(buf, endpoint->migr_data, count);
2867 	migr_reg->pending_bytes = 0;
2868 
2869 	return 0;
2870 }
2871 
2872 static ssize_t
2873 vfio_user_migration_write_data(vfu_ctx_t *vfu_ctx, void *buf, uint64_t count, uint64_t offset)
2874 {
2875 	struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx);
2876 
2877 	memcpy(endpoint->migr_data, buf, count);
2878 
2879 	return 0;
2880 }
2881 
2882 static int
2883 vfio_user_migration_data_written(vfu_ctx_t *vfu_ctx, uint64_t count)
2884 {
2885 	SPDK_DEBUGLOG(nvmf_vfio, "write 0x%"PRIx64"\n", (uint64_t)count);
2886 
2887 	return 0;
2888 }
2889 
2890 static int
2891 vfio_user_dev_info_fill(struct nvmf_vfio_user_transport *vu_transport,
2892 			struct nvmf_vfio_user_endpoint *endpoint)
2893 {
2894 	int ret;
2895 	ssize_t cap_offset;
2896 	vfu_ctx_t *vfu_ctx = endpoint->vfu_ctx;
2897 	struct iovec migr_sparse_mmap = {};
2898 
2899 	struct pmcap pmcap = { .hdr.id = PCI_CAP_ID_PM, .pmcs.nsfrst = 0x1 };
2900 	struct pxcap pxcap = {
2901 		.hdr.id = PCI_CAP_ID_EXP,
2902 		.pxcaps.ver = 0x2,
2903 		.pxdcap = {.rer = 0x1, .flrc = 0x1},
2904 		.pxdcap2.ctds = 0x1
2905 	};
2906 
2907 	struct msixcap msixcap = {
2908 		.hdr.id = PCI_CAP_ID_MSIX,
2909 		.mxc.ts = NVME_IRQ_MSIX_NUM - 1,
2910 		.mtab = {.tbir = 0x4, .to = 0x0},
2911 		.mpba = {.pbir = 0x5, .pbao = 0x0}
2912 	};
2913 
2914 	struct iovec sparse_mmap[] = {
2915 		{
2916 			.iov_base = (void *)NVME_DOORBELLS_OFFSET,
2917 			.iov_len = NVMF_VFIO_USER_DOORBELLS_SIZE,
2918 		},
2919 	};
2920 
2921 	const vfu_migration_callbacks_t migr_callbacks = {
2922 		.version = VFU_MIGR_CALLBACKS_VERS,
2923 		.transition = &vfio_user_migration_device_state_transition,
2924 		.get_pending_bytes = &vfio_user_migration_get_pending_bytes,
2925 		.prepare_data = &vfio_user_migration_prepare_data,
2926 		.read_data = &vfio_user_migration_read_data,
2927 		.data_written = &vfio_user_migration_data_written,
2928 		.write_data = &vfio_user_migration_write_data
2929 	};
2930 
2931 	ret = vfu_pci_init(vfu_ctx, VFU_PCI_TYPE_EXPRESS, PCI_HEADER_TYPE_NORMAL, 0);
2932 	if (ret < 0) {
2933 		SPDK_ERRLOG("vfu_ctx %p failed to initialize PCI\n", vfu_ctx);
2934 		return ret;
2935 	}
2936 	vfu_pci_set_id(vfu_ctx, SPDK_PCI_VID_NUTANIX, 0x0001, SPDK_PCI_VID_NUTANIX, 0);
2937 	/*
2938 	 * 0x02, controller uses the NVM Express programming interface
2939 	 * 0x08, non-volatile memory controller
2940 	 * 0x01, mass storage controller
2941 	 */
2942 	vfu_pci_set_class(vfu_ctx, 0x01, 0x08, 0x02);
2943 
2944 	cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pmcap);
2945 	if (cap_offset < 0) {
2946 		SPDK_ERRLOG("vfu_ctx %p failed add pmcap\n", vfu_ctx);
2947 		return ret;
2948 	}
2949 
2950 	cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pxcap);
2951 	if (cap_offset < 0) {
2952 		SPDK_ERRLOG("vfu_ctx %p failed add pxcap\n", vfu_ctx);
2953 		return ret;
2954 	}
2955 
2956 	cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &msixcap);
2957 	if (cap_offset < 0) {
2958 		SPDK_ERRLOG("vfu_ctx %p failed add msixcap\n", vfu_ctx);
2959 		return ret;
2960 	}
2961 
2962 	ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_CFG_REGION_IDX, NVME_REG_CFG_SIZE,
2963 			       access_pci_config, VFU_REGION_FLAG_RW, NULL, 0, -1, 0);
2964 	if (ret < 0) {
2965 		SPDK_ERRLOG("vfu_ctx %p failed to setup cfg\n", vfu_ctx);
2966 		return ret;
2967 	}
2968 
2969 	if (vu_transport->transport_opts.disable_mappable_bar0) {
2970 		ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE,
2971 				       access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM,
2972 				       NULL, 0, -1, 0);
2973 	} else {
2974 		ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE,
2975 				       access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM,
2976 				       sparse_mmap, 1, endpoint->devmem_fd, 0);
2977 	}
2978 
2979 	if (ret < 0) {
2980 		SPDK_ERRLOG("vfu_ctx %p failed to setup bar 0\n", vfu_ctx);
2981 		return ret;
2982 	}
2983 
2984 	ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR4_REGION_IDX, NVME_BAR4_SIZE,
2985 			       NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0);
2986 	if (ret < 0) {
2987 		SPDK_ERRLOG("vfu_ctx %p failed to setup bar 4\n", vfu_ctx);
2988 		return ret;
2989 	}
2990 
2991 	ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR5_REGION_IDX, NVME_BAR5_SIZE,
2992 			       NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0);
2993 	if (ret < 0) {
2994 		SPDK_ERRLOG("vfu_ctx %p failed to setup bar 5\n", vfu_ctx);
2995 		return ret;
2996 	}
2997 
2998 	ret = vfu_setup_device_dma(vfu_ctx, memory_region_add_cb, memory_region_remove_cb);
2999 	if (ret < 0) {
3000 		SPDK_ERRLOG("vfu_ctx %p failed to setup dma callback\n", vfu_ctx);
3001 		return ret;
3002 	}
3003 
3004 	ret = vfu_setup_device_reset_cb(vfu_ctx, vfio_user_device_reset);
3005 	if (ret < 0) {
3006 		SPDK_ERRLOG("vfu_ctx %p failed to setup reset callback\n", vfu_ctx);
3007 		return ret;
3008 	}
3009 
3010 	ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_INTX_IRQ, 1);
3011 	if (ret < 0) {
3012 		SPDK_ERRLOG("vfu_ctx %p failed to setup INTX\n", vfu_ctx);
3013 		return ret;
3014 	}
3015 
3016 	ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSIX_IRQ, NVME_IRQ_MSIX_NUM);
3017 	if (ret < 0) {
3018 		SPDK_ERRLOG("vfu_ctx %p failed to setup MSIX\n", vfu_ctx);
3019 		return ret;
3020 	}
3021 
3022 	vfu_setup_device_quiesce_cb(vfu_ctx, vfio_user_dev_quiesce_cb);
3023 
3024 	migr_sparse_mmap.iov_base = (void *)4096;
3025 	migr_sparse_mmap.iov_len = vfio_user_migr_data_len();
3026 	ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_MIGR_REGION_IDX,
3027 			       vfu_get_migr_register_area_size() + vfio_user_migr_data_len(),
3028 			       NULL, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, &migr_sparse_mmap,
3029 			       1, endpoint->migr_fd, 0);
3030 	if (ret < 0) {
3031 		SPDK_ERRLOG("vfu_ctx %p failed to setup migration region\n", vfu_ctx);
3032 		return ret;
3033 	}
3034 
3035 	ret = vfu_setup_device_migration_callbacks(vfu_ctx, &migr_callbacks,
3036 			vfu_get_migr_register_area_size());
3037 	if (ret < 0) {
3038 		SPDK_ERRLOG("vfu_ctx %p failed to setup migration callbacks\n", vfu_ctx);
3039 		return ret;
3040 	}
3041 
3042 	ret = vfu_realize_ctx(vfu_ctx);
3043 	if (ret < 0) {
3044 		SPDK_ERRLOG("vfu_ctx %p failed to realize\n", vfu_ctx);
3045 		return ret;
3046 	}
3047 
3048 	endpoint->pci_config_space = vfu_pci_get_config_space(endpoint->vfu_ctx);
3049 	assert(endpoint->pci_config_space != NULL);
3050 	init_pci_config_space(endpoint->pci_config_space);
3051 
3052 	assert(cap_offset != 0);
3053 	endpoint->msix = (struct msixcap *)((uint8_t *)endpoint->pci_config_space + cap_offset);
3054 
3055 	return 0;
3056 }
3057 
3058 static int nvmf_vfio_user_accept(void *ctx);
3059 
3060 static void
3061 set_intr_mode_noop(struct spdk_poller *poller, void *arg, bool interrupt_mode)
3062 {
3063 	/* Nothing for us to do here. */
3064 }
3065 
3066 /*
3067  * Register an "accept" poller: this is polling for incoming vfio-user socket
3068  * connections (on the listening socket).
3069  *
3070  * We need to do this on first listening, and also after destroying a
3071  * controller, so we can accept another connection.
3072  */
3073 static int
3074 vfio_user_register_accept_poller(struct nvmf_vfio_user_endpoint *endpoint)
3075 {
3076 	uint64_t poll_rate_us = endpoint->transport->transport.opts.acceptor_poll_rate;
3077 
3078 	SPDK_DEBUGLOG(nvmf_vfio, "registering accept poller\n");
3079 
3080 	endpoint->accept_poller = SPDK_POLLER_REGISTER(nvmf_vfio_user_accept,
3081 				  endpoint, poll_rate_us);
3082 
3083 	if (!endpoint->accept_poller) {
3084 		return -1;
3085 	}
3086 
3087 	endpoint->accept_thread = spdk_get_thread();
3088 
3089 	if (!spdk_interrupt_mode_is_enabled()) {
3090 		return 0;
3091 	}
3092 
3093 	endpoint->accept_intr_fd = vfu_get_poll_fd(endpoint->vfu_ctx);
3094 	assert(endpoint->accept_intr_fd != -1);
3095 
3096 	endpoint->accept_intr = SPDK_INTERRUPT_REGISTER(endpoint->accept_intr_fd,
3097 				nvmf_vfio_user_accept, endpoint);
3098 
3099 	assert(endpoint->accept_intr != NULL);
3100 
3101 	spdk_poller_register_interrupt(endpoint->accept_poller,
3102 				       set_intr_mode_noop, NULL);
3103 	return 0;
3104 }
3105 
3106 static void
3107 _vfio_user_relisten(void *ctx)
3108 {
3109 	struct nvmf_vfio_user_endpoint *endpoint = ctx;
3110 
3111 	vfio_user_register_accept_poller(endpoint);
3112 }
3113 
3114 static void
3115 _free_ctrlr(void *ctx)
3116 {
3117 	struct nvmf_vfio_user_ctrlr *ctrlr = ctx;
3118 	struct nvmf_vfio_user_endpoint *endpoint = ctrlr->endpoint;
3119 
3120 	spdk_interrupt_unregister(&ctrlr->intr);
3121 	ctrlr->intr_fd = -1;
3122 	spdk_poller_unregister(&ctrlr->vfu_ctx_poller);
3123 
3124 	free(ctrlr);
3125 
3126 	if (endpoint == NULL) {
3127 		return;
3128 	}
3129 
3130 	if (endpoint->need_async_destroy) {
3131 		nvmf_vfio_user_destroy_endpoint(endpoint);
3132 	} else {
3133 		spdk_thread_send_msg(endpoint->accept_thread,
3134 				     _vfio_user_relisten, endpoint);
3135 	}
3136 }
3137 
3138 static void
3139 free_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr)
3140 {
3141 	int i;
3142 	assert(ctrlr != NULL);
3143 
3144 	SPDK_DEBUGLOG(nvmf_vfio, "free %s\n", ctrlr_id(ctrlr));
3145 
3146 	for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) {
3147 		free_qp(ctrlr, i);
3148 	}
3149 
3150 	spdk_thread_exec_msg(ctrlr->thread, _free_ctrlr, ctrlr);
3151 }
3152 
3153 static int
3154 nvmf_vfio_user_create_ctrlr(struct nvmf_vfio_user_transport *transport,
3155 			    struct nvmf_vfio_user_endpoint *endpoint)
3156 {
3157 	struct nvmf_vfio_user_ctrlr *ctrlr;
3158 	int err = 0;
3159 
3160 	SPDK_DEBUGLOG(nvmf_vfio, "%s\n", endpoint_id(endpoint));
3161 
3162 	/* First, construct a vfio-user CUSTOM transport controller */
3163 	ctrlr = calloc(1, sizeof(*ctrlr));
3164 	if (ctrlr == NULL) {
3165 		err = -ENOMEM;
3166 		goto out;
3167 	}
3168 	/* We can only support one connection for now */
3169 	ctrlr->cntlid = 0x1;
3170 	ctrlr->intr_fd = -1;
3171 	ctrlr->transport = transport;
3172 	ctrlr->endpoint = endpoint;
3173 	ctrlr->doorbells = endpoint->doorbells;
3174 	TAILQ_INIT(&ctrlr->connected_sqs);
3175 
3176 	/* Then, construct an admin queue pair */
3177 	err = init_sq(ctrlr, &transport->transport, 0);
3178 	if (err != 0) {
3179 		free(ctrlr);
3180 		goto out;
3181 	}
3182 
3183 	err = init_cq(ctrlr, 0);
3184 	if (err != 0) {
3185 		free(ctrlr);
3186 		goto out;
3187 	}
3188 
3189 	ctrlr->sqs[0]->size = NVMF_VFIO_USER_DEFAULT_AQ_DEPTH;
3190 
3191 	err = alloc_sq_reqs(ctrlr, ctrlr->sqs[0]);
3192 	if (err != 0) {
3193 		free(ctrlr);
3194 		goto out;
3195 	}
3196 	endpoint->ctrlr = ctrlr;
3197 
3198 	/* Notify the generic layer about the new admin queue pair */
3199 	spdk_nvmf_tgt_new_qpair(transport->transport.tgt, &ctrlr->sqs[0]->qpair);
3200 
3201 out:
3202 	if (err != 0) {
3203 		SPDK_ERRLOG("%s: failed to create vfio-user controller: %s\n",
3204 			    endpoint_id(endpoint), strerror(-err));
3205 	}
3206 
3207 	return err;
3208 }
3209 
3210 static int
3211 nvmf_vfio_user_listen(struct spdk_nvmf_transport *transport,
3212 		      const struct spdk_nvme_transport_id *trid,
3213 		      struct spdk_nvmf_listen_opts *listen_opts)
3214 {
3215 	struct nvmf_vfio_user_transport *vu_transport;
3216 	struct nvmf_vfio_user_endpoint *endpoint, *tmp;
3217 	char path[PATH_MAX] = {};
3218 	char uuid[PATH_MAX] = {};
3219 	int ret;
3220 
3221 	vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport,
3222 					transport);
3223 
3224 	pthread_mutex_lock(&vu_transport->lock);
3225 	TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) {
3226 		/* Only compare traddr */
3227 		if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) {
3228 			pthread_mutex_unlock(&vu_transport->lock);
3229 			return -EEXIST;
3230 		}
3231 	}
3232 	pthread_mutex_unlock(&vu_transport->lock);
3233 
3234 	endpoint = calloc(1, sizeof(*endpoint));
3235 	if (!endpoint) {
3236 		return -ENOMEM;
3237 	}
3238 
3239 	pthread_mutex_init(&endpoint->lock, NULL);
3240 	endpoint->devmem_fd = -1;
3241 	memcpy(&endpoint->trid, trid, sizeof(endpoint->trid));
3242 	endpoint->transport = vu_transport;
3243 
3244 	ret = snprintf(path, PATH_MAX, "%s/bar0", endpoint_id(endpoint));
3245 	if (ret < 0 || ret >= PATH_MAX) {
3246 		SPDK_ERRLOG("%s: error to get socket path: %s.\n", endpoint_id(endpoint), spdk_strerror(errno));
3247 		ret = -1;
3248 		goto out;
3249 	}
3250 
3251 	ret = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
3252 	if (ret == -1) {
3253 		SPDK_ERRLOG("%s: failed to open device memory at %s: %s.\n",
3254 			    endpoint_id(endpoint), path, spdk_strerror(errno));
3255 		goto out;
3256 	}
3257 
3258 	endpoint->devmem_fd = ret;
3259 	ret = ftruncate(endpoint->devmem_fd,
3260 			NVME_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE);
3261 	if (ret != 0) {
3262 		SPDK_ERRLOG("%s: error to ftruncate file %s: %s.\n", endpoint_id(endpoint), path,
3263 			    spdk_strerror(errno));
3264 		goto out;
3265 	}
3266 
3267 	endpoint->doorbells = mmap(NULL, NVMF_VFIO_USER_DOORBELLS_SIZE,
3268 				   PROT_READ | PROT_WRITE, MAP_SHARED, endpoint->devmem_fd, NVME_DOORBELLS_OFFSET);
3269 	if (endpoint->doorbells == MAP_FAILED) {
3270 		SPDK_ERRLOG("%s: error to mmap file %s: %s.\n", endpoint_id(endpoint), path, spdk_strerror(errno));
3271 		endpoint->doorbells = NULL;
3272 		ret = -1;
3273 		goto out;
3274 	}
3275 
3276 	ret = snprintf(path, PATH_MAX, "%s/migr", endpoint_id(endpoint));
3277 	if (ret < 0 || ret >= PATH_MAX) {
3278 		SPDK_ERRLOG("%s: error to get migration file path: %s.\n", endpoint_id(endpoint),
3279 			    spdk_strerror(errno));
3280 		ret = -1;
3281 		goto out;
3282 	}
3283 	ret = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
3284 	if (ret == -1) {
3285 		SPDK_ERRLOG("%s: failed to open device memory at %s: %s.\n",
3286 			    endpoint_id(endpoint), path, spdk_strerror(errno));
3287 		goto out;
3288 	}
3289 
3290 	endpoint->migr_fd = ret;
3291 	ret = ftruncate(endpoint->migr_fd,
3292 			vfu_get_migr_register_area_size() + vfio_user_migr_data_len());
3293 	if (ret != 0) {
3294 		SPDK_ERRLOG("%s: error to ftruncate migration file %s: %s.\n", endpoint_id(endpoint), path,
3295 			    spdk_strerror(errno));
3296 		goto out;
3297 	}
3298 
3299 	endpoint->migr_data = mmap(NULL, vfio_user_migr_data_len(),
3300 				   PROT_READ | PROT_WRITE, MAP_SHARED, endpoint->migr_fd, vfu_get_migr_register_area_size());
3301 	if (endpoint->migr_data == MAP_FAILED) {
3302 		SPDK_ERRLOG("%s: error to mmap file %s: %s.\n", endpoint_id(endpoint), path, spdk_strerror(errno));
3303 		endpoint->migr_data = NULL;
3304 		ret = -1;
3305 		goto out;
3306 	}
3307 
3308 	ret = snprintf(uuid, PATH_MAX, "%s/cntrl", endpoint_id(endpoint));
3309 	if (ret < 0 || ret >= PATH_MAX) {
3310 		SPDK_ERRLOG("%s: error to get ctrlr file path: %s\n", endpoint_id(endpoint), spdk_strerror(errno));
3311 		ret = -1;
3312 		goto out;
3313 	}
3314 
3315 	endpoint->vfu_ctx = vfu_create_ctx(VFU_TRANS_SOCK, uuid, LIBVFIO_USER_FLAG_ATTACH_NB,
3316 					   endpoint, VFU_DEV_TYPE_PCI);
3317 	if (endpoint->vfu_ctx == NULL) {
3318 		SPDK_ERRLOG("%s: error creating libmuser context: %m\n",
3319 			    endpoint_id(endpoint));
3320 		ret = -1;
3321 		goto out;
3322 	}
3323 	vfu_setup_log(endpoint->vfu_ctx, vfio_user_log, vfio_user_get_log_level());
3324 
3325 	ret = vfio_user_dev_info_fill(vu_transport, endpoint);
3326 	if (ret < 0) {
3327 		goto out;
3328 	}
3329 
3330 	ret = vfio_user_register_accept_poller(endpoint);
3331 
3332 	if (ret != 0) {
3333 		goto out;
3334 	}
3335 
3336 	pthread_mutex_lock(&vu_transport->lock);
3337 	TAILQ_INSERT_TAIL(&vu_transport->endpoints, endpoint, link);
3338 	pthread_mutex_unlock(&vu_transport->lock);
3339 
3340 out:
3341 	if (ret != 0) {
3342 		nvmf_vfio_user_destroy_endpoint(endpoint);
3343 	}
3344 
3345 	return ret;
3346 }
3347 
3348 static void
3349 nvmf_vfio_user_stop_listen(struct spdk_nvmf_transport *transport,
3350 			   const struct spdk_nvme_transport_id *trid)
3351 {
3352 	struct nvmf_vfio_user_transport *vu_transport;
3353 	struct nvmf_vfio_user_endpoint *endpoint, *tmp;
3354 
3355 	assert(trid != NULL);
3356 	assert(trid->traddr != NULL);
3357 
3358 	SPDK_DEBUGLOG(nvmf_vfio, "%s: stop listen\n", trid->traddr);
3359 
3360 	vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport,
3361 					transport);
3362 
3363 	pthread_mutex_lock(&vu_transport->lock);
3364 	TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) {
3365 		if (strcmp(trid->traddr, endpoint->trid.traddr) == 0) {
3366 			TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link);
3367 			/* Defer to free endpoint resources until the controller
3368 			 * is freed.  There are two cases when running here:
3369 			 * 1. kill nvmf target while VM is connected
3370 			 * 2. remove listener via RPC call
3371 			 * nvmf library will disconnect all queue paris.
3372 			 */
3373 			if (endpoint->ctrlr) {
3374 				assert(!endpoint->need_async_destroy);
3375 				endpoint->need_async_destroy = true;
3376 				pthread_mutex_unlock(&vu_transport->lock);
3377 				return;
3378 			}
3379 
3380 			nvmf_vfio_user_destroy_endpoint(endpoint);
3381 			pthread_mutex_unlock(&vu_transport->lock);
3382 			return;
3383 		}
3384 	}
3385 	pthread_mutex_unlock(&vu_transport->lock);
3386 
3387 	SPDK_DEBUGLOG(nvmf_vfio, "%s: not found\n", trid->traddr);
3388 }
3389 
3390 static void
3391 nvmf_vfio_user_cdata_init(struct spdk_nvmf_transport *transport,
3392 			  struct spdk_nvmf_subsystem *subsystem,
3393 			  struct spdk_nvmf_ctrlr_data *cdata)
3394 {
3395 	cdata->vid = SPDK_PCI_VID_NUTANIX;
3396 	cdata->ssvid = SPDK_PCI_VID_NUTANIX;
3397 	cdata->ieee[0] = 0x8d;
3398 	cdata->ieee[1] = 0x6b;
3399 	cdata->ieee[2] = 0x50;
3400 	memset(&cdata->sgls, 0, sizeof(struct spdk_nvme_cdata_sgls));
3401 	cdata->sgls.supported = SPDK_NVME_SGLS_SUPPORTED_DWORD_ALIGNED;
3402 	/* libvfio-user can only support 1 connection for now */
3403 	cdata->oncs.reservations = 0;
3404 }
3405 
3406 static int
3407 nvmf_vfio_user_listen_associate(struct spdk_nvmf_transport *transport,
3408 				const struct spdk_nvmf_subsystem *subsystem,
3409 				const struct spdk_nvme_transport_id *trid)
3410 {
3411 	struct nvmf_vfio_user_transport *vu_transport;
3412 	struct nvmf_vfio_user_endpoint *endpoint;
3413 
3414 	vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, transport);
3415 
3416 	pthread_mutex_lock(&vu_transport->lock);
3417 	TAILQ_FOREACH(endpoint, &vu_transport->endpoints, link) {
3418 		if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) {
3419 			break;
3420 		}
3421 	}
3422 	pthread_mutex_unlock(&vu_transport->lock);
3423 
3424 	if (endpoint == NULL) {
3425 		return -ENOENT;
3426 	}
3427 
3428 	endpoint->subsystem = subsystem;
3429 
3430 	return 0;
3431 }
3432 
3433 /*
3434  * Executed periodically at a default SPDK_NVMF_DEFAULT_ACCEPT_POLL_RATE_US
3435  * frequency.
3436  *
3437  * For this endpoint (which at the libvfio-user level corresponds to a socket),
3438  * if we don't currently have a controller set up, peek to see if the socket is
3439  * able to accept a new connection.
3440  */
3441 static int
3442 nvmf_vfio_user_accept(void *ctx)
3443 {
3444 	struct nvmf_vfio_user_endpoint *endpoint = ctx;
3445 	struct nvmf_vfio_user_transport *vu_transport;
3446 	int err;
3447 
3448 	vu_transport = endpoint->transport;
3449 
3450 	if (endpoint->ctrlr != NULL) {
3451 		return SPDK_POLLER_IDLE;
3452 	}
3453 
3454 	err = vfu_attach_ctx(endpoint->vfu_ctx);
3455 
3456 	if (err == 0) {
3457 		SPDK_DEBUGLOG(nvmf_vfio, "attach succeeded\n");
3458 
3459 		err = nvmf_vfio_user_create_ctrlr(vu_transport, endpoint);
3460 
3461 		if (err == 0) {
3462 			/*
3463 			 * Unregister ourselves: now we've accepted a
3464 			 * connection, there is nothing for us to poll for, and
3465 			 * we will poll the connection via vfu_run_ctx()
3466 			 * instead.
3467 			 */
3468 			spdk_interrupt_unregister(&endpoint->accept_intr);
3469 			spdk_poller_unregister(&endpoint->accept_poller);
3470 		}
3471 
3472 		return SPDK_POLLER_BUSY;
3473 	}
3474 
3475 	if (errno == EAGAIN || errno == EWOULDBLOCK) {
3476 		return SPDK_POLLER_IDLE;
3477 	}
3478 
3479 	return SPDK_POLLER_BUSY;
3480 }
3481 
3482 static void
3483 nvmf_vfio_user_discover(struct spdk_nvmf_transport *transport,
3484 			struct spdk_nvme_transport_id *trid,
3485 			struct spdk_nvmf_discovery_log_page_entry *entry)
3486 { }
3487 
3488 static struct spdk_nvmf_transport_poll_group *
3489 nvmf_vfio_user_poll_group_create(struct spdk_nvmf_transport *transport,
3490 				 struct spdk_nvmf_poll_group *group)
3491 {
3492 	struct nvmf_vfio_user_transport *vu_transport;
3493 	struct nvmf_vfio_user_poll_group *vu_group;
3494 
3495 	SPDK_DEBUGLOG(nvmf_vfio, "create poll group\n");
3496 
3497 	vu_group = calloc(1, sizeof(*vu_group));
3498 	if (vu_group == NULL) {
3499 		SPDK_ERRLOG("Error allocating poll group: %m");
3500 		return NULL;
3501 	}
3502 
3503 	TAILQ_INIT(&vu_group->sqs);
3504 
3505 	vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport,
3506 					transport);
3507 	pthread_mutex_lock(&vu_transport->pg_lock);
3508 	TAILQ_INSERT_TAIL(&vu_transport->poll_groups, vu_group, link);
3509 	if (vu_transport->next_pg == NULL) {
3510 		vu_transport->next_pg = vu_group;
3511 	}
3512 	pthread_mutex_unlock(&vu_transport->pg_lock);
3513 
3514 	if (!spdk_interrupt_mode_is_enabled()) {
3515 		return &vu_group->group;
3516 	}
3517 
3518 	/*
3519 	 * Only allow the poll group to work in interrupt mode if the transport
3520 	 * supports it. It's our responsibility to register the actual interrupt
3521 	 * later (in handle_queue_connect_rsp()) that processes everything in
3522 	 * the poll group: for us, that's the libvfio-user context, and the
3523 	 * actual qpairs.
3524 	 *
3525 	 * Note that this only works in the case that nothing else shares the
3526 	 * spdk_nvmf_poll_group.
3527 	 *
3528 	 * If not supported, this will effectively always wake up to poll the
3529 	 * poll group.
3530 	 */
3531 
3532 	vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport,
3533 					transport);
3534 
3535 	if (!vu_transport->intr_mode_supported) {
3536 		SPDK_WARNLOG("vfio-user interrupt mode not supported\n");
3537 		return &vu_group->group;
3538 	}
3539 
3540 	spdk_poller_register_interrupt(group->poller, set_intr_mode_noop,
3541 				       NULL);
3542 
3543 	return &vu_group->group;
3544 }
3545 
3546 static struct spdk_nvmf_transport_poll_group *
3547 nvmf_vfio_user_get_optimal_poll_group(struct spdk_nvmf_qpair *qpair)
3548 {
3549 	struct nvmf_vfio_user_transport *vu_transport;
3550 	struct nvmf_vfio_user_poll_group **vu_group;
3551 	struct nvmf_vfio_user_sq *sq;
3552 	struct nvmf_vfio_user_cq *cq;
3553 
3554 	struct spdk_nvmf_transport_poll_group *result = NULL;
3555 
3556 	sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair);
3557 	cq = sq->ctrlr->cqs[sq->cqid];
3558 	assert(cq != NULL);
3559 	vu_transport = SPDK_CONTAINEROF(qpair->transport, struct nvmf_vfio_user_transport, transport);
3560 
3561 	pthread_mutex_lock(&vu_transport->pg_lock);
3562 	if (TAILQ_EMPTY(&vu_transport->poll_groups)) {
3563 		goto out;
3564 	}
3565 
3566 	if (!nvmf_qpair_is_admin_queue(qpair)) {
3567 		/*
3568 		 * If this is shared IO CQ case, just return the used CQ's poll
3569 		 * group, so I/O completions don't have to use
3570 		 * spdk_thread_send_msg().
3571 		 */
3572 		if (cq->group != NULL) {
3573 			result = cq->group;
3574 			goto out;
3575 		}
3576 
3577 		/*
3578 		 * If we're in interrupt mode, align all qpairs for a controller
3579 		 * on the same poll group, to avoid complications in
3580 		 * vfio_user_handle_intr().
3581 		 */
3582 		if (spdk_interrupt_mode_is_enabled() &&
3583 		    vu_transport->intr_mode_supported) {
3584 			result = sq->ctrlr->sqs[0]->group;
3585 			goto out;
3586 		}
3587 
3588 	}
3589 
3590 	vu_group = &vu_transport->next_pg;
3591 	assert(*vu_group != NULL);
3592 
3593 	result = &(*vu_group)->group;
3594 	*vu_group = TAILQ_NEXT(*vu_group, link);
3595 	if (*vu_group == NULL) {
3596 		*vu_group = TAILQ_FIRST(&vu_transport->poll_groups);
3597 	}
3598 
3599 	if (cq->group == NULL) {
3600 		cq->group = result;
3601 	}
3602 
3603 out:
3604 	pthread_mutex_unlock(&vu_transport->pg_lock);
3605 	return result;
3606 }
3607 
3608 /* called when process exits */
3609 static void
3610 nvmf_vfio_user_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group)
3611 {
3612 	struct nvmf_vfio_user_poll_group *vu_group, *next_tgroup;;
3613 	struct nvmf_vfio_user_transport *vu_transport;
3614 
3615 	SPDK_DEBUGLOG(nvmf_vfio, "destroy poll group\n");
3616 
3617 	vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group);
3618 	vu_transport = SPDK_CONTAINEROF(vu_group->group.transport, struct nvmf_vfio_user_transport,
3619 					transport);
3620 
3621 	pthread_mutex_lock(&vu_transport->pg_lock);
3622 	next_tgroup = TAILQ_NEXT(vu_group, link);
3623 	TAILQ_REMOVE(&vu_transport->poll_groups, vu_group, link);
3624 	if (next_tgroup == NULL) {
3625 		next_tgroup = TAILQ_FIRST(&vu_transport->poll_groups);
3626 	}
3627 	if (vu_transport->next_pg == vu_group) {
3628 		vu_transport->next_pg = next_tgroup;
3629 	}
3630 	pthread_mutex_unlock(&vu_transport->pg_lock);
3631 
3632 	free(vu_group);
3633 }
3634 
3635 static void
3636 _vfio_user_qpair_disconnect(void *ctx)
3637 {
3638 	struct nvmf_vfio_user_sq *sq = ctx;
3639 
3640 	spdk_nvmf_qpair_disconnect(&sq->qpair, NULL, NULL);
3641 }
3642 
3643 /* The function is used when socket connection is destroyed */
3644 static int
3645 vfio_user_destroy_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr)
3646 {
3647 	struct nvmf_vfio_user_sq *sq;
3648 	struct nvmf_vfio_user_endpoint *endpoint;
3649 
3650 	SPDK_DEBUGLOG(nvmf_vfio, "%s stop processing\n", ctrlr_id(ctrlr));
3651 
3652 	endpoint = ctrlr->endpoint;
3653 	assert(endpoint != NULL);
3654 
3655 	pthread_mutex_lock(&endpoint->lock);
3656 	if (TAILQ_EMPTY(&ctrlr->connected_sqs)) {
3657 		endpoint->ctrlr = NULL;
3658 		free_ctrlr(ctrlr);
3659 		pthread_mutex_unlock(&endpoint->lock);
3660 		return 0;
3661 	}
3662 
3663 	TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) {
3664 		/* add another round thread poll to avoid recursive endpoint lock */
3665 		spdk_thread_send_msg(ctrlr->thread, _vfio_user_qpair_disconnect, sq);
3666 	}
3667 	pthread_mutex_unlock(&endpoint->lock);
3668 
3669 	return 0;
3670 }
3671 
3672 /*
3673  * Poll for and process any incoming vfio-user messages.
3674  */
3675 static int
3676 vfio_user_poll_vfu_ctx(void *ctx)
3677 {
3678 	struct nvmf_vfio_user_ctrlr *ctrlr = ctx;
3679 	int ret;
3680 
3681 	assert(ctrlr != NULL);
3682 
3683 	/* This will call access_bar0_fn() if there are any writes
3684 	 * to the portion of the BAR that is not mmap'd */
3685 	ret = vfu_run_ctx(ctrlr->endpoint->vfu_ctx);
3686 	if (spdk_unlikely(ret == -1)) {
3687 		if (errno == EBUSY) {
3688 			return SPDK_POLLER_IDLE;
3689 		}
3690 
3691 		spdk_poller_unregister(&ctrlr->vfu_ctx_poller);
3692 
3693 		/*
3694 		 * We lost the client; the reset callback will already have
3695 		 * unregistered the interrupt.
3696 		 */
3697 		if (errno == ENOTCONN) {
3698 			vfio_user_destroy_ctrlr(ctrlr);
3699 			return SPDK_POLLER_BUSY;
3700 		}
3701 
3702 		/*
3703 		 * We might not have got a reset callback in this case, so
3704 		 * explicitly unregister the interrupt here.
3705 		 */
3706 		spdk_interrupt_unregister(&ctrlr->intr);
3707 		ctrlr->intr_fd = -1;
3708 		fail_ctrlr(ctrlr);
3709 	}
3710 
3711 	return ret != 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
3712 }
3713 
3714 struct vfio_user_post_cpl_ctx {
3715 	struct nvmf_vfio_user_ctrlr	*ctrlr;
3716 	struct nvmf_vfio_user_cq	*cq;
3717 	struct spdk_nvme_cpl		cpl;
3718 };
3719 
3720 static void
3721 _post_completion_msg(void *ctx)
3722 {
3723 	struct vfio_user_post_cpl_ctx *cpl_ctx = ctx;
3724 
3725 	post_completion(cpl_ctx->ctrlr, cpl_ctx->cq, cpl_ctx->cpl.cdw0, cpl_ctx->cpl.sqid,
3726 			cpl_ctx->cpl.cid, cpl_ctx->cpl.status.sc, cpl_ctx->cpl.status.sct);
3727 	free(cpl_ctx);
3728 }
3729 
3730 static int nvmf_vfio_user_poll_group_poll(struct spdk_nvmf_transport_poll_group *group);
3731 
3732 static int
3733 vfio_user_handle_intr(void *ctx)
3734 {
3735 	struct nvmf_vfio_user_ctrlr *ctrlr = ctx;
3736 	int ret;
3737 
3738 	assert(ctrlr != NULL);
3739 	assert(ctrlr->sqs[0] != NULL);
3740 	assert(ctrlr->sqs[0]->group != NULL);
3741 
3742 	vfio_user_poll_vfu_ctx(ctrlr);
3743 
3744 	/*
3745 	 * See nvmf_vfio_user_get_optimal_poll_group() fo why it's OK to only
3746 	 * poll this poll group.
3747 	 */
3748 	ret = nvmf_vfio_user_poll_group_poll(ctrlr->sqs[0]->group);
3749 
3750 	return ret != 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
3751 }
3752 
3753 static int
3754 handle_queue_connect_rsp(struct nvmf_vfio_user_req *req, void *cb_arg)
3755 {
3756 	struct nvmf_vfio_user_poll_group *vu_group;
3757 	struct nvmf_vfio_user_sq *sq = cb_arg;
3758 	struct nvmf_vfio_user_cq *cq;
3759 	struct nvmf_vfio_user_ctrlr *vu_ctrlr;
3760 	struct nvmf_vfio_user_endpoint *endpoint;
3761 
3762 	assert(sq != NULL);
3763 	assert(req != NULL);
3764 
3765 	vu_ctrlr = sq->ctrlr;
3766 	assert(vu_ctrlr != NULL);
3767 	endpoint = vu_ctrlr->endpoint;
3768 	assert(endpoint != NULL);
3769 
3770 	if (spdk_nvme_cpl_is_error(&req->req.rsp->nvme_cpl)) {
3771 		SPDK_ERRLOG("SC %u, SCT %u\n", req->req.rsp->nvme_cpl.status.sc, req->req.rsp->nvme_cpl.status.sct);
3772 		endpoint->ctrlr = NULL;
3773 		free_ctrlr(vu_ctrlr);
3774 		return -1;
3775 	}
3776 
3777 	vu_group = SPDK_CONTAINEROF(sq->group, struct nvmf_vfio_user_poll_group, group);
3778 	TAILQ_INSERT_TAIL(&vu_group->sqs, sq, link);
3779 
3780 	cq = vu_ctrlr->cqs[0];
3781 	assert(cq != NULL);
3782 
3783 	pthread_mutex_lock(&endpoint->lock);
3784 	if (nvmf_qpair_is_admin_queue(&sq->qpair)) {
3785 		vu_ctrlr->cntlid = sq->qpair.ctrlr->cntlid;
3786 		vu_ctrlr->thread = spdk_get_thread();
3787 		vu_ctrlr->ctrlr = sq->qpair.ctrlr;
3788 		vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING;
3789 		vu_ctrlr->vfu_ctx_poller = SPDK_POLLER_REGISTER(vfio_user_poll_vfu_ctx, vu_ctrlr, 0);
3790 
3791 		cq->thread = spdk_get_thread();
3792 
3793 		if (spdk_interrupt_mode_is_enabled() &&
3794 		    endpoint->transport->intr_mode_supported) {
3795 			vu_ctrlr->intr_fd = vfu_get_poll_fd(vu_ctrlr->endpoint->vfu_ctx);
3796 			assert(vu_ctrlr->intr_fd != -1);
3797 
3798 			vu_ctrlr->intr = SPDK_INTERRUPT_REGISTER(vu_ctrlr->intr_fd,
3799 					 vfio_user_handle_intr,
3800 					 vu_ctrlr);
3801 
3802 			assert(vu_ctrlr->intr != NULL);
3803 
3804 			spdk_poller_register_interrupt(vu_ctrlr->vfu_ctx_poller,
3805 						       set_intr_mode_noop,
3806 						       vu_ctrlr);
3807 		}
3808 	} else {
3809 		/* For I/O queues this command was generated in response to an
3810 		 * ADMIN I/O CREATE SUBMISSION QUEUE command which has not yet
3811 		 * been completed. Complete it now.
3812 		 */
3813 		if (sq->post_create_io_sq_completion) {
3814 			assert(cq->thread != NULL);
3815 			if (cq->thread != spdk_get_thread()) {
3816 				struct vfio_user_post_cpl_ctx *cpl_ctx;
3817 
3818 				cpl_ctx = calloc(1, sizeof(*cpl_ctx));
3819 				if (!cpl_ctx) {
3820 					return -ENOMEM;
3821 				}
3822 				cpl_ctx->ctrlr = vu_ctrlr;
3823 				cpl_ctx->cq = cq;
3824 				cpl_ctx->cpl.sqid = 0;
3825 				cpl_ctx->cpl.cdw0 = 0;
3826 				cpl_ctx->cpl.cid = sq->create_io_sq_cmd.cid;
3827 				cpl_ctx->cpl.status.sc = SPDK_NVME_SC_SUCCESS;
3828 				cpl_ctx->cpl.status.sct = SPDK_NVME_SCT_GENERIC;
3829 
3830 				spdk_thread_send_msg(cq->thread, _post_completion_msg, cpl_ctx);
3831 			} else {
3832 				post_completion(vu_ctrlr, cq, 0, 0,
3833 						sq->create_io_sq_cmd.cid, SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC);
3834 			}
3835 			sq->post_create_io_sq_completion = false;
3836 		}
3837 		sq->sq_state = VFIO_USER_SQ_ACTIVE;
3838 	}
3839 
3840 	TAILQ_INSERT_TAIL(&vu_ctrlr->connected_sqs, sq, tailq);
3841 	pthread_mutex_unlock(&endpoint->lock);
3842 
3843 	free(req->req.data);
3844 	req->req.data = NULL;
3845 
3846 	return 0;
3847 }
3848 
3849 /*
3850  * Add the given qpair to the given poll group. New qpairs are added via
3851  * spdk_nvmf_tgt_new_qpair(), which picks a poll group via
3852  * nvmf_vfio_user_get_optimal_poll_group(), then calls back here via
3853  * nvmf_transport_poll_group_add().
3854  */
3855 static int
3856 nvmf_vfio_user_poll_group_add(struct spdk_nvmf_transport_poll_group *group,
3857 			      struct spdk_nvmf_qpair *qpair)
3858 {
3859 	struct nvmf_vfio_user_sq *sq;
3860 	struct nvmf_vfio_user_req *vu_req;
3861 	struct nvmf_vfio_user_ctrlr *ctrlr;
3862 	struct spdk_nvmf_request *req;
3863 	struct spdk_nvmf_fabric_connect_data *data;
3864 	bool admin;
3865 
3866 	sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair);
3867 	sq->group = group;
3868 	ctrlr = sq->ctrlr;
3869 
3870 	SPDK_DEBUGLOG(nvmf_vfio, "%s: add QP%d=%p(%p) to poll_group=%p\n",
3871 		      ctrlr_id(ctrlr), sq->qpair.qid,
3872 		      sq, qpair, group);
3873 
3874 	admin = nvmf_qpair_is_admin_queue(&sq->qpair);
3875 
3876 	vu_req = get_nvmf_vfio_user_req(sq);
3877 	if (vu_req == NULL) {
3878 		return -1;
3879 	}
3880 
3881 	req = &vu_req->req;
3882 	req->cmd->connect_cmd.opcode = SPDK_NVME_OPC_FABRIC;
3883 	req->cmd->connect_cmd.cid = 0;
3884 	req->cmd->connect_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_CONNECT;
3885 	req->cmd->connect_cmd.recfmt = 0;
3886 	req->cmd->connect_cmd.sqsize = sq->size - 1;
3887 	req->cmd->connect_cmd.qid = admin ? 0 : qpair->qid;
3888 
3889 	req->length = sizeof(struct spdk_nvmf_fabric_connect_data);
3890 	req->data = calloc(1, req->length);
3891 	if (req->data == NULL) {
3892 		nvmf_vfio_user_req_free(req);
3893 		return -ENOMEM;
3894 	}
3895 
3896 	data = (struct spdk_nvmf_fabric_connect_data *)req->data;
3897 	data->cntlid = ctrlr->cntlid;
3898 	snprintf(data->subnqn, sizeof(data->subnqn), "%s",
3899 		 spdk_nvmf_subsystem_get_nqn(ctrlr->endpoint->subsystem));
3900 
3901 	vu_req->cb_fn = handle_queue_connect_rsp;
3902 	vu_req->cb_arg = sq;
3903 
3904 	SPDK_DEBUGLOG(nvmf_vfio,
3905 		      "%s: sending connect fabrics command for QID=%#x cntlid=%#x\n",
3906 		      ctrlr_id(ctrlr), qpair->qid, data->cntlid);
3907 
3908 	spdk_nvmf_request_exec_fabrics(req);
3909 	return 0;
3910 }
3911 
3912 static int
3913 nvmf_vfio_user_poll_group_remove(struct spdk_nvmf_transport_poll_group *group,
3914 				 struct spdk_nvmf_qpair *qpair)
3915 {
3916 	struct nvmf_vfio_user_sq *sq;
3917 	struct nvmf_vfio_user_poll_group *vu_group;
3918 
3919 	sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair);
3920 
3921 	SPDK_DEBUGLOG(nvmf_vfio,
3922 		      "%s: remove NVMf QP%d=%p from NVMf poll_group=%p\n",
3923 		      ctrlr_id(sq->ctrlr), qpair->qid, qpair, group);
3924 
3925 
3926 	vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group);
3927 	TAILQ_REMOVE(&vu_group->sqs, sq, link);
3928 
3929 	return 0;
3930 }
3931 
3932 static void
3933 _nvmf_vfio_user_req_free(struct nvmf_vfio_user_sq *sq, struct nvmf_vfio_user_req *vu_req)
3934 {
3935 	memset(&vu_req->cmd, 0, sizeof(vu_req->cmd));
3936 	memset(&vu_req->rsp, 0, sizeof(vu_req->rsp));
3937 	vu_req->iovcnt = 0;
3938 	vu_req->state = VFIO_USER_REQUEST_STATE_FREE;
3939 
3940 	TAILQ_INSERT_TAIL(&sq->free_reqs, vu_req, link);
3941 }
3942 
3943 static int
3944 nvmf_vfio_user_req_free(struct spdk_nvmf_request *req)
3945 {
3946 	struct nvmf_vfio_user_sq *sq;
3947 	struct nvmf_vfio_user_req *vu_req;
3948 
3949 	assert(req != NULL);
3950 
3951 	vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req);
3952 	sq = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair);
3953 
3954 	_nvmf_vfio_user_req_free(sq, vu_req);
3955 
3956 	return 0;
3957 }
3958 
3959 static int
3960 nvmf_vfio_user_req_complete(struct spdk_nvmf_request *req)
3961 {
3962 	struct nvmf_vfio_user_sq *sq;
3963 	struct nvmf_vfio_user_req *vu_req;
3964 
3965 	assert(req != NULL);
3966 
3967 	vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req);
3968 	sq = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair);
3969 
3970 	if (vu_req->cb_fn != NULL) {
3971 		if (vu_req->cb_fn(vu_req, vu_req->cb_arg) != 0) {
3972 			fail_ctrlr(sq->ctrlr);
3973 		}
3974 	}
3975 
3976 	_nvmf_vfio_user_req_free(sq, vu_req);
3977 
3978 	return 0;
3979 }
3980 
3981 static void
3982 nvmf_vfio_user_close_qpair(struct spdk_nvmf_qpair *qpair,
3983 			   spdk_nvmf_transport_qpair_fini_cb cb_fn, void *cb_arg)
3984 {
3985 	struct nvmf_vfio_user_sq *sq;
3986 	struct nvmf_vfio_user_ctrlr *vu_ctrlr;
3987 	struct nvmf_vfio_user_endpoint *endpoint;
3988 
3989 	assert(qpair != NULL);
3990 	sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair);
3991 	vu_ctrlr = sq->ctrlr;
3992 	endpoint = vu_ctrlr->endpoint;
3993 
3994 	pthread_mutex_lock(&endpoint->lock);
3995 	TAILQ_REMOVE(&vu_ctrlr->connected_sqs, sq, tailq);
3996 	delete_sq_done(vu_ctrlr, sq);
3997 	if (TAILQ_EMPTY(&vu_ctrlr->connected_sqs)) {
3998 		endpoint->ctrlr = NULL;
3999 		free_ctrlr(vu_ctrlr);
4000 	}
4001 	pthread_mutex_unlock(&endpoint->lock);
4002 
4003 	if (cb_fn) {
4004 		cb_fn(cb_arg);
4005 	}
4006 }
4007 
4008 /**
4009  * Returns a preallocated request, or NULL if there isn't one available.
4010  */
4011 static struct nvmf_vfio_user_req *
4012 get_nvmf_vfio_user_req(struct nvmf_vfio_user_sq *sq)
4013 {
4014 	struct nvmf_vfio_user_req *req;
4015 
4016 	if (sq == NULL) {
4017 		return NULL;
4018 	}
4019 
4020 	req = TAILQ_FIRST(&sq->free_reqs);
4021 	if (req == NULL) {
4022 		return NULL;
4023 	}
4024 
4025 	TAILQ_REMOVE(&sq->free_reqs, req, link);
4026 
4027 	return req;
4028 }
4029 
4030 static int
4031 get_nvmf_io_req_length(struct spdk_nvmf_request *req)
4032 {
4033 	uint16_t nr;
4034 	uint32_t nlb, nsid;
4035 	struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
4036 	struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
4037 	struct spdk_nvmf_ns *ns;
4038 
4039 	nsid = cmd->nsid;
4040 	ns = _nvmf_subsystem_get_ns(ctrlr->subsys, nsid);
4041 	if (ns == NULL || ns->bdev == NULL) {
4042 		SPDK_ERRLOG("unsuccessful query for nsid %u\n", cmd->nsid);
4043 		return -EINVAL;
4044 	}
4045 
4046 	if (cmd->opc == SPDK_NVME_OPC_DATASET_MANAGEMENT) {
4047 		nr = cmd->cdw10_bits.dsm.nr + 1;
4048 		return nr * sizeof(struct spdk_nvme_dsm_range);
4049 	}
4050 
4051 	nlb = (cmd->cdw12 & 0x0000ffffu) + 1;
4052 	return nlb * spdk_bdev_get_block_size(ns->bdev);
4053 }
4054 
4055 static int
4056 map_admin_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req)
4057 {
4058 	struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
4059 	uint32_t len = 0;
4060 	uint8_t fid;
4061 	int iovcnt;
4062 
4063 	req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc);
4064 	req->length = 0;
4065 	req->data = NULL;
4066 
4067 	if (req->xfer == SPDK_NVME_DATA_NONE) {
4068 		return 0;
4069 	}
4070 
4071 	switch (cmd->opc) {
4072 	case SPDK_NVME_OPC_IDENTIFY:
4073 		len = 4096;
4074 		break;
4075 	case SPDK_NVME_OPC_GET_LOG_PAGE:
4076 		len = (((cmd->cdw11_bits.get_log_page.numdu << 16) | cmd->cdw10_bits.get_log_page.numdl) + 1) * 4;
4077 		break;
4078 	case SPDK_NVME_OPC_GET_FEATURES:
4079 	case SPDK_NVME_OPC_SET_FEATURES:
4080 		fid = cmd->cdw10_bits.set_features.fid;
4081 		switch (fid) {
4082 		case SPDK_NVME_FEAT_LBA_RANGE_TYPE:
4083 			len = 4096;
4084 			break;
4085 		case SPDK_NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
4086 			len = 256;
4087 			break;
4088 		case SPDK_NVME_FEAT_TIMESTAMP:
4089 			len = 8;
4090 			break;
4091 		case SPDK_NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
4092 			len = 512;
4093 			break;
4094 		case SPDK_NVME_FEAT_HOST_IDENTIFIER:
4095 			if (cmd->cdw11_bits.feat_host_identifier.bits.exhid) {
4096 				len = 16;
4097 			} else {
4098 				len = 8;
4099 			}
4100 			break;
4101 		default:
4102 			return 0;
4103 		}
4104 		break;
4105 	default:
4106 		return 0;
4107 	}
4108 
4109 	/* ADMIN command will not use SGL */
4110 	if (cmd->psdt != 0) {
4111 		return -EINVAL;
4112 	}
4113 
4114 	iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, len);
4115 	if (iovcnt < 0) {
4116 		SPDK_ERRLOG("%s: map Admin Opc %x failed\n",
4117 			    ctrlr_id(ctrlr), cmd->opc);
4118 		return -1;
4119 	}
4120 	req->length = len;
4121 	req->data = req->iov[0].iov_base;
4122 	req->iovcnt = iovcnt;
4123 
4124 	return 0;
4125 }
4126 
4127 /*
4128  * Map an I/O command's buffers.
4129  *
4130  * Returns 0 on success and -errno on failure.
4131  */
4132 static int
4133 map_io_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req)
4134 {
4135 	int len, iovcnt;
4136 	struct spdk_nvme_cmd *cmd;
4137 
4138 	assert(ctrlr != NULL);
4139 	assert(req != NULL);
4140 
4141 	cmd = &req->cmd->nvme_cmd;
4142 	req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc);
4143 	req->length = 0;
4144 	req->data = NULL;
4145 
4146 	if (spdk_unlikely(req->xfer == SPDK_NVME_DATA_NONE)) {
4147 		return 0;
4148 	}
4149 
4150 	len = get_nvmf_io_req_length(req);
4151 	if (len < 0) {
4152 		return -EINVAL;
4153 	}
4154 	req->length = len;
4155 
4156 	iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, req->length);
4157 	if (iovcnt < 0) {
4158 		SPDK_ERRLOG("%s: failed to map IO OPC %u\n", ctrlr_id(ctrlr), cmd->opc);
4159 		return -EFAULT;
4160 	}
4161 	req->data = req->iov[0].iov_base;
4162 	req->iovcnt = iovcnt;
4163 
4164 	return 0;
4165 }
4166 
4167 static int
4168 handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd,
4169 	       struct nvmf_vfio_user_sq *sq)
4170 {
4171 	int err;
4172 	struct nvmf_vfio_user_req *vu_req;
4173 	struct spdk_nvmf_request *req;
4174 
4175 	assert(ctrlr != NULL);
4176 	assert(cmd != NULL);
4177 
4178 	vu_req = get_nvmf_vfio_user_req(sq);
4179 	if (spdk_unlikely(vu_req == NULL)) {
4180 		SPDK_ERRLOG("%s: no request for NVMe command opc 0x%x\n", ctrlr_id(ctrlr), cmd->opc);
4181 		return post_completion(ctrlr, ctrlr->cqs[sq->cqid], 0, 0, cmd->cid,
4182 				       SPDK_NVME_SC_INTERNAL_DEVICE_ERROR, SPDK_NVME_SCT_GENERIC);
4183 
4184 	}
4185 	req = &vu_req->req;
4186 
4187 	assert(req->qpair != NULL);
4188 	SPDK_DEBUGLOG(nvmf_vfio, "%s: handle qid%u, req opc=%#x cid=%d\n",
4189 		      ctrlr_id(ctrlr), req->qpair->qid, cmd->opc, cmd->cid);
4190 
4191 	vu_req->cb_fn = handle_cmd_rsp;
4192 	vu_req->cb_arg = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair);
4193 	req->cmd->nvme_cmd = *cmd;
4194 
4195 	if (nvmf_qpair_is_admin_queue(req->qpair)) {
4196 		err = map_admin_cmd_req(ctrlr, req);
4197 	} else {
4198 		switch (cmd->opc) {
4199 		case SPDK_NVME_OPC_RESERVATION_REGISTER:
4200 		case SPDK_NVME_OPC_RESERVATION_REPORT:
4201 		case SPDK_NVME_OPC_RESERVATION_ACQUIRE:
4202 		case SPDK_NVME_OPC_RESERVATION_RELEASE:
4203 			err = -ENOTSUP;
4204 			break;
4205 		default:
4206 			err = map_io_cmd_req(ctrlr, req);
4207 			break;
4208 		}
4209 	}
4210 
4211 	if (spdk_unlikely(err < 0)) {
4212 		SPDK_ERRLOG("%s: process NVMe command opc 0x%x failed\n",
4213 			    ctrlr_id(ctrlr), cmd->opc);
4214 		req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
4215 		req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC;
4216 		err = handle_cmd_rsp(vu_req, vu_req->cb_arg);
4217 		_nvmf_vfio_user_req_free(sq, vu_req);
4218 		return err;
4219 	}
4220 
4221 	vu_req->state = VFIO_USER_REQUEST_STATE_EXECUTING;
4222 	spdk_nvmf_request_exec(req);
4223 
4224 	return 0;
4225 }
4226 
4227 /* Returns the number of commands processed, or a negative value on error. */
4228 static int
4229 nvmf_vfio_user_sq_poll(struct nvmf_vfio_user_sq *sq)
4230 {
4231 	struct nvmf_vfio_user_ctrlr *ctrlr;
4232 	uint32_t new_tail;
4233 	int count = 0;
4234 
4235 	assert(sq != NULL);
4236 
4237 	ctrlr = sq->ctrlr;
4238 
4239 	/* On aarch64 platforms, doorbells update from guest VM may not be seen
4240 	 * on SPDK target side. This is because there is memory type mismatch
4241 	 * situation here. That is on guest VM side, the doorbells are treated as
4242 	 * device memory while on SPDK target side, it is treated as normal
4243 	 * memory. And this situation cause problem on ARM platform.
4244 	 * Refer to "https://developer.arm.com/documentation/102376/0100/
4245 	 * Memory-aliasing-and-mismatched-memory-types". Only using spdk_mb()
4246 	 * cannot fix this. Use "dc civac" to invalidate cache may solve
4247 	 * this.
4248 	 */
4249 	spdk_ivdt_dcache(sq_dbl_tailp(ctrlr, sq));
4250 
4251 	/* Load-Acquire. */
4252 	new_tail = *sq_dbl_tailp(ctrlr, sq);
4253 
4254 	/*
4255 	 * Ensure that changes to the queue are visible to us.
4256 	 * The host driver should write the queue first, do a wmb(), and then
4257 	 * update the SQ tail doorbell (their Store-Release).
4258 	 */
4259 	spdk_rmb();
4260 
4261 	new_tail = new_tail & 0xffffu;
4262 	if (spdk_unlikely(new_tail >= sq->size)) {
4263 		union spdk_nvme_async_event_completion event = {};
4264 
4265 		SPDK_DEBUGLOG(nvmf_vfio, "%s: invalid SQ%u doorbell value %u\n", ctrlr_id(ctrlr), sq->qid,
4266 			      new_tail);
4267 		event.bits.async_event_type = SPDK_NVME_ASYNC_EVENT_TYPE_ERROR;
4268 		event.bits.async_event_info = SPDK_NVME_ASYNC_EVENT_INVALID_DB_WRITE;
4269 		nvmf_ctrlr_async_event_error_event(ctrlr->ctrlr, event);
4270 
4271 		return 0;
4272 	}
4273 
4274 	if (*sq_headp(sq) == new_tail) {
4275 		return 0;
4276 	}
4277 
4278 	count = handle_sq_tdbl_write(ctrlr, new_tail, sq);
4279 	if (count < 0) {
4280 		fail_ctrlr(ctrlr);
4281 	}
4282 
4283 	return count;
4284 }
4285 
4286 /*
4287  * vfio-user transport poll handler. Note that the library context is polled in
4288  * a separate poller (->vfu_ctx_poller), so this poller only needs to poll the
4289  * active qpairs.
4290  *
4291  * Returns the number of commands processed, or a negative value on error.
4292  */
4293 static int
4294 nvmf_vfio_user_poll_group_poll(struct spdk_nvmf_transport_poll_group *group)
4295 {
4296 	struct nvmf_vfio_user_poll_group *vu_group;
4297 	struct nvmf_vfio_user_sq *sq, *tmp;
4298 	int count = 0;
4299 
4300 	assert(group != NULL);
4301 
4302 	spdk_rmb();
4303 
4304 	vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group);
4305 
4306 	TAILQ_FOREACH_SAFE(sq, &vu_group->sqs, link, tmp) {
4307 		int ret;
4308 
4309 		if (spdk_unlikely(sq->sq_state != VFIO_USER_SQ_ACTIVE || !sq->size)) {
4310 			continue;
4311 		}
4312 
4313 		ret = nvmf_vfio_user_sq_poll(sq);
4314 
4315 		if (ret < 0) {
4316 			return ret;
4317 		}
4318 
4319 		count += ret;
4320 	}
4321 
4322 	return count;
4323 }
4324 
4325 static int
4326 nvmf_vfio_user_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair,
4327 				    struct spdk_nvme_transport_id *trid)
4328 {
4329 	struct nvmf_vfio_user_sq *sq;
4330 	struct nvmf_vfio_user_ctrlr *ctrlr;
4331 
4332 	sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair);
4333 	ctrlr = sq->ctrlr;
4334 
4335 	memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid));
4336 	return 0;
4337 }
4338 
4339 static int
4340 nvmf_vfio_user_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair,
4341 				   struct spdk_nvme_transport_id *trid)
4342 {
4343 	return 0;
4344 }
4345 
4346 static int
4347 nvmf_vfio_user_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair,
4348 				     struct spdk_nvme_transport_id *trid)
4349 {
4350 	struct nvmf_vfio_user_sq *sq;
4351 	struct nvmf_vfio_user_ctrlr *ctrlr;
4352 
4353 	sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair);
4354 	ctrlr = sq->ctrlr;
4355 
4356 	memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid));
4357 	return 0;
4358 }
4359 
4360 static void
4361 nvmf_vfio_user_qpair_abort_request(struct spdk_nvmf_qpair *qpair,
4362 				   struct spdk_nvmf_request *req)
4363 {
4364 	struct spdk_nvmf_request *req_to_abort = NULL;
4365 	struct spdk_nvmf_request *temp_req = NULL;
4366 	uint16_t cid;
4367 
4368 	cid = req->cmd->nvme_cmd.cdw10_bits.abort.cid;
4369 
4370 	TAILQ_FOREACH(temp_req, &qpair->outstanding, link) {
4371 		struct nvmf_vfio_user_req *vu_req;
4372 
4373 		vu_req = SPDK_CONTAINEROF(temp_req, struct nvmf_vfio_user_req, req);
4374 
4375 		if (vu_req->state == VFIO_USER_REQUEST_STATE_EXECUTING && vu_req->cmd.cid == cid) {
4376 			req_to_abort = temp_req;
4377 			break;
4378 		}
4379 	}
4380 
4381 	if (req_to_abort == NULL) {
4382 		spdk_nvmf_request_complete(req);
4383 		return;
4384 	}
4385 
4386 	req->req_to_abort = req_to_abort;
4387 	nvmf_ctrlr_abort_request(req);
4388 }
4389 
4390 static void
4391 nvmf_vfio_user_opts_init(struct spdk_nvmf_transport_opts *opts)
4392 {
4393 	opts->max_queue_depth =		NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH;
4394 	opts->max_qpairs_per_ctrlr =	NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR;
4395 	opts->in_capsule_data_size =	0;
4396 	opts->max_io_size =		NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE;
4397 	opts->io_unit_size =		NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE;
4398 	opts->max_aq_depth =		NVMF_VFIO_USER_DEFAULT_AQ_DEPTH;
4399 	opts->num_shared_buffers =	0;
4400 	opts->buf_cache_size =		0;
4401 	opts->association_timeout =	0;
4402 	opts->transport_specific =      NULL;
4403 }
4404 
4405 const struct spdk_nvmf_transport_ops spdk_nvmf_transport_vfio_user = {
4406 	.name = "VFIOUSER",
4407 	.type = SPDK_NVME_TRANSPORT_VFIOUSER,
4408 	.opts_init = nvmf_vfio_user_opts_init,
4409 	.create = nvmf_vfio_user_create,
4410 	.destroy = nvmf_vfio_user_destroy,
4411 
4412 	.listen = nvmf_vfio_user_listen,
4413 	.stop_listen = nvmf_vfio_user_stop_listen,
4414 	.cdata_init = nvmf_vfio_user_cdata_init,
4415 	.listen_associate = nvmf_vfio_user_listen_associate,
4416 
4417 	.listener_discover = nvmf_vfio_user_discover,
4418 
4419 	.poll_group_create = nvmf_vfio_user_poll_group_create,
4420 	.get_optimal_poll_group = nvmf_vfio_user_get_optimal_poll_group,
4421 	.poll_group_destroy = nvmf_vfio_user_poll_group_destroy,
4422 	.poll_group_add = nvmf_vfio_user_poll_group_add,
4423 	.poll_group_remove = nvmf_vfio_user_poll_group_remove,
4424 	.poll_group_poll = nvmf_vfio_user_poll_group_poll,
4425 
4426 	.req_free = nvmf_vfio_user_req_free,
4427 	.req_complete = nvmf_vfio_user_req_complete,
4428 
4429 	.qpair_fini = nvmf_vfio_user_close_qpair,
4430 	.qpair_get_local_trid = nvmf_vfio_user_qpair_get_local_trid,
4431 	.qpair_get_peer_trid = nvmf_vfio_user_qpair_get_peer_trid,
4432 	.qpair_get_listen_trid = nvmf_vfio_user_qpair_get_listen_trid,
4433 	.qpair_abort_request = nvmf_vfio_user_qpair_abort_request,
4434 };
4435 
4436 SPDK_NVMF_TRANSPORT_REGISTER(muser, &spdk_nvmf_transport_vfio_user);
4437 SPDK_LOG_REGISTER_COMPONENT(nvmf_vfio)
4438