xref: /spdk/lib/nvmf/vfio_user.c (revision 8bb0ded3e55c182cea67af1f6790f8de5f38c05f)
1 /*-
2  *   BSD LICENSE
3  *   Copyright (c) Intel Corporation. All rights reserved.
4  *   Copyright (c) 2019, Nutanix Inc. All rights reserved.
5  *
6  *   Redistribution and use in source and binary forms, with or without
7  *   modification, are permitted provided that the following conditions
8  *   are met:
9  *
10  *     * Redistributions of source code must retain the above copyright
11  *       notice, this list of conditions and the following disclaimer.
12  *     * Redistributions in binary form must reproduce the above copyright
13  *       notice, this list of conditions and the following disclaimer in
14  *       the documentation and/or other materials provided with the
15  *       distribution.
16  *     * Neither the name of Intel Corporation nor the names of its
17  *       contributors may be used to endorse or promote products derived
18  *       from this software without specific prior written permission.
19  *
20  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 /*
34  * NVMe over vfio-user transport
35  */
36 
37 #include <vfio-user/libvfio-user.h>
38 #include <vfio-user/pci_defs.h>
39 
40 #include "spdk/barrier.h"
41 #include "spdk/stdinc.h"
42 #include "spdk/assert.h"
43 #include "spdk/thread.h"
44 #include "spdk/nvmf_transport.h"
45 #include "spdk/sock.h"
46 #include "spdk/string.h"
47 #include "spdk/util.h"
48 #include "spdk/log.h"
49 
50 #include "transport.h"
51 
52 #include "nvmf_internal.h"
53 
54 #define NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH 256
55 #define NVMF_VFIO_USER_DEFAULT_AQ_DEPTH 32
56 #define NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR 64
57 #define NVMF_VFIO_USER_DEFAULT_IN_CAPSULE_DATA_SIZE 0
58 #define NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE 131072
59 #define NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE 131072
60 #define NVMF_VFIO_USER_DEFAULT_NUM_SHARED_BUFFERS 512 /* internal buf size */
61 #define NVMF_VFIO_USER_DEFAULT_BUFFER_CACHE_SIZE 0
62 
63 #define NVMF_VFIO_USER_DOORBELLS_OFFSET	0x1000
64 #define NVMF_VFIO_USER_DOORBELLS_SIZE 0x1000
65 
66 #define NVME_REG_CFG_SIZE       0x1000
67 #define NVME_REG_BAR0_SIZE      0x4000
68 #define NVME_IRQ_INTX_NUM       1
69 #define NVME_IRQ_MSIX_NUM	NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR
70 
71 struct nvmf_vfio_user_req;
72 struct nvmf_vfio_user_qpair;
73 
74 typedef int (*nvmf_vfio_user_req_cb_fn)(struct nvmf_vfio_user_req *req, void *cb_arg);
75 
76 #define NVMF_VFIO_USER_MDTS	32
77 #define NVMF_VFIO_USER_MAX_IOVECS	(NVMF_VFIO_USER_MDTS + 1)
78 
79 struct nvmf_vfio_user_req  {
80 	struct spdk_nvmf_request		req;
81 	struct spdk_nvme_cpl			rsp;
82 	struct spdk_nvme_cmd			cmd;
83 	uint16_t				cid;
84 
85 	nvmf_vfio_user_req_cb_fn		cb_fn;
86 	void					*cb_arg;
87 
88 	dma_sg_t				sg[NVMF_VFIO_USER_MAX_IOVECS];
89 	struct iovec				iov[NVMF_VFIO_USER_MAX_IOVECS];
90 	uint8_t					iovcnt;
91 
92 	TAILQ_ENTRY(nvmf_vfio_user_req)		link;
93 };
94 
95 /*
96  * A NVMe queue.
97  */
98 struct nvme_q {
99 	bool is_cq;
100 
101 	void *addr;
102 
103 	dma_sg_t sg;
104 	struct iovec iov;
105 
106 	uint32_t size;
107 	uint64_t prp1;
108 
109 	union {
110 		struct {
111 			uint32_t head;
112 			/* multiple SQs can be mapped to the same CQ */
113 			uint16_t cqid;
114 		};
115 		struct {
116 			uint32_t tail;
117 			uint16_t iv;
118 			bool ien;
119 		};
120 	};
121 };
122 
123 enum nvmf_vfio_user_qpair_state {
124 	VFIO_USER_QPAIR_UNINITIALIZED = 0,
125 	VFIO_USER_QPAIR_ACTIVE,
126 	VFIO_USER_QPAIR_DELETED,
127 	VFIO_USER_QPAIR_INACTIVE,
128 	VFIO_USER_QPAIR_ERROR,
129 };
130 
131 struct nvmf_vfio_user_qpair {
132 	struct spdk_nvmf_qpair			qpair;
133 	struct spdk_nvmf_transport_poll_group	*group;
134 	struct nvmf_vfio_user_ctrlr		*ctrlr;
135 	struct nvmf_vfio_user_req		*reqs_internal;
136 	uint16_t				qsize;
137 	struct nvme_q				cq;
138 	struct nvme_q				sq;
139 	enum nvmf_vfio_user_qpair_state		state;
140 
141 	TAILQ_HEAD(, nvmf_vfio_user_req)	reqs;
142 	TAILQ_ENTRY(nvmf_vfio_user_qpair)	link;
143 };
144 
145 struct nvmf_vfio_user_poll_group {
146 	struct spdk_nvmf_transport_poll_group	group;
147 	TAILQ_HEAD(, nvmf_vfio_user_qpair)	qps;
148 };
149 
150 struct nvmf_vfio_user_ctrlr {
151 	struct nvmf_vfio_user_endpoint		*endpoint;
152 	struct nvmf_vfio_user_transport		*transport;
153 
154 	/* True when the admin queue is connected */
155 	bool					ready;
156 
157 	uint16_t				cntlid;
158 
159 	struct nvmf_vfio_user_qpair		*qp[NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR];
160 
161 	TAILQ_ENTRY(nvmf_vfio_user_ctrlr)	link;
162 
163 	volatile uint32_t			*doorbells;
164 
165 	/* internal CSTS.CFS register for vfio-user fatal errors */
166 	uint32_t				cfs : 1;
167 };
168 
169 struct nvmf_vfio_user_endpoint {
170 	vfu_ctx_t				*vfu_ctx;
171 	struct msixcap				*msix;
172 	vfu_pci_config_space_t			*pci_config_space;
173 	int					fd;
174 	volatile uint32_t			*doorbells;
175 
176 	struct spdk_nvme_transport_id		trid;
177 	const struct spdk_nvmf_subsystem	*subsystem;
178 
179 	struct nvmf_vfio_user_ctrlr		*ctrlr;
180 
181 	TAILQ_ENTRY(nvmf_vfio_user_endpoint)	link;
182 };
183 
184 struct nvmf_vfio_user_transport {
185 	struct spdk_nvmf_transport		transport;
186 	pthread_mutex_t				lock;
187 	TAILQ_HEAD(, nvmf_vfio_user_endpoint)	endpoints;
188 
189 	TAILQ_HEAD(, nvmf_vfio_user_qpair)	new_qps;
190 };
191 
192 /*
193  * function prototypes
194  */
195 static volatile uint32_t *
196 hdbl(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q);
197 
198 static volatile uint32_t *
199 tdbl(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q);
200 
201 static int
202 nvmf_vfio_user_req_free(struct spdk_nvmf_request *req);
203 
204 static struct nvmf_vfio_user_req *
205 get_nvmf_vfio_user_req(struct nvmf_vfio_user_qpair *qpair);
206 
207 static int
208 post_completion(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd,
209 		struct nvme_q *cq, uint32_t cdw0, uint16_t sc,
210 		uint16_t sct);
211 
212 static void
213 map_dma(vfu_ctx_t *vfu_ctx, uint64_t iova, uint64_t len);
214 
215 static int
216 unmap_dma(vfu_ctx_t *vfu_ctx, uint64_t iova, uint64_t len);
217 
218 static char *
219 endpoint_id(struct nvmf_vfio_user_endpoint *endpoint)
220 {
221 	return endpoint->trid.traddr;
222 }
223 
224 static char *
225 ctrlr_id(struct nvmf_vfio_user_ctrlr *ctrlr)
226 {
227 	if (!ctrlr || !ctrlr->endpoint) {
228 		return "Null Ctrlr";
229 	}
230 
231 	return endpoint_id(ctrlr->endpoint);
232 }
233 
234 static uint16_t
235 io_q_id(struct nvme_q *q)
236 {
237 
238 	struct nvmf_vfio_user_qpair *vfio_user_qpair;
239 
240 	assert(q);
241 
242 	if (q->is_cq) {
243 		vfio_user_qpair = SPDK_CONTAINEROF(q, struct nvmf_vfio_user_qpair, cq);
244 	} else {
245 		vfio_user_qpair = SPDK_CONTAINEROF(q, struct nvmf_vfio_user_qpair, sq);
246 	}
247 	assert(vfio_user_qpair);
248 	return vfio_user_qpair->qpair.qid;
249 }
250 
251 static void
252 fail_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr)
253 {
254 	assert(ctrlr != NULL);
255 
256 	if (ctrlr->cfs == 0) {
257 		SPDK_ERRLOG(":%s failing controller\n", ctrlr_id(ctrlr));
258 	}
259 
260 	ctrlr->ready = false;
261 	ctrlr->cfs = 1U;
262 }
263 
264 static bool
265 ctrlr_interrupt_enabled(struct nvmf_vfio_user_ctrlr *ctrlr)
266 {
267 	assert(ctrlr != NULL);
268 	assert(ctrlr->endpoint != NULL);
269 
270 	vfu_pci_config_space_t *pci = ctrlr->endpoint->pci_config_space;
271 
272 	return (!pci->hdr.cmd.id || ctrlr->endpoint->msix->mxc.mxe);
273 }
274 
275 static void
276 nvmf_vfio_user_destroy_endpoint(struct nvmf_vfio_user_endpoint *endpoint)
277 {
278 	if (endpoint->doorbells) {
279 		munmap((void *)endpoint->doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE);
280 	}
281 
282 	if (endpoint->fd > 0) {
283 		close(endpoint->fd);
284 	}
285 
286 	vfu_destroy_ctx(endpoint->vfu_ctx);
287 
288 	free(endpoint);
289 }
290 
291 /* called when process exits */
292 static int
293 nvmf_vfio_user_destroy(struct spdk_nvmf_transport *transport,
294 		       spdk_nvmf_transport_destroy_done_cb cb_fn, void *cb_arg)
295 {
296 	struct nvmf_vfio_user_transport *vu_transport;
297 	struct nvmf_vfio_user_endpoint *endpoint, *tmp;
298 
299 	SPDK_DEBUGLOG(nvmf_vfio, "destroy transport\n");
300 
301 	vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport,
302 					transport);
303 
304 	(void)pthread_mutex_destroy(&vu_transport->lock);
305 
306 	TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) {
307 		TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link);
308 		nvmf_vfio_user_destroy_endpoint(endpoint);
309 	}
310 
311 	free(vu_transport);
312 
313 	if (cb_fn) {
314 		cb_fn(cb_arg);
315 	}
316 
317 	return 0;
318 }
319 
320 static struct spdk_nvmf_transport *
321 nvmf_vfio_user_create(struct spdk_nvmf_transport_opts *opts)
322 {
323 	struct nvmf_vfio_user_transport *vu_transport;
324 	int err;
325 
326 	vu_transport = calloc(1, sizeof(*vu_transport));
327 	if (vu_transport == NULL) {
328 		SPDK_ERRLOG("Transport alloc fail: %m\n");
329 		return NULL;
330 	}
331 
332 	err = pthread_mutex_init(&vu_transport->lock, NULL);
333 	if (err != 0) {
334 		SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err);
335 		goto err;
336 	}
337 
338 	TAILQ_INIT(&vu_transport->endpoints);
339 	TAILQ_INIT(&vu_transport->new_qps);
340 
341 	return &vu_transport->transport;
342 
343 err:
344 	free(vu_transport);
345 
346 	return NULL;
347 }
348 
349 static uint16_t
350 max_queue_size(struct nvmf_vfio_user_ctrlr const *ctrlr)
351 {
352 	assert(ctrlr != NULL);
353 	assert(ctrlr->qp[0] != NULL);
354 	assert(ctrlr->qp[0]->qpair.ctrlr != NULL);
355 
356 	return ctrlr->qp[0]->qpair.ctrlr->vcprop.cap.bits.mqes + 1;
357 }
358 
359 static void *
360 map_one(vfu_ctx_t *ctx, uint64_t addr, uint64_t len, dma_sg_t *sg, struct iovec *iov)
361 {
362 	int ret;
363 
364 	assert(ctx != NULL);
365 	assert(sg != NULL);
366 	assert(iov != NULL);
367 
368 	ret = vfu_addr_to_sg(ctx, addr, len, sg, 1, PROT_READ | PROT_WRITE);
369 	if (ret != 1) {
370 		errno = ret;
371 		return NULL;
372 	}
373 
374 	ret = vfu_map_sg(ctx, sg, iov, 1);
375 	if (ret != 0) {
376 		errno = ret;
377 		return NULL;
378 	}
379 
380 	assert(iov->iov_base != NULL);
381 	return iov->iov_base;
382 }
383 
384 static uint32_t
385 sq_head(struct nvmf_vfio_user_qpair *qpair)
386 {
387 	assert(qpair != NULL);
388 	return qpair->sq.head;
389 }
390 
391 static void
392 sqhd_advance(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_qpair *qpair)
393 {
394 	assert(ctrlr != NULL);
395 	assert(qpair != NULL);
396 	qpair->sq.head = (qpair->sq.head + 1) % qpair->sq.size;
397 }
398 
399 static void
400 insert_queue(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q,
401 	     const bool is_cq, const uint16_t id)
402 {
403 	struct nvme_q *_q;
404 	struct nvmf_vfio_user_qpair *qpair;
405 
406 	assert(ctrlr != NULL);
407 	assert(q != NULL);
408 
409 	qpair = ctrlr->qp[id];
410 
411 	q->is_cq = is_cq;
412 	if (is_cq) {
413 		_q = &qpair->cq;
414 		*_q = *q;
415 		*hdbl(ctrlr, _q) = 0;
416 	} else {
417 		_q = &qpair->sq;
418 		*_q = *q;
419 		*tdbl(ctrlr, _q) = 0;
420 	}
421 }
422 
423 static int
424 asq_map(struct nvmf_vfio_user_ctrlr *ctrlr)
425 {
426 	struct nvme_q q;
427 	const struct spdk_nvmf_registers *regs = spdk_nvmf_ctrlr_get_regs(ctrlr->qp[0]->qpair.ctrlr);
428 
429 	assert(ctrlr != NULL);
430 	assert(ctrlr->qp[0]->sq.addr == NULL);
431 	/* XXX ctrlr->asq == 0 is a valid memory address */
432 
433 	q.size = regs->aqa.bits.asqs + 1;
434 	q.head = ctrlr->doorbells[0] = 0;
435 	q.cqid = 0;
436 	q.addr = map_one(ctrlr->endpoint->vfu_ctx, regs->asq,
437 			 q.size * sizeof(struct spdk_nvme_cmd), &q.sg, &q.iov);
438 	if (q.addr == NULL) {
439 		SPDK_ERRLOG("Map ASQ failed, ASQ %"PRIx64", errno %d\n", regs->asq, errno);
440 		return -1;
441 	}
442 	memset(q.addr, 0, q.size * sizeof(struct spdk_nvme_cmd));
443 	insert_queue(ctrlr, &q, false, 0);
444 	return 0;
445 }
446 
447 static uint16_t
448 cq_next(struct nvme_q *q)
449 {
450 	assert(q != NULL);
451 	assert(q->is_cq);
452 	return (q->tail + 1) % q->size;
453 }
454 
455 static int
456 queue_index(uint16_t qid, int is_cq)
457 {
458 	return (qid * 2) + is_cq;
459 }
460 
461 static volatile uint32_t *
462 tdbl(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q)
463 {
464 	assert(ctrlr != NULL);
465 	assert(q != NULL);
466 	assert(!q->is_cq);
467 
468 	return &ctrlr->doorbells[queue_index(io_q_id(q), false)];
469 }
470 
471 static volatile uint32_t *
472 hdbl(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q)
473 {
474 	assert(ctrlr != NULL);
475 	assert(q != NULL);
476 	assert(q->is_cq);
477 
478 	return &ctrlr->doorbells[queue_index(io_q_id(q), true)];
479 }
480 
481 static bool
482 cq_is_full(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q)
483 {
484 	assert(ctrlr != NULL);
485 	assert(q != NULL);
486 	return cq_next(q) == *hdbl(ctrlr, q);
487 }
488 
489 static void
490 cq_tail_advance(struct nvme_q *q)
491 {
492 	assert(q != NULL);
493 	q->tail = cq_next(q);
494 }
495 
496 static int
497 acq_map(struct nvmf_vfio_user_ctrlr *ctrlr)
498 {
499 	struct nvme_q *q;
500 	const struct spdk_nvmf_registers *regs;
501 
502 	assert(ctrlr != NULL);
503 	assert(ctrlr->qp[0] != NULL);
504 	assert(ctrlr->qp[0]->cq.addr == NULL);
505 
506 	q = &ctrlr->qp[0]->cq;
507 	regs = spdk_nvmf_ctrlr_get_regs(ctrlr->qp[0]->qpair.ctrlr);
508 	assert(regs != NULL);
509 	assert(regs->acq != 0);
510 
511 	q->size = regs->aqa.bits.acqs + 1;
512 	q->tail = 0;
513 	q->addr = map_one(ctrlr->endpoint->vfu_ctx, regs->acq,
514 			  q->size * sizeof(struct spdk_nvme_cpl), &q->sg, &q->iov);
515 	if (q->addr == NULL) {
516 		SPDK_ERRLOG("Map ACQ failed, ACQ %"PRIx64", errno %d\n", regs->acq, errno);
517 		return -1;
518 	}
519 	memset(q->addr, 0, q->size * sizeof(struct spdk_nvme_cpl));
520 	q->is_cq = true;
521 	q->ien = true;
522 	insert_queue(ctrlr, q, true, 0);
523 	return 0;
524 }
525 
526 static void *
527 _map_one(void *prv, uint64_t addr, uint64_t len)
528 {
529 	struct nvmf_vfio_user_req *vu_req;
530 	struct nvmf_vfio_user_qpair *vu_qpair;
531 	void *ret;
532 
533 	assert(prv != NULL);
534 
535 	vu_req = SPDK_CONTAINEROF(prv, struct nvmf_vfio_user_req, cmd);
536 	vu_qpair = SPDK_CONTAINEROF(vu_req->req.qpair, struct nvmf_vfio_user_qpair, qpair);
537 
538 	assert(vu_req->iovcnt < NVMF_VFIO_USER_MAX_IOVECS);
539 	ret = map_one(vu_qpair->ctrlr->endpoint->vfu_ctx, addr, len,
540 		      &vu_req->sg[vu_req->iovcnt],
541 		      &vu_req->iov[vu_req->iovcnt]);
542 	if (spdk_likely(ret != NULL)) {
543 		vu_req->iovcnt++;
544 	}
545 	return ret;
546 }
547 
548 static int
549 vfio_user_map_prps(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd,
550 		   struct iovec *iov, uint32_t length)
551 {
552 	/* Map PRP list to from Guest physical memory to
553 	 * virtual memory address.
554 	 */
555 	return spdk_nvme_map_prps(cmd, cmd, iov, length,
556 				  4096, _map_one);
557 }
558 
559 static struct spdk_nvmf_request *
560 get_nvmf_req(struct nvmf_vfio_user_qpair *qp);
561 
562 static int
563 handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd,
564 	       struct spdk_nvmf_request *req);
565 
566 static void
567 handle_identify_ctrlr_rsp(struct spdk_nvme_ctrlr_data *data)
568 {
569 	assert(data != NULL);
570 
571 	data->sgls.supported = SPDK_NVME_SGLS_NOT_SUPPORTED;
572 }
573 
574 /*
575  * Posts a CQE in the completion queue.
576  *
577  * @ctrlr: the vfio-user controller
578  * @cmd: the NVMe command for which the completion is posted
579  * @cq: the completion queue
580  * @cdw0: cdw0 as reported by NVMf (only for SPDK_NVME_OPC_SET_FEATURES and
581  *        SPDK_NVME_OPC_ABORT)
582  * @sc: the NVMe CQE status code
583  * @sct: the NVMe CQE status code type
584  */
585 static int
586 post_completion(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd,
587 		struct nvme_q *cq, uint32_t cdw0, uint16_t sc,
588 		uint16_t sct)
589 {
590 	struct spdk_nvme_cpl *cpl;
591 	uint16_t qid;
592 	int err;
593 
594 	assert(ctrlr != NULL);
595 	assert(cmd != NULL);
596 
597 	qid = io_q_id(cq);
598 
599 	if (ctrlr->qp[0]->qpair.ctrlr->vcprop.csts.bits.shst != SPDK_NVME_SHST_NORMAL) {
600 		SPDK_DEBUGLOG(nvmf_vfio,
601 			      "%s: ignore completion SQ%d cid=%d status=%#x\n",
602 			      ctrlr_id(ctrlr), qid, cmd->cid, sc);
603 		return 0;
604 	}
605 
606 	if (cq_is_full(ctrlr, cq)) {
607 		SPDK_ERRLOG("%s: CQ%d full (tail=%d, head=%d)\n",
608 			    ctrlr_id(ctrlr), qid, cq->tail, *hdbl(ctrlr, cq));
609 		return -1;
610 	}
611 
612 	cpl = ((struct spdk_nvme_cpl *)cq->addr) + cq->tail;
613 
614 	SPDK_DEBUGLOG(nvmf_vfio,
615 		      "%s: request complete SQ%d cid=%d status=%#x SQ head=%#x CQ tail=%#x\n",
616 		      ctrlr_id(ctrlr), qid, cmd->cid, sc, ctrlr->qp[qid]->sq.head,
617 		      cq->tail);
618 
619 	if (qid == 0) {
620 		switch (cmd->opc) {
621 		case SPDK_NVME_OPC_ABORT:
622 		case SPDK_NVME_OPC_SET_FEATURES:
623 		case SPDK_NVME_OPC_GET_FEATURES:
624 			cpl->cdw0 = cdw0;
625 			break;
626 		}
627 	}
628 
629 
630 	assert(ctrlr->qp[qid] != NULL);
631 
632 	cpl->sqhd = ctrlr->qp[qid]->sq.head;
633 	cpl->cid = cmd->cid;
634 	cpl->status.dnr = 0x0;
635 	cpl->status.m = 0x0;
636 	cpl->status.sct = sct;
637 	cpl->status.p = ~cpl->status.p;
638 	cpl->status.sc = sc;
639 
640 	cq_tail_advance(cq);
641 
642 	/*
643 	 * this function now executes at SPDK thread context, we
644 	 * might be triggerring interrupts from vfio-user thread context so
645 	 * check for race conditions.
646 	 */
647 	if (ctrlr_interrupt_enabled(ctrlr) && cq->ien) {
648 		err = vfu_irq_trigger(ctrlr->endpoint->vfu_ctx, cq->iv);
649 		if (err != 0) {
650 			SPDK_ERRLOG("%s: failed to trigger interrupt: %m\n",
651 				    ctrlr_id(ctrlr));
652 			return err;
653 		}
654 	}
655 
656 	return 0;
657 }
658 
659 static struct nvme_q *
660 lookup_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, const uint16_t qid, const bool is_cq)
661 {
662 	struct nvme_q *q;
663 
664 	assert(ctrlr != NULL);
665 
666 	if (qid > NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR) {
667 		return NULL;
668 	}
669 
670 	if (ctrlr->qp[qid] == NULL) {
671 		return NULL;
672 	}
673 
674 	if (is_cq) {
675 		q = &ctrlr->qp[qid]->cq;
676 	} else {
677 		q = &ctrlr->qp[qid]->sq;
678 	}
679 
680 	if (q->addr == NULL) {
681 		return NULL;
682 	}
683 
684 	return q;
685 }
686 
687 static void
688 unmap_q(vfu_ctx_t *vfu_ctx, struct nvme_q *q)
689 {
690 	if (q == NULL) {
691 		return;
692 	}
693 	if (q->addr != NULL) {
694 		vfu_unmap_sg(vfu_ctx, &q->sg, &q->iov, 1);
695 		q->addr = NULL;
696 	}
697 }
698 
699 static void
700 unmap_qp(struct nvmf_vfio_user_qpair *qp)
701 {
702 	if (qp->ctrlr == NULL) {
703 		return;
704 	}
705 
706 	SPDK_DEBUGLOG(nvmf_vfio, "%s: destroy I/O QP%d\n",
707 		      ctrlr_id(qp->ctrlr), qp->qpair.qid);
708 
709 	unmap_q(qp->ctrlr->endpoint->vfu_ctx, &qp->sq);
710 	unmap_q(qp->ctrlr->endpoint->vfu_ctx, &qp->cq);
711 }
712 
713 /*
714  * TODO we can immediately remove the QP from the list because this function
715  * is now executed by the SPDK thread.
716  */
717 static void
718 destroy_qp(struct nvmf_vfio_user_ctrlr *ctrlr, uint16_t qid)
719 {
720 	struct nvmf_vfio_user_qpair *qpair;
721 
722 	if (ctrlr == NULL) {
723 		return;
724 	}
725 
726 	qpair = ctrlr->qp[qid];
727 	if (qpair == NULL) {
728 		return;
729 	}
730 
731 	SPDK_DEBUGLOG(nvmf_vfio, "%s: destroy QP%d=%p\n", ctrlr_id(ctrlr),
732 		      qid, qpair);
733 
734 	unmap_qp(qpair);
735 	free(qpair->reqs_internal);
736 	ctrlr->qp[qid] = NULL;
737 }
738 
739 /* This function can only fail because of memory allocation errors. */
740 static int
741 init_qp(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_transport *transport,
742 	const uint16_t qsize, const uint16_t id)
743 {
744 	int err = 0, i;
745 	struct nvmf_vfio_user_qpair *qpair;
746 	struct nvmf_vfio_user_req *vu_req;
747 	struct spdk_nvmf_request *req;
748 
749 	assert(ctrlr != NULL);
750 	assert(transport != NULL);
751 
752 	qpair = calloc(1, sizeof(*qpair));
753 	if (qpair == NULL) {
754 		return -ENOMEM;
755 	}
756 
757 	qpair->qpair.qid = id;
758 	qpair->qpair.transport = transport;
759 	qpair->ctrlr = ctrlr;
760 	qpair->qsize = qsize;
761 
762 	TAILQ_INIT(&qpair->reqs);
763 
764 	qpair->reqs_internal = calloc(qsize, sizeof(struct nvmf_vfio_user_req));
765 	if (qpair->reqs_internal == NULL) {
766 		SPDK_ERRLOG("%s: error allocating reqs: %m\n", ctrlr_id(ctrlr));
767 		err = -ENOMEM;
768 		goto out;
769 	}
770 
771 	for (i = 0; i < qsize; i++) {
772 		vu_req = &qpair->reqs_internal[i];
773 		req = &vu_req->req;
774 
775 		vu_req->cid = i;
776 		req->qpair = &qpair->qpair;
777 		req->rsp = (union nvmf_c2h_msg *)&vu_req->rsp;
778 		req->cmd = (union nvmf_h2c_msg *)&vu_req->cmd;
779 
780 		TAILQ_INSERT_TAIL(&qpair->reqs, vu_req, link);
781 	}
782 	ctrlr->qp[id] = qpair;
783 out:
784 	if (err != 0) {
785 		free(qpair);
786 	}
787 	return err;
788 }
789 
790 static int
791 add_qp(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_transport *transport,
792        const uint16_t qsize, const uint16_t qid)
793 {
794 	int err;
795 	struct nvmf_vfio_user_transport *vu_transport;
796 
797 	SPDK_DEBUGLOG(nvmf_vfio, "%s: request add QP%d\n",
798 		      ctrlr_id(ctrlr), qid);
799 
800 	err = init_qp(ctrlr, transport, qsize, qid);
801 	if (err != 0) {
802 		return err;
803 	}
804 
805 	vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport,
806 					transport);
807 
808 	/*
809 	 * After we've returned from the nvmf_vfio_user_poll_group_poll thread, once
810 	 * nvmf_vfio_user_accept executes it will pick up this QP and will eventually
811 	 * call nvmf_vfio_user_poll_group_add. The rest of the opertion needed to
812 	 * complete the addition of the queue will be continued at the
813 	 * completion callback.
814 	 */
815 	TAILQ_INSERT_TAIL(&vu_transport->new_qps, ctrlr->qp[qid], link);
816 
817 	return 0;
818 }
819 
820 /*
821  * Creates a completion or sumbission I/O queue. Returns 0 on success, -errno
822  * on error.
823  *
824  * XXX SPDK thread context.
825  */
826 static int
827 handle_create_io_q(struct nvmf_vfio_user_ctrlr *ctrlr,
828 		   struct spdk_nvme_cmd *cmd, const bool is_cq)
829 {
830 	size_t entry_size;
831 	uint16_t sc = SPDK_NVME_SC_SUCCESS;
832 	uint16_t sct = SPDK_NVME_SCT_GENERIC;
833 	int err = 0;
834 	struct nvme_q io_q = {};
835 
836 	assert(ctrlr != NULL);
837 	assert(cmd != NULL);
838 
839 	SPDK_DEBUGLOG(nvmf_vfio,
840 		      "%s: create I/O %cQ%d: QSIZE=%#x\n", ctrlr_id(ctrlr),
841 		      is_cq ? 'C' : 'S', cmd->cdw10_bits.create_io_q.qid,
842 		      cmd->cdw10_bits.create_io_q.qsize);
843 
844 	if (cmd->cdw10_bits.create_io_q.qid >= NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR) {
845 		SPDK_ERRLOG("%s: invalid QID=%d, max=%d\n", ctrlr_id(ctrlr),
846 			    cmd->cdw10_bits.create_io_q.qid,
847 			    NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR);
848 		sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
849 		sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER;
850 		goto out;
851 	}
852 
853 	if (lookup_io_q(ctrlr, cmd->cdw10_bits.create_io_q.qid, is_cq)) {
854 		SPDK_ERRLOG("%s: %cQ%d already exists\n", ctrlr_id(ctrlr),
855 			    is_cq ? 'C' : 'S', cmd->cdw10_bits.create_io_q.qid);
856 		sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
857 		sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER;
858 		goto out;
859 	}
860 
861 	/* TODO break rest of this function into smaller functions */
862 	if (is_cq) {
863 		entry_size = sizeof(struct spdk_nvme_cpl);
864 		if (cmd->cdw11_bits.create_io_cq.pc != 0x1) {
865 			/*
866 			 * TODO CAP.CMBS is currently set to zero, however we
867 			 * should zero it out explicitly when CAP is read.
868 			 * Support for CAP.CMBS is not mentioned in the NVMf
869 			 * spec.
870 			 */
871 			SPDK_ERRLOG("%s: non-PC CQ not supporred\n", ctrlr_id(ctrlr));
872 			sc = SPDK_NVME_SC_INVALID_CONTROLLER_MEM_BUF;
873 			goto out;
874 		}
875 		io_q.ien = cmd->cdw11_bits.create_io_cq.ien;
876 		io_q.iv = cmd->cdw11_bits.create_io_cq.iv;
877 	} else {
878 		/* CQ must be created before SQ */
879 		if (!lookup_io_q(ctrlr, cmd->cdw11_bits.create_io_sq.cqid, true)) {
880 			SPDK_ERRLOG("%s: CQ%d does not exist\n", ctrlr_id(ctrlr),
881 				    cmd->cdw11_bits.create_io_sq.cqid);
882 			sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
883 			sc = SPDK_NVME_SC_COMPLETION_QUEUE_INVALID;
884 			goto out;
885 		}
886 
887 		entry_size = sizeof(struct spdk_nvme_cmd);
888 		if (cmd->cdw11_bits.create_io_sq.pc != 0x1) {
889 			SPDK_ERRLOG("%s: non-PC SQ not supported\n", ctrlr_id(ctrlr));
890 			sc = SPDK_NVME_SC_INVALID_CONTROLLER_MEM_BUF;
891 			goto out;
892 		}
893 
894 		io_q.cqid = cmd->cdw11_bits.create_io_sq.cqid;
895 		SPDK_DEBUGLOG(nvmf_vfio, "%s: SQ%d CQID=%d\n", ctrlr_id(ctrlr),
896 			      cmd->cdw10_bits.create_io_q.qid, io_q.cqid);
897 	}
898 
899 	io_q.size = cmd->cdw10_bits.create_io_q.qsize + 1;
900 	if (io_q.size > max_queue_size(ctrlr)) {
901 		SPDK_ERRLOG("%s: queue too big, want=%d, max=%d\n", ctrlr_id(ctrlr),
902 			    io_q.size, max_queue_size(ctrlr));
903 		sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
904 		sc = SPDK_NVME_SC_INVALID_QUEUE_SIZE;
905 		goto out;
906 	}
907 
908 	io_q.addr = map_one(ctrlr->endpoint->vfu_ctx, cmd->dptr.prp.prp1,
909 			    io_q.size * entry_size, &io_q.sg, &io_q.iov);
910 	if (io_q.addr == NULL) {
911 		sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
912 		SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr));
913 		goto out;
914 	}
915 	io_q.prp1 = cmd->dptr.prp.prp1;
916 	memset(io_q.addr, 0, io_q.size * entry_size);
917 
918 	SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped %cQ%d IOVA=%#lx vaddr=%#llx\n",
919 		      ctrlr_id(ctrlr), is_cq ? 'C' : 'S',
920 		      cmd->cdw10_bits.create_io_q.qid, cmd->dptr.prp.prp1,
921 		      (unsigned long long)io_q.addr);
922 
923 	if (is_cq) {
924 		err = add_qp(ctrlr, ctrlr->qp[0]->qpair.transport, io_q.size,
925 			     cmd->cdw10_bits.create_io_q.qid);
926 		if (err != 0) {
927 			sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
928 			goto out;
929 		}
930 	}
931 
932 	insert_queue(ctrlr, &io_q, is_cq, cmd->cdw10_bits.create_io_q.qid);
933 
934 out:
935 	return post_completion(ctrlr, cmd, &ctrlr->qp[0]->cq, 0, sc, sct);
936 }
937 
938 /*
939  * Deletes a completion or sumbission I/O queue.
940  */
941 static int
942 handle_del_io_q(struct nvmf_vfio_user_ctrlr *ctrlr,
943 		struct spdk_nvme_cmd *cmd, const bool is_cq)
944 {
945 	uint16_t sct = SPDK_NVME_SCT_GENERIC;
946 	uint16_t sc = SPDK_NVME_SC_SUCCESS;
947 
948 	SPDK_DEBUGLOG(nvmf_vfio, "%s: delete I/O %cQ: QID=%d\n",
949 		      ctrlr_id(ctrlr), is_cq ? 'C' : 'S',
950 		      cmd->cdw10_bits.delete_io_q.qid);
951 
952 	if (lookup_io_q(ctrlr, cmd->cdw10_bits.delete_io_q.qid, is_cq) == NULL) {
953 		SPDK_ERRLOG("%s: %cQ%d does not exist\n", ctrlr_id(ctrlr),
954 			    is_cq ? 'C' : 'S', cmd->cdw10_bits.delete_io_q.qid);
955 		sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
956 		sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER;
957 		goto out;
958 	}
959 
960 	if (is_cq) {
961 		/* SQ must have been deleted first */
962 		if (ctrlr->qp[cmd->cdw10_bits.delete_io_q.qid]->state != VFIO_USER_QPAIR_DELETED) {
963 			SPDK_ERRLOG("%s: the associated SQ must be deleted first\n", ctrlr_id(ctrlr));
964 			sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
965 			sc = SPDK_NVME_SC_INVALID_QUEUE_DELETION;
966 			goto out;
967 		}
968 	} else {
969 		/*
970 		 * This doesn't actually delete the I/O queue, we can't
971 		 * do that anyway because NVMf doesn't support it. We're merely
972 		 * telling the poll_group_poll function to skip checking this
973 		 * queue. The only workflow this works is when CC.EN is set to
974 		 * 0 and we're stopping the subsystem, so we know that the
975 		 * relevant callbacks to destroy the queues will be called.
976 		 */
977 		assert(ctrlr->qp[cmd->cdw10_bits.delete_io_q.qid]->state == VFIO_USER_QPAIR_ACTIVE);
978 		ctrlr->qp[cmd->cdw10_bits.delete_io_q.qid]->state = VFIO_USER_QPAIR_DELETED;
979 	}
980 
981 out:
982 	return post_completion(ctrlr, cmd, &ctrlr->qp[0]->cq, 0, sc, sct);
983 }
984 
985 /* TODO need to honor the Abort Command Limit field */
986 static int
987 handle_abort_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd)
988 {
989 	assert(ctrlr != NULL);
990 
991 	SPDK_DEBUGLOG(nvmf_vfio, "%s: abort CID %u in SQID %u\n", ctrlr_id(ctrlr),
992 		      cmd->cdw10_bits.abort.cid, cmd->cdw10_bits.abort.sqid);
993 
994 	/* abort command not yet implemented */
995 	return post_completion(ctrlr, cmd, &ctrlr->qp[0]->cq, 1,
996 			       SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC);
997 }
998 
999 /*
1000  * Returns 0 on success and -errno on error.
1001  *
1002  * XXX SPDK thread context
1003  */
1004 static int
1005 consume_admin_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd)
1006 {
1007 	assert(ctrlr != NULL);
1008 	assert(cmd != NULL);
1009 
1010 	SPDK_DEBUGLOG(nvmf_vfio, "%s: handle admin req opc=%#x cid=%d\n",
1011 		      ctrlr_id(ctrlr), cmd->opc, cmd->cid);
1012 
1013 	switch (cmd->opc) {
1014 	case SPDK_NVME_OPC_CREATE_IO_CQ:
1015 	case SPDK_NVME_OPC_CREATE_IO_SQ:
1016 		return handle_create_io_q(ctrlr, cmd,
1017 					  cmd->opc == SPDK_NVME_OPC_CREATE_IO_CQ);
1018 	case SPDK_NVME_OPC_ABORT:
1019 		return handle_abort_cmd(ctrlr, cmd);
1020 	case SPDK_NVME_OPC_DELETE_IO_SQ:
1021 	case SPDK_NVME_OPC_DELETE_IO_CQ:
1022 		return handle_del_io_q(ctrlr, cmd,
1023 				       cmd->opc == SPDK_NVME_OPC_DELETE_IO_CQ);
1024 	default:
1025 		return handle_cmd_req(ctrlr, cmd, get_nvmf_req(ctrlr->qp[0]));
1026 	}
1027 }
1028 
1029 static int
1030 handle_cmd_rsp(struct nvmf_vfio_user_req *req, void *cb_arg)
1031 {
1032 	struct nvmf_vfio_user_qpair *qpair = cb_arg;
1033 	struct spdk_nvme_cmd *cmd = &req->req.cmd->nvme_cmd;
1034 
1035 	assert(qpair != NULL);
1036 	assert(req != NULL);
1037 
1038 	if (nvmf_qpair_is_admin_queue(&qpair->qpair)) {
1039 		switch (cmd->opc) {
1040 		case SPDK_NVME_OPC_IDENTIFY:
1041 			if ((cmd->cdw10 & 0xFF) == SPDK_NVME_IDENTIFY_CTRLR) {
1042 				handle_identify_ctrlr_rsp(req->req.data);
1043 			}
1044 			break;
1045 		default:
1046 			break;
1047 		}
1048 	}
1049 
1050 	vfu_unmap_sg(qpair->ctrlr->endpoint->vfu_ctx, req->sg, req->iov, req->iovcnt);
1051 
1052 	return post_completion(qpair->ctrlr, &req->req.cmd->nvme_cmd,
1053 			       &qpair->ctrlr->qp[req->req.qpair->qid]->cq,
1054 			       req->req.rsp->nvme_cpl.cdw0,
1055 			       req->req.rsp->nvme_cpl.status.sc,
1056 			       req->req.rsp->nvme_cpl.status.sct);
1057 }
1058 
1059 static int
1060 consume_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_qpair *qpair,
1061 	    struct spdk_nvme_cmd *cmd)
1062 {
1063 	assert(qpair != NULL);
1064 	if (nvmf_qpair_is_admin_queue(&qpair->qpair)) {
1065 		return consume_admin_cmd(ctrlr, cmd);
1066 	}
1067 
1068 	return handle_cmd_req(ctrlr, cmd, get_nvmf_req(qpair));
1069 }
1070 
1071 static ssize_t
1072 handle_sq_tdbl_write(struct nvmf_vfio_user_ctrlr *ctrlr, const uint32_t new_tail,
1073 		     struct nvmf_vfio_user_qpair *qpair)
1074 {
1075 	struct spdk_nvme_cmd *queue;
1076 
1077 	assert(ctrlr != NULL);
1078 	assert(qpair != NULL);
1079 
1080 	queue = qpair->sq.addr;
1081 	while (sq_head(qpair) != new_tail) {
1082 		int err;
1083 		struct spdk_nvme_cmd *cmd = &queue[sq_head(qpair)];
1084 
1085 		/*
1086 		 * SQHD must contain the new head pointer, so we must increase
1087 		 * it before we generate a completion.
1088 		 */
1089 		sqhd_advance(ctrlr, qpair);
1090 
1091 		err = consume_cmd(ctrlr, qpair, cmd);
1092 		if (err != 0) {
1093 			return err;
1094 		}
1095 	}
1096 
1097 	return 0;
1098 }
1099 
1100 static int
1101 map_admin_queue(struct nvmf_vfio_user_ctrlr *ctrlr)
1102 {
1103 	int err;
1104 
1105 	assert(ctrlr != NULL);
1106 
1107 	err = acq_map(ctrlr);
1108 	if (err != 0) {
1109 		SPDK_ERRLOG("%s: failed to map CQ0: %d\n", ctrlr_id(ctrlr), err);
1110 		return err;
1111 	}
1112 	err = asq_map(ctrlr);
1113 	if (err != 0) {
1114 		SPDK_ERRLOG("%s: failed to map SQ0: %d\n", ctrlr_id(ctrlr), err);
1115 		return err;
1116 	}
1117 	return 0;
1118 }
1119 
1120 static void
1121 unmap_admin_queue(struct nvmf_vfio_user_ctrlr *ctrlr)
1122 {
1123 	assert(ctrlr->qp[0] != NULL);
1124 
1125 	unmap_qp(ctrlr->qp[0]);
1126 }
1127 
1128 static int
1129 nvmf_vfio_user_prop_req_rsp(struct nvmf_vfio_user_req *req, void *cb_arg)
1130 {
1131 	struct nvmf_vfio_user_qpair *qpair = cb_arg;
1132 
1133 	assert(qpair != NULL);
1134 	assert(req != NULL);
1135 
1136 	if (req->req.cmd->prop_get_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET) {
1137 		assert(qpair->ctrlr != NULL);
1138 		assert(req != NULL);
1139 
1140 		memcpy(req->req.data,
1141 		       &req->req.rsp->prop_get_rsp.value.u64,
1142 		       req->req.length);
1143 	} else {
1144 		assert(req->req.cmd->prop_set_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET);
1145 		assert(qpair->ctrlr != NULL);
1146 
1147 		if (req->req.cmd->prop_set_cmd.ofst == offsetof(struct spdk_nvme_registers, cc)) {
1148 			union spdk_nvme_cc_register *cc;
1149 
1150 			cc = (union spdk_nvme_cc_register *)&req->req.cmd->prop_set_cmd.value.u64;
1151 
1152 			if (cc->bits.en == 1 && cc->bits.shn == 0) {
1153 				SPDK_DEBUGLOG(nvmf_vfio,
1154 					      "%s: MAP Admin queue\n",
1155 					      ctrlr_id(qpair->ctrlr));
1156 				map_admin_queue(qpair->ctrlr);
1157 			} else if ((cc->bits.en == 0 && cc->bits.shn == 0) ||
1158 				   (cc->bits.en == 1 && cc->bits.shn != 0)) {
1159 				SPDK_DEBUGLOG(nvmf_vfio,
1160 					      "%s: UNMAP Admin queue\n",
1161 					      ctrlr_id(qpair->ctrlr));
1162 				unmap_admin_queue(qpair->ctrlr);
1163 			}
1164 		}
1165 	}
1166 
1167 	qpair->ctrlr->ready = true;
1168 	return 0;
1169 }
1170 
1171 /*
1172  * XXX Do NOT remove, see comment in access_bar0_fn.
1173  *
1174  * Handles a write at offset 0x1000 or more.
1175  *
1176  * DSTRD is set to fixed value 0 for NVMf.
1177  *
1178  */
1179 static int
1180 handle_dbl_access(struct nvmf_vfio_user_ctrlr *ctrlr, uint32_t *buf,
1181 		  const size_t count, loff_t pos, const bool is_write)
1182 {
1183 	assert(ctrlr != NULL);
1184 	assert(buf != NULL);
1185 
1186 	if (count != sizeof(uint32_t)) {
1187 		SPDK_ERRLOG("%s: bad doorbell buffer size %ld\n",
1188 			    ctrlr_id(ctrlr), count);
1189 		return -EINVAL;
1190 	}
1191 
1192 	pos -= NVMF_VFIO_USER_DOORBELLS_OFFSET;
1193 
1194 	/* pos must be dword aligned */
1195 	if ((pos & 0x3) != 0) {
1196 		SPDK_ERRLOG("%s: bad doorbell offset %#lx\n", ctrlr_id(ctrlr), pos);
1197 		return -EINVAL;
1198 	}
1199 
1200 	/* convert byte offset to array index */
1201 	pos >>= 2;
1202 
1203 	if (pos > NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR * 2) {
1204 		/*
1205 		 * TODO: need to emit a "Write to Invalid Doorbell Register"
1206 		 * asynchronous event
1207 		 */
1208 		SPDK_ERRLOG("%s: bad doorbell index %#lx\n", ctrlr_id(ctrlr), pos);
1209 		return -EINVAL;
1210 	}
1211 
1212 	if (is_write) {
1213 		ctrlr->doorbells[pos] = *buf;
1214 		spdk_wmb();
1215 	} else {
1216 		spdk_rmb();
1217 		*buf = ctrlr->doorbells[pos];
1218 	}
1219 	return 0;
1220 }
1221 
1222 static ssize_t
1223 access_bar0_fn(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t pos,
1224 	       bool is_write)
1225 {
1226 	struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx);
1227 	struct nvmf_vfio_user_ctrlr *ctrlr;
1228 	struct nvmf_vfio_user_req *req;
1229 	int ret;
1230 
1231 	ctrlr = endpoint->ctrlr;
1232 
1233 	SPDK_DEBUGLOG(nvmf_vfio,
1234 		      "%s: bar0 %s ctrlr: %p, count=%zu, pos=%"PRIX64"\n",
1235 		      endpoint_id(endpoint), is_write ? "write" : "read",
1236 		      ctrlr, count, pos);
1237 
1238 	if (pos >= NVMF_VFIO_USER_DOORBELLS_OFFSET) {
1239 		/*
1240 		 * XXX The fact that the doorbells can be memory mapped doesn't
1241 		 * mean thath the client (VFIO in QEMU) is obliged to memory
1242 		 * map them, it might still elect to access them via regular
1243 		 * read/write.
1244 		 */
1245 		ret = handle_dbl_access(ctrlr, (uint32_t *)buf, count,
1246 					pos, is_write);
1247 		if (ret == 0) {
1248 			return count;
1249 		}
1250 		assert(ret < 0);
1251 		return ret;
1252 	}
1253 
1254 	/* Construct a Fabric Property Get/Set command and send it */
1255 	req = get_nvmf_vfio_user_req(ctrlr->qp[0]);
1256 	if (req == NULL) {
1257 		return -1;
1258 	}
1259 
1260 	req->cb_fn = nvmf_vfio_user_prop_req_rsp;
1261 	req->cb_arg = ctrlr->qp[0];
1262 	req->req.cmd->prop_set_cmd.opcode = SPDK_NVME_OPC_FABRIC;
1263 	req->req.cmd->prop_set_cmd.cid = 0;
1264 	req->req.cmd->prop_set_cmd.attrib.size = (count / 4) - 1;
1265 	req->req.cmd->prop_set_cmd.ofst = pos;
1266 	if (is_write) {
1267 		req->req.cmd->prop_set_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET;
1268 		if (req->req.cmd->prop_set_cmd.attrib.size) {
1269 			req->req.cmd->prop_set_cmd.value.u64 = *(uint64_t *)buf;
1270 		} else {
1271 			req->req.cmd->prop_set_cmd.value.u32.high = 0;
1272 			req->req.cmd->prop_set_cmd.value.u32.low = *(uint32_t *)buf;
1273 		}
1274 	} else {
1275 		req->req.cmd->prop_get_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET;
1276 	}
1277 	req->req.length = count;
1278 	req->req.data = buf;
1279 
1280 	/* Mark the controller as busy to limit the queue depth for fabric get/set to 1 */
1281 	ctrlr->ready = false;
1282 
1283 	spdk_nvmf_request_exec_fabrics(&req->req);
1284 
1285 	return count;
1286 }
1287 
1288 /*
1289  * NVMe driver reads 4096 bytes, which is the extended PCI configuration space
1290  * available on PCI-X 2.0 and PCI Express buses
1291  */
1292 static ssize_t
1293 access_pci_config(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t offset,
1294 		  bool is_write)
1295 {
1296 	struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx);
1297 
1298 	if (is_write) {
1299 		SPDK_ERRLOG("%s: write %#lx-%#lx not supported\n",
1300 			    endpoint_id(endpoint), offset, offset + count);
1301 		return -EINVAL;
1302 	}
1303 
1304 	if (offset + count > PCI_CFG_SPACE_EXP_SIZE) {
1305 		SPDK_ERRLOG("%s: access past end of extended PCI configuration space, want=%ld+%ld, max=%d\n",
1306 			    endpoint_id(endpoint), offset, count,
1307 			    PCI_CFG_SPACE_EXP_SIZE);
1308 		return -ERANGE;
1309 	}
1310 
1311 	memcpy(buf, ((unsigned char *)endpoint->pci_config_space) + offset, count);
1312 
1313 	return count;
1314 }
1315 
1316 static void
1317 vfio_user_log(vfu_ctx_t *vfu_ctx, int level, char const *msg)
1318 {
1319 	struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx);
1320 
1321 	if (level >= SPDK_LOG_DEBUG) {
1322 		SPDK_DEBUGLOG(nvmf_vfio, "%s: %s", endpoint_id(endpoint), msg);
1323 	} else if (level >= SPDK_LOG_NOTICE) {
1324 		SPDK_NOTICELOG("%s: %s", endpoint_id(endpoint), msg);
1325 	} else {
1326 		SPDK_ERRLOG("%s: %s", endpoint_id(endpoint), msg);
1327 	}
1328 }
1329 
1330 static void
1331 init_pci_config_space(vfu_pci_config_space_t *p)
1332 {
1333 	/* MLBAR */
1334 	p->hdr.bars[0].raw = 0x0;
1335 	/* MUBAR */
1336 	p->hdr.bars[1].raw = 0x0;
1337 
1338 	/* vendor specific, let's set them to zero for now */
1339 	p->hdr.bars[3].raw = 0x0;
1340 	p->hdr.bars[4].raw = 0x0;
1341 	p->hdr.bars[5].raw = 0x0;
1342 
1343 	/* enable INTx */
1344 	p->hdr.intr.ipin = 0x1;
1345 }
1346 
1347 static int
1348 vfio_user_dev_info_fill(struct nvmf_vfio_user_endpoint *endpoint)
1349 {
1350 	int ret;
1351 	size_t offset;
1352 	vfu_ctx_t *vfu_ctx = endpoint->vfu_ctx;
1353 
1354 	static vfu_cap_t pm = {
1355 		.pm = {
1356 			.hdr.id = PCI_CAP_ID_PM,
1357 			.pmcs.nsfrst = 0x1
1358 		}
1359 	};
1360 	static vfu_cap_t px = {
1361 		.px = {
1362 			.hdr.id = PCI_CAP_ID_EXP,
1363 			.pxcaps.ver = 0x2,
1364 			.pxdcap = {.per = 0x1, .flrc = 0x1},
1365 			.pxdcap2.ctds = 0x1
1366 		}
1367 	};
1368 	static vfu_cap_t msix = {
1369 		.msix = {
1370 			.hdr.id = PCI_CAP_ID_MSIX,
1371 			.mxc.ts = NVME_IRQ_MSIX_NUM - 1,
1372 			.mtab = {.tbir = 0x4, .to = 0x0},
1373 			.mpba = {.pbir = 0x5, .pbao = 0x0}
1374 		}
1375 	};
1376 
1377 	static vfu_cap_t *caps[] = {&pm, &msix, &px};
1378 	static struct iovec sparse_mmap[] = {
1379 		{
1380 			.iov_base = (void *)NVMF_VFIO_USER_DOORBELLS_OFFSET,
1381 			.iov_len = NVMF_VFIO_USER_DOORBELLS_SIZE,
1382 		},
1383 	};
1384 
1385 	ret = vfu_pci_init(vfu_ctx, VFU_PCI_TYPE_EXPRESS, PCI_HEADER_TYPE_NORMAL, 0);
1386 	if (ret < 0) {
1387 		SPDK_ERRLOG("vfu_ctx %p failed to initialize PCI\n", vfu_ctx);
1388 		return ret;
1389 	}
1390 	vfu_pci_set_id(vfu_ctx, 0x4e58, 0x0001, 0, 0);
1391 	/*
1392 	 * 0x02, controller uses the NVM Express programming interface
1393 	 * 0x08, non-volatile memory controller
1394 	 * 0x01, mass storage controller
1395 	 */
1396 	vfu_pci_set_class(vfu_ctx, 0x01, 0x08, 0x02);
1397 
1398 	ret = vfu_pci_setup_caps(vfu_ctx, caps, 3);
1399 	if (ret < 0) {
1400 		SPDK_ERRLOG("vfu_ctx %p failed to setup cap list\n", vfu_ctx);
1401 		return ret;
1402 	}
1403 
1404 	ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_CFG_REGION_IDX, NVME_REG_CFG_SIZE,
1405 			       access_pci_config, VFU_REGION_FLAG_RW, NULL, 0, -1);
1406 	if (ret < 0) {
1407 		SPDK_ERRLOG("vfu_ctx %p failed to setup cfg\n", vfu_ctx);
1408 		return ret;
1409 	}
1410 
1411 	ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE,
1412 			       access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM,
1413 			       sparse_mmap, 1, endpoint->fd);
1414 	if (ret < 0) {
1415 		SPDK_ERRLOG("vfu_ctx %p failed to setup bar 0\n", vfu_ctx);
1416 		return ret;
1417 	}
1418 
1419 	ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR4_REGION_IDX, PAGE_SIZE,
1420 			       NULL, VFU_REGION_FLAG_RW, NULL, 0, -1);
1421 	if (ret < 0) {
1422 		SPDK_ERRLOG("vfu_ctx %p failed to setup bar 4\n", vfu_ctx);
1423 		return ret;
1424 	}
1425 
1426 	ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR5_REGION_IDX, PAGE_SIZE,
1427 			       NULL, VFU_REGION_FLAG_RW, NULL, 0, -1);
1428 	if (ret < 0) {
1429 		SPDK_ERRLOG("vfu_ctx %p failed to setup bar 5\n", vfu_ctx);
1430 		return ret;
1431 	}
1432 
1433 	ret = vfu_setup_device_dma_cb(vfu_ctx, map_dma, unmap_dma);
1434 	if (ret < 0) {
1435 		SPDK_ERRLOG("vfu_ctx %p failed to setup dma callback\n", vfu_ctx);
1436 		return ret;
1437 	}
1438 
1439 	ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_INTX_IRQ, 1);
1440 	if (ret < 0) {
1441 		SPDK_ERRLOG("vfu_ctx %p failed to setup INTX\n", vfu_ctx);
1442 		return ret;
1443 	}
1444 
1445 	ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSIX_IRQ, NVME_IRQ_MSIX_NUM);
1446 	if (ret < 0) {
1447 		SPDK_ERRLOG("vfu_ctx %p failed to setup MSIX\n", vfu_ctx);
1448 		return ret;
1449 	}
1450 
1451 	ret = vfu_realize_ctx(vfu_ctx);
1452 	if (ret < 0) {
1453 		SPDK_ERRLOG("vfu_ctx %p failed to realize\n", vfu_ctx);
1454 		return ret;
1455 	}
1456 
1457 	endpoint->pci_config_space = vfu_pci_get_config_space(endpoint->vfu_ctx);
1458 	assert(endpoint->pci_config_space != NULL);
1459 	init_pci_config_space(endpoint->pci_config_space);
1460 
1461 	offset = vfu_pci_find_capability(endpoint->vfu_ctx, 0, PCI_CAP_ID_MSIX);
1462 	assert(offset != 0);
1463 	endpoint->msix = (struct msixcap *)((uint8_t *)endpoint->pci_config_space + offset);
1464 
1465 	return 0;
1466 }
1467 
1468 static int
1469 destroy_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr)
1470 {
1471 	int i;
1472 
1473 	if (ctrlr == NULL) {
1474 		return 0;
1475 	}
1476 
1477 	for (i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; i++) {
1478 		destroy_qp(ctrlr, i);
1479 	}
1480 
1481 	if (ctrlr->endpoint) {
1482 		ctrlr->endpoint->ctrlr = NULL;
1483 	}
1484 
1485 	free(ctrlr);
1486 	return 0;
1487 }
1488 
1489 static void
1490 map_dma(vfu_ctx_t *vfu_ctx, uint64_t iova, uint64_t len)
1491 {
1492 	struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx);
1493 	struct nvmf_vfio_user_ctrlr *ctrlr;
1494 	struct nvmf_vfio_user_qpair *qpair;
1495 	int i, ret;
1496 
1497 	assert(endpoint != NULL);
1498 
1499 	if (endpoint->ctrlr == NULL) {
1500 		return;
1501 	}
1502 
1503 	ctrlr = endpoint->ctrlr;
1504 
1505 	SPDK_DEBUGLOG(nvmf_vfio, "%s: map IOVA %#lx-%#lx\n",
1506 		      ctrlr_id(ctrlr), iova, len);
1507 
1508 	for (i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; i++) {
1509 		qpair = ctrlr->qp[i];
1510 		if (qpair == NULL) {
1511 			continue;
1512 		}
1513 
1514 		if (qpair->state != VFIO_USER_QPAIR_INACTIVE) {
1515 			continue;
1516 		}
1517 
1518 		if (nvmf_qpair_is_admin_queue(&qpair->qpair)) {
1519 			ret = map_admin_queue(ctrlr);
1520 			if (ret) {
1521 				continue;
1522 			}
1523 			qpair->state = VFIO_USER_QPAIR_ACTIVE;
1524 		} else {
1525 			struct nvme_q *sq = &qpair->sq;
1526 			struct nvme_q *cq = &qpair->cq;
1527 
1528 			sq->addr = map_one(ctrlr->endpoint->vfu_ctx, sq->prp1, sq->size * 64, &sq->sg, &sq->iov);
1529 			if (!sq->addr) {
1530 				SPDK_NOTICELOG("Failed to map SQID %d %#lx-%#lx, will try again in next poll\n",
1531 					       i, sq->prp1, sq->prp1 + sq->size * 64);
1532 				continue;
1533 			}
1534 			cq->addr = map_one(ctrlr->endpoint->vfu_ctx, cq->prp1, cq->size * 16, &cq->sg, &cq->iov);
1535 			if (!cq->addr) {
1536 				SPDK_NOTICELOG("Failed to map CQID %d %#lx-%#lx, will try again in next poll\n",
1537 					       i, cq->prp1, cq->prp1 + cq->size * 16);
1538 				continue;
1539 			}
1540 
1541 			qpair->state = VFIO_USER_QPAIR_ACTIVE;
1542 		}
1543 	}
1544 }
1545 
1546 static int
1547 unmap_dma(vfu_ctx_t *vfu_ctx, uint64_t iova, uint64_t len)
1548 {
1549 
1550 	struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx);
1551 	struct nvmf_vfio_user_ctrlr *ctrlr;
1552 	int i;
1553 
1554 	assert(endpoint != NULL);
1555 
1556 	if (endpoint->ctrlr == NULL) {
1557 		return 0;
1558 	}
1559 
1560 	ctrlr = endpoint->ctrlr;
1561 
1562 	SPDK_DEBUGLOG(nvmf_vfio, "%s: unmap IOVA %#lx\n",
1563 		      ctrlr_id(ctrlr), iova);
1564 
1565 	for (i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; i++) {
1566 		if (ctrlr->qp[i] == NULL) {
1567 			continue;
1568 		}
1569 		if (ctrlr->qp[i]->cq.sg.dma_addr == iova ||
1570 		    ctrlr->qp[i]->sq.sg.dma_addr == iova) {
1571 			unmap_qp(ctrlr->qp[i]);
1572 			ctrlr->qp[i]->state = VFIO_USER_QPAIR_INACTIVE;
1573 		}
1574 	}
1575 
1576 	return 0;
1577 }
1578 
1579 static void
1580 nvmf_vfio_user_create_ctrlr(struct nvmf_vfio_user_transport *transport,
1581 			    struct nvmf_vfio_user_endpoint *endpoint)
1582 {
1583 	struct nvmf_vfio_user_ctrlr *ctrlr;
1584 	int err;
1585 
1586 	/* First, construct a vfio-user CUSTOM transport controller */
1587 	ctrlr = calloc(1, sizeof(*ctrlr));
1588 	if (ctrlr == NULL) {
1589 		err = -ENOMEM;
1590 		goto out;
1591 	}
1592 	ctrlr->cntlid = 0xffff;
1593 	ctrlr->transport = transport;
1594 	ctrlr->endpoint = endpoint;
1595 	ctrlr->doorbells = endpoint->doorbells;
1596 
1597 	/* Then, construct an admin queue pair */
1598 	err = init_qp(ctrlr, &transport->transport, NVMF_VFIO_USER_DEFAULT_AQ_DEPTH, 0);
1599 	if (err != 0) {
1600 		goto out;
1601 	}
1602 	endpoint->ctrlr = ctrlr;
1603 
1604 	/* Notify the generic layer about the new admin queue pair */
1605 	TAILQ_INSERT_TAIL(&ctrlr->transport->new_qps, ctrlr->qp[0], link);
1606 
1607 out:
1608 	if (err != 0) {
1609 		SPDK_ERRLOG("%s: failed to create vfio-user controller: %s\n",
1610 			    endpoint_id(endpoint), strerror(-err));
1611 		if (destroy_ctrlr(ctrlr) != 0) {
1612 			SPDK_ERRLOG("%s: failed to clean up\n",
1613 				    endpoint_id(endpoint));
1614 		}
1615 	}
1616 }
1617 
1618 static int
1619 nvmf_vfio_user_listen(struct spdk_nvmf_transport *transport,
1620 		      const struct spdk_nvme_transport_id *trid,
1621 		      struct spdk_nvmf_listen_opts *listen_opts)
1622 {
1623 	struct nvmf_vfio_user_transport *vu_transport;
1624 	struct nvmf_vfio_user_endpoint *endpoint, *tmp;
1625 	char *path = NULL;
1626 	char uuid[PATH_MAX] = {};
1627 	int fd;
1628 	int err;
1629 
1630 	vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport,
1631 					transport);
1632 
1633 	TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) {
1634 		/* Only compare traddr */
1635 		if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) {
1636 			return -EEXIST;
1637 		}
1638 	}
1639 
1640 	endpoint = calloc(1, sizeof(*endpoint));
1641 	if (!endpoint) {
1642 		return -ENOMEM;
1643 	}
1644 
1645 	endpoint->fd = -1;
1646 	memcpy(&endpoint->trid, trid, sizeof(endpoint->trid));
1647 
1648 	err = asprintf(&path, "%s/bar0", endpoint_id(endpoint));
1649 	if (err == -1) {
1650 		goto out;
1651 	}
1652 
1653 	fd = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH);
1654 	if (fd == -1) {
1655 		SPDK_ERRLOG("%s: failed to open device memory at %s: %m\n",
1656 			    endpoint_id(endpoint), path);
1657 		err = fd;
1658 		free(path);
1659 		goto out;
1660 	}
1661 	free(path);
1662 
1663 	err = ftruncate(fd, NVMF_VFIO_USER_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE);
1664 	if (err != 0) {
1665 		goto out;
1666 	}
1667 
1668 	endpoint->doorbells = mmap(NULL, NVMF_VFIO_USER_DOORBELLS_SIZE,
1669 				   PROT_READ | PROT_WRITE, MAP_SHARED, fd, NVMF_VFIO_USER_DOORBELLS_OFFSET);
1670 	if (endpoint->doorbells == MAP_FAILED) {
1671 		endpoint->doorbells = NULL;
1672 		err = -errno;
1673 		goto out;
1674 	}
1675 
1676 	endpoint->fd = fd;
1677 
1678 	snprintf(uuid, PATH_MAX, "%s/cntrl", endpoint_id(endpoint));
1679 	SPDK_DEBUGLOG(nvmf_vfio, "%s: doorbells %p\n", uuid, endpoint->doorbells);
1680 
1681 	endpoint->vfu_ctx = vfu_create_ctx(VFU_TRANS_SOCK, uuid, LIBVFIO_USER_FLAG_ATTACH_NB,
1682 					   endpoint, VFU_DEV_TYPE_PCI);
1683 	if (endpoint->vfu_ctx == NULL) {
1684 		SPDK_ERRLOG("%s: error creating libmuser context: %m\n",
1685 			    endpoint_id(endpoint));
1686 		err = -1;
1687 		goto out;
1688 	}
1689 	vfu_setup_log(endpoint->vfu_ctx, vfio_user_log,
1690 		      SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio") ? SPDK_LOG_DEBUG : SPDK_LOG_ERROR);
1691 
1692 	err = vfio_user_dev_info_fill(endpoint);
1693 	if (err < 0) {
1694 		goto out;
1695 	}
1696 
1697 	TAILQ_INSERT_TAIL(&vu_transport->endpoints, endpoint, link);
1698 
1699 out:
1700 	if (err != 0) {
1701 		nvmf_vfio_user_destroy_endpoint(endpoint);
1702 	}
1703 
1704 	return err;
1705 }
1706 
1707 static void
1708 nvmf_vfio_user_stop_listen(struct spdk_nvmf_transport *transport,
1709 			   const struct spdk_nvme_transport_id *trid)
1710 {
1711 	struct nvmf_vfio_user_transport *vu_transport;
1712 	struct nvmf_vfio_user_endpoint *endpoint, *tmp;
1713 	int err;
1714 
1715 	assert(trid != NULL);
1716 	assert(trid->traddr != NULL);
1717 
1718 	SPDK_DEBUGLOG(nvmf_vfio, "%s: stop listen\n", trid->traddr);
1719 
1720 	vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport,
1721 					transport);
1722 
1723 	pthread_mutex_lock(&vu_transport->lock);
1724 	TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) {
1725 		if (strcmp(trid->traddr, endpoint->trid.traddr) == 0) {
1726 			TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link);
1727 			if (endpoint->ctrlr) {
1728 				err = destroy_ctrlr(endpoint->ctrlr);
1729 				if (err != 0) {
1730 					SPDK_ERRLOG("%s: failed destroy controller: %s\n",
1731 						    endpoint_id(endpoint), strerror(-err));
1732 				}
1733 			}
1734 			nvmf_vfio_user_destroy_endpoint(endpoint);
1735 			pthread_mutex_unlock(&vu_transport->lock);
1736 
1737 			return;
1738 		}
1739 	}
1740 	pthread_mutex_unlock(&vu_transport->lock);
1741 
1742 	SPDK_DEBUGLOG(nvmf_vfio, "%s: not found\n", trid->traddr);
1743 }
1744 
1745 static int
1746 nvmf_vfio_user_listen_associate(struct spdk_nvmf_transport *transport,
1747 				const struct spdk_nvmf_subsystem *subsystem,
1748 				const struct spdk_nvme_transport_id *trid)
1749 {
1750 	struct nvmf_vfio_user_transport *vu_transport;
1751 	struct nvmf_vfio_user_endpoint *endpoint;
1752 
1753 	vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, transport);
1754 
1755 	TAILQ_FOREACH(endpoint, &vu_transport->endpoints, link) {
1756 		if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) {
1757 			break;
1758 		}
1759 	}
1760 
1761 	if (endpoint == NULL) {
1762 		return -ENOENT;
1763 	}
1764 
1765 	endpoint->subsystem = subsystem;
1766 
1767 	return 0;
1768 }
1769 
1770 /*
1771  * Executed periodically.
1772  *
1773  * XXX SPDK thread context.
1774  */
1775 static uint32_t
1776 nvmf_vfio_user_accept(struct spdk_nvmf_transport *transport)
1777 {
1778 	int err;
1779 	struct nvmf_vfio_user_transport *vu_transport;
1780 	struct nvmf_vfio_user_qpair *qp, *tmp_qp;
1781 	struct nvmf_vfio_user_endpoint *endpoint;
1782 
1783 	vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport,
1784 					transport);
1785 
1786 	pthread_mutex_lock(&vu_transport->lock);
1787 
1788 	TAILQ_FOREACH(endpoint, &vu_transport->endpoints, link) {
1789 		/* we need try to attach the controller again after reset or shutdown */
1790 		if (endpoint->ctrlr != NULL && endpoint->ctrlr->ready) {
1791 			continue;
1792 		}
1793 
1794 		err = vfu_attach_ctx(endpoint->vfu_ctx);
1795 		if (err == -1) {
1796 			if (errno == EAGAIN || errno == EWOULDBLOCK) {
1797 				continue;
1798 			}
1799 
1800 			pthread_mutex_unlock(&vu_transport->lock);
1801 			return -EFAULT;
1802 		}
1803 
1804 		/* Construct a controller */
1805 		nvmf_vfio_user_create_ctrlr(vu_transport, endpoint);
1806 	}
1807 
1808 	TAILQ_FOREACH_SAFE(qp, &vu_transport->new_qps, link, tmp_qp) {
1809 		TAILQ_REMOVE(&vu_transport->new_qps, qp, link);
1810 		spdk_nvmf_tgt_new_qpair(transport->tgt, &qp->qpair);
1811 	}
1812 
1813 	pthread_mutex_unlock(&vu_transport->lock);
1814 
1815 	return 0;
1816 }
1817 
1818 static void
1819 nvmf_vfio_user_discover(struct spdk_nvmf_transport *transport,
1820 			struct spdk_nvme_transport_id *trid,
1821 			struct spdk_nvmf_discovery_log_page_entry *entry)
1822 { }
1823 
1824 static struct spdk_nvmf_transport_poll_group *
1825 nvmf_vfio_user_poll_group_create(struct spdk_nvmf_transport *transport)
1826 {
1827 	struct nvmf_vfio_user_poll_group *vu_group;
1828 
1829 	SPDK_DEBUGLOG(nvmf_vfio, "create poll group\n");
1830 
1831 	vu_group = calloc(1, sizeof(*vu_group));
1832 	if (vu_group == NULL) {
1833 		SPDK_ERRLOG("Error allocating poll group: %m");
1834 		return NULL;
1835 	}
1836 
1837 	TAILQ_INIT(&vu_group->qps);
1838 
1839 	return &vu_group->group;
1840 }
1841 
1842 /* called when process exits */
1843 static void
1844 nvmf_vfio_user_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group)
1845 {
1846 	struct nvmf_vfio_user_poll_group *vu_group;
1847 
1848 	SPDK_DEBUGLOG(nvmf_vfio, "destroy poll group\n");
1849 
1850 	vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group);
1851 
1852 	free(vu_group);
1853 }
1854 
1855 static int
1856 handle_queue_connect_rsp(struct nvmf_vfio_user_req *req, void *cb_arg)
1857 {
1858 	struct nvmf_vfio_user_poll_group *vu_group;
1859 	struct nvmf_vfio_user_qpair *qpair = cb_arg;
1860 	struct nvmf_vfio_user_ctrlr *ctrlr;
1861 
1862 	assert(qpair != NULL);
1863 	assert(req != NULL);
1864 
1865 	vu_group = SPDK_CONTAINEROF(qpair->group, struct nvmf_vfio_user_poll_group, group);
1866 	TAILQ_INSERT_TAIL(&vu_group->qps, qpair, link);
1867 	qpair->state = VFIO_USER_QPAIR_ACTIVE;
1868 
1869 	ctrlr = qpair->ctrlr;
1870 	assert(ctrlr != NULL);
1871 
1872 	if (spdk_nvme_cpl_is_error(&req->req.rsp->nvme_cpl)) {
1873 		SPDK_ERRLOG("SC %u, SCT %u\n", req->req.rsp->nvme_cpl.status.sc, req->req.rsp->nvme_cpl.status.sct);
1874 		destroy_qp(ctrlr, qpair->qpair.qid);
1875 		destroy_ctrlr(ctrlr);
1876 		return -1;
1877 	}
1878 
1879 	if (nvmf_qpair_is_admin_queue(&qpair->qpair)) {
1880 		ctrlr->cntlid = qpair->qpair.ctrlr->cntlid;
1881 		ctrlr->ready = true;
1882 	}
1883 
1884 	free(req->req.data);
1885 	req->req.data = NULL;
1886 
1887 	return 0;
1888 }
1889 
1890 /*
1891  * Called by spdk_nvmf_transport_poll_group_add.
1892  */
1893 static int
1894 nvmf_vfio_user_poll_group_add(struct spdk_nvmf_transport_poll_group *group,
1895 			      struct spdk_nvmf_qpair *qpair)
1896 {
1897 	struct nvmf_vfio_user_qpair *vu_qpair;
1898 	struct nvmf_vfio_user_req *vu_req;
1899 	struct nvmf_vfio_user_ctrlr *ctrlr;
1900 	struct spdk_nvmf_request *req;
1901 	struct spdk_nvmf_fabric_connect_data *data;
1902 	bool admin;
1903 
1904 	vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair);
1905 	vu_qpair->group = group;
1906 	ctrlr = vu_qpair->ctrlr;
1907 
1908 	SPDK_DEBUGLOG(nvmf_vfio, "%s: add QP%d=%p(%p) to poll_group=%p\n",
1909 		      ctrlr_id(ctrlr), vu_qpair->qpair.qid,
1910 		      vu_qpair, qpair, group);
1911 
1912 	admin = nvmf_qpair_is_admin_queue(&vu_qpair->qpair);
1913 
1914 	vu_req = get_nvmf_vfio_user_req(vu_qpair);
1915 	if (vu_req == NULL) {
1916 		return -1;
1917 	}
1918 
1919 	req = &vu_req->req;
1920 	req->cmd->connect_cmd.opcode = SPDK_NVME_OPC_FABRIC;
1921 	req->cmd->connect_cmd.cid = vu_req->cid;
1922 	req->cmd->connect_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_CONNECT;
1923 	req->cmd->connect_cmd.recfmt = 0;
1924 	req->cmd->connect_cmd.sqsize = vu_qpair->qsize - 1;
1925 	req->cmd->connect_cmd.qid = admin ? 0 : qpair->qid;
1926 
1927 	req->length = sizeof(struct spdk_nvmf_fabric_connect_data);
1928 	req->data = calloc(1, req->length);
1929 	if (req->data == NULL) {
1930 		nvmf_vfio_user_req_free(req);
1931 		return -ENOMEM;
1932 	}
1933 
1934 	data = (struct spdk_nvmf_fabric_connect_data *)req->data;
1935 	data->cntlid = admin ? 0xFFFF : ctrlr->cntlid;
1936 	snprintf(data->subnqn, sizeof(data->subnqn), "%s",
1937 		 spdk_nvmf_subsystem_get_nqn(ctrlr->endpoint->subsystem));
1938 
1939 	vu_req->cb_fn = handle_queue_connect_rsp;
1940 	vu_req->cb_arg = vu_qpair;
1941 
1942 	SPDK_DEBUGLOG(nvmf_vfio,
1943 		      "%s: sending connect fabrics command for QID=%#x cntlid=%#x\n",
1944 		      ctrlr_id(ctrlr), qpair->qid, data->cntlid);
1945 
1946 	spdk_nvmf_request_exec_fabrics(req);
1947 	return 0;
1948 }
1949 
1950 static int
1951 nvmf_vfio_user_poll_group_remove(struct spdk_nvmf_transport_poll_group *group,
1952 				 struct spdk_nvmf_qpair *qpair)
1953 {
1954 	struct nvmf_vfio_user_qpair *vu_qpair;
1955 	struct nvmf_vfio_user_poll_group *vu_group;
1956 
1957 	vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair);
1958 
1959 	SPDK_DEBUGLOG(nvmf_vfio,
1960 		      "%s: remove NVMf QP%d=%p from NVMf poll_group=%p\n",
1961 		      ctrlr_id(vu_qpair->ctrlr), qpair->qid, qpair, group);
1962 
1963 
1964 	vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group);
1965 
1966 	TAILQ_REMOVE(&vu_group->qps, vu_qpair, link);
1967 
1968 	return 0;
1969 }
1970 
1971 static int
1972 nvmf_vfio_user_req_free(struct spdk_nvmf_request *req)
1973 {
1974 	struct nvmf_vfio_user_qpair *qpair;
1975 	struct nvmf_vfio_user_req *vfio_user_req;
1976 
1977 	assert(req != NULL);
1978 
1979 	vfio_user_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req);
1980 	qpair = SPDK_CONTAINEROF(vfio_user_req->req.qpair, struct nvmf_vfio_user_qpair, qpair);
1981 
1982 	TAILQ_INSERT_TAIL(&qpair->reqs, vfio_user_req, link);
1983 
1984 	return 0;
1985 }
1986 
1987 static int
1988 nvmf_vfio_user_req_complete(struct spdk_nvmf_request *req)
1989 {
1990 	struct nvmf_vfio_user_qpair *qpair;
1991 	struct nvmf_vfio_user_req *vfio_user_req;
1992 
1993 	assert(req != NULL);
1994 
1995 	vfio_user_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req);
1996 	qpair = SPDK_CONTAINEROF(vfio_user_req->req.qpair, struct nvmf_vfio_user_qpair, qpair);
1997 
1998 	if (vfio_user_req->cb_fn != NULL) {
1999 		if (vfio_user_req->cb_fn(vfio_user_req, vfio_user_req->cb_arg) != 0) {
2000 			fail_ctrlr(qpair->ctrlr);
2001 		}
2002 	}
2003 
2004 	TAILQ_INSERT_TAIL(&qpair->reqs, vfio_user_req, link);
2005 
2006 	return 0;
2007 }
2008 
2009 static void
2010 nvmf_vfio_user_close_qpair(struct spdk_nvmf_qpair *qpair,
2011 			   spdk_nvmf_transport_qpair_fini_cb cb_fn, void *cb_arg)
2012 {
2013 	struct nvmf_vfio_user_qpair *vu_qpair;
2014 
2015 	assert(qpair != NULL);
2016 	vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair);
2017 	destroy_qp(vu_qpair->ctrlr, qpair->qid);
2018 
2019 	if (cb_fn) {
2020 		cb_fn(cb_arg);
2021 	}
2022 }
2023 
2024 /**
2025  * Returns a preallocated spdk_nvmf_request or NULL if there isn't one available.
2026  */
2027 static struct nvmf_vfio_user_req *
2028 get_nvmf_vfio_user_req(struct nvmf_vfio_user_qpair *qpair)
2029 {
2030 	struct nvmf_vfio_user_req *req;
2031 
2032 	assert(qpair != NULL);
2033 
2034 	if (TAILQ_EMPTY(&qpair->reqs)) {
2035 		return NULL;
2036 	}
2037 
2038 	req = TAILQ_FIRST(&qpair->reqs);
2039 	TAILQ_REMOVE(&qpair->reqs, req, link);
2040 	memset(&req->cmd, 0, sizeof(req->cmd));
2041 	memset(&req->rsp, 0, sizeof(req->rsp));
2042 	req->iovcnt = 0;
2043 
2044 	return req;
2045 }
2046 
2047 static struct spdk_nvmf_request *
2048 get_nvmf_req(struct nvmf_vfio_user_qpair *qpair)
2049 {
2050 	struct nvmf_vfio_user_req *req = get_nvmf_vfio_user_req(qpair);
2051 
2052 	if (req == NULL) {
2053 		return NULL;
2054 	}
2055 	return &req->req;
2056 }
2057 
2058 static int
2059 get_nvmf_io_req_length(struct spdk_nvmf_request *req)
2060 {
2061 	uint16_t nlb;
2062 	uint32_t nsid;
2063 	struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
2064 	struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
2065 	struct spdk_nvmf_ns *ns;
2066 
2067 	nsid = cmd->nsid;
2068 	nlb = (cmd->cdw12 & 0x0000ffffu) + 1;
2069 	ns = _nvmf_subsystem_get_ns(ctrlr->subsys, nsid);
2070 	if (ns == NULL || ns->bdev == NULL) {
2071 		SPDK_ERRLOG("unsuccessful query for nsid %u\n", cmd->nsid);
2072 		return -EINVAL;
2073 	}
2074 
2075 	return nlb * spdk_bdev_get_block_size(ns->bdev);
2076 }
2077 
2078 static int
2079 map_admin_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req)
2080 {
2081 	struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
2082 	uint32_t len = 0;
2083 	int iovcnt;
2084 
2085 	req->xfer = cmd->opc & 0x3;
2086 	req->length = 0;
2087 	req->data = NULL;
2088 
2089 	switch (cmd->opc) {
2090 	case SPDK_NVME_OPC_IDENTIFY:
2091 		len = 4096; /* TODO: there should be a define somewhere for this */
2092 		break;
2093 	case SPDK_NVME_OPC_GET_LOG_PAGE:
2094 		len = (cmd->cdw10_bits.get_log_page.numdl + 1) * 4;
2095 		break;
2096 	}
2097 
2098 	if (!cmd->dptr.prp.prp1 || !len) {
2099 		return 0;
2100 	}
2101 
2102 	iovcnt = vfio_user_map_prps(ctrlr, cmd, req->iov, len);
2103 	if (iovcnt < 0) {
2104 		SPDK_ERRLOG("%s: map Admin Opc %x failed\n",
2105 			    ctrlr_id(ctrlr), cmd->opc);
2106 		return -1;
2107 	}
2108 
2109 	req->length = len;
2110 	req->data = req->iov[0].iov_base;
2111 
2112 	return 0;
2113 }
2114 
2115 /*
2116  * Handles an I/O command.
2117  *
2118  * Returns 0 on success and -errno on failure. Sets @submit on whether or not
2119  * the request must be forwarded to NVMf.
2120  */
2121 static int
2122 map_io_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req)
2123 {
2124 	int err = 0;
2125 	bool remap = true;
2126 
2127 	assert(ctrlr != NULL);
2128 	assert(req != NULL);
2129 
2130 	switch (req->cmd->nvme_cmd.opc) {
2131 	case SPDK_NVME_OPC_FLUSH:
2132 		req->xfer = SPDK_NVME_DATA_NONE;
2133 		remap = false;
2134 		break;
2135 	case SPDK_NVME_OPC_READ:
2136 		req->xfer = SPDK_NVME_DATA_CONTROLLER_TO_HOST;
2137 		break;
2138 	case SPDK_NVME_OPC_WRITE:
2139 		req->xfer = SPDK_NVME_DATA_HOST_TO_CONTROLLER;
2140 		break;
2141 	default:
2142 		SPDK_ERRLOG("%s: SQ%d invalid I/O request type 0x%x\n",
2143 			    ctrlr_id(ctrlr), req->qpair->qid,
2144 			    req->cmd->nvme_cmd.opc);
2145 		return -EINVAL;
2146 	}
2147 
2148 	req->data = NULL;
2149 	if (remap) {
2150 		assert(req->cmd->nvme_cmd.psdt == 0);
2151 		err = get_nvmf_io_req_length(req);
2152 		if (err < 0) {
2153 			return -EINVAL;
2154 		}
2155 
2156 		req->length = err;
2157 		err = vfio_user_map_prps(ctrlr, &req->cmd->nvme_cmd, req->iov,
2158 					 req->length);
2159 		if (err < 0) {
2160 			SPDK_ERRLOG("%s: failed to map PRP: %d\n",
2161 				    ctrlr_id(ctrlr), err);
2162 			return -EFAULT;
2163 		}
2164 		req->iovcnt = err;
2165 	}
2166 
2167 	return 0;
2168 }
2169 
2170 static int
2171 handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd,
2172 	       struct spdk_nvmf_request *req)
2173 {
2174 	int err;
2175 	struct nvmf_vfio_user_req *vfio_user_req;
2176 
2177 	assert(ctrlr != NULL);
2178 	assert(cmd != NULL);
2179 
2180 	/*
2181 	 * TODO: this means that there are no free requests available,
2182 	 * returning -1 will fail the controller. Theoretically this error can
2183 	 * be avoided completely by ensuring we have as many requests as slots
2184 	 * in the SQ, plus one for the the property request.
2185 	 */
2186 	if (spdk_unlikely(req == NULL)) {
2187 		return -1;
2188 	}
2189 
2190 	vfio_user_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req);
2191 	vfio_user_req->cb_fn = handle_cmd_rsp;
2192 	vfio_user_req->cb_arg = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_qpair, qpair);
2193 	req->cmd->nvme_cmd = *cmd;
2194 	if (nvmf_qpair_is_admin_queue(req->qpair)) {
2195 		err = map_admin_cmd_req(ctrlr, req);
2196 	} else {
2197 		err = map_io_cmd_req(ctrlr, req);
2198 	}
2199 
2200 	if (spdk_unlikely(err < 0)) {
2201 		SPDK_ERRLOG("%s: map NVMe command opc 0x%x failed\n",
2202 			    ctrlr_id(ctrlr), cmd->opc);
2203 		req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
2204 		req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC;
2205 		return handle_cmd_rsp(vfio_user_req, vfio_user_req->cb_arg);
2206 	}
2207 
2208 	spdk_nvmf_request_exec(req);
2209 
2210 	return 0;
2211 }
2212 
2213 static int
2214 nvmf_vfio_user_ctrlr_poll(struct nvmf_vfio_user_ctrlr *ctrlr)
2215 {
2216 	if (ctrlr == NULL) {
2217 		return 0;
2218 	}
2219 
2220 	/* This will call access_bar0_fn() if there are any writes
2221 	 * to the portion of the BAR that is not mmap'd */
2222 	return vfu_run_ctx(ctrlr->endpoint->vfu_ctx);
2223 }
2224 
2225 static void
2226 nvmf_vfio_user_qpair_poll(struct nvmf_vfio_user_qpair *qpair)
2227 {
2228 	struct nvmf_vfio_user_ctrlr *ctrlr;
2229 	uint32_t new_tail;
2230 
2231 	assert(qpair != NULL);
2232 
2233 	ctrlr = qpair->ctrlr;
2234 
2235 	new_tail = *tdbl(ctrlr, &qpair->sq);
2236 	if (sq_head(qpair) != new_tail) {
2237 		int err = handle_sq_tdbl_write(ctrlr, new_tail, qpair);
2238 		if (err != 0) {
2239 			fail_ctrlr(ctrlr);
2240 			return;
2241 		}
2242 	}
2243 }
2244 
2245 /*
2246  * Called unconditionally, periodically, very frequently from SPDK to ask
2247  * whether there's work to be done.  This function consumes requests generated
2248  * from read/write_bar0 by setting ctrlr->prop_req.dir.  read_bar0, and
2249  * occasionally write_bar0 -- though this may change, synchronously wait. This
2250  * function also consumes requests by looking at the doorbells.
2251  */
2252 static int
2253 nvmf_vfio_user_poll_group_poll(struct spdk_nvmf_transport_poll_group *group)
2254 {
2255 	struct nvmf_vfio_user_poll_group *vu_group;
2256 	struct nvmf_vfio_user_qpair *vu_qpair, *tmp;
2257 	struct nvmf_vfio_user_ctrlr *ctrlr;
2258 
2259 	assert(group != NULL);
2260 
2261 	spdk_rmb();
2262 
2263 	vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group);
2264 
2265 	TAILQ_FOREACH_SAFE(vu_qpair, &vu_group->qps, link, tmp) {
2266 		ctrlr = vu_qpair->ctrlr;
2267 		if (!ctrlr->ready) {
2268 			continue;
2269 		}
2270 
2271 		if (nvmf_qpair_is_admin_queue(&vu_qpair->qpair)) {
2272 			int err;
2273 
2274 			err = nvmf_vfio_user_ctrlr_poll(ctrlr);
2275 			if (spdk_unlikely(err) != 0) {
2276 				if (err == -ENOTCONN) {
2277 					TAILQ_REMOVE(&vu_group->qps, vu_qpair, link);
2278 					ctrlr->ready = false;
2279 					continue;
2280 				}
2281 
2282 				fail_ctrlr(ctrlr);
2283 				return -1;
2284 			}
2285 		}
2286 
2287 		if (vu_qpair->state != VFIO_USER_QPAIR_ACTIVE || !vu_qpair->sq.size) {
2288 			continue;
2289 		}
2290 
2291 		nvmf_vfio_user_qpair_poll(vu_qpair);
2292 	}
2293 
2294 	return 0;
2295 }
2296 
2297 static int
2298 nvmf_vfio_user_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair,
2299 				    struct spdk_nvme_transport_id *trid)
2300 {
2301 	struct nvmf_vfio_user_qpair *vu_qpair;
2302 	struct nvmf_vfio_user_ctrlr *ctrlr;
2303 
2304 	vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair);
2305 	ctrlr = vu_qpair->ctrlr;
2306 
2307 	memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid));
2308 	return 0;
2309 }
2310 
2311 static int
2312 nvmf_vfio_user_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair,
2313 				   struct spdk_nvme_transport_id *trid)
2314 {
2315 	return 0;
2316 }
2317 
2318 static int
2319 nvmf_vfio_user_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair,
2320 				     struct spdk_nvme_transport_id *trid)
2321 {
2322 	struct nvmf_vfio_user_qpair *vu_qpair;
2323 	struct nvmf_vfio_user_ctrlr *ctrlr;
2324 
2325 	vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair);
2326 	ctrlr = vu_qpair->ctrlr;
2327 
2328 	memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid));
2329 	return 0;
2330 }
2331 
2332 static void
2333 nvmf_vfio_user_opts_init(struct spdk_nvmf_transport_opts *opts)
2334 {
2335 	opts->max_queue_depth =		NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH;
2336 	opts->max_qpairs_per_ctrlr =	NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR;
2337 	opts->in_capsule_data_size =	NVMF_VFIO_USER_DEFAULT_IN_CAPSULE_DATA_SIZE;
2338 	opts->max_io_size =		NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE;
2339 	opts->io_unit_size =		NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE;
2340 	opts->max_aq_depth =		NVMF_VFIO_USER_DEFAULT_AQ_DEPTH;
2341 	opts->num_shared_buffers =	NVMF_VFIO_USER_DEFAULT_NUM_SHARED_BUFFERS;
2342 	opts->buf_cache_size =		NVMF_VFIO_USER_DEFAULT_BUFFER_CACHE_SIZE;
2343 }
2344 
2345 const struct spdk_nvmf_transport_ops spdk_nvmf_transport_vfio_user = {
2346 	.name = "VFIOUSER",
2347 	.type = SPDK_NVME_TRANSPORT_VFIOUSER,
2348 	.opts_init = nvmf_vfio_user_opts_init,
2349 	.create = nvmf_vfio_user_create,
2350 	.destroy = nvmf_vfio_user_destroy,
2351 
2352 	.listen = nvmf_vfio_user_listen,
2353 	.stop_listen = nvmf_vfio_user_stop_listen,
2354 	.accept = nvmf_vfio_user_accept,
2355 	.listen_associate = nvmf_vfio_user_listen_associate,
2356 
2357 	.listener_discover = nvmf_vfio_user_discover,
2358 
2359 	.poll_group_create = nvmf_vfio_user_poll_group_create,
2360 	.poll_group_destroy = nvmf_vfio_user_poll_group_destroy,
2361 	.poll_group_add = nvmf_vfio_user_poll_group_add,
2362 	.poll_group_remove = nvmf_vfio_user_poll_group_remove,
2363 	.poll_group_poll = nvmf_vfio_user_poll_group_poll,
2364 
2365 	.req_free = nvmf_vfio_user_req_free,
2366 	.req_complete = nvmf_vfio_user_req_complete,
2367 
2368 	.qpair_fini = nvmf_vfio_user_close_qpair,
2369 	.qpair_get_local_trid = nvmf_vfio_user_qpair_get_local_trid,
2370 	.qpair_get_peer_trid = nvmf_vfio_user_qpair_get_peer_trid,
2371 	.qpair_get_listen_trid = nvmf_vfio_user_qpair_get_listen_trid,
2372 };
2373 
2374 SPDK_NVMF_TRANSPORT_REGISTER(muser, &spdk_nvmf_transport_vfio_user);
2375 SPDK_LOG_REGISTER_COMPONENT(nvmf_vfio)
2376