xref: /spdk/lib/nvmf/vfio_user.c (revision 6cebe9d06b14ad173e45d2b9be49b04f64b5fba3)
1 /*-
2  *   BSD LICENSE
3  *   Copyright (c) Intel Corporation. All rights reserved.
4  *   Copyright (c) 2019, Nutanix Inc. All rights reserved.
5  *
6  *   Redistribution and use in source and binary forms, with or without
7  *   modification, are permitted provided that the following conditions
8  *   are met:
9  *
10  *     * Redistributions of source code must retain the above copyright
11  *       notice, this list of conditions and the following disclaimer.
12  *     * Redistributions in binary form must reproduce the above copyright
13  *       notice, this list of conditions and the following disclaimer in
14  *       the documentation and/or other materials provided with the
15  *       distribution.
16  *     * Neither the name of Intel Corporation nor the names of its
17  *       contributors may be used to endorse or promote products derived
18  *       from this software without specific prior written permission.
19  *
20  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 /*
34  * NVMe over vfio-user transport
35  */
36 
37 #include <vfio-user/libvfio-user.h>
38 #include <vfio-user/pci_defs.h>
39 
40 #include "spdk/barrier.h"
41 #include "spdk/stdinc.h"
42 #include "spdk/assert.h"
43 #include "spdk/thread.h"
44 #include "spdk/nvmf_transport.h"
45 #include "spdk/sock.h"
46 #include "spdk/string.h"
47 #include "spdk/util.h"
48 #include "spdk/log.h"
49 
50 #include "transport.h"
51 
52 #include "nvmf_internal.h"
53 
54 #define NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH 256
55 #define NVMF_VFIO_USER_DEFAULT_AQ_DEPTH 32
56 #define NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR 64
57 #define NVMF_VFIO_USER_DEFAULT_IN_CAPSULE_DATA_SIZE 0
58 #define NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE ((NVMF_REQ_MAX_BUFFERS - 1) << SHIFT_4KB)
59 #define NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE
60 #define NVMF_VFIO_USER_DEFAULT_NUM_SHARED_BUFFERS 512 /* internal buf size */
61 #define NVMF_VFIO_USER_DEFAULT_BUFFER_CACHE_SIZE 0
62 
63 #define NVMF_VFIO_USER_DOORBELLS_OFFSET	0x1000
64 #define NVMF_VFIO_USER_DOORBELLS_SIZE 0x1000
65 
66 #define NVME_REG_CFG_SIZE       0x1000
67 #define NVME_REG_BAR0_SIZE      0x4000
68 #define NVME_IRQ_INTX_NUM       1
69 #define NVME_IRQ_MSIX_NUM	NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR
70 
71 struct nvmf_vfio_user_req;
72 struct nvmf_vfio_user_qpair;
73 
74 typedef int (*nvmf_vfio_user_req_cb_fn)(struct nvmf_vfio_user_req *req, void *cb_arg);
75 
76 /* 1 more for PRP2 list itself */
77 #define NVMF_VFIO_USER_MAX_IOVECS	(NVMF_REQ_MAX_BUFFERS + 1)
78 
79 enum nvmf_vfio_user_req_state {
80 	VFIO_USER_REQUEST_STATE_FREE = 0,
81 	VFIO_USER_REQUEST_STATE_EXECUTING,
82 };
83 
84 struct nvmf_vfio_user_req  {
85 	struct spdk_nvmf_request		req;
86 	struct spdk_nvme_cpl			rsp;
87 	struct spdk_nvme_cmd			cmd;
88 
89 	enum nvmf_vfio_user_req_state		state;
90 	nvmf_vfio_user_req_cb_fn		cb_fn;
91 	void					*cb_arg;
92 
93 	/* placeholder for gpa_to_vva memory map table, the IO buffer doesn't use it */
94 	dma_sg_t				sg[NVMF_VFIO_USER_MAX_IOVECS];
95 	struct iovec				iov[NVMF_VFIO_USER_MAX_IOVECS];
96 	uint8_t					iovcnt;
97 
98 	TAILQ_ENTRY(nvmf_vfio_user_req)		link;
99 };
100 
101 /*
102  * A NVMe queue.
103  */
104 struct nvme_q {
105 	bool is_cq;
106 
107 	void *addr;
108 
109 	dma_sg_t sg;
110 	struct iovec iov;
111 
112 	uint32_t size;
113 	uint64_t prp1;
114 
115 	union {
116 		struct {
117 			uint32_t head;
118 			/* multiple SQs can be mapped to the same CQ */
119 			uint16_t cqid;
120 		};
121 		struct {
122 			uint32_t tail;
123 			uint16_t iv;
124 			bool ien;
125 		};
126 	};
127 };
128 
129 enum nvmf_vfio_user_qpair_state {
130 	VFIO_USER_QPAIR_UNINITIALIZED = 0,
131 	VFIO_USER_QPAIR_ACTIVE,
132 	VFIO_USER_QPAIR_DELETED,
133 	VFIO_USER_QPAIR_INACTIVE,
134 	VFIO_USER_QPAIR_ERROR,
135 };
136 
137 struct nvmf_vfio_user_qpair {
138 	struct spdk_nvmf_qpair			qpair;
139 	struct spdk_nvmf_transport_poll_group	*group;
140 	struct nvmf_vfio_user_ctrlr		*ctrlr;
141 	struct nvmf_vfio_user_req		*reqs_internal;
142 	uint16_t				qsize;
143 	struct nvme_q				cq;
144 	struct nvme_q				sq;
145 	enum nvmf_vfio_user_qpair_state		state;
146 
147 	TAILQ_HEAD(, nvmf_vfio_user_req)	reqs;
148 	TAILQ_ENTRY(nvmf_vfio_user_qpair)	link;
149 };
150 
151 struct nvmf_vfio_user_poll_group {
152 	struct spdk_nvmf_transport_poll_group	group;
153 	TAILQ_HEAD(, nvmf_vfio_user_qpair)	qps;
154 };
155 
156 struct nvmf_vfio_user_ctrlr {
157 	struct nvmf_vfio_user_endpoint		*endpoint;
158 	struct nvmf_vfio_user_transport		*transport;
159 
160 	/* True when the socket connection is active */
161 	bool					ready;
162 	/* Number of connected queue pairs */
163 	uint32_t				num_connected_qps;
164 
165 	struct spdk_thread			*thread;
166 	struct spdk_poller			*mmio_poller;
167 
168 	uint16_t				cntlid;
169 
170 	struct nvmf_vfio_user_qpair		*qp[NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR];
171 
172 	TAILQ_ENTRY(nvmf_vfio_user_ctrlr)	link;
173 
174 	volatile uint32_t			*doorbells;
175 
176 	/* internal CSTS.CFS register for vfio-user fatal errors */
177 	uint32_t				cfs : 1;
178 };
179 
180 struct nvmf_vfio_user_endpoint {
181 	vfu_ctx_t				*vfu_ctx;
182 	struct msixcap				*msix;
183 	vfu_pci_config_space_t			*pci_config_space;
184 	int					fd;
185 	volatile uint32_t			*doorbells;
186 
187 	struct spdk_nvme_transport_id		trid;
188 	const struct spdk_nvmf_subsystem	*subsystem;
189 
190 	struct nvmf_vfio_user_ctrlr		*ctrlr;
191 	pthread_mutex_t				lock;
192 
193 	TAILQ_ENTRY(nvmf_vfio_user_endpoint)	link;
194 };
195 
196 struct nvmf_vfio_user_transport {
197 	struct spdk_nvmf_transport		transport;
198 	pthread_mutex_t				lock;
199 	TAILQ_HEAD(, nvmf_vfio_user_endpoint)	endpoints;
200 
201 	TAILQ_HEAD(, nvmf_vfio_user_qpair)	new_qps;
202 };
203 
204 /*
205  * function prototypes
206  */
207 static volatile uint32_t *
208 hdbl(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q);
209 
210 static volatile uint32_t *
211 tdbl(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q);
212 
213 static int
214 nvmf_vfio_user_req_free(struct spdk_nvmf_request *req);
215 
216 static struct nvmf_vfio_user_req *
217 get_nvmf_vfio_user_req(struct nvmf_vfio_user_qpair *qpair);
218 
219 static int
220 post_completion(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd,
221 		struct nvme_q *cq, uint32_t cdw0, uint16_t sc,
222 		uint16_t sct);
223 
224 static char *
225 endpoint_id(struct nvmf_vfio_user_endpoint *endpoint)
226 {
227 	return endpoint->trid.traddr;
228 }
229 
230 static char *
231 ctrlr_id(struct nvmf_vfio_user_ctrlr *ctrlr)
232 {
233 	if (!ctrlr || !ctrlr->endpoint) {
234 		return "Null Ctrlr";
235 	}
236 
237 	return endpoint_id(ctrlr->endpoint);
238 }
239 
240 static uint16_t
241 io_q_id(struct nvme_q *q)
242 {
243 
244 	struct nvmf_vfio_user_qpair *vfio_user_qpair;
245 
246 	assert(q);
247 
248 	if (q->is_cq) {
249 		vfio_user_qpair = SPDK_CONTAINEROF(q, struct nvmf_vfio_user_qpair, cq);
250 	} else {
251 		vfio_user_qpair = SPDK_CONTAINEROF(q, struct nvmf_vfio_user_qpair, sq);
252 	}
253 	assert(vfio_user_qpair);
254 	return vfio_user_qpair->qpair.qid;
255 }
256 
257 static void
258 fail_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr)
259 {
260 	assert(ctrlr != NULL);
261 
262 	if (ctrlr->cfs == 0) {
263 		SPDK_ERRLOG(":%s failing controller\n", ctrlr_id(ctrlr));
264 	}
265 
266 	ctrlr->ready = false;
267 	ctrlr->cfs = 1U;
268 }
269 
270 static bool
271 ctrlr_interrupt_enabled(struct nvmf_vfio_user_ctrlr *ctrlr)
272 {
273 	assert(ctrlr != NULL);
274 	assert(ctrlr->endpoint != NULL);
275 
276 	vfu_pci_config_space_t *pci = ctrlr->endpoint->pci_config_space;
277 
278 	return (!pci->hdr.cmd.id || ctrlr->endpoint->msix->mxc.mxe);
279 }
280 
281 static void
282 nvmf_vfio_user_destroy_endpoint(struct nvmf_vfio_user_endpoint *endpoint)
283 {
284 	if (endpoint->doorbells) {
285 		munmap((void *)endpoint->doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE);
286 	}
287 
288 	if (endpoint->fd > 0) {
289 		close(endpoint->fd);
290 	}
291 
292 	vfu_destroy_ctx(endpoint->vfu_ctx);
293 
294 	pthread_mutex_destroy(&endpoint->lock);
295 	free(endpoint);
296 }
297 
298 /* called when process exits */
299 static int
300 nvmf_vfio_user_destroy(struct spdk_nvmf_transport *transport,
301 		       spdk_nvmf_transport_destroy_done_cb cb_fn, void *cb_arg)
302 {
303 	struct nvmf_vfio_user_transport *vu_transport;
304 	struct nvmf_vfio_user_endpoint *endpoint, *tmp;
305 
306 	SPDK_DEBUGLOG(nvmf_vfio, "destroy transport\n");
307 
308 	vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport,
309 					transport);
310 
311 	(void)pthread_mutex_destroy(&vu_transport->lock);
312 
313 	TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) {
314 		TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link);
315 		nvmf_vfio_user_destroy_endpoint(endpoint);
316 	}
317 
318 	free(vu_transport);
319 
320 	if (cb_fn) {
321 		cb_fn(cb_arg);
322 	}
323 
324 	return 0;
325 }
326 
327 static struct spdk_nvmf_transport *
328 nvmf_vfio_user_create(struct spdk_nvmf_transport_opts *opts)
329 {
330 	struct nvmf_vfio_user_transport *vu_transport;
331 	int err;
332 
333 	vu_transport = calloc(1, sizeof(*vu_transport));
334 	if (vu_transport == NULL) {
335 		SPDK_ERRLOG("Transport alloc fail: %m\n");
336 		return NULL;
337 	}
338 
339 	err = pthread_mutex_init(&vu_transport->lock, NULL);
340 	if (err != 0) {
341 		SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err);
342 		goto err;
343 	}
344 
345 	TAILQ_INIT(&vu_transport->endpoints);
346 	TAILQ_INIT(&vu_transport->new_qps);
347 
348 	return &vu_transport->transport;
349 
350 err:
351 	free(vu_transport);
352 
353 	return NULL;
354 }
355 
356 static uint16_t
357 max_queue_size(struct nvmf_vfio_user_ctrlr const *ctrlr)
358 {
359 	assert(ctrlr != NULL);
360 	assert(ctrlr->qp[0] != NULL);
361 	assert(ctrlr->qp[0]->qpair.ctrlr != NULL);
362 
363 	return ctrlr->qp[0]->qpair.ctrlr->vcprop.cap.bits.mqes + 1;
364 }
365 
366 static void *
367 map_one(vfu_ctx_t *ctx, uint64_t addr, uint64_t len, dma_sg_t *sg, struct iovec *iov)
368 {
369 	int ret;
370 
371 	assert(ctx != NULL);
372 	assert(sg != NULL);
373 	assert(iov != NULL);
374 
375 	ret = vfu_addr_to_sg(ctx, (void *)(uintptr_t)addr, len, sg, 1, PROT_READ | PROT_WRITE);
376 	if (ret != 1) {
377 		return NULL;
378 	}
379 
380 	ret = vfu_map_sg(ctx, sg, iov, 1);
381 	if (ret != 0) {
382 		return NULL;
383 	}
384 
385 	assert(iov->iov_base != NULL);
386 	return iov->iov_base;
387 }
388 
389 static uint32_t
390 sq_head(struct nvmf_vfio_user_qpair *qpair)
391 {
392 	assert(qpair != NULL);
393 	return qpair->sq.head;
394 }
395 
396 static void
397 sqhd_advance(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_qpair *qpair)
398 {
399 	assert(ctrlr != NULL);
400 	assert(qpair != NULL);
401 	qpair->sq.head = (qpair->sq.head + 1) % qpair->sq.size;
402 }
403 
404 static void
405 insert_queue(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q,
406 	     const bool is_cq, const uint16_t id)
407 {
408 	struct nvme_q *_q;
409 	struct nvmf_vfio_user_qpair *qpair;
410 
411 	assert(ctrlr != NULL);
412 	assert(q != NULL);
413 
414 	qpair = ctrlr->qp[id];
415 
416 	q->is_cq = is_cq;
417 	if (is_cq) {
418 		_q = &qpair->cq;
419 		*_q = *q;
420 		*hdbl(ctrlr, _q) = 0;
421 	} else {
422 		_q = &qpair->sq;
423 		*_q = *q;
424 		*tdbl(ctrlr, _q) = 0;
425 	}
426 }
427 
428 static int
429 asq_map(struct nvmf_vfio_user_ctrlr *ctrlr)
430 {
431 	struct nvme_q q = {};
432 	const struct spdk_nvmf_registers *regs;
433 
434 	assert(ctrlr != NULL);
435 	assert(ctrlr->qp[0] != NULL);
436 	assert(ctrlr->qp[0]->sq.addr == NULL);
437 	/* XXX ctrlr->asq == 0 is a valid memory address */
438 
439 	regs = spdk_nvmf_ctrlr_get_regs(ctrlr->qp[0]->qpair.ctrlr);
440 	q.size = regs->aqa.bits.asqs + 1;
441 	q.head = ctrlr->doorbells[0] = 0;
442 	q.cqid = 0;
443 	q.addr = map_one(ctrlr->endpoint->vfu_ctx, regs->asq,
444 			 q.size * sizeof(struct spdk_nvme_cmd), &q.sg, &q.iov);
445 	if (q.addr == NULL) {
446 		return -1;
447 	}
448 	memset(q.addr, 0, q.size * sizeof(struct spdk_nvme_cmd));
449 	insert_queue(ctrlr, &q, false, 0);
450 
451 	return 0;
452 }
453 
454 static uint16_t
455 cq_next(struct nvme_q *q)
456 {
457 	assert(q != NULL);
458 	assert(q->is_cq);
459 	return (q->tail + 1) % q->size;
460 }
461 
462 static int
463 queue_index(uint16_t qid, int is_cq)
464 {
465 	return (qid * 2) + is_cq;
466 }
467 
468 static volatile uint32_t *
469 tdbl(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q)
470 {
471 	assert(ctrlr != NULL);
472 	assert(q != NULL);
473 	assert(!q->is_cq);
474 
475 	return &ctrlr->doorbells[queue_index(io_q_id(q), false)];
476 }
477 
478 static volatile uint32_t *
479 hdbl(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q)
480 {
481 	assert(ctrlr != NULL);
482 	assert(q != NULL);
483 	assert(q->is_cq);
484 
485 	return &ctrlr->doorbells[queue_index(io_q_id(q), true)];
486 }
487 
488 static bool
489 cq_is_full(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q)
490 {
491 	assert(ctrlr != NULL);
492 	assert(q != NULL);
493 	return cq_next(q) == *hdbl(ctrlr, q);
494 }
495 
496 static void
497 cq_tail_advance(struct nvme_q *q)
498 {
499 	assert(q != NULL);
500 	q->tail = cq_next(q);
501 }
502 
503 static int
504 acq_map(struct nvmf_vfio_user_ctrlr *ctrlr)
505 {
506 	struct nvme_q q = {};
507 	const struct spdk_nvmf_registers *regs;
508 
509 	assert(ctrlr != NULL);
510 	assert(ctrlr->qp[0] != NULL);
511 	assert(ctrlr->qp[0]->cq.addr == NULL);
512 
513 	regs = spdk_nvmf_ctrlr_get_regs(ctrlr->qp[0]->qpair.ctrlr);
514 	assert(regs != NULL);
515 
516 	q.size = regs->aqa.bits.acqs + 1;
517 	q.tail = 0;
518 	q.addr = map_one(ctrlr->endpoint->vfu_ctx, regs->acq,
519 			 q.size * sizeof(struct spdk_nvme_cpl), &q.sg, &q.iov);
520 	if (q.addr == NULL) {
521 		return -1;
522 	}
523 	memset(q.addr, 0, q.size * sizeof(struct spdk_nvme_cpl));
524 	q.is_cq = true;
525 	q.ien = true;
526 	insert_queue(ctrlr, &q, true, 0);
527 
528 	return 0;
529 }
530 
531 static void *
532 _map_one(void *prv, uint64_t addr, uint64_t len)
533 {
534 	struct spdk_nvmf_request *req = (struct spdk_nvmf_request *)prv;
535 	struct spdk_nvmf_qpair *qpair;
536 	struct nvmf_vfio_user_req *vu_req;
537 	struct nvmf_vfio_user_qpair *vu_qpair;
538 	void *ret;
539 
540 	assert(req != NULL);
541 	qpair = req->qpair;
542 	vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req);
543 	vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair);
544 
545 	assert(vu_req->iovcnt < NVMF_VFIO_USER_MAX_IOVECS);
546 	ret = map_one(vu_qpair->ctrlr->endpoint->vfu_ctx, addr, len,
547 		      &vu_req->sg[vu_req->iovcnt],
548 		      &vu_req->iov[vu_req->iovcnt]);
549 	if (spdk_likely(ret != NULL)) {
550 		vu_req->iovcnt++;
551 	}
552 	return ret;
553 }
554 
555 static int
556 vfio_user_map_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req,
557 		  struct iovec *iov, uint32_t length)
558 {
559 	/* Map PRP list to from Guest physical memory to
560 	 * virtual memory address.
561 	 */
562 	return spdk_nvme_map_cmd(req, &req->cmd->nvme_cmd, iov, NVMF_REQ_MAX_BUFFERS,
563 				 length, 4096, _map_one);
564 }
565 
566 static struct spdk_nvmf_request *
567 get_nvmf_req(struct nvmf_vfio_user_qpair *qp);
568 
569 static int
570 handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd,
571 	       struct spdk_nvmf_request *req);
572 
573 /*
574  * Posts a CQE in the completion queue.
575  *
576  * @ctrlr: the vfio-user controller
577  * @cmd: the NVMe command for which the completion is posted
578  * @cq: the completion queue
579  * @cdw0: cdw0 as reported by NVMf (only for SPDK_NVME_OPC_GET/SET_FEATURES)
580  * @sc: the NVMe CQE status code
581  * @sct: the NVMe CQE status code type
582  */
583 static int
584 post_completion(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd,
585 		struct nvme_q *cq, uint32_t cdw0, uint16_t sc,
586 		uint16_t sct)
587 {
588 	struct spdk_nvme_cpl *cpl;
589 	uint16_t qid;
590 	int err;
591 
592 	assert(ctrlr != NULL);
593 	assert(cmd != NULL);
594 
595 	qid = io_q_id(cq);
596 
597 	if (ctrlr->qp[0]->qpair.ctrlr->vcprop.csts.bits.shst != SPDK_NVME_SHST_NORMAL) {
598 		SPDK_DEBUGLOG(nvmf_vfio,
599 			      "%s: ignore completion SQ%d cid=%d status=%#x\n",
600 			      ctrlr_id(ctrlr), qid, cmd->cid, sc);
601 		return 0;
602 	}
603 
604 	if (cq_is_full(ctrlr, cq)) {
605 		SPDK_ERRLOG("%s: CQ%d full (tail=%d, head=%d)\n",
606 			    ctrlr_id(ctrlr), qid, cq->tail, *hdbl(ctrlr, cq));
607 		return -1;
608 	}
609 
610 	cpl = ((struct spdk_nvme_cpl *)cq->addr) + cq->tail;
611 
612 	SPDK_DEBUGLOG(nvmf_vfio,
613 		      "%s: request complete SQ%d cid=%d status=%#x SQ head=%#x CQ tail=%#x\n",
614 		      ctrlr_id(ctrlr), qid, cmd->cid, sc, ctrlr->qp[qid]->sq.head,
615 		      cq->tail);
616 
617 	if (qid == 0) {
618 		switch (cmd->opc) {
619 		case SPDK_NVME_OPC_SET_FEATURES:
620 		case SPDK_NVME_OPC_GET_FEATURES:
621 			cpl->cdw0 = cdw0;
622 			break;
623 		}
624 	}
625 
626 
627 	assert(ctrlr->qp[qid] != NULL);
628 
629 	cpl->sqhd = ctrlr->qp[qid]->sq.head;
630 	cpl->cid = cmd->cid;
631 	cpl->status.dnr = 0x0;
632 	cpl->status.m = 0x0;
633 	cpl->status.sct = sct;
634 	cpl->status.p = ~cpl->status.p;
635 	cpl->status.sc = sc;
636 
637 	cq_tail_advance(cq);
638 
639 	/*
640 	 * this function now executes at SPDK thread context, we
641 	 * might be triggerring interrupts from vfio-user thread context so
642 	 * check for race conditions.
643 	 */
644 	if (ctrlr_interrupt_enabled(ctrlr) && cq->ien) {
645 		err = vfu_irq_trigger(ctrlr->endpoint->vfu_ctx, cq->iv);
646 		if (err != 0) {
647 			SPDK_ERRLOG("%s: failed to trigger interrupt: %m\n",
648 				    ctrlr_id(ctrlr));
649 			return err;
650 		}
651 	}
652 
653 	return 0;
654 }
655 
656 static struct nvme_q *
657 lookup_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, const uint16_t qid, const bool is_cq)
658 {
659 	struct nvme_q *q;
660 
661 	assert(ctrlr != NULL);
662 
663 	if (qid > NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR) {
664 		return NULL;
665 	}
666 
667 	if (ctrlr->qp[qid] == NULL) {
668 		return NULL;
669 	}
670 
671 	if (is_cq) {
672 		q = &ctrlr->qp[qid]->cq;
673 	} else {
674 		q = &ctrlr->qp[qid]->sq;
675 	}
676 
677 	if (q->addr == NULL) {
678 		return NULL;
679 	}
680 
681 	return q;
682 }
683 
684 static void
685 unmap_qp(struct nvmf_vfio_user_qpair *qp)
686 {
687 	struct nvmf_vfio_user_ctrlr *ctrlr;
688 
689 	if (qp->ctrlr == NULL) {
690 		return;
691 	}
692 	ctrlr = qp->ctrlr;
693 
694 	SPDK_DEBUGLOG(nvmf_vfio, "%s: destroy I/O QP%d\n",
695 		      ctrlr_id(ctrlr), qp->qpair.qid);
696 
697 	if (qp->sq.addr != NULL) {
698 		vfu_unmap_sg(ctrlr->endpoint->vfu_ctx, &qp->sq.sg, &qp->sq.iov, 1);
699 		qp->sq.addr = NULL;
700 	}
701 
702 	if (qp->cq.addr != NULL) {
703 		vfu_unmap_sg(ctrlr->endpoint->vfu_ctx, &qp->cq.sg, &qp->cq.iov, 1);
704 		qp->cq.addr = NULL;
705 	}
706 }
707 
708 /*
709  * TODO we can immediately remove the QP from the list because this function
710  * is now executed by the SPDK thread.
711  */
712 static void
713 destroy_qp(struct nvmf_vfio_user_ctrlr *ctrlr, uint16_t qid)
714 {
715 	struct nvmf_vfio_user_qpair *qpair;
716 
717 	if (ctrlr == NULL) {
718 		return;
719 	}
720 
721 	qpair = ctrlr->qp[qid];
722 	if (qpair == NULL) {
723 		return;
724 	}
725 
726 	SPDK_DEBUGLOG(nvmf_vfio, "%s: destroy QP%d=%p\n", ctrlr_id(ctrlr),
727 		      qid, qpair);
728 
729 	unmap_qp(qpair);
730 	free(qpair->reqs_internal);
731 	free(qpair);
732 	ctrlr->qp[qid] = NULL;
733 }
734 
735 /* This function can only fail because of memory allocation errors. */
736 static int
737 init_qp(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_transport *transport,
738 	const uint16_t qsize, const uint16_t id)
739 {
740 	int err = 0, i;
741 	struct nvmf_vfio_user_qpair *qpair;
742 	struct nvmf_vfio_user_req *vu_req;
743 	struct spdk_nvmf_request *req;
744 
745 	assert(ctrlr != NULL);
746 	assert(transport != NULL);
747 
748 	qpair = calloc(1, sizeof(*qpair));
749 	if (qpair == NULL) {
750 		return -ENOMEM;
751 	}
752 
753 	qpair->qpair.qid = id;
754 	qpair->qpair.transport = transport;
755 	qpair->ctrlr = ctrlr;
756 	qpair->qsize = qsize;
757 
758 	TAILQ_INIT(&qpair->reqs);
759 
760 	qpair->reqs_internal = calloc(qsize, sizeof(struct nvmf_vfio_user_req));
761 	if (qpair->reqs_internal == NULL) {
762 		SPDK_ERRLOG("%s: error allocating reqs: %m\n", ctrlr_id(ctrlr));
763 		err = -ENOMEM;
764 		goto out;
765 	}
766 
767 	for (i = 0; i < qsize; i++) {
768 		vu_req = &qpair->reqs_internal[i];
769 		req = &vu_req->req;
770 
771 		req->qpair = &qpair->qpair;
772 		req->rsp = (union nvmf_c2h_msg *)&vu_req->rsp;
773 		req->cmd = (union nvmf_h2c_msg *)&vu_req->cmd;
774 
775 		TAILQ_INSERT_TAIL(&qpair->reqs, vu_req, link);
776 	}
777 	ctrlr->qp[id] = qpair;
778 out:
779 	if (err != 0) {
780 		free(qpair);
781 	}
782 	return err;
783 }
784 
785 /*
786  * Creates a completion or sumbission I/O queue. Returns 0 on success, -errno
787  * on error.
788  *
789  * XXX SPDK thread context.
790  */
791 static int
792 handle_create_io_q(struct nvmf_vfio_user_ctrlr *ctrlr,
793 		   struct spdk_nvme_cmd *cmd, const bool is_cq)
794 {
795 	size_t entry_size;
796 	uint16_t sc = SPDK_NVME_SC_SUCCESS;
797 	uint16_t sct = SPDK_NVME_SCT_GENERIC;
798 	int err = 0;
799 	struct nvme_q io_q = {};
800 
801 	assert(ctrlr != NULL);
802 	assert(cmd != NULL);
803 
804 	SPDK_DEBUGLOG(nvmf_vfio,
805 		      "%s: create I/O %cQ%d: QSIZE=%#x\n", ctrlr_id(ctrlr),
806 		      is_cq ? 'C' : 'S', cmd->cdw10_bits.create_io_q.qid,
807 		      cmd->cdw10_bits.create_io_q.qsize);
808 
809 	if (cmd->cdw10_bits.create_io_q.qid >= NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR) {
810 		SPDK_ERRLOG("%s: invalid QID=%d, max=%d\n", ctrlr_id(ctrlr),
811 			    cmd->cdw10_bits.create_io_q.qid,
812 			    NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR);
813 		sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
814 		sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER;
815 		goto out;
816 	}
817 
818 	if (lookup_io_q(ctrlr, cmd->cdw10_bits.create_io_q.qid, is_cq)) {
819 		SPDK_ERRLOG("%s: %cQ%d already exists\n", ctrlr_id(ctrlr),
820 			    is_cq ? 'C' : 'S', cmd->cdw10_bits.create_io_q.qid);
821 		sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
822 		sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER;
823 		goto out;
824 	}
825 
826 	/* TODO break rest of this function into smaller functions */
827 	if (is_cq) {
828 		entry_size = sizeof(struct spdk_nvme_cpl);
829 		if (cmd->cdw11_bits.create_io_cq.pc != 0x1) {
830 			/*
831 			 * TODO CAP.CMBS is currently set to zero, however we
832 			 * should zero it out explicitly when CAP is read.
833 			 * Support for CAP.CMBS is not mentioned in the NVMf
834 			 * spec.
835 			 */
836 			SPDK_ERRLOG("%s: non-PC CQ not supporred\n", ctrlr_id(ctrlr));
837 			sc = SPDK_NVME_SC_INVALID_CONTROLLER_MEM_BUF;
838 			goto out;
839 		}
840 		io_q.ien = cmd->cdw11_bits.create_io_cq.ien;
841 		io_q.iv = cmd->cdw11_bits.create_io_cq.iv;
842 	} else {
843 		/* CQ must be created before SQ */
844 		if (!lookup_io_q(ctrlr, cmd->cdw11_bits.create_io_sq.cqid, true)) {
845 			SPDK_ERRLOG("%s: CQ%d does not exist\n", ctrlr_id(ctrlr),
846 				    cmd->cdw11_bits.create_io_sq.cqid);
847 			sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
848 			sc = SPDK_NVME_SC_COMPLETION_QUEUE_INVALID;
849 			goto out;
850 		}
851 
852 		entry_size = sizeof(struct spdk_nvme_cmd);
853 		if (cmd->cdw11_bits.create_io_sq.pc != 0x1) {
854 			SPDK_ERRLOG("%s: non-PC SQ not supported\n", ctrlr_id(ctrlr));
855 			sc = SPDK_NVME_SC_INVALID_CONTROLLER_MEM_BUF;
856 			goto out;
857 		}
858 
859 		io_q.cqid = cmd->cdw11_bits.create_io_sq.cqid;
860 		SPDK_DEBUGLOG(nvmf_vfio, "%s: SQ%d CQID=%d\n", ctrlr_id(ctrlr),
861 			      cmd->cdw10_bits.create_io_q.qid, io_q.cqid);
862 	}
863 
864 	io_q.size = cmd->cdw10_bits.create_io_q.qsize + 1;
865 	if (io_q.size > max_queue_size(ctrlr)) {
866 		SPDK_ERRLOG("%s: queue too big, want=%d, max=%d\n", ctrlr_id(ctrlr),
867 			    io_q.size, max_queue_size(ctrlr));
868 		sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
869 		sc = SPDK_NVME_SC_INVALID_QUEUE_SIZE;
870 		goto out;
871 	}
872 
873 	io_q.addr = map_one(ctrlr->endpoint->vfu_ctx, cmd->dptr.prp.prp1,
874 			    io_q.size * entry_size, &io_q.sg, &io_q.iov);
875 	if (io_q.addr == NULL) {
876 		sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
877 		SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr));
878 		goto out;
879 	}
880 	io_q.prp1 = cmd->dptr.prp.prp1;
881 	memset(io_q.addr, 0, io_q.size * entry_size);
882 
883 	SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped %cQ%d IOVA=%#lx vaddr=%#llx\n",
884 		      ctrlr_id(ctrlr), is_cq ? 'C' : 'S',
885 		      cmd->cdw10_bits.create_io_q.qid, cmd->dptr.prp.prp1,
886 		      (unsigned long long)io_q.addr);
887 
888 	if (is_cq) {
889 		err = init_qp(ctrlr, ctrlr->qp[0]->qpair.transport, io_q.size,
890 			      cmd->cdw10_bits.create_io_q.qid);
891 		if (err != 0) {
892 			sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
893 			goto out;
894 		}
895 	} else {
896 		/*
897 		 * After we've returned from the nvmf_vfio_user_poll_group_poll thread, once
898 		 * nvmf_vfio_user_accept executes it will pick up this QP and will eventually
899 		 * call nvmf_vfio_user_poll_group_add. The rest of the opertion needed to
900 		 * complete the addition of the queue will be continued at the
901 		 * completion callback.
902 		 */
903 		TAILQ_INSERT_TAIL(&ctrlr->transport->new_qps, ctrlr->qp[cmd->cdw10_bits.create_io_q.qid], link);
904 
905 	}
906 	insert_queue(ctrlr, &io_q, is_cq, cmd->cdw10_bits.create_io_q.qid);
907 
908 out:
909 	return post_completion(ctrlr, cmd, &ctrlr->qp[0]->cq, 0, sc, sct);
910 }
911 
912 /*
913  * Deletes a completion or sumbission I/O queue.
914  */
915 static int
916 handle_del_io_q(struct nvmf_vfio_user_ctrlr *ctrlr,
917 		struct spdk_nvme_cmd *cmd, const bool is_cq)
918 {
919 	uint16_t sct = SPDK_NVME_SCT_GENERIC;
920 	uint16_t sc = SPDK_NVME_SC_SUCCESS;
921 
922 	SPDK_DEBUGLOG(nvmf_vfio, "%s: delete I/O %cQ: QID=%d\n",
923 		      ctrlr_id(ctrlr), is_cq ? 'C' : 'S',
924 		      cmd->cdw10_bits.delete_io_q.qid);
925 
926 	if (lookup_io_q(ctrlr, cmd->cdw10_bits.delete_io_q.qid, is_cq) == NULL) {
927 		SPDK_ERRLOG("%s: %cQ%d does not exist\n", ctrlr_id(ctrlr),
928 			    is_cq ? 'C' : 'S', cmd->cdw10_bits.delete_io_q.qid);
929 		sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
930 		sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER;
931 		goto out;
932 	}
933 
934 	if (is_cq) {
935 		/* SQ must have been deleted first */
936 		if (ctrlr->qp[cmd->cdw10_bits.delete_io_q.qid]->state != VFIO_USER_QPAIR_DELETED) {
937 			SPDK_ERRLOG("%s: the associated SQ must be deleted first\n", ctrlr_id(ctrlr));
938 			sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
939 			sc = SPDK_NVME_SC_INVALID_QUEUE_DELETION;
940 			goto out;
941 		}
942 	} else {
943 		/*
944 		 * This doesn't actually delete the I/O queue, we can't
945 		 * do that anyway because NVMf doesn't support it. We're merely
946 		 * telling the poll_group_poll function to skip checking this
947 		 * queue. The only workflow this works is when CC.EN is set to
948 		 * 0 and we're stopping the subsystem, so we know that the
949 		 * relevant callbacks to destroy the queues will be called.
950 		 */
951 		assert(ctrlr->qp[cmd->cdw10_bits.delete_io_q.qid]->state == VFIO_USER_QPAIR_ACTIVE);
952 		ctrlr->qp[cmd->cdw10_bits.delete_io_q.qid]->state = VFIO_USER_QPAIR_DELETED;
953 	}
954 
955 out:
956 	return post_completion(ctrlr, cmd, &ctrlr->qp[0]->cq, 0, sc, sct);
957 }
958 
959 /*
960  * Returns 0 on success and -errno on error.
961  *
962  * XXX SPDK thread context
963  */
964 static int
965 consume_admin_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd)
966 {
967 	assert(ctrlr != NULL);
968 	assert(cmd != NULL);
969 
970 	SPDK_DEBUGLOG(nvmf_vfio, "%s: handle admin req opc=%#x cid=%d\n",
971 		      ctrlr_id(ctrlr), cmd->opc, cmd->cid);
972 
973 	switch (cmd->opc) {
974 	case SPDK_NVME_OPC_CREATE_IO_CQ:
975 	case SPDK_NVME_OPC_CREATE_IO_SQ:
976 		return handle_create_io_q(ctrlr, cmd,
977 					  cmd->opc == SPDK_NVME_OPC_CREATE_IO_CQ);
978 	case SPDK_NVME_OPC_DELETE_IO_SQ:
979 	case SPDK_NVME_OPC_DELETE_IO_CQ:
980 		return handle_del_io_q(ctrlr, cmd,
981 				       cmd->opc == SPDK_NVME_OPC_DELETE_IO_CQ);
982 	default:
983 		return handle_cmd_req(ctrlr, cmd, get_nvmf_req(ctrlr->qp[0]));
984 	}
985 }
986 
987 static int
988 handle_cmd_rsp(struct nvmf_vfio_user_req *req, void *cb_arg)
989 {
990 	struct nvmf_vfio_user_qpair *qpair = cb_arg;
991 
992 	assert(qpair != NULL);
993 	assert(req != NULL);
994 
995 	vfu_unmap_sg(qpair->ctrlr->endpoint->vfu_ctx, req->sg, req->iov, req->iovcnt);
996 
997 	return post_completion(qpair->ctrlr, &req->req.cmd->nvme_cmd,
998 			       &qpair->ctrlr->qp[req->req.qpair->qid]->cq,
999 			       req->req.rsp->nvme_cpl.cdw0,
1000 			       req->req.rsp->nvme_cpl.status.sc,
1001 			       req->req.rsp->nvme_cpl.status.sct);
1002 }
1003 
1004 static int
1005 consume_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_qpair *qpair,
1006 	    struct spdk_nvme_cmd *cmd)
1007 {
1008 	assert(qpair != NULL);
1009 	if (nvmf_qpair_is_admin_queue(&qpair->qpair)) {
1010 		return consume_admin_cmd(ctrlr, cmd);
1011 	}
1012 
1013 	return handle_cmd_req(ctrlr, cmd, get_nvmf_req(qpair));
1014 }
1015 
1016 static ssize_t
1017 handle_sq_tdbl_write(struct nvmf_vfio_user_ctrlr *ctrlr, const uint32_t new_tail,
1018 		     struct nvmf_vfio_user_qpair *qpair)
1019 {
1020 	struct spdk_nvme_cmd *queue;
1021 
1022 	assert(ctrlr != NULL);
1023 	assert(qpair != NULL);
1024 
1025 	queue = qpair->sq.addr;
1026 	while (sq_head(qpair) != new_tail) {
1027 		int err;
1028 		struct spdk_nvme_cmd *cmd = &queue[sq_head(qpair)];
1029 
1030 		/*
1031 		 * SQHD must contain the new head pointer, so we must increase
1032 		 * it before we generate a completion.
1033 		 */
1034 		sqhd_advance(ctrlr, qpair);
1035 
1036 		err = consume_cmd(ctrlr, qpair, cmd);
1037 		if (err != 0) {
1038 			return err;
1039 		}
1040 	}
1041 
1042 	return 0;
1043 }
1044 
1045 static int
1046 map_admin_queue(struct nvmf_vfio_user_ctrlr *ctrlr)
1047 {
1048 	int err;
1049 
1050 	assert(ctrlr != NULL);
1051 
1052 	err = acq_map(ctrlr);
1053 	if (err != 0) {
1054 		return err;
1055 	}
1056 
1057 	err = asq_map(ctrlr);
1058 	if (err != 0) {
1059 		return err;
1060 	}
1061 
1062 	return 0;
1063 }
1064 
1065 static void
1066 unmap_admin_queue(struct nvmf_vfio_user_ctrlr *ctrlr)
1067 {
1068 	assert(ctrlr->qp[0] != NULL);
1069 
1070 	unmap_qp(ctrlr->qp[0]);
1071 }
1072 
1073 static void
1074 memory_region_add_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
1075 {
1076 	struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx);
1077 	struct nvmf_vfio_user_ctrlr *ctrlr;
1078 	struct nvmf_vfio_user_qpair *qpair;
1079 	int i, ret;
1080 
1081 	/*
1082 	 * We're not interested in any DMA regions that aren't mappable (we don't
1083 	 * support clients that don't share their memory).
1084 	 */
1085 	if (!info->vaddr) {
1086 		return;
1087 	}
1088 
1089 	if (((uintptr_t)info->mapping.iov_base & MASK_2MB) ||
1090 	    (info->mapping.iov_len & MASK_2MB)) {
1091 		SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %#lx-%#lx\n", info->vaddr,
1092 			      (uintptr_t)info->mapping.iov_base,
1093 			      (uintptr_t)info->mapping.iov_base + info->mapping.iov_len);
1094 		return;
1095 	}
1096 
1097 	assert(endpoint != NULL);
1098 	if (endpoint->ctrlr == NULL) {
1099 		return;
1100 	}
1101 	ctrlr = endpoint->ctrlr;
1102 
1103 	SPDK_DEBUGLOG(nvmf_vfio, "%s: map IOVA %#lx-%#lx\n", ctrlr_id(ctrlr),
1104 		      (uintptr_t)info->mapping.iov_base,
1105 		      (uintptr_t)info->mapping.iov_base + info->mapping.iov_len);
1106 
1107 	/* VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE are enabled when registering to VFIO, here we also
1108 	 * check the protection bits before registering.
1109 	 */
1110 	if ((info->prot == (PROT_WRITE | PROT_READ)) &&
1111 	    (spdk_mem_register(info->mapping.iov_base, info->mapping.iov_len))) {
1112 		SPDK_ERRLOG("Memory region register %#lx-%#lx failed\n",
1113 			    (uint64_t)(uintptr_t)info->mapping.iov_base,
1114 			    (uint64_t)(uintptr_t)info->mapping.iov_base + info->mapping.iov_len);
1115 	}
1116 
1117 	for (i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; i++) {
1118 		qpair = ctrlr->qp[i];
1119 		if (qpair == NULL) {
1120 			continue;
1121 		}
1122 
1123 		if (qpair->state != VFIO_USER_QPAIR_INACTIVE) {
1124 			continue;
1125 		}
1126 
1127 		if (nvmf_qpair_is_admin_queue(&qpair->qpair)) {
1128 			ret = map_admin_queue(ctrlr);
1129 			if (ret) {
1130 				SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap Admin queue\n");
1131 				continue;
1132 			}
1133 			qpair->state = VFIO_USER_QPAIR_ACTIVE;
1134 		} else {
1135 			struct nvme_q *sq = &qpair->sq;
1136 			struct nvme_q *cq = &qpair->cq;
1137 
1138 			sq->addr = map_one(ctrlr->endpoint->vfu_ctx, sq->prp1, sq->size * 64, &sq->sg, &sq->iov);
1139 			if (!sq->addr) {
1140 				SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap SQID %d %#lx-%#lx\n",
1141 					      i, sq->prp1, sq->prp1 + sq->size * 64);
1142 				continue;
1143 			}
1144 			cq->addr = map_one(ctrlr->endpoint->vfu_ctx, cq->prp1, cq->size * 16, &cq->sg, &cq->iov);
1145 			if (!cq->addr) {
1146 				SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap CQID %d %#lx-%#lx\n",
1147 					      i, cq->prp1, cq->prp1 + cq->size * 16);
1148 				continue;
1149 			}
1150 			qpair->state = VFIO_USER_QPAIR_ACTIVE;
1151 		}
1152 	}
1153 }
1154 
1155 static int
1156 memory_region_remove_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
1157 {
1158 
1159 	struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx);
1160 	struct nvmf_vfio_user_ctrlr *ctrlr;
1161 	struct nvmf_vfio_user_qpair *qpair;
1162 	void *map_start, *map_end;
1163 	int i;
1164 
1165 	if (!info->vaddr) {
1166 		return 0;
1167 	}
1168 
1169 	if (((uintptr_t)info->mapping.iov_base & MASK_2MB) ||
1170 	    (info->mapping.iov_len & MASK_2MB)) {
1171 		SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %#lx-%#lx\n", info->vaddr,
1172 			      (uintptr_t)info->mapping.iov_base,
1173 			      (uintptr_t)info->mapping.iov_base + info->mapping.iov_len);
1174 		return 0;
1175 	}
1176 
1177 	assert(endpoint != NULL);
1178 	if (endpoint->ctrlr == NULL) {
1179 		return 0;
1180 	}
1181 	ctrlr = endpoint->ctrlr;
1182 
1183 	SPDK_DEBUGLOG(nvmf_vfio, "%s: unmap IOVA %#lx-%#lx\n", ctrlr_id(ctrlr),
1184 		      (uintptr_t)info->mapping.iov_base,
1185 		      (uintptr_t)info->mapping.iov_base + info->mapping.iov_len);
1186 
1187 	if ((info->prot == (PROT_WRITE | PROT_READ)) &&
1188 	    (spdk_mem_unregister(info->mapping.iov_base, info->mapping.iov_len))) {
1189 		SPDK_ERRLOG("Memory region unregister %#lx-%#lx failed\n",
1190 			    (uint64_t)(uintptr_t)info->mapping.iov_base,
1191 			    (uint64_t)(uintptr_t)info->mapping.iov_base + info->mapping.iov_len);
1192 	}
1193 
1194 	map_start = info->mapping.iov_base;
1195 	map_end = info->mapping.iov_base + info->mapping.iov_len;
1196 	for (i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; i++) {
1197 		qpair = ctrlr->qp[i];
1198 		if (qpair == NULL) {
1199 			continue;
1200 		}
1201 
1202 		if ((qpair->cq.addr >= map_start && qpair->cq.addr < map_end) ||
1203 		    (qpair->sq.addr >= map_start && qpair->sq.addr < map_end)) {
1204 			unmap_qp(qpair);
1205 			qpair->state = VFIO_USER_QPAIR_INACTIVE;
1206 		}
1207 	}
1208 
1209 	return 0;
1210 }
1211 
1212 static int
1213 nvmf_vfio_user_prop_req_rsp(struct nvmf_vfio_user_req *req, void *cb_arg)
1214 {
1215 	struct nvmf_vfio_user_qpair *qpair = cb_arg;
1216 	int ret;
1217 
1218 	assert(qpair != NULL);
1219 	assert(req != NULL);
1220 
1221 	if (req->req.cmd->prop_get_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET) {
1222 		assert(qpair->ctrlr != NULL);
1223 		assert(req != NULL);
1224 
1225 		memcpy(req->req.data,
1226 		       &req->req.rsp->prop_get_rsp.value.u64,
1227 		       req->req.length);
1228 	} else {
1229 		assert(req->req.cmd->prop_set_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET);
1230 		assert(qpair->ctrlr != NULL);
1231 
1232 		if (req->req.cmd->prop_set_cmd.ofst == offsetof(struct spdk_nvme_registers, cc)) {
1233 			union spdk_nvme_cc_register *cc;
1234 
1235 			cc = (union spdk_nvme_cc_register *)&req->req.cmd->prop_set_cmd.value.u64;
1236 
1237 			if (cc->bits.en == 1 && cc->bits.shn == 0) {
1238 				SPDK_DEBUGLOG(nvmf_vfio,
1239 					      "%s: MAP Admin queue\n",
1240 					      ctrlr_id(qpair->ctrlr));
1241 				ret = map_admin_queue(qpair->ctrlr);
1242 				if (ret) {
1243 					SPDK_ERRLOG("%s: failed to map Admin queue\n", ctrlr_id(qpair->ctrlr));
1244 					return ret;
1245 				}
1246 				qpair->state = VFIO_USER_QPAIR_ACTIVE;
1247 			} else if ((cc->bits.en == 0 && cc->bits.shn == 0) ||
1248 				   (cc->bits.en == 1 && cc->bits.shn != 0)) {
1249 				SPDK_DEBUGLOG(nvmf_vfio,
1250 					      "%s: UNMAP Admin queue\n",
1251 					      ctrlr_id(qpair->ctrlr));
1252 				unmap_admin_queue(qpair->ctrlr);
1253 				qpair->state = VFIO_USER_QPAIR_INACTIVE;
1254 			}
1255 		}
1256 	}
1257 
1258 	return 0;
1259 }
1260 
1261 /*
1262  * XXX Do NOT remove, see comment in access_bar0_fn.
1263  *
1264  * Handles a write at offset 0x1000 or more.
1265  *
1266  * DSTRD is set to fixed value 0 for NVMf.
1267  *
1268  */
1269 static int
1270 handle_dbl_access(struct nvmf_vfio_user_ctrlr *ctrlr, uint32_t *buf,
1271 		  const size_t count, loff_t pos, const bool is_write)
1272 {
1273 	assert(ctrlr != NULL);
1274 	assert(buf != NULL);
1275 
1276 	if (count != sizeof(uint32_t)) {
1277 		SPDK_ERRLOG("%s: bad doorbell buffer size %ld\n",
1278 			    ctrlr_id(ctrlr), count);
1279 		errno = EINVAL;
1280 		return -1;
1281 	}
1282 
1283 	pos -= NVMF_VFIO_USER_DOORBELLS_OFFSET;
1284 
1285 	/* pos must be dword aligned */
1286 	if ((pos & 0x3) != 0) {
1287 		SPDK_ERRLOG("%s: bad doorbell offset %#lx\n", ctrlr_id(ctrlr), pos);
1288 		errno = EINVAL;
1289 		return -1;
1290 	}
1291 
1292 	/* convert byte offset to array index */
1293 	pos >>= 2;
1294 
1295 	if (pos > NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR * 2) {
1296 		/*
1297 		 * TODO: need to emit a "Write to Invalid Doorbell Register"
1298 		 * asynchronous event
1299 		 */
1300 		SPDK_ERRLOG("%s: bad doorbell index %#lx\n", ctrlr_id(ctrlr), pos);
1301 		errno = EINVAL;
1302 		return -1;
1303 	}
1304 
1305 	if (is_write) {
1306 		ctrlr->doorbells[pos] = *buf;
1307 		spdk_wmb();
1308 	} else {
1309 		spdk_rmb();
1310 		*buf = ctrlr->doorbells[pos];
1311 	}
1312 	return 0;
1313 }
1314 
1315 static ssize_t
1316 access_bar0_fn(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t pos,
1317 	       bool is_write)
1318 {
1319 	struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx);
1320 	struct nvmf_vfio_user_ctrlr *ctrlr;
1321 	struct nvmf_vfio_user_req *req;
1322 	int ret;
1323 
1324 	ctrlr = endpoint->ctrlr;
1325 
1326 	SPDK_DEBUGLOG(nvmf_vfio,
1327 		      "%s: bar0 %s ctrlr: %p, count=%zu, pos=%"PRIX64"\n",
1328 		      endpoint_id(endpoint), is_write ? "write" : "read",
1329 		      ctrlr, count, pos);
1330 
1331 	if (pos >= NVMF_VFIO_USER_DOORBELLS_OFFSET) {
1332 		/*
1333 		 * XXX The fact that the doorbells can be memory mapped doesn't
1334 		 * mean thath the client (VFIO in QEMU) is obliged to memory
1335 		 * map them, it might still elect to access them via regular
1336 		 * read/write.
1337 		 */
1338 		ret = handle_dbl_access(ctrlr, (uint32_t *)buf, count,
1339 					pos, is_write);
1340 		if (ret == 0) {
1341 			return count;
1342 		}
1343 		assert(errno != 0);
1344 		return ret;
1345 	}
1346 
1347 	/* Construct a Fabric Property Get/Set command and send it */
1348 	req = get_nvmf_vfio_user_req(ctrlr->qp[0]);
1349 	if (req == NULL) {
1350 		errno = ENOBUFS;
1351 		return -1;
1352 	}
1353 
1354 	req->cb_fn = nvmf_vfio_user_prop_req_rsp;
1355 	req->cb_arg = ctrlr->qp[0];
1356 	req->req.cmd->prop_set_cmd.opcode = SPDK_NVME_OPC_FABRIC;
1357 	req->req.cmd->prop_set_cmd.cid = 0;
1358 	req->req.cmd->prop_set_cmd.attrib.size = (count / 4) - 1;
1359 	req->req.cmd->prop_set_cmd.ofst = pos;
1360 	if (is_write) {
1361 		req->req.cmd->prop_set_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET;
1362 		if (req->req.cmd->prop_set_cmd.attrib.size) {
1363 			req->req.cmd->prop_set_cmd.value.u64 = *(uint64_t *)buf;
1364 		} else {
1365 			req->req.cmd->prop_set_cmd.value.u32.high = 0;
1366 			req->req.cmd->prop_set_cmd.value.u32.low = *(uint32_t *)buf;
1367 		}
1368 	} else {
1369 		req->req.cmd->prop_get_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET;
1370 	}
1371 	req->req.length = count;
1372 	req->req.data = buf;
1373 
1374 	spdk_nvmf_request_exec_fabrics(&req->req);
1375 
1376 	return count;
1377 }
1378 
1379 /*
1380  * NVMe driver reads 4096 bytes, which is the extended PCI configuration space
1381  * available on PCI-X 2.0 and PCI Express buses
1382  */
1383 static ssize_t
1384 access_pci_config(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t offset,
1385 		  bool is_write)
1386 {
1387 	struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx);
1388 
1389 	if (is_write) {
1390 		SPDK_ERRLOG("%s: write %#lx-%#lx not supported\n",
1391 			    endpoint_id(endpoint), offset, offset + count);
1392 		errno = EINVAL;
1393 		return -1;
1394 	}
1395 
1396 	if (offset + count > PCI_CFG_SPACE_EXP_SIZE) {
1397 		SPDK_ERRLOG("%s: access past end of extended PCI configuration space, want=%ld+%ld, max=%d\n",
1398 			    endpoint_id(endpoint), offset, count,
1399 			    PCI_CFG_SPACE_EXP_SIZE);
1400 		errno = ERANGE;
1401 		return -1;
1402 	}
1403 
1404 	memcpy(buf, ((unsigned char *)endpoint->pci_config_space) + offset, count);
1405 
1406 	return count;
1407 }
1408 
1409 static void
1410 vfio_user_log(vfu_ctx_t *vfu_ctx, int level, char const *msg)
1411 {
1412 	struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx);
1413 
1414 	if (level >= LOG_DEBUG) {
1415 		SPDK_DEBUGLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg);
1416 	} else if (level >= LOG_INFO) {
1417 		SPDK_INFOLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg);
1418 	} else if (level >= LOG_NOTICE) {
1419 		SPDK_NOTICELOG("%s: %s\n", endpoint_id(endpoint), msg);
1420 	} else if (level >= LOG_WARNING) {
1421 		SPDK_WARNLOG("%s: %s\n", endpoint_id(endpoint), msg);
1422 	} else {
1423 		SPDK_ERRLOG("%s: %s\n", endpoint_id(endpoint), msg);
1424 	}
1425 }
1426 
1427 static void
1428 init_pci_config_space(vfu_pci_config_space_t *p)
1429 {
1430 	/* MLBAR */
1431 	p->hdr.bars[0].raw = 0x0;
1432 	/* MUBAR */
1433 	p->hdr.bars[1].raw = 0x0;
1434 
1435 	/* vendor specific, let's set them to zero for now */
1436 	p->hdr.bars[3].raw = 0x0;
1437 	p->hdr.bars[4].raw = 0x0;
1438 	p->hdr.bars[5].raw = 0x0;
1439 
1440 	/* enable INTx */
1441 	p->hdr.intr.ipin = 0x1;
1442 }
1443 
1444 static int
1445 vfio_user_dev_info_fill(struct nvmf_vfio_user_endpoint *endpoint)
1446 {
1447 	int ret;
1448 	ssize_t cap_offset;
1449 	vfu_ctx_t *vfu_ctx = endpoint->vfu_ctx;
1450 
1451 	struct pmcap pmcap = { .hdr.id = PCI_CAP_ID_PM, .pmcs.nsfrst = 0x1 };
1452 	struct pxcap pxcap = {
1453 		.hdr.id = PCI_CAP_ID_EXP,
1454 		.pxcaps.ver = 0x2,
1455 		.pxdcap = {.per = 0x1, .flrc = 0x1},
1456 		.pxdcap2.ctds = 0x1
1457 	};
1458 
1459 	struct msixcap msixcap = {
1460 		.hdr.id = PCI_CAP_ID_MSIX,
1461 		.mxc.ts = NVME_IRQ_MSIX_NUM - 1,
1462 		.mtab = {.tbir = 0x4, .to = 0x0},
1463 		.mpba = {.pbir = 0x5, .pbao = 0x0}
1464 	};
1465 
1466 	static struct iovec sparse_mmap[] = {
1467 		{
1468 			.iov_base = (void *)NVMF_VFIO_USER_DOORBELLS_OFFSET,
1469 			.iov_len = NVMF_VFIO_USER_DOORBELLS_SIZE,
1470 		},
1471 	};
1472 
1473 	ret = vfu_pci_init(vfu_ctx, VFU_PCI_TYPE_EXPRESS, PCI_HEADER_TYPE_NORMAL, 0);
1474 	if (ret < 0) {
1475 		SPDK_ERRLOG("vfu_ctx %p failed to initialize PCI\n", vfu_ctx);
1476 		return ret;
1477 	}
1478 	vfu_pci_set_id(vfu_ctx, 0x4e58, 0x0001, 0, 0);
1479 	/*
1480 	 * 0x02, controller uses the NVM Express programming interface
1481 	 * 0x08, non-volatile memory controller
1482 	 * 0x01, mass storage controller
1483 	 */
1484 	vfu_pci_set_class(vfu_ctx, 0x01, 0x08, 0x02);
1485 
1486 	cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pmcap);
1487 	if (cap_offset < 0) {
1488 		SPDK_ERRLOG("vfu_ctx %p failed add pmcap\n", vfu_ctx);
1489 		return ret;
1490 	}
1491 
1492 	cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pxcap);
1493 	if (cap_offset < 0) {
1494 		SPDK_ERRLOG("vfu_ctx %p failed add pxcap\n", vfu_ctx);
1495 		return ret;
1496 	}
1497 
1498 	cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &msixcap);
1499 	if (cap_offset < 0) {
1500 		SPDK_ERRLOG("vfu_ctx %p failed add msixcap\n", vfu_ctx);
1501 		return ret;
1502 	}
1503 
1504 	ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_CFG_REGION_IDX, NVME_REG_CFG_SIZE,
1505 			       access_pci_config, VFU_REGION_FLAG_RW, NULL, 0, -1);
1506 	if (ret < 0) {
1507 		SPDK_ERRLOG("vfu_ctx %p failed to setup cfg\n", vfu_ctx);
1508 		return ret;
1509 	}
1510 
1511 	ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE,
1512 			       access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM,
1513 			       sparse_mmap, 1, endpoint->fd);
1514 	if (ret < 0) {
1515 		SPDK_ERRLOG("vfu_ctx %p failed to setup bar 0\n", vfu_ctx);
1516 		return ret;
1517 	}
1518 
1519 	ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR4_REGION_IDX, PAGE_SIZE,
1520 			       NULL, VFU_REGION_FLAG_RW, NULL, 0, -1);
1521 	if (ret < 0) {
1522 		SPDK_ERRLOG("vfu_ctx %p failed to setup bar 4\n", vfu_ctx);
1523 		return ret;
1524 	}
1525 
1526 	ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR5_REGION_IDX, PAGE_SIZE,
1527 			       NULL, VFU_REGION_FLAG_RW, NULL, 0, -1);
1528 	if (ret < 0) {
1529 		SPDK_ERRLOG("vfu_ctx %p failed to setup bar 5\n", vfu_ctx);
1530 		return ret;
1531 	}
1532 
1533 	ret = vfu_setup_device_dma(vfu_ctx, memory_region_add_cb, memory_region_remove_cb);
1534 	if (ret < 0) {
1535 		SPDK_ERRLOG("vfu_ctx %p failed to setup dma callback\n", vfu_ctx);
1536 		return ret;
1537 	}
1538 
1539 	ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_INTX_IRQ, 1);
1540 	if (ret < 0) {
1541 		SPDK_ERRLOG("vfu_ctx %p failed to setup INTX\n", vfu_ctx);
1542 		return ret;
1543 	}
1544 
1545 	ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSIX_IRQ, NVME_IRQ_MSIX_NUM);
1546 	if (ret < 0) {
1547 		SPDK_ERRLOG("vfu_ctx %p failed to setup MSIX\n", vfu_ctx);
1548 		return ret;
1549 	}
1550 
1551 	ret = vfu_realize_ctx(vfu_ctx);
1552 	if (ret < 0) {
1553 		SPDK_ERRLOG("vfu_ctx %p failed to realize\n", vfu_ctx);
1554 		return ret;
1555 	}
1556 
1557 	endpoint->pci_config_space = vfu_pci_get_config_space(endpoint->vfu_ctx);
1558 	assert(endpoint->pci_config_space != NULL);
1559 	init_pci_config_space(endpoint->pci_config_space);
1560 
1561 	assert(cap_offset != 0);
1562 	endpoint->msix = (struct msixcap *)((uint8_t *)endpoint->pci_config_space + cap_offset);
1563 
1564 	return 0;
1565 }
1566 
1567 static void
1568 _destroy_ctrlr(void *ctx)
1569 {
1570 	struct nvmf_vfio_user_ctrlr *ctrlr = ctx;
1571 	int i;
1572 
1573 	for (i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; i++) {
1574 		destroy_qp(ctrlr, i);
1575 	}
1576 
1577 	if (ctrlr->endpoint) {
1578 		ctrlr->endpoint->ctrlr = NULL;
1579 	}
1580 
1581 	spdk_poller_unregister(&ctrlr->mmio_poller);
1582 	free(ctrlr);
1583 }
1584 
1585 static int
1586 destroy_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr)
1587 {
1588 	assert(ctrlr != NULL);
1589 
1590 	SPDK_DEBUGLOG(nvmf_vfio, "destroy %s\n", ctrlr_id(ctrlr));
1591 
1592 	if (ctrlr->thread == spdk_get_thread()) {
1593 		_destroy_ctrlr(ctrlr);
1594 	} else {
1595 		spdk_thread_send_msg(ctrlr->thread, _destroy_ctrlr, ctrlr);
1596 	}
1597 
1598 	return 0;
1599 }
1600 
1601 static void
1602 nvmf_vfio_user_create_ctrlr(struct nvmf_vfio_user_transport *transport,
1603 			    struct nvmf_vfio_user_endpoint *endpoint)
1604 {
1605 	struct nvmf_vfio_user_ctrlr *ctrlr;
1606 	int err;
1607 
1608 	/* First, construct a vfio-user CUSTOM transport controller */
1609 	ctrlr = calloc(1, sizeof(*ctrlr));
1610 	if (ctrlr == NULL) {
1611 		err = -ENOMEM;
1612 		goto out;
1613 	}
1614 	ctrlr->cntlid = 0xffff;
1615 	ctrlr->transport = transport;
1616 	ctrlr->endpoint = endpoint;
1617 	ctrlr->doorbells = endpoint->doorbells;
1618 
1619 	/* Then, construct an admin queue pair */
1620 	err = init_qp(ctrlr, &transport->transport, NVMF_VFIO_USER_DEFAULT_AQ_DEPTH, 0);
1621 	if (err != 0) {
1622 		goto out;
1623 	}
1624 	endpoint->ctrlr = ctrlr;
1625 	ctrlr->ready = true;
1626 
1627 	/* Notify the generic layer about the new admin queue pair */
1628 	TAILQ_INSERT_TAIL(&ctrlr->transport->new_qps, ctrlr->qp[0], link);
1629 
1630 out:
1631 	if (err != 0) {
1632 		SPDK_ERRLOG("%s: failed to create vfio-user controller: %s\n",
1633 			    endpoint_id(endpoint), strerror(-err));
1634 		if (destroy_ctrlr(ctrlr) != 0) {
1635 			SPDK_ERRLOG("%s: failed to clean up\n",
1636 				    endpoint_id(endpoint));
1637 		}
1638 	}
1639 }
1640 
1641 static int
1642 nvmf_vfio_user_listen(struct spdk_nvmf_transport *transport,
1643 		      const struct spdk_nvme_transport_id *trid,
1644 		      struct spdk_nvmf_listen_opts *listen_opts)
1645 {
1646 	struct nvmf_vfio_user_transport *vu_transport;
1647 	struct nvmf_vfio_user_endpoint *endpoint, *tmp;
1648 	char *path = NULL;
1649 	char uuid[PATH_MAX] = {};
1650 	int fd;
1651 	int err;
1652 
1653 	vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport,
1654 					transport);
1655 
1656 	TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) {
1657 		/* Only compare traddr */
1658 		if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) {
1659 			return -EEXIST;
1660 		}
1661 	}
1662 
1663 	endpoint = calloc(1, sizeof(*endpoint));
1664 	if (!endpoint) {
1665 		return -ENOMEM;
1666 	}
1667 
1668 	endpoint->fd = -1;
1669 	memcpy(&endpoint->trid, trid, sizeof(endpoint->trid));
1670 
1671 	err = asprintf(&path, "%s/bar0", endpoint_id(endpoint));
1672 	if (err == -1) {
1673 		goto out;
1674 	}
1675 
1676 	fd = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH);
1677 	if (fd == -1) {
1678 		SPDK_ERRLOG("%s: failed to open device memory at %s: %m\n",
1679 			    endpoint_id(endpoint), path);
1680 		err = fd;
1681 		free(path);
1682 		goto out;
1683 	}
1684 	free(path);
1685 
1686 	err = ftruncate(fd, NVMF_VFIO_USER_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE);
1687 	if (err != 0) {
1688 		goto out;
1689 	}
1690 
1691 	endpoint->doorbells = mmap(NULL, NVMF_VFIO_USER_DOORBELLS_SIZE,
1692 				   PROT_READ | PROT_WRITE, MAP_SHARED, fd, NVMF_VFIO_USER_DOORBELLS_OFFSET);
1693 	if (endpoint->doorbells == MAP_FAILED) {
1694 		endpoint->doorbells = NULL;
1695 		err = -errno;
1696 		goto out;
1697 	}
1698 
1699 	endpoint->fd = fd;
1700 
1701 	snprintf(uuid, PATH_MAX, "%s/cntrl", endpoint_id(endpoint));
1702 
1703 	endpoint->vfu_ctx = vfu_create_ctx(VFU_TRANS_SOCK, uuid, LIBVFIO_USER_FLAG_ATTACH_NB,
1704 					   endpoint, VFU_DEV_TYPE_PCI);
1705 	if (endpoint->vfu_ctx == NULL) {
1706 		SPDK_ERRLOG("%s: error creating libmuser context: %m\n",
1707 			    endpoint_id(endpoint));
1708 		err = -1;
1709 		goto out;
1710 	}
1711 	vfu_setup_log(endpoint->vfu_ctx, vfio_user_log,
1712 		      SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio") ? LOG_DEBUG : LOG_ERR);
1713 
1714 	err = vfio_user_dev_info_fill(endpoint);
1715 	if (err < 0) {
1716 		goto out;
1717 	}
1718 
1719 	pthread_mutex_init(&endpoint->lock, NULL);
1720 	TAILQ_INSERT_TAIL(&vu_transport->endpoints, endpoint, link);
1721 	SPDK_DEBUGLOG(nvmf_vfio, "%s: doorbells %p\n", uuid, endpoint->doorbells);
1722 
1723 out:
1724 	if (err != 0) {
1725 		nvmf_vfio_user_destroy_endpoint(endpoint);
1726 	}
1727 
1728 	return err;
1729 }
1730 
1731 static void
1732 nvmf_vfio_user_stop_listen(struct spdk_nvmf_transport *transport,
1733 			   const struct spdk_nvme_transport_id *trid)
1734 {
1735 	struct nvmf_vfio_user_transport *vu_transport;
1736 	struct nvmf_vfio_user_endpoint *endpoint, *tmp;
1737 	int err;
1738 
1739 	assert(trid != NULL);
1740 	assert(trid->traddr != NULL);
1741 
1742 	SPDK_DEBUGLOG(nvmf_vfio, "%s: stop listen\n", trid->traddr);
1743 
1744 	vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport,
1745 					transport);
1746 
1747 	pthread_mutex_lock(&vu_transport->lock);
1748 	TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) {
1749 		if (strcmp(trid->traddr, endpoint->trid.traddr) == 0) {
1750 			TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link);
1751 			if (endpoint->ctrlr) {
1752 				err = destroy_ctrlr(endpoint->ctrlr);
1753 				if (err != 0) {
1754 					SPDK_ERRLOG("%s: failed destroy controller: %s\n",
1755 						    endpoint_id(endpoint), strerror(-err));
1756 				}
1757 			}
1758 			nvmf_vfio_user_destroy_endpoint(endpoint);
1759 			pthread_mutex_unlock(&vu_transport->lock);
1760 
1761 			return;
1762 		}
1763 	}
1764 	pthread_mutex_unlock(&vu_transport->lock);
1765 
1766 	SPDK_DEBUGLOG(nvmf_vfio, "%s: not found\n", trid->traddr);
1767 }
1768 
1769 static void
1770 nvmf_vfio_user_cdata_init(struct spdk_nvmf_transport *transport,
1771 			  struct spdk_nvmf_subsystem *subsystem,
1772 			  struct spdk_nvmf_ctrlr_data *cdata)
1773 {
1774 	memset(&cdata->sgls, 0, sizeof(struct spdk_nvme_cdata_sgls));
1775 	cdata->sgls.supported = SPDK_NVME_SGLS_SUPPORTED_DWORD_ALIGNED;
1776 }
1777 
1778 static int
1779 nvmf_vfio_user_listen_associate(struct spdk_nvmf_transport *transport,
1780 				const struct spdk_nvmf_subsystem *subsystem,
1781 				const struct spdk_nvme_transport_id *trid)
1782 {
1783 	struct nvmf_vfio_user_transport *vu_transport;
1784 	struct nvmf_vfio_user_endpoint *endpoint;
1785 
1786 	vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, transport);
1787 
1788 	TAILQ_FOREACH(endpoint, &vu_transport->endpoints, link) {
1789 		if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) {
1790 			break;
1791 		}
1792 	}
1793 
1794 	if (endpoint == NULL) {
1795 		return -ENOENT;
1796 	}
1797 
1798 	endpoint->subsystem = subsystem;
1799 
1800 	return 0;
1801 }
1802 
1803 /*
1804  * Executed periodically.
1805  *
1806  * XXX SPDK thread context.
1807  */
1808 static uint32_t
1809 nvmf_vfio_user_accept(struct spdk_nvmf_transport *transport)
1810 {
1811 	int err;
1812 	struct nvmf_vfio_user_transport *vu_transport;
1813 	struct nvmf_vfio_user_qpair *qp, *tmp_qp;
1814 	struct nvmf_vfio_user_endpoint *endpoint;
1815 
1816 	vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport,
1817 					transport);
1818 
1819 	pthread_mutex_lock(&vu_transport->lock);
1820 
1821 	TAILQ_FOREACH(endpoint, &vu_transport->endpoints, link) {
1822 		/* we need try to attach the controller again after reset or shutdown */
1823 		if (endpoint->ctrlr != NULL && endpoint->ctrlr->ready) {
1824 			continue;
1825 		}
1826 
1827 		err = vfu_attach_ctx(endpoint->vfu_ctx);
1828 		if (err != 0) {
1829 			if (errno == EAGAIN || errno == EWOULDBLOCK) {
1830 				continue;
1831 			}
1832 
1833 			pthread_mutex_unlock(&vu_transport->lock);
1834 			return -EFAULT;
1835 		}
1836 
1837 		/* Construct a controller */
1838 		nvmf_vfio_user_create_ctrlr(vu_transport, endpoint);
1839 	}
1840 
1841 	TAILQ_FOREACH_SAFE(qp, &vu_transport->new_qps, link, tmp_qp) {
1842 		TAILQ_REMOVE(&vu_transport->new_qps, qp, link);
1843 		spdk_nvmf_tgt_new_qpair(transport->tgt, &qp->qpair);
1844 	}
1845 
1846 	pthread_mutex_unlock(&vu_transport->lock);
1847 
1848 	return 0;
1849 }
1850 
1851 static void
1852 nvmf_vfio_user_discover(struct spdk_nvmf_transport *transport,
1853 			struct spdk_nvme_transport_id *trid,
1854 			struct spdk_nvmf_discovery_log_page_entry *entry)
1855 { }
1856 
1857 static struct spdk_nvmf_transport_poll_group *
1858 nvmf_vfio_user_poll_group_create(struct spdk_nvmf_transport *transport)
1859 {
1860 	struct nvmf_vfio_user_poll_group *vu_group;
1861 
1862 	SPDK_DEBUGLOG(nvmf_vfio, "create poll group\n");
1863 
1864 	vu_group = calloc(1, sizeof(*vu_group));
1865 	if (vu_group == NULL) {
1866 		SPDK_ERRLOG("Error allocating poll group: %m");
1867 		return NULL;
1868 	}
1869 
1870 	TAILQ_INIT(&vu_group->qps);
1871 
1872 	return &vu_group->group;
1873 }
1874 
1875 /* called when process exits */
1876 static void
1877 nvmf_vfio_user_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group)
1878 {
1879 	struct nvmf_vfio_user_poll_group *vu_group;
1880 
1881 	SPDK_DEBUGLOG(nvmf_vfio, "destroy poll group\n");
1882 
1883 	vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group);
1884 
1885 	free(vu_group);
1886 }
1887 
1888 static void
1889 vfio_user_qpair_disconnect_cb(void *ctx)
1890 {
1891 	struct nvmf_vfio_user_endpoint *endpoint = ctx;
1892 	struct nvmf_vfio_user_ctrlr *ctrlr;
1893 
1894 	pthread_mutex_lock(&endpoint->lock);
1895 	ctrlr = endpoint->ctrlr;
1896 	if (!ctrlr) {
1897 		pthread_mutex_unlock(&endpoint->lock);
1898 		return;
1899 	}
1900 
1901 	if (!ctrlr->num_connected_qps) {
1902 		destroy_ctrlr(ctrlr);
1903 		pthread_mutex_unlock(&endpoint->lock);
1904 		return;
1905 	}
1906 	pthread_mutex_unlock(&endpoint->lock);
1907 }
1908 
1909 static int
1910 vfio_user_stop_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr)
1911 {
1912 	uint32_t i;
1913 	struct nvmf_vfio_user_qpair *qpair;
1914 	struct nvmf_vfio_user_endpoint *endpoint;
1915 
1916 	SPDK_DEBUGLOG(nvmf_vfio, "%s stop processing\n", ctrlr_id(ctrlr));
1917 
1918 	ctrlr->ready = false;
1919 	endpoint = ctrlr->endpoint;
1920 	assert(endpoint != NULL);
1921 
1922 	for (i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; i++) {
1923 		qpair = ctrlr->qp[i];
1924 		if (qpair == NULL) {
1925 			continue;
1926 		}
1927 		spdk_nvmf_qpair_disconnect(&qpair->qpair, vfio_user_qpair_disconnect_cb, endpoint);
1928 	}
1929 
1930 	return 0;
1931 }
1932 
1933 static int
1934 vfio_user_poll_mmio(void *ctx)
1935 {
1936 	struct nvmf_vfio_user_ctrlr *ctrlr = ctx;
1937 	int ret;
1938 
1939 	assert(ctrlr != NULL);
1940 
1941 	/* This will call access_bar0_fn() if there are any writes
1942 	 * to the portion of the BAR that is not mmap'd */
1943 	ret = vfu_run_ctx(ctrlr->endpoint->vfu_ctx);
1944 	if (spdk_unlikely(ret != 0)) {
1945 		spdk_poller_unregister(&ctrlr->mmio_poller);
1946 
1947 		/* initiator shutdown or reset, waiting for another re-connect */
1948 		if (errno == ENOTCONN) {
1949 			vfio_user_stop_ctrlr(ctrlr);
1950 			return SPDK_POLLER_BUSY;
1951 		}
1952 
1953 		fail_ctrlr(ctrlr);
1954 	}
1955 
1956 	return SPDK_POLLER_BUSY;
1957 }
1958 
1959 static int
1960 handle_queue_connect_rsp(struct nvmf_vfio_user_req *req, void *cb_arg)
1961 {
1962 	struct nvmf_vfio_user_poll_group *vu_group;
1963 	struct nvmf_vfio_user_qpair *qpair = cb_arg;
1964 	struct nvmf_vfio_user_ctrlr *ctrlr;
1965 	struct nvmf_vfio_user_endpoint *endpoint;
1966 
1967 	assert(qpair != NULL);
1968 	assert(req != NULL);
1969 
1970 	ctrlr = qpair->ctrlr;
1971 	endpoint = ctrlr->endpoint;
1972 	assert(ctrlr != NULL);
1973 	assert(endpoint != NULL);
1974 
1975 	if (spdk_nvme_cpl_is_error(&req->req.rsp->nvme_cpl)) {
1976 		SPDK_ERRLOG("SC %u, SCT %u\n", req->req.rsp->nvme_cpl.status.sc, req->req.rsp->nvme_cpl.status.sct);
1977 		destroy_qp(ctrlr, qpair->qpair.qid);
1978 		destroy_ctrlr(ctrlr);
1979 		return -1;
1980 	}
1981 
1982 	vu_group = SPDK_CONTAINEROF(qpair->group, struct nvmf_vfio_user_poll_group, group);
1983 	TAILQ_INSERT_TAIL(&vu_group->qps, qpair, link);
1984 	qpair->state = VFIO_USER_QPAIR_ACTIVE;
1985 
1986 	pthread_mutex_lock(&endpoint->lock);
1987 	if (nvmf_qpair_is_admin_queue(&qpair->qpair)) {
1988 		ctrlr->cntlid = qpair->qpair.ctrlr->cntlid;
1989 		ctrlr->thread = spdk_get_thread();
1990 		ctrlr->mmio_poller = SPDK_POLLER_REGISTER(vfio_user_poll_mmio, ctrlr, 0);
1991 	}
1992 	ctrlr->num_connected_qps++;
1993 	pthread_mutex_unlock(&endpoint->lock);
1994 
1995 	free(req->req.data);
1996 	req->req.data = NULL;
1997 
1998 	return 0;
1999 }
2000 
2001 /*
2002  * Called by spdk_nvmf_transport_poll_group_add.
2003  */
2004 static int
2005 nvmf_vfio_user_poll_group_add(struct spdk_nvmf_transport_poll_group *group,
2006 			      struct spdk_nvmf_qpair *qpair)
2007 {
2008 	struct nvmf_vfio_user_qpair *vu_qpair;
2009 	struct nvmf_vfio_user_req *vu_req;
2010 	struct nvmf_vfio_user_ctrlr *ctrlr;
2011 	struct spdk_nvmf_request *req;
2012 	struct spdk_nvmf_fabric_connect_data *data;
2013 	bool admin;
2014 
2015 	vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair);
2016 	vu_qpair->group = group;
2017 	ctrlr = vu_qpair->ctrlr;
2018 
2019 	SPDK_DEBUGLOG(nvmf_vfio, "%s: add QP%d=%p(%p) to poll_group=%p\n",
2020 		      ctrlr_id(ctrlr), vu_qpair->qpair.qid,
2021 		      vu_qpair, qpair, group);
2022 
2023 	admin = nvmf_qpair_is_admin_queue(&vu_qpair->qpair);
2024 
2025 	vu_req = get_nvmf_vfio_user_req(vu_qpair);
2026 	if (vu_req == NULL) {
2027 		return -1;
2028 	}
2029 
2030 	req = &vu_req->req;
2031 	req->cmd->connect_cmd.opcode = SPDK_NVME_OPC_FABRIC;
2032 	req->cmd->connect_cmd.cid = 0;
2033 	req->cmd->connect_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_CONNECT;
2034 	req->cmd->connect_cmd.recfmt = 0;
2035 	req->cmd->connect_cmd.sqsize = vu_qpair->qsize - 1;
2036 	req->cmd->connect_cmd.qid = admin ? 0 : qpair->qid;
2037 
2038 	req->length = sizeof(struct spdk_nvmf_fabric_connect_data);
2039 	req->data = calloc(1, req->length);
2040 	if (req->data == NULL) {
2041 		nvmf_vfio_user_req_free(req);
2042 		return -ENOMEM;
2043 	}
2044 
2045 	data = (struct spdk_nvmf_fabric_connect_data *)req->data;
2046 	data->cntlid = admin ? 0xFFFF : ctrlr->cntlid;
2047 	snprintf(data->subnqn, sizeof(data->subnqn), "%s",
2048 		 spdk_nvmf_subsystem_get_nqn(ctrlr->endpoint->subsystem));
2049 
2050 	vu_req->cb_fn = handle_queue_connect_rsp;
2051 	vu_req->cb_arg = vu_qpair;
2052 
2053 	SPDK_DEBUGLOG(nvmf_vfio,
2054 		      "%s: sending connect fabrics command for QID=%#x cntlid=%#x\n",
2055 		      ctrlr_id(ctrlr), qpair->qid, data->cntlid);
2056 
2057 	spdk_nvmf_request_exec_fabrics(req);
2058 	return 0;
2059 }
2060 
2061 static int
2062 nvmf_vfio_user_poll_group_remove(struct spdk_nvmf_transport_poll_group *group,
2063 				 struct spdk_nvmf_qpair *qpair)
2064 {
2065 	struct nvmf_vfio_user_qpair *vu_qpair;
2066 	struct nvmf_vfio_user_ctrlr *vu_ctrlr;
2067 	struct nvmf_vfio_user_endpoint *endpoint;
2068 	struct nvmf_vfio_user_poll_group *vu_group;
2069 
2070 	vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair);
2071 	vu_ctrlr = vu_qpair->ctrlr;
2072 	endpoint = vu_ctrlr->endpoint;
2073 
2074 	SPDK_DEBUGLOG(nvmf_vfio,
2075 		      "%s: remove NVMf QP%d=%p from NVMf poll_group=%p\n",
2076 		      ctrlr_id(vu_qpair->ctrlr), qpair->qid, qpair, group);
2077 
2078 
2079 	vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group);
2080 	TAILQ_REMOVE(&vu_group->qps, vu_qpair, link);
2081 
2082 	pthread_mutex_lock(&endpoint->lock);
2083 	assert(vu_ctrlr->num_connected_qps);
2084 	vu_ctrlr->num_connected_qps--;
2085 	pthread_mutex_unlock(&endpoint->lock);
2086 
2087 	return 0;
2088 }
2089 
2090 static void
2091 _nvmf_vfio_user_req_free(struct nvmf_vfio_user_qpair *vu_qpair, struct nvmf_vfio_user_req *vu_req)
2092 {
2093 	memset(&vu_req->cmd, 0, sizeof(vu_req->cmd));
2094 	memset(&vu_req->rsp, 0, sizeof(vu_req->rsp));
2095 	vu_req->iovcnt = 0;
2096 	vu_req->state = VFIO_USER_REQUEST_STATE_FREE;
2097 
2098 	TAILQ_INSERT_TAIL(&vu_qpair->reqs, vu_req, link);
2099 }
2100 
2101 static int
2102 nvmf_vfio_user_req_free(struct spdk_nvmf_request *req)
2103 {
2104 	struct nvmf_vfio_user_qpair *vu_qpair;
2105 	struct nvmf_vfio_user_req *vu_req;
2106 
2107 	assert(req != NULL);
2108 
2109 	vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req);
2110 	vu_qpair = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_qpair, qpair);
2111 
2112 	_nvmf_vfio_user_req_free(vu_qpair, vu_req);
2113 
2114 	return 0;
2115 }
2116 
2117 static int
2118 nvmf_vfio_user_req_complete(struct spdk_nvmf_request *req)
2119 {
2120 	struct nvmf_vfio_user_qpair *vu_qpair;
2121 	struct nvmf_vfio_user_req *vu_req;
2122 
2123 	assert(req != NULL);
2124 
2125 	vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req);
2126 	vu_qpair = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_qpair, qpair);
2127 
2128 	if (vu_req->cb_fn != NULL) {
2129 		if (vu_req->cb_fn(vu_req, vu_req->cb_arg) != 0) {
2130 			fail_ctrlr(vu_qpair->ctrlr);
2131 		}
2132 	}
2133 
2134 	_nvmf_vfio_user_req_free(vu_qpair, vu_req);
2135 
2136 	return 0;
2137 }
2138 
2139 static void
2140 nvmf_vfio_user_close_qpair(struct spdk_nvmf_qpair *qpair,
2141 			   spdk_nvmf_transport_qpair_fini_cb cb_fn, void *cb_arg)
2142 {
2143 	struct nvmf_vfio_user_qpair *vu_qpair;
2144 
2145 	assert(qpair != NULL);
2146 	vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair);
2147 	destroy_qp(vu_qpair->ctrlr, qpair->qid);
2148 
2149 	if (cb_fn) {
2150 		cb_fn(cb_arg);
2151 	}
2152 }
2153 
2154 /**
2155  * Returns a preallocated spdk_nvmf_request or NULL if there isn't one available.
2156  */
2157 static struct nvmf_vfio_user_req *
2158 get_nvmf_vfio_user_req(struct nvmf_vfio_user_qpair *qpair)
2159 {
2160 	struct nvmf_vfio_user_req *req;
2161 
2162 	assert(qpair != NULL);
2163 
2164 	if (TAILQ_EMPTY(&qpair->reqs)) {
2165 		return NULL;
2166 	}
2167 
2168 	req = TAILQ_FIRST(&qpair->reqs);
2169 	TAILQ_REMOVE(&qpair->reqs, req, link);
2170 
2171 	return req;
2172 }
2173 
2174 static struct spdk_nvmf_request *
2175 get_nvmf_req(struct nvmf_vfio_user_qpair *qpair)
2176 {
2177 	struct nvmf_vfio_user_req *req = get_nvmf_vfio_user_req(qpair);
2178 
2179 	if (req == NULL) {
2180 		return NULL;
2181 	}
2182 	return &req->req;
2183 }
2184 
2185 static int
2186 get_nvmf_io_req_length(struct spdk_nvmf_request *req)
2187 {
2188 	uint16_t nlb, nr;
2189 	uint32_t nsid;
2190 	struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
2191 	struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
2192 	struct spdk_nvmf_ns *ns;
2193 
2194 	nsid = cmd->nsid;
2195 	ns = _nvmf_subsystem_get_ns(ctrlr->subsys, nsid);
2196 	if (ns == NULL || ns->bdev == NULL) {
2197 		SPDK_ERRLOG("unsuccessful query for nsid %u\n", cmd->nsid);
2198 		return -EINVAL;
2199 	}
2200 
2201 	if (cmd->opc == SPDK_NVME_OPC_DATASET_MANAGEMENT) {
2202 		nr = cmd->cdw10_bits.dsm.nr + 1;
2203 		return nr * sizeof(struct spdk_nvme_dsm_range);
2204 	}
2205 
2206 	nlb = (cmd->cdw12 & 0x0000ffffu) + 1;
2207 	return nlb * spdk_bdev_get_block_size(ns->bdev);
2208 }
2209 
2210 static int
2211 map_admin_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req)
2212 {
2213 	struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
2214 	uint32_t len = 0;
2215 	int iovcnt;
2216 
2217 	req->xfer = cmd->opc & 0x3;
2218 	req->length = 0;
2219 	req->data = NULL;
2220 
2221 	switch (cmd->opc) {
2222 	case SPDK_NVME_OPC_IDENTIFY:
2223 		len = 4096; /* TODO: there should be a define somewhere for this */
2224 		break;
2225 	case SPDK_NVME_OPC_GET_LOG_PAGE:
2226 		len = (cmd->cdw10_bits.get_log_page.numdl + 1) * 4;
2227 		break;
2228 	}
2229 
2230 	if (!cmd->dptr.prp.prp1 || !len) {
2231 		return 0;
2232 	}
2233 	/* ADMIN command will not use SGL */
2234 	assert(req->cmd->nvme_cmd.psdt == 0);
2235 	iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, len);
2236 	if (iovcnt < 0) {
2237 		SPDK_ERRLOG("%s: map Admin Opc %x failed\n",
2238 			    ctrlr_id(ctrlr), cmd->opc);
2239 		return -1;
2240 	}
2241 
2242 	req->length = len;
2243 	req->data = req->iov[0].iov_base;
2244 
2245 	return 0;
2246 }
2247 
2248 /*
2249  * Handles an I/O command.
2250  *
2251  * Returns 0 on success and -errno on failure. Sets @submit on whether or not
2252  * the request must be forwarded to NVMf.
2253  */
2254 static int
2255 map_io_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req)
2256 {
2257 	int err = 0;
2258 	struct spdk_nvme_cmd *cmd;
2259 
2260 	assert(ctrlr != NULL);
2261 	assert(req != NULL);
2262 
2263 	cmd = &req->cmd->nvme_cmd;
2264 	req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc);
2265 
2266 	if (spdk_unlikely(req->xfer == SPDK_NVME_DATA_NONE)) {
2267 		return 0;
2268 	}
2269 
2270 	err = get_nvmf_io_req_length(req);
2271 	if (err < 0) {
2272 		return -EINVAL;
2273 	}
2274 
2275 	req->length = err;
2276 	err = vfio_user_map_cmd(ctrlr, req, req->iov, req->length);
2277 	if (err < 0) {
2278 		SPDK_ERRLOG("%s: failed to map IO OPC %u\n", ctrlr_id(ctrlr), cmd->opc);
2279 		return -EFAULT;
2280 	}
2281 
2282 	req->data = req->iov[0].iov_base;
2283 	req->iovcnt = err;
2284 
2285 	return 0;
2286 }
2287 
2288 static int
2289 handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd,
2290 	       struct spdk_nvmf_request *req)
2291 {
2292 	int err;
2293 	struct nvmf_vfio_user_req *vu_req;
2294 
2295 	assert(ctrlr != NULL);
2296 	assert(cmd != NULL);
2297 
2298 	/*
2299 	 * TODO: this means that there are no free requests available,
2300 	 * returning -1 will fail the controller. Theoretically this error can
2301 	 * be avoided completely by ensuring we have as many requests as slots
2302 	 * in the SQ, plus one for the the property request.
2303 	 */
2304 	if (spdk_unlikely(req == NULL)) {
2305 		return -1;
2306 	}
2307 
2308 	vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req);
2309 	vu_req->cb_fn = handle_cmd_rsp;
2310 	vu_req->cb_arg = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_qpair, qpair);
2311 	req->cmd->nvme_cmd = *cmd;
2312 	if (nvmf_qpair_is_admin_queue(req->qpair)) {
2313 		err = map_admin_cmd_req(ctrlr, req);
2314 	} else {
2315 		err = map_io_cmd_req(ctrlr, req);
2316 	}
2317 
2318 	if (spdk_unlikely(err < 0)) {
2319 		SPDK_ERRLOG("%s: map NVMe command opc 0x%x failed\n",
2320 			    ctrlr_id(ctrlr), cmd->opc);
2321 		req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
2322 		req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC;
2323 		return handle_cmd_rsp(vu_req, vu_req->cb_arg);
2324 	}
2325 
2326 	vu_req->state = VFIO_USER_REQUEST_STATE_EXECUTING;
2327 	spdk_nvmf_request_exec(req);
2328 
2329 	return 0;
2330 }
2331 
2332 static void
2333 nvmf_vfio_user_qpair_poll(struct nvmf_vfio_user_qpair *qpair)
2334 {
2335 	struct nvmf_vfio_user_ctrlr *ctrlr;
2336 	uint32_t new_tail;
2337 
2338 	assert(qpair != NULL);
2339 
2340 	ctrlr = qpair->ctrlr;
2341 
2342 	new_tail = *tdbl(ctrlr, &qpair->sq);
2343 	if (sq_head(qpair) != new_tail) {
2344 		int err = handle_sq_tdbl_write(ctrlr, new_tail, qpair);
2345 		if (err != 0) {
2346 			fail_ctrlr(ctrlr);
2347 			return;
2348 		}
2349 	}
2350 }
2351 
2352 /*
2353  * Called unconditionally, periodically, very frequently from SPDK to ask
2354  * whether there's work to be done.  This function consumes requests generated
2355  * from read/write_bar0 by setting ctrlr->prop_req.dir.  read_bar0, and
2356  * occasionally write_bar0 -- though this may change, synchronously wait. This
2357  * function also consumes requests by looking at the doorbells.
2358  */
2359 static int
2360 nvmf_vfio_user_poll_group_poll(struct spdk_nvmf_transport_poll_group *group)
2361 {
2362 	struct nvmf_vfio_user_poll_group *vu_group;
2363 	struct nvmf_vfio_user_qpair *vu_qpair, *tmp;
2364 
2365 	assert(group != NULL);
2366 
2367 	spdk_rmb();
2368 
2369 	vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group);
2370 
2371 	TAILQ_FOREACH_SAFE(vu_qpair, &vu_group->qps, link, tmp) {
2372 		if (spdk_unlikely(vu_qpair->state != VFIO_USER_QPAIR_ACTIVE || !vu_qpair->sq.size)) {
2373 			continue;
2374 		}
2375 		nvmf_vfio_user_qpair_poll(vu_qpair);
2376 	}
2377 
2378 	return 0;
2379 }
2380 
2381 static int
2382 nvmf_vfio_user_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair,
2383 				    struct spdk_nvme_transport_id *trid)
2384 {
2385 	struct nvmf_vfio_user_qpair *vu_qpair;
2386 	struct nvmf_vfio_user_ctrlr *ctrlr;
2387 
2388 	vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair);
2389 	ctrlr = vu_qpair->ctrlr;
2390 
2391 	memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid));
2392 	return 0;
2393 }
2394 
2395 static int
2396 nvmf_vfio_user_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair,
2397 				   struct spdk_nvme_transport_id *trid)
2398 {
2399 	return 0;
2400 }
2401 
2402 static int
2403 nvmf_vfio_user_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair,
2404 				     struct spdk_nvme_transport_id *trid)
2405 {
2406 	struct nvmf_vfio_user_qpair *vu_qpair;
2407 	struct nvmf_vfio_user_ctrlr *ctrlr;
2408 
2409 	vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair);
2410 	ctrlr = vu_qpair->ctrlr;
2411 
2412 	memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid));
2413 	return 0;
2414 }
2415 
2416 static void
2417 nvmf_vfio_user_qpair_abort_request(struct spdk_nvmf_qpair *qpair,
2418 				   struct spdk_nvmf_request *req)
2419 {
2420 	struct nvmf_vfio_user_qpair *vu_qpair;
2421 	struct nvmf_vfio_user_req *vu_req, *vu_req_to_abort = NULL;
2422 	uint16_t i, cid;
2423 
2424 	vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair);
2425 
2426 	cid = req->cmd->nvme_cmd.cdw10_bits.abort.cid;
2427 	for (i = 0; i < vu_qpair->qsize; i++) {
2428 		vu_req = &vu_qpair->reqs_internal[i];
2429 		if (vu_req->state == VFIO_USER_REQUEST_STATE_EXECUTING && vu_req->cmd.cid == cid) {
2430 			vu_req_to_abort = vu_req;
2431 			break;
2432 		}
2433 	}
2434 
2435 	if (vu_req_to_abort == NULL) {
2436 		spdk_nvmf_request_complete(req);
2437 		return;
2438 	}
2439 
2440 	req->req_to_abort = &vu_req_to_abort->req;
2441 	nvmf_ctrlr_abort_request(req);
2442 }
2443 
2444 static void
2445 nvmf_vfio_user_opts_init(struct spdk_nvmf_transport_opts *opts)
2446 {
2447 	opts->max_queue_depth =		NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH;
2448 	opts->max_qpairs_per_ctrlr =	NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR;
2449 	opts->in_capsule_data_size =	NVMF_VFIO_USER_DEFAULT_IN_CAPSULE_DATA_SIZE;
2450 	opts->max_io_size =		NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE;
2451 	opts->io_unit_size =		NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE;
2452 	opts->max_aq_depth =		NVMF_VFIO_USER_DEFAULT_AQ_DEPTH;
2453 	opts->num_shared_buffers =	NVMF_VFIO_USER_DEFAULT_NUM_SHARED_BUFFERS;
2454 	opts->buf_cache_size =		NVMF_VFIO_USER_DEFAULT_BUFFER_CACHE_SIZE;
2455 }
2456 
2457 const struct spdk_nvmf_transport_ops spdk_nvmf_transport_vfio_user = {
2458 	.name = "VFIOUSER",
2459 	.type = SPDK_NVME_TRANSPORT_VFIOUSER,
2460 	.opts_init = nvmf_vfio_user_opts_init,
2461 	.create = nvmf_vfio_user_create,
2462 	.destroy = nvmf_vfio_user_destroy,
2463 
2464 	.listen = nvmf_vfio_user_listen,
2465 	.stop_listen = nvmf_vfio_user_stop_listen,
2466 	.accept = nvmf_vfio_user_accept,
2467 	.cdata_init = nvmf_vfio_user_cdata_init,
2468 	.listen_associate = nvmf_vfio_user_listen_associate,
2469 
2470 	.listener_discover = nvmf_vfio_user_discover,
2471 
2472 	.poll_group_create = nvmf_vfio_user_poll_group_create,
2473 	.poll_group_destroy = nvmf_vfio_user_poll_group_destroy,
2474 	.poll_group_add = nvmf_vfio_user_poll_group_add,
2475 	.poll_group_remove = nvmf_vfio_user_poll_group_remove,
2476 	.poll_group_poll = nvmf_vfio_user_poll_group_poll,
2477 
2478 	.req_free = nvmf_vfio_user_req_free,
2479 	.req_complete = nvmf_vfio_user_req_complete,
2480 
2481 	.qpair_fini = nvmf_vfio_user_close_qpair,
2482 	.qpair_get_local_trid = nvmf_vfio_user_qpair_get_local_trid,
2483 	.qpair_get_peer_trid = nvmf_vfio_user_qpair_get_peer_trid,
2484 	.qpair_get_listen_trid = nvmf_vfio_user_qpair_get_listen_trid,
2485 	.qpair_abort_request = nvmf_vfio_user_qpair_abort_request,
2486 };
2487 
2488 SPDK_NVMF_TRANSPORT_REGISTER(muser, &spdk_nvmf_transport_vfio_user);
2489 SPDK_LOG_REGISTER_COMPONENT(nvmf_vfio)
2490