xref: /spdk/lib/nvmf/vfio_user.c (revision 32999ab917f67af61872f868585fd3d78ad6fb8a)
1 /*-
2  *   BSD LICENSE
3  *   Copyright (c) Intel Corporation. All rights reserved.
4  *   Copyright (c) 2019, Nutanix Inc. All rights reserved.
5  *
6  *   Redistribution and use in source and binary forms, with or without
7  *   modification, are permitted provided that the following conditions
8  *   are met:
9  *
10  *     * Redistributions of source code must retain the above copyright
11  *       notice, this list of conditions and the following disclaimer.
12  *     * Redistributions in binary form must reproduce the above copyright
13  *       notice, this list of conditions and the following disclaimer in
14  *       the documentation and/or other materials provided with the
15  *       distribution.
16  *     * Neither the name of Intel Corporation nor the names of its
17  *       contributors may be used to endorse or promote products derived
18  *       from this software without specific prior written permission.
19  *
20  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 /*
34  * NVMe over vfio-user transport
35  */
36 
37 #include <vfio-user/libvfio-user.h>
38 #include <vfio-user/pci_defs.h>
39 
40 #include "spdk/barrier.h"
41 #include "spdk/stdinc.h"
42 #include "spdk/assert.h"
43 #include "spdk/thread.h"
44 #include "spdk/nvmf_transport.h"
45 #include "spdk/sock.h"
46 #include "spdk/string.h"
47 #include "spdk/util.h"
48 #include "spdk/log.h"
49 
50 #include "transport.h"
51 
52 #include "nvmf_internal.h"
53 
54 #define NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH 256
55 #define NVMF_VFIO_USER_DEFAULT_AQ_DEPTH 32
56 #define NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR 64
57 #define NVMF_VFIO_USER_DEFAULT_IN_CAPSULE_DATA_SIZE 0
58 #define NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE ((NVMF_REQ_MAX_BUFFERS - 1) << SHIFT_4KB)
59 #define NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE
60 #define NVMF_VFIO_USER_DEFAULT_NUM_SHARED_BUFFERS 512 /* internal buf size */
61 #define NVMF_VFIO_USER_DEFAULT_BUFFER_CACHE_SIZE 0
62 
63 #define NVMF_VFIO_USER_DOORBELLS_OFFSET	0x1000
64 #define NVMF_VFIO_USER_DOORBELLS_SIZE 0x1000
65 
66 #define NVME_REG_CFG_SIZE       0x1000
67 #define NVME_REG_BAR0_SIZE      0x4000
68 #define NVME_IRQ_INTX_NUM       1
69 #define NVME_IRQ_MSIX_NUM	NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR
70 
71 struct nvmf_vfio_user_req;
72 struct nvmf_vfio_user_qpair;
73 
74 typedef int (*nvmf_vfio_user_req_cb_fn)(struct nvmf_vfio_user_req *req, void *cb_arg);
75 
76 /* 1 more for PRP2 list itself */
77 #define NVMF_VFIO_USER_MAX_IOVECS	(NVMF_REQ_MAX_BUFFERS + 1)
78 
79 enum nvmf_vfio_user_req_state {
80 	VFIO_USER_REQUEST_STATE_FREE = 0,
81 	VFIO_USER_REQUEST_STATE_EXECUTING,
82 };
83 
84 struct nvmf_vfio_user_req  {
85 	struct spdk_nvmf_request		req;
86 	struct spdk_nvme_cpl			rsp;
87 	struct spdk_nvme_cmd			cmd;
88 
89 	enum nvmf_vfio_user_req_state		state;
90 	nvmf_vfio_user_req_cb_fn		cb_fn;
91 	void					*cb_arg;
92 
93 	/* placeholder for gpa_to_vva memory map table, the IO buffer doesn't use it */
94 	dma_sg_t				sg[NVMF_VFIO_USER_MAX_IOVECS];
95 	struct iovec				iov[NVMF_VFIO_USER_MAX_IOVECS];
96 	uint8_t					iovcnt;
97 
98 	TAILQ_ENTRY(nvmf_vfio_user_req)		link;
99 };
100 
101 /*
102  * A NVMe queue.
103  */
104 struct nvme_q {
105 	bool is_cq;
106 
107 	void *addr;
108 
109 	dma_sg_t sg;
110 	struct iovec iov;
111 
112 	uint32_t size;
113 	uint64_t prp1;
114 
115 	union {
116 		struct {
117 			uint32_t head;
118 			/* multiple SQs can be mapped to the same CQ */
119 			uint16_t cqid;
120 		};
121 		struct {
122 			uint32_t tail;
123 			uint16_t iv;
124 			bool ien;
125 		};
126 	};
127 };
128 
129 enum nvmf_vfio_user_qpair_state {
130 	VFIO_USER_QPAIR_UNINITIALIZED = 0,
131 	VFIO_USER_QPAIR_ACTIVE,
132 	VFIO_USER_QPAIR_DELETED,
133 	VFIO_USER_QPAIR_INACTIVE,
134 	VFIO_USER_QPAIR_ERROR,
135 };
136 
137 struct nvmf_vfio_user_qpair {
138 	struct spdk_nvmf_qpair			qpair;
139 	struct spdk_nvmf_transport_poll_group	*group;
140 	struct nvmf_vfio_user_ctrlr		*ctrlr;
141 	struct nvmf_vfio_user_req		*reqs_internal;
142 	uint16_t				qsize;
143 	struct nvme_q				cq;
144 	struct nvme_q				sq;
145 	enum nvmf_vfio_user_qpair_state		state;
146 
147 	TAILQ_HEAD(, nvmf_vfio_user_req)	reqs;
148 	TAILQ_ENTRY(nvmf_vfio_user_qpair)	link;
149 };
150 
151 struct nvmf_vfio_user_poll_group {
152 	struct spdk_nvmf_transport_poll_group	group;
153 	TAILQ_HEAD(, nvmf_vfio_user_qpair)	qps;
154 };
155 
156 struct nvmf_vfio_user_ctrlr {
157 	struct nvmf_vfio_user_endpoint		*endpoint;
158 	struct nvmf_vfio_user_transport		*transport;
159 
160 	/* True when the socket connection is active */
161 	bool					ready;
162 	/* Number of connected queue pairs */
163 	uint32_t				num_connected_qps;
164 
165 	struct spdk_thread			*thread;
166 	struct spdk_poller			*mmio_poller;
167 
168 	uint16_t				cntlid;
169 
170 	struct nvmf_vfio_user_qpair		*qp[NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR];
171 
172 	TAILQ_ENTRY(nvmf_vfio_user_ctrlr)	link;
173 
174 	volatile uint32_t			*doorbells;
175 
176 	/* internal CSTS.CFS register for vfio-user fatal errors */
177 	uint32_t				cfs : 1;
178 };
179 
180 struct nvmf_vfio_user_endpoint {
181 	vfu_ctx_t				*vfu_ctx;
182 	struct msixcap				*msix;
183 	vfu_pci_config_space_t			*pci_config_space;
184 	int					fd;
185 	volatile uint32_t			*doorbells;
186 
187 	struct spdk_nvme_transport_id		trid;
188 	const struct spdk_nvmf_subsystem	*subsystem;
189 
190 	struct nvmf_vfio_user_ctrlr		*ctrlr;
191 	pthread_mutex_t				lock;
192 
193 	TAILQ_ENTRY(nvmf_vfio_user_endpoint)	link;
194 };
195 
196 struct nvmf_vfio_user_transport {
197 	struct spdk_nvmf_transport		transport;
198 	pthread_mutex_t				lock;
199 	TAILQ_HEAD(, nvmf_vfio_user_endpoint)	endpoints;
200 
201 	TAILQ_HEAD(, nvmf_vfio_user_qpair)	new_qps;
202 };
203 
204 /*
205  * function prototypes
206  */
207 static volatile uint32_t *
208 hdbl(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q);
209 
210 static volatile uint32_t *
211 tdbl(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q);
212 
213 static int
214 nvmf_vfio_user_req_free(struct spdk_nvmf_request *req);
215 
216 static struct nvmf_vfio_user_req *
217 get_nvmf_vfio_user_req(struct nvmf_vfio_user_qpair *qpair);
218 
219 static int
220 post_completion(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd,
221 		struct nvme_q *cq, uint32_t cdw0, uint16_t sc,
222 		uint16_t sct);
223 
224 static char *
225 endpoint_id(struct nvmf_vfio_user_endpoint *endpoint)
226 {
227 	return endpoint->trid.traddr;
228 }
229 
230 static char *
231 ctrlr_id(struct nvmf_vfio_user_ctrlr *ctrlr)
232 {
233 	if (!ctrlr || !ctrlr->endpoint) {
234 		return "Null Ctrlr";
235 	}
236 
237 	return endpoint_id(ctrlr->endpoint);
238 }
239 
240 static uint16_t
241 io_q_id(struct nvme_q *q)
242 {
243 
244 	struct nvmf_vfio_user_qpair *vfio_user_qpair;
245 
246 	assert(q);
247 
248 	if (q->is_cq) {
249 		vfio_user_qpair = SPDK_CONTAINEROF(q, struct nvmf_vfio_user_qpair, cq);
250 	} else {
251 		vfio_user_qpair = SPDK_CONTAINEROF(q, struct nvmf_vfio_user_qpair, sq);
252 	}
253 	assert(vfio_user_qpair);
254 	return vfio_user_qpair->qpair.qid;
255 }
256 
257 static void
258 fail_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr)
259 {
260 	assert(ctrlr != NULL);
261 
262 	if (ctrlr->cfs == 0) {
263 		SPDK_ERRLOG(":%s failing controller\n", ctrlr_id(ctrlr));
264 	}
265 
266 	ctrlr->ready = false;
267 	ctrlr->cfs = 1U;
268 }
269 
270 static bool
271 ctrlr_interrupt_enabled(struct nvmf_vfio_user_ctrlr *ctrlr)
272 {
273 	assert(ctrlr != NULL);
274 	assert(ctrlr->endpoint != NULL);
275 
276 	vfu_pci_config_space_t *pci = ctrlr->endpoint->pci_config_space;
277 
278 	return (!pci->hdr.cmd.id || ctrlr->endpoint->msix->mxc.mxe);
279 }
280 
281 static void
282 nvmf_vfio_user_destroy_endpoint(struct nvmf_vfio_user_endpoint *endpoint)
283 {
284 	if (endpoint->doorbells) {
285 		munmap((void *)endpoint->doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE);
286 	}
287 
288 	if (endpoint->fd > 0) {
289 		close(endpoint->fd);
290 	}
291 
292 	vfu_destroy_ctx(endpoint->vfu_ctx);
293 
294 	pthread_mutex_destroy(&endpoint->lock);
295 	free(endpoint);
296 }
297 
298 /* called when process exits */
299 static int
300 nvmf_vfio_user_destroy(struct spdk_nvmf_transport *transport,
301 		       spdk_nvmf_transport_destroy_done_cb cb_fn, void *cb_arg)
302 {
303 	struct nvmf_vfio_user_transport *vu_transport;
304 	struct nvmf_vfio_user_endpoint *endpoint, *tmp;
305 
306 	SPDK_DEBUGLOG(nvmf_vfio, "destroy transport\n");
307 
308 	vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport,
309 					transport);
310 
311 	(void)pthread_mutex_destroy(&vu_transport->lock);
312 
313 	TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) {
314 		TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link);
315 		nvmf_vfio_user_destroy_endpoint(endpoint);
316 	}
317 
318 	free(vu_transport);
319 
320 	if (cb_fn) {
321 		cb_fn(cb_arg);
322 	}
323 
324 	return 0;
325 }
326 
327 static struct spdk_nvmf_transport *
328 nvmf_vfio_user_create(struct spdk_nvmf_transport_opts *opts)
329 {
330 	struct nvmf_vfio_user_transport *vu_transport;
331 	int err;
332 
333 	vu_transport = calloc(1, sizeof(*vu_transport));
334 	if (vu_transport == NULL) {
335 		SPDK_ERRLOG("Transport alloc fail: %m\n");
336 		return NULL;
337 	}
338 
339 	err = pthread_mutex_init(&vu_transport->lock, NULL);
340 	if (err != 0) {
341 		SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err);
342 		goto err;
343 	}
344 
345 	TAILQ_INIT(&vu_transport->endpoints);
346 	TAILQ_INIT(&vu_transport->new_qps);
347 
348 	return &vu_transport->transport;
349 
350 err:
351 	free(vu_transport);
352 
353 	return NULL;
354 }
355 
356 static uint16_t
357 max_queue_size(struct nvmf_vfio_user_ctrlr const *ctrlr)
358 {
359 	assert(ctrlr != NULL);
360 	assert(ctrlr->qp[0] != NULL);
361 	assert(ctrlr->qp[0]->qpair.ctrlr != NULL);
362 
363 	return ctrlr->qp[0]->qpair.ctrlr->vcprop.cap.bits.mqes + 1;
364 }
365 
366 static void *
367 map_one(vfu_ctx_t *ctx, uint64_t addr, uint64_t len, dma_sg_t *sg, struct iovec *iov)
368 {
369 	int ret;
370 
371 	assert(ctx != NULL);
372 	assert(sg != NULL);
373 	assert(iov != NULL);
374 
375 	ret = vfu_addr_to_sg(ctx, (void *)(uintptr_t)addr, len, sg, 1, PROT_READ | PROT_WRITE);
376 	if (ret != 1) {
377 		return NULL;
378 	}
379 
380 	ret = vfu_map_sg(ctx, sg, iov, 1);
381 	if (ret != 0) {
382 		return NULL;
383 	}
384 
385 	assert(iov->iov_base != NULL);
386 	return iov->iov_base;
387 }
388 
389 static uint32_t
390 sq_head(struct nvmf_vfio_user_qpair *qpair)
391 {
392 	assert(qpair != NULL);
393 	return qpair->sq.head;
394 }
395 
396 static void
397 sqhd_advance(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_qpair *qpair)
398 {
399 	assert(ctrlr != NULL);
400 	assert(qpair != NULL);
401 	qpair->sq.head = (qpair->sq.head + 1) % qpair->sq.size;
402 }
403 
404 static void
405 insert_queue(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q,
406 	     const bool is_cq, const uint16_t id)
407 {
408 	struct nvme_q *_q;
409 	struct nvmf_vfio_user_qpair *qpair;
410 
411 	assert(ctrlr != NULL);
412 	assert(q != NULL);
413 
414 	qpair = ctrlr->qp[id];
415 
416 	q->is_cq = is_cq;
417 	if (is_cq) {
418 		_q = &qpair->cq;
419 		*_q = *q;
420 		*hdbl(ctrlr, _q) = 0;
421 	} else {
422 		_q = &qpair->sq;
423 		*_q = *q;
424 		*tdbl(ctrlr, _q) = 0;
425 	}
426 }
427 
428 static int
429 asq_map(struct nvmf_vfio_user_ctrlr *ctrlr)
430 {
431 	struct nvme_q q = {};
432 	const struct spdk_nvmf_registers *regs;
433 
434 	assert(ctrlr != NULL);
435 	assert(ctrlr->qp[0] != NULL);
436 	assert(ctrlr->qp[0]->sq.addr == NULL);
437 	/* XXX ctrlr->asq == 0 is a valid memory address */
438 
439 	regs = spdk_nvmf_ctrlr_get_regs(ctrlr->qp[0]->qpair.ctrlr);
440 	q.size = regs->aqa.bits.asqs + 1;
441 	q.head = ctrlr->doorbells[0] = 0;
442 	q.cqid = 0;
443 	q.addr = map_one(ctrlr->endpoint->vfu_ctx, regs->asq,
444 			 q.size * sizeof(struct spdk_nvme_cmd), &q.sg, &q.iov);
445 	if (q.addr == NULL) {
446 		return -1;
447 	}
448 	memset(q.addr, 0, q.size * sizeof(struct spdk_nvme_cmd));
449 	insert_queue(ctrlr, &q, false, 0);
450 
451 	return 0;
452 }
453 
454 static uint16_t
455 cq_next(struct nvme_q *q)
456 {
457 	assert(q != NULL);
458 	assert(q->is_cq);
459 	return (q->tail + 1) % q->size;
460 }
461 
462 static int
463 queue_index(uint16_t qid, int is_cq)
464 {
465 	return (qid * 2) + is_cq;
466 }
467 
468 static volatile uint32_t *
469 tdbl(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q)
470 {
471 	assert(ctrlr != NULL);
472 	assert(q != NULL);
473 	assert(!q->is_cq);
474 
475 	return &ctrlr->doorbells[queue_index(io_q_id(q), false)];
476 }
477 
478 static volatile uint32_t *
479 hdbl(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q)
480 {
481 	assert(ctrlr != NULL);
482 	assert(q != NULL);
483 	assert(q->is_cq);
484 
485 	return &ctrlr->doorbells[queue_index(io_q_id(q), true)];
486 }
487 
488 static bool
489 cq_is_full(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q)
490 {
491 	assert(ctrlr != NULL);
492 	assert(q != NULL);
493 	return cq_next(q) == *hdbl(ctrlr, q);
494 }
495 
496 static void
497 cq_tail_advance(struct nvme_q *q)
498 {
499 	assert(q != NULL);
500 	q->tail = cq_next(q);
501 }
502 
503 static int
504 acq_map(struct nvmf_vfio_user_ctrlr *ctrlr)
505 {
506 	struct nvme_q q = {};
507 	const struct spdk_nvmf_registers *regs;
508 
509 	assert(ctrlr != NULL);
510 	assert(ctrlr->qp[0] != NULL);
511 	assert(ctrlr->qp[0]->cq.addr == NULL);
512 
513 	regs = spdk_nvmf_ctrlr_get_regs(ctrlr->qp[0]->qpair.ctrlr);
514 	assert(regs != NULL);
515 
516 	q.size = regs->aqa.bits.acqs + 1;
517 	q.tail = 0;
518 	q.addr = map_one(ctrlr->endpoint->vfu_ctx, regs->acq,
519 			 q.size * sizeof(struct spdk_nvme_cpl), &q.sg, &q.iov);
520 	if (q.addr == NULL) {
521 		return -1;
522 	}
523 	memset(q.addr, 0, q.size * sizeof(struct spdk_nvme_cpl));
524 	q.is_cq = true;
525 	q.ien = true;
526 	insert_queue(ctrlr, &q, true, 0);
527 
528 	return 0;
529 }
530 
531 static void *
532 _map_one(void *prv, uint64_t addr, uint64_t len)
533 {
534 	struct spdk_nvmf_request *req = (struct spdk_nvmf_request *)prv;
535 	struct spdk_nvmf_qpair *qpair;
536 	struct nvmf_vfio_user_req *vu_req;
537 	struct nvmf_vfio_user_qpair *vu_qpair;
538 	void *ret;
539 
540 	assert(req != NULL);
541 	qpair = req->qpair;
542 	vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req);
543 	vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair);
544 
545 	assert(vu_req->iovcnt < NVMF_VFIO_USER_MAX_IOVECS);
546 	ret = map_one(vu_qpair->ctrlr->endpoint->vfu_ctx, addr, len,
547 		      &vu_req->sg[vu_req->iovcnt],
548 		      &vu_req->iov[vu_req->iovcnt]);
549 	if (spdk_likely(ret != NULL)) {
550 		vu_req->iovcnt++;
551 	}
552 	return ret;
553 }
554 
555 static int
556 vfio_user_map_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req,
557 		  struct iovec *iov, uint32_t length)
558 {
559 	/* Map PRP list to from Guest physical memory to
560 	 * virtual memory address.
561 	 */
562 	return spdk_nvme_map_cmd(req, &req->cmd->nvme_cmd, iov, NVMF_REQ_MAX_BUFFERS,
563 				 length, 4096, _map_one);
564 }
565 
566 static struct spdk_nvmf_request *
567 get_nvmf_req(struct nvmf_vfio_user_qpair *qp);
568 
569 static int
570 handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd,
571 	       struct spdk_nvmf_request *req);
572 
573 /*
574  * Posts a CQE in the completion queue.
575  *
576  * @ctrlr: the vfio-user controller
577  * @cmd: the NVMe command for which the completion is posted
578  * @cq: the completion queue
579  * @cdw0: cdw0 as reported by NVMf (only for SPDK_NVME_OPC_GET/SET_FEATURES)
580  * @sc: the NVMe CQE status code
581  * @sct: the NVMe CQE status code type
582  */
583 static int
584 post_completion(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd,
585 		struct nvme_q *cq, uint32_t cdw0, uint16_t sc,
586 		uint16_t sct)
587 {
588 	struct spdk_nvme_cpl *cpl;
589 	uint16_t qid;
590 	int err;
591 
592 	assert(ctrlr != NULL);
593 	assert(cmd != NULL);
594 
595 	qid = io_q_id(cq);
596 
597 	if (ctrlr->qp[0]->qpair.ctrlr->vcprop.csts.bits.shst != SPDK_NVME_SHST_NORMAL) {
598 		SPDK_DEBUGLOG(nvmf_vfio,
599 			      "%s: ignore completion SQ%d cid=%d status=%#x\n",
600 			      ctrlr_id(ctrlr), qid, cmd->cid, sc);
601 		return 0;
602 	}
603 
604 	if (cq_is_full(ctrlr, cq)) {
605 		SPDK_ERRLOG("%s: CQ%d full (tail=%d, head=%d)\n",
606 			    ctrlr_id(ctrlr), qid, cq->tail, *hdbl(ctrlr, cq));
607 		return -1;
608 	}
609 
610 	cpl = ((struct spdk_nvme_cpl *)cq->addr) + cq->tail;
611 
612 	SPDK_DEBUGLOG(nvmf_vfio,
613 		      "%s: request complete SQ%d cid=%d status=%#x SQ head=%#x CQ tail=%#x\n",
614 		      ctrlr_id(ctrlr), qid, cmd->cid, sc, ctrlr->qp[qid]->sq.head,
615 		      cq->tail);
616 
617 	if (qid == 0) {
618 		switch (cmd->opc) {
619 		case SPDK_NVME_OPC_SET_FEATURES:
620 		case SPDK_NVME_OPC_GET_FEATURES:
621 			cpl->cdw0 = cdw0;
622 			break;
623 		}
624 	}
625 
626 
627 	assert(ctrlr->qp[qid] != NULL);
628 
629 	cpl->sqhd = ctrlr->qp[qid]->sq.head;
630 	cpl->cid = cmd->cid;
631 	cpl->status.dnr = 0x0;
632 	cpl->status.m = 0x0;
633 	cpl->status.sct = sct;
634 	cpl->status.p = ~cpl->status.p;
635 	cpl->status.sc = sc;
636 
637 	cq_tail_advance(cq);
638 
639 	/*
640 	 * this function now executes at SPDK thread context, we
641 	 * might be triggerring interrupts from vfio-user thread context so
642 	 * check for race conditions.
643 	 */
644 	if (ctrlr_interrupt_enabled(ctrlr) && cq->ien) {
645 		err = vfu_irq_trigger(ctrlr->endpoint->vfu_ctx, cq->iv);
646 		if (err != 0) {
647 			SPDK_ERRLOG("%s: failed to trigger interrupt: %m\n",
648 				    ctrlr_id(ctrlr));
649 			return err;
650 		}
651 	}
652 
653 	return 0;
654 }
655 
656 static struct nvme_q *
657 lookup_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, const uint16_t qid, const bool is_cq)
658 {
659 	struct nvme_q *q;
660 
661 	assert(ctrlr != NULL);
662 
663 	if (qid > NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR) {
664 		return NULL;
665 	}
666 
667 	if (ctrlr->qp[qid] == NULL) {
668 		return NULL;
669 	}
670 
671 	if (is_cq) {
672 		q = &ctrlr->qp[qid]->cq;
673 	} else {
674 		q = &ctrlr->qp[qid]->sq;
675 	}
676 
677 	if (q->addr == NULL) {
678 		return NULL;
679 	}
680 
681 	return q;
682 }
683 
684 static void
685 unmap_qp(struct nvmf_vfio_user_qpair *qp)
686 {
687 	struct nvmf_vfio_user_ctrlr *ctrlr;
688 
689 	if (qp->ctrlr == NULL) {
690 		return;
691 	}
692 	ctrlr = qp->ctrlr;
693 
694 	SPDK_DEBUGLOG(nvmf_vfio, "%s: destroy I/O QP%d\n",
695 		      ctrlr_id(ctrlr), qp->qpair.qid);
696 
697 	if (qp->sq.addr != NULL) {
698 		vfu_unmap_sg(ctrlr->endpoint->vfu_ctx, &qp->sq.sg, &qp->sq.iov, 1);
699 		qp->sq.addr = NULL;
700 	}
701 
702 	if (qp->cq.addr != NULL) {
703 		vfu_unmap_sg(ctrlr->endpoint->vfu_ctx, &qp->cq.sg, &qp->cq.iov, 1);
704 		qp->cq.addr = NULL;
705 	}
706 }
707 
708 /*
709  * TODO we can immediately remove the QP from the list because this function
710  * is now executed by the SPDK thread.
711  */
712 static void
713 destroy_qp(struct nvmf_vfio_user_ctrlr *ctrlr, uint16_t qid)
714 {
715 	struct nvmf_vfio_user_qpair *qpair;
716 
717 	if (ctrlr == NULL) {
718 		return;
719 	}
720 
721 	qpair = ctrlr->qp[qid];
722 	if (qpair == NULL) {
723 		return;
724 	}
725 
726 	SPDK_DEBUGLOG(nvmf_vfio, "%s: destroy QP%d=%p\n", ctrlr_id(ctrlr),
727 		      qid, qpair);
728 
729 	unmap_qp(qpair);
730 	free(qpair->reqs_internal);
731 	free(qpair);
732 	ctrlr->qp[qid] = NULL;
733 }
734 
735 /* This function can only fail because of memory allocation errors. */
736 static int
737 init_qp(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_transport *transport,
738 	const uint16_t qsize, const uint16_t id)
739 {
740 	int err = 0, i;
741 	struct nvmf_vfio_user_qpair *qpair;
742 	struct nvmf_vfio_user_req *vu_req;
743 	struct spdk_nvmf_request *req;
744 
745 	assert(ctrlr != NULL);
746 	assert(transport != NULL);
747 
748 	qpair = calloc(1, sizeof(*qpair));
749 	if (qpair == NULL) {
750 		return -ENOMEM;
751 	}
752 
753 	qpair->qpair.qid = id;
754 	qpair->qpair.transport = transport;
755 	qpair->ctrlr = ctrlr;
756 	qpair->qsize = qsize;
757 
758 	TAILQ_INIT(&qpair->reqs);
759 
760 	qpair->reqs_internal = calloc(qsize, sizeof(struct nvmf_vfio_user_req));
761 	if (qpair->reqs_internal == NULL) {
762 		SPDK_ERRLOG("%s: error allocating reqs: %m\n", ctrlr_id(ctrlr));
763 		err = -ENOMEM;
764 		goto out;
765 	}
766 
767 	for (i = 0; i < qsize; i++) {
768 		vu_req = &qpair->reqs_internal[i];
769 		req = &vu_req->req;
770 
771 		req->qpair = &qpair->qpair;
772 		req->rsp = (union nvmf_c2h_msg *)&vu_req->rsp;
773 		req->cmd = (union nvmf_h2c_msg *)&vu_req->cmd;
774 
775 		TAILQ_INSERT_TAIL(&qpair->reqs, vu_req, link);
776 	}
777 	ctrlr->qp[id] = qpair;
778 out:
779 	if (err != 0) {
780 		free(qpair);
781 	}
782 	return err;
783 }
784 
785 /*
786  * Creates a completion or sumbission I/O queue. Returns 0 on success, -errno
787  * on error.
788  *
789  * XXX SPDK thread context.
790  */
791 static int
792 handle_create_io_q(struct nvmf_vfio_user_ctrlr *ctrlr,
793 		   struct spdk_nvme_cmd *cmd, const bool is_cq)
794 {
795 	size_t entry_size;
796 	uint16_t sc = SPDK_NVME_SC_SUCCESS;
797 	uint16_t sct = SPDK_NVME_SCT_GENERIC;
798 	int err = 0;
799 	struct nvme_q io_q = {};
800 
801 	assert(ctrlr != NULL);
802 	assert(cmd != NULL);
803 
804 	SPDK_DEBUGLOG(nvmf_vfio,
805 		      "%s: create I/O %cQ%d: QSIZE=%#x\n", ctrlr_id(ctrlr),
806 		      is_cq ? 'C' : 'S', cmd->cdw10_bits.create_io_q.qid,
807 		      cmd->cdw10_bits.create_io_q.qsize);
808 
809 	if (cmd->cdw10_bits.create_io_q.qid >= NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR) {
810 		SPDK_ERRLOG("%s: invalid QID=%d, max=%d\n", ctrlr_id(ctrlr),
811 			    cmd->cdw10_bits.create_io_q.qid,
812 			    NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR);
813 		sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
814 		sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER;
815 		goto out;
816 	}
817 
818 	if (lookup_io_q(ctrlr, cmd->cdw10_bits.create_io_q.qid, is_cq)) {
819 		SPDK_ERRLOG("%s: %cQ%d already exists\n", ctrlr_id(ctrlr),
820 			    is_cq ? 'C' : 'S', cmd->cdw10_bits.create_io_q.qid);
821 		sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
822 		sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER;
823 		goto out;
824 	}
825 
826 	/* TODO break rest of this function into smaller functions */
827 	if (is_cq) {
828 		entry_size = sizeof(struct spdk_nvme_cpl);
829 		if (cmd->cdw11_bits.create_io_cq.pc != 0x1) {
830 			/*
831 			 * TODO CAP.CMBS is currently set to zero, however we
832 			 * should zero it out explicitly when CAP is read.
833 			 * Support for CAP.CMBS is not mentioned in the NVMf
834 			 * spec.
835 			 */
836 			SPDK_ERRLOG("%s: non-PC CQ not supporred\n", ctrlr_id(ctrlr));
837 			sc = SPDK_NVME_SC_INVALID_CONTROLLER_MEM_BUF;
838 			goto out;
839 		}
840 		io_q.ien = cmd->cdw11_bits.create_io_cq.ien;
841 		io_q.iv = cmd->cdw11_bits.create_io_cq.iv;
842 	} else {
843 		/* CQ must be created before SQ */
844 		if (!lookup_io_q(ctrlr, cmd->cdw11_bits.create_io_sq.cqid, true)) {
845 			SPDK_ERRLOG("%s: CQ%d does not exist\n", ctrlr_id(ctrlr),
846 				    cmd->cdw11_bits.create_io_sq.cqid);
847 			sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
848 			sc = SPDK_NVME_SC_COMPLETION_QUEUE_INVALID;
849 			goto out;
850 		}
851 
852 		entry_size = sizeof(struct spdk_nvme_cmd);
853 		if (cmd->cdw11_bits.create_io_sq.pc != 0x1) {
854 			SPDK_ERRLOG("%s: non-PC SQ not supported\n", ctrlr_id(ctrlr));
855 			sc = SPDK_NVME_SC_INVALID_CONTROLLER_MEM_BUF;
856 			goto out;
857 		}
858 
859 		io_q.cqid = cmd->cdw11_bits.create_io_sq.cqid;
860 		SPDK_DEBUGLOG(nvmf_vfio, "%s: SQ%d CQID=%d\n", ctrlr_id(ctrlr),
861 			      cmd->cdw10_bits.create_io_q.qid, io_q.cqid);
862 	}
863 
864 	io_q.size = cmd->cdw10_bits.create_io_q.qsize + 1;
865 	if (io_q.size > max_queue_size(ctrlr)) {
866 		SPDK_ERRLOG("%s: queue too big, want=%d, max=%d\n", ctrlr_id(ctrlr),
867 			    io_q.size, max_queue_size(ctrlr));
868 		sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
869 		sc = SPDK_NVME_SC_INVALID_QUEUE_SIZE;
870 		goto out;
871 	}
872 
873 	io_q.addr = map_one(ctrlr->endpoint->vfu_ctx, cmd->dptr.prp.prp1,
874 			    io_q.size * entry_size, &io_q.sg, &io_q.iov);
875 	if (io_q.addr == NULL) {
876 		sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
877 		SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr));
878 		goto out;
879 	}
880 	io_q.prp1 = cmd->dptr.prp.prp1;
881 	memset(io_q.addr, 0, io_q.size * entry_size);
882 
883 	SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped %cQ%d IOVA=%#lx vaddr=%#llx\n",
884 		      ctrlr_id(ctrlr), is_cq ? 'C' : 'S',
885 		      cmd->cdw10_bits.create_io_q.qid, cmd->dptr.prp.prp1,
886 		      (unsigned long long)io_q.addr);
887 
888 	if (is_cq) {
889 		err = init_qp(ctrlr, ctrlr->qp[0]->qpair.transport, io_q.size,
890 			      cmd->cdw10_bits.create_io_q.qid);
891 		if (err != 0) {
892 			sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
893 			goto out;
894 		}
895 	} else {
896 		/*
897 		 * After we've returned from the nvmf_vfio_user_poll_group_poll thread, once
898 		 * nvmf_vfio_user_accept executes it will pick up this QP and will eventually
899 		 * call nvmf_vfio_user_poll_group_add. The rest of the opertion needed to
900 		 * complete the addition of the queue will be continued at the
901 		 * completion callback.
902 		 */
903 		TAILQ_INSERT_TAIL(&ctrlr->transport->new_qps, ctrlr->qp[cmd->cdw10_bits.create_io_q.qid], link);
904 
905 	}
906 	insert_queue(ctrlr, &io_q, is_cq, cmd->cdw10_bits.create_io_q.qid);
907 
908 out:
909 	return post_completion(ctrlr, cmd, &ctrlr->qp[0]->cq, 0, sc, sct);
910 }
911 
912 /*
913  * Deletes a completion or sumbission I/O queue.
914  */
915 static int
916 handle_del_io_q(struct nvmf_vfio_user_ctrlr *ctrlr,
917 		struct spdk_nvme_cmd *cmd, const bool is_cq)
918 {
919 	uint16_t sct = SPDK_NVME_SCT_GENERIC;
920 	uint16_t sc = SPDK_NVME_SC_SUCCESS;
921 
922 	SPDK_DEBUGLOG(nvmf_vfio, "%s: delete I/O %cQ: QID=%d\n",
923 		      ctrlr_id(ctrlr), is_cq ? 'C' : 'S',
924 		      cmd->cdw10_bits.delete_io_q.qid);
925 
926 	if (lookup_io_q(ctrlr, cmd->cdw10_bits.delete_io_q.qid, is_cq) == NULL) {
927 		SPDK_ERRLOG("%s: %cQ%d does not exist\n", ctrlr_id(ctrlr),
928 			    is_cq ? 'C' : 'S', cmd->cdw10_bits.delete_io_q.qid);
929 		sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
930 		sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER;
931 		goto out;
932 	}
933 
934 	if (is_cq) {
935 		/* SQ must have been deleted first */
936 		if (ctrlr->qp[cmd->cdw10_bits.delete_io_q.qid]->state != VFIO_USER_QPAIR_DELETED) {
937 			SPDK_ERRLOG("%s: the associated SQ must be deleted first\n", ctrlr_id(ctrlr));
938 			sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
939 			sc = SPDK_NVME_SC_INVALID_QUEUE_DELETION;
940 			goto out;
941 		}
942 	} else {
943 		/*
944 		 * This doesn't actually delete the I/O queue, we can't
945 		 * do that anyway because NVMf doesn't support it. We're merely
946 		 * telling the poll_group_poll function to skip checking this
947 		 * queue. The only workflow this works is when CC.EN is set to
948 		 * 0 and we're stopping the subsystem, so we know that the
949 		 * relevant callbacks to destroy the queues will be called.
950 		 */
951 		assert(ctrlr->qp[cmd->cdw10_bits.delete_io_q.qid]->state == VFIO_USER_QPAIR_ACTIVE);
952 		ctrlr->qp[cmd->cdw10_bits.delete_io_q.qid]->state = VFIO_USER_QPAIR_DELETED;
953 	}
954 
955 out:
956 	return post_completion(ctrlr, cmd, &ctrlr->qp[0]->cq, 0, sc, sct);
957 }
958 
959 /*
960  * Returns 0 on success and -errno on error.
961  *
962  * XXX SPDK thread context
963  */
964 static int
965 consume_admin_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd)
966 {
967 	assert(ctrlr != NULL);
968 	assert(cmd != NULL);
969 
970 	SPDK_DEBUGLOG(nvmf_vfio, "%s: handle admin req opc=%#x cid=%d\n",
971 		      ctrlr_id(ctrlr), cmd->opc, cmd->cid);
972 
973 	switch (cmd->opc) {
974 	case SPDK_NVME_OPC_CREATE_IO_CQ:
975 	case SPDK_NVME_OPC_CREATE_IO_SQ:
976 		return handle_create_io_q(ctrlr, cmd,
977 					  cmd->opc == SPDK_NVME_OPC_CREATE_IO_CQ);
978 	case SPDK_NVME_OPC_DELETE_IO_SQ:
979 	case SPDK_NVME_OPC_DELETE_IO_CQ:
980 		return handle_del_io_q(ctrlr, cmd,
981 				       cmd->opc == SPDK_NVME_OPC_DELETE_IO_CQ);
982 	default:
983 		return handle_cmd_req(ctrlr, cmd, get_nvmf_req(ctrlr->qp[0]));
984 	}
985 }
986 
987 static int
988 handle_cmd_rsp(struct nvmf_vfio_user_req *req, void *cb_arg)
989 {
990 	struct nvmf_vfio_user_qpair *qpair = cb_arg;
991 
992 	assert(qpair != NULL);
993 	assert(req != NULL);
994 
995 	vfu_unmap_sg(qpair->ctrlr->endpoint->vfu_ctx, req->sg, req->iov, req->iovcnt);
996 
997 	return post_completion(qpair->ctrlr, &req->req.cmd->nvme_cmd,
998 			       &qpair->ctrlr->qp[req->req.qpair->qid]->cq,
999 			       req->req.rsp->nvme_cpl.cdw0,
1000 			       req->req.rsp->nvme_cpl.status.sc,
1001 			       req->req.rsp->nvme_cpl.status.sct);
1002 }
1003 
1004 static int
1005 consume_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_qpair *qpair,
1006 	    struct spdk_nvme_cmd *cmd)
1007 {
1008 	assert(qpair != NULL);
1009 	if (nvmf_qpair_is_admin_queue(&qpair->qpair)) {
1010 		return consume_admin_cmd(ctrlr, cmd);
1011 	}
1012 
1013 	return handle_cmd_req(ctrlr, cmd, get_nvmf_req(qpair));
1014 }
1015 
1016 static ssize_t
1017 handle_sq_tdbl_write(struct nvmf_vfio_user_ctrlr *ctrlr, const uint32_t new_tail,
1018 		     struct nvmf_vfio_user_qpair *qpair)
1019 {
1020 	struct spdk_nvme_cmd *queue;
1021 
1022 	assert(ctrlr != NULL);
1023 	assert(qpair != NULL);
1024 
1025 	queue = qpair->sq.addr;
1026 	while (sq_head(qpair) != new_tail) {
1027 		int err;
1028 		struct spdk_nvme_cmd *cmd = &queue[sq_head(qpair)];
1029 
1030 		/*
1031 		 * SQHD must contain the new head pointer, so we must increase
1032 		 * it before we generate a completion.
1033 		 */
1034 		sqhd_advance(ctrlr, qpair);
1035 
1036 		err = consume_cmd(ctrlr, qpair, cmd);
1037 		if (err != 0) {
1038 			return err;
1039 		}
1040 	}
1041 
1042 	return 0;
1043 }
1044 
1045 static int
1046 map_admin_queue(struct nvmf_vfio_user_ctrlr *ctrlr)
1047 {
1048 	int err;
1049 
1050 	assert(ctrlr != NULL);
1051 
1052 	err = acq_map(ctrlr);
1053 	if (err != 0) {
1054 		return err;
1055 	}
1056 
1057 	err = asq_map(ctrlr);
1058 	if (err != 0) {
1059 		return err;
1060 	}
1061 
1062 	return 0;
1063 }
1064 
1065 static void
1066 unmap_admin_queue(struct nvmf_vfio_user_ctrlr *ctrlr)
1067 {
1068 	assert(ctrlr->qp[0] != NULL);
1069 
1070 	unmap_qp(ctrlr->qp[0]);
1071 }
1072 
1073 static void
1074 memory_region_add_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
1075 {
1076 	struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx);
1077 	struct nvmf_vfio_user_ctrlr *ctrlr;
1078 	struct nvmf_vfio_user_qpair *qpair;
1079 	int i, ret;
1080 
1081 	if (!info->vaddr || ((uintptr_t)info->mapping.iov_base & MASK_2MB) ||
1082 	    (info->mapping.iov_len & MASK_2MB)) {
1083 		SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %#lx-%#lx\n", info->vaddr,
1084 			      (uintptr_t)info->mapping.iov_base,
1085 			      (uintptr_t)info->mapping.iov_base + info->mapping.iov_len);
1086 		return;
1087 	}
1088 
1089 	assert(endpoint != NULL);
1090 	if (endpoint->ctrlr == NULL) {
1091 		return;
1092 	}
1093 	ctrlr = endpoint->ctrlr;
1094 
1095 	SPDK_DEBUGLOG(nvmf_vfio, "%s: map IOVA %#lx-%#lx\n", ctrlr_id(ctrlr),
1096 		      (uintptr_t)info->mapping.iov_base,
1097 		      (uintptr_t)info->mapping.iov_base + info->mapping.iov_len);
1098 
1099 	/* VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE are enabled when registering to VFIO, here we also
1100 	 * check the protection bits before registering.
1101 	 */
1102 	if ((info->prot == (PROT_WRITE | PROT_READ)) &&
1103 	    (spdk_mem_register(info->mapping.iov_base, info->mapping.iov_len))) {
1104 		SPDK_ERRLOG("Memory region register %#lx-%#lx failed\n",
1105 			    (uint64_t)(uintptr_t)info->mapping.iov_base,
1106 			    (uint64_t)(uintptr_t)info->mapping.iov_base + info->mapping.iov_len);
1107 	}
1108 
1109 	for (i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; i++) {
1110 		qpair = ctrlr->qp[i];
1111 		if (qpair == NULL) {
1112 			continue;
1113 		}
1114 
1115 		if (qpair->state != VFIO_USER_QPAIR_INACTIVE) {
1116 			continue;
1117 		}
1118 
1119 		if (nvmf_qpair_is_admin_queue(&qpair->qpair)) {
1120 			ret = map_admin_queue(ctrlr);
1121 			if (ret) {
1122 				SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap Admin queue\n");
1123 				continue;
1124 			}
1125 			qpair->state = VFIO_USER_QPAIR_ACTIVE;
1126 		} else {
1127 			struct nvme_q *sq = &qpair->sq;
1128 			struct nvme_q *cq = &qpair->cq;
1129 
1130 			sq->addr = map_one(ctrlr->endpoint->vfu_ctx, sq->prp1, sq->size * 64, &sq->sg, &sq->iov);
1131 			if (!sq->addr) {
1132 				SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap SQID %d %#lx-%#lx\n",
1133 					      i, sq->prp1, sq->prp1 + sq->size * 64);
1134 				continue;
1135 			}
1136 			cq->addr = map_one(ctrlr->endpoint->vfu_ctx, cq->prp1, cq->size * 16, &cq->sg, &cq->iov);
1137 			if (!cq->addr) {
1138 				SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap CQID %d %#lx-%#lx\n",
1139 					      i, cq->prp1, cq->prp1 + cq->size * 16);
1140 				continue;
1141 			}
1142 			qpair->state = VFIO_USER_QPAIR_ACTIVE;
1143 		}
1144 	}
1145 }
1146 
1147 static int
1148 memory_region_remove_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
1149 {
1150 
1151 	struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx);
1152 	struct nvmf_vfio_user_ctrlr *ctrlr;
1153 	struct nvmf_vfio_user_qpair *qpair;
1154 	void *map_start, *map_end;
1155 	int i;
1156 
1157 	if (!info->vaddr || ((uintptr_t)info->mapping.iov_base & MASK_2MB) ||
1158 	    (info->mapping.iov_len & MASK_2MB)) {
1159 		SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %#lx-%#lx\n", info->vaddr,
1160 			      (uintptr_t)info->mapping.iov_base,
1161 			      (uintptr_t)info->mapping.iov_base + info->mapping.iov_len);
1162 		return 0;
1163 	}
1164 
1165 	assert(endpoint != NULL);
1166 	if (endpoint->ctrlr == NULL) {
1167 		return 0;
1168 	}
1169 	ctrlr = endpoint->ctrlr;
1170 
1171 	SPDK_DEBUGLOG(nvmf_vfio, "%s: unmap IOVA %#lx-%#lx\n", ctrlr_id(ctrlr),
1172 		      (uintptr_t)info->mapping.iov_base,
1173 		      (uintptr_t)info->mapping.iov_base + info->mapping.iov_len);
1174 
1175 	if ((info->prot == (PROT_WRITE | PROT_READ)) &&
1176 	    (spdk_mem_unregister(info->mapping.iov_base, info->mapping.iov_len))) {
1177 		SPDK_ERRLOG("Memory region unregister %#lx-%#lx failed\n",
1178 			    (uint64_t)(uintptr_t)info->mapping.iov_base,
1179 			    (uint64_t)(uintptr_t)info->mapping.iov_base + info->mapping.iov_len);
1180 	}
1181 
1182 	map_start = info->mapping.iov_base;
1183 	map_end = info->mapping.iov_base + info->mapping.iov_len;
1184 	for (i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; i++) {
1185 		qpair = ctrlr->qp[i];
1186 		if (qpair == NULL) {
1187 			continue;
1188 		}
1189 
1190 		if ((qpair->cq.addr >= map_start && qpair->cq.addr < map_end) ||
1191 		    (qpair->sq.addr >= map_start && qpair->sq.addr < map_end)) {
1192 			unmap_qp(qpair);
1193 			qpair->state = VFIO_USER_QPAIR_INACTIVE;
1194 		}
1195 	}
1196 
1197 	return 0;
1198 }
1199 
1200 static int
1201 nvmf_vfio_user_prop_req_rsp(struct nvmf_vfio_user_req *req, void *cb_arg)
1202 {
1203 	struct nvmf_vfio_user_qpair *qpair = cb_arg;
1204 	int ret;
1205 
1206 	assert(qpair != NULL);
1207 	assert(req != NULL);
1208 
1209 	if (req->req.cmd->prop_get_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET) {
1210 		assert(qpair->ctrlr != NULL);
1211 		assert(req != NULL);
1212 
1213 		memcpy(req->req.data,
1214 		       &req->req.rsp->prop_get_rsp.value.u64,
1215 		       req->req.length);
1216 	} else {
1217 		assert(req->req.cmd->prop_set_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET);
1218 		assert(qpair->ctrlr != NULL);
1219 
1220 		if (req->req.cmd->prop_set_cmd.ofst == offsetof(struct spdk_nvme_registers, cc)) {
1221 			union spdk_nvme_cc_register *cc;
1222 
1223 			cc = (union spdk_nvme_cc_register *)&req->req.cmd->prop_set_cmd.value.u64;
1224 
1225 			if (cc->bits.en == 1 && cc->bits.shn == 0) {
1226 				SPDK_DEBUGLOG(nvmf_vfio,
1227 					      "%s: MAP Admin queue\n",
1228 					      ctrlr_id(qpair->ctrlr));
1229 				ret = map_admin_queue(qpair->ctrlr);
1230 				if (ret) {
1231 					SPDK_ERRLOG("%s: failed to map Admin queue\n", ctrlr_id(qpair->ctrlr));
1232 					return ret;
1233 				}
1234 				qpair->state = VFIO_USER_QPAIR_ACTIVE;
1235 			} else if ((cc->bits.en == 0 && cc->bits.shn == 0) ||
1236 				   (cc->bits.en == 1 && cc->bits.shn != 0)) {
1237 				SPDK_DEBUGLOG(nvmf_vfio,
1238 					      "%s: UNMAP Admin queue\n",
1239 					      ctrlr_id(qpair->ctrlr));
1240 				unmap_admin_queue(qpair->ctrlr);
1241 				qpair->state = VFIO_USER_QPAIR_INACTIVE;
1242 			}
1243 		}
1244 	}
1245 
1246 	return 0;
1247 }
1248 
1249 /*
1250  * XXX Do NOT remove, see comment in access_bar0_fn.
1251  *
1252  * Handles a write at offset 0x1000 or more.
1253  *
1254  * DSTRD is set to fixed value 0 for NVMf.
1255  *
1256  */
1257 static int
1258 handle_dbl_access(struct nvmf_vfio_user_ctrlr *ctrlr, uint32_t *buf,
1259 		  const size_t count, loff_t pos, const bool is_write)
1260 {
1261 	assert(ctrlr != NULL);
1262 	assert(buf != NULL);
1263 
1264 	if (count != sizeof(uint32_t)) {
1265 		SPDK_ERRLOG("%s: bad doorbell buffer size %ld\n",
1266 			    ctrlr_id(ctrlr), count);
1267 		errno = EINVAL;
1268 		return -1;
1269 	}
1270 
1271 	pos -= NVMF_VFIO_USER_DOORBELLS_OFFSET;
1272 
1273 	/* pos must be dword aligned */
1274 	if ((pos & 0x3) != 0) {
1275 		SPDK_ERRLOG("%s: bad doorbell offset %#lx\n", ctrlr_id(ctrlr), pos);
1276 		errno = EINVAL;
1277 		return -1;
1278 	}
1279 
1280 	/* convert byte offset to array index */
1281 	pos >>= 2;
1282 
1283 	if (pos > NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR * 2) {
1284 		/*
1285 		 * TODO: need to emit a "Write to Invalid Doorbell Register"
1286 		 * asynchronous event
1287 		 */
1288 		SPDK_ERRLOG("%s: bad doorbell index %#lx\n", ctrlr_id(ctrlr), pos);
1289 		errno = EINVAL;
1290 		return -1;
1291 	}
1292 
1293 	if (is_write) {
1294 		ctrlr->doorbells[pos] = *buf;
1295 		spdk_wmb();
1296 	} else {
1297 		spdk_rmb();
1298 		*buf = ctrlr->doorbells[pos];
1299 	}
1300 	return 0;
1301 }
1302 
1303 static ssize_t
1304 access_bar0_fn(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t pos,
1305 	       bool is_write)
1306 {
1307 	struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx);
1308 	struct nvmf_vfio_user_ctrlr *ctrlr;
1309 	struct nvmf_vfio_user_req *req;
1310 	int ret;
1311 
1312 	ctrlr = endpoint->ctrlr;
1313 
1314 	SPDK_DEBUGLOG(nvmf_vfio,
1315 		      "%s: bar0 %s ctrlr: %p, count=%zu, pos=%"PRIX64"\n",
1316 		      endpoint_id(endpoint), is_write ? "write" : "read",
1317 		      ctrlr, count, pos);
1318 
1319 	if (pos >= NVMF_VFIO_USER_DOORBELLS_OFFSET) {
1320 		/*
1321 		 * XXX The fact that the doorbells can be memory mapped doesn't
1322 		 * mean thath the client (VFIO in QEMU) is obliged to memory
1323 		 * map them, it might still elect to access them via regular
1324 		 * read/write.
1325 		 */
1326 		ret = handle_dbl_access(ctrlr, (uint32_t *)buf, count,
1327 					pos, is_write);
1328 		if (ret == 0) {
1329 			return count;
1330 		}
1331 		assert(errno != 0);
1332 		return ret;
1333 	}
1334 
1335 	/* Construct a Fabric Property Get/Set command and send it */
1336 	req = get_nvmf_vfio_user_req(ctrlr->qp[0]);
1337 	if (req == NULL) {
1338 		errno = ENOBUFS;
1339 		return -1;
1340 	}
1341 
1342 	req->cb_fn = nvmf_vfio_user_prop_req_rsp;
1343 	req->cb_arg = ctrlr->qp[0];
1344 	req->req.cmd->prop_set_cmd.opcode = SPDK_NVME_OPC_FABRIC;
1345 	req->req.cmd->prop_set_cmd.cid = 0;
1346 	req->req.cmd->prop_set_cmd.attrib.size = (count / 4) - 1;
1347 	req->req.cmd->prop_set_cmd.ofst = pos;
1348 	if (is_write) {
1349 		req->req.cmd->prop_set_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET;
1350 		if (req->req.cmd->prop_set_cmd.attrib.size) {
1351 			req->req.cmd->prop_set_cmd.value.u64 = *(uint64_t *)buf;
1352 		} else {
1353 			req->req.cmd->prop_set_cmd.value.u32.high = 0;
1354 			req->req.cmd->prop_set_cmd.value.u32.low = *(uint32_t *)buf;
1355 		}
1356 	} else {
1357 		req->req.cmd->prop_get_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET;
1358 	}
1359 	req->req.length = count;
1360 	req->req.data = buf;
1361 
1362 	spdk_nvmf_request_exec_fabrics(&req->req);
1363 
1364 	return count;
1365 }
1366 
1367 /*
1368  * NVMe driver reads 4096 bytes, which is the extended PCI configuration space
1369  * available on PCI-X 2.0 and PCI Express buses
1370  */
1371 static ssize_t
1372 access_pci_config(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t offset,
1373 		  bool is_write)
1374 {
1375 	struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx);
1376 
1377 	if (is_write) {
1378 		SPDK_ERRLOG("%s: write %#lx-%#lx not supported\n",
1379 			    endpoint_id(endpoint), offset, offset + count);
1380 		errno = EINVAL;
1381 		return -1;
1382 	}
1383 
1384 	if (offset + count > PCI_CFG_SPACE_EXP_SIZE) {
1385 		SPDK_ERRLOG("%s: access past end of extended PCI configuration space, want=%ld+%ld, max=%d\n",
1386 			    endpoint_id(endpoint), offset, count,
1387 			    PCI_CFG_SPACE_EXP_SIZE);
1388 		errno = ERANGE;
1389 		return -1;
1390 	}
1391 
1392 	memcpy(buf, ((unsigned char *)endpoint->pci_config_space) + offset, count);
1393 
1394 	return count;
1395 }
1396 
1397 static void
1398 vfio_user_log(vfu_ctx_t *vfu_ctx, int level, char const *msg)
1399 {
1400 	struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx);
1401 
1402 	if (level >= LOG_DEBUG) {
1403 		SPDK_DEBUGLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg);
1404 	} else if (level >= LOG_INFO) {
1405 		SPDK_INFOLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg);
1406 	} else if (level >= LOG_NOTICE) {
1407 		SPDK_NOTICELOG("%s: %s\n", endpoint_id(endpoint), msg);
1408 	} else if (level >= LOG_WARNING) {
1409 		SPDK_WARNLOG("%s: %s\n", endpoint_id(endpoint), msg);
1410 	} else {
1411 		SPDK_ERRLOG("%s: %s\n", endpoint_id(endpoint), msg);
1412 	}
1413 }
1414 
1415 static void
1416 init_pci_config_space(vfu_pci_config_space_t *p)
1417 {
1418 	/* MLBAR */
1419 	p->hdr.bars[0].raw = 0x0;
1420 	/* MUBAR */
1421 	p->hdr.bars[1].raw = 0x0;
1422 
1423 	/* vendor specific, let's set them to zero for now */
1424 	p->hdr.bars[3].raw = 0x0;
1425 	p->hdr.bars[4].raw = 0x0;
1426 	p->hdr.bars[5].raw = 0x0;
1427 
1428 	/* enable INTx */
1429 	p->hdr.intr.ipin = 0x1;
1430 }
1431 
1432 static int
1433 vfio_user_dev_info_fill(struct nvmf_vfio_user_endpoint *endpoint)
1434 {
1435 	int ret;
1436 	ssize_t cap_offset;
1437 	vfu_ctx_t *vfu_ctx = endpoint->vfu_ctx;
1438 
1439 	struct pmcap pmcap = { .hdr.id = PCI_CAP_ID_PM, .pmcs.nsfrst = 0x1 };
1440 	struct pxcap pxcap = {
1441 		.hdr.id = PCI_CAP_ID_EXP,
1442 		.pxcaps.ver = 0x2,
1443 		.pxdcap = {.per = 0x1, .flrc = 0x1},
1444 		.pxdcap2.ctds = 0x1
1445 	};
1446 
1447 	struct msixcap msixcap = {
1448 		.hdr.id = PCI_CAP_ID_MSIX,
1449 		.mxc.ts = NVME_IRQ_MSIX_NUM - 1,
1450 		.mtab = {.tbir = 0x4, .to = 0x0},
1451 		.mpba = {.pbir = 0x5, .pbao = 0x0}
1452 	};
1453 
1454 	static struct iovec sparse_mmap[] = {
1455 		{
1456 			.iov_base = (void *)NVMF_VFIO_USER_DOORBELLS_OFFSET,
1457 			.iov_len = NVMF_VFIO_USER_DOORBELLS_SIZE,
1458 		},
1459 	};
1460 
1461 	ret = vfu_pci_init(vfu_ctx, VFU_PCI_TYPE_EXPRESS, PCI_HEADER_TYPE_NORMAL, 0);
1462 	if (ret < 0) {
1463 		SPDK_ERRLOG("vfu_ctx %p failed to initialize PCI\n", vfu_ctx);
1464 		return ret;
1465 	}
1466 	vfu_pci_set_id(vfu_ctx, 0x4e58, 0x0001, 0, 0);
1467 	/*
1468 	 * 0x02, controller uses the NVM Express programming interface
1469 	 * 0x08, non-volatile memory controller
1470 	 * 0x01, mass storage controller
1471 	 */
1472 	vfu_pci_set_class(vfu_ctx, 0x01, 0x08, 0x02);
1473 
1474 	cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pmcap);
1475 	if (cap_offset < 0) {
1476 		SPDK_ERRLOG("vfu_ctx %p failed add pmcap\n", vfu_ctx);
1477 		return ret;
1478 	}
1479 
1480 	cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pxcap);
1481 	if (cap_offset < 0) {
1482 		SPDK_ERRLOG("vfu_ctx %p failed add pxcap\n", vfu_ctx);
1483 		return ret;
1484 	}
1485 
1486 	cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &msixcap);
1487 	if (cap_offset < 0) {
1488 		SPDK_ERRLOG("vfu_ctx %p failed add msixcap\n", vfu_ctx);
1489 		return ret;
1490 	}
1491 
1492 	ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_CFG_REGION_IDX, NVME_REG_CFG_SIZE,
1493 			       access_pci_config, VFU_REGION_FLAG_RW, NULL, 0, -1);
1494 	if (ret < 0) {
1495 		SPDK_ERRLOG("vfu_ctx %p failed to setup cfg\n", vfu_ctx);
1496 		return ret;
1497 	}
1498 
1499 	ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE,
1500 			       access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM,
1501 			       sparse_mmap, 1, endpoint->fd);
1502 	if (ret < 0) {
1503 		SPDK_ERRLOG("vfu_ctx %p failed to setup bar 0\n", vfu_ctx);
1504 		return ret;
1505 	}
1506 
1507 	ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR4_REGION_IDX, PAGE_SIZE,
1508 			       NULL, VFU_REGION_FLAG_RW, NULL, 0, -1);
1509 	if (ret < 0) {
1510 		SPDK_ERRLOG("vfu_ctx %p failed to setup bar 4\n", vfu_ctx);
1511 		return ret;
1512 	}
1513 
1514 	ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR5_REGION_IDX, PAGE_SIZE,
1515 			       NULL, VFU_REGION_FLAG_RW, NULL, 0, -1);
1516 	if (ret < 0) {
1517 		SPDK_ERRLOG("vfu_ctx %p failed to setup bar 5\n", vfu_ctx);
1518 		return ret;
1519 	}
1520 
1521 	ret = vfu_setup_device_dma(vfu_ctx, memory_region_add_cb, memory_region_remove_cb);
1522 	if (ret < 0) {
1523 		SPDK_ERRLOG("vfu_ctx %p failed to setup dma callback\n", vfu_ctx);
1524 		return ret;
1525 	}
1526 
1527 	ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_INTX_IRQ, 1);
1528 	if (ret < 0) {
1529 		SPDK_ERRLOG("vfu_ctx %p failed to setup INTX\n", vfu_ctx);
1530 		return ret;
1531 	}
1532 
1533 	ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSIX_IRQ, NVME_IRQ_MSIX_NUM);
1534 	if (ret < 0) {
1535 		SPDK_ERRLOG("vfu_ctx %p failed to setup MSIX\n", vfu_ctx);
1536 		return ret;
1537 	}
1538 
1539 	ret = vfu_realize_ctx(vfu_ctx);
1540 	if (ret < 0) {
1541 		SPDK_ERRLOG("vfu_ctx %p failed to realize\n", vfu_ctx);
1542 		return ret;
1543 	}
1544 
1545 	endpoint->pci_config_space = vfu_pci_get_config_space(endpoint->vfu_ctx);
1546 	assert(endpoint->pci_config_space != NULL);
1547 	init_pci_config_space(endpoint->pci_config_space);
1548 
1549 	assert(cap_offset != 0);
1550 	endpoint->msix = (struct msixcap *)((uint8_t *)endpoint->pci_config_space + cap_offset);
1551 
1552 	return 0;
1553 }
1554 
1555 static void
1556 _destroy_ctrlr(void *ctx)
1557 {
1558 	struct nvmf_vfio_user_ctrlr *ctrlr = ctx;
1559 	int i;
1560 
1561 	for (i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; i++) {
1562 		destroy_qp(ctrlr, i);
1563 	}
1564 
1565 	if (ctrlr->endpoint) {
1566 		ctrlr->endpoint->ctrlr = NULL;
1567 	}
1568 
1569 	spdk_poller_unregister(&ctrlr->mmio_poller);
1570 	free(ctrlr);
1571 }
1572 
1573 static int
1574 destroy_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr)
1575 {
1576 	assert(ctrlr != NULL);
1577 
1578 	SPDK_DEBUGLOG(nvmf_vfio, "destroy %s\n", ctrlr_id(ctrlr));
1579 
1580 	if (ctrlr->thread == spdk_get_thread()) {
1581 		_destroy_ctrlr(ctrlr);
1582 	} else {
1583 		spdk_thread_send_msg(ctrlr->thread, _destroy_ctrlr, ctrlr);
1584 	}
1585 
1586 	return 0;
1587 }
1588 
1589 static void
1590 nvmf_vfio_user_create_ctrlr(struct nvmf_vfio_user_transport *transport,
1591 			    struct nvmf_vfio_user_endpoint *endpoint)
1592 {
1593 	struct nvmf_vfio_user_ctrlr *ctrlr;
1594 	int err;
1595 
1596 	/* First, construct a vfio-user CUSTOM transport controller */
1597 	ctrlr = calloc(1, sizeof(*ctrlr));
1598 	if (ctrlr == NULL) {
1599 		err = -ENOMEM;
1600 		goto out;
1601 	}
1602 	ctrlr->cntlid = 0xffff;
1603 	ctrlr->transport = transport;
1604 	ctrlr->endpoint = endpoint;
1605 	ctrlr->doorbells = endpoint->doorbells;
1606 
1607 	/* Then, construct an admin queue pair */
1608 	err = init_qp(ctrlr, &transport->transport, NVMF_VFIO_USER_DEFAULT_AQ_DEPTH, 0);
1609 	if (err != 0) {
1610 		goto out;
1611 	}
1612 	endpoint->ctrlr = ctrlr;
1613 	ctrlr->ready = true;
1614 
1615 	/* Notify the generic layer about the new admin queue pair */
1616 	TAILQ_INSERT_TAIL(&ctrlr->transport->new_qps, ctrlr->qp[0], link);
1617 
1618 out:
1619 	if (err != 0) {
1620 		SPDK_ERRLOG("%s: failed to create vfio-user controller: %s\n",
1621 			    endpoint_id(endpoint), strerror(-err));
1622 		if (destroy_ctrlr(ctrlr) != 0) {
1623 			SPDK_ERRLOG("%s: failed to clean up\n",
1624 				    endpoint_id(endpoint));
1625 		}
1626 	}
1627 }
1628 
1629 static int
1630 nvmf_vfio_user_listen(struct spdk_nvmf_transport *transport,
1631 		      const struct spdk_nvme_transport_id *trid,
1632 		      struct spdk_nvmf_listen_opts *listen_opts)
1633 {
1634 	struct nvmf_vfio_user_transport *vu_transport;
1635 	struct nvmf_vfio_user_endpoint *endpoint, *tmp;
1636 	char *path = NULL;
1637 	char uuid[PATH_MAX] = {};
1638 	int fd;
1639 	int err;
1640 
1641 	vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport,
1642 					transport);
1643 
1644 	TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) {
1645 		/* Only compare traddr */
1646 		if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) {
1647 			return -EEXIST;
1648 		}
1649 	}
1650 
1651 	endpoint = calloc(1, sizeof(*endpoint));
1652 	if (!endpoint) {
1653 		return -ENOMEM;
1654 	}
1655 
1656 	endpoint->fd = -1;
1657 	memcpy(&endpoint->trid, trid, sizeof(endpoint->trid));
1658 
1659 	err = asprintf(&path, "%s/bar0", endpoint_id(endpoint));
1660 	if (err == -1) {
1661 		goto out;
1662 	}
1663 
1664 	fd = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH);
1665 	if (fd == -1) {
1666 		SPDK_ERRLOG("%s: failed to open device memory at %s: %m\n",
1667 			    endpoint_id(endpoint), path);
1668 		err = fd;
1669 		free(path);
1670 		goto out;
1671 	}
1672 	free(path);
1673 
1674 	err = ftruncate(fd, NVMF_VFIO_USER_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE);
1675 	if (err != 0) {
1676 		goto out;
1677 	}
1678 
1679 	endpoint->doorbells = mmap(NULL, NVMF_VFIO_USER_DOORBELLS_SIZE,
1680 				   PROT_READ | PROT_WRITE, MAP_SHARED, fd, NVMF_VFIO_USER_DOORBELLS_OFFSET);
1681 	if (endpoint->doorbells == MAP_FAILED) {
1682 		endpoint->doorbells = NULL;
1683 		err = -errno;
1684 		goto out;
1685 	}
1686 
1687 	endpoint->fd = fd;
1688 
1689 	snprintf(uuid, PATH_MAX, "%s/cntrl", endpoint_id(endpoint));
1690 
1691 	endpoint->vfu_ctx = vfu_create_ctx(VFU_TRANS_SOCK, uuid, LIBVFIO_USER_FLAG_ATTACH_NB,
1692 					   endpoint, VFU_DEV_TYPE_PCI);
1693 	if (endpoint->vfu_ctx == NULL) {
1694 		SPDK_ERRLOG("%s: error creating libmuser context: %m\n",
1695 			    endpoint_id(endpoint));
1696 		err = -1;
1697 		goto out;
1698 	}
1699 	vfu_setup_log(endpoint->vfu_ctx, vfio_user_log,
1700 		      SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio") ? LOG_DEBUG : LOG_ERR);
1701 
1702 	err = vfio_user_dev_info_fill(endpoint);
1703 	if (err < 0) {
1704 		goto out;
1705 	}
1706 
1707 	pthread_mutex_init(&endpoint->lock, NULL);
1708 	TAILQ_INSERT_TAIL(&vu_transport->endpoints, endpoint, link);
1709 	SPDK_DEBUGLOG(nvmf_vfio, "%s: doorbells %p\n", uuid, endpoint->doorbells);
1710 
1711 out:
1712 	if (err != 0) {
1713 		nvmf_vfio_user_destroy_endpoint(endpoint);
1714 	}
1715 
1716 	return err;
1717 }
1718 
1719 static void
1720 nvmf_vfio_user_stop_listen(struct spdk_nvmf_transport *transport,
1721 			   const struct spdk_nvme_transport_id *trid)
1722 {
1723 	struct nvmf_vfio_user_transport *vu_transport;
1724 	struct nvmf_vfio_user_endpoint *endpoint, *tmp;
1725 	int err;
1726 
1727 	assert(trid != NULL);
1728 	assert(trid->traddr != NULL);
1729 
1730 	SPDK_DEBUGLOG(nvmf_vfio, "%s: stop listen\n", trid->traddr);
1731 
1732 	vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport,
1733 					transport);
1734 
1735 	pthread_mutex_lock(&vu_transport->lock);
1736 	TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) {
1737 		if (strcmp(trid->traddr, endpoint->trid.traddr) == 0) {
1738 			TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link);
1739 			if (endpoint->ctrlr) {
1740 				err = destroy_ctrlr(endpoint->ctrlr);
1741 				if (err != 0) {
1742 					SPDK_ERRLOG("%s: failed destroy controller: %s\n",
1743 						    endpoint_id(endpoint), strerror(-err));
1744 				}
1745 			}
1746 			nvmf_vfio_user_destroy_endpoint(endpoint);
1747 			pthread_mutex_unlock(&vu_transport->lock);
1748 
1749 			return;
1750 		}
1751 	}
1752 	pthread_mutex_unlock(&vu_transport->lock);
1753 
1754 	SPDK_DEBUGLOG(nvmf_vfio, "%s: not found\n", trid->traddr);
1755 }
1756 
1757 static void
1758 nvmf_vfio_user_cdata_init(struct spdk_nvmf_transport *transport,
1759 			  struct spdk_nvmf_subsystem *subsystem,
1760 			  struct spdk_nvmf_ctrlr_data *cdata)
1761 {
1762 	memset(&cdata->sgls, 0, sizeof(struct spdk_nvme_cdata_sgls));
1763 	cdata->sgls.supported = SPDK_NVME_SGLS_SUPPORTED_DWORD_ALIGNED;
1764 }
1765 
1766 static int
1767 nvmf_vfio_user_listen_associate(struct spdk_nvmf_transport *transport,
1768 				const struct spdk_nvmf_subsystem *subsystem,
1769 				const struct spdk_nvme_transport_id *trid)
1770 {
1771 	struct nvmf_vfio_user_transport *vu_transport;
1772 	struct nvmf_vfio_user_endpoint *endpoint;
1773 
1774 	vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, transport);
1775 
1776 	TAILQ_FOREACH(endpoint, &vu_transport->endpoints, link) {
1777 		if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) {
1778 			break;
1779 		}
1780 	}
1781 
1782 	if (endpoint == NULL) {
1783 		return -ENOENT;
1784 	}
1785 
1786 	endpoint->subsystem = subsystem;
1787 
1788 	return 0;
1789 }
1790 
1791 /*
1792  * Executed periodically.
1793  *
1794  * XXX SPDK thread context.
1795  */
1796 static uint32_t
1797 nvmf_vfio_user_accept(struct spdk_nvmf_transport *transport)
1798 {
1799 	int err;
1800 	struct nvmf_vfio_user_transport *vu_transport;
1801 	struct nvmf_vfio_user_qpair *qp, *tmp_qp;
1802 	struct nvmf_vfio_user_endpoint *endpoint;
1803 
1804 	vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport,
1805 					transport);
1806 
1807 	pthread_mutex_lock(&vu_transport->lock);
1808 
1809 	TAILQ_FOREACH(endpoint, &vu_transport->endpoints, link) {
1810 		/* we need try to attach the controller again after reset or shutdown */
1811 		if (endpoint->ctrlr != NULL && endpoint->ctrlr->ready) {
1812 			continue;
1813 		}
1814 
1815 		err = vfu_attach_ctx(endpoint->vfu_ctx);
1816 		if (err != 0) {
1817 			if (errno == EAGAIN || errno == EWOULDBLOCK) {
1818 				continue;
1819 			}
1820 
1821 			pthread_mutex_unlock(&vu_transport->lock);
1822 			return -EFAULT;
1823 		}
1824 
1825 		/* Construct a controller */
1826 		nvmf_vfio_user_create_ctrlr(vu_transport, endpoint);
1827 	}
1828 
1829 	TAILQ_FOREACH_SAFE(qp, &vu_transport->new_qps, link, tmp_qp) {
1830 		TAILQ_REMOVE(&vu_transport->new_qps, qp, link);
1831 		spdk_nvmf_tgt_new_qpair(transport->tgt, &qp->qpair);
1832 	}
1833 
1834 	pthread_mutex_unlock(&vu_transport->lock);
1835 
1836 	return 0;
1837 }
1838 
1839 static void
1840 nvmf_vfio_user_discover(struct spdk_nvmf_transport *transport,
1841 			struct spdk_nvme_transport_id *trid,
1842 			struct spdk_nvmf_discovery_log_page_entry *entry)
1843 { }
1844 
1845 static struct spdk_nvmf_transport_poll_group *
1846 nvmf_vfio_user_poll_group_create(struct spdk_nvmf_transport *transport)
1847 {
1848 	struct nvmf_vfio_user_poll_group *vu_group;
1849 
1850 	SPDK_DEBUGLOG(nvmf_vfio, "create poll group\n");
1851 
1852 	vu_group = calloc(1, sizeof(*vu_group));
1853 	if (vu_group == NULL) {
1854 		SPDK_ERRLOG("Error allocating poll group: %m");
1855 		return NULL;
1856 	}
1857 
1858 	TAILQ_INIT(&vu_group->qps);
1859 
1860 	return &vu_group->group;
1861 }
1862 
1863 /* called when process exits */
1864 static void
1865 nvmf_vfio_user_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group)
1866 {
1867 	struct nvmf_vfio_user_poll_group *vu_group;
1868 
1869 	SPDK_DEBUGLOG(nvmf_vfio, "destroy poll group\n");
1870 
1871 	vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group);
1872 
1873 	free(vu_group);
1874 }
1875 
1876 static void
1877 vfio_user_qpair_disconnect_cb(void *ctx)
1878 {
1879 	struct nvmf_vfio_user_endpoint *endpoint = ctx;
1880 	struct nvmf_vfio_user_ctrlr *ctrlr;
1881 
1882 	pthread_mutex_lock(&endpoint->lock);
1883 	ctrlr = endpoint->ctrlr;
1884 	if (!ctrlr) {
1885 		pthread_mutex_unlock(&endpoint->lock);
1886 		return;
1887 	}
1888 
1889 	if (!ctrlr->num_connected_qps) {
1890 		destroy_ctrlr(ctrlr);
1891 		pthread_mutex_unlock(&endpoint->lock);
1892 		return;
1893 	}
1894 	pthread_mutex_unlock(&endpoint->lock);
1895 }
1896 
1897 static int
1898 vfio_user_stop_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr)
1899 {
1900 	uint32_t i;
1901 	struct nvmf_vfio_user_qpair *qpair;
1902 	struct nvmf_vfio_user_endpoint *endpoint;
1903 
1904 	SPDK_DEBUGLOG(nvmf_vfio, "%s stop processing\n", ctrlr_id(ctrlr));
1905 
1906 	ctrlr->ready = false;
1907 	endpoint = ctrlr->endpoint;
1908 	assert(endpoint != NULL);
1909 
1910 	for (i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; i++) {
1911 		qpair = ctrlr->qp[i];
1912 		if (qpair == NULL) {
1913 			continue;
1914 		}
1915 		spdk_nvmf_qpair_disconnect(&qpair->qpair, vfio_user_qpair_disconnect_cb, endpoint);
1916 	}
1917 
1918 	return 0;
1919 }
1920 
1921 static int
1922 vfio_user_poll_mmio(void *ctx)
1923 {
1924 	struct nvmf_vfio_user_ctrlr *ctrlr = ctx;
1925 	int ret;
1926 
1927 	assert(ctrlr != NULL);
1928 
1929 	/* This will call access_bar0_fn() if there are any writes
1930 	 * to the portion of the BAR that is not mmap'd */
1931 	ret = vfu_run_ctx(ctrlr->endpoint->vfu_ctx);
1932 	if (spdk_unlikely(ret != 0)) {
1933 		spdk_poller_unregister(&ctrlr->mmio_poller);
1934 
1935 		/* initiator shutdown or reset, waiting for another re-connect */
1936 		if (errno == ENOTCONN) {
1937 			vfio_user_stop_ctrlr(ctrlr);
1938 			return SPDK_POLLER_BUSY;
1939 		}
1940 
1941 		fail_ctrlr(ctrlr);
1942 	}
1943 
1944 	return SPDK_POLLER_BUSY;
1945 }
1946 
1947 static int
1948 handle_queue_connect_rsp(struct nvmf_vfio_user_req *req, void *cb_arg)
1949 {
1950 	struct nvmf_vfio_user_poll_group *vu_group;
1951 	struct nvmf_vfio_user_qpair *qpair = cb_arg;
1952 	struct nvmf_vfio_user_ctrlr *ctrlr;
1953 	struct nvmf_vfio_user_endpoint *endpoint;
1954 
1955 	assert(qpair != NULL);
1956 	assert(req != NULL);
1957 
1958 	ctrlr = qpair->ctrlr;
1959 	endpoint = ctrlr->endpoint;
1960 	assert(ctrlr != NULL);
1961 	assert(endpoint != NULL);
1962 
1963 	if (spdk_nvme_cpl_is_error(&req->req.rsp->nvme_cpl)) {
1964 		SPDK_ERRLOG("SC %u, SCT %u\n", req->req.rsp->nvme_cpl.status.sc, req->req.rsp->nvme_cpl.status.sct);
1965 		destroy_qp(ctrlr, qpair->qpair.qid);
1966 		destroy_ctrlr(ctrlr);
1967 		return -1;
1968 	}
1969 
1970 	vu_group = SPDK_CONTAINEROF(qpair->group, struct nvmf_vfio_user_poll_group, group);
1971 	TAILQ_INSERT_TAIL(&vu_group->qps, qpair, link);
1972 	qpair->state = VFIO_USER_QPAIR_ACTIVE;
1973 
1974 	pthread_mutex_lock(&endpoint->lock);
1975 	if (nvmf_qpair_is_admin_queue(&qpair->qpair)) {
1976 		ctrlr->cntlid = qpair->qpair.ctrlr->cntlid;
1977 		ctrlr->thread = spdk_get_thread();
1978 		ctrlr->mmio_poller = SPDK_POLLER_REGISTER(vfio_user_poll_mmio, ctrlr, 0);
1979 	}
1980 	ctrlr->num_connected_qps++;
1981 	pthread_mutex_unlock(&endpoint->lock);
1982 
1983 	free(req->req.data);
1984 	req->req.data = NULL;
1985 
1986 	return 0;
1987 }
1988 
1989 /*
1990  * Called by spdk_nvmf_transport_poll_group_add.
1991  */
1992 static int
1993 nvmf_vfio_user_poll_group_add(struct spdk_nvmf_transport_poll_group *group,
1994 			      struct spdk_nvmf_qpair *qpair)
1995 {
1996 	struct nvmf_vfio_user_qpair *vu_qpair;
1997 	struct nvmf_vfio_user_req *vu_req;
1998 	struct nvmf_vfio_user_ctrlr *ctrlr;
1999 	struct spdk_nvmf_request *req;
2000 	struct spdk_nvmf_fabric_connect_data *data;
2001 	bool admin;
2002 
2003 	vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair);
2004 	vu_qpair->group = group;
2005 	ctrlr = vu_qpair->ctrlr;
2006 
2007 	SPDK_DEBUGLOG(nvmf_vfio, "%s: add QP%d=%p(%p) to poll_group=%p\n",
2008 		      ctrlr_id(ctrlr), vu_qpair->qpair.qid,
2009 		      vu_qpair, qpair, group);
2010 
2011 	admin = nvmf_qpair_is_admin_queue(&vu_qpair->qpair);
2012 
2013 	vu_req = get_nvmf_vfio_user_req(vu_qpair);
2014 	if (vu_req == NULL) {
2015 		return -1;
2016 	}
2017 
2018 	req = &vu_req->req;
2019 	req->cmd->connect_cmd.opcode = SPDK_NVME_OPC_FABRIC;
2020 	req->cmd->connect_cmd.cid = 0;
2021 	req->cmd->connect_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_CONNECT;
2022 	req->cmd->connect_cmd.recfmt = 0;
2023 	req->cmd->connect_cmd.sqsize = vu_qpair->qsize - 1;
2024 	req->cmd->connect_cmd.qid = admin ? 0 : qpair->qid;
2025 
2026 	req->length = sizeof(struct spdk_nvmf_fabric_connect_data);
2027 	req->data = calloc(1, req->length);
2028 	if (req->data == NULL) {
2029 		nvmf_vfio_user_req_free(req);
2030 		return -ENOMEM;
2031 	}
2032 
2033 	data = (struct spdk_nvmf_fabric_connect_data *)req->data;
2034 	data->cntlid = admin ? 0xFFFF : ctrlr->cntlid;
2035 	snprintf(data->subnqn, sizeof(data->subnqn), "%s",
2036 		 spdk_nvmf_subsystem_get_nqn(ctrlr->endpoint->subsystem));
2037 
2038 	vu_req->cb_fn = handle_queue_connect_rsp;
2039 	vu_req->cb_arg = vu_qpair;
2040 
2041 	SPDK_DEBUGLOG(nvmf_vfio,
2042 		      "%s: sending connect fabrics command for QID=%#x cntlid=%#x\n",
2043 		      ctrlr_id(ctrlr), qpair->qid, data->cntlid);
2044 
2045 	spdk_nvmf_request_exec_fabrics(req);
2046 	return 0;
2047 }
2048 
2049 static int
2050 nvmf_vfio_user_poll_group_remove(struct spdk_nvmf_transport_poll_group *group,
2051 				 struct spdk_nvmf_qpair *qpair)
2052 {
2053 	struct nvmf_vfio_user_qpair *vu_qpair;
2054 	struct nvmf_vfio_user_ctrlr *vu_ctrlr;
2055 	struct nvmf_vfio_user_endpoint *endpoint;
2056 	struct nvmf_vfio_user_poll_group *vu_group;
2057 
2058 	vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair);
2059 	vu_ctrlr = vu_qpair->ctrlr;
2060 	endpoint = vu_ctrlr->endpoint;
2061 
2062 	SPDK_DEBUGLOG(nvmf_vfio,
2063 		      "%s: remove NVMf QP%d=%p from NVMf poll_group=%p\n",
2064 		      ctrlr_id(vu_qpair->ctrlr), qpair->qid, qpair, group);
2065 
2066 
2067 	vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group);
2068 	TAILQ_REMOVE(&vu_group->qps, vu_qpair, link);
2069 
2070 	pthread_mutex_lock(&endpoint->lock);
2071 	assert(vu_ctrlr->num_connected_qps);
2072 	vu_ctrlr->num_connected_qps--;
2073 	pthread_mutex_unlock(&endpoint->lock);
2074 
2075 	return 0;
2076 }
2077 
2078 static void
2079 _nvmf_vfio_user_req_free(struct nvmf_vfio_user_qpair *vu_qpair, struct nvmf_vfio_user_req *vu_req)
2080 {
2081 	memset(&vu_req->cmd, 0, sizeof(vu_req->cmd));
2082 	memset(&vu_req->rsp, 0, sizeof(vu_req->rsp));
2083 	vu_req->iovcnt = 0;
2084 	vu_req->state = VFIO_USER_REQUEST_STATE_FREE;
2085 
2086 	TAILQ_INSERT_TAIL(&vu_qpair->reqs, vu_req, link);
2087 }
2088 
2089 static int
2090 nvmf_vfio_user_req_free(struct spdk_nvmf_request *req)
2091 {
2092 	struct nvmf_vfio_user_qpair *vu_qpair;
2093 	struct nvmf_vfio_user_req *vu_req;
2094 
2095 	assert(req != NULL);
2096 
2097 	vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req);
2098 	vu_qpair = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_qpair, qpair);
2099 
2100 	_nvmf_vfio_user_req_free(vu_qpair, vu_req);
2101 
2102 	return 0;
2103 }
2104 
2105 static int
2106 nvmf_vfio_user_req_complete(struct spdk_nvmf_request *req)
2107 {
2108 	struct nvmf_vfio_user_qpair *vu_qpair;
2109 	struct nvmf_vfio_user_req *vu_req;
2110 
2111 	assert(req != NULL);
2112 
2113 	vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req);
2114 	vu_qpair = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_qpair, qpair);
2115 
2116 	if (vu_req->cb_fn != NULL) {
2117 		if (vu_req->cb_fn(vu_req, vu_req->cb_arg) != 0) {
2118 			fail_ctrlr(vu_qpair->ctrlr);
2119 		}
2120 	}
2121 
2122 	_nvmf_vfio_user_req_free(vu_qpair, vu_req);
2123 
2124 	return 0;
2125 }
2126 
2127 static void
2128 nvmf_vfio_user_close_qpair(struct spdk_nvmf_qpair *qpair,
2129 			   spdk_nvmf_transport_qpair_fini_cb cb_fn, void *cb_arg)
2130 {
2131 	struct nvmf_vfio_user_qpair *vu_qpair;
2132 
2133 	assert(qpair != NULL);
2134 	vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair);
2135 	destroy_qp(vu_qpair->ctrlr, qpair->qid);
2136 
2137 	if (cb_fn) {
2138 		cb_fn(cb_arg);
2139 	}
2140 }
2141 
2142 /**
2143  * Returns a preallocated spdk_nvmf_request or NULL if there isn't one available.
2144  */
2145 static struct nvmf_vfio_user_req *
2146 get_nvmf_vfio_user_req(struct nvmf_vfio_user_qpair *qpair)
2147 {
2148 	struct nvmf_vfio_user_req *req;
2149 
2150 	assert(qpair != NULL);
2151 
2152 	if (TAILQ_EMPTY(&qpair->reqs)) {
2153 		return NULL;
2154 	}
2155 
2156 	req = TAILQ_FIRST(&qpair->reqs);
2157 	TAILQ_REMOVE(&qpair->reqs, req, link);
2158 
2159 	return req;
2160 }
2161 
2162 static struct spdk_nvmf_request *
2163 get_nvmf_req(struct nvmf_vfio_user_qpair *qpair)
2164 {
2165 	struct nvmf_vfio_user_req *req = get_nvmf_vfio_user_req(qpair);
2166 
2167 	if (req == NULL) {
2168 		return NULL;
2169 	}
2170 	return &req->req;
2171 }
2172 
2173 static int
2174 get_nvmf_io_req_length(struct spdk_nvmf_request *req)
2175 {
2176 	uint16_t nlb, nr;
2177 	uint32_t nsid;
2178 	struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
2179 	struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
2180 	struct spdk_nvmf_ns *ns;
2181 
2182 	nsid = cmd->nsid;
2183 	ns = _nvmf_subsystem_get_ns(ctrlr->subsys, nsid);
2184 	if (ns == NULL || ns->bdev == NULL) {
2185 		SPDK_ERRLOG("unsuccessful query for nsid %u\n", cmd->nsid);
2186 		return -EINVAL;
2187 	}
2188 
2189 	if (cmd->opc == SPDK_NVME_OPC_DATASET_MANAGEMENT) {
2190 		nr = cmd->cdw10_bits.dsm.nr + 1;
2191 		return nr * sizeof(struct spdk_nvme_dsm_range);
2192 	}
2193 
2194 	nlb = (cmd->cdw12 & 0x0000ffffu) + 1;
2195 	return nlb * spdk_bdev_get_block_size(ns->bdev);
2196 }
2197 
2198 static int
2199 map_admin_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req)
2200 {
2201 	struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
2202 	uint32_t len = 0;
2203 	int iovcnt;
2204 
2205 	req->xfer = cmd->opc & 0x3;
2206 	req->length = 0;
2207 	req->data = NULL;
2208 
2209 	switch (cmd->opc) {
2210 	case SPDK_NVME_OPC_IDENTIFY:
2211 		len = 4096; /* TODO: there should be a define somewhere for this */
2212 		break;
2213 	case SPDK_NVME_OPC_GET_LOG_PAGE:
2214 		len = (cmd->cdw10_bits.get_log_page.numdl + 1) * 4;
2215 		break;
2216 	}
2217 
2218 	if (!cmd->dptr.prp.prp1 || !len) {
2219 		return 0;
2220 	}
2221 	/* ADMIN command will not use SGL */
2222 	assert(req->cmd->nvme_cmd.psdt == 0);
2223 	iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, len);
2224 	if (iovcnt < 0) {
2225 		SPDK_ERRLOG("%s: map Admin Opc %x failed\n",
2226 			    ctrlr_id(ctrlr), cmd->opc);
2227 		return -1;
2228 	}
2229 
2230 	req->length = len;
2231 	req->data = req->iov[0].iov_base;
2232 
2233 	return 0;
2234 }
2235 
2236 /*
2237  * Handles an I/O command.
2238  *
2239  * Returns 0 on success and -errno on failure. Sets @submit on whether or not
2240  * the request must be forwarded to NVMf.
2241  */
2242 static int
2243 map_io_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req)
2244 {
2245 	int err = 0;
2246 	struct spdk_nvme_cmd *cmd;
2247 
2248 	assert(ctrlr != NULL);
2249 	assert(req != NULL);
2250 
2251 	cmd = &req->cmd->nvme_cmd;
2252 	req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc);
2253 
2254 	if (spdk_unlikely(req->xfer == SPDK_NVME_DATA_NONE)) {
2255 		return 0;
2256 	}
2257 
2258 	err = get_nvmf_io_req_length(req);
2259 	if (err < 0) {
2260 		return -EINVAL;
2261 	}
2262 
2263 	req->length = err;
2264 	err = vfio_user_map_cmd(ctrlr, req, req->iov, req->length);
2265 	if (err < 0) {
2266 		SPDK_ERRLOG("%s: failed to map IO OPC %u\n", ctrlr_id(ctrlr), cmd->opc);
2267 		return -EFAULT;
2268 	}
2269 
2270 	req->data = req->iov[0].iov_base;
2271 	req->iovcnt = err;
2272 
2273 	return 0;
2274 }
2275 
2276 static int
2277 handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd,
2278 	       struct spdk_nvmf_request *req)
2279 {
2280 	int err;
2281 	struct nvmf_vfio_user_req *vu_req;
2282 
2283 	assert(ctrlr != NULL);
2284 	assert(cmd != NULL);
2285 
2286 	/*
2287 	 * TODO: this means that there are no free requests available,
2288 	 * returning -1 will fail the controller. Theoretically this error can
2289 	 * be avoided completely by ensuring we have as many requests as slots
2290 	 * in the SQ, plus one for the the property request.
2291 	 */
2292 	if (spdk_unlikely(req == NULL)) {
2293 		return -1;
2294 	}
2295 
2296 	vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req);
2297 	vu_req->cb_fn = handle_cmd_rsp;
2298 	vu_req->cb_arg = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_qpair, qpair);
2299 	req->cmd->nvme_cmd = *cmd;
2300 	if (nvmf_qpair_is_admin_queue(req->qpair)) {
2301 		err = map_admin_cmd_req(ctrlr, req);
2302 	} else {
2303 		err = map_io_cmd_req(ctrlr, req);
2304 	}
2305 
2306 	if (spdk_unlikely(err < 0)) {
2307 		SPDK_ERRLOG("%s: map NVMe command opc 0x%x failed\n",
2308 			    ctrlr_id(ctrlr), cmd->opc);
2309 		req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
2310 		req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC;
2311 		return handle_cmd_rsp(vu_req, vu_req->cb_arg);
2312 	}
2313 
2314 	vu_req->state = VFIO_USER_REQUEST_STATE_EXECUTING;
2315 	spdk_nvmf_request_exec(req);
2316 
2317 	return 0;
2318 }
2319 
2320 static void
2321 nvmf_vfio_user_qpair_poll(struct nvmf_vfio_user_qpair *qpair)
2322 {
2323 	struct nvmf_vfio_user_ctrlr *ctrlr;
2324 	uint32_t new_tail;
2325 
2326 	assert(qpair != NULL);
2327 
2328 	ctrlr = qpair->ctrlr;
2329 
2330 	new_tail = *tdbl(ctrlr, &qpair->sq);
2331 	if (sq_head(qpair) != new_tail) {
2332 		int err = handle_sq_tdbl_write(ctrlr, new_tail, qpair);
2333 		if (err != 0) {
2334 			fail_ctrlr(ctrlr);
2335 			return;
2336 		}
2337 	}
2338 }
2339 
2340 /*
2341  * Called unconditionally, periodically, very frequently from SPDK to ask
2342  * whether there's work to be done.  This function consumes requests generated
2343  * from read/write_bar0 by setting ctrlr->prop_req.dir.  read_bar0, and
2344  * occasionally write_bar0 -- though this may change, synchronously wait. This
2345  * function also consumes requests by looking at the doorbells.
2346  */
2347 static int
2348 nvmf_vfio_user_poll_group_poll(struct spdk_nvmf_transport_poll_group *group)
2349 {
2350 	struct nvmf_vfio_user_poll_group *vu_group;
2351 	struct nvmf_vfio_user_qpair *vu_qpair, *tmp;
2352 
2353 	assert(group != NULL);
2354 
2355 	spdk_rmb();
2356 
2357 	vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group);
2358 
2359 	TAILQ_FOREACH_SAFE(vu_qpair, &vu_group->qps, link, tmp) {
2360 		if (spdk_unlikely(vu_qpair->state != VFIO_USER_QPAIR_ACTIVE || !vu_qpair->sq.size)) {
2361 			continue;
2362 		}
2363 		nvmf_vfio_user_qpair_poll(vu_qpair);
2364 	}
2365 
2366 	return 0;
2367 }
2368 
2369 static int
2370 nvmf_vfio_user_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair,
2371 				    struct spdk_nvme_transport_id *trid)
2372 {
2373 	struct nvmf_vfio_user_qpair *vu_qpair;
2374 	struct nvmf_vfio_user_ctrlr *ctrlr;
2375 
2376 	vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair);
2377 	ctrlr = vu_qpair->ctrlr;
2378 
2379 	memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid));
2380 	return 0;
2381 }
2382 
2383 static int
2384 nvmf_vfio_user_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair,
2385 				   struct spdk_nvme_transport_id *trid)
2386 {
2387 	return 0;
2388 }
2389 
2390 static int
2391 nvmf_vfio_user_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair,
2392 				     struct spdk_nvme_transport_id *trid)
2393 {
2394 	struct nvmf_vfio_user_qpair *vu_qpair;
2395 	struct nvmf_vfio_user_ctrlr *ctrlr;
2396 
2397 	vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair);
2398 	ctrlr = vu_qpair->ctrlr;
2399 
2400 	memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid));
2401 	return 0;
2402 }
2403 
2404 static void
2405 nvmf_vfio_user_qpair_abort_request(struct spdk_nvmf_qpair *qpair,
2406 				   struct spdk_nvmf_request *req)
2407 {
2408 	struct nvmf_vfio_user_qpair *vu_qpair;
2409 	struct nvmf_vfio_user_req *vu_req, *vu_req_to_abort = NULL;
2410 	uint16_t i, cid;
2411 
2412 	vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair);
2413 
2414 	cid = req->cmd->nvme_cmd.cdw10_bits.abort.cid;
2415 	for (i = 0; i < vu_qpair->qsize; i++) {
2416 		vu_req = &vu_qpair->reqs_internal[i];
2417 		if (vu_req->state == VFIO_USER_REQUEST_STATE_EXECUTING && vu_req->cmd.cid == cid) {
2418 			vu_req_to_abort = vu_req;
2419 			break;
2420 		}
2421 	}
2422 
2423 	if (vu_req_to_abort == NULL) {
2424 		spdk_nvmf_request_complete(req);
2425 		return;
2426 	}
2427 
2428 	req->req_to_abort = &vu_req_to_abort->req;
2429 	nvmf_ctrlr_abort_request(req);
2430 }
2431 
2432 static void
2433 nvmf_vfio_user_opts_init(struct spdk_nvmf_transport_opts *opts)
2434 {
2435 	opts->max_queue_depth =		NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH;
2436 	opts->max_qpairs_per_ctrlr =	NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR;
2437 	opts->in_capsule_data_size =	NVMF_VFIO_USER_DEFAULT_IN_CAPSULE_DATA_SIZE;
2438 	opts->max_io_size =		NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE;
2439 	opts->io_unit_size =		NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE;
2440 	opts->max_aq_depth =		NVMF_VFIO_USER_DEFAULT_AQ_DEPTH;
2441 	opts->num_shared_buffers =	NVMF_VFIO_USER_DEFAULT_NUM_SHARED_BUFFERS;
2442 	opts->buf_cache_size =		NVMF_VFIO_USER_DEFAULT_BUFFER_CACHE_SIZE;
2443 }
2444 
2445 const struct spdk_nvmf_transport_ops spdk_nvmf_transport_vfio_user = {
2446 	.name = "VFIOUSER",
2447 	.type = SPDK_NVME_TRANSPORT_VFIOUSER,
2448 	.opts_init = nvmf_vfio_user_opts_init,
2449 	.create = nvmf_vfio_user_create,
2450 	.destroy = nvmf_vfio_user_destroy,
2451 
2452 	.listen = nvmf_vfio_user_listen,
2453 	.stop_listen = nvmf_vfio_user_stop_listen,
2454 	.accept = nvmf_vfio_user_accept,
2455 	.cdata_init = nvmf_vfio_user_cdata_init,
2456 	.listen_associate = nvmf_vfio_user_listen_associate,
2457 
2458 	.listener_discover = nvmf_vfio_user_discover,
2459 
2460 	.poll_group_create = nvmf_vfio_user_poll_group_create,
2461 	.poll_group_destroy = nvmf_vfio_user_poll_group_destroy,
2462 	.poll_group_add = nvmf_vfio_user_poll_group_add,
2463 	.poll_group_remove = nvmf_vfio_user_poll_group_remove,
2464 	.poll_group_poll = nvmf_vfio_user_poll_group_poll,
2465 
2466 	.req_free = nvmf_vfio_user_req_free,
2467 	.req_complete = nvmf_vfio_user_req_complete,
2468 
2469 	.qpair_fini = nvmf_vfio_user_close_qpair,
2470 	.qpair_get_local_trid = nvmf_vfio_user_qpair_get_local_trid,
2471 	.qpair_get_peer_trid = nvmf_vfio_user_qpair_get_peer_trid,
2472 	.qpair_get_listen_trid = nvmf_vfio_user_qpair_get_listen_trid,
2473 	.qpair_abort_request = nvmf_vfio_user_qpair_abort_request,
2474 };
2475 
2476 SPDK_NVMF_TRANSPORT_REGISTER(muser, &spdk_nvmf_transport_vfio_user);
2477 SPDK_LOG_REGISTER_COMPONENT(nvmf_vfio)
2478