xref: /spdk/lib/nvmf/vfio_user.c (revision 8f633fa1c331383d74e6e529481c7bb6bae4c8aa)
1 /*-
2  *   BSD LICENSE
3  *   Copyright (c) Intel Corporation. All rights reserved.
4  *   Copyright (c) 2019, Nutanix Inc. All rights reserved.
5  *
6  *   Redistribution and use in source and binary forms, with or without
7  *   modification, are permitted provided that the following conditions
8  *   are met:
9  *
10  *     * Redistributions of source code must retain the above copyright
11  *       notice, this list of conditions and the following disclaimer.
12  *     * Redistributions in binary form must reproduce the above copyright
13  *       notice, this list of conditions and the following disclaimer in
14  *       the documentation and/or other materials provided with the
15  *       distribution.
16  *     * Neither the name of Intel Corporation nor the names of its
17  *       contributors may be used to endorse or promote products derived
18  *       from this software without specific prior written permission.
19  *
20  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 /*
34  * NVMe over vfio-user transport
35  */
36 
37 #include <vfio-user/libvfio-user.h>
38 #include <vfio-user/pci_defs.h>
39 
40 #include "spdk/barrier.h"
41 #include "spdk/stdinc.h"
42 #include "spdk/assert.h"
43 #include "spdk/thread.h"
44 #include "spdk/nvmf_transport.h"
45 #include "spdk/sock.h"
46 #include "spdk/string.h"
47 #include "spdk/util.h"
48 #include "spdk/log.h"
49 
50 #include "transport.h"
51 
52 #include "nvmf_internal.h"
53 
54 #define NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH 256
55 #define NVMF_VFIO_USER_DEFAULT_AQ_DEPTH 32
56 #define NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR 64
57 #define NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE ((NVMF_REQ_MAX_BUFFERS - 1) << SHIFT_4KB)
58 #define NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE
59 
60 #define NVMF_VFIO_USER_DOORBELLS_OFFSET	0x1000
61 #define NVMF_VFIO_USER_DOORBELLS_SIZE 0x1000
62 
63 #define NVME_REG_CFG_SIZE       0x1000
64 #define NVME_REG_BAR0_SIZE      0x4000
65 #define NVME_IRQ_MSIX_NUM	NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR
66 
67 struct nvmf_vfio_user_req;
68 struct nvmf_vfio_user_qpair;
69 
70 typedef int (*nvmf_vfio_user_req_cb_fn)(struct nvmf_vfio_user_req *req, void *cb_arg);
71 
72 /* 1 more for PRP2 list itself */
73 #define NVMF_VFIO_USER_MAX_IOVECS	(NVMF_REQ_MAX_BUFFERS + 1)
74 
75 enum nvmf_vfio_user_req_state {
76 	VFIO_USER_REQUEST_STATE_FREE = 0,
77 	VFIO_USER_REQUEST_STATE_EXECUTING,
78 };
79 
80 struct nvmf_vfio_user_req  {
81 	struct spdk_nvmf_request		req;
82 	struct spdk_nvme_cpl			rsp;
83 	struct spdk_nvme_cmd			cmd;
84 
85 	enum nvmf_vfio_user_req_state		state;
86 	nvmf_vfio_user_req_cb_fn		cb_fn;
87 	void					*cb_arg;
88 
89 	/* old CC before prop_set_cc fabric command */
90 	union spdk_nvme_cc_register		cc;
91 
92 	/* placeholder for gpa_to_vva memory map table, the IO buffer doesn't use it */
93 	dma_sg_t				*sg;
94 	struct iovec				iov[NVMF_VFIO_USER_MAX_IOVECS];
95 	uint8_t					iovcnt;
96 
97 	TAILQ_ENTRY(nvmf_vfio_user_req)		link;
98 };
99 
100 /*
101  * A NVMe queue.
102  */
103 struct nvme_q {
104 	bool is_cq;
105 
106 	void *addr;
107 
108 	dma_sg_t *sg;
109 	struct iovec iov;
110 
111 	uint32_t size;
112 	uint64_t prp1;
113 
114 	union {
115 		struct {
116 			uint32_t head;
117 			/* multiple SQs can be mapped to the same CQ */
118 			uint16_t cqid;
119 		};
120 		struct {
121 			uint32_t tail;
122 			uint16_t iv;
123 			bool ien;
124 			bool phase;
125 		};
126 	};
127 };
128 
129 enum nvmf_vfio_user_qpair_state {
130 	VFIO_USER_QPAIR_UNINITIALIZED = 0,
131 	VFIO_USER_QPAIR_ACTIVE,
132 	VFIO_USER_QPAIR_SQ_DELETED,
133 	VFIO_USER_QPAIR_INACTIVE,
134 	VFIO_USER_QPAIR_ERROR,
135 };
136 
137 struct nvmf_vfio_user_qpair {
138 	struct spdk_nvmf_qpair			qpair;
139 	struct spdk_nvmf_transport_poll_group	*group;
140 	struct nvmf_vfio_user_ctrlr		*ctrlr;
141 	struct nvmf_vfio_user_req		*reqs_internal;
142 	uint32_t				qsize;
143 	struct nvme_q				cq;
144 	struct nvme_q				sq;
145 	enum nvmf_vfio_user_qpair_state		state;
146 
147 	/* Copy of Create IO SQ command */
148 	struct spdk_nvme_cmd			create_io_sq_cmd;
149 
150 	TAILQ_HEAD(, nvmf_vfio_user_req)	reqs;
151 	/* Poll group entry */
152 	TAILQ_ENTRY(nvmf_vfio_user_qpair)	link;
153 	/* Connected queue pair entry */
154 	TAILQ_ENTRY(nvmf_vfio_user_qpair)	tailq;
155 };
156 
157 struct nvmf_vfio_user_poll_group {
158 	struct spdk_nvmf_transport_poll_group	group;
159 	TAILQ_HEAD(, nvmf_vfio_user_qpair)	qps;
160 };
161 
162 struct nvmf_vfio_user_ctrlr {
163 	struct nvmf_vfio_user_endpoint		*endpoint;
164 	struct nvmf_vfio_user_transport		*transport;
165 
166 	/* Connected queue pairs list */
167 	TAILQ_HEAD(, nvmf_vfio_user_qpair)	connected_qps;
168 
169 	struct spdk_thread			*thread;
170 	struct spdk_poller			*vfu_ctx_poller;
171 
172 	uint16_t				cntlid;
173 
174 	struct nvmf_vfio_user_qpair		*qp[NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR];
175 
176 	TAILQ_ENTRY(nvmf_vfio_user_ctrlr)	link;
177 
178 	volatile uint32_t			*doorbells;
179 
180 	/* internal CSTS.CFS register for vfio-user fatal errors */
181 	uint32_t				cfs : 1;
182 };
183 
184 struct nvmf_vfio_user_endpoint {
185 	vfu_ctx_t				*vfu_ctx;
186 	struct msixcap				*msix;
187 	vfu_pci_config_space_t			*pci_config_space;
188 	int					devmem_fd;
189 	volatile uint32_t			*doorbells;
190 
191 	struct spdk_nvme_transport_id		trid;
192 	const struct spdk_nvmf_subsystem	*subsystem;
193 
194 	struct nvmf_vfio_user_ctrlr		*ctrlr;
195 	pthread_mutex_t				lock;
196 
197 	TAILQ_ENTRY(nvmf_vfio_user_endpoint)	link;
198 };
199 
200 struct nvmf_vfio_user_transport_opts {
201 	bool					disable_mappable_bar0;
202 };
203 
204 struct nvmf_vfio_user_transport {
205 	struct spdk_nvmf_transport		transport;
206 	struct nvmf_vfio_user_transport_opts    transport_opts;
207 	pthread_mutex_t				lock;
208 	TAILQ_HEAD(, nvmf_vfio_user_endpoint)	endpoints;
209 };
210 
211 /*
212  * function prototypes
213  */
214 static volatile uint32_t *
215 hdbl(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q);
216 
217 static volatile uint32_t *
218 tdbl(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q);
219 
220 static int
221 nvmf_vfio_user_req_free(struct spdk_nvmf_request *req);
222 
223 static struct nvmf_vfio_user_req *
224 get_nvmf_vfio_user_req(struct nvmf_vfio_user_qpair *qpair);
225 
226 static int
227 nvme_cmd_map_prps(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs,
228 		  uint32_t max_iovcnt, uint32_t len, size_t mps,
229 		  void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot))
230 {
231 	uint64_t prp1, prp2;
232 	void *vva;
233 	uint32_t i;
234 	uint32_t residue_len, nents;
235 	uint64_t *prp_list;
236 	uint32_t iovcnt;
237 
238 	assert(max_iovcnt > 0);
239 
240 	prp1 = cmd->dptr.prp.prp1;
241 	prp2 = cmd->dptr.prp.prp2;
242 
243 	/* PRP1 may started with unaligned page address */
244 	residue_len = mps - (prp1 % mps);
245 	residue_len = spdk_min(len, residue_len);
246 
247 	vva = gpa_to_vva(prv, prp1, residue_len, PROT_READ | PROT_WRITE);
248 	if (spdk_unlikely(vva == NULL)) {
249 		SPDK_ERRLOG("GPA to VVA failed\n");
250 		return -EINVAL;
251 	}
252 	len -= residue_len;
253 	if (len && max_iovcnt < 2) {
254 		SPDK_ERRLOG("Too many page entries, at least two iovs are required\n");
255 		return -ERANGE;
256 	}
257 	iovs[0].iov_base = vva;
258 	iovs[0].iov_len = residue_len;
259 
260 	if (len) {
261 		if (spdk_unlikely(prp2 == 0)) {
262 			SPDK_ERRLOG("no PRP2, %d remaining\n", len);
263 			return -EINVAL;
264 		}
265 
266 		if (len <= mps) {
267 			/* 2 PRP used */
268 			iovcnt = 2;
269 			vva = gpa_to_vva(prv, prp2, len, PROT_READ | PROT_WRITE);
270 			if (spdk_unlikely(vva == NULL)) {
271 				SPDK_ERRLOG("no VVA for %#" PRIx64 ", len%#x\n",
272 					    prp2, len);
273 				return -EINVAL;
274 			}
275 			iovs[1].iov_base = vva;
276 			iovs[1].iov_len = len;
277 		} else {
278 			/* PRP list used */
279 			nents = (len + mps - 1) / mps;
280 			if (spdk_unlikely(nents + 1 > max_iovcnt)) {
281 				SPDK_ERRLOG("Too many page entries\n");
282 				return -ERANGE;
283 			}
284 
285 			vva = gpa_to_vva(prv, prp2, nents * sizeof(*prp_list), PROT_READ);
286 			if (spdk_unlikely(vva == NULL)) {
287 				SPDK_ERRLOG("no VVA for %#" PRIx64 ", nents=%#x\n",
288 					    prp2, nents);
289 				return -EINVAL;
290 			}
291 			prp_list = vva;
292 			i = 0;
293 			while (len != 0) {
294 				residue_len = spdk_min(len, mps);
295 				vva = gpa_to_vva(prv, prp_list[i], residue_len, PROT_READ | PROT_WRITE);
296 				if (spdk_unlikely(vva == NULL)) {
297 					SPDK_ERRLOG("no VVA for %#" PRIx64 ", residue_len=%#x\n",
298 						    prp_list[i], residue_len);
299 					return -EINVAL;
300 				}
301 				iovs[i + 1].iov_base = vva;
302 				iovs[i + 1].iov_len = residue_len;
303 				len -= residue_len;
304 				i++;
305 			}
306 			iovcnt = i + 1;
307 		}
308 	} else {
309 		/* 1 PRP used */
310 		iovcnt = 1;
311 	}
312 
313 	assert(iovcnt <= max_iovcnt);
314 	return iovcnt;
315 }
316 
317 static int
318 nvme_cmd_map_sgls_data(void *prv, struct spdk_nvme_sgl_descriptor *sgls, uint32_t num_sgls,
319 		       struct iovec *iovs, uint32_t max_iovcnt,
320 		       void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot))
321 {
322 	uint32_t i;
323 	void *vva;
324 
325 	if (spdk_unlikely(max_iovcnt < num_sgls)) {
326 		return -ERANGE;
327 	}
328 
329 	for (i = 0; i < num_sgls; i++) {
330 		if (spdk_unlikely(sgls[i].unkeyed.type != SPDK_NVME_SGL_TYPE_DATA_BLOCK)) {
331 			SPDK_ERRLOG("Invalid SGL type %u\n", sgls[i].unkeyed.type);
332 			return -EINVAL;
333 		}
334 		vva = gpa_to_vva(prv, sgls[i].address, sgls[i].unkeyed.length, PROT_READ | PROT_WRITE);
335 		if (spdk_unlikely(vva == NULL)) {
336 			SPDK_ERRLOG("GPA to VVA failed\n");
337 			return -EINVAL;
338 		}
339 		iovs[i].iov_base = vva;
340 		iovs[i].iov_len = sgls[i].unkeyed.length;
341 	}
342 
343 	return num_sgls;
344 }
345 
346 static int
347 nvme_cmd_map_sgls(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt,
348 		  uint32_t len, size_t mps,
349 		  void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot))
350 {
351 	struct spdk_nvme_sgl_descriptor *sgl, *last_sgl;
352 	uint32_t num_sgls, seg_len;
353 	void *vva;
354 	int ret;
355 	uint32_t total_iovcnt = 0;
356 
357 	/* SGL cases */
358 	sgl = &cmd->dptr.sgl1;
359 
360 	/* only one SGL segment */
361 	if (sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) {
362 		assert(max_iovcnt > 0);
363 		vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, PROT_READ | PROT_WRITE);
364 		if (spdk_unlikely(vva == NULL)) {
365 			SPDK_ERRLOG("GPA to VVA failed\n");
366 			return -EINVAL;
367 		}
368 		iovs[0].iov_base = vva;
369 		iovs[0].iov_len = sgl->unkeyed.length;
370 		assert(sgl->unkeyed.length == len);
371 
372 		return 1;
373 	}
374 
375 	for (;;) {
376 		if (spdk_unlikely((sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_SEGMENT) &&
377 				  (sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_LAST_SEGMENT))) {
378 			SPDK_ERRLOG("Invalid SGL type %u\n", sgl->unkeyed.type);
379 			return -EINVAL;
380 		}
381 
382 		seg_len = sgl->unkeyed.length;
383 		if (spdk_unlikely(seg_len % sizeof(struct spdk_nvme_sgl_descriptor))) {
384 			SPDK_ERRLOG("Invalid SGL segment len %u\n", seg_len);
385 			return -EINVAL;
386 		}
387 
388 		num_sgls = seg_len / sizeof(struct spdk_nvme_sgl_descriptor);
389 		vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, PROT_READ);
390 		if (spdk_unlikely(vva == NULL)) {
391 			SPDK_ERRLOG("GPA to VVA failed\n");
392 			return -EINVAL;
393 		}
394 
395 		/* sgl point to the first segment */
396 		sgl = (struct spdk_nvme_sgl_descriptor *)vva;
397 		last_sgl = &sgl[num_sgls - 1];
398 
399 		/* we are done */
400 		if (last_sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) {
401 			/* map whole sgl list */
402 			ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls, &iovs[total_iovcnt],
403 						     max_iovcnt - total_iovcnt, gpa_to_vva);
404 			if (spdk_unlikely(ret < 0)) {
405 				return ret;
406 			}
407 			total_iovcnt += ret;
408 
409 			return total_iovcnt;
410 		}
411 
412 		if (num_sgls > 1) {
413 			/* map whole sgl exclude last_sgl */
414 			ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls - 1, &iovs[total_iovcnt],
415 						     max_iovcnt - total_iovcnt, gpa_to_vva);
416 			if (spdk_unlikely(ret < 0)) {
417 				return ret;
418 			}
419 			total_iovcnt += ret;
420 		}
421 
422 		/* move to next level's segments */
423 		sgl = last_sgl;
424 	}
425 
426 	return 0;
427 }
428 
429 static int
430 nvme_map_cmd(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt,
431 	     uint32_t len, size_t mps,
432 	     void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot))
433 {
434 	if (cmd->psdt == SPDK_NVME_PSDT_PRP) {
435 		return nvme_cmd_map_prps(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva);
436 	}
437 
438 	return nvme_cmd_map_sgls(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva);
439 }
440 
441 static char *
442 endpoint_id(struct nvmf_vfio_user_endpoint *endpoint)
443 {
444 	return endpoint->trid.traddr;
445 }
446 
447 static char *
448 ctrlr_id(struct nvmf_vfio_user_ctrlr *ctrlr)
449 {
450 	if (!ctrlr || !ctrlr->endpoint) {
451 		return "Null Ctrlr";
452 	}
453 
454 	return endpoint_id(ctrlr->endpoint);
455 }
456 
457 static inline uint16_t
458 io_q_id(struct nvme_q *q)
459 {
460 
461 	struct nvmf_vfio_user_qpair *vu_qpair;
462 
463 	assert(q);
464 
465 	if (q->is_cq) {
466 		vu_qpair = SPDK_CONTAINEROF(q, struct nvmf_vfio_user_qpair, cq);
467 	} else {
468 		vu_qpair = SPDK_CONTAINEROF(q, struct nvmf_vfio_user_qpair, sq);
469 	}
470 	assert(vu_qpair);
471 	return vu_qpair->qpair.qid;
472 }
473 
474 static void
475 fail_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr)
476 {
477 	assert(ctrlr != NULL);
478 
479 	if (ctrlr->cfs == 0) {
480 		SPDK_ERRLOG(":%s failing controller\n", ctrlr_id(ctrlr));
481 	}
482 
483 	ctrlr->cfs = 1U;
484 }
485 
486 static inline bool
487 ctrlr_interrupt_enabled(struct nvmf_vfio_user_ctrlr *vu_ctrlr)
488 {
489 	assert(vu_ctrlr != NULL);
490 	assert(vu_ctrlr->endpoint != NULL);
491 
492 	vfu_pci_config_space_t *pci = vu_ctrlr->endpoint->pci_config_space;
493 
494 	return (!pci->hdr.cmd.id || vu_ctrlr->endpoint->msix->mxc.mxe);
495 }
496 
497 static void
498 nvmf_vfio_user_destroy_endpoint(struct nvmf_vfio_user_endpoint *endpoint)
499 {
500 	if (endpoint->doorbells) {
501 		munmap((void *)endpoint->doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE);
502 	}
503 
504 	if (endpoint->devmem_fd > 0) {
505 		close(endpoint->devmem_fd);
506 	}
507 
508 	vfu_destroy_ctx(endpoint->vfu_ctx);
509 
510 	pthread_mutex_destroy(&endpoint->lock);
511 	free(endpoint);
512 }
513 
514 /* called when process exits */
515 static int
516 nvmf_vfio_user_destroy(struct spdk_nvmf_transport *transport,
517 		       spdk_nvmf_transport_destroy_done_cb cb_fn, void *cb_arg)
518 {
519 	struct nvmf_vfio_user_transport *vu_transport;
520 	struct nvmf_vfio_user_endpoint *endpoint, *tmp;
521 
522 	SPDK_DEBUGLOG(nvmf_vfio, "destroy transport\n");
523 
524 	vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport,
525 					transport);
526 
527 	(void)pthread_mutex_destroy(&vu_transport->lock);
528 
529 	TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) {
530 		TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link);
531 		nvmf_vfio_user_destroy_endpoint(endpoint);
532 	}
533 
534 	free(vu_transport);
535 
536 	if (cb_fn) {
537 		cb_fn(cb_arg);
538 	}
539 
540 	return 0;
541 }
542 
543 static const struct spdk_json_object_decoder vfio_user_transport_opts_decoder[] = {
544 	{
545 		"disable_mappable_bar0",
546 		offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_mappable_bar0),
547 		spdk_json_decode_bool, true
548 	},
549 };
550 
551 static struct spdk_nvmf_transport *
552 nvmf_vfio_user_create(struct spdk_nvmf_transport_opts *opts)
553 {
554 	struct nvmf_vfio_user_transport *vu_transport;
555 	int err;
556 
557 	if (opts->max_qpairs_per_ctrlr > NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR) {
558 		SPDK_ERRLOG("Invalid max_qpairs_per_ctrlr=%d, supported max_qpairs_per_ctrlr=%d\n",
559 			    opts->max_qpairs_per_ctrlr, NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR);
560 		return NULL;
561 	}
562 
563 	vu_transport = calloc(1, sizeof(*vu_transport));
564 	if (vu_transport == NULL) {
565 		SPDK_ERRLOG("Transport alloc fail: %m\n");
566 		return NULL;
567 	}
568 
569 	err = pthread_mutex_init(&vu_transport->lock, NULL);
570 	if (err != 0) {
571 		SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err);
572 		goto err;
573 	}
574 
575 	TAILQ_INIT(&vu_transport->endpoints);
576 
577 	if (opts->transport_specific != NULL &&
578 	    spdk_json_decode_object_relaxed(opts->transport_specific, vfio_user_transport_opts_decoder,
579 					    SPDK_COUNTOF(vfio_user_transport_opts_decoder),
580 					    vu_transport)) {
581 		SPDK_ERRLOG("spdk_json_decode_object_relaxed failed\n");
582 		free(vu_transport);
583 		return NULL;
584 	}
585 
586 	SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_mappable_bar0=%d\n",
587 		      vu_transport->transport_opts.disable_mappable_bar0);
588 
589 	return &vu_transport->transport;
590 
591 err:
592 	free(vu_transport);
593 
594 	return NULL;
595 }
596 
597 static uint32_t
598 max_queue_size(struct nvmf_vfio_user_ctrlr const *ctrlr)
599 {
600 	assert(ctrlr != NULL);
601 	assert(ctrlr->qp[0] != NULL);
602 	assert(ctrlr->qp[0]->qpair.ctrlr != NULL);
603 
604 	return ctrlr->qp[0]->qpair.ctrlr->vcprop.cap.bits.mqes + 1;
605 }
606 
607 static void *
608 map_one(vfu_ctx_t *ctx, uint64_t addr, uint64_t len, dma_sg_t *sg, struct iovec *iov, int prot)
609 {
610 	int ret;
611 
612 	assert(ctx != NULL);
613 	assert(sg != NULL);
614 	assert(iov != NULL);
615 
616 	ret = vfu_addr_to_sg(ctx, (void *)(uintptr_t)addr, len, sg, 1, prot);
617 	if (ret < 0) {
618 		return NULL;
619 	}
620 
621 	ret = vfu_map_sg(ctx, sg, iov, 1, 0);
622 	if (ret != 0) {
623 		return NULL;
624 	}
625 
626 	assert(iov->iov_base != NULL);
627 	return iov->iov_base;
628 }
629 
630 static inline uint32_t
631 sq_head(struct nvmf_vfio_user_qpair *qpair)
632 {
633 	assert(qpair != NULL);
634 	return qpair->sq.head;
635 }
636 
637 static inline void
638 sqhd_advance(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_qpair *qpair)
639 {
640 	assert(ctrlr != NULL);
641 	assert(qpair != NULL);
642 	qpair->sq.head = (qpair->sq.head + 1) % qpair->sq.size;
643 }
644 
645 static int
646 map_q(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvme_q *q, bool is_cq, bool unmap)
647 {
648 	uint64_t len;
649 
650 	assert(q->size);
651 	assert(q->addr == NULL);
652 
653 	if (is_cq) {
654 		len = q->size * sizeof(struct spdk_nvme_cpl);
655 	} else {
656 		len = q->size * sizeof(struct spdk_nvme_cmd);
657 	}
658 
659 	q->addr = map_one(vu_ctrlr->endpoint->vfu_ctx, q->prp1, len, q->sg,
660 			  &q->iov, is_cq ? PROT_READ | PROT_WRITE : PROT_READ);
661 	if (q->addr == NULL) {
662 		return -EFAULT;
663 	}
664 
665 	if (unmap) {
666 		memset(q->addr, 0, len);
667 	}
668 
669 	return 0;
670 }
671 
672 static int
673 asq_setup(struct nvmf_vfio_user_ctrlr *ctrlr)
674 {
675 	struct nvme_q *sq;
676 	const struct spdk_nvmf_registers *regs;
677 	int ret;
678 
679 	assert(ctrlr != NULL);
680 	assert(ctrlr->qp[0] != NULL);
681 	assert(ctrlr->qp[0]->sq.addr == NULL);
682 	/* XXX ctrlr->asq == 0 is a valid memory address */
683 
684 	regs = spdk_nvmf_ctrlr_get_regs(ctrlr->qp[0]->qpair.ctrlr);
685 	sq = &ctrlr->qp[0]->sq;
686 	sq->size = regs->aqa.bits.asqs + 1;
687 	sq->prp1 = regs->asq;
688 	sq->head = 0;
689 	sq->cqid = 0;
690 	sq->is_cq = false;
691 
692 	ret = map_q(ctrlr, sq, false, true);
693 	if (ret) {
694 		return ret;
695 	}
696 
697 	*tdbl(ctrlr, sq) = 0;
698 
699 	return 0;
700 }
701 
702 static inline int
703 queue_index(uint16_t qid, int is_cq)
704 {
705 	return (qid * 2) + is_cq;
706 }
707 
708 static volatile uint32_t *
709 tdbl(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q)
710 {
711 	assert(ctrlr != NULL);
712 	assert(q != NULL);
713 	assert(!q->is_cq);
714 
715 	return &ctrlr->doorbells[queue_index(io_q_id(q), false)];
716 }
717 
718 static volatile uint32_t *
719 hdbl(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q)
720 {
721 	assert(ctrlr != NULL);
722 	assert(q != NULL);
723 	assert(q->is_cq);
724 
725 	return &ctrlr->doorbells[queue_index(io_q_id(q), true)];
726 }
727 
728 static inline bool
729 cq_is_full(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q)
730 {
731 	assert(ctrlr != NULL);
732 	assert(q != NULL);
733 	assert(q->is_cq);
734 
735 	return ((q->tail + 1) % q->size) == *hdbl(ctrlr, q);
736 }
737 
738 static inline void
739 cq_tail_advance(struct nvme_q *q)
740 {
741 	assert(q != NULL);
742 	assert(q->is_cq);
743 
744 	assert(q->tail < q->size);
745 	q->tail++;
746 
747 	if (spdk_unlikely(q->tail == q->size)) {
748 		q->tail = 0;
749 		q->phase = !q->phase;
750 	}
751 }
752 
753 static int
754 acq_setup(struct nvmf_vfio_user_ctrlr *ctrlr)
755 {
756 	struct nvme_q *cq;
757 	const struct spdk_nvmf_registers *regs;
758 	int ret;
759 
760 	assert(ctrlr != NULL);
761 	assert(ctrlr->qp[0] != NULL);
762 	assert(ctrlr->qp[0]->cq.addr == NULL);
763 
764 	regs = spdk_nvmf_ctrlr_get_regs(ctrlr->qp[0]->qpair.ctrlr);
765 	assert(regs != NULL);
766 	cq = &ctrlr->qp[0]->cq;
767 	cq->size = regs->aqa.bits.acqs + 1;
768 	cq->prp1 = regs->acq;
769 	cq->tail = 0;
770 	cq->is_cq = true;
771 	cq->ien = true;
772 	cq->phase = true;
773 
774 	ret = map_q(ctrlr, cq, true, true);
775 	if (ret) {
776 		return ret;
777 	}
778 	*hdbl(ctrlr, cq) = 0;
779 
780 	return 0;
781 }
782 
783 static inline dma_sg_t *
784 vu_req_to_sg_t(struct nvmf_vfio_user_req *vu_req, uint32_t iovcnt)
785 {
786 	return (dma_sg_t *)((uintptr_t)vu_req->sg + iovcnt * dma_sg_size());
787 }
788 
789 static void *
790 _map_one(void *prv, uint64_t addr, uint64_t len, int prot)
791 {
792 	struct spdk_nvmf_request *req = (struct spdk_nvmf_request *)prv;
793 	struct spdk_nvmf_qpair *qpair;
794 	struct nvmf_vfio_user_req *vu_req;
795 	struct nvmf_vfio_user_qpair *vu_qpair;
796 	void *ret;
797 
798 	assert(req != NULL);
799 	qpair = req->qpair;
800 	vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req);
801 	vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair);
802 
803 	assert(vu_req->iovcnt < NVMF_VFIO_USER_MAX_IOVECS);
804 	ret = map_one(vu_qpair->ctrlr->endpoint->vfu_ctx, addr, len,
805 		      vu_req_to_sg_t(vu_req, vu_req->iovcnt),
806 		      &vu_req->iov[vu_req->iovcnt], prot);
807 	if (spdk_likely(ret != NULL)) {
808 		vu_req->iovcnt++;
809 	}
810 	return ret;
811 }
812 
813 static int
814 vfio_user_map_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req,
815 		  struct iovec *iov, uint32_t length)
816 {
817 	/* Map PRP list to from Guest physical memory to
818 	 * virtual memory address.
819 	 */
820 	return nvme_map_cmd(req, &req->cmd->nvme_cmd, iov, NVMF_REQ_MAX_BUFFERS,
821 			    length, 4096, _map_one);
822 }
823 
824 static int
825 handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd,
826 	       struct nvmf_vfio_user_qpair *vu_qpair);
827 
828 /*
829  * Posts a CQE in the completion queue.
830  *
831  * @ctrlr: the vfio-user controller
832  * @cq: the completion queue
833  * @cdw0: cdw0 as reported by NVMf
834  * @sqid: submission queue ID
835  * @cid: command identifier in NVMe command
836  * @sc: the NVMe CQE status code
837  * @sct: the NVMe CQE status code type
838  */
839 static int
840 post_completion(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *cq,
841 		uint32_t cdw0, uint16_t sqid, uint16_t cid, uint16_t sc, uint16_t sct)
842 {
843 	struct spdk_nvme_cpl *cpl;
844 	const struct spdk_nvmf_registers *regs;
845 	int err;
846 
847 	assert(ctrlr != NULL);
848 
849 	if (spdk_unlikely(cq == NULL || cq->addr == NULL)) {
850 		return 0;
851 	}
852 
853 	regs = spdk_nvmf_ctrlr_get_regs(ctrlr->qp[0]->qpair.ctrlr);
854 	if (regs->csts.bits.shst != SPDK_NVME_SHST_NORMAL) {
855 		SPDK_DEBUGLOG(nvmf_vfio,
856 			      "%s: ignore completion SQ%d cid=%d status=%#x\n",
857 			      ctrlr_id(ctrlr), sqid, cid, sc);
858 		return 0;
859 	}
860 
861 	if (cq_is_full(ctrlr, cq)) {
862 		SPDK_ERRLOG("%s: CQ%d full (tail=%d, head=%d)\n",
863 			    ctrlr_id(ctrlr), io_q_id(cq), cq->tail, *hdbl(ctrlr, cq));
864 		return -1;
865 	}
866 
867 	cpl = ((struct spdk_nvme_cpl *)cq->addr) + cq->tail;
868 
869 	assert(ctrlr->qp[sqid] != NULL);
870 	SPDK_DEBUGLOG(nvmf_vfio,
871 		      "%s: request complete SQ%d cid=%d status=%#x SQ head=%#x CQ tail=%#x\n",
872 		      ctrlr_id(ctrlr), sqid, cid, sc, sq_head(ctrlr->qp[sqid]),
873 		      cq->tail);
874 
875 	cpl->sqhd = sq_head(ctrlr->qp[sqid]);
876 	cpl->sqid = sqid;
877 	cpl->cid = cid;
878 	cpl->cdw0 = cdw0;
879 	cpl->status.dnr = 0x0;
880 	cpl->status.m = 0x0;
881 	cpl->status.sct = sct;
882 	cpl->status.p = cq->phase;
883 	cpl->status.sc = sc;
884 
885 	cq_tail_advance(cq);
886 
887 	/*
888 	 * this function now executes at SPDK thread context, we
889 	 * might be triggerring interrupts from vfio-user thread context so
890 	 * check for race conditions.
891 	 */
892 	if (ctrlr_interrupt_enabled(ctrlr) && cq->ien) {
893 		err = vfu_irq_trigger(ctrlr->endpoint->vfu_ctx, cq->iv);
894 		if (err != 0) {
895 			SPDK_ERRLOG("%s: failed to trigger interrupt: %m\n",
896 				    ctrlr_id(ctrlr));
897 			return err;
898 		}
899 	}
900 
901 	return 0;
902 }
903 
904 static bool
905 io_q_exists(struct nvmf_vfio_user_ctrlr *vu_ctrlr, const uint16_t qid, const bool is_cq)
906 {
907 	assert(vu_ctrlr != NULL);
908 
909 	if (qid == 0 || qid >= NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR) {
910 		return false;
911 	}
912 
913 	if (vu_ctrlr->qp[qid] == NULL) {
914 		return false;
915 	}
916 
917 	if (!is_cq) {
918 		if (vu_ctrlr->qp[qid]->state == VFIO_USER_QPAIR_SQ_DELETED ||
919 		    vu_ctrlr->qp[qid]->state == VFIO_USER_QPAIR_UNINITIALIZED) {
920 			return false;
921 		}
922 	}
923 
924 	return true;
925 }
926 
927 static void
928 unmap_qp(struct nvmf_vfio_user_qpair *qp)
929 {
930 	struct nvmf_vfio_user_ctrlr *ctrlr;
931 
932 	if (qp->ctrlr == NULL) {
933 		return;
934 	}
935 	ctrlr = qp->ctrlr;
936 
937 	SPDK_DEBUGLOG(nvmf_vfio, "%s: unmap QP%d\n",
938 		      ctrlr_id(ctrlr), qp->qpair.qid);
939 
940 	if (qp->sq.addr != NULL) {
941 		vfu_unmap_sg(ctrlr->endpoint->vfu_ctx, qp->sq.sg, &qp->sq.iov, 1);
942 		qp->sq.addr = NULL;
943 	}
944 
945 	if (qp->cq.addr != NULL) {
946 		vfu_unmap_sg(ctrlr->endpoint->vfu_ctx, qp->cq.sg, &qp->cq.iov, 1);
947 		qp->cq.addr = NULL;
948 	}
949 }
950 
951 static int
952 remap_qp(struct nvmf_vfio_user_qpair *vu_qpair)
953 {
954 	struct nvme_q *sq, *cq;
955 	struct nvmf_vfio_user_ctrlr *vu_ctrlr;
956 	int ret;
957 
958 	vu_ctrlr = vu_qpair->ctrlr;
959 	sq = &vu_qpair->sq;
960 	cq = &vu_qpair->cq;
961 
962 	if (sq->size) {
963 		ret = map_q(vu_ctrlr, sq, false, false);
964 		if (ret) {
965 			SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap SQID %d %#lx-%#lx\n",
966 				      io_q_id(sq), sq->prp1, sq->prp1 + sq->size * sizeof(struct spdk_nvme_cmd));
967 			return -EFAULT;
968 		}
969 	}
970 
971 	if (cq->size) {
972 		ret = map_q(vu_ctrlr, cq, true, false);
973 		if (ret) {
974 			SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap CQID %d %#lx-%#lx\n",
975 				      io_q_id(cq), cq->prp1, cq->prp1 + cq->size * sizeof(struct spdk_nvme_cpl));
976 			return -EFAULT;
977 		}
978 
979 	}
980 
981 	return 0;
982 }
983 
984 static void
985 free_qp(struct nvmf_vfio_user_ctrlr *ctrlr, uint16_t qid)
986 {
987 	struct nvmf_vfio_user_qpair *qpair;
988 	struct nvmf_vfio_user_req *vu_req;
989 	uint32_t i;
990 
991 	if (ctrlr == NULL) {
992 		return;
993 	}
994 
995 	qpair = ctrlr->qp[qid];
996 	if (qpair == NULL) {
997 		return;
998 	}
999 
1000 	SPDK_DEBUGLOG(nvmf_vfio, "%s: destroy QP%d=%p\n", ctrlr_id(ctrlr),
1001 		      qid, qpair);
1002 
1003 	unmap_qp(qpair);
1004 
1005 	for (i = 0; i < qpair->qsize; i++) {
1006 		vu_req = &qpair->reqs_internal[i];
1007 		free(vu_req->sg);
1008 	}
1009 	free(qpair->reqs_internal);
1010 
1011 	free(qpair->sq.sg);
1012 	free(qpair->cq.sg);
1013 	free(qpair);
1014 
1015 	ctrlr->qp[qid] = NULL;
1016 }
1017 
1018 /* This function can only fail because of memory allocation errors. */
1019 static int
1020 init_qp(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_transport *transport,
1021 	const uint32_t qsize, const uint16_t id)
1022 {
1023 	uint32_t i;
1024 	struct nvmf_vfio_user_qpair *qpair;
1025 	struct nvmf_vfio_user_req *vu_req, *tmp;
1026 	struct spdk_nvmf_request *req;
1027 
1028 	assert(ctrlr != NULL);
1029 	assert(transport != NULL);
1030 
1031 	qpair = calloc(1, sizeof(*qpair));
1032 	if (qpair == NULL) {
1033 		return -ENOMEM;
1034 	}
1035 	qpair->sq.sg = calloc(1, dma_sg_size());
1036 	if (qpair->sq.sg == NULL) {
1037 		free(qpair);
1038 		return -ENOMEM;
1039 	}
1040 	qpair->cq.sg = calloc(1, dma_sg_size());
1041 	if (qpair->cq.sg == NULL) {
1042 		free(qpair->sq.sg);
1043 		free(qpair);
1044 		return -ENOMEM;
1045 	}
1046 
1047 	qpair->qpair.qid = id;
1048 	qpair->qpair.transport = transport;
1049 	qpair->ctrlr = ctrlr;
1050 	qpair->qsize = qsize;
1051 
1052 	TAILQ_INIT(&qpair->reqs);
1053 
1054 	qpair->reqs_internal = calloc(qsize, sizeof(struct nvmf_vfio_user_req));
1055 	if (qpair->reqs_internal == NULL) {
1056 		SPDK_ERRLOG("%s: error allocating reqs: %m\n", ctrlr_id(ctrlr));
1057 		goto reqs_err;
1058 	}
1059 
1060 	for (i = 0; i < qsize; i++) {
1061 		vu_req = &qpair->reqs_internal[i];
1062 		vu_req->sg = calloc(NVMF_VFIO_USER_MAX_IOVECS, dma_sg_size());
1063 		if (vu_req->sg == NULL) {
1064 			goto sg_err;
1065 		}
1066 
1067 		req = &vu_req->req;
1068 		req->qpair = &qpair->qpair;
1069 		req->rsp = (union nvmf_c2h_msg *)&vu_req->rsp;
1070 		req->cmd = (union nvmf_h2c_msg *)&vu_req->cmd;
1071 
1072 		TAILQ_INSERT_TAIL(&qpair->reqs, vu_req, link);
1073 	}
1074 
1075 	ctrlr->qp[id] = qpair;
1076 	return 0;
1077 
1078 sg_err:
1079 	TAILQ_FOREACH_SAFE(vu_req, &qpair->reqs, link, tmp) {
1080 		free(vu_req->sg);
1081 	}
1082 	free(qpair->reqs_internal);
1083 
1084 reqs_err:
1085 	free(qpair->sq.sg);
1086 	free(qpair->cq.sg);
1087 	free(qpair);
1088 	return -ENOMEM;
1089 }
1090 
1091 /*
1092  * Creates a completion or submission I/O queue. Returns 0 on success, -errno
1093  * on error.
1094  */
1095 static int
1096 handle_create_io_q(struct nvmf_vfio_user_ctrlr *ctrlr,
1097 		   struct spdk_nvme_cmd *cmd, const bool is_cq)
1098 {
1099 	uint16_t qid;
1100 	uint32_t qsize;
1101 	uint16_t sc = SPDK_NVME_SC_SUCCESS;
1102 	uint16_t sct = SPDK_NVME_SCT_GENERIC;
1103 	int err = 0;
1104 	struct nvmf_vfio_user_qpair *vu_qpair;
1105 	struct nvme_q *io_q;
1106 	struct nvmf_vfio_user_transport *vu_transport = ctrlr->transport;
1107 
1108 	assert(ctrlr != NULL);
1109 	assert(cmd != NULL);
1110 
1111 	qid = cmd->cdw10_bits.create_io_q.qid;
1112 	if (qid == 0 || qid >= vu_transport->transport.opts.max_qpairs_per_ctrlr) {
1113 		SPDK_ERRLOG("%s: invalid QID=%d, max=%d\n", ctrlr_id(ctrlr),
1114 			    qid, vu_transport->transport.opts.max_qpairs_per_ctrlr);
1115 		sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
1116 		sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER;
1117 		goto out;
1118 	}
1119 
1120 	if (io_q_exists(ctrlr, qid, is_cq)) {
1121 		SPDK_ERRLOG("%s: %cQ%d already exists\n", ctrlr_id(ctrlr),
1122 			    is_cq ? 'C' : 'S', qid);
1123 		sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
1124 		sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER;
1125 		goto out;
1126 	}
1127 
1128 	qsize = cmd->cdw10_bits.create_io_q.qsize + 1;
1129 	if (qsize == 1 || qsize > max_queue_size(ctrlr)) {
1130 		SPDK_ERRLOG("%s: invalid I/O queue size %u\n", ctrlr_id(ctrlr), qsize);
1131 		sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
1132 		sc = SPDK_NVME_SC_INVALID_QUEUE_SIZE;
1133 		goto out;
1134 	}
1135 
1136 	SPDK_DEBUGLOG(nvmf_vfio,
1137 		      "%s: create I/O %cQ%d: QSIZE=%#x\n", ctrlr_id(ctrlr),
1138 		      is_cq ? 'C' : 'S', qid, qsize);
1139 
1140 	if (is_cq) {
1141 		if (cmd->cdw11_bits.create_io_cq.pc != 0x1) {
1142 			SPDK_ERRLOG("%s: non-PC CQ not supporred\n", ctrlr_id(ctrlr));
1143 			sc = SPDK_NVME_SC_INVALID_FIELD;
1144 			goto out;
1145 		}
1146 		if (cmd->cdw11_bits.create_io_cq.iv > NVME_IRQ_MSIX_NUM - 1) {
1147 			SPDK_ERRLOG("%s: IV is too big\n", ctrlr_id(ctrlr));
1148 			sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
1149 			sc = SPDK_NVME_SC_INVALID_INTERRUPT_VECTOR;
1150 			goto out;
1151 		}
1152 
1153 		err = init_qp(ctrlr, ctrlr->qp[0]->qpair.transport, qsize, qid);
1154 		if (err != 0) {
1155 			sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
1156 			goto out;
1157 		}
1158 
1159 		io_q = &ctrlr->qp[qid]->cq;
1160 		io_q->ien = cmd->cdw11_bits.create_io_cq.ien;
1161 		io_q->iv = cmd->cdw11_bits.create_io_cq.iv;
1162 		io_q->phase = true;
1163 	} else {
1164 		if (cmd->cdw11_bits.create_io_sq.cqid == 0) {
1165 			SPDK_ERRLOG("%s: invalid CQID 0\n", ctrlr_id(ctrlr));
1166 			sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
1167 			sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER;
1168 			goto out;
1169 
1170 		}
1171 		/* CQ must be created before SQ */
1172 		if (!io_q_exists(ctrlr, cmd->cdw11_bits.create_io_sq.cqid, true)) {
1173 			SPDK_ERRLOG("%s: CQ%d does not exist\n", ctrlr_id(ctrlr),
1174 				    cmd->cdw11_bits.create_io_sq.cqid);
1175 			sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
1176 			sc = SPDK_NVME_SC_COMPLETION_QUEUE_INVALID;
1177 			goto out;
1178 		}
1179 
1180 		if (cmd->cdw11_bits.create_io_sq.pc != 0x1) {
1181 			SPDK_ERRLOG("%s: non-PC SQ not supported\n", ctrlr_id(ctrlr));
1182 			sc = SPDK_NVME_SC_INVALID_FIELD;
1183 			goto out;
1184 		}
1185 		/* TODO: support shared IO CQ */
1186 		if (qid != cmd->cdw11_bits.create_io_sq.cqid) {
1187 			SPDK_ERRLOG("%s: doesn't support shared CQ now\n", ctrlr_id(ctrlr));
1188 			sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
1189 			sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER;
1190 		}
1191 
1192 		io_q = &ctrlr->qp[qid]->sq;
1193 		io_q->cqid = cmd->cdw11_bits.create_io_sq.cqid;
1194 		SPDK_DEBUGLOG(nvmf_vfio, "%s: SQ%d CQID=%d\n", ctrlr_id(ctrlr),
1195 			      qid, io_q->cqid);
1196 	}
1197 
1198 	io_q->is_cq = is_cq;
1199 	io_q->size = qsize;
1200 	io_q->prp1 = cmd->dptr.prp.prp1;
1201 
1202 	err = map_q(ctrlr, io_q, is_cq, true);
1203 	if (err) {
1204 		sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
1205 		SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr));
1206 		goto out;
1207 	}
1208 
1209 	SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped %cQ%d IOVA=%#lx vaddr=%#llx\n",
1210 		      ctrlr_id(ctrlr), is_cq ? 'C' : 'S',
1211 		      qid, cmd->dptr.prp.prp1, (unsigned long long)io_q->addr);
1212 
1213 	if (is_cq) {
1214 		*hdbl(ctrlr, io_q) = 0;
1215 	} else {
1216 		vu_qpair = ctrlr->qp[qid];
1217 		*tdbl(ctrlr, io_q) = 0;
1218 		vu_qpair->sq.head = 0;
1219 
1220 		if (vu_qpair->state == VFIO_USER_QPAIR_SQ_DELETED) {
1221 			vu_qpair->state = VFIO_USER_QPAIR_ACTIVE;
1222 		} else {
1223 			/*
1224 			 * Create our new I/O qpair. This asynchronously invokes, on a
1225 			 * suitable poll group, the nvmf_vfio_user_poll_group_add()
1226 			 * callback, which will call spdk_nvmf_request_exec_fabrics()
1227 			 * with a generated fabrics connect command. This command is
1228 			 * then eventually completed via handle_queue_connect_rsp().
1229 			 */
1230 			vu_qpair->create_io_sq_cmd = *cmd;
1231 			spdk_nvmf_tgt_new_qpair(ctrlr->transport->transport.tgt,
1232 						&vu_qpair->qpair);
1233 			return 0;
1234 		}
1235 	}
1236 
1237 out:
1238 	return post_completion(ctrlr, &ctrlr->qp[0]->cq, 0, 0, cmd->cid, sc, sct);
1239 }
1240 
1241 /* For ADMIN I/O DELETE COMPLETION QUEUE the NVMf library will disconnect and free
1242  * queue pair, so save the command in a context.
1243  */
1244 struct vfio_user_delete_cq_ctx {
1245 	struct nvmf_vfio_user_ctrlr *vu_ctrlr;
1246 	struct spdk_nvme_cmd delete_io_cq_cmd;
1247 };
1248 
1249 static void
1250 vfio_user_qpair_delete_cb(void *cb_arg)
1251 {
1252 	struct vfio_user_delete_cq_ctx *ctx = cb_arg;
1253 	struct nvmf_vfio_user_ctrlr *vu_ctrlr = ctx->vu_ctrlr;
1254 
1255 	post_completion(vu_ctrlr, &vu_ctrlr->qp[0]->cq, 0, 0, ctx->delete_io_cq_cmd.cid,
1256 			SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC);
1257 	free(ctx);
1258 }
1259 
1260 /*
1261  * Deletes a completion or submission I/O queue.
1262  */
1263 static int
1264 handle_del_io_q(struct nvmf_vfio_user_ctrlr *ctrlr,
1265 		struct spdk_nvme_cmd *cmd, const bool is_cq)
1266 {
1267 	uint16_t sct = SPDK_NVME_SCT_GENERIC;
1268 	uint16_t sc = SPDK_NVME_SC_SUCCESS;
1269 	struct nvmf_vfio_user_qpair *vu_qpair;
1270 	struct vfio_user_delete_cq_ctx *ctx;
1271 
1272 	SPDK_DEBUGLOG(nvmf_vfio, "%s: delete I/O %cQ: QID=%d\n",
1273 		      ctrlr_id(ctrlr), is_cq ? 'C' : 'S',
1274 		      cmd->cdw10_bits.delete_io_q.qid);
1275 
1276 	if (!io_q_exists(ctrlr, cmd->cdw10_bits.delete_io_q.qid, is_cq)) {
1277 		SPDK_ERRLOG("%s: I/O %cQ%d does not exist\n", ctrlr_id(ctrlr),
1278 			    is_cq ? 'C' : 'S', cmd->cdw10_bits.delete_io_q.qid);
1279 		sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
1280 		sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER;
1281 		goto out;
1282 	}
1283 
1284 	vu_qpair = ctrlr->qp[cmd->cdw10_bits.delete_io_q.qid];
1285 	if (is_cq) {
1286 		if (vu_qpair->state == VFIO_USER_QPAIR_UNINITIALIZED) {
1287 			free_qp(ctrlr, cmd->cdw10_bits.delete_io_q.qid);
1288 			goto out;
1289 		}
1290 
1291 		/* SQ must have been deleted first */
1292 		if (vu_qpair->state != VFIO_USER_QPAIR_SQ_DELETED) {
1293 			SPDK_ERRLOG("%s: the associated SQ must be deleted first\n", ctrlr_id(ctrlr));
1294 			sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
1295 			sc = SPDK_NVME_SC_INVALID_QUEUE_DELETION;
1296 			goto out;
1297 		}
1298 		ctx = calloc(1, sizeof(*ctx));
1299 		if (!ctx) {
1300 			sct = SPDK_NVME_SCT_GENERIC;
1301 			sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
1302 			goto out;
1303 		}
1304 		ctx->vu_ctrlr = ctrlr;
1305 		ctx->delete_io_cq_cmd = *cmd;
1306 		spdk_nvmf_qpair_disconnect(&vu_qpair->qpair, vfio_user_qpair_delete_cb, ctx);
1307 		return 0;
1308 	} else {
1309 		if (vu_qpair->state == VFIO_USER_QPAIR_SQ_DELETED) {
1310 			SPDK_DEBUGLOG(nvmf_vfio, "%s: SQ%u is already deleted\n", ctrlr_id(ctrlr),
1311 				      cmd->cdw10_bits.delete_io_q.qid);
1312 			sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
1313 			sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER;
1314 			goto out;
1315 		}
1316 
1317 		/*
1318 		 * This doesn't actually delete the SQ, We're merely telling the poll_group_poll
1319 		 * function to skip checking this SQ.  The queue pair will be disconnected in Delete
1320 		 * IO CQ command.
1321 		 */
1322 		vu_qpair->state = VFIO_USER_QPAIR_SQ_DELETED;
1323 		vfu_unmap_sg(ctrlr->endpoint->vfu_ctx, vu_qpair->sq.sg, &vu_qpair->sq.iov, 1);
1324 		vu_qpair->sq.addr = NULL;
1325 	}
1326 
1327 out:
1328 	return post_completion(ctrlr, &ctrlr->qp[0]->cq, 0, 0, cmd->cid, sc, sct);
1329 }
1330 
1331 /*
1332  * Returns 0 on success and -errno on error.
1333  */
1334 static int
1335 consume_admin_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd)
1336 {
1337 	assert(ctrlr != NULL);
1338 	assert(cmd != NULL);
1339 
1340 	if (cmd->fuse != 0) {
1341 		/* Fused admin commands are not supported. */
1342 		return post_completion(ctrlr, &ctrlr->qp[0]->cq, 0, 0, cmd->cid,
1343 				       SPDK_NVME_SC_INVALID_FIELD,
1344 				       SPDK_NVME_SCT_GENERIC);
1345 	}
1346 
1347 	switch (cmd->opc) {
1348 	case SPDK_NVME_OPC_CREATE_IO_CQ:
1349 	case SPDK_NVME_OPC_CREATE_IO_SQ:
1350 		return handle_create_io_q(ctrlr, cmd,
1351 					  cmd->opc == SPDK_NVME_OPC_CREATE_IO_CQ);
1352 	case SPDK_NVME_OPC_DELETE_IO_SQ:
1353 	case SPDK_NVME_OPC_DELETE_IO_CQ:
1354 		return handle_del_io_q(ctrlr, cmd,
1355 				       cmd->opc == SPDK_NVME_OPC_DELETE_IO_CQ);
1356 	default:
1357 		return handle_cmd_req(ctrlr, cmd, ctrlr->qp[0]);
1358 	}
1359 }
1360 
1361 static int
1362 handle_cmd_rsp(struct nvmf_vfio_user_req *vu_req, void *cb_arg)
1363 {
1364 	struct nvmf_vfio_user_qpair *vu_qpair = cb_arg;
1365 	struct nvmf_vfio_user_ctrlr *vu_ctrlr = vu_qpair->ctrlr;
1366 	uint16_t sqid, cqid;
1367 
1368 	assert(vu_qpair != NULL);
1369 	assert(vu_req != NULL);
1370 	assert(vu_ctrlr != NULL);
1371 
1372 	if (spdk_likely(vu_req->iovcnt)) {
1373 		vfu_unmap_sg(vu_ctrlr->endpoint->vfu_ctx, vu_req->sg, vu_req->iov, vu_req->iovcnt);
1374 	}
1375 	sqid = vu_qpair->qpair.qid;
1376 	cqid = vu_ctrlr->qp[sqid]->sq.cqid;
1377 
1378 	return post_completion(vu_ctrlr, &vu_ctrlr->qp[cqid]->cq,
1379 			       vu_req->req.rsp->nvme_cpl.cdw0,
1380 			       sqid,
1381 			       vu_req->req.cmd->nvme_cmd.cid,
1382 			       vu_req->req.rsp->nvme_cpl.status.sc,
1383 			       vu_req->req.rsp->nvme_cpl.status.sct);
1384 }
1385 
1386 static int
1387 consume_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_qpair *qpair,
1388 	    struct spdk_nvme_cmd *cmd)
1389 {
1390 	assert(qpair != NULL);
1391 	if (nvmf_qpair_is_admin_queue(&qpair->qpair)) {
1392 		return consume_admin_cmd(ctrlr, cmd);
1393 	}
1394 
1395 	return handle_cmd_req(ctrlr, cmd, qpair);
1396 }
1397 
1398 /* Returns the number of commands processed, or a negative value on error. */
1399 static int
1400 handle_sq_tdbl_write(struct nvmf_vfio_user_ctrlr *ctrlr, const uint32_t new_tail,
1401 		     struct nvmf_vfio_user_qpair *qpair)
1402 {
1403 	struct spdk_nvme_cmd *queue;
1404 	int count = 0;
1405 
1406 	assert(ctrlr != NULL);
1407 	assert(qpair != NULL);
1408 
1409 	queue = qpair->sq.addr;
1410 	while (sq_head(qpair) != new_tail) {
1411 		int err;
1412 		struct spdk_nvme_cmd *cmd = &queue[sq_head(qpair)];
1413 
1414 		count++;
1415 
1416 		/*
1417 		 * SQHD must contain the new head pointer, so we must increase
1418 		 * it before we generate a completion.
1419 		 */
1420 		sqhd_advance(ctrlr, qpair);
1421 
1422 		err = consume_cmd(ctrlr, qpair, cmd);
1423 		if (err != 0) {
1424 			return err;
1425 		}
1426 	}
1427 
1428 	return count;
1429 }
1430 
1431 static int
1432 enable_admin_queue(struct nvmf_vfio_user_ctrlr *ctrlr)
1433 {
1434 	int err;
1435 
1436 	assert(ctrlr != NULL);
1437 
1438 	err = acq_setup(ctrlr);
1439 	if (err != 0) {
1440 		return err;
1441 	}
1442 
1443 	err = asq_setup(ctrlr);
1444 	if (err != 0) {
1445 		return err;
1446 	}
1447 
1448 	return 0;
1449 }
1450 
1451 static void
1452 disable_admin_queue(struct nvmf_vfio_user_ctrlr *ctrlr)
1453 {
1454 	assert(ctrlr->qp[0] != NULL);
1455 
1456 	unmap_qp(ctrlr->qp[0]);
1457 }
1458 
1459 static void
1460 memory_region_add_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
1461 {
1462 	struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx);
1463 	struct nvmf_vfio_user_ctrlr *ctrlr;
1464 	struct nvmf_vfio_user_qpair *qpair;
1465 	int ret;
1466 
1467 	/*
1468 	 * We're not interested in any DMA regions that aren't mappable (we don't
1469 	 * support clients that don't share their memory).
1470 	 */
1471 	if (!info->vaddr) {
1472 		return;
1473 	}
1474 
1475 	if (((uintptr_t)info->mapping.iov_base & MASK_2MB) ||
1476 	    (info->mapping.iov_len & MASK_2MB)) {
1477 		SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %#lx-%#lx\n", info->vaddr,
1478 			      (uintptr_t)info->mapping.iov_base,
1479 			      (uintptr_t)info->mapping.iov_base + info->mapping.iov_len);
1480 		return;
1481 	}
1482 
1483 	assert(endpoint != NULL);
1484 	if (endpoint->ctrlr == NULL) {
1485 		return;
1486 	}
1487 	ctrlr = endpoint->ctrlr;
1488 
1489 	SPDK_DEBUGLOG(nvmf_vfio, "%s: map IOVA %#lx-%#lx\n", ctrlr_id(ctrlr),
1490 		      (uintptr_t)info->mapping.iov_base,
1491 		      (uintptr_t)info->mapping.iov_base + info->mapping.iov_len);
1492 
1493 	/* VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE are enabled when registering to VFIO, here we also
1494 	 * check the protection bits before registering.
1495 	 */
1496 	if (info->prot == (PROT_WRITE | PROT_READ)) {
1497 		ret = spdk_mem_register(info->mapping.iov_base, info->mapping.iov_len);
1498 		if (ret) {
1499 			SPDK_ERRLOG("Memory region register %#lx-%#lx failed, ret=%d\n",
1500 				    (uint64_t)(uintptr_t)info->mapping.iov_base,
1501 				    (uint64_t)(uintptr_t)info->mapping.iov_base + info->mapping.iov_len,
1502 				    ret);
1503 		}
1504 	}
1505 
1506 	pthread_mutex_lock(&endpoint->lock);
1507 	TAILQ_FOREACH(qpair, &ctrlr->connected_qps, tailq) {
1508 		if (qpair->state != VFIO_USER_QPAIR_INACTIVE) {
1509 			continue;
1510 		}
1511 
1512 		ret = remap_qp(qpair);
1513 		if (ret) {
1514 			continue;
1515 		}
1516 		qpair->state = VFIO_USER_QPAIR_ACTIVE;
1517 		SPDK_DEBUGLOG(nvmf_vfio, "Remap QP %u successfully\n", qpair->qpair.qid);
1518 	}
1519 	pthread_mutex_unlock(&endpoint->lock);
1520 }
1521 
1522 static int
1523 memory_region_remove_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
1524 {
1525 	struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx);
1526 	struct nvmf_vfio_user_ctrlr *ctrlr;
1527 	struct nvmf_vfio_user_qpair *qpair;
1528 	void *map_start, *map_end;
1529 	int ret = 0;
1530 
1531 	if (!info->vaddr) {
1532 		return 0;
1533 	}
1534 
1535 	if (((uintptr_t)info->mapping.iov_base & MASK_2MB) ||
1536 	    (info->mapping.iov_len & MASK_2MB)) {
1537 		SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %#lx-%#lx\n", info->vaddr,
1538 			      (uintptr_t)info->mapping.iov_base,
1539 			      (uintptr_t)info->mapping.iov_base + info->mapping.iov_len);
1540 		return 0;
1541 	}
1542 
1543 	assert(endpoint != NULL);
1544 	if (endpoint->ctrlr == NULL) {
1545 		return 0;
1546 	}
1547 	ctrlr = endpoint->ctrlr;
1548 
1549 	SPDK_DEBUGLOG(nvmf_vfio, "%s: unmap IOVA %#lx-%#lx\n", ctrlr_id(ctrlr),
1550 		      (uintptr_t)info->mapping.iov_base,
1551 		      (uintptr_t)info->mapping.iov_base + info->mapping.iov_len);
1552 
1553 	map_start = info->mapping.iov_base;
1554 	map_end = info->mapping.iov_base + info->mapping.iov_len;
1555 
1556 	pthread_mutex_lock(&endpoint->lock);
1557 	TAILQ_FOREACH(qpair, &ctrlr->connected_qps, tailq) {
1558 		if ((qpair->cq.addr >= map_start && qpair->cq.addr <= map_end) ||
1559 		    (qpair->sq.addr >= map_start && qpair->sq.addr <= map_end)) {
1560 			/* TODO: Ideally we should disconnect this queue pair
1561 			 * before returning to caller.
1562 			 */
1563 			unmap_qp(qpair);
1564 			qpair->state = VFIO_USER_QPAIR_INACTIVE;
1565 		}
1566 	}
1567 	pthread_mutex_unlock(&endpoint->lock);
1568 
1569 	if (info->prot == (PROT_WRITE | PROT_READ)) {
1570 		ret = spdk_mem_unregister(info->mapping.iov_base, info->mapping.iov_len);
1571 		if (ret) {
1572 			SPDK_ERRLOG("Memory region unregister %#lx-%#lx failed, ret=%d\n",
1573 				    (uint64_t)(uintptr_t)info->mapping.iov_base,
1574 				    (uint64_t)(uintptr_t)info->mapping.iov_base + info->mapping.iov_len,
1575 				    ret);
1576 		}
1577 	}
1578 
1579 	return 0;
1580 }
1581 
1582 static int
1583 nvmf_vfio_user_prop_req_rsp(struct nvmf_vfio_user_req *req, void *cb_arg)
1584 {
1585 	struct nvmf_vfio_user_qpair *vu_qpair = cb_arg;
1586 	struct nvmf_vfio_user_ctrlr *vu_ctrlr;
1587 	bool disable_admin = false;
1588 	int ret;
1589 
1590 	assert(vu_qpair != NULL);
1591 	assert(req != NULL);
1592 
1593 	if (req->req.cmd->prop_get_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET) {
1594 		assert(vu_qpair->ctrlr != NULL);
1595 		assert(req != NULL);
1596 
1597 		memcpy(req->req.data,
1598 		       &req->req.rsp->prop_get_rsp.value.u64,
1599 		       req->req.length);
1600 	} else {
1601 		assert(req->req.cmd->prop_set_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET);
1602 		assert(vu_qpair->ctrlr != NULL);
1603 		vu_ctrlr = vu_qpair->ctrlr;
1604 
1605 		if (req->req.cmd->prop_set_cmd.ofst == offsetof(struct spdk_nvme_registers, cc)) {
1606 			union spdk_nvme_cc_register cc, diff;
1607 
1608 			cc.raw = req->req.cmd->prop_set_cmd.value.u64;
1609 			diff.raw = cc.raw ^ req->cc.raw;
1610 
1611 			if (diff.bits.en) {
1612 				if (cc.bits.en) {
1613 					SPDK_DEBUGLOG(nvmf_vfio, "%s: MAP Admin queue\n", ctrlr_id(vu_ctrlr));
1614 					ret = enable_admin_queue(vu_ctrlr);
1615 					if (ret) {
1616 						SPDK_ERRLOG("%s: failed to map Admin queue\n", ctrlr_id(vu_ctrlr));
1617 						return ret;
1618 					}
1619 					vu_qpair->state = VFIO_USER_QPAIR_ACTIVE;
1620 				} else {
1621 					disable_admin = true;
1622 				}
1623 			}
1624 
1625 			if (diff.bits.shn) {
1626 				if (cc.bits.shn == SPDK_NVME_SHN_NORMAL || cc.bits.shn == SPDK_NVME_SHN_ABRUPT) {
1627 					disable_admin = true;
1628 				}
1629 			}
1630 
1631 			if (disable_admin) {
1632 				SPDK_DEBUGLOG(nvmf_vfio,
1633 					      "%s: UNMAP Admin queue\n",
1634 					      ctrlr_id(vu_ctrlr));
1635 				vu_qpair->state = VFIO_USER_QPAIR_INACTIVE;
1636 				disable_admin_queue(vu_ctrlr);
1637 				/* For PCIe controller reset or shutdown, we will drop all AER responses */
1638 				nvmf_ctrlr_abort_aer(vu_qpair->qpair.ctrlr);
1639 			}
1640 		}
1641 	}
1642 
1643 	return 0;
1644 }
1645 
1646 /*
1647  * Handles a write at offset 0x1000 or more; this is the non-mapped path when a
1648  * doorbell is written via access_bar0_fn().
1649  *
1650  * DSTRD is set to fixed value 0 for NVMf.
1651  *
1652  */
1653 static int
1654 handle_dbl_access(struct nvmf_vfio_user_ctrlr *ctrlr, uint32_t *buf,
1655 		  const size_t count, loff_t pos, const bool is_write)
1656 {
1657 	assert(ctrlr != NULL);
1658 	assert(buf != NULL);
1659 
1660 	if (count != sizeof(uint32_t)) {
1661 		SPDK_ERRLOG("%s: bad doorbell buffer size %ld\n",
1662 			    ctrlr_id(ctrlr), count);
1663 		errno = EINVAL;
1664 		return -1;
1665 	}
1666 
1667 	pos -= NVMF_VFIO_USER_DOORBELLS_OFFSET;
1668 
1669 	/* pos must be dword aligned */
1670 	if ((pos & 0x3) != 0) {
1671 		SPDK_ERRLOG("%s: bad doorbell offset %#lx\n", ctrlr_id(ctrlr), pos);
1672 		errno = EINVAL;
1673 		return -1;
1674 	}
1675 
1676 	/* convert byte offset to array index */
1677 	pos >>= 2;
1678 
1679 	if (pos >= NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR * 2) {
1680 		SPDK_ERRLOG("%s: bad doorbell index %#lx\n", ctrlr_id(ctrlr), pos);
1681 		errno = EINVAL;
1682 		return -1;
1683 	}
1684 
1685 	if (is_write) {
1686 		ctrlr->doorbells[pos] = *buf;
1687 		spdk_wmb();
1688 	} else {
1689 		spdk_rmb();
1690 		*buf = ctrlr->doorbells[pos];
1691 	}
1692 	return 0;
1693 }
1694 
1695 static ssize_t
1696 access_bar0_fn(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t pos,
1697 	       bool is_write)
1698 {
1699 	struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx);
1700 	struct nvmf_vfio_user_ctrlr *ctrlr;
1701 	struct nvmf_vfio_user_req *req;
1702 	const struct spdk_nvmf_registers *regs;
1703 	int ret;
1704 
1705 	ctrlr = endpoint->ctrlr;
1706 
1707 	SPDK_DEBUGLOG(nvmf_vfio,
1708 		      "%s: bar0 %s ctrlr: %p, count=%zu, pos=%"PRIX64"\n",
1709 		      endpoint_id(endpoint), is_write ? "write" : "read",
1710 		      ctrlr, count, pos);
1711 
1712 	if (pos >= NVMF_VFIO_USER_DOORBELLS_OFFSET) {
1713 		/*
1714 		 * The fact that the doorbells can be memory mapped doesn't mean
1715 		 * that the client (VFIO in QEMU) is obliged to memory map them,
1716 		 * it might still elect to access them via regular read/write;
1717 		 * we might also have had disable_mappable_bar0 set.
1718 		 */
1719 		ret = handle_dbl_access(ctrlr, (uint32_t *)buf, count,
1720 					pos, is_write);
1721 		if (ret == 0) {
1722 			return count;
1723 		}
1724 		return ret;
1725 	}
1726 
1727 	/* Construct a Fabric Property Get/Set command and send it */
1728 	req = get_nvmf_vfio_user_req(ctrlr->qp[0]);
1729 	if (req == NULL) {
1730 		errno = ENOBUFS;
1731 		return -1;
1732 	}
1733 	regs = spdk_nvmf_ctrlr_get_regs(ctrlr->qp[0]->qpair.ctrlr);
1734 	req->cc.raw = regs->cc.raw;
1735 
1736 	req->cb_fn = nvmf_vfio_user_prop_req_rsp;
1737 	req->cb_arg = ctrlr->qp[0];
1738 	req->req.cmd->prop_set_cmd.opcode = SPDK_NVME_OPC_FABRIC;
1739 	req->req.cmd->prop_set_cmd.cid = 0;
1740 	req->req.cmd->prop_set_cmd.attrib.size = (count / 4) - 1;
1741 	req->req.cmd->prop_set_cmd.ofst = pos;
1742 	if (is_write) {
1743 		req->req.cmd->prop_set_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET;
1744 		if (req->req.cmd->prop_set_cmd.attrib.size) {
1745 			req->req.cmd->prop_set_cmd.value.u64 = *(uint64_t *)buf;
1746 		} else {
1747 			req->req.cmd->prop_set_cmd.value.u32.high = 0;
1748 			req->req.cmd->prop_set_cmd.value.u32.low = *(uint32_t *)buf;
1749 		}
1750 	} else {
1751 		req->req.cmd->prop_get_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET;
1752 	}
1753 	req->req.length = count;
1754 	req->req.data = buf;
1755 
1756 	spdk_nvmf_request_exec_fabrics(&req->req);
1757 
1758 	return count;
1759 }
1760 
1761 /*
1762  * NVMe driver reads 4096 bytes, which is the extended PCI configuration space
1763  * available on PCI-X 2.0 and PCI Express buses
1764  */
1765 static ssize_t
1766 access_pci_config(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t offset,
1767 		  bool is_write)
1768 {
1769 	struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx);
1770 
1771 	if (is_write) {
1772 		SPDK_ERRLOG("%s: write %#lx-%#lx not supported\n",
1773 			    endpoint_id(endpoint), offset, offset + count);
1774 		errno = EINVAL;
1775 		return -1;
1776 	}
1777 
1778 	if (offset + count > PCI_CFG_SPACE_EXP_SIZE) {
1779 		SPDK_ERRLOG("%s: access past end of extended PCI configuration space, want=%ld+%ld, max=%d\n",
1780 			    endpoint_id(endpoint), offset, count,
1781 			    PCI_CFG_SPACE_EXP_SIZE);
1782 		errno = ERANGE;
1783 		return -1;
1784 	}
1785 
1786 	memcpy(buf, ((unsigned char *)endpoint->pci_config_space) + offset, count);
1787 
1788 	return count;
1789 }
1790 
1791 static void
1792 vfio_user_log(vfu_ctx_t *vfu_ctx, int level, char const *msg)
1793 {
1794 	struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx);
1795 
1796 	if (level >= LOG_DEBUG) {
1797 		SPDK_DEBUGLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg);
1798 	} else if (level >= LOG_INFO) {
1799 		SPDK_INFOLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg);
1800 	} else if (level >= LOG_NOTICE) {
1801 		SPDK_NOTICELOG("%s: %s\n", endpoint_id(endpoint), msg);
1802 	} else if (level >= LOG_WARNING) {
1803 		SPDK_WARNLOG("%s: %s\n", endpoint_id(endpoint), msg);
1804 	} else {
1805 		SPDK_ERRLOG("%s: %s\n", endpoint_id(endpoint), msg);
1806 	}
1807 }
1808 
1809 static int
1810 vfio_user_get_log_level(void)
1811 {
1812 	int level;
1813 
1814 	if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) {
1815 		return LOG_DEBUG;
1816 	}
1817 
1818 	level = spdk_log_to_syslog_level(spdk_log_get_level());
1819 	if (level < 0) {
1820 		return LOG_ERR;
1821 	}
1822 
1823 	return level;
1824 }
1825 
1826 static void
1827 init_pci_config_space(vfu_pci_config_space_t *p)
1828 {
1829 	/* MLBAR */
1830 	p->hdr.bars[0].raw = 0x0;
1831 	/* MUBAR */
1832 	p->hdr.bars[1].raw = 0x0;
1833 
1834 	/* vendor specific, let's set them to zero for now */
1835 	p->hdr.bars[3].raw = 0x0;
1836 	p->hdr.bars[4].raw = 0x0;
1837 	p->hdr.bars[5].raw = 0x0;
1838 
1839 	/* enable INTx */
1840 	p->hdr.intr.ipin = 0x1;
1841 }
1842 
1843 static int
1844 vfio_user_dev_info_fill(struct nvmf_vfio_user_transport *vu_transport,
1845 			struct nvmf_vfio_user_endpoint *endpoint)
1846 {
1847 	int ret;
1848 	ssize_t cap_offset;
1849 	vfu_ctx_t *vfu_ctx = endpoint->vfu_ctx;
1850 
1851 	struct pmcap pmcap = { .hdr.id = PCI_CAP_ID_PM, .pmcs.nsfrst = 0x1 };
1852 	struct pxcap pxcap = {
1853 		.hdr.id = PCI_CAP_ID_EXP,
1854 		.pxcaps.ver = 0x2,
1855 		.pxdcap = {.rer = 0x1, .flrc = 0x1},
1856 		.pxdcap2.ctds = 0x1
1857 	};
1858 
1859 	struct msixcap msixcap = {
1860 		.hdr.id = PCI_CAP_ID_MSIX,
1861 		.mxc.ts = NVME_IRQ_MSIX_NUM - 1,
1862 		.mtab = {.tbir = 0x4, .to = 0x0},
1863 		.mpba = {.pbir = 0x5, .pbao = 0x0}
1864 	};
1865 
1866 	static struct iovec sparse_mmap[] = {
1867 		{
1868 			.iov_base = (void *)NVMF_VFIO_USER_DOORBELLS_OFFSET,
1869 			.iov_len = NVMF_VFIO_USER_DOORBELLS_SIZE,
1870 		},
1871 	};
1872 
1873 	ret = vfu_pci_init(vfu_ctx, VFU_PCI_TYPE_EXPRESS, PCI_HEADER_TYPE_NORMAL, 0);
1874 	if (ret < 0) {
1875 		SPDK_ERRLOG("vfu_ctx %p failed to initialize PCI\n", vfu_ctx);
1876 		return ret;
1877 	}
1878 	vfu_pci_set_id(vfu_ctx, 0x4e58, 0x0001, 0, 0);
1879 	/*
1880 	 * 0x02, controller uses the NVM Express programming interface
1881 	 * 0x08, non-volatile memory controller
1882 	 * 0x01, mass storage controller
1883 	 */
1884 	vfu_pci_set_class(vfu_ctx, 0x01, 0x08, 0x02);
1885 
1886 	cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pmcap);
1887 	if (cap_offset < 0) {
1888 		SPDK_ERRLOG("vfu_ctx %p failed add pmcap\n", vfu_ctx);
1889 		return ret;
1890 	}
1891 
1892 	cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pxcap);
1893 	if (cap_offset < 0) {
1894 		SPDK_ERRLOG("vfu_ctx %p failed add pxcap\n", vfu_ctx);
1895 		return ret;
1896 	}
1897 
1898 	cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &msixcap);
1899 	if (cap_offset < 0) {
1900 		SPDK_ERRLOG("vfu_ctx %p failed add msixcap\n", vfu_ctx);
1901 		return ret;
1902 	}
1903 
1904 	ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_CFG_REGION_IDX, NVME_REG_CFG_SIZE,
1905 			       access_pci_config, VFU_REGION_FLAG_RW, NULL, 0, -1, 0);
1906 	if (ret < 0) {
1907 		SPDK_ERRLOG("vfu_ctx %p failed to setup cfg\n", vfu_ctx);
1908 		return ret;
1909 	}
1910 
1911 	if (vu_transport->transport_opts.disable_mappable_bar0) {
1912 		ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE,
1913 				       access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM,
1914 				       NULL, 0, -1, 0);
1915 	} else {
1916 		ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE,
1917 				       access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM,
1918 				       sparse_mmap, 1, endpoint->devmem_fd, 0);
1919 	}
1920 
1921 	if (ret < 0) {
1922 		SPDK_ERRLOG("vfu_ctx %p failed to setup bar 0\n", vfu_ctx);
1923 		return ret;
1924 	}
1925 
1926 	ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR4_REGION_IDX, PAGE_SIZE,
1927 			       NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0);
1928 	if (ret < 0) {
1929 		SPDK_ERRLOG("vfu_ctx %p failed to setup bar 4\n", vfu_ctx);
1930 		return ret;
1931 	}
1932 
1933 	ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR5_REGION_IDX, PAGE_SIZE,
1934 			       NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0);
1935 	if (ret < 0) {
1936 		SPDK_ERRLOG("vfu_ctx %p failed to setup bar 5\n", vfu_ctx);
1937 		return ret;
1938 	}
1939 
1940 	ret = vfu_setup_device_dma(vfu_ctx, memory_region_add_cb, memory_region_remove_cb);
1941 	if (ret < 0) {
1942 		SPDK_ERRLOG("vfu_ctx %p failed to setup dma callback\n", vfu_ctx);
1943 		return ret;
1944 	}
1945 
1946 	ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_INTX_IRQ, 1);
1947 	if (ret < 0) {
1948 		SPDK_ERRLOG("vfu_ctx %p failed to setup INTX\n", vfu_ctx);
1949 		return ret;
1950 	}
1951 
1952 	ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSIX_IRQ, NVME_IRQ_MSIX_NUM);
1953 	if (ret < 0) {
1954 		SPDK_ERRLOG("vfu_ctx %p failed to setup MSIX\n", vfu_ctx);
1955 		return ret;
1956 	}
1957 
1958 	ret = vfu_realize_ctx(vfu_ctx);
1959 	if (ret < 0) {
1960 		SPDK_ERRLOG("vfu_ctx %p failed to realize\n", vfu_ctx);
1961 		return ret;
1962 	}
1963 
1964 	endpoint->pci_config_space = vfu_pci_get_config_space(endpoint->vfu_ctx);
1965 	assert(endpoint->pci_config_space != NULL);
1966 	init_pci_config_space(endpoint->pci_config_space);
1967 
1968 	assert(cap_offset != 0);
1969 	endpoint->msix = (struct msixcap *)((uint8_t *)endpoint->pci_config_space + cap_offset);
1970 
1971 	return 0;
1972 }
1973 
1974 static void
1975 _free_ctrlr(void *ctx)
1976 {
1977 	struct nvmf_vfio_user_ctrlr *ctrlr = ctx;
1978 
1979 	spdk_poller_unregister(&ctrlr->vfu_ctx_poller);
1980 	free(ctrlr);
1981 }
1982 
1983 static void
1984 free_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr, bool free_qps)
1985 {
1986 	int i;
1987 	assert(ctrlr != NULL);
1988 
1989 	SPDK_DEBUGLOG(nvmf_vfio, "free %s\n", ctrlr_id(ctrlr));
1990 
1991 	if (free_qps) {
1992 		for (i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; i++) {
1993 			free_qp(ctrlr, i);
1994 		}
1995 	}
1996 
1997 	if (ctrlr->thread == spdk_get_thread()) {
1998 		_free_ctrlr(ctrlr);
1999 	} else {
2000 		spdk_thread_send_msg(ctrlr->thread, _free_ctrlr, ctrlr);
2001 	}
2002 }
2003 
2004 static void
2005 nvmf_vfio_user_create_ctrlr(struct nvmf_vfio_user_transport *transport,
2006 			    struct nvmf_vfio_user_endpoint *endpoint)
2007 {
2008 	struct nvmf_vfio_user_ctrlr *ctrlr;
2009 	int err = 0;
2010 
2011 	/* First, construct a vfio-user CUSTOM transport controller */
2012 	ctrlr = calloc(1, sizeof(*ctrlr));
2013 	if (ctrlr == NULL) {
2014 		err = -ENOMEM;
2015 		goto out;
2016 	}
2017 	ctrlr->cntlid = 0xffff;
2018 	ctrlr->transport = transport;
2019 	ctrlr->endpoint = endpoint;
2020 	ctrlr->doorbells = endpoint->doorbells;
2021 	TAILQ_INIT(&ctrlr->connected_qps);
2022 
2023 	/* Then, construct an admin queue pair */
2024 	err = init_qp(ctrlr, &transport->transport, NVMF_VFIO_USER_DEFAULT_AQ_DEPTH, 0);
2025 	if (err != 0) {
2026 		free(ctrlr);
2027 		goto out;
2028 	}
2029 	endpoint->ctrlr = ctrlr;
2030 
2031 	/* Notify the generic layer about the new admin queue pair */
2032 	spdk_nvmf_tgt_new_qpair(transport->transport.tgt, &ctrlr->qp[0]->qpair);
2033 
2034 out:
2035 	if (err != 0) {
2036 		SPDK_ERRLOG("%s: failed to create vfio-user controller: %s\n",
2037 			    endpoint_id(endpoint), strerror(-err));
2038 	}
2039 }
2040 
2041 static int
2042 nvmf_vfio_user_listen(struct spdk_nvmf_transport *transport,
2043 		      const struct spdk_nvme_transport_id *trid,
2044 		      struct spdk_nvmf_listen_opts *listen_opts)
2045 {
2046 	struct nvmf_vfio_user_transport *vu_transport;
2047 	struct nvmf_vfio_user_endpoint *endpoint, *tmp;
2048 	char *path = NULL;
2049 	char uuid[PATH_MAX] = {};
2050 	int fd;
2051 	int err;
2052 
2053 	vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport,
2054 					transport);
2055 
2056 	TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) {
2057 		/* Only compare traddr */
2058 		if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) {
2059 			return -EEXIST;
2060 		}
2061 	}
2062 
2063 	endpoint = calloc(1, sizeof(*endpoint));
2064 	if (!endpoint) {
2065 		return -ENOMEM;
2066 	}
2067 
2068 	endpoint->devmem_fd = -1;
2069 	memcpy(&endpoint->trid, trid, sizeof(endpoint->trid));
2070 
2071 	err = asprintf(&path, "%s/bar0", endpoint_id(endpoint));
2072 	if (err == -1) {
2073 		goto out;
2074 	}
2075 
2076 	fd = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH);
2077 	if (fd == -1) {
2078 		SPDK_ERRLOG("%s: failed to open device memory at %s: %m\n",
2079 			    endpoint_id(endpoint), path);
2080 		err = fd;
2081 		free(path);
2082 		goto out;
2083 	}
2084 	free(path);
2085 
2086 	endpoint->devmem_fd = fd;
2087 	err = ftruncate(fd, NVMF_VFIO_USER_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE);
2088 	if (err != 0) {
2089 		goto out;
2090 	}
2091 
2092 	endpoint->doorbells = mmap(NULL, NVMF_VFIO_USER_DOORBELLS_SIZE,
2093 				   PROT_READ | PROT_WRITE, MAP_SHARED, fd, NVMF_VFIO_USER_DOORBELLS_OFFSET);
2094 	if (endpoint->doorbells == MAP_FAILED) {
2095 		endpoint->doorbells = NULL;
2096 		err = -errno;
2097 		goto out;
2098 	}
2099 
2100 	snprintf(uuid, PATH_MAX, "%s/cntrl", endpoint_id(endpoint));
2101 
2102 	endpoint->vfu_ctx = vfu_create_ctx(VFU_TRANS_SOCK, uuid, LIBVFIO_USER_FLAG_ATTACH_NB,
2103 					   endpoint, VFU_DEV_TYPE_PCI);
2104 	if (endpoint->vfu_ctx == NULL) {
2105 		SPDK_ERRLOG("%s: error creating libmuser context: %m\n",
2106 			    endpoint_id(endpoint));
2107 		err = -1;
2108 		goto out;
2109 	}
2110 	vfu_setup_log(endpoint->vfu_ctx, vfio_user_log, vfio_user_get_log_level());
2111 
2112 	err = vfio_user_dev_info_fill(vu_transport, endpoint);
2113 	if (err < 0) {
2114 		goto out;
2115 	}
2116 
2117 	pthread_mutex_init(&endpoint->lock, NULL);
2118 	TAILQ_INSERT_TAIL(&vu_transport->endpoints, endpoint, link);
2119 	SPDK_DEBUGLOG(nvmf_vfio, "%s: doorbells %p\n", uuid, endpoint->doorbells);
2120 
2121 out:
2122 	if (err != 0) {
2123 		nvmf_vfio_user_destroy_endpoint(endpoint);
2124 	}
2125 
2126 	return err;
2127 }
2128 
2129 static void
2130 nvmf_vfio_user_stop_listen(struct spdk_nvmf_transport *transport,
2131 			   const struct spdk_nvme_transport_id *trid)
2132 {
2133 	struct nvmf_vfio_user_transport *vu_transport;
2134 	struct nvmf_vfio_user_endpoint *endpoint, *tmp;
2135 
2136 	assert(trid != NULL);
2137 	assert(trid->traddr != NULL);
2138 
2139 	SPDK_DEBUGLOG(nvmf_vfio, "%s: stop listen\n", trid->traddr);
2140 
2141 	vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport,
2142 					transport);
2143 
2144 	pthread_mutex_lock(&vu_transport->lock);
2145 	TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) {
2146 		if (strcmp(trid->traddr, endpoint->trid.traddr) == 0) {
2147 			TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link);
2148 			if (endpoint->ctrlr) {
2149 				/* Users may kill NVMeoF target while VM
2150 				 * is connected, free all resources.
2151 				 */
2152 				free_ctrlr(endpoint->ctrlr, true);
2153 			}
2154 			nvmf_vfio_user_destroy_endpoint(endpoint);
2155 			pthread_mutex_unlock(&vu_transport->lock);
2156 
2157 			return;
2158 		}
2159 	}
2160 	pthread_mutex_unlock(&vu_transport->lock);
2161 
2162 	SPDK_DEBUGLOG(nvmf_vfio, "%s: not found\n", trid->traddr);
2163 }
2164 
2165 static void
2166 nvmf_vfio_user_cdata_init(struct spdk_nvmf_transport *transport,
2167 			  struct spdk_nvmf_subsystem *subsystem,
2168 			  struct spdk_nvmf_ctrlr_data *cdata)
2169 {
2170 	memset(&cdata->sgls, 0, sizeof(struct spdk_nvme_cdata_sgls));
2171 	cdata->sgls.supported = SPDK_NVME_SGLS_SUPPORTED_DWORD_ALIGNED;
2172 	/* libvfio-user can only support 1 connection for now */
2173 	cdata->oncs.reservations = 0;
2174 }
2175 
2176 static int
2177 nvmf_vfio_user_listen_associate(struct spdk_nvmf_transport *transport,
2178 				const struct spdk_nvmf_subsystem *subsystem,
2179 				const struct spdk_nvme_transport_id *trid)
2180 {
2181 	struct nvmf_vfio_user_transport *vu_transport;
2182 	struct nvmf_vfio_user_endpoint *endpoint;
2183 
2184 	vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, transport);
2185 
2186 	TAILQ_FOREACH(endpoint, &vu_transport->endpoints, link) {
2187 		if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) {
2188 			break;
2189 		}
2190 	}
2191 
2192 	if (endpoint == NULL) {
2193 		return -ENOENT;
2194 	}
2195 
2196 	endpoint->subsystem = subsystem;
2197 
2198 	return 0;
2199 }
2200 
2201 /*
2202  * Executed periodically at a default SPDK_NVMF_DEFAULT_ACCEPT_POLL_RATE_US
2203  * frequency.
2204  *
2205  * For each transport endpoint (which at the libvfio-user level corresponds to
2206  * a socket), if we don't currently have a controller set up, peek to see if the
2207  * socket is able to accept a new connection.
2208  *
2209  * This poller also takes care of handling the creation of any pending new
2210  * qpairs.
2211  *
2212  * Returns the number of events handled.
2213  */
2214 static uint32_t
2215 nvmf_vfio_user_accept(struct spdk_nvmf_transport *transport)
2216 {
2217 	struct nvmf_vfio_user_transport *vu_transport;
2218 	struct nvmf_vfio_user_endpoint *endpoint;
2219 	uint32_t count = 0;
2220 	int err;
2221 
2222 	vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport,
2223 					transport);
2224 
2225 	pthread_mutex_lock(&vu_transport->lock);
2226 
2227 	TAILQ_FOREACH(endpoint, &vu_transport->endpoints, link) {
2228 		if (endpoint->ctrlr != NULL) {
2229 			continue;
2230 		}
2231 
2232 		err = vfu_attach_ctx(endpoint->vfu_ctx);
2233 		if (err != 0) {
2234 			if (errno == EAGAIN || errno == EWOULDBLOCK) {
2235 				continue;
2236 			}
2237 
2238 			pthread_mutex_unlock(&vu_transport->lock);
2239 			return 1;
2240 		}
2241 
2242 		count++;
2243 
2244 		/* Construct a controller */
2245 		nvmf_vfio_user_create_ctrlr(vu_transport, endpoint);
2246 	}
2247 
2248 	pthread_mutex_unlock(&vu_transport->lock);
2249 
2250 	return count;
2251 }
2252 
2253 static void
2254 nvmf_vfio_user_discover(struct spdk_nvmf_transport *transport,
2255 			struct spdk_nvme_transport_id *trid,
2256 			struct spdk_nvmf_discovery_log_page_entry *entry)
2257 { }
2258 
2259 static struct spdk_nvmf_transport_poll_group *
2260 nvmf_vfio_user_poll_group_create(struct spdk_nvmf_transport *transport)
2261 {
2262 	struct nvmf_vfio_user_poll_group *vu_group;
2263 
2264 	SPDK_DEBUGLOG(nvmf_vfio, "create poll group\n");
2265 
2266 	vu_group = calloc(1, sizeof(*vu_group));
2267 	if (vu_group == NULL) {
2268 		SPDK_ERRLOG("Error allocating poll group: %m");
2269 		return NULL;
2270 	}
2271 
2272 	TAILQ_INIT(&vu_group->qps);
2273 
2274 	return &vu_group->group;
2275 }
2276 
2277 /* called when process exits */
2278 static void
2279 nvmf_vfio_user_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group)
2280 {
2281 	struct nvmf_vfio_user_poll_group *vu_group;
2282 
2283 	SPDK_DEBUGLOG(nvmf_vfio, "destroy poll group\n");
2284 
2285 	vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group);
2286 
2287 	free(vu_group);
2288 }
2289 
2290 static void
2291 vfio_user_qpair_disconnect_cb(void *ctx)
2292 {
2293 	struct nvmf_vfio_user_endpoint *endpoint = ctx;
2294 	struct nvmf_vfio_user_ctrlr *ctrlr;
2295 
2296 	pthread_mutex_lock(&endpoint->lock);
2297 	ctrlr = endpoint->ctrlr;
2298 	if (!ctrlr) {
2299 		pthread_mutex_unlock(&endpoint->lock);
2300 		return;
2301 	}
2302 
2303 	if (TAILQ_EMPTY(&ctrlr->connected_qps)) {
2304 		endpoint->ctrlr = NULL;
2305 		free_ctrlr(ctrlr, false);
2306 		pthread_mutex_unlock(&endpoint->lock);
2307 		return;
2308 	}
2309 	pthread_mutex_unlock(&endpoint->lock);
2310 }
2311 
2312 static int
2313 vfio_user_destroy_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr)
2314 {
2315 	struct nvmf_vfio_user_qpair *qpair;
2316 	struct nvmf_vfio_user_endpoint *endpoint;
2317 
2318 	SPDK_DEBUGLOG(nvmf_vfio, "%s stop processing\n", ctrlr_id(ctrlr));
2319 
2320 	endpoint = ctrlr->endpoint;
2321 	assert(endpoint != NULL);
2322 
2323 	pthread_mutex_lock(&endpoint->lock);
2324 	if (TAILQ_EMPTY(&ctrlr->connected_qps)) {
2325 		endpoint->ctrlr = NULL;
2326 		free_ctrlr(ctrlr, false);
2327 		pthread_mutex_unlock(&endpoint->lock);
2328 		return 0;
2329 	}
2330 
2331 	TAILQ_FOREACH(qpair, &ctrlr->connected_qps, tailq) {
2332 		spdk_nvmf_qpair_disconnect(&qpair->qpair, vfio_user_qpair_disconnect_cb, endpoint);
2333 	}
2334 	pthread_mutex_unlock(&endpoint->lock);
2335 
2336 	return 0;
2337 }
2338 
2339 /*
2340  * Poll for and process any incoming vfio-user messages.
2341  */
2342 static int
2343 vfio_user_poll_vfu_ctx(void *ctx)
2344 {
2345 	struct nvmf_vfio_user_ctrlr *ctrlr = ctx;
2346 	int ret;
2347 
2348 	assert(ctrlr != NULL);
2349 
2350 	/* This will call access_bar0_fn() if there are any writes
2351 	 * to the portion of the BAR that is not mmap'd */
2352 	ret = vfu_run_ctx(ctrlr->endpoint->vfu_ctx);
2353 	if (spdk_unlikely(ret == -1)) {
2354 		spdk_poller_unregister(&ctrlr->vfu_ctx_poller);
2355 
2356 		/* initiator shutdown or reset, waiting for another re-connect */
2357 		if (errno == ENOTCONN) {
2358 			vfio_user_destroy_ctrlr(ctrlr);
2359 			return SPDK_POLLER_BUSY;
2360 		}
2361 
2362 		fail_ctrlr(ctrlr);
2363 	}
2364 
2365 	return ret != 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
2366 }
2367 
2368 static int
2369 handle_queue_connect_rsp(struct nvmf_vfio_user_req *req, void *cb_arg)
2370 {
2371 	struct nvmf_vfio_user_poll_group *vu_group;
2372 	struct nvmf_vfio_user_qpair *qpair = cb_arg;
2373 	struct nvmf_vfio_user_ctrlr *ctrlr;
2374 	struct nvmf_vfio_user_endpoint *endpoint;
2375 
2376 	assert(qpair != NULL);
2377 	assert(req != NULL);
2378 
2379 	ctrlr = qpair->ctrlr;
2380 	endpoint = ctrlr->endpoint;
2381 	assert(ctrlr != NULL);
2382 	assert(endpoint != NULL);
2383 
2384 	if (spdk_nvme_cpl_is_error(&req->req.rsp->nvme_cpl)) {
2385 		SPDK_ERRLOG("SC %u, SCT %u\n", req->req.rsp->nvme_cpl.status.sc, req->req.rsp->nvme_cpl.status.sct);
2386 		endpoint->ctrlr = NULL;
2387 		free_ctrlr(ctrlr, true);
2388 		return -1;
2389 	}
2390 
2391 	vu_group = SPDK_CONTAINEROF(qpair->group, struct nvmf_vfio_user_poll_group, group);
2392 	TAILQ_INSERT_TAIL(&vu_group->qps, qpair, link);
2393 	qpair->state = VFIO_USER_QPAIR_ACTIVE;
2394 
2395 	pthread_mutex_lock(&endpoint->lock);
2396 	if (nvmf_qpair_is_admin_queue(&qpair->qpair)) {
2397 		ctrlr->cntlid = qpair->qpair.ctrlr->cntlid;
2398 		ctrlr->thread = spdk_get_thread();
2399 		ctrlr->vfu_ctx_poller = SPDK_POLLER_REGISTER(vfio_user_poll_vfu_ctx, ctrlr, 0);
2400 	} else {
2401 		/* For I/O queues this command was generated in response to an
2402 		 * ADMIN I/O CREATE SUBMISSION QUEUE command which has not yet
2403 		 * been completed. Complete it now.
2404 		 */
2405 		post_completion(ctrlr, &ctrlr->qp[0]->cq, 0, 0,
2406 				qpair->create_io_sq_cmd.cid, SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC);
2407 	}
2408 	TAILQ_INSERT_TAIL(&ctrlr->connected_qps, qpair, tailq);
2409 	pthread_mutex_unlock(&endpoint->lock);
2410 
2411 	free(req->req.data);
2412 	req->req.data = NULL;
2413 
2414 	return 0;
2415 }
2416 
2417 /*
2418  * Add the given qpair to the given poll group. New qpairs are added via
2419  * spdk_nvmf_tgt_new_qpair(), which picks a poll group, then calls back
2420  * here via nvmf_transport_poll_group_add().
2421  */
2422 static int
2423 nvmf_vfio_user_poll_group_add(struct spdk_nvmf_transport_poll_group *group,
2424 			      struct spdk_nvmf_qpair *qpair)
2425 {
2426 	struct nvmf_vfio_user_qpair *vu_qpair;
2427 	struct nvmf_vfio_user_req *vu_req;
2428 	struct nvmf_vfio_user_ctrlr *ctrlr;
2429 	struct spdk_nvmf_request *req;
2430 	struct spdk_nvmf_fabric_connect_data *data;
2431 	bool admin;
2432 
2433 	vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair);
2434 	vu_qpair->group = group;
2435 	ctrlr = vu_qpair->ctrlr;
2436 
2437 	SPDK_DEBUGLOG(nvmf_vfio, "%s: add QP%d=%p(%p) to poll_group=%p\n",
2438 		      ctrlr_id(ctrlr), vu_qpair->qpair.qid,
2439 		      vu_qpair, qpair, group);
2440 
2441 	admin = nvmf_qpair_is_admin_queue(&vu_qpair->qpair);
2442 
2443 	vu_req = get_nvmf_vfio_user_req(vu_qpair);
2444 	if (vu_req == NULL) {
2445 		return -1;
2446 	}
2447 
2448 	req = &vu_req->req;
2449 	req->cmd->connect_cmd.opcode = SPDK_NVME_OPC_FABRIC;
2450 	req->cmd->connect_cmd.cid = 0;
2451 	req->cmd->connect_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_CONNECT;
2452 	req->cmd->connect_cmd.recfmt = 0;
2453 	req->cmd->connect_cmd.sqsize = vu_qpair->qsize - 1;
2454 	req->cmd->connect_cmd.qid = admin ? 0 : qpair->qid;
2455 
2456 	req->length = sizeof(struct spdk_nvmf_fabric_connect_data);
2457 	req->data = calloc(1, req->length);
2458 	if (req->data == NULL) {
2459 		nvmf_vfio_user_req_free(req);
2460 		return -ENOMEM;
2461 	}
2462 
2463 	data = (struct spdk_nvmf_fabric_connect_data *)req->data;
2464 	data->cntlid = admin ? 0xFFFF : ctrlr->cntlid;
2465 	snprintf(data->subnqn, sizeof(data->subnqn), "%s",
2466 		 spdk_nvmf_subsystem_get_nqn(ctrlr->endpoint->subsystem));
2467 
2468 	vu_req->cb_fn = handle_queue_connect_rsp;
2469 	vu_req->cb_arg = vu_qpair;
2470 
2471 	SPDK_DEBUGLOG(nvmf_vfio,
2472 		      "%s: sending connect fabrics command for QID=%#x cntlid=%#x\n",
2473 		      ctrlr_id(ctrlr), qpair->qid, data->cntlid);
2474 
2475 	spdk_nvmf_request_exec_fabrics(req);
2476 	return 0;
2477 }
2478 
2479 static int
2480 nvmf_vfio_user_poll_group_remove(struct spdk_nvmf_transport_poll_group *group,
2481 				 struct spdk_nvmf_qpair *qpair)
2482 {
2483 	struct nvmf_vfio_user_qpair *vu_qpair;
2484 	struct nvmf_vfio_user_ctrlr *vu_ctrlr;
2485 	struct nvmf_vfio_user_endpoint *endpoint;
2486 	struct nvmf_vfio_user_poll_group *vu_group;
2487 
2488 	vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair);
2489 	vu_ctrlr = vu_qpair->ctrlr;
2490 	endpoint = vu_ctrlr->endpoint;
2491 
2492 	SPDK_DEBUGLOG(nvmf_vfio,
2493 		      "%s: remove NVMf QP%d=%p from NVMf poll_group=%p\n",
2494 		      ctrlr_id(vu_qpair->ctrlr), qpair->qid, qpair, group);
2495 
2496 
2497 	vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group);
2498 	TAILQ_REMOVE(&vu_group->qps, vu_qpair, link);
2499 
2500 	pthread_mutex_lock(&endpoint->lock);
2501 	TAILQ_REMOVE(&vu_ctrlr->connected_qps, vu_qpair, tailq);
2502 	pthread_mutex_unlock(&endpoint->lock);
2503 
2504 	return 0;
2505 }
2506 
2507 static void
2508 _nvmf_vfio_user_req_free(struct nvmf_vfio_user_qpair *vu_qpair, struct nvmf_vfio_user_req *vu_req)
2509 {
2510 	memset(&vu_req->cmd, 0, sizeof(vu_req->cmd));
2511 	memset(&vu_req->rsp, 0, sizeof(vu_req->rsp));
2512 	vu_req->iovcnt = 0;
2513 	vu_req->state = VFIO_USER_REQUEST_STATE_FREE;
2514 
2515 	TAILQ_INSERT_TAIL(&vu_qpair->reqs, vu_req, link);
2516 }
2517 
2518 static int
2519 nvmf_vfio_user_req_free(struct spdk_nvmf_request *req)
2520 {
2521 	struct nvmf_vfio_user_qpair *vu_qpair;
2522 	struct nvmf_vfio_user_req *vu_req;
2523 
2524 	assert(req != NULL);
2525 
2526 	vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req);
2527 	vu_qpair = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_qpair, qpair);
2528 
2529 	_nvmf_vfio_user_req_free(vu_qpair, vu_req);
2530 
2531 	return 0;
2532 }
2533 
2534 static int
2535 nvmf_vfio_user_req_complete(struct spdk_nvmf_request *req)
2536 {
2537 	struct nvmf_vfio_user_qpair *vu_qpair;
2538 	struct nvmf_vfio_user_req *vu_req;
2539 
2540 	assert(req != NULL);
2541 
2542 	vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req);
2543 	vu_qpair = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_qpair, qpair);
2544 
2545 	if (vu_req->cb_fn != NULL) {
2546 		if (vu_req->cb_fn(vu_req, vu_req->cb_arg) != 0) {
2547 			fail_ctrlr(vu_qpair->ctrlr);
2548 		}
2549 	}
2550 
2551 	_nvmf_vfio_user_req_free(vu_qpair, vu_req);
2552 
2553 	return 0;
2554 }
2555 
2556 static void
2557 nvmf_vfio_user_close_qpair(struct spdk_nvmf_qpair *qpair,
2558 			   spdk_nvmf_transport_qpair_fini_cb cb_fn, void *cb_arg)
2559 {
2560 	struct nvmf_vfio_user_qpair *vu_qpair;
2561 
2562 	assert(qpair != NULL);
2563 	vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair);
2564 	free_qp(vu_qpair->ctrlr, qpair->qid);
2565 
2566 	if (cb_fn) {
2567 		cb_fn(cb_arg);
2568 	}
2569 }
2570 
2571 /**
2572  * Returns a preallocated spdk_nvmf_request or NULL if there isn't one available.
2573  */
2574 static struct nvmf_vfio_user_req *
2575 get_nvmf_vfio_user_req(struct nvmf_vfio_user_qpair *qpair)
2576 {
2577 	struct nvmf_vfio_user_req *req;
2578 
2579 	assert(qpair != NULL);
2580 
2581 	if (TAILQ_EMPTY(&qpair->reqs)) {
2582 		return NULL;
2583 	}
2584 
2585 	req = TAILQ_FIRST(&qpair->reqs);
2586 	TAILQ_REMOVE(&qpair->reqs, req, link);
2587 
2588 	return req;
2589 }
2590 
2591 static int
2592 get_nvmf_io_req_length(struct spdk_nvmf_request *req)
2593 {
2594 	uint16_t nr;
2595 	uint32_t nlb, nsid;
2596 	struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
2597 	struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
2598 	struct spdk_nvmf_ns *ns;
2599 
2600 	nsid = cmd->nsid;
2601 	ns = _nvmf_subsystem_get_ns(ctrlr->subsys, nsid);
2602 	if (ns == NULL || ns->bdev == NULL) {
2603 		SPDK_ERRLOG("unsuccessful query for nsid %u\n", cmd->nsid);
2604 		return -EINVAL;
2605 	}
2606 
2607 	if (cmd->opc == SPDK_NVME_OPC_DATASET_MANAGEMENT) {
2608 		nr = cmd->cdw10_bits.dsm.nr + 1;
2609 		return nr * sizeof(struct spdk_nvme_dsm_range);
2610 	}
2611 
2612 	nlb = (cmd->cdw12 & 0x0000ffffu) + 1;
2613 	return nlb * spdk_bdev_get_block_size(ns->bdev);
2614 }
2615 
2616 static int
2617 map_admin_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req)
2618 {
2619 	struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
2620 	uint32_t len = 0;
2621 	uint8_t fid;
2622 	int iovcnt;
2623 
2624 	req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc);
2625 	req->length = 0;
2626 	req->data = NULL;
2627 
2628 	if (req->xfer == SPDK_NVME_DATA_NONE) {
2629 		return 0;
2630 	}
2631 
2632 	switch (cmd->opc) {
2633 	case SPDK_NVME_OPC_IDENTIFY:
2634 		len = 4096;
2635 		break;
2636 	case SPDK_NVME_OPC_GET_LOG_PAGE:
2637 		len = (((cmd->cdw11_bits.get_log_page.numdu << 16) | cmd->cdw10_bits.get_log_page.numdl) + 1) * 4;
2638 		break;
2639 	case SPDK_NVME_OPC_GET_FEATURES:
2640 	case SPDK_NVME_OPC_SET_FEATURES:
2641 		fid = cmd->cdw10_bits.set_features.fid;
2642 		switch (fid) {
2643 		case SPDK_NVME_FEAT_LBA_RANGE_TYPE:
2644 			len = 4096;
2645 			break;
2646 		case SPDK_NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
2647 			len = 256;
2648 			break;
2649 		case SPDK_NVME_FEAT_TIMESTAMP:
2650 			len = 8;
2651 			break;
2652 		case SPDK_NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
2653 			len = 512;
2654 			break;
2655 		case SPDK_NVME_FEAT_HOST_IDENTIFIER:
2656 			if (cmd->cdw11_bits.feat_host_identifier.bits.exhid) {
2657 				len = 16;
2658 			} else {
2659 				len = 8;
2660 			}
2661 			break;
2662 		default:
2663 			return 0;
2664 		}
2665 		break;
2666 	default:
2667 		return 0;
2668 	}
2669 
2670 	/* ADMIN command will not use SGL */
2671 	if (cmd->psdt != 0) {
2672 		return -EINVAL;
2673 	}
2674 
2675 	iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, len);
2676 	if (iovcnt < 0) {
2677 		SPDK_ERRLOG("%s: map Admin Opc %x failed\n",
2678 			    ctrlr_id(ctrlr), cmd->opc);
2679 		return -1;
2680 	}
2681 	req->length = len;
2682 	req->data = req->iov[0].iov_base;
2683 	req->iovcnt = iovcnt;
2684 
2685 	return 0;
2686 }
2687 
2688 /*
2689  * Map an I/O command's buffers.
2690  *
2691  * Returns 0 on success and -errno on failure.
2692  */
2693 static int
2694 map_io_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req)
2695 {
2696 	int len, iovcnt;
2697 	struct spdk_nvme_cmd *cmd;
2698 
2699 	assert(ctrlr != NULL);
2700 	assert(req != NULL);
2701 
2702 	cmd = &req->cmd->nvme_cmd;
2703 	req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc);
2704 	req->length = 0;
2705 	req->data = NULL;
2706 
2707 	if (spdk_unlikely(req->xfer == SPDK_NVME_DATA_NONE)) {
2708 		return 0;
2709 	}
2710 
2711 	len = get_nvmf_io_req_length(req);
2712 	if (len < 0) {
2713 		return -EINVAL;
2714 	}
2715 	req->length = len;
2716 
2717 	iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, req->length);
2718 	if (iovcnt < 0) {
2719 		SPDK_ERRLOG("%s: failed to map IO OPC %u\n", ctrlr_id(ctrlr), cmd->opc);
2720 		return -EFAULT;
2721 	}
2722 	req->data = req->iov[0].iov_base;
2723 	req->iovcnt = iovcnt;
2724 
2725 	return 0;
2726 }
2727 
2728 static int
2729 handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd,
2730 	       struct nvmf_vfio_user_qpair *vu_qpair)
2731 {
2732 	int err;
2733 	struct nvmf_vfio_user_req *vu_req;
2734 	struct spdk_nvmf_request *req;
2735 
2736 	assert(ctrlr != NULL);
2737 	assert(cmd != NULL);
2738 
2739 	vu_req = get_nvmf_vfio_user_req(vu_qpair);
2740 	if (spdk_unlikely(vu_req == NULL)) {
2741 		SPDK_ERRLOG("%s: no request for NVMe command opc 0x%x\n", ctrlr_id(ctrlr), cmd->opc);
2742 		return post_completion(ctrlr, &vu_qpair->cq, 0, 0, cmd->cid,
2743 				       SPDK_NVME_SC_INTERNAL_DEVICE_ERROR, SPDK_NVME_SCT_GENERIC);
2744 
2745 	}
2746 	req = &vu_req->req;
2747 
2748 	assert(req->qpair != NULL);
2749 	SPDK_DEBUGLOG(nvmf_vfio, "%s: handle qid%u, req opc=%#x cid=%d\n",
2750 		      ctrlr_id(ctrlr), req->qpair->qid, cmd->opc, cmd->cid);
2751 
2752 	vu_req->cb_fn = handle_cmd_rsp;
2753 	vu_req->cb_arg = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_qpair, qpair);
2754 	req->cmd->nvme_cmd = *cmd;
2755 
2756 	if (nvmf_qpair_is_admin_queue(req->qpair)) {
2757 		err = map_admin_cmd_req(ctrlr, req);
2758 	} else {
2759 		switch (cmd->opc) {
2760 		case SPDK_NVME_OPC_RESERVATION_REGISTER:
2761 		case SPDK_NVME_OPC_RESERVATION_REPORT:
2762 		case SPDK_NVME_OPC_RESERVATION_ACQUIRE:
2763 		case SPDK_NVME_OPC_RESERVATION_RELEASE:
2764 			err = -ENOTSUP;
2765 			break;
2766 		default:
2767 			err = map_io_cmd_req(ctrlr, req);
2768 			break;
2769 		}
2770 	}
2771 
2772 	if (spdk_unlikely(err < 0)) {
2773 		SPDK_ERRLOG("%s: process NVMe command opc 0x%x failed\n",
2774 			    ctrlr_id(ctrlr), cmd->opc);
2775 		req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
2776 		req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC;
2777 		err = handle_cmd_rsp(vu_req, vu_req->cb_arg);
2778 		_nvmf_vfio_user_req_free(vu_qpair, vu_req);
2779 		return err;
2780 	}
2781 
2782 	vu_req->state = VFIO_USER_REQUEST_STATE_EXECUTING;
2783 	spdk_nvmf_request_exec(req);
2784 
2785 	return 0;
2786 }
2787 
2788 /* Returns the number of commands processed, or a negative value on error. */
2789 static int
2790 nvmf_vfio_user_qpair_poll(struct nvmf_vfio_user_qpair *qpair)
2791 {
2792 	struct nvmf_vfio_user_ctrlr *ctrlr;
2793 	uint32_t new_tail;
2794 	int count = 0;
2795 
2796 	assert(qpair != NULL);
2797 
2798 	ctrlr = qpair->ctrlr;
2799 
2800 	/* Load-Acquire. */
2801 	new_tail = *tdbl(ctrlr, &qpair->sq);
2802 
2803 	/*
2804 	 * Ensure that changes to the queue are visible to us.
2805 	 * The host driver should write the queue first, do a wmb(), and then
2806 	 * update the SQ tail doorbell (their Store-Release).
2807 	 */
2808 	spdk_rmb();
2809 
2810 	new_tail = new_tail & 0xffffu;
2811 	if (spdk_unlikely(new_tail >= qpair->sq.size)) {
2812 		union spdk_nvme_async_event_completion event = {};
2813 
2814 		SPDK_DEBUGLOG(nvmf_vfio, "%s: invalid SQ%u doorbell value %u\n", ctrlr_id(ctrlr), qpair->qpair.qid,
2815 			      new_tail);
2816 		event.bits.async_event_type = SPDK_NVME_ASYNC_EVENT_TYPE_ERROR;
2817 		event.bits.async_event_info = SPDK_NVME_ASYNC_EVENT_INVALID_DB_WRITE;
2818 		nvmf_ctrlr_async_event_error_event(qpair->qpair.ctrlr, event);
2819 
2820 		return 0;
2821 	}
2822 
2823 	if (sq_head(qpair) == new_tail) {
2824 		return 0;
2825 	}
2826 
2827 	count = handle_sq_tdbl_write(ctrlr, new_tail, qpair);
2828 	if (count < 0) {
2829 		fail_ctrlr(ctrlr);
2830 	}
2831 
2832 	return count;
2833 }
2834 
2835 /*
2836  * vfio-user transport poll handler. Note that the library context is polled in
2837  * a separate poller (->vfu_ctx_poller), so this poller only needs to poll the
2838  * active qpairs.
2839  *
2840  * Returns the number of commands processed, or a negative value on error.
2841  */
2842 static int
2843 nvmf_vfio_user_poll_group_poll(struct spdk_nvmf_transport_poll_group *group)
2844 {
2845 	struct nvmf_vfio_user_poll_group *vu_group;
2846 	struct nvmf_vfio_user_qpair *vu_qpair, *tmp;
2847 	int count = 0;
2848 
2849 	assert(group != NULL);
2850 
2851 	spdk_rmb();
2852 
2853 	vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group);
2854 
2855 	TAILQ_FOREACH_SAFE(vu_qpair, &vu_group->qps, link, tmp) {
2856 		int ret;
2857 
2858 		if (spdk_unlikely(vu_qpair->state != VFIO_USER_QPAIR_ACTIVE || !vu_qpair->sq.size)) {
2859 			continue;
2860 		}
2861 
2862 		ret = nvmf_vfio_user_qpair_poll(vu_qpair);
2863 
2864 		if (ret < 0) {
2865 			return ret;
2866 		}
2867 
2868 		count += ret;
2869 	}
2870 
2871 	return count;
2872 }
2873 
2874 static int
2875 nvmf_vfio_user_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair,
2876 				    struct spdk_nvme_transport_id *trid)
2877 {
2878 	struct nvmf_vfio_user_qpair *vu_qpair;
2879 	struct nvmf_vfio_user_ctrlr *ctrlr;
2880 
2881 	vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair);
2882 	ctrlr = vu_qpair->ctrlr;
2883 
2884 	memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid));
2885 	return 0;
2886 }
2887 
2888 static int
2889 nvmf_vfio_user_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair,
2890 				   struct spdk_nvme_transport_id *trid)
2891 {
2892 	return 0;
2893 }
2894 
2895 static int
2896 nvmf_vfio_user_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair,
2897 				     struct spdk_nvme_transport_id *trid)
2898 {
2899 	struct nvmf_vfio_user_qpair *vu_qpair;
2900 	struct nvmf_vfio_user_ctrlr *ctrlr;
2901 
2902 	vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair);
2903 	ctrlr = vu_qpair->ctrlr;
2904 
2905 	memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid));
2906 	return 0;
2907 }
2908 
2909 static void
2910 nvmf_vfio_user_qpair_abort_request(struct spdk_nvmf_qpair *qpair,
2911 				   struct spdk_nvmf_request *req)
2912 {
2913 	struct nvmf_vfio_user_qpair *vu_qpair;
2914 	struct nvmf_vfio_user_req *vu_req, *vu_req_to_abort = NULL;
2915 	uint32_t i;
2916 	uint16_t cid;
2917 
2918 	vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair);
2919 
2920 	cid = req->cmd->nvme_cmd.cdw10_bits.abort.cid;
2921 	for (i = 0; i < vu_qpair->qsize; i++) {
2922 		vu_req = &vu_qpair->reqs_internal[i];
2923 		if (vu_req->state == VFIO_USER_REQUEST_STATE_EXECUTING && vu_req->cmd.cid == cid) {
2924 			vu_req_to_abort = vu_req;
2925 			break;
2926 		}
2927 	}
2928 
2929 	if (vu_req_to_abort == NULL) {
2930 		spdk_nvmf_request_complete(req);
2931 		return;
2932 	}
2933 
2934 	req->req_to_abort = &vu_req_to_abort->req;
2935 	nvmf_ctrlr_abort_request(req);
2936 }
2937 
2938 static void
2939 nvmf_vfio_user_opts_init(struct spdk_nvmf_transport_opts *opts)
2940 {
2941 	opts->max_queue_depth =		NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH;
2942 	opts->max_qpairs_per_ctrlr =	NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR;
2943 	opts->in_capsule_data_size =	0;
2944 	opts->max_io_size =		NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE;
2945 	opts->io_unit_size =		NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE;
2946 	opts->max_aq_depth =		NVMF_VFIO_USER_DEFAULT_AQ_DEPTH;
2947 	opts->num_shared_buffers =	0;
2948 	opts->buf_cache_size =		0;
2949 	opts->association_timeout =	0;
2950 	opts->transport_specific =      NULL;
2951 }
2952 
2953 const struct spdk_nvmf_transport_ops spdk_nvmf_transport_vfio_user = {
2954 	.name = "VFIOUSER",
2955 	.type = SPDK_NVME_TRANSPORT_VFIOUSER,
2956 	.opts_init = nvmf_vfio_user_opts_init,
2957 	.create = nvmf_vfio_user_create,
2958 	.destroy = nvmf_vfio_user_destroy,
2959 
2960 	.listen = nvmf_vfio_user_listen,
2961 	.stop_listen = nvmf_vfio_user_stop_listen,
2962 	.accept = nvmf_vfio_user_accept,
2963 	.cdata_init = nvmf_vfio_user_cdata_init,
2964 	.listen_associate = nvmf_vfio_user_listen_associate,
2965 
2966 	.listener_discover = nvmf_vfio_user_discover,
2967 
2968 	.poll_group_create = nvmf_vfio_user_poll_group_create,
2969 	.poll_group_destroy = nvmf_vfio_user_poll_group_destroy,
2970 	.poll_group_add = nvmf_vfio_user_poll_group_add,
2971 	.poll_group_remove = nvmf_vfio_user_poll_group_remove,
2972 	.poll_group_poll = nvmf_vfio_user_poll_group_poll,
2973 
2974 	.req_free = nvmf_vfio_user_req_free,
2975 	.req_complete = nvmf_vfio_user_req_complete,
2976 
2977 	.qpair_fini = nvmf_vfio_user_close_qpair,
2978 	.qpair_get_local_trid = nvmf_vfio_user_qpair_get_local_trid,
2979 	.qpair_get_peer_trid = nvmf_vfio_user_qpair_get_peer_trid,
2980 	.qpair_get_listen_trid = nvmf_vfio_user_qpair_get_listen_trid,
2981 	.qpair_abort_request = nvmf_vfio_user_qpair_abort_request,
2982 };
2983 
2984 SPDK_NVMF_TRANSPORT_REGISTER(muser, &spdk_nvmf_transport_vfio_user);
2985 SPDK_LOG_REGISTER_COMPONENT(nvmf_vfio)
2986