xref: /freebsd-src/usr.sbin/bhyve/pci_nvme.c (revision 1575a795cb6e83933b426e5aabccb5a0f04dd299)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  *
7  * Function crc16 Copyright (c) 2017, Fedor Uporov
8  *     Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31 
32 /*
33  * bhyve PCIe-NVMe device emulation.
34  *
35  * options:
36  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#
37  *
38  *  accepted devpath:
39  *    /dev/blockdev
40  *    /path/to/image
41  *    ram=size_in_MiB
42  *
43  *  maxq    = max number of queues
44  *  qsz     = max elements in each queue
45  *  ioslots = max number of concurrent io requests
46  *  sectsz  = sector size (defaults to blockif sector size)
47  *  ser     = serial number (20-chars max)
48  *  eui64   = IEEE Extended Unique Identifier (8 byte value)
49  *
50  */
51 
52 /* TODO:
53     - create async event for smart and log
54     - intr coalesce
55  */
56 
57 #include <sys/cdefs.h>
58 __FBSDID("$FreeBSD$");
59 
60 #include <sys/types.h>
61 #include <net/ieee_oui.h>
62 
63 #include <assert.h>
64 #include <pthread.h>
65 #include <semaphore.h>
66 #include <stdbool.h>
67 #include <stddef.h>
68 #include <stdint.h>
69 #include <stdio.h>
70 #include <stdlib.h>
71 #include <string.h>
72 
73 #include <machine/atomic.h>
74 #include <machine/vmm.h>
75 #include <vmmapi.h>
76 
77 #include <dev/nvme/nvme.h>
78 
79 #include "bhyverun.h"
80 #include "block_if.h"
81 #include "pci_emul.h"
82 
83 
84 static int nvme_debug = 0;
85 #define	DPRINTF(params) if (nvme_debug) printf params
86 #define	WPRINTF(params) printf params
87 
88 /* defaults; can be overridden */
89 #define	NVME_MSIX_BAR		4
90 
91 #define	NVME_IOSLOTS		8
92 
93 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
94 #define NVME_MMIO_SPACE_MIN	(1 << 14)
95 
96 #define	NVME_QUEUES		16
97 #define	NVME_MAX_QENTRIES	2048
98 
99 #define	NVME_PRP2_ITEMS		(PAGE_SIZE/sizeof(uint64_t))
100 #define	NVME_MAX_BLOCKIOVS	512
101 
102 /* helpers */
103 
104 /* Convert a zero-based value into a one-based value */
105 #define ONE_BASED(zero)		((zero) + 1)
106 /* Convert a one-based value into a zero-based value */
107 #define ZERO_BASED(one)		((one)  - 1)
108 
109 /* Encode number of SQ's and CQ's for Set/Get Features */
110 #define NVME_FEATURE_NUM_QUEUES(sc) \
111 	(ZERO_BASED((sc)->num_squeues) & 0xffff) | \
112 	(ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
113 
114 #define	NVME_DOORBELL_OFFSET	offsetof(struct nvme_registers, doorbell)
115 
116 enum nvme_controller_register_offsets {
117 	NVME_CR_CAP_LOW = 0x00,
118 	NVME_CR_CAP_HI  = 0x04,
119 	NVME_CR_VS      = 0x08,
120 	NVME_CR_INTMS   = 0x0c,
121 	NVME_CR_INTMC   = 0x10,
122 	NVME_CR_CC      = 0x14,
123 	NVME_CR_CSTS    = 0x1c,
124 	NVME_CR_NSSR    = 0x20,
125 	NVME_CR_AQA     = 0x24,
126 	NVME_CR_ASQ_LOW = 0x28,
127 	NVME_CR_ASQ_HI  = 0x2c,
128 	NVME_CR_ACQ_LOW = 0x30,
129 	NVME_CR_ACQ_HI  = 0x34,
130 };
131 
132 enum nvme_cmd_cdw11 {
133 	NVME_CMD_CDW11_PC  = 0x0001,
134 	NVME_CMD_CDW11_IEN = 0x0002,
135 	NVME_CMD_CDW11_IV  = 0xFFFF0000,
136 };
137 
138 #define	NVME_CQ_INTEN	0x01
139 #define	NVME_CQ_INTCOAL	0x02
140 
141 struct nvme_completion_queue {
142 	struct nvme_completion *qbase;
143 	uint32_t	size;
144 	uint16_t	tail; /* nvme progress */
145 	uint16_t	head; /* guest progress */
146 	uint16_t	intr_vec;
147 	uint32_t	intr_en;
148 	pthread_mutex_t	mtx;
149 };
150 
151 struct nvme_submission_queue {
152 	struct nvme_command *qbase;
153 	uint32_t	size;
154 	uint16_t	head; /* nvme progress */
155 	uint16_t	tail; /* guest progress */
156 	uint16_t	cqid; /* completion queue id */
157 	int		busy; /* queue is being processed */
158 	int		qpriority;
159 };
160 
161 enum nvme_storage_type {
162 	NVME_STOR_BLOCKIF = 0,
163 	NVME_STOR_RAM = 1,
164 };
165 
166 struct pci_nvme_blockstore {
167 	enum nvme_storage_type type;
168 	void		*ctx;
169 	uint64_t	size;
170 	uint32_t	sectsz;
171 	uint32_t	sectsz_bits;
172 	uint64_t	eui64;
173 };
174 
175 struct pci_nvme_ioreq {
176 	struct pci_nvme_softc *sc;
177 	struct pci_nvme_ioreq *next;
178 	struct nvme_submission_queue *nvme_sq;
179 	uint16_t	sqid;
180 
181 	/* command information */
182 	uint16_t	opc;
183 	uint16_t	cid;
184 	uint32_t	nsid;
185 
186 	uint64_t	prev_gpaddr;
187 	size_t		prev_size;
188 
189 	/*
190 	 * lock if all iovs consumed (big IO);
191 	 * complete transaction before continuing
192 	 */
193 	pthread_mutex_t	mtx;
194 	pthread_cond_t	cv;
195 
196 	struct blockif_req io_req;
197 
198 	/* pad to fit up to 512 page descriptors from guest IO request */
199 	struct iovec	iovpadding[NVME_MAX_BLOCKIOVS-BLOCKIF_IOV_MAX];
200 };
201 
202 struct pci_nvme_softc {
203 	struct pci_devinst *nsc_pi;
204 
205 	pthread_mutex_t	mtx;
206 
207 	struct nvme_registers regs;
208 
209 	struct nvme_namespace_data  nsdata;
210 	struct nvme_controller_data ctrldata;
211 	struct nvme_error_information_entry err_log;
212 	struct nvme_health_information_page health_log;
213 	struct nvme_firmware_page fw_log;
214 
215 	struct pci_nvme_blockstore nvstore;
216 
217 	uint16_t	max_qentries;	/* max entries per queue */
218 	uint32_t	max_queues;	/* max number of IO SQ's or CQ's */
219 	uint32_t	num_cqueues;
220 	uint32_t	num_squeues;
221 
222 	struct pci_nvme_ioreq *ioreqs;
223 	struct pci_nvme_ioreq *ioreqs_free; /* free list of ioreqs */
224 	uint32_t	pending_ios;
225 	uint32_t	ioslots;
226 	sem_t		iosemlock;
227 
228 	/*
229 	 * Memory mapped Submission and Completion queues
230 	 * Each array includes both Admin and IO queues
231 	 */
232 	struct nvme_completion_queue *compl_queues;
233 	struct nvme_submission_queue *submit_queues;
234 
235 	/* controller features */
236 	uint32_t	intr_coales_aggr_time;   /* 0x08: uS to delay intr */
237 	uint32_t	intr_coales_aggr_thresh; /* 0x08: compl-Q entries */
238 	uint32_t	async_ev_config;         /* 0x0B: async event config */
239 };
240 
241 
242 static void pci_nvme_io_partial(struct blockif_req *br, int err);
243 
244 /* Controller Configuration utils */
245 #define	NVME_CC_GET_EN(cc) \
246 	((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
247 #define	NVME_CC_GET_CSS(cc) \
248 	((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
249 #define	NVME_CC_GET_SHN(cc) \
250 	((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
251 #define	NVME_CC_GET_IOSQES(cc) \
252 	((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
253 #define	NVME_CC_GET_IOCQES(cc) \
254 	((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
255 
256 #define	NVME_CC_WRITE_MASK \
257 	((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
258 	 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
259 	 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
260 
261 #define	NVME_CC_NEN_WRITE_MASK \
262 	((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
263 	 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
264 	 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
265 
266 /* Controller Status utils */
267 #define	NVME_CSTS_GET_RDY(sts) \
268 	((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
269 
270 #define	NVME_CSTS_RDY	(1 << NVME_CSTS_REG_RDY_SHIFT)
271 
272 /* Completion Queue status word utils */
273 #define	NVME_STATUS_P	(1 << NVME_STATUS_P_SHIFT)
274 #define	NVME_STATUS_MASK \
275 	((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
276 	 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
277 
278 static __inline void
279 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
280 {
281 	size_t len;
282 
283 	len = strnlen(src, dst_size);
284 	memset(dst, pad, dst_size);
285 	memcpy(dst, src, len);
286 }
287 
288 static __inline void
289 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
290 {
291 
292 	*status &= ~NVME_STATUS_MASK;
293 	*status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
294 		(code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
295 }
296 
297 static __inline void
298 pci_nvme_status_genc(uint16_t *status, uint16_t code)
299 {
300 
301 	pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
302 }
303 
304 static __inline void
305 pci_nvme_toggle_phase(uint16_t *status, int prev)
306 {
307 
308 	if (prev)
309 		*status &= ~NVME_STATUS_P;
310 	else
311 		*status |= NVME_STATUS_P;
312 }
313 
314 static void
315 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
316 {
317 	struct nvme_controller_data *cd = &sc->ctrldata;
318 
319 	cd->vid = 0xFB5D;
320 	cd->ssvid = 0x0000;
321 
322 	cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
323 	cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
324 
325 	/* Num of submission commands that we can handle at a time (2^rab) */
326 	cd->rab   = 4;
327 
328 	/* FreeBSD OUI */
329 	cd->ieee[0] = 0x58;
330 	cd->ieee[1] = 0x9c;
331 	cd->ieee[2] = 0xfc;
332 
333 	cd->mic = 0;
334 
335 	cd->mdts = 9;	/* max data transfer size (2^mdts * CAP.MPSMIN) */
336 
337 	cd->ver = 0x00010300;
338 
339 	cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
340 	cd->acl = 2;
341 	cd->aerl = 4;
342 
343 	cd->lpa = 0;	/* TODO: support some simple things like SMART */
344 	cd->elpe = 0;	/* max error log page entries */
345 	cd->npss = 1;	/* number of power states support */
346 
347 	/* Warning Composite Temperature Threshold */
348 	cd->wctemp = 0x0157;
349 
350 	cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
351 	    (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
352 	cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
353 	    (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
354 	cd->nn = 1;	/* number of namespaces */
355 
356 	cd->fna = 0x03;
357 
358 	cd->power_state[0].mp = 10;
359 }
360 
361 /*
362  * Calculate the CRC-16 of the given buffer
363  * See copyright attribution at top of file
364  */
365 static uint16_t
366 crc16(uint16_t crc, const void *buffer, unsigned int len)
367 {
368 	const unsigned char *cp = buffer;
369 	/* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
370 	static uint16_t const crc16_table[256] = {
371 		0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
372 		0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
373 		0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
374 		0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
375 		0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
376 		0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
377 		0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
378 		0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
379 		0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
380 		0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
381 		0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
382 		0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
383 		0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
384 		0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
385 		0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
386 		0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
387 		0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
388 		0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
389 		0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
390 		0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
391 		0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
392 		0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
393 		0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
394 		0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
395 		0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
396 		0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
397 		0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
398 		0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
399 		0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
400 		0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
401 		0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
402 		0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
403 	};
404 
405 	while (len--)
406 		crc = (((crc >> 8) & 0xffU) ^
407 		    crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
408 	return crc;
409 }
410 
411 static void
412 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
413     struct nvme_namespace_data *nd, uint32_t nsid,
414     uint64_t eui64)
415 {
416 
417 	nd->nsze = sc->nvstore.size / sc->nvstore.sectsz;
418 	nd->ncap = nd->nsze;
419 	nd->nuse = nd->nsze;
420 
421 	/* Get LBA and backstore information from backing store */
422 	nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
423 	nd->flbas = 0;
424 
425 	/* Create an EUI-64 if user did not provide one */
426 	if (eui64 == 0) {
427 		char *data = NULL;
428 
429 		asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus,
430 		    sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
431 
432 		if (data != NULL) {
433 			eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
434 			free(data);
435 		}
436 		eui64 = (eui64 << 16) | (nsid & 0xffff);
437 	}
438 	be64enc(nd->eui64, eui64);
439 
440 	/* LBA data-sz = 2^lbads */
441 	nd->lbaf[0] = sc->nvstore.sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
442 }
443 
444 static void
445 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
446 {
447 
448 	memset(&sc->err_log, 0, sizeof(sc->err_log));
449 	memset(&sc->health_log, 0, sizeof(sc->health_log));
450 	memset(&sc->fw_log, 0, sizeof(sc->fw_log));
451 }
452 
453 static void
454 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
455 {
456 	DPRINTF(("%s\r\n", __func__));
457 
458 	sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
459 	    (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
460 	    (60 << NVME_CAP_LO_REG_TO_SHIFT);
461 
462 	sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
463 
464 	sc->regs.vs = 0x00010300;	/* NVMe v1.3 */
465 
466 	sc->regs.cc = 0;
467 	sc->regs.csts = 0;
468 
469 	sc->num_cqueues = sc->num_squeues = sc->max_queues;
470 	if (sc->submit_queues != NULL) {
471 		for (int i = 0; i < sc->num_squeues + 1; i++) {
472 			/*
473 			 * The Admin Submission Queue is at index 0.
474 			 * It must not be changed at reset otherwise the
475 			 * emulation will be out of sync with the guest.
476 			 */
477 			if (i != 0) {
478 				sc->submit_queues[i].qbase = NULL;
479 				sc->submit_queues[i].size = 0;
480 				sc->submit_queues[i].cqid = 0;
481 			}
482 			sc->submit_queues[i].tail = 0;
483 			sc->submit_queues[i].head = 0;
484 			sc->submit_queues[i].busy = 0;
485 		}
486 	} else
487 		sc->submit_queues = calloc(sc->num_squeues + 1,
488 		                        sizeof(struct nvme_submission_queue));
489 
490 	if (sc->compl_queues != NULL) {
491 		for (int i = 0; i < sc->num_cqueues + 1; i++) {
492 			/* See Admin Submission Queue note above */
493 			if (i != 0) {
494 				sc->compl_queues[i].qbase = NULL;
495 				sc->compl_queues[i].size = 0;
496 			}
497 
498 			sc->compl_queues[i].tail = 0;
499 			sc->compl_queues[i].head = 0;
500 		}
501 	} else {
502 		sc->compl_queues = calloc(sc->num_cqueues + 1,
503 		                        sizeof(struct nvme_completion_queue));
504 
505 		for (int i = 0; i < sc->num_cqueues + 1; i++)
506 			pthread_mutex_init(&sc->compl_queues[i].mtx, NULL);
507 	}
508 }
509 
510 static void
511 pci_nvme_reset(struct pci_nvme_softc *sc)
512 {
513 	pthread_mutex_lock(&sc->mtx);
514 	pci_nvme_reset_locked(sc);
515 	pthread_mutex_unlock(&sc->mtx);
516 }
517 
518 static void
519 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
520 {
521 	uint16_t acqs, asqs;
522 
523 	DPRINTF(("%s\r\n", __func__));
524 
525 	asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
526 	sc->submit_queues[0].size = asqs;
527 	sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
528 	            sizeof(struct nvme_command) * asqs);
529 
530 	DPRINTF(("%s mapping Admin-SQ guest 0x%lx, host: %p\r\n",
531 	        __func__, sc->regs.asq, sc->submit_queues[0].qbase));
532 
533 	acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
534 	    NVME_AQA_REG_ACQS_MASK) + 1;
535 	sc->compl_queues[0].size = acqs;
536 	sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
537 	         sizeof(struct nvme_completion) * acqs);
538 	DPRINTF(("%s mapping Admin-CQ guest 0x%lx, host: %p\r\n",
539 	        __func__, sc->regs.acq, sc->compl_queues[0].qbase));
540 }
541 
542 static int
543 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *src,
544 	size_t len)
545 {
546 	uint8_t *dst;
547 	size_t bytes;
548 
549 	if (len > (8 * 1024)) {
550 		return (-1);
551 	}
552 
553 	/* Copy from the start of prp1 to the end of the physical page */
554 	bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
555 	bytes = MIN(bytes, len);
556 
557 	dst = vm_map_gpa(ctx, prp1, bytes);
558 	if (dst == NULL) {
559 		return (-1);
560 	}
561 
562 	memcpy(dst, src, bytes);
563 
564 	src += bytes;
565 
566 	len -= bytes;
567 	if (len == 0) {
568 		return (0);
569 	}
570 
571 	len = MIN(len, PAGE_SIZE);
572 
573 	dst = vm_map_gpa(ctx, prp2, len);
574 	if (dst == NULL) {
575 		return (-1);
576 	}
577 
578 	memcpy(dst, src, len);
579 
580 	return (0);
581 }
582 
583 static int
584 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
585 	struct nvme_completion* compl)
586 {
587 	uint16_t qid = command->cdw10 & 0xffff;
588 
589 	DPRINTF(("%s DELETE_IO_SQ %u\r\n", __func__, qid));
590 	if (qid == 0 || qid > sc->num_squeues) {
591 		WPRINTF(("%s NOT PERMITTED queue id %u / num_squeues %u\r\n",
592 		        __func__, qid, sc->num_squeues));
593 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
594 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
595 		return (1);
596 	}
597 
598 	sc->submit_queues[qid].qbase = NULL;
599 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
600 	return (1);
601 }
602 
603 static int
604 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
605 	struct nvme_completion* compl)
606 {
607 	if (command->cdw11 & NVME_CMD_CDW11_PC) {
608 		uint16_t qid = command->cdw10 & 0xffff;
609 		struct nvme_submission_queue *nsq;
610 
611 		if ((qid == 0) || (qid > sc->num_squeues)) {
612 			WPRINTF(("%s queue index %u > num_squeues %u\r\n",
613 			        __func__, qid, sc->num_squeues));
614 			pci_nvme_status_tc(&compl->status,
615 			    NVME_SCT_COMMAND_SPECIFIC,
616 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
617 			return (1);
618 		}
619 
620 		nsq = &sc->submit_queues[qid];
621 		nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
622 
623 		nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
624 		              sizeof(struct nvme_command) * (size_t)nsq->size);
625 		nsq->cqid = (command->cdw11 >> 16) & 0xffff;
626 		nsq->qpriority = (command->cdw11 >> 1) & 0x03;
627 
628 		DPRINTF(("%s sq %u size %u gaddr %p cqid %u\r\n", __func__,
629 		        qid, nsq->size, nsq->qbase, nsq->cqid));
630 
631 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
632 
633 		DPRINTF(("%s completed creating IOSQ qid %u\r\n",
634 		         __func__, qid));
635 	} else {
636 		/*
637 		 * Guest sent non-cont submission queue request.
638 		 * This setting is unsupported by this emulation.
639 		 */
640 		WPRINTF(("%s unsupported non-contig (list-based) "
641 		         "create i/o submission queue\r\n", __func__));
642 
643 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
644 	}
645 	return (1);
646 }
647 
648 static int
649 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
650 	struct nvme_completion* compl)
651 {
652 	uint16_t qid = command->cdw10 & 0xffff;
653 
654 	DPRINTF(("%s DELETE_IO_CQ %u\r\n", __func__, qid));
655 	if (qid == 0 || qid > sc->num_cqueues) {
656 		WPRINTF(("%s queue index %u / num_cqueues %u\r\n",
657 		        __func__, qid, sc->num_cqueues));
658 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
659 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
660 		return (1);
661 	}
662 
663 	sc->compl_queues[qid].qbase = NULL;
664 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
665 	return (1);
666 }
667 
668 static int
669 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
670 	struct nvme_completion* compl)
671 {
672 	if (command->cdw11 & NVME_CMD_CDW11_PC) {
673 		uint16_t qid = command->cdw10 & 0xffff;
674 		struct nvme_completion_queue *ncq;
675 
676 		if ((qid == 0) || (qid > sc->num_cqueues)) {
677 			WPRINTF(("%s queue index %u > num_cqueues %u\r\n",
678 			        __func__, qid, sc->num_cqueues));
679 			pci_nvme_status_tc(&compl->status,
680 			    NVME_SCT_COMMAND_SPECIFIC,
681 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
682 			return (1);
683 		}
684 
685 		ncq = &sc->compl_queues[qid];
686 		ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
687 		ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
688 		ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
689 
690 		ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
691 		             command->prp1,
692 		             sizeof(struct nvme_command) * (size_t)ncq->size);
693 
694 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
695 	} else {
696 		/*
697 		 * Non-contig completion queue unsupported.
698 		 */
699 		WPRINTF(("%s unsupported non-contig (list-based) "
700 		         "create i/o completion queue\r\n",
701 		         __func__));
702 
703 		/* 0x12 = Invalid Use of Controller Memory Buffer */
704 		pci_nvme_status_genc(&compl->status, 0x12);
705 	}
706 
707 	return (1);
708 }
709 
710 static int
711 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
712 	struct nvme_completion* compl)
713 {
714 	uint32_t logsize = (1 + ((command->cdw10 >> 16) & 0xFFF)) * 2;
715 	uint8_t logpage = command->cdw10 & 0xFF;
716 
717 	DPRINTF(("%s log page %u len %u\r\n", __func__, logpage, logsize));
718 
719 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
720 
721 	switch (logpage) {
722 	case NVME_LOG_ERROR:
723 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
724 		    command->prp2, (uint8_t *)&sc->err_log, logsize);
725 		break;
726 	case NVME_LOG_HEALTH_INFORMATION:
727 		/* TODO: present some smart info */
728 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
729 		    command->prp2, (uint8_t *)&sc->health_log, logsize);
730 		break;
731 	case NVME_LOG_FIRMWARE_SLOT:
732 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
733 		    command->prp2, (uint8_t *)&sc->fw_log, logsize);
734 		break;
735 	default:
736 		WPRINTF(("%s get log page %x command not supported\r\n",
737 		        __func__, logpage));
738 
739 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
740 		    NVME_SC_INVALID_LOG_PAGE);
741 	}
742 
743 	return (1);
744 }
745 
746 static int
747 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
748 	struct nvme_completion* compl)
749 {
750 	void *dest;
751 
752 	DPRINTF(("%s identify 0x%x nsid 0x%x\r\n", __func__,
753 	        command->cdw10 & 0xFF, command->nsid));
754 
755 	switch (command->cdw10 & 0xFF) {
756 	case 0x00: /* return Identify Namespace data structure */
757 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
758 		    command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata));
759 		break;
760 	case 0x01: /* return Identify Controller data structure */
761 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
762 		    command->prp2, (uint8_t *)&sc->ctrldata,
763 		    sizeof(sc->ctrldata));
764 		break;
765 	case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
766 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
767 		                  sizeof(uint32_t) * 1024);
768 		((uint32_t *)dest)[0] = 1;
769 		((uint32_t *)dest)[1] = 0;
770 		break;
771 	case 0x11:
772 		pci_nvme_status_genc(&compl->status,
773 		    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
774 		return (1);
775 	case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
776 	case 0x10:
777 	case 0x12:
778 	case 0x13:
779 	case 0x14:
780 	case 0x15:
781 	default:
782 		DPRINTF(("%s unsupported identify command requested 0x%x\r\n",
783 		         __func__, command->cdw10 & 0xFF));
784 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
785 		return (1);
786 	}
787 
788 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
789 	return (1);
790 }
791 
792 static int
793 nvme_set_feature_queues(struct pci_nvme_softc* sc, struct nvme_command* command,
794 	struct nvme_completion* compl)
795 {
796 	uint16_t nqr;	/* Number of Queues Requested */
797 
798 	nqr = command->cdw11 & 0xFFFF;
799 	if (nqr == 0xffff) {
800 		WPRINTF(("%s: Illegal NSQR value %#x\n", __func__, nqr));
801 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
802 		return (-1);
803 	}
804 
805 	sc->num_squeues = ONE_BASED(nqr);
806 	if (sc->num_squeues > sc->max_queues) {
807 		DPRINTF(("NSQR=%u is greater than max %u\n", sc->num_squeues,
808 					sc->max_queues));
809 		sc->num_squeues = sc->max_queues;
810 	}
811 
812 	nqr = (command->cdw11 >> 16) & 0xFFFF;
813 	if (nqr == 0xffff) {
814 		WPRINTF(("%s: Illegal NCQR value %#x\n", __func__, nqr));
815 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
816 		return (-1);
817 	}
818 
819 	sc->num_cqueues = ONE_BASED(nqr);
820 	if (sc->num_cqueues > sc->max_queues) {
821 		DPRINTF(("NCQR=%u is greater than max %u\n", sc->num_cqueues,
822 					sc->max_queues));
823 		sc->num_cqueues = sc->max_queues;
824 	}
825 
826 	compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
827 
828 	return (0);
829 }
830 
831 static int
832 nvme_opc_set_features(struct pci_nvme_softc* sc, struct nvme_command* command,
833 	struct nvme_completion* compl)
834 {
835 	int feature = command->cdw10 & 0xFF;
836 	uint32_t iv;
837 
838 	DPRINTF(("%s feature 0x%x\r\n", __func__, feature));
839 	compl->cdw0 = 0;
840 
841 	switch (feature) {
842 	case NVME_FEAT_ARBITRATION:
843 		DPRINTF(("  arbitration 0x%x\r\n", command->cdw11));
844 		break;
845 	case NVME_FEAT_POWER_MANAGEMENT:
846 		DPRINTF(("  power management 0x%x\r\n", command->cdw11));
847 		break;
848 	case NVME_FEAT_LBA_RANGE_TYPE:
849 		DPRINTF(("  lba range 0x%x\r\n", command->cdw11));
850 		break;
851 	case NVME_FEAT_TEMPERATURE_THRESHOLD:
852 		DPRINTF(("  temperature threshold 0x%x\r\n", command->cdw11));
853 		break;
854 	case NVME_FEAT_ERROR_RECOVERY:
855 		DPRINTF(("  error recovery 0x%x\r\n", command->cdw11));
856 		break;
857 	case NVME_FEAT_VOLATILE_WRITE_CACHE:
858 		DPRINTF(("  volatile write cache 0x%x\r\n", command->cdw11));
859 		break;
860 	case NVME_FEAT_NUMBER_OF_QUEUES:
861 		nvme_set_feature_queues(sc, command, compl);
862 		break;
863 	case NVME_FEAT_INTERRUPT_COALESCING:
864 		DPRINTF(("  interrupt coalescing 0x%x\r\n", command->cdw11));
865 
866 		/* in uS */
867 		sc->intr_coales_aggr_time = ((command->cdw11 >> 8) & 0xFF)*100;
868 
869 		sc->intr_coales_aggr_thresh = command->cdw11 & 0xFF;
870 		break;
871 	case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
872 		iv = command->cdw11 & 0xFFFF;
873 
874 		DPRINTF(("  interrupt vector configuration 0x%x\r\n",
875 		        command->cdw11));
876 
877 		for (uint32_t i = 0; i < sc->num_cqueues + 1; i++) {
878 			if (sc->compl_queues[i].intr_vec == iv) {
879 				if (command->cdw11 & (1 << 16))
880 					sc->compl_queues[i].intr_en |=
881 					                      NVME_CQ_INTCOAL;
882 				else
883 					sc->compl_queues[i].intr_en &=
884 					                     ~NVME_CQ_INTCOAL;
885 			}
886 		}
887 		break;
888 	case NVME_FEAT_WRITE_ATOMICITY:
889 		DPRINTF(("  write atomicity 0x%x\r\n", command->cdw11));
890 		break;
891 	case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
892 		DPRINTF(("  async event configuration 0x%x\r\n",
893 		        command->cdw11));
894 		sc->async_ev_config = command->cdw11;
895 		break;
896 	case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
897 		DPRINTF(("  software progress marker 0x%x\r\n",
898 		        command->cdw11));
899 		break;
900 	case 0x0C:
901 		DPRINTF(("  autonomous power state transition 0x%x\r\n",
902 		        command->cdw11));
903 		break;
904 	default:
905 		WPRINTF(("%s invalid feature\r\n", __func__));
906 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
907 		return (1);
908 	}
909 
910 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
911 	return (1);
912 }
913 
914 static int
915 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
916 	struct nvme_completion* compl)
917 {
918 	int feature = command->cdw10 & 0xFF;
919 
920 	DPRINTF(("%s feature 0x%x\r\n", __func__, feature));
921 
922 	compl->cdw0 = 0;
923 
924 	switch (feature) {
925 	case NVME_FEAT_ARBITRATION:
926 		DPRINTF(("  arbitration\r\n"));
927 		break;
928 	case NVME_FEAT_POWER_MANAGEMENT:
929 		DPRINTF(("  power management\r\n"));
930 		break;
931 	case NVME_FEAT_LBA_RANGE_TYPE:
932 		DPRINTF(("  lba range\r\n"));
933 		break;
934 	case NVME_FEAT_TEMPERATURE_THRESHOLD:
935 		DPRINTF(("  temperature threshold\r\n"));
936 		switch ((command->cdw11 >> 20) & 0x3) {
937 		case 0:
938 			/* Over temp threshold */
939 			compl->cdw0 = 0xFFFF;
940 			break;
941 		case 1:
942 			/* Under temp threshold */
943 			compl->cdw0 = 0;
944 			break;
945 		default:
946 			WPRINTF(("  invalid threshold type select\r\n"));
947 			pci_nvme_status_genc(&compl->status,
948 			    NVME_SC_INVALID_FIELD);
949 			return (1);
950 		}
951 		break;
952 	case NVME_FEAT_ERROR_RECOVERY:
953 		DPRINTF(("  error recovery\r\n"));
954 		break;
955 	case NVME_FEAT_VOLATILE_WRITE_CACHE:
956 		DPRINTF(("  volatile write cache\r\n"));
957 		break;
958 	case NVME_FEAT_NUMBER_OF_QUEUES:
959 		compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
960 
961 		DPRINTF(("  number of queues (submit %u, completion %u)\r\n",
962 		        compl->cdw0 & 0xFFFF,
963 		        (compl->cdw0 >> 16) & 0xFFFF));
964 
965 		break;
966 	case NVME_FEAT_INTERRUPT_COALESCING:
967 		DPRINTF(("  interrupt coalescing\r\n"));
968 		break;
969 	case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
970 		DPRINTF(("  interrupt vector configuration\r\n"));
971 		break;
972 	case NVME_FEAT_WRITE_ATOMICITY:
973 		DPRINTF(("  write atomicity\r\n"));
974 		break;
975 	case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
976 		DPRINTF(("  async event configuration\r\n"));
977 		sc->async_ev_config = command->cdw11;
978 		break;
979 	case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
980 		DPRINTF(("  software progress marker\r\n"));
981 		break;
982 	case 0x0C:
983 		DPRINTF(("  autonomous power state transition\r\n"));
984 		break;
985 	default:
986 		WPRINTF(("%s invalid feature 0x%x\r\n", __func__, feature));
987 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
988 		return (1);
989 	}
990 
991 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
992 	return (1);
993 }
994 
995 static int
996 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
997 	struct nvme_completion* compl)
998 {
999 	DPRINTF(("%s submission queue %u, command ID 0x%x\r\n", __func__,
1000 	        command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF));
1001 
1002 	/* TODO: search for the command ID and abort it */
1003 
1004 	compl->cdw0 = 1;
1005 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1006 	return (1);
1007 }
1008 
1009 static int
1010 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1011 	struct nvme_command* command, struct nvme_completion* compl)
1012 {
1013 	DPRINTF(("%s async event request 0x%x\r\n", __func__, command->cdw11));
1014 
1015 	/*
1016 	 * TODO: raise events when they happen based on the Set Features cmd.
1017 	 * These events happen async, so only set completion successful if
1018 	 * there is an event reflective of the request to get event.
1019 	 */
1020 	pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1021 	    NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1022 	return (0);
1023 }
1024 
1025 static void
1026 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1027 {
1028 	struct nvme_completion compl;
1029 	struct nvme_command *cmd;
1030 	struct nvme_submission_queue *sq;
1031 	struct nvme_completion_queue *cq;
1032 	int do_intr = 0;
1033 	uint16_t sqhead;
1034 
1035 	DPRINTF(("%s index %u\r\n", __func__, (uint32_t)value));
1036 
1037 	sq = &sc->submit_queues[0];
1038 
1039 	sqhead = atomic_load_acq_short(&sq->head);
1040 
1041 	if (atomic_testandset_int(&sq->busy, 1)) {
1042 		DPRINTF(("%s SQ busy, head %u, tail %u\r\n",
1043 		        __func__, sqhead, sq->tail));
1044 		return;
1045 	}
1046 
1047 	DPRINTF(("sqhead %u, tail %u\r\n", sqhead, sq->tail));
1048 
1049 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
1050 		cmd = &(sq->qbase)[sqhead];
1051 		compl.cdw0 = 0;
1052 		compl.status = 0;
1053 
1054 		switch (cmd->opc) {
1055 		case NVME_OPC_DELETE_IO_SQ:
1056 			DPRINTF(("%s command DELETE_IO_SQ\r\n", __func__));
1057 			do_intr |= nvme_opc_delete_io_sq(sc, cmd, &compl);
1058 			break;
1059 		case NVME_OPC_CREATE_IO_SQ:
1060 			DPRINTF(("%s command CREATE_IO_SQ\r\n", __func__));
1061 			do_intr |= nvme_opc_create_io_sq(sc, cmd, &compl);
1062 			break;
1063 		case NVME_OPC_DELETE_IO_CQ:
1064 			DPRINTF(("%s command DELETE_IO_CQ\r\n", __func__));
1065 			do_intr |= nvme_opc_delete_io_cq(sc, cmd, &compl);
1066 			break;
1067 		case NVME_OPC_CREATE_IO_CQ:
1068 			DPRINTF(("%s command CREATE_IO_CQ\r\n", __func__));
1069 			do_intr |= nvme_opc_create_io_cq(sc, cmd, &compl);
1070 			break;
1071 		case NVME_OPC_GET_LOG_PAGE:
1072 			DPRINTF(("%s command GET_LOG_PAGE\r\n", __func__));
1073 			do_intr |= nvme_opc_get_log_page(sc, cmd, &compl);
1074 			break;
1075 		case NVME_OPC_IDENTIFY:
1076 			DPRINTF(("%s command IDENTIFY\r\n", __func__));
1077 			do_intr |= nvme_opc_identify(sc, cmd, &compl);
1078 			break;
1079 		case NVME_OPC_ABORT:
1080 			DPRINTF(("%s command ABORT\r\n", __func__));
1081 			do_intr |= nvme_opc_abort(sc, cmd, &compl);
1082 			break;
1083 		case NVME_OPC_SET_FEATURES:
1084 			DPRINTF(("%s command SET_FEATURES\r\n", __func__));
1085 			do_intr |= nvme_opc_set_features(sc, cmd, &compl);
1086 			break;
1087 		case NVME_OPC_GET_FEATURES:
1088 			DPRINTF(("%s command GET_FEATURES\r\n", __func__));
1089 			do_intr |= nvme_opc_get_features(sc, cmd, &compl);
1090 			break;
1091 		case NVME_OPC_ASYNC_EVENT_REQUEST:
1092 			DPRINTF(("%s command ASYNC_EVENT_REQ\r\n", __func__));
1093 			/* XXX dont care, unhandled for now
1094 			do_intr |= nvme_opc_async_event_req(sc, cmd, &compl);
1095 			*/
1096 			break;
1097 		default:
1098 			WPRINTF(("0x%x command is not implemented\r\n",
1099 			    cmd->opc));
1100 		}
1101 
1102 		/* for now skip async event generation */
1103 		if (cmd->opc != NVME_OPC_ASYNC_EVENT_REQUEST) {
1104 			struct nvme_completion *cp;
1105 			int phase;
1106 
1107 			cq = &sc->compl_queues[0];
1108 
1109 			cp = &(cq->qbase)[cq->tail];
1110 			cp->cdw0 = compl.cdw0;
1111 			cp->sqid = 0;
1112 			cp->sqhd = sqhead;
1113 			cp->cid = cmd->cid;
1114 
1115 			phase = NVME_STATUS_GET_P(cp->status);
1116 			cp->status = compl.status;
1117 			pci_nvme_toggle_phase(&cp->status, phase);
1118 
1119 			cq->tail = (cq->tail + 1) % cq->size;
1120 		}
1121 		sqhead = (sqhead + 1) % sq->size;
1122 	}
1123 
1124 	DPRINTF(("setting sqhead %u\r\n", sqhead));
1125 	atomic_store_short(&sq->head, sqhead);
1126 	atomic_store_int(&sq->busy, 0);
1127 
1128 	if (do_intr)
1129 		pci_generate_msix(sc->nsc_pi, 0);
1130 
1131 }
1132 
1133 static int
1134 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1135 	uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1136 {
1137 	int iovidx;
1138 
1139 	if (req != NULL) {
1140 		/* concatenate contig block-iovs to minimize number of iovs */
1141 		if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1142 			iovidx = req->io_req.br_iovcnt - 1;
1143 
1144 			req->io_req.br_iov[iovidx].iov_base =
1145 			    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1146 			                     req->prev_gpaddr, size);
1147 
1148 			req->prev_size += size;
1149 			req->io_req.br_resid += size;
1150 
1151 			req->io_req.br_iov[iovidx].iov_len = req->prev_size;
1152 		} else {
1153 			pthread_mutex_lock(&req->mtx);
1154 
1155 			iovidx = req->io_req.br_iovcnt;
1156 			if (iovidx == NVME_MAX_BLOCKIOVS) {
1157 				int err = 0;
1158 
1159 				DPRINTF(("large I/O, doing partial req\r\n"));
1160 
1161 				iovidx = 0;
1162 				req->io_req.br_iovcnt = 0;
1163 
1164 				req->io_req.br_callback = pci_nvme_io_partial;
1165 
1166 				if (!do_write)
1167 					err = blockif_read(sc->nvstore.ctx,
1168 					                   &req->io_req);
1169 				else
1170 					err = blockif_write(sc->nvstore.ctx,
1171 					                    &req->io_req);
1172 
1173 				/* wait until req completes before cont */
1174 				if (err == 0)
1175 					pthread_cond_wait(&req->cv, &req->mtx);
1176 			}
1177 			if (iovidx == 0) {
1178 				req->io_req.br_offset = lba;
1179 				req->io_req.br_resid = 0;
1180 				req->io_req.br_param = req;
1181 			}
1182 
1183 			req->io_req.br_iov[iovidx].iov_base =
1184 			    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1185 			                     gpaddr, size);
1186 
1187 			req->io_req.br_iov[iovidx].iov_len = size;
1188 
1189 			req->prev_gpaddr = gpaddr;
1190 			req->prev_size = size;
1191 			req->io_req.br_resid += size;
1192 
1193 			req->io_req.br_iovcnt++;
1194 
1195 			pthread_mutex_unlock(&req->mtx);
1196 		}
1197 	} else {
1198 		/* RAM buffer: read/write directly */
1199 		void *p = sc->nvstore.ctx;
1200 		void *gptr;
1201 
1202 		if ((lba + size) > sc->nvstore.size) {
1203 			WPRINTF(("%s write would overflow RAM\r\n", __func__));
1204 			return (-1);
1205 		}
1206 
1207 		p = (void *)((uintptr_t)p + (uintptr_t)lba);
1208 		gptr = paddr_guest2host(sc->nsc_pi->pi_vmctx, gpaddr, size);
1209 		if (do_write)
1210 			memcpy(p, gptr, size);
1211 		else
1212 			memcpy(gptr, p, size);
1213 	}
1214 	return (0);
1215 }
1216 
1217 static void
1218 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1219 	struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1220 	uint32_t cdw0, uint16_t status, int ignore_busy)
1221 {
1222 	struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1223 	struct nvme_completion *compl;
1224 	int do_intr = 0;
1225 	int phase;
1226 
1227 	DPRINTF(("%s sqid %d cqid %u cid %u status: 0x%x 0x%x\r\n",
1228 		 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1229 		 NVME_STATUS_GET_SC(status)));
1230 
1231 	pthread_mutex_lock(&cq->mtx);
1232 
1233 	assert(cq->qbase != NULL);
1234 
1235 	compl = &cq->qbase[cq->tail];
1236 
1237 	compl->sqhd = atomic_load_acq_short(&sq->head);
1238 	compl->sqid = sqid;
1239 	compl->cid = cid;
1240 
1241 	// toggle phase
1242 	phase = NVME_STATUS_GET_P(compl->status);
1243 	compl->status = status;
1244 	pci_nvme_toggle_phase(&compl->status, phase);
1245 
1246 	cq->tail = (cq->tail + 1) % cq->size;
1247 
1248 	if (cq->intr_en & NVME_CQ_INTEN)
1249 		do_intr = 1;
1250 
1251 	pthread_mutex_unlock(&cq->mtx);
1252 
1253 	if (ignore_busy || !atomic_load_acq_int(&sq->busy))
1254 		if (do_intr)
1255 			pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1256 }
1257 
1258 static void
1259 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1260 {
1261 	req->sc = NULL;
1262 	req->nvme_sq = NULL;
1263 	req->sqid = 0;
1264 
1265 	pthread_mutex_lock(&sc->mtx);
1266 
1267 	req->next = sc->ioreqs_free;
1268 	sc->ioreqs_free = req;
1269 	sc->pending_ios--;
1270 
1271 	/* when no more IO pending, can set to ready if device reset/enabled */
1272 	if (sc->pending_ios == 0 &&
1273 	    NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1274 		sc->regs.csts |= NVME_CSTS_RDY;
1275 
1276 	pthread_mutex_unlock(&sc->mtx);
1277 
1278 	sem_post(&sc->iosemlock);
1279 }
1280 
1281 static struct pci_nvme_ioreq *
1282 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1283 {
1284 	struct pci_nvme_ioreq *req = NULL;;
1285 
1286 	sem_wait(&sc->iosemlock);
1287 	pthread_mutex_lock(&sc->mtx);
1288 
1289 	req = sc->ioreqs_free;
1290 	assert(req != NULL);
1291 
1292 	sc->ioreqs_free = req->next;
1293 
1294 	req->next = NULL;
1295 	req->sc = sc;
1296 
1297 	sc->pending_ios++;
1298 
1299 	pthread_mutex_unlock(&sc->mtx);
1300 
1301 	req->io_req.br_iovcnt = 0;
1302 	req->io_req.br_offset = 0;
1303 	req->io_req.br_resid = 0;
1304 	req->io_req.br_param = req;
1305 	req->prev_gpaddr = 0;
1306 	req->prev_size = 0;
1307 
1308 	return req;
1309 }
1310 
1311 static void
1312 pci_nvme_io_done(struct blockif_req *br, int err)
1313 {
1314 	struct pci_nvme_ioreq *req = br->br_param;
1315 	struct nvme_submission_queue *sq = req->nvme_sq;
1316 	uint16_t code, status;
1317 
1318 	DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err)));
1319 
1320 	/* TODO return correct error */
1321 	code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1322 	pci_nvme_status_genc(&status, code);
1323 
1324 	pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status, 0);
1325 	pci_nvme_release_ioreq(req->sc, req);
1326 }
1327 
1328 static void
1329 pci_nvme_io_partial(struct blockif_req *br, int err)
1330 {
1331 	struct pci_nvme_ioreq *req = br->br_param;
1332 
1333 	DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err)));
1334 
1335 	pthread_cond_signal(&req->cv);
1336 }
1337 
1338 
1339 static void
1340 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
1341 {
1342 	struct nvme_submission_queue *sq;
1343 	uint16_t status;
1344 	uint16_t sqhead;
1345 	int err;
1346 
1347 	/* handle all submissions up to sq->tail index */
1348 	sq = &sc->submit_queues[idx];
1349 
1350 	if (atomic_testandset_int(&sq->busy, 1)) {
1351 		DPRINTF(("%s sqid %u busy\r\n", __func__, idx));
1352 		return;
1353 	}
1354 
1355 	sqhead = atomic_load_acq_short(&sq->head);
1356 
1357 	DPRINTF(("nvme_handle_io qid %u head %u tail %u cmdlist %p\r\n",
1358 	         idx, sqhead, sq->tail, sq->qbase));
1359 
1360 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
1361 		struct nvme_command *cmd;
1362 		struct pci_nvme_ioreq *req = NULL;
1363 		uint64_t lba;
1364 		uint64_t nblocks, bytes, size, cpsz;
1365 
1366 		/* TODO: support scatter gather list handling */
1367 
1368 		cmd = &sq->qbase[sqhead];
1369 		sqhead = (sqhead + 1) % sq->size;
1370 
1371 		lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
1372 
1373 		if (cmd->opc == NVME_OPC_FLUSH) {
1374 			pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1375 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1376 			                        status, 1);
1377 
1378 			continue;
1379 		} else if (cmd->opc == 0x08) {
1380 			/* TODO: write zeroes */
1381 			WPRINTF(("%s write zeroes lba 0x%lx blocks %u\r\n",
1382 			        __func__, lba, cmd->cdw12 & 0xFFFF));
1383 			pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1384 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1385 			                        status, 1);
1386 
1387 			continue;
1388 		}
1389 
1390 		nblocks = (cmd->cdw12 & 0xFFFF) + 1;
1391 
1392 		bytes = nblocks * sc->nvstore.sectsz;
1393 
1394 		if (sc->nvstore.type == NVME_STOR_BLOCKIF) {
1395 			req = pci_nvme_get_ioreq(sc);
1396 			req->nvme_sq = sq;
1397 			req->sqid = idx;
1398 		}
1399 
1400 		/*
1401 		 * If data starts mid-page and flows into the next page, then
1402 		 * increase page count
1403 		 */
1404 
1405 		DPRINTF(("[h%u:t%u:n%u] %s starting LBA 0x%lx blocks %lu "
1406 		         "(%lu-bytes)\r\n",
1407 		         sqhead==0 ? sq->size-1 : sqhead-1, sq->tail, sq->size,
1408 		         cmd->opc == NVME_OPC_WRITE ?
1409 			     "WRITE" : "READ",
1410 		         lba, nblocks, bytes));
1411 
1412 		cmd->prp1 &= ~(0x03UL);
1413 		cmd->prp2 &= ~(0x03UL);
1414 
1415 		DPRINTF((" prp1 0x%lx prp2 0x%lx\r\n", cmd->prp1, cmd->prp2));
1416 
1417 		size = bytes;
1418 		lba *= sc->nvstore.sectsz;
1419 
1420 		cpsz = PAGE_SIZE - (cmd->prp1 % PAGE_SIZE);
1421 
1422 		if (cpsz > bytes)
1423 			cpsz = bytes;
1424 
1425 		if (req != NULL) {
1426 			req->io_req.br_offset = ((uint64_t)cmd->cdw11 << 32) |
1427 			                        cmd->cdw10;
1428 			req->opc = cmd->opc;
1429 			req->cid = cmd->cid;
1430 			req->nsid = cmd->nsid;
1431 		}
1432 
1433 		err = pci_nvme_append_iov_req(sc, req, cmd->prp1, cpsz,
1434 		    cmd->opc == NVME_OPC_WRITE, lba);
1435 		lba += cpsz;
1436 		size -= cpsz;
1437 
1438 		if (size == 0)
1439 			goto iodone;
1440 
1441 		if (size <= PAGE_SIZE) {
1442 			/* prp2 is second (and final) page in transfer */
1443 
1444 			err = pci_nvme_append_iov_req(sc, req, cmd->prp2,
1445 			    size,
1446 			    cmd->opc == NVME_OPC_WRITE,
1447 			    lba);
1448 		} else {
1449 			uint64_t *prp_list;
1450 			int i;
1451 
1452 			/* prp2 is pointer to a physical region page list */
1453 			prp_list = paddr_guest2host(sc->nsc_pi->pi_vmctx,
1454 			                            cmd->prp2, PAGE_SIZE);
1455 
1456 			i = 0;
1457 			while (size != 0) {
1458 				cpsz = MIN(size, PAGE_SIZE);
1459 
1460 				/*
1461 				 * Move to linked physical region page list
1462 				 * in last item.
1463 				 */
1464 				if (i == (NVME_PRP2_ITEMS-1) &&
1465 				    size > PAGE_SIZE) {
1466 					assert((prp_list[i] & (PAGE_SIZE-1)) == 0);
1467 					prp_list = paddr_guest2host(
1468 					              sc->nsc_pi->pi_vmctx,
1469 					              prp_list[i], PAGE_SIZE);
1470 					i = 0;
1471 				}
1472 				if (prp_list[i] == 0) {
1473 					WPRINTF(("PRP2[%d] = 0 !!!\r\n", i));
1474 					err = 1;
1475 					break;
1476 				}
1477 
1478 				err = pci_nvme_append_iov_req(sc, req,
1479 				    prp_list[i], cpsz,
1480 				    cmd->opc == NVME_OPC_WRITE, lba);
1481 				if (err)
1482 					break;
1483 
1484 				lba += cpsz;
1485 				size -= cpsz;
1486 				i++;
1487 			}
1488 		}
1489 
1490 iodone:
1491 		if (sc->nvstore.type == NVME_STOR_RAM) {
1492 			uint16_t code, status;
1493 
1494 			code = err ? NVME_SC_LBA_OUT_OF_RANGE :
1495 			    NVME_SC_SUCCESS;
1496 			pci_nvme_status_genc(&status, code);
1497 
1498 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1499 			                        status, 1);
1500 
1501 			continue;
1502 		}
1503 
1504 
1505 		if (err)
1506 			goto do_error;
1507 
1508 		req->io_req.br_callback = pci_nvme_io_done;
1509 
1510 		err = 0;
1511 		switch (cmd->opc) {
1512 		case NVME_OPC_READ:
1513 			err = blockif_read(sc->nvstore.ctx, &req->io_req);
1514 			break;
1515 		case NVME_OPC_WRITE:
1516 			err = blockif_write(sc->nvstore.ctx, &req->io_req);
1517 			break;
1518 		default:
1519 			WPRINTF(("%s unhandled io command 0x%x\r\n",
1520 				 __func__, cmd->opc));
1521 			err = 1;
1522 		}
1523 
1524 do_error:
1525 		if (err) {
1526 			uint16_t status;
1527 
1528 			pci_nvme_status_genc(&status,
1529 			    NVME_SC_DATA_TRANSFER_ERROR);
1530 
1531 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1532 			                        status, 1);
1533 			pci_nvme_release_ioreq(sc, req);
1534 		}
1535 	}
1536 
1537 	atomic_store_short(&sq->head, sqhead);
1538 	atomic_store_int(&sq->busy, 0);
1539 }
1540 
1541 static void
1542 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
1543 	uint64_t idx, int is_sq, uint64_t value)
1544 {
1545 	DPRINTF(("nvme doorbell %lu, %s, val 0x%lx\r\n",
1546 	        idx, is_sq ? "SQ" : "CQ", value & 0xFFFF));
1547 
1548 	if (is_sq) {
1549 		atomic_store_short(&sc->submit_queues[idx].tail,
1550 		                   (uint16_t)value);
1551 
1552 		if (idx == 0) {
1553 			pci_nvme_handle_admin_cmd(sc, value);
1554 		} else {
1555 			/* submission queue; handle new entries in SQ */
1556 			if (idx > sc->num_squeues) {
1557 				WPRINTF(("%s SQ index %lu overflow from "
1558 				         "guest (max %u)\r\n",
1559 				         __func__, idx, sc->num_squeues));
1560 				return;
1561 			}
1562 			pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
1563 		}
1564 	} else {
1565 		if (idx > sc->num_cqueues) {
1566 			WPRINTF(("%s queue index %lu overflow from "
1567 			         "guest (max %u)\r\n",
1568 			         __func__, idx, sc->num_cqueues));
1569 			return;
1570 		}
1571 
1572 		sc->compl_queues[idx].head = (uint16_t)value;
1573 	}
1574 }
1575 
1576 static void
1577 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
1578 {
1579 	const char *s = iswrite ? "WRITE" : "READ";
1580 
1581 	switch (offset) {
1582 	case NVME_CR_CAP_LOW:
1583 		DPRINTF(("%s %s NVME_CR_CAP_LOW\r\n", func, s));
1584 		break;
1585 	case NVME_CR_CAP_HI:
1586 		DPRINTF(("%s %s NVME_CR_CAP_HI\r\n", func, s));
1587 		break;
1588 	case NVME_CR_VS:
1589 		DPRINTF(("%s %s NVME_CR_VS\r\n", func, s));
1590 		break;
1591 	case NVME_CR_INTMS:
1592 		DPRINTF(("%s %s NVME_CR_INTMS\r\n", func, s));
1593 		break;
1594 	case NVME_CR_INTMC:
1595 		DPRINTF(("%s %s NVME_CR_INTMC\r\n", func, s));
1596 		break;
1597 	case NVME_CR_CC:
1598 		DPRINTF(("%s %s NVME_CR_CC\r\n", func, s));
1599 		break;
1600 	case NVME_CR_CSTS:
1601 		DPRINTF(("%s %s NVME_CR_CSTS\r\n", func, s));
1602 		break;
1603 	case NVME_CR_NSSR:
1604 		DPRINTF(("%s %s NVME_CR_NSSR\r\n", func, s));
1605 		break;
1606 	case NVME_CR_AQA:
1607 		DPRINTF(("%s %s NVME_CR_AQA\r\n", func, s));
1608 		break;
1609 	case NVME_CR_ASQ_LOW:
1610 		DPRINTF(("%s %s NVME_CR_ASQ_LOW\r\n", func, s));
1611 		break;
1612 	case NVME_CR_ASQ_HI:
1613 		DPRINTF(("%s %s NVME_CR_ASQ_HI\r\n", func, s));
1614 		break;
1615 	case NVME_CR_ACQ_LOW:
1616 		DPRINTF(("%s %s NVME_CR_ACQ_LOW\r\n", func, s));
1617 		break;
1618 	case NVME_CR_ACQ_HI:
1619 		DPRINTF(("%s %s NVME_CR_ACQ_HI\r\n", func, s));
1620 		break;
1621 	default:
1622 		DPRINTF(("unknown nvme bar-0 offset 0x%lx\r\n", offset));
1623 	}
1624 
1625 }
1626 
1627 static void
1628 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
1629 	uint64_t offset, int size, uint64_t value)
1630 {
1631 	uint32_t ccreg;
1632 
1633 	if (offset >= NVME_DOORBELL_OFFSET) {
1634 		uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
1635 		uint64_t idx = belloffset / 8; /* door bell size = 2*int */
1636 		int is_sq = (belloffset % 8) < 4;
1637 
1638 		if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
1639 			WPRINTF(("guest attempted an overflow write offset "
1640 			         "0x%lx, val 0x%lx in %s",
1641 			         offset, value, __func__));
1642 			return;
1643 		}
1644 
1645 		pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
1646 		return;
1647 	}
1648 
1649 	DPRINTF(("nvme-write offset 0x%lx, size %d, value 0x%lx\r\n",
1650 	        offset, size, value));
1651 
1652 	if (size != 4) {
1653 		WPRINTF(("guest wrote invalid size %d (offset 0x%lx, "
1654 		         "val 0x%lx) to bar0 in %s",
1655 		         size, offset, value, __func__));
1656 		/* TODO: shutdown device */
1657 		return;
1658 	}
1659 
1660 	pci_nvme_bar0_reg_dumps(__func__, offset, 1);
1661 
1662 	pthread_mutex_lock(&sc->mtx);
1663 
1664 	switch (offset) {
1665 	case NVME_CR_CAP_LOW:
1666 	case NVME_CR_CAP_HI:
1667 		/* readonly */
1668 		break;
1669 	case NVME_CR_VS:
1670 		/* readonly */
1671 		break;
1672 	case NVME_CR_INTMS:
1673 		/* MSI-X, so ignore */
1674 		break;
1675 	case NVME_CR_INTMC:
1676 		/* MSI-X, so ignore */
1677 		break;
1678 	case NVME_CR_CC:
1679 		ccreg = (uint32_t)value;
1680 
1681 		DPRINTF(("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
1682 		         "iocqes %u\r\n",
1683 		        __func__,
1684 			 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
1685 			 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
1686 			 NVME_CC_GET_IOCQES(ccreg)));
1687 
1688 		if (NVME_CC_GET_SHN(ccreg)) {
1689 			/* perform shutdown - flush out data to backend */
1690 			sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
1691 			    NVME_CSTS_REG_SHST_SHIFT);
1692 			sc->regs.csts |= NVME_SHST_COMPLETE <<
1693 			    NVME_CSTS_REG_SHST_SHIFT;
1694 		}
1695 		if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
1696 			if (NVME_CC_GET_EN(ccreg) == 0)
1697 				/* transition 1-> causes controller reset */
1698 				pci_nvme_reset_locked(sc);
1699 			else
1700 				pci_nvme_init_controller(ctx, sc);
1701 		}
1702 
1703 		/* Insert the iocqes, iosqes and en bits from the write */
1704 		sc->regs.cc &= ~NVME_CC_WRITE_MASK;
1705 		sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
1706 		if (NVME_CC_GET_EN(ccreg) == 0) {
1707 			/* Insert the ams, mps and css bit fields */
1708 			sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
1709 			sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
1710 			sc->regs.csts &= ~NVME_CSTS_RDY;
1711 		} else if (sc->pending_ios == 0) {
1712 			sc->regs.csts |= NVME_CSTS_RDY;
1713 		}
1714 		break;
1715 	case NVME_CR_CSTS:
1716 		break;
1717 	case NVME_CR_NSSR:
1718 		/* ignore writes; don't support subsystem reset */
1719 		break;
1720 	case NVME_CR_AQA:
1721 		sc->regs.aqa = (uint32_t)value;
1722 		break;
1723 	case NVME_CR_ASQ_LOW:
1724 		sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
1725 		               (0xFFFFF000 & value);
1726 		break;
1727 	case NVME_CR_ASQ_HI:
1728 		sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
1729 		               (value << 32);
1730 		break;
1731 	case NVME_CR_ACQ_LOW:
1732 		sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
1733 		               (0xFFFFF000 & value);
1734 		break;
1735 	case NVME_CR_ACQ_HI:
1736 		sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
1737 		               (value << 32);
1738 		break;
1739 	default:
1740 		DPRINTF(("%s unknown offset 0x%lx, value 0x%lx size %d\r\n",
1741 		         __func__, offset, value, size));
1742 	}
1743 	pthread_mutex_unlock(&sc->mtx);
1744 }
1745 
1746 static void
1747 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
1748                 int baridx, uint64_t offset, int size, uint64_t value)
1749 {
1750 	struct pci_nvme_softc* sc = pi->pi_arg;
1751 
1752 	if (baridx == pci_msix_table_bar(pi) ||
1753 	    baridx == pci_msix_pba_bar(pi)) {
1754 		DPRINTF(("nvme-write baridx %d, msix: off 0x%lx, size %d, "
1755 		         " value 0x%lx\r\n", baridx, offset, size, value));
1756 
1757 		pci_emul_msix_twrite(pi, offset, size, value);
1758 		return;
1759 	}
1760 
1761 	switch (baridx) {
1762 	case 0:
1763 		pci_nvme_write_bar_0(ctx, sc, offset, size, value);
1764 		break;
1765 
1766 	default:
1767 		DPRINTF(("%s unknown baridx %d, val 0x%lx\r\n",
1768 		         __func__, baridx, value));
1769 	}
1770 }
1771 
1772 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
1773 	uint64_t offset, int size)
1774 {
1775 	uint64_t value;
1776 
1777 	pci_nvme_bar0_reg_dumps(__func__, offset, 0);
1778 
1779 	if (offset < NVME_DOORBELL_OFFSET) {
1780 		void *p = &(sc->regs);
1781 		pthread_mutex_lock(&sc->mtx);
1782 		memcpy(&value, (void *)((uintptr_t)p + offset), size);
1783 		pthread_mutex_unlock(&sc->mtx);
1784 	} else {
1785 		value = 0;
1786                 WPRINTF(("pci_nvme: read invalid offset %ld\r\n", offset));
1787 	}
1788 
1789 	switch (size) {
1790 	case 1:
1791 		value &= 0xFF;
1792 		break;
1793 	case 2:
1794 		value &= 0xFFFF;
1795 		break;
1796 	case 4:
1797 		value &= 0xFFFFFFFF;
1798 		break;
1799 	}
1800 
1801 	DPRINTF(("   nvme-read offset 0x%lx, size %d -> value 0x%x\r\n",
1802 	         offset, size, (uint32_t)value));
1803 
1804 	return (value);
1805 }
1806 
1807 
1808 
1809 static uint64_t
1810 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
1811     uint64_t offset, int size)
1812 {
1813 	struct pci_nvme_softc* sc = pi->pi_arg;
1814 
1815 	if (baridx == pci_msix_table_bar(pi) ||
1816 	    baridx == pci_msix_pba_bar(pi)) {
1817 		DPRINTF(("nvme-read bar: %d, msix: regoff 0x%lx, size %d\r\n",
1818 		        baridx, offset, size));
1819 
1820 		return pci_emul_msix_tread(pi, offset, size);
1821 	}
1822 
1823 	switch (baridx) {
1824 	case 0:
1825        		return pci_nvme_read_bar_0(sc, offset, size);
1826 
1827 	default:
1828 		DPRINTF(("unknown bar %d, 0x%lx\r\n", baridx, offset));
1829 	}
1830 
1831 	return (0);
1832 }
1833 
1834 
1835 static int
1836 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
1837 {
1838 	char bident[sizeof("XX:X:X")];
1839 	char	*uopt, *xopts, *config;
1840 	uint32_t sectsz;
1841 	int optidx;
1842 
1843 	sc->max_queues = NVME_QUEUES;
1844 	sc->max_qentries = NVME_MAX_QENTRIES;
1845 	sc->ioslots = NVME_IOSLOTS;
1846 	sc->num_squeues = sc->max_queues;
1847 	sc->num_cqueues = sc->max_queues;
1848 	sectsz = 0;
1849 
1850 	uopt = strdup(opts);
1851 	optidx = 0;
1852 	snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
1853 	         "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
1854 	for (xopts = strtok(uopt, ",");
1855 	     xopts != NULL;
1856 	     xopts = strtok(NULL, ",")) {
1857 
1858 		if ((config = strchr(xopts, '=')) != NULL)
1859 			*config++ = '\0';
1860 
1861 		if (!strcmp("maxq", xopts)) {
1862 			sc->max_queues = atoi(config);
1863 		} else if (!strcmp("qsz", xopts)) {
1864 			sc->max_qentries = atoi(config);
1865 		} else if (!strcmp("ioslots", xopts)) {
1866 			sc->ioslots = atoi(config);
1867 		} else if (!strcmp("sectsz", xopts)) {
1868 			sectsz = atoi(config);
1869 		} else if (!strcmp("ser", xopts)) {
1870 			/*
1871 			 * This field indicates the Product Serial Number in
1872 			 * 7-bit ASCII, unused bytes should be space characters.
1873 			 * Ref: NVMe v1.3c.
1874 			 */
1875 			cpywithpad((char *)sc->ctrldata.sn,
1876 			           sizeof(sc->ctrldata.sn), config, ' ');
1877 		} else if (!strcmp("ram", xopts)) {
1878 			uint64_t sz = strtoull(&xopts[4], NULL, 10);
1879 
1880 			sc->nvstore.type = NVME_STOR_RAM;
1881 			sc->nvstore.size = sz * 1024 * 1024;
1882 			sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1883 			sc->nvstore.sectsz = 4096;
1884 			sc->nvstore.sectsz_bits = 12;
1885 			if (sc->nvstore.ctx == NULL) {
1886 				perror("Unable to allocate RAM");
1887 				free(uopt);
1888 				return (-1);
1889 			}
1890 		} else if (!strcmp("eui64", xopts)) {
1891 			sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0));
1892 		} else if (optidx == 0) {
1893 			snprintf(bident, sizeof(bident), "%d:%d",
1894 			         sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
1895 			sc->nvstore.ctx = blockif_open(xopts, bident);
1896 			if (sc->nvstore.ctx == NULL) {
1897 				perror("Could not open backing file");
1898 				free(uopt);
1899 				return (-1);
1900 			}
1901 			sc->nvstore.type = NVME_STOR_BLOCKIF;
1902 			sc->nvstore.size = blockif_size(sc->nvstore.ctx);
1903 		} else {
1904 			fprintf(stderr, "Invalid option %s\n", xopts);
1905 			free(uopt);
1906 			return (-1);
1907 		}
1908 
1909 		optidx++;
1910 	}
1911 	free(uopt);
1912 
1913 	if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
1914 		fprintf(stderr, "backing store not specified\n");
1915 		return (-1);
1916 	}
1917 	if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
1918 		sc->nvstore.sectsz = sectsz;
1919 	else if (sc->nvstore.type != NVME_STOR_RAM)
1920 		sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
1921 	for (sc->nvstore.sectsz_bits = 9;
1922 	     (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
1923 	     sc->nvstore.sectsz_bits++);
1924 
1925 	if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
1926 		sc->max_queues = NVME_QUEUES;
1927 
1928 	if (sc->max_qentries <= 0) {
1929 		fprintf(stderr, "Invalid qsz option\n");
1930 		return (-1);
1931 	}
1932 	if (sc->ioslots <= 0) {
1933 		fprintf(stderr, "Invalid ioslots option\n");
1934 		return (-1);
1935 	}
1936 
1937 	return (0);
1938 }
1939 
1940 static int
1941 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
1942 {
1943 	struct pci_nvme_softc *sc;
1944 	uint32_t pci_membar_sz;
1945 	int	error;
1946 
1947 	error = 0;
1948 
1949 	sc = calloc(1, sizeof(struct pci_nvme_softc));
1950 	pi->pi_arg = sc;
1951 	sc->nsc_pi = pi;
1952 
1953 	error = pci_nvme_parse_opts(sc, opts);
1954 	if (error < 0)
1955 		goto done;
1956 	else
1957 		error = 0;
1958 
1959 	sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
1960 	for (int i = 0; i < sc->ioslots; i++) {
1961 		if (i < (sc->ioslots-1))
1962 			sc->ioreqs[i].next = &sc->ioreqs[i+1];
1963 		pthread_mutex_init(&sc->ioreqs[i].mtx, NULL);
1964 		pthread_cond_init(&sc->ioreqs[i].cv, NULL);
1965 	}
1966 	sc->ioreqs_free = sc->ioreqs;
1967 	sc->intr_coales_aggr_thresh = 1;
1968 
1969 	pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
1970 	pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
1971 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
1972 	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
1973 	pci_set_cfgdata8(pi, PCIR_PROGIF,
1974 	                 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
1975 
1976 	/*
1977 	 * Allocate size of NVMe registers + doorbell space for all queues.
1978 	 *
1979 	 * The specification requires a minimum memory I/O window size of 16K.
1980 	 * The Windows driver will refuse to start a device with a smaller
1981 	 * window.
1982 	 */
1983 	pci_membar_sz = sizeof(struct nvme_registers) +
1984 	    2 * sizeof(uint32_t) * (sc->max_queues + 1);
1985 	pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
1986 
1987 	DPRINTF(("nvme membar size: %u\r\n", pci_membar_sz));
1988 
1989 	error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
1990 	if (error) {
1991 		WPRINTF(("%s pci alloc mem bar failed\r\n", __func__));
1992 		goto done;
1993 	}
1994 
1995 	error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
1996 	if (error) {
1997 		WPRINTF(("%s pci add msixcap failed\r\n", __func__));
1998 		goto done;
1999 	}
2000 
2001 	error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
2002 	if (error) {
2003 		WPRINTF(("%s pci add Express capability failed\r\n", __func__));
2004 		goto done;
2005 	}
2006 
2007 	pthread_mutex_init(&sc->mtx, NULL);
2008 	sem_init(&sc->iosemlock, 0, sc->ioslots);
2009 
2010 	pci_nvme_reset(sc);
2011 	pci_nvme_init_ctrldata(sc);
2012 	pci_nvme_init_nsdata(sc, &sc->nsdata, 1, sc->nvstore.eui64);
2013 	pci_nvme_init_logpages(sc);
2014 
2015 	pci_lintr_request(pi);
2016 
2017 done:
2018 	return (error);
2019 }
2020 
2021 
2022 struct pci_devemu pci_de_nvme = {
2023 	.pe_emu =	"nvme",
2024 	.pe_init =	pci_nvme_init,
2025 	.pe_barwrite =	pci_nvme_write,
2026 	.pe_barread =	pci_nvme_read
2027 };
2028 PCI_EMUL_SET(pci_de_nvme);
2029