1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2017 Shunsuke Mie 5 * Copyright (c) 2018 Leon Dang 6 * 7 * Function crc16 Copyright (c) 2017, Fedor Uporov 8 * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 /* 33 * bhyve PCIe-NVMe device emulation. 34 * 35 * options: 36 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=# 37 * 38 * accepted devpath: 39 * /dev/blockdev 40 * /path/to/image 41 * ram=size_in_MiB 42 * 43 * maxq = max number of queues 44 * qsz = max elements in each queue 45 * ioslots = max number of concurrent io requests 46 * sectsz = sector size (defaults to blockif sector size) 47 * ser = serial number (20-chars max) 48 * eui64 = IEEE Extended Unique Identifier (8 byte value) 49 * 50 */ 51 52 /* TODO: 53 - create async event for smart and log 54 - intr coalesce 55 */ 56 57 #include <sys/cdefs.h> 58 __FBSDID("$FreeBSD$"); 59 60 #include <sys/types.h> 61 #include <net/ieee_oui.h> 62 63 #include <assert.h> 64 #include <pthread.h> 65 #include <semaphore.h> 66 #include <stdbool.h> 67 #include <stddef.h> 68 #include <stdint.h> 69 #include <stdio.h> 70 #include <stdlib.h> 71 #include <string.h> 72 73 #include <machine/atomic.h> 74 #include <machine/vmm.h> 75 #include <vmmapi.h> 76 77 #include <dev/nvme/nvme.h> 78 79 #include "bhyverun.h" 80 #include "block_if.h" 81 #include "pci_emul.h" 82 83 84 static int nvme_debug = 0; 85 #define DPRINTF(params) if (nvme_debug) printf params 86 #define WPRINTF(params) printf params 87 88 /* defaults; can be overridden */ 89 #define NVME_MSIX_BAR 4 90 91 #define NVME_IOSLOTS 8 92 93 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */ 94 #define NVME_MMIO_SPACE_MIN (1 << 14) 95 96 #define NVME_QUEUES 16 97 #define NVME_MAX_QENTRIES 2048 98 99 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t)) 100 #define NVME_MAX_BLOCKIOVS 512 101 102 /* helpers */ 103 104 /* Convert a zero-based value into a one-based value */ 105 #define ONE_BASED(zero) ((zero) + 1) 106 /* Convert a one-based value into a zero-based value */ 107 #define ZERO_BASED(one) ((one) - 1) 108 109 /* Encode number of SQ's and CQ's for Set/Get Features */ 110 #define NVME_FEATURE_NUM_QUEUES(sc) \ 111 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \ 112 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16; 113 114 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell) 115 116 enum nvme_controller_register_offsets { 117 NVME_CR_CAP_LOW = 0x00, 118 NVME_CR_CAP_HI = 0x04, 119 NVME_CR_VS = 0x08, 120 NVME_CR_INTMS = 0x0c, 121 NVME_CR_INTMC = 0x10, 122 NVME_CR_CC = 0x14, 123 NVME_CR_CSTS = 0x1c, 124 NVME_CR_NSSR = 0x20, 125 NVME_CR_AQA = 0x24, 126 NVME_CR_ASQ_LOW = 0x28, 127 NVME_CR_ASQ_HI = 0x2c, 128 NVME_CR_ACQ_LOW = 0x30, 129 NVME_CR_ACQ_HI = 0x34, 130 }; 131 132 enum nvme_cmd_cdw11 { 133 NVME_CMD_CDW11_PC = 0x0001, 134 NVME_CMD_CDW11_IEN = 0x0002, 135 NVME_CMD_CDW11_IV = 0xFFFF0000, 136 }; 137 138 #define NVME_CQ_INTEN 0x01 139 #define NVME_CQ_INTCOAL 0x02 140 141 struct nvme_completion_queue { 142 struct nvme_completion *qbase; 143 uint32_t size; 144 uint16_t tail; /* nvme progress */ 145 uint16_t head; /* guest progress */ 146 uint16_t intr_vec; 147 uint32_t intr_en; 148 pthread_mutex_t mtx; 149 }; 150 151 struct nvme_submission_queue { 152 struct nvme_command *qbase; 153 uint32_t size; 154 uint16_t head; /* nvme progress */ 155 uint16_t tail; /* guest progress */ 156 uint16_t cqid; /* completion queue id */ 157 int busy; /* queue is being processed */ 158 int qpriority; 159 }; 160 161 enum nvme_storage_type { 162 NVME_STOR_BLOCKIF = 0, 163 NVME_STOR_RAM = 1, 164 }; 165 166 struct pci_nvme_blockstore { 167 enum nvme_storage_type type; 168 void *ctx; 169 uint64_t size; 170 uint32_t sectsz; 171 uint32_t sectsz_bits; 172 uint64_t eui64; 173 }; 174 175 struct pci_nvme_ioreq { 176 struct pci_nvme_softc *sc; 177 struct pci_nvme_ioreq *next; 178 struct nvme_submission_queue *nvme_sq; 179 uint16_t sqid; 180 181 /* command information */ 182 uint16_t opc; 183 uint16_t cid; 184 uint32_t nsid; 185 186 uint64_t prev_gpaddr; 187 size_t prev_size; 188 189 /* 190 * lock if all iovs consumed (big IO); 191 * complete transaction before continuing 192 */ 193 pthread_mutex_t mtx; 194 pthread_cond_t cv; 195 196 struct blockif_req io_req; 197 198 /* pad to fit up to 512 page descriptors from guest IO request */ 199 struct iovec iovpadding[NVME_MAX_BLOCKIOVS-BLOCKIF_IOV_MAX]; 200 }; 201 202 struct pci_nvme_softc { 203 struct pci_devinst *nsc_pi; 204 205 pthread_mutex_t mtx; 206 207 struct nvme_registers regs; 208 209 struct nvme_namespace_data nsdata; 210 struct nvme_controller_data ctrldata; 211 struct nvme_error_information_entry err_log; 212 struct nvme_health_information_page health_log; 213 struct nvme_firmware_page fw_log; 214 215 struct pci_nvme_blockstore nvstore; 216 217 uint16_t max_qentries; /* max entries per queue */ 218 uint32_t max_queues; /* max number of IO SQ's or CQ's */ 219 uint32_t num_cqueues; 220 uint32_t num_squeues; 221 222 struct pci_nvme_ioreq *ioreqs; 223 struct pci_nvme_ioreq *ioreqs_free; /* free list of ioreqs */ 224 uint32_t pending_ios; 225 uint32_t ioslots; 226 sem_t iosemlock; 227 228 /* 229 * Memory mapped Submission and Completion queues 230 * Each array includes both Admin and IO queues 231 */ 232 struct nvme_completion_queue *compl_queues; 233 struct nvme_submission_queue *submit_queues; 234 235 /* controller features */ 236 uint32_t intr_coales_aggr_time; /* 0x08: uS to delay intr */ 237 uint32_t intr_coales_aggr_thresh; /* 0x08: compl-Q entries */ 238 uint32_t async_ev_config; /* 0x0B: async event config */ 239 }; 240 241 242 static void pci_nvme_io_partial(struct blockif_req *br, int err); 243 244 /* Controller Configuration utils */ 245 #define NVME_CC_GET_EN(cc) \ 246 ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK) 247 #define NVME_CC_GET_CSS(cc) \ 248 ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK) 249 #define NVME_CC_GET_SHN(cc) \ 250 ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK) 251 #define NVME_CC_GET_IOSQES(cc) \ 252 ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK) 253 #define NVME_CC_GET_IOCQES(cc) \ 254 ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK) 255 256 #define NVME_CC_WRITE_MASK \ 257 ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \ 258 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \ 259 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT)) 260 261 #define NVME_CC_NEN_WRITE_MASK \ 262 ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \ 263 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \ 264 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT)) 265 266 /* Controller Status utils */ 267 #define NVME_CSTS_GET_RDY(sts) \ 268 ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK) 269 270 #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT) 271 272 /* Completion Queue status word utils */ 273 #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT) 274 #define NVME_STATUS_MASK \ 275 ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\ 276 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT)) 277 278 static __inline void 279 cpywithpad(char *dst, size_t dst_size, const char *src, char pad) 280 { 281 size_t len; 282 283 len = strnlen(src, dst_size); 284 memset(dst, pad, dst_size); 285 memcpy(dst, src, len); 286 } 287 288 static __inline void 289 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code) 290 { 291 292 *status &= ~NVME_STATUS_MASK; 293 *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT | 294 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT; 295 } 296 297 static __inline void 298 pci_nvme_status_genc(uint16_t *status, uint16_t code) 299 { 300 301 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code); 302 } 303 304 static __inline void 305 pci_nvme_toggle_phase(uint16_t *status, int prev) 306 { 307 308 if (prev) 309 *status &= ~NVME_STATUS_P; 310 else 311 *status |= NVME_STATUS_P; 312 } 313 314 static void 315 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc) 316 { 317 struct nvme_controller_data *cd = &sc->ctrldata; 318 319 cd->vid = 0xFB5D; 320 cd->ssvid = 0x0000; 321 322 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' '); 323 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' '); 324 325 /* Num of submission commands that we can handle at a time (2^rab) */ 326 cd->rab = 4; 327 328 /* FreeBSD OUI */ 329 cd->ieee[0] = 0x58; 330 cd->ieee[1] = 0x9c; 331 cd->ieee[2] = 0xfc; 332 333 cd->mic = 0; 334 335 cd->mdts = 9; /* max data transfer size (2^mdts * CAP.MPSMIN) */ 336 337 cd->ver = 0x00010300; 338 339 cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT; 340 cd->acl = 2; 341 cd->aerl = 4; 342 343 cd->lpa = 0; /* TODO: support some simple things like SMART */ 344 cd->elpe = 0; /* max error log page entries */ 345 cd->npss = 1; /* number of power states support */ 346 347 /* Warning Composite Temperature Threshold */ 348 cd->wctemp = 0x0157; 349 350 cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) | 351 (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT); 352 cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) | 353 (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT); 354 cd->nn = 1; /* number of namespaces */ 355 356 cd->fna = 0x03; 357 358 cd->power_state[0].mp = 10; 359 } 360 361 /* 362 * Calculate the CRC-16 of the given buffer 363 * See copyright attribution at top of file 364 */ 365 static uint16_t 366 crc16(uint16_t crc, const void *buffer, unsigned int len) 367 { 368 const unsigned char *cp = buffer; 369 /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */ 370 static uint16_t const crc16_table[256] = { 371 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241, 372 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440, 373 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40, 374 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841, 375 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40, 376 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41, 377 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641, 378 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040, 379 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240, 380 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441, 381 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41, 382 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840, 383 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41, 384 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40, 385 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640, 386 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041, 387 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240, 388 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441, 389 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41, 390 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840, 391 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41, 392 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40, 393 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640, 394 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041, 395 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241, 396 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440, 397 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40, 398 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841, 399 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40, 400 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41, 401 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641, 402 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040 403 }; 404 405 while (len--) 406 crc = (((crc >> 8) & 0xffU) ^ 407 crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU; 408 return crc; 409 } 410 411 static void 412 pci_nvme_init_nsdata(struct pci_nvme_softc *sc, 413 struct nvme_namespace_data *nd, uint32_t nsid, 414 uint64_t eui64) 415 { 416 417 nd->nsze = sc->nvstore.size / sc->nvstore.sectsz; 418 nd->ncap = nd->nsze; 419 nd->nuse = nd->nsze; 420 421 /* Get LBA and backstore information from backing store */ 422 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */ 423 nd->flbas = 0; 424 425 /* Create an EUI-64 if user did not provide one */ 426 if (eui64 == 0) { 427 char *data = NULL; 428 429 asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus, 430 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 431 432 if (data != NULL) { 433 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data)); 434 free(data); 435 } 436 eui64 = (eui64 << 16) | (nsid & 0xffff); 437 } 438 be64enc(nd->eui64, eui64); 439 440 /* LBA data-sz = 2^lbads */ 441 nd->lbaf[0] = sc->nvstore.sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT; 442 } 443 444 static void 445 pci_nvme_init_logpages(struct pci_nvme_softc *sc) 446 { 447 448 memset(&sc->err_log, 0, sizeof(sc->err_log)); 449 memset(&sc->health_log, 0, sizeof(sc->health_log)); 450 memset(&sc->fw_log, 0, sizeof(sc->fw_log)); 451 } 452 453 static void 454 pci_nvme_reset_locked(struct pci_nvme_softc *sc) 455 { 456 DPRINTF(("%s\r\n", __func__)); 457 458 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) | 459 (1 << NVME_CAP_LO_REG_CQR_SHIFT) | 460 (60 << NVME_CAP_LO_REG_TO_SHIFT); 461 462 sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT; 463 464 sc->regs.vs = 0x00010300; /* NVMe v1.3 */ 465 466 sc->regs.cc = 0; 467 sc->regs.csts = 0; 468 469 sc->num_cqueues = sc->num_squeues = sc->max_queues; 470 if (sc->submit_queues != NULL) { 471 for (int i = 0; i < sc->num_squeues + 1; i++) { 472 /* 473 * The Admin Submission Queue is at index 0. 474 * It must not be changed at reset otherwise the 475 * emulation will be out of sync with the guest. 476 */ 477 if (i != 0) { 478 sc->submit_queues[i].qbase = NULL; 479 sc->submit_queues[i].size = 0; 480 sc->submit_queues[i].cqid = 0; 481 } 482 sc->submit_queues[i].tail = 0; 483 sc->submit_queues[i].head = 0; 484 sc->submit_queues[i].busy = 0; 485 } 486 } else 487 sc->submit_queues = calloc(sc->num_squeues + 1, 488 sizeof(struct nvme_submission_queue)); 489 490 if (sc->compl_queues != NULL) { 491 for (int i = 0; i < sc->num_cqueues + 1; i++) { 492 /* See Admin Submission Queue note above */ 493 if (i != 0) { 494 sc->compl_queues[i].qbase = NULL; 495 sc->compl_queues[i].size = 0; 496 } 497 498 sc->compl_queues[i].tail = 0; 499 sc->compl_queues[i].head = 0; 500 } 501 } else { 502 sc->compl_queues = calloc(sc->num_cqueues + 1, 503 sizeof(struct nvme_completion_queue)); 504 505 for (int i = 0; i < sc->num_cqueues + 1; i++) 506 pthread_mutex_init(&sc->compl_queues[i].mtx, NULL); 507 } 508 } 509 510 static void 511 pci_nvme_reset(struct pci_nvme_softc *sc) 512 { 513 pthread_mutex_lock(&sc->mtx); 514 pci_nvme_reset_locked(sc); 515 pthread_mutex_unlock(&sc->mtx); 516 } 517 518 static void 519 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc) 520 { 521 uint16_t acqs, asqs; 522 523 DPRINTF(("%s\r\n", __func__)); 524 525 asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1; 526 sc->submit_queues[0].size = asqs; 527 sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq, 528 sizeof(struct nvme_command) * asqs); 529 530 DPRINTF(("%s mapping Admin-SQ guest 0x%lx, host: %p\r\n", 531 __func__, sc->regs.asq, sc->submit_queues[0].qbase)); 532 533 acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 534 NVME_AQA_REG_ACQS_MASK) + 1; 535 sc->compl_queues[0].size = acqs; 536 sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq, 537 sizeof(struct nvme_completion) * acqs); 538 DPRINTF(("%s mapping Admin-CQ guest 0x%lx, host: %p\r\n", 539 __func__, sc->regs.acq, sc->compl_queues[0].qbase)); 540 } 541 542 static int 543 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *src, 544 size_t len) 545 { 546 uint8_t *dst; 547 size_t bytes; 548 549 if (len > (8 * 1024)) { 550 return (-1); 551 } 552 553 /* Copy from the start of prp1 to the end of the physical page */ 554 bytes = PAGE_SIZE - (prp1 & PAGE_MASK); 555 bytes = MIN(bytes, len); 556 557 dst = vm_map_gpa(ctx, prp1, bytes); 558 if (dst == NULL) { 559 return (-1); 560 } 561 562 memcpy(dst, src, bytes); 563 564 src += bytes; 565 566 len -= bytes; 567 if (len == 0) { 568 return (0); 569 } 570 571 len = MIN(len, PAGE_SIZE); 572 573 dst = vm_map_gpa(ctx, prp2, len); 574 if (dst == NULL) { 575 return (-1); 576 } 577 578 memcpy(dst, src, len); 579 580 return (0); 581 } 582 583 static int 584 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 585 struct nvme_completion* compl) 586 { 587 uint16_t qid = command->cdw10 & 0xffff; 588 589 DPRINTF(("%s DELETE_IO_SQ %u\r\n", __func__, qid)); 590 if (qid == 0 || qid > sc->num_squeues) { 591 WPRINTF(("%s NOT PERMITTED queue id %u / num_squeues %u\r\n", 592 __func__, qid, sc->num_squeues)); 593 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 594 NVME_SC_INVALID_QUEUE_IDENTIFIER); 595 return (1); 596 } 597 598 sc->submit_queues[qid].qbase = NULL; 599 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 600 return (1); 601 } 602 603 static int 604 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 605 struct nvme_completion* compl) 606 { 607 if (command->cdw11 & NVME_CMD_CDW11_PC) { 608 uint16_t qid = command->cdw10 & 0xffff; 609 struct nvme_submission_queue *nsq; 610 611 if ((qid == 0) || (qid > sc->num_squeues)) { 612 WPRINTF(("%s queue index %u > num_squeues %u\r\n", 613 __func__, qid, sc->num_squeues)); 614 pci_nvme_status_tc(&compl->status, 615 NVME_SCT_COMMAND_SPECIFIC, 616 NVME_SC_INVALID_QUEUE_IDENTIFIER); 617 return (1); 618 } 619 620 nsq = &sc->submit_queues[qid]; 621 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 622 623 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 624 sizeof(struct nvme_command) * (size_t)nsq->size); 625 nsq->cqid = (command->cdw11 >> 16) & 0xffff; 626 nsq->qpriority = (command->cdw11 >> 1) & 0x03; 627 628 DPRINTF(("%s sq %u size %u gaddr %p cqid %u\r\n", __func__, 629 qid, nsq->size, nsq->qbase, nsq->cqid)); 630 631 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 632 633 DPRINTF(("%s completed creating IOSQ qid %u\r\n", 634 __func__, qid)); 635 } else { 636 /* 637 * Guest sent non-cont submission queue request. 638 * This setting is unsupported by this emulation. 639 */ 640 WPRINTF(("%s unsupported non-contig (list-based) " 641 "create i/o submission queue\r\n", __func__)); 642 643 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 644 } 645 return (1); 646 } 647 648 static int 649 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 650 struct nvme_completion* compl) 651 { 652 uint16_t qid = command->cdw10 & 0xffff; 653 654 DPRINTF(("%s DELETE_IO_CQ %u\r\n", __func__, qid)); 655 if (qid == 0 || qid > sc->num_cqueues) { 656 WPRINTF(("%s queue index %u / num_cqueues %u\r\n", 657 __func__, qid, sc->num_cqueues)); 658 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 659 NVME_SC_INVALID_QUEUE_IDENTIFIER); 660 return (1); 661 } 662 663 sc->compl_queues[qid].qbase = NULL; 664 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 665 return (1); 666 } 667 668 static int 669 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 670 struct nvme_completion* compl) 671 { 672 if (command->cdw11 & NVME_CMD_CDW11_PC) { 673 uint16_t qid = command->cdw10 & 0xffff; 674 struct nvme_completion_queue *ncq; 675 676 if ((qid == 0) || (qid > sc->num_cqueues)) { 677 WPRINTF(("%s queue index %u > num_cqueues %u\r\n", 678 __func__, qid, sc->num_cqueues)); 679 pci_nvme_status_tc(&compl->status, 680 NVME_SCT_COMMAND_SPECIFIC, 681 NVME_SC_INVALID_QUEUE_IDENTIFIER); 682 return (1); 683 } 684 685 ncq = &sc->compl_queues[qid]; 686 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1; 687 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff; 688 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 689 690 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, 691 command->prp1, 692 sizeof(struct nvme_command) * (size_t)ncq->size); 693 694 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 695 } else { 696 /* 697 * Non-contig completion queue unsupported. 698 */ 699 WPRINTF(("%s unsupported non-contig (list-based) " 700 "create i/o completion queue\r\n", 701 __func__)); 702 703 /* 0x12 = Invalid Use of Controller Memory Buffer */ 704 pci_nvme_status_genc(&compl->status, 0x12); 705 } 706 707 return (1); 708 } 709 710 static int 711 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command, 712 struct nvme_completion* compl) 713 { 714 uint32_t logsize = (1 + ((command->cdw10 >> 16) & 0xFFF)) * 2; 715 uint8_t logpage = command->cdw10 & 0xFF; 716 717 DPRINTF(("%s log page %u len %u\r\n", __func__, logpage, logsize)); 718 719 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 720 721 switch (logpage) { 722 case NVME_LOG_ERROR: 723 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 724 command->prp2, (uint8_t *)&sc->err_log, logsize); 725 break; 726 case NVME_LOG_HEALTH_INFORMATION: 727 /* TODO: present some smart info */ 728 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 729 command->prp2, (uint8_t *)&sc->health_log, logsize); 730 break; 731 case NVME_LOG_FIRMWARE_SLOT: 732 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 733 command->prp2, (uint8_t *)&sc->fw_log, logsize); 734 break; 735 default: 736 WPRINTF(("%s get log page %x command not supported\r\n", 737 __func__, logpage)); 738 739 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 740 NVME_SC_INVALID_LOG_PAGE); 741 } 742 743 return (1); 744 } 745 746 static int 747 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command, 748 struct nvme_completion* compl) 749 { 750 void *dest; 751 752 DPRINTF(("%s identify 0x%x nsid 0x%x\r\n", __func__, 753 command->cdw10 & 0xFF, command->nsid)); 754 755 switch (command->cdw10 & 0xFF) { 756 case 0x00: /* return Identify Namespace data structure */ 757 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 758 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata)); 759 break; 760 case 0x01: /* return Identify Controller data structure */ 761 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 762 command->prp2, (uint8_t *)&sc->ctrldata, 763 sizeof(sc->ctrldata)); 764 break; 765 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */ 766 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 767 sizeof(uint32_t) * 1024); 768 ((uint32_t *)dest)[0] = 1; 769 ((uint32_t *)dest)[1] = 0; 770 break; 771 case 0x11: 772 pci_nvme_status_genc(&compl->status, 773 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 774 return (1); 775 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */ 776 case 0x10: 777 case 0x12: 778 case 0x13: 779 case 0x14: 780 case 0x15: 781 default: 782 DPRINTF(("%s unsupported identify command requested 0x%x\r\n", 783 __func__, command->cdw10 & 0xFF)); 784 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 785 return (1); 786 } 787 788 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 789 return (1); 790 } 791 792 static int 793 nvme_set_feature_queues(struct pci_nvme_softc* sc, struct nvme_command* command, 794 struct nvme_completion* compl) 795 { 796 uint16_t nqr; /* Number of Queues Requested */ 797 798 nqr = command->cdw11 & 0xFFFF; 799 if (nqr == 0xffff) { 800 WPRINTF(("%s: Illegal NSQR value %#x\n", __func__, nqr)); 801 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 802 return (-1); 803 } 804 805 sc->num_squeues = ONE_BASED(nqr); 806 if (sc->num_squeues > sc->max_queues) { 807 DPRINTF(("NSQR=%u is greater than max %u\n", sc->num_squeues, 808 sc->max_queues)); 809 sc->num_squeues = sc->max_queues; 810 } 811 812 nqr = (command->cdw11 >> 16) & 0xFFFF; 813 if (nqr == 0xffff) { 814 WPRINTF(("%s: Illegal NCQR value %#x\n", __func__, nqr)); 815 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 816 return (-1); 817 } 818 819 sc->num_cqueues = ONE_BASED(nqr); 820 if (sc->num_cqueues > sc->max_queues) { 821 DPRINTF(("NCQR=%u is greater than max %u\n", sc->num_cqueues, 822 sc->max_queues)); 823 sc->num_cqueues = sc->max_queues; 824 } 825 826 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); 827 828 return (0); 829 } 830 831 static int 832 nvme_opc_set_features(struct pci_nvme_softc* sc, struct nvme_command* command, 833 struct nvme_completion* compl) 834 { 835 int feature = command->cdw10 & 0xFF; 836 uint32_t iv; 837 838 DPRINTF(("%s feature 0x%x\r\n", __func__, feature)); 839 compl->cdw0 = 0; 840 841 switch (feature) { 842 case NVME_FEAT_ARBITRATION: 843 DPRINTF((" arbitration 0x%x\r\n", command->cdw11)); 844 break; 845 case NVME_FEAT_POWER_MANAGEMENT: 846 DPRINTF((" power management 0x%x\r\n", command->cdw11)); 847 break; 848 case NVME_FEAT_LBA_RANGE_TYPE: 849 DPRINTF((" lba range 0x%x\r\n", command->cdw11)); 850 break; 851 case NVME_FEAT_TEMPERATURE_THRESHOLD: 852 DPRINTF((" temperature threshold 0x%x\r\n", command->cdw11)); 853 break; 854 case NVME_FEAT_ERROR_RECOVERY: 855 DPRINTF((" error recovery 0x%x\r\n", command->cdw11)); 856 break; 857 case NVME_FEAT_VOLATILE_WRITE_CACHE: 858 DPRINTF((" volatile write cache 0x%x\r\n", command->cdw11)); 859 break; 860 case NVME_FEAT_NUMBER_OF_QUEUES: 861 nvme_set_feature_queues(sc, command, compl); 862 break; 863 case NVME_FEAT_INTERRUPT_COALESCING: 864 DPRINTF((" interrupt coalescing 0x%x\r\n", command->cdw11)); 865 866 /* in uS */ 867 sc->intr_coales_aggr_time = ((command->cdw11 >> 8) & 0xFF)*100; 868 869 sc->intr_coales_aggr_thresh = command->cdw11 & 0xFF; 870 break; 871 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 872 iv = command->cdw11 & 0xFFFF; 873 874 DPRINTF((" interrupt vector configuration 0x%x\r\n", 875 command->cdw11)); 876 877 for (uint32_t i = 0; i < sc->num_cqueues + 1; i++) { 878 if (sc->compl_queues[i].intr_vec == iv) { 879 if (command->cdw11 & (1 << 16)) 880 sc->compl_queues[i].intr_en |= 881 NVME_CQ_INTCOAL; 882 else 883 sc->compl_queues[i].intr_en &= 884 ~NVME_CQ_INTCOAL; 885 } 886 } 887 break; 888 case NVME_FEAT_WRITE_ATOMICITY: 889 DPRINTF((" write atomicity 0x%x\r\n", command->cdw11)); 890 break; 891 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 892 DPRINTF((" async event configuration 0x%x\r\n", 893 command->cdw11)); 894 sc->async_ev_config = command->cdw11; 895 break; 896 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: 897 DPRINTF((" software progress marker 0x%x\r\n", 898 command->cdw11)); 899 break; 900 case 0x0C: 901 DPRINTF((" autonomous power state transition 0x%x\r\n", 902 command->cdw11)); 903 break; 904 default: 905 WPRINTF(("%s invalid feature\r\n", __func__)); 906 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 907 return (1); 908 } 909 910 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 911 return (1); 912 } 913 914 static int 915 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command, 916 struct nvme_completion* compl) 917 { 918 int feature = command->cdw10 & 0xFF; 919 920 DPRINTF(("%s feature 0x%x\r\n", __func__, feature)); 921 922 compl->cdw0 = 0; 923 924 switch (feature) { 925 case NVME_FEAT_ARBITRATION: 926 DPRINTF((" arbitration\r\n")); 927 break; 928 case NVME_FEAT_POWER_MANAGEMENT: 929 DPRINTF((" power management\r\n")); 930 break; 931 case NVME_FEAT_LBA_RANGE_TYPE: 932 DPRINTF((" lba range\r\n")); 933 break; 934 case NVME_FEAT_TEMPERATURE_THRESHOLD: 935 DPRINTF((" temperature threshold\r\n")); 936 switch ((command->cdw11 >> 20) & 0x3) { 937 case 0: 938 /* Over temp threshold */ 939 compl->cdw0 = 0xFFFF; 940 break; 941 case 1: 942 /* Under temp threshold */ 943 compl->cdw0 = 0; 944 break; 945 default: 946 WPRINTF((" invalid threshold type select\r\n")); 947 pci_nvme_status_genc(&compl->status, 948 NVME_SC_INVALID_FIELD); 949 return (1); 950 } 951 break; 952 case NVME_FEAT_ERROR_RECOVERY: 953 DPRINTF((" error recovery\r\n")); 954 break; 955 case NVME_FEAT_VOLATILE_WRITE_CACHE: 956 DPRINTF((" volatile write cache\r\n")); 957 break; 958 case NVME_FEAT_NUMBER_OF_QUEUES: 959 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); 960 961 DPRINTF((" number of queues (submit %u, completion %u)\r\n", 962 compl->cdw0 & 0xFFFF, 963 (compl->cdw0 >> 16) & 0xFFFF)); 964 965 break; 966 case NVME_FEAT_INTERRUPT_COALESCING: 967 DPRINTF((" interrupt coalescing\r\n")); 968 break; 969 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 970 DPRINTF((" interrupt vector configuration\r\n")); 971 break; 972 case NVME_FEAT_WRITE_ATOMICITY: 973 DPRINTF((" write atomicity\r\n")); 974 break; 975 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 976 DPRINTF((" async event configuration\r\n")); 977 sc->async_ev_config = command->cdw11; 978 break; 979 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: 980 DPRINTF((" software progress marker\r\n")); 981 break; 982 case 0x0C: 983 DPRINTF((" autonomous power state transition\r\n")); 984 break; 985 default: 986 WPRINTF(("%s invalid feature 0x%x\r\n", __func__, feature)); 987 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 988 return (1); 989 } 990 991 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 992 return (1); 993 } 994 995 static int 996 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command, 997 struct nvme_completion* compl) 998 { 999 DPRINTF(("%s submission queue %u, command ID 0x%x\r\n", __func__, 1000 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF)); 1001 1002 /* TODO: search for the command ID and abort it */ 1003 1004 compl->cdw0 = 1; 1005 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1006 return (1); 1007 } 1008 1009 static int 1010 nvme_opc_async_event_req(struct pci_nvme_softc* sc, 1011 struct nvme_command* command, struct nvme_completion* compl) 1012 { 1013 DPRINTF(("%s async event request 0x%x\r\n", __func__, command->cdw11)); 1014 1015 /* 1016 * TODO: raise events when they happen based on the Set Features cmd. 1017 * These events happen async, so only set completion successful if 1018 * there is an event reflective of the request to get event. 1019 */ 1020 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1021 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED); 1022 return (0); 1023 } 1024 1025 static void 1026 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value) 1027 { 1028 struct nvme_completion compl; 1029 struct nvme_command *cmd; 1030 struct nvme_submission_queue *sq; 1031 struct nvme_completion_queue *cq; 1032 int do_intr = 0; 1033 uint16_t sqhead; 1034 1035 DPRINTF(("%s index %u\r\n", __func__, (uint32_t)value)); 1036 1037 sq = &sc->submit_queues[0]; 1038 1039 sqhead = atomic_load_acq_short(&sq->head); 1040 1041 if (atomic_testandset_int(&sq->busy, 1)) { 1042 DPRINTF(("%s SQ busy, head %u, tail %u\r\n", 1043 __func__, sqhead, sq->tail)); 1044 return; 1045 } 1046 1047 DPRINTF(("sqhead %u, tail %u\r\n", sqhead, sq->tail)); 1048 1049 while (sqhead != atomic_load_acq_short(&sq->tail)) { 1050 cmd = &(sq->qbase)[sqhead]; 1051 compl.cdw0 = 0; 1052 compl.status = 0; 1053 1054 switch (cmd->opc) { 1055 case NVME_OPC_DELETE_IO_SQ: 1056 DPRINTF(("%s command DELETE_IO_SQ\r\n", __func__)); 1057 do_intr |= nvme_opc_delete_io_sq(sc, cmd, &compl); 1058 break; 1059 case NVME_OPC_CREATE_IO_SQ: 1060 DPRINTF(("%s command CREATE_IO_SQ\r\n", __func__)); 1061 do_intr |= nvme_opc_create_io_sq(sc, cmd, &compl); 1062 break; 1063 case NVME_OPC_DELETE_IO_CQ: 1064 DPRINTF(("%s command DELETE_IO_CQ\r\n", __func__)); 1065 do_intr |= nvme_opc_delete_io_cq(sc, cmd, &compl); 1066 break; 1067 case NVME_OPC_CREATE_IO_CQ: 1068 DPRINTF(("%s command CREATE_IO_CQ\r\n", __func__)); 1069 do_intr |= nvme_opc_create_io_cq(sc, cmd, &compl); 1070 break; 1071 case NVME_OPC_GET_LOG_PAGE: 1072 DPRINTF(("%s command GET_LOG_PAGE\r\n", __func__)); 1073 do_intr |= nvme_opc_get_log_page(sc, cmd, &compl); 1074 break; 1075 case NVME_OPC_IDENTIFY: 1076 DPRINTF(("%s command IDENTIFY\r\n", __func__)); 1077 do_intr |= nvme_opc_identify(sc, cmd, &compl); 1078 break; 1079 case NVME_OPC_ABORT: 1080 DPRINTF(("%s command ABORT\r\n", __func__)); 1081 do_intr |= nvme_opc_abort(sc, cmd, &compl); 1082 break; 1083 case NVME_OPC_SET_FEATURES: 1084 DPRINTF(("%s command SET_FEATURES\r\n", __func__)); 1085 do_intr |= nvme_opc_set_features(sc, cmd, &compl); 1086 break; 1087 case NVME_OPC_GET_FEATURES: 1088 DPRINTF(("%s command GET_FEATURES\r\n", __func__)); 1089 do_intr |= nvme_opc_get_features(sc, cmd, &compl); 1090 break; 1091 case NVME_OPC_ASYNC_EVENT_REQUEST: 1092 DPRINTF(("%s command ASYNC_EVENT_REQ\r\n", __func__)); 1093 /* XXX dont care, unhandled for now 1094 do_intr |= nvme_opc_async_event_req(sc, cmd, &compl); 1095 */ 1096 break; 1097 default: 1098 WPRINTF(("0x%x command is not implemented\r\n", 1099 cmd->opc)); 1100 } 1101 1102 /* for now skip async event generation */ 1103 if (cmd->opc != NVME_OPC_ASYNC_EVENT_REQUEST) { 1104 struct nvme_completion *cp; 1105 int phase; 1106 1107 cq = &sc->compl_queues[0]; 1108 1109 cp = &(cq->qbase)[cq->tail]; 1110 cp->cdw0 = compl.cdw0; 1111 cp->sqid = 0; 1112 cp->sqhd = sqhead; 1113 cp->cid = cmd->cid; 1114 1115 phase = NVME_STATUS_GET_P(cp->status); 1116 cp->status = compl.status; 1117 pci_nvme_toggle_phase(&cp->status, phase); 1118 1119 cq->tail = (cq->tail + 1) % cq->size; 1120 } 1121 sqhead = (sqhead + 1) % sq->size; 1122 } 1123 1124 DPRINTF(("setting sqhead %u\r\n", sqhead)); 1125 atomic_store_short(&sq->head, sqhead); 1126 atomic_store_int(&sq->busy, 0); 1127 1128 if (do_intr) 1129 pci_generate_msix(sc->nsc_pi, 0); 1130 1131 } 1132 1133 static int 1134 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req, 1135 uint64_t gpaddr, size_t size, int do_write, uint64_t lba) 1136 { 1137 int iovidx; 1138 1139 if (req != NULL) { 1140 /* concatenate contig block-iovs to minimize number of iovs */ 1141 if ((req->prev_gpaddr + req->prev_size) == gpaddr) { 1142 iovidx = req->io_req.br_iovcnt - 1; 1143 1144 req->io_req.br_iov[iovidx].iov_base = 1145 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 1146 req->prev_gpaddr, size); 1147 1148 req->prev_size += size; 1149 req->io_req.br_resid += size; 1150 1151 req->io_req.br_iov[iovidx].iov_len = req->prev_size; 1152 } else { 1153 pthread_mutex_lock(&req->mtx); 1154 1155 iovidx = req->io_req.br_iovcnt; 1156 if (iovidx == NVME_MAX_BLOCKIOVS) { 1157 int err = 0; 1158 1159 DPRINTF(("large I/O, doing partial req\r\n")); 1160 1161 iovidx = 0; 1162 req->io_req.br_iovcnt = 0; 1163 1164 req->io_req.br_callback = pci_nvme_io_partial; 1165 1166 if (!do_write) 1167 err = blockif_read(sc->nvstore.ctx, 1168 &req->io_req); 1169 else 1170 err = blockif_write(sc->nvstore.ctx, 1171 &req->io_req); 1172 1173 /* wait until req completes before cont */ 1174 if (err == 0) 1175 pthread_cond_wait(&req->cv, &req->mtx); 1176 } 1177 if (iovidx == 0) { 1178 req->io_req.br_offset = lba; 1179 req->io_req.br_resid = 0; 1180 req->io_req.br_param = req; 1181 } 1182 1183 req->io_req.br_iov[iovidx].iov_base = 1184 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 1185 gpaddr, size); 1186 1187 req->io_req.br_iov[iovidx].iov_len = size; 1188 1189 req->prev_gpaddr = gpaddr; 1190 req->prev_size = size; 1191 req->io_req.br_resid += size; 1192 1193 req->io_req.br_iovcnt++; 1194 1195 pthread_mutex_unlock(&req->mtx); 1196 } 1197 } else { 1198 /* RAM buffer: read/write directly */ 1199 void *p = sc->nvstore.ctx; 1200 void *gptr; 1201 1202 if ((lba + size) > sc->nvstore.size) { 1203 WPRINTF(("%s write would overflow RAM\r\n", __func__)); 1204 return (-1); 1205 } 1206 1207 p = (void *)((uintptr_t)p + (uintptr_t)lba); 1208 gptr = paddr_guest2host(sc->nsc_pi->pi_vmctx, gpaddr, size); 1209 if (do_write) 1210 memcpy(p, gptr, size); 1211 else 1212 memcpy(gptr, p, size); 1213 } 1214 return (0); 1215 } 1216 1217 static void 1218 pci_nvme_set_completion(struct pci_nvme_softc *sc, 1219 struct nvme_submission_queue *sq, int sqid, uint16_t cid, 1220 uint32_t cdw0, uint16_t status, int ignore_busy) 1221 { 1222 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid]; 1223 struct nvme_completion *compl; 1224 int do_intr = 0; 1225 int phase; 1226 1227 DPRINTF(("%s sqid %d cqid %u cid %u status: 0x%x 0x%x\r\n", 1228 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status), 1229 NVME_STATUS_GET_SC(status))); 1230 1231 pthread_mutex_lock(&cq->mtx); 1232 1233 assert(cq->qbase != NULL); 1234 1235 compl = &cq->qbase[cq->tail]; 1236 1237 compl->sqhd = atomic_load_acq_short(&sq->head); 1238 compl->sqid = sqid; 1239 compl->cid = cid; 1240 1241 // toggle phase 1242 phase = NVME_STATUS_GET_P(compl->status); 1243 compl->status = status; 1244 pci_nvme_toggle_phase(&compl->status, phase); 1245 1246 cq->tail = (cq->tail + 1) % cq->size; 1247 1248 if (cq->intr_en & NVME_CQ_INTEN) 1249 do_intr = 1; 1250 1251 pthread_mutex_unlock(&cq->mtx); 1252 1253 if (ignore_busy || !atomic_load_acq_int(&sq->busy)) 1254 if (do_intr) 1255 pci_generate_msix(sc->nsc_pi, cq->intr_vec); 1256 } 1257 1258 static void 1259 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req) 1260 { 1261 req->sc = NULL; 1262 req->nvme_sq = NULL; 1263 req->sqid = 0; 1264 1265 pthread_mutex_lock(&sc->mtx); 1266 1267 req->next = sc->ioreqs_free; 1268 sc->ioreqs_free = req; 1269 sc->pending_ios--; 1270 1271 /* when no more IO pending, can set to ready if device reset/enabled */ 1272 if (sc->pending_ios == 0 && 1273 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts))) 1274 sc->regs.csts |= NVME_CSTS_RDY; 1275 1276 pthread_mutex_unlock(&sc->mtx); 1277 1278 sem_post(&sc->iosemlock); 1279 } 1280 1281 static struct pci_nvme_ioreq * 1282 pci_nvme_get_ioreq(struct pci_nvme_softc *sc) 1283 { 1284 struct pci_nvme_ioreq *req = NULL;; 1285 1286 sem_wait(&sc->iosemlock); 1287 pthread_mutex_lock(&sc->mtx); 1288 1289 req = sc->ioreqs_free; 1290 assert(req != NULL); 1291 1292 sc->ioreqs_free = req->next; 1293 1294 req->next = NULL; 1295 req->sc = sc; 1296 1297 sc->pending_ios++; 1298 1299 pthread_mutex_unlock(&sc->mtx); 1300 1301 req->io_req.br_iovcnt = 0; 1302 req->io_req.br_offset = 0; 1303 req->io_req.br_resid = 0; 1304 req->io_req.br_param = req; 1305 req->prev_gpaddr = 0; 1306 req->prev_size = 0; 1307 1308 return req; 1309 } 1310 1311 static void 1312 pci_nvme_io_done(struct blockif_req *br, int err) 1313 { 1314 struct pci_nvme_ioreq *req = br->br_param; 1315 struct nvme_submission_queue *sq = req->nvme_sq; 1316 uint16_t code, status; 1317 1318 DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err))); 1319 1320 /* TODO return correct error */ 1321 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS; 1322 pci_nvme_status_genc(&status, code); 1323 1324 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status, 0); 1325 pci_nvme_release_ioreq(req->sc, req); 1326 } 1327 1328 static void 1329 pci_nvme_io_partial(struct blockif_req *br, int err) 1330 { 1331 struct pci_nvme_ioreq *req = br->br_param; 1332 1333 DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err))); 1334 1335 pthread_cond_signal(&req->cv); 1336 } 1337 1338 1339 static void 1340 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx) 1341 { 1342 struct nvme_submission_queue *sq; 1343 uint16_t status; 1344 uint16_t sqhead; 1345 int err; 1346 1347 /* handle all submissions up to sq->tail index */ 1348 sq = &sc->submit_queues[idx]; 1349 1350 if (atomic_testandset_int(&sq->busy, 1)) { 1351 DPRINTF(("%s sqid %u busy\r\n", __func__, idx)); 1352 return; 1353 } 1354 1355 sqhead = atomic_load_acq_short(&sq->head); 1356 1357 DPRINTF(("nvme_handle_io qid %u head %u tail %u cmdlist %p\r\n", 1358 idx, sqhead, sq->tail, sq->qbase)); 1359 1360 while (sqhead != atomic_load_acq_short(&sq->tail)) { 1361 struct nvme_command *cmd; 1362 struct pci_nvme_ioreq *req = NULL; 1363 uint64_t lba; 1364 uint64_t nblocks, bytes, size, cpsz; 1365 1366 /* TODO: support scatter gather list handling */ 1367 1368 cmd = &sq->qbase[sqhead]; 1369 sqhead = (sqhead + 1) % sq->size; 1370 1371 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10; 1372 1373 if (cmd->opc == NVME_OPC_FLUSH) { 1374 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 1375 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, 1376 status, 1); 1377 1378 continue; 1379 } else if (cmd->opc == 0x08) { 1380 /* TODO: write zeroes */ 1381 WPRINTF(("%s write zeroes lba 0x%lx blocks %u\r\n", 1382 __func__, lba, cmd->cdw12 & 0xFFFF)); 1383 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 1384 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, 1385 status, 1); 1386 1387 continue; 1388 } 1389 1390 nblocks = (cmd->cdw12 & 0xFFFF) + 1; 1391 1392 bytes = nblocks * sc->nvstore.sectsz; 1393 1394 if (sc->nvstore.type == NVME_STOR_BLOCKIF) { 1395 req = pci_nvme_get_ioreq(sc); 1396 req->nvme_sq = sq; 1397 req->sqid = idx; 1398 } 1399 1400 /* 1401 * If data starts mid-page and flows into the next page, then 1402 * increase page count 1403 */ 1404 1405 DPRINTF(("[h%u:t%u:n%u] %s starting LBA 0x%lx blocks %lu " 1406 "(%lu-bytes)\r\n", 1407 sqhead==0 ? sq->size-1 : sqhead-1, sq->tail, sq->size, 1408 cmd->opc == NVME_OPC_WRITE ? 1409 "WRITE" : "READ", 1410 lba, nblocks, bytes)); 1411 1412 cmd->prp1 &= ~(0x03UL); 1413 cmd->prp2 &= ~(0x03UL); 1414 1415 DPRINTF((" prp1 0x%lx prp2 0x%lx\r\n", cmd->prp1, cmd->prp2)); 1416 1417 size = bytes; 1418 lba *= sc->nvstore.sectsz; 1419 1420 cpsz = PAGE_SIZE - (cmd->prp1 % PAGE_SIZE); 1421 1422 if (cpsz > bytes) 1423 cpsz = bytes; 1424 1425 if (req != NULL) { 1426 req->io_req.br_offset = ((uint64_t)cmd->cdw11 << 32) | 1427 cmd->cdw10; 1428 req->opc = cmd->opc; 1429 req->cid = cmd->cid; 1430 req->nsid = cmd->nsid; 1431 } 1432 1433 err = pci_nvme_append_iov_req(sc, req, cmd->prp1, cpsz, 1434 cmd->opc == NVME_OPC_WRITE, lba); 1435 lba += cpsz; 1436 size -= cpsz; 1437 1438 if (size == 0) 1439 goto iodone; 1440 1441 if (size <= PAGE_SIZE) { 1442 /* prp2 is second (and final) page in transfer */ 1443 1444 err = pci_nvme_append_iov_req(sc, req, cmd->prp2, 1445 size, 1446 cmd->opc == NVME_OPC_WRITE, 1447 lba); 1448 } else { 1449 uint64_t *prp_list; 1450 int i; 1451 1452 /* prp2 is pointer to a physical region page list */ 1453 prp_list = paddr_guest2host(sc->nsc_pi->pi_vmctx, 1454 cmd->prp2, PAGE_SIZE); 1455 1456 i = 0; 1457 while (size != 0) { 1458 cpsz = MIN(size, PAGE_SIZE); 1459 1460 /* 1461 * Move to linked physical region page list 1462 * in last item. 1463 */ 1464 if (i == (NVME_PRP2_ITEMS-1) && 1465 size > PAGE_SIZE) { 1466 assert((prp_list[i] & (PAGE_SIZE-1)) == 0); 1467 prp_list = paddr_guest2host( 1468 sc->nsc_pi->pi_vmctx, 1469 prp_list[i], PAGE_SIZE); 1470 i = 0; 1471 } 1472 if (prp_list[i] == 0) { 1473 WPRINTF(("PRP2[%d] = 0 !!!\r\n", i)); 1474 err = 1; 1475 break; 1476 } 1477 1478 err = pci_nvme_append_iov_req(sc, req, 1479 prp_list[i], cpsz, 1480 cmd->opc == NVME_OPC_WRITE, lba); 1481 if (err) 1482 break; 1483 1484 lba += cpsz; 1485 size -= cpsz; 1486 i++; 1487 } 1488 } 1489 1490 iodone: 1491 if (sc->nvstore.type == NVME_STOR_RAM) { 1492 uint16_t code, status; 1493 1494 code = err ? NVME_SC_LBA_OUT_OF_RANGE : 1495 NVME_SC_SUCCESS; 1496 pci_nvme_status_genc(&status, code); 1497 1498 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, 1499 status, 1); 1500 1501 continue; 1502 } 1503 1504 1505 if (err) 1506 goto do_error; 1507 1508 req->io_req.br_callback = pci_nvme_io_done; 1509 1510 err = 0; 1511 switch (cmd->opc) { 1512 case NVME_OPC_READ: 1513 err = blockif_read(sc->nvstore.ctx, &req->io_req); 1514 break; 1515 case NVME_OPC_WRITE: 1516 err = blockif_write(sc->nvstore.ctx, &req->io_req); 1517 break; 1518 default: 1519 WPRINTF(("%s unhandled io command 0x%x\r\n", 1520 __func__, cmd->opc)); 1521 err = 1; 1522 } 1523 1524 do_error: 1525 if (err) { 1526 uint16_t status; 1527 1528 pci_nvme_status_genc(&status, 1529 NVME_SC_DATA_TRANSFER_ERROR); 1530 1531 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, 1532 status, 1); 1533 pci_nvme_release_ioreq(sc, req); 1534 } 1535 } 1536 1537 atomic_store_short(&sq->head, sqhead); 1538 atomic_store_int(&sq->busy, 0); 1539 } 1540 1541 static void 1542 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc, 1543 uint64_t idx, int is_sq, uint64_t value) 1544 { 1545 DPRINTF(("nvme doorbell %lu, %s, val 0x%lx\r\n", 1546 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF)); 1547 1548 if (is_sq) { 1549 atomic_store_short(&sc->submit_queues[idx].tail, 1550 (uint16_t)value); 1551 1552 if (idx == 0) { 1553 pci_nvme_handle_admin_cmd(sc, value); 1554 } else { 1555 /* submission queue; handle new entries in SQ */ 1556 if (idx > sc->num_squeues) { 1557 WPRINTF(("%s SQ index %lu overflow from " 1558 "guest (max %u)\r\n", 1559 __func__, idx, sc->num_squeues)); 1560 return; 1561 } 1562 pci_nvme_handle_io_cmd(sc, (uint16_t)idx); 1563 } 1564 } else { 1565 if (idx > sc->num_cqueues) { 1566 WPRINTF(("%s queue index %lu overflow from " 1567 "guest (max %u)\r\n", 1568 __func__, idx, sc->num_cqueues)); 1569 return; 1570 } 1571 1572 sc->compl_queues[idx].head = (uint16_t)value; 1573 } 1574 } 1575 1576 static void 1577 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite) 1578 { 1579 const char *s = iswrite ? "WRITE" : "READ"; 1580 1581 switch (offset) { 1582 case NVME_CR_CAP_LOW: 1583 DPRINTF(("%s %s NVME_CR_CAP_LOW\r\n", func, s)); 1584 break; 1585 case NVME_CR_CAP_HI: 1586 DPRINTF(("%s %s NVME_CR_CAP_HI\r\n", func, s)); 1587 break; 1588 case NVME_CR_VS: 1589 DPRINTF(("%s %s NVME_CR_VS\r\n", func, s)); 1590 break; 1591 case NVME_CR_INTMS: 1592 DPRINTF(("%s %s NVME_CR_INTMS\r\n", func, s)); 1593 break; 1594 case NVME_CR_INTMC: 1595 DPRINTF(("%s %s NVME_CR_INTMC\r\n", func, s)); 1596 break; 1597 case NVME_CR_CC: 1598 DPRINTF(("%s %s NVME_CR_CC\r\n", func, s)); 1599 break; 1600 case NVME_CR_CSTS: 1601 DPRINTF(("%s %s NVME_CR_CSTS\r\n", func, s)); 1602 break; 1603 case NVME_CR_NSSR: 1604 DPRINTF(("%s %s NVME_CR_NSSR\r\n", func, s)); 1605 break; 1606 case NVME_CR_AQA: 1607 DPRINTF(("%s %s NVME_CR_AQA\r\n", func, s)); 1608 break; 1609 case NVME_CR_ASQ_LOW: 1610 DPRINTF(("%s %s NVME_CR_ASQ_LOW\r\n", func, s)); 1611 break; 1612 case NVME_CR_ASQ_HI: 1613 DPRINTF(("%s %s NVME_CR_ASQ_HI\r\n", func, s)); 1614 break; 1615 case NVME_CR_ACQ_LOW: 1616 DPRINTF(("%s %s NVME_CR_ACQ_LOW\r\n", func, s)); 1617 break; 1618 case NVME_CR_ACQ_HI: 1619 DPRINTF(("%s %s NVME_CR_ACQ_HI\r\n", func, s)); 1620 break; 1621 default: 1622 DPRINTF(("unknown nvme bar-0 offset 0x%lx\r\n", offset)); 1623 } 1624 1625 } 1626 1627 static void 1628 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc, 1629 uint64_t offset, int size, uint64_t value) 1630 { 1631 uint32_t ccreg; 1632 1633 if (offset >= NVME_DOORBELL_OFFSET) { 1634 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET; 1635 uint64_t idx = belloffset / 8; /* door bell size = 2*int */ 1636 int is_sq = (belloffset % 8) < 4; 1637 1638 if (belloffset > ((sc->max_queues+1) * 8 - 4)) { 1639 WPRINTF(("guest attempted an overflow write offset " 1640 "0x%lx, val 0x%lx in %s", 1641 offset, value, __func__)); 1642 return; 1643 } 1644 1645 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value); 1646 return; 1647 } 1648 1649 DPRINTF(("nvme-write offset 0x%lx, size %d, value 0x%lx\r\n", 1650 offset, size, value)); 1651 1652 if (size != 4) { 1653 WPRINTF(("guest wrote invalid size %d (offset 0x%lx, " 1654 "val 0x%lx) to bar0 in %s", 1655 size, offset, value, __func__)); 1656 /* TODO: shutdown device */ 1657 return; 1658 } 1659 1660 pci_nvme_bar0_reg_dumps(__func__, offset, 1); 1661 1662 pthread_mutex_lock(&sc->mtx); 1663 1664 switch (offset) { 1665 case NVME_CR_CAP_LOW: 1666 case NVME_CR_CAP_HI: 1667 /* readonly */ 1668 break; 1669 case NVME_CR_VS: 1670 /* readonly */ 1671 break; 1672 case NVME_CR_INTMS: 1673 /* MSI-X, so ignore */ 1674 break; 1675 case NVME_CR_INTMC: 1676 /* MSI-X, so ignore */ 1677 break; 1678 case NVME_CR_CC: 1679 ccreg = (uint32_t)value; 1680 1681 DPRINTF(("%s NVME_CR_CC en %x css %x shn %x iosqes %u " 1682 "iocqes %u\r\n", 1683 __func__, 1684 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg), 1685 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg), 1686 NVME_CC_GET_IOCQES(ccreg))); 1687 1688 if (NVME_CC_GET_SHN(ccreg)) { 1689 /* perform shutdown - flush out data to backend */ 1690 sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK << 1691 NVME_CSTS_REG_SHST_SHIFT); 1692 sc->regs.csts |= NVME_SHST_COMPLETE << 1693 NVME_CSTS_REG_SHST_SHIFT; 1694 } 1695 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) { 1696 if (NVME_CC_GET_EN(ccreg) == 0) 1697 /* transition 1-> causes controller reset */ 1698 pci_nvme_reset_locked(sc); 1699 else 1700 pci_nvme_init_controller(ctx, sc); 1701 } 1702 1703 /* Insert the iocqes, iosqes and en bits from the write */ 1704 sc->regs.cc &= ~NVME_CC_WRITE_MASK; 1705 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK; 1706 if (NVME_CC_GET_EN(ccreg) == 0) { 1707 /* Insert the ams, mps and css bit fields */ 1708 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK; 1709 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK; 1710 sc->regs.csts &= ~NVME_CSTS_RDY; 1711 } else if (sc->pending_ios == 0) { 1712 sc->regs.csts |= NVME_CSTS_RDY; 1713 } 1714 break; 1715 case NVME_CR_CSTS: 1716 break; 1717 case NVME_CR_NSSR: 1718 /* ignore writes; don't support subsystem reset */ 1719 break; 1720 case NVME_CR_AQA: 1721 sc->regs.aqa = (uint32_t)value; 1722 break; 1723 case NVME_CR_ASQ_LOW: 1724 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) | 1725 (0xFFFFF000 & value); 1726 break; 1727 case NVME_CR_ASQ_HI: 1728 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) | 1729 (value << 32); 1730 break; 1731 case NVME_CR_ACQ_LOW: 1732 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) | 1733 (0xFFFFF000 & value); 1734 break; 1735 case NVME_CR_ACQ_HI: 1736 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) | 1737 (value << 32); 1738 break; 1739 default: 1740 DPRINTF(("%s unknown offset 0x%lx, value 0x%lx size %d\r\n", 1741 __func__, offset, value, size)); 1742 } 1743 pthread_mutex_unlock(&sc->mtx); 1744 } 1745 1746 static void 1747 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, 1748 int baridx, uint64_t offset, int size, uint64_t value) 1749 { 1750 struct pci_nvme_softc* sc = pi->pi_arg; 1751 1752 if (baridx == pci_msix_table_bar(pi) || 1753 baridx == pci_msix_pba_bar(pi)) { 1754 DPRINTF(("nvme-write baridx %d, msix: off 0x%lx, size %d, " 1755 " value 0x%lx\r\n", baridx, offset, size, value)); 1756 1757 pci_emul_msix_twrite(pi, offset, size, value); 1758 return; 1759 } 1760 1761 switch (baridx) { 1762 case 0: 1763 pci_nvme_write_bar_0(ctx, sc, offset, size, value); 1764 break; 1765 1766 default: 1767 DPRINTF(("%s unknown baridx %d, val 0x%lx\r\n", 1768 __func__, baridx, value)); 1769 } 1770 } 1771 1772 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc, 1773 uint64_t offset, int size) 1774 { 1775 uint64_t value; 1776 1777 pci_nvme_bar0_reg_dumps(__func__, offset, 0); 1778 1779 if (offset < NVME_DOORBELL_OFFSET) { 1780 void *p = &(sc->regs); 1781 pthread_mutex_lock(&sc->mtx); 1782 memcpy(&value, (void *)((uintptr_t)p + offset), size); 1783 pthread_mutex_unlock(&sc->mtx); 1784 } else { 1785 value = 0; 1786 WPRINTF(("pci_nvme: read invalid offset %ld\r\n", offset)); 1787 } 1788 1789 switch (size) { 1790 case 1: 1791 value &= 0xFF; 1792 break; 1793 case 2: 1794 value &= 0xFFFF; 1795 break; 1796 case 4: 1797 value &= 0xFFFFFFFF; 1798 break; 1799 } 1800 1801 DPRINTF((" nvme-read offset 0x%lx, size %d -> value 0x%x\r\n", 1802 offset, size, (uint32_t)value)); 1803 1804 return (value); 1805 } 1806 1807 1808 1809 static uint64_t 1810 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, 1811 uint64_t offset, int size) 1812 { 1813 struct pci_nvme_softc* sc = pi->pi_arg; 1814 1815 if (baridx == pci_msix_table_bar(pi) || 1816 baridx == pci_msix_pba_bar(pi)) { 1817 DPRINTF(("nvme-read bar: %d, msix: regoff 0x%lx, size %d\r\n", 1818 baridx, offset, size)); 1819 1820 return pci_emul_msix_tread(pi, offset, size); 1821 } 1822 1823 switch (baridx) { 1824 case 0: 1825 return pci_nvme_read_bar_0(sc, offset, size); 1826 1827 default: 1828 DPRINTF(("unknown bar %d, 0x%lx\r\n", baridx, offset)); 1829 } 1830 1831 return (0); 1832 } 1833 1834 1835 static int 1836 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts) 1837 { 1838 char bident[sizeof("XX:X:X")]; 1839 char *uopt, *xopts, *config; 1840 uint32_t sectsz; 1841 int optidx; 1842 1843 sc->max_queues = NVME_QUEUES; 1844 sc->max_qentries = NVME_MAX_QENTRIES; 1845 sc->ioslots = NVME_IOSLOTS; 1846 sc->num_squeues = sc->max_queues; 1847 sc->num_cqueues = sc->max_queues; 1848 sectsz = 0; 1849 1850 uopt = strdup(opts); 1851 optidx = 0; 1852 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn), 1853 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 1854 for (xopts = strtok(uopt, ","); 1855 xopts != NULL; 1856 xopts = strtok(NULL, ",")) { 1857 1858 if ((config = strchr(xopts, '=')) != NULL) 1859 *config++ = '\0'; 1860 1861 if (!strcmp("maxq", xopts)) { 1862 sc->max_queues = atoi(config); 1863 } else if (!strcmp("qsz", xopts)) { 1864 sc->max_qentries = atoi(config); 1865 } else if (!strcmp("ioslots", xopts)) { 1866 sc->ioslots = atoi(config); 1867 } else if (!strcmp("sectsz", xopts)) { 1868 sectsz = atoi(config); 1869 } else if (!strcmp("ser", xopts)) { 1870 /* 1871 * This field indicates the Product Serial Number in 1872 * 7-bit ASCII, unused bytes should be space characters. 1873 * Ref: NVMe v1.3c. 1874 */ 1875 cpywithpad((char *)sc->ctrldata.sn, 1876 sizeof(sc->ctrldata.sn), config, ' '); 1877 } else if (!strcmp("ram", xopts)) { 1878 uint64_t sz = strtoull(&xopts[4], NULL, 10); 1879 1880 sc->nvstore.type = NVME_STOR_RAM; 1881 sc->nvstore.size = sz * 1024 * 1024; 1882 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 1883 sc->nvstore.sectsz = 4096; 1884 sc->nvstore.sectsz_bits = 12; 1885 if (sc->nvstore.ctx == NULL) { 1886 perror("Unable to allocate RAM"); 1887 free(uopt); 1888 return (-1); 1889 } 1890 } else if (!strcmp("eui64", xopts)) { 1891 sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0)); 1892 } else if (optidx == 0) { 1893 snprintf(bident, sizeof(bident), "%d:%d", 1894 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 1895 sc->nvstore.ctx = blockif_open(xopts, bident); 1896 if (sc->nvstore.ctx == NULL) { 1897 perror("Could not open backing file"); 1898 free(uopt); 1899 return (-1); 1900 } 1901 sc->nvstore.type = NVME_STOR_BLOCKIF; 1902 sc->nvstore.size = blockif_size(sc->nvstore.ctx); 1903 } else { 1904 fprintf(stderr, "Invalid option %s\n", xopts); 1905 free(uopt); 1906 return (-1); 1907 } 1908 1909 optidx++; 1910 } 1911 free(uopt); 1912 1913 if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) { 1914 fprintf(stderr, "backing store not specified\n"); 1915 return (-1); 1916 } 1917 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192) 1918 sc->nvstore.sectsz = sectsz; 1919 else if (sc->nvstore.type != NVME_STOR_RAM) 1920 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx); 1921 for (sc->nvstore.sectsz_bits = 9; 1922 (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz; 1923 sc->nvstore.sectsz_bits++); 1924 1925 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES) 1926 sc->max_queues = NVME_QUEUES; 1927 1928 if (sc->max_qentries <= 0) { 1929 fprintf(stderr, "Invalid qsz option\n"); 1930 return (-1); 1931 } 1932 if (sc->ioslots <= 0) { 1933 fprintf(stderr, "Invalid ioslots option\n"); 1934 return (-1); 1935 } 1936 1937 return (0); 1938 } 1939 1940 static int 1941 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) 1942 { 1943 struct pci_nvme_softc *sc; 1944 uint32_t pci_membar_sz; 1945 int error; 1946 1947 error = 0; 1948 1949 sc = calloc(1, sizeof(struct pci_nvme_softc)); 1950 pi->pi_arg = sc; 1951 sc->nsc_pi = pi; 1952 1953 error = pci_nvme_parse_opts(sc, opts); 1954 if (error < 0) 1955 goto done; 1956 else 1957 error = 0; 1958 1959 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq)); 1960 for (int i = 0; i < sc->ioslots; i++) { 1961 if (i < (sc->ioslots-1)) 1962 sc->ioreqs[i].next = &sc->ioreqs[i+1]; 1963 pthread_mutex_init(&sc->ioreqs[i].mtx, NULL); 1964 pthread_cond_init(&sc->ioreqs[i].cv, NULL); 1965 } 1966 sc->ioreqs_free = sc->ioreqs; 1967 sc->intr_coales_aggr_thresh = 1; 1968 1969 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A); 1970 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); 1971 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); 1972 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM); 1973 pci_set_cfgdata8(pi, PCIR_PROGIF, 1974 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0); 1975 1976 /* 1977 * Allocate size of NVMe registers + doorbell space for all queues. 1978 * 1979 * The specification requires a minimum memory I/O window size of 16K. 1980 * The Windows driver will refuse to start a device with a smaller 1981 * window. 1982 */ 1983 pci_membar_sz = sizeof(struct nvme_registers) + 1984 2 * sizeof(uint32_t) * (sc->max_queues + 1); 1985 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN); 1986 1987 DPRINTF(("nvme membar size: %u\r\n", pci_membar_sz)); 1988 1989 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz); 1990 if (error) { 1991 WPRINTF(("%s pci alloc mem bar failed\r\n", __func__)); 1992 goto done; 1993 } 1994 1995 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR); 1996 if (error) { 1997 WPRINTF(("%s pci add msixcap failed\r\n", __func__)); 1998 goto done; 1999 } 2000 2001 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP); 2002 if (error) { 2003 WPRINTF(("%s pci add Express capability failed\r\n", __func__)); 2004 goto done; 2005 } 2006 2007 pthread_mutex_init(&sc->mtx, NULL); 2008 sem_init(&sc->iosemlock, 0, sc->ioslots); 2009 2010 pci_nvme_reset(sc); 2011 pci_nvme_init_ctrldata(sc); 2012 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, sc->nvstore.eui64); 2013 pci_nvme_init_logpages(sc); 2014 2015 pci_lintr_request(pi); 2016 2017 done: 2018 return (error); 2019 } 2020 2021 2022 struct pci_devemu pci_de_nvme = { 2023 .pe_emu = "nvme", 2024 .pe_init = pci_nvme_init, 2025 .pe_barwrite = pci_nvme_write, 2026 .pe_barread = pci_nvme_read 2027 }; 2028 PCI_EMUL_SET(pci_de_nvme); 2029