1 /*- 2 * BSD LICENSE 3 * Copyright (c) Intel Corporation. All rights reserved. 4 * Copyright (c) 2019, Nutanix Inc. All rights reserved. 5 * Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 /* 35 * NVMe over vfio-user transport 36 */ 37 38 #include <vfio-user/libvfio-user.h> 39 #include <vfio-user/pci_defs.h> 40 41 #include "spdk/barrier.h" 42 #include "spdk/stdinc.h" 43 #include "spdk/assert.h" 44 #include "spdk/thread.h" 45 #include "spdk/nvmf_transport.h" 46 #include "spdk/sock.h" 47 #include "spdk/string.h" 48 #include "spdk/util.h" 49 #include "spdk/log.h" 50 51 #include "transport.h" 52 53 #include "nvmf_internal.h" 54 55 #define NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH 256 56 #define NVMF_VFIO_USER_DEFAULT_AQ_DEPTH 32 57 #define NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE ((NVMF_REQ_MAX_BUFFERS - 1) << SHIFT_4KB) 58 #define NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE 59 60 #define NVME_DOORBELLS_OFFSET 0x1000 61 #define NVMF_VFIO_USER_DOORBELLS_SIZE 0x1000 62 63 /* 64 * NVMe driver reads 4096 bytes, which is the extended PCI configuration space 65 * available on PCI-X 2.0 and PCI Express buses 66 */ 67 #define NVME_REG_CFG_SIZE 0x1000 68 #define NVME_REG_BAR0_SIZE (NVME_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE) 69 #define NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR ((NVMF_VFIO_USER_DOORBELLS_SIZE) / 8) 70 #define NVME_IRQ_MSIX_NUM NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR 71 /* MSIX Table Size */ 72 #define NVME_BAR4_SIZE SPDK_ALIGN_CEIL((NVME_IRQ_MSIX_NUM * 16), 0x1000) 73 /* MSIX Pending Bit Array Size */ 74 #define NVME_BAR5_SIZE SPDK_ALIGN_CEIL((NVME_IRQ_MSIX_NUM / 8), 0x1000) 75 76 #define NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR (NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR / 4) 77 78 struct nvmf_vfio_user_req; 79 80 typedef int (*nvmf_vfio_user_req_cb_fn)(struct nvmf_vfio_user_req *req, void *cb_arg); 81 82 /* 1 more for PRP2 list itself */ 83 #define NVMF_VFIO_USER_MAX_IOVECS (NVMF_REQ_MAX_BUFFERS + 1) 84 85 enum nvmf_vfio_user_req_state { 86 VFIO_USER_REQUEST_STATE_FREE = 0, 87 VFIO_USER_REQUEST_STATE_EXECUTING, 88 }; 89 90 /* NVMe device state representation */ 91 struct nvme_migr_sq_state { 92 uint16_t sqid; 93 uint16_t cqid; 94 uint32_t head; 95 uint32_t size; 96 uint32_t reserved; 97 uint64_t dma_addr; 98 }; 99 SPDK_STATIC_ASSERT(sizeof(struct nvme_migr_sq_state) == 0x18, "Incorrect size"); 100 101 struct nvme_migr_cq_state { 102 uint16_t cqid; 103 uint16_t phase; 104 uint32_t tail; 105 uint32_t size; 106 uint32_t iv; 107 uint32_t ien; 108 uint32_t reserved; 109 uint64_t dma_addr; 110 }; 111 SPDK_STATIC_ASSERT(sizeof(struct nvme_migr_cq_state) == 0x20, "Incorrect size"); 112 113 #define VFIO_USER_NVME_MIGR_MAGIC 0xAFEDBC23 114 115 /* The device state is in VFIO MIGRATION BAR(9) region, keep the device state page aligned. 116 * 117 * NVMe device migration region is defined as below: 118 * ---------------------------------------------------------------------- 119 * | nvme_migr_device_state | private controller data | queue pairs | BARs | 120 * ---------------------------------------------------------------------- 121 * 122 * Keep nvme_migr_device_state as a fixed 0x1000 length, all new added fields 123 * can use the reserved space at the end of the data structure. 124 */ 125 struct nvme_migr_device_state { 126 /* Magic value to validate migration data */ 127 uint32_t magic; 128 /* Version to check the data is same from source to destination */ 129 uint32_t version; 130 131 /* The library uses this field to know how many fields in this 132 * structure are valid, starting at the beginning of this data 133 * structure. New added fields in future use `unused` memory 134 * spaces. 135 */ 136 uint32_t opts_size; 137 uint32_t reserved0; 138 139 /* BARs information */ 140 uint64_t bar_offset[VFU_PCI_DEV_NUM_REGIONS]; 141 uint64_t bar_len[VFU_PCI_DEV_NUM_REGIONS]; 142 143 /* Queue pair start offset, starting at the beginning of this 144 * data structure. 145 */ 146 uint64_t qp_offset; 147 uint64_t qp_len; 148 149 /* Controller data structure */ 150 uint32_t num_io_queues; 151 uint32_t reserved1; 152 153 uint16_t reserved2[3]; 154 uint16_t nr_aers; 155 uint16_t aer_cids[NVMF_MIGR_MAX_PENDING_AERS]; 156 157 /* Controller private data offset and length if exist, starting at 158 * the beginning of this data structure. 159 */ 160 uint64_t private_data_offset; 161 uint64_t private_data_len; 162 163 /* Reserved memory space for new added fields, the 164 * field is always at the end of this data structure. 165 */ 166 uint8_t unused[3356]; 167 }; 168 SPDK_STATIC_ASSERT(sizeof(struct nvme_migr_device_state) == 0x1000, "Incorrect size"); 169 170 struct vfio_user_nvme_migr_qp { 171 struct nvme_migr_sq_state sq; 172 struct nvme_migr_cq_state cq; 173 }; 174 175 /* NVMe state definition used temporarily to load/restore from/to NVMe migration BAR region */ 176 struct vfio_user_nvme_migr_state { 177 struct nvme_migr_device_state ctrlr_data; 178 struct nvmf_ctrlr_migr_data private_data; 179 struct vfio_user_nvme_migr_qp qps[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 180 uint8_t bar0[NVME_REG_BAR0_SIZE]; 181 uint8_t cfg[NVME_REG_CFG_SIZE]; 182 }; 183 184 struct nvmf_vfio_user_req { 185 struct spdk_nvmf_request req; 186 struct spdk_nvme_cpl rsp; 187 struct spdk_nvme_cmd cmd; 188 189 enum nvmf_vfio_user_req_state state; 190 nvmf_vfio_user_req_cb_fn cb_fn; 191 void *cb_arg; 192 193 /* old CC before prop_set_cc fabric command */ 194 union spdk_nvme_cc_register cc; 195 196 TAILQ_ENTRY(nvmf_vfio_user_req) link; 197 198 struct iovec iov[NVMF_VFIO_USER_MAX_IOVECS]; 199 uint8_t iovcnt; 200 201 /* NVMF_VFIO_USER_MAX_IOVECS worth of dma_sg_t. */ 202 uint8_t sg[]; 203 }; 204 205 /* 206 * Mapping of an NVMe queue. 207 * 208 * This holds the information tracking a local process mapping of an NVMe queue 209 * shared by the client. 210 */ 211 struct nvme_q_mapping { 212 /* iov of local process mapping. */ 213 struct iovec iov; 214 /* Stored sg, needed for unmap. */ 215 dma_sg_t *sg; 216 /* Client PRP of queue. */ 217 uint64_t prp1; 218 }; 219 220 enum nvmf_vfio_user_sq_state { 221 VFIO_USER_SQ_UNUSED = 0, 222 VFIO_USER_SQ_CREATED, 223 VFIO_USER_SQ_DELETED, 224 VFIO_USER_SQ_ACTIVE, 225 VFIO_USER_SQ_INACTIVE 226 }; 227 228 enum nvmf_vfio_user_cq_state { 229 VFIO_USER_CQ_UNUSED = 0, 230 VFIO_USER_CQ_CREATED, 231 VFIO_USER_CQ_DELETED, 232 }; 233 234 enum nvmf_vfio_user_ctrlr_state { 235 VFIO_USER_CTRLR_CREATING = 0, 236 VFIO_USER_CTRLR_RUNNING, 237 /* Quiesce requested by libvfio-user */ 238 VFIO_USER_CTRLR_PAUSING, 239 /* NVMf subsystem is paused, it's safe to do PCI reset, memory register, 240 * memory unergister, and vfio migration state transition in this state. 241 */ 242 VFIO_USER_CTRLR_PAUSED, 243 /* 244 * Implies that the NVMf subsystem is paused. Device will be unquiesced (PCI 245 * reset, memory register and unregister, controller in destination VM has 246 * been restored). NVMf subsystem resume has been requested. 247 */ 248 VFIO_USER_CTRLR_RESUMING, 249 /* 250 * Implies that the NVMf subsystem is paused. Both controller in source VM and 251 * destinatiom VM is in this state when doing live migration. 252 */ 253 VFIO_USER_CTRLR_MIGRATING 254 }; 255 256 /* Migration region to record NVMe device state data structure */ 257 struct vfio_user_migration_region { 258 uint64_t last_data_offset; 259 uint64_t pending_bytes; 260 }; 261 262 struct nvmf_vfio_user_sq { 263 struct spdk_nvmf_qpair qpair; 264 struct spdk_nvmf_transport_poll_group *group; 265 struct nvmf_vfio_user_ctrlr *ctrlr; 266 267 uint32_t qid; 268 /* Number of entries in queue. */ 269 uint32_t size; 270 struct nvme_q_mapping mapping; 271 enum nvmf_vfio_user_sq_state sq_state; 272 273 uint32_t head; 274 275 /* multiple SQs can be mapped to the same CQ */ 276 uint16_t cqid; 277 278 /* handle_queue_connect_rsp() can be used both for CREATE IO SQ response 279 * and SQ re-connect response in the destination VM, for the prior case, 280 * we will post a NVMe completion to VM, we will not set this flag when 281 * re-connecting SQs in the destination VM. 282 */ 283 bool post_create_io_sq_completion; 284 /* Copy of Create IO SQ command, this field is used together with 285 * `post_create_io_sq_completion` flag. 286 */ 287 struct spdk_nvme_cmd create_io_sq_cmd; 288 289 /* Currently unallocated reqs. */ 290 TAILQ_HEAD(, nvmf_vfio_user_req) free_reqs; 291 /* Poll group entry */ 292 TAILQ_ENTRY(nvmf_vfio_user_sq) link; 293 /* Connected SQ entry */ 294 TAILQ_ENTRY(nvmf_vfio_user_sq) tailq; 295 }; 296 297 struct nvmf_vfio_user_cq { 298 struct spdk_nvmf_transport_poll_group *group; 299 struct spdk_thread *thread; 300 uint32_t cq_ref; 301 302 uint32_t qid; 303 /* Number of entries in queue. */ 304 uint32_t size; 305 struct nvme_q_mapping mapping; 306 enum nvmf_vfio_user_cq_state cq_state; 307 308 uint32_t tail; 309 bool phase; 310 311 uint16_t iv; 312 bool ien; 313 }; 314 315 struct nvmf_vfio_user_poll_group { 316 struct spdk_nvmf_transport_poll_group group; 317 TAILQ_ENTRY(nvmf_vfio_user_poll_group) link; 318 TAILQ_HEAD(, nvmf_vfio_user_sq) sqs; 319 }; 320 321 struct nvmf_vfio_user_ctrlr { 322 struct nvmf_vfio_user_endpoint *endpoint; 323 struct nvmf_vfio_user_transport *transport; 324 325 /* Connected SQs list */ 326 TAILQ_HEAD(, nvmf_vfio_user_sq) connected_sqs; 327 enum nvmf_vfio_user_ctrlr_state state; 328 329 struct vfio_user_migration_region migr_reg; 330 /* Controller is in source VM when doing live migration */ 331 bool in_source_vm; 332 333 struct spdk_thread *thread; 334 struct spdk_poller *vfu_ctx_poller; 335 336 bool queued_quiesce; 337 338 bool reset_shn; 339 340 uint16_t cntlid; 341 struct spdk_nvmf_ctrlr *ctrlr; 342 343 struct nvmf_vfio_user_sq *sqs[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 344 struct nvmf_vfio_user_cq *cqs[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 345 346 TAILQ_ENTRY(nvmf_vfio_user_ctrlr) link; 347 348 volatile uint32_t *doorbells; 349 350 /* internal CSTS.CFS register for vfio-user fatal errors */ 351 uint32_t cfs : 1; 352 }; 353 354 struct nvmf_vfio_user_endpoint { 355 vfu_ctx_t *vfu_ctx; 356 struct msixcap *msix; 357 vfu_pci_config_space_t *pci_config_space; 358 int devmem_fd; 359 volatile uint32_t *doorbells; 360 361 int migr_fd; 362 void *migr_data; 363 364 struct spdk_nvme_transport_id trid; 365 const struct spdk_nvmf_subsystem *subsystem; 366 367 struct nvmf_vfio_user_ctrlr *ctrlr; 368 pthread_mutex_t lock; 369 370 bool need_async_destroy; 371 372 TAILQ_ENTRY(nvmf_vfio_user_endpoint) link; 373 }; 374 375 struct nvmf_vfio_user_transport_opts { 376 bool disable_mappable_bar0; 377 }; 378 379 struct nvmf_vfio_user_transport { 380 struct spdk_nvmf_transport transport; 381 struct nvmf_vfio_user_transport_opts transport_opts; 382 struct spdk_poller *accept_poller; 383 pthread_mutex_t lock; 384 TAILQ_HEAD(, nvmf_vfio_user_endpoint) endpoints; 385 386 pthread_mutex_t pg_lock; 387 TAILQ_HEAD(, nvmf_vfio_user_poll_group) poll_groups; 388 struct nvmf_vfio_user_poll_group *next_pg; 389 }; 390 391 /* 392 * function prototypes 393 */ 394 static int 395 nvmf_vfio_user_req_free(struct spdk_nvmf_request *req); 396 397 static struct nvmf_vfio_user_req * 398 get_nvmf_vfio_user_req(struct nvmf_vfio_user_sq *sq); 399 400 /* 401 * Local process virtual address of a queue. 402 */ 403 static inline void * 404 q_addr(struct nvme_q_mapping *mapping) 405 { 406 return mapping->iov.iov_base; 407 } 408 409 static inline int 410 queue_index(uint16_t qid, bool is_cq) 411 { 412 return (qid * 2) + is_cq; 413 } 414 415 static inline volatile uint32_t * 416 sq_headp(struct nvmf_vfio_user_sq *sq) 417 { 418 assert(sq != NULL); 419 return &sq->head; 420 } 421 422 static inline volatile uint32_t * 423 sq_dbl_tailp(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_sq *sq) 424 { 425 assert(ctrlr != NULL); 426 assert(sq != NULL); 427 return &ctrlr->doorbells[queue_index(sq->qid, false)]; 428 } 429 430 static inline volatile uint32_t * 431 cq_dbl_headp(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_cq *cq) 432 { 433 assert(ctrlr != NULL); 434 assert(cq != NULL); 435 return &ctrlr->doorbells[queue_index(cq->qid, true)]; 436 } 437 438 static inline volatile uint32_t * 439 cq_tailp(struct nvmf_vfio_user_cq *cq) 440 { 441 assert(cq != NULL); 442 return &cq->tail; 443 } 444 445 static inline void 446 sq_head_advance(struct nvmf_vfio_user_sq *sq) 447 { 448 assert(sq != NULL); 449 450 assert(*sq_headp(sq) < sq->size); 451 (*sq_headp(sq))++; 452 453 if (spdk_unlikely(*sq_headp(sq) == sq->size)) { 454 *sq_headp(sq) = 0; 455 } 456 } 457 458 static inline void 459 cq_tail_advance(struct nvmf_vfio_user_cq *cq) 460 { 461 assert(cq != NULL); 462 463 assert(*cq_tailp(cq) < cq->size); 464 (*cq_tailp(cq))++; 465 466 if (spdk_unlikely(*cq_tailp(cq) == cq->size)) { 467 *cq_tailp(cq) = 0; 468 cq->phase = !cq->phase; 469 } 470 } 471 472 static inline bool 473 cq_is_full(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_cq *cq) 474 { 475 uint32_t qindex; 476 477 assert(ctrlr != NULL); 478 assert(cq != NULL); 479 480 qindex = *cq_tailp(cq) + 1; 481 if (spdk_unlikely(qindex == cq->size)) { 482 qindex = 0; 483 } 484 485 return qindex == *cq_dbl_headp(ctrlr, cq); 486 } 487 488 489 /* TODO: wrapper to data structure */ 490 static inline size_t 491 vfio_user_migr_data_len(void) 492 { 493 size_t len = 0; 494 495 len = NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR * (sizeof(struct nvme_migr_sq_state) + sizeof( 496 struct nvme_migr_cq_state)); 497 len += sizeof(struct nvme_migr_device_state); 498 len += sizeof(struct nvmf_ctrlr_migr_data); 499 len += NVME_REG_BAR0_SIZE; 500 len += NVME_REG_CFG_SIZE; 501 /* BAR4 */ 502 len += NVME_BAR4_SIZE; 503 /* BAR5 */ 504 len += NVME_BAR5_SIZE; 505 506 return SPDK_ALIGN_CEIL(len, PAGE_SIZE); 507 } 508 509 static int 510 nvme_cmd_map_prps(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, 511 uint32_t max_iovcnt, uint32_t len, size_t mps, 512 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 513 { 514 uint64_t prp1, prp2; 515 void *vva; 516 uint32_t i; 517 uint32_t residue_len, nents; 518 uint64_t *prp_list; 519 uint32_t iovcnt; 520 521 assert(max_iovcnt > 0); 522 523 prp1 = cmd->dptr.prp.prp1; 524 prp2 = cmd->dptr.prp.prp2; 525 526 /* PRP1 may started with unaligned page address */ 527 residue_len = mps - (prp1 % mps); 528 residue_len = spdk_min(len, residue_len); 529 530 vva = gpa_to_vva(prv, prp1, residue_len, PROT_READ | PROT_WRITE); 531 if (spdk_unlikely(vva == NULL)) { 532 SPDK_ERRLOG("GPA to VVA failed\n"); 533 return -EINVAL; 534 } 535 len -= residue_len; 536 if (len && max_iovcnt < 2) { 537 SPDK_ERRLOG("Too many page entries, at least two iovs are required\n"); 538 return -ERANGE; 539 } 540 iovs[0].iov_base = vva; 541 iovs[0].iov_len = residue_len; 542 543 if (len) { 544 if (spdk_unlikely(prp2 == 0)) { 545 SPDK_ERRLOG("no PRP2, %d remaining\n", len); 546 return -EINVAL; 547 } 548 549 if (len <= mps) { 550 /* 2 PRP used */ 551 iovcnt = 2; 552 vva = gpa_to_vva(prv, prp2, len, PROT_READ | PROT_WRITE); 553 if (spdk_unlikely(vva == NULL)) { 554 SPDK_ERRLOG("no VVA for %#" PRIx64 ", len%#x\n", 555 prp2, len); 556 return -EINVAL; 557 } 558 iovs[1].iov_base = vva; 559 iovs[1].iov_len = len; 560 } else { 561 /* PRP list used */ 562 nents = (len + mps - 1) / mps; 563 if (spdk_unlikely(nents + 1 > max_iovcnt)) { 564 SPDK_ERRLOG("Too many page entries\n"); 565 return -ERANGE; 566 } 567 568 vva = gpa_to_vva(prv, prp2, nents * sizeof(*prp_list), PROT_READ); 569 if (spdk_unlikely(vva == NULL)) { 570 SPDK_ERRLOG("no VVA for %#" PRIx64 ", nents=%#x\n", 571 prp2, nents); 572 return -EINVAL; 573 } 574 prp_list = vva; 575 i = 0; 576 while (len != 0) { 577 residue_len = spdk_min(len, mps); 578 vva = gpa_to_vva(prv, prp_list[i], residue_len, PROT_READ | PROT_WRITE); 579 if (spdk_unlikely(vva == NULL)) { 580 SPDK_ERRLOG("no VVA for %#" PRIx64 ", residue_len=%#x\n", 581 prp_list[i], residue_len); 582 return -EINVAL; 583 } 584 iovs[i + 1].iov_base = vva; 585 iovs[i + 1].iov_len = residue_len; 586 len -= residue_len; 587 i++; 588 } 589 iovcnt = i + 1; 590 } 591 } else { 592 /* 1 PRP used */ 593 iovcnt = 1; 594 } 595 596 assert(iovcnt <= max_iovcnt); 597 return iovcnt; 598 } 599 600 static int 601 nvme_cmd_map_sgls_data(void *prv, struct spdk_nvme_sgl_descriptor *sgls, uint32_t num_sgls, 602 struct iovec *iovs, uint32_t max_iovcnt, 603 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 604 { 605 uint32_t i; 606 void *vva; 607 608 if (spdk_unlikely(max_iovcnt < num_sgls)) { 609 return -ERANGE; 610 } 611 612 for (i = 0; i < num_sgls; i++) { 613 if (spdk_unlikely(sgls[i].unkeyed.type != SPDK_NVME_SGL_TYPE_DATA_BLOCK)) { 614 SPDK_ERRLOG("Invalid SGL type %u\n", sgls[i].unkeyed.type); 615 return -EINVAL; 616 } 617 vva = gpa_to_vva(prv, sgls[i].address, sgls[i].unkeyed.length, PROT_READ | PROT_WRITE); 618 if (spdk_unlikely(vva == NULL)) { 619 SPDK_ERRLOG("GPA to VVA failed\n"); 620 return -EINVAL; 621 } 622 iovs[i].iov_base = vva; 623 iovs[i].iov_len = sgls[i].unkeyed.length; 624 } 625 626 return num_sgls; 627 } 628 629 static int 630 nvme_cmd_map_sgls(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt, 631 uint32_t len, size_t mps, 632 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 633 { 634 struct spdk_nvme_sgl_descriptor *sgl, *last_sgl; 635 uint32_t num_sgls, seg_len; 636 void *vva; 637 int ret; 638 uint32_t total_iovcnt = 0; 639 640 /* SGL cases */ 641 sgl = &cmd->dptr.sgl1; 642 643 /* only one SGL segment */ 644 if (sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { 645 assert(max_iovcnt > 0); 646 vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, PROT_READ | PROT_WRITE); 647 if (spdk_unlikely(vva == NULL)) { 648 SPDK_ERRLOG("GPA to VVA failed\n"); 649 return -EINVAL; 650 } 651 iovs[0].iov_base = vva; 652 iovs[0].iov_len = sgl->unkeyed.length; 653 assert(sgl->unkeyed.length == len); 654 655 return 1; 656 } 657 658 for (;;) { 659 if (spdk_unlikely((sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_SEGMENT) && 660 (sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_LAST_SEGMENT))) { 661 SPDK_ERRLOG("Invalid SGL type %u\n", sgl->unkeyed.type); 662 return -EINVAL; 663 } 664 665 seg_len = sgl->unkeyed.length; 666 if (spdk_unlikely(seg_len % sizeof(struct spdk_nvme_sgl_descriptor))) { 667 SPDK_ERRLOG("Invalid SGL segment len %u\n", seg_len); 668 return -EINVAL; 669 } 670 671 num_sgls = seg_len / sizeof(struct spdk_nvme_sgl_descriptor); 672 vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, PROT_READ); 673 if (spdk_unlikely(vva == NULL)) { 674 SPDK_ERRLOG("GPA to VVA failed\n"); 675 return -EINVAL; 676 } 677 678 /* sgl point to the first segment */ 679 sgl = (struct spdk_nvme_sgl_descriptor *)vva; 680 last_sgl = &sgl[num_sgls - 1]; 681 682 /* we are done */ 683 if (last_sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { 684 /* map whole sgl list */ 685 ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls, &iovs[total_iovcnt], 686 max_iovcnt - total_iovcnt, gpa_to_vva); 687 if (spdk_unlikely(ret < 0)) { 688 return ret; 689 } 690 total_iovcnt += ret; 691 692 return total_iovcnt; 693 } 694 695 if (num_sgls > 1) { 696 /* map whole sgl exclude last_sgl */ 697 ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls - 1, &iovs[total_iovcnt], 698 max_iovcnt - total_iovcnt, gpa_to_vva); 699 if (spdk_unlikely(ret < 0)) { 700 return ret; 701 } 702 total_iovcnt += ret; 703 } 704 705 /* move to next level's segments */ 706 sgl = last_sgl; 707 } 708 709 return 0; 710 } 711 712 static int 713 nvme_map_cmd(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt, 714 uint32_t len, size_t mps, 715 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 716 { 717 if (cmd->psdt == SPDK_NVME_PSDT_PRP) { 718 return nvme_cmd_map_prps(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva); 719 } 720 721 return nvme_cmd_map_sgls(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva); 722 } 723 724 static char * 725 endpoint_id(struct nvmf_vfio_user_endpoint *endpoint) 726 { 727 return endpoint->trid.traddr; 728 } 729 730 static char * 731 ctrlr_id(struct nvmf_vfio_user_ctrlr *ctrlr) 732 { 733 if (!ctrlr || !ctrlr->endpoint) { 734 return "Null Ctrlr"; 735 } 736 737 return endpoint_id(ctrlr->endpoint); 738 } 739 740 static void 741 fail_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 742 { 743 assert(ctrlr != NULL); 744 745 if (ctrlr->cfs == 0) { 746 SPDK_ERRLOG(":%s failing controller\n", ctrlr_id(ctrlr)); 747 } 748 749 ctrlr->cfs = 1U; 750 } 751 752 static inline bool 753 ctrlr_interrupt_enabled(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 754 { 755 assert(vu_ctrlr != NULL); 756 assert(vu_ctrlr->endpoint != NULL); 757 758 vfu_pci_config_space_t *pci = vu_ctrlr->endpoint->pci_config_space; 759 760 return (!pci->hdr.cmd.id || vu_ctrlr->endpoint->msix->mxc.mxe); 761 } 762 763 static void 764 nvmf_vfio_user_destroy_endpoint(struct nvmf_vfio_user_endpoint *endpoint) 765 { 766 SPDK_DEBUGLOG(nvmf_vfio, "destroy endpoint %s\n", endpoint_id(endpoint)); 767 768 if (endpoint->doorbells) { 769 munmap((void *)endpoint->doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 770 } 771 772 if (endpoint->devmem_fd > 0) { 773 close(endpoint->devmem_fd); 774 } 775 776 if (endpoint->migr_data) { 777 munmap(endpoint->migr_data, vfio_user_migr_data_len()); 778 } 779 780 if (endpoint->migr_fd > 0) { 781 close(endpoint->migr_fd); 782 } 783 784 if (endpoint->vfu_ctx) { 785 vfu_destroy_ctx(endpoint->vfu_ctx); 786 } 787 788 pthread_mutex_destroy(&endpoint->lock); 789 free(endpoint); 790 } 791 792 /* called when process exits */ 793 static int 794 nvmf_vfio_user_destroy(struct spdk_nvmf_transport *transport, 795 spdk_nvmf_transport_destroy_done_cb cb_fn, void *cb_arg) 796 { 797 struct nvmf_vfio_user_transport *vu_transport; 798 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 799 800 SPDK_DEBUGLOG(nvmf_vfio, "destroy transport\n"); 801 802 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 803 transport); 804 805 spdk_poller_unregister(&vu_transport->accept_poller); 806 pthread_mutex_destroy(&vu_transport->lock); 807 pthread_mutex_destroy(&vu_transport->pg_lock); 808 809 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 810 TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link); 811 nvmf_vfio_user_destroy_endpoint(endpoint); 812 } 813 814 free(vu_transport); 815 816 if (cb_fn) { 817 cb_fn(cb_arg); 818 } 819 820 return 0; 821 } 822 823 static const struct spdk_json_object_decoder vfio_user_transport_opts_decoder[] = { 824 { 825 "disable_mappable_bar0", 826 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_mappable_bar0), 827 spdk_json_decode_bool, true 828 }, 829 }; 830 831 static int 832 nvmf_vfio_user_accept(void *ctx); 833 834 static struct spdk_nvmf_transport * 835 nvmf_vfio_user_create(struct spdk_nvmf_transport_opts *opts) 836 { 837 struct nvmf_vfio_user_transport *vu_transport; 838 int err; 839 840 if (opts->max_qpairs_per_ctrlr > NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) { 841 SPDK_ERRLOG("Invalid max_qpairs_per_ctrlr=%d, supported max_qpairs_per_ctrlr=%d\n", 842 opts->max_qpairs_per_ctrlr, NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR); 843 return NULL; 844 } 845 846 vu_transport = calloc(1, sizeof(*vu_transport)); 847 if (vu_transport == NULL) { 848 SPDK_ERRLOG("Transport alloc fail: %m\n"); 849 return NULL; 850 } 851 852 err = pthread_mutex_init(&vu_transport->lock, NULL); 853 if (err != 0) { 854 SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err); 855 goto err; 856 } 857 TAILQ_INIT(&vu_transport->endpoints); 858 859 err = pthread_mutex_init(&vu_transport->pg_lock, NULL); 860 if (err != 0) { 861 pthread_mutex_destroy(&vu_transport->lock); 862 SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err); 863 goto err; 864 } 865 TAILQ_INIT(&vu_transport->poll_groups); 866 867 if (opts->transport_specific != NULL && 868 spdk_json_decode_object_relaxed(opts->transport_specific, vfio_user_transport_opts_decoder, 869 SPDK_COUNTOF(vfio_user_transport_opts_decoder), 870 vu_transport)) { 871 SPDK_ERRLOG("spdk_json_decode_object_relaxed failed\n"); 872 goto cleanup; 873 } 874 875 vu_transport->accept_poller = SPDK_POLLER_REGISTER(nvmf_vfio_user_accept, &vu_transport->transport, 876 opts->acceptor_poll_rate); 877 if (!vu_transport->accept_poller) { 878 goto cleanup; 879 } 880 881 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_mappable_bar0=%d\n", 882 vu_transport->transport_opts.disable_mappable_bar0); 883 884 return &vu_transport->transport; 885 886 cleanup: 887 pthread_mutex_destroy(&vu_transport->lock); 888 pthread_mutex_destroy(&vu_transport->pg_lock); 889 err: 890 free(vu_transport); 891 return NULL; 892 } 893 894 static uint32_t 895 max_queue_size(struct nvmf_vfio_user_ctrlr const *vu_ctrlr) 896 { 897 assert(vu_ctrlr != NULL); 898 assert(vu_ctrlr->ctrlr != NULL); 899 900 return vu_ctrlr->ctrlr->vcprop.cap.bits.mqes + 1; 901 } 902 903 static void * 904 map_one(vfu_ctx_t *ctx, uint64_t addr, uint64_t len, dma_sg_t *sg, struct iovec *iov, int prot) 905 { 906 int ret; 907 908 assert(ctx != NULL); 909 assert(sg != NULL); 910 assert(iov != NULL); 911 912 ret = vfu_addr_to_sg(ctx, (void *)(uintptr_t)addr, len, sg, 1, prot); 913 if (ret < 0) { 914 return NULL; 915 } 916 917 ret = vfu_map_sg(ctx, sg, iov, 1, 0); 918 if (ret != 0) { 919 return NULL; 920 } 921 922 assert(iov->iov_base != NULL); 923 return iov->iov_base; 924 } 925 926 static int 927 map_q(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvme_q_mapping *mapping, 928 uint32_t q_size, bool is_cq, bool unmap) 929 { 930 uint64_t len; 931 void *ret; 932 933 assert(q_size); 934 assert(q_addr(mapping) == NULL); 935 936 if (is_cq) { 937 len = q_size * sizeof(struct spdk_nvme_cpl); 938 } else { 939 len = q_size * sizeof(struct spdk_nvme_cmd); 940 } 941 942 ret = map_one(vu_ctrlr->endpoint->vfu_ctx, mapping->prp1, len, 943 mapping->sg, &mapping->iov, 944 is_cq ? PROT_READ | PROT_WRITE : PROT_READ); 945 if (ret == NULL) { 946 return -EFAULT; 947 } 948 949 if (unmap) { 950 memset(q_addr(mapping), 0, len); 951 } 952 953 return 0; 954 } 955 956 static inline void 957 unmap_q(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvme_q_mapping *mapping) 958 { 959 if (q_addr(mapping) != NULL) { 960 vfu_unmap_sg(vu_ctrlr->endpoint->vfu_ctx, mapping->sg, 961 &mapping->iov, 1); 962 mapping->iov.iov_base = NULL; 963 } 964 } 965 966 static int 967 asq_setup(struct nvmf_vfio_user_ctrlr *ctrlr) 968 { 969 struct nvmf_vfio_user_sq *sq; 970 const struct spdk_nvmf_registers *regs; 971 int ret; 972 973 assert(ctrlr != NULL); 974 975 sq = ctrlr->sqs[0]; 976 977 assert(sq != NULL); 978 assert(q_addr(&sq->mapping) == NULL); 979 /* XXX ctrlr->asq == 0 is a valid memory address */ 980 981 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->ctrlr); 982 sq->qid = 0; 983 sq->size = regs->aqa.bits.asqs + 1; 984 sq->mapping.prp1 = regs->asq; 985 *sq_headp(sq) = 0; 986 sq->cqid = 0; 987 988 ret = map_q(ctrlr, &sq->mapping, sq->size, false, true); 989 if (ret) { 990 return ret; 991 } 992 993 *sq_dbl_tailp(ctrlr, sq) = 0; 994 995 return 0; 996 } 997 998 static int 999 acq_setup(struct nvmf_vfio_user_ctrlr *ctrlr) 1000 { 1001 struct nvmf_vfio_user_cq *cq; 1002 const struct spdk_nvmf_registers *regs; 1003 int ret; 1004 1005 assert(ctrlr != NULL); 1006 1007 cq = ctrlr->cqs[0]; 1008 1009 assert(cq != NULL); 1010 1011 assert(q_addr(&cq->mapping) == NULL); 1012 1013 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->ctrlr); 1014 assert(regs != NULL); 1015 cq->qid = 0; 1016 cq->size = regs->aqa.bits.acqs + 1; 1017 cq->mapping.prp1 = regs->acq; 1018 *cq_tailp(cq) = 0; 1019 cq->ien = true; 1020 cq->phase = true; 1021 1022 ret = map_q(ctrlr, &cq->mapping, cq->size, true, true); 1023 if (ret) { 1024 return ret; 1025 } 1026 1027 *cq_dbl_headp(ctrlr, cq) = 0; 1028 1029 return 0; 1030 } 1031 1032 static inline dma_sg_t * 1033 vu_req_to_sg_t(struct nvmf_vfio_user_req *vu_req, uint32_t iovcnt) 1034 { 1035 return (dma_sg_t *)(vu_req->sg + iovcnt * dma_sg_size()); 1036 } 1037 1038 static void * 1039 _map_one(void *prv, uint64_t addr, uint64_t len, int prot) 1040 { 1041 struct spdk_nvmf_request *req = (struct spdk_nvmf_request *)prv; 1042 struct spdk_nvmf_qpair *qpair; 1043 struct nvmf_vfio_user_req *vu_req; 1044 struct nvmf_vfio_user_sq *sq; 1045 void *ret; 1046 1047 assert(req != NULL); 1048 qpair = req->qpair; 1049 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 1050 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 1051 1052 assert(vu_req->iovcnt < NVMF_VFIO_USER_MAX_IOVECS); 1053 ret = map_one(sq->ctrlr->endpoint->vfu_ctx, addr, len, 1054 vu_req_to_sg_t(vu_req, vu_req->iovcnt), 1055 &vu_req->iov[vu_req->iovcnt], prot); 1056 if (spdk_likely(ret != NULL)) { 1057 vu_req->iovcnt++; 1058 } 1059 return ret; 1060 } 1061 1062 static int 1063 vfio_user_map_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req, 1064 struct iovec *iov, uint32_t length) 1065 { 1066 /* Map PRP list to from Guest physical memory to 1067 * virtual memory address. 1068 */ 1069 return nvme_map_cmd(req, &req->cmd->nvme_cmd, iov, NVMF_REQ_MAX_BUFFERS, 1070 length, 4096, _map_one); 1071 } 1072 1073 static int 1074 handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 1075 struct nvmf_vfio_user_sq *sq); 1076 1077 /* 1078 * Posts a CQE in the completion queue. 1079 * 1080 * @ctrlr: the vfio-user controller 1081 * @cq: the completion queue 1082 * @cdw0: cdw0 as reported by NVMf 1083 * @sqid: submission queue ID 1084 * @cid: command identifier in NVMe command 1085 * @sc: the NVMe CQE status code 1086 * @sct: the NVMe CQE status code type 1087 */ 1088 static int 1089 post_completion(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_cq *cq, 1090 uint32_t cdw0, uint16_t sqid, uint16_t cid, uint16_t sc, uint16_t sct) 1091 { 1092 struct spdk_nvme_cpl *cpl; 1093 const struct spdk_nvmf_registers *regs; 1094 int err; 1095 1096 assert(ctrlr != NULL); 1097 1098 if (spdk_unlikely(cq == NULL || q_addr(&cq->mapping) == NULL)) { 1099 return 0; 1100 } 1101 1102 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->ctrlr); 1103 if (regs->csts.bits.shst != SPDK_NVME_SHST_NORMAL) { 1104 SPDK_DEBUGLOG(nvmf_vfio, 1105 "%s: ignore completion SQ%d cid=%d status=%#x\n", 1106 ctrlr_id(ctrlr), sqid, cid, sc); 1107 return 0; 1108 } 1109 1110 if (cq_is_full(ctrlr, cq)) { 1111 SPDK_ERRLOG("%s: CQ%d full (tail=%d, head=%d)\n", 1112 ctrlr_id(ctrlr), cq->qid, *cq_tailp(cq), 1113 *cq_dbl_headp(ctrlr, cq)); 1114 return -1; 1115 } 1116 1117 cpl = ((struct spdk_nvme_cpl *)q_addr(&cq->mapping)) + *cq_tailp(cq); 1118 1119 assert(ctrlr->sqs[sqid] != NULL); 1120 SPDK_DEBUGLOG(nvmf_vfio, 1121 "%s: request complete SQ%d cid=%d status=%#x SQ head=%#x CQ tail=%#x\n", 1122 ctrlr_id(ctrlr), sqid, cid, sc, *sq_headp(ctrlr->sqs[sqid]), 1123 *cq_tailp(cq)); 1124 1125 cpl->sqhd = *sq_headp(ctrlr->sqs[sqid]); 1126 cpl->sqid = sqid; 1127 cpl->cid = cid; 1128 cpl->cdw0 = cdw0; 1129 cpl->status.dnr = 0x0; 1130 cpl->status.m = 0x0; 1131 cpl->status.sct = sct; 1132 cpl->status.sc = sc; 1133 cpl->status.p = cq->phase; 1134 1135 /* Ensure the Completion Queue Entry is visible. */ 1136 spdk_wmb(); 1137 cq_tail_advance(cq); 1138 1139 /* 1140 * this function now executes at SPDK thread context, we 1141 * might be triggering interrupts from vfio-user thread context so 1142 * check for race conditions. 1143 */ 1144 if (ctrlr_interrupt_enabled(ctrlr) && cq->ien) { 1145 err = vfu_irq_trigger(ctrlr->endpoint->vfu_ctx, cq->iv); 1146 if (err != 0) { 1147 SPDK_ERRLOG("%s: failed to trigger interrupt: %m\n", 1148 ctrlr_id(ctrlr)); 1149 return err; 1150 } 1151 } 1152 1153 return 0; 1154 } 1155 1156 static bool 1157 io_q_exists(struct nvmf_vfio_user_ctrlr *vu_ctrlr, const uint16_t qid, const bool is_cq) 1158 { 1159 assert(vu_ctrlr != NULL); 1160 1161 if (qid == 0 || qid >= NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) { 1162 return false; 1163 } 1164 1165 if (is_cq) { 1166 if (vu_ctrlr->cqs[qid] == NULL) { 1167 return false; 1168 } 1169 1170 return (vu_ctrlr->cqs[qid]->cq_state != VFIO_USER_CQ_DELETED && 1171 vu_ctrlr->cqs[qid]->cq_state != VFIO_USER_CQ_UNUSED); 1172 } 1173 1174 if (vu_ctrlr->sqs[qid] == NULL) { 1175 return false; 1176 } 1177 1178 return (vu_ctrlr->sqs[qid]->sq_state != VFIO_USER_SQ_DELETED && 1179 vu_ctrlr->sqs[qid]->sq_state != VFIO_USER_SQ_UNUSED); 1180 } 1181 1182 static void 1183 free_sq_reqs(struct nvmf_vfio_user_sq *sq) 1184 { 1185 while (!TAILQ_EMPTY(&sq->free_reqs)) { 1186 struct nvmf_vfio_user_req *vu_req = TAILQ_FIRST(&sq->free_reqs); 1187 TAILQ_REMOVE(&sq->free_reqs, vu_req, link); 1188 free(vu_req); 1189 } 1190 } 1191 1192 /* Deletes a SQ, if this SQ is the last user of the associated CQ 1193 * and the controller is being shut down or reset, then the CQ is 1194 * also deleted. 1195 */ 1196 static void 1197 delete_sq_done(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvmf_vfio_user_sq *sq) 1198 { 1199 struct nvmf_vfio_user_cq *cq; 1200 uint16_t cqid; 1201 1202 SPDK_DEBUGLOG(nvmf_vfio, "%s: delete SQ%d=%p done\n", ctrlr_id(vu_ctrlr), 1203 sq->qid, sq); 1204 1205 /* Free SQ resources */ 1206 unmap_q(vu_ctrlr, &sq->mapping); 1207 1208 free_sq_reqs(sq); 1209 1210 sq->size = 0; 1211 1212 sq->sq_state = VFIO_USER_SQ_DELETED; 1213 1214 /* Controller RESET and SHUTDOWN are special cases, 1215 * VM may not send DELETE IO SQ/CQ commands, NVMf library 1216 * will disconnect IO queue pairs. 1217 */ 1218 if (vu_ctrlr->reset_shn) { 1219 cqid = sq->cqid; 1220 cq = vu_ctrlr->cqs[cqid]; 1221 1222 SPDK_DEBUGLOG(nvmf_vfio, "%s: try to delete CQ%d=%p\n", ctrlr_id(vu_ctrlr), 1223 cq->qid, cq); 1224 1225 if (cq->cq_ref) { 1226 cq->cq_ref--; 1227 } 1228 if (cq->cq_ref == 0) { 1229 unmap_q(vu_ctrlr, &cq->mapping); 1230 cq->size = 0; 1231 cq->cq_state = VFIO_USER_CQ_DELETED; 1232 cq->group = NULL; 1233 } 1234 } 1235 } 1236 1237 static void 1238 free_qp(struct nvmf_vfio_user_ctrlr *ctrlr, uint16_t qid) 1239 { 1240 struct nvmf_vfio_user_sq *sq; 1241 struct nvmf_vfio_user_cq *cq; 1242 1243 if (ctrlr == NULL) { 1244 return; 1245 } 1246 1247 sq = ctrlr->sqs[qid]; 1248 if (sq) { 1249 SPDK_DEBUGLOG(nvmf_vfio, "%s: Free SQ %u\n", ctrlr_id(ctrlr), qid); 1250 unmap_q(ctrlr, &sq->mapping); 1251 1252 free_sq_reqs(sq); 1253 1254 free(sq->mapping.sg); 1255 free(sq); 1256 ctrlr->sqs[qid] = NULL; 1257 } 1258 1259 cq = ctrlr->cqs[qid]; 1260 if (cq) { 1261 SPDK_DEBUGLOG(nvmf_vfio, "%s: Free CQ %u\n", ctrlr_id(ctrlr), qid); 1262 unmap_q(ctrlr, &cq->mapping); 1263 free(cq->mapping.sg); 1264 free(cq); 1265 ctrlr->cqs[qid] = NULL; 1266 } 1267 } 1268 1269 static int 1270 init_sq(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_transport *transport, 1271 const uint16_t id) 1272 { 1273 struct nvmf_vfio_user_sq *sq; 1274 1275 assert(ctrlr != NULL); 1276 assert(transport != NULL); 1277 assert(ctrlr->sqs[id] == NULL); 1278 1279 sq = calloc(1, sizeof(*sq)); 1280 if (sq == NULL) { 1281 return -ENOMEM; 1282 } 1283 sq->mapping.sg = calloc(1, dma_sg_size()); 1284 if (sq->mapping.sg == NULL) { 1285 free(sq); 1286 return -ENOMEM; 1287 } 1288 1289 sq->qid = id; 1290 sq->qpair.qid = id; 1291 sq->qpair.transport = transport; 1292 sq->ctrlr = ctrlr; 1293 ctrlr->sqs[id] = sq; 1294 1295 TAILQ_INIT(&sq->free_reqs); 1296 1297 return 0; 1298 } 1299 1300 static int 1301 init_cq(struct nvmf_vfio_user_ctrlr *vu_ctrlr, const uint16_t id) 1302 { 1303 struct nvmf_vfio_user_cq *cq; 1304 1305 assert(vu_ctrlr != NULL); 1306 assert(vu_ctrlr->cqs[id] == NULL); 1307 1308 cq = calloc(1, sizeof(*cq)); 1309 if (cq == NULL) { 1310 return -ENOMEM; 1311 } 1312 cq->mapping.sg = calloc(1, dma_sg_size()); 1313 if (cq->mapping.sg == NULL) { 1314 free(cq); 1315 return -ENOMEM; 1316 } 1317 1318 cq->qid = id; 1319 vu_ctrlr->cqs[id] = cq; 1320 1321 return 0; 1322 } 1323 1324 static int 1325 alloc_sq_reqs(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvmf_vfio_user_sq *sq) 1326 { 1327 struct nvmf_vfio_user_req *vu_req, *tmp; 1328 size_t req_size; 1329 uint32_t i; 1330 1331 req_size = sizeof(struct nvmf_vfio_user_req) + 1332 (dma_sg_size() * NVMF_VFIO_USER_MAX_IOVECS); 1333 1334 for (i = 0; i < sq->size; i++) { 1335 struct spdk_nvmf_request *req; 1336 1337 vu_req = calloc(1, req_size); 1338 if (vu_req == NULL) { 1339 goto err; 1340 } 1341 1342 req = &vu_req->req; 1343 req->qpair = &sq->qpair; 1344 req->rsp = (union nvmf_c2h_msg *)&vu_req->rsp; 1345 req->cmd = (union nvmf_h2c_msg *)&vu_req->cmd; 1346 1347 TAILQ_INSERT_TAIL(&sq->free_reqs, vu_req, link); 1348 } 1349 1350 return 0; 1351 1352 err: 1353 TAILQ_FOREACH_SAFE(vu_req, &sq->free_reqs, link, tmp) { 1354 free(vu_req); 1355 } 1356 return -ENOMEM; 1357 } 1358 1359 static uint16_t 1360 handle_create_io_sq(struct nvmf_vfio_user_ctrlr *ctrlr, 1361 struct spdk_nvme_cmd *cmd, uint16_t *sct) 1362 { 1363 struct nvmf_vfio_user_transport *vu_transport = ctrlr->transport; 1364 struct nvmf_vfio_user_sq *sq; 1365 uint32_t qsize; 1366 uint16_t cqid; 1367 uint16_t qid; 1368 int err; 1369 1370 qid = cmd->cdw10_bits.create_io_q.qid; 1371 cqid = cmd->cdw11_bits.create_io_sq.cqid; 1372 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 1373 1374 if (ctrlr->sqs[qid] == NULL) { 1375 err = init_sq(ctrlr, ctrlr->sqs[0]->qpair.transport, qid); 1376 if (err != 0) { 1377 *sct = SPDK_NVME_SCT_GENERIC; 1378 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1379 } 1380 } 1381 1382 if (cqid == 0 || cqid >= vu_transport->transport.opts.max_qpairs_per_ctrlr) { 1383 SPDK_ERRLOG("%s: invalid CQID %u\n", ctrlr_id(ctrlr), cqid); 1384 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1385 return SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 1386 } 1387 1388 /* CQ must be created before SQ. */ 1389 if (!io_q_exists(ctrlr, cqid, true)) { 1390 SPDK_ERRLOG("%s: CQ%u does not exist\n", ctrlr_id(ctrlr), cqid); 1391 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1392 return SPDK_NVME_SC_COMPLETION_QUEUE_INVALID; 1393 } 1394 1395 if (cmd->cdw11_bits.create_io_sq.pc != 0x1) { 1396 SPDK_ERRLOG("%s: non-PC SQ not supported\n", ctrlr_id(ctrlr)); 1397 *sct = SPDK_NVME_SCT_GENERIC; 1398 return SPDK_NVME_SC_INVALID_FIELD; 1399 } 1400 1401 sq = ctrlr->sqs[qid]; 1402 sq->size = qsize; 1403 1404 SPDK_DEBUGLOG(nvmf_vfio, "%s: SQ%d CQID=%d\n", ctrlr_id(ctrlr), 1405 qid, cqid); 1406 1407 sq->mapping.prp1 = cmd->dptr.prp.prp1; 1408 1409 err = map_q(ctrlr, &sq->mapping, sq->size, false, true); 1410 if (err) { 1411 SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr)); 1412 *sct = SPDK_NVME_SCT_GENERIC; 1413 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1414 } 1415 1416 SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped SQ%d IOVA=%#lx vaddr=%p\n", 1417 ctrlr_id(ctrlr), qid, cmd->dptr.prp.prp1, 1418 q_addr(&sq->mapping)); 1419 1420 err = alloc_sq_reqs(ctrlr, sq); 1421 if (err < 0) { 1422 SPDK_ERRLOG("%s: failed to allocate SQ requests: %m\n", ctrlr_id(ctrlr)); 1423 *sct = SPDK_NVME_SCT_GENERIC; 1424 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1425 } 1426 1427 sq->cqid = cqid; 1428 ctrlr->cqs[sq->cqid]->cq_ref++; 1429 sq->sq_state = VFIO_USER_SQ_CREATED; 1430 *sq_headp(sq) = 0; 1431 *sq_dbl_tailp(ctrlr, sq) = 0; 1432 1433 /* 1434 * Create our new I/O qpair. This asynchronously invokes, on a suitable 1435 * poll group, the nvmf_vfio_user_poll_group_add() callback, which will 1436 * call spdk_nvmf_request_exec_fabrics() with a generated fabrics 1437 * connect command. This command is then eventually completed via 1438 * handle_queue_connect_rsp(). 1439 */ 1440 sq->create_io_sq_cmd = *cmd; 1441 sq->post_create_io_sq_completion = true; 1442 1443 spdk_nvmf_tgt_new_qpair(ctrlr->transport->transport.tgt, 1444 &sq->qpair); 1445 1446 *sct = SPDK_NVME_SCT_GENERIC; 1447 return SPDK_NVME_SC_SUCCESS; 1448 } 1449 1450 static uint16_t 1451 handle_create_io_cq(struct nvmf_vfio_user_ctrlr *ctrlr, 1452 struct spdk_nvme_cmd *cmd, uint16_t *sct) 1453 { 1454 struct nvmf_vfio_user_cq *cq; 1455 uint32_t qsize; 1456 uint16_t qid; 1457 int err; 1458 1459 qid = cmd->cdw10_bits.create_io_q.qid; 1460 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 1461 1462 if (ctrlr->cqs[qid] == NULL) { 1463 err = init_cq(ctrlr, qid); 1464 if (err != 0) { 1465 *sct = SPDK_NVME_SCT_GENERIC; 1466 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1467 } 1468 } 1469 1470 if (cmd->cdw11_bits.create_io_cq.pc != 0x1) { 1471 SPDK_ERRLOG("%s: non-PC CQ not supported\n", ctrlr_id(ctrlr)); 1472 *sct = SPDK_NVME_SCT_GENERIC; 1473 return SPDK_NVME_SC_INVALID_FIELD; 1474 } 1475 1476 if (cmd->cdw11_bits.create_io_cq.iv > NVME_IRQ_MSIX_NUM - 1) { 1477 SPDK_ERRLOG("%s: IV is too big\n", ctrlr_id(ctrlr)); 1478 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1479 return SPDK_NVME_SC_INVALID_INTERRUPT_VECTOR; 1480 } 1481 1482 cq = ctrlr->cqs[qid]; 1483 cq->size = qsize; 1484 1485 cq->mapping.prp1 = cmd->dptr.prp.prp1; 1486 1487 err = map_q(ctrlr, &cq->mapping, cq->size, true, true); 1488 if (err) { 1489 SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr)); 1490 *sct = SPDK_NVME_SCT_GENERIC; 1491 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1492 } 1493 1494 SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped CQ%d IOVA=%#lx vaddr=%p\n", 1495 ctrlr_id(ctrlr), qid, cmd->dptr.prp.prp1, 1496 q_addr(&cq->mapping)); 1497 1498 cq->ien = cmd->cdw11_bits.create_io_cq.ien; 1499 cq->iv = cmd->cdw11_bits.create_io_cq.iv; 1500 cq->phase = true; 1501 cq->cq_state = VFIO_USER_CQ_CREATED; 1502 1503 *cq_tailp(cq) = 0; 1504 *cq_dbl_headp(ctrlr, cq) = 0; 1505 1506 *sct = SPDK_NVME_SCT_GENERIC; 1507 return SPDK_NVME_SC_SUCCESS; 1508 } 1509 1510 /* 1511 * Creates a completion or submission I/O queue. Returns 0 on success, -errno 1512 * on error. 1513 */ 1514 static int 1515 handle_create_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, 1516 struct spdk_nvme_cmd *cmd, const bool is_cq) 1517 { 1518 struct nvmf_vfio_user_transport *vu_transport = ctrlr->transport; 1519 uint16_t sct = SPDK_NVME_SCT_GENERIC; 1520 uint16_t sc = SPDK_NVME_SC_SUCCESS; 1521 uint32_t qsize; 1522 uint16_t qid; 1523 1524 assert(ctrlr != NULL); 1525 assert(cmd != NULL); 1526 1527 qid = cmd->cdw10_bits.create_io_q.qid; 1528 if (qid == 0 || qid >= vu_transport->transport.opts.max_qpairs_per_ctrlr) { 1529 SPDK_ERRLOG("%s: invalid QID=%d, max=%d\n", ctrlr_id(ctrlr), 1530 qid, vu_transport->transport.opts.max_qpairs_per_ctrlr); 1531 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1532 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 1533 goto out; 1534 } 1535 1536 if (io_q_exists(ctrlr, qid, is_cq)) { 1537 SPDK_ERRLOG("%s: %cQ%d already exists\n", ctrlr_id(ctrlr), 1538 is_cq ? 'C' : 'S', qid); 1539 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1540 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 1541 goto out; 1542 } 1543 1544 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 1545 if (qsize == 1 || qsize > max_queue_size(ctrlr)) { 1546 SPDK_ERRLOG("%s: invalid I/O queue size %u\n", ctrlr_id(ctrlr), qsize); 1547 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1548 sc = SPDK_NVME_SC_INVALID_QUEUE_SIZE; 1549 goto out; 1550 } 1551 1552 if (is_cq) { 1553 sc = handle_create_io_cq(ctrlr, cmd, &sct); 1554 } else { 1555 sc = handle_create_io_sq(ctrlr, cmd, &sct); 1556 1557 if (sct == SPDK_NVME_SCT_GENERIC && 1558 sc == SPDK_NVME_SC_SUCCESS) { 1559 /* Completion posted asynchronously. */ 1560 return 0; 1561 } 1562 } 1563 1564 out: 1565 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 1566 } 1567 1568 /* For ADMIN I/O DELETE SUBMISSION QUEUE the NVMf library will disconnect and free 1569 * queue pair, so save the command in a context. 1570 */ 1571 struct vfio_user_delete_sq_ctx { 1572 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 1573 struct spdk_nvme_cmd delete_io_sq_cmd; 1574 }; 1575 1576 static void 1577 vfio_user_qpair_delete_cb(void *cb_arg) 1578 { 1579 struct vfio_user_delete_sq_ctx *ctx = cb_arg; 1580 struct nvmf_vfio_user_ctrlr *vu_ctrlr = ctx->vu_ctrlr; 1581 1582 post_completion(vu_ctrlr, vu_ctrlr->cqs[0], 0, 0, ctx->delete_io_sq_cmd.cid, 1583 SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC); 1584 free(ctx); 1585 } 1586 1587 /* 1588 * Deletes a completion or submission I/O queue. 1589 */ 1590 static int 1591 handle_del_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, 1592 struct spdk_nvme_cmd *cmd, const bool is_cq) 1593 { 1594 uint16_t sct = SPDK_NVME_SCT_GENERIC; 1595 uint16_t sc = SPDK_NVME_SC_SUCCESS; 1596 struct nvmf_vfio_user_sq *sq; 1597 struct nvmf_vfio_user_cq *cq; 1598 struct vfio_user_delete_sq_ctx *ctx; 1599 1600 SPDK_DEBUGLOG(nvmf_vfio, "%s: delete I/O %cQ: QID=%d\n", 1601 ctrlr_id(ctrlr), is_cq ? 'C' : 'S', 1602 cmd->cdw10_bits.delete_io_q.qid); 1603 1604 if (!io_q_exists(ctrlr, cmd->cdw10_bits.delete_io_q.qid, is_cq)) { 1605 SPDK_ERRLOG("%s: I/O %cQ%d does not exist\n", ctrlr_id(ctrlr), 1606 is_cq ? 'C' : 'S', cmd->cdw10_bits.delete_io_q.qid); 1607 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1608 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 1609 goto out; 1610 } 1611 1612 if (is_cq) { 1613 cq = ctrlr->cqs[cmd->cdw10_bits.delete_io_q.qid]; 1614 if (cq->cq_ref) { 1615 SPDK_ERRLOG("%s: the associated SQ must be deleted first\n", ctrlr_id(ctrlr)); 1616 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1617 sc = SPDK_NVME_SC_INVALID_QUEUE_DELETION; 1618 goto out; 1619 } 1620 1621 unmap_q(ctrlr, &cq->mapping); 1622 cq->size = 0; 1623 cq->cq_state = VFIO_USER_CQ_DELETED; 1624 cq->group = NULL; 1625 } else { 1626 ctx = calloc(1, sizeof(*ctx)); 1627 if (!ctx) { 1628 sct = SPDK_NVME_SCT_GENERIC; 1629 sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1630 goto out; 1631 } 1632 ctx->vu_ctrlr = ctrlr; 1633 ctx->delete_io_sq_cmd = *cmd; 1634 1635 sq = ctrlr->sqs[cmd->cdw10_bits.delete_io_q.qid]; 1636 sq->sq_state = VFIO_USER_SQ_DELETED; 1637 assert(ctrlr->cqs[sq->cqid]->cq_ref); 1638 ctrlr->cqs[sq->cqid]->cq_ref--; 1639 1640 spdk_nvmf_qpair_disconnect(&sq->qpair, vfio_user_qpair_delete_cb, ctx); 1641 return 0; 1642 } 1643 1644 out: 1645 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 1646 } 1647 1648 /* 1649 * Returns 0 on success and -errno on error. 1650 */ 1651 static int 1652 consume_admin_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd) 1653 { 1654 assert(ctrlr != NULL); 1655 assert(cmd != NULL); 1656 1657 if (cmd->fuse != 0) { 1658 /* Fused admin commands are not supported. */ 1659 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, 1660 SPDK_NVME_SC_INVALID_FIELD, 1661 SPDK_NVME_SCT_GENERIC); 1662 } 1663 1664 switch (cmd->opc) { 1665 case SPDK_NVME_OPC_CREATE_IO_CQ: 1666 case SPDK_NVME_OPC_CREATE_IO_SQ: 1667 return handle_create_io_q(ctrlr, cmd, 1668 cmd->opc == SPDK_NVME_OPC_CREATE_IO_CQ); 1669 case SPDK_NVME_OPC_DELETE_IO_SQ: 1670 case SPDK_NVME_OPC_DELETE_IO_CQ: 1671 return handle_del_io_q(ctrlr, cmd, 1672 cmd->opc == SPDK_NVME_OPC_DELETE_IO_CQ); 1673 default: 1674 return handle_cmd_req(ctrlr, cmd, ctrlr->sqs[0]); 1675 } 1676 } 1677 1678 static int 1679 handle_cmd_rsp(struct nvmf_vfio_user_req *vu_req, void *cb_arg) 1680 { 1681 struct nvmf_vfio_user_sq *sq = cb_arg; 1682 struct nvmf_vfio_user_ctrlr *vu_ctrlr = sq->ctrlr; 1683 uint16_t sqid, cqid; 1684 1685 assert(sq != NULL); 1686 assert(vu_req != NULL); 1687 assert(vu_ctrlr != NULL); 1688 1689 if (spdk_likely(vu_req->iovcnt)) { 1690 vfu_unmap_sg(vu_ctrlr->endpoint->vfu_ctx, 1691 vu_req_to_sg_t(vu_req, 0), 1692 vu_req->iov, vu_req->iovcnt); 1693 } 1694 sqid = sq->qid; 1695 cqid = sq->cqid; 1696 1697 return post_completion(vu_ctrlr, vu_ctrlr->cqs[cqid], 1698 vu_req->req.rsp->nvme_cpl.cdw0, 1699 sqid, 1700 vu_req->req.cmd->nvme_cmd.cid, 1701 vu_req->req.rsp->nvme_cpl.status.sc, 1702 vu_req->req.rsp->nvme_cpl.status.sct); 1703 } 1704 1705 static int 1706 consume_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_sq *sq, 1707 struct spdk_nvme_cmd *cmd) 1708 { 1709 assert(sq != NULL); 1710 if (nvmf_qpair_is_admin_queue(&sq->qpair)) { 1711 return consume_admin_cmd(ctrlr, cmd); 1712 } 1713 1714 return handle_cmd_req(ctrlr, cmd, sq); 1715 } 1716 1717 /* Returns the number of commands processed, or a negative value on error. */ 1718 static int 1719 handle_sq_tdbl_write(struct nvmf_vfio_user_ctrlr *ctrlr, const uint32_t new_tail, 1720 struct nvmf_vfio_user_sq *sq) 1721 { 1722 struct spdk_nvme_cmd *queue; 1723 int count = 0; 1724 1725 assert(ctrlr != NULL); 1726 assert(sq != NULL); 1727 1728 queue = q_addr(&sq->mapping); 1729 while (*sq_headp(sq) != new_tail) { 1730 int err; 1731 struct spdk_nvme_cmd *cmd = &queue[*sq_headp(sq)]; 1732 1733 count++; 1734 1735 /* 1736 * SQHD must contain the new head pointer, so we must increase 1737 * it before we generate a completion. 1738 */ 1739 sq_head_advance(sq); 1740 1741 err = consume_cmd(ctrlr, sq, cmd); 1742 if (err != 0) { 1743 return err; 1744 } 1745 } 1746 1747 return count; 1748 } 1749 1750 static int 1751 enable_admin_queue(struct nvmf_vfio_user_ctrlr *ctrlr) 1752 { 1753 int err; 1754 1755 assert(ctrlr != NULL); 1756 1757 err = acq_setup(ctrlr); 1758 if (err != 0) { 1759 return err; 1760 } 1761 1762 err = asq_setup(ctrlr); 1763 if (err != 0) { 1764 return err; 1765 } 1766 1767 return 0; 1768 } 1769 1770 static void 1771 disable_admin_queue(struct nvmf_vfio_user_ctrlr *ctrlr) 1772 { 1773 assert(ctrlr->sqs[0] != NULL); 1774 assert(ctrlr->cqs[0] != NULL); 1775 1776 unmap_q(ctrlr, &ctrlr->sqs[0]->mapping); 1777 unmap_q(ctrlr, &ctrlr->cqs[0]->mapping); 1778 1779 ctrlr->sqs[0]->size = 0; 1780 *sq_headp(ctrlr->sqs[0]) = 0; 1781 ctrlr->cqs[0]->size = 0; 1782 *cq_dbl_headp(ctrlr, ctrlr->cqs[0]) = 0; 1783 } 1784 1785 static void 1786 memory_region_add_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) 1787 { 1788 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 1789 struct nvmf_vfio_user_ctrlr *ctrlr; 1790 struct nvmf_vfio_user_sq *sq; 1791 struct nvmf_vfio_user_cq *cq; 1792 void *map_start, *map_end; 1793 int ret; 1794 1795 /* 1796 * We're not interested in any DMA regions that aren't mappable (we don't 1797 * support clients that don't share their memory). 1798 */ 1799 if (!info->vaddr) { 1800 return; 1801 } 1802 1803 map_start = info->mapping.iov_base; 1804 map_end = info->mapping.iov_base + info->mapping.iov_len; 1805 1806 if (((uintptr_t)info->mapping.iov_base & MASK_2MB) || 1807 (info->mapping.iov_len & MASK_2MB)) { 1808 SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %p-%p\n", 1809 info->vaddr, map_start, map_end); 1810 return; 1811 } 1812 1813 assert(endpoint != NULL); 1814 if (endpoint->ctrlr == NULL) { 1815 return; 1816 } 1817 ctrlr = endpoint->ctrlr; 1818 1819 SPDK_DEBUGLOG(nvmf_vfio, "%s: map IOVA %p-%p\n", endpoint_id(endpoint), 1820 map_start, map_end); 1821 1822 /* VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE are enabled when registering to VFIO, here we also 1823 * check the protection bits before registering. 1824 */ 1825 if (info->prot == (PROT_WRITE | PROT_READ)) { 1826 ret = spdk_mem_register(info->mapping.iov_base, info->mapping.iov_len); 1827 if (ret) { 1828 SPDK_ERRLOG("Memory region register %p-%p failed, ret=%d\n", 1829 map_start, map_end, ret); 1830 } 1831 } 1832 1833 pthread_mutex_lock(&endpoint->lock); 1834 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 1835 if (sq->sq_state != VFIO_USER_SQ_INACTIVE) { 1836 continue; 1837 } 1838 1839 cq = ctrlr->cqs[sq->cqid]; 1840 1841 /* For shared CQ case, we will use q_addr() to avoid mapping CQ multiple times */ 1842 if (cq->size && q_addr(&cq->mapping) == NULL) { 1843 ret = map_q(ctrlr, &cq->mapping, cq->size, true, false); 1844 if (ret) { 1845 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap CQID %d %#lx-%#lx\n", 1846 cq->qid, cq->mapping.prp1, 1847 cq->mapping.prp1 + cq->size * sizeof(struct spdk_nvme_cpl)); 1848 continue; 1849 } 1850 } 1851 1852 if (sq->size) { 1853 ret = map_q(ctrlr, &sq->mapping, sq->size, false, false); 1854 if (ret) { 1855 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap SQID %d %#lx-%#lx\n", 1856 sq->qid, sq->mapping.prp1, 1857 sq->mapping.prp1 + sq->size * sizeof(struct spdk_nvme_cmd)); 1858 continue; 1859 } 1860 } 1861 sq->sq_state = VFIO_USER_SQ_ACTIVE; 1862 SPDK_DEBUGLOG(nvmf_vfio, "Remap SQ %u successfully\n", sq->qid); 1863 } 1864 pthread_mutex_unlock(&endpoint->lock); 1865 } 1866 1867 static void 1868 memory_region_remove_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) 1869 { 1870 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 1871 struct nvmf_vfio_user_sq *sq; 1872 struct nvmf_vfio_user_cq *cq; 1873 void *map_start, *map_end; 1874 int ret = 0; 1875 1876 if (!info->vaddr) { 1877 return; 1878 } 1879 1880 map_start = info->mapping.iov_base; 1881 map_end = info->mapping.iov_base + info->mapping.iov_len; 1882 1883 if (((uintptr_t)info->mapping.iov_base & MASK_2MB) || 1884 (info->mapping.iov_len & MASK_2MB)) { 1885 SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %p-%p\n", 1886 info->vaddr, map_start, map_end); 1887 return; 1888 } 1889 1890 assert(endpoint != NULL); 1891 SPDK_DEBUGLOG(nvmf_vfio, "%s: unmap IOVA %p-%p\n", endpoint_id(endpoint), 1892 map_start, map_end); 1893 1894 if (endpoint->ctrlr != NULL) { 1895 struct nvmf_vfio_user_ctrlr *ctrlr; 1896 ctrlr = endpoint->ctrlr; 1897 1898 pthread_mutex_lock(&endpoint->lock); 1899 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 1900 if (q_addr(&sq->mapping) >= map_start && q_addr(&sq->mapping) <= map_end) { 1901 unmap_q(ctrlr, &sq->mapping); 1902 sq->sq_state = VFIO_USER_SQ_INACTIVE; 1903 } 1904 1905 cq = ctrlr->cqs[sq->cqid]; 1906 if (q_addr(&cq->mapping) >= map_start && q_addr(&cq->mapping) <= map_end) { 1907 unmap_q(ctrlr, &cq->mapping); 1908 } 1909 } 1910 pthread_mutex_unlock(&endpoint->lock); 1911 } 1912 1913 if (info->prot == (PROT_WRITE | PROT_READ)) { 1914 ret = spdk_mem_unregister(info->mapping.iov_base, info->mapping.iov_len); 1915 if (ret) { 1916 SPDK_ERRLOG("Memory region unregister %p-%p failed, ret=%d\n", 1917 map_start, map_end, ret); 1918 } 1919 } 1920 } 1921 1922 static int 1923 nvmf_vfio_user_prop_req_rsp(struct nvmf_vfio_user_req *req, void *cb_arg) 1924 { 1925 struct nvmf_vfio_user_sq *sq = cb_arg; 1926 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 1927 int ret; 1928 1929 assert(sq != NULL); 1930 assert(req != NULL); 1931 1932 if (req->req.cmd->prop_get_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET) { 1933 assert(sq->ctrlr != NULL); 1934 assert(req != NULL); 1935 1936 memcpy(req->req.data, 1937 &req->req.rsp->prop_get_rsp.value.u64, 1938 req->req.length); 1939 } else { 1940 assert(req->req.cmd->prop_set_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET); 1941 assert(sq->ctrlr != NULL); 1942 vu_ctrlr = sq->ctrlr; 1943 1944 if (req->req.cmd->prop_set_cmd.ofst == offsetof(struct spdk_nvme_registers, cc)) { 1945 union spdk_nvme_cc_register cc, diff; 1946 1947 cc.raw = req->req.cmd->prop_set_cmd.value.u64; 1948 diff.raw = cc.raw ^ req->cc.raw; 1949 1950 if (diff.bits.en) { 1951 if (cc.bits.en) { 1952 SPDK_DEBUGLOG(nvmf_vfio, "%s: MAP Admin queue\n", ctrlr_id(vu_ctrlr)); 1953 ret = enable_admin_queue(vu_ctrlr); 1954 if (ret) { 1955 SPDK_ERRLOG("%s: failed to map Admin queue\n", ctrlr_id(vu_ctrlr)); 1956 return ret; 1957 } 1958 sq->sq_state = VFIO_USER_SQ_ACTIVE; 1959 vu_ctrlr->reset_shn = false; 1960 } else { 1961 vu_ctrlr->reset_shn = true; 1962 } 1963 } 1964 1965 if (diff.bits.shn) { 1966 if (cc.bits.shn == SPDK_NVME_SHN_NORMAL || cc.bits.shn == SPDK_NVME_SHN_ABRUPT) { 1967 vu_ctrlr->reset_shn = true; 1968 } 1969 } 1970 1971 if (vu_ctrlr->reset_shn) { 1972 SPDK_DEBUGLOG(nvmf_vfio, 1973 "%s: UNMAP Admin queue\n", 1974 ctrlr_id(vu_ctrlr)); 1975 sq->sq_state = VFIO_USER_SQ_INACTIVE; 1976 disable_admin_queue(vu_ctrlr); 1977 /* For PCIe controller reset or shutdown, we will drop all AER responses */ 1978 nvmf_ctrlr_abort_aer(vu_ctrlr->ctrlr); 1979 } 1980 } 1981 } 1982 1983 return 0; 1984 } 1985 1986 /* 1987 * Handles a write at offset 0x1000 or more; this is the non-mapped path when a 1988 * doorbell is written via access_bar0_fn(). 1989 * 1990 * DSTRD is set to fixed value 0 for NVMf. 1991 * 1992 */ 1993 static int 1994 handle_dbl_access(struct nvmf_vfio_user_ctrlr *ctrlr, uint32_t *buf, 1995 const size_t count, loff_t pos, const bool is_write) 1996 { 1997 assert(ctrlr != NULL); 1998 assert(buf != NULL); 1999 2000 if (count != sizeof(uint32_t)) { 2001 SPDK_ERRLOG("%s: bad doorbell buffer size %ld\n", 2002 ctrlr_id(ctrlr), count); 2003 errno = EINVAL; 2004 return -1; 2005 } 2006 2007 pos -= NVME_DOORBELLS_OFFSET; 2008 2009 /* pos must be dword aligned */ 2010 if ((pos & 0x3) != 0) { 2011 SPDK_ERRLOG("%s: bad doorbell offset %#lx\n", ctrlr_id(ctrlr), pos); 2012 errno = EINVAL; 2013 return -1; 2014 } 2015 2016 /* convert byte offset to array index */ 2017 pos >>= 2; 2018 2019 if (pos >= NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR * 2) { 2020 SPDK_ERRLOG("%s: bad doorbell index %#lx\n", ctrlr_id(ctrlr), pos); 2021 errno = EINVAL; 2022 return -1; 2023 } 2024 2025 if (is_write) { 2026 ctrlr->doorbells[pos] = *buf; 2027 spdk_wmb(); 2028 } else { 2029 spdk_rmb(); 2030 *buf = ctrlr->doorbells[pos]; 2031 } 2032 return 0; 2033 } 2034 2035 static size_t 2036 vfio_user_property_access(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 2037 char *buf, size_t count, loff_t pos, 2038 bool is_write) 2039 { 2040 struct nvmf_vfio_user_req *req; 2041 const struct spdk_nvmf_registers *regs; 2042 2043 /* Construct a Fabric Property Get/Set command and send it */ 2044 req = get_nvmf_vfio_user_req(vu_ctrlr->sqs[0]); 2045 if (req == NULL) { 2046 errno = ENOBUFS; 2047 return -1; 2048 } 2049 regs = spdk_nvmf_ctrlr_get_regs(vu_ctrlr->ctrlr); 2050 req->cc.raw = regs->cc.raw; 2051 2052 req->cb_fn = nvmf_vfio_user_prop_req_rsp; 2053 req->cb_arg = vu_ctrlr->sqs[0]; 2054 req->req.cmd->prop_set_cmd.opcode = SPDK_NVME_OPC_FABRIC; 2055 req->req.cmd->prop_set_cmd.cid = 0; 2056 req->req.cmd->prop_set_cmd.attrib.size = (count / 4) - 1; 2057 req->req.cmd->prop_set_cmd.ofst = pos; 2058 if (is_write) { 2059 req->req.cmd->prop_set_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET; 2060 if (req->req.cmd->prop_set_cmd.attrib.size) { 2061 req->req.cmd->prop_set_cmd.value.u64 = *(uint64_t *)buf; 2062 } else { 2063 req->req.cmd->prop_set_cmd.value.u32.high = 0; 2064 req->req.cmd->prop_set_cmd.value.u32.low = *(uint32_t *)buf; 2065 } 2066 } else { 2067 req->req.cmd->prop_get_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET; 2068 } 2069 req->req.length = count; 2070 req->req.data = buf; 2071 2072 spdk_nvmf_request_exec_fabrics(&req->req); 2073 2074 return count; 2075 } 2076 2077 static ssize_t 2078 access_bar0_fn(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t pos, 2079 bool is_write) 2080 { 2081 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2082 struct nvmf_vfio_user_ctrlr *ctrlr; 2083 int ret; 2084 2085 ctrlr = endpoint->ctrlr; 2086 if (endpoint->need_async_destroy || !ctrlr) { 2087 errno = EIO; 2088 return -1; 2089 } 2090 2091 SPDK_DEBUGLOG(nvmf_vfio, 2092 "%s: bar0 %s ctrlr: %p, count=%zu, pos=%"PRIX64"\n", 2093 endpoint_id(endpoint), is_write ? "write" : "read", 2094 ctrlr, count, pos); 2095 2096 if (pos >= NVME_DOORBELLS_OFFSET) { 2097 /* 2098 * The fact that the doorbells can be memory mapped doesn't mean 2099 * that the client (VFIO in QEMU) is obliged to memory map them, 2100 * it might still elect to access them via regular read/write; 2101 * we might also have had disable_mappable_bar0 set. 2102 */ 2103 ret = handle_dbl_access(ctrlr, (uint32_t *)buf, count, 2104 pos, is_write); 2105 if (ret == 0) { 2106 return count; 2107 } 2108 return ret; 2109 } 2110 2111 return vfio_user_property_access(ctrlr, buf, count, pos, is_write); 2112 } 2113 2114 static ssize_t 2115 access_pci_config(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t offset, 2116 bool is_write) 2117 { 2118 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2119 2120 if (is_write) { 2121 SPDK_ERRLOG("%s: write %#lx-%#lx not supported\n", 2122 endpoint_id(endpoint), offset, offset + count); 2123 errno = EINVAL; 2124 return -1; 2125 } 2126 2127 if (offset + count > NVME_REG_CFG_SIZE) { 2128 SPDK_ERRLOG("%s: access past end of extended PCI configuration space, want=%ld+%ld, max=%d\n", 2129 endpoint_id(endpoint), offset, count, 2130 NVME_REG_CFG_SIZE); 2131 errno = ERANGE; 2132 return -1; 2133 } 2134 2135 memcpy(buf, ((unsigned char *)endpoint->pci_config_space) + offset, count); 2136 2137 return count; 2138 } 2139 2140 static void 2141 vfio_user_log(vfu_ctx_t *vfu_ctx, int level, char const *msg) 2142 { 2143 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2144 2145 if (level >= LOG_DEBUG) { 2146 SPDK_DEBUGLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg); 2147 } else if (level >= LOG_INFO) { 2148 SPDK_INFOLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg); 2149 } else if (level >= LOG_NOTICE) { 2150 SPDK_NOTICELOG("%s: %s\n", endpoint_id(endpoint), msg); 2151 } else if (level >= LOG_WARNING) { 2152 SPDK_WARNLOG("%s: %s\n", endpoint_id(endpoint), msg); 2153 } else { 2154 SPDK_ERRLOG("%s: %s\n", endpoint_id(endpoint), msg); 2155 } 2156 } 2157 2158 static int 2159 vfio_user_get_log_level(void) 2160 { 2161 int level; 2162 2163 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 2164 return LOG_DEBUG; 2165 } 2166 2167 level = spdk_log_to_syslog_level(spdk_log_get_level()); 2168 if (level < 0) { 2169 return LOG_ERR; 2170 } 2171 2172 return level; 2173 } 2174 2175 static void 2176 init_pci_config_space(vfu_pci_config_space_t *p) 2177 { 2178 /* MLBAR */ 2179 p->hdr.bars[0].raw = 0x0; 2180 /* MUBAR */ 2181 p->hdr.bars[1].raw = 0x0; 2182 2183 /* vendor specific, let's set them to zero for now */ 2184 p->hdr.bars[3].raw = 0x0; 2185 p->hdr.bars[4].raw = 0x0; 2186 p->hdr.bars[5].raw = 0x0; 2187 2188 /* enable INTx */ 2189 p->hdr.intr.ipin = 0x1; 2190 } 2191 2192 static void 2193 vfio_user_dev_migr_resume_done(struct spdk_nvmf_subsystem *subsystem, 2194 void *cb_arg, int status) 2195 { 2196 struct nvmf_vfio_user_ctrlr *vu_ctrlr = cb_arg; 2197 2198 SPDK_DEBUGLOG(nvmf_vfio, "%s resumed done with status %d\n", ctrlr_id(vu_ctrlr), status); 2199 2200 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 2201 } 2202 2203 static void 2204 vfio_user_dev_quiesce_done(struct spdk_nvmf_subsystem *subsystem, 2205 void *cb_arg, int status); 2206 2207 static void 2208 vfio_user_endpoint_resume_done(struct spdk_nvmf_subsystem *subsystem, 2209 void *cb_arg, int status) 2210 { 2211 struct nvmf_vfio_user_endpoint *endpoint = cb_arg; 2212 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 2213 int ret; 2214 2215 SPDK_DEBUGLOG(nvmf_vfio, "%s resumed done with status %d\n", endpoint_id(endpoint), status); 2216 2217 if (!vu_ctrlr) { 2218 return; 2219 } 2220 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 2221 2222 /* Basically, once we call `vfu_device_quiesced` the device is unquiesced from 2223 * libvfio-user's perspective so from the moment `vfio_user_dev_quiesce_done` returns 2224 * libvfio-user might quiesce the device again. However, because the NVMf subsytem is 2225 * an asynchronous operation, this quiesce might come _before_ the NVMf subsystem has 2226 * been resumed, so in the callback of `spdk_nvmf_subsystem_resume` we need to check 2227 * whether a quiesce was requested. 2228 */ 2229 if (vu_ctrlr->queued_quiesce) { 2230 SPDK_DEBUGLOG(nvmf_vfio, "%s has queued quiesce event, pause again\n", ctrlr_id(vu_ctrlr)); 2231 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSING; 2232 ret = spdk_nvmf_subsystem_pause((struct spdk_nvmf_subsystem *)endpoint->subsystem, 0, 2233 vfio_user_dev_quiesce_done, vu_ctrlr); 2234 if (ret < 0) { 2235 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 2236 SPDK_ERRLOG("%s: failed to pause, ret=%d\n", endpoint_id(endpoint), ret); 2237 } 2238 } 2239 } 2240 2241 static void 2242 vfio_user_dev_quiesce_done(struct spdk_nvmf_subsystem *subsystem, 2243 void *cb_arg, int status) 2244 { 2245 struct nvmf_vfio_user_ctrlr *vu_ctrlr = cb_arg; 2246 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 2247 int ret; 2248 2249 SPDK_DEBUGLOG(nvmf_vfio, "%s paused done with status %d\n", ctrlr_id(vu_ctrlr), status); 2250 2251 assert(vu_ctrlr->state == VFIO_USER_CTRLR_PAUSING); 2252 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 2253 vfu_device_quiesced(endpoint->vfu_ctx, status); 2254 vu_ctrlr->queued_quiesce = false; 2255 2256 /* `vfu_device_quiesced` can change the migration state, 2257 * so we need to re-check `vu_ctrlr->state`. 2258 */ 2259 if (vu_ctrlr->state == VFIO_USER_CTRLR_MIGRATING) { 2260 SPDK_DEBUGLOG(nvmf_vfio, "%s is in MIGRATION state\n", ctrlr_id(vu_ctrlr)); 2261 return; 2262 } 2263 2264 SPDK_DEBUGLOG(nvmf_vfio, "%s start to resume\n", ctrlr_id(vu_ctrlr)); 2265 vu_ctrlr->state = VFIO_USER_CTRLR_RESUMING; 2266 ret = spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 2267 vfio_user_endpoint_resume_done, endpoint); 2268 if (ret < 0) { 2269 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 2270 SPDK_ERRLOG("%s: failed to resume, ret=%d\n", endpoint_id(endpoint), ret); 2271 } 2272 } 2273 2274 static int 2275 vfio_user_dev_quiesce_cb(vfu_ctx_t *vfu_ctx) 2276 { 2277 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2278 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 2279 int ret; 2280 2281 if (!vu_ctrlr) { 2282 return 0; 2283 } 2284 2285 /* NVMf library will destruct controller when no 2286 * connected queue pairs. 2287 */ 2288 if (!nvmf_subsystem_get_ctrlr((struct spdk_nvmf_subsystem *)endpoint->subsystem, 2289 vu_ctrlr->cntlid)) { 2290 return 0; 2291 } 2292 2293 SPDK_DEBUGLOG(nvmf_vfio, "%s starts to quiesce\n", ctrlr_id(vu_ctrlr)); 2294 2295 /* There is no race condition here as device quiesce callback 2296 * and nvmf_prop_set_cc() are running in the same thread context. 2297 */ 2298 if (!vu_ctrlr->ctrlr->vcprop.cc.bits.en) { 2299 return 0; 2300 } else if (!vu_ctrlr->ctrlr->vcprop.csts.bits.rdy) { 2301 return 0; 2302 } else if (vu_ctrlr->ctrlr->vcprop.csts.bits.shst == SPDK_NVME_SHST_COMPLETE) { 2303 return 0; 2304 } 2305 2306 switch (vu_ctrlr->state) { 2307 case VFIO_USER_CTRLR_PAUSED: 2308 case VFIO_USER_CTRLR_MIGRATING: 2309 return 0; 2310 case VFIO_USER_CTRLR_RUNNING: 2311 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSING; 2312 ret = spdk_nvmf_subsystem_pause((struct spdk_nvmf_subsystem *)endpoint->subsystem, 0, 2313 vfio_user_dev_quiesce_done, vu_ctrlr); 2314 if (ret < 0) { 2315 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 2316 SPDK_ERRLOG("%s: failed to pause, ret=%d\n", endpoint_id(endpoint), ret); 2317 return 0; 2318 } 2319 break; 2320 case VFIO_USER_CTRLR_RESUMING: 2321 vu_ctrlr->queued_quiesce = true; 2322 SPDK_DEBUGLOG(nvmf_vfio, "%s is busy to quiesce, current state %u\n", ctrlr_id(vu_ctrlr), 2323 vu_ctrlr->state); 2324 break; 2325 default: 2326 assert(vu_ctrlr->state != VFIO_USER_CTRLR_PAUSING); 2327 break; 2328 } 2329 2330 errno = EBUSY; 2331 return -1; 2332 } 2333 2334 static void 2335 vfio_user_ctrlr_dump_migr_data(const char *name, struct vfio_user_nvme_migr_state *migr_data) 2336 { 2337 struct spdk_nvme_registers *regs; 2338 struct nvme_migr_sq_state *sq; 2339 struct nvme_migr_cq_state *cq; 2340 uint32_t *doorbell_base; 2341 uint32_t i; 2342 2343 SPDK_NOTICELOG("Dump %s\n", name); 2344 2345 regs = (struct spdk_nvme_registers *)migr_data->bar0; 2346 doorbell_base = (uint32_t *)®s->doorbell[0].sq_tdbl; 2347 2348 SPDK_NOTICELOG("Registers\n"); 2349 SPDK_NOTICELOG("CSTS 0x%x\n", regs->csts.raw); 2350 SPDK_NOTICELOG("CAP 0x%"PRIx64"\n", regs->cap.raw); 2351 SPDK_NOTICELOG("VS 0x%x\n", regs->vs.raw); 2352 SPDK_NOTICELOG("CC 0x%x\n", regs->cc.raw); 2353 SPDK_NOTICELOG("AQA 0x%x\n", regs->aqa.raw); 2354 SPDK_NOTICELOG("ASQ 0x%"PRIx64"\n", regs->asq); 2355 SPDK_NOTICELOG("ACQ 0x%"PRIx64"\n", regs->acq); 2356 2357 SPDK_NOTICELOG("Number of IO Queues %u\n", migr_data->ctrlr_data.num_io_queues); 2358 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 2359 sq = &migr_data->qps[i].sq; 2360 cq = &migr_data->qps[i].cq; 2361 2362 if (sq->size) { 2363 SPDK_NOTICELOG("SQID %u, SQ DOORBELL %u\n", sq->sqid, doorbell_base[i * 2]); 2364 SPDK_NOTICELOG("SQ SQID %u, CQID %u, HEAD %u, SIZE %u, DMA ADDR 0x%"PRIx64"\n", 2365 sq->sqid, sq->cqid, sq->head, sq->size, sq->dma_addr); 2366 } 2367 2368 if (cq->size) { 2369 SPDK_NOTICELOG("CQID %u, CQ DOORBELL %u\n", cq->cqid, doorbell_base[i * 2 + 1]); 2370 SPDK_NOTICELOG("CQ CQID %u, PHASE %u, TAIL %u, SIZE %u, IV %u, IEN %u, DMA ADDR 0x%"PRIx64"\n", 2371 cq->cqid, cq->phase, cq->tail, cq->size, cq->iv, cq->ien, cq->dma_addr); 2372 } 2373 } 2374 2375 SPDK_NOTICELOG("%s Dump Done\n", name); 2376 } 2377 2378 /* Read region 9 content and restore it to migration data structures */ 2379 static int 2380 vfio_user_migr_stream_to_data(struct nvmf_vfio_user_endpoint *endpoint, 2381 struct vfio_user_nvme_migr_state *migr_state) 2382 { 2383 void *data_ptr = endpoint->migr_data; 2384 2385 /* Load nvme_migr_device_state first */ 2386 memcpy(&migr_state->ctrlr_data, data_ptr, sizeof(struct nvme_migr_device_state)); 2387 /* TODO: version check */ 2388 if (migr_state->ctrlr_data.magic != VFIO_USER_NVME_MIGR_MAGIC) { 2389 SPDK_ERRLOG("%s: bad magic number %x\n", endpoint_id(endpoint), migr_state->ctrlr_data.magic); 2390 return -EINVAL; 2391 } 2392 2393 /* Load private controller data */ 2394 data_ptr = endpoint->migr_data + migr_state->ctrlr_data.private_data_offset; 2395 memcpy(&migr_state->private_data, data_ptr, migr_state->ctrlr_data.private_data_len); 2396 2397 /* Load queue pairs */ 2398 data_ptr = endpoint->migr_data + migr_state->ctrlr_data.qp_offset; 2399 memcpy(&migr_state->qps, data_ptr, migr_state->ctrlr_data.qp_len); 2400 2401 /* Load BAR0 */ 2402 data_ptr = endpoint->migr_data + migr_state->ctrlr_data.bar_offset[VFU_PCI_DEV_BAR0_REGION_IDX]; 2403 memcpy(&migr_state->bar0, data_ptr, migr_state->ctrlr_data.bar_len[VFU_PCI_DEV_BAR0_REGION_IDX]); 2404 2405 /* Load CFG */ 2406 data_ptr = endpoint->migr_data + migr_state->ctrlr_data.bar_offset[VFU_PCI_DEV_CFG_REGION_IDX]; 2407 memcpy(&migr_state->cfg, data_ptr, migr_state->ctrlr_data.bar_len[VFU_PCI_DEV_CFG_REGION_IDX]); 2408 2409 return 0; 2410 } 2411 2412 2413 static void 2414 vfio_user_migr_ctrlr_save_data(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 2415 { 2416 struct spdk_nvmf_ctrlr *ctrlr = vu_ctrlr->ctrlr; 2417 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 2418 struct nvmf_vfio_user_sq *sq; 2419 struct nvmf_vfio_user_cq *cq; 2420 struct vfio_user_nvme_migr_state migr_state = {}; 2421 uint64_t data_offset; 2422 void *data_ptr; 2423 int num_aers; 2424 struct spdk_nvme_registers *regs; 2425 uint32_t *doorbell_base; 2426 uint32_t i = 0; 2427 uint16_t sqid, cqid; 2428 2429 /* Save all data to vfio_user_nvme_migr_state first, then we will 2430 * copy it to device migration region at last. 2431 */ 2432 2433 /* save magic number */ 2434 migr_state.ctrlr_data.magic = VFIO_USER_NVME_MIGR_MAGIC; 2435 2436 /* save controller data */ 2437 num_aers = nvmf_ctrlr_save_aers(ctrlr, migr_state.ctrlr_data.aer_cids, 2438 256); 2439 assert(num_aers >= 0); 2440 migr_state.ctrlr_data.nr_aers = num_aers; 2441 2442 /* save controller private data */ 2443 nvmf_ctrlr_save_migr_data(ctrlr, (struct nvmf_ctrlr_migr_data *)&migr_state.private_data); 2444 2445 /* save connected queue pairs */ 2446 TAILQ_FOREACH(sq, &vu_ctrlr->connected_sqs, tailq) { 2447 /* save sq */ 2448 sqid = sq->qid; 2449 migr_state.qps[sqid].sq.sqid = sq->qid; 2450 migr_state.qps[sqid].sq.cqid = sq->cqid; 2451 migr_state.qps[sqid].sq.head = *sq_headp(sq); 2452 migr_state.qps[sqid].sq.size = sq->size; 2453 migr_state.qps[sqid].sq.dma_addr = sq->mapping.prp1; 2454 2455 /* save cq, for shared cq case, cq may be saved multiple times */ 2456 cqid = sq->cqid; 2457 cq = vu_ctrlr->cqs[cqid]; 2458 migr_state.qps[cqid].cq.cqid = cqid; 2459 migr_state.qps[cqid].cq.tail = *cq_tailp(cq); 2460 migr_state.qps[cqid].cq.ien = cq->ien; 2461 migr_state.qps[cqid].cq.iv = cq->iv; 2462 migr_state.qps[cqid].cq.size = cq->size; 2463 migr_state.qps[cqid].cq.phase = cq->phase; 2464 migr_state.qps[cqid].cq.dma_addr = cq->mapping.prp1; 2465 i++; 2466 } 2467 2468 assert(i > 0); 2469 migr_state.ctrlr_data.num_io_queues = i - 1; 2470 2471 regs = (struct spdk_nvme_registers *)&migr_state.bar0; 2472 /* Save mandarory registers to bar0 */ 2473 regs->cap.raw = ctrlr->vcprop.cap.raw; 2474 regs->vs.raw = ctrlr->vcprop.vs.raw; 2475 regs->cc.raw = ctrlr->vcprop.cc.raw; 2476 regs->aqa.raw = ctrlr->vcprop.aqa.raw; 2477 regs->asq = ctrlr->vcprop.asq; 2478 regs->acq = ctrlr->vcprop.acq; 2479 /* Save doorbells */ 2480 doorbell_base = (uint32_t *)®s->doorbell[0].sq_tdbl; 2481 memcpy(doorbell_base, (void *)vu_ctrlr->doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 2482 2483 /* Save PCI configuration space */ 2484 memcpy(&migr_state.cfg, (void *)endpoint->pci_config_space, NVME_REG_CFG_SIZE); 2485 2486 /* Save all data to device migration region */ 2487 data_ptr = endpoint->migr_data; 2488 2489 /* Copy private controller data */ 2490 data_offset = sizeof(struct nvme_migr_device_state); 2491 data_ptr += data_offset; 2492 migr_state.ctrlr_data.private_data_offset = data_offset; 2493 migr_state.ctrlr_data.private_data_len = sizeof(struct nvmf_ctrlr_migr_data); 2494 memcpy(data_ptr, &migr_state.private_data, sizeof(struct nvmf_ctrlr_migr_data)); 2495 2496 /* Copy queue pairs */ 2497 data_offset += sizeof(struct nvmf_ctrlr_migr_data); 2498 data_ptr += sizeof(struct nvmf_ctrlr_migr_data); 2499 migr_state.ctrlr_data.qp_offset = data_offset; 2500 migr_state.ctrlr_data.qp_len = i * (sizeof(struct nvme_migr_sq_state) + sizeof( 2501 struct nvme_migr_cq_state)); 2502 memcpy(data_ptr, &migr_state.qps, migr_state.ctrlr_data.qp_len); 2503 2504 /* Copy BAR0 */ 2505 data_offset += migr_state.ctrlr_data.qp_len; 2506 data_ptr += migr_state.ctrlr_data.qp_len; 2507 migr_state.ctrlr_data.bar_offset[VFU_PCI_DEV_BAR0_REGION_IDX] = data_offset; 2508 migr_state.ctrlr_data.bar_len[VFU_PCI_DEV_BAR0_REGION_IDX] = NVME_REG_BAR0_SIZE; 2509 memcpy(data_ptr, &migr_state.bar0, NVME_REG_BAR0_SIZE); 2510 2511 /* Copy CFG */ 2512 data_offset += NVME_REG_BAR0_SIZE; 2513 data_ptr += NVME_REG_BAR0_SIZE; 2514 migr_state.ctrlr_data.bar_offset[VFU_PCI_DEV_CFG_REGION_IDX] = data_offset; 2515 migr_state.ctrlr_data.bar_len[VFU_PCI_DEV_CFG_REGION_IDX] = NVME_REG_CFG_SIZE; 2516 memcpy(data_ptr, &migr_state.cfg, NVME_REG_CFG_SIZE); 2517 2518 /* Copy device state finally */ 2519 memcpy(endpoint->migr_data, &migr_state.ctrlr_data, sizeof(struct nvme_migr_device_state)); 2520 2521 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 2522 vfio_user_ctrlr_dump_migr_data("SAVE", &migr_state); 2523 } 2524 } 2525 2526 static int 2527 vfio_user_migr_ctrlr_construct_qps(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 2528 struct vfio_user_nvme_migr_state *migr_state) 2529 { 2530 uint32_t i, qsize = 0; 2531 uint16_t sqid, cqid; 2532 struct vfio_user_nvme_migr_qp migr_qp; 2533 void *addr; 2534 int ret; 2535 2536 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 2537 vfio_user_ctrlr_dump_migr_data("RESUME", migr_state); 2538 } 2539 2540 /* restore connected queue pairs */ 2541 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 2542 migr_qp = migr_state->qps[i]; 2543 2544 qsize = migr_qp.sq.size; 2545 if (qsize) { 2546 struct nvmf_vfio_user_sq *sq; 2547 2548 sqid = migr_qp.sq.sqid; 2549 if (sqid != i) { 2550 SPDK_ERRLOG("Expected sqid %u while got %u", i, sqid); 2551 return -EINVAL; 2552 } 2553 2554 /* allocate sq if necessary */ 2555 if (vu_ctrlr->sqs[sqid] == NULL) { 2556 ret = init_sq(vu_ctrlr, &vu_ctrlr->transport->transport, sqid); 2557 if (ret) { 2558 SPDK_ERRLOG("Construct qpair with qid %u failed\n", sqid); 2559 return -EFAULT; 2560 } 2561 } 2562 2563 sq = vu_ctrlr->sqs[sqid]; 2564 2565 sq->size = qsize; 2566 2567 ret = alloc_sq_reqs(vu_ctrlr, sq); 2568 if (ret) { 2569 SPDK_ERRLOG("Construct sq with qid %u failed\n", sqid); 2570 return -EFAULT; 2571 } 2572 2573 /* restore sq */ 2574 sq->cqid = migr_qp.sq.cqid; 2575 *sq_headp(sq) = migr_qp.sq.head; 2576 sq->mapping.prp1 = migr_qp.sq.dma_addr; 2577 addr = map_one(vu_ctrlr->endpoint->vfu_ctx, 2578 sq->mapping.prp1, sq->size * 64, 2579 sq->mapping.sg, &sq->mapping.iov, 2580 PROT_READ); 2581 if (addr == NULL) { 2582 SPDK_ERRLOG("Restore sq with qid %u PRP1 0x%"PRIx64" with size %u failed\n", 2583 sqid, sq->mapping.prp1, sq->size); 2584 return -EFAULT; 2585 } 2586 } 2587 2588 qsize = migr_qp.cq.size; 2589 if (qsize) { 2590 struct nvmf_vfio_user_cq *cq; 2591 2592 /* restore cq */ 2593 cqid = migr_qp.sq.cqid; 2594 assert(cqid == i); 2595 2596 /* allocate cq if necessary */ 2597 if (vu_ctrlr->cqs[cqid] == NULL) { 2598 ret = init_cq(vu_ctrlr, cqid); 2599 if (ret) { 2600 SPDK_ERRLOG("Construct qpair with qid %u failed\n", cqid); 2601 return -EFAULT; 2602 } 2603 } 2604 2605 cq = vu_ctrlr->cqs[cqid]; 2606 2607 cq->size = qsize; 2608 2609 *cq_tailp(cq) = migr_qp.cq.tail; 2610 cq->mapping.prp1 = migr_qp.cq.dma_addr; 2611 cq->ien = migr_qp.cq.ien; 2612 cq->iv = migr_qp.cq.iv; 2613 cq->phase = migr_qp.cq.phase; 2614 addr = map_one(vu_ctrlr->endpoint->vfu_ctx, 2615 cq->mapping.prp1, cq->size * 16, 2616 cq->mapping.sg, &cq->mapping.iov, 2617 PROT_READ | PROT_WRITE); 2618 if (addr == NULL) { 2619 SPDK_ERRLOG("Restore cq with qid %u PRP1 0x%"PRIx64" with size %u failed\n", 2620 cqid, cq->mapping.prp1, cq->size); 2621 return -EFAULT; 2622 } 2623 } 2624 } 2625 2626 return 0; 2627 } 2628 2629 static int 2630 vfio_user_migr_ctrlr_restore(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 2631 { 2632 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 2633 struct spdk_nvmf_ctrlr *ctrlr = vu_ctrlr->ctrlr; 2634 uint32_t *doorbell_base; 2635 struct vfio_user_nvme_migr_state migr_state = {}; 2636 struct spdk_nvme_registers *regs; 2637 struct spdk_nvme_cmd cmd; 2638 uint16_t i; 2639 int rc = 0; 2640 2641 assert(endpoint->migr_data != NULL); 2642 assert(ctrlr != NULL); 2643 rc = vfio_user_migr_stream_to_data(endpoint, &migr_state); 2644 if (rc) { 2645 return rc; 2646 } 2647 2648 rc = vfio_user_migr_ctrlr_construct_qps(vu_ctrlr, &migr_state); 2649 if (rc) { 2650 return rc; 2651 } 2652 2653 /* restore PCI configuration space */ 2654 memcpy((void *)endpoint->pci_config_space, &migr_state.cfg, NVME_REG_CFG_SIZE); 2655 2656 regs = (struct spdk_nvme_registers *)&migr_state.bar0; 2657 doorbell_base = (uint32_t *)®s->doorbell[0].sq_tdbl; 2658 /* restore doorbells from saved registers */ 2659 memcpy((void *)vu_ctrlr->doorbells, doorbell_base, NVMF_VFIO_USER_DOORBELLS_SIZE); 2660 2661 /* restore controller registers after ADMIN queue connection */ 2662 ctrlr->vcprop.cap.raw = regs->cap.raw; 2663 ctrlr->vcprop.vs.raw = regs->vs.raw; 2664 ctrlr->vcprop.cc.raw = regs->cc.raw; 2665 ctrlr->vcprop.aqa.raw = regs->aqa.raw; 2666 ctrlr->vcprop.asq = regs->asq; 2667 ctrlr->vcprop.acq = regs->acq; 2668 2669 /* restore controller private data */ 2670 rc = nvmf_ctrlr_restore_migr_data(ctrlr, &migr_state.private_data); 2671 if (rc) { 2672 return rc; 2673 } 2674 2675 /* resubmit pending AERs */ 2676 for (i = 0; i < migr_state.ctrlr_data.nr_aers; i++) { 2677 SPDK_DEBUGLOG(nvmf_vfio, "%s AER resubmit, CID %u\n", ctrlr_id(vu_ctrlr), 2678 migr_state.ctrlr_data.aer_cids[i]); 2679 memset(&cmd, 0, sizeof(cmd)); 2680 cmd.opc = SPDK_NVME_OPC_ASYNC_EVENT_REQUEST; 2681 cmd.cid = migr_state.ctrlr_data.aer_cids[i]; 2682 rc = handle_cmd_req(vu_ctrlr, &cmd, vu_ctrlr->sqs[0]); 2683 if (rc) { 2684 break; 2685 } 2686 } 2687 2688 return rc; 2689 } 2690 2691 static void 2692 vfio_user_migr_ctrlr_enable_sqs(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 2693 { 2694 uint32_t i; 2695 struct nvmf_vfio_user_sq *sq; 2696 2697 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 2698 sq = vu_ctrlr->sqs[i]; 2699 if (!sq || !sq->size) { 2700 continue; 2701 } 2702 2703 if (nvmf_qpair_is_admin_queue(&sq->qpair)) { 2704 /* ADMIN queue pair is always in the poll group, just enable it */ 2705 sq->sq_state = VFIO_USER_SQ_ACTIVE; 2706 } else { 2707 spdk_nvmf_tgt_new_qpair(vu_ctrlr->transport->transport.tgt, &sq->qpair); 2708 } 2709 } 2710 } 2711 2712 static int 2713 vfio_user_migration_device_state_transition(vfu_ctx_t *vfu_ctx, vfu_migr_state_t state) 2714 { 2715 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2716 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 2717 struct nvmf_vfio_user_sq *sq; 2718 int ret = 0; 2719 2720 SPDK_DEBUGLOG(nvmf_vfio, "%s controller state %u, migration state %u\n", endpoint_id(endpoint), 2721 vu_ctrlr->state, state); 2722 2723 switch (state) { 2724 case VFU_MIGR_STATE_STOP_AND_COPY: 2725 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 2726 vfio_user_migr_ctrlr_save_data(vu_ctrlr); 2727 break; 2728 case VFU_MIGR_STATE_STOP: 2729 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 2730 break; 2731 case VFU_MIGR_STATE_PRE_COPY: 2732 assert(vu_ctrlr->state == VFIO_USER_CTRLR_PAUSED); 2733 vu_ctrlr->migr_reg.pending_bytes = vfio_user_migr_data_len(); 2734 vu_ctrlr->migr_reg.last_data_offset = 0; 2735 vu_ctrlr->in_source_vm = true; 2736 break; 2737 case VFU_MIGR_STATE_RESUME: 2738 /* 2739 * Destination ADMIN queue pair is connected when starting the VM, 2740 * but the ADMIN queue pair isn't enabled in destination VM, the poll 2741 * group will do nothing to ADMIN queue pair for now. 2742 */ 2743 if (vu_ctrlr->state != VFIO_USER_CTRLR_RUNNING) { 2744 break; 2745 } 2746 2747 assert(!vu_ctrlr->in_source_vm); 2748 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 2749 2750 sq = TAILQ_FIRST(&vu_ctrlr->connected_sqs); 2751 assert(sq != NULL); 2752 assert(sq->qpair.qid == 0); 2753 sq->sq_state = VFIO_USER_SQ_INACTIVE; 2754 2755 /* Free ADMIN SQ resources first, SQ resources will be 2756 * allocated based on queue size from source VM. 2757 */ 2758 free_sq_reqs(sq); 2759 sq->size = 0; 2760 break; 2761 case VFU_MIGR_STATE_RUNNING: 2762 if (vu_ctrlr->state != VFIO_USER_CTRLR_MIGRATING) { 2763 break; 2764 } 2765 2766 if (!vu_ctrlr->in_source_vm) { 2767 /* Restore destination VM from BAR9 */ 2768 ret = vfio_user_migr_ctrlr_restore(vu_ctrlr); 2769 if (ret) { 2770 break; 2771 } 2772 vfio_user_migr_ctrlr_enable_sqs(vu_ctrlr); 2773 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 2774 } else { 2775 /* Rollback source VM */ 2776 vu_ctrlr->state = VFIO_USER_CTRLR_RESUMING; 2777 ret = spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 2778 vfio_user_dev_migr_resume_done, vu_ctrlr); 2779 if (ret < 0) { 2780 /* TODO: fail controller with CFS bit set */ 2781 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 2782 SPDK_ERRLOG("%s: failed to resume, ret=%d\n", endpoint_id(endpoint), ret); 2783 break; 2784 } 2785 } 2786 break; 2787 2788 default: 2789 return -EINVAL; 2790 } 2791 2792 return ret; 2793 } 2794 2795 static uint64_t 2796 vfio_user_migration_get_pending_bytes(vfu_ctx_t *vfu_ctx) 2797 { 2798 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2799 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 2800 struct vfio_user_migration_region *migr_reg = &ctrlr->migr_reg; 2801 2802 SPDK_DEBUGLOG(nvmf_vfio, "%s current state %u, pending bytes 0x%"PRIx64"\n", endpoint_id(endpoint), 2803 ctrlr->state, migr_reg->pending_bytes); 2804 2805 return migr_reg->pending_bytes; 2806 } 2807 2808 static int 2809 vfio_user_migration_prepare_data(vfu_ctx_t *vfu_ctx, uint64_t *offset, uint64_t *size) 2810 { 2811 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2812 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 2813 struct vfio_user_migration_region *migr_reg = &ctrlr->migr_reg; 2814 2815 if (migr_reg->last_data_offset == vfio_user_migr_data_len()) { 2816 *offset = vfio_user_migr_data_len(); 2817 if (size) { 2818 *size = 0; 2819 } 2820 migr_reg->pending_bytes = 0; 2821 } else { 2822 *offset = 0; 2823 if (size) { 2824 *size = vfio_user_migr_data_len(); 2825 if (ctrlr->state == VFIO_USER_CTRLR_MIGRATING) { 2826 vfio_user_migr_ctrlr_save_data(ctrlr); 2827 migr_reg->last_data_offset = vfio_user_migr_data_len(); 2828 } 2829 } 2830 } 2831 2832 SPDK_DEBUGLOG(nvmf_vfio, "%s current state %u\n", endpoint_id(endpoint), ctrlr->state); 2833 2834 return 0; 2835 } 2836 2837 static ssize_t 2838 vfio_user_migration_read_data(vfu_ctx_t *vfu_ctx, void *buf, uint64_t count, uint64_t offset) 2839 { 2840 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2841 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 2842 struct vfio_user_migration_region *migr_reg = &ctrlr->migr_reg; 2843 2844 memcpy(buf, endpoint->migr_data, count); 2845 migr_reg->pending_bytes = 0; 2846 2847 return 0; 2848 } 2849 2850 static ssize_t 2851 vfio_user_migration_write_data(vfu_ctx_t *vfu_ctx, void *buf, uint64_t count, uint64_t offset) 2852 { 2853 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2854 2855 memcpy(endpoint->migr_data, buf, count); 2856 2857 return 0; 2858 } 2859 2860 static int 2861 vfio_user_migration_data_written(vfu_ctx_t *vfu_ctx, uint64_t count) 2862 { 2863 SPDK_DEBUGLOG(nvmf_vfio, "write 0x%"PRIx64"\n", (uint64_t)count); 2864 2865 return 0; 2866 } 2867 2868 static int 2869 vfio_user_dev_info_fill(struct nvmf_vfio_user_transport *vu_transport, 2870 struct nvmf_vfio_user_endpoint *endpoint) 2871 { 2872 int ret; 2873 ssize_t cap_offset; 2874 vfu_ctx_t *vfu_ctx = endpoint->vfu_ctx; 2875 struct iovec migr_sparse_mmap = {}; 2876 2877 struct pmcap pmcap = { .hdr.id = PCI_CAP_ID_PM, .pmcs.nsfrst = 0x1 }; 2878 struct pxcap pxcap = { 2879 .hdr.id = PCI_CAP_ID_EXP, 2880 .pxcaps.ver = 0x2, 2881 .pxdcap = {.rer = 0x1, .flrc = 0x1}, 2882 .pxdcap2.ctds = 0x1 2883 }; 2884 2885 struct msixcap msixcap = { 2886 .hdr.id = PCI_CAP_ID_MSIX, 2887 .mxc.ts = NVME_IRQ_MSIX_NUM - 1, 2888 .mtab = {.tbir = 0x4, .to = 0x0}, 2889 .mpba = {.pbir = 0x5, .pbao = 0x0} 2890 }; 2891 2892 struct iovec sparse_mmap[] = { 2893 { 2894 .iov_base = (void *)NVME_DOORBELLS_OFFSET, 2895 .iov_len = NVMF_VFIO_USER_DOORBELLS_SIZE, 2896 }, 2897 }; 2898 2899 const vfu_migration_callbacks_t migr_callbacks = { 2900 .version = VFU_MIGR_CALLBACKS_VERS, 2901 .transition = &vfio_user_migration_device_state_transition, 2902 .get_pending_bytes = &vfio_user_migration_get_pending_bytes, 2903 .prepare_data = &vfio_user_migration_prepare_data, 2904 .read_data = &vfio_user_migration_read_data, 2905 .data_written = &vfio_user_migration_data_written, 2906 .write_data = &vfio_user_migration_write_data 2907 }; 2908 2909 ret = vfu_pci_init(vfu_ctx, VFU_PCI_TYPE_EXPRESS, PCI_HEADER_TYPE_NORMAL, 0); 2910 if (ret < 0) { 2911 SPDK_ERRLOG("vfu_ctx %p failed to initialize PCI\n", vfu_ctx); 2912 return ret; 2913 } 2914 vfu_pci_set_id(vfu_ctx, SPDK_PCI_VID_NUTANIX, 0x0001, SPDK_PCI_VID_NUTANIX, 0); 2915 /* 2916 * 0x02, controller uses the NVM Express programming interface 2917 * 0x08, non-volatile memory controller 2918 * 0x01, mass storage controller 2919 */ 2920 vfu_pci_set_class(vfu_ctx, 0x01, 0x08, 0x02); 2921 2922 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pmcap); 2923 if (cap_offset < 0) { 2924 SPDK_ERRLOG("vfu_ctx %p failed add pmcap\n", vfu_ctx); 2925 return ret; 2926 } 2927 2928 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pxcap); 2929 if (cap_offset < 0) { 2930 SPDK_ERRLOG("vfu_ctx %p failed add pxcap\n", vfu_ctx); 2931 return ret; 2932 } 2933 2934 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &msixcap); 2935 if (cap_offset < 0) { 2936 SPDK_ERRLOG("vfu_ctx %p failed add msixcap\n", vfu_ctx); 2937 return ret; 2938 } 2939 2940 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_CFG_REGION_IDX, NVME_REG_CFG_SIZE, 2941 access_pci_config, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 2942 if (ret < 0) { 2943 SPDK_ERRLOG("vfu_ctx %p failed to setup cfg\n", vfu_ctx); 2944 return ret; 2945 } 2946 2947 if (vu_transport->transport_opts.disable_mappable_bar0) { 2948 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE, 2949 access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, 2950 NULL, 0, -1, 0); 2951 } else { 2952 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE, 2953 access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, 2954 sparse_mmap, 1, endpoint->devmem_fd, 0); 2955 } 2956 2957 if (ret < 0) { 2958 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 0\n", vfu_ctx); 2959 return ret; 2960 } 2961 2962 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR4_REGION_IDX, NVME_BAR4_SIZE, 2963 NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 2964 if (ret < 0) { 2965 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 4\n", vfu_ctx); 2966 return ret; 2967 } 2968 2969 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR5_REGION_IDX, NVME_BAR5_SIZE, 2970 NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 2971 if (ret < 0) { 2972 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 5\n", vfu_ctx); 2973 return ret; 2974 } 2975 2976 ret = vfu_setup_device_dma(vfu_ctx, memory_region_add_cb, memory_region_remove_cb); 2977 if (ret < 0) { 2978 SPDK_ERRLOG("vfu_ctx %p failed to setup dma callback\n", vfu_ctx); 2979 return ret; 2980 } 2981 2982 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_INTX_IRQ, 1); 2983 if (ret < 0) { 2984 SPDK_ERRLOG("vfu_ctx %p failed to setup INTX\n", vfu_ctx); 2985 return ret; 2986 } 2987 2988 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSIX_IRQ, NVME_IRQ_MSIX_NUM); 2989 if (ret < 0) { 2990 SPDK_ERRLOG("vfu_ctx %p failed to setup MSIX\n", vfu_ctx); 2991 return ret; 2992 } 2993 2994 vfu_setup_device_quiesce_cb(vfu_ctx, vfio_user_dev_quiesce_cb); 2995 2996 migr_sparse_mmap.iov_base = (void *)4096; 2997 migr_sparse_mmap.iov_len = vfio_user_migr_data_len(); 2998 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_MIGR_REGION_IDX, 2999 vfu_get_migr_register_area_size() + vfio_user_migr_data_len(), 3000 NULL, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, &migr_sparse_mmap, 3001 1, endpoint->migr_fd, 0); 3002 if (ret < 0) { 3003 SPDK_ERRLOG("vfu_ctx %p failed to setup migration region\n", vfu_ctx); 3004 return ret; 3005 } 3006 3007 ret = vfu_setup_device_migration_callbacks(vfu_ctx, &migr_callbacks, 3008 vfu_get_migr_register_area_size()); 3009 if (ret < 0) { 3010 SPDK_ERRLOG("vfu_ctx %p failed to setup migration callbacks\n", vfu_ctx); 3011 return ret; 3012 } 3013 3014 ret = vfu_realize_ctx(vfu_ctx); 3015 if (ret < 0) { 3016 SPDK_ERRLOG("vfu_ctx %p failed to realize\n", vfu_ctx); 3017 return ret; 3018 } 3019 3020 endpoint->pci_config_space = vfu_pci_get_config_space(endpoint->vfu_ctx); 3021 assert(endpoint->pci_config_space != NULL); 3022 init_pci_config_space(endpoint->pci_config_space); 3023 3024 assert(cap_offset != 0); 3025 endpoint->msix = (struct msixcap *)((uint8_t *)endpoint->pci_config_space + cap_offset); 3026 3027 return 0; 3028 } 3029 3030 static void 3031 _free_ctrlr(void *ctx) 3032 { 3033 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 3034 struct nvmf_vfio_user_endpoint *endpoint = ctrlr->endpoint; 3035 3036 spdk_poller_unregister(&ctrlr->vfu_ctx_poller); 3037 free(ctrlr); 3038 3039 if (endpoint && endpoint->need_async_destroy) { 3040 nvmf_vfio_user_destroy_endpoint(endpoint); 3041 } 3042 } 3043 3044 static void 3045 free_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 3046 { 3047 int i; 3048 assert(ctrlr != NULL); 3049 3050 SPDK_DEBUGLOG(nvmf_vfio, "free %s\n", ctrlr_id(ctrlr)); 3051 3052 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3053 free_qp(ctrlr, i); 3054 } 3055 3056 if (ctrlr->thread == spdk_get_thread()) { 3057 _free_ctrlr(ctrlr); 3058 } else { 3059 spdk_thread_send_msg(ctrlr->thread, _free_ctrlr, ctrlr); 3060 } 3061 } 3062 3063 static void 3064 nvmf_vfio_user_create_ctrlr(struct nvmf_vfio_user_transport *transport, 3065 struct nvmf_vfio_user_endpoint *endpoint) 3066 { 3067 struct nvmf_vfio_user_ctrlr *ctrlr; 3068 int err = 0; 3069 3070 /* First, construct a vfio-user CUSTOM transport controller */ 3071 ctrlr = calloc(1, sizeof(*ctrlr)); 3072 if (ctrlr == NULL) { 3073 err = -ENOMEM; 3074 goto out; 3075 } 3076 /* We can only support one connection for now */ 3077 ctrlr->cntlid = 0x1; 3078 ctrlr->transport = transport; 3079 ctrlr->endpoint = endpoint; 3080 ctrlr->doorbells = endpoint->doorbells; 3081 TAILQ_INIT(&ctrlr->connected_sqs); 3082 3083 /* Then, construct an admin queue pair */ 3084 err = init_sq(ctrlr, &transport->transport, 0); 3085 if (err != 0) { 3086 free(ctrlr); 3087 goto out; 3088 } 3089 3090 err = init_cq(ctrlr, 0); 3091 if (err != 0) { 3092 free(ctrlr); 3093 goto out; 3094 } 3095 3096 ctrlr->sqs[0]->size = NVMF_VFIO_USER_DEFAULT_AQ_DEPTH; 3097 3098 err = alloc_sq_reqs(ctrlr, ctrlr->sqs[0]); 3099 if (err != 0) { 3100 free(ctrlr); 3101 goto out; 3102 } 3103 endpoint->ctrlr = ctrlr; 3104 3105 /* Notify the generic layer about the new admin queue pair */ 3106 spdk_nvmf_tgt_new_qpair(transport->transport.tgt, &ctrlr->sqs[0]->qpair); 3107 3108 out: 3109 if (err != 0) { 3110 SPDK_ERRLOG("%s: failed to create vfio-user controller: %s\n", 3111 endpoint_id(endpoint), strerror(-err)); 3112 } 3113 } 3114 3115 static int 3116 nvmf_vfio_user_listen(struct spdk_nvmf_transport *transport, 3117 const struct spdk_nvme_transport_id *trid, 3118 struct spdk_nvmf_listen_opts *listen_opts) 3119 { 3120 struct nvmf_vfio_user_transport *vu_transport; 3121 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 3122 char path[PATH_MAX] = {}; 3123 char uuid[PATH_MAX] = {}; 3124 int ret; 3125 3126 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 3127 transport); 3128 3129 pthread_mutex_lock(&vu_transport->lock); 3130 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 3131 /* Only compare traddr */ 3132 if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) { 3133 pthread_mutex_unlock(&vu_transport->lock); 3134 return -EEXIST; 3135 } 3136 } 3137 pthread_mutex_unlock(&vu_transport->lock); 3138 3139 endpoint = calloc(1, sizeof(*endpoint)); 3140 if (!endpoint) { 3141 return -ENOMEM; 3142 } 3143 3144 pthread_mutex_init(&endpoint->lock, NULL); 3145 endpoint->devmem_fd = -1; 3146 memcpy(&endpoint->trid, trid, sizeof(endpoint->trid)); 3147 3148 ret = snprintf(path, PATH_MAX, "%s/bar0", endpoint_id(endpoint)); 3149 if (ret < 0 || ret >= PATH_MAX) { 3150 SPDK_ERRLOG("%s: error to get socket path: %s.\n", endpoint_id(endpoint), spdk_strerror(errno)); 3151 ret = -1; 3152 goto out; 3153 } 3154 3155 ret = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); 3156 if (ret == -1) { 3157 SPDK_ERRLOG("%s: failed to open device memory at %s: %s.\n", 3158 endpoint_id(endpoint), path, spdk_strerror(errno)); 3159 goto out; 3160 } 3161 3162 endpoint->devmem_fd = ret; 3163 ret = ftruncate(endpoint->devmem_fd, 3164 NVME_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE); 3165 if (ret != 0) { 3166 SPDK_ERRLOG("%s: error to ftruncate file %s: %s.\n", endpoint_id(endpoint), path, 3167 spdk_strerror(errno)); 3168 goto out; 3169 } 3170 3171 endpoint->doorbells = mmap(NULL, NVMF_VFIO_USER_DOORBELLS_SIZE, 3172 PROT_READ | PROT_WRITE, MAP_SHARED, endpoint->devmem_fd, NVME_DOORBELLS_OFFSET); 3173 if (endpoint->doorbells == MAP_FAILED) { 3174 SPDK_ERRLOG("%s: error to mmap file %s: %s.\n", endpoint_id(endpoint), path, spdk_strerror(errno)); 3175 endpoint->doorbells = NULL; 3176 ret = -1; 3177 goto out; 3178 } 3179 3180 ret = snprintf(path, PATH_MAX, "%s/migr", endpoint_id(endpoint)); 3181 if (ret < 0 || ret >= PATH_MAX) { 3182 SPDK_ERRLOG("%s: error to get migration file path: %s.\n", endpoint_id(endpoint), 3183 spdk_strerror(errno)); 3184 ret = -1; 3185 goto out; 3186 } 3187 ret = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); 3188 if (ret == -1) { 3189 SPDK_ERRLOG("%s: failed to open device memory at %s: %s.\n", 3190 endpoint_id(endpoint), path, spdk_strerror(errno)); 3191 goto out; 3192 } 3193 3194 endpoint->migr_fd = ret; 3195 ret = ftruncate(endpoint->migr_fd, 3196 vfu_get_migr_register_area_size() + vfio_user_migr_data_len()); 3197 if (ret != 0) { 3198 SPDK_ERRLOG("%s: error to ftruncate migration file %s: %s.\n", endpoint_id(endpoint), path, 3199 spdk_strerror(errno)); 3200 goto out; 3201 } 3202 3203 endpoint->migr_data = mmap(NULL, vfio_user_migr_data_len(), 3204 PROT_READ | PROT_WRITE, MAP_SHARED, endpoint->migr_fd, vfu_get_migr_register_area_size()); 3205 if (endpoint->migr_data == MAP_FAILED) { 3206 SPDK_ERRLOG("%s: error to mmap file %s: %s.\n", endpoint_id(endpoint), path, spdk_strerror(errno)); 3207 endpoint->migr_data = NULL; 3208 ret = -1; 3209 goto out; 3210 } 3211 3212 ret = snprintf(uuid, PATH_MAX, "%s/cntrl", endpoint_id(endpoint)); 3213 if (ret < 0 || ret >= PATH_MAX) { 3214 SPDK_ERRLOG("%s: error to get ctrlr file path: %s\n", endpoint_id(endpoint), spdk_strerror(errno)); 3215 ret = -1; 3216 goto out; 3217 } 3218 3219 endpoint->vfu_ctx = vfu_create_ctx(VFU_TRANS_SOCK, uuid, LIBVFIO_USER_FLAG_ATTACH_NB, 3220 endpoint, VFU_DEV_TYPE_PCI); 3221 if (endpoint->vfu_ctx == NULL) { 3222 SPDK_ERRLOG("%s: error creating libmuser context: %m\n", 3223 endpoint_id(endpoint)); 3224 ret = -1; 3225 goto out; 3226 } 3227 vfu_setup_log(endpoint->vfu_ctx, vfio_user_log, vfio_user_get_log_level()); 3228 3229 ret = vfio_user_dev_info_fill(vu_transport, endpoint); 3230 if (ret < 0) { 3231 goto out; 3232 } 3233 3234 pthread_mutex_lock(&vu_transport->lock); 3235 TAILQ_INSERT_TAIL(&vu_transport->endpoints, endpoint, link); 3236 pthread_mutex_unlock(&vu_transport->lock); 3237 3238 SPDK_DEBUGLOG(nvmf_vfio, "%s: doorbells %p\n", uuid, endpoint->doorbells); 3239 3240 out: 3241 if (ret != 0) { 3242 nvmf_vfio_user_destroy_endpoint(endpoint); 3243 } 3244 3245 return ret; 3246 } 3247 3248 static void 3249 nvmf_vfio_user_stop_listen(struct spdk_nvmf_transport *transport, 3250 const struct spdk_nvme_transport_id *trid) 3251 { 3252 struct nvmf_vfio_user_transport *vu_transport; 3253 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 3254 3255 assert(trid != NULL); 3256 assert(trid->traddr != NULL); 3257 3258 SPDK_DEBUGLOG(nvmf_vfio, "%s: stop listen\n", trid->traddr); 3259 3260 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 3261 transport); 3262 3263 pthread_mutex_lock(&vu_transport->lock); 3264 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 3265 if (strcmp(trid->traddr, endpoint->trid.traddr) == 0) { 3266 TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link); 3267 /* Defer to free endpoint resources until the controller 3268 * is freed. There are two cases when running here: 3269 * 1. kill nvmf target while VM is connected 3270 * 2. remove listener via RPC call 3271 * nvmf library will disconnect all queue paris. 3272 */ 3273 if (endpoint->ctrlr) { 3274 assert(!endpoint->need_async_destroy); 3275 endpoint->need_async_destroy = true; 3276 pthread_mutex_unlock(&vu_transport->lock); 3277 return; 3278 } 3279 3280 nvmf_vfio_user_destroy_endpoint(endpoint); 3281 pthread_mutex_unlock(&vu_transport->lock); 3282 return; 3283 } 3284 } 3285 pthread_mutex_unlock(&vu_transport->lock); 3286 3287 SPDK_DEBUGLOG(nvmf_vfio, "%s: not found\n", trid->traddr); 3288 } 3289 3290 static void 3291 nvmf_vfio_user_cdata_init(struct spdk_nvmf_transport *transport, 3292 struct spdk_nvmf_subsystem *subsystem, 3293 struct spdk_nvmf_ctrlr_data *cdata) 3294 { 3295 cdata->vid = SPDK_PCI_VID_NUTANIX; 3296 cdata->ssvid = SPDK_PCI_VID_NUTANIX; 3297 cdata->ieee[0] = 0x8d; 3298 cdata->ieee[1] = 0x6b; 3299 cdata->ieee[2] = 0x50; 3300 memset(&cdata->sgls, 0, sizeof(struct spdk_nvme_cdata_sgls)); 3301 cdata->sgls.supported = SPDK_NVME_SGLS_SUPPORTED_DWORD_ALIGNED; 3302 /* libvfio-user can only support 1 connection for now */ 3303 cdata->oncs.reservations = 0; 3304 } 3305 3306 static int 3307 nvmf_vfio_user_listen_associate(struct spdk_nvmf_transport *transport, 3308 const struct spdk_nvmf_subsystem *subsystem, 3309 const struct spdk_nvme_transport_id *trid) 3310 { 3311 struct nvmf_vfio_user_transport *vu_transport; 3312 struct nvmf_vfio_user_endpoint *endpoint; 3313 3314 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, transport); 3315 3316 pthread_mutex_lock(&vu_transport->lock); 3317 TAILQ_FOREACH(endpoint, &vu_transport->endpoints, link) { 3318 if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) { 3319 break; 3320 } 3321 } 3322 pthread_mutex_unlock(&vu_transport->lock); 3323 3324 if (endpoint == NULL) { 3325 return -ENOENT; 3326 } 3327 3328 endpoint->subsystem = subsystem; 3329 3330 return 0; 3331 } 3332 3333 /* 3334 * Executed periodically at a default SPDK_NVMF_DEFAULT_ACCEPT_POLL_RATE_US 3335 * frequency. 3336 * 3337 * For each transport endpoint (which at the libvfio-user level corresponds to 3338 * a socket), if we don't currently have a controller set up, peek to see if the 3339 * socket is able to accept a new connection. 3340 * 3341 * This poller also takes care of handling the creation of any pending new 3342 * qpairs. 3343 */ 3344 static int 3345 nvmf_vfio_user_accept(void *ctx) 3346 { 3347 struct spdk_nvmf_transport *transport = ctx; 3348 struct nvmf_vfio_user_transport *vu_transport; 3349 struct nvmf_vfio_user_endpoint *endpoint; 3350 uint32_t count = 0; 3351 int err; 3352 3353 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 3354 transport); 3355 3356 pthread_mutex_lock(&vu_transport->lock); 3357 3358 TAILQ_FOREACH(endpoint, &vu_transport->endpoints, link) { 3359 if (endpoint->ctrlr != NULL) { 3360 continue; 3361 } 3362 3363 err = vfu_attach_ctx(endpoint->vfu_ctx); 3364 if (err != 0) { 3365 if (errno == EAGAIN || errno == EWOULDBLOCK) { 3366 continue; 3367 } 3368 3369 pthread_mutex_unlock(&vu_transport->lock); 3370 return SPDK_POLLER_BUSY; 3371 } 3372 3373 count++; 3374 3375 /* Construct a controller */ 3376 nvmf_vfio_user_create_ctrlr(vu_transport, endpoint); 3377 } 3378 3379 pthread_mutex_unlock(&vu_transport->lock); 3380 3381 return count > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 3382 } 3383 3384 static void 3385 nvmf_vfio_user_discover(struct spdk_nvmf_transport *transport, 3386 struct spdk_nvme_transport_id *trid, 3387 struct spdk_nvmf_discovery_log_page_entry *entry) 3388 { } 3389 3390 static struct spdk_nvmf_transport_poll_group * 3391 nvmf_vfio_user_poll_group_create(struct spdk_nvmf_transport *transport) 3392 { 3393 struct nvmf_vfio_user_transport *vu_transport; 3394 struct nvmf_vfio_user_poll_group *vu_group; 3395 3396 SPDK_DEBUGLOG(nvmf_vfio, "create poll group\n"); 3397 3398 vu_group = calloc(1, sizeof(*vu_group)); 3399 if (vu_group == NULL) { 3400 SPDK_ERRLOG("Error allocating poll group: %m"); 3401 return NULL; 3402 } 3403 3404 TAILQ_INIT(&vu_group->sqs); 3405 3406 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 3407 transport); 3408 pthread_mutex_lock(&vu_transport->pg_lock); 3409 TAILQ_INSERT_TAIL(&vu_transport->poll_groups, vu_group, link); 3410 if (vu_transport->next_pg == NULL) { 3411 vu_transport->next_pg = vu_group; 3412 } 3413 pthread_mutex_unlock(&vu_transport->pg_lock); 3414 3415 return &vu_group->group; 3416 } 3417 3418 static struct spdk_nvmf_transport_poll_group * 3419 nvmf_vfio_user_get_optimal_poll_group(struct spdk_nvmf_qpair *qpair) 3420 { 3421 struct nvmf_vfio_user_transport *vu_transport; 3422 struct nvmf_vfio_user_poll_group **vu_group; 3423 struct nvmf_vfio_user_sq *sq; 3424 struct nvmf_vfio_user_cq *cq; 3425 3426 struct spdk_nvmf_transport_poll_group *result; 3427 3428 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 3429 cq = sq->ctrlr->cqs[sq->cqid]; 3430 assert(cq != NULL); 3431 vu_transport = SPDK_CONTAINEROF(qpair->transport, struct nvmf_vfio_user_transport, transport); 3432 3433 pthread_mutex_lock(&vu_transport->pg_lock); 3434 if (TAILQ_EMPTY(&vu_transport->poll_groups)) { 3435 pthread_mutex_unlock(&vu_transport->pg_lock); 3436 return NULL; 3437 } 3438 3439 /* If this is shared IO CQ case, just return the used CQ's poll group */ 3440 if (!nvmf_qpair_is_admin_queue(qpair)) { 3441 if (cq->group) { 3442 pthread_mutex_unlock(&vu_transport->pg_lock); 3443 return cq->group; 3444 } 3445 } 3446 3447 vu_group = &vu_transport->next_pg; 3448 assert(*vu_group != NULL); 3449 3450 result = &(*vu_group)->group; 3451 *vu_group = TAILQ_NEXT(*vu_group, link); 3452 if (*vu_group == NULL) { 3453 *vu_group = TAILQ_FIRST(&vu_transport->poll_groups); 3454 } 3455 3456 if (cq->group == NULL) { 3457 cq->group = result; 3458 } 3459 3460 pthread_mutex_unlock(&vu_transport->pg_lock); 3461 return result; 3462 } 3463 3464 /* called when process exits */ 3465 static void 3466 nvmf_vfio_user_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group) 3467 { 3468 struct nvmf_vfio_user_poll_group *vu_group, *next_tgroup;; 3469 struct nvmf_vfio_user_transport *vu_transport; 3470 3471 SPDK_DEBUGLOG(nvmf_vfio, "destroy poll group\n"); 3472 3473 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 3474 vu_transport = SPDK_CONTAINEROF(vu_group->group.transport, struct nvmf_vfio_user_transport, 3475 transport); 3476 3477 pthread_mutex_lock(&vu_transport->pg_lock); 3478 next_tgroup = TAILQ_NEXT(vu_group, link); 3479 TAILQ_REMOVE(&vu_transport->poll_groups, vu_group, link); 3480 if (next_tgroup == NULL) { 3481 next_tgroup = TAILQ_FIRST(&vu_transport->poll_groups); 3482 } 3483 if (vu_transport->next_pg == vu_group) { 3484 vu_transport->next_pg = next_tgroup; 3485 } 3486 pthread_mutex_unlock(&vu_transport->pg_lock); 3487 3488 free(vu_group); 3489 } 3490 3491 static void 3492 _vfio_user_qpair_disconnect(void *ctx) 3493 { 3494 struct nvmf_vfio_user_sq *sq = ctx; 3495 3496 spdk_nvmf_qpair_disconnect(&sq->qpair, NULL, NULL); 3497 } 3498 3499 /* The function is used when socket connection is destroyed */ 3500 static int 3501 vfio_user_destroy_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 3502 { 3503 struct nvmf_vfio_user_sq *sq; 3504 struct nvmf_vfio_user_endpoint *endpoint; 3505 3506 SPDK_DEBUGLOG(nvmf_vfio, "%s stop processing\n", ctrlr_id(ctrlr)); 3507 3508 endpoint = ctrlr->endpoint; 3509 assert(endpoint != NULL); 3510 3511 pthread_mutex_lock(&endpoint->lock); 3512 if (TAILQ_EMPTY(&ctrlr->connected_sqs)) { 3513 endpoint->ctrlr = NULL; 3514 free_ctrlr(ctrlr); 3515 pthread_mutex_unlock(&endpoint->lock); 3516 return 0; 3517 } 3518 3519 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 3520 /* add another round thread poll to avoid recursive endpoint lock */ 3521 spdk_thread_send_msg(ctrlr->thread, _vfio_user_qpair_disconnect, sq); 3522 } 3523 pthread_mutex_unlock(&endpoint->lock); 3524 3525 return 0; 3526 } 3527 3528 /* 3529 * Poll for and process any incoming vfio-user messages. 3530 */ 3531 static int 3532 vfio_user_poll_vfu_ctx(void *ctx) 3533 { 3534 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 3535 int ret; 3536 3537 assert(ctrlr != NULL); 3538 3539 /* This will call access_bar0_fn() if there are any writes 3540 * to the portion of the BAR that is not mmap'd */ 3541 ret = vfu_run_ctx(ctrlr->endpoint->vfu_ctx); 3542 if (spdk_unlikely(ret == -1)) { 3543 if (errno == EBUSY) { 3544 return SPDK_POLLER_BUSY; 3545 } 3546 3547 spdk_poller_unregister(&ctrlr->vfu_ctx_poller); 3548 3549 /* initiator shutdown or reset, waiting for another re-connect */ 3550 if (errno == ENOTCONN) { 3551 vfio_user_destroy_ctrlr(ctrlr); 3552 return SPDK_POLLER_BUSY; 3553 } 3554 3555 fail_ctrlr(ctrlr); 3556 } 3557 3558 return ret != 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 3559 } 3560 3561 struct vfio_user_post_cpl_ctx { 3562 struct nvmf_vfio_user_ctrlr *ctrlr; 3563 struct nvmf_vfio_user_cq *cq; 3564 struct spdk_nvme_cpl cpl; 3565 }; 3566 3567 static void 3568 _post_completion_msg(void *ctx) 3569 { 3570 struct vfio_user_post_cpl_ctx *cpl_ctx = ctx; 3571 3572 post_completion(cpl_ctx->ctrlr, cpl_ctx->cq, cpl_ctx->cpl.cdw0, cpl_ctx->cpl.sqid, 3573 cpl_ctx->cpl.cid, cpl_ctx->cpl.status.sc, cpl_ctx->cpl.status.sct); 3574 free(cpl_ctx); 3575 } 3576 3577 static int 3578 handle_queue_connect_rsp(struct nvmf_vfio_user_req *req, void *cb_arg) 3579 { 3580 struct nvmf_vfio_user_poll_group *vu_group; 3581 struct nvmf_vfio_user_sq *sq = cb_arg; 3582 struct nvmf_vfio_user_cq *cq; 3583 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 3584 struct nvmf_vfio_user_endpoint *endpoint; 3585 3586 assert(sq != NULL); 3587 assert(req != NULL); 3588 3589 vu_ctrlr = sq->ctrlr; 3590 assert(vu_ctrlr != NULL); 3591 endpoint = vu_ctrlr->endpoint; 3592 assert(endpoint != NULL); 3593 3594 if (spdk_nvme_cpl_is_error(&req->req.rsp->nvme_cpl)) { 3595 SPDK_ERRLOG("SC %u, SCT %u\n", req->req.rsp->nvme_cpl.status.sc, req->req.rsp->nvme_cpl.status.sct); 3596 endpoint->ctrlr = NULL; 3597 free_ctrlr(vu_ctrlr); 3598 return -1; 3599 } 3600 3601 vu_group = SPDK_CONTAINEROF(sq->group, struct nvmf_vfio_user_poll_group, group); 3602 TAILQ_INSERT_TAIL(&vu_group->sqs, sq, link); 3603 3604 cq = vu_ctrlr->cqs[0]; 3605 assert(cq != NULL); 3606 3607 pthread_mutex_lock(&endpoint->lock); 3608 if (nvmf_qpair_is_admin_queue(&sq->qpair)) { 3609 vu_ctrlr->cntlid = sq->qpair.ctrlr->cntlid; 3610 vu_ctrlr->thread = spdk_get_thread(); 3611 vu_ctrlr->ctrlr = sq->qpair.ctrlr; 3612 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 3613 vu_ctrlr->vfu_ctx_poller = SPDK_POLLER_REGISTER(vfio_user_poll_vfu_ctx, vu_ctrlr, 0); 3614 cq->thread = spdk_get_thread(); 3615 } else { 3616 /* For I/O queues this command was generated in response to an 3617 * ADMIN I/O CREATE SUBMISSION QUEUE command which has not yet 3618 * been completed. Complete it now. 3619 */ 3620 if (sq->post_create_io_sq_completion) { 3621 assert(cq->thread != NULL); 3622 if (cq->thread != spdk_get_thread()) { 3623 struct vfio_user_post_cpl_ctx *cpl_ctx; 3624 3625 cpl_ctx = calloc(1, sizeof(*cpl_ctx)); 3626 if (!cpl_ctx) { 3627 return -ENOMEM; 3628 } 3629 cpl_ctx->ctrlr = vu_ctrlr; 3630 cpl_ctx->cq = cq; 3631 cpl_ctx->cpl.sqid = 0; 3632 cpl_ctx->cpl.cdw0 = 0; 3633 cpl_ctx->cpl.cid = sq->create_io_sq_cmd.cid; 3634 cpl_ctx->cpl.status.sc = SPDK_NVME_SC_SUCCESS; 3635 cpl_ctx->cpl.status.sct = SPDK_NVME_SCT_GENERIC; 3636 3637 spdk_thread_send_msg(cq->thread, _post_completion_msg, cpl_ctx); 3638 } else { 3639 post_completion(vu_ctrlr, cq, 0, 0, 3640 sq->create_io_sq_cmd.cid, SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC); 3641 } 3642 sq->post_create_io_sq_completion = false; 3643 } 3644 sq->sq_state = VFIO_USER_SQ_ACTIVE; 3645 } 3646 3647 TAILQ_INSERT_TAIL(&vu_ctrlr->connected_sqs, sq, tailq); 3648 pthread_mutex_unlock(&endpoint->lock); 3649 3650 free(req->req.data); 3651 req->req.data = NULL; 3652 3653 return 0; 3654 } 3655 3656 /* 3657 * Add the given qpair to the given poll group. New qpairs are added via 3658 * spdk_nvmf_tgt_new_qpair(), which picks a poll group, then calls back 3659 * here via nvmf_transport_poll_group_add(). 3660 */ 3661 static int 3662 nvmf_vfio_user_poll_group_add(struct spdk_nvmf_transport_poll_group *group, 3663 struct spdk_nvmf_qpair *qpair) 3664 { 3665 struct nvmf_vfio_user_sq *sq; 3666 struct nvmf_vfio_user_req *vu_req; 3667 struct nvmf_vfio_user_ctrlr *ctrlr; 3668 struct spdk_nvmf_request *req; 3669 struct spdk_nvmf_fabric_connect_data *data; 3670 bool admin; 3671 3672 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 3673 sq->group = group; 3674 ctrlr = sq->ctrlr; 3675 3676 SPDK_DEBUGLOG(nvmf_vfio, "%s: add QP%d=%p(%p) to poll_group=%p\n", 3677 ctrlr_id(ctrlr), sq->qpair.qid, 3678 sq, qpair, group); 3679 3680 admin = nvmf_qpair_is_admin_queue(&sq->qpair); 3681 3682 vu_req = get_nvmf_vfio_user_req(sq); 3683 if (vu_req == NULL) { 3684 return -1; 3685 } 3686 3687 req = &vu_req->req; 3688 req->cmd->connect_cmd.opcode = SPDK_NVME_OPC_FABRIC; 3689 req->cmd->connect_cmd.cid = 0; 3690 req->cmd->connect_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_CONNECT; 3691 req->cmd->connect_cmd.recfmt = 0; 3692 req->cmd->connect_cmd.sqsize = sq->size - 1; 3693 req->cmd->connect_cmd.qid = admin ? 0 : qpair->qid; 3694 3695 req->length = sizeof(struct spdk_nvmf_fabric_connect_data); 3696 req->data = calloc(1, req->length); 3697 if (req->data == NULL) { 3698 nvmf_vfio_user_req_free(req); 3699 return -ENOMEM; 3700 } 3701 3702 data = (struct spdk_nvmf_fabric_connect_data *)req->data; 3703 data->cntlid = ctrlr->cntlid; 3704 snprintf(data->subnqn, sizeof(data->subnqn), "%s", 3705 spdk_nvmf_subsystem_get_nqn(ctrlr->endpoint->subsystem)); 3706 3707 vu_req->cb_fn = handle_queue_connect_rsp; 3708 vu_req->cb_arg = sq; 3709 3710 SPDK_DEBUGLOG(nvmf_vfio, 3711 "%s: sending connect fabrics command for QID=%#x cntlid=%#x\n", 3712 ctrlr_id(ctrlr), qpair->qid, data->cntlid); 3713 3714 spdk_nvmf_request_exec_fabrics(req); 3715 return 0; 3716 } 3717 3718 static int 3719 nvmf_vfio_user_poll_group_remove(struct spdk_nvmf_transport_poll_group *group, 3720 struct spdk_nvmf_qpair *qpair) 3721 { 3722 struct nvmf_vfio_user_sq *sq; 3723 struct nvmf_vfio_user_poll_group *vu_group; 3724 3725 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 3726 3727 SPDK_DEBUGLOG(nvmf_vfio, 3728 "%s: remove NVMf QP%d=%p from NVMf poll_group=%p\n", 3729 ctrlr_id(sq->ctrlr), qpair->qid, qpair, group); 3730 3731 3732 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 3733 TAILQ_REMOVE(&vu_group->sqs, sq, link); 3734 3735 return 0; 3736 } 3737 3738 static void 3739 _nvmf_vfio_user_req_free(struct nvmf_vfio_user_sq *sq, struct nvmf_vfio_user_req *vu_req) 3740 { 3741 memset(&vu_req->cmd, 0, sizeof(vu_req->cmd)); 3742 memset(&vu_req->rsp, 0, sizeof(vu_req->rsp)); 3743 vu_req->iovcnt = 0; 3744 vu_req->state = VFIO_USER_REQUEST_STATE_FREE; 3745 3746 TAILQ_INSERT_TAIL(&sq->free_reqs, vu_req, link); 3747 } 3748 3749 static int 3750 nvmf_vfio_user_req_free(struct spdk_nvmf_request *req) 3751 { 3752 struct nvmf_vfio_user_sq *sq; 3753 struct nvmf_vfio_user_req *vu_req; 3754 3755 assert(req != NULL); 3756 3757 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 3758 sq = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 3759 3760 _nvmf_vfio_user_req_free(sq, vu_req); 3761 3762 return 0; 3763 } 3764 3765 static int 3766 nvmf_vfio_user_req_complete(struct spdk_nvmf_request *req) 3767 { 3768 struct nvmf_vfio_user_sq *sq; 3769 struct nvmf_vfio_user_req *vu_req; 3770 3771 assert(req != NULL); 3772 3773 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 3774 sq = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 3775 3776 if (vu_req->cb_fn != NULL) { 3777 if (vu_req->cb_fn(vu_req, vu_req->cb_arg) != 0) { 3778 fail_ctrlr(sq->ctrlr); 3779 } 3780 } 3781 3782 _nvmf_vfio_user_req_free(sq, vu_req); 3783 3784 return 0; 3785 } 3786 3787 static void 3788 nvmf_vfio_user_close_qpair(struct spdk_nvmf_qpair *qpair, 3789 spdk_nvmf_transport_qpair_fini_cb cb_fn, void *cb_arg) 3790 { 3791 struct nvmf_vfio_user_sq *sq; 3792 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 3793 struct nvmf_vfio_user_endpoint *endpoint; 3794 3795 assert(qpair != NULL); 3796 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 3797 vu_ctrlr = sq->ctrlr; 3798 endpoint = vu_ctrlr->endpoint; 3799 3800 pthread_mutex_lock(&endpoint->lock); 3801 TAILQ_REMOVE(&vu_ctrlr->connected_sqs, sq, tailq); 3802 delete_sq_done(vu_ctrlr, sq); 3803 if (TAILQ_EMPTY(&vu_ctrlr->connected_sqs)) { 3804 endpoint->ctrlr = NULL; 3805 free_ctrlr(vu_ctrlr); 3806 } 3807 pthread_mutex_unlock(&endpoint->lock); 3808 3809 if (cb_fn) { 3810 cb_fn(cb_arg); 3811 } 3812 } 3813 3814 /** 3815 * Returns a preallocated request, or NULL if there isn't one available. 3816 */ 3817 static struct nvmf_vfio_user_req * 3818 get_nvmf_vfio_user_req(struct nvmf_vfio_user_sq *sq) 3819 { 3820 struct nvmf_vfio_user_req *req; 3821 3822 if (sq == NULL) { 3823 return NULL; 3824 } 3825 3826 req = TAILQ_FIRST(&sq->free_reqs); 3827 if (req == NULL) { 3828 return NULL; 3829 } 3830 3831 TAILQ_REMOVE(&sq->free_reqs, req, link); 3832 3833 return req; 3834 } 3835 3836 static int 3837 get_nvmf_io_req_length(struct spdk_nvmf_request *req) 3838 { 3839 uint16_t nr; 3840 uint32_t nlb, nsid; 3841 struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; 3842 struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; 3843 struct spdk_nvmf_ns *ns; 3844 3845 nsid = cmd->nsid; 3846 ns = _nvmf_subsystem_get_ns(ctrlr->subsys, nsid); 3847 if (ns == NULL || ns->bdev == NULL) { 3848 SPDK_ERRLOG("unsuccessful query for nsid %u\n", cmd->nsid); 3849 return -EINVAL; 3850 } 3851 3852 if (cmd->opc == SPDK_NVME_OPC_DATASET_MANAGEMENT) { 3853 nr = cmd->cdw10_bits.dsm.nr + 1; 3854 return nr * sizeof(struct spdk_nvme_dsm_range); 3855 } 3856 3857 nlb = (cmd->cdw12 & 0x0000ffffu) + 1; 3858 return nlb * spdk_bdev_get_block_size(ns->bdev); 3859 } 3860 3861 static int 3862 map_admin_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req) 3863 { 3864 struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; 3865 uint32_t len = 0; 3866 uint8_t fid; 3867 int iovcnt; 3868 3869 req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 3870 req->length = 0; 3871 req->data = NULL; 3872 3873 if (req->xfer == SPDK_NVME_DATA_NONE) { 3874 return 0; 3875 } 3876 3877 switch (cmd->opc) { 3878 case SPDK_NVME_OPC_IDENTIFY: 3879 len = 4096; 3880 break; 3881 case SPDK_NVME_OPC_GET_LOG_PAGE: 3882 len = (((cmd->cdw11_bits.get_log_page.numdu << 16) | cmd->cdw10_bits.get_log_page.numdl) + 1) * 4; 3883 break; 3884 case SPDK_NVME_OPC_GET_FEATURES: 3885 case SPDK_NVME_OPC_SET_FEATURES: 3886 fid = cmd->cdw10_bits.set_features.fid; 3887 switch (fid) { 3888 case SPDK_NVME_FEAT_LBA_RANGE_TYPE: 3889 len = 4096; 3890 break; 3891 case SPDK_NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 3892 len = 256; 3893 break; 3894 case SPDK_NVME_FEAT_TIMESTAMP: 3895 len = 8; 3896 break; 3897 case SPDK_NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 3898 len = 512; 3899 break; 3900 case SPDK_NVME_FEAT_HOST_IDENTIFIER: 3901 if (cmd->cdw11_bits.feat_host_identifier.bits.exhid) { 3902 len = 16; 3903 } else { 3904 len = 8; 3905 } 3906 break; 3907 default: 3908 return 0; 3909 } 3910 break; 3911 default: 3912 return 0; 3913 } 3914 3915 /* ADMIN command will not use SGL */ 3916 if (cmd->psdt != 0) { 3917 return -EINVAL; 3918 } 3919 3920 iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, len); 3921 if (iovcnt < 0) { 3922 SPDK_ERRLOG("%s: map Admin Opc %x failed\n", 3923 ctrlr_id(ctrlr), cmd->opc); 3924 return -1; 3925 } 3926 req->length = len; 3927 req->data = req->iov[0].iov_base; 3928 req->iovcnt = iovcnt; 3929 3930 return 0; 3931 } 3932 3933 /* 3934 * Map an I/O command's buffers. 3935 * 3936 * Returns 0 on success and -errno on failure. 3937 */ 3938 static int 3939 map_io_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req) 3940 { 3941 int len, iovcnt; 3942 struct spdk_nvme_cmd *cmd; 3943 3944 assert(ctrlr != NULL); 3945 assert(req != NULL); 3946 3947 cmd = &req->cmd->nvme_cmd; 3948 req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 3949 req->length = 0; 3950 req->data = NULL; 3951 3952 if (spdk_unlikely(req->xfer == SPDK_NVME_DATA_NONE)) { 3953 return 0; 3954 } 3955 3956 len = get_nvmf_io_req_length(req); 3957 if (len < 0) { 3958 return -EINVAL; 3959 } 3960 req->length = len; 3961 3962 iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, req->length); 3963 if (iovcnt < 0) { 3964 SPDK_ERRLOG("%s: failed to map IO OPC %u\n", ctrlr_id(ctrlr), cmd->opc); 3965 return -EFAULT; 3966 } 3967 req->data = req->iov[0].iov_base; 3968 req->iovcnt = iovcnt; 3969 3970 return 0; 3971 } 3972 3973 static int 3974 handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 3975 struct nvmf_vfio_user_sq *sq) 3976 { 3977 int err; 3978 struct nvmf_vfio_user_req *vu_req; 3979 struct spdk_nvmf_request *req; 3980 3981 assert(ctrlr != NULL); 3982 assert(cmd != NULL); 3983 3984 vu_req = get_nvmf_vfio_user_req(sq); 3985 if (spdk_unlikely(vu_req == NULL)) { 3986 SPDK_ERRLOG("%s: no request for NVMe command opc 0x%x\n", ctrlr_id(ctrlr), cmd->opc); 3987 return post_completion(ctrlr, ctrlr->cqs[sq->cqid], 0, 0, cmd->cid, 3988 SPDK_NVME_SC_INTERNAL_DEVICE_ERROR, SPDK_NVME_SCT_GENERIC); 3989 3990 } 3991 req = &vu_req->req; 3992 3993 assert(req->qpair != NULL); 3994 SPDK_DEBUGLOG(nvmf_vfio, "%s: handle qid%u, req opc=%#x cid=%d\n", 3995 ctrlr_id(ctrlr), req->qpair->qid, cmd->opc, cmd->cid); 3996 3997 vu_req->cb_fn = handle_cmd_rsp; 3998 vu_req->cb_arg = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 3999 req->cmd->nvme_cmd = *cmd; 4000 4001 if (nvmf_qpair_is_admin_queue(req->qpair)) { 4002 err = map_admin_cmd_req(ctrlr, req); 4003 } else { 4004 switch (cmd->opc) { 4005 case SPDK_NVME_OPC_RESERVATION_REGISTER: 4006 case SPDK_NVME_OPC_RESERVATION_REPORT: 4007 case SPDK_NVME_OPC_RESERVATION_ACQUIRE: 4008 case SPDK_NVME_OPC_RESERVATION_RELEASE: 4009 err = -ENOTSUP; 4010 break; 4011 default: 4012 err = map_io_cmd_req(ctrlr, req); 4013 break; 4014 } 4015 } 4016 4017 if (spdk_unlikely(err < 0)) { 4018 SPDK_ERRLOG("%s: process NVMe command opc 0x%x failed\n", 4019 ctrlr_id(ctrlr), cmd->opc); 4020 req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 4021 req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; 4022 err = handle_cmd_rsp(vu_req, vu_req->cb_arg); 4023 _nvmf_vfio_user_req_free(sq, vu_req); 4024 return err; 4025 } 4026 4027 vu_req->state = VFIO_USER_REQUEST_STATE_EXECUTING; 4028 spdk_nvmf_request_exec(req); 4029 4030 return 0; 4031 } 4032 4033 /* Returns the number of commands processed, or a negative value on error. */ 4034 static int 4035 nvmf_vfio_user_sq_poll(struct nvmf_vfio_user_sq *sq) 4036 { 4037 struct nvmf_vfio_user_ctrlr *ctrlr; 4038 uint32_t new_tail; 4039 int count = 0; 4040 4041 assert(sq != NULL); 4042 4043 ctrlr = sq->ctrlr; 4044 4045 /* On aarch64 platforms, doorbells update from guest VM may not be seen 4046 * on SPDK target side. This is because there is memory type mismatch 4047 * situation here. That is on guest VM side, the doorbells are treated as 4048 * device memory while on SPDK target side, it is treated as normal 4049 * memory. And this situation cause problem on ARM platform. 4050 * Refer to "https://developer.arm.com/documentation/102376/0100/ 4051 * Memory-aliasing-and-mismatched-memory-types". Only using spdk_mb() 4052 * cannot fix this. Use "dc civac" to invalidate cache may solve 4053 * this. 4054 */ 4055 spdk_ivdt_dcache(sq_dbl_tailp(ctrlr, sq)); 4056 4057 /* Load-Acquire. */ 4058 new_tail = *sq_dbl_tailp(ctrlr, sq); 4059 4060 /* 4061 * Ensure that changes to the queue are visible to us. 4062 * The host driver should write the queue first, do a wmb(), and then 4063 * update the SQ tail doorbell (their Store-Release). 4064 */ 4065 spdk_rmb(); 4066 4067 new_tail = new_tail & 0xffffu; 4068 if (spdk_unlikely(new_tail >= sq->size)) { 4069 union spdk_nvme_async_event_completion event = {}; 4070 4071 SPDK_DEBUGLOG(nvmf_vfio, "%s: invalid SQ%u doorbell value %u\n", ctrlr_id(ctrlr), sq->qid, 4072 new_tail); 4073 event.bits.async_event_type = SPDK_NVME_ASYNC_EVENT_TYPE_ERROR; 4074 event.bits.async_event_info = SPDK_NVME_ASYNC_EVENT_INVALID_DB_WRITE; 4075 nvmf_ctrlr_async_event_error_event(ctrlr->ctrlr, event); 4076 4077 return 0; 4078 } 4079 4080 if (*sq_headp(sq) == new_tail) { 4081 return 0; 4082 } 4083 4084 count = handle_sq_tdbl_write(ctrlr, new_tail, sq); 4085 if (count < 0) { 4086 fail_ctrlr(ctrlr); 4087 } 4088 4089 return count; 4090 } 4091 4092 /* 4093 * vfio-user transport poll handler. Note that the library context is polled in 4094 * a separate poller (->vfu_ctx_poller), so this poller only needs to poll the 4095 * active qpairs. 4096 * 4097 * Returns the number of commands processed, or a negative value on error. 4098 */ 4099 static int 4100 nvmf_vfio_user_poll_group_poll(struct spdk_nvmf_transport_poll_group *group) 4101 { 4102 struct nvmf_vfio_user_poll_group *vu_group; 4103 struct nvmf_vfio_user_sq *sq, *tmp; 4104 int count = 0; 4105 4106 assert(group != NULL); 4107 4108 spdk_rmb(); 4109 4110 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 4111 4112 TAILQ_FOREACH_SAFE(sq, &vu_group->sqs, link, tmp) { 4113 int ret; 4114 4115 if (spdk_unlikely(sq->sq_state != VFIO_USER_SQ_ACTIVE || !sq->size)) { 4116 continue; 4117 } 4118 4119 ret = nvmf_vfio_user_sq_poll(sq); 4120 4121 if (ret < 0) { 4122 return ret; 4123 } 4124 4125 count += ret; 4126 } 4127 4128 return count; 4129 } 4130 4131 static int 4132 nvmf_vfio_user_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair, 4133 struct spdk_nvme_transport_id *trid) 4134 { 4135 struct nvmf_vfio_user_sq *sq; 4136 struct nvmf_vfio_user_ctrlr *ctrlr; 4137 4138 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 4139 ctrlr = sq->ctrlr; 4140 4141 memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid)); 4142 return 0; 4143 } 4144 4145 static int 4146 nvmf_vfio_user_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair, 4147 struct spdk_nvme_transport_id *trid) 4148 { 4149 return 0; 4150 } 4151 4152 static int 4153 nvmf_vfio_user_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair, 4154 struct spdk_nvme_transport_id *trid) 4155 { 4156 struct nvmf_vfio_user_sq *sq; 4157 struct nvmf_vfio_user_ctrlr *ctrlr; 4158 4159 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 4160 ctrlr = sq->ctrlr; 4161 4162 memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid)); 4163 return 0; 4164 } 4165 4166 static void 4167 nvmf_vfio_user_qpair_abort_request(struct spdk_nvmf_qpair *qpair, 4168 struct spdk_nvmf_request *req) 4169 { 4170 struct spdk_nvmf_request *req_to_abort = NULL; 4171 uint16_t cid; 4172 4173 cid = req->cmd->nvme_cmd.cdw10_bits.abort.cid; 4174 4175 TAILQ_FOREACH(req, &qpair->outstanding, link) { 4176 struct nvmf_vfio_user_req *vu_req; 4177 4178 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 4179 4180 if (vu_req->state == VFIO_USER_REQUEST_STATE_EXECUTING && vu_req->cmd.cid == cid) { 4181 req_to_abort = req; 4182 break; 4183 } 4184 } 4185 4186 if (req_to_abort == NULL) { 4187 spdk_nvmf_request_complete(req); 4188 return; 4189 } 4190 4191 req->req_to_abort = req_to_abort; 4192 nvmf_ctrlr_abort_request(req); 4193 } 4194 4195 static void 4196 nvmf_vfio_user_opts_init(struct spdk_nvmf_transport_opts *opts) 4197 { 4198 opts->max_queue_depth = NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH; 4199 opts->max_qpairs_per_ctrlr = NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; 4200 opts->in_capsule_data_size = 0; 4201 opts->max_io_size = NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE; 4202 opts->io_unit_size = NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE; 4203 opts->max_aq_depth = NVMF_VFIO_USER_DEFAULT_AQ_DEPTH; 4204 opts->num_shared_buffers = 0; 4205 opts->buf_cache_size = 0; 4206 opts->association_timeout = 0; 4207 opts->transport_specific = NULL; 4208 } 4209 4210 const struct spdk_nvmf_transport_ops spdk_nvmf_transport_vfio_user = { 4211 .name = "VFIOUSER", 4212 .type = SPDK_NVME_TRANSPORT_VFIOUSER, 4213 .opts_init = nvmf_vfio_user_opts_init, 4214 .create = nvmf_vfio_user_create, 4215 .destroy = nvmf_vfio_user_destroy, 4216 4217 .listen = nvmf_vfio_user_listen, 4218 .stop_listen = nvmf_vfio_user_stop_listen, 4219 .cdata_init = nvmf_vfio_user_cdata_init, 4220 .listen_associate = nvmf_vfio_user_listen_associate, 4221 4222 .listener_discover = nvmf_vfio_user_discover, 4223 4224 .poll_group_create = nvmf_vfio_user_poll_group_create, 4225 .get_optimal_poll_group = nvmf_vfio_user_get_optimal_poll_group, 4226 .poll_group_destroy = nvmf_vfio_user_poll_group_destroy, 4227 .poll_group_add = nvmf_vfio_user_poll_group_add, 4228 .poll_group_remove = nvmf_vfio_user_poll_group_remove, 4229 .poll_group_poll = nvmf_vfio_user_poll_group_poll, 4230 4231 .req_free = nvmf_vfio_user_req_free, 4232 .req_complete = nvmf_vfio_user_req_complete, 4233 4234 .qpair_fini = nvmf_vfio_user_close_qpair, 4235 .qpair_get_local_trid = nvmf_vfio_user_qpair_get_local_trid, 4236 .qpair_get_peer_trid = nvmf_vfio_user_qpair_get_peer_trid, 4237 .qpair_get_listen_trid = nvmf_vfio_user_qpair_get_listen_trid, 4238 .qpair_abort_request = nvmf_vfio_user_qpair_abort_request, 4239 }; 4240 4241 SPDK_NVMF_TRANSPORT_REGISTER(muser, &spdk_nvmf_transport_vfio_user); 4242 SPDK_LOG_REGISTER_COMPONENT(nvmf_vfio) 4243