1 /*- 2 * BSD LICENSE 3 * Copyright (c) Intel Corporation. All rights reserved. 4 * Copyright (c) 2019, Nutanix Inc. All rights reserved. 5 * Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 /* 35 * NVMe over vfio-user transport 36 */ 37 38 #include <vfio-user/libvfio-user.h> 39 #include <vfio-user/pci_defs.h> 40 41 #include "spdk/barrier.h" 42 #include "spdk/stdinc.h" 43 #include "spdk/assert.h" 44 #include "spdk/thread.h" 45 #include "spdk/nvmf_transport.h" 46 #include "spdk/sock.h" 47 #include "spdk/string.h" 48 #include "spdk/util.h" 49 #include "spdk/log.h" 50 51 #include "transport.h" 52 53 #include "nvmf_internal.h" 54 55 #define NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH 256 56 #define NVMF_VFIO_USER_DEFAULT_AQ_DEPTH 32 57 #define NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE ((NVMF_REQ_MAX_BUFFERS - 1) << SHIFT_4KB) 58 #define NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE 59 60 #define NVME_DOORBELLS_OFFSET 0x1000 61 #define NVMF_VFIO_USER_DOORBELLS_SIZE 0x1000 62 63 /* 64 * NVMe driver reads 4096 bytes, which is the extended PCI configuration space 65 * available on PCI-X 2.0 and PCI Express buses 66 */ 67 #define NVME_REG_CFG_SIZE 0x1000 68 #define NVME_REG_BAR0_SIZE (NVME_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE) 69 #define NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR ((NVMF_VFIO_USER_DOORBELLS_SIZE) / 8) 70 #define NVME_IRQ_MSIX_NUM NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR 71 /* MSIX Table Size */ 72 #define NVME_BAR4_SIZE SPDK_ALIGN_CEIL((NVME_IRQ_MSIX_NUM * 16), 0x1000) 73 /* MSIX Pending Bit Array Size */ 74 #define NVME_BAR5_SIZE SPDK_ALIGN_CEIL((NVME_IRQ_MSIX_NUM / 8), 0x1000) 75 76 #define NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR (NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR / 4) 77 78 struct nvmf_vfio_user_req; 79 80 typedef int (*nvmf_vfio_user_req_cb_fn)(struct nvmf_vfio_user_req *req, void *cb_arg); 81 82 /* 1 more for PRP2 list itself */ 83 #define NVMF_VFIO_USER_MAX_IOVECS (NVMF_REQ_MAX_BUFFERS + 1) 84 85 enum nvmf_vfio_user_req_state { 86 VFIO_USER_REQUEST_STATE_FREE = 0, 87 VFIO_USER_REQUEST_STATE_EXECUTING, 88 }; 89 90 /* NVMe device state representation */ 91 struct nvme_migr_sq_state { 92 uint16_t sqid; 93 uint16_t cqid; 94 uint32_t head; 95 uint32_t size; 96 uint32_t reserved; 97 uint64_t dma_addr; 98 }; 99 SPDK_STATIC_ASSERT(sizeof(struct nvme_migr_sq_state) == 0x18, "Incorrect size"); 100 101 struct nvme_migr_cq_state { 102 uint16_t cqid; 103 uint16_t phase; 104 uint32_t tail; 105 uint32_t size; 106 uint32_t iv; 107 uint32_t ien; 108 uint32_t reserved; 109 uint64_t dma_addr; 110 }; 111 SPDK_STATIC_ASSERT(sizeof(struct nvme_migr_cq_state) == 0x20, "Incorrect size"); 112 113 #define VFIO_USER_NVME_MIGR_MAGIC 0xAFEDBC23 114 115 /* The device state is in VFIO MIGRATION BAR(9) region, keep the device state page aligned. 116 * 117 * NVMe device migration region is defined as below: 118 * ------------------------------------------------------------------------- 119 * | vfio_user_nvme_migr_header | nvmf controller data | queue pairs | BARs | 120 * ------------------------------------------------------------------------- 121 * 122 * Keep vfio_user_nvme_migr_header as a fixed 0x1000 length, all new added fields 123 * can use the reserved space at the end of the data structure. 124 */ 125 struct vfio_user_nvme_migr_header { 126 /* Magic value to validate migration data */ 127 uint32_t magic; 128 /* Version to check the data is same from source to destination */ 129 uint32_t version; 130 131 /* The library uses this field to know how many fields in this 132 * structure are valid, starting at the beginning of this data 133 * structure. New added fields in future use `unused` memory 134 * spaces. 135 */ 136 uint32_t opts_size; 137 uint32_t reserved0; 138 139 /* BARs information */ 140 uint64_t bar_offset[VFU_PCI_DEV_NUM_REGIONS]; 141 uint64_t bar_len[VFU_PCI_DEV_NUM_REGIONS]; 142 143 /* Queue pair start offset, starting at the beginning of this 144 * data structure. 145 */ 146 uint64_t qp_offset; 147 uint64_t qp_len; 148 149 /* Controller data structure */ 150 uint32_t num_io_queues; 151 uint32_t reserved1; 152 153 /* TODO: this part will be moved to common nvmf controller data */ 154 uint16_t reserved2[3]; 155 uint16_t nr_aers; 156 uint16_t aer_cids[NVMF_MIGR_MAX_PENDING_AERS]; 157 158 /* NVMf controller data offset and length if exist, starting at 159 * the beginning of this data structure. 160 */ 161 uint64_t nvmf_data_offset; 162 uint64_t nvmf_data_len; 163 164 /* Reserved memory space for new added fields, the 165 * field is always at the end of this data structure. 166 */ 167 uint8_t unused[3356]; 168 }; 169 SPDK_STATIC_ASSERT(sizeof(struct vfio_user_nvme_migr_header) == 0x1000, "Incorrect size"); 170 171 struct vfio_user_nvme_migr_qp { 172 struct nvme_migr_sq_state sq; 173 struct nvme_migr_cq_state cq; 174 }; 175 176 /* NVMe state definition used to load/restore from/to NVMe migration BAR region */ 177 struct vfio_user_nvme_migr_state { 178 struct vfio_user_nvme_migr_header ctrlr_header; 179 struct nvmf_ctrlr_migr_data nvmf_data; 180 struct vfio_user_nvme_migr_qp qps[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 181 uint8_t bar0[NVME_REG_BAR0_SIZE]; 182 uint8_t cfg[NVME_REG_CFG_SIZE]; 183 }; 184 185 struct nvmf_vfio_user_req { 186 struct spdk_nvmf_request req; 187 struct spdk_nvme_cpl rsp; 188 struct spdk_nvme_cmd cmd; 189 190 enum nvmf_vfio_user_req_state state; 191 nvmf_vfio_user_req_cb_fn cb_fn; 192 void *cb_arg; 193 194 /* old CC before prop_set_cc fabric command */ 195 union spdk_nvme_cc_register cc; 196 197 TAILQ_ENTRY(nvmf_vfio_user_req) link; 198 199 struct iovec iov[NVMF_VFIO_USER_MAX_IOVECS]; 200 uint8_t iovcnt; 201 202 /* NVMF_VFIO_USER_MAX_IOVECS worth of dma_sg_t. */ 203 uint8_t sg[]; 204 }; 205 206 /* 207 * Mapping of an NVMe queue. 208 * 209 * This holds the information tracking a local process mapping of an NVMe queue 210 * shared by the client. 211 */ 212 struct nvme_q_mapping { 213 /* iov of local process mapping. */ 214 struct iovec iov; 215 /* Stored sg, needed for unmap. */ 216 dma_sg_t *sg; 217 /* Client PRP of queue. */ 218 uint64_t prp1; 219 }; 220 221 enum nvmf_vfio_user_sq_state { 222 VFIO_USER_SQ_UNUSED = 0, 223 VFIO_USER_SQ_CREATED, 224 VFIO_USER_SQ_DELETED, 225 VFIO_USER_SQ_ACTIVE, 226 VFIO_USER_SQ_INACTIVE 227 }; 228 229 enum nvmf_vfio_user_cq_state { 230 VFIO_USER_CQ_UNUSED = 0, 231 VFIO_USER_CQ_CREATED, 232 VFIO_USER_CQ_DELETED, 233 }; 234 235 enum nvmf_vfio_user_ctrlr_state { 236 VFIO_USER_CTRLR_CREATING = 0, 237 VFIO_USER_CTRLR_RUNNING, 238 /* Quiesce requested by libvfio-user */ 239 VFIO_USER_CTRLR_PAUSING, 240 /* NVMf subsystem is paused, it's safe to do PCI reset, memory register, 241 * memory unergister, and vfio migration state transition in this state. 242 */ 243 VFIO_USER_CTRLR_PAUSED, 244 /* 245 * Implies that the NVMf subsystem is paused. Device will be unquiesced (PCI 246 * reset, memory register and unregister, controller in destination VM has 247 * been restored). NVMf subsystem resume has been requested. 248 */ 249 VFIO_USER_CTRLR_RESUMING, 250 /* 251 * Implies that the NVMf subsystem is paused. Both controller in source VM and 252 * destinatiom VM is in this state when doing live migration. 253 */ 254 VFIO_USER_CTRLR_MIGRATING 255 }; 256 257 /* Migration region to record NVMe device state data structure */ 258 struct vfio_user_migration_region { 259 uint64_t last_data_offset; 260 uint64_t pending_bytes; 261 }; 262 263 struct nvmf_vfio_user_sq { 264 struct spdk_nvmf_qpair qpair; 265 struct spdk_nvmf_transport_poll_group *group; 266 struct nvmf_vfio_user_ctrlr *ctrlr; 267 268 uint32_t qid; 269 /* Number of entries in queue. */ 270 uint32_t size; 271 struct nvme_q_mapping mapping; 272 enum nvmf_vfio_user_sq_state sq_state; 273 274 uint32_t head; 275 276 /* multiple SQs can be mapped to the same CQ */ 277 uint16_t cqid; 278 279 /* handle_queue_connect_rsp() can be used both for CREATE IO SQ response 280 * and SQ re-connect response in the destination VM, for the prior case, 281 * we will post a NVMe completion to VM, we will not set this flag when 282 * re-connecting SQs in the destination VM. 283 */ 284 bool post_create_io_sq_completion; 285 /* Copy of Create IO SQ command, this field is used together with 286 * `post_create_io_sq_completion` flag. 287 */ 288 struct spdk_nvme_cmd create_io_sq_cmd; 289 290 /* Currently unallocated reqs. */ 291 TAILQ_HEAD(, nvmf_vfio_user_req) free_reqs; 292 /* Poll group entry */ 293 TAILQ_ENTRY(nvmf_vfio_user_sq) link; 294 /* Connected SQ entry */ 295 TAILQ_ENTRY(nvmf_vfio_user_sq) tailq; 296 }; 297 298 struct nvmf_vfio_user_cq { 299 struct spdk_nvmf_transport_poll_group *group; 300 struct spdk_thread *thread; 301 uint32_t cq_ref; 302 303 uint32_t qid; 304 /* Number of entries in queue. */ 305 uint32_t size; 306 struct nvme_q_mapping mapping; 307 enum nvmf_vfio_user_cq_state cq_state; 308 309 uint32_t tail; 310 bool phase; 311 312 uint16_t iv; 313 bool ien; 314 }; 315 316 struct nvmf_vfio_user_poll_group { 317 struct spdk_nvmf_transport_poll_group group; 318 TAILQ_ENTRY(nvmf_vfio_user_poll_group) link; 319 TAILQ_HEAD(, nvmf_vfio_user_sq) sqs; 320 }; 321 322 struct nvmf_vfio_user_ctrlr { 323 struct nvmf_vfio_user_endpoint *endpoint; 324 struct nvmf_vfio_user_transport *transport; 325 326 /* Connected SQs list */ 327 TAILQ_HEAD(, nvmf_vfio_user_sq) connected_sqs; 328 enum nvmf_vfio_user_ctrlr_state state; 329 330 struct vfio_user_migration_region migr_reg; 331 /* Controller is in source VM when doing live migration */ 332 bool in_source_vm; 333 334 struct spdk_thread *thread; 335 struct spdk_poller *vfu_ctx_poller; 336 struct spdk_interrupt *intr; 337 int intr_fd; 338 339 bool queued_quiesce; 340 341 bool reset_shn; 342 343 uint16_t cntlid; 344 struct spdk_nvmf_ctrlr *ctrlr; 345 346 struct nvmf_vfio_user_sq *sqs[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 347 struct nvmf_vfio_user_cq *cqs[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 348 349 TAILQ_ENTRY(nvmf_vfio_user_ctrlr) link; 350 351 volatile uint32_t *doorbells; 352 }; 353 354 struct nvmf_vfio_user_endpoint { 355 struct nvmf_vfio_user_transport *transport; 356 vfu_ctx_t *vfu_ctx; 357 struct spdk_poller *accept_poller; 358 struct spdk_thread *accept_thread; 359 struct msixcap *msix; 360 vfu_pci_config_space_t *pci_config_space; 361 int devmem_fd; 362 int accept_intr_fd; 363 struct spdk_interrupt *accept_intr; 364 365 volatile uint32_t *doorbells; 366 367 int migr_fd; 368 void *migr_data; 369 370 struct spdk_nvme_transport_id trid; 371 const struct spdk_nvmf_subsystem *subsystem; 372 373 struct nvmf_vfio_user_ctrlr *ctrlr; 374 pthread_mutex_t lock; 375 376 bool need_async_destroy; 377 378 TAILQ_ENTRY(nvmf_vfio_user_endpoint) link; 379 }; 380 381 struct nvmf_vfio_user_transport_opts { 382 bool disable_mappable_bar0; 383 }; 384 385 struct nvmf_vfio_user_transport { 386 struct spdk_nvmf_transport transport; 387 struct nvmf_vfio_user_transport_opts transport_opts; 388 bool intr_mode_supported; 389 pthread_mutex_t lock; 390 TAILQ_HEAD(, nvmf_vfio_user_endpoint) endpoints; 391 392 pthread_mutex_t pg_lock; 393 TAILQ_HEAD(, nvmf_vfio_user_poll_group) poll_groups; 394 struct nvmf_vfio_user_poll_group *next_pg; 395 }; 396 397 /* 398 * function prototypes 399 */ 400 static int 401 nvmf_vfio_user_req_free(struct spdk_nvmf_request *req); 402 403 static struct nvmf_vfio_user_req * 404 get_nvmf_vfio_user_req(struct nvmf_vfio_user_sq *sq); 405 406 /* 407 * Local process virtual address of a queue. 408 */ 409 static inline void * 410 q_addr(struct nvme_q_mapping *mapping) 411 { 412 return mapping->iov.iov_base; 413 } 414 415 static inline int 416 queue_index(uint16_t qid, bool is_cq) 417 { 418 return (qid * 2) + is_cq; 419 } 420 421 static inline volatile uint32_t * 422 sq_headp(struct nvmf_vfio_user_sq *sq) 423 { 424 assert(sq != NULL); 425 return &sq->head; 426 } 427 428 static inline volatile uint32_t * 429 sq_dbl_tailp(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_sq *sq) 430 { 431 assert(ctrlr != NULL); 432 assert(sq != NULL); 433 return &ctrlr->doorbells[queue_index(sq->qid, false)]; 434 } 435 436 static inline volatile uint32_t * 437 cq_dbl_headp(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_cq *cq) 438 { 439 assert(ctrlr != NULL); 440 assert(cq != NULL); 441 return &ctrlr->doorbells[queue_index(cq->qid, true)]; 442 } 443 444 static inline volatile uint32_t * 445 cq_tailp(struct nvmf_vfio_user_cq *cq) 446 { 447 assert(cq != NULL); 448 return &cq->tail; 449 } 450 451 static inline void 452 sq_head_advance(struct nvmf_vfio_user_sq *sq) 453 { 454 assert(sq != NULL); 455 456 assert(*sq_headp(sq) < sq->size); 457 (*sq_headp(sq))++; 458 459 if (spdk_unlikely(*sq_headp(sq) == sq->size)) { 460 *sq_headp(sq) = 0; 461 } 462 } 463 464 static inline void 465 cq_tail_advance(struct nvmf_vfio_user_cq *cq) 466 { 467 assert(cq != NULL); 468 469 assert(*cq_tailp(cq) < cq->size); 470 (*cq_tailp(cq))++; 471 472 if (spdk_unlikely(*cq_tailp(cq) == cq->size)) { 473 *cq_tailp(cq) = 0; 474 cq->phase = !cq->phase; 475 } 476 } 477 478 static inline bool 479 cq_is_full(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_cq *cq) 480 { 481 uint32_t qindex; 482 483 assert(ctrlr != NULL); 484 assert(cq != NULL); 485 486 qindex = *cq_tailp(cq) + 1; 487 if (spdk_unlikely(qindex == cq->size)) { 488 qindex = 0; 489 } 490 491 return qindex == *cq_dbl_headp(ctrlr, cq); 492 } 493 494 static inline size_t 495 vfio_user_migr_data_len(void) 496 { 497 return SPDK_ALIGN_CEIL(sizeof(struct vfio_user_nvme_migr_state), PAGE_SIZE); 498 } 499 500 static int 501 nvme_cmd_map_prps(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, 502 uint32_t max_iovcnt, uint32_t len, size_t mps, 503 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 504 { 505 uint64_t prp1, prp2; 506 void *vva; 507 uint32_t i; 508 uint32_t residue_len, nents; 509 uint64_t *prp_list; 510 uint32_t iovcnt; 511 512 assert(max_iovcnt > 0); 513 514 prp1 = cmd->dptr.prp.prp1; 515 prp2 = cmd->dptr.prp.prp2; 516 517 /* PRP1 may started with unaligned page address */ 518 residue_len = mps - (prp1 % mps); 519 residue_len = spdk_min(len, residue_len); 520 521 vva = gpa_to_vva(prv, prp1, residue_len, PROT_READ | PROT_WRITE); 522 if (spdk_unlikely(vva == NULL)) { 523 SPDK_ERRLOG("GPA to VVA failed\n"); 524 return -EINVAL; 525 } 526 len -= residue_len; 527 if (len && max_iovcnt < 2) { 528 SPDK_ERRLOG("Too many page entries, at least two iovs are required\n"); 529 return -ERANGE; 530 } 531 iovs[0].iov_base = vva; 532 iovs[0].iov_len = residue_len; 533 534 if (len) { 535 if (spdk_unlikely(prp2 == 0)) { 536 SPDK_ERRLOG("no PRP2, %d remaining\n", len); 537 return -EINVAL; 538 } 539 540 if (len <= mps) { 541 /* 2 PRP used */ 542 iovcnt = 2; 543 vva = gpa_to_vva(prv, prp2, len, PROT_READ | PROT_WRITE); 544 if (spdk_unlikely(vva == NULL)) { 545 SPDK_ERRLOG("no VVA for %#" PRIx64 ", len%#x\n", 546 prp2, len); 547 return -EINVAL; 548 } 549 iovs[1].iov_base = vva; 550 iovs[1].iov_len = len; 551 } else { 552 /* PRP list used */ 553 nents = (len + mps - 1) / mps; 554 if (spdk_unlikely(nents + 1 > max_iovcnt)) { 555 SPDK_ERRLOG("Too many page entries\n"); 556 return -ERANGE; 557 } 558 559 vva = gpa_to_vva(prv, prp2, nents * sizeof(*prp_list), PROT_READ); 560 if (spdk_unlikely(vva == NULL)) { 561 SPDK_ERRLOG("no VVA for %#" PRIx64 ", nents=%#x\n", 562 prp2, nents); 563 return -EINVAL; 564 } 565 prp_list = vva; 566 i = 0; 567 while (len != 0) { 568 residue_len = spdk_min(len, mps); 569 vva = gpa_to_vva(prv, prp_list[i], residue_len, PROT_READ | PROT_WRITE); 570 if (spdk_unlikely(vva == NULL)) { 571 SPDK_ERRLOG("no VVA for %#" PRIx64 ", residue_len=%#x\n", 572 prp_list[i], residue_len); 573 return -EINVAL; 574 } 575 iovs[i + 1].iov_base = vva; 576 iovs[i + 1].iov_len = residue_len; 577 len -= residue_len; 578 i++; 579 } 580 iovcnt = i + 1; 581 } 582 } else { 583 /* 1 PRP used */ 584 iovcnt = 1; 585 } 586 587 assert(iovcnt <= max_iovcnt); 588 return iovcnt; 589 } 590 591 static int 592 nvme_cmd_map_sgls_data(void *prv, struct spdk_nvme_sgl_descriptor *sgls, uint32_t num_sgls, 593 struct iovec *iovs, uint32_t max_iovcnt, 594 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 595 { 596 uint32_t i; 597 void *vva; 598 599 if (spdk_unlikely(max_iovcnt < num_sgls)) { 600 return -ERANGE; 601 } 602 603 for (i = 0; i < num_sgls; i++) { 604 if (spdk_unlikely(sgls[i].unkeyed.type != SPDK_NVME_SGL_TYPE_DATA_BLOCK)) { 605 SPDK_ERRLOG("Invalid SGL type %u\n", sgls[i].unkeyed.type); 606 return -EINVAL; 607 } 608 vva = gpa_to_vva(prv, sgls[i].address, sgls[i].unkeyed.length, PROT_READ | PROT_WRITE); 609 if (spdk_unlikely(vva == NULL)) { 610 SPDK_ERRLOG("GPA to VVA failed\n"); 611 return -EINVAL; 612 } 613 iovs[i].iov_base = vva; 614 iovs[i].iov_len = sgls[i].unkeyed.length; 615 } 616 617 return num_sgls; 618 } 619 620 static int 621 nvme_cmd_map_sgls(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt, 622 uint32_t len, size_t mps, 623 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 624 { 625 struct spdk_nvme_sgl_descriptor *sgl, *last_sgl; 626 uint32_t num_sgls, seg_len; 627 void *vva; 628 int ret; 629 uint32_t total_iovcnt = 0; 630 631 /* SGL cases */ 632 sgl = &cmd->dptr.sgl1; 633 634 /* only one SGL segment */ 635 if (sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { 636 assert(max_iovcnt > 0); 637 vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, PROT_READ | PROT_WRITE); 638 if (spdk_unlikely(vva == NULL)) { 639 SPDK_ERRLOG("GPA to VVA failed\n"); 640 return -EINVAL; 641 } 642 iovs[0].iov_base = vva; 643 iovs[0].iov_len = sgl->unkeyed.length; 644 assert(sgl->unkeyed.length == len); 645 646 return 1; 647 } 648 649 for (;;) { 650 if (spdk_unlikely((sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_SEGMENT) && 651 (sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_LAST_SEGMENT))) { 652 SPDK_ERRLOG("Invalid SGL type %u\n", sgl->unkeyed.type); 653 return -EINVAL; 654 } 655 656 seg_len = sgl->unkeyed.length; 657 if (spdk_unlikely(seg_len % sizeof(struct spdk_nvme_sgl_descriptor))) { 658 SPDK_ERRLOG("Invalid SGL segment len %u\n", seg_len); 659 return -EINVAL; 660 } 661 662 num_sgls = seg_len / sizeof(struct spdk_nvme_sgl_descriptor); 663 vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, PROT_READ); 664 if (spdk_unlikely(vva == NULL)) { 665 SPDK_ERRLOG("GPA to VVA failed\n"); 666 return -EINVAL; 667 } 668 669 /* sgl point to the first segment */ 670 sgl = (struct spdk_nvme_sgl_descriptor *)vva; 671 last_sgl = &sgl[num_sgls - 1]; 672 673 /* we are done */ 674 if (last_sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { 675 /* map whole sgl list */ 676 ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls, &iovs[total_iovcnt], 677 max_iovcnt - total_iovcnt, gpa_to_vva); 678 if (spdk_unlikely(ret < 0)) { 679 return ret; 680 } 681 total_iovcnt += ret; 682 683 return total_iovcnt; 684 } 685 686 if (num_sgls > 1) { 687 /* map whole sgl exclude last_sgl */ 688 ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls - 1, &iovs[total_iovcnt], 689 max_iovcnt - total_iovcnt, gpa_to_vva); 690 if (spdk_unlikely(ret < 0)) { 691 return ret; 692 } 693 total_iovcnt += ret; 694 } 695 696 /* move to next level's segments */ 697 sgl = last_sgl; 698 } 699 700 return 0; 701 } 702 703 static int 704 nvme_map_cmd(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt, 705 uint32_t len, size_t mps, 706 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 707 { 708 if (cmd->psdt == SPDK_NVME_PSDT_PRP) { 709 return nvme_cmd_map_prps(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva); 710 } 711 712 return nvme_cmd_map_sgls(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva); 713 } 714 715 static char * 716 endpoint_id(struct nvmf_vfio_user_endpoint *endpoint) 717 { 718 return endpoint->trid.traddr; 719 } 720 721 static char * 722 ctrlr_id(struct nvmf_vfio_user_ctrlr *ctrlr) 723 { 724 if (!ctrlr || !ctrlr->endpoint) { 725 return "Null Ctrlr"; 726 } 727 728 return endpoint_id(ctrlr->endpoint); 729 } 730 731 static void 732 fail_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 733 { 734 const struct spdk_nvmf_registers *regs; 735 736 assert(vu_ctrlr != NULL); 737 assert(vu_ctrlr->ctrlr != NULL); 738 739 regs = spdk_nvmf_ctrlr_get_regs(vu_ctrlr->ctrlr); 740 if (regs->csts.bits.cfs == 0) { 741 SPDK_ERRLOG(":%s failing controller\n", ctrlr_id(vu_ctrlr)); 742 } 743 744 nvmf_ctrlr_set_fatal_status(vu_ctrlr->ctrlr); 745 } 746 747 static inline bool 748 ctrlr_interrupt_enabled(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 749 { 750 assert(vu_ctrlr != NULL); 751 assert(vu_ctrlr->endpoint != NULL); 752 753 vfu_pci_config_space_t *pci = vu_ctrlr->endpoint->pci_config_space; 754 755 return (!pci->hdr.cmd.id || vu_ctrlr->endpoint->msix->mxc.mxe); 756 } 757 758 static void 759 nvmf_vfio_user_destroy_endpoint(struct nvmf_vfio_user_endpoint *endpoint) 760 { 761 SPDK_DEBUGLOG(nvmf_vfio, "destroy endpoint %s\n", endpoint_id(endpoint)); 762 763 spdk_interrupt_unregister(&endpoint->accept_intr); 764 spdk_poller_unregister(&endpoint->accept_poller); 765 766 if (endpoint->doorbells) { 767 munmap((void *)endpoint->doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 768 } 769 770 if (endpoint->devmem_fd > 0) { 771 close(endpoint->devmem_fd); 772 } 773 774 if (endpoint->migr_data) { 775 munmap(endpoint->migr_data, vfio_user_migr_data_len()); 776 } 777 778 if (endpoint->migr_fd > 0) { 779 close(endpoint->migr_fd); 780 } 781 782 if (endpoint->vfu_ctx) { 783 vfu_destroy_ctx(endpoint->vfu_ctx); 784 } 785 786 pthread_mutex_destroy(&endpoint->lock); 787 free(endpoint); 788 } 789 790 /* called when process exits */ 791 static int 792 nvmf_vfio_user_destroy(struct spdk_nvmf_transport *transport, 793 spdk_nvmf_transport_destroy_done_cb cb_fn, void *cb_arg) 794 { 795 struct nvmf_vfio_user_transport *vu_transport; 796 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 797 798 SPDK_DEBUGLOG(nvmf_vfio, "destroy transport\n"); 799 800 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 801 transport); 802 803 pthread_mutex_destroy(&vu_transport->lock); 804 pthread_mutex_destroy(&vu_transport->pg_lock); 805 806 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 807 TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link); 808 nvmf_vfio_user_destroy_endpoint(endpoint); 809 } 810 811 free(vu_transport); 812 813 if (cb_fn) { 814 cb_fn(cb_arg); 815 } 816 817 return 0; 818 } 819 820 static const struct spdk_json_object_decoder vfio_user_transport_opts_decoder[] = { 821 { 822 "disable_mappable_bar0", 823 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_mappable_bar0), 824 spdk_json_decode_bool, true 825 }, 826 }; 827 828 static struct spdk_nvmf_transport * 829 nvmf_vfio_user_create(struct spdk_nvmf_transport_opts *opts) 830 { 831 struct nvmf_vfio_user_transport *vu_transport; 832 int err; 833 834 if (opts->max_qpairs_per_ctrlr > NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) { 835 SPDK_ERRLOG("Invalid max_qpairs_per_ctrlr=%d, supported max_qpairs_per_ctrlr=%d\n", 836 opts->max_qpairs_per_ctrlr, NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR); 837 return NULL; 838 } 839 840 vu_transport = calloc(1, sizeof(*vu_transport)); 841 if (vu_transport == NULL) { 842 SPDK_ERRLOG("Transport alloc fail: %m\n"); 843 return NULL; 844 } 845 846 err = pthread_mutex_init(&vu_transport->lock, NULL); 847 if (err != 0) { 848 SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err); 849 goto err; 850 } 851 TAILQ_INIT(&vu_transport->endpoints); 852 853 err = pthread_mutex_init(&vu_transport->pg_lock, NULL); 854 if (err != 0) { 855 pthread_mutex_destroy(&vu_transport->lock); 856 SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err); 857 goto err; 858 } 859 TAILQ_INIT(&vu_transport->poll_groups); 860 861 if (opts->transport_specific != NULL && 862 spdk_json_decode_object_relaxed(opts->transport_specific, vfio_user_transport_opts_decoder, 863 SPDK_COUNTOF(vfio_user_transport_opts_decoder), 864 vu_transport)) { 865 SPDK_ERRLOG("spdk_json_decode_object_relaxed failed\n"); 866 goto cleanup; 867 } 868 869 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_mappable_bar0=%d\n", 870 vu_transport->transport_opts.disable_mappable_bar0); 871 872 /* 873 * To support interrupt mode, the transport must be configured with 874 * mappable BAR0 disabled: we need a vfio-user message to wake us up 875 * when a client writes new doorbell values to BAR0, via the 876 * libvfio-user socket fd. 877 */ 878 vu_transport->intr_mode_supported = 879 vu_transport->transport_opts.disable_mappable_bar0; 880 881 return &vu_transport->transport; 882 883 cleanup: 884 pthread_mutex_destroy(&vu_transport->lock); 885 pthread_mutex_destroy(&vu_transport->pg_lock); 886 err: 887 free(vu_transport); 888 return NULL; 889 } 890 891 static uint32_t 892 max_queue_size(struct nvmf_vfio_user_ctrlr const *vu_ctrlr) 893 { 894 assert(vu_ctrlr != NULL); 895 assert(vu_ctrlr->ctrlr != NULL); 896 897 return vu_ctrlr->ctrlr->vcprop.cap.bits.mqes + 1; 898 } 899 900 static void * 901 map_one(vfu_ctx_t *ctx, uint64_t addr, uint64_t len, dma_sg_t *sg, struct iovec *iov, int prot) 902 { 903 int ret; 904 905 assert(ctx != NULL); 906 assert(sg != NULL); 907 assert(iov != NULL); 908 909 ret = vfu_addr_to_sg(ctx, (void *)(uintptr_t)addr, len, sg, 1, prot); 910 if (ret < 0) { 911 return NULL; 912 } 913 914 ret = vfu_map_sg(ctx, sg, iov, 1, 0); 915 if (ret != 0) { 916 return NULL; 917 } 918 919 assert(iov->iov_base != NULL); 920 return iov->iov_base; 921 } 922 923 static int 924 map_q(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvme_q_mapping *mapping, 925 uint32_t q_size, bool is_cq, bool unmap) 926 { 927 uint64_t len; 928 void *ret; 929 930 assert(q_size); 931 assert(q_addr(mapping) == NULL); 932 933 if (is_cq) { 934 len = q_size * sizeof(struct spdk_nvme_cpl); 935 } else { 936 len = q_size * sizeof(struct spdk_nvme_cmd); 937 } 938 939 ret = map_one(vu_ctrlr->endpoint->vfu_ctx, mapping->prp1, len, 940 mapping->sg, &mapping->iov, 941 is_cq ? PROT_READ | PROT_WRITE : PROT_READ); 942 if (ret == NULL) { 943 return -EFAULT; 944 } 945 946 if (unmap) { 947 memset(q_addr(mapping), 0, len); 948 } 949 950 return 0; 951 } 952 953 static inline void 954 unmap_q(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvme_q_mapping *mapping) 955 { 956 if (q_addr(mapping) != NULL) { 957 vfu_unmap_sg(vu_ctrlr->endpoint->vfu_ctx, mapping->sg, 958 &mapping->iov, 1); 959 mapping->iov.iov_base = NULL; 960 } 961 } 962 963 static int 964 asq_setup(struct nvmf_vfio_user_ctrlr *ctrlr) 965 { 966 struct nvmf_vfio_user_sq *sq; 967 const struct spdk_nvmf_registers *regs; 968 int ret; 969 970 assert(ctrlr != NULL); 971 972 sq = ctrlr->sqs[0]; 973 974 assert(sq != NULL); 975 assert(q_addr(&sq->mapping) == NULL); 976 /* XXX ctrlr->asq == 0 is a valid memory address */ 977 978 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->ctrlr); 979 sq->qid = 0; 980 sq->size = regs->aqa.bits.asqs + 1; 981 sq->mapping.prp1 = regs->asq; 982 *sq_headp(sq) = 0; 983 sq->cqid = 0; 984 985 ret = map_q(ctrlr, &sq->mapping, sq->size, false, true); 986 if (ret) { 987 return ret; 988 } 989 990 *sq_dbl_tailp(ctrlr, sq) = 0; 991 992 return 0; 993 } 994 995 static int 996 acq_setup(struct nvmf_vfio_user_ctrlr *ctrlr) 997 { 998 struct nvmf_vfio_user_cq *cq; 999 const struct spdk_nvmf_registers *regs; 1000 int ret; 1001 1002 assert(ctrlr != NULL); 1003 1004 cq = ctrlr->cqs[0]; 1005 1006 assert(cq != NULL); 1007 1008 assert(q_addr(&cq->mapping) == NULL); 1009 1010 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->ctrlr); 1011 assert(regs != NULL); 1012 cq->qid = 0; 1013 cq->size = regs->aqa.bits.acqs + 1; 1014 cq->mapping.prp1 = regs->acq; 1015 *cq_tailp(cq) = 0; 1016 cq->ien = true; 1017 cq->phase = true; 1018 1019 ret = map_q(ctrlr, &cq->mapping, cq->size, true, true); 1020 if (ret) { 1021 return ret; 1022 } 1023 1024 *cq_dbl_headp(ctrlr, cq) = 0; 1025 1026 return 0; 1027 } 1028 1029 static inline dma_sg_t * 1030 vu_req_to_sg_t(struct nvmf_vfio_user_req *vu_req, uint32_t iovcnt) 1031 { 1032 return (dma_sg_t *)(vu_req->sg + iovcnt * dma_sg_size()); 1033 } 1034 1035 static void * 1036 _map_one(void *prv, uint64_t addr, uint64_t len, int prot) 1037 { 1038 struct spdk_nvmf_request *req = (struct spdk_nvmf_request *)prv; 1039 struct spdk_nvmf_qpair *qpair; 1040 struct nvmf_vfio_user_req *vu_req; 1041 struct nvmf_vfio_user_sq *sq; 1042 void *ret; 1043 1044 assert(req != NULL); 1045 qpair = req->qpair; 1046 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 1047 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 1048 1049 assert(vu_req->iovcnt < NVMF_VFIO_USER_MAX_IOVECS); 1050 ret = map_one(sq->ctrlr->endpoint->vfu_ctx, addr, len, 1051 vu_req_to_sg_t(vu_req, vu_req->iovcnt), 1052 &vu_req->iov[vu_req->iovcnt], prot); 1053 if (spdk_likely(ret != NULL)) { 1054 vu_req->iovcnt++; 1055 } 1056 return ret; 1057 } 1058 1059 static int 1060 vfio_user_map_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req, 1061 struct iovec *iov, uint32_t length) 1062 { 1063 /* Map PRP list to from Guest physical memory to 1064 * virtual memory address. 1065 */ 1066 return nvme_map_cmd(req, &req->cmd->nvme_cmd, iov, NVMF_REQ_MAX_BUFFERS, 1067 length, 4096, _map_one); 1068 } 1069 1070 static int 1071 handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 1072 struct nvmf_vfio_user_sq *sq); 1073 1074 /* 1075 * Posts a CQE in the completion queue. 1076 * 1077 * @ctrlr: the vfio-user controller 1078 * @cq: the completion queue 1079 * @cdw0: cdw0 as reported by NVMf 1080 * @sqid: submission queue ID 1081 * @cid: command identifier in NVMe command 1082 * @sc: the NVMe CQE status code 1083 * @sct: the NVMe CQE status code type 1084 */ 1085 static int 1086 post_completion(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_cq *cq, 1087 uint32_t cdw0, uint16_t sqid, uint16_t cid, uint16_t sc, uint16_t sct) 1088 { 1089 struct spdk_nvme_cpl *cpl; 1090 const struct spdk_nvmf_registers *regs; 1091 int err; 1092 1093 assert(ctrlr != NULL); 1094 1095 if (spdk_unlikely(cq == NULL || q_addr(&cq->mapping) == NULL)) { 1096 return 0; 1097 } 1098 1099 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->ctrlr); 1100 if (regs->csts.bits.shst != SPDK_NVME_SHST_NORMAL) { 1101 SPDK_DEBUGLOG(nvmf_vfio, 1102 "%s: ignore completion SQ%d cid=%d status=%#x\n", 1103 ctrlr_id(ctrlr), sqid, cid, sc); 1104 return 0; 1105 } 1106 1107 if (cq_is_full(ctrlr, cq)) { 1108 SPDK_ERRLOG("%s: CQ%d full (tail=%d, head=%d)\n", 1109 ctrlr_id(ctrlr), cq->qid, *cq_tailp(cq), 1110 *cq_dbl_headp(ctrlr, cq)); 1111 return -1; 1112 } 1113 1114 cpl = ((struct spdk_nvme_cpl *)q_addr(&cq->mapping)) + *cq_tailp(cq); 1115 1116 assert(ctrlr->sqs[sqid] != NULL); 1117 SPDK_DEBUGLOG(nvmf_vfio, 1118 "%s: request complete SQ%d cid=%d status=%#x SQ head=%#x CQ tail=%#x\n", 1119 ctrlr_id(ctrlr), sqid, cid, sc, *sq_headp(ctrlr->sqs[sqid]), 1120 *cq_tailp(cq)); 1121 1122 cpl->sqhd = *sq_headp(ctrlr->sqs[sqid]); 1123 cpl->sqid = sqid; 1124 cpl->cid = cid; 1125 cpl->cdw0 = cdw0; 1126 cpl->status.dnr = 0x0; 1127 cpl->status.m = 0x0; 1128 cpl->status.sct = sct; 1129 cpl->status.sc = sc; 1130 cpl->status.p = cq->phase; 1131 1132 /* Ensure the Completion Queue Entry is visible. */ 1133 spdk_wmb(); 1134 cq_tail_advance(cq); 1135 1136 /* 1137 * this function now executes at SPDK thread context, we 1138 * might be triggering interrupts from vfio-user thread context so 1139 * check for race conditions. 1140 */ 1141 if (ctrlr_interrupt_enabled(ctrlr) && cq->ien) { 1142 err = vfu_irq_trigger(ctrlr->endpoint->vfu_ctx, cq->iv); 1143 if (err != 0) { 1144 SPDK_ERRLOG("%s: failed to trigger interrupt: %m\n", 1145 ctrlr_id(ctrlr)); 1146 return err; 1147 } 1148 } 1149 1150 return 0; 1151 } 1152 1153 static bool 1154 io_q_exists(struct nvmf_vfio_user_ctrlr *vu_ctrlr, const uint16_t qid, const bool is_cq) 1155 { 1156 assert(vu_ctrlr != NULL); 1157 1158 if (qid == 0 || qid >= NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) { 1159 return false; 1160 } 1161 1162 if (is_cq) { 1163 if (vu_ctrlr->cqs[qid] == NULL) { 1164 return false; 1165 } 1166 1167 return (vu_ctrlr->cqs[qid]->cq_state != VFIO_USER_CQ_DELETED && 1168 vu_ctrlr->cqs[qid]->cq_state != VFIO_USER_CQ_UNUSED); 1169 } 1170 1171 if (vu_ctrlr->sqs[qid] == NULL) { 1172 return false; 1173 } 1174 1175 return (vu_ctrlr->sqs[qid]->sq_state != VFIO_USER_SQ_DELETED && 1176 vu_ctrlr->sqs[qid]->sq_state != VFIO_USER_SQ_UNUSED); 1177 } 1178 1179 static void 1180 free_sq_reqs(struct nvmf_vfio_user_sq *sq) 1181 { 1182 while (!TAILQ_EMPTY(&sq->free_reqs)) { 1183 struct nvmf_vfio_user_req *vu_req = TAILQ_FIRST(&sq->free_reqs); 1184 TAILQ_REMOVE(&sq->free_reqs, vu_req, link); 1185 free(vu_req); 1186 } 1187 } 1188 1189 /* Deletes a SQ, if this SQ is the last user of the associated CQ 1190 * and the controller is being shut down or reset, then the CQ is 1191 * also deleted. 1192 */ 1193 static void 1194 delete_sq_done(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvmf_vfio_user_sq *sq) 1195 { 1196 struct nvmf_vfio_user_cq *cq; 1197 uint16_t cqid; 1198 1199 SPDK_DEBUGLOG(nvmf_vfio, "%s: delete SQ%d=%p done\n", ctrlr_id(vu_ctrlr), 1200 sq->qid, sq); 1201 1202 /* Free SQ resources */ 1203 unmap_q(vu_ctrlr, &sq->mapping); 1204 1205 free_sq_reqs(sq); 1206 1207 sq->size = 0; 1208 1209 sq->sq_state = VFIO_USER_SQ_DELETED; 1210 1211 /* Controller RESET and SHUTDOWN are special cases, 1212 * VM may not send DELETE IO SQ/CQ commands, NVMf library 1213 * will disconnect IO queue pairs. 1214 */ 1215 if (vu_ctrlr->reset_shn) { 1216 cqid = sq->cqid; 1217 cq = vu_ctrlr->cqs[cqid]; 1218 1219 SPDK_DEBUGLOG(nvmf_vfio, "%s: try to delete CQ%d=%p\n", ctrlr_id(vu_ctrlr), 1220 cq->qid, cq); 1221 1222 if (cq->cq_ref) { 1223 cq->cq_ref--; 1224 } 1225 if (cq->cq_ref == 0) { 1226 unmap_q(vu_ctrlr, &cq->mapping); 1227 cq->size = 0; 1228 cq->cq_state = VFIO_USER_CQ_DELETED; 1229 cq->group = NULL; 1230 } 1231 } 1232 } 1233 1234 static void 1235 free_qp(struct nvmf_vfio_user_ctrlr *ctrlr, uint16_t qid) 1236 { 1237 struct nvmf_vfio_user_sq *sq; 1238 struct nvmf_vfio_user_cq *cq; 1239 1240 if (ctrlr == NULL) { 1241 return; 1242 } 1243 1244 sq = ctrlr->sqs[qid]; 1245 if (sq) { 1246 SPDK_DEBUGLOG(nvmf_vfio, "%s: Free SQ %u\n", ctrlr_id(ctrlr), qid); 1247 unmap_q(ctrlr, &sq->mapping); 1248 1249 free_sq_reqs(sq); 1250 1251 free(sq->mapping.sg); 1252 free(sq); 1253 ctrlr->sqs[qid] = NULL; 1254 } 1255 1256 cq = ctrlr->cqs[qid]; 1257 if (cq) { 1258 SPDK_DEBUGLOG(nvmf_vfio, "%s: Free CQ %u\n", ctrlr_id(ctrlr), qid); 1259 unmap_q(ctrlr, &cq->mapping); 1260 free(cq->mapping.sg); 1261 free(cq); 1262 ctrlr->cqs[qid] = NULL; 1263 } 1264 } 1265 1266 static int 1267 init_sq(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_transport *transport, 1268 const uint16_t id) 1269 { 1270 struct nvmf_vfio_user_sq *sq; 1271 1272 assert(ctrlr != NULL); 1273 assert(transport != NULL); 1274 assert(ctrlr->sqs[id] == NULL); 1275 1276 sq = calloc(1, sizeof(*sq)); 1277 if (sq == NULL) { 1278 return -ENOMEM; 1279 } 1280 sq->mapping.sg = calloc(1, dma_sg_size()); 1281 if (sq->mapping.sg == NULL) { 1282 free(sq); 1283 return -ENOMEM; 1284 } 1285 1286 sq->qid = id; 1287 sq->qpair.qid = id; 1288 sq->qpair.transport = transport; 1289 sq->ctrlr = ctrlr; 1290 ctrlr->sqs[id] = sq; 1291 1292 TAILQ_INIT(&sq->free_reqs); 1293 1294 return 0; 1295 } 1296 1297 static int 1298 init_cq(struct nvmf_vfio_user_ctrlr *vu_ctrlr, const uint16_t id) 1299 { 1300 struct nvmf_vfio_user_cq *cq; 1301 1302 assert(vu_ctrlr != NULL); 1303 assert(vu_ctrlr->cqs[id] == NULL); 1304 1305 cq = calloc(1, sizeof(*cq)); 1306 if (cq == NULL) { 1307 return -ENOMEM; 1308 } 1309 cq->mapping.sg = calloc(1, dma_sg_size()); 1310 if (cq->mapping.sg == NULL) { 1311 free(cq); 1312 return -ENOMEM; 1313 } 1314 1315 cq->qid = id; 1316 vu_ctrlr->cqs[id] = cq; 1317 1318 return 0; 1319 } 1320 1321 static int 1322 alloc_sq_reqs(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvmf_vfio_user_sq *sq) 1323 { 1324 struct nvmf_vfio_user_req *vu_req, *tmp; 1325 size_t req_size; 1326 uint32_t i; 1327 1328 req_size = sizeof(struct nvmf_vfio_user_req) + 1329 (dma_sg_size() * NVMF_VFIO_USER_MAX_IOVECS); 1330 1331 for (i = 0; i < sq->size; i++) { 1332 struct spdk_nvmf_request *req; 1333 1334 vu_req = calloc(1, req_size); 1335 if (vu_req == NULL) { 1336 goto err; 1337 } 1338 1339 req = &vu_req->req; 1340 req->qpair = &sq->qpair; 1341 req->rsp = (union nvmf_c2h_msg *)&vu_req->rsp; 1342 req->cmd = (union nvmf_h2c_msg *)&vu_req->cmd; 1343 1344 TAILQ_INSERT_TAIL(&sq->free_reqs, vu_req, link); 1345 } 1346 1347 return 0; 1348 1349 err: 1350 TAILQ_FOREACH_SAFE(vu_req, &sq->free_reqs, link, tmp) { 1351 free(vu_req); 1352 } 1353 return -ENOMEM; 1354 } 1355 1356 static uint16_t 1357 handle_create_io_sq(struct nvmf_vfio_user_ctrlr *ctrlr, 1358 struct spdk_nvme_cmd *cmd, uint16_t *sct) 1359 { 1360 struct nvmf_vfio_user_transport *vu_transport = ctrlr->transport; 1361 struct nvmf_vfio_user_sq *sq; 1362 uint32_t qsize; 1363 uint16_t cqid; 1364 uint16_t qid; 1365 int err; 1366 1367 qid = cmd->cdw10_bits.create_io_q.qid; 1368 cqid = cmd->cdw11_bits.create_io_sq.cqid; 1369 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 1370 1371 if (ctrlr->sqs[qid] == NULL) { 1372 err = init_sq(ctrlr, ctrlr->sqs[0]->qpair.transport, qid); 1373 if (err != 0) { 1374 *sct = SPDK_NVME_SCT_GENERIC; 1375 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1376 } 1377 } 1378 1379 if (cqid == 0 || cqid >= vu_transport->transport.opts.max_qpairs_per_ctrlr) { 1380 SPDK_ERRLOG("%s: invalid CQID %u\n", ctrlr_id(ctrlr), cqid); 1381 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1382 return SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 1383 } 1384 1385 /* CQ must be created before SQ. */ 1386 if (!io_q_exists(ctrlr, cqid, true)) { 1387 SPDK_ERRLOG("%s: CQ%u does not exist\n", ctrlr_id(ctrlr), cqid); 1388 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1389 return SPDK_NVME_SC_COMPLETION_QUEUE_INVALID; 1390 } 1391 1392 if (cmd->cdw11_bits.create_io_sq.pc != 0x1) { 1393 SPDK_ERRLOG("%s: non-PC SQ not supported\n", ctrlr_id(ctrlr)); 1394 *sct = SPDK_NVME_SCT_GENERIC; 1395 return SPDK_NVME_SC_INVALID_FIELD; 1396 } 1397 1398 sq = ctrlr->sqs[qid]; 1399 sq->size = qsize; 1400 1401 SPDK_DEBUGLOG(nvmf_vfio, "%s: SQ%d CQID=%d\n", ctrlr_id(ctrlr), 1402 qid, cqid); 1403 1404 sq->mapping.prp1 = cmd->dptr.prp.prp1; 1405 1406 err = map_q(ctrlr, &sq->mapping, sq->size, false, true); 1407 if (err) { 1408 SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr)); 1409 *sct = SPDK_NVME_SCT_GENERIC; 1410 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1411 } 1412 1413 SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped SQ%d IOVA=%#lx vaddr=%p\n", 1414 ctrlr_id(ctrlr), qid, cmd->dptr.prp.prp1, 1415 q_addr(&sq->mapping)); 1416 1417 err = alloc_sq_reqs(ctrlr, sq); 1418 if (err < 0) { 1419 SPDK_ERRLOG("%s: failed to allocate SQ requests: %m\n", ctrlr_id(ctrlr)); 1420 *sct = SPDK_NVME_SCT_GENERIC; 1421 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1422 } 1423 1424 sq->cqid = cqid; 1425 ctrlr->cqs[sq->cqid]->cq_ref++; 1426 sq->sq_state = VFIO_USER_SQ_CREATED; 1427 *sq_headp(sq) = 0; 1428 *sq_dbl_tailp(ctrlr, sq) = 0; 1429 1430 /* 1431 * Create our new I/O qpair. This asynchronously invokes, on a suitable 1432 * poll group, the nvmf_vfio_user_poll_group_add() callback, which will 1433 * call spdk_nvmf_request_exec_fabrics() with a generated fabrics 1434 * connect command. This command is then eventually completed via 1435 * handle_queue_connect_rsp(). 1436 */ 1437 sq->create_io_sq_cmd = *cmd; 1438 sq->post_create_io_sq_completion = true; 1439 1440 spdk_nvmf_tgt_new_qpair(ctrlr->transport->transport.tgt, 1441 &sq->qpair); 1442 1443 *sct = SPDK_NVME_SCT_GENERIC; 1444 return SPDK_NVME_SC_SUCCESS; 1445 } 1446 1447 static uint16_t 1448 handle_create_io_cq(struct nvmf_vfio_user_ctrlr *ctrlr, 1449 struct spdk_nvme_cmd *cmd, uint16_t *sct) 1450 { 1451 struct nvmf_vfio_user_cq *cq; 1452 uint32_t qsize; 1453 uint16_t qid; 1454 int err; 1455 1456 qid = cmd->cdw10_bits.create_io_q.qid; 1457 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 1458 1459 if (ctrlr->cqs[qid] == NULL) { 1460 err = init_cq(ctrlr, qid); 1461 if (err != 0) { 1462 *sct = SPDK_NVME_SCT_GENERIC; 1463 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1464 } 1465 } 1466 1467 if (cmd->cdw11_bits.create_io_cq.pc != 0x1) { 1468 SPDK_ERRLOG("%s: non-PC CQ not supported\n", ctrlr_id(ctrlr)); 1469 *sct = SPDK_NVME_SCT_GENERIC; 1470 return SPDK_NVME_SC_INVALID_FIELD; 1471 } 1472 1473 if (cmd->cdw11_bits.create_io_cq.iv > NVME_IRQ_MSIX_NUM - 1) { 1474 SPDK_ERRLOG("%s: IV is too big\n", ctrlr_id(ctrlr)); 1475 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1476 return SPDK_NVME_SC_INVALID_INTERRUPT_VECTOR; 1477 } 1478 1479 cq = ctrlr->cqs[qid]; 1480 cq->size = qsize; 1481 1482 cq->mapping.prp1 = cmd->dptr.prp.prp1; 1483 1484 err = map_q(ctrlr, &cq->mapping, cq->size, true, true); 1485 if (err) { 1486 SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr)); 1487 *sct = SPDK_NVME_SCT_GENERIC; 1488 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1489 } 1490 1491 SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped CQ%d IOVA=%#lx vaddr=%p\n", 1492 ctrlr_id(ctrlr), qid, cmd->dptr.prp.prp1, 1493 q_addr(&cq->mapping)); 1494 1495 cq->ien = cmd->cdw11_bits.create_io_cq.ien; 1496 cq->iv = cmd->cdw11_bits.create_io_cq.iv; 1497 cq->phase = true; 1498 cq->cq_state = VFIO_USER_CQ_CREATED; 1499 1500 *cq_tailp(cq) = 0; 1501 *cq_dbl_headp(ctrlr, cq) = 0; 1502 1503 *sct = SPDK_NVME_SCT_GENERIC; 1504 return SPDK_NVME_SC_SUCCESS; 1505 } 1506 1507 /* 1508 * Creates a completion or submission I/O queue. Returns 0 on success, -errno 1509 * on error. 1510 */ 1511 static int 1512 handle_create_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, 1513 struct spdk_nvme_cmd *cmd, const bool is_cq) 1514 { 1515 struct nvmf_vfio_user_transport *vu_transport = ctrlr->transport; 1516 uint16_t sct = SPDK_NVME_SCT_GENERIC; 1517 uint16_t sc = SPDK_NVME_SC_SUCCESS; 1518 uint32_t qsize; 1519 uint16_t qid; 1520 1521 assert(ctrlr != NULL); 1522 assert(cmd != NULL); 1523 1524 qid = cmd->cdw10_bits.create_io_q.qid; 1525 if (qid == 0 || qid >= vu_transport->transport.opts.max_qpairs_per_ctrlr) { 1526 SPDK_ERRLOG("%s: invalid QID=%d, max=%d\n", ctrlr_id(ctrlr), 1527 qid, vu_transport->transport.opts.max_qpairs_per_ctrlr); 1528 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1529 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 1530 goto out; 1531 } 1532 1533 if (io_q_exists(ctrlr, qid, is_cq)) { 1534 SPDK_ERRLOG("%s: %cQ%d already exists\n", ctrlr_id(ctrlr), 1535 is_cq ? 'C' : 'S', qid); 1536 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1537 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 1538 goto out; 1539 } 1540 1541 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 1542 if (qsize == 1 || qsize > max_queue_size(ctrlr)) { 1543 SPDK_ERRLOG("%s: invalid I/O queue size %u\n", ctrlr_id(ctrlr), qsize); 1544 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1545 sc = SPDK_NVME_SC_INVALID_QUEUE_SIZE; 1546 goto out; 1547 } 1548 1549 if (is_cq) { 1550 sc = handle_create_io_cq(ctrlr, cmd, &sct); 1551 } else { 1552 sc = handle_create_io_sq(ctrlr, cmd, &sct); 1553 1554 if (sct == SPDK_NVME_SCT_GENERIC && 1555 sc == SPDK_NVME_SC_SUCCESS) { 1556 /* Completion posted asynchronously. */ 1557 return 0; 1558 } 1559 } 1560 1561 out: 1562 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 1563 } 1564 1565 /* For ADMIN I/O DELETE SUBMISSION QUEUE the NVMf library will disconnect and free 1566 * queue pair, so save the command in a context. 1567 */ 1568 struct vfio_user_delete_sq_ctx { 1569 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 1570 struct spdk_nvme_cmd delete_io_sq_cmd; 1571 }; 1572 1573 static void 1574 vfio_user_qpair_delete_cb(void *cb_arg) 1575 { 1576 struct vfio_user_delete_sq_ctx *ctx = cb_arg; 1577 struct nvmf_vfio_user_ctrlr *vu_ctrlr = ctx->vu_ctrlr; 1578 1579 post_completion(vu_ctrlr, vu_ctrlr->cqs[0], 0, 0, ctx->delete_io_sq_cmd.cid, 1580 SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC); 1581 free(ctx); 1582 } 1583 1584 /* 1585 * Deletes a completion or submission I/O queue. 1586 */ 1587 static int 1588 handle_del_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, 1589 struct spdk_nvme_cmd *cmd, const bool is_cq) 1590 { 1591 uint16_t sct = SPDK_NVME_SCT_GENERIC; 1592 uint16_t sc = SPDK_NVME_SC_SUCCESS; 1593 struct nvmf_vfio_user_sq *sq; 1594 struct nvmf_vfio_user_cq *cq; 1595 struct vfio_user_delete_sq_ctx *ctx; 1596 1597 SPDK_DEBUGLOG(nvmf_vfio, "%s: delete I/O %cQ: QID=%d\n", 1598 ctrlr_id(ctrlr), is_cq ? 'C' : 'S', 1599 cmd->cdw10_bits.delete_io_q.qid); 1600 1601 if (!io_q_exists(ctrlr, cmd->cdw10_bits.delete_io_q.qid, is_cq)) { 1602 SPDK_ERRLOG("%s: I/O %cQ%d does not exist\n", ctrlr_id(ctrlr), 1603 is_cq ? 'C' : 'S', cmd->cdw10_bits.delete_io_q.qid); 1604 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1605 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 1606 goto out; 1607 } 1608 1609 if (is_cq) { 1610 cq = ctrlr->cqs[cmd->cdw10_bits.delete_io_q.qid]; 1611 if (cq->cq_ref) { 1612 SPDK_ERRLOG("%s: the associated SQ must be deleted first\n", ctrlr_id(ctrlr)); 1613 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1614 sc = SPDK_NVME_SC_INVALID_QUEUE_DELETION; 1615 goto out; 1616 } 1617 1618 unmap_q(ctrlr, &cq->mapping); 1619 cq->size = 0; 1620 cq->cq_state = VFIO_USER_CQ_DELETED; 1621 cq->group = NULL; 1622 } else { 1623 ctx = calloc(1, sizeof(*ctx)); 1624 if (!ctx) { 1625 sct = SPDK_NVME_SCT_GENERIC; 1626 sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1627 goto out; 1628 } 1629 ctx->vu_ctrlr = ctrlr; 1630 ctx->delete_io_sq_cmd = *cmd; 1631 1632 sq = ctrlr->sqs[cmd->cdw10_bits.delete_io_q.qid]; 1633 sq->sq_state = VFIO_USER_SQ_DELETED; 1634 assert(ctrlr->cqs[sq->cqid]->cq_ref); 1635 ctrlr->cqs[sq->cqid]->cq_ref--; 1636 1637 spdk_nvmf_qpair_disconnect(&sq->qpair, vfio_user_qpair_delete_cb, ctx); 1638 return 0; 1639 } 1640 1641 out: 1642 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 1643 } 1644 1645 /* 1646 * Returns 0 on success and -errno on error. 1647 */ 1648 static int 1649 consume_admin_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd) 1650 { 1651 assert(ctrlr != NULL); 1652 assert(cmd != NULL); 1653 1654 if (cmd->fuse != 0) { 1655 /* Fused admin commands are not supported. */ 1656 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, 1657 SPDK_NVME_SC_INVALID_FIELD, 1658 SPDK_NVME_SCT_GENERIC); 1659 } 1660 1661 switch (cmd->opc) { 1662 case SPDK_NVME_OPC_CREATE_IO_CQ: 1663 case SPDK_NVME_OPC_CREATE_IO_SQ: 1664 return handle_create_io_q(ctrlr, cmd, 1665 cmd->opc == SPDK_NVME_OPC_CREATE_IO_CQ); 1666 case SPDK_NVME_OPC_DELETE_IO_SQ: 1667 case SPDK_NVME_OPC_DELETE_IO_CQ: 1668 return handle_del_io_q(ctrlr, cmd, 1669 cmd->opc == SPDK_NVME_OPC_DELETE_IO_CQ); 1670 default: 1671 return handle_cmd_req(ctrlr, cmd, ctrlr->sqs[0]); 1672 } 1673 } 1674 1675 static int 1676 handle_cmd_rsp(struct nvmf_vfio_user_req *vu_req, void *cb_arg) 1677 { 1678 struct nvmf_vfio_user_sq *sq = cb_arg; 1679 struct nvmf_vfio_user_ctrlr *vu_ctrlr = sq->ctrlr; 1680 uint16_t sqid, cqid; 1681 1682 assert(sq != NULL); 1683 assert(vu_req != NULL); 1684 assert(vu_ctrlr != NULL); 1685 1686 if (spdk_likely(vu_req->iovcnt)) { 1687 vfu_unmap_sg(vu_ctrlr->endpoint->vfu_ctx, 1688 vu_req_to_sg_t(vu_req, 0), 1689 vu_req->iov, vu_req->iovcnt); 1690 } 1691 sqid = sq->qid; 1692 cqid = sq->cqid; 1693 1694 return post_completion(vu_ctrlr, vu_ctrlr->cqs[cqid], 1695 vu_req->req.rsp->nvme_cpl.cdw0, 1696 sqid, 1697 vu_req->req.cmd->nvme_cmd.cid, 1698 vu_req->req.rsp->nvme_cpl.status.sc, 1699 vu_req->req.rsp->nvme_cpl.status.sct); 1700 } 1701 1702 static int 1703 consume_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_sq *sq, 1704 struct spdk_nvme_cmd *cmd) 1705 { 1706 assert(sq != NULL); 1707 if (nvmf_qpair_is_admin_queue(&sq->qpair)) { 1708 return consume_admin_cmd(ctrlr, cmd); 1709 } 1710 1711 return handle_cmd_req(ctrlr, cmd, sq); 1712 } 1713 1714 /* Returns the number of commands processed, or a negative value on error. */ 1715 static int 1716 handle_sq_tdbl_write(struct nvmf_vfio_user_ctrlr *ctrlr, const uint32_t new_tail, 1717 struct nvmf_vfio_user_sq *sq) 1718 { 1719 struct spdk_nvme_cmd *queue; 1720 int count = 0; 1721 1722 assert(ctrlr != NULL); 1723 assert(sq != NULL); 1724 1725 queue = q_addr(&sq->mapping); 1726 while (*sq_headp(sq) != new_tail) { 1727 int err; 1728 struct spdk_nvme_cmd *cmd = &queue[*sq_headp(sq)]; 1729 1730 count++; 1731 1732 /* 1733 * SQHD must contain the new head pointer, so we must increase 1734 * it before we generate a completion. 1735 */ 1736 sq_head_advance(sq); 1737 1738 err = consume_cmd(ctrlr, sq, cmd); 1739 if (err != 0) { 1740 return err; 1741 } 1742 } 1743 1744 return count; 1745 } 1746 1747 static int 1748 enable_admin_queue(struct nvmf_vfio_user_ctrlr *ctrlr) 1749 { 1750 int err; 1751 1752 assert(ctrlr != NULL); 1753 1754 err = acq_setup(ctrlr); 1755 if (err != 0) { 1756 return err; 1757 } 1758 1759 err = asq_setup(ctrlr); 1760 if (err != 0) { 1761 return err; 1762 } 1763 1764 return 0; 1765 } 1766 1767 static void 1768 disable_admin_queue(struct nvmf_vfio_user_ctrlr *ctrlr) 1769 { 1770 assert(ctrlr->sqs[0] != NULL); 1771 assert(ctrlr->cqs[0] != NULL); 1772 1773 unmap_q(ctrlr, &ctrlr->sqs[0]->mapping); 1774 unmap_q(ctrlr, &ctrlr->cqs[0]->mapping); 1775 1776 ctrlr->sqs[0]->size = 0; 1777 *sq_headp(ctrlr->sqs[0]) = 0; 1778 ctrlr->cqs[0]->size = 0; 1779 *cq_dbl_headp(ctrlr, ctrlr->cqs[0]) = 0; 1780 } 1781 1782 static void 1783 memory_region_add_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) 1784 { 1785 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 1786 struct nvmf_vfio_user_ctrlr *ctrlr; 1787 struct nvmf_vfio_user_sq *sq; 1788 struct nvmf_vfio_user_cq *cq; 1789 void *map_start, *map_end; 1790 int ret; 1791 1792 /* 1793 * We're not interested in any DMA regions that aren't mappable (we don't 1794 * support clients that don't share their memory). 1795 */ 1796 if (!info->vaddr) { 1797 return; 1798 } 1799 1800 map_start = info->mapping.iov_base; 1801 map_end = info->mapping.iov_base + info->mapping.iov_len; 1802 1803 if (((uintptr_t)info->mapping.iov_base & MASK_2MB) || 1804 (info->mapping.iov_len & MASK_2MB)) { 1805 SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %p-%p\n", 1806 info->vaddr, map_start, map_end); 1807 return; 1808 } 1809 1810 assert(endpoint != NULL); 1811 if (endpoint->ctrlr == NULL) { 1812 return; 1813 } 1814 ctrlr = endpoint->ctrlr; 1815 1816 SPDK_DEBUGLOG(nvmf_vfio, "%s: map IOVA %p-%p\n", endpoint_id(endpoint), 1817 map_start, map_end); 1818 1819 /* VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE are enabled when registering to VFIO, here we also 1820 * check the protection bits before registering. 1821 */ 1822 if (info->prot == (PROT_WRITE | PROT_READ)) { 1823 ret = spdk_mem_register(info->mapping.iov_base, info->mapping.iov_len); 1824 if (ret) { 1825 SPDK_ERRLOG("Memory region register %p-%p failed, ret=%d\n", 1826 map_start, map_end, ret); 1827 } 1828 } 1829 1830 pthread_mutex_lock(&endpoint->lock); 1831 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 1832 if (sq->sq_state != VFIO_USER_SQ_INACTIVE) { 1833 continue; 1834 } 1835 1836 cq = ctrlr->cqs[sq->cqid]; 1837 1838 /* For shared CQ case, we will use q_addr() to avoid mapping CQ multiple times */ 1839 if (cq->size && q_addr(&cq->mapping) == NULL) { 1840 ret = map_q(ctrlr, &cq->mapping, cq->size, true, false); 1841 if (ret) { 1842 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap CQID %d %#lx-%#lx\n", 1843 cq->qid, cq->mapping.prp1, 1844 cq->mapping.prp1 + cq->size * sizeof(struct spdk_nvme_cpl)); 1845 continue; 1846 } 1847 } 1848 1849 if (sq->size) { 1850 ret = map_q(ctrlr, &sq->mapping, sq->size, false, false); 1851 if (ret) { 1852 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap SQID %d %#lx-%#lx\n", 1853 sq->qid, sq->mapping.prp1, 1854 sq->mapping.prp1 + sq->size * sizeof(struct spdk_nvme_cmd)); 1855 continue; 1856 } 1857 } 1858 sq->sq_state = VFIO_USER_SQ_ACTIVE; 1859 SPDK_DEBUGLOG(nvmf_vfio, "Remap SQ %u successfully\n", sq->qid); 1860 } 1861 pthread_mutex_unlock(&endpoint->lock); 1862 } 1863 1864 static void 1865 memory_region_remove_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) 1866 { 1867 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 1868 struct nvmf_vfio_user_sq *sq; 1869 struct nvmf_vfio_user_cq *cq; 1870 void *map_start, *map_end; 1871 int ret = 0; 1872 1873 if (!info->vaddr) { 1874 return; 1875 } 1876 1877 map_start = info->mapping.iov_base; 1878 map_end = info->mapping.iov_base + info->mapping.iov_len; 1879 1880 if (((uintptr_t)info->mapping.iov_base & MASK_2MB) || 1881 (info->mapping.iov_len & MASK_2MB)) { 1882 SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %p-%p\n", 1883 info->vaddr, map_start, map_end); 1884 return; 1885 } 1886 1887 assert(endpoint != NULL); 1888 SPDK_DEBUGLOG(nvmf_vfio, "%s: unmap IOVA %p-%p\n", endpoint_id(endpoint), 1889 map_start, map_end); 1890 1891 if (endpoint->ctrlr != NULL) { 1892 struct nvmf_vfio_user_ctrlr *ctrlr; 1893 ctrlr = endpoint->ctrlr; 1894 1895 pthread_mutex_lock(&endpoint->lock); 1896 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 1897 if (q_addr(&sq->mapping) >= map_start && q_addr(&sq->mapping) <= map_end) { 1898 unmap_q(ctrlr, &sq->mapping); 1899 sq->sq_state = VFIO_USER_SQ_INACTIVE; 1900 } 1901 1902 cq = ctrlr->cqs[sq->cqid]; 1903 if (q_addr(&cq->mapping) >= map_start && q_addr(&cq->mapping) <= map_end) { 1904 unmap_q(ctrlr, &cq->mapping); 1905 } 1906 } 1907 pthread_mutex_unlock(&endpoint->lock); 1908 } 1909 1910 if (info->prot == (PROT_WRITE | PROT_READ)) { 1911 ret = spdk_mem_unregister(info->mapping.iov_base, info->mapping.iov_len); 1912 if (ret) { 1913 SPDK_ERRLOG("Memory region unregister %p-%p failed, ret=%d\n", 1914 map_start, map_end, ret); 1915 } 1916 } 1917 } 1918 1919 static int 1920 nvmf_vfio_user_prop_req_rsp(struct nvmf_vfio_user_req *req, void *cb_arg) 1921 { 1922 struct nvmf_vfio_user_sq *sq = cb_arg; 1923 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 1924 int ret; 1925 1926 assert(sq != NULL); 1927 assert(req != NULL); 1928 1929 if (req->req.cmd->prop_get_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET) { 1930 assert(sq->ctrlr != NULL); 1931 assert(req != NULL); 1932 1933 memcpy(req->req.data, 1934 &req->req.rsp->prop_get_rsp.value.u64, 1935 req->req.length); 1936 } else { 1937 assert(req->req.cmd->prop_set_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET); 1938 assert(sq->ctrlr != NULL); 1939 vu_ctrlr = sq->ctrlr; 1940 1941 if (req->req.cmd->prop_set_cmd.ofst == offsetof(struct spdk_nvme_registers, cc)) { 1942 union spdk_nvme_cc_register cc, diff; 1943 1944 cc.raw = req->req.cmd->prop_set_cmd.value.u64; 1945 diff.raw = cc.raw ^ req->cc.raw; 1946 1947 if (diff.bits.en) { 1948 if (cc.bits.en) { 1949 SPDK_DEBUGLOG(nvmf_vfio, "%s: MAP Admin queue\n", ctrlr_id(vu_ctrlr)); 1950 ret = enable_admin_queue(vu_ctrlr); 1951 if (ret) { 1952 SPDK_ERRLOG("%s: failed to map Admin queue\n", ctrlr_id(vu_ctrlr)); 1953 return ret; 1954 } 1955 sq->sq_state = VFIO_USER_SQ_ACTIVE; 1956 vu_ctrlr->reset_shn = false; 1957 } else { 1958 vu_ctrlr->reset_shn = true; 1959 } 1960 } 1961 1962 if (diff.bits.shn) { 1963 if (cc.bits.shn == SPDK_NVME_SHN_NORMAL || cc.bits.shn == SPDK_NVME_SHN_ABRUPT) { 1964 vu_ctrlr->reset_shn = true; 1965 } 1966 } 1967 1968 if (vu_ctrlr->reset_shn) { 1969 SPDK_DEBUGLOG(nvmf_vfio, 1970 "%s: UNMAP Admin queue\n", 1971 ctrlr_id(vu_ctrlr)); 1972 sq->sq_state = VFIO_USER_SQ_INACTIVE; 1973 disable_admin_queue(vu_ctrlr); 1974 /* For PCIe controller reset or shutdown, we will drop all AER responses */ 1975 nvmf_ctrlr_abort_aer(vu_ctrlr->ctrlr); 1976 } 1977 } 1978 } 1979 1980 return 0; 1981 } 1982 1983 /* 1984 * Handles a write at offset 0x1000 or more; this is the non-mapped path when a 1985 * doorbell is written via access_bar0_fn(). 1986 * 1987 * DSTRD is set to fixed value 0 for NVMf. 1988 * 1989 */ 1990 static int 1991 handle_dbl_access(struct nvmf_vfio_user_ctrlr *ctrlr, uint32_t *buf, 1992 const size_t count, loff_t pos, const bool is_write) 1993 { 1994 assert(ctrlr != NULL); 1995 assert(buf != NULL); 1996 1997 if (count != sizeof(uint32_t)) { 1998 SPDK_ERRLOG("%s: bad doorbell buffer size %ld\n", 1999 ctrlr_id(ctrlr), count); 2000 errno = EINVAL; 2001 return -1; 2002 } 2003 2004 pos -= NVME_DOORBELLS_OFFSET; 2005 2006 /* pos must be dword aligned */ 2007 if ((pos & 0x3) != 0) { 2008 SPDK_ERRLOG("%s: bad doorbell offset %#lx\n", ctrlr_id(ctrlr), pos); 2009 errno = EINVAL; 2010 return -1; 2011 } 2012 2013 /* convert byte offset to array index */ 2014 pos >>= 2; 2015 2016 if (pos >= NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR * 2) { 2017 SPDK_ERRLOG("%s: bad doorbell index %#lx\n", ctrlr_id(ctrlr), pos); 2018 errno = EINVAL; 2019 return -1; 2020 } 2021 2022 if (is_write) { 2023 ctrlr->doorbells[pos] = *buf; 2024 spdk_wmb(); 2025 } else { 2026 spdk_rmb(); 2027 *buf = ctrlr->doorbells[pos]; 2028 } 2029 return 0; 2030 } 2031 2032 static size_t 2033 vfio_user_property_access(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 2034 char *buf, size_t count, loff_t pos, 2035 bool is_write) 2036 { 2037 struct nvmf_vfio_user_req *req; 2038 const struct spdk_nvmf_registers *regs; 2039 2040 /* Construct a Fabric Property Get/Set command and send it */ 2041 req = get_nvmf_vfio_user_req(vu_ctrlr->sqs[0]); 2042 if (req == NULL) { 2043 errno = ENOBUFS; 2044 return -1; 2045 } 2046 regs = spdk_nvmf_ctrlr_get_regs(vu_ctrlr->ctrlr); 2047 req->cc.raw = regs->cc.raw; 2048 2049 req->cb_fn = nvmf_vfio_user_prop_req_rsp; 2050 req->cb_arg = vu_ctrlr->sqs[0]; 2051 req->req.cmd->prop_set_cmd.opcode = SPDK_NVME_OPC_FABRIC; 2052 req->req.cmd->prop_set_cmd.cid = 0; 2053 req->req.cmd->prop_set_cmd.attrib.size = (count / 4) - 1; 2054 req->req.cmd->prop_set_cmd.ofst = pos; 2055 if (is_write) { 2056 req->req.cmd->prop_set_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET; 2057 if (req->req.cmd->prop_set_cmd.attrib.size) { 2058 req->req.cmd->prop_set_cmd.value.u64 = *(uint64_t *)buf; 2059 } else { 2060 req->req.cmd->prop_set_cmd.value.u32.high = 0; 2061 req->req.cmd->prop_set_cmd.value.u32.low = *(uint32_t *)buf; 2062 } 2063 } else { 2064 req->req.cmd->prop_get_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET; 2065 } 2066 req->req.length = count; 2067 req->req.data = buf; 2068 2069 spdk_nvmf_request_exec_fabrics(&req->req); 2070 2071 return count; 2072 } 2073 2074 static ssize_t 2075 access_bar0_fn(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t pos, 2076 bool is_write) 2077 { 2078 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2079 struct nvmf_vfio_user_ctrlr *ctrlr; 2080 int ret; 2081 2082 ctrlr = endpoint->ctrlr; 2083 if (endpoint->need_async_destroy || !ctrlr) { 2084 errno = EIO; 2085 return -1; 2086 } 2087 2088 SPDK_DEBUGLOG(nvmf_vfio, 2089 "%s: bar0 %s ctrlr: %p, count=%zu, pos=%"PRIX64"\n", 2090 endpoint_id(endpoint), is_write ? "write" : "read", 2091 ctrlr, count, pos); 2092 2093 if (pos >= NVME_DOORBELLS_OFFSET) { 2094 /* 2095 * The fact that the doorbells can be memory mapped doesn't mean 2096 * that the client (VFIO in QEMU) is obliged to memory map them, 2097 * it might still elect to access them via regular read/write; 2098 * we might also have had disable_mappable_bar0 set. 2099 */ 2100 ret = handle_dbl_access(ctrlr, (uint32_t *)buf, count, 2101 pos, is_write); 2102 if (ret == 0) { 2103 return count; 2104 } 2105 return ret; 2106 } 2107 2108 return vfio_user_property_access(ctrlr, buf, count, pos, is_write); 2109 } 2110 2111 static ssize_t 2112 access_pci_config(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t offset, 2113 bool is_write) 2114 { 2115 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2116 2117 if (is_write) { 2118 SPDK_ERRLOG("%s: write %#lx-%#lx not supported\n", 2119 endpoint_id(endpoint), offset, offset + count); 2120 errno = EINVAL; 2121 return -1; 2122 } 2123 2124 if (offset + count > NVME_REG_CFG_SIZE) { 2125 SPDK_ERRLOG("%s: access past end of extended PCI configuration space, want=%ld+%ld, max=%d\n", 2126 endpoint_id(endpoint), offset, count, 2127 NVME_REG_CFG_SIZE); 2128 errno = ERANGE; 2129 return -1; 2130 } 2131 2132 memcpy(buf, ((unsigned char *)endpoint->pci_config_space) + offset, count); 2133 2134 return count; 2135 } 2136 2137 static void 2138 vfio_user_log(vfu_ctx_t *vfu_ctx, int level, char const *msg) 2139 { 2140 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2141 2142 if (level >= LOG_DEBUG) { 2143 SPDK_DEBUGLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg); 2144 } else if (level >= LOG_INFO) { 2145 SPDK_INFOLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg); 2146 } else if (level >= LOG_NOTICE) { 2147 SPDK_NOTICELOG("%s: %s\n", endpoint_id(endpoint), msg); 2148 } else if (level >= LOG_WARNING) { 2149 SPDK_WARNLOG("%s: %s\n", endpoint_id(endpoint), msg); 2150 } else { 2151 SPDK_ERRLOG("%s: %s\n", endpoint_id(endpoint), msg); 2152 } 2153 } 2154 2155 static int 2156 vfio_user_get_log_level(void) 2157 { 2158 int level; 2159 2160 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 2161 return LOG_DEBUG; 2162 } 2163 2164 level = spdk_log_to_syslog_level(spdk_log_get_level()); 2165 if (level < 0) { 2166 return LOG_ERR; 2167 } 2168 2169 return level; 2170 } 2171 2172 static void 2173 init_pci_config_space(vfu_pci_config_space_t *p) 2174 { 2175 /* MLBAR */ 2176 p->hdr.bars[0].raw = 0x0; 2177 /* MUBAR */ 2178 p->hdr.bars[1].raw = 0x0; 2179 2180 /* vendor specific, let's set them to zero for now */ 2181 p->hdr.bars[3].raw = 0x0; 2182 p->hdr.bars[4].raw = 0x0; 2183 p->hdr.bars[5].raw = 0x0; 2184 2185 /* enable INTx */ 2186 p->hdr.intr.ipin = 0x1; 2187 } 2188 2189 static void 2190 vfio_user_dev_quiesce_done(struct spdk_nvmf_subsystem *subsystem, 2191 void *cb_arg, int status); 2192 2193 static void 2194 vfio_user_endpoint_resume_done(struct spdk_nvmf_subsystem *subsystem, 2195 void *cb_arg, int status) 2196 { 2197 struct nvmf_vfio_user_endpoint *endpoint = cb_arg; 2198 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 2199 int ret; 2200 2201 SPDK_DEBUGLOG(nvmf_vfio, "%s resumed done with status %d\n", endpoint_id(endpoint), status); 2202 2203 if (!vu_ctrlr) { 2204 return; 2205 } 2206 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 2207 2208 /* Basically, once we call `vfu_device_quiesced` the device is unquiesced from 2209 * libvfio-user's perspective so from the moment `vfio_user_dev_quiesce_done` returns 2210 * libvfio-user might quiesce the device again. However, because the NVMf subsytem is 2211 * an asynchronous operation, this quiesce might come _before_ the NVMf subsystem has 2212 * been resumed, so in the callback of `spdk_nvmf_subsystem_resume` we need to check 2213 * whether a quiesce was requested. 2214 */ 2215 if (vu_ctrlr->queued_quiesce) { 2216 SPDK_DEBUGLOG(nvmf_vfio, "%s has queued quiesce event, pause again\n", ctrlr_id(vu_ctrlr)); 2217 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSING; 2218 ret = spdk_nvmf_subsystem_pause((struct spdk_nvmf_subsystem *)endpoint->subsystem, 0, 2219 vfio_user_dev_quiesce_done, vu_ctrlr); 2220 if (ret < 0) { 2221 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 2222 SPDK_ERRLOG("%s: failed to pause, ret=%d\n", endpoint_id(endpoint), ret); 2223 } 2224 } 2225 } 2226 2227 static void 2228 vfio_user_dev_quiesce_done(struct spdk_nvmf_subsystem *subsystem, 2229 void *cb_arg, int status) 2230 { 2231 struct nvmf_vfio_user_ctrlr *vu_ctrlr = cb_arg; 2232 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 2233 int ret; 2234 2235 SPDK_DEBUGLOG(nvmf_vfio, "%s paused done with status %d\n", ctrlr_id(vu_ctrlr), status); 2236 2237 assert(vu_ctrlr->state == VFIO_USER_CTRLR_PAUSING); 2238 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 2239 vfu_device_quiesced(endpoint->vfu_ctx, status); 2240 vu_ctrlr->queued_quiesce = false; 2241 2242 /* `vfu_device_quiesced` can change the migration state, 2243 * so we need to re-check `vu_ctrlr->state`. 2244 */ 2245 if (vu_ctrlr->state == VFIO_USER_CTRLR_MIGRATING) { 2246 SPDK_DEBUGLOG(nvmf_vfio, "%s is in MIGRATION state\n", ctrlr_id(vu_ctrlr)); 2247 return; 2248 } 2249 2250 SPDK_DEBUGLOG(nvmf_vfio, "%s start to resume\n", ctrlr_id(vu_ctrlr)); 2251 vu_ctrlr->state = VFIO_USER_CTRLR_RESUMING; 2252 ret = spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 2253 vfio_user_endpoint_resume_done, endpoint); 2254 if (ret < 0) { 2255 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 2256 SPDK_ERRLOG("%s: failed to resume, ret=%d\n", endpoint_id(endpoint), ret); 2257 } 2258 } 2259 2260 static int 2261 vfio_user_dev_quiesce_cb(vfu_ctx_t *vfu_ctx) 2262 { 2263 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2264 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 2265 int ret; 2266 2267 if (!vu_ctrlr) { 2268 return 0; 2269 } 2270 2271 /* NVMf library will destruct controller when no 2272 * connected queue pairs. 2273 */ 2274 if (!nvmf_subsystem_get_ctrlr((struct spdk_nvmf_subsystem *)endpoint->subsystem, 2275 vu_ctrlr->cntlid)) { 2276 return 0; 2277 } 2278 2279 SPDK_DEBUGLOG(nvmf_vfio, "%s starts to quiesce\n", ctrlr_id(vu_ctrlr)); 2280 2281 /* There is no race condition here as device quiesce callback 2282 * and nvmf_prop_set_cc() are running in the same thread context. 2283 */ 2284 if (!vu_ctrlr->ctrlr->vcprop.cc.bits.en) { 2285 return 0; 2286 } else if (!vu_ctrlr->ctrlr->vcprop.csts.bits.rdy) { 2287 return 0; 2288 } else if (vu_ctrlr->ctrlr->vcprop.csts.bits.shst == SPDK_NVME_SHST_COMPLETE) { 2289 return 0; 2290 } 2291 2292 switch (vu_ctrlr->state) { 2293 case VFIO_USER_CTRLR_PAUSED: 2294 case VFIO_USER_CTRLR_MIGRATING: 2295 return 0; 2296 case VFIO_USER_CTRLR_RUNNING: 2297 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSING; 2298 ret = spdk_nvmf_subsystem_pause((struct spdk_nvmf_subsystem *)endpoint->subsystem, 0, 2299 vfio_user_dev_quiesce_done, vu_ctrlr); 2300 if (ret < 0) { 2301 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 2302 SPDK_ERRLOG("%s: failed to pause, ret=%d\n", endpoint_id(endpoint), ret); 2303 return 0; 2304 } 2305 break; 2306 case VFIO_USER_CTRLR_RESUMING: 2307 vu_ctrlr->queued_quiesce = true; 2308 SPDK_DEBUGLOG(nvmf_vfio, "%s is busy to quiesce, current state %u\n", ctrlr_id(vu_ctrlr), 2309 vu_ctrlr->state); 2310 break; 2311 default: 2312 assert(vu_ctrlr->state != VFIO_USER_CTRLR_PAUSING); 2313 break; 2314 } 2315 2316 errno = EBUSY; 2317 return -1; 2318 } 2319 2320 static void 2321 vfio_user_ctrlr_dump_migr_data(const char *name, struct vfio_user_nvme_migr_state *migr_data) 2322 { 2323 struct spdk_nvme_registers *regs; 2324 struct nvme_migr_sq_state *sq; 2325 struct nvme_migr_cq_state *cq; 2326 uint32_t *doorbell_base; 2327 uint32_t i; 2328 2329 SPDK_NOTICELOG("Dump %s\n", name); 2330 2331 regs = (struct spdk_nvme_registers *)migr_data->bar0; 2332 doorbell_base = (uint32_t *)®s->doorbell[0].sq_tdbl; 2333 2334 SPDK_NOTICELOG("Registers\n"); 2335 SPDK_NOTICELOG("CSTS 0x%x\n", regs->csts.raw); 2336 SPDK_NOTICELOG("CAP 0x%"PRIx64"\n", regs->cap.raw); 2337 SPDK_NOTICELOG("VS 0x%x\n", regs->vs.raw); 2338 SPDK_NOTICELOG("CC 0x%x\n", regs->cc.raw); 2339 SPDK_NOTICELOG("AQA 0x%x\n", regs->aqa.raw); 2340 SPDK_NOTICELOG("ASQ 0x%"PRIx64"\n", regs->asq); 2341 SPDK_NOTICELOG("ACQ 0x%"PRIx64"\n", regs->acq); 2342 2343 SPDK_NOTICELOG("Number of IO Queues %u\n", migr_data->ctrlr_header.num_io_queues); 2344 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 2345 sq = &migr_data->qps[i].sq; 2346 cq = &migr_data->qps[i].cq; 2347 2348 if (sq->size) { 2349 SPDK_NOTICELOG("SQID %u, SQ DOORBELL %u\n", sq->sqid, doorbell_base[i * 2]); 2350 SPDK_NOTICELOG("SQ SQID %u, CQID %u, HEAD %u, SIZE %u, DMA ADDR 0x%"PRIx64"\n", 2351 sq->sqid, sq->cqid, sq->head, sq->size, sq->dma_addr); 2352 } 2353 2354 if (cq->size) { 2355 SPDK_NOTICELOG("CQID %u, CQ DOORBELL %u\n", cq->cqid, doorbell_base[i * 2 + 1]); 2356 SPDK_NOTICELOG("CQ CQID %u, PHASE %u, TAIL %u, SIZE %u, IV %u, IEN %u, DMA ADDR 0x%"PRIx64"\n", 2357 cq->cqid, cq->phase, cq->tail, cq->size, cq->iv, cq->ien, cq->dma_addr); 2358 } 2359 } 2360 2361 SPDK_NOTICELOG("%s Dump Done\n", name); 2362 } 2363 2364 /* Read region 9 content and restore it to migration data structures */ 2365 static int 2366 vfio_user_migr_stream_to_data(struct nvmf_vfio_user_endpoint *endpoint, 2367 struct vfio_user_nvme_migr_state *migr_state) 2368 { 2369 void *data_ptr = endpoint->migr_data; 2370 2371 /* Load vfio_user_nvme_migr_header first */ 2372 memcpy(&migr_state->ctrlr_header, data_ptr, sizeof(struct vfio_user_nvme_migr_header)); 2373 /* TODO: version check */ 2374 if (migr_state->ctrlr_header.magic != VFIO_USER_NVME_MIGR_MAGIC) { 2375 SPDK_ERRLOG("%s: bad magic number %x\n", endpoint_id(endpoint), migr_state->ctrlr_header.magic); 2376 return -EINVAL; 2377 } 2378 2379 /* Load nvmf controller data */ 2380 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.nvmf_data_offset; 2381 memcpy(&migr_state->nvmf_data, data_ptr, migr_state->ctrlr_header.nvmf_data_len); 2382 2383 /* Load queue pairs */ 2384 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.qp_offset; 2385 memcpy(&migr_state->qps, data_ptr, migr_state->ctrlr_header.qp_len); 2386 2387 /* Load BAR0 */ 2388 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.bar_offset[VFU_PCI_DEV_BAR0_REGION_IDX]; 2389 memcpy(&migr_state->bar0, data_ptr, migr_state->ctrlr_header.bar_len[VFU_PCI_DEV_BAR0_REGION_IDX]); 2390 2391 /* Load CFG */ 2392 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.bar_offset[VFU_PCI_DEV_CFG_REGION_IDX]; 2393 memcpy(&migr_state->cfg, data_ptr, migr_state->ctrlr_header.bar_len[VFU_PCI_DEV_CFG_REGION_IDX]); 2394 2395 return 0; 2396 } 2397 2398 2399 static void 2400 vfio_user_migr_ctrlr_save_data(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 2401 { 2402 struct spdk_nvmf_ctrlr *ctrlr = vu_ctrlr->ctrlr; 2403 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 2404 struct nvmf_vfio_user_sq *sq; 2405 struct nvmf_vfio_user_cq *cq; 2406 struct vfio_user_nvme_migr_state migr_state = {}; 2407 uint64_t data_offset; 2408 void *data_ptr; 2409 int num_aers; 2410 struct spdk_nvme_registers *regs; 2411 uint32_t *doorbell_base; 2412 uint32_t i = 0; 2413 uint16_t sqid, cqid; 2414 2415 /* Save all data to vfio_user_nvme_migr_state first, then we will 2416 * copy it to device migration region at last. 2417 */ 2418 2419 /* save magic number */ 2420 migr_state.ctrlr_header.magic = VFIO_USER_NVME_MIGR_MAGIC; 2421 2422 /* save controller data */ 2423 num_aers = nvmf_ctrlr_save_aers(ctrlr, migr_state.ctrlr_header.aer_cids, 2424 256); 2425 assert(num_aers >= 0); 2426 migr_state.ctrlr_header.nr_aers = num_aers; 2427 2428 /* save nvmf controller data */ 2429 nvmf_ctrlr_save_migr_data(ctrlr, (struct nvmf_ctrlr_migr_data *)&migr_state.nvmf_data); 2430 2431 /* save connected queue pairs */ 2432 TAILQ_FOREACH(sq, &vu_ctrlr->connected_sqs, tailq) { 2433 /* save sq */ 2434 sqid = sq->qid; 2435 migr_state.qps[sqid].sq.sqid = sq->qid; 2436 migr_state.qps[sqid].sq.cqid = sq->cqid; 2437 migr_state.qps[sqid].sq.head = *sq_headp(sq); 2438 migr_state.qps[sqid].sq.size = sq->size; 2439 migr_state.qps[sqid].sq.dma_addr = sq->mapping.prp1; 2440 2441 /* save cq, for shared cq case, cq may be saved multiple times */ 2442 cqid = sq->cqid; 2443 cq = vu_ctrlr->cqs[cqid]; 2444 migr_state.qps[cqid].cq.cqid = cqid; 2445 migr_state.qps[cqid].cq.tail = *cq_tailp(cq); 2446 migr_state.qps[cqid].cq.ien = cq->ien; 2447 migr_state.qps[cqid].cq.iv = cq->iv; 2448 migr_state.qps[cqid].cq.size = cq->size; 2449 migr_state.qps[cqid].cq.phase = cq->phase; 2450 migr_state.qps[cqid].cq.dma_addr = cq->mapping.prp1; 2451 i++; 2452 } 2453 2454 assert(i > 0); 2455 migr_state.ctrlr_header.num_io_queues = i - 1; 2456 2457 regs = (struct spdk_nvme_registers *)&migr_state.bar0; 2458 /* Save mandarory registers to bar0 */ 2459 regs->csts.raw = ctrlr->vcprop.csts.raw; 2460 regs->cap.raw = ctrlr->vcprop.cap.raw; 2461 regs->vs.raw = ctrlr->vcprop.vs.raw; 2462 regs->cc.raw = ctrlr->vcprop.cc.raw; 2463 regs->aqa.raw = ctrlr->vcprop.aqa.raw; 2464 regs->asq = ctrlr->vcprop.asq; 2465 regs->acq = ctrlr->vcprop.acq; 2466 /* Save doorbells */ 2467 doorbell_base = (uint32_t *)®s->doorbell[0].sq_tdbl; 2468 memcpy(doorbell_base, (void *)vu_ctrlr->doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 2469 2470 /* Save PCI configuration space */ 2471 memcpy(&migr_state.cfg, (void *)endpoint->pci_config_space, NVME_REG_CFG_SIZE); 2472 2473 /* Save all data to device migration region */ 2474 data_ptr = endpoint->migr_data; 2475 2476 /* Copy nvmf controller data */ 2477 data_offset = sizeof(struct vfio_user_nvme_migr_header); 2478 data_ptr += data_offset; 2479 migr_state.ctrlr_header.nvmf_data_offset = data_offset; 2480 migr_state.ctrlr_header.nvmf_data_len = sizeof(struct nvmf_ctrlr_migr_data); 2481 memcpy(data_ptr, &migr_state.nvmf_data, sizeof(struct nvmf_ctrlr_migr_data)); 2482 2483 /* Copy queue pairs */ 2484 data_offset += sizeof(struct nvmf_ctrlr_migr_data); 2485 data_ptr += sizeof(struct nvmf_ctrlr_migr_data); 2486 migr_state.ctrlr_header.qp_offset = data_offset; 2487 migr_state.ctrlr_header.qp_len = i * (sizeof(struct nvme_migr_sq_state) + sizeof( 2488 struct nvme_migr_cq_state)); 2489 memcpy(data_ptr, &migr_state.qps, migr_state.ctrlr_header.qp_len); 2490 2491 /* Copy BAR0 */ 2492 data_offset += migr_state.ctrlr_header.qp_len; 2493 data_ptr += migr_state.ctrlr_header.qp_len; 2494 migr_state.ctrlr_header.bar_offset[VFU_PCI_DEV_BAR0_REGION_IDX] = data_offset; 2495 migr_state.ctrlr_header.bar_len[VFU_PCI_DEV_BAR0_REGION_IDX] = NVME_REG_BAR0_SIZE; 2496 memcpy(data_ptr, &migr_state.bar0, NVME_REG_BAR0_SIZE); 2497 2498 /* Copy CFG */ 2499 data_offset += NVME_REG_BAR0_SIZE; 2500 data_ptr += NVME_REG_BAR0_SIZE; 2501 migr_state.ctrlr_header.bar_offset[VFU_PCI_DEV_CFG_REGION_IDX] = data_offset; 2502 migr_state.ctrlr_header.bar_len[VFU_PCI_DEV_CFG_REGION_IDX] = NVME_REG_CFG_SIZE; 2503 memcpy(data_ptr, &migr_state.cfg, NVME_REG_CFG_SIZE); 2504 2505 /* Copy nvme migration header finally */ 2506 memcpy(endpoint->migr_data, &migr_state.ctrlr_header, sizeof(struct vfio_user_nvme_migr_header)); 2507 2508 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 2509 vfio_user_ctrlr_dump_migr_data("SAVE", &migr_state); 2510 } 2511 } 2512 2513 /* 2514 * If we are about to close the connection, we need to unregister the interrupt, 2515 * as the library will subsequently close the file descriptor we registered. 2516 */ 2517 static int 2518 vfio_user_device_reset(vfu_ctx_t *vfu_ctx, vfu_reset_type_t type) 2519 { 2520 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2521 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 2522 2523 SPDK_DEBUGLOG(nvmf_vfio, "Device reset type %u\n", type); 2524 2525 if (type == VFU_RESET_LOST_CONN) { 2526 if (ctrlr != NULL) { 2527 spdk_interrupt_unregister(&ctrlr->intr); 2528 ctrlr->intr_fd = -1; 2529 } 2530 return 0; 2531 } 2532 2533 /* FIXME: much more needed here. */ 2534 2535 return 0; 2536 } 2537 2538 static int 2539 vfio_user_migr_ctrlr_construct_qps(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 2540 struct vfio_user_nvme_migr_state *migr_state) 2541 { 2542 uint32_t i, qsize = 0; 2543 uint16_t sqid, cqid; 2544 struct vfio_user_nvme_migr_qp migr_qp; 2545 void *addr; 2546 uint32_t cqs_ref[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR] = {}; 2547 int ret; 2548 2549 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 2550 vfio_user_ctrlr_dump_migr_data("RESUME", migr_state); 2551 } 2552 2553 /* restore submission queues */ 2554 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 2555 migr_qp = migr_state->qps[i]; 2556 2557 qsize = migr_qp.sq.size; 2558 if (qsize) { 2559 struct nvmf_vfio_user_sq *sq; 2560 2561 sqid = migr_qp.sq.sqid; 2562 if (sqid != i) { 2563 SPDK_ERRLOG("Expected sqid %u while got %u", i, sqid); 2564 return -EINVAL; 2565 } 2566 2567 /* allocate sq if necessary */ 2568 if (vu_ctrlr->sqs[sqid] == NULL) { 2569 ret = init_sq(vu_ctrlr, &vu_ctrlr->transport->transport, sqid); 2570 if (ret) { 2571 SPDK_ERRLOG("Construct qpair with qid %u failed\n", sqid); 2572 return -EFAULT; 2573 } 2574 } 2575 2576 sq = vu_ctrlr->sqs[sqid]; 2577 sq->size = qsize; 2578 2579 ret = alloc_sq_reqs(vu_ctrlr, sq); 2580 if (ret) { 2581 SPDK_ERRLOG("Construct sq with qid %u failed\n", sqid); 2582 return -EFAULT; 2583 } 2584 2585 /* restore sq */ 2586 sq->sq_state = VFIO_USER_SQ_CREATED; 2587 sq->cqid = migr_qp.sq.cqid; 2588 *sq_headp(sq) = migr_qp.sq.head; 2589 sq->mapping.prp1 = migr_qp.sq.dma_addr; 2590 addr = map_one(vu_ctrlr->endpoint->vfu_ctx, 2591 sq->mapping.prp1, sq->size * 64, 2592 sq->mapping.sg, &sq->mapping.iov, 2593 PROT_READ); 2594 if (addr == NULL) { 2595 SPDK_ERRLOG("Restore sq with qid %u PRP1 0x%"PRIx64" with size %u failed\n", 2596 sqid, sq->mapping.prp1, sq->size); 2597 return -EFAULT; 2598 } 2599 cqs_ref[sq->cqid]++; 2600 } 2601 } 2602 2603 /* restore completion queues */ 2604 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 2605 migr_qp = migr_state->qps[i]; 2606 2607 qsize = migr_qp.cq.size; 2608 if (qsize) { 2609 struct nvmf_vfio_user_cq *cq; 2610 2611 /* restore cq */ 2612 cqid = migr_qp.sq.cqid; 2613 assert(cqid == i); 2614 2615 /* allocate cq if necessary */ 2616 if (vu_ctrlr->cqs[cqid] == NULL) { 2617 ret = init_cq(vu_ctrlr, cqid); 2618 if (ret) { 2619 SPDK_ERRLOG("Construct qpair with qid %u failed\n", cqid); 2620 return -EFAULT; 2621 } 2622 } 2623 2624 cq = vu_ctrlr->cqs[cqid]; 2625 2626 cq->size = qsize; 2627 2628 cq->cq_state = VFIO_USER_CQ_CREATED; 2629 cq->cq_ref = cqs_ref[cqid]; 2630 *cq_tailp(cq) = migr_qp.cq.tail; 2631 cq->mapping.prp1 = migr_qp.cq.dma_addr; 2632 cq->ien = migr_qp.cq.ien; 2633 cq->iv = migr_qp.cq.iv; 2634 cq->phase = migr_qp.cq.phase; 2635 addr = map_one(vu_ctrlr->endpoint->vfu_ctx, 2636 cq->mapping.prp1, cq->size * 16, 2637 cq->mapping.sg, &cq->mapping.iov, 2638 PROT_READ | PROT_WRITE); 2639 if (addr == NULL) { 2640 SPDK_ERRLOG("Restore cq with qid %u PRP1 0x%"PRIx64" with size %u failed\n", 2641 cqid, cq->mapping.prp1, cq->size); 2642 return -EFAULT; 2643 } 2644 } 2645 } 2646 2647 return 0; 2648 } 2649 2650 static int 2651 vfio_user_migr_ctrlr_restore(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 2652 { 2653 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 2654 struct spdk_nvmf_ctrlr *ctrlr = vu_ctrlr->ctrlr; 2655 uint32_t *doorbell_base; 2656 struct vfio_user_nvme_migr_state migr_state = {}; 2657 struct spdk_nvme_registers *regs; 2658 struct spdk_nvme_cmd cmd; 2659 uint16_t i; 2660 int rc = 0; 2661 2662 assert(endpoint->migr_data != NULL); 2663 assert(ctrlr != NULL); 2664 rc = vfio_user_migr_stream_to_data(endpoint, &migr_state); 2665 if (rc) { 2666 return rc; 2667 } 2668 2669 rc = vfio_user_migr_ctrlr_construct_qps(vu_ctrlr, &migr_state); 2670 if (rc) { 2671 return rc; 2672 } 2673 2674 /* restore PCI configuration space */ 2675 memcpy((void *)endpoint->pci_config_space, &migr_state.cfg, NVME_REG_CFG_SIZE); 2676 2677 regs = (struct spdk_nvme_registers *)&migr_state.bar0; 2678 doorbell_base = (uint32_t *)®s->doorbell[0].sq_tdbl; 2679 /* restore doorbells from saved registers */ 2680 memcpy((void *)vu_ctrlr->doorbells, doorbell_base, NVMF_VFIO_USER_DOORBELLS_SIZE); 2681 2682 /* restore controller registers after ADMIN queue connection */ 2683 ctrlr->vcprop.csts.raw = regs->csts.raw; 2684 ctrlr->vcprop.cap.raw = regs->cap.raw; 2685 ctrlr->vcprop.vs.raw = regs->vs.raw; 2686 ctrlr->vcprop.cc.raw = regs->cc.raw; 2687 ctrlr->vcprop.aqa.raw = regs->aqa.raw; 2688 ctrlr->vcprop.asq = regs->asq; 2689 ctrlr->vcprop.acq = regs->acq; 2690 2691 /* restore nvmf controller data */ 2692 rc = nvmf_ctrlr_restore_migr_data(ctrlr, &migr_state.nvmf_data); 2693 if (rc) { 2694 return rc; 2695 } 2696 2697 /* resubmit pending AERs */ 2698 for (i = 0; i < migr_state.ctrlr_header.nr_aers; i++) { 2699 SPDK_DEBUGLOG(nvmf_vfio, "%s AER resubmit, CID %u\n", ctrlr_id(vu_ctrlr), 2700 migr_state.ctrlr_header.aer_cids[i]); 2701 memset(&cmd, 0, sizeof(cmd)); 2702 cmd.opc = SPDK_NVME_OPC_ASYNC_EVENT_REQUEST; 2703 cmd.cid = migr_state.ctrlr_header.aer_cids[i]; 2704 rc = handle_cmd_req(vu_ctrlr, &cmd, vu_ctrlr->sqs[0]); 2705 if (rc) { 2706 break; 2707 } 2708 } 2709 2710 return rc; 2711 } 2712 2713 static void 2714 vfio_user_migr_ctrlr_enable_sqs(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 2715 { 2716 uint32_t i; 2717 struct nvmf_vfio_user_sq *sq; 2718 2719 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 2720 sq = vu_ctrlr->sqs[i]; 2721 if (!sq || !sq->size) { 2722 continue; 2723 } 2724 2725 if (nvmf_qpair_is_admin_queue(&sq->qpair)) { 2726 /* ADMIN queue pair is always in the poll group, just enable it */ 2727 sq->sq_state = VFIO_USER_SQ_ACTIVE; 2728 } else { 2729 spdk_nvmf_tgt_new_qpair(vu_ctrlr->transport->transport.tgt, &sq->qpair); 2730 } 2731 } 2732 } 2733 2734 static int 2735 vfio_user_migration_device_state_transition(vfu_ctx_t *vfu_ctx, vfu_migr_state_t state) 2736 { 2737 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2738 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 2739 struct nvmf_vfio_user_sq *sq; 2740 int ret = 0; 2741 2742 SPDK_DEBUGLOG(nvmf_vfio, "%s controller state %u, migration state %u\n", endpoint_id(endpoint), 2743 vu_ctrlr->state, state); 2744 2745 switch (state) { 2746 case VFU_MIGR_STATE_STOP_AND_COPY: 2747 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 2748 vfio_user_migr_ctrlr_save_data(vu_ctrlr); 2749 break; 2750 case VFU_MIGR_STATE_STOP: 2751 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 2752 break; 2753 case VFU_MIGR_STATE_PRE_COPY: 2754 assert(vu_ctrlr->state == VFIO_USER_CTRLR_PAUSED); 2755 vu_ctrlr->migr_reg.pending_bytes = vfio_user_migr_data_len(); 2756 vu_ctrlr->migr_reg.last_data_offset = 0; 2757 vu_ctrlr->in_source_vm = true; 2758 break; 2759 case VFU_MIGR_STATE_RESUME: 2760 /* 2761 * Destination ADMIN queue pair is connected when starting the VM, 2762 * but the ADMIN queue pair isn't enabled in destination VM, the poll 2763 * group will do nothing to ADMIN queue pair for now. 2764 */ 2765 if (vu_ctrlr->state != VFIO_USER_CTRLR_RUNNING) { 2766 break; 2767 } 2768 2769 assert(!vu_ctrlr->in_source_vm); 2770 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 2771 2772 sq = TAILQ_FIRST(&vu_ctrlr->connected_sqs); 2773 assert(sq != NULL); 2774 assert(sq->qpair.qid == 0); 2775 sq->sq_state = VFIO_USER_SQ_INACTIVE; 2776 2777 /* Free ADMIN SQ resources first, SQ resources will be 2778 * allocated based on queue size from source VM. 2779 */ 2780 free_sq_reqs(sq); 2781 sq->size = 0; 2782 break; 2783 case VFU_MIGR_STATE_RUNNING: 2784 if (vu_ctrlr->state != VFIO_USER_CTRLR_MIGRATING) { 2785 break; 2786 } 2787 2788 if (!vu_ctrlr->in_source_vm) { 2789 /* Restore destination VM from BAR9 */ 2790 ret = vfio_user_migr_ctrlr_restore(vu_ctrlr); 2791 if (ret) { 2792 break; 2793 } 2794 vfio_user_migr_ctrlr_enable_sqs(vu_ctrlr); 2795 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 2796 } else { 2797 /* Rollback source VM */ 2798 vu_ctrlr->state = VFIO_USER_CTRLR_RESUMING; 2799 ret = spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 2800 vfio_user_endpoint_resume_done, endpoint); 2801 if (ret < 0) { 2802 /* TODO: fail controller with CFS bit set */ 2803 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 2804 SPDK_ERRLOG("%s: failed to resume, ret=%d\n", endpoint_id(endpoint), ret); 2805 break; 2806 } 2807 } 2808 break; 2809 2810 default: 2811 return -EINVAL; 2812 } 2813 2814 return ret; 2815 } 2816 2817 static uint64_t 2818 vfio_user_migration_get_pending_bytes(vfu_ctx_t *vfu_ctx) 2819 { 2820 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2821 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 2822 struct vfio_user_migration_region *migr_reg = &ctrlr->migr_reg; 2823 2824 SPDK_DEBUGLOG(nvmf_vfio, "%s current state %u, pending bytes 0x%"PRIx64"\n", endpoint_id(endpoint), 2825 ctrlr->state, migr_reg->pending_bytes); 2826 2827 return migr_reg->pending_bytes; 2828 } 2829 2830 static int 2831 vfio_user_migration_prepare_data(vfu_ctx_t *vfu_ctx, uint64_t *offset, uint64_t *size) 2832 { 2833 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2834 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 2835 struct vfio_user_migration_region *migr_reg = &ctrlr->migr_reg; 2836 2837 if (migr_reg->last_data_offset == vfio_user_migr_data_len()) { 2838 *offset = vfio_user_migr_data_len(); 2839 if (size) { 2840 *size = 0; 2841 } 2842 migr_reg->pending_bytes = 0; 2843 } else { 2844 *offset = 0; 2845 if (size) { 2846 *size = vfio_user_migr_data_len(); 2847 if (ctrlr->state == VFIO_USER_CTRLR_MIGRATING) { 2848 vfio_user_migr_ctrlr_save_data(ctrlr); 2849 migr_reg->last_data_offset = vfio_user_migr_data_len(); 2850 } 2851 } 2852 } 2853 2854 SPDK_DEBUGLOG(nvmf_vfio, "%s current state %u\n", endpoint_id(endpoint), ctrlr->state); 2855 2856 return 0; 2857 } 2858 2859 static ssize_t 2860 vfio_user_migration_read_data(vfu_ctx_t *vfu_ctx, void *buf, uint64_t count, uint64_t offset) 2861 { 2862 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2863 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 2864 struct vfio_user_migration_region *migr_reg = &ctrlr->migr_reg; 2865 2866 memcpy(buf, endpoint->migr_data, count); 2867 migr_reg->pending_bytes = 0; 2868 2869 return 0; 2870 } 2871 2872 static ssize_t 2873 vfio_user_migration_write_data(vfu_ctx_t *vfu_ctx, void *buf, uint64_t count, uint64_t offset) 2874 { 2875 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2876 2877 memcpy(endpoint->migr_data, buf, count); 2878 2879 return 0; 2880 } 2881 2882 static int 2883 vfio_user_migration_data_written(vfu_ctx_t *vfu_ctx, uint64_t count) 2884 { 2885 SPDK_DEBUGLOG(nvmf_vfio, "write 0x%"PRIx64"\n", (uint64_t)count); 2886 2887 return 0; 2888 } 2889 2890 static int 2891 vfio_user_dev_info_fill(struct nvmf_vfio_user_transport *vu_transport, 2892 struct nvmf_vfio_user_endpoint *endpoint) 2893 { 2894 int ret; 2895 ssize_t cap_offset; 2896 vfu_ctx_t *vfu_ctx = endpoint->vfu_ctx; 2897 struct iovec migr_sparse_mmap = {}; 2898 2899 struct pmcap pmcap = { .hdr.id = PCI_CAP_ID_PM, .pmcs.nsfrst = 0x1 }; 2900 struct pxcap pxcap = { 2901 .hdr.id = PCI_CAP_ID_EXP, 2902 .pxcaps.ver = 0x2, 2903 .pxdcap = {.rer = 0x1, .flrc = 0x1}, 2904 .pxdcap2.ctds = 0x1 2905 }; 2906 2907 struct msixcap msixcap = { 2908 .hdr.id = PCI_CAP_ID_MSIX, 2909 .mxc.ts = NVME_IRQ_MSIX_NUM - 1, 2910 .mtab = {.tbir = 0x4, .to = 0x0}, 2911 .mpba = {.pbir = 0x5, .pbao = 0x0} 2912 }; 2913 2914 struct iovec sparse_mmap[] = { 2915 { 2916 .iov_base = (void *)NVME_DOORBELLS_OFFSET, 2917 .iov_len = NVMF_VFIO_USER_DOORBELLS_SIZE, 2918 }, 2919 }; 2920 2921 const vfu_migration_callbacks_t migr_callbacks = { 2922 .version = VFU_MIGR_CALLBACKS_VERS, 2923 .transition = &vfio_user_migration_device_state_transition, 2924 .get_pending_bytes = &vfio_user_migration_get_pending_bytes, 2925 .prepare_data = &vfio_user_migration_prepare_data, 2926 .read_data = &vfio_user_migration_read_data, 2927 .data_written = &vfio_user_migration_data_written, 2928 .write_data = &vfio_user_migration_write_data 2929 }; 2930 2931 ret = vfu_pci_init(vfu_ctx, VFU_PCI_TYPE_EXPRESS, PCI_HEADER_TYPE_NORMAL, 0); 2932 if (ret < 0) { 2933 SPDK_ERRLOG("vfu_ctx %p failed to initialize PCI\n", vfu_ctx); 2934 return ret; 2935 } 2936 vfu_pci_set_id(vfu_ctx, SPDK_PCI_VID_NUTANIX, 0x0001, SPDK_PCI_VID_NUTANIX, 0); 2937 /* 2938 * 0x02, controller uses the NVM Express programming interface 2939 * 0x08, non-volatile memory controller 2940 * 0x01, mass storage controller 2941 */ 2942 vfu_pci_set_class(vfu_ctx, 0x01, 0x08, 0x02); 2943 2944 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pmcap); 2945 if (cap_offset < 0) { 2946 SPDK_ERRLOG("vfu_ctx %p failed add pmcap\n", vfu_ctx); 2947 return ret; 2948 } 2949 2950 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pxcap); 2951 if (cap_offset < 0) { 2952 SPDK_ERRLOG("vfu_ctx %p failed add pxcap\n", vfu_ctx); 2953 return ret; 2954 } 2955 2956 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &msixcap); 2957 if (cap_offset < 0) { 2958 SPDK_ERRLOG("vfu_ctx %p failed add msixcap\n", vfu_ctx); 2959 return ret; 2960 } 2961 2962 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_CFG_REGION_IDX, NVME_REG_CFG_SIZE, 2963 access_pci_config, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 2964 if (ret < 0) { 2965 SPDK_ERRLOG("vfu_ctx %p failed to setup cfg\n", vfu_ctx); 2966 return ret; 2967 } 2968 2969 if (vu_transport->transport_opts.disable_mappable_bar0) { 2970 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE, 2971 access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, 2972 NULL, 0, -1, 0); 2973 } else { 2974 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE, 2975 access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, 2976 sparse_mmap, 1, endpoint->devmem_fd, 0); 2977 } 2978 2979 if (ret < 0) { 2980 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 0\n", vfu_ctx); 2981 return ret; 2982 } 2983 2984 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR4_REGION_IDX, NVME_BAR4_SIZE, 2985 NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 2986 if (ret < 0) { 2987 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 4\n", vfu_ctx); 2988 return ret; 2989 } 2990 2991 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR5_REGION_IDX, NVME_BAR5_SIZE, 2992 NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 2993 if (ret < 0) { 2994 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 5\n", vfu_ctx); 2995 return ret; 2996 } 2997 2998 ret = vfu_setup_device_dma(vfu_ctx, memory_region_add_cb, memory_region_remove_cb); 2999 if (ret < 0) { 3000 SPDK_ERRLOG("vfu_ctx %p failed to setup dma callback\n", vfu_ctx); 3001 return ret; 3002 } 3003 3004 ret = vfu_setup_device_reset_cb(vfu_ctx, vfio_user_device_reset); 3005 if (ret < 0) { 3006 SPDK_ERRLOG("vfu_ctx %p failed to setup reset callback\n", vfu_ctx); 3007 return ret; 3008 } 3009 3010 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_INTX_IRQ, 1); 3011 if (ret < 0) { 3012 SPDK_ERRLOG("vfu_ctx %p failed to setup INTX\n", vfu_ctx); 3013 return ret; 3014 } 3015 3016 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSIX_IRQ, NVME_IRQ_MSIX_NUM); 3017 if (ret < 0) { 3018 SPDK_ERRLOG("vfu_ctx %p failed to setup MSIX\n", vfu_ctx); 3019 return ret; 3020 } 3021 3022 vfu_setup_device_quiesce_cb(vfu_ctx, vfio_user_dev_quiesce_cb); 3023 3024 migr_sparse_mmap.iov_base = (void *)4096; 3025 migr_sparse_mmap.iov_len = vfio_user_migr_data_len(); 3026 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_MIGR_REGION_IDX, 3027 vfu_get_migr_register_area_size() + vfio_user_migr_data_len(), 3028 NULL, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, &migr_sparse_mmap, 3029 1, endpoint->migr_fd, 0); 3030 if (ret < 0) { 3031 SPDK_ERRLOG("vfu_ctx %p failed to setup migration region\n", vfu_ctx); 3032 return ret; 3033 } 3034 3035 ret = vfu_setup_device_migration_callbacks(vfu_ctx, &migr_callbacks, 3036 vfu_get_migr_register_area_size()); 3037 if (ret < 0) { 3038 SPDK_ERRLOG("vfu_ctx %p failed to setup migration callbacks\n", vfu_ctx); 3039 return ret; 3040 } 3041 3042 ret = vfu_realize_ctx(vfu_ctx); 3043 if (ret < 0) { 3044 SPDK_ERRLOG("vfu_ctx %p failed to realize\n", vfu_ctx); 3045 return ret; 3046 } 3047 3048 endpoint->pci_config_space = vfu_pci_get_config_space(endpoint->vfu_ctx); 3049 assert(endpoint->pci_config_space != NULL); 3050 init_pci_config_space(endpoint->pci_config_space); 3051 3052 assert(cap_offset != 0); 3053 endpoint->msix = (struct msixcap *)((uint8_t *)endpoint->pci_config_space + cap_offset); 3054 3055 return 0; 3056 } 3057 3058 static int nvmf_vfio_user_accept(void *ctx); 3059 3060 static void 3061 set_intr_mode_noop(struct spdk_poller *poller, void *arg, bool interrupt_mode) 3062 { 3063 /* Nothing for us to do here. */ 3064 } 3065 3066 /* 3067 * Register an "accept" poller: this is polling for incoming vfio-user socket 3068 * connections (on the listening socket). 3069 * 3070 * We need to do this on first listening, and also after destroying a 3071 * controller, so we can accept another connection. 3072 */ 3073 static int 3074 vfio_user_register_accept_poller(struct nvmf_vfio_user_endpoint *endpoint) 3075 { 3076 uint64_t poll_rate_us = endpoint->transport->transport.opts.acceptor_poll_rate; 3077 3078 SPDK_DEBUGLOG(nvmf_vfio, "registering accept poller\n"); 3079 3080 endpoint->accept_poller = SPDK_POLLER_REGISTER(nvmf_vfio_user_accept, 3081 endpoint, poll_rate_us); 3082 3083 if (!endpoint->accept_poller) { 3084 return -1; 3085 } 3086 3087 endpoint->accept_thread = spdk_get_thread(); 3088 3089 if (!spdk_interrupt_mode_is_enabled()) { 3090 return 0; 3091 } 3092 3093 endpoint->accept_intr_fd = vfu_get_poll_fd(endpoint->vfu_ctx); 3094 assert(endpoint->accept_intr_fd != -1); 3095 3096 endpoint->accept_intr = SPDK_INTERRUPT_REGISTER(endpoint->accept_intr_fd, 3097 nvmf_vfio_user_accept, endpoint); 3098 3099 assert(endpoint->accept_intr != NULL); 3100 3101 spdk_poller_register_interrupt(endpoint->accept_poller, 3102 set_intr_mode_noop, NULL); 3103 return 0; 3104 } 3105 3106 static void 3107 _vfio_user_relisten(void *ctx) 3108 { 3109 struct nvmf_vfio_user_endpoint *endpoint = ctx; 3110 3111 vfio_user_register_accept_poller(endpoint); 3112 } 3113 3114 static void 3115 _free_ctrlr(void *ctx) 3116 { 3117 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 3118 struct nvmf_vfio_user_endpoint *endpoint = ctrlr->endpoint; 3119 3120 spdk_interrupt_unregister(&ctrlr->intr); 3121 ctrlr->intr_fd = -1; 3122 spdk_poller_unregister(&ctrlr->vfu_ctx_poller); 3123 3124 free(ctrlr); 3125 3126 if (endpoint == NULL) { 3127 return; 3128 } 3129 3130 if (endpoint->need_async_destroy) { 3131 nvmf_vfio_user_destroy_endpoint(endpoint); 3132 } else { 3133 spdk_thread_send_msg(endpoint->accept_thread, 3134 _vfio_user_relisten, endpoint); 3135 } 3136 } 3137 3138 static void 3139 free_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 3140 { 3141 int i; 3142 assert(ctrlr != NULL); 3143 3144 SPDK_DEBUGLOG(nvmf_vfio, "free %s\n", ctrlr_id(ctrlr)); 3145 3146 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3147 free_qp(ctrlr, i); 3148 } 3149 3150 spdk_thread_exec_msg(ctrlr->thread, _free_ctrlr, ctrlr); 3151 } 3152 3153 static int 3154 nvmf_vfio_user_create_ctrlr(struct nvmf_vfio_user_transport *transport, 3155 struct nvmf_vfio_user_endpoint *endpoint) 3156 { 3157 struct nvmf_vfio_user_ctrlr *ctrlr; 3158 int err = 0; 3159 3160 SPDK_DEBUGLOG(nvmf_vfio, "%s\n", endpoint_id(endpoint)); 3161 3162 /* First, construct a vfio-user CUSTOM transport controller */ 3163 ctrlr = calloc(1, sizeof(*ctrlr)); 3164 if (ctrlr == NULL) { 3165 err = -ENOMEM; 3166 goto out; 3167 } 3168 /* We can only support one connection for now */ 3169 ctrlr->cntlid = 0x1; 3170 ctrlr->intr_fd = -1; 3171 ctrlr->transport = transport; 3172 ctrlr->endpoint = endpoint; 3173 ctrlr->doorbells = endpoint->doorbells; 3174 TAILQ_INIT(&ctrlr->connected_sqs); 3175 3176 /* Then, construct an admin queue pair */ 3177 err = init_sq(ctrlr, &transport->transport, 0); 3178 if (err != 0) { 3179 free(ctrlr); 3180 goto out; 3181 } 3182 3183 err = init_cq(ctrlr, 0); 3184 if (err != 0) { 3185 free(ctrlr); 3186 goto out; 3187 } 3188 3189 ctrlr->sqs[0]->size = NVMF_VFIO_USER_DEFAULT_AQ_DEPTH; 3190 3191 err = alloc_sq_reqs(ctrlr, ctrlr->sqs[0]); 3192 if (err != 0) { 3193 free(ctrlr); 3194 goto out; 3195 } 3196 endpoint->ctrlr = ctrlr; 3197 3198 /* Notify the generic layer about the new admin queue pair */ 3199 spdk_nvmf_tgt_new_qpair(transport->transport.tgt, &ctrlr->sqs[0]->qpair); 3200 3201 out: 3202 if (err != 0) { 3203 SPDK_ERRLOG("%s: failed to create vfio-user controller: %s\n", 3204 endpoint_id(endpoint), strerror(-err)); 3205 } 3206 3207 return err; 3208 } 3209 3210 static int 3211 nvmf_vfio_user_listen(struct spdk_nvmf_transport *transport, 3212 const struct spdk_nvme_transport_id *trid, 3213 struct spdk_nvmf_listen_opts *listen_opts) 3214 { 3215 struct nvmf_vfio_user_transport *vu_transport; 3216 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 3217 char path[PATH_MAX] = {}; 3218 char uuid[PATH_MAX] = {}; 3219 int ret; 3220 3221 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 3222 transport); 3223 3224 pthread_mutex_lock(&vu_transport->lock); 3225 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 3226 /* Only compare traddr */ 3227 if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) { 3228 pthread_mutex_unlock(&vu_transport->lock); 3229 return -EEXIST; 3230 } 3231 } 3232 pthread_mutex_unlock(&vu_transport->lock); 3233 3234 endpoint = calloc(1, sizeof(*endpoint)); 3235 if (!endpoint) { 3236 return -ENOMEM; 3237 } 3238 3239 pthread_mutex_init(&endpoint->lock, NULL); 3240 endpoint->devmem_fd = -1; 3241 memcpy(&endpoint->trid, trid, sizeof(endpoint->trid)); 3242 endpoint->transport = vu_transport; 3243 3244 ret = snprintf(path, PATH_MAX, "%s/bar0", endpoint_id(endpoint)); 3245 if (ret < 0 || ret >= PATH_MAX) { 3246 SPDK_ERRLOG("%s: error to get socket path: %s.\n", endpoint_id(endpoint), spdk_strerror(errno)); 3247 ret = -1; 3248 goto out; 3249 } 3250 3251 ret = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); 3252 if (ret == -1) { 3253 SPDK_ERRLOG("%s: failed to open device memory at %s: %s.\n", 3254 endpoint_id(endpoint), path, spdk_strerror(errno)); 3255 goto out; 3256 } 3257 3258 endpoint->devmem_fd = ret; 3259 ret = ftruncate(endpoint->devmem_fd, 3260 NVME_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE); 3261 if (ret != 0) { 3262 SPDK_ERRLOG("%s: error to ftruncate file %s: %s.\n", endpoint_id(endpoint), path, 3263 spdk_strerror(errno)); 3264 goto out; 3265 } 3266 3267 endpoint->doorbells = mmap(NULL, NVMF_VFIO_USER_DOORBELLS_SIZE, 3268 PROT_READ | PROT_WRITE, MAP_SHARED, endpoint->devmem_fd, NVME_DOORBELLS_OFFSET); 3269 if (endpoint->doorbells == MAP_FAILED) { 3270 SPDK_ERRLOG("%s: error to mmap file %s: %s.\n", endpoint_id(endpoint), path, spdk_strerror(errno)); 3271 endpoint->doorbells = NULL; 3272 ret = -1; 3273 goto out; 3274 } 3275 3276 ret = snprintf(path, PATH_MAX, "%s/migr", endpoint_id(endpoint)); 3277 if (ret < 0 || ret >= PATH_MAX) { 3278 SPDK_ERRLOG("%s: error to get migration file path: %s.\n", endpoint_id(endpoint), 3279 spdk_strerror(errno)); 3280 ret = -1; 3281 goto out; 3282 } 3283 ret = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); 3284 if (ret == -1) { 3285 SPDK_ERRLOG("%s: failed to open device memory at %s: %s.\n", 3286 endpoint_id(endpoint), path, spdk_strerror(errno)); 3287 goto out; 3288 } 3289 3290 endpoint->migr_fd = ret; 3291 ret = ftruncate(endpoint->migr_fd, 3292 vfu_get_migr_register_area_size() + vfio_user_migr_data_len()); 3293 if (ret != 0) { 3294 SPDK_ERRLOG("%s: error to ftruncate migration file %s: %s.\n", endpoint_id(endpoint), path, 3295 spdk_strerror(errno)); 3296 goto out; 3297 } 3298 3299 endpoint->migr_data = mmap(NULL, vfio_user_migr_data_len(), 3300 PROT_READ | PROT_WRITE, MAP_SHARED, endpoint->migr_fd, vfu_get_migr_register_area_size()); 3301 if (endpoint->migr_data == MAP_FAILED) { 3302 SPDK_ERRLOG("%s: error to mmap file %s: %s.\n", endpoint_id(endpoint), path, spdk_strerror(errno)); 3303 endpoint->migr_data = NULL; 3304 ret = -1; 3305 goto out; 3306 } 3307 3308 ret = snprintf(uuid, PATH_MAX, "%s/cntrl", endpoint_id(endpoint)); 3309 if (ret < 0 || ret >= PATH_MAX) { 3310 SPDK_ERRLOG("%s: error to get ctrlr file path: %s\n", endpoint_id(endpoint), spdk_strerror(errno)); 3311 ret = -1; 3312 goto out; 3313 } 3314 3315 endpoint->vfu_ctx = vfu_create_ctx(VFU_TRANS_SOCK, uuid, LIBVFIO_USER_FLAG_ATTACH_NB, 3316 endpoint, VFU_DEV_TYPE_PCI); 3317 if (endpoint->vfu_ctx == NULL) { 3318 SPDK_ERRLOG("%s: error creating libmuser context: %m\n", 3319 endpoint_id(endpoint)); 3320 ret = -1; 3321 goto out; 3322 } 3323 vfu_setup_log(endpoint->vfu_ctx, vfio_user_log, vfio_user_get_log_level()); 3324 3325 ret = vfio_user_dev_info_fill(vu_transport, endpoint); 3326 if (ret < 0) { 3327 goto out; 3328 } 3329 3330 ret = vfio_user_register_accept_poller(endpoint); 3331 3332 if (ret != 0) { 3333 goto out; 3334 } 3335 3336 pthread_mutex_lock(&vu_transport->lock); 3337 TAILQ_INSERT_TAIL(&vu_transport->endpoints, endpoint, link); 3338 pthread_mutex_unlock(&vu_transport->lock); 3339 3340 out: 3341 if (ret != 0) { 3342 nvmf_vfio_user_destroy_endpoint(endpoint); 3343 } 3344 3345 return ret; 3346 } 3347 3348 static void 3349 nvmf_vfio_user_stop_listen(struct spdk_nvmf_transport *transport, 3350 const struct spdk_nvme_transport_id *trid) 3351 { 3352 struct nvmf_vfio_user_transport *vu_transport; 3353 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 3354 3355 assert(trid != NULL); 3356 assert(trid->traddr != NULL); 3357 3358 SPDK_DEBUGLOG(nvmf_vfio, "%s: stop listen\n", trid->traddr); 3359 3360 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 3361 transport); 3362 3363 pthread_mutex_lock(&vu_transport->lock); 3364 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 3365 if (strcmp(trid->traddr, endpoint->trid.traddr) == 0) { 3366 TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link); 3367 /* Defer to free endpoint resources until the controller 3368 * is freed. There are two cases when running here: 3369 * 1. kill nvmf target while VM is connected 3370 * 2. remove listener via RPC call 3371 * nvmf library will disconnect all queue paris. 3372 */ 3373 if (endpoint->ctrlr) { 3374 assert(!endpoint->need_async_destroy); 3375 endpoint->need_async_destroy = true; 3376 pthread_mutex_unlock(&vu_transport->lock); 3377 return; 3378 } 3379 3380 nvmf_vfio_user_destroy_endpoint(endpoint); 3381 pthread_mutex_unlock(&vu_transport->lock); 3382 return; 3383 } 3384 } 3385 pthread_mutex_unlock(&vu_transport->lock); 3386 3387 SPDK_DEBUGLOG(nvmf_vfio, "%s: not found\n", trid->traddr); 3388 } 3389 3390 static void 3391 nvmf_vfio_user_cdata_init(struct spdk_nvmf_transport *transport, 3392 struct spdk_nvmf_subsystem *subsystem, 3393 struct spdk_nvmf_ctrlr_data *cdata) 3394 { 3395 cdata->vid = SPDK_PCI_VID_NUTANIX; 3396 cdata->ssvid = SPDK_PCI_VID_NUTANIX; 3397 cdata->ieee[0] = 0x8d; 3398 cdata->ieee[1] = 0x6b; 3399 cdata->ieee[2] = 0x50; 3400 memset(&cdata->sgls, 0, sizeof(struct spdk_nvme_cdata_sgls)); 3401 cdata->sgls.supported = SPDK_NVME_SGLS_SUPPORTED_DWORD_ALIGNED; 3402 /* libvfio-user can only support 1 connection for now */ 3403 cdata->oncs.reservations = 0; 3404 } 3405 3406 static int 3407 nvmf_vfio_user_listen_associate(struct spdk_nvmf_transport *transport, 3408 const struct spdk_nvmf_subsystem *subsystem, 3409 const struct spdk_nvme_transport_id *trid) 3410 { 3411 struct nvmf_vfio_user_transport *vu_transport; 3412 struct nvmf_vfio_user_endpoint *endpoint; 3413 3414 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, transport); 3415 3416 pthread_mutex_lock(&vu_transport->lock); 3417 TAILQ_FOREACH(endpoint, &vu_transport->endpoints, link) { 3418 if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) { 3419 break; 3420 } 3421 } 3422 pthread_mutex_unlock(&vu_transport->lock); 3423 3424 if (endpoint == NULL) { 3425 return -ENOENT; 3426 } 3427 3428 endpoint->subsystem = subsystem; 3429 3430 return 0; 3431 } 3432 3433 /* 3434 * Executed periodically at a default SPDK_NVMF_DEFAULT_ACCEPT_POLL_RATE_US 3435 * frequency. 3436 * 3437 * For this endpoint (which at the libvfio-user level corresponds to a socket), 3438 * if we don't currently have a controller set up, peek to see if the socket is 3439 * able to accept a new connection. 3440 */ 3441 static int 3442 nvmf_vfio_user_accept(void *ctx) 3443 { 3444 struct nvmf_vfio_user_endpoint *endpoint = ctx; 3445 struct nvmf_vfio_user_transport *vu_transport; 3446 int err; 3447 3448 vu_transport = endpoint->transport; 3449 3450 if (endpoint->ctrlr != NULL) { 3451 return SPDK_POLLER_IDLE; 3452 } 3453 3454 err = vfu_attach_ctx(endpoint->vfu_ctx); 3455 3456 if (err == 0) { 3457 SPDK_DEBUGLOG(nvmf_vfio, "attach succeeded\n"); 3458 3459 err = nvmf_vfio_user_create_ctrlr(vu_transport, endpoint); 3460 3461 if (err == 0) { 3462 /* 3463 * Unregister ourselves: now we've accepted a 3464 * connection, there is nothing for us to poll for, and 3465 * we will poll the connection via vfu_run_ctx() 3466 * instead. 3467 */ 3468 spdk_interrupt_unregister(&endpoint->accept_intr); 3469 spdk_poller_unregister(&endpoint->accept_poller); 3470 } 3471 3472 return SPDK_POLLER_BUSY; 3473 } 3474 3475 if (errno == EAGAIN || errno == EWOULDBLOCK) { 3476 return SPDK_POLLER_IDLE; 3477 } 3478 3479 return SPDK_POLLER_BUSY; 3480 } 3481 3482 static void 3483 nvmf_vfio_user_discover(struct spdk_nvmf_transport *transport, 3484 struct spdk_nvme_transport_id *trid, 3485 struct spdk_nvmf_discovery_log_page_entry *entry) 3486 { } 3487 3488 static struct spdk_nvmf_transport_poll_group * 3489 nvmf_vfio_user_poll_group_create(struct spdk_nvmf_transport *transport, 3490 struct spdk_nvmf_poll_group *group) 3491 { 3492 struct nvmf_vfio_user_transport *vu_transport; 3493 struct nvmf_vfio_user_poll_group *vu_group; 3494 3495 SPDK_DEBUGLOG(nvmf_vfio, "create poll group\n"); 3496 3497 vu_group = calloc(1, sizeof(*vu_group)); 3498 if (vu_group == NULL) { 3499 SPDK_ERRLOG("Error allocating poll group: %m"); 3500 return NULL; 3501 } 3502 3503 TAILQ_INIT(&vu_group->sqs); 3504 3505 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 3506 transport); 3507 pthread_mutex_lock(&vu_transport->pg_lock); 3508 TAILQ_INSERT_TAIL(&vu_transport->poll_groups, vu_group, link); 3509 if (vu_transport->next_pg == NULL) { 3510 vu_transport->next_pg = vu_group; 3511 } 3512 pthread_mutex_unlock(&vu_transport->pg_lock); 3513 3514 if (!spdk_interrupt_mode_is_enabled()) { 3515 return &vu_group->group; 3516 } 3517 3518 /* 3519 * Only allow the poll group to work in interrupt mode if the transport 3520 * supports it. It's our responsibility to register the actual interrupt 3521 * later (in handle_queue_connect_rsp()) that processes everything in 3522 * the poll group: for us, that's the libvfio-user context, and the 3523 * actual qpairs. 3524 * 3525 * Note that this only works in the case that nothing else shares the 3526 * spdk_nvmf_poll_group. 3527 * 3528 * If not supported, this will effectively always wake up to poll the 3529 * poll group. 3530 */ 3531 3532 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 3533 transport); 3534 3535 if (!vu_transport->intr_mode_supported) { 3536 SPDK_WARNLOG("vfio-user interrupt mode not supported\n"); 3537 return &vu_group->group; 3538 } 3539 3540 spdk_poller_register_interrupt(group->poller, set_intr_mode_noop, 3541 NULL); 3542 3543 return &vu_group->group; 3544 } 3545 3546 static struct spdk_nvmf_transport_poll_group * 3547 nvmf_vfio_user_get_optimal_poll_group(struct spdk_nvmf_qpair *qpair) 3548 { 3549 struct nvmf_vfio_user_transport *vu_transport; 3550 struct nvmf_vfio_user_poll_group **vu_group; 3551 struct nvmf_vfio_user_sq *sq; 3552 struct nvmf_vfio_user_cq *cq; 3553 3554 struct spdk_nvmf_transport_poll_group *result = NULL; 3555 3556 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 3557 cq = sq->ctrlr->cqs[sq->cqid]; 3558 assert(cq != NULL); 3559 vu_transport = SPDK_CONTAINEROF(qpair->transport, struct nvmf_vfio_user_transport, transport); 3560 3561 pthread_mutex_lock(&vu_transport->pg_lock); 3562 if (TAILQ_EMPTY(&vu_transport->poll_groups)) { 3563 goto out; 3564 } 3565 3566 if (!nvmf_qpair_is_admin_queue(qpair)) { 3567 /* 3568 * If this is shared IO CQ case, just return the used CQ's poll 3569 * group, so I/O completions don't have to use 3570 * spdk_thread_send_msg(). 3571 */ 3572 if (cq->group != NULL) { 3573 result = cq->group; 3574 goto out; 3575 } 3576 3577 /* 3578 * If we're in interrupt mode, align all qpairs for a controller 3579 * on the same poll group, to avoid complications in 3580 * vfio_user_handle_intr(). 3581 */ 3582 if (spdk_interrupt_mode_is_enabled() && 3583 vu_transport->intr_mode_supported) { 3584 result = sq->ctrlr->sqs[0]->group; 3585 goto out; 3586 } 3587 3588 } 3589 3590 vu_group = &vu_transport->next_pg; 3591 assert(*vu_group != NULL); 3592 3593 result = &(*vu_group)->group; 3594 *vu_group = TAILQ_NEXT(*vu_group, link); 3595 if (*vu_group == NULL) { 3596 *vu_group = TAILQ_FIRST(&vu_transport->poll_groups); 3597 } 3598 3599 if (cq->group == NULL) { 3600 cq->group = result; 3601 } 3602 3603 out: 3604 pthread_mutex_unlock(&vu_transport->pg_lock); 3605 return result; 3606 } 3607 3608 /* called when process exits */ 3609 static void 3610 nvmf_vfio_user_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group) 3611 { 3612 struct nvmf_vfio_user_poll_group *vu_group, *next_tgroup;; 3613 struct nvmf_vfio_user_transport *vu_transport; 3614 3615 SPDK_DEBUGLOG(nvmf_vfio, "destroy poll group\n"); 3616 3617 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 3618 vu_transport = SPDK_CONTAINEROF(vu_group->group.transport, struct nvmf_vfio_user_transport, 3619 transport); 3620 3621 pthread_mutex_lock(&vu_transport->pg_lock); 3622 next_tgroup = TAILQ_NEXT(vu_group, link); 3623 TAILQ_REMOVE(&vu_transport->poll_groups, vu_group, link); 3624 if (next_tgroup == NULL) { 3625 next_tgroup = TAILQ_FIRST(&vu_transport->poll_groups); 3626 } 3627 if (vu_transport->next_pg == vu_group) { 3628 vu_transport->next_pg = next_tgroup; 3629 } 3630 pthread_mutex_unlock(&vu_transport->pg_lock); 3631 3632 free(vu_group); 3633 } 3634 3635 static void 3636 _vfio_user_qpair_disconnect(void *ctx) 3637 { 3638 struct nvmf_vfio_user_sq *sq = ctx; 3639 3640 spdk_nvmf_qpair_disconnect(&sq->qpair, NULL, NULL); 3641 } 3642 3643 /* The function is used when socket connection is destroyed */ 3644 static int 3645 vfio_user_destroy_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 3646 { 3647 struct nvmf_vfio_user_sq *sq; 3648 struct nvmf_vfio_user_endpoint *endpoint; 3649 3650 SPDK_DEBUGLOG(nvmf_vfio, "%s stop processing\n", ctrlr_id(ctrlr)); 3651 3652 endpoint = ctrlr->endpoint; 3653 assert(endpoint != NULL); 3654 3655 pthread_mutex_lock(&endpoint->lock); 3656 if (TAILQ_EMPTY(&ctrlr->connected_sqs)) { 3657 endpoint->ctrlr = NULL; 3658 free_ctrlr(ctrlr); 3659 pthread_mutex_unlock(&endpoint->lock); 3660 return 0; 3661 } 3662 3663 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 3664 /* add another round thread poll to avoid recursive endpoint lock */ 3665 spdk_thread_send_msg(ctrlr->thread, _vfio_user_qpair_disconnect, sq); 3666 } 3667 pthread_mutex_unlock(&endpoint->lock); 3668 3669 return 0; 3670 } 3671 3672 /* 3673 * Poll for and process any incoming vfio-user messages. 3674 */ 3675 static int 3676 vfio_user_poll_vfu_ctx(void *ctx) 3677 { 3678 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 3679 int ret; 3680 3681 assert(ctrlr != NULL); 3682 3683 /* This will call access_bar0_fn() if there are any writes 3684 * to the portion of the BAR that is not mmap'd */ 3685 ret = vfu_run_ctx(ctrlr->endpoint->vfu_ctx); 3686 if (spdk_unlikely(ret == -1)) { 3687 if (errno == EBUSY) { 3688 return SPDK_POLLER_IDLE; 3689 } 3690 3691 spdk_poller_unregister(&ctrlr->vfu_ctx_poller); 3692 3693 /* 3694 * We lost the client; the reset callback will already have 3695 * unregistered the interrupt. 3696 */ 3697 if (errno == ENOTCONN) { 3698 vfio_user_destroy_ctrlr(ctrlr); 3699 return SPDK_POLLER_BUSY; 3700 } 3701 3702 /* 3703 * We might not have got a reset callback in this case, so 3704 * explicitly unregister the interrupt here. 3705 */ 3706 spdk_interrupt_unregister(&ctrlr->intr); 3707 ctrlr->intr_fd = -1; 3708 fail_ctrlr(ctrlr); 3709 } 3710 3711 return ret != 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 3712 } 3713 3714 struct vfio_user_post_cpl_ctx { 3715 struct nvmf_vfio_user_ctrlr *ctrlr; 3716 struct nvmf_vfio_user_cq *cq; 3717 struct spdk_nvme_cpl cpl; 3718 }; 3719 3720 static void 3721 _post_completion_msg(void *ctx) 3722 { 3723 struct vfio_user_post_cpl_ctx *cpl_ctx = ctx; 3724 3725 post_completion(cpl_ctx->ctrlr, cpl_ctx->cq, cpl_ctx->cpl.cdw0, cpl_ctx->cpl.sqid, 3726 cpl_ctx->cpl.cid, cpl_ctx->cpl.status.sc, cpl_ctx->cpl.status.sct); 3727 free(cpl_ctx); 3728 } 3729 3730 static int nvmf_vfio_user_poll_group_poll(struct spdk_nvmf_transport_poll_group *group); 3731 3732 static int 3733 vfio_user_handle_intr(void *ctx) 3734 { 3735 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 3736 int ret; 3737 3738 assert(ctrlr != NULL); 3739 assert(ctrlr->sqs[0] != NULL); 3740 assert(ctrlr->sqs[0]->group != NULL); 3741 3742 vfio_user_poll_vfu_ctx(ctrlr); 3743 3744 /* 3745 * See nvmf_vfio_user_get_optimal_poll_group() fo why it's OK to only 3746 * poll this poll group. 3747 */ 3748 ret = nvmf_vfio_user_poll_group_poll(ctrlr->sqs[0]->group); 3749 3750 return ret != 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 3751 } 3752 3753 static int 3754 handle_queue_connect_rsp(struct nvmf_vfio_user_req *req, void *cb_arg) 3755 { 3756 struct nvmf_vfio_user_poll_group *vu_group; 3757 struct nvmf_vfio_user_sq *sq = cb_arg; 3758 struct nvmf_vfio_user_cq *cq; 3759 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 3760 struct nvmf_vfio_user_endpoint *endpoint; 3761 3762 assert(sq != NULL); 3763 assert(req != NULL); 3764 3765 vu_ctrlr = sq->ctrlr; 3766 assert(vu_ctrlr != NULL); 3767 endpoint = vu_ctrlr->endpoint; 3768 assert(endpoint != NULL); 3769 3770 if (spdk_nvme_cpl_is_error(&req->req.rsp->nvme_cpl)) { 3771 SPDK_ERRLOG("SC %u, SCT %u\n", req->req.rsp->nvme_cpl.status.sc, req->req.rsp->nvme_cpl.status.sct); 3772 endpoint->ctrlr = NULL; 3773 free_ctrlr(vu_ctrlr); 3774 return -1; 3775 } 3776 3777 vu_group = SPDK_CONTAINEROF(sq->group, struct nvmf_vfio_user_poll_group, group); 3778 TAILQ_INSERT_TAIL(&vu_group->sqs, sq, link); 3779 3780 cq = vu_ctrlr->cqs[0]; 3781 assert(cq != NULL); 3782 3783 pthread_mutex_lock(&endpoint->lock); 3784 if (nvmf_qpair_is_admin_queue(&sq->qpair)) { 3785 vu_ctrlr->cntlid = sq->qpair.ctrlr->cntlid; 3786 vu_ctrlr->thread = spdk_get_thread(); 3787 vu_ctrlr->ctrlr = sq->qpair.ctrlr; 3788 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 3789 vu_ctrlr->vfu_ctx_poller = SPDK_POLLER_REGISTER(vfio_user_poll_vfu_ctx, vu_ctrlr, 0); 3790 3791 cq->thread = spdk_get_thread(); 3792 3793 if (spdk_interrupt_mode_is_enabled() && 3794 endpoint->transport->intr_mode_supported) { 3795 vu_ctrlr->intr_fd = vfu_get_poll_fd(vu_ctrlr->endpoint->vfu_ctx); 3796 assert(vu_ctrlr->intr_fd != -1); 3797 3798 vu_ctrlr->intr = SPDK_INTERRUPT_REGISTER(vu_ctrlr->intr_fd, 3799 vfio_user_handle_intr, 3800 vu_ctrlr); 3801 3802 assert(vu_ctrlr->intr != NULL); 3803 3804 spdk_poller_register_interrupt(vu_ctrlr->vfu_ctx_poller, 3805 set_intr_mode_noop, 3806 vu_ctrlr); 3807 } 3808 } else { 3809 /* For I/O queues this command was generated in response to an 3810 * ADMIN I/O CREATE SUBMISSION QUEUE command which has not yet 3811 * been completed. Complete it now. 3812 */ 3813 if (sq->post_create_io_sq_completion) { 3814 assert(cq->thread != NULL); 3815 if (cq->thread != spdk_get_thread()) { 3816 struct vfio_user_post_cpl_ctx *cpl_ctx; 3817 3818 cpl_ctx = calloc(1, sizeof(*cpl_ctx)); 3819 if (!cpl_ctx) { 3820 return -ENOMEM; 3821 } 3822 cpl_ctx->ctrlr = vu_ctrlr; 3823 cpl_ctx->cq = cq; 3824 cpl_ctx->cpl.sqid = 0; 3825 cpl_ctx->cpl.cdw0 = 0; 3826 cpl_ctx->cpl.cid = sq->create_io_sq_cmd.cid; 3827 cpl_ctx->cpl.status.sc = SPDK_NVME_SC_SUCCESS; 3828 cpl_ctx->cpl.status.sct = SPDK_NVME_SCT_GENERIC; 3829 3830 spdk_thread_send_msg(cq->thread, _post_completion_msg, cpl_ctx); 3831 } else { 3832 post_completion(vu_ctrlr, cq, 0, 0, 3833 sq->create_io_sq_cmd.cid, SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC); 3834 } 3835 sq->post_create_io_sq_completion = false; 3836 } 3837 sq->sq_state = VFIO_USER_SQ_ACTIVE; 3838 } 3839 3840 TAILQ_INSERT_TAIL(&vu_ctrlr->connected_sqs, sq, tailq); 3841 pthread_mutex_unlock(&endpoint->lock); 3842 3843 free(req->req.data); 3844 req->req.data = NULL; 3845 3846 return 0; 3847 } 3848 3849 /* 3850 * Add the given qpair to the given poll group. New qpairs are added via 3851 * spdk_nvmf_tgt_new_qpair(), which picks a poll group via 3852 * nvmf_vfio_user_get_optimal_poll_group(), then calls back here via 3853 * nvmf_transport_poll_group_add(). 3854 */ 3855 static int 3856 nvmf_vfio_user_poll_group_add(struct spdk_nvmf_transport_poll_group *group, 3857 struct spdk_nvmf_qpair *qpair) 3858 { 3859 struct nvmf_vfio_user_sq *sq; 3860 struct nvmf_vfio_user_req *vu_req; 3861 struct nvmf_vfio_user_ctrlr *ctrlr; 3862 struct spdk_nvmf_request *req; 3863 struct spdk_nvmf_fabric_connect_data *data; 3864 bool admin; 3865 3866 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 3867 sq->group = group; 3868 ctrlr = sq->ctrlr; 3869 3870 SPDK_DEBUGLOG(nvmf_vfio, "%s: add QP%d=%p(%p) to poll_group=%p\n", 3871 ctrlr_id(ctrlr), sq->qpair.qid, 3872 sq, qpair, group); 3873 3874 admin = nvmf_qpair_is_admin_queue(&sq->qpair); 3875 3876 vu_req = get_nvmf_vfio_user_req(sq); 3877 if (vu_req == NULL) { 3878 return -1; 3879 } 3880 3881 req = &vu_req->req; 3882 req->cmd->connect_cmd.opcode = SPDK_NVME_OPC_FABRIC; 3883 req->cmd->connect_cmd.cid = 0; 3884 req->cmd->connect_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_CONNECT; 3885 req->cmd->connect_cmd.recfmt = 0; 3886 req->cmd->connect_cmd.sqsize = sq->size - 1; 3887 req->cmd->connect_cmd.qid = admin ? 0 : qpair->qid; 3888 3889 req->length = sizeof(struct spdk_nvmf_fabric_connect_data); 3890 req->data = calloc(1, req->length); 3891 if (req->data == NULL) { 3892 nvmf_vfio_user_req_free(req); 3893 return -ENOMEM; 3894 } 3895 3896 data = (struct spdk_nvmf_fabric_connect_data *)req->data; 3897 data->cntlid = ctrlr->cntlid; 3898 snprintf(data->subnqn, sizeof(data->subnqn), "%s", 3899 spdk_nvmf_subsystem_get_nqn(ctrlr->endpoint->subsystem)); 3900 3901 vu_req->cb_fn = handle_queue_connect_rsp; 3902 vu_req->cb_arg = sq; 3903 3904 SPDK_DEBUGLOG(nvmf_vfio, 3905 "%s: sending connect fabrics command for QID=%#x cntlid=%#x\n", 3906 ctrlr_id(ctrlr), qpair->qid, data->cntlid); 3907 3908 spdk_nvmf_request_exec_fabrics(req); 3909 return 0; 3910 } 3911 3912 static int 3913 nvmf_vfio_user_poll_group_remove(struct spdk_nvmf_transport_poll_group *group, 3914 struct spdk_nvmf_qpair *qpair) 3915 { 3916 struct nvmf_vfio_user_sq *sq; 3917 struct nvmf_vfio_user_poll_group *vu_group; 3918 3919 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 3920 3921 SPDK_DEBUGLOG(nvmf_vfio, 3922 "%s: remove NVMf QP%d=%p from NVMf poll_group=%p\n", 3923 ctrlr_id(sq->ctrlr), qpair->qid, qpair, group); 3924 3925 3926 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 3927 TAILQ_REMOVE(&vu_group->sqs, sq, link); 3928 3929 return 0; 3930 } 3931 3932 static void 3933 _nvmf_vfio_user_req_free(struct nvmf_vfio_user_sq *sq, struct nvmf_vfio_user_req *vu_req) 3934 { 3935 memset(&vu_req->cmd, 0, sizeof(vu_req->cmd)); 3936 memset(&vu_req->rsp, 0, sizeof(vu_req->rsp)); 3937 vu_req->iovcnt = 0; 3938 vu_req->state = VFIO_USER_REQUEST_STATE_FREE; 3939 3940 TAILQ_INSERT_TAIL(&sq->free_reqs, vu_req, link); 3941 } 3942 3943 static int 3944 nvmf_vfio_user_req_free(struct spdk_nvmf_request *req) 3945 { 3946 struct nvmf_vfio_user_sq *sq; 3947 struct nvmf_vfio_user_req *vu_req; 3948 3949 assert(req != NULL); 3950 3951 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 3952 sq = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 3953 3954 _nvmf_vfio_user_req_free(sq, vu_req); 3955 3956 return 0; 3957 } 3958 3959 static int 3960 nvmf_vfio_user_req_complete(struct spdk_nvmf_request *req) 3961 { 3962 struct nvmf_vfio_user_sq *sq; 3963 struct nvmf_vfio_user_req *vu_req; 3964 3965 assert(req != NULL); 3966 3967 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 3968 sq = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 3969 3970 if (vu_req->cb_fn != NULL) { 3971 if (vu_req->cb_fn(vu_req, vu_req->cb_arg) != 0) { 3972 fail_ctrlr(sq->ctrlr); 3973 } 3974 } 3975 3976 _nvmf_vfio_user_req_free(sq, vu_req); 3977 3978 return 0; 3979 } 3980 3981 static void 3982 nvmf_vfio_user_close_qpair(struct spdk_nvmf_qpair *qpair, 3983 spdk_nvmf_transport_qpair_fini_cb cb_fn, void *cb_arg) 3984 { 3985 struct nvmf_vfio_user_sq *sq; 3986 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 3987 struct nvmf_vfio_user_endpoint *endpoint; 3988 3989 assert(qpair != NULL); 3990 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 3991 vu_ctrlr = sq->ctrlr; 3992 endpoint = vu_ctrlr->endpoint; 3993 3994 pthread_mutex_lock(&endpoint->lock); 3995 TAILQ_REMOVE(&vu_ctrlr->connected_sqs, sq, tailq); 3996 delete_sq_done(vu_ctrlr, sq); 3997 if (TAILQ_EMPTY(&vu_ctrlr->connected_sqs)) { 3998 endpoint->ctrlr = NULL; 3999 free_ctrlr(vu_ctrlr); 4000 } 4001 pthread_mutex_unlock(&endpoint->lock); 4002 4003 if (cb_fn) { 4004 cb_fn(cb_arg); 4005 } 4006 } 4007 4008 /** 4009 * Returns a preallocated request, or NULL if there isn't one available. 4010 */ 4011 static struct nvmf_vfio_user_req * 4012 get_nvmf_vfio_user_req(struct nvmf_vfio_user_sq *sq) 4013 { 4014 struct nvmf_vfio_user_req *req; 4015 4016 if (sq == NULL) { 4017 return NULL; 4018 } 4019 4020 req = TAILQ_FIRST(&sq->free_reqs); 4021 if (req == NULL) { 4022 return NULL; 4023 } 4024 4025 TAILQ_REMOVE(&sq->free_reqs, req, link); 4026 4027 return req; 4028 } 4029 4030 static int 4031 get_nvmf_io_req_length(struct spdk_nvmf_request *req) 4032 { 4033 uint16_t nr; 4034 uint32_t nlb, nsid; 4035 struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; 4036 struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; 4037 struct spdk_nvmf_ns *ns; 4038 4039 nsid = cmd->nsid; 4040 ns = _nvmf_subsystem_get_ns(ctrlr->subsys, nsid); 4041 if (ns == NULL || ns->bdev == NULL) { 4042 SPDK_ERRLOG("unsuccessful query for nsid %u\n", cmd->nsid); 4043 return -EINVAL; 4044 } 4045 4046 if (cmd->opc == SPDK_NVME_OPC_DATASET_MANAGEMENT) { 4047 nr = cmd->cdw10_bits.dsm.nr + 1; 4048 return nr * sizeof(struct spdk_nvme_dsm_range); 4049 } 4050 4051 nlb = (cmd->cdw12 & 0x0000ffffu) + 1; 4052 return nlb * spdk_bdev_get_block_size(ns->bdev); 4053 } 4054 4055 static int 4056 map_admin_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req) 4057 { 4058 struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; 4059 uint32_t len = 0; 4060 uint8_t fid; 4061 int iovcnt; 4062 4063 req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 4064 req->length = 0; 4065 req->data = NULL; 4066 4067 if (req->xfer == SPDK_NVME_DATA_NONE) { 4068 return 0; 4069 } 4070 4071 switch (cmd->opc) { 4072 case SPDK_NVME_OPC_IDENTIFY: 4073 len = 4096; 4074 break; 4075 case SPDK_NVME_OPC_GET_LOG_PAGE: 4076 len = (((cmd->cdw11_bits.get_log_page.numdu << 16) | cmd->cdw10_bits.get_log_page.numdl) + 1) * 4; 4077 break; 4078 case SPDK_NVME_OPC_GET_FEATURES: 4079 case SPDK_NVME_OPC_SET_FEATURES: 4080 fid = cmd->cdw10_bits.set_features.fid; 4081 switch (fid) { 4082 case SPDK_NVME_FEAT_LBA_RANGE_TYPE: 4083 len = 4096; 4084 break; 4085 case SPDK_NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 4086 len = 256; 4087 break; 4088 case SPDK_NVME_FEAT_TIMESTAMP: 4089 len = 8; 4090 break; 4091 case SPDK_NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 4092 len = 512; 4093 break; 4094 case SPDK_NVME_FEAT_HOST_IDENTIFIER: 4095 if (cmd->cdw11_bits.feat_host_identifier.bits.exhid) { 4096 len = 16; 4097 } else { 4098 len = 8; 4099 } 4100 break; 4101 default: 4102 return 0; 4103 } 4104 break; 4105 default: 4106 return 0; 4107 } 4108 4109 /* ADMIN command will not use SGL */ 4110 if (cmd->psdt != 0) { 4111 return -EINVAL; 4112 } 4113 4114 iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, len); 4115 if (iovcnt < 0) { 4116 SPDK_ERRLOG("%s: map Admin Opc %x failed\n", 4117 ctrlr_id(ctrlr), cmd->opc); 4118 return -1; 4119 } 4120 req->length = len; 4121 req->data = req->iov[0].iov_base; 4122 req->iovcnt = iovcnt; 4123 4124 return 0; 4125 } 4126 4127 /* 4128 * Map an I/O command's buffers. 4129 * 4130 * Returns 0 on success and -errno on failure. 4131 */ 4132 static int 4133 map_io_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req) 4134 { 4135 int len, iovcnt; 4136 struct spdk_nvme_cmd *cmd; 4137 4138 assert(ctrlr != NULL); 4139 assert(req != NULL); 4140 4141 cmd = &req->cmd->nvme_cmd; 4142 req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 4143 req->length = 0; 4144 req->data = NULL; 4145 4146 if (spdk_unlikely(req->xfer == SPDK_NVME_DATA_NONE)) { 4147 return 0; 4148 } 4149 4150 len = get_nvmf_io_req_length(req); 4151 if (len < 0) { 4152 return -EINVAL; 4153 } 4154 req->length = len; 4155 4156 iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, req->length); 4157 if (iovcnt < 0) { 4158 SPDK_ERRLOG("%s: failed to map IO OPC %u\n", ctrlr_id(ctrlr), cmd->opc); 4159 return -EFAULT; 4160 } 4161 req->data = req->iov[0].iov_base; 4162 req->iovcnt = iovcnt; 4163 4164 return 0; 4165 } 4166 4167 static int 4168 handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 4169 struct nvmf_vfio_user_sq *sq) 4170 { 4171 int err; 4172 struct nvmf_vfio_user_req *vu_req; 4173 struct spdk_nvmf_request *req; 4174 4175 assert(ctrlr != NULL); 4176 assert(cmd != NULL); 4177 4178 vu_req = get_nvmf_vfio_user_req(sq); 4179 if (spdk_unlikely(vu_req == NULL)) { 4180 SPDK_ERRLOG("%s: no request for NVMe command opc 0x%x\n", ctrlr_id(ctrlr), cmd->opc); 4181 return post_completion(ctrlr, ctrlr->cqs[sq->cqid], 0, 0, cmd->cid, 4182 SPDK_NVME_SC_INTERNAL_DEVICE_ERROR, SPDK_NVME_SCT_GENERIC); 4183 4184 } 4185 req = &vu_req->req; 4186 4187 assert(req->qpair != NULL); 4188 SPDK_DEBUGLOG(nvmf_vfio, "%s: handle qid%u, req opc=%#x cid=%d\n", 4189 ctrlr_id(ctrlr), req->qpair->qid, cmd->opc, cmd->cid); 4190 4191 vu_req->cb_fn = handle_cmd_rsp; 4192 vu_req->cb_arg = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 4193 req->cmd->nvme_cmd = *cmd; 4194 4195 if (nvmf_qpair_is_admin_queue(req->qpair)) { 4196 err = map_admin_cmd_req(ctrlr, req); 4197 } else { 4198 switch (cmd->opc) { 4199 case SPDK_NVME_OPC_RESERVATION_REGISTER: 4200 case SPDK_NVME_OPC_RESERVATION_REPORT: 4201 case SPDK_NVME_OPC_RESERVATION_ACQUIRE: 4202 case SPDK_NVME_OPC_RESERVATION_RELEASE: 4203 err = -ENOTSUP; 4204 break; 4205 default: 4206 err = map_io_cmd_req(ctrlr, req); 4207 break; 4208 } 4209 } 4210 4211 if (spdk_unlikely(err < 0)) { 4212 SPDK_ERRLOG("%s: process NVMe command opc 0x%x failed\n", 4213 ctrlr_id(ctrlr), cmd->opc); 4214 req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 4215 req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; 4216 err = handle_cmd_rsp(vu_req, vu_req->cb_arg); 4217 _nvmf_vfio_user_req_free(sq, vu_req); 4218 return err; 4219 } 4220 4221 vu_req->state = VFIO_USER_REQUEST_STATE_EXECUTING; 4222 spdk_nvmf_request_exec(req); 4223 4224 return 0; 4225 } 4226 4227 /* Returns the number of commands processed, or a negative value on error. */ 4228 static int 4229 nvmf_vfio_user_sq_poll(struct nvmf_vfio_user_sq *sq) 4230 { 4231 struct nvmf_vfio_user_ctrlr *ctrlr; 4232 uint32_t new_tail; 4233 int count = 0; 4234 4235 assert(sq != NULL); 4236 4237 ctrlr = sq->ctrlr; 4238 4239 /* On aarch64 platforms, doorbells update from guest VM may not be seen 4240 * on SPDK target side. This is because there is memory type mismatch 4241 * situation here. That is on guest VM side, the doorbells are treated as 4242 * device memory while on SPDK target side, it is treated as normal 4243 * memory. And this situation cause problem on ARM platform. 4244 * Refer to "https://developer.arm.com/documentation/102376/0100/ 4245 * Memory-aliasing-and-mismatched-memory-types". Only using spdk_mb() 4246 * cannot fix this. Use "dc civac" to invalidate cache may solve 4247 * this. 4248 */ 4249 spdk_ivdt_dcache(sq_dbl_tailp(ctrlr, sq)); 4250 4251 /* Load-Acquire. */ 4252 new_tail = *sq_dbl_tailp(ctrlr, sq); 4253 4254 /* 4255 * Ensure that changes to the queue are visible to us. 4256 * The host driver should write the queue first, do a wmb(), and then 4257 * update the SQ tail doorbell (their Store-Release). 4258 */ 4259 spdk_rmb(); 4260 4261 new_tail = new_tail & 0xffffu; 4262 if (spdk_unlikely(new_tail >= sq->size)) { 4263 union spdk_nvme_async_event_completion event = {}; 4264 4265 SPDK_DEBUGLOG(nvmf_vfio, "%s: invalid SQ%u doorbell value %u\n", ctrlr_id(ctrlr), sq->qid, 4266 new_tail); 4267 event.bits.async_event_type = SPDK_NVME_ASYNC_EVENT_TYPE_ERROR; 4268 event.bits.async_event_info = SPDK_NVME_ASYNC_EVENT_INVALID_DB_WRITE; 4269 nvmf_ctrlr_async_event_error_event(ctrlr->ctrlr, event); 4270 4271 return 0; 4272 } 4273 4274 if (*sq_headp(sq) == new_tail) { 4275 return 0; 4276 } 4277 4278 count = handle_sq_tdbl_write(ctrlr, new_tail, sq); 4279 if (count < 0) { 4280 fail_ctrlr(ctrlr); 4281 } 4282 4283 return count; 4284 } 4285 4286 /* 4287 * vfio-user transport poll handler. Note that the library context is polled in 4288 * a separate poller (->vfu_ctx_poller), so this poller only needs to poll the 4289 * active qpairs. 4290 * 4291 * Returns the number of commands processed, or a negative value on error. 4292 */ 4293 static int 4294 nvmf_vfio_user_poll_group_poll(struct spdk_nvmf_transport_poll_group *group) 4295 { 4296 struct nvmf_vfio_user_poll_group *vu_group; 4297 struct nvmf_vfio_user_sq *sq, *tmp; 4298 int count = 0; 4299 4300 assert(group != NULL); 4301 4302 spdk_rmb(); 4303 4304 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 4305 4306 TAILQ_FOREACH_SAFE(sq, &vu_group->sqs, link, tmp) { 4307 int ret; 4308 4309 if (spdk_unlikely(sq->sq_state != VFIO_USER_SQ_ACTIVE || !sq->size)) { 4310 continue; 4311 } 4312 4313 ret = nvmf_vfio_user_sq_poll(sq); 4314 4315 if (ret < 0) { 4316 return ret; 4317 } 4318 4319 count += ret; 4320 } 4321 4322 return count; 4323 } 4324 4325 static int 4326 nvmf_vfio_user_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair, 4327 struct spdk_nvme_transport_id *trid) 4328 { 4329 struct nvmf_vfio_user_sq *sq; 4330 struct nvmf_vfio_user_ctrlr *ctrlr; 4331 4332 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 4333 ctrlr = sq->ctrlr; 4334 4335 memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid)); 4336 return 0; 4337 } 4338 4339 static int 4340 nvmf_vfio_user_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair, 4341 struct spdk_nvme_transport_id *trid) 4342 { 4343 return 0; 4344 } 4345 4346 static int 4347 nvmf_vfio_user_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair, 4348 struct spdk_nvme_transport_id *trid) 4349 { 4350 struct nvmf_vfio_user_sq *sq; 4351 struct nvmf_vfio_user_ctrlr *ctrlr; 4352 4353 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 4354 ctrlr = sq->ctrlr; 4355 4356 memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid)); 4357 return 0; 4358 } 4359 4360 static void 4361 nvmf_vfio_user_qpair_abort_request(struct spdk_nvmf_qpair *qpair, 4362 struct spdk_nvmf_request *req) 4363 { 4364 struct spdk_nvmf_request *req_to_abort = NULL; 4365 struct spdk_nvmf_request *temp_req = NULL; 4366 uint16_t cid; 4367 4368 cid = req->cmd->nvme_cmd.cdw10_bits.abort.cid; 4369 4370 TAILQ_FOREACH(temp_req, &qpair->outstanding, link) { 4371 struct nvmf_vfio_user_req *vu_req; 4372 4373 vu_req = SPDK_CONTAINEROF(temp_req, struct nvmf_vfio_user_req, req); 4374 4375 if (vu_req->state == VFIO_USER_REQUEST_STATE_EXECUTING && vu_req->cmd.cid == cid) { 4376 req_to_abort = temp_req; 4377 break; 4378 } 4379 } 4380 4381 if (req_to_abort == NULL) { 4382 spdk_nvmf_request_complete(req); 4383 return; 4384 } 4385 4386 req->req_to_abort = req_to_abort; 4387 nvmf_ctrlr_abort_request(req); 4388 } 4389 4390 static void 4391 nvmf_vfio_user_opts_init(struct spdk_nvmf_transport_opts *opts) 4392 { 4393 opts->max_queue_depth = NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH; 4394 opts->max_qpairs_per_ctrlr = NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; 4395 opts->in_capsule_data_size = 0; 4396 opts->max_io_size = NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE; 4397 opts->io_unit_size = NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE; 4398 opts->max_aq_depth = NVMF_VFIO_USER_DEFAULT_AQ_DEPTH; 4399 opts->num_shared_buffers = 0; 4400 opts->buf_cache_size = 0; 4401 opts->association_timeout = 0; 4402 opts->transport_specific = NULL; 4403 } 4404 4405 const struct spdk_nvmf_transport_ops spdk_nvmf_transport_vfio_user = { 4406 .name = "VFIOUSER", 4407 .type = SPDK_NVME_TRANSPORT_VFIOUSER, 4408 .opts_init = nvmf_vfio_user_opts_init, 4409 .create = nvmf_vfio_user_create, 4410 .destroy = nvmf_vfio_user_destroy, 4411 4412 .listen = nvmf_vfio_user_listen, 4413 .stop_listen = nvmf_vfio_user_stop_listen, 4414 .cdata_init = nvmf_vfio_user_cdata_init, 4415 .listen_associate = nvmf_vfio_user_listen_associate, 4416 4417 .listener_discover = nvmf_vfio_user_discover, 4418 4419 .poll_group_create = nvmf_vfio_user_poll_group_create, 4420 .get_optimal_poll_group = nvmf_vfio_user_get_optimal_poll_group, 4421 .poll_group_destroy = nvmf_vfio_user_poll_group_destroy, 4422 .poll_group_add = nvmf_vfio_user_poll_group_add, 4423 .poll_group_remove = nvmf_vfio_user_poll_group_remove, 4424 .poll_group_poll = nvmf_vfio_user_poll_group_poll, 4425 4426 .req_free = nvmf_vfio_user_req_free, 4427 .req_complete = nvmf_vfio_user_req_complete, 4428 4429 .qpair_fini = nvmf_vfio_user_close_qpair, 4430 .qpair_get_local_trid = nvmf_vfio_user_qpair_get_local_trid, 4431 .qpair_get_peer_trid = nvmf_vfio_user_qpair_get_peer_trid, 4432 .qpair_get_listen_trid = nvmf_vfio_user_qpair_get_listen_trid, 4433 .qpair_abort_request = nvmf_vfio_user_qpair_abort_request, 4434 }; 4435 4436 SPDK_NVMF_TRANSPORT_REGISTER(muser, &spdk_nvmf_transport_vfio_user); 4437 SPDK_LOG_REGISTER_COMPONENT(nvmf_vfio) 4438