1 /*- 2 * BSD LICENSE 3 * Copyright (c) Intel Corporation. All rights reserved. 4 * Copyright (c) 2019-2022, Nutanix Inc. All rights reserved. 5 * Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 /* 35 * NVMe over vfio-user transport 36 */ 37 38 #include <vfio-user/libvfio-user.h> 39 #include <vfio-user/pci_defs.h> 40 41 #include "spdk/barrier.h" 42 #include "spdk/stdinc.h" 43 #include "spdk/assert.h" 44 #include "spdk/thread.h" 45 #include "spdk/nvmf_transport.h" 46 #include "spdk/sock.h" 47 #include "spdk/string.h" 48 #include "spdk/util.h" 49 #include "spdk/log.h" 50 51 #include "transport.h" 52 53 #include "nvmf_internal.h" 54 55 #define SWAP(x, y) \ 56 do \ 57 { \ 58 typeof(x) _tmp = x; \ 59 x = y; \ 60 y = _tmp; \ 61 } while (0) 62 63 #define NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH 256 64 #define NVMF_VFIO_USER_DEFAULT_AQ_DEPTH 32 65 #define NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE ((NVMF_REQ_MAX_BUFFERS - 1) << SHIFT_4KB) 66 #define NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE 67 68 #define NVME_DOORBELLS_OFFSET 0x1000 69 #define NVMF_VFIO_USER_DOORBELLS_SIZE 0x1000 70 #define NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT 2 71 #define NVMF_VFIO_USER_SET_EVENTIDX_MAX_ATTEMPTS 3 72 #define NVMF_VFIO_USER_EVENTIDX_POLL UINT32_MAX 73 74 /* 75 * NVMe driver reads 4096 bytes, which is the extended PCI configuration space 76 * available on PCI-X 2.0 and PCI Express buses 77 */ 78 #define NVME_REG_CFG_SIZE 0x1000 79 #define NVME_REG_BAR0_SIZE (NVME_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE) 80 #define NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR ((NVMF_VFIO_USER_DOORBELLS_SIZE) / 8) 81 #define NVME_IRQ_MSIX_NUM NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR 82 /* MSIX Table Size */ 83 #define NVME_BAR4_SIZE SPDK_ALIGN_CEIL((NVME_IRQ_MSIX_NUM * 16), 0x1000) 84 /* MSIX Pending Bit Array Size */ 85 #define NVME_BAR5_SIZE SPDK_ALIGN_CEIL((NVME_IRQ_MSIX_NUM / 8), 0x1000) 86 87 #define NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR (NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR / 4) 88 89 struct nvmf_vfio_user_req; 90 91 typedef int (*nvmf_vfio_user_req_cb_fn)(struct nvmf_vfio_user_req *req, void *cb_arg); 92 93 /* 1 more for PRP2 list itself */ 94 #define NVMF_VFIO_USER_MAX_IOVECS (NVMF_REQ_MAX_BUFFERS + 1) 95 96 enum nvmf_vfio_user_req_state { 97 VFIO_USER_REQUEST_STATE_FREE = 0, 98 VFIO_USER_REQUEST_STATE_EXECUTING, 99 }; 100 101 /* NVMe device state representation */ 102 struct nvme_migr_sq_state { 103 uint16_t sqid; 104 uint16_t cqid; 105 uint32_t head; 106 uint32_t size; 107 uint32_t reserved; 108 uint64_t dma_addr; 109 }; 110 SPDK_STATIC_ASSERT(sizeof(struct nvme_migr_sq_state) == 0x18, "Incorrect size"); 111 112 struct nvme_migr_cq_state { 113 uint16_t cqid; 114 uint16_t phase; 115 uint32_t tail; 116 uint32_t size; 117 uint32_t iv; 118 uint32_t ien; 119 uint32_t reserved; 120 uint64_t dma_addr; 121 }; 122 SPDK_STATIC_ASSERT(sizeof(struct nvme_migr_cq_state) == 0x20, "Incorrect size"); 123 124 #define VFIO_USER_NVME_MIGR_MAGIC 0xAFEDBC23 125 126 /* The device state is in VFIO MIGRATION BAR(9) region, keep the device state page aligned. 127 * 128 * NVMe device migration region is defined as below: 129 * ------------------------------------------------------------------------- 130 * | vfio_user_nvme_migr_header | nvmf controller data | queue pairs | BARs | 131 * ------------------------------------------------------------------------- 132 * 133 * Keep vfio_user_nvme_migr_header as a fixed 0x1000 length, all new added fields 134 * can use the reserved space at the end of the data structure. 135 */ 136 struct vfio_user_nvme_migr_header { 137 /* Magic value to validate migration data */ 138 uint32_t magic; 139 /* Version to check the data is same from source to destination */ 140 uint32_t version; 141 142 /* The library uses this field to know how many fields in this 143 * structure are valid, starting at the beginning of this data 144 * structure. New added fields in future use `unused` memory 145 * spaces. 146 */ 147 uint32_t opts_size; 148 uint32_t reserved0; 149 150 /* BARs information */ 151 uint64_t bar_offset[VFU_PCI_DEV_NUM_REGIONS]; 152 uint64_t bar_len[VFU_PCI_DEV_NUM_REGIONS]; 153 154 /* Queue pair start offset, starting at the beginning of this 155 * data structure. 156 */ 157 uint64_t qp_offset; 158 uint64_t qp_len; 159 160 /* Controller data structure */ 161 uint32_t num_io_queues; 162 uint32_t reserved1; 163 164 /* TODO: this part will be moved to common nvmf controller data */ 165 uint16_t reserved2[3]; 166 uint16_t nr_aers; 167 uint16_t aer_cids[NVMF_MIGR_MAX_PENDING_AERS]; 168 169 /* NVMf controller data offset and length if exist, starting at 170 * the beginning of this data structure. 171 */ 172 uint64_t nvmf_data_offset; 173 uint64_t nvmf_data_len; 174 175 /* 176 * Whether or not shadow doorbells are used in the source. 0 is a valid DMA 177 * address. 178 */ 179 bool sdbl; 180 181 /* Shadow doorbell DMA addresses. */ 182 uint64_t shadow_doorbell_buffer; 183 uint64_t eventidx_buffer; 184 185 /* Reserved memory space for new added fields, the 186 * field is always at the end of this data structure. 187 */ 188 uint8_t unused[3336]; 189 }; 190 SPDK_STATIC_ASSERT(sizeof(struct vfio_user_nvme_migr_header) == 0x1000, "Incorrect size"); 191 192 struct vfio_user_nvme_migr_qp { 193 struct nvme_migr_sq_state sq; 194 struct nvme_migr_cq_state cq; 195 }; 196 197 /* NVMe state definition used to load/restore from/to NVMe migration BAR region */ 198 struct vfio_user_nvme_migr_state { 199 struct vfio_user_nvme_migr_header ctrlr_header; 200 struct nvmf_ctrlr_migr_data nvmf_data; 201 struct vfio_user_nvme_migr_qp qps[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 202 uint8_t bar0[NVME_REG_BAR0_SIZE]; 203 uint8_t cfg[NVME_REG_CFG_SIZE]; 204 }; 205 206 struct nvmf_vfio_user_req { 207 struct spdk_nvmf_request req; 208 struct spdk_nvme_cpl rsp; 209 struct spdk_nvme_cmd cmd; 210 211 enum nvmf_vfio_user_req_state state; 212 nvmf_vfio_user_req_cb_fn cb_fn; 213 void *cb_arg; 214 215 /* old CC before prop_set_cc fabric command */ 216 union spdk_nvme_cc_register cc; 217 218 TAILQ_ENTRY(nvmf_vfio_user_req) link; 219 220 struct iovec iov[NVMF_VFIO_USER_MAX_IOVECS]; 221 uint8_t iovcnt; 222 223 /* NVMF_VFIO_USER_MAX_IOVECS worth of dma_sg_t. */ 224 uint8_t sg[]; 225 }; 226 227 /* 228 * Mapping of an NVMe queue. 229 * 230 * This holds the information tracking a local process mapping of an NVMe queue 231 * shared by the client. 232 */ 233 struct nvme_q_mapping { 234 /* iov of local process mapping. */ 235 struct iovec iov; 236 /* Stored sg, needed for unmap. */ 237 dma_sg_t *sg; 238 /* Client PRP of queue. */ 239 uint64_t prp1; 240 }; 241 242 enum nvmf_vfio_user_sq_state { 243 VFIO_USER_SQ_UNUSED = 0, 244 VFIO_USER_SQ_CREATED, 245 VFIO_USER_SQ_DELETED, 246 VFIO_USER_SQ_ACTIVE, 247 VFIO_USER_SQ_INACTIVE 248 }; 249 250 enum nvmf_vfio_user_cq_state { 251 VFIO_USER_CQ_UNUSED = 0, 252 VFIO_USER_CQ_CREATED, 253 VFIO_USER_CQ_DELETED, 254 }; 255 256 enum nvmf_vfio_user_ctrlr_state { 257 VFIO_USER_CTRLR_CREATING = 0, 258 VFIO_USER_CTRLR_RUNNING, 259 /* Quiesce requested by libvfio-user */ 260 VFIO_USER_CTRLR_PAUSING, 261 /* NVMf subsystem is paused, it's safe to do PCI reset, memory register, 262 * memory unergister, and vfio migration state transition in this state. 263 */ 264 VFIO_USER_CTRLR_PAUSED, 265 /* 266 * Implies that the NVMf subsystem is paused. Device will be unquiesced (PCI 267 * reset, memory register and unregister, controller in destination VM has 268 * been restored). NVMf subsystem resume has been requested. 269 */ 270 VFIO_USER_CTRLR_RESUMING, 271 /* 272 * Implies that the NVMf subsystem is paused. Both controller in source VM and 273 * destinatiom VM is in this state when doing live migration. 274 */ 275 VFIO_USER_CTRLR_MIGRATING 276 }; 277 278 /* Migration region to record NVMe device state data structure */ 279 struct vfio_user_migration_region { 280 uint64_t last_data_offset; 281 uint64_t pending_bytes; 282 }; 283 284 struct nvmf_vfio_user_sq { 285 struct spdk_nvmf_qpair qpair; 286 struct spdk_nvmf_transport_poll_group *group; 287 struct nvmf_vfio_user_ctrlr *ctrlr; 288 289 uint32_t qid; 290 /* Number of entries in queue. */ 291 uint32_t size; 292 struct nvme_q_mapping mapping; 293 enum nvmf_vfio_user_sq_state sq_state; 294 295 uint32_t head; 296 volatile uint32_t *dbl_tailp; 297 298 /* Whether a shadow doorbell eventidx needs setting. */ 299 bool need_rearm; 300 301 /* multiple SQs can be mapped to the same CQ */ 302 uint16_t cqid; 303 304 /* handle_queue_connect_rsp() can be used both for CREATE IO SQ response 305 * and SQ re-connect response in the destination VM, for the prior case, 306 * we will post a NVMe completion to VM, we will not set this flag when 307 * re-connecting SQs in the destination VM. 308 */ 309 bool post_create_io_sq_completion; 310 /* Copy of Create IO SQ command, this field is used together with 311 * `post_create_io_sq_completion` flag. 312 */ 313 struct spdk_nvme_cmd create_io_sq_cmd; 314 315 /* Currently unallocated reqs. */ 316 TAILQ_HEAD(, nvmf_vfio_user_req) free_reqs; 317 /* Poll group entry */ 318 TAILQ_ENTRY(nvmf_vfio_user_sq) link; 319 /* Connected SQ entry */ 320 TAILQ_ENTRY(nvmf_vfio_user_sq) tailq; 321 }; 322 323 struct nvmf_vfio_user_cq { 324 struct spdk_nvmf_transport_poll_group *group; 325 struct spdk_thread *thread; 326 uint32_t cq_ref; 327 328 uint32_t qid; 329 /* Number of entries in queue. */ 330 uint32_t size; 331 struct nvme_q_mapping mapping; 332 enum nvmf_vfio_user_cq_state cq_state; 333 334 uint32_t tail; 335 volatile uint32_t *dbl_headp; 336 337 bool phase; 338 339 uint16_t iv; 340 bool ien; 341 342 uint32_t last_head; 343 uint32_t last_trigger_irq_tail; 344 }; 345 346 struct nvmf_vfio_user_poll_group { 347 struct spdk_nvmf_transport_poll_group group; 348 TAILQ_ENTRY(nvmf_vfio_user_poll_group) link; 349 TAILQ_HEAD(, nvmf_vfio_user_sq) sqs; 350 }; 351 352 struct nvmf_vfio_user_shadow_doorbells { 353 volatile uint32_t *shadow_doorbells; 354 volatile uint32_t *eventidxs; 355 dma_sg_t *sgs; 356 struct iovec *iovs; 357 }; 358 359 struct nvmf_vfio_user_ctrlr { 360 struct nvmf_vfio_user_endpoint *endpoint; 361 struct nvmf_vfio_user_transport *transport; 362 363 /* Connected SQs list */ 364 TAILQ_HEAD(, nvmf_vfio_user_sq) connected_sqs; 365 enum nvmf_vfio_user_ctrlr_state state; 366 367 struct vfio_user_migration_region migr_reg; 368 /* Controller is in source VM when doing live migration */ 369 bool in_source_vm; 370 371 struct spdk_thread *thread; 372 struct spdk_poller *vfu_ctx_poller; 373 struct spdk_interrupt *intr; 374 int intr_fd; 375 376 bool queued_quiesce; 377 378 bool reset_shn; 379 380 uint16_t cntlid; 381 struct spdk_nvmf_ctrlr *ctrlr; 382 383 struct nvmf_vfio_user_sq *sqs[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 384 struct nvmf_vfio_user_cq *cqs[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 385 386 TAILQ_ENTRY(nvmf_vfio_user_ctrlr) link; 387 388 volatile uint32_t *bar0_doorbells; 389 struct nvmf_vfio_user_shadow_doorbells *sdbl; 390 /* 391 * Shadow doorbells PRPs to provide during the stop-and-copy state. 392 */ 393 uint64_t shadow_doorbell_buffer; 394 uint64_t eventidx_buffer; 395 396 bool self_kick_requested; 397 }; 398 399 struct nvmf_vfio_user_endpoint { 400 struct nvmf_vfio_user_transport *transport; 401 vfu_ctx_t *vfu_ctx; 402 struct spdk_poller *accept_poller; 403 struct spdk_thread *accept_thread; 404 bool interrupt_mode; 405 struct msixcap *msix; 406 vfu_pci_config_space_t *pci_config_space; 407 int devmem_fd; 408 int accept_intr_fd; 409 struct spdk_interrupt *accept_intr; 410 411 volatile uint32_t *bar0_doorbells; 412 413 int migr_fd; 414 void *migr_data; 415 416 struct spdk_nvme_transport_id trid; 417 const struct spdk_nvmf_subsystem *subsystem; 418 419 struct nvmf_vfio_user_ctrlr *ctrlr; 420 pthread_mutex_t lock; 421 422 bool need_async_destroy; 423 424 TAILQ_ENTRY(nvmf_vfio_user_endpoint) link; 425 }; 426 427 struct nvmf_vfio_user_transport_opts { 428 bool disable_mappable_bar0; 429 bool disable_adaptive_irq; 430 bool disable_shadow_doorbells; 431 }; 432 433 struct nvmf_vfio_user_transport { 434 struct spdk_nvmf_transport transport; 435 struct nvmf_vfio_user_transport_opts transport_opts; 436 bool intr_mode_supported; 437 pthread_mutex_t lock; 438 TAILQ_HEAD(, nvmf_vfio_user_endpoint) endpoints; 439 440 pthread_mutex_t pg_lock; 441 TAILQ_HEAD(, nvmf_vfio_user_poll_group) poll_groups; 442 struct nvmf_vfio_user_poll_group *next_pg; 443 }; 444 445 /* 446 * function prototypes 447 */ 448 static int 449 nvmf_vfio_user_req_free(struct spdk_nvmf_request *req); 450 451 static struct nvmf_vfio_user_req * 452 get_nvmf_vfio_user_req(struct nvmf_vfio_user_sq *sq); 453 454 /* 455 * Local process virtual address of a queue. 456 */ 457 static inline void * 458 q_addr(struct nvme_q_mapping *mapping) 459 { 460 return mapping->iov.iov_base; 461 } 462 463 static inline int 464 queue_index(uint16_t qid, bool is_cq) 465 { 466 return (qid * 2) + is_cq; 467 } 468 469 static inline volatile uint32_t * 470 sq_headp(struct nvmf_vfio_user_sq *sq) 471 { 472 assert(sq != NULL); 473 return &sq->head; 474 } 475 476 static inline volatile uint32_t * 477 sq_dbl_tailp(struct nvmf_vfio_user_sq *sq) 478 { 479 assert(sq != NULL); 480 return sq->dbl_tailp; 481 } 482 483 static inline volatile uint32_t * 484 cq_dbl_headp(struct nvmf_vfio_user_cq *cq) 485 { 486 assert(cq != NULL); 487 return cq->dbl_headp; 488 } 489 490 static inline volatile uint32_t * 491 cq_tailp(struct nvmf_vfio_user_cq *cq) 492 { 493 assert(cq != NULL); 494 return &cq->tail; 495 } 496 497 static inline void 498 sq_head_advance(struct nvmf_vfio_user_sq *sq) 499 { 500 assert(sq != NULL); 501 502 assert(*sq_headp(sq) < sq->size); 503 (*sq_headp(sq))++; 504 505 if (spdk_unlikely(*sq_headp(sq) == sq->size)) { 506 *sq_headp(sq) = 0; 507 } 508 } 509 510 static inline void 511 cq_tail_advance(struct nvmf_vfio_user_cq *cq) 512 { 513 assert(cq != NULL); 514 515 assert(*cq_tailp(cq) < cq->size); 516 (*cq_tailp(cq))++; 517 518 if (spdk_unlikely(*cq_tailp(cq) == cq->size)) { 519 *cq_tailp(cq) = 0; 520 cq->phase = !cq->phase; 521 } 522 } 523 524 static inline bool 525 cq_is_full(struct nvmf_vfio_user_cq *cq) 526 { 527 uint32_t qindex; 528 529 assert(cq != NULL); 530 531 qindex = *cq_tailp(cq) + 1; 532 if (spdk_unlikely(qindex == cq->size)) { 533 qindex = 0; 534 } 535 536 return qindex == *cq_dbl_headp(cq); 537 } 538 539 static bool 540 io_q_exists(struct nvmf_vfio_user_ctrlr *vu_ctrlr, const uint16_t qid, const bool is_cq) 541 { 542 assert(vu_ctrlr != NULL); 543 544 if (qid == 0 || qid >= NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) { 545 return false; 546 } 547 548 if (is_cq) { 549 if (vu_ctrlr->cqs[qid] == NULL) { 550 return false; 551 } 552 553 return (vu_ctrlr->cqs[qid]->cq_state != VFIO_USER_CQ_DELETED && 554 vu_ctrlr->cqs[qid]->cq_state != VFIO_USER_CQ_UNUSED); 555 } 556 557 if (vu_ctrlr->sqs[qid] == NULL) { 558 return false; 559 } 560 561 return (vu_ctrlr->sqs[qid]->sq_state != VFIO_USER_SQ_DELETED && 562 vu_ctrlr->sqs[qid]->sq_state != VFIO_USER_SQ_UNUSED); 563 } 564 565 static inline size_t 566 vfio_user_migr_data_len(void) 567 { 568 return SPDK_ALIGN_CEIL(sizeof(struct vfio_user_nvme_migr_state), PAGE_SIZE); 569 } 570 571 static int 572 vfio_user_handle_intr(void *ctx); 573 574 /* 575 * Wrap vfio_user_handle_intr() such that it can be used with 576 * spdk_thread_send_msg(). 577 * Pollers have type int (*)(void *) while message functions should have type 578 * void (*)(void *), so simply discard the returned value. 579 */ 580 static void 581 vfio_user_handle_intr_wrapper(void *ctx) 582 { 583 vfio_user_handle_intr(ctx); 584 } 585 586 static inline int 587 self_kick(struct nvmf_vfio_user_ctrlr *ctrlr) 588 { 589 assert(ctrlr != NULL); 590 assert(ctrlr->thread != NULL); 591 592 if (ctrlr->self_kick_requested) { 593 return 0; 594 } 595 596 ctrlr->self_kick_requested = true; 597 598 return spdk_thread_send_msg(ctrlr->thread, 599 vfio_user_handle_intr_wrapper, 600 ctrlr); 601 } 602 603 /* 604 * Make the given DMA address and length available (locally mapped) via iov. 605 */ 606 static void * 607 map_one(vfu_ctx_t *ctx, uint64_t addr, uint64_t len, dma_sg_t *sg, 608 struct iovec *iov, int prot) 609 { 610 int ret; 611 612 assert(ctx != NULL); 613 assert(sg != NULL); 614 assert(iov != NULL); 615 616 ret = vfu_addr_to_sg(ctx, (void *)(uintptr_t)addr, len, sg, 1, prot); 617 if (ret < 0) { 618 return NULL; 619 } 620 621 ret = vfu_map_sg(ctx, sg, iov, 1, 0); 622 if (ret != 0) { 623 return NULL; 624 } 625 626 assert(iov->iov_base != NULL); 627 return iov->iov_base; 628 } 629 630 static int 631 nvme_cmd_map_prps(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, 632 uint32_t max_iovcnt, uint32_t len, size_t mps, 633 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 634 { 635 uint64_t prp1, prp2; 636 void *vva; 637 uint32_t i; 638 uint32_t residue_len, nents; 639 uint64_t *prp_list; 640 uint32_t iovcnt; 641 642 assert(max_iovcnt > 0); 643 644 prp1 = cmd->dptr.prp.prp1; 645 prp2 = cmd->dptr.prp.prp2; 646 647 /* PRP1 may started with unaligned page address */ 648 residue_len = mps - (prp1 % mps); 649 residue_len = spdk_min(len, residue_len); 650 651 vva = gpa_to_vva(prv, prp1, residue_len, PROT_READ | PROT_WRITE); 652 if (spdk_unlikely(vva == NULL)) { 653 SPDK_ERRLOG("GPA to VVA failed\n"); 654 return -EINVAL; 655 } 656 len -= residue_len; 657 if (len && max_iovcnt < 2) { 658 SPDK_ERRLOG("Too many page entries, at least two iovs are required\n"); 659 return -ERANGE; 660 } 661 iovs[0].iov_base = vva; 662 iovs[0].iov_len = residue_len; 663 664 if (len) { 665 if (spdk_unlikely(prp2 == 0)) { 666 SPDK_ERRLOG("no PRP2, %d remaining\n", len); 667 return -EINVAL; 668 } 669 670 if (len <= mps) { 671 /* 2 PRP used */ 672 iovcnt = 2; 673 vva = gpa_to_vva(prv, prp2, len, PROT_READ | PROT_WRITE); 674 if (spdk_unlikely(vva == NULL)) { 675 SPDK_ERRLOG("no VVA for %#" PRIx64 ", len%#x\n", 676 prp2, len); 677 return -EINVAL; 678 } 679 iovs[1].iov_base = vva; 680 iovs[1].iov_len = len; 681 } else { 682 /* PRP list used */ 683 nents = (len + mps - 1) / mps; 684 if (spdk_unlikely(nents + 1 > max_iovcnt)) { 685 SPDK_ERRLOG("Too many page entries\n"); 686 return -ERANGE; 687 } 688 689 vva = gpa_to_vva(prv, prp2, nents * sizeof(*prp_list), PROT_READ); 690 if (spdk_unlikely(vva == NULL)) { 691 SPDK_ERRLOG("no VVA for %#" PRIx64 ", nents=%#x\n", 692 prp2, nents); 693 return -EINVAL; 694 } 695 prp_list = vva; 696 i = 0; 697 while (len != 0) { 698 residue_len = spdk_min(len, mps); 699 vva = gpa_to_vva(prv, prp_list[i], residue_len, PROT_READ | PROT_WRITE); 700 if (spdk_unlikely(vva == NULL)) { 701 SPDK_ERRLOG("no VVA for %#" PRIx64 ", residue_len=%#x\n", 702 prp_list[i], residue_len); 703 return -EINVAL; 704 } 705 iovs[i + 1].iov_base = vva; 706 iovs[i + 1].iov_len = residue_len; 707 len -= residue_len; 708 i++; 709 } 710 iovcnt = i + 1; 711 } 712 } else { 713 /* 1 PRP used */ 714 iovcnt = 1; 715 } 716 717 assert(iovcnt <= max_iovcnt); 718 return iovcnt; 719 } 720 721 static int 722 nvme_cmd_map_sgls_data(void *prv, struct spdk_nvme_sgl_descriptor *sgls, uint32_t num_sgls, 723 struct iovec *iovs, uint32_t max_iovcnt, 724 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 725 { 726 uint32_t i; 727 void *vva; 728 729 if (spdk_unlikely(max_iovcnt < num_sgls)) { 730 return -ERANGE; 731 } 732 733 for (i = 0; i < num_sgls; i++) { 734 if (spdk_unlikely(sgls[i].unkeyed.type != SPDK_NVME_SGL_TYPE_DATA_BLOCK)) { 735 SPDK_ERRLOG("Invalid SGL type %u\n", sgls[i].unkeyed.type); 736 return -EINVAL; 737 } 738 vva = gpa_to_vva(prv, sgls[i].address, sgls[i].unkeyed.length, PROT_READ | PROT_WRITE); 739 if (spdk_unlikely(vva == NULL)) { 740 SPDK_ERRLOG("GPA to VVA failed\n"); 741 return -EINVAL; 742 } 743 iovs[i].iov_base = vva; 744 iovs[i].iov_len = sgls[i].unkeyed.length; 745 } 746 747 return num_sgls; 748 } 749 750 static int 751 nvme_cmd_map_sgls(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt, 752 uint32_t len, size_t mps, 753 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 754 { 755 struct spdk_nvme_sgl_descriptor *sgl, *last_sgl; 756 uint32_t num_sgls, seg_len; 757 void *vva; 758 int ret; 759 uint32_t total_iovcnt = 0; 760 761 /* SGL cases */ 762 sgl = &cmd->dptr.sgl1; 763 764 /* only one SGL segment */ 765 if (sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { 766 assert(max_iovcnt > 0); 767 vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, PROT_READ | PROT_WRITE); 768 if (spdk_unlikely(vva == NULL)) { 769 SPDK_ERRLOG("GPA to VVA failed\n"); 770 return -EINVAL; 771 } 772 iovs[0].iov_base = vva; 773 iovs[0].iov_len = sgl->unkeyed.length; 774 assert(sgl->unkeyed.length == len); 775 776 return 1; 777 } 778 779 for (;;) { 780 if (spdk_unlikely((sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_SEGMENT) && 781 (sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_LAST_SEGMENT))) { 782 SPDK_ERRLOG("Invalid SGL type %u\n", sgl->unkeyed.type); 783 return -EINVAL; 784 } 785 786 seg_len = sgl->unkeyed.length; 787 if (spdk_unlikely(seg_len % sizeof(struct spdk_nvme_sgl_descriptor))) { 788 SPDK_ERRLOG("Invalid SGL segment len %u\n", seg_len); 789 return -EINVAL; 790 } 791 792 num_sgls = seg_len / sizeof(struct spdk_nvme_sgl_descriptor); 793 vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, PROT_READ); 794 if (spdk_unlikely(vva == NULL)) { 795 SPDK_ERRLOG("GPA to VVA failed\n"); 796 return -EINVAL; 797 } 798 799 /* sgl point to the first segment */ 800 sgl = (struct spdk_nvme_sgl_descriptor *)vva; 801 last_sgl = &sgl[num_sgls - 1]; 802 803 /* we are done */ 804 if (last_sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { 805 /* map whole sgl list */ 806 ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls, &iovs[total_iovcnt], 807 max_iovcnt - total_iovcnt, gpa_to_vva); 808 if (spdk_unlikely(ret < 0)) { 809 return ret; 810 } 811 total_iovcnt += ret; 812 813 return total_iovcnt; 814 } 815 816 if (num_sgls > 1) { 817 /* map whole sgl exclude last_sgl */ 818 ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls - 1, &iovs[total_iovcnt], 819 max_iovcnt - total_iovcnt, gpa_to_vva); 820 if (spdk_unlikely(ret < 0)) { 821 return ret; 822 } 823 total_iovcnt += ret; 824 } 825 826 /* move to next level's segments */ 827 sgl = last_sgl; 828 } 829 830 return 0; 831 } 832 833 static int 834 nvme_map_cmd(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt, 835 uint32_t len, size_t mps, 836 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 837 { 838 if (cmd->psdt == SPDK_NVME_PSDT_PRP) { 839 return nvme_cmd_map_prps(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva); 840 } 841 842 return nvme_cmd_map_sgls(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva); 843 } 844 845 static char * 846 endpoint_id(struct nvmf_vfio_user_endpoint *endpoint) 847 { 848 return endpoint->trid.traddr; 849 } 850 851 static char * 852 ctrlr_id(struct nvmf_vfio_user_ctrlr *ctrlr) 853 { 854 if (!ctrlr || !ctrlr->endpoint) { 855 return "Null Ctrlr"; 856 } 857 858 return endpoint_id(ctrlr->endpoint); 859 } 860 861 /* 862 * For each queue, update the location of its doorbell to the correct location: 863 * either our own BAR0, or the guest's configured shadow doorbell area. 864 * 865 * The Admin queue (qid: 0) does not ever use shadow doorbells. 866 */ 867 static void 868 vfio_user_ctrlr_switch_doorbells(struct nvmf_vfio_user_ctrlr *ctrlr, bool shadow) 869 { 870 volatile uint32_t *doorbells = shadow ? ctrlr->sdbl->shadow_doorbells : 871 ctrlr->bar0_doorbells; 872 873 assert(doorbells != NULL); 874 875 for (size_t i = 1; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; i++) { 876 struct nvmf_vfio_user_sq *sq = ctrlr->sqs[i]; 877 struct nvmf_vfio_user_cq *cq = ctrlr->cqs[i]; 878 879 if (sq != NULL) { 880 sq->dbl_tailp = doorbells + queue_index(sq->qid, false); 881 } 882 883 if (cq != NULL) { 884 cq->dbl_headp = doorbells + queue_index(cq->qid, true); 885 } 886 } 887 } 888 889 static void 890 unmap_sdbl(vfu_ctx_t *vfu_ctx, struct nvmf_vfio_user_shadow_doorbells *sdbl) 891 { 892 assert(vfu_ctx != NULL); 893 assert(sdbl != NULL); 894 895 /* 896 * An allocation error would result in only one of the two being 897 * non-NULL. If that is the case, no memory should have been mapped. 898 */ 899 if (sdbl->iovs == NULL || sdbl->sgs == NULL) { 900 return; 901 } 902 903 for (size_t i = 0; i < NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT; ++i) { 904 struct iovec *iov; 905 dma_sg_t *sg; 906 907 if (!sdbl->iovs[i].iov_len) { 908 continue; 909 } 910 911 sg = (dma_sg_t *)((uintptr_t)sdbl->sgs + i * dma_sg_size()); 912 iov = sdbl->iovs + i; 913 914 vfu_unmap_sg(vfu_ctx, sg, iov, 1); 915 } 916 } 917 918 static void 919 free_sdbl(vfu_ctx_t *vfu_ctx, struct nvmf_vfio_user_shadow_doorbells *sdbl) 920 { 921 if (sdbl == NULL) { 922 return; 923 } 924 925 unmap_sdbl(vfu_ctx, sdbl); 926 927 /* 928 * sdbl->shadow_doorbells and sdbl->eventidxs were mapped, 929 * not allocated, so don't free() them. 930 */ 931 free(sdbl->sgs); 932 free(sdbl->iovs); 933 free(sdbl); 934 } 935 936 static struct nvmf_vfio_user_shadow_doorbells * 937 map_sdbl(vfu_ctx_t *vfu_ctx, uint64_t prp1, uint64_t prp2, size_t len) 938 { 939 struct nvmf_vfio_user_shadow_doorbells *sdbl = NULL; 940 dma_sg_t *sg2 = NULL; 941 void *p; 942 943 assert(vfu_ctx != NULL); 944 945 sdbl = calloc(1, sizeof(*sdbl)); 946 if (sdbl == NULL) { 947 goto err; 948 } 949 950 sdbl->sgs = calloc(NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT, dma_sg_size()); 951 sdbl->iovs = calloc(NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT, sizeof(*sdbl->iovs)); 952 if (sdbl->sgs == NULL || sdbl->iovs == NULL) { 953 goto err; 954 } 955 956 /* Map shadow doorbell buffer (PRP1). */ 957 p = map_one(vfu_ctx, prp1, len, sdbl->sgs, sdbl->iovs, 958 PROT_READ | PROT_WRITE); 959 960 if (p == NULL) { 961 goto err; 962 } 963 964 /* 965 * Map eventidx buffer (PRP2). 966 * Should only be written to by the controller. 967 */ 968 969 sg2 = (dma_sg_t *)((uintptr_t)sdbl->sgs + dma_sg_size()); 970 971 p = map_one(vfu_ctx, prp2, len, sg2, sdbl->iovs + 1, 972 PROT_READ | PROT_WRITE); 973 974 if (p == NULL) { 975 goto err; 976 } 977 978 sdbl->shadow_doorbells = (uint32_t *)sdbl->iovs[0].iov_base; 979 sdbl->eventidxs = (uint32_t *)sdbl->iovs[1].iov_base; 980 981 return sdbl; 982 983 err: 984 free_sdbl(vfu_ctx, sdbl); 985 return NULL; 986 } 987 988 /* 989 * Copy doorbells from one buffer to the other, during switches betweeen BAR0 990 * doorbells and shadow doorbells. 991 */ 992 static void 993 copy_doorbells(struct nvmf_vfio_user_ctrlr *ctrlr, 994 const volatile uint32_t *from, volatile uint32_t *to) 995 { 996 assert(ctrlr != NULL); 997 assert(from != NULL); 998 assert(to != NULL); 999 1000 SPDK_DEBUGLOG(vfio_user_db, 1001 "%s: migrating shadow doorbells from %p to %p\n", 1002 ctrlr_id(ctrlr), from, to); 1003 1004 /* Can't use memcpy because it doesn't respect volatile semantics. */ 1005 for (size_t i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; ++i) { 1006 if (ctrlr->sqs[i] != NULL) { 1007 to[queue_index(i, false)] = from[queue_index(i, false)]; 1008 } 1009 1010 if (ctrlr->cqs[i] != NULL) { 1011 to[queue_index(i, true)] = from[queue_index(i, true)]; 1012 } 1013 } 1014 } 1015 1016 static void 1017 fail_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1018 { 1019 const struct spdk_nvmf_registers *regs; 1020 1021 assert(vu_ctrlr != NULL); 1022 assert(vu_ctrlr->ctrlr != NULL); 1023 1024 regs = spdk_nvmf_ctrlr_get_regs(vu_ctrlr->ctrlr); 1025 if (regs->csts.bits.cfs == 0) { 1026 SPDK_ERRLOG(":%s failing controller\n", ctrlr_id(vu_ctrlr)); 1027 } 1028 1029 nvmf_ctrlr_set_fatal_status(vu_ctrlr->ctrlr); 1030 } 1031 1032 static inline bool 1033 ctrlr_interrupt_enabled(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1034 { 1035 assert(vu_ctrlr != NULL); 1036 assert(vu_ctrlr->endpoint != NULL); 1037 1038 vfu_pci_config_space_t *pci = vu_ctrlr->endpoint->pci_config_space; 1039 1040 return (!pci->hdr.cmd.id || vu_ctrlr->endpoint->msix->mxc.mxe); 1041 } 1042 1043 static void 1044 nvmf_vfio_user_destroy_endpoint(struct nvmf_vfio_user_endpoint *endpoint) 1045 { 1046 SPDK_DEBUGLOG(nvmf_vfio, "destroy endpoint %s\n", endpoint_id(endpoint)); 1047 1048 spdk_interrupt_unregister(&endpoint->accept_intr); 1049 spdk_poller_unregister(&endpoint->accept_poller); 1050 1051 if (endpoint->bar0_doorbells) { 1052 munmap((void *)endpoint->bar0_doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 1053 } 1054 1055 if (endpoint->devmem_fd > 0) { 1056 close(endpoint->devmem_fd); 1057 } 1058 1059 if (endpoint->migr_data) { 1060 munmap(endpoint->migr_data, vfio_user_migr_data_len()); 1061 } 1062 1063 if (endpoint->migr_fd > 0) { 1064 close(endpoint->migr_fd); 1065 } 1066 1067 if (endpoint->vfu_ctx) { 1068 vfu_destroy_ctx(endpoint->vfu_ctx); 1069 } 1070 1071 pthread_mutex_destroy(&endpoint->lock); 1072 free(endpoint); 1073 } 1074 1075 /* called when process exits */ 1076 static int 1077 nvmf_vfio_user_destroy(struct spdk_nvmf_transport *transport, 1078 spdk_nvmf_transport_destroy_done_cb cb_fn, void *cb_arg) 1079 { 1080 struct nvmf_vfio_user_transport *vu_transport; 1081 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 1082 1083 SPDK_DEBUGLOG(nvmf_vfio, "destroy transport\n"); 1084 1085 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 1086 transport); 1087 1088 pthread_mutex_destroy(&vu_transport->lock); 1089 pthread_mutex_destroy(&vu_transport->pg_lock); 1090 1091 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 1092 TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link); 1093 nvmf_vfio_user_destroy_endpoint(endpoint); 1094 } 1095 1096 free(vu_transport); 1097 1098 if (cb_fn) { 1099 cb_fn(cb_arg); 1100 } 1101 1102 return 0; 1103 } 1104 1105 static const struct spdk_json_object_decoder vfio_user_transport_opts_decoder[] = { 1106 { 1107 "disable_mappable_bar0", 1108 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_mappable_bar0), 1109 spdk_json_decode_bool, true 1110 }, 1111 { 1112 "disable_adaptive_irq", 1113 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_adaptive_irq), 1114 spdk_json_decode_bool, true 1115 }, 1116 { 1117 "disable_shadow_doorbells", 1118 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_shadow_doorbells), 1119 spdk_json_decode_bool, true 1120 }, 1121 }; 1122 1123 static struct spdk_nvmf_transport * 1124 nvmf_vfio_user_create(struct spdk_nvmf_transport_opts *opts) 1125 { 1126 struct nvmf_vfio_user_transport *vu_transport; 1127 int err; 1128 1129 if (opts->max_qpairs_per_ctrlr > NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) { 1130 SPDK_ERRLOG("Invalid max_qpairs_per_ctrlr=%d, supported max_qpairs_per_ctrlr=%d\n", 1131 opts->max_qpairs_per_ctrlr, NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR); 1132 return NULL; 1133 } 1134 1135 vu_transport = calloc(1, sizeof(*vu_transport)); 1136 if (vu_transport == NULL) { 1137 SPDK_ERRLOG("Transport alloc fail: %m\n"); 1138 return NULL; 1139 } 1140 1141 err = pthread_mutex_init(&vu_transport->lock, NULL); 1142 if (err != 0) { 1143 SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err); 1144 goto err; 1145 } 1146 TAILQ_INIT(&vu_transport->endpoints); 1147 1148 err = pthread_mutex_init(&vu_transport->pg_lock, NULL); 1149 if (err != 0) { 1150 pthread_mutex_destroy(&vu_transport->lock); 1151 SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err); 1152 goto err; 1153 } 1154 TAILQ_INIT(&vu_transport->poll_groups); 1155 1156 if (opts->transport_specific != NULL && 1157 spdk_json_decode_object_relaxed(opts->transport_specific, vfio_user_transport_opts_decoder, 1158 SPDK_COUNTOF(vfio_user_transport_opts_decoder), 1159 vu_transport)) { 1160 SPDK_ERRLOG("spdk_json_decode_object_relaxed failed\n"); 1161 goto cleanup; 1162 } 1163 1164 /* 1165 * To support interrupt mode, the transport must be configured with 1166 * mappable BAR0 disabled: we need a vfio-user message to wake us up 1167 * when a client writes new doorbell values to BAR0, via the 1168 * libvfio-user socket fd. 1169 */ 1170 vu_transport->intr_mode_supported = 1171 vu_transport->transport_opts.disable_mappable_bar0; 1172 1173 /* 1174 * If BAR0 is mappable, it doesn't make sense to support shadow 1175 * doorbells, so explicitly turn it off. 1176 */ 1177 if (!vu_transport->transport_opts.disable_mappable_bar0) { 1178 vu_transport->transport_opts.disable_shadow_doorbells = true; 1179 } 1180 1181 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_mappable_bar0=%d\n", 1182 vu_transport->transport_opts.disable_mappable_bar0); 1183 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_adaptive_irq=%d\n", 1184 vu_transport->transport_opts.disable_adaptive_irq); 1185 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_shadow_doorbells=%d\n", 1186 vu_transport->transport_opts.disable_shadow_doorbells); 1187 1188 return &vu_transport->transport; 1189 1190 cleanup: 1191 pthread_mutex_destroy(&vu_transport->lock); 1192 pthread_mutex_destroy(&vu_transport->pg_lock); 1193 err: 1194 free(vu_transport); 1195 return NULL; 1196 } 1197 1198 static uint32_t 1199 max_queue_size(struct nvmf_vfio_user_ctrlr const *vu_ctrlr) 1200 { 1201 assert(vu_ctrlr != NULL); 1202 assert(vu_ctrlr->ctrlr != NULL); 1203 1204 return vu_ctrlr->ctrlr->vcprop.cap.bits.mqes + 1; 1205 } 1206 1207 static uint32_t 1208 doorbell_stride(const struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1209 { 1210 assert(vu_ctrlr != NULL); 1211 assert(vu_ctrlr->ctrlr != NULL); 1212 1213 return vu_ctrlr->ctrlr->vcprop.cap.bits.dstrd; 1214 } 1215 1216 static uintptr_t 1217 memory_page_size(const struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1218 { 1219 uint32_t memory_page_shift = vu_ctrlr->ctrlr->vcprop.cc.bits.mps + 12; 1220 return 1ul << memory_page_shift; 1221 } 1222 1223 static uintptr_t 1224 memory_page_mask(const struct nvmf_vfio_user_ctrlr *ctrlr) 1225 { 1226 return ~(memory_page_size(ctrlr) - 1); 1227 } 1228 1229 static int 1230 map_q(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvme_q_mapping *mapping, 1231 uint32_t q_size, bool is_cq, bool unmap) 1232 { 1233 uint64_t len; 1234 void *ret; 1235 1236 assert(q_size); 1237 assert(q_addr(mapping) == NULL); 1238 1239 if (is_cq) { 1240 len = q_size * sizeof(struct spdk_nvme_cpl); 1241 } else { 1242 len = q_size * sizeof(struct spdk_nvme_cmd); 1243 } 1244 1245 ret = map_one(vu_ctrlr->endpoint->vfu_ctx, mapping->prp1, len, 1246 mapping->sg, &mapping->iov, 1247 is_cq ? PROT_READ | PROT_WRITE : PROT_READ); 1248 if (ret == NULL) { 1249 return -EFAULT; 1250 } 1251 1252 if (unmap) { 1253 memset(q_addr(mapping), 0, len); 1254 } 1255 1256 return 0; 1257 } 1258 1259 static inline void 1260 unmap_q(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvme_q_mapping *mapping) 1261 { 1262 if (q_addr(mapping) != NULL) { 1263 vfu_unmap_sg(vu_ctrlr->endpoint->vfu_ctx, mapping->sg, 1264 &mapping->iov, 1); 1265 mapping->iov.iov_base = NULL; 1266 } 1267 } 1268 1269 static int 1270 asq_setup(struct nvmf_vfio_user_ctrlr *ctrlr) 1271 { 1272 struct nvmf_vfio_user_sq *sq; 1273 const struct spdk_nvmf_registers *regs; 1274 int ret; 1275 1276 assert(ctrlr != NULL); 1277 1278 sq = ctrlr->sqs[0]; 1279 1280 assert(sq != NULL); 1281 assert(q_addr(&sq->mapping) == NULL); 1282 /* XXX ctrlr->asq == 0 is a valid memory address */ 1283 1284 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->ctrlr); 1285 sq->qid = 0; 1286 sq->size = regs->aqa.bits.asqs + 1; 1287 sq->mapping.prp1 = regs->asq; 1288 *sq_headp(sq) = 0; 1289 sq->cqid = 0; 1290 1291 ret = map_q(ctrlr, &sq->mapping, sq->size, false, true); 1292 if (ret) { 1293 return ret; 1294 } 1295 1296 /* The Admin queue (qid: 0) does not ever use shadow doorbells. */ 1297 sq->dbl_tailp = ctrlr->bar0_doorbells + queue_index(0, false); 1298 1299 *sq_dbl_tailp(sq) = 0; 1300 1301 return 0; 1302 } 1303 1304 /* 1305 * Updates eventidx to set an SQ into interrupt or polling mode. 1306 * 1307 * Returns false if the current SQ tail does not match the SQ head, as 1308 * this means that the host has submitted more items to the queue while we were 1309 * not looking - or during the event index update. In that case, we must retry, 1310 * or otherwise make sure we are going to wake up again. 1311 */ 1312 static bool 1313 set_sq_eventidx(struct nvmf_vfio_user_sq *sq) 1314 { 1315 struct nvmf_vfio_user_ctrlr *ctrlr; 1316 volatile uint32_t *sq_tail_eidx; 1317 uint32_t old_tail, new_tail; 1318 1319 assert(sq != NULL); 1320 assert(sq->ctrlr != NULL); 1321 assert(sq->ctrlr->sdbl != NULL); 1322 assert(sq->need_rearm); 1323 1324 ctrlr = sq->ctrlr; 1325 1326 SPDK_DEBUGLOG(vfio_user_db, "%s: updating eventidx of sqid:%u\n", 1327 ctrlr_id(ctrlr), sq->qid); 1328 1329 sq_tail_eidx = ctrlr->sdbl->eventidxs + queue_index(sq->qid, false); 1330 1331 assert(ctrlr->endpoint != NULL); 1332 1333 if (!ctrlr->endpoint->interrupt_mode) { 1334 /* No synchronisation necessary. */ 1335 *sq_tail_eidx = NVMF_VFIO_USER_EVENTIDX_POLL; 1336 return true; 1337 } 1338 1339 old_tail = *sq_dbl_tailp(sq); 1340 *sq_tail_eidx = old_tail; 1341 1342 /* 1343 * Ensure that the event index is updated before re-reading the tail 1344 * doorbell. If it's not, then the host might race us and update the 1345 * tail after the second read but before the event index is written, so 1346 * it won't write to BAR0 and we'll miss the update. 1347 * 1348 * The driver should provide similar ordering with an mb(). 1349 */ 1350 spdk_mb(); 1351 1352 /* 1353 * Check if the host has updated the tail doorbell after we've read it 1354 * for the first time, but before the event index was written. If that's 1355 * the case, then we've lost the race and we need to update the event 1356 * index again (after polling the queue, since the host won't write to 1357 * BAR0). 1358 */ 1359 new_tail = *sq_dbl_tailp(sq); 1360 1361 /* 1362 * We might poll the queue straight after this function returns if the 1363 * tail has been updated, so we need to ensure that any changes to the 1364 * queue will be visible to us if the doorbell has been updated. 1365 * 1366 * The driver should provide similar ordering with a wmb() to ensure 1367 * that the queue is written before it updates the tail doorbell. 1368 */ 1369 spdk_rmb(); 1370 1371 SPDK_DEBUGLOG(vfio_user_db, "%s: sqid:%u, old_tail=%u, new_tail=%u, " 1372 "sq_head=%u\n", ctrlr_id(ctrlr), sq->qid, old_tail, 1373 new_tail, *sq_headp(sq)); 1374 1375 if (new_tail == *sq_headp(sq)) { 1376 sq->need_rearm = false; 1377 return true; 1378 } 1379 1380 /* 1381 * We've lost the race: the tail was updated since we last polled, 1382 * including if it happened within this routine. 1383 * 1384 * The caller should retry after polling (think of this as a cmpxchg 1385 * loop); if we go to sleep while the SQ is not empty, then we won't 1386 * process the remaining events. 1387 */ 1388 return false; 1389 } 1390 1391 static int 1392 nvmf_vfio_user_sq_poll(struct nvmf_vfio_user_sq *sq); 1393 1394 /* 1395 * Arrange for an SQ to interrupt us if written. Returns non-zero if we 1396 * processed some SQ entries. 1397 */ 1398 static int 1399 set_sq_intr_mode(struct nvmf_vfio_user_ctrlr *ctrlr, 1400 struct nvmf_vfio_user_sq *sq) 1401 { 1402 int count = 0; 1403 size_t i; 1404 1405 if (!sq->need_rearm) { 1406 return 0; 1407 } 1408 1409 for (i = 0; i < NVMF_VFIO_USER_SET_EVENTIDX_MAX_ATTEMPTS; i++) { 1410 int ret; 1411 1412 if (set_sq_eventidx(sq)) { 1413 /* We won the race and set eventidx; done. */ 1414 return count; 1415 } 1416 1417 ret = nvmf_vfio_user_sq_poll(sq); 1418 1419 count += (ret < 0) ? 1 : ret; 1420 1421 /* 1422 * set_sq_eventidx() hit the race, so we expected 1423 * to process at least one command from this queue. 1424 * If there were no new commands waiting for us, then 1425 * we must have hit an unexpected race condition. 1426 */ 1427 if (ret == 0) { 1428 SPDK_ERRLOG("%s: unexpected race condition detected " 1429 "while updating the shadow doorbell buffer\n", 1430 ctrlr_id(ctrlr)); 1431 1432 fail_ctrlr(ctrlr); 1433 return count; 1434 } 1435 } 1436 1437 SPDK_DEBUGLOG(vfio_user_db, 1438 "%s: set_sq_eventidx() lost the race %zu times\n", 1439 ctrlr_id(ctrlr), i); 1440 1441 /* 1442 * We couldn't arrange an eventidx guaranteed to cause a BAR0 write, as 1443 * we raced with the producer too many times; force ourselves to wake up 1444 * instead. We'll process all queues at that point. 1445 */ 1446 self_kick(ctrlr); 1447 1448 return count; 1449 } 1450 1451 /* 1452 * We're in interrupt mode, and potentially about to go to sleep. We need to 1453 * make sure any further I/O submissions are guaranteed to wake us up: for 1454 * shadow doorbells that means we may need to go through set_sq_eventidx() for 1455 * every SQ that needs re-arming. 1456 * 1457 * Returns non-zero if we processed something. 1458 */ 1459 static int 1460 set_ctrlr_intr_mode(struct nvmf_vfio_user_ctrlr *ctrlr) 1461 { 1462 int count = 0; 1463 1464 assert(ctrlr != NULL); 1465 1466 if (ctrlr->sdbl == NULL) { 1467 return 0; 1468 } 1469 1470 /* 1471 * The admin queue (qid: 0) doesn't use the shadow doorbell buffer, so 1472 * skip it. 1473 */ 1474 for (size_t i = 1; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; ++i) { 1475 if (!io_q_exists(ctrlr, i, false)) { 1476 continue; 1477 } 1478 1479 count += set_sq_intr_mode(ctrlr, ctrlr->sqs[i]); 1480 } 1481 1482 return count; 1483 } 1484 1485 static int 1486 acq_setup(struct nvmf_vfio_user_ctrlr *ctrlr) 1487 { 1488 struct nvmf_vfio_user_cq *cq; 1489 const struct spdk_nvmf_registers *regs; 1490 int ret; 1491 1492 assert(ctrlr != NULL); 1493 1494 cq = ctrlr->cqs[0]; 1495 1496 assert(cq != NULL); 1497 1498 assert(q_addr(&cq->mapping) == NULL); 1499 1500 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->ctrlr); 1501 assert(regs != NULL); 1502 cq->qid = 0; 1503 cq->size = regs->aqa.bits.acqs + 1; 1504 cq->mapping.prp1 = regs->acq; 1505 *cq_tailp(cq) = 0; 1506 cq->ien = true; 1507 cq->phase = true; 1508 1509 ret = map_q(ctrlr, &cq->mapping, cq->size, true, true); 1510 if (ret) { 1511 return ret; 1512 } 1513 1514 /* The Admin queue (qid: 0) does not ever use shadow doorbells. */ 1515 cq->dbl_headp = ctrlr->bar0_doorbells + queue_index(0, true); 1516 1517 *cq_dbl_headp(cq) = 0; 1518 1519 return 0; 1520 } 1521 1522 static inline dma_sg_t * 1523 vu_req_to_sg_t(struct nvmf_vfio_user_req *vu_req, uint32_t iovcnt) 1524 { 1525 return (dma_sg_t *)(vu_req->sg + iovcnt * dma_sg_size()); 1526 } 1527 1528 static void * 1529 _map_one(void *prv, uint64_t addr, uint64_t len, int prot) 1530 { 1531 struct spdk_nvmf_request *req = (struct spdk_nvmf_request *)prv; 1532 struct spdk_nvmf_qpair *qpair; 1533 struct nvmf_vfio_user_req *vu_req; 1534 struct nvmf_vfio_user_sq *sq; 1535 void *ret; 1536 1537 assert(req != NULL); 1538 qpair = req->qpair; 1539 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 1540 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 1541 1542 assert(vu_req->iovcnt < NVMF_VFIO_USER_MAX_IOVECS); 1543 ret = map_one(sq->ctrlr->endpoint->vfu_ctx, addr, len, 1544 vu_req_to_sg_t(vu_req, vu_req->iovcnt), 1545 &vu_req->iov[vu_req->iovcnt], prot); 1546 if (spdk_likely(ret != NULL)) { 1547 vu_req->iovcnt++; 1548 } 1549 return ret; 1550 } 1551 1552 static int 1553 vfio_user_map_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req, 1554 struct iovec *iov, uint32_t length) 1555 { 1556 /* Map PRP list to from Guest physical memory to 1557 * virtual memory address. 1558 */ 1559 return nvme_map_cmd(req, &req->cmd->nvme_cmd, iov, NVMF_REQ_MAX_BUFFERS, 1560 length, 4096, _map_one); 1561 } 1562 1563 static int 1564 handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 1565 struct nvmf_vfio_user_sq *sq); 1566 1567 static inline int 1568 adaptive_irq_enabled(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_cq *cq) 1569 { 1570 return (!spdk_interrupt_mode_is_enabled() && cq->qid != 0 && 1571 !ctrlr->transport->transport_opts.disable_adaptive_irq); 1572 1573 } 1574 1575 /* 1576 * Posts a CQE in the completion queue. 1577 * 1578 * @ctrlr: the vfio-user controller 1579 * @cq: the completion queue 1580 * @cdw0: cdw0 as reported by NVMf 1581 * @sqid: submission queue ID 1582 * @cid: command identifier in NVMe command 1583 * @sc: the NVMe CQE status code 1584 * @sct: the NVMe CQE status code type 1585 */ 1586 static int 1587 post_completion(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_cq *cq, 1588 uint32_t cdw0, uint16_t sqid, uint16_t cid, uint16_t sc, uint16_t sct) 1589 { 1590 struct spdk_nvme_status cpl_status = { 0 }; 1591 const struct spdk_nvmf_registers *regs; 1592 struct spdk_nvme_cpl *cpl; 1593 int err; 1594 1595 assert(ctrlr != NULL); 1596 1597 if (spdk_unlikely(cq == NULL || q_addr(&cq->mapping) == NULL)) { 1598 return 0; 1599 } 1600 1601 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->ctrlr); 1602 if (regs->csts.bits.shst != SPDK_NVME_SHST_NORMAL) { 1603 SPDK_DEBUGLOG(nvmf_vfio, 1604 "%s: ignore completion sqid:%d cid=%d status=%#x\n", 1605 ctrlr_id(ctrlr), sqid, cid, sc); 1606 return 0; 1607 } 1608 1609 if (cq_is_full(cq)) { 1610 SPDK_ERRLOG("%s: cqid:%d full (tail=%d, head=%d)\n", 1611 ctrlr_id(ctrlr), cq->qid, *cq_tailp(cq), 1612 *cq_dbl_headp(cq)); 1613 return -1; 1614 } 1615 1616 cpl = ((struct spdk_nvme_cpl *)q_addr(&cq->mapping)) + *cq_tailp(cq); 1617 1618 assert(ctrlr->sqs[sqid] != NULL); 1619 SPDK_DEBUGLOG(nvmf_vfio, 1620 "%s: request complete sqid:%d cid=%d status=%#x " 1621 "sqhead=%d cq tail=%d\n", ctrlr_id(ctrlr), sqid, cid, sc, 1622 *sq_headp(ctrlr->sqs[sqid]), *cq_tailp(cq)); 1623 1624 cpl->sqhd = *sq_headp(ctrlr->sqs[sqid]); 1625 cpl->sqid = sqid; 1626 cpl->cid = cid; 1627 cpl->cdw0 = cdw0; 1628 1629 /* 1630 * This is a bitfield: instead of setting the individual bits we need 1631 * directly in cpl->status, which would cause a read-modify-write cycle, 1632 * we'll avoid reading from the CPL altogether by filling in a local 1633 * cpl_status variable, then writing the whole thing. 1634 */ 1635 cpl_status.sct = sct; 1636 cpl_status.sc = sc; 1637 cpl_status.p = cq->phase; 1638 cpl->status = cpl_status; 1639 1640 /* Ensure the Completion Queue Entry is visible. */ 1641 spdk_wmb(); 1642 cq_tail_advance(cq); 1643 1644 /* 1645 * this function now executes at SPDK thread context, we 1646 * might be triggering interrupts from vfio-user thread context so 1647 * check for race conditions. 1648 */ 1649 if (!adaptive_irq_enabled(ctrlr, cq) && 1650 cq->ien && ctrlr_interrupt_enabled(ctrlr)) { 1651 err = vfu_irq_trigger(ctrlr->endpoint->vfu_ctx, cq->iv); 1652 if (err != 0) { 1653 SPDK_ERRLOG("%s: failed to trigger interrupt: %m\n", 1654 ctrlr_id(ctrlr)); 1655 return err; 1656 } 1657 } 1658 1659 return 0; 1660 } 1661 1662 static void 1663 free_sq_reqs(struct nvmf_vfio_user_sq *sq) 1664 { 1665 while (!TAILQ_EMPTY(&sq->free_reqs)) { 1666 struct nvmf_vfio_user_req *vu_req = TAILQ_FIRST(&sq->free_reqs); 1667 TAILQ_REMOVE(&sq->free_reqs, vu_req, link); 1668 free(vu_req); 1669 } 1670 } 1671 1672 /* Deletes a SQ, if this SQ is the last user of the associated CQ 1673 * and the controller is being shut down or reset, then the CQ is 1674 * also deleted. 1675 */ 1676 static void 1677 delete_sq_done(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvmf_vfio_user_sq *sq) 1678 { 1679 struct nvmf_vfio_user_cq *cq; 1680 uint16_t cqid; 1681 1682 SPDK_DEBUGLOG(nvmf_vfio, "%s: delete sqid:%d=%p done\n", ctrlr_id(vu_ctrlr), 1683 sq->qid, sq); 1684 1685 /* Free SQ resources */ 1686 unmap_q(vu_ctrlr, &sq->mapping); 1687 1688 free_sq_reqs(sq); 1689 1690 sq->size = 0; 1691 1692 sq->sq_state = VFIO_USER_SQ_DELETED; 1693 1694 /* Controller RESET and SHUTDOWN are special cases, 1695 * VM may not send DELETE IO SQ/CQ commands, NVMf library 1696 * will disconnect IO queue pairs. 1697 */ 1698 if (vu_ctrlr->reset_shn) { 1699 cqid = sq->cqid; 1700 cq = vu_ctrlr->cqs[cqid]; 1701 1702 SPDK_DEBUGLOG(nvmf_vfio, "%s: try to delete cqid:%u=%p\n", ctrlr_id(vu_ctrlr), 1703 cq->qid, cq); 1704 1705 if (cq->cq_ref) { 1706 cq->cq_ref--; 1707 } 1708 if (cq->cq_ref == 0) { 1709 unmap_q(vu_ctrlr, &cq->mapping); 1710 cq->size = 0; 1711 cq->cq_state = VFIO_USER_CQ_DELETED; 1712 cq->group = NULL; 1713 } 1714 } 1715 } 1716 1717 static void 1718 free_qp(struct nvmf_vfio_user_ctrlr *ctrlr, uint16_t qid) 1719 { 1720 struct nvmf_vfio_user_sq *sq; 1721 struct nvmf_vfio_user_cq *cq; 1722 1723 if (ctrlr == NULL) { 1724 return; 1725 } 1726 1727 sq = ctrlr->sqs[qid]; 1728 if (sq) { 1729 SPDK_DEBUGLOG(nvmf_vfio, "%s: Free SQ %u\n", ctrlr_id(ctrlr), qid); 1730 unmap_q(ctrlr, &sq->mapping); 1731 1732 free_sq_reqs(sq); 1733 1734 free(sq->mapping.sg); 1735 free(sq); 1736 ctrlr->sqs[qid] = NULL; 1737 } 1738 1739 cq = ctrlr->cqs[qid]; 1740 if (cq) { 1741 SPDK_DEBUGLOG(nvmf_vfio, "%s: Free cqid:%u\n", ctrlr_id(ctrlr), qid); 1742 unmap_q(ctrlr, &cq->mapping); 1743 free(cq->mapping.sg); 1744 free(cq); 1745 ctrlr->cqs[qid] = NULL; 1746 } 1747 } 1748 1749 static int 1750 init_sq(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_transport *transport, 1751 const uint16_t id) 1752 { 1753 struct nvmf_vfio_user_sq *sq; 1754 1755 assert(ctrlr != NULL); 1756 assert(transport != NULL); 1757 assert(ctrlr->sqs[id] == NULL); 1758 1759 sq = calloc(1, sizeof(*sq)); 1760 if (sq == NULL) { 1761 return -ENOMEM; 1762 } 1763 sq->mapping.sg = calloc(1, dma_sg_size()); 1764 if (sq->mapping.sg == NULL) { 1765 free(sq); 1766 return -ENOMEM; 1767 } 1768 1769 sq->qid = id; 1770 sq->qpair.qid = id; 1771 sq->qpair.transport = transport; 1772 sq->ctrlr = ctrlr; 1773 ctrlr->sqs[id] = sq; 1774 1775 TAILQ_INIT(&sq->free_reqs); 1776 1777 return 0; 1778 } 1779 1780 static int 1781 init_cq(struct nvmf_vfio_user_ctrlr *vu_ctrlr, const uint16_t id) 1782 { 1783 struct nvmf_vfio_user_cq *cq; 1784 1785 assert(vu_ctrlr != NULL); 1786 assert(vu_ctrlr->cqs[id] == NULL); 1787 1788 cq = calloc(1, sizeof(*cq)); 1789 if (cq == NULL) { 1790 return -ENOMEM; 1791 } 1792 cq->mapping.sg = calloc(1, dma_sg_size()); 1793 if (cq->mapping.sg == NULL) { 1794 free(cq); 1795 return -ENOMEM; 1796 } 1797 1798 cq->qid = id; 1799 vu_ctrlr->cqs[id] = cq; 1800 1801 return 0; 1802 } 1803 1804 static int 1805 alloc_sq_reqs(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvmf_vfio_user_sq *sq) 1806 { 1807 struct nvmf_vfio_user_req *vu_req, *tmp; 1808 size_t req_size; 1809 uint32_t i; 1810 1811 req_size = sizeof(struct nvmf_vfio_user_req) + 1812 (dma_sg_size() * NVMF_VFIO_USER_MAX_IOVECS); 1813 1814 for (i = 0; i < sq->size; i++) { 1815 struct spdk_nvmf_request *req; 1816 1817 vu_req = calloc(1, req_size); 1818 if (vu_req == NULL) { 1819 goto err; 1820 } 1821 1822 req = &vu_req->req; 1823 req->qpair = &sq->qpair; 1824 req->rsp = (union nvmf_c2h_msg *)&vu_req->rsp; 1825 req->cmd = (union nvmf_h2c_msg *)&vu_req->cmd; 1826 req->stripped_data = NULL; 1827 1828 TAILQ_INSERT_TAIL(&sq->free_reqs, vu_req, link); 1829 } 1830 1831 return 0; 1832 1833 err: 1834 TAILQ_FOREACH_SAFE(vu_req, &sq->free_reqs, link, tmp) { 1835 free(vu_req); 1836 } 1837 return -ENOMEM; 1838 } 1839 1840 static volatile uint32_t * 1841 ctrlr_doorbell_ptr(struct nvmf_vfio_user_ctrlr *ctrlr) 1842 { 1843 return ctrlr->sdbl != NULL ? 1844 ctrlr->sdbl->shadow_doorbells : 1845 ctrlr->bar0_doorbells; 1846 } 1847 1848 static uint16_t 1849 handle_create_io_sq(struct nvmf_vfio_user_ctrlr *ctrlr, 1850 struct spdk_nvme_cmd *cmd, uint16_t *sct) 1851 { 1852 struct nvmf_vfio_user_transport *vu_transport = ctrlr->transport; 1853 struct nvmf_vfio_user_sq *sq; 1854 uint32_t qsize; 1855 uint16_t cqid; 1856 uint16_t qid; 1857 int err; 1858 1859 qid = cmd->cdw10_bits.create_io_q.qid; 1860 cqid = cmd->cdw11_bits.create_io_sq.cqid; 1861 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 1862 1863 if (ctrlr->sqs[qid] == NULL) { 1864 err = init_sq(ctrlr, ctrlr->sqs[0]->qpair.transport, qid); 1865 if (err != 0) { 1866 *sct = SPDK_NVME_SCT_GENERIC; 1867 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1868 } 1869 } 1870 1871 if (cqid == 0 || cqid >= vu_transport->transport.opts.max_qpairs_per_ctrlr) { 1872 SPDK_ERRLOG("%s: invalid cqid:%u\n", ctrlr_id(ctrlr), cqid); 1873 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1874 return SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 1875 } 1876 1877 /* CQ must be created before SQ. */ 1878 if (!io_q_exists(ctrlr, cqid, true)) { 1879 SPDK_ERRLOG("%s: cqid:%u does not exist\n", ctrlr_id(ctrlr), cqid); 1880 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1881 return SPDK_NVME_SC_COMPLETION_QUEUE_INVALID; 1882 } 1883 1884 if (cmd->cdw11_bits.create_io_sq.pc != 0x1) { 1885 SPDK_ERRLOG("%s: non-PC SQ not supported\n", ctrlr_id(ctrlr)); 1886 *sct = SPDK_NVME_SCT_GENERIC; 1887 return SPDK_NVME_SC_INVALID_FIELD; 1888 } 1889 1890 sq = ctrlr->sqs[qid]; 1891 sq->size = qsize; 1892 1893 SPDK_DEBUGLOG(nvmf_vfio, "%s: sqid:%d cqid:%d\n", ctrlr_id(ctrlr), 1894 qid, cqid); 1895 1896 sq->mapping.prp1 = cmd->dptr.prp.prp1; 1897 1898 err = map_q(ctrlr, &sq->mapping, sq->size, false, true); 1899 if (err) { 1900 SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr)); 1901 *sct = SPDK_NVME_SCT_GENERIC; 1902 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1903 } 1904 1905 SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped sqid:%d IOVA=%#lx vaddr=%p\n", 1906 ctrlr_id(ctrlr), qid, cmd->dptr.prp.prp1, 1907 q_addr(&sq->mapping)); 1908 1909 err = alloc_sq_reqs(ctrlr, sq); 1910 if (err < 0) { 1911 SPDK_ERRLOG("%s: failed to allocate SQ requests: %m\n", ctrlr_id(ctrlr)); 1912 *sct = SPDK_NVME_SCT_GENERIC; 1913 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1914 } 1915 1916 sq->cqid = cqid; 1917 ctrlr->cqs[sq->cqid]->cq_ref++; 1918 sq->sq_state = VFIO_USER_SQ_CREATED; 1919 *sq_headp(sq) = 0; 1920 1921 sq->dbl_tailp = ctrlr_doorbell_ptr(ctrlr) + queue_index(qid, false); 1922 1923 /* 1924 * We should always reset the doorbells. 1925 * 1926 * The Specification prohibits the controller from writing to the shadow 1927 * doorbell buffer, however older versions of the Linux NVMe driver 1928 * don't reset the shadow doorbell buffer after a Queue-Level or 1929 * Controller-Level reset, which means that we're left with garbage 1930 * doorbell values. 1931 */ 1932 *sq_dbl_tailp(sq) = 0; 1933 1934 if (ctrlr->sdbl != NULL) { 1935 sq->need_rearm = true; 1936 1937 if (!set_sq_eventidx(sq)) { 1938 SPDK_ERRLOG("%s: host updated SQ tail doorbell before " 1939 "sqid:%hu was initialized\n", 1940 ctrlr_id(ctrlr), qid); 1941 fail_ctrlr(ctrlr); 1942 *sct = SPDK_NVME_SCT_GENERIC; 1943 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1944 } 1945 } 1946 1947 /* 1948 * Create our new I/O qpair. This asynchronously invokes, on a suitable 1949 * poll group, the nvmf_vfio_user_poll_group_add() callback, which will 1950 * call spdk_nvmf_request_exec_fabrics() with a generated fabrics 1951 * connect command. This command is then eventually completed via 1952 * handle_queue_connect_rsp(). 1953 */ 1954 sq->create_io_sq_cmd = *cmd; 1955 sq->post_create_io_sq_completion = true; 1956 1957 spdk_nvmf_tgt_new_qpair(ctrlr->transport->transport.tgt, 1958 &sq->qpair); 1959 1960 *sct = SPDK_NVME_SCT_GENERIC; 1961 return SPDK_NVME_SC_SUCCESS; 1962 } 1963 1964 static uint16_t 1965 handle_create_io_cq(struct nvmf_vfio_user_ctrlr *ctrlr, 1966 struct spdk_nvme_cmd *cmd, uint16_t *sct) 1967 { 1968 struct nvmf_vfio_user_cq *cq; 1969 uint32_t qsize; 1970 uint16_t qid; 1971 int err; 1972 1973 qid = cmd->cdw10_bits.create_io_q.qid; 1974 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 1975 1976 if (ctrlr->cqs[qid] == NULL) { 1977 err = init_cq(ctrlr, qid); 1978 if (err != 0) { 1979 *sct = SPDK_NVME_SCT_GENERIC; 1980 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1981 } 1982 } 1983 1984 if (cmd->cdw11_bits.create_io_cq.pc != 0x1) { 1985 SPDK_ERRLOG("%s: non-PC CQ not supported\n", ctrlr_id(ctrlr)); 1986 *sct = SPDK_NVME_SCT_GENERIC; 1987 return SPDK_NVME_SC_INVALID_FIELD; 1988 } 1989 1990 if (cmd->cdw11_bits.create_io_cq.iv > NVME_IRQ_MSIX_NUM - 1) { 1991 SPDK_ERRLOG("%s: IV is too big\n", ctrlr_id(ctrlr)); 1992 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1993 return SPDK_NVME_SC_INVALID_INTERRUPT_VECTOR; 1994 } 1995 1996 cq = ctrlr->cqs[qid]; 1997 cq->size = qsize; 1998 1999 cq->mapping.prp1 = cmd->dptr.prp.prp1; 2000 2001 cq->dbl_headp = ctrlr_doorbell_ptr(ctrlr) + queue_index(qid, true); 2002 2003 err = map_q(ctrlr, &cq->mapping, cq->size, true, true); 2004 if (err) { 2005 SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr)); 2006 *sct = SPDK_NVME_SCT_GENERIC; 2007 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2008 } 2009 2010 SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped cqid:%u IOVA=%#lx vaddr=%p\n", 2011 ctrlr_id(ctrlr), qid, cmd->dptr.prp.prp1, 2012 q_addr(&cq->mapping)); 2013 2014 cq->ien = cmd->cdw11_bits.create_io_cq.ien; 2015 cq->iv = cmd->cdw11_bits.create_io_cq.iv; 2016 cq->phase = true; 2017 cq->cq_state = VFIO_USER_CQ_CREATED; 2018 2019 *cq_tailp(cq) = 0; 2020 2021 /* 2022 * We should always reset the doorbells. 2023 * 2024 * The Specification prohibits the controller from writing to the shadow 2025 * doorbell buffer, however older versions of the Linux NVMe driver 2026 * don't reset the shadow doorbell buffer after a Queue-Level or 2027 * Controller-Level reset, which means that we're left with garbage 2028 * doorbell values. 2029 */ 2030 *cq_dbl_headp(cq) = 0; 2031 2032 *sct = SPDK_NVME_SCT_GENERIC; 2033 return SPDK_NVME_SC_SUCCESS; 2034 } 2035 2036 /* 2037 * Creates a completion or submission I/O queue. Returns 0 on success, -errno 2038 * on error. 2039 */ 2040 static int 2041 handle_create_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, 2042 struct spdk_nvme_cmd *cmd, const bool is_cq) 2043 { 2044 struct nvmf_vfio_user_transport *vu_transport = ctrlr->transport; 2045 uint16_t sct = SPDK_NVME_SCT_GENERIC; 2046 uint16_t sc = SPDK_NVME_SC_SUCCESS; 2047 uint32_t qsize; 2048 uint16_t qid; 2049 2050 assert(ctrlr != NULL); 2051 assert(cmd != NULL); 2052 2053 qid = cmd->cdw10_bits.create_io_q.qid; 2054 if (qid == 0 || qid >= vu_transport->transport.opts.max_qpairs_per_ctrlr) { 2055 SPDK_ERRLOG("%s: invalid qid=%d, max=%d\n", ctrlr_id(ctrlr), 2056 qid, vu_transport->transport.opts.max_qpairs_per_ctrlr); 2057 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2058 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2059 goto out; 2060 } 2061 2062 if (io_q_exists(ctrlr, qid, is_cq)) { 2063 SPDK_ERRLOG("%s: %cqid:%d already exists\n", ctrlr_id(ctrlr), 2064 is_cq ? 'c' : 's', qid); 2065 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2066 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2067 goto out; 2068 } 2069 2070 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 2071 if (qsize == 1 || qsize > max_queue_size(ctrlr)) { 2072 SPDK_ERRLOG("%s: invalid I/O queue size %u\n", ctrlr_id(ctrlr), qsize); 2073 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2074 sc = SPDK_NVME_SC_INVALID_QUEUE_SIZE; 2075 goto out; 2076 } 2077 2078 if (is_cq) { 2079 sc = handle_create_io_cq(ctrlr, cmd, &sct); 2080 } else { 2081 sc = handle_create_io_sq(ctrlr, cmd, &sct); 2082 2083 if (sct == SPDK_NVME_SCT_GENERIC && 2084 sc == SPDK_NVME_SC_SUCCESS) { 2085 /* Completion posted asynchronously. */ 2086 return 0; 2087 } 2088 } 2089 2090 out: 2091 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 2092 } 2093 2094 /* For ADMIN I/O DELETE SUBMISSION QUEUE the NVMf library will disconnect and free 2095 * queue pair, so save the command in a context. 2096 */ 2097 struct vfio_user_delete_sq_ctx { 2098 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 2099 struct spdk_nvme_cmd delete_io_sq_cmd; 2100 }; 2101 2102 static void 2103 vfio_user_qpair_delete_cb(void *cb_arg) 2104 { 2105 struct vfio_user_delete_sq_ctx *ctx = cb_arg; 2106 struct nvmf_vfio_user_ctrlr *vu_ctrlr = ctx->vu_ctrlr; 2107 2108 post_completion(vu_ctrlr, vu_ctrlr->cqs[0], 0, 0, ctx->delete_io_sq_cmd.cid, 2109 SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC); 2110 free(ctx); 2111 } 2112 2113 /* 2114 * Deletes a completion or submission I/O queue. 2115 */ 2116 static int 2117 handle_del_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, 2118 struct spdk_nvme_cmd *cmd, const bool is_cq) 2119 { 2120 uint16_t sct = SPDK_NVME_SCT_GENERIC; 2121 uint16_t sc = SPDK_NVME_SC_SUCCESS; 2122 struct nvmf_vfio_user_sq *sq; 2123 struct nvmf_vfio_user_cq *cq; 2124 struct vfio_user_delete_sq_ctx *ctx; 2125 2126 SPDK_DEBUGLOG(nvmf_vfio, "%s: delete I/O %cqid:%d\n", 2127 ctrlr_id(ctrlr), is_cq ? 'c' : 's', 2128 cmd->cdw10_bits.delete_io_q.qid); 2129 2130 if (!io_q_exists(ctrlr, cmd->cdw10_bits.delete_io_q.qid, is_cq)) { 2131 SPDK_ERRLOG("%s: I/O %cqid:%d does not exist\n", ctrlr_id(ctrlr), 2132 is_cq ? 'c' : 's', cmd->cdw10_bits.delete_io_q.qid); 2133 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2134 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2135 goto out; 2136 } 2137 2138 if (is_cq) { 2139 cq = ctrlr->cqs[cmd->cdw10_bits.delete_io_q.qid]; 2140 if (cq->cq_ref) { 2141 SPDK_ERRLOG("%s: the associated SQ must be deleted first\n", ctrlr_id(ctrlr)); 2142 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2143 sc = SPDK_NVME_SC_INVALID_QUEUE_DELETION; 2144 goto out; 2145 } 2146 2147 unmap_q(ctrlr, &cq->mapping); 2148 cq->size = 0; 2149 cq->cq_state = VFIO_USER_CQ_DELETED; 2150 cq->group = NULL; 2151 } else { 2152 ctx = calloc(1, sizeof(*ctx)); 2153 if (!ctx) { 2154 sct = SPDK_NVME_SCT_GENERIC; 2155 sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2156 goto out; 2157 } 2158 ctx->vu_ctrlr = ctrlr; 2159 ctx->delete_io_sq_cmd = *cmd; 2160 2161 sq = ctrlr->sqs[cmd->cdw10_bits.delete_io_q.qid]; 2162 sq->sq_state = VFIO_USER_SQ_DELETED; 2163 assert(ctrlr->cqs[sq->cqid]->cq_ref); 2164 ctrlr->cqs[sq->cqid]->cq_ref--; 2165 2166 spdk_nvmf_qpair_disconnect(&sq->qpair, vfio_user_qpair_delete_cb, ctx); 2167 return 0; 2168 } 2169 2170 out: 2171 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 2172 } 2173 2174 /* 2175 * Configures Shadow Doorbells. 2176 */ 2177 static int 2178 handle_doorbell_buffer_config(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd) 2179 { 2180 struct nvmf_vfio_user_shadow_doorbells *sdbl = NULL; 2181 uint32_t dstrd; 2182 uintptr_t page_size, page_mask; 2183 uint64_t prp1, prp2; 2184 uint16_t sct = SPDK_NVME_SCT_GENERIC; 2185 uint16_t sc = SPDK_NVME_SC_INVALID_FIELD; 2186 2187 assert(ctrlr != NULL); 2188 assert(ctrlr->endpoint != NULL); 2189 assert(cmd != NULL); 2190 2191 dstrd = doorbell_stride(ctrlr); 2192 page_size = memory_page_size(ctrlr); 2193 page_mask = memory_page_mask(ctrlr); 2194 2195 /* FIXME: we don't check doorbell stride when setting queue doorbells. */ 2196 if ((4u << dstrd) * NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR > page_size) { 2197 SPDK_ERRLOG("%s: doorbells do not fit in a single host page", 2198 ctrlr_id(ctrlr)); 2199 2200 goto out; 2201 } 2202 2203 /* Verify guest physical addresses passed as PRPs. */ 2204 if (cmd->psdt != SPDK_NVME_PSDT_PRP) { 2205 SPDK_ERRLOG("%s: received Doorbell Buffer Config without PRPs", 2206 ctrlr_id(ctrlr)); 2207 2208 goto out; 2209 } 2210 2211 prp1 = cmd->dptr.prp.prp1; 2212 prp2 = cmd->dptr.prp.prp2; 2213 2214 SPDK_DEBUGLOG(nvmf_vfio, 2215 "%s: configuring shadow doorbells with PRP1=%#lx and PRP2=%#lx (GPAs)\n", 2216 ctrlr_id(ctrlr), prp1, prp2); 2217 2218 if (prp1 == prp2 2219 || prp1 != (prp1 & page_mask) 2220 || prp2 != (prp2 & page_mask)) { 2221 SPDK_ERRLOG("%s: invalid shadow doorbell GPAs\n", 2222 ctrlr_id(ctrlr)); 2223 2224 goto out; 2225 } 2226 2227 /* Map guest physical addresses to our virtual address space. */ 2228 sdbl = map_sdbl(ctrlr->endpoint->vfu_ctx, prp1, prp2, page_size); 2229 if (sdbl == NULL) { 2230 SPDK_ERRLOG("%s: failed to map shadow doorbell buffers\n", 2231 ctrlr_id(ctrlr)); 2232 2233 goto out; 2234 } 2235 2236 ctrlr->shadow_doorbell_buffer = prp1; 2237 ctrlr->eventidx_buffer = prp2; 2238 2239 SPDK_DEBUGLOG(nvmf_vfio, 2240 "%s: mapped shadow doorbell buffers [%p, %p) and [%p, %p)\n", 2241 ctrlr_id(ctrlr), 2242 sdbl->iovs[0].iov_base, 2243 sdbl->iovs[0].iov_base + sdbl->iovs[0].iov_len, 2244 sdbl->iovs[1].iov_base, 2245 sdbl->iovs[1].iov_base + sdbl->iovs[1].iov_len); 2246 2247 2248 /* 2249 * Set all possible CQ head doorbells to polling mode now, such that we 2250 * don't have to worry about it later if the host creates more queues. 2251 * 2252 * We only ever want interrupts for writes to the SQ tail doorbells 2253 * (which are initialised in set_ctrlr_intr_mode() below). 2254 */ 2255 for (uint16_t i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; ++i) { 2256 sdbl->eventidxs[queue_index(i, true)] = NVMF_VFIO_USER_EVENTIDX_POLL; 2257 if (ctrlr->sqs[i] != NULL) { 2258 ctrlr->sqs[i]->need_rearm = true; 2259 } 2260 } 2261 2262 /* Update controller. */ 2263 SWAP(ctrlr->sdbl, sdbl); 2264 2265 /* 2266 * Copy doorbells from either the previous shadow doorbell buffer or the 2267 * BAR0 doorbells and make I/O queue doorbells point to the new buffer. 2268 * 2269 * This needs to account for older versions of the Linux NVMe driver, 2270 * which don't clear out the buffer after a controller reset. 2271 */ 2272 copy_doorbells(ctrlr, sdbl != NULL ? 2273 sdbl->shadow_doorbells : ctrlr->bar0_doorbells, 2274 ctrlr->sdbl->shadow_doorbells); 2275 vfio_user_ctrlr_switch_doorbells(ctrlr, true); 2276 2277 /* Update event index buffer and poll queues if necessary. */ 2278 set_ctrlr_intr_mode(ctrlr); 2279 2280 sc = SPDK_NVME_SC_SUCCESS; 2281 2282 out: 2283 /* 2284 * Unmap existing buffers, in case Doorbell Buffer Config was sent 2285 * more than once (pointless, but not prohibited by the spec), or 2286 * in case of an error. 2287 * 2288 * If this is the first time Doorbell Buffer Config was processed, 2289 * then we've just swapped a NULL from ctrlr->sdbl into sdbl, so 2290 * free_sdbl() becomes a noop. 2291 */ 2292 free_sdbl(ctrlr->endpoint->vfu_ctx, sdbl); 2293 2294 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 2295 } 2296 2297 /* Returns 0 on success and -errno on error. */ 2298 static int 2299 consume_admin_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd) 2300 { 2301 assert(ctrlr != NULL); 2302 assert(cmd != NULL); 2303 2304 if (cmd->fuse != 0) { 2305 /* Fused admin commands are not supported. */ 2306 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, 2307 SPDK_NVME_SC_INVALID_FIELD, 2308 SPDK_NVME_SCT_GENERIC); 2309 } 2310 2311 switch (cmd->opc) { 2312 case SPDK_NVME_OPC_CREATE_IO_CQ: 2313 case SPDK_NVME_OPC_CREATE_IO_SQ: 2314 return handle_create_io_q(ctrlr, cmd, 2315 cmd->opc == SPDK_NVME_OPC_CREATE_IO_CQ); 2316 case SPDK_NVME_OPC_DELETE_IO_SQ: 2317 case SPDK_NVME_OPC_DELETE_IO_CQ: 2318 return handle_del_io_q(ctrlr, cmd, 2319 cmd->opc == SPDK_NVME_OPC_DELETE_IO_CQ); 2320 case SPDK_NVME_OPC_DOORBELL_BUFFER_CONFIG: 2321 if (!ctrlr->transport->transport_opts.disable_shadow_doorbells) { 2322 return handle_doorbell_buffer_config(ctrlr, cmd); 2323 } 2324 /* FALLTHROUGH */ 2325 default: 2326 return handle_cmd_req(ctrlr, cmd, ctrlr->sqs[0]); 2327 } 2328 } 2329 2330 static int 2331 handle_cmd_rsp(struct nvmf_vfio_user_req *vu_req, void *cb_arg) 2332 { 2333 struct nvmf_vfio_user_sq *sq = cb_arg; 2334 struct nvmf_vfio_user_ctrlr *vu_ctrlr = sq->ctrlr; 2335 uint16_t sqid, cqid; 2336 2337 assert(sq != NULL); 2338 assert(vu_req != NULL); 2339 assert(vu_ctrlr != NULL); 2340 2341 if (spdk_likely(vu_req->iovcnt)) { 2342 vfu_unmap_sg(vu_ctrlr->endpoint->vfu_ctx, 2343 vu_req_to_sg_t(vu_req, 0), 2344 vu_req->iov, vu_req->iovcnt); 2345 } 2346 sqid = sq->qid; 2347 cqid = sq->cqid; 2348 2349 return post_completion(vu_ctrlr, vu_ctrlr->cqs[cqid], 2350 vu_req->req.rsp->nvme_cpl.cdw0, 2351 sqid, 2352 vu_req->req.cmd->nvme_cmd.cid, 2353 vu_req->req.rsp->nvme_cpl.status.sc, 2354 vu_req->req.rsp->nvme_cpl.status.sct); 2355 } 2356 2357 static int 2358 consume_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_sq *sq, 2359 struct spdk_nvme_cmd *cmd) 2360 { 2361 assert(sq != NULL); 2362 if (nvmf_qpair_is_admin_queue(&sq->qpair)) { 2363 return consume_admin_cmd(ctrlr, cmd); 2364 } 2365 2366 return handle_cmd_req(ctrlr, cmd, sq); 2367 } 2368 2369 /* Returns the number of commands processed, or a negative value on error. */ 2370 static int 2371 handle_sq_tdbl_write(struct nvmf_vfio_user_ctrlr *ctrlr, const uint32_t new_tail, 2372 struct nvmf_vfio_user_sq *sq) 2373 { 2374 struct spdk_nvme_cmd *queue; 2375 int count = 0; 2376 2377 assert(ctrlr != NULL); 2378 assert(sq != NULL); 2379 2380 /* 2381 * Submission queue index has moved past the event index, so it needs to 2382 * be re-armed before we go to sleep. 2383 */ 2384 sq->need_rearm = true; 2385 2386 queue = q_addr(&sq->mapping); 2387 while (*sq_headp(sq) != new_tail) { 2388 int err; 2389 struct spdk_nvme_cmd *cmd = &queue[*sq_headp(sq)]; 2390 2391 count++; 2392 2393 /* 2394 * SQHD must contain the new head pointer, so we must increase 2395 * it before we generate a completion. 2396 */ 2397 sq_head_advance(sq); 2398 2399 err = consume_cmd(ctrlr, sq, cmd); 2400 if (err != 0) { 2401 return err; 2402 } 2403 } 2404 2405 return count; 2406 } 2407 2408 static void 2409 memory_region_add_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) 2410 { 2411 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2412 struct nvmf_vfio_user_ctrlr *ctrlr; 2413 struct nvmf_vfio_user_sq *sq; 2414 struct nvmf_vfio_user_cq *cq; 2415 void *map_start, *map_end; 2416 int ret; 2417 2418 /* 2419 * We're not interested in any DMA regions that aren't mappable (we don't 2420 * support clients that don't share their memory). 2421 */ 2422 if (!info->vaddr) { 2423 return; 2424 } 2425 2426 map_start = info->mapping.iov_base; 2427 map_end = info->mapping.iov_base + info->mapping.iov_len; 2428 2429 if (((uintptr_t)info->mapping.iov_base & MASK_2MB) || 2430 (info->mapping.iov_len & MASK_2MB)) { 2431 SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %p-%p\n", 2432 info->vaddr, map_start, map_end); 2433 return; 2434 } 2435 2436 assert(endpoint != NULL); 2437 if (endpoint->ctrlr == NULL) { 2438 return; 2439 } 2440 ctrlr = endpoint->ctrlr; 2441 2442 SPDK_DEBUGLOG(nvmf_vfio, "%s: map IOVA %p-%p\n", endpoint_id(endpoint), 2443 map_start, map_end); 2444 2445 /* VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE are enabled when registering to VFIO, here we also 2446 * check the protection bits before registering. 2447 */ 2448 if (info->prot == (PROT_WRITE | PROT_READ)) { 2449 ret = spdk_mem_register(info->mapping.iov_base, info->mapping.iov_len); 2450 if (ret) { 2451 SPDK_ERRLOG("Memory region register %p-%p failed, ret=%d\n", 2452 map_start, map_end, ret); 2453 } 2454 } 2455 2456 pthread_mutex_lock(&endpoint->lock); 2457 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 2458 if (sq->sq_state != VFIO_USER_SQ_INACTIVE) { 2459 continue; 2460 } 2461 2462 cq = ctrlr->cqs[sq->cqid]; 2463 2464 /* For shared CQ case, we will use q_addr() to avoid mapping CQ multiple times */ 2465 if (cq->size && q_addr(&cq->mapping) == NULL) { 2466 ret = map_q(ctrlr, &cq->mapping, cq->size, true, false); 2467 if (ret) { 2468 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap cqid:%d %#lx-%#lx\n", 2469 cq->qid, cq->mapping.prp1, 2470 cq->mapping.prp1 + cq->size * sizeof(struct spdk_nvme_cpl)); 2471 continue; 2472 } 2473 } 2474 2475 if (sq->size) { 2476 ret = map_q(ctrlr, &sq->mapping, sq->size, false, false); 2477 if (ret) { 2478 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap sqid:%d %#lx-%#lx\n", 2479 sq->qid, sq->mapping.prp1, 2480 sq->mapping.prp1 + sq->size * sizeof(struct spdk_nvme_cmd)); 2481 continue; 2482 } 2483 } 2484 sq->sq_state = VFIO_USER_SQ_ACTIVE; 2485 SPDK_DEBUGLOG(nvmf_vfio, "Remap sqid:%u successfully\n", sq->qid); 2486 } 2487 pthread_mutex_unlock(&endpoint->lock); 2488 } 2489 2490 static void 2491 memory_region_remove_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) 2492 { 2493 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2494 struct nvmf_vfio_user_sq *sq; 2495 struct nvmf_vfio_user_cq *cq; 2496 void *map_start, *map_end; 2497 int ret = 0; 2498 2499 if (!info->vaddr) { 2500 return; 2501 } 2502 2503 map_start = info->mapping.iov_base; 2504 map_end = info->mapping.iov_base + info->mapping.iov_len; 2505 2506 if (((uintptr_t)info->mapping.iov_base & MASK_2MB) || 2507 (info->mapping.iov_len & MASK_2MB)) { 2508 SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %p-%p\n", 2509 info->vaddr, map_start, map_end); 2510 return; 2511 } 2512 2513 assert(endpoint != NULL); 2514 SPDK_DEBUGLOG(nvmf_vfio, "%s: unmap IOVA %p-%p\n", endpoint_id(endpoint), 2515 map_start, map_end); 2516 2517 if (endpoint->ctrlr != NULL) { 2518 struct nvmf_vfio_user_ctrlr *ctrlr; 2519 ctrlr = endpoint->ctrlr; 2520 2521 pthread_mutex_lock(&endpoint->lock); 2522 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 2523 if (q_addr(&sq->mapping) >= map_start && q_addr(&sq->mapping) <= map_end) { 2524 unmap_q(ctrlr, &sq->mapping); 2525 sq->sq_state = VFIO_USER_SQ_INACTIVE; 2526 } 2527 2528 cq = ctrlr->cqs[sq->cqid]; 2529 if (q_addr(&cq->mapping) >= map_start && q_addr(&cq->mapping) <= map_end) { 2530 unmap_q(ctrlr, &cq->mapping); 2531 } 2532 } 2533 2534 if (ctrlr->sdbl != NULL) { 2535 size_t i; 2536 2537 for (i = 0; i < NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT; i++) { 2538 const void *const iov_base = ctrlr->sdbl->iovs[i].iov_base; 2539 2540 if (iov_base >= map_start && iov_base < map_end) { 2541 copy_doorbells(ctrlr, 2542 ctrlr->sdbl->shadow_doorbells, 2543 ctrlr->bar0_doorbells); 2544 vfio_user_ctrlr_switch_doorbells(ctrlr, false); 2545 free_sdbl(endpoint->vfu_ctx, ctrlr->sdbl); 2546 ctrlr->sdbl = NULL; 2547 break; 2548 } 2549 } 2550 } 2551 2552 pthread_mutex_unlock(&endpoint->lock); 2553 } 2554 2555 if (info->prot == (PROT_WRITE | PROT_READ)) { 2556 ret = spdk_mem_unregister(info->mapping.iov_base, info->mapping.iov_len); 2557 if (ret) { 2558 SPDK_ERRLOG("Memory region unregister %p-%p failed, ret=%d\n", 2559 map_start, map_end, ret); 2560 } 2561 } 2562 } 2563 2564 /* Used to initiate a controller-level reset or a controller shutdown. */ 2565 static void 2566 disable_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 2567 { 2568 SPDK_DEBUGLOG(nvmf_vfio, "%s: disabling controller\n", 2569 ctrlr_id(vu_ctrlr)); 2570 2571 /* Unmap Admin queue. */ 2572 2573 assert(vu_ctrlr->sqs[0] != NULL); 2574 assert(vu_ctrlr->cqs[0] != NULL); 2575 2576 unmap_q(vu_ctrlr, &vu_ctrlr->sqs[0]->mapping); 2577 unmap_q(vu_ctrlr, &vu_ctrlr->cqs[0]->mapping); 2578 2579 vu_ctrlr->sqs[0]->size = 0; 2580 *sq_headp(vu_ctrlr->sqs[0]) = 0; 2581 2582 vu_ctrlr->sqs[0]->sq_state = VFIO_USER_SQ_INACTIVE; 2583 2584 vu_ctrlr->cqs[0]->size = 0; 2585 *cq_tailp(vu_ctrlr->cqs[0]) = 0; 2586 2587 /* 2588 * For PCIe controller reset or shutdown, we will drop all AER 2589 * responses. 2590 */ 2591 nvmf_ctrlr_abort_aer(vu_ctrlr->ctrlr); 2592 2593 /* Free the shadow doorbell buffer. */ 2594 free_sdbl(vu_ctrlr->endpoint->vfu_ctx, vu_ctrlr->sdbl); 2595 vu_ctrlr->sdbl = NULL; 2596 } 2597 2598 /* Used to re-enable the controller after a controller-level reset. */ 2599 static int 2600 enable_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 2601 { 2602 int err; 2603 2604 assert(vu_ctrlr != NULL); 2605 2606 SPDK_DEBUGLOG(nvmf_vfio, "%s: enabling controller\n", 2607 ctrlr_id(vu_ctrlr)); 2608 2609 err = acq_setup(vu_ctrlr); 2610 if (err != 0) { 2611 return err; 2612 } 2613 2614 err = asq_setup(vu_ctrlr); 2615 if (err != 0) { 2616 return err; 2617 } 2618 2619 vu_ctrlr->sqs[0]->sq_state = VFIO_USER_SQ_ACTIVE; 2620 2621 return 0; 2622 } 2623 2624 static int 2625 nvmf_vfio_user_prop_req_rsp(struct nvmf_vfio_user_req *req, void *cb_arg) 2626 { 2627 struct nvmf_vfio_user_sq *sq = cb_arg; 2628 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 2629 int ret; 2630 2631 assert(sq != NULL); 2632 assert(req != NULL); 2633 2634 if (req->req.cmd->prop_get_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET) { 2635 assert(sq->ctrlr != NULL); 2636 assert(req != NULL); 2637 2638 memcpy(req->req.data, 2639 &req->req.rsp->prop_get_rsp.value.u64, 2640 req->req.length); 2641 } else { 2642 assert(req->req.cmd->prop_set_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET); 2643 assert(sq->ctrlr != NULL); 2644 vu_ctrlr = sq->ctrlr; 2645 2646 if (req->req.cmd->prop_set_cmd.ofst == offsetof(struct spdk_nvme_registers, cc)) { 2647 union spdk_nvme_cc_register cc, diff; 2648 2649 cc.raw = req->req.cmd->prop_set_cmd.value.u64; 2650 diff.raw = cc.raw ^ req->cc.raw; 2651 2652 if (diff.bits.en) { 2653 if (cc.bits.en) { 2654 ret = enable_ctrlr(vu_ctrlr); 2655 if (ret) { 2656 SPDK_ERRLOG("%s: failed to enable ctrlr\n", ctrlr_id(vu_ctrlr)); 2657 return ret; 2658 } 2659 vu_ctrlr->reset_shn = false; 2660 } else { 2661 vu_ctrlr->reset_shn = true; 2662 } 2663 } 2664 2665 if (diff.bits.shn) { 2666 if (cc.bits.shn == SPDK_NVME_SHN_NORMAL || cc.bits.shn == SPDK_NVME_SHN_ABRUPT) { 2667 vu_ctrlr->reset_shn = true; 2668 } 2669 } 2670 2671 if (vu_ctrlr->reset_shn) { 2672 disable_ctrlr(vu_ctrlr); 2673 } 2674 } 2675 } 2676 2677 return 0; 2678 } 2679 2680 /* 2681 * Handles a write at offset 0x1000 or more; this is the non-mapped path when a 2682 * doorbell is written via access_bar0_fn(). 2683 * 2684 * DSTRD is set to fixed value 0 for NVMf. 2685 * 2686 */ 2687 static int 2688 handle_dbl_access(struct nvmf_vfio_user_ctrlr *ctrlr, uint32_t *buf, 2689 const size_t count, loff_t pos, const bool is_write) 2690 { 2691 assert(ctrlr != NULL); 2692 assert(buf != NULL); 2693 2694 if (!is_write) { 2695 SPDK_WARNLOG("%s: host tried to read BAR0 doorbell %#lx\n", 2696 ctrlr_id(ctrlr), pos); 2697 errno = EPERM; 2698 return -1; 2699 } 2700 2701 if (count != sizeof(uint32_t)) { 2702 SPDK_ERRLOG("%s: bad doorbell buffer size %ld\n", 2703 ctrlr_id(ctrlr), count); 2704 errno = EINVAL; 2705 return -1; 2706 } 2707 2708 pos -= NVME_DOORBELLS_OFFSET; 2709 2710 /* pos must be dword aligned */ 2711 if ((pos & 0x3) != 0) { 2712 SPDK_ERRLOG("%s: bad doorbell offset %#lx\n", ctrlr_id(ctrlr), pos); 2713 errno = EINVAL; 2714 return -1; 2715 } 2716 2717 /* convert byte offset to array index */ 2718 pos >>= 2; 2719 2720 if (pos >= NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR * 2) { 2721 SPDK_ERRLOG("%s: bad doorbell index %#lx\n", ctrlr_id(ctrlr), pos); 2722 errno = EINVAL; 2723 return -1; 2724 } 2725 2726 ctrlr->bar0_doorbells[pos] = *buf; 2727 spdk_wmb(); 2728 2729 SPDK_DEBUGLOG(vfio_user_db, "%s: updating BAR0 doorbell %s:%ld to %u\n", 2730 ctrlr_id(ctrlr), (pos & 1) ? "cqid" : "sqid", 2731 pos / 2, *buf); 2732 2733 2734 return 0; 2735 } 2736 2737 static size_t 2738 vfio_user_property_access(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 2739 char *buf, size_t count, loff_t pos, 2740 bool is_write) 2741 { 2742 struct nvmf_vfio_user_req *req; 2743 const struct spdk_nvmf_registers *regs; 2744 2745 /* Construct a Fabric Property Get/Set command and send it */ 2746 req = get_nvmf_vfio_user_req(vu_ctrlr->sqs[0]); 2747 if (req == NULL) { 2748 errno = ENOBUFS; 2749 return -1; 2750 } 2751 regs = spdk_nvmf_ctrlr_get_regs(vu_ctrlr->ctrlr); 2752 req->cc.raw = regs->cc.raw; 2753 2754 req->cb_fn = nvmf_vfio_user_prop_req_rsp; 2755 req->cb_arg = vu_ctrlr->sqs[0]; 2756 req->req.cmd->prop_set_cmd.opcode = SPDK_NVME_OPC_FABRIC; 2757 req->req.cmd->prop_set_cmd.cid = 0; 2758 req->req.cmd->prop_set_cmd.attrib.size = (count / 4) - 1; 2759 req->req.cmd->prop_set_cmd.ofst = pos; 2760 if (is_write) { 2761 req->req.cmd->prop_set_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET; 2762 if (req->req.cmd->prop_set_cmd.attrib.size) { 2763 req->req.cmd->prop_set_cmd.value.u64 = *(uint64_t *)buf; 2764 } else { 2765 req->req.cmd->prop_set_cmd.value.u32.high = 0; 2766 req->req.cmd->prop_set_cmd.value.u32.low = *(uint32_t *)buf; 2767 } 2768 } else { 2769 req->req.cmd->prop_get_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET; 2770 } 2771 req->req.length = count; 2772 req->req.data = buf; 2773 2774 spdk_nvmf_request_exec_fabrics(&req->req); 2775 2776 return count; 2777 } 2778 2779 static ssize_t 2780 access_bar0_fn(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t pos, 2781 bool is_write) 2782 { 2783 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2784 struct nvmf_vfio_user_ctrlr *ctrlr; 2785 int ret; 2786 2787 ctrlr = endpoint->ctrlr; 2788 if (endpoint->need_async_destroy || !ctrlr) { 2789 errno = EIO; 2790 return -1; 2791 } 2792 2793 if (pos >= NVME_DOORBELLS_OFFSET) { 2794 /* 2795 * The fact that the doorbells can be memory mapped doesn't mean 2796 * that the client (VFIO in QEMU) is obliged to memory map them, 2797 * it might still elect to access them via regular read/write; 2798 * we might also have had disable_mappable_bar0 set. 2799 */ 2800 ret = handle_dbl_access(ctrlr, (uint32_t *)buf, count, 2801 pos, is_write); 2802 if (ret == 0) { 2803 return count; 2804 } 2805 return ret; 2806 } 2807 2808 return vfio_user_property_access(ctrlr, buf, count, pos, is_write); 2809 } 2810 2811 static ssize_t 2812 access_pci_config(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t offset, 2813 bool is_write) 2814 { 2815 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2816 2817 if (is_write) { 2818 SPDK_ERRLOG("%s: write %#lx-%#lx not supported\n", 2819 endpoint_id(endpoint), offset, offset + count); 2820 errno = EINVAL; 2821 return -1; 2822 } 2823 2824 if (offset + count > NVME_REG_CFG_SIZE) { 2825 SPDK_ERRLOG("%s: access past end of extended PCI configuration space, want=%ld+%ld, max=%d\n", 2826 endpoint_id(endpoint), offset, count, 2827 NVME_REG_CFG_SIZE); 2828 errno = ERANGE; 2829 return -1; 2830 } 2831 2832 memcpy(buf, ((unsigned char *)endpoint->pci_config_space) + offset, count); 2833 2834 return count; 2835 } 2836 2837 static void 2838 vfio_user_log(vfu_ctx_t *vfu_ctx, int level, char const *msg) 2839 { 2840 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2841 2842 if (level >= LOG_DEBUG) { 2843 SPDK_DEBUGLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg); 2844 } else if (level >= LOG_INFO) { 2845 SPDK_INFOLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg); 2846 } else if (level >= LOG_NOTICE) { 2847 SPDK_NOTICELOG("%s: %s\n", endpoint_id(endpoint), msg); 2848 } else if (level >= LOG_WARNING) { 2849 SPDK_WARNLOG("%s: %s\n", endpoint_id(endpoint), msg); 2850 } else { 2851 SPDK_ERRLOG("%s: %s\n", endpoint_id(endpoint), msg); 2852 } 2853 } 2854 2855 static int 2856 vfio_user_get_log_level(void) 2857 { 2858 int level; 2859 2860 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 2861 return LOG_DEBUG; 2862 } 2863 2864 level = spdk_log_to_syslog_level(spdk_log_get_level()); 2865 if (level < 0) { 2866 return LOG_ERR; 2867 } 2868 2869 return level; 2870 } 2871 2872 static void 2873 init_pci_config_space(vfu_pci_config_space_t *p) 2874 { 2875 /* MLBAR */ 2876 p->hdr.bars[0].raw = 0x0; 2877 /* MUBAR */ 2878 p->hdr.bars[1].raw = 0x0; 2879 2880 /* vendor specific, let's set them to zero for now */ 2881 p->hdr.bars[3].raw = 0x0; 2882 p->hdr.bars[4].raw = 0x0; 2883 p->hdr.bars[5].raw = 0x0; 2884 2885 /* enable INTx */ 2886 p->hdr.intr.ipin = 0x1; 2887 } 2888 2889 static void 2890 vfio_user_dev_quiesce_done(struct spdk_nvmf_subsystem *subsystem, 2891 void *cb_arg, int status); 2892 2893 static void 2894 vfio_user_endpoint_resume_done(struct spdk_nvmf_subsystem *subsystem, 2895 void *cb_arg, int status) 2896 { 2897 struct nvmf_vfio_user_endpoint *endpoint = cb_arg; 2898 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 2899 int ret; 2900 2901 SPDK_DEBUGLOG(nvmf_vfio, "%s resumed done with status %d\n", endpoint_id(endpoint), status); 2902 2903 if (!vu_ctrlr) { 2904 return; 2905 } 2906 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 2907 2908 /* Basically, once we call `vfu_device_quiesced` the device is unquiesced from 2909 * libvfio-user's perspective so from the moment `vfio_user_dev_quiesce_done` returns 2910 * libvfio-user might quiesce the device again. However, because the NVMf subsytem is 2911 * an asynchronous operation, this quiesce might come _before_ the NVMf subsystem has 2912 * been resumed, so in the callback of `spdk_nvmf_subsystem_resume` we need to check 2913 * whether a quiesce was requested. 2914 */ 2915 if (vu_ctrlr->queued_quiesce) { 2916 SPDK_DEBUGLOG(nvmf_vfio, "%s has queued quiesce event, pause again\n", ctrlr_id(vu_ctrlr)); 2917 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSING; 2918 ret = spdk_nvmf_subsystem_pause((struct spdk_nvmf_subsystem *)endpoint->subsystem, 0, 2919 vfio_user_dev_quiesce_done, vu_ctrlr); 2920 if (ret < 0) { 2921 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 2922 SPDK_ERRLOG("%s: failed to pause, ret=%d\n", endpoint_id(endpoint), ret); 2923 } 2924 } 2925 } 2926 2927 static void 2928 vfio_user_dev_quiesce_done(struct spdk_nvmf_subsystem *subsystem, 2929 void *cb_arg, int status) 2930 { 2931 struct nvmf_vfio_user_ctrlr *vu_ctrlr = cb_arg; 2932 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 2933 int ret; 2934 2935 SPDK_DEBUGLOG(nvmf_vfio, "%s paused done with status %d\n", ctrlr_id(vu_ctrlr), status); 2936 2937 assert(vu_ctrlr->state == VFIO_USER_CTRLR_PAUSING); 2938 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 2939 vfu_device_quiesced(endpoint->vfu_ctx, status); 2940 vu_ctrlr->queued_quiesce = false; 2941 2942 /* `vfu_device_quiesced` can change the migration state, 2943 * so we need to re-check `vu_ctrlr->state`. 2944 */ 2945 if (vu_ctrlr->state == VFIO_USER_CTRLR_MIGRATING) { 2946 SPDK_DEBUGLOG(nvmf_vfio, "%s is in MIGRATION state\n", ctrlr_id(vu_ctrlr)); 2947 return; 2948 } 2949 2950 SPDK_DEBUGLOG(nvmf_vfio, "%s start to resume\n", ctrlr_id(vu_ctrlr)); 2951 vu_ctrlr->state = VFIO_USER_CTRLR_RESUMING; 2952 ret = spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 2953 vfio_user_endpoint_resume_done, endpoint); 2954 if (ret < 0) { 2955 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 2956 SPDK_ERRLOG("%s: failed to resume, ret=%d\n", endpoint_id(endpoint), ret); 2957 } 2958 } 2959 2960 static int 2961 vfio_user_dev_quiesce_cb(vfu_ctx_t *vfu_ctx) 2962 { 2963 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2964 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 2965 int ret; 2966 2967 if (!vu_ctrlr) { 2968 return 0; 2969 } 2970 2971 /* NVMf library will destruct controller when no 2972 * connected queue pairs. 2973 */ 2974 if (!nvmf_subsystem_get_ctrlr((struct spdk_nvmf_subsystem *)endpoint->subsystem, 2975 vu_ctrlr->cntlid)) { 2976 return 0; 2977 } 2978 2979 SPDK_DEBUGLOG(nvmf_vfio, "%s starts to quiesce\n", ctrlr_id(vu_ctrlr)); 2980 2981 /* There is no race condition here as device quiesce callback 2982 * and nvmf_prop_set_cc() are running in the same thread context. 2983 */ 2984 if (!vu_ctrlr->ctrlr->vcprop.cc.bits.en) { 2985 return 0; 2986 } else if (!vu_ctrlr->ctrlr->vcprop.csts.bits.rdy) { 2987 return 0; 2988 } else if (vu_ctrlr->ctrlr->vcprop.csts.bits.shst == SPDK_NVME_SHST_COMPLETE) { 2989 return 0; 2990 } 2991 2992 switch (vu_ctrlr->state) { 2993 case VFIO_USER_CTRLR_PAUSED: 2994 case VFIO_USER_CTRLR_MIGRATING: 2995 return 0; 2996 case VFIO_USER_CTRLR_RUNNING: 2997 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSING; 2998 ret = spdk_nvmf_subsystem_pause((struct spdk_nvmf_subsystem *)endpoint->subsystem, 0, 2999 vfio_user_dev_quiesce_done, vu_ctrlr); 3000 if (ret < 0) { 3001 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 3002 SPDK_ERRLOG("%s: failed to pause, ret=%d\n", endpoint_id(endpoint), ret); 3003 return 0; 3004 } 3005 break; 3006 case VFIO_USER_CTRLR_RESUMING: 3007 vu_ctrlr->queued_quiesce = true; 3008 SPDK_DEBUGLOG(nvmf_vfio, "%s is busy to quiesce, current state %u\n", ctrlr_id(vu_ctrlr), 3009 vu_ctrlr->state); 3010 break; 3011 default: 3012 assert(vu_ctrlr->state != VFIO_USER_CTRLR_PAUSING); 3013 break; 3014 } 3015 3016 errno = EBUSY; 3017 return -1; 3018 } 3019 3020 static void 3021 vfio_user_ctrlr_dump_migr_data(const char *name, 3022 struct vfio_user_nvme_migr_state *migr_data, 3023 struct nvmf_vfio_user_shadow_doorbells *sdbl) 3024 { 3025 struct spdk_nvme_registers *regs; 3026 struct nvme_migr_sq_state *sq; 3027 struct nvme_migr_cq_state *cq; 3028 uint32_t *doorbell_base; 3029 uint32_t i; 3030 3031 SPDK_NOTICELOG("Dump %s\n", name); 3032 3033 regs = (struct spdk_nvme_registers *)migr_data->bar0; 3034 doorbell_base = (uint32_t *)®s->doorbell[0].sq_tdbl; 3035 3036 SPDK_NOTICELOG("Registers\n"); 3037 SPDK_NOTICELOG("CSTS 0x%x\n", regs->csts.raw); 3038 SPDK_NOTICELOG("CAP 0x%"PRIx64"\n", regs->cap.raw); 3039 SPDK_NOTICELOG("VS 0x%x\n", regs->vs.raw); 3040 SPDK_NOTICELOG("CC 0x%x\n", regs->cc.raw); 3041 SPDK_NOTICELOG("AQA 0x%x\n", regs->aqa.raw); 3042 SPDK_NOTICELOG("ASQ 0x%"PRIx64"\n", regs->asq); 3043 SPDK_NOTICELOG("ACQ 0x%"PRIx64"\n", regs->acq); 3044 3045 SPDK_NOTICELOG("Number of IO Queues %u\n", migr_data->ctrlr_header.num_io_queues); 3046 3047 if (sdbl != NULL) { 3048 SPDK_NOTICELOG("shadow doorbell buffer=%#lx\n", 3049 migr_data->ctrlr_header.shadow_doorbell_buffer); 3050 SPDK_NOTICELOG("eventidx buffer=%#lx\n", 3051 migr_data->ctrlr_header.eventidx_buffer); 3052 } 3053 3054 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3055 sq = &migr_data->qps[i].sq; 3056 cq = &migr_data->qps[i].cq; 3057 3058 if (sq->size) { 3059 SPDK_NOTICELOG("sqid:%u, bar0_doorbell:%u\n", sq->sqid, doorbell_base[i * 2]); 3060 if (i > 0 && sdbl != NULL) { 3061 SPDK_NOTICELOG("sqid:%u, shadow_doorbell:%u, eventidx:%u\n", 3062 sq->sqid, 3063 sdbl->shadow_doorbells[queue_index(i, false)], 3064 sdbl->eventidxs[queue_index(i, false)]); 3065 } 3066 SPDK_NOTICELOG("SQ sqid:%u, cqid:%u, sqhead:%u, size:%u, dma_addr:0x%"PRIx64"\n", 3067 sq->sqid, sq->cqid, sq->head, sq->size, sq->dma_addr); 3068 } 3069 3070 if (cq->size) { 3071 SPDK_NOTICELOG("cqid:%u, bar0_doorbell:%u\n", cq->cqid, doorbell_base[i * 2 + 1]); 3072 if (i > 0 && sdbl != NULL) { 3073 SPDK_NOTICELOG("cqid:%u, shadow_doorbell:%u, eventidx:%u\n", 3074 cq->cqid, 3075 sdbl->shadow_doorbells[queue_index(i, true)], 3076 sdbl->eventidxs[queue_index(i, true)]); 3077 } 3078 SPDK_NOTICELOG("CQ cqid:%u, phase:%u, cqtail:%u, size:%u, iv:%u, ien:%u, dma_addr:0x%"PRIx64"\n", 3079 cq->cqid, cq->phase, cq->tail, cq->size, cq->iv, cq->ien, cq->dma_addr); 3080 } 3081 } 3082 3083 SPDK_NOTICELOG("%s Dump Done\n", name); 3084 } 3085 3086 /* Read region 9 content and restore it to migration data structures */ 3087 static int 3088 vfio_user_migr_stream_to_data(struct nvmf_vfio_user_endpoint *endpoint, 3089 struct vfio_user_nvme_migr_state *migr_state) 3090 { 3091 void *data_ptr = endpoint->migr_data; 3092 3093 /* Load vfio_user_nvme_migr_header first */ 3094 memcpy(&migr_state->ctrlr_header, data_ptr, sizeof(struct vfio_user_nvme_migr_header)); 3095 /* TODO: version check */ 3096 if (migr_state->ctrlr_header.magic != VFIO_USER_NVME_MIGR_MAGIC) { 3097 SPDK_ERRLOG("%s: bad magic number %x\n", endpoint_id(endpoint), migr_state->ctrlr_header.magic); 3098 return -EINVAL; 3099 } 3100 3101 /* Load nvmf controller data */ 3102 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.nvmf_data_offset; 3103 memcpy(&migr_state->nvmf_data, data_ptr, migr_state->ctrlr_header.nvmf_data_len); 3104 3105 /* Load queue pairs */ 3106 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.qp_offset; 3107 memcpy(&migr_state->qps, data_ptr, migr_state->ctrlr_header.qp_len); 3108 3109 /* Load BAR0 */ 3110 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.bar_offset[VFU_PCI_DEV_BAR0_REGION_IDX]; 3111 memcpy(&migr_state->bar0, data_ptr, migr_state->ctrlr_header.bar_len[VFU_PCI_DEV_BAR0_REGION_IDX]); 3112 3113 /* Load CFG */ 3114 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.bar_offset[VFU_PCI_DEV_CFG_REGION_IDX]; 3115 memcpy(&migr_state->cfg, data_ptr, migr_state->ctrlr_header.bar_len[VFU_PCI_DEV_CFG_REGION_IDX]); 3116 3117 return 0; 3118 } 3119 3120 3121 static void 3122 vfio_user_migr_ctrlr_save_data(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3123 { 3124 struct spdk_nvmf_ctrlr *ctrlr = vu_ctrlr->ctrlr; 3125 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 3126 struct nvmf_vfio_user_sq *sq; 3127 struct nvmf_vfio_user_cq *cq; 3128 struct vfio_user_nvme_migr_state migr_state = {}; 3129 uint64_t data_offset; 3130 void *data_ptr; 3131 int num_aers; 3132 struct spdk_nvme_registers *regs; 3133 uint32_t *doorbell_base; 3134 uint32_t i = 0; 3135 uint16_t sqid, cqid; 3136 3137 /* Save all data to vfio_user_nvme_migr_state first, then we will 3138 * copy it to device migration region at last. 3139 */ 3140 3141 /* save magic number */ 3142 migr_state.ctrlr_header.magic = VFIO_USER_NVME_MIGR_MAGIC; 3143 3144 /* save controller data */ 3145 num_aers = nvmf_ctrlr_save_aers(ctrlr, migr_state.ctrlr_header.aer_cids, 3146 256); 3147 assert(num_aers >= 0); 3148 migr_state.ctrlr_header.nr_aers = num_aers; 3149 3150 /* save nvmf controller data */ 3151 nvmf_ctrlr_save_migr_data(ctrlr, (struct nvmf_ctrlr_migr_data *)&migr_state.nvmf_data); 3152 3153 /* save connected queue pairs */ 3154 TAILQ_FOREACH(sq, &vu_ctrlr->connected_sqs, tailq) { 3155 /* save sq */ 3156 sqid = sq->qid; 3157 migr_state.qps[sqid].sq.sqid = sq->qid; 3158 migr_state.qps[sqid].sq.cqid = sq->cqid; 3159 migr_state.qps[sqid].sq.head = *sq_headp(sq); 3160 migr_state.qps[sqid].sq.size = sq->size; 3161 migr_state.qps[sqid].sq.dma_addr = sq->mapping.prp1; 3162 3163 /* save cq, for shared cq case, cq may be saved multiple times */ 3164 cqid = sq->cqid; 3165 cq = vu_ctrlr->cqs[cqid]; 3166 migr_state.qps[cqid].cq.cqid = cqid; 3167 migr_state.qps[cqid].cq.tail = *cq_tailp(cq); 3168 migr_state.qps[cqid].cq.ien = cq->ien; 3169 migr_state.qps[cqid].cq.iv = cq->iv; 3170 migr_state.qps[cqid].cq.size = cq->size; 3171 migr_state.qps[cqid].cq.phase = cq->phase; 3172 migr_state.qps[cqid].cq.dma_addr = cq->mapping.prp1; 3173 i++; 3174 } 3175 3176 assert(i > 0); 3177 migr_state.ctrlr_header.num_io_queues = i - 1; 3178 3179 regs = (struct spdk_nvme_registers *)&migr_state.bar0; 3180 /* Save mandarory registers to bar0 */ 3181 regs->csts.raw = ctrlr->vcprop.csts.raw; 3182 regs->cap.raw = ctrlr->vcprop.cap.raw; 3183 regs->vs.raw = ctrlr->vcprop.vs.raw; 3184 regs->cc.raw = ctrlr->vcprop.cc.raw; 3185 regs->aqa.raw = ctrlr->vcprop.aqa.raw; 3186 regs->asq = ctrlr->vcprop.asq; 3187 regs->acq = ctrlr->vcprop.acq; 3188 /* Save doorbells */ 3189 doorbell_base = (uint32_t *)®s->doorbell[0].sq_tdbl; 3190 memcpy(doorbell_base, (void *)vu_ctrlr->bar0_doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 3191 3192 /* Save PCI configuration space */ 3193 memcpy(&migr_state.cfg, (void *)endpoint->pci_config_space, NVME_REG_CFG_SIZE); 3194 3195 /* Save all data to device migration region */ 3196 data_ptr = endpoint->migr_data; 3197 3198 /* Copy nvmf controller data */ 3199 data_offset = sizeof(struct vfio_user_nvme_migr_header); 3200 data_ptr += data_offset; 3201 migr_state.ctrlr_header.nvmf_data_offset = data_offset; 3202 migr_state.ctrlr_header.nvmf_data_len = sizeof(struct nvmf_ctrlr_migr_data); 3203 memcpy(data_ptr, &migr_state.nvmf_data, sizeof(struct nvmf_ctrlr_migr_data)); 3204 3205 /* Copy queue pairs */ 3206 data_offset += sizeof(struct nvmf_ctrlr_migr_data); 3207 data_ptr += sizeof(struct nvmf_ctrlr_migr_data); 3208 migr_state.ctrlr_header.qp_offset = data_offset; 3209 migr_state.ctrlr_header.qp_len = i * (sizeof(struct nvme_migr_sq_state) + sizeof( 3210 struct nvme_migr_cq_state)); 3211 memcpy(data_ptr, &migr_state.qps, migr_state.ctrlr_header.qp_len); 3212 3213 /* Copy BAR0 */ 3214 data_offset += migr_state.ctrlr_header.qp_len; 3215 data_ptr += migr_state.ctrlr_header.qp_len; 3216 migr_state.ctrlr_header.bar_offset[VFU_PCI_DEV_BAR0_REGION_IDX] = data_offset; 3217 migr_state.ctrlr_header.bar_len[VFU_PCI_DEV_BAR0_REGION_IDX] = NVME_REG_BAR0_SIZE; 3218 memcpy(data_ptr, &migr_state.bar0, NVME_REG_BAR0_SIZE); 3219 3220 /* Copy CFG */ 3221 data_offset += NVME_REG_BAR0_SIZE; 3222 data_ptr += NVME_REG_BAR0_SIZE; 3223 migr_state.ctrlr_header.bar_offset[VFU_PCI_DEV_CFG_REGION_IDX] = data_offset; 3224 migr_state.ctrlr_header.bar_len[VFU_PCI_DEV_CFG_REGION_IDX] = NVME_REG_CFG_SIZE; 3225 memcpy(data_ptr, &migr_state.cfg, NVME_REG_CFG_SIZE); 3226 3227 /* copy shadow doorbells */ 3228 if (vu_ctrlr->sdbl != NULL) { 3229 migr_state.ctrlr_header.sdbl = true; 3230 migr_state.ctrlr_header.shadow_doorbell_buffer = vu_ctrlr->shadow_doorbell_buffer; 3231 migr_state.ctrlr_header.eventidx_buffer = vu_ctrlr->eventidx_buffer; 3232 } 3233 3234 /* Copy nvme migration header finally */ 3235 memcpy(endpoint->migr_data, &migr_state.ctrlr_header, sizeof(struct vfio_user_nvme_migr_header)); 3236 3237 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 3238 vfio_user_ctrlr_dump_migr_data("SAVE", &migr_state, vu_ctrlr->sdbl); 3239 } 3240 } 3241 3242 /* 3243 * If we are about to close the connection, we need to unregister the interrupt, 3244 * as the library will subsequently close the file descriptor we registered. 3245 */ 3246 static int 3247 vfio_user_device_reset(vfu_ctx_t *vfu_ctx, vfu_reset_type_t type) 3248 { 3249 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3250 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 3251 3252 SPDK_DEBUGLOG(nvmf_vfio, "Device reset type %u\n", type); 3253 3254 if (type == VFU_RESET_LOST_CONN) { 3255 if (ctrlr != NULL) { 3256 spdk_interrupt_unregister(&ctrlr->intr); 3257 ctrlr->intr_fd = -1; 3258 } 3259 return 0; 3260 } 3261 3262 /* FIXME: LOST_CONN case ? */ 3263 if (ctrlr->sdbl != NULL) { 3264 free_sdbl(vfu_ctx, ctrlr->sdbl); 3265 ctrlr->sdbl = NULL; 3266 } 3267 3268 /* FIXME: much more needed here. */ 3269 3270 return 0; 3271 } 3272 3273 static int 3274 vfio_user_migr_ctrlr_construct_qps(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 3275 struct vfio_user_nvme_migr_state *migr_state) 3276 { 3277 uint32_t i, qsize = 0; 3278 uint16_t sqid, cqid; 3279 struct vfio_user_nvme_migr_qp migr_qp; 3280 void *addr; 3281 uint32_t cqs_ref[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR] = {}; 3282 int ret; 3283 3284 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 3285 vfio_user_ctrlr_dump_migr_data("RESUME", migr_state, vu_ctrlr->sdbl); 3286 } 3287 3288 /* restore submission queues */ 3289 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3290 migr_qp = migr_state->qps[i]; 3291 3292 qsize = migr_qp.sq.size; 3293 if (qsize) { 3294 struct nvmf_vfio_user_sq *sq; 3295 3296 sqid = migr_qp.sq.sqid; 3297 if (sqid != i) { 3298 SPDK_ERRLOG("Expected sqid %u while got %u", i, sqid); 3299 return -EINVAL; 3300 } 3301 3302 /* allocate sq if necessary */ 3303 if (vu_ctrlr->sqs[sqid] == NULL) { 3304 ret = init_sq(vu_ctrlr, &vu_ctrlr->transport->transport, sqid); 3305 if (ret) { 3306 SPDK_ERRLOG("Construct qpair with qid %u failed\n", sqid); 3307 return -EFAULT; 3308 } 3309 } 3310 3311 sq = vu_ctrlr->sqs[sqid]; 3312 sq->size = qsize; 3313 3314 ret = alloc_sq_reqs(vu_ctrlr, sq); 3315 if (ret) { 3316 SPDK_ERRLOG("Construct sq with qid %u failed\n", sqid); 3317 return -EFAULT; 3318 } 3319 3320 /* restore sq */ 3321 sq->sq_state = VFIO_USER_SQ_CREATED; 3322 sq->cqid = migr_qp.sq.cqid; 3323 *sq_headp(sq) = migr_qp.sq.head; 3324 sq->mapping.prp1 = migr_qp.sq.dma_addr; 3325 addr = map_one(vu_ctrlr->endpoint->vfu_ctx, 3326 sq->mapping.prp1, sq->size * 64, 3327 sq->mapping.sg, &sq->mapping.iov, 3328 PROT_READ); 3329 if (addr == NULL) { 3330 SPDK_ERRLOG("Restore sq with qid %u PRP1 0x%"PRIx64" with size %u failed\n", 3331 sqid, sq->mapping.prp1, sq->size); 3332 return -EFAULT; 3333 } 3334 cqs_ref[sq->cqid]++; 3335 } 3336 } 3337 3338 /* restore completion queues */ 3339 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3340 migr_qp = migr_state->qps[i]; 3341 3342 qsize = migr_qp.cq.size; 3343 if (qsize) { 3344 struct nvmf_vfio_user_cq *cq; 3345 3346 /* restore cq */ 3347 cqid = migr_qp.sq.cqid; 3348 assert(cqid == i); 3349 3350 /* allocate cq if necessary */ 3351 if (vu_ctrlr->cqs[cqid] == NULL) { 3352 ret = init_cq(vu_ctrlr, cqid); 3353 if (ret) { 3354 SPDK_ERRLOG("Construct qpair with qid %u failed\n", cqid); 3355 return -EFAULT; 3356 } 3357 } 3358 3359 cq = vu_ctrlr->cqs[cqid]; 3360 3361 cq->size = qsize; 3362 3363 cq->cq_state = VFIO_USER_CQ_CREATED; 3364 cq->cq_ref = cqs_ref[cqid]; 3365 *cq_tailp(cq) = migr_qp.cq.tail; 3366 cq->mapping.prp1 = migr_qp.cq.dma_addr; 3367 cq->ien = migr_qp.cq.ien; 3368 cq->iv = migr_qp.cq.iv; 3369 cq->phase = migr_qp.cq.phase; 3370 addr = map_one(vu_ctrlr->endpoint->vfu_ctx, 3371 cq->mapping.prp1, cq->size * 16, 3372 cq->mapping.sg, &cq->mapping.iov, 3373 PROT_READ | PROT_WRITE); 3374 if (addr == NULL) { 3375 SPDK_ERRLOG("Restore cq with qid %u PRP1 0x%"PRIx64" with size %u failed\n", 3376 cqid, cq->mapping.prp1, cq->size); 3377 return -EFAULT; 3378 } 3379 } 3380 } 3381 3382 return 0; 3383 } 3384 3385 static int 3386 vfio_user_migr_ctrlr_restore(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3387 { 3388 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 3389 struct spdk_nvmf_ctrlr *ctrlr = vu_ctrlr->ctrlr; 3390 uint32_t *doorbell_base; 3391 struct vfio_user_nvme_migr_state migr_state = {}; 3392 struct spdk_nvme_registers *regs; 3393 struct spdk_nvme_cmd cmd; 3394 uint16_t i; 3395 int rc = 0; 3396 3397 assert(endpoint->migr_data != NULL); 3398 assert(ctrlr != NULL); 3399 rc = vfio_user_migr_stream_to_data(endpoint, &migr_state); 3400 if (rc) { 3401 return rc; 3402 } 3403 3404 /* restore shadow doorbells */ 3405 if (migr_state.ctrlr_header.sdbl) { 3406 struct nvmf_vfio_user_shadow_doorbells *sdbl; 3407 sdbl = map_sdbl(vu_ctrlr->endpoint->vfu_ctx, 3408 migr_state.ctrlr_header.shadow_doorbell_buffer, 3409 migr_state.ctrlr_header.eventidx_buffer, 3410 memory_page_size(vu_ctrlr)); 3411 if (sdbl == NULL) { 3412 SPDK_ERRLOG("%s: failed to re-map shadow doorbell buffers\n", 3413 ctrlr_id(vu_ctrlr)); 3414 return -1; 3415 } 3416 3417 vu_ctrlr->shadow_doorbell_buffer = migr_state.ctrlr_header.shadow_doorbell_buffer; 3418 vu_ctrlr->eventidx_buffer = migr_state.ctrlr_header.eventidx_buffer; 3419 3420 SWAP(vu_ctrlr->sdbl, sdbl); 3421 } 3422 3423 rc = vfio_user_migr_ctrlr_construct_qps(vu_ctrlr, &migr_state); 3424 if (rc) { 3425 return rc; 3426 } 3427 3428 /* restore PCI configuration space */ 3429 memcpy((void *)endpoint->pci_config_space, &migr_state.cfg, NVME_REG_CFG_SIZE); 3430 3431 regs = (struct spdk_nvme_registers *)&migr_state.bar0; 3432 doorbell_base = (uint32_t *)®s->doorbell[0].sq_tdbl; 3433 /* restore doorbells from saved registers */ 3434 memcpy((void *)vu_ctrlr->bar0_doorbells, doorbell_base, NVMF_VFIO_USER_DOORBELLS_SIZE); 3435 3436 /* restore controller registers after ADMIN queue connection */ 3437 ctrlr->vcprop.csts.raw = regs->csts.raw; 3438 ctrlr->vcprop.cap.raw = regs->cap.raw; 3439 ctrlr->vcprop.vs.raw = regs->vs.raw; 3440 ctrlr->vcprop.cc.raw = regs->cc.raw; 3441 ctrlr->vcprop.aqa.raw = regs->aqa.raw; 3442 ctrlr->vcprop.asq = regs->asq; 3443 ctrlr->vcprop.acq = regs->acq; 3444 3445 /* restore nvmf controller data */ 3446 rc = nvmf_ctrlr_restore_migr_data(ctrlr, &migr_state.nvmf_data); 3447 if (rc) { 3448 return rc; 3449 } 3450 3451 /* resubmit pending AERs */ 3452 for (i = 0; i < migr_state.ctrlr_header.nr_aers; i++) { 3453 SPDK_DEBUGLOG(nvmf_vfio, "%s AER resubmit, CID %u\n", ctrlr_id(vu_ctrlr), 3454 migr_state.ctrlr_header.aer_cids[i]); 3455 memset(&cmd, 0, sizeof(cmd)); 3456 cmd.opc = SPDK_NVME_OPC_ASYNC_EVENT_REQUEST; 3457 cmd.cid = migr_state.ctrlr_header.aer_cids[i]; 3458 rc = handle_cmd_req(vu_ctrlr, &cmd, vu_ctrlr->sqs[0]); 3459 if (rc) { 3460 break; 3461 } 3462 } 3463 3464 return rc; 3465 } 3466 3467 static void 3468 vfio_user_migr_ctrlr_enable_sqs(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3469 { 3470 uint32_t i; 3471 struct nvmf_vfio_user_sq *sq; 3472 3473 /* The Admin queue (qid: 0) does not ever use shadow doorbells. */ 3474 3475 if (vu_ctrlr->sqs[0] != NULL) { 3476 vu_ctrlr->sqs[0]->dbl_tailp = vu_ctrlr->bar0_doorbells + 3477 queue_index(0, false); 3478 } 3479 3480 if (vu_ctrlr->cqs[0] != NULL) { 3481 vu_ctrlr->cqs[0]->dbl_headp = vu_ctrlr->bar0_doorbells + 3482 queue_index(0, true); 3483 } 3484 3485 vfio_user_ctrlr_switch_doorbells(vu_ctrlr, vu_ctrlr->sdbl != NULL); 3486 3487 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3488 sq = vu_ctrlr->sqs[i]; 3489 if (!sq || !sq->size) { 3490 continue; 3491 } 3492 3493 if (nvmf_qpair_is_admin_queue(&sq->qpair)) { 3494 /* ADMIN queue pair is always in the poll group, just enable it */ 3495 sq->sq_state = VFIO_USER_SQ_ACTIVE; 3496 } else { 3497 spdk_nvmf_tgt_new_qpair(vu_ctrlr->transport->transport.tgt, &sq->qpair); 3498 } 3499 } 3500 } 3501 3502 static int 3503 vfio_user_migration_device_state_transition(vfu_ctx_t *vfu_ctx, vfu_migr_state_t state) 3504 { 3505 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3506 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3507 struct nvmf_vfio_user_sq *sq; 3508 int ret = 0; 3509 3510 SPDK_DEBUGLOG(nvmf_vfio, "%s controller state %u, migration state %u\n", endpoint_id(endpoint), 3511 vu_ctrlr->state, state); 3512 3513 switch (state) { 3514 case VFU_MIGR_STATE_STOP_AND_COPY: 3515 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 3516 vfio_user_migr_ctrlr_save_data(vu_ctrlr); 3517 break; 3518 case VFU_MIGR_STATE_STOP: 3519 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 3520 break; 3521 case VFU_MIGR_STATE_PRE_COPY: 3522 assert(vu_ctrlr->state == VFIO_USER_CTRLR_PAUSED); 3523 vu_ctrlr->migr_reg.pending_bytes = vfio_user_migr_data_len(); 3524 vu_ctrlr->migr_reg.last_data_offset = 0; 3525 vu_ctrlr->in_source_vm = true; 3526 break; 3527 case VFU_MIGR_STATE_RESUME: 3528 /* 3529 * Destination ADMIN queue pair is connected when starting the VM, 3530 * but the ADMIN queue pair isn't enabled in destination VM, the poll 3531 * group will do nothing to ADMIN queue pair for now. 3532 */ 3533 if (vu_ctrlr->state != VFIO_USER_CTRLR_RUNNING) { 3534 break; 3535 } 3536 3537 assert(!vu_ctrlr->in_source_vm); 3538 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 3539 3540 sq = TAILQ_FIRST(&vu_ctrlr->connected_sqs); 3541 assert(sq != NULL); 3542 assert(sq->qpair.qid == 0); 3543 sq->sq_state = VFIO_USER_SQ_INACTIVE; 3544 3545 /* Free ADMIN SQ resources first, SQ resources will be 3546 * allocated based on queue size from source VM. 3547 */ 3548 free_sq_reqs(sq); 3549 sq->size = 0; 3550 break; 3551 case VFU_MIGR_STATE_RUNNING: 3552 if (vu_ctrlr->state != VFIO_USER_CTRLR_MIGRATING) { 3553 break; 3554 } 3555 3556 if (!vu_ctrlr->in_source_vm) { 3557 /* Restore destination VM from BAR9 */ 3558 ret = vfio_user_migr_ctrlr_restore(vu_ctrlr); 3559 if (ret) { 3560 break; 3561 } 3562 3563 vfio_user_ctrlr_switch_doorbells(vu_ctrlr, false); 3564 vfio_user_migr_ctrlr_enable_sqs(vu_ctrlr); 3565 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 3566 } else { 3567 /* Rollback source VM */ 3568 vu_ctrlr->state = VFIO_USER_CTRLR_RESUMING; 3569 ret = spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 3570 vfio_user_endpoint_resume_done, endpoint); 3571 if (ret < 0) { 3572 /* TODO: fail controller with CFS bit set */ 3573 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 3574 SPDK_ERRLOG("%s: failed to resume, ret=%d\n", endpoint_id(endpoint), ret); 3575 break; 3576 } 3577 } 3578 break; 3579 3580 default: 3581 return -EINVAL; 3582 } 3583 3584 return ret; 3585 } 3586 3587 static uint64_t 3588 vfio_user_migration_get_pending_bytes(vfu_ctx_t *vfu_ctx) 3589 { 3590 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3591 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 3592 struct vfio_user_migration_region *migr_reg = &ctrlr->migr_reg; 3593 3594 SPDK_DEBUGLOG(nvmf_vfio, "%s current state %u, pending bytes 0x%"PRIx64"\n", endpoint_id(endpoint), 3595 ctrlr->state, migr_reg->pending_bytes); 3596 3597 return migr_reg->pending_bytes; 3598 } 3599 3600 static int 3601 vfio_user_migration_prepare_data(vfu_ctx_t *vfu_ctx, uint64_t *offset, uint64_t *size) 3602 { 3603 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3604 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 3605 struct vfio_user_migration_region *migr_reg = &ctrlr->migr_reg; 3606 3607 if (migr_reg->last_data_offset == vfio_user_migr_data_len()) { 3608 *offset = vfio_user_migr_data_len(); 3609 if (size) { 3610 *size = 0; 3611 } 3612 migr_reg->pending_bytes = 0; 3613 } else { 3614 *offset = 0; 3615 if (size) { 3616 *size = vfio_user_migr_data_len(); 3617 if (ctrlr->state == VFIO_USER_CTRLR_MIGRATING) { 3618 vfio_user_migr_ctrlr_save_data(ctrlr); 3619 migr_reg->last_data_offset = vfio_user_migr_data_len(); 3620 } 3621 } 3622 } 3623 3624 SPDK_DEBUGLOG(nvmf_vfio, "%s current state %u\n", endpoint_id(endpoint), ctrlr->state); 3625 3626 return 0; 3627 } 3628 3629 static ssize_t 3630 vfio_user_migration_read_data(vfu_ctx_t *vfu_ctx, void *buf, uint64_t count, uint64_t offset) 3631 { 3632 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3633 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 3634 struct vfio_user_migration_region *migr_reg = &ctrlr->migr_reg; 3635 3636 memcpy(buf, endpoint->migr_data, count); 3637 migr_reg->pending_bytes = 0; 3638 3639 return 0; 3640 } 3641 3642 static ssize_t 3643 vfio_user_migration_write_data(vfu_ctx_t *vfu_ctx, void *buf, uint64_t count, uint64_t offset) 3644 { 3645 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3646 3647 memcpy(endpoint->migr_data, buf, count); 3648 3649 return 0; 3650 } 3651 3652 static int 3653 vfio_user_migration_data_written(vfu_ctx_t *vfu_ctx, uint64_t count) 3654 { 3655 SPDK_DEBUGLOG(nvmf_vfio, "write 0x%"PRIx64"\n", (uint64_t)count); 3656 3657 return 0; 3658 } 3659 3660 static int 3661 vfio_user_dev_info_fill(struct nvmf_vfio_user_transport *vu_transport, 3662 struct nvmf_vfio_user_endpoint *endpoint) 3663 { 3664 int ret; 3665 ssize_t cap_offset; 3666 vfu_ctx_t *vfu_ctx = endpoint->vfu_ctx; 3667 struct iovec migr_sparse_mmap = {}; 3668 3669 struct pmcap pmcap = { .hdr.id = PCI_CAP_ID_PM, .pmcs.nsfrst = 0x1 }; 3670 struct pxcap pxcap = { 3671 .hdr.id = PCI_CAP_ID_EXP, 3672 .pxcaps.ver = 0x2, 3673 .pxdcap = {.rer = 0x1, .flrc = 0x1}, 3674 .pxdcap2.ctds = 0x1 3675 }; 3676 3677 struct msixcap msixcap = { 3678 .hdr.id = PCI_CAP_ID_MSIX, 3679 .mxc.ts = NVME_IRQ_MSIX_NUM - 1, 3680 .mtab = {.tbir = 0x4, .to = 0x0}, 3681 .mpba = {.pbir = 0x5, .pbao = 0x0} 3682 }; 3683 3684 struct iovec sparse_mmap[] = { 3685 { 3686 .iov_base = (void *)NVME_DOORBELLS_OFFSET, 3687 .iov_len = NVMF_VFIO_USER_DOORBELLS_SIZE, 3688 }, 3689 }; 3690 3691 const vfu_migration_callbacks_t migr_callbacks = { 3692 .version = VFU_MIGR_CALLBACKS_VERS, 3693 .transition = &vfio_user_migration_device_state_transition, 3694 .get_pending_bytes = &vfio_user_migration_get_pending_bytes, 3695 .prepare_data = &vfio_user_migration_prepare_data, 3696 .read_data = &vfio_user_migration_read_data, 3697 .data_written = &vfio_user_migration_data_written, 3698 .write_data = &vfio_user_migration_write_data 3699 }; 3700 3701 ret = vfu_pci_init(vfu_ctx, VFU_PCI_TYPE_EXPRESS, PCI_HEADER_TYPE_NORMAL, 0); 3702 if (ret < 0) { 3703 SPDK_ERRLOG("vfu_ctx %p failed to initialize PCI\n", vfu_ctx); 3704 return ret; 3705 } 3706 vfu_pci_set_id(vfu_ctx, SPDK_PCI_VID_NUTANIX, 0x0001, SPDK_PCI_VID_NUTANIX, 0); 3707 /* 3708 * 0x02, controller uses the NVM Express programming interface 3709 * 0x08, non-volatile memory controller 3710 * 0x01, mass storage controller 3711 */ 3712 vfu_pci_set_class(vfu_ctx, 0x01, 0x08, 0x02); 3713 3714 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pmcap); 3715 if (cap_offset < 0) { 3716 SPDK_ERRLOG("vfu_ctx %p failed add pmcap\n", vfu_ctx); 3717 return ret; 3718 } 3719 3720 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pxcap); 3721 if (cap_offset < 0) { 3722 SPDK_ERRLOG("vfu_ctx %p failed add pxcap\n", vfu_ctx); 3723 return ret; 3724 } 3725 3726 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &msixcap); 3727 if (cap_offset < 0) { 3728 SPDK_ERRLOG("vfu_ctx %p failed add msixcap\n", vfu_ctx); 3729 return ret; 3730 } 3731 3732 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_CFG_REGION_IDX, NVME_REG_CFG_SIZE, 3733 access_pci_config, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 3734 if (ret < 0) { 3735 SPDK_ERRLOG("vfu_ctx %p failed to setup cfg\n", vfu_ctx); 3736 return ret; 3737 } 3738 3739 if (vu_transport->transport_opts.disable_mappable_bar0) { 3740 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE, 3741 access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, 3742 NULL, 0, -1, 0); 3743 } else { 3744 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE, 3745 access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, 3746 sparse_mmap, 1, endpoint->devmem_fd, 0); 3747 } 3748 3749 if (ret < 0) { 3750 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 0\n", vfu_ctx); 3751 return ret; 3752 } 3753 3754 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR4_REGION_IDX, NVME_BAR4_SIZE, 3755 NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 3756 if (ret < 0) { 3757 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 4\n", vfu_ctx); 3758 return ret; 3759 } 3760 3761 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR5_REGION_IDX, NVME_BAR5_SIZE, 3762 NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 3763 if (ret < 0) { 3764 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 5\n", vfu_ctx); 3765 return ret; 3766 } 3767 3768 ret = vfu_setup_device_dma(vfu_ctx, memory_region_add_cb, memory_region_remove_cb); 3769 if (ret < 0) { 3770 SPDK_ERRLOG("vfu_ctx %p failed to setup dma callback\n", vfu_ctx); 3771 return ret; 3772 } 3773 3774 ret = vfu_setup_device_reset_cb(vfu_ctx, vfio_user_device_reset); 3775 if (ret < 0) { 3776 SPDK_ERRLOG("vfu_ctx %p failed to setup reset callback\n", vfu_ctx); 3777 return ret; 3778 } 3779 3780 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_INTX_IRQ, 1); 3781 if (ret < 0) { 3782 SPDK_ERRLOG("vfu_ctx %p failed to setup INTX\n", vfu_ctx); 3783 return ret; 3784 } 3785 3786 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSIX_IRQ, NVME_IRQ_MSIX_NUM); 3787 if (ret < 0) { 3788 SPDK_ERRLOG("vfu_ctx %p failed to setup MSIX\n", vfu_ctx); 3789 return ret; 3790 } 3791 3792 vfu_setup_device_quiesce_cb(vfu_ctx, vfio_user_dev_quiesce_cb); 3793 3794 migr_sparse_mmap.iov_base = (void *)4096; 3795 migr_sparse_mmap.iov_len = vfio_user_migr_data_len(); 3796 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_MIGR_REGION_IDX, 3797 vfu_get_migr_register_area_size() + vfio_user_migr_data_len(), 3798 NULL, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, &migr_sparse_mmap, 3799 1, endpoint->migr_fd, 0); 3800 if (ret < 0) { 3801 SPDK_ERRLOG("vfu_ctx %p failed to setup migration region\n", vfu_ctx); 3802 return ret; 3803 } 3804 3805 ret = vfu_setup_device_migration_callbacks(vfu_ctx, &migr_callbacks, 3806 vfu_get_migr_register_area_size()); 3807 if (ret < 0) { 3808 SPDK_ERRLOG("vfu_ctx %p failed to setup migration callbacks\n", vfu_ctx); 3809 return ret; 3810 } 3811 3812 ret = vfu_realize_ctx(vfu_ctx); 3813 if (ret < 0) { 3814 SPDK_ERRLOG("vfu_ctx %p failed to realize\n", vfu_ctx); 3815 return ret; 3816 } 3817 3818 endpoint->pci_config_space = vfu_pci_get_config_space(endpoint->vfu_ctx); 3819 assert(endpoint->pci_config_space != NULL); 3820 init_pci_config_space(endpoint->pci_config_space); 3821 3822 assert(cap_offset != 0); 3823 endpoint->msix = (struct msixcap *)((uint8_t *)endpoint->pci_config_space + cap_offset); 3824 3825 return 0; 3826 } 3827 3828 static int nvmf_vfio_user_accept(void *ctx); 3829 3830 static void 3831 set_intr_mode_noop(struct spdk_poller *poller, void *arg, bool interrupt_mode) 3832 { 3833 /* Nothing for us to do here. */ 3834 } 3835 3836 /* 3837 * Register an "accept" poller: this is polling for incoming vfio-user socket 3838 * connections (on the listening socket). 3839 * 3840 * We need to do this on first listening, and also after destroying a 3841 * controller, so we can accept another connection. 3842 */ 3843 static int 3844 vfio_user_register_accept_poller(struct nvmf_vfio_user_endpoint *endpoint) 3845 { 3846 uint64_t poll_rate_us = endpoint->transport->transport.opts.acceptor_poll_rate; 3847 3848 SPDK_DEBUGLOG(nvmf_vfio, "registering accept poller\n"); 3849 3850 endpoint->accept_poller = SPDK_POLLER_REGISTER(nvmf_vfio_user_accept, 3851 endpoint, poll_rate_us); 3852 3853 if (!endpoint->accept_poller) { 3854 return -1; 3855 } 3856 3857 endpoint->accept_thread = spdk_get_thread(); 3858 3859 if (!spdk_interrupt_mode_is_enabled()) { 3860 return 0; 3861 } 3862 3863 endpoint->accept_intr_fd = vfu_get_poll_fd(endpoint->vfu_ctx); 3864 assert(endpoint->accept_intr_fd != -1); 3865 3866 endpoint->accept_intr = SPDK_INTERRUPT_REGISTER(endpoint->accept_intr_fd, 3867 nvmf_vfio_user_accept, endpoint); 3868 3869 assert(endpoint->accept_intr != NULL); 3870 3871 spdk_poller_register_interrupt(endpoint->accept_poller, 3872 set_intr_mode_noop, NULL); 3873 return 0; 3874 } 3875 3876 static void 3877 _vfio_user_relisten(void *ctx) 3878 { 3879 struct nvmf_vfio_user_endpoint *endpoint = ctx; 3880 3881 vfio_user_register_accept_poller(endpoint); 3882 } 3883 3884 static void 3885 _free_ctrlr(void *ctx) 3886 { 3887 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 3888 struct nvmf_vfio_user_endpoint *endpoint = ctrlr->endpoint; 3889 3890 free_sdbl(ctrlr->endpoint->vfu_ctx, ctrlr->sdbl); 3891 3892 spdk_interrupt_unregister(&ctrlr->intr); 3893 ctrlr->intr_fd = -1; 3894 spdk_poller_unregister(&ctrlr->vfu_ctx_poller); 3895 3896 free(ctrlr); 3897 3898 if (endpoint == NULL) { 3899 return; 3900 } 3901 3902 if (endpoint->need_async_destroy) { 3903 nvmf_vfio_user_destroy_endpoint(endpoint); 3904 } else { 3905 spdk_thread_send_msg(endpoint->accept_thread, 3906 _vfio_user_relisten, endpoint); 3907 } 3908 } 3909 3910 static void 3911 free_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 3912 { 3913 int i; 3914 assert(ctrlr != NULL); 3915 3916 SPDK_DEBUGLOG(nvmf_vfio, "free %s\n", ctrlr_id(ctrlr)); 3917 3918 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3919 free_qp(ctrlr, i); 3920 } 3921 3922 spdk_thread_exec_msg(ctrlr->thread, _free_ctrlr, ctrlr); 3923 } 3924 3925 static int 3926 nvmf_vfio_user_create_ctrlr(struct nvmf_vfio_user_transport *transport, 3927 struct nvmf_vfio_user_endpoint *endpoint) 3928 { 3929 struct nvmf_vfio_user_ctrlr *ctrlr; 3930 int err = 0; 3931 3932 SPDK_DEBUGLOG(nvmf_vfio, "%s\n", endpoint_id(endpoint)); 3933 3934 /* First, construct a vfio-user CUSTOM transport controller */ 3935 ctrlr = calloc(1, sizeof(*ctrlr)); 3936 if (ctrlr == NULL) { 3937 err = -ENOMEM; 3938 goto out; 3939 } 3940 /* We can only support one connection for now */ 3941 ctrlr->cntlid = 0x1; 3942 ctrlr->intr_fd = -1; 3943 ctrlr->transport = transport; 3944 ctrlr->endpoint = endpoint; 3945 ctrlr->bar0_doorbells = endpoint->bar0_doorbells; 3946 TAILQ_INIT(&ctrlr->connected_sqs); 3947 3948 /* Then, construct an admin queue pair */ 3949 err = init_sq(ctrlr, &transport->transport, 0); 3950 if (err != 0) { 3951 free(ctrlr); 3952 goto out; 3953 } 3954 3955 err = init_cq(ctrlr, 0); 3956 if (err != 0) { 3957 free(ctrlr); 3958 goto out; 3959 } 3960 3961 ctrlr->sqs[0]->size = NVMF_VFIO_USER_DEFAULT_AQ_DEPTH; 3962 3963 err = alloc_sq_reqs(ctrlr, ctrlr->sqs[0]); 3964 if (err != 0) { 3965 free(ctrlr); 3966 goto out; 3967 } 3968 endpoint->ctrlr = ctrlr; 3969 3970 /* Notify the generic layer about the new admin queue pair */ 3971 spdk_nvmf_tgt_new_qpair(transport->transport.tgt, &ctrlr->sqs[0]->qpair); 3972 3973 out: 3974 if (err != 0) { 3975 SPDK_ERRLOG("%s: failed to create vfio-user controller: %s\n", 3976 endpoint_id(endpoint), strerror(-err)); 3977 } 3978 3979 return err; 3980 } 3981 3982 static int 3983 nvmf_vfio_user_listen(struct spdk_nvmf_transport *transport, 3984 const struct spdk_nvme_transport_id *trid, 3985 struct spdk_nvmf_listen_opts *listen_opts) 3986 { 3987 struct nvmf_vfio_user_transport *vu_transport; 3988 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 3989 char path[PATH_MAX] = {}; 3990 char uuid[PATH_MAX] = {}; 3991 int ret; 3992 3993 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 3994 transport); 3995 3996 pthread_mutex_lock(&vu_transport->lock); 3997 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 3998 /* Only compare traddr */ 3999 if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) { 4000 pthread_mutex_unlock(&vu_transport->lock); 4001 return -EEXIST; 4002 } 4003 } 4004 pthread_mutex_unlock(&vu_transport->lock); 4005 4006 endpoint = calloc(1, sizeof(*endpoint)); 4007 if (!endpoint) { 4008 return -ENOMEM; 4009 } 4010 4011 pthread_mutex_init(&endpoint->lock, NULL); 4012 endpoint->devmem_fd = -1; 4013 memcpy(&endpoint->trid, trid, sizeof(endpoint->trid)); 4014 endpoint->transport = vu_transport; 4015 4016 ret = snprintf(path, PATH_MAX, "%s/bar0", endpoint_id(endpoint)); 4017 if (ret < 0 || ret >= PATH_MAX) { 4018 SPDK_ERRLOG("%s: error to get socket path: %s.\n", endpoint_id(endpoint), spdk_strerror(errno)); 4019 ret = -1; 4020 goto out; 4021 } 4022 4023 ret = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); 4024 if (ret == -1) { 4025 SPDK_ERRLOG("%s: failed to open device memory at %s: %s.\n", 4026 endpoint_id(endpoint), path, spdk_strerror(errno)); 4027 goto out; 4028 } 4029 unlink(path); 4030 4031 endpoint->devmem_fd = ret; 4032 ret = ftruncate(endpoint->devmem_fd, 4033 NVME_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE); 4034 if (ret != 0) { 4035 SPDK_ERRLOG("%s: error to ftruncate file %s: %s.\n", endpoint_id(endpoint), path, 4036 spdk_strerror(errno)); 4037 goto out; 4038 } 4039 4040 endpoint->bar0_doorbells = mmap(NULL, NVMF_VFIO_USER_DOORBELLS_SIZE, 4041 PROT_READ | PROT_WRITE, MAP_SHARED, endpoint->devmem_fd, NVME_DOORBELLS_OFFSET); 4042 if (endpoint->bar0_doorbells == MAP_FAILED) { 4043 SPDK_ERRLOG("%s: error to mmap file %s: %s.\n", endpoint_id(endpoint), path, spdk_strerror(errno)); 4044 endpoint->bar0_doorbells = NULL; 4045 ret = -1; 4046 goto out; 4047 } 4048 4049 ret = snprintf(path, PATH_MAX, "%s/migr", endpoint_id(endpoint)); 4050 if (ret < 0 || ret >= PATH_MAX) { 4051 SPDK_ERRLOG("%s: error to get migration file path: %s.\n", endpoint_id(endpoint), 4052 spdk_strerror(errno)); 4053 ret = -1; 4054 goto out; 4055 } 4056 ret = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); 4057 if (ret == -1) { 4058 SPDK_ERRLOG("%s: failed to open device memory at %s: %s.\n", 4059 endpoint_id(endpoint), path, spdk_strerror(errno)); 4060 goto out; 4061 } 4062 unlink(path); 4063 4064 endpoint->migr_fd = ret; 4065 ret = ftruncate(endpoint->migr_fd, 4066 vfu_get_migr_register_area_size() + vfio_user_migr_data_len()); 4067 if (ret != 0) { 4068 SPDK_ERRLOG("%s: error to ftruncate migration file %s: %s.\n", endpoint_id(endpoint), path, 4069 spdk_strerror(errno)); 4070 goto out; 4071 } 4072 4073 endpoint->migr_data = mmap(NULL, vfio_user_migr_data_len(), 4074 PROT_READ | PROT_WRITE, MAP_SHARED, endpoint->migr_fd, vfu_get_migr_register_area_size()); 4075 if (endpoint->migr_data == MAP_FAILED) { 4076 SPDK_ERRLOG("%s: error to mmap file %s: %s.\n", endpoint_id(endpoint), path, spdk_strerror(errno)); 4077 endpoint->migr_data = NULL; 4078 ret = -1; 4079 goto out; 4080 } 4081 4082 ret = snprintf(uuid, PATH_MAX, "%s/cntrl", endpoint_id(endpoint)); 4083 if (ret < 0 || ret >= PATH_MAX) { 4084 SPDK_ERRLOG("%s: error to get ctrlr file path: %s\n", endpoint_id(endpoint), spdk_strerror(errno)); 4085 ret = -1; 4086 goto out; 4087 } 4088 4089 endpoint->vfu_ctx = vfu_create_ctx(VFU_TRANS_SOCK, uuid, LIBVFIO_USER_FLAG_ATTACH_NB, 4090 endpoint, VFU_DEV_TYPE_PCI); 4091 if (endpoint->vfu_ctx == NULL) { 4092 SPDK_ERRLOG("%s: error creating libmuser context: %m\n", 4093 endpoint_id(endpoint)); 4094 ret = -1; 4095 goto out; 4096 } 4097 vfu_setup_log(endpoint->vfu_ctx, vfio_user_log, vfio_user_get_log_level()); 4098 4099 ret = vfio_user_dev_info_fill(vu_transport, endpoint); 4100 if (ret < 0) { 4101 goto out; 4102 } 4103 4104 ret = vfio_user_register_accept_poller(endpoint); 4105 4106 if (ret != 0) { 4107 goto out; 4108 } 4109 4110 pthread_mutex_lock(&vu_transport->lock); 4111 TAILQ_INSERT_TAIL(&vu_transport->endpoints, endpoint, link); 4112 pthread_mutex_unlock(&vu_transport->lock); 4113 4114 out: 4115 if (ret != 0) { 4116 nvmf_vfio_user_destroy_endpoint(endpoint); 4117 } 4118 4119 return ret; 4120 } 4121 4122 static void 4123 nvmf_vfio_user_stop_listen(struct spdk_nvmf_transport *transport, 4124 const struct spdk_nvme_transport_id *trid) 4125 { 4126 struct nvmf_vfio_user_transport *vu_transport; 4127 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 4128 4129 assert(trid != NULL); 4130 assert(trid->traddr != NULL); 4131 4132 SPDK_DEBUGLOG(nvmf_vfio, "%s: stop listen\n", trid->traddr); 4133 4134 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4135 transport); 4136 4137 pthread_mutex_lock(&vu_transport->lock); 4138 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 4139 if (strcmp(trid->traddr, endpoint->trid.traddr) == 0) { 4140 TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link); 4141 /* Defer to free endpoint resources until the controller 4142 * is freed. There are two cases when running here: 4143 * 1. kill nvmf target while VM is connected 4144 * 2. remove listener via RPC call 4145 * nvmf library will disconnect all queue paris. 4146 */ 4147 if (endpoint->ctrlr) { 4148 assert(!endpoint->need_async_destroy); 4149 endpoint->need_async_destroy = true; 4150 pthread_mutex_unlock(&vu_transport->lock); 4151 return; 4152 } 4153 4154 nvmf_vfio_user_destroy_endpoint(endpoint); 4155 pthread_mutex_unlock(&vu_transport->lock); 4156 return; 4157 } 4158 } 4159 pthread_mutex_unlock(&vu_transport->lock); 4160 4161 SPDK_DEBUGLOG(nvmf_vfio, "%s: not found\n", trid->traddr); 4162 } 4163 4164 static void 4165 nvmf_vfio_user_cdata_init(struct spdk_nvmf_transport *transport, 4166 struct spdk_nvmf_subsystem *subsystem, 4167 struct spdk_nvmf_ctrlr_data *cdata) 4168 { 4169 struct nvmf_vfio_user_transport *vu_transport; 4170 4171 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, transport); 4172 4173 cdata->vid = SPDK_PCI_VID_NUTANIX; 4174 cdata->ssvid = SPDK_PCI_VID_NUTANIX; 4175 cdata->ieee[0] = 0x8d; 4176 cdata->ieee[1] = 0x6b; 4177 cdata->ieee[2] = 0x50; 4178 memset(&cdata->sgls, 0, sizeof(struct spdk_nvme_cdata_sgls)); 4179 cdata->sgls.supported = SPDK_NVME_SGLS_SUPPORTED_DWORD_ALIGNED; 4180 /* libvfio-user can only support 1 connection for now */ 4181 cdata->oncs.reservations = 0; 4182 cdata->oacs.doorbell_buffer_config = !vu_transport->transport_opts.disable_shadow_doorbells; 4183 } 4184 4185 static int 4186 nvmf_vfio_user_listen_associate(struct spdk_nvmf_transport *transport, 4187 const struct spdk_nvmf_subsystem *subsystem, 4188 const struct spdk_nvme_transport_id *trid) 4189 { 4190 struct nvmf_vfio_user_transport *vu_transport; 4191 struct nvmf_vfio_user_endpoint *endpoint; 4192 4193 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, transport); 4194 4195 pthread_mutex_lock(&vu_transport->lock); 4196 TAILQ_FOREACH(endpoint, &vu_transport->endpoints, link) { 4197 if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) { 4198 break; 4199 } 4200 } 4201 pthread_mutex_unlock(&vu_transport->lock); 4202 4203 if (endpoint == NULL) { 4204 return -ENOENT; 4205 } 4206 4207 endpoint->subsystem = subsystem; 4208 4209 return 0; 4210 } 4211 4212 /* 4213 * Executed periodically at a default SPDK_NVMF_DEFAULT_ACCEPT_POLL_RATE_US 4214 * frequency. 4215 * 4216 * For this endpoint (which at the libvfio-user level corresponds to a socket), 4217 * if we don't currently have a controller set up, peek to see if the socket is 4218 * able to accept a new connection. 4219 */ 4220 static int 4221 nvmf_vfio_user_accept(void *ctx) 4222 { 4223 struct nvmf_vfio_user_endpoint *endpoint = ctx; 4224 struct nvmf_vfio_user_transport *vu_transport; 4225 int err; 4226 4227 vu_transport = endpoint->transport; 4228 4229 if (endpoint->ctrlr != NULL) { 4230 return SPDK_POLLER_IDLE; 4231 } 4232 4233 err = vfu_attach_ctx(endpoint->vfu_ctx); 4234 4235 if (err == 0) { 4236 SPDK_DEBUGLOG(nvmf_vfio, "attach succeeded\n"); 4237 4238 err = nvmf_vfio_user_create_ctrlr(vu_transport, endpoint); 4239 4240 if (err == 0) { 4241 /* 4242 * Unregister ourselves: now we've accepted a 4243 * connection, there is nothing for us to poll for, and 4244 * we will poll the connection via vfu_run_ctx() 4245 * instead. 4246 */ 4247 spdk_interrupt_unregister(&endpoint->accept_intr); 4248 spdk_poller_unregister(&endpoint->accept_poller); 4249 } 4250 4251 return SPDK_POLLER_BUSY; 4252 } 4253 4254 if (errno == EAGAIN || errno == EWOULDBLOCK) { 4255 return SPDK_POLLER_IDLE; 4256 } 4257 4258 return SPDK_POLLER_BUSY; 4259 } 4260 4261 static void 4262 nvmf_vfio_user_discover(struct spdk_nvmf_transport *transport, 4263 struct spdk_nvme_transport_id *trid, 4264 struct spdk_nvmf_discovery_log_page_entry *entry) 4265 { } 4266 4267 static struct spdk_nvmf_transport_poll_group * 4268 nvmf_vfio_user_poll_group_create(struct spdk_nvmf_transport *transport, 4269 struct spdk_nvmf_poll_group *group) 4270 { 4271 struct nvmf_vfio_user_transport *vu_transport; 4272 struct nvmf_vfio_user_poll_group *vu_group; 4273 4274 SPDK_DEBUGLOG(nvmf_vfio, "create poll group\n"); 4275 4276 vu_group = calloc(1, sizeof(*vu_group)); 4277 if (vu_group == NULL) { 4278 SPDK_ERRLOG("Error allocating poll group: %m"); 4279 return NULL; 4280 } 4281 4282 TAILQ_INIT(&vu_group->sqs); 4283 4284 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4285 transport); 4286 pthread_mutex_lock(&vu_transport->pg_lock); 4287 TAILQ_INSERT_TAIL(&vu_transport->poll_groups, vu_group, link); 4288 if (vu_transport->next_pg == NULL) { 4289 vu_transport->next_pg = vu_group; 4290 } 4291 pthread_mutex_unlock(&vu_transport->pg_lock); 4292 4293 if (!spdk_interrupt_mode_is_enabled()) { 4294 return &vu_group->group; 4295 } 4296 4297 /* 4298 * Only allow the poll group to work in interrupt mode if the transport 4299 * supports it. It's our responsibility to register the actual interrupt 4300 * later (in handle_queue_connect_rsp()) that processes everything in 4301 * the poll group: for us, that's the libvfio-user context, and the 4302 * actual qpairs. 4303 * 4304 * Note that this only works in the case that nothing else shares the 4305 * spdk_nvmf_poll_group. 4306 * 4307 * If not supported, this will effectively always wake up to poll the 4308 * poll group. 4309 */ 4310 4311 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4312 transport); 4313 4314 if (!vu_transport->intr_mode_supported) { 4315 SPDK_WARNLOG("vfio-user interrupt mode not supported\n"); 4316 return &vu_group->group; 4317 } 4318 4319 spdk_poller_register_interrupt(group->poller, set_intr_mode_noop, 4320 NULL); 4321 4322 return &vu_group->group; 4323 } 4324 4325 static bool 4326 in_interrupt_mode(struct nvmf_vfio_user_transport *vu_transport) 4327 { 4328 return spdk_interrupt_mode_is_enabled() && 4329 vu_transport->intr_mode_supported; 4330 } 4331 4332 static struct spdk_nvmf_transport_poll_group * 4333 nvmf_vfio_user_get_optimal_poll_group(struct spdk_nvmf_qpair *qpair) 4334 { 4335 struct nvmf_vfio_user_transport *vu_transport; 4336 struct nvmf_vfio_user_poll_group **vu_group; 4337 struct nvmf_vfio_user_sq *sq; 4338 struct nvmf_vfio_user_cq *cq; 4339 4340 struct spdk_nvmf_transport_poll_group *result = NULL; 4341 4342 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 4343 cq = sq->ctrlr->cqs[sq->cqid]; 4344 assert(cq != NULL); 4345 vu_transport = SPDK_CONTAINEROF(qpair->transport, struct nvmf_vfio_user_transport, transport); 4346 4347 pthread_mutex_lock(&vu_transport->pg_lock); 4348 if (TAILQ_EMPTY(&vu_transport->poll_groups)) { 4349 goto out; 4350 } 4351 4352 if (!nvmf_qpair_is_admin_queue(qpair)) { 4353 /* 4354 * If this is shared IO CQ case, just return the used CQ's poll 4355 * group, so I/O completions don't have to use 4356 * spdk_thread_send_msg(). 4357 */ 4358 if (cq->group != NULL) { 4359 result = cq->group; 4360 goto out; 4361 } 4362 4363 /* 4364 * If we're in interrupt mode, align all qpairs for a controller 4365 * on the same poll group, to avoid complications in 4366 * vfio_user_handle_intr(). 4367 */ 4368 if (in_interrupt_mode(vu_transport)) { 4369 result = sq->ctrlr->sqs[0]->group; 4370 goto out; 4371 } 4372 4373 } 4374 4375 vu_group = &vu_transport->next_pg; 4376 assert(*vu_group != NULL); 4377 4378 result = &(*vu_group)->group; 4379 *vu_group = TAILQ_NEXT(*vu_group, link); 4380 if (*vu_group == NULL) { 4381 *vu_group = TAILQ_FIRST(&vu_transport->poll_groups); 4382 } 4383 4384 out: 4385 if (cq->group == NULL) { 4386 cq->group = result; 4387 } 4388 4389 pthread_mutex_unlock(&vu_transport->pg_lock); 4390 return result; 4391 } 4392 4393 /* called when process exits */ 4394 static void 4395 nvmf_vfio_user_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group) 4396 { 4397 struct nvmf_vfio_user_poll_group *vu_group, *next_tgroup;; 4398 struct nvmf_vfio_user_transport *vu_transport; 4399 4400 SPDK_DEBUGLOG(nvmf_vfio, "destroy poll group\n"); 4401 4402 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 4403 vu_transport = SPDK_CONTAINEROF(vu_group->group.transport, struct nvmf_vfio_user_transport, 4404 transport); 4405 4406 pthread_mutex_lock(&vu_transport->pg_lock); 4407 next_tgroup = TAILQ_NEXT(vu_group, link); 4408 TAILQ_REMOVE(&vu_transport->poll_groups, vu_group, link); 4409 if (next_tgroup == NULL) { 4410 next_tgroup = TAILQ_FIRST(&vu_transport->poll_groups); 4411 } 4412 if (vu_transport->next_pg == vu_group) { 4413 vu_transport->next_pg = next_tgroup; 4414 } 4415 pthread_mutex_unlock(&vu_transport->pg_lock); 4416 4417 free(vu_group); 4418 } 4419 4420 static void 4421 _vfio_user_qpair_disconnect(void *ctx) 4422 { 4423 struct nvmf_vfio_user_sq *sq = ctx; 4424 4425 spdk_nvmf_qpair_disconnect(&sq->qpair, NULL, NULL); 4426 } 4427 4428 /* The function is used when socket connection is destroyed */ 4429 static int 4430 vfio_user_destroy_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 4431 { 4432 struct nvmf_vfio_user_sq *sq; 4433 struct nvmf_vfio_user_endpoint *endpoint; 4434 4435 SPDK_DEBUGLOG(nvmf_vfio, "%s stop processing\n", ctrlr_id(ctrlr)); 4436 4437 endpoint = ctrlr->endpoint; 4438 assert(endpoint != NULL); 4439 4440 pthread_mutex_lock(&endpoint->lock); 4441 if (TAILQ_EMPTY(&ctrlr->connected_sqs)) { 4442 endpoint->ctrlr = NULL; 4443 free_ctrlr(ctrlr); 4444 pthread_mutex_unlock(&endpoint->lock); 4445 return 0; 4446 } 4447 4448 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 4449 /* add another round thread poll to avoid recursive endpoint lock */ 4450 spdk_thread_send_msg(ctrlr->thread, _vfio_user_qpair_disconnect, sq); 4451 } 4452 pthread_mutex_unlock(&endpoint->lock); 4453 4454 return 0; 4455 } 4456 4457 /* 4458 * Poll for and process any incoming vfio-user messages. 4459 */ 4460 static int 4461 vfio_user_poll_vfu_ctx(void *ctx) 4462 { 4463 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 4464 int ret; 4465 4466 assert(ctrlr != NULL); 4467 4468 /* This will call access_bar0_fn() if there are any writes 4469 * to the portion of the BAR that is not mmap'd */ 4470 ret = vfu_run_ctx(ctrlr->endpoint->vfu_ctx); 4471 if (spdk_unlikely(ret == -1)) { 4472 if (errno == EBUSY) { 4473 return SPDK_POLLER_IDLE; 4474 } 4475 4476 spdk_poller_unregister(&ctrlr->vfu_ctx_poller); 4477 4478 /* 4479 * We lost the client; the reset callback will already have 4480 * unregistered the interrupt. 4481 */ 4482 if (errno == ENOTCONN) { 4483 vfio_user_destroy_ctrlr(ctrlr); 4484 return SPDK_POLLER_BUSY; 4485 } 4486 4487 /* 4488 * We might not have got a reset callback in this case, so 4489 * explicitly unregister the interrupt here. 4490 */ 4491 spdk_interrupt_unregister(&ctrlr->intr); 4492 ctrlr->intr_fd = -1; 4493 fail_ctrlr(ctrlr); 4494 } 4495 4496 return ret != 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 4497 } 4498 4499 struct vfio_user_post_cpl_ctx { 4500 struct nvmf_vfio_user_ctrlr *ctrlr; 4501 struct nvmf_vfio_user_cq *cq; 4502 struct spdk_nvme_cpl cpl; 4503 }; 4504 4505 static void 4506 _post_completion_msg(void *ctx) 4507 { 4508 struct vfio_user_post_cpl_ctx *cpl_ctx = ctx; 4509 4510 post_completion(cpl_ctx->ctrlr, cpl_ctx->cq, cpl_ctx->cpl.cdw0, cpl_ctx->cpl.sqid, 4511 cpl_ctx->cpl.cid, cpl_ctx->cpl.status.sc, cpl_ctx->cpl.status.sct); 4512 free(cpl_ctx); 4513 } 4514 4515 static int nvmf_vfio_user_poll_group_poll(struct spdk_nvmf_transport_poll_group *group); 4516 4517 static int set_ctrlr_intr_mode(struct nvmf_vfio_user_ctrlr *ctrlr); 4518 4519 static int 4520 vfio_user_handle_intr(void *ctx) 4521 { 4522 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 4523 int ret = 0; 4524 4525 assert(ctrlr != NULL); 4526 assert(ctrlr->sqs[0] != NULL); 4527 assert(ctrlr->sqs[0]->group != NULL); 4528 4529 ctrlr->self_kick_requested = false; 4530 4531 vfio_user_poll_vfu_ctx(ctrlr); 4532 4533 /* 4534 * See nvmf_vfio_user_get_optimal_poll_group() for why it's OK to only 4535 * poll this poll group. 4536 */ 4537 ret |= nvmf_vfio_user_poll_group_poll(ctrlr->sqs[0]->group); 4538 4539 /* Re-arm the event indexes. */ 4540 ret |= set_ctrlr_intr_mode(ctrlr); 4541 4542 return ret != 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 4543 } 4544 4545 static void 4546 vfio_user_set_intr_mode(struct spdk_poller *poller, void *arg, 4547 bool interrupt_mode) 4548 { 4549 struct nvmf_vfio_user_ctrlr *ctrlr = arg; 4550 assert(ctrlr != NULL); 4551 assert(ctrlr->endpoint != NULL); 4552 4553 SPDK_DEBUGLOG(nvmf_vfio, "%s: setting interrupt mode to %d\n", 4554 ctrlr_id(ctrlr), interrupt_mode); 4555 4556 /* 4557 * interrupt_mode needs to persist across controller resets, so store 4558 * it in the endpoint instead. 4559 */ 4560 ctrlr->endpoint->interrupt_mode = interrupt_mode; 4561 set_ctrlr_intr_mode(ctrlr); 4562 } 4563 4564 static int 4565 handle_queue_connect_rsp(struct nvmf_vfio_user_req *req, void *cb_arg) 4566 { 4567 struct nvmf_vfio_user_poll_group *vu_group; 4568 struct nvmf_vfio_user_sq *sq = cb_arg; 4569 struct nvmf_vfio_user_cq *cq; 4570 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 4571 struct nvmf_vfio_user_endpoint *endpoint; 4572 4573 assert(sq != NULL); 4574 assert(req != NULL); 4575 4576 vu_ctrlr = sq->ctrlr; 4577 assert(vu_ctrlr != NULL); 4578 endpoint = vu_ctrlr->endpoint; 4579 assert(endpoint != NULL); 4580 4581 if (spdk_nvme_cpl_is_error(&req->req.rsp->nvme_cpl)) { 4582 SPDK_ERRLOG("SC %u, SCT %u\n", req->req.rsp->nvme_cpl.status.sc, req->req.rsp->nvme_cpl.status.sct); 4583 endpoint->ctrlr = NULL; 4584 free_ctrlr(vu_ctrlr); 4585 return -1; 4586 } 4587 4588 vu_group = SPDK_CONTAINEROF(sq->group, struct nvmf_vfio_user_poll_group, group); 4589 TAILQ_INSERT_TAIL(&vu_group->sqs, sq, link); 4590 4591 cq = vu_ctrlr->cqs[0]; 4592 assert(cq != NULL); 4593 4594 pthread_mutex_lock(&endpoint->lock); 4595 if (nvmf_qpair_is_admin_queue(&sq->qpair)) { 4596 vu_ctrlr->cntlid = sq->qpair.ctrlr->cntlid; 4597 vu_ctrlr->thread = spdk_get_thread(); 4598 vu_ctrlr->ctrlr = sq->qpair.ctrlr; 4599 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 4600 4601 cq->thread = spdk_get_thread(); 4602 4603 if (in_interrupt_mode(endpoint->transport)) { 4604 vu_ctrlr->vfu_ctx_poller = SPDK_POLLER_REGISTER(vfio_user_poll_vfu_ctx, 4605 vu_ctrlr, 0); 4606 4607 vu_ctrlr->intr_fd = vfu_get_poll_fd(vu_ctrlr->endpoint->vfu_ctx); 4608 assert(vu_ctrlr->intr_fd != -1); 4609 4610 vu_ctrlr->intr = SPDK_INTERRUPT_REGISTER(vu_ctrlr->intr_fd, 4611 vfio_user_handle_intr, 4612 vu_ctrlr); 4613 4614 assert(vu_ctrlr->intr != NULL); 4615 4616 spdk_poller_register_interrupt(vu_ctrlr->vfu_ctx_poller, 4617 vfio_user_set_intr_mode, 4618 vu_ctrlr); 4619 } else { 4620 vu_ctrlr->vfu_ctx_poller = SPDK_POLLER_REGISTER(vfio_user_poll_vfu_ctx, 4621 vu_ctrlr, 1000); 4622 } 4623 } else { 4624 /* For I/O queues this command was generated in response to an 4625 * ADMIN I/O CREATE SUBMISSION QUEUE command which has not yet 4626 * been completed. Complete it now. 4627 */ 4628 if (sq->post_create_io_sq_completion) { 4629 assert(cq->thread != NULL); 4630 if (cq->thread != spdk_get_thread()) { 4631 struct vfio_user_post_cpl_ctx *cpl_ctx; 4632 4633 cpl_ctx = calloc(1, sizeof(*cpl_ctx)); 4634 if (!cpl_ctx) { 4635 return -ENOMEM; 4636 } 4637 cpl_ctx->ctrlr = vu_ctrlr; 4638 cpl_ctx->cq = cq; 4639 cpl_ctx->cpl.sqid = 0; 4640 cpl_ctx->cpl.cdw0 = 0; 4641 cpl_ctx->cpl.cid = sq->create_io_sq_cmd.cid; 4642 cpl_ctx->cpl.status.sc = SPDK_NVME_SC_SUCCESS; 4643 cpl_ctx->cpl.status.sct = SPDK_NVME_SCT_GENERIC; 4644 4645 spdk_thread_send_msg(cq->thread, _post_completion_msg, cpl_ctx); 4646 } else { 4647 post_completion(vu_ctrlr, cq, 0, 0, 4648 sq->create_io_sq_cmd.cid, SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC); 4649 } 4650 sq->post_create_io_sq_completion = false; 4651 } else if (in_interrupt_mode(endpoint->transport)) { 4652 /* 4653 * FIXME self_kick() ends up polling all queues on the 4654 * controller thread, and this will be wrong if we ever 4655 * support interrupt mode with I/O queues in a 4656 * different poll group than the controller's. 4657 */ 4658 self_kick(vu_ctrlr); 4659 } 4660 sq->sq_state = VFIO_USER_SQ_ACTIVE; 4661 } 4662 4663 TAILQ_INSERT_TAIL(&vu_ctrlr->connected_sqs, sq, tailq); 4664 pthread_mutex_unlock(&endpoint->lock); 4665 4666 free(req->req.data); 4667 req->req.data = NULL; 4668 4669 return 0; 4670 } 4671 4672 /* 4673 * Add the given qpair to the given poll group. New qpairs are added via 4674 * spdk_nvmf_tgt_new_qpair(), which picks a poll group via 4675 * nvmf_vfio_user_get_optimal_poll_group(), then calls back here via 4676 * nvmf_transport_poll_group_add(). 4677 */ 4678 static int 4679 nvmf_vfio_user_poll_group_add(struct spdk_nvmf_transport_poll_group *group, 4680 struct spdk_nvmf_qpair *qpair) 4681 { 4682 struct nvmf_vfio_user_sq *sq; 4683 struct nvmf_vfio_user_req *vu_req; 4684 struct nvmf_vfio_user_ctrlr *ctrlr; 4685 struct spdk_nvmf_request *req; 4686 struct spdk_nvmf_fabric_connect_data *data; 4687 bool admin; 4688 4689 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 4690 sq->group = group; 4691 ctrlr = sq->ctrlr; 4692 4693 SPDK_DEBUGLOG(nvmf_vfio, "%s: add QP%d=%p(%p) to poll_group=%p\n", 4694 ctrlr_id(ctrlr), sq->qpair.qid, 4695 sq, qpair, group); 4696 4697 admin = nvmf_qpair_is_admin_queue(&sq->qpair); 4698 4699 vu_req = get_nvmf_vfio_user_req(sq); 4700 if (vu_req == NULL) { 4701 return -1; 4702 } 4703 4704 req = &vu_req->req; 4705 req->cmd->connect_cmd.opcode = SPDK_NVME_OPC_FABRIC; 4706 req->cmd->connect_cmd.cid = 0; 4707 req->cmd->connect_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_CONNECT; 4708 req->cmd->connect_cmd.recfmt = 0; 4709 req->cmd->connect_cmd.sqsize = sq->size - 1; 4710 req->cmd->connect_cmd.qid = admin ? 0 : qpair->qid; 4711 4712 req->length = sizeof(struct spdk_nvmf_fabric_connect_data); 4713 req->data = calloc(1, req->length); 4714 if (req->data == NULL) { 4715 nvmf_vfio_user_req_free(req); 4716 return -ENOMEM; 4717 } 4718 4719 data = (struct spdk_nvmf_fabric_connect_data *)req->data; 4720 data->cntlid = ctrlr->cntlid; 4721 snprintf(data->subnqn, sizeof(data->subnqn), "%s", 4722 spdk_nvmf_subsystem_get_nqn(ctrlr->endpoint->subsystem)); 4723 4724 vu_req->cb_fn = handle_queue_connect_rsp; 4725 vu_req->cb_arg = sq; 4726 4727 SPDK_DEBUGLOG(nvmf_vfio, 4728 "%s: sending connect fabrics command for qid:%#x cntlid=%#x\n", 4729 ctrlr_id(ctrlr), qpair->qid, data->cntlid); 4730 4731 spdk_nvmf_request_exec_fabrics(req); 4732 return 0; 4733 } 4734 4735 static int 4736 nvmf_vfio_user_poll_group_remove(struct spdk_nvmf_transport_poll_group *group, 4737 struct spdk_nvmf_qpair *qpair) 4738 { 4739 struct nvmf_vfio_user_sq *sq; 4740 struct nvmf_vfio_user_poll_group *vu_group; 4741 4742 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 4743 4744 SPDK_DEBUGLOG(nvmf_vfio, 4745 "%s: remove NVMf QP%d=%p from NVMf poll_group=%p\n", 4746 ctrlr_id(sq->ctrlr), qpair->qid, qpair, group); 4747 4748 4749 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 4750 TAILQ_REMOVE(&vu_group->sqs, sq, link); 4751 4752 return 0; 4753 } 4754 4755 static void 4756 _nvmf_vfio_user_req_free(struct nvmf_vfio_user_sq *sq, struct nvmf_vfio_user_req *vu_req) 4757 { 4758 memset(&vu_req->cmd, 0, sizeof(vu_req->cmd)); 4759 memset(&vu_req->rsp, 0, sizeof(vu_req->rsp)); 4760 vu_req->iovcnt = 0; 4761 vu_req->state = VFIO_USER_REQUEST_STATE_FREE; 4762 4763 TAILQ_INSERT_TAIL(&sq->free_reqs, vu_req, link); 4764 } 4765 4766 static int 4767 nvmf_vfio_user_req_free(struct spdk_nvmf_request *req) 4768 { 4769 struct nvmf_vfio_user_sq *sq; 4770 struct nvmf_vfio_user_req *vu_req; 4771 4772 assert(req != NULL); 4773 4774 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 4775 sq = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 4776 4777 _nvmf_vfio_user_req_free(sq, vu_req); 4778 4779 return 0; 4780 } 4781 4782 static int 4783 nvmf_vfio_user_req_complete(struct spdk_nvmf_request *req) 4784 { 4785 struct nvmf_vfio_user_sq *sq; 4786 struct nvmf_vfio_user_req *vu_req; 4787 4788 assert(req != NULL); 4789 4790 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 4791 sq = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 4792 4793 if (vu_req->cb_fn != NULL) { 4794 if (vu_req->cb_fn(vu_req, vu_req->cb_arg) != 0) { 4795 fail_ctrlr(sq->ctrlr); 4796 } 4797 } 4798 4799 _nvmf_vfio_user_req_free(sq, vu_req); 4800 4801 return 0; 4802 } 4803 4804 static void 4805 nvmf_vfio_user_close_qpair(struct spdk_nvmf_qpair *qpair, 4806 spdk_nvmf_transport_qpair_fini_cb cb_fn, void *cb_arg) 4807 { 4808 struct nvmf_vfio_user_sq *sq; 4809 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 4810 struct nvmf_vfio_user_endpoint *endpoint; 4811 4812 assert(qpair != NULL); 4813 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 4814 vu_ctrlr = sq->ctrlr; 4815 endpoint = vu_ctrlr->endpoint; 4816 4817 pthread_mutex_lock(&endpoint->lock); 4818 TAILQ_REMOVE(&vu_ctrlr->connected_sqs, sq, tailq); 4819 delete_sq_done(vu_ctrlr, sq); 4820 if (TAILQ_EMPTY(&vu_ctrlr->connected_sqs)) { 4821 endpoint->ctrlr = NULL; 4822 free_ctrlr(vu_ctrlr); 4823 } 4824 pthread_mutex_unlock(&endpoint->lock); 4825 4826 if (cb_fn) { 4827 cb_fn(cb_arg); 4828 } 4829 } 4830 4831 /** 4832 * Returns a preallocated request, or NULL if there isn't one available. 4833 */ 4834 static struct nvmf_vfio_user_req * 4835 get_nvmf_vfio_user_req(struct nvmf_vfio_user_sq *sq) 4836 { 4837 struct nvmf_vfio_user_req *req; 4838 4839 if (sq == NULL) { 4840 return NULL; 4841 } 4842 4843 req = TAILQ_FIRST(&sq->free_reqs); 4844 if (req == NULL) { 4845 return NULL; 4846 } 4847 4848 TAILQ_REMOVE(&sq->free_reqs, req, link); 4849 4850 return req; 4851 } 4852 4853 static int 4854 get_nvmf_io_req_length(struct spdk_nvmf_request *req) 4855 { 4856 uint16_t nr; 4857 uint32_t nlb, nsid; 4858 struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; 4859 struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; 4860 struct spdk_nvmf_ns *ns; 4861 4862 nsid = cmd->nsid; 4863 ns = _nvmf_subsystem_get_ns(ctrlr->subsys, nsid); 4864 if (ns == NULL || ns->bdev == NULL) { 4865 SPDK_ERRLOG("unsuccessful query for nsid %u\n", cmd->nsid); 4866 return -EINVAL; 4867 } 4868 4869 if (cmd->opc == SPDK_NVME_OPC_DATASET_MANAGEMENT) { 4870 nr = cmd->cdw10_bits.dsm.nr + 1; 4871 return nr * sizeof(struct spdk_nvme_dsm_range); 4872 } 4873 4874 nlb = (cmd->cdw12 & 0x0000ffffu) + 1; 4875 return nlb * spdk_bdev_get_block_size(ns->bdev); 4876 } 4877 4878 static int 4879 map_admin_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req) 4880 { 4881 struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; 4882 uint32_t len = 0; 4883 uint8_t fid; 4884 int iovcnt; 4885 4886 req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 4887 req->length = 0; 4888 req->data = NULL; 4889 4890 if (req->xfer == SPDK_NVME_DATA_NONE) { 4891 return 0; 4892 } 4893 4894 switch (cmd->opc) { 4895 case SPDK_NVME_OPC_IDENTIFY: 4896 len = 4096; 4897 break; 4898 case SPDK_NVME_OPC_GET_LOG_PAGE: 4899 len = (((cmd->cdw11_bits.get_log_page.numdu << 16) | cmd->cdw10_bits.get_log_page.numdl) + 1) * 4; 4900 break; 4901 case SPDK_NVME_OPC_GET_FEATURES: 4902 case SPDK_NVME_OPC_SET_FEATURES: 4903 fid = cmd->cdw10_bits.set_features.fid; 4904 switch (fid) { 4905 case SPDK_NVME_FEAT_LBA_RANGE_TYPE: 4906 len = 4096; 4907 break; 4908 case SPDK_NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 4909 len = 256; 4910 break; 4911 case SPDK_NVME_FEAT_TIMESTAMP: 4912 len = 8; 4913 break; 4914 case SPDK_NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 4915 len = 512; 4916 break; 4917 case SPDK_NVME_FEAT_HOST_IDENTIFIER: 4918 if (cmd->cdw11_bits.feat_host_identifier.bits.exhid) { 4919 len = 16; 4920 } else { 4921 len = 8; 4922 } 4923 break; 4924 default: 4925 return 0; 4926 } 4927 break; 4928 default: 4929 return 0; 4930 } 4931 4932 /* ADMIN command will not use SGL */ 4933 if (cmd->psdt != 0) { 4934 return -EINVAL; 4935 } 4936 4937 iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, len); 4938 if (iovcnt < 0) { 4939 SPDK_ERRLOG("%s: map Admin Opc %x failed\n", 4940 ctrlr_id(ctrlr), cmd->opc); 4941 return -1; 4942 } 4943 req->length = len; 4944 req->data = req->iov[0].iov_base; 4945 req->iovcnt = iovcnt; 4946 4947 return 0; 4948 } 4949 4950 /* 4951 * Map an I/O command's buffers. 4952 * 4953 * Returns 0 on success and -errno on failure. 4954 */ 4955 static int 4956 map_io_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req) 4957 { 4958 int len, iovcnt; 4959 struct spdk_nvme_cmd *cmd; 4960 4961 assert(ctrlr != NULL); 4962 assert(req != NULL); 4963 4964 cmd = &req->cmd->nvme_cmd; 4965 req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 4966 req->length = 0; 4967 req->data = NULL; 4968 4969 if (spdk_unlikely(req->xfer == SPDK_NVME_DATA_NONE)) { 4970 return 0; 4971 } 4972 4973 len = get_nvmf_io_req_length(req); 4974 if (len < 0) { 4975 return -EINVAL; 4976 } 4977 req->length = len; 4978 4979 iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, req->length); 4980 if (iovcnt < 0) { 4981 SPDK_ERRLOG("%s: failed to map IO OPC %u\n", ctrlr_id(ctrlr), cmd->opc); 4982 return -EFAULT; 4983 } 4984 req->data = req->iov[0].iov_base; 4985 req->iovcnt = iovcnt; 4986 4987 return 0; 4988 } 4989 4990 static int 4991 handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 4992 struct nvmf_vfio_user_sq *sq) 4993 { 4994 int err; 4995 struct nvmf_vfio_user_req *vu_req; 4996 struct spdk_nvmf_request *req; 4997 4998 assert(ctrlr != NULL); 4999 assert(cmd != NULL); 5000 5001 vu_req = get_nvmf_vfio_user_req(sq); 5002 if (spdk_unlikely(vu_req == NULL)) { 5003 SPDK_ERRLOG("%s: no request for NVMe command opc 0x%x\n", ctrlr_id(ctrlr), cmd->opc); 5004 return post_completion(ctrlr, ctrlr->cqs[sq->cqid], 0, 0, cmd->cid, 5005 SPDK_NVME_SC_INTERNAL_DEVICE_ERROR, SPDK_NVME_SCT_GENERIC); 5006 5007 } 5008 req = &vu_req->req; 5009 5010 assert(req->qpair != NULL); 5011 SPDK_DEBUGLOG(nvmf_vfio, "%s: handle sqid:%u, req opc=%#x cid=%d\n", 5012 ctrlr_id(ctrlr), req->qpair->qid, cmd->opc, cmd->cid); 5013 5014 vu_req->cb_fn = handle_cmd_rsp; 5015 vu_req->cb_arg = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 5016 req->cmd->nvme_cmd = *cmd; 5017 5018 if (nvmf_qpair_is_admin_queue(req->qpair)) { 5019 err = map_admin_cmd_req(ctrlr, req); 5020 } else { 5021 switch (cmd->opc) { 5022 case SPDK_NVME_OPC_RESERVATION_REGISTER: 5023 case SPDK_NVME_OPC_RESERVATION_REPORT: 5024 case SPDK_NVME_OPC_RESERVATION_ACQUIRE: 5025 case SPDK_NVME_OPC_RESERVATION_RELEASE: 5026 err = -ENOTSUP; 5027 break; 5028 default: 5029 err = map_io_cmd_req(ctrlr, req); 5030 break; 5031 } 5032 } 5033 5034 if (spdk_unlikely(err < 0)) { 5035 SPDK_ERRLOG("%s: process NVMe command opc 0x%x failed\n", 5036 ctrlr_id(ctrlr), cmd->opc); 5037 req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 5038 req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; 5039 err = handle_cmd_rsp(vu_req, vu_req->cb_arg); 5040 _nvmf_vfio_user_req_free(sq, vu_req); 5041 return err; 5042 } 5043 5044 vu_req->state = VFIO_USER_REQUEST_STATE_EXECUTING; 5045 spdk_nvmf_request_exec(req); 5046 5047 return 0; 5048 } 5049 5050 /* 5051 * If we suppressed an IRQ in post_completion(), check if it needs to be fired 5052 * here: if the host isn't up to date, and is apparently not actively processing 5053 * the queue (i.e. ->last_head isn't changing), we need an IRQ. 5054 */ 5055 static void 5056 handle_suppressed_irq(struct nvmf_vfio_user_ctrlr *ctrlr, 5057 struct nvmf_vfio_user_sq *sq) 5058 { 5059 struct nvmf_vfio_user_cq *cq = ctrlr->cqs[sq->cqid]; 5060 uint32_t cq_head; 5061 uint32_t cq_tail; 5062 5063 if (!cq->ien || !ctrlr_interrupt_enabled(ctrlr) || 5064 !adaptive_irq_enabled(ctrlr, cq)) { 5065 return; 5066 } 5067 5068 cq_tail = *cq_tailp(cq); 5069 5070 /* Already sent? */ 5071 if (cq_tail == cq->last_trigger_irq_tail) { 5072 return; 5073 } 5074 5075 spdk_ivdt_dcache(cq_dbl_headp(cq)); 5076 cq_head = *cq_dbl_headp(cq); 5077 5078 if (cq_head != cq_tail && cq_head == cq->last_head) { 5079 int err = vfu_irq_trigger(ctrlr->endpoint->vfu_ctx, cq->iv); 5080 if (err != 0) { 5081 SPDK_ERRLOG("%s: failed to trigger interrupt: %m\n", 5082 ctrlr_id(ctrlr)); 5083 } else { 5084 cq->last_trigger_irq_tail = cq_tail; 5085 } 5086 } 5087 5088 cq->last_head = cq_head; 5089 } 5090 5091 /* Returns the number of commands processed, or a negative value on error. */ 5092 static int 5093 nvmf_vfio_user_sq_poll(struct nvmf_vfio_user_sq *sq) 5094 { 5095 struct nvmf_vfio_user_ctrlr *ctrlr; 5096 uint32_t new_tail; 5097 int count = 0; 5098 5099 assert(sq != NULL); 5100 5101 ctrlr = sq->ctrlr; 5102 5103 handle_suppressed_irq(ctrlr, sq); 5104 5105 /* On aarch64 platforms, doorbells update from guest VM may not be seen 5106 * on SPDK target side. This is because there is memory type mismatch 5107 * situation here. That is on guest VM side, the doorbells are treated as 5108 * device memory while on SPDK target side, it is treated as normal 5109 * memory. And this situation cause problem on ARM platform. 5110 * Refer to "https://developer.arm.com/documentation/102376/0100/ 5111 * Memory-aliasing-and-mismatched-memory-types". Only using spdk_mb() 5112 * cannot fix this. Use "dc civac" to invalidate cache may solve 5113 * this. 5114 */ 5115 spdk_ivdt_dcache(sq_dbl_tailp(sq)); 5116 5117 /* Load-Acquire. */ 5118 new_tail = *sq_dbl_tailp(sq); 5119 5120 new_tail = new_tail & 0xffffu; 5121 if (spdk_unlikely(new_tail >= sq->size)) { 5122 union spdk_nvme_async_event_completion event = {}; 5123 5124 SPDK_DEBUGLOG(nvmf_vfio, "%s: invalid sqid:%u doorbell value %u\n", ctrlr_id(ctrlr), sq->qid, 5125 new_tail); 5126 event.bits.async_event_type = SPDK_NVME_ASYNC_EVENT_TYPE_ERROR; 5127 event.bits.async_event_info = SPDK_NVME_ASYNC_EVENT_INVALID_DB_WRITE; 5128 nvmf_ctrlr_async_event_error_event(ctrlr->ctrlr, event); 5129 5130 return -1; 5131 } 5132 5133 if (*sq_headp(sq) == new_tail) { 5134 return 0; 5135 } 5136 5137 SPDK_DEBUGLOG(nvmf_vfio, "%s: sqid:%u doorbell old=%u new=%u\n", 5138 ctrlr_id(ctrlr), sq->qid, *sq_headp(sq), new_tail); 5139 if (ctrlr->sdbl != NULL) { 5140 SPDK_DEBUGLOG(nvmf_vfio, 5141 "%s: sqid:%u bar0_doorbell=%u shadow_doorbell=%u eventidx=%u\n", 5142 ctrlr_id(ctrlr), sq->qid, 5143 ctrlr->bar0_doorbells[queue_index(sq->qid, false)], 5144 ctrlr->sdbl->shadow_doorbells[queue_index(sq->qid, false)], 5145 ctrlr->sdbl->eventidxs[queue_index(sq->qid, false)]); 5146 } 5147 5148 /* 5149 * Ensure that changes to the queue are visible to us. 5150 * The host driver should write the queue first, do a wmb(), and then 5151 * update the SQ tail doorbell (their Store-Release). 5152 */ 5153 spdk_rmb(); 5154 5155 count = handle_sq_tdbl_write(ctrlr, new_tail, sq); 5156 if (count < 0) { 5157 fail_ctrlr(ctrlr); 5158 } 5159 5160 return count; 5161 } 5162 5163 /* 5164 * vfio-user transport poll handler. Note that the library context is polled in 5165 * a separate poller (->vfu_ctx_poller), so this poller only needs to poll the 5166 * active SQs. 5167 * 5168 * Returns the number of commands processed, or a negative value on error. 5169 */ 5170 static int 5171 nvmf_vfio_user_poll_group_poll(struct spdk_nvmf_transport_poll_group *group) 5172 { 5173 struct nvmf_vfio_user_poll_group *vu_group; 5174 struct nvmf_vfio_user_sq *sq, *tmp; 5175 int count = 0; 5176 5177 assert(group != NULL); 5178 5179 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 5180 5181 SPDK_DEBUGLOG(vfio_user_db, "polling all SQs\n"); 5182 5183 TAILQ_FOREACH_SAFE(sq, &vu_group->sqs, link, tmp) { 5184 int ret; 5185 5186 if (spdk_unlikely(sq->sq_state != VFIO_USER_SQ_ACTIVE || !sq->size)) { 5187 continue; 5188 } 5189 5190 ret = nvmf_vfio_user_sq_poll(sq); 5191 5192 if (ret < 0) { 5193 return ret; 5194 } 5195 5196 count += ret; 5197 } 5198 5199 return count; 5200 } 5201 5202 static int 5203 nvmf_vfio_user_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair, 5204 struct spdk_nvme_transport_id *trid) 5205 { 5206 struct nvmf_vfio_user_sq *sq; 5207 struct nvmf_vfio_user_ctrlr *ctrlr; 5208 5209 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5210 ctrlr = sq->ctrlr; 5211 5212 memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid)); 5213 return 0; 5214 } 5215 5216 static int 5217 nvmf_vfio_user_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair, 5218 struct spdk_nvme_transport_id *trid) 5219 { 5220 return 0; 5221 } 5222 5223 static int 5224 nvmf_vfio_user_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair, 5225 struct spdk_nvme_transport_id *trid) 5226 { 5227 struct nvmf_vfio_user_sq *sq; 5228 struct nvmf_vfio_user_ctrlr *ctrlr; 5229 5230 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5231 ctrlr = sq->ctrlr; 5232 5233 memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid)); 5234 return 0; 5235 } 5236 5237 static void 5238 nvmf_vfio_user_qpair_abort_request(struct spdk_nvmf_qpair *qpair, 5239 struct spdk_nvmf_request *req) 5240 { 5241 struct spdk_nvmf_request *req_to_abort = NULL; 5242 struct spdk_nvmf_request *temp_req = NULL; 5243 uint16_t cid; 5244 5245 cid = req->cmd->nvme_cmd.cdw10_bits.abort.cid; 5246 5247 TAILQ_FOREACH(temp_req, &qpair->outstanding, link) { 5248 struct nvmf_vfio_user_req *vu_req; 5249 5250 vu_req = SPDK_CONTAINEROF(temp_req, struct nvmf_vfio_user_req, req); 5251 5252 if (vu_req->state == VFIO_USER_REQUEST_STATE_EXECUTING && vu_req->cmd.cid == cid) { 5253 req_to_abort = temp_req; 5254 break; 5255 } 5256 } 5257 5258 if (req_to_abort == NULL) { 5259 spdk_nvmf_request_complete(req); 5260 return; 5261 } 5262 5263 req->req_to_abort = req_to_abort; 5264 nvmf_ctrlr_abort_request(req); 5265 } 5266 5267 static void 5268 nvmf_vfio_user_opts_init(struct spdk_nvmf_transport_opts *opts) 5269 { 5270 opts->max_queue_depth = NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH; 5271 opts->max_qpairs_per_ctrlr = NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; 5272 opts->in_capsule_data_size = 0; 5273 opts->max_io_size = NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE; 5274 opts->io_unit_size = NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE; 5275 opts->max_aq_depth = NVMF_VFIO_USER_DEFAULT_AQ_DEPTH; 5276 opts->num_shared_buffers = 0; 5277 opts->buf_cache_size = 0; 5278 opts->association_timeout = 0; 5279 opts->transport_specific = NULL; 5280 } 5281 5282 const struct spdk_nvmf_transport_ops spdk_nvmf_transport_vfio_user = { 5283 .name = "VFIOUSER", 5284 .type = SPDK_NVME_TRANSPORT_VFIOUSER, 5285 .opts_init = nvmf_vfio_user_opts_init, 5286 .create = nvmf_vfio_user_create, 5287 .destroy = nvmf_vfio_user_destroy, 5288 5289 .listen = nvmf_vfio_user_listen, 5290 .stop_listen = nvmf_vfio_user_stop_listen, 5291 .cdata_init = nvmf_vfio_user_cdata_init, 5292 .listen_associate = nvmf_vfio_user_listen_associate, 5293 5294 .listener_discover = nvmf_vfio_user_discover, 5295 5296 .poll_group_create = nvmf_vfio_user_poll_group_create, 5297 .get_optimal_poll_group = nvmf_vfio_user_get_optimal_poll_group, 5298 .poll_group_destroy = nvmf_vfio_user_poll_group_destroy, 5299 .poll_group_add = nvmf_vfio_user_poll_group_add, 5300 .poll_group_remove = nvmf_vfio_user_poll_group_remove, 5301 .poll_group_poll = nvmf_vfio_user_poll_group_poll, 5302 5303 .req_free = nvmf_vfio_user_req_free, 5304 .req_complete = nvmf_vfio_user_req_complete, 5305 5306 .qpair_fini = nvmf_vfio_user_close_qpair, 5307 .qpair_get_local_trid = nvmf_vfio_user_qpair_get_local_trid, 5308 .qpair_get_peer_trid = nvmf_vfio_user_qpair_get_peer_trid, 5309 .qpair_get_listen_trid = nvmf_vfio_user_qpair_get_listen_trid, 5310 .qpair_abort_request = nvmf_vfio_user_qpair_abort_request, 5311 }; 5312 5313 SPDK_NVMF_TRANSPORT_REGISTER(muser, &spdk_nvmf_transport_vfio_user); 5314 SPDK_LOG_REGISTER_COMPONENT(nvmf_vfio) 5315 SPDK_LOG_REGISTER_COMPONENT(vfio_user_db) 5316