1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (c) 2019-2022, Nutanix Inc. All rights reserved. 3 * Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 */ 5 6 /* 7 * NVMe over vfio-user transport 8 */ 9 10 #include <vfio-user/libvfio-user.h> 11 #include <vfio-user/pci_defs.h> 12 13 #include "spdk/barrier.h" 14 #include "spdk/stdinc.h" 15 #include "spdk/assert.h" 16 #include "spdk/thread.h" 17 #include "spdk/nvmf_transport.h" 18 #include "spdk/sock.h" 19 #include "spdk/string.h" 20 #include "spdk/util.h" 21 #include "spdk/log.h" 22 23 #include "transport.h" 24 25 #include "nvmf_internal.h" 26 27 #define SWAP(x, y) \ 28 do \ 29 { \ 30 typeof(x) _tmp = x; \ 31 x = y; \ 32 y = _tmp; \ 33 } while (0) 34 35 #define NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH 256 36 #define NVMF_VFIO_USER_DEFAULT_AQ_DEPTH 32 37 #define NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE ((NVMF_REQ_MAX_BUFFERS - 1) << SHIFT_4KB) 38 #define NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE 39 40 #define NVME_DOORBELLS_OFFSET 0x1000 41 #define NVMF_VFIO_USER_DOORBELLS_SIZE 0x1000 42 #define NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT 2 43 #define NVMF_VFIO_USER_SET_EVENTIDX_MAX_ATTEMPTS 3 44 #define NVMF_VFIO_USER_EVENTIDX_POLL UINT32_MAX 45 46 /* 47 * NVMe driver reads 4096 bytes, which is the extended PCI configuration space 48 * available on PCI-X 2.0 and PCI Express buses 49 */ 50 #define NVME_REG_CFG_SIZE 0x1000 51 #define NVME_REG_BAR0_SIZE (NVME_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE) 52 #define NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR ((NVMF_VFIO_USER_DOORBELLS_SIZE) / 8) 53 #define NVME_IRQ_MSIX_NUM NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR 54 /* MSIX Table Size */ 55 #define NVME_BAR4_SIZE SPDK_ALIGN_CEIL((NVME_IRQ_MSIX_NUM * 16), 0x1000) 56 /* MSIX Pending Bit Array Size */ 57 #define NVME_BAR5_SIZE SPDK_ALIGN_CEIL((NVME_IRQ_MSIX_NUM / 8), 0x1000) 58 59 #define NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR (NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR / 4) 60 61 struct nvmf_vfio_user_req; 62 63 typedef int (*nvmf_vfio_user_req_cb_fn)(struct nvmf_vfio_user_req *req, void *cb_arg); 64 65 /* 1 more for PRP2 list itself */ 66 #define NVMF_VFIO_USER_MAX_IOVECS (NVMF_REQ_MAX_BUFFERS + 1) 67 68 enum nvmf_vfio_user_req_state { 69 VFIO_USER_REQUEST_STATE_FREE = 0, 70 VFIO_USER_REQUEST_STATE_EXECUTING, 71 }; 72 73 /* NVMe device state representation */ 74 struct nvme_migr_sq_state { 75 uint16_t sqid; 76 uint16_t cqid; 77 uint32_t head; 78 uint32_t size; 79 uint32_t reserved; 80 uint64_t dma_addr; 81 }; 82 SPDK_STATIC_ASSERT(sizeof(struct nvme_migr_sq_state) == 0x18, "Incorrect size"); 83 84 struct nvme_migr_cq_state { 85 uint16_t cqid; 86 uint16_t phase; 87 uint32_t tail; 88 uint32_t size; 89 uint32_t iv; 90 uint32_t ien; 91 uint32_t reserved; 92 uint64_t dma_addr; 93 }; 94 SPDK_STATIC_ASSERT(sizeof(struct nvme_migr_cq_state) == 0x20, "Incorrect size"); 95 96 #define VFIO_USER_NVME_MIGR_MAGIC 0xAFEDBC23 97 98 /* The device state is in VFIO MIGRATION BAR(9) region, keep the device state page aligned. 99 * 100 * NVMe device migration region is defined as below: 101 * ------------------------------------------------------------------------- 102 * | vfio_user_nvme_migr_header | nvmf controller data | queue pairs | BARs | 103 * ------------------------------------------------------------------------- 104 * 105 * Keep vfio_user_nvme_migr_header as a fixed 0x1000 length, all new added fields 106 * can use the reserved space at the end of the data structure. 107 */ 108 struct vfio_user_nvme_migr_header { 109 /* Magic value to validate migration data */ 110 uint32_t magic; 111 /* Version to check the data is same from source to destination */ 112 uint32_t version; 113 114 /* The library uses this field to know how many fields in this 115 * structure are valid, starting at the beginning of this data 116 * structure. New added fields in future use `unused` memory 117 * spaces. 118 */ 119 uint32_t opts_size; 120 uint32_t reserved0; 121 122 /* BARs information */ 123 uint64_t bar_offset[VFU_PCI_DEV_NUM_REGIONS]; 124 uint64_t bar_len[VFU_PCI_DEV_NUM_REGIONS]; 125 126 /* Queue pair start offset, starting at the beginning of this 127 * data structure. 128 */ 129 uint64_t qp_offset; 130 uint64_t qp_len; 131 132 /* Controller data structure */ 133 uint32_t num_io_queues; 134 uint32_t reserved1; 135 136 /* TODO: this part will be moved to common nvmf controller data */ 137 uint16_t reserved2[3]; 138 uint16_t nr_aers; 139 uint16_t aer_cids[NVMF_MIGR_MAX_PENDING_AERS]; 140 141 /* NVMf controller data offset and length if exist, starting at 142 * the beginning of this data structure. 143 */ 144 uint64_t nvmf_data_offset; 145 uint64_t nvmf_data_len; 146 147 /* 148 * Whether or not shadow doorbells are used in the source. 0 is a valid DMA 149 * address. 150 */ 151 bool sdbl; 152 153 /* Shadow doorbell DMA addresses. */ 154 uint64_t shadow_doorbell_buffer; 155 uint64_t eventidx_buffer; 156 157 /* Reserved memory space for new added fields, the 158 * field is always at the end of this data structure. 159 */ 160 uint8_t unused[3336]; 161 }; 162 SPDK_STATIC_ASSERT(sizeof(struct vfio_user_nvme_migr_header) == 0x1000, "Incorrect size"); 163 164 struct vfio_user_nvme_migr_qp { 165 struct nvme_migr_sq_state sq; 166 struct nvme_migr_cq_state cq; 167 }; 168 169 /* NVMe state definition used to load/restore from/to NVMe migration BAR region */ 170 struct vfio_user_nvme_migr_state { 171 struct vfio_user_nvme_migr_header ctrlr_header; 172 struct nvmf_ctrlr_migr_data nvmf_data; 173 struct vfio_user_nvme_migr_qp qps[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 174 uint8_t bar0[NVME_REG_BAR0_SIZE]; 175 uint8_t cfg[NVME_REG_CFG_SIZE]; 176 }; 177 178 struct nvmf_vfio_user_req { 179 struct spdk_nvmf_request req; 180 struct spdk_nvme_cpl rsp; 181 struct spdk_nvme_cmd cmd; 182 183 enum nvmf_vfio_user_req_state state; 184 nvmf_vfio_user_req_cb_fn cb_fn; 185 void *cb_arg; 186 187 /* old CC before prop_set_cc fabric command */ 188 union spdk_nvme_cc_register cc; 189 190 TAILQ_ENTRY(nvmf_vfio_user_req) link; 191 192 struct iovec iov[NVMF_VFIO_USER_MAX_IOVECS]; 193 uint8_t iovcnt; 194 195 /* NVMF_VFIO_USER_MAX_IOVECS worth of dma_sg_t. */ 196 uint8_t sg[]; 197 }; 198 199 /* 200 * Mapping of an NVMe queue. 201 * 202 * This holds the information tracking a local process mapping of an NVMe queue 203 * shared by the client. 204 */ 205 struct nvme_q_mapping { 206 /* iov of local process mapping. */ 207 struct iovec iov; 208 /* Stored sg, needed for unmap. */ 209 dma_sg_t *sg; 210 /* Client PRP of queue. */ 211 uint64_t prp1; 212 }; 213 214 enum nvmf_vfio_user_sq_state { 215 VFIO_USER_SQ_UNUSED = 0, 216 VFIO_USER_SQ_CREATED, 217 VFIO_USER_SQ_DELETED, 218 VFIO_USER_SQ_ACTIVE, 219 VFIO_USER_SQ_INACTIVE 220 }; 221 222 enum nvmf_vfio_user_cq_state { 223 VFIO_USER_CQ_UNUSED = 0, 224 VFIO_USER_CQ_CREATED, 225 VFIO_USER_CQ_DELETED, 226 }; 227 228 enum nvmf_vfio_user_ctrlr_state { 229 VFIO_USER_CTRLR_CREATING = 0, 230 VFIO_USER_CTRLR_RUNNING, 231 /* Quiesce requested by libvfio-user */ 232 VFIO_USER_CTRLR_PAUSING, 233 /* NVMf subsystem is paused, it's safe to do PCI reset, memory register, 234 * memory unergister, and vfio migration state transition in this state. 235 */ 236 VFIO_USER_CTRLR_PAUSED, 237 /* 238 * Implies that the NVMf subsystem is paused. Device will be unquiesced (PCI 239 * reset, memory register and unregister, controller in destination VM has 240 * been restored). NVMf subsystem resume has been requested. 241 */ 242 VFIO_USER_CTRLR_RESUMING, 243 /* 244 * Implies that the NVMf subsystem is paused. Both controller in source VM and 245 * destinatiom VM is in this state when doing live migration. 246 */ 247 VFIO_USER_CTRLR_MIGRATING 248 }; 249 250 /* Migration region to record NVMe device state data structure */ 251 struct vfio_user_migration_region { 252 uint64_t last_data_offset; 253 uint64_t pending_bytes; 254 }; 255 256 struct nvmf_vfio_user_sq { 257 struct spdk_nvmf_qpair qpair; 258 struct spdk_nvmf_transport_poll_group *group; 259 struct nvmf_vfio_user_ctrlr *ctrlr; 260 261 uint32_t qid; 262 /* Number of entries in queue. */ 263 uint32_t size; 264 struct nvme_q_mapping mapping; 265 enum nvmf_vfio_user_sq_state sq_state; 266 267 uint32_t head; 268 volatile uint32_t *dbl_tailp; 269 270 /* Whether a shadow doorbell eventidx needs setting. */ 271 bool need_rearm; 272 273 /* multiple SQs can be mapped to the same CQ */ 274 uint16_t cqid; 275 276 /* handle_queue_connect_rsp() can be used both for CREATE IO SQ response 277 * and SQ re-connect response in the destination VM, for the prior case, 278 * we will post a NVMe completion to VM, we will not set this flag when 279 * re-connecting SQs in the destination VM. 280 */ 281 bool post_create_io_sq_completion; 282 /* Copy of Create IO SQ command, this field is used together with 283 * `post_create_io_sq_completion` flag. 284 */ 285 struct spdk_nvme_cmd create_io_sq_cmd; 286 287 /* Currently unallocated reqs. */ 288 TAILQ_HEAD(, nvmf_vfio_user_req) free_reqs; 289 /* Poll group entry */ 290 TAILQ_ENTRY(nvmf_vfio_user_sq) link; 291 /* Connected SQ entry */ 292 TAILQ_ENTRY(nvmf_vfio_user_sq) tailq; 293 }; 294 295 struct nvmf_vfio_user_cq { 296 struct spdk_nvmf_transport_poll_group *group; 297 struct spdk_thread *thread; 298 uint32_t cq_ref; 299 300 uint32_t qid; 301 /* Number of entries in queue. */ 302 uint32_t size; 303 struct nvme_q_mapping mapping; 304 enum nvmf_vfio_user_cq_state cq_state; 305 306 uint32_t tail; 307 volatile uint32_t *dbl_headp; 308 309 bool phase; 310 311 uint16_t iv; 312 bool ien; 313 314 uint32_t last_head; 315 uint32_t last_trigger_irq_tail; 316 }; 317 318 struct nvmf_vfio_user_poll_group { 319 struct spdk_nvmf_transport_poll_group group; 320 TAILQ_ENTRY(nvmf_vfio_user_poll_group) link; 321 TAILQ_HEAD(, nvmf_vfio_user_sq) sqs; 322 }; 323 324 struct nvmf_vfio_user_shadow_doorbells { 325 volatile uint32_t *shadow_doorbells; 326 volatile uint32_t *eventidxs; 327 dma_sg_t *sgs; 328 struct iovec *iovs; 329 }; 330 331 struct nvmf_vfio_user_ctrlr { 332 struct nvmf_vfio_user_endpoint *endpoint; 333 struct nvmf_vfio_user_transport *transport; 334 335 /* Connected SQs list */ 336 TAILQ_HEAD(, nvmf_vfio_user_sq) connected_sqs; 337 enum nvmf_vfio_user_ctrlr_state state; 338 339 struct vfio_user_migration_region migr_reg; 340 /* Controller is in source VM when doing live migration */ 341 bool in_source_vm; 342 343 struct spdk_thread *thread; 344 struct spdk_poller *vfu_ctx_poller; 345 struct spdk_interrupt *intr; 346 int intr_fd; 347 348 bool queued_quiesce; 349 350 bool reset_shn; 351 352 uint16_t cntlid; 353 struct spdk_nvmf_ctrlr *ctrlr; 354 355 struct nvmf_vfio_user_sq *sqs[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 356 struct nvmf_vfio_user_cq *cqs[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 357 358 TAILQ_ENTRY(nvmf_vfio_user_ctrlr) link; 359 360 volatile uint32_t *bar0_doorbells; 361 struct nvmf_vfio_user_shadow_doorbells *sdbl; 362 /* 363 * Shadow doorbells PRPs to provide during the stop-and-copy state. 364 */ 365 uint64_t shadow_doorbell_buffer; 366 uint64_t eventidx_buffer; 367 368 bool adaptive_irqs_enabled; 369 bool kick_requested; 370 }; 371 372 /* Endpoint in vfio-user is associated with a socket file, which 373 * is the representative of a PCI endpoint. 374 */ 375 struct nvmf_vfio_user_endpoint { 376 struct nvmf_vfio_user_transport *transport; 377 vfu_ctx_t *vfu_ctx; 378 struct spdk_poller *accept_poller; 379 struct spdk_thread *accept_thread; 380 bool interrupt_mode; 381 struct msixcap *msix; 382 vfu_pci_config_space_t *pci_config_space; 383 int devmem_fd; 384 int accept_intr_fd; 385 struct spdk_interrupt *accept_intr; 386 387 volatile uint32_t *bar0_doorbells; 388 389 int migr_fd; 390 void *migr_data; 391 392 struct spdk_nvme_transport_id trid; 393 struct spdk_nvmf_subsystem *subsystem; 394 395 /* Controller is associated with an active socket connection, 396 * the lifecycle of the controller is same as the VM. 397 * Currently we only support one active connection, as the NVMe 398 * specification defines, we may support multiple controllers in 399 * future, so that it can support e.g: RESERVATION. 400 */ 401 struct nvmf_vfio_user_ctrlr *ctrlr; 402 pthread_mutex_t lock; 403 404 bool need_async_destroy; 405 /* The subsystem is in PAUSED state and need to be resumed, TRUE 406 * only when migration is done successfully and the controller is 407 * in source VM. 408 */ 409 bool need_resume; 410 411 TAILQ_ENTRY(nvmf_vfio_user_endpoint) link; 412 }; 413 414 struct nvmf_vfio_user_transport_opts { 415 bool disable_mappable_bar0; 416 bool disable_adaptive_irq; 417 bool disable_shadow_doorbells; 418 bool disable_compare; 419 }; 420 421 struct nvmf_vfio_user_transport { 422 struct spdk_nvmf_transport transport; 423 struct nvmf_vfio_user_transport_opts transport_opts; 424 bool intr_mode_supported; 425 pthread_mutex_t lock; 426 TAILQ_HEAD(, nvmf_vfio_user_endpoint) endpoints; 427 428 pthread_mutex_t pg_lock; 429 TAILQ_HEAD(, nvmf_vfio_user_poll_group) poll_groups; 430 struct nvmf_vfio_user_poll_group *next_pg; 431 }; 432 433 /* 434 * function prototypes 435 */ 436 static int nvmf_vfio_user_req_free(struct spdk_nvmf_request *req); 437 438 static struct nvmf_vfio_user_req *get_nvmf_vfio_user_req(struct nvmf_vfio_user_sq *sq); 439 440 /* 441 * Local process virtual address of a queue. 442 */ 443 static inline void * 444 q_addr(struct nvme_q_mapping *mapping) 445 { 446 return mapping->iov.iov_base; 447 } 448 449 static inline int 450 queue_index(uint16_t qid, bool is_cq) 451 { 452 return (qid * 2) + is_cq; 453 } 454 455 static inline volatile uint32_t * 456 sq_headp(struct nvmf_vfio_user_sq *sq) 457 { 458 assert(sq != NULL); 459 return &sq->head; 460 } 461 462 static inline volatile uint32_t * 463 sq_dbl_tailp(struct nvmf_vfio_user_sq *sq) 464 { 465 assert(sq != NULL); 466 return sq->dbl_tailp; 467 } 468 469 static inline volatile uint32_t * 470 cq_dbl_headp(struct nvmf_vfio_user_cq *cq) 471 { 472 assert(cq != NULL); 473 return cq->dbl_headp; 474 } 475 476 static inline volatile uint32_t * 477 cq_tailp(struct nvmf_vfio_user_cq *cq) 478 { 479 assert(cq != NULL); 480 return &cq->tail; 481 } 482 483 static inline void 484 sq_head_advance(struct nvmf_vfio_user_sq *sq) 485 { 486 assert(sq != NULL); 487 488 assert(*sq_headp(sq) < sq->size); 489 (*sq_headp(sq))++; 490 491 if (spdk_unlikely(*sq_headp(sq) == sq->size)) { 492 *sq_headp(sq) = 0; 493 } 494 } 495 496 static inline void 497 cq_tail_advance(struct nvmf_vfio_user_cq *cq) 498 { 499 assert(cq != NULL); 500 501 assert(*cq_tailp(cq) < cq->size); 502 (*cq_tailp(cq))++; 503 504 if (spdk_unlikely(*cq_tailp(cq) == cq->size)) { 505 *cq_tailp(cq) = 0; 506 cq->phase = !cq->phase; 507 } 508 } 509 510 /* 511 * As per NVMe Base spec 3.3.1.2.1, we are supposed to implement CQ flow 512 * control: if there is no space in the CQ, we should wait until there is. 513 * 514 * In practice, we just fail the controller instead: as it happens, all host 515 * implementations we care about right-size the CQ: this is required anyway for 516 * NVMEoF support (see 3.3.2.8). 517 * 518 * Since reading the head doorbell is relatively expensive, we use the cached 519 * value, so we only have to read it for real if it appears that we are full. 520 */ 521 static inline bool 522 cq_is_full(struct nvmf_vfio_user_cq *cq) 523 { 524 uint32_t qindex; 525 526 assert(cq != NULL); 527 528 qindex = *cq_tailp(cq) + 1; 529 if (spdk_unlikely(qindex == cq->size)) { 530 qindex = 0; 531 } 532 533 if (qindex != cq->last_head) { 534 return false; 535 } 536 537 cq->last_head = *cq_dbl_headp(cq); 538 539 return qindex == cq->last_head; 540 } 541 542 static bool 543 io_q_exists(struct nvmf_vfio_user_ctrlr *vu_ctrlr, const uint16_t qid, const bool is_cq) 544 { 545 assert(vu_ctrlr != NULL); 546 547 if (qid == 0 || qid >= NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) { 548 return false; 549 } 550 551 if (is_cq) { 552 if (vu_ctrlr->cqs[qid] == NULL) { 553 return false; 554 } 555 556 return (vu_ctrlr->cqs[qid]->cq_state != VFIO_USER_CQ_DELETED && 557 vu_ctrlr->cqs[qid]->cq_state != VFIO_USER_CQ_UNUSED); 558 } 559 560 if (vu_ctrlr->sqs[qid] == NULL) { 561 return false; 562 } 563 564 return (vu_ctrlr->sqs[qid]->sq_state != VFIO_USER_SQ_DELETED && 565 vu_ctrlr->sqs[qid]->sq_state != VFIO_USER_SQ_UNUSED); 566 } 567 568 /* Return the poll group for the admin queue of the controller. */ 569 static inline struct nvmf_vfio_user_poll_group * 570 ctrlr_to_poll_group(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 571 { 572 return SPDK_CONTAINEROF(vu_ctrlr->sqs[0]->group, 573 struct nvmf_vfio_user_poll_group, 574 group); 575 } 576 577 static inline struct spdk_thread * 578 poll_group_to_thread(struct nvmf_vfio_user_poll_group *vu_pg) 579 { 580 return vu_pg->group.group->thread; 581 } 582 583 static dma_sg_t * 584 index_to_sg_t(void *arr, size_t i) 585 { 586 return (dma_sg_t *)((uintptr_t)arr + i * dma_sg_size()); 587 } 588 589 static inline size_t 590 vfio_user_migr_data_len(void) 591 { 592 return SPDK_ALIGN_CEIL(sizeof(struct vfio_user_nvme_migr_state), PAGE_SIZE); 593 } 594 595 static int vfio_user_ctrlr_intr(void *ctx); 596 597 /* 598 * Wrap vfio_user_ctrlr_intr() such that it can be used with 599 * spdk_thread_send_msg(). 600 * Pollers have type int (*)(void *) while message functions should have type 601 * void (*)(void *), so simply discard the returned value. 602 */ 603 static void 604 vfio_user_ctrlr_intr_wrapper(void *ctx) 605 { 606 vfio_user_ctrlr_intr(ctx); 607 } 608 609 /* 610 * Arrange for this controller to immediately wake up and process everything. 611 */ 612 static inline int 613 ctrlr_kick(struct nvmf_vfio_user_ctrlr *ctrlr) 614 { 615 assert(ctrlr != NULL); 616 assert(ctrlr->thread != NULL); 617 618 if (ctrlr->kick_requested) { 619 return 0; 620 } 621 622 ctrlr->kick_requested = true; 623 624 return spdk_thread_send_msg(ctrlr->thread, 625 vfio_user_ctrlr_intr_wrapper, 626 ctrlr); 627 } 628 629 /* 630 * Make the given DMA address and length available (locally mapped) via iov. 631 */ 632 static void * 633 map_one(vfu_ctx_t *ctx, uint64_t addr, uint64_t len, dma_sg_t *sg, 634 struct iovec *iov, int prot) 635 { 636 int ret; 637 638 assert(ctx != NULL); 639 assert(sg != NULL); 640 assert(iov != NULL); 641 642 ret = vfu_addr_to_sgl(ctx, (void *)(uintptr_t)addr, len, sg, 1, prot); 643 if (ret < 0) { 644 return NULL; 645 } 646 647 ret = vfu_sgl_get(ctx, sg, iov, 1, 0); 648 if (ret != 0) { 649 return NULL; 650 } 651 652 assert(iov->iov_base != NULL); 653 return iov->iov_base; 654 } 655 656 static int 657 nvme_cmd_map_prps(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, 658 uint32_t max_iovcnt, uint32_t len, size_t mps, 659 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 660 { 661 uint64_t prp1, prp2; 662 void *vva; 663 uint32_t i; 664 uint32_t residue_len, nents; 665 uint64_t *prp_list; 666 uint32_t iovcnt; 667 668 assert(max_iovcnt > 0); 669 670 prp1 = cmd->dptr.prp.prp1; 671 prp2 = cmd->dptr.prp.prp2; 672 673 /* PRP1 may started with unaligned page address */ 674 residue_len = mps - (prp1 % mps); 675 residue_len = spdk_min(len, residue_len); 676 677 vva = gpa_to_vva(prv, prp1, residue_len, PROT_READ | PROT_WRITE); 678 if (spdk_unlikely(vva == NULL)) { 679 SPDK_ERRLOG("GPA to VVA failed\n"); 680 return -EINVAL; 681 } 682 len -= residue_len; 683 if (len && max_iovcnt < 2) { 684 SPDK_ERRLOG("Too many page entries, at least two iovs are required\n"); 685 return -ERANGE; 686 } 687 iovs[0].iov_base = vva; 688 iovs[0].iov_len = residue_len; 689 690 if (len) { 691 if (spdk_unlikely(prp2 == 0)) { 692 SPDK_ERRLOG("no PRP2, %d remaining\n", len); 693 return -EINVAL; 694 } 695 696 if (len <= mps) { 697 /* 2 PRP used */ 698 iovcnt = 2; 699 vva = gpa_to_vva(prv, prp2, len, PROT_READ | PROT_WRITE); 700 if (spdk_unlikely(vva == NULL)) { 701 SPDK_ERRLOG("no VVA for %#" PRIx64 ", len%#x\n", 702 prp2, len); 703 return -EINVAL; 704 } 705 iovs[1].iov_base = vva; 706 iovs[1].iov_len = len; 707 } else { 708 /* PRP list used */ 709 nents = (len + mps - 1) / mps; 710 if (spdk_unlikely(nents + 1 > max_iovcnt)) { 711 SPDK_ERRLOG("Too many page entries\n"); 712 return -ERANGE; 713 } 714 715 vva = gpa_to_vva(prv, prp2, nents * sizeof(*prp_list), PROT_READ); 716 if (spdk_unlikely(vva == NULL)) { 717 SPDK_ERRLOG("no VVA for %#" PRIx64 ", nents=%#x\n", 718 prp2, nents); 719 return -EINVAL; 720 } 721 prp_list = vva; 722 i = 0; 723 while (len != 0) { 724 residue_len = spdk_min(len, mps); 725 vva = gpa_to_vva(prv, prp_list[i], residue_len, PROT_READ | PROT_WRITE); 726 if (spdk_unlikely(vva == NULL)) { 727 SPDK_ERRLOG("no VVA for %#" PRIx64 ", residue_len=%#x\n", 728 prp_list[i], residue_len); 729 return -EINVAL; 730 } 731 iovs[i + 1].iov_base = vva; 732 iovs[i + 1].iov_len = residue_len; 733 len -= residue_len; 734 i++; 735 } 736 iovcnt = i + 1; 737 } 738 } else { 739 /* 1 PRP used */ 740 iovcnt = 1; 741 } 742 743 assert(iovcnt <= max_iovcnt); 744 return iovcnt; 745 } 746 747 static int 748 nvme_cmd_map_sgls_data(void *prv, struct spdk_nvme_sgl_descriptor *sgls, uint32_t num_sgls, 749 struct iovec *iovs, uint32_t max_iovcnt, 750 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 751 { 752 uint32_t i; 753 void *vva; 754 755 if (spdk_unlikely(max_iovcnt < num_sgls)) { 756 return -ERANGE; 757 } 758 759 for (i = 0; i < num_sgls; i++) { 760 if (spdk_unlikely(sgls[i].unkeyed.type != SPDK_NVME_SGL_TYPE_DATA_BLOCK)) { 761 SPDK_ERRLOG("Invalid SGL type %u\n", sgls[i].unkeyed.type); 762 return -EINVAL; 763 } 764 vva = gpa_to_vva(prv, sgls[i].address, sgls[i].unkeyed.length, PROT_READ | PROT_WRITE); 765 if (spdk_unlikely(vva == NULL)) { 766 SPDK_ERRLOG("GPA to VVA failed\n"); 767 return -EINVAL; 768 } 769 iovs[i].iov_base = vva; 770 iovs[i].iov_len = sgls[i].unkeyed.length; 771 } 772 773 return num_sgls; 774 } 775 776 static int 777 nvme_cmd_map_sgls(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt, 778 uint32_t len, size_t mps, 779 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 780 { 781 struct spdk_nvme_sgl_descriptor *sgl, *last_sgl; 782 uint32_t num_sgls, seg_len; 783 void *vva; 784 int ret; 785 uint32_t total_iovcnt = 0; 786 787 /* SGL cases */ 788 sgl = &cmd->dptr.sgl1; 789 790 /* only one SGL segment */ 791 if (sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { 792 assert(max_iovcnt > 0); 793 vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, PROT_READ | PROT_WRITE); 794 if (spdk_unlikely(vva == NULL)) { 795 SPDK_ERRLOG("GPA to VVA failed\n"); 796 return -EINVAL; 797 } 798 iovs[0].iov_base = vva; 799 iovs[0].iov_len = sgl->unkeyed.length; 800 assert(sgl->unkeyed.length == len); 801 802 return 1; 803 } 804 805 for (;;) { 806 if (spdk_unlikely((sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_SEGMENT) && 807 (sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_LAST_SEGMENT))) { 808 SPDK_ERRLOG("Invalid SGL type %u\n", sgl->unkeyed.type); 809 return -EINVAL; 810 } 811 812 seg_len = sgl->unkeyed.length; 813 if (spdk_unlikely(seg_len % sizeof(struct spdk_nvme_sgl_descriptor))) { 814 SPDK_ERRLOG("Invalid SGL segment len %u\n", seg_len); 815 return -EINVAL; 816 } 817 818 num_sgls = seg_len / sizeof(struct spdk_nvme_sgl_descriptor); 819 vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, PROT_READ); 820 if (spdk_unlikely(vva == NULL)) { 821 SPDK_ERRLOG("GPA to VVA failed\n"); 822 return -EINVAL; 823 } 824 825 /* sgl point to the first segment */ 826 sgl = (struct spdk_nvme_sgl_descriptor *)vva; 827 last_sgl = &sgl[num_sgls - 1]; 828 829 /* we are done */ 830 if (last_sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { 831 /* map whole sgl list */ 832 ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls, &iovs[total_iovcnt], 833 max_iovcnt - total_iovcnt, gpa_to_vva); 834 if (spdk_unlikely(ret < 0)) { 835 return ret; 836 } 837 total_iovcnt += ret; 838 839 return total_iovcnt; 840 } 841 842 if (num_sgls > 1) { 843 /* map whole sgl exclude last_sgl */ 844 ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls - 1, &iovs[total_iovcnt], 845 max_iovcnt - total_iovcnt, gpa_to_vva); 846 if (spdk_unlikely(ret < 0)) { 847 return ret; 848 } 849 total_iovcnt += ret; 850 } 851 852 /* move to next level's segments */ 853 sgl = last_sgl; 854 } 855 856 return 0; 857 } 858 859 static int 860 nvme_map_cmd(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt, 861 uint32_t len, size_t mps, 862 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 863 { 864 if (cmd->psdt == SPDK_NVME_PSDT_PRP) { 865 return nvme_cmd_map_prps(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva); 866 } 867 868 return nvme_cmd_map_sgls(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva); 869 } 870 871 static char * 872 endpoint_id(struct nvmf_vfio_user_endpoint *endpoint) 873 { 874 return endpoint->trid.traddr; 875 } 876 877 static char * 878 ctrlr_id(struct nvmf_vfio_user_ctrlr *ctrlr) 879 { 880 if (!ctrlr || !ctrlr->endpoint) { 881 return "Null Ctrlr"; 882 } 883 884 return endpoint_id(ctrlr->endpoint); 885 } 886 887 /* 888 * For each queue, update the location of its doorbell to the correct location: 889 * either our own BAR0, or the guest's configured shadow doorbell area. 890 * 891 * The Admin queue (qid: 0) does not ever use shadow doorbells. 892 */ 893 static void 894 vfio_user_ctrlr_switch_doorbells(struct nvmf_vfio_user_ctrlr *ctrlr, bool shadow) 895 { 896 volatile uint32_t *doorbells = shadow ? ctrlr->sdbl->shadow_doorbells : 897 ctrlr->bar0_doorbells; 898 899 assert(doorbells != NULL); 900 901 for (size_t i = 1; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; i++) { 902 struct nvmf_vfio_user_sq *sq = ctrlr->sqs[i]; 903 struct nvmf_vfio_user_cq *cq = ctrlr->cqs[i]; 904 905 if (sq != NULL) { 906 sq->dbl_tailp = doorbells + queue_index(sq->qid, false); 907 } 908 909 if (cq != NULL) { 910 cq->dbl_headp = doorbells + queue_index(cq->qid, true); 911 } 912 } 913 } 914 915 static void 916 unmap_sdbl(vfu_ctx_t *vfu_ctx, struct nvmf_vfio_user_shadow_doorbells *sdbl) 917 { 918 assert(vfu_ctx != NULL); 919 assert(sdbl != NULL); 920 921 /* 922 * An allocation error would result in only one of the two being 923 * non-NULL. If that is the case, no memory should have been mapped. 924 */ 925 if (sdbl->iovs == NULL || sdbl->sgs == NULL) { 926 return; 927 } 928 929 for (size_t i = 0; i < NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT; ++i) { 930 struct iovec *iov; 931 dma_sg_t *sg; 932 933 if (!sdbl->iovs[i].iov_len) { 934 continue; 935 } 936 937 sg = index_to_sg_t(sdbl->sgs, i); 938 iov = sdbl->iovs + i; 939 940 vfu_sgl_put(vfu_ctx, sg, iov, 1); 941 } 942 } 943 944 static void 945 free_sdbl(vfu_ctx_t *vfu_ctx, struct nvmf_vfio_user_shadow_doorbells *sdbl) 946 { 947 if (sdbl == NULL) { 948 return; 949 } 950 951 unmap_sdbl(vfu_ctx, sdbl); 952 953 /* 954 * sdbl->shadow_doorbells and sdbl->eventidxs were mapped, 955 * not allocated, so don't free() them. 956 */ 957 free(sdbl->sgs); 958 free(sdbl->iovs); 959 free(sdbl); 960 } 961 962 static struct nvmf_vfio_user_shadow_doorbells * 963 map_sdbl(vfu_ctx_t *vfu_ctx, uint64_t prp1, uint64_t prp2, size_t len) 964 { 965 struct nvmf_vfio_user_shadow_doorbells *sdbl = NULL; 966 dma_sg_t *sg2 = NULL; 967 void *p; 968 969 assert(vfu_ctx != NULL); 970 971 sdbl = calloc(1, sizeof(*sdbl)); 972 if (sdbl == NULL) { 973 goto err; 974 } 975 976 sdbl->sgs = calloc(NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT, dma_sg_size()); 977 sdbl->iovs = calloc(NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT, sizeof(*sdbl->iovs)); 978 if (sdbl->sgs == NULL || sdbl->iovs == NULL) { 979 goto err; 980 } 981 982 /* Map shadow doorbell buffer (PRP1). */ 983 p = map_one(vfu_ctx, prp1, len, sdbl->sgs, sdbl->iovs, 984 PROT_READ | PROT_WRITE); 985 986 if (p == NULL) { 987 goto err; 988 } 989 990 /* 991 * Map eventidx buffer (PRP2). 992 * Should only be written to by the controller. 993 */ 994 995 sg2 = index_to_sg_t(sdbl->sgs, 1); 996 997 p = map_one(vfu_ctx, prp2, len, sg2, sdbl->iovs + 1, 998 PROT_READ | PROT_WRITE); 999 1000 if (p == NULL) { 1001 goto err; 1002 } 1003 1004 sdbl->shadow_doorbells = (uint32_t *)sdbl->iovs[0].iov_base; 1005 sdbl->eventidxs = (uint32_t *)sdbl->iovs[1].iov_base; 1006 1007 return sdbl; 1008 1009 err: 1010 free_sdbl(vfu_ctx, sdbl); 1011 return NULL; 1012 } 1013 1014 /* 1015 * Copy doorbells from one buffer to the other, during switches betweeen BAR0 1016 * doorbells and shadow doorbells. 1017 */ 1018 static void 1019 copy_doorbells(struct nvmf_vfio_user_ctrlr *ctrlr, 1020 const volatile uint32_t *from, volatile uint32_t *to) 1021 { 1022 assert(ctrlr != NULL); 1023 assert(from != NULL); 1024 assert(to != NULL); 1025 1026 SPDK_DEBUGLOG(vfio_user_db, 1027 "%s: migrating shadow doorbells from %p to %p\n", 1028 ctrlr_id(ctrlr), from, to); 1029 1030 /* Can't use memcpy because it doesn't respect volatile semantics. */ 1031 for (size_t i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; ++i) { 1032 if (ctrlr->sqs[i] != NULL) { 1033 to[queue_index(i, false)] = from[queue_index(i, false)]; 1034 } 1035 1036 if (ctrlr->cqs[i] != NULL) { 1037 to[queue_index(i, true)] = from[queue_index(i, true)]; 1038 } 1039 } 1040 } 1041 1042 static void 1043 fail_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1044 { 1045 const struct spdk_nvmf_registers *regs; 1046 1047 assert(vu_ctrlr != NULL); 1048 assert(vu_ctrlr->ctrlr != NULL); 1049 1050 regs = spdk_nvmf_ctrlr_get_regs(vu_ctrlr->ctrlr); 1051 if (regs->csts.bits.cfs == 0) { 1052 SPDK_ERRLOG(":%s failing controller\n", ctrlr_id(vu_ctrlr)); 1053 } 1054 1055 nvmf_ctrlr_set_fatal_status(vu_ctrlr->ctrlr); 1056 } 1057 1058 static inline bool 1059 ctrlr_interrupt_enabled(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1060 { 1061 assert(vu_ctrlr != NULL); 1062 assert(vu_ctrlr->endpoint != NULL); 1063 1064 vfu_pci_config_space_t *pci = vu_ctrlr->endpoint->pci_config_space; 1065 1066 return (!pci->hdr.cmd.id || vu_ctrlr->endpoint->msix->mxc.mxe); 1067 } 1068 1069 static void 1070 nvmf_vfio_user_destroy_endpoint(struct nvmf_vfio_user_endpoint *endpoint) 1071 { 1072 SPDK_DEBUGLOG(nvmf_vfio, "destroy endpoint %s\n", endpoint_id(endpoint)); 1073 1074 spdk_interrupt_unregister(&endpoint->accept_intr); 1075 spdk_poller_unregister(&endpoint->accept_poller); 1076 1077 if (endpoint->bar0_doorbells) { 1078 munmap((void *)endpoint->bar0_doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 1079 } 1080 1081 if (endpoint->devmem_fd > 0) { 1082 close(endpoint->devmem_fd); 1083 } 1084 1085 if (endpoint->migr_data) { 1086 munmap(endpoint->migr_data, vfio_user_migr_data_len()); 1087 } 1088 1089 if (endpoint->migr_fd > 0) { 1090 close(endpoint->migr_fd); 1091 } 1092 1093 if (endpoint->vfu_ctx) { 1094 vfu_destroy_ctx(endpoint->vfu_ctx); 1095 } 1096 1097 pthread_mutex_destroy(&endpoint->lock); 1098 free(endpoint); 1099 } 1100 1101 /* called when process exits */ 1102 static int 1103 nvmf_vfio_user_destroy(struct spdk_nvmf_transport *transport, 1104 spdk_nvmf_transport_destroy_done_cb cb_fn, void *cb_arg) 1105 { 1106 struct nvmf_vfio_user_transport *vu_transport; 1107 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 1108 1109 SPDK_DEBUGLOG(nvmf_vfio, "destroy transport\n"); 1110 1111 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 1112 transport); 1113 1114 pthread_mutex_destroy(&vu_transport->lock); 1115 pthread_mutex_destroy(&vu_transport->pg_lock); 1116 1117 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 1118 TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link); 1119 nvmf_vfio_user_destroy_endpoint(endpoint); 1120 } 1121 1122 free(vu_transport); 1123 1124 if (cb_fn) { 1125 cb_fn(cb_arg); 1126 } 1127 1128 return 0; 1129 } 1130 1131 static const struct spdk_json_object_decoder vfio_user_transport_opts_decoder[] = { 1132 { 1133 "disable_mappable_bar0", 1134 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_mappable_bar0), 1135 spdk_json_decode_bool, true 1136 }, 1137 { 1138 "disable_adaptive_irq", 1139 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_adaptive_irq), 1140 spdk_json_decode_bool, true 1141 }, 1142 { 1143 "disable_shadow_doorbells", 1144 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_shadow_doorbells), 1145 spdk_json_decode_bool, true 1146 }, 1147 { 1148 "disable_compare", 1149 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_compare), 1150 spdk_json_decode_bool, true 1151 }, 1152 }; 1153 1154 static struct spdk_nvmf_transport * 1155 nvmf_vfio_user_create(struct spdk_nvmf_transport_opts *opts) 1156 { 1157 struct nvmf_vfio_user_transport *vu_transport; 1158 int err; 1159 1160 if (opts->max_qpairs_per_ctrlr > NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) { 1161 SPDK_ERRLOG("Invalid max_qpairs_per_ctrlr=%d, supported max_qpairs_per_ctrlr=%d\n", 1162 opts->max_qpairs_per_ctrlr, NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR); 1163 return NULL; 1164 } 1165 1166 vu_transport = calloc(1, sizeof(*vu_transport)); 1167 if (vu_transport == NULL) { 1168 SPDK_ERRLOG("Transport alloc fail: %m\n"); 1169 return NULL; 1170 } 1171 1172 err = pthread_mutex_init(&vu_transport->lock, NULL); 1173 if (err != 0) { 1174 SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err); 1175 goto err; 1176 } 1177 TAILQ_INIT(&vu_transport->endpoints); 1178 1179 err = pthread_mutex_init(&vu_transport->pg_lock, NULL); 1180 if (err != 0) { 1181 pthread_mutex_destroy(&vu_transport->lock); 1182 SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err); 1183 goto err; 1184 } 1185 TAILQ_INIT(&vu_transport->poll_groups); 1186 1187 if (opts->transport_specific != NULL && 1188 spdk_json_decode_object_relaxed(opts->transport_specific, vfio_user_transport_opts_decoder, 1189 SPDK_COUNTOF(vfio_user_transport_opts_decoder), 1190 vu_transport)) { 1191 SPDK_ERRLOG("spdk_json_decode_object_relaxed failed\n"); 1192 goto cleanup; 1193 } 1194 1195 /* 1196 * To support interrupt mode, the transport must be configured with 1197 * mappable BAR0 disabled: we need a vfio-user message to wake us up 1198 * when a client writes new doorbell values to BAR0, via the 1199 * libvfio-user socket fd. 1200 */ 1201 vu_transport->intr_mode_supported = 1202 vu_transport->transport_opts.disable_mappable_bar0; 1203 1204 /* 1205 * If BAR0 is mappable, it doesn't make sense to support shadow 1206 * doorbells, so explicitly turn it off. 1207 */ 1208 if (!vu_transport->transport_opts.disable_mappable_bar0) { 1209 vu_transport->transport_opts.disable_shadow_doorbells = true; 1210 } 1211 1212 /* 1213 * If we are in interrupt mode, we cannot support adaptive IRQs, as 1214 * there is no guarantee the SQ poller will run subsequently to send 1215 * pending IRQs. 1216 */ 1217 if (spdk_interrupt_mode_is_enabled()) { 1218 vu_transport->transport_opts.disable_adaptive_irq = true; 1219 } 1220 1221 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_mappable_bar0=%d\n", 1222 vu_transport->transport_opts.disable_mappable_bar0); 1223 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_adaptive_irq=%d\n", 1224 vu_transport->transport_opts.disable_adaptive_irq); 1225 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_shadow_doorbells=%d\n", 1226 vu_transport->transport_opts.disable_shadow_doorbells); 1227 1228 return &vu_transport->transport; 1229 1230 cleanup: 1231 pthread_mutex_destroy(&vu_transport->lock); 1232 pthread_mutex_destroy(&vu_transport->pg_lock); 1233 err: 1234 free(vu_transport); 1235 return NULL; 1236 } 1237 1238 static uint32_t 1239 max_queue_size(struct nvmf_vfio_user_ctrlr const *vu_ctrlr) 1240 { 1241 assert(vu_ctrlr != NULL); 1242 assert(vu_ctrlr->ctrlr != NULL); 1243 1244 return vu_ctrlr->ctrlr->vcprop.cap.bits.mqes + 1; 1245 } 1246 1247 static uint32_t 1248 doorbell_stride(const struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1249 { 1250 assert(vu_ctrlr != NULL); 1251 assert(vu_ctrlr->ctrlr != NULL); 1252 1253 return vu_ctrlr->ctrlr->vcprop.cap.bits.dstrd; 1254 } 1255 1256 static uintptr_t 1257 memory_page_size(const struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1258 { 1259 uint32_t memory_page_shift = vu_ctrlr->ctrlr->vcprop.cc.bits.mps + 12; 1260 return 1ul << memory_page_shift; 1261 } 1262 1263 static uintptr_t 1264 memory_page_mask(const struct nvmf_vfio_user_ctrlr *ctrlr) 1265 { 1266 return ~(memory_page_size(ctrlr) - 1); 1267 } 1268 1269 static int 1270 map_q(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvme_q_mapping *mapping, 1271 uint32_t q_size, bool is_cq, bool unmap) 1272 { 1273 uint64_t len; 1274 void *ret; 1275 1276 assert(q_size); 1277 assert(q_addr(mapping) == NULL); 1278 1279 if (is_cq) { 1280 len = q_size * sizeof(struct spdk_nvme_cpl); 1281 } else { 1282 len = q_size * sizeof(struct spdk_nvme_cmd); 1283 } 1284 1285 ret = map_one(vu_ctrlr->endpoint->vfu_ctx, mapping->prp1, len, 1286 mapping->sg, &mapping->iov, 1287 is_cq ? PROT_READ | PROT_WRITE : PROT_READ); 1288 if (ret == NULL) { 1289 return -EFAULT; 1290 } 1291 1292 if (unmap) { 1293 memset(q_addr(mapping), 0, len); 1294 } 1295 1296 return 0; 1297 } 1298 1299 static inline void 1300 unmap_q(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvme_q_mapping *mapping) 1301 { 1302 if (q_addr(mapping) != NULL) { 1303 vfu_sgl_put(vu_ctrlr->endpoint->vfu_ctx, mapping->sg, 1304 &mapping->iov, 1); 1305 mapping->iov.iov_base = NULL; 1306 } 1307 } 1308 1309 static int 1310 asq_setup(struct nvmf_vfio_user_ctrlr *ctrlr) 1311 { 1312 struct nvmf_vfio_user_sq *sq; 1313 const struct spdk_nvmf_registers *regs; 1314 int ret; 1315 1316 assert(ctrlr != NULL); 1317 1318 sq = ctrlr->sqs[0]; 1319 1320 assert(sq != NULL); 1321 assert(q_addr(&sq->mapping) == NULL); 1322 /* XXX ctrlr->asq == 0 is a valid memory address */ 1323 1324 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->ctrlr); 1325 sq->qid = 0; 1326 sq->size = regs->aqa.bits.asqs + 1; 1327 sq->mapping.prp1 = regs->asq; 1328 *sq_headp(sq) = 0; 1329 sq->cqid = 0; 1330 1331 ret = map_q(ctrlr, &sq->mapping, sq->size, false, true); 1332 if (ret) { 1333 return ret; 1334 } 1335 1336 /* The Admin queue (qid: 0) does not ever use shadow doorbells. */ 1337 sq->dbl_tailp = ctrlr->bar0_doorbells + queue_index(0, false); 1338 1339 *sq_dbl_tailp(sq) = 0; 1340 1341 return 0; 1342 } 1343 1344 /* 1345 * Updates eventidx to set an SQ into interrupt or polling mode. 1346 * 1347 * Returns false if the current SQ tail does not match the SQ head, as 1348 * this means that the host has submitted more items to the queue while we were 1349 * not looking - or during the event index update. In that case, we must retry, 1350 * or otherwise make sure we are going to wake up again. 1351 */ 1352 static bool 1353 set_sq_eventidx(struct nvmf_vfio_user_sq *sq) 1354 { 1355 struct nvmf_vfio_user_ctrlr *ctrlr; 1356 volatile uint32_t *sq_tail_eidx; 1357 uint32_t old_tail, new_tail; 1358 1359 assert(sq != NULL); 1360 assert(sq->ctrlr != NULL); 1361 assert(sq->ctrlr->sdbl != NULL); 1362 assert(sq->need_rearm); 1363 1364 ctrlr = sq->ctrlr; 1365 1366 SPDK_DEBUGLOG(vfio_user_db, "%s: updating eventidx of sqid:%u\n", 1367 ctrlr_id(ctrlr), sq->qid); 1368 1369 sq_tail_eidx = ctrlr->sdbl->eventidxs + queue_index(sq->qid, false); 1370 1371 assert(ctrlr->endpoint != NULL); 1372 1373 if (!ctrlr->endpoint->interrupt_mode) { 1374 /* No synchronisation necessary. */ 1375 *sq_tail_eidx = NVMF_VFIO_USER_EVENTIDX_POLL; 1376 return true; 1377 } 1378 1379 old_tail = *sq_dbl_tailp(sq); 1380 *sq_tail_eidx = old_tail; 1381 1382 /* 1383 * Ensure that the event index is updated before re-reading the tail 1384 * doorbell. If it's not, then the host might race us and update the 1385 * tail after the second read but before the event index is written, so 1386 * it won't write to BAR0 and we'll miss the update. 1387 * 1388 * The driver should provide similar ordering with an mb(). 1389 */ 1390 spdk_mb(); 1391 1392 /* 1393 * Check if the host has updated the tail doorbell after we've read it 1394 * for the first time, but before the event index was written. If that's 1395 * the case, then we've lost the race and we need to update the event 1396 * index again (after polling the queue, since the host won't write to 1397 * BAR0). 1398 */ 1399 new_tail = *sq_dbl_tailp(sq); 1400 1401 /* 1402 * We might poll the queue straight after this function returns if the 1403 * tail has been updated, so we need to ensure that any changes to the 1404 * queue will be visible to us if the doorbell has been updated. 1405 * 1406 * The driver should provide similar ordering with a wmb() to ensure 1407 * that the queue is written before it updates the tail doorbell. 1408 */ 1409 spdk_rmb(); 1410 1411 SPDK_DEBUGLOG(vfio_user_db, "%s: sqid:%u, old_tail=%u, new_tail=%u, " 1412 "sq_head=%u\n", ctrlr_id(ctrlr), sq->qid, old_tail, 1413 new_tail, *sq_headp(sq)); 1414 1415 if (new_tail == *sq_headp(sq)) { 1416 sq->need_rearm = false; 1417 return true; 1418 } 1419 1420 /* 1421 * We've lost the race: the tail was updated since we last polled, 1422 * including if it happened within this routine. 1423 * 1424 * The caller should retry after polling (think of this as a cmpxchg 1425 * loop); if we go to sleep while the SQ is not empty, then we won't 1426 * process the remaining events. 1427 */ 1428 return false; 1429 } 1430 1431 static int nvmf_vfio_user_sq_poll(struct nvmf_vfio_user_sq *sq); 1432 1433 /* 1434 * Arrange for an SQ to interrupt us if written. Returns non-zero if we 1435 * processed some SQ entries. 1436 */ 1437 static int 1438 vfio_user_sq_rearm(struct nvmf_vfio_user_ctrlr *ctrlr, 1439 struct nvmf_vfio_user_sq *sq) 1440 { 1441 int count = 0; 1442 size_t i; 1443 1444 assert(sq->need_rearm); 1445 1446 for (i = 0; i < NVMF_VFIO_USER_SET_EVENTIDX_MAX_ATTEMPTS; i++) { 1447 int ret; 1448 1449 if (set_sq_eventidx(sq)) { 1450 /* We won the race and set eventidx; done. */ 1451 return count; 1452 } 1453 1454 ret = nvmf_vfio_user_sq_poll(sq); 1455 1456 count += (ret < 0) ? 1 : ret; 1457 1458 /* 1459 * set_sq_eventidx() hit the race, so we expected 1460 * to process at least one command from this queue. 1461 * If there were no new commands waiting for us, then 1462 * we must have hit an unexpected race condition. 1463 */ 1464 if (ret == 0) { 1465 SPDK_ERRLOG("%s: unexpected race condition detected " 1466 "while updating the shadow doorbell buffer\n", 1467 ctrlr_id(ctrlr)); 1468 1469 fail_ctrlr(ctrlr); 1470 return count; 1471 } 1472 } 1473 1474 SPDK_DEBUGLOG(vfio_user_db, 1475 "%s: set_sq_eventidx() lost the race %zu times\n", 1476 ctrlr_id(ctrlr), i); 1477 1478 /* 1479 * We couldn't arrange an eventidx guaranteed to cause a BAR0 write, as 1480 * we raced with the producer too many times; force ourselves to wake up 1481 * instead. We'll process all queues at that point. 1482 */ 1483 ctrlr_kick(ctrlr); 1484 1485 return count; 1486 } 1487 1488 /* 1489 * We're in interrupt mode, and potentially about to go to sleep. We need to 1490 * make sure any further I/O submissions are guaranteed to wake us up: for 1491 * shadow doorbells that means we may need to go through set_sq_eventidx() for 1492 * every SQ that needs re-arming. 1493 * 1494 * Returns non-zero if we processed something. 1495 */ 1496 static int 1497 vfio_user_poll_group_rearm(struct nvmf_vfio_user_poll_group *vu_group) 1498 { 1499 struct nvmf_vfio_user_sq *sq; 1500 int count = 0; 1501 1502 TAILQ_FOREACH(sq, &vu_group->sqs, link) { 1503 if (spdk_unlikely(sq->sq_state != VFIO_USER_SQ_ACTIVE || !sq->size)) { 1504 continue; 1505 } 1506 1507 if (sq->need_rearm) { 1508 count += vfio_user_sq_rearm(sq->ctrlr, sq); 1509 } 1510 } 1511 1512 return count; 1513 } 1514 1515 static int 1516 acq_setup(struct nvmf_vfio_user_ctrlr *ctrlr) 1517 { 1518 struct nvmf_vfio_user_cq *cq; 1519 const struct spdk_nvmf_registers *regs; 1520 int ret; 1521 1522 assert(ctrlr != NULL); 1523 1524 cq = ctrlr->cqs[0]; 1525 1526 assert(cq != NULL); 1527 1528 assert(q_addr(&cq->mapping) == NULL); 1529 1530 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->ctrlr); 1531 assert(regs != NULL); 1532 cq->qid = 0; 1533 cq->size = regs->aqa.bits.acqs + 1; 1534 cq->mapping.prp1 = regs->acq; 1535 *cq_tailp(cq) = 0; 1536 cq->ien = true; 1537 cq->phase = true; 1538 1539 ret = map_q(ctrlr, &cq->mapping, cq->size, true, true); 1540 if (ret) { 1541 return ret; 1542 } 1543 1544 /* The Admin queue (qid: 0) does not ever use shadow doorbells. */ 1545 cq->dbl_headp = ctrlr->bar0_doorbells + queue_index(0, true); 1546 1547 *cq_dbl_headp(cq) = 0; 1548 1549 return 0; 1550 } 1551 1552 static void * 1553 _map_one(void *prv, uint64_t addr, uint64_t len, int prot) 1554 { 1555 struct spdk_nvmf_request *req = (struct spdk_nvmf_request *)prv; 1556 struct spdk_nvmf_qpair *qpair; 1557 struct nvmf_vfio_user_req *vu_req; 1558 struct nvmf_vfio_user_sq *sq; 1559 void *ret; 1560 1561 assert(req != NULL); 1562 qpair = req->qpair; 1563 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 1564 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 1565 1566 assert(vu_req->iovcnt < NVMF_VFIO_USER_MAX_IOVECS); 1567 ret = map_one(sq->ctrlr->endpoint->vfu_ctx, addr, len, 1568 index_to_sg_t(vu_req->sg, vu_req->iovcnt), 1569 &vu_req->iov[vu_req->iovcnt], prot); 1570 if (spdk_likely(ret != NULL)) { 1571 vu_req->iovcnt++; 1572 } 1573 return ret; 1574 } 1575 1576 static int 1577 vfio_user_map_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req, 1578 struct iovec *iov, uint32_t length) 1579 { 1580 /* Map PRP list to from Guest physical memory to 1581 * virtual memory address. 1582 */ 1583 return nvme_map_cmd(req, &req->cmd->nvme_cmd, iov, NVMF_REQ_MAX_BUFFERS, 1584 length, 4096, _map_one); 1585 } 1586 1587 static int handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 1588 struct nvmf_vfio_user_sq *sq); 1589 1590 /* 1591 * Posts a CQE in the completion queue. 1592 * 1593 * @ctrlr: the vfio-user controller 1594 * @cq: the completion queue 1595 * @cdw0: cdw0 as reported by NVMf 1596 * @sqid: submission queue ID 1597 * @cid: command identifier in NVMe command 1598 * @sc: the NVMe CQE status code 1599 * @sct: the NVMe CQE status code type 1600 */ 1601 static int 1602 post_completion(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_cq *cq, 1603 uint32_t cdw0, uint16_t sqid, uint16_t cid, uint16_t sc, uint16_t sct) 1604 { 1605 struct spdk_nvme_status cpl_status = { 0 }; 1606 struct spdk_nvme_cpl *cpl; 1607 int err; 1608 1609 assert(ctrlr != NULL); 1610 1611 if (spdk_unlikely(cq == NULL || q_addr(&cq->mapping) == NULL)) { 1612 return 0; 1613 } 1614 1615 if (cq_is_full(cq)) { 1616 SPDK_ERRLOG("%s: cqid:%d full (tail=%d, head=%d)\n", 1617 ctrlr_id(ctrlr), cq->qid, *cq_tailp(cq), 1618 *cq_dbl_headp(cq)); 1619 return -1; 1620 } 1621 1622 cpl = ((struct spdk_nvme_cpl *)q_addr(&cq->mapping)) + *cq_tailp(cq); 1623 1624 assert(ctrlr->sqs[sqid] != NULL); 1625 SPDK_DEBUGLOG(nvmf_vfio, 1626 "%s: request complete sqid:%d cid=%d status=%#x " 1627 "sqhead=%d cq tail=%d\n", ctrlr_id(ctrlr), sqid, cid, sc, 1628 *sq_headp(ctrlr->sqs[sqid]), *cq_tailp(cq)); 1629 1630 cpl->sqhd = *sq_headp(ctrlr->sqs[sqid]); 1631 cpl->sqid = sqid; 1632 cpl->cid = cid; 1633 cpl->cdw0 = cdw0; 1634 1635 /* 1636 * This is a bitfield: instead of setting the individual bits we need 1637 * directly in cpl->status, which would cause a read-modify-write cycle, 1638 * we'll avoid reading from the CPL altogether by filling in a local 1639 * cpl_status variable, then writing the whole thing. 1640 */ 1641 cpl_status.sct = sct; 1642 cpl_status.sc = sc; 1643 cpl_status.p = cq->phase; 1644 cpl->status = cpl_status; 1645 1646 /* Ensure the Completion Queue Entry is visible. */ 1647 spdk_wmb(); 1648 cq_tail_advance(cq); 1649 1650 if ((cq->qid == 0 || !ctrlr->adaptive_irqs_enabled) && 1651 cq->ien && ctrlr_interrupt_enabled(ctrlr)) { 1652 err = vfu_irq_trigger(ctrlr->endpoint->vfu_ctx, cq->iv); 1653 if (err != 0) { 1654 SPDK_ERRLOG("%s: failed to trigger interrupt: %m\n", 1655 ctrlr_id(ctrlr)); 1656 return err; 1657 } 1658 } 1659 1660 return 0; 1661 } 1662 1663 static void 1664 free_sq_reqs(struct nvmf_vfio_user_sq *sq) 1665 { 1666 while (!TAILQ_EMPTY(&sq->free_reqs)) { 1667 struct nvmf_vfio_user_req *vu_req = TAILQ_FIRST(&sq->free_reqs); 1668 TAILQ_REMOVE(&sq->free_reqs, vu_req, link); 1669 free(vu_req); 1670 } 1671 } 1672 1673 /* Deletes a SQ, if this SQ is the last user of the associated CQ 1674 * and the controller is being shut down or reset, then the CQ is 1675 * also deleted. 1676 */ 1677 static void 1678 delete_sq_done(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvmf_vfio_user_sq *sq) 1679 { 1680 struct nvmf_vfio_user_cq *cq; 1681 uint16_t cqid; 1682 1683 SPDK_DEBUGLOG(nvmf_vfio, "%s: delete sqid:%d=%p done\n", ctrlr_id(vu_ctrlr), 1684 sq->qid, sq); 1685 1686 /* Free SQ resources */ 1687 unmap_q(vu_ctrlr, &sq->mapping); 1688 1689 free_sq_reqs(sq); 1690 1691 sq->size = 0; 1692 1693 sq->sq_state = VFIO_USER_SQ_DELETED; 1694 1695 /* Controller RESET and SHUTDOWN are special cases, 1696 * VM may not send DELETE IO SQ/CQ commands, NVMf library 1697 * will disconnect IO queue pairs. 1698 */ 1699 if (vu_ctrlr->reset_shn) { 1700 cqid = sq->cqid; 1701 cq = vu_ctrlr->cqs[cqid]; 1702 1703 SPDK_DEBUGLOG(nvmf_vfio, "%s: try to delete cqid:%u=%p\n", ctrlr_id(vu_ctrlr), 1704 cq->qid, cq); 1705 1706 if (cq->cq_ref) { 1707 cq->cq_ref--; 1708 } 1709 if (cq->cq_ref == 0) { 1710 unmap_q(vu_ctrlr, &cq->mapping); 1711 cq->size = 0; 1712 cq->cq_state = VFIO_USER_CQ_DELETED; 1713 cq->group = NULL; 1714 } 1715 } 1716 } 1717 1718 static void 1719 free_qp(struct nvmf_vfio_user_ctrlr *ctrlr, uint16_t qid) 1720 { 1721 struct nvmf_vfio_user_sq *sq; 1722 struct nvmf_vfio_user_cq *cq; 1723 1724 if (ctrlr == NULL) { 1725 return; 1726 } 1727 1728 sq = ctrlr->sqs[qid]; 1729 if (sq) { 1730 SPDK_DEBUGLOG(nvmf_vfio, "%s: Free SQ %u\n", ctrlr_id(ctrlr), qid); 1731 unmap_q(ctrlr, &sq->mapping); 1732 1733 free_sq_reqs(sq); 1734 1735 free(sq->mapping.sg); 1736 free(sq); 1737 ctrlr->sqs[qid] = NULL; 1738 } 1739 1740 cq = ctrlr->cqs[qid]; 1741 if (cq) { 1742 SPDK_DEBUGLOG(nvmf_vfio, "%s: Free cqid:%u\n", ctrlr_id(ctrlr), qid); 1743 unmap_q(ctrlr, &cq->mapping); 1744 free(cq->mapping.sg); 1745 free(cq); 1746 ctrlr->cqs[qid] = NULL; 1747 } 1748 } 1749 1750 static int 1751 init_sq(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_transport *transport, 1752 const uint16_t id) 1753 { 1754 struct nvmf_vfio_user_sq *sq; 1755 1756 assert(ctrlr != NULL); 1757 assert(transport != NULL); 1758 assert(ctrlr->sqs[id] == NULL); 1759 1760 sq = calloc(1, sizeof(*sq)); 1761 if (sq == NULL) { 1762 return -ENOMEM; 1763 } 1764 sq->mapping.sg = calloc(1, dma_sg_size()); 1765 if (sq->mapping.sg == NULL) { 1766 free(sq); 1767 return -ENOMEM; 1768 } 1769 1770 sq->qid = id; 1771 sq->qpair.qid = id; 1772 sq->qpair.transport = transport; 1773 sq->ctrlr = ctrlr; 1774 ctrlr->sqs[id] = sq; 1775 1776 TAILQ_INIT(&sq->free_reqs); 1777 1778 return 0; 1779 } 1780 1781 static int 1782 init_cq(struct nvmf_vfio_user_ctrlr *vu_ctrlr, const uint16_t id) 1783 { 1784 struct nvmf_vfio_user_cq *cq; 1785 1786 assert(vu_ctrlr != NULL); 1787 assert(vu_ctrlr->cqs[id] == NULL); 1788 1789 cq = calloc(1, sizeof(*cq)); 1790 if (cq == NULL) { 1791 return -ENOMEM; 1792 } 1793 cq->mapping.sg = calloc(1, dma_sg_size()); 1794 if (cq->mapping.sg == NULL) { 1795 free(cq); 1796 return -ENOMEM; 1797 } 1798 1799 cq->qid = id; 1800 vu_ctrlr->cqs[id] = cq; 1801 1802 return 0; 1803 } 1804 1805 static int 1806 alloc_sq_reqs(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvmf_vfio_user_sq *sq) 1807 { 1808 struct nvmf_vfio_user_req *vu_req, *tmp; 1809 size_t req_size; 1810 uint32_t i; 1811 1812 req_size = sizeof(struct nvmf_vfio_user_req) + 1813 (dma_sg_size() * NVMF_VFIO_USER_MAX_IOVECS); 1814 1815 for (i = 0; i < sq->size; i++) { 1816 struct spdk_nvmf_request *req; 1817 1818 vu_req = calloc(1, req_size); 1819 if (vu_req == NULL) { 1820 goto err; 1821 } 1822 1823 req = &vu_req->req; 1824 req->qpair = &sq->qpair; 1825 req->rsp = (union nvmf_c2h_msg *)&vu_req->rsp; 1826 req->cmd = (union nvmf_h2c_msg *)&vu_req->cmd; 1827 req->stripped_data = NULL; 1828 1829 TAILQ_INSERT_TAIL(&sq->free_reqs, vu_req, link); 1830 } 1831 1832 return 0; 1833 1834 err: 1835 TAILQ_FOREACH_SAFE(vu_req, &sq->free_reqs, link, tmp) { 1836 free(vu_req); 1837 } 1838 return -ENOMEM; 1839 } 1840 1841 static volatile uint32_t * 1842 ctrlr_doorbell_ptr(struct nvmf_vfio_user_ctrlr *ctrlr) 1843 { 1844 return ctrlr->sdbl != NULL ? 1845 ctrlr->sdbl->shadow_doorbells : 1846 ctrlr->bar0_doorbells; 1847 } 1848 1849 static uint16_t 1850 handle_create_io_sq(struct nvmf_vfio_user_ctrlr *ctrlr, 1851 struct spdk_nvme_cmd *cmd, uint16_t *sct) 1852 { 1853 struct nvmf_vfio_user_transport *vu_transport = ctrlr->transport; 1854 struct nvmf_vfio_user_sq *sq; 1855 uint32_t qsize; 1856 uint16_t cqid; 1857 uint16_t qid; 1858 int err; 1859 1860 qid = cmd->cdw10_bits.create_io_q.qid; 1861 cqid = cmd->cdw11_bits.create_io_sq.cqid; 1862 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 1863 1864 if (ctrlr->sqs[qid] == NULL) { 1865 err = init_sq(ctrlr, ctrlr->sqs[0]->qpair.transport, qid); 1866 if (err != 0) { 1867 *sct = SPDK_NVME_SCT_GENERIC; 1868 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1869 } 1870 } 1871 1872 if (cqid == 0 || cqid >= vu_transport->transport.opts.max_qpairs_per_ctrlr) { 1873 SPDK_ERRLOG("%s: invalid cqid:%u\n", ctrlr_id(ctrlr), cqid); 1874 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1875 return SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 1876 } 1877 1878 /* CQ must be created before SQ. */ 1879 if (!io_q_exists(ctrlr, cqid, true)) { 1880 SPDK_ERRLOG("%s: cqid:%u does not exist\n", ctrlr_id(ctrlr), cqid); 1881 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1882 return SPDK_NVME_SC_COMPLETION_QUEUE_INVALID; 1883 } 1884 1885 if (cmd->cdw11_bits.create_io_sq.pc != 0x1) { 1886 SPDK_ERRLOG("%s: non-PC SQ not supported\n", ctrlr_id(ctrlr)); 1887 *sct = SPDK_NVME_SCT_GENERIC; 1888 return SPDK_NVME_SC_INVALID_FIELD; 1889 } 1890 1891 sq = ctrlr->sqs[qid]; 1892 sq->size = qsize; 1893 1894 SPDK_DEBUGLOG(nvmf_vfio, "%s: sqid:%d cqid:%d\n", ctrlr_id(ctrlr), 1895 qid, cqid); 1896 1897 sq->mapping.prp1 = cmd->dptr.prp.prp1; 1898 1899 err = map_q(ctrlr, &sq->mapping, sq->size, false, true); 1900 if (err) { 1901 SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr)); 1902 *sct = SPDK_NVME_SCT_GENERIC; 1903 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1904 } 1905 1906 SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped sqid:%d IOVA=%#lx vaddr=%p\n", 1907 ctrlr_id(ctrlr), qid, cmd->dptr.prp.prp1, 1908 q_addr(&sq->mapping)); 1909 1910 err = alloc_sq_reqs(ctrlr, sq); 1911 if (err < 0) { 1912 SPDK_ERRLOG("%s: failed to allocate SQ requests: %m\n", ctrlr_id(ctrlr)); 1913 *sct = SPDK_NVME_SCT_GENERIC; 1914 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1915 } 1916 1917 sq->cqid = cqid; 1918 ctrlr->cqs[sq->cqid]->cq_ref++; 1919 sq->sq_state = VFIO_USER_SQ_CREATED; 1920 *sq_headp(sq) = 0; 1921 1922 sq->dbl_tailp = ctrlr_doorbell_ptr(ctrlr) + queue_index(qid, false); 1923 1924 /* 1925 * We should always reset the doorbells. 1926 * 1927 * The Specification prohibits the controller from writing to the shadow 1928 * doorbell buffer, however older versions of the Linux NVMe driver 1929 * don't reset the shadow doorbell buffer after a Queue-Level or 1930 * Controller-Level reset, which means that we're left with garbage 1931 * doorbell values. 1932 */ 1933 *sq_dbl_tailp(sq) = 0; 1934 1935 if (ctrlr->sdbl != NULL) { 1936 sq->need_rearm = true; 1937 1938 if (!set_sq_eventidx(sq)) { 1939 SPDK_ERRLOG("%s: host updated SQ tail doorbell before " 1940 "sqid:%hu was initialized\n", 1941 ctrlr_id(ctrlr), qid); 1942 fail_ctrlr(ctrlr); 1943 *sct = SPDK_NVME_SCT_GENERIC; 1944 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1945 } 1946 } 1947 1948 /* 1949 * Create our new I/O qpair. This asynchronously invokes, on a suitable 1950 * poll group, the nvmf_vfio_user_poll_group_add() callback, which will 1951 * call spdk_nvmf_request_exec_fabrics() with a generated fabrics 1952 * connect command. This command is then eventually completed via 1953 * handle_queue_connect_rsp(). 1954 */ 1955 sq->create_io_sq_cmd = *cmd; 1956 sq->post_create_io_sq_completion = true; 1957 1958 spdk_nvmf_tgt_new_qpair(ctrlr->transport->transport.tgt, 1959 &sq->qpair); 1960 1961 *sct = SPDK_NVME_SCT_GENERIC; 1962 return SPDK_NVME_SC_SUCCESS; 1963 } 1964 1965 static uint16_t 1966 handle_create_io_cq(struct nvmf_vfio_user_ctrlr *ctrlr, 1967 struct spdk_nvme_cmd *cmd, uint16_t *sct) 1968 { 1969 struct nvmf_vfio_user_cq *cq; 1970 uint32_t qsize; 1971 uint16_t qid; 1972 int err; 1973 1974 qid = cmd->cdw10_bits.create_io_q.qid; 1975 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 1976 1977 if (ctrlr->cqs[qid] == NULL) { 1978 err = init_cq(ctrlr, qid); 1979 if (err != 0) { 1980 *sct = SPDK_NVME_SCT_GENERIC; 1981 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1982 } 1983 } 1984 1985 if (cmd->cdw11_bits.create_io_cq.pc != 0x1) { 1986 SPDK_ERRLOG("%s: non-PC CQ not supported\n", ctrlr_id(ctrlr)); 1987 *sct = SPDK_NVME_SCT_GENERIC; 1988 return SPDK_NVME_SC_INVALID_FIELD; 1989 } 1990 1991 if (cmd->cdw11_bits.create_io_cq.iv > NVME_IRQ_MSIX_NUM - 1) { 1992 SPDK_ERRLOG("%s: IV is too big\n", ctrlr_id(ctrlr)); 1993 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1994 return SPDK_NVME_SC_INVALID_INTERRUPT_VECTOR; 1995 } 1996 1997 cq = ctrlr->cqs[qid]; 1998 cq->size = qsize; 1999 2000 cq->mapping.prp1 = cmd->dptr.prp.prp1; 2001 2002 cq->dbl_headp = ctrlr_doorbell_ptr(ctrlr) + queue_index(qid, true); 2003 2004 err = map_q(ctrlr, &cq->mapping, cq->size, true, true); 2005 if (err) { 2006 SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr)); 2007 *sct = SPDK_NVME_SCT_GENERIC; 2008 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2009 } 2010 2011 SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped cqid:%u IOVA=%#lx vaddr=%p\n", 2012 ctrlr_id(ctrlr), qid, cmd->dptr.prp.prp1, 2013 q_addr(&cq->mapping)); 2014 2015 cq->ien = cmd->cdw11_bits.create_io_cq.ien; 2016 cq->iv = cmd->cdw11_bits.create_io_cq.iv; 2017 cq->phase = true; 2018 cq->cq_state = VFIO_USER_CQ_CREATED; 2019 2020 *cq_tailp(cq) = 0; 2021 2022 /* 2023 * We should always reset the doorbells. 2024 * 2025 * The Specification prohibits the controller from writing to the shadow 2026 * doorbell buffer, however older versions of the Linux NVMe driver 2027 * don't reset the shadow doorbell buffer after a Queue-Level or 2028 * Controller-Level reset, which means that we're left with garbage 2029 * doorbell values. 2030 */ 2031 *cq_dbl_headp(cq) = 0; 2032 2033 *sct = SPDK_NVME_SCT_GENERIC; 2034 return SPDK_NVME_SC_SUCCESS; 2035 } 2036 2037 /* 2038 * Creates a completion or submission I/O queue. Returns 0 on success, -errno 2039 * on error. 2040 */ 2041 static int 2042 handle_create_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, 2043 struct spdk_nvme_cmd *cmd, const bool is_cq) 2044 { 2045 struct nvmf_vfio_user_transport *vu_transport = ctrlr->transport; 2046 uint16_t sct = SPDK_NVME_SCT_GENERIC; 2047 uint16_t sc = SPDK_NVME_SC_SUCCESS; 2048 uint32_t qsize; 2049 uint16_t qid; 2050 2051 assert(ctrlr != NULL); 2052 assert(cmd != NULL); 2053 2054 qid = cmd->cdw10_bits.create_io_q.qid; 2055 if (qid == 0 || qid >= vu_transport->transport.opts.max_qpairs_per_ctrlr) { 2056 SPDK_ERRLOG("%s: invalid qid=%d, max=%d\n", ctrlr_id(ctrlr), 2057 qid, vu_transport->transport.opts.max_qpairs_per_ctrlr); 2058 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2059 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2060 goto out; 2061 } 2062 2063 if (io_q_exists(ctrlr, qid, is_cq)) { 2064 SPDK_ERRLOG("%s: %cqid:%d already exists\n", ctrlr_id(ctrlr), 2065 is_cq ? 'c' : 's', qid); 2066 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2067 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2068 goto out; 2069 } 2070 2071 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 2072 if (qsize == 1 || qsize > max_queue_size(ctrlr)) { 2073 SPDK_ERRLOG("%s: invalid I/O queue size %u\n", ctrlr_id(ctrlr), qsize); 2074 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2075 sc = SPDK_NVME_SC_INVALID_QUEUE_SIZE; 2076 goto out; 2077 } 2078 2079 if (is_cq) { 2080 sc = handle_create_io_cq(ctrlr, cmd, &sct); 2081 } else { 2082 sc = handle_create_io_sq(ctrlr, cmd, &sct); 2083 2084 if (sct == SPDK_NVME_SCT_GENERIC && 2085 sc == SPDK_NVME_SC_SUCCESS) { 2086 /* Completion posted asynchronously. */ 2087 return 0; 2088 } 2089 } 2090 2091 out: 2092 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 2093 } 2094 2095 /* For ADMIN I/O DELETE SUBMISSION QUEUE the NVMf library will disconnect and free 2096 * queue pair, so save the command in a context. 2097 */ 2098 struct vfio_user_delete_sq_ctx { 2099 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 2100 struct spdk_nvme_cmd delete_io_sq_cmd; 2101 }; 2102 2103 static void 2104 vfio_user_qpair_delete_cb(void *cb_arg) 2105 { 2106 struct vfio_user_delete_sq_ctx *ctx = cb_arg; 2107 struct nvmf_vfio_user_ctrlr *vu_ctrlr = ctx->vu_ctrlr; 2108 2109 post_completion(vu_ctrlr, vu_ctrlr->cqs[0], 0, 0, ctx->delete_io_sq_cmd.cid, 2110 SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC); 2111 free(ctx); 2112 } 2113 2114 /* 2115 * Deletes a completion or submission I/O queue. 2116 */ 2117 static int 2118 handle_del_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, 2119 struct spdk_nvme_cmd *cmd, const bool is_cq) 2120 { 2121 uint16_t sct = SPDK_NVME_SCT_GENERIC; 2122 uint16_t sc = SPDK_NVME_SC_SUCCESS; 2123 struct nvmf_vfio_user_sq *sq; 2124 struct nvmf_vfio_user_cq *cq; 2125 struct vfio_user_delete_sq_ctx *ctx; 2126 2127 SPDK_DEBUGLOG(nvmf_vfio, "%s: delete I/O %cqid:%d\n", 2128 ctrlr_id(ctrlr), is_cq ? 'c' : 's', 2129 cmd->cdw10_bits.delete_io_q.qid); 2130 2131 if (!io_q_exists(ctrlr, cmd->cdw10_bits.delete_io_q.qid, is_cq)) { 2132 SPDK_ERRLOG("%s: I/O %cqid:%d does not exist\n", ctrlr_id(ctrlr), 2133 is_cq ? 'c' : 's', cmd->cdw10_bits.delete_io_q.qid); 2134 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2135 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2136 goto out; 2137 } 2138 2139 if (is_cq) { 2140 cq = ctrlr->cqs[cmd->cdw10_bits.delete_io_q.qid]; 2141 if (cq->cq_ref) { 2142 SPDK_ERRLOG("%s: the associated SQ must be deleted first\n", ctrlr_id(ctrlr)); 2143 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2144 sc = SPDK_NVME_SC_INVALID_QUEUE_DELETION; 2145 goto out; 2146 } 2147 2148 unmap_q(ctrlr, &cq->mapping); 2149 cq->size = 0; 2150 cq->cq_state = VFIO_USER_CQ_DELETED; 2151 cq->group = NULL; 2152 } else { 2153 ctx = calloc(1, sizeof(*ctx)); 2154 if (!ctx) { 2155 sct = SPDK_NVME_SCT_GENERIC; 2156 sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2157 goto out; 2158 } 2159 ctx->vu_ctrlr = ctrlr; 2160 ctx->delete_io_sq_cmd = *cmd; 2161 2162 sq = ctrlr->sqs[cmd->cdw10_bits.delete_io_q.qid]; 2163 sq->sq_state = VFIO_USER_SQ_DELETED; 2164 assert(ctrlr->cqs[sq->cqid]->cq_ref); 2165 ctrlr->cqs[sq->cqid]->cq_ref--; 2166 2167 spdk_nvmf_qpair_disconnect(&sq->qpair, vfio_user_qpair_delete_cb, ctx); 2168 return 0; 2169 } 2170 2171 out: 2172 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 2173 } 2174 2175 /* 2176 * Configures Shadow Doorbells. 2177 */ 2178 static int 2179 handle_doorbell_buffer_config(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd) 2180 { 2181 struct nvmf_vfio_user_shadow_doorbells *sdbl = NULL; 2182 uint32_t dstrd; 2183 uintptr_t page_size, page_mask; 2184 uint64_t prp1, prp2; 2185 uint16_t sct = SPDK_NVME_SCT_GENERIC; 2186 uint16_t sc = SPDK_NVME_SC_INVALID_FIELD; 2187 2188 assert(ctrlr != NULL); 2189 assert(ctrlr->endpoint != NULL); 2190 assert(cmd != NULL); 2191 2192 dstrd = doorbell_stride(ctrlr); 2193 page_size = memory_page_size(ctrlr); 2194 page_mask = memory_page_mask(ctrlr); 2195 2196 /* FIXME: we don't check doorbell stride when setting queue doorbells. */ 2197 if ((4u << dstrd) * NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR > page_size) { 2198 SPDK_ERRLOG("%s: doorbells do not fit in a single host page", 2199 ctrlr_id(ctrlr)); 2200 2201 goto out; 2202 } 2203 2204 /* Verify guest physical addresses passed as PRPs. */ 2205 if (cmd->psdt != SPDK_NVME_PSDT_PRP) { 2206 SPDK_ERRLOG("%s: received Doorbell Buffer Config without PRPs", 2207 ctrlr_id(ctrlr)); 2208 2209 goto out; 2210 } 2211 2212 prp1 = cmd->dptr.prp.prp1; 2213 prp2 = cmd->dptr.prp.prp2; 2214 2215 SPDK_DEBUGLOG(nvmf_vfio, 2216 "%s: configuring shadow doorbells with PRP1=%#lx and PRP2=%#lx (GPAs)\n", 2217 ctrlr_id(ctrlr), prp1, prp2); 2218 2219 if (prp1 == prp2 2220 || prp1 != (prp1 & page_mask) 2221 || prp2 != (prp2 & page_mask)) { 2222 SPDK_ERRLOG("%s: invalid shadow doorbell GPAs\n", 2223 ctrlr_id(ctrlr)); 2224 2225 goto out; 2226 } 2227 2228 /* Map guest physical addresses to our virtual address space. */ 2229 sdbl = map_sdbl(ctrlr->endpoint->vfu_ctx, prp1, prp2, page_size); 2230 if (sdbl == NULL) { 2231 SPDK_ERRLOG("%s: failed to map shadow doorbell buffers\n", 2232 ctrlr_id(ctrlr)); 2233 2234 goto out; 2235 } 2236 2237 ctrlr->shadow_doorbell_buffer = prp1; 2238 ctrlr->eventidx_buffer = prp2; 2239 2240 SPDK_DEBUGLOG(nvmf_vfio, 2241 "%s: mapped shadow doorbell buffers [%p, %p) and [%p, %p)\n", 2242 ctrlr_id(ctrlr), 2243 sdbl->iovs[0].iov_base, 2244 sdbl->iovs[0].iov_base + sdbl->iovs[0].iov_len, 2245 sdbl->iovs[1].iov_base, 2246 sdbl->iovs[1].iov_base + sdbl->iovs[1].iov_len); 2247 2248 2249 /* 2250 * Set all possible CQ head doorbells to polling mode now, such that we 2251 * don't have to worry about it later if the host creates more queues. 2252 * 2253 * We only ever want interrupts for writes to the SQ tail doorbells 2254 * (which are initialised in set_ctrlr_intr_mode() below). 2255 */ 2256 for (uint16_t i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; ++i) { 2257 sdbl->eventidxs[queue_index(i, true)] = NVMF_VFIO_USER_EVENTIDX_POLL; 2258 if (ctrlr->sqs[i] != NULL) { 2259 ctrlr->sqs[i]->need_rearm = true; 2260 } 2261 } 2262 2263 /* Update controller. */ 2264 SWAP(ctrlr->sdbl, sdbl); 2265 2266 /* 2267 * Copy doorbells from either the previous shadow doorbell buffer or the 2268 * BAR0 doorbells and make I/O queue doorbells point to the new buffer. 2269 * 2270 * This needs to account for older versions of the Linux NVMe driver, 2271 * which don't clear out the buffer after a controller reset. 2272 */ 2273 copy_doorbells(ctrlr, sdbl != NULL ? 2274 sdbl->shadow_doorbells : ctrlr->bar0_doorbells, 2275 ctrlr->sdbl->shadow_doorbells); 2276 vfio_user_ctrlr_switch_doorbells(ctrlr, true); 2277 2278 /* Update event index buffer and poll queues if necessary. */ 2279 vfio_user_poll_group_rearm(ctrlr_to_poll_group(ctrlr)); 2280 2281 sc = SPDK_NVME_SC_SUCCESS; 2282 2283 out: 2284 /* 2285 * Unmap existing buffers, in case Doorbell Buffer Config was sent 2286 * more than once (pointless, but not prohibited by the spec), or 2287 * in case of an error. 2288 * 2289 * If this is the first time Doorbell Buffer Config was processed, 2290 * then we've just swapped a NULL from ctrlr->sdbl into sdbl, so 2291 * free_sdbl() becomes a noop. 2292 */ 2293 free_sdbl(ctrlr->endpoint->vfu_ctx, sdbl); 2294 2295 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 2296 } 2297 2298 /* Returns 0 on success and -errno on error. */ 2299 static int 2300 consume_admin_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd) 2301 { 2302 assert(ctrlr != NULL); 2303 assert(cmd != NULL); 2304 2305 if (cmd->fuse != 0) { 2306 /* Fused admin commands are not supported. */ 2307 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, 2308 SPDK_NVME_SC_INVALID_FIELD, 2309 SPDK_NVME_SCT_GENERIC); 2310 } 2311 2312 switch (cmd->opc) { 2313 case SPDK_NVME_OPC_CREATE_IO_CQ: 2314 case SPDK_NVME_OPC_CREATE_IO_SQ: 2315 return handle_create_io_q(ctrlr, cmd, 2316 cmd->opc == SPDK_NVME_OPC_CREATE_IO_CQ); 2317 case SPDK_NVME_OPC_DELETE_IO_SQ: 2318 case SPDK_NVME_OPC_DELETE_IO_CQ: 2319 return handle_del_io_q(ctrlr, cmd, 2320 cmd->opc == SPDK_NVME_OPC_DELETE_IO_CQ); 2321 case SPDK_NVME_OPC_DOORBELL_BUFFER_CONFIG: 2322 if (!ctrlr->transport->transport_opts.disable_shadow_doorbells) { 2323 return handle_doorbell_buffer_config(ctrlr, cmd); 2324 } 2325 /* FALLTHROUGH */ 2326 default: 2327 return handle_cmd_req(ctrlr, cmd, ctrlr->sqs[0]); 2328 } 2329 } 2330 2331 static int 2332 handle_cmd_rsp(struct nvmf_vfio_user_req *vu_req, void *cb_arg) 2333 { 2334 struct nvmf_vfio_user_sq *sq = cb_arg; 2335 struct nvmf_vfio_user_ctrlr *vu_ctrlr = sq->ctrlr; 2336 uint16_t sqid, cqid; 2337 2338 assert(sq != NULL); 2339 assert(vu_req != NULL); 2340 assert(vu_ctrlr != NULL); 2341 2342 if (spdk_likely(vu_req->iovcnt)) { 2343 vfu_sgl_put(vu_ctrlr->endpoint->vfu_ctx, 2344 index_to_sg_t(vu_req->sg, 0), 2345 vu_req->iov, vu_req->iovcnt); 2346 } 2347 sqid = sq->qid; 2348 cqid = sq->cqid; 2349 2350 return post_completion(vu_ctrlr, vu_ctrlr->cqs[cqid], 2351 vu_req->req.rsp->nvme_cpl.cdw0, 2352 sqid, 2353 vu_req->req.cmd->nvme_cmd.cid, 2354 vu_req->req.rsp->nvme_cpl.status.sc, 2355 vu_req->req.rsp->nvme_cpl.status.sct); 2356 } 2357 2358 static int 2359 consume_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_sq *sq, 2360 struct spdk_nvme_cmd *cmd) 2361 { 2362 assert(sq != NULL); 2363 if (nvmf_qpair_is_admin_queue(&sq->qpair)) { 2364 return consume_admin_cmd(ctrlr, cmd); 2365 } 2366 2367 return handle_cmd_req(ctrlr, cmd, sq); 2368 } 2369 2370 /* Returns the number of commands processed, or a negative value on error. */ 2371 static int 2372 handle_sq_tdbl_write(struct nvmf_vfio_user_ctrlr *ctrlr, const uint32_t new_tail, 2373 struct nvmf_vfio_user_sq *sq) 2374 { 2375 struct spdk_nvme_cmd *queue; 2376 int count = 0; 2377 2378 assert(ctrlr != NULL); 2379 assert(sq != NULL); 2380 2381 if (ctrlr->sdbl != NULL) { 2382 /* 2383 * Submission queue index has moved past the event index, so it 2384 * needs to be re-armed before we go to sleep. 2385 */ 2386 sq->need_rearm = true; 2387 } 2388 2389 queue = q_addr(&sq->mapping); 2390 while (*sq_headp(sq) != new_tail) { 2391 int err; 2392 struct spdk_nvme_cmd *cmd = &queue[*sq_headp(sq)]; 2393 2394 count++; 2395 2396 /* 2397 * SQHD must contain the new head pointer, so we must increase 2398 * it before we generate a completion. 2399 */ 2400 sq_head_advance(sq); 2401 2402 err = consume_cmd(ctrlr, sq, cmd); 2403 if (err != 0) { 2404 return err; 2405 } 2406 } 2407 2408 return count; 2409 } 2410 2411 static void 2412 memory_region_add_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) 2413 { 2414 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2415 struct nvmf_vfio_user_ctrlr *ctrlr; 2416 struct nvmf_vfio_user_sq *sq; 2417 struct nvmf_vfio_user_cq *cq; 2418 void *map_start, *map_end; 2419 int ret; 2420 2421 /* 2422 * We're not interested in any DMA regions that aren't mappable (we don't 2423 * support clients that don't share their memory). 2424 */ 2425 if (!info->vaddr) { 2426 return; 2427 } 2428 2429 map_start = info->mapping.iov_base; 2430 map_end = info->mapping.iov_base + info->mapping.iov_len; 2431 2432 if (((uintptr_t)info->mapping.iov_base & MASK_2MB) || 2433 (info->mapping.iov_len & MASK_2MB)) { 2434 SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %p-%p\n", 2435 info->vaddr, map_start, map_end); 2436 return; 2437 } 2438 2439 assert(endpoint != NULL); 2440 if (endpoint->ctrlr == NULL) { 2441 return; 2442 } 2443 ctrlr = endpoint->ctrlr; 2444 2445 SPDK_DEBUGLOG(nvmf_vfio, "%s: map IOVA %p-%p\n", endpoint_id(endpoint), 2446 map_start, map_end); 2447 2448 /* VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE are enabled when registering to VFIO, here we also 2449 * check the protection bits before registering. 2450 */ 2451 if (info->prot == (PROT_WRITE | PROT_READ)) { 2452 ret = spdk_mem_register(info->mapping.iov_base, info->mapping.iov_len); 2453 if (ret) { 2454 SPDK_ERRLOG("Memory region register %p-%p failed, ret=%d\n", 2455 map_start, map_end, ret); 2456 } 2457 } 2458 2459 pthread_mutex_lock(&endpoint->lock); 2460 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 2461 if (sq->sq_state != VFIO_USER_SQ_INACTIVE) { 2462 continue; 2463 } 2464 2465 cq = ctrlr->cqs[sq->cqid]; 2466 2467 /* For shared CQ case, we will use q_addr() to avoid mapping CQ multiple times */ 2468 if (cq->size && q_addr(&cq->mapping) == NULL) { 2469 ret = map_q(ctrlr, &cq->mapping, cq->size, true, false); 2470 if (ret) { 2471 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap cqid:%d %#lx-%#lx\n", 2472 cq->qid, cq->mapping.prp1, 2473 cq->mapping.prp1 + cq->size * sizeof(struct spdk_nvme_cpl)); 2474 continue; 2475 } 2476 } 2477 2478 if (sq->size) { 2479 ret = map_q(ctrlr, &sq->mapping, sq->size, false, false); 2480 if (ret) { 2481 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap sqid:%d %#lx-%#lx\n", 2482 sq->qid, sq->mapping.prp1, 2483 sq->mapping.prp1 + sq->size * sizeof(struct spdk_nvme_cmd)); 2484 continue; 2485 } 2486 } 2487 sq->sq_state = VFIO_USER_SQ_ACTIVE; 2488 SPDK_DEBUGLOG(nvmf_vfio, "Remap sqid:%u successfully\n", sq->qid); 2489 } 2490 pthread_mutex_unlock(&endpoint->lock); 2491 } 2492 2493 static void 2494 memory_region_remove_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) 2495 { 2496 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2497 struct nvmf_vfio_user_sq *sq; 2498 struct nvmf_vfio_user_cq *cq; 2499 void *map_start, *map_end; 2500 int ret = 0; 2501 2502 if (!info->vaddr) { 2503 return; 2504 } 2505 2506 map_start = info->mapping.iov_base; 2507 map_end = info->mapping.iov_base + info->mapping.iov_len; 2508 2509 if (((uintptr_t)info->mapping.iov_base & MASK_2MB) || 2510 (info->mapping.iov_len & MASK_2MB)) { 2511 SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %p-%p\n", 2512 info->vaddr, map_start, map_end); 2513 return; 2514 } 2515 2516 assert(endpoint != NULL); 2517 SPDK_DEBUGLOG(nvmf_vfio, "%s: unmap IOVA %p-%p\n", endpoint_id(endpoint), 2518 map_start, map_end); 2519 2520 if (endpoint->ctrlr != NULL) { 2521 struct nvmf_vfio_user_ctrlr *ctrlr; 2522 ctrlr = endpoint->ctrlr; 2523 2524 pthread_mutex_lock(&endpoint->lock); 2525 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 2526 if (q_addr(&sq->mapping) >= map_start && q_addr(&sq->mapping) <= map_end) { 2527 unmap_q(ctrlr, &sq->mapping); 2528 sq->sq_state = VFIO_USER_SQ_INACTIVE; 2529 } 2530 2531 cq = ctrlr->cqs[sq->cqid]; 2532 if (q_addr(&cq->mapping) >= map_start && q_addr(&cq->mapping) <= map_end) { 2533 unmap_q(ctrlr, &cq->mapping); 2534 } 2535 } 2536 2537 if (ctrlr->sdbl != NULL) { 2538 size_t i; 2539 2540 for (i = 0; i < NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT; i++) { 2541 const void *const iov_base = ctrlr->sdbl->iovs[i].iov_base; 2542 2543 if (iov_base >= map_start && iov_base < map_end) { 2544 copy_doorbells(ctrlr, 2545 ctrlr->sdbl->shadow_doorbells, 2546 ctrlr->bar0_doorbells); 2547 vfio_user_ctrlr_switch_doorbells(ctrlr, false); 2548 free_sdbl(endpoint->vfu_ctx, ctrlr->sdbl); 2549 ctrlr->sdbl = NULL; 2550 break; 2551 } 2552 } 2553 } 2554 2555 pthread_mutex_unlock(&endpoint->lock); 2556 } 2557 2558 if (info->prot == (PROT_WRITE | PROT_READ)) { 2559 ret = spdk_mem_unregister(info->mapping.iov_base, info->mapping.iov_len); 2560 if (ret) { 2561 SPDK_ERRLOG("Memory region unregister %p-%p failed, ret=%d\n", 2562 map_start, map_end, ret); 2563 } 2564 } 2565 } 2566 2567 /* Used to initiate a controller-level reset or a controller shutdown. */ 2568 static void 2569 disable_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 2570 { 2571 SPDK_DEBUGLOG(nvmf_vfio, "%s: disabling controller\n", 2572 ctrlr_id(vu_ctrlr)); 2573 2574 /* Unmap Admin queue. */ 2575 2576 assert(vu_ctrlr->sqs[0] != NULL); 2577 assert(vu_ctrlr->cqs[0] != NULL); 2578 2579 unmap_q(vu_ctrlr, &vu_ctrlr->sqs[0]->mapping); 2580 unmap_q(vu_ctrlr, &vu_ctrlr->cqs[0]->mapping); 2581 2582 vu_ctrlr->sqs[0]->size = 0; 2583 *sq_headp(vu_ctrlr->sqs[0]) = 0; 2584 2585 vu_ctrlr->sqs[0]->sq_state = VFIO_USER_SQ_INACTIVE; 2586 2587 vu_ctrlr->cqs[0]->size = 0; 2588 *cq_tailp(vu_ctrlr->cqs[0]) = 0; 2589 2590 /* 2591 * For PCIe controller reset or shutdown, we will drop all AER 2592 * responses. 2593 */ 2594 nvmf_ctrlr_abort_aer(vu_ctrlr->ctrlr); 2595 2596 /* Free the shadow doorbell buffer. */ 2597 free_sdbl(vu_ctrlr->endpoint->vfu_ctx, vu_ctrlr->sdbl); 2598 vu_ctrlr->sdbl = NULL; 2599 } 2600 2601 /* Used to re-enable the controller after a controller-level reset. */ 2602 static int 2603 enable_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 2604 { 2605 int err; 2606 2607 assert(vu_ctrlr != NULL); 2608 2609 SPDK_DEBUGLOG(nvmf_vfio, "%s: enabling controller\n", 2610 ctrlr_id(vu_ctrlr)); 2611 2612 err = acq_setup(vu_ctrlr); 2613 if (err != 0) { 2614 return err; 2615 } 2616 2617 err = asq_setup(vu_ctrlr); 2618 if (err != 0) { 2619 return err; 2620 } 2621 2622 vu_ctrlr->sqs[0]->sq_state = VFIO_USER_SQ_ACTIVE; 2623 2624 return 0; 2625 } 2626 2627 static int 2628 nvmf_vfio_user_prop_req_rsp(struct nvmf_vfio_user_req *req, void *cb_arg) 2629 { 2630 struct nvmf_vfio_user_sq *sq = cb_arg; 2631 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 2632 int ret; 2633 2634 assert(sq != NULL); 2635 assert(req != NULL); 2636 2637 if (req->req.cmd->prop_get_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET) { 2638 assert(sq->ctrlr != NULL); 2639 assert(req != NULL); 2640 2641 memcpy(req->req.data, 2642 &req->req.rsp->prop_get_rsp.value.u64, 2643 req->req.length); 2644 } else { 2645 assert(req->req.cmd->prop_set_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET); 2646 assert(sq->ctrlr != NULL); 2647 vu_ctrlr = sq->ctrlr; 2648 2649 if (req->req.cmd->prop_set_cmd.ofst == offsetof(struct spdk_nvme_registers, cc)) { 2650 union spdk_nvme_cc_register cc, diff; 2651 2652 cc.raw = req->req.cmd->prop_set_cmd.value.u64; 2653 diff.raw = cc.raw ^ req->cc.raw; 2654 2655 if (diff.bits.en) { 2656 if (cc.bits.en) { 2657 ret = enable_ctrlr(vu_ctrlr); 2658 if (ret) { 2659 SPDK_ERRLOG("%s: failed to enable ctrlr\n", ctrlr_id(vu_ctrlr)); 2660 return ret; 2661 } 2662 vu_ctrlr->reset_shn = false; 2663 } else { 2664 vu_ctrlr->reset_shn = true; 2665 } 2666 } 2667 2668 if (diff.bits.shn) { 2669 if (cc.bits.shn == SPDK_NVME_SHN_NORMAL || cc.bits.shn == SPDK_NVME_SHN_ABRUPT) { 2670 vu_ctrlr->reset_shn = true; 2671 } 2672 } 2673 2674 if (vu_ctrlr->reset_shn) { 2675 disable_ctrlr(vu_ctrlr); 2676 } 2677 } 2678 } 2679 2680 return 0; 2681 } 2682 2683 /* 2684 * Handles a write at offset 0x1000 or more; this is the non-mapped path when a 2685 * doorbell is written via access_bar0_fn(). 2686 * 2687 * DSTRD is set to fixed value 0 for NVMf. 2688 * 2689 */ 2690 static int 2691 handle_dbl_access(struct nvmf_vfio_user_ctrlr *ctrlr, uint32_t *buf, 2692 const size_t count, loff_t pos, const bool is_write) 2693 { 2694 assert(ctrlr != NULL); 2695 assert(buf != NULL); 2696 2697 if (!is_write) { 2698 SPDK_WARNLOG("%s: host tried to read BAR0 doorbell %#lx\n", 2699 ctrlr_id(ctrlr), pos); 2700 errno = EPERM; 2701 return -1; 2702 } 2703 2704 if (count != sizeof(uint32_t)) { 2705 SPDK_ERRLOG("%s: bad doorbell buffer size %ld\n", 2706 ctrlr_id(ctrlr), count); 2707 errno = EINVAL; 2708 return -1; 2709 } 2710 2711 pos -= NVME_DOORBELLS_OFFSET; 2712 2713 /* pos must be dword aligned */ 2714 if ((pos & 0x3) != 0) { 2715 SPDK_ERRLOG("%s: bad doorbell offset %#lx\n", ctrlr_id(ctrlr), pos); 2716 errno = EINVAL; 2717 return -1; 2718 } 2719 2720 /* convert byte offset to array index */ 2721 pos >>= 2; 2722 2723 if (pos >= NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR * 2) { 2724 SPDK_ERRLOG("%s: bad doorbell index %#lx\n", ctrlr_id(ctrlr), pos); 2725 errno = EINVAL; 2726 return -1; 2727 } 2728 2729 ctrlr->bar0_doorbells[pos] = *buf; 2730 spdk_wmb(); 2731 2732 SPDK_DEBUGLOG(vfio_user_db, "%s: updating BAR0 doorbell %s:%ld to %u\n", 2733 ctrlr_id(ctrlr), (pos & 1) ? "cqid" : "sqid", 2734 pos / 2, *buf); 2735 2736 2737 return 0; 2738 } 2739 2740 static size_t 2741 vfio_user_property_access(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 2742 char *buf, size_t count, loff_t pos, 2743 bool is_write) 2744 { 2745 struct nvmf_vfio_user_req *req; 2746 const struct spdk_nvmf_registers *regs; 2747 2748 if ((count != 4) && (count != 8)) { 2749 errno = EINVAL; 2750 return -1; 2751 } 2752 2753 /* Construct a Fabric Property Get/Set command and send it */ 2754 req = get_nvmf_vfio_user_req(vu_ctrlr->sqs[0]); 2755 if (req == NULL) { 2756 errno = ENOBUFS; 2757 return -1; 2758 } 2759 regs = spdk_nvmf_ctrlr_get_regs(vu_ctrlr->ctrlr); 2760 req->cc.raw = regs->cc.raw; 2761 2762 req->cb_fn = nvmf_vfio_user_prop_req_rsp; 2763 req->cb_arg = vu_ctrlr->sqs[0]; 2764 req->req.cmd->prop_set_cmd.opcode = SPDK_NVME_OPC_FABRIC; 2765 req->req.cmd->prop_set_cmd.cid = 0; 2766 if (count == 4) { 2767 req->req.cmd->prop_set_cmd.attrib.size = 0; 2768 } else { 2769 req->req.cmd->prop_set_cmd.attrib.size = 1; 2770 } 2771 req->req.cmd->prop_set_cmd.ofst = pos; 2772 if (is_write) { 2773 req->req.cmd->prop_set_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET; 2774 if (req->req.cmd->prop_set_cmd.attrib.size) { 2775 req->req.cmd->prop_set_cmd.value.u64 = *(uint64_t *)buf; 2776 } else { 2777 req->req.cmd->prop_set_cmd.value.u32.high = 0; 2778 req->req.cmd->prop_set_cmd.value.u32.low = *(uint32_t *)buf; 2779 } 2780 } else { 2781 req->req.cmd->prop_get_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET; 2782 } 2783 req->req.length = count; 2784 req->req.data = buf; 2785 2786 spdk_nvmf_request_exec_fabrics(&req->req); 2787 2788 return count; 2789 } 2790 2791 static ssize_t 2792 access_bar0_fn(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t pos, 2793 bool is_write) 2794 { 2795 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2796 struct nvmf_vfio_user_ctrlr *ctrlr; 2797 int ret; 2798 2799 ctrlr = endpoint->ctrlr; 2800 if (endpoint->need_async_destroy || !ctrlr) { 2801 errno = EIO; 2802 return -1; 2803 } 2804 2805 if (pos >= NVME_DOORBELLS_OFFSET) { 2806 /* 2807 * The fact that the doorbells can be memory mapped doesn't mean 2808 * that the client (VFIO in QEMU) is obliged to memory map them, 2809 * it might still elect to access them via regular read/write; 2810 * we might also have had disable_mappable_bar0 set. 2811 */ 2812 ret = handle_dbl_access(ctrlr, (uint32_t *)buf, count, 2813 pos, is_write); 2814 if (ret == 0) { 2815 return count; 2816 } 2817 return ret; 2818 } 2819 2820 return vfio_user_property_access(ctrlr, buf, count, pos, is_write); 2821 } 2822 2823 static ssize_t 2824 access_pci_config(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t offset, 2825 bool is_write) 2826 { 2827 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2828 2829 if (is_write) { 2830 SPDK_ERRLOG("%s: write %#lx-%#lx not supported\n", 2831 endpoint_id(endpoint), offset, offset + count); 2832 errno = EINVAL; 2833 return -1; 2834 } 2835 2836 if (offset + count > NVME_REG_CFG_SIZE) { 2837 SPDK_ERRLOG("%s: access past end of extended PCI configuration space, want=%ld+%ld, max=%d\n", 2838 endpoint_id(endpoint), offset, count, 2839 NVME_REG_CFG_SIZE); 2840 errno = ERANGE; 2841 return -1; 2842 } 2843 2844 memcpy(buf, ((unsigned char *)endpoint->pci_config_space) + offset, count); 2845 2846 return count; 2847 } 2848 2849 static void 2850 vfio_user_log(vfu_ctx_t *vfu_ctx, int level, char const *msg) 2851 { 2852 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2853 2854 if (level >= LOG_DEBUG) { 2855 SPDK_DEBUGLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg); 2856 } else if (level >= LOG_INFO) { 2857 SPDK_INFOLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg); 2858 } else if (level >= LOG_NOTICE) { 2859 SPDK_NOTICELOG("%s: %s\n", endpoint_id(endpoint), msg); 2860 } else if (level >= LOG_WARNING) { 2861 SPDK_WARNLOG("%s: %s\n", endpoint_id(endpoint), msg); 2862 } else { 2863 SPDK_ERRLOG("%s: %s\n", endpoint_id(endpoint), msg); 2864 } 2865 } 2866 2867 static int 2868 vfio_user_get_log_level(void) 2869 { 2870 int level; 2871 2872 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 2873 return LOG_DEBUG; 2874 } 2875 2876 level = spdk_log_to_syslog_level(spdk_log_get_level()); 2877 if (level < 0) { 2878 return LOG_ERR; 2879 } 2880 2881 return level; 2882 } 2883 2884 static void 2885 init_pci_config_space(vfu_pci_config_space_t *p) 2886 { 2887 /* MLBAR */ 2888 p->hdr.bars[0].raw = 0x0; 2889 /* MUBAR */ 2890 p->hdr.bars[1].raw = 0x0; 2891 2892 /* vendor specific, let's set them to zero for now */ 2893 p->hdr.bars[3].raw = 0x0; 2894 p->hdr.bars[4].raw = 0x0; 2895 p->hdr.bars[5].raw = 0x0; 2896 2897 /* enable INTx */ 2898 p->hdr.intr.ipin = 0x1; 2899 } 2900 2901 struct ctrlr_quiesce_ctx { 2902 struct nvmf_vfio_user_endpoint *endpoint; 2903 struct nvmf_vfio_user_poll_group *group; 2904 int status; 2905 }; 2906 2907 static void ctrlr_quiesce(struct nvmf_vfio_user_ctrlr *vu_ctrlr); 2908 2909 static void 2910 _vfio_user_endpoint_resume_done_msg(void *ctx) 2911 { 2912 struct nvmf_vfio_user_endpoint *endpoint = ctx; 2913 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 2914 2915 endpoint->need_resume = false; 2916 2917 if (!vu_ctrlr) { 2918 return; 2919 } 2920 2921 if (!vu_ctrlr->queued_quiesce) { 2922 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 2923 2924 /* 2925 * We might have ignored new SQ entries while we were quiesced: 2926 * kick ourselves so we'll definitely check again while in 2927 * VFIO_USER_CTRLR_RUNNING state. 2928 */ 2929 ctrlr_kick(vu_ctrlr); 2930 return; 2931 } 2932 2933 2934 /* 2935 * Basically, once we call `vfu_device_quiesced` the device is 2936 * unquiesced from libvfio-user's perspective so from the moment 2937 * `vfio_user_quiesce_done` returns libvfio-user might quiesce the device 2938 * again. However, because the NVMf subsytem is an asynchronous 2939 * operation, this quiesce might come _before_ the NVMf subsystem has 2940 * been resumed, so in the callback of `spdk_nvmf_subsystem_resume` we 2941 * need to check whether a quiesce was requested. 2942 */ 2943 SPDK_DEBUGLOG(nvmf_vfio, "%s has queued quiesce event, quiesce again\n", 2944 ctrlr_id(vu_ctrlr)); 2945 ctrlr_quiesce(vu_ctrlr); 2946 } 2947 2948 static void 2949 vfio_user_endpoint_resume_done(struct spdk_nvmf_subsystem *subsystem, 2950 void *cb_arg, int status) 2951 { 2952 struct nvmf_vfio_user_endpoint *endpoint = cb_arg; 2953 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 2954 2955 SPDK_DEBUGLOG(nvmf_vfio, "%s resumed done with status %d\n", endpoint_id(endpoint), status); 2956 2957 if (!vu_ctrlr) { 2958 return; 2959 } 2960 2961 spdk_thread_send_msg(vu_ctrlr->thread, _vfio_user_endpoint_resume_done_msg, endpoint); 2962 } 2963 2964 static void 2965 vfio_user_quiesce_done(void *ctx) 2966 { 2967 struct ctrlr_quiesce_ctx *quiesce_ctx = ctx; 2968 struct nvmf_vfio_user_endpoint *endpoint = quiesce_ctx->endpoint; 2969 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 2970 int ret; 2971 2972 if (!vu_ctrlr) { 2973 free(quiesce_ctx); 2974 return; 2975 } 2976 2977 SPDK_DEBUGLOG(nvmf_vfio, "%s device quiesced\n", ctrlr_id(vu_ctrlr)); 2978 2979 assert(vu_ctrlr->state == VFIO_USER_CTRLR_PAUSING); 2980 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 2981 vfu_device_quiesced(endpoint->vfu_ctx, quiesce_ctx->status); 2982 vu_ctrlr->queued_quiesce = false; 2983 free(quiesce_ctx); 2984 2985 /* `vfu_device_quiesced` can change the migration state, 2986 * so we need to re-check `vu_ctrlr->state`. 2987 */ 2988 if (vu_ctrlr->state == VFIO_USER_CTRLR_MIGRATING) { 2989 SPDK_DEBUGLOG(nvmf_vfio, "%s is in MIGRATION state\n", ctrlr_id(vu_ctrlr)); 2990 return; 2991 } 2992 2993 SPDK_DEBUGLOG(nvmf_vfio, "%s start to resume\n", ctrlr_id(vu_ctrlr)); 2994 vu_ctrlr->state = VFIO_USER_CTRLR_RESUMING; 2995 ret = spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 2996 vfio_user_endpoint_resume_done, endpoint); 2997 if (ret < 0) { 2998 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 2999 SPDK_ERRLOG("%s: failed to resume, ret=%d\n", endpoint_id(endpoint), ret); 3000 } 3001 } 3002 3003 static void 3004 vfio_user_pause_done(struct spdk_nvmf_subsystem *subsystem, 3005 void *ctx, int status) 3006 { 3007 struct ctrlr_quiesce_ctx *quiesce_ctx = ctx; 3008 struct nvmf_vfio_user_endpoint *endpoint = quiesce_ctx->endpoint; 3009 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3010 3011 if (!vu_ctrlr) { 3012 free(quiesce_ctx); 3013 return; 3014 } 3015 3016 quiesce_ctx->status = status; 3017 3018 SPDK_DEBUGLOG(nvmf_vfio, "%s pause done with status %d\n", 3019 ctrlr_id(vu_ctrlr), status); 3020 3021 spdk_thread_send_msg(vu_ctrlr->thread, 3022 vfio_user_quiesce_done, ctx); 3023 } 3024 3025 /* 3026 * Ensure that, for this PG, we've stopped running in nvmf_vfio_user_sq_poll(); 3027 * we've already set ctrlr->state, so we won't process new entries, but we need 3028 * to ensure that this PG is quiesced. This only works because there's no 3029 * callback context set up between polling the SQ and spdk_nvmf_request_exec(). 3030 * 3031 * Once we've walked all PGs, we need to pause any submitted I/O via 3032 * spdk_nvmf_subsystem_pause(SPDK_NVME_GLOBAL_NS_TAG). 3033 */ 3034 static void 3035 vfio_user_quiesce_pg(void *ctx) 3036 { 3037 struct ctrlr_quiesce_ctx *quiesce_ctx = ctx; 3038 struct nvmf_vfio_user_endpoint *endpoint = quiesce_ctx->endpoint; 3039 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3040 struct nvmf_vfio_user_poll_group *vu_group = quiesce_ctx->group; 3041 struct spdk_nvmf_subsystem *subsystem = endpoint->subsystem; 3042 int ret; 3043 3044 SPDK_DEBUGLOG(nvmf_vfio, "quiesced pg:%p\n", vu_group); 3045 3046 if (!vu_ctrlr) { 3047 free(quiesce_ctx); 3048 return; 3049 } 3050 3051 quiesce_ctx->group = TAILQ_NEXT(vu_group, link); 3052 if (quiesce_ctx->group != NULL) { 3053 spdk_thread_send_msg(poll_group_to_thread(quiesce_ctx->group), 3054 vfio_user_quiesce_pg, quiesce_ctx); 3055 return; 3056 } 3057 3058 ret = spdk_nvmf_subsystem_pause(subsystem, SPDK_NVME_GLOBAL_NS_TAG, 3059 vfio_user_pause_done, quiesce_ctx); 3060 if (ret < 0) { 3061 SPDK_ERRLOG("%s: failed to pause, ret=%d\n", 3062 endpoint_id(endpoint), ret); 3063 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 3064 fail_ctrlr(vu_ctrlr); 3065 free(quiesce_ctx); 3066 } 3067 } 3068 3069 static void 3070 ctrlr_quiesce(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3071 { 3072 struct ctrlr_quiesce_ctx *quiesce_ctx; 3073 3074 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSING; 3075 3076 quiesce_ctx = calloc(1, sizeof(*quiesce_ctx)); 3077 if (!quiesce_ctx) { 3078 SPDK_ERRLOG("Failed to allocate subsystem pause context\n"); 3079 assert(false); 3080 return; 3081 } 3082 3083 quiesce_ctx->endpoint = vu_ctrlr->endpoint; 3084 quiesce_ctx->status = 0; 3085 quiesce_ctx->group = TAILQ_FIRST(&vu_ctrlr->transport->poll_groups); 3086 3087 spdk_thread_send_msg(poll_group_to_thread(quiesce_ctx->group), 3088 vfio_user_quiesce_pg, quiesce_ctx); 3089 } 3090 3091 static int 3092 vfio_user_dev_quiesce_cb(vfu_ctx_t *vfu_ctx) 3093 { 3094 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3095 struct spdk_nvmf_subsystem *subsystem = endpoint->subsystem; 3096 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3097 3098 if (!vu_ctrlr) { 3099 return 0; 3100 } 3101 3102 /* NVMf library will destruct controller when no 3103 * connected queue pairs. 3104 */ 3105 if (!nvmf_subsystem_get_ctrlr(subsystem, vu_ctrlr->cntlid)) { 3106 return 0; 3107 } 3108 3109 SPDK_DEBUGLOG(nvmf_vfio, "%s starts to quiesce\n", ctrlr_id(vu_ctrlr)); 3110 3111 /* There is no race condition here as device quiesce callback 3112 * and nvmf_prop_set_cc() are running in the same thread context. 3113 */ 3114 if (!vu_ctrlr->ctrlr->vcprop.cc.bits.en) { 3115 return 0; 3116 } else if (!vu_ctrlr->ctrlr->vcprop.csts.bits.rdy) { 3117 return 0; 3118 } else if (vu_ctrlr->ctrlr->vcprop.csts.bits.shst == SPDK_NVME_SHST_COMPLETE) { 3119 return 0; 3120 } 3121 3122 switch (vu_ctrlr->state) { 3123 case VFIO_USER_CTRLR_PAUSED: 3124 case VFIO_USER_CTRLR_MIGRATING: 3125 return 0; 3126 case VFIO_USER_CTRLR_RUNNING: 3127 ctrlr_quiesce(vu_ctrlr); 3128 break; 3129 case VFIO_USER_CTRLR_RESUMING: 3130 vu_ctrlr->queued_quiesce = true; 3131 SPDK_DEBUGLOG(nvmf_vfio, "%s is busy to quiesce, current state %u\n", ctrlr_id(vu_ctrlr), 3132 vu_ctrlr->state); 3133 break; 3134 default: 3135 assert(vu_ctrlr->state != VFIO_USER_CTRLR_PAUSING); 3136 break; 3137 } 3138 3139 errno = EBUSY; 3140 return -1; 3141 } 3142 3143 static void 3144 vfio_user_ctrlr_dump_migr_data(const char *name, 3145 struct vfio_user_nvme_migr_state *migr_data, 3146 struct nvmf_vfio_user_shadow_doorbells *sdbl) 3147 { 3148 struct spdk_nvme_registers *regs; 3149 struct nvme_migr_sq_state *sq; 3150 struct nvme_migr_cq_state *cq; 3151 uint32_t *doorbell_base; 3152 uint32_t i; 3153 3154 SPDK_NOTICELOG("Dump %s\n", name); 3155 3156 regs = (struct spdk_nvme_registers *)migr_data->bar0; 3157 doorbell_base = (uint32_t *)®s->doorbell[0].sq_tdbl; 3158 3159 SPDK_NOTICELOG("Registers\n"); 3160 SPDK_NOTICELOG("CSTS 0x%x\n", regs->csts.raw); 3161 SPDK_NOTICELOG("CAP 0x%"PRIx64"\n", regs->cap.raw); 3162 SPDK_NOTICELOG("VS 0x%x\n", regs->vs.raw); 3163 SPDK_NOTICELOG("CC 0x%x\n", regs->cc.raw); 3164 SPDK_NOTICELOG("AQA 0x%x\n", regs->aqa.raw); 3165 SPDK_NOTICELOG("ASQ 0x%"PRIx64"\n", regs->asq); 3166 SPDK_NOTICELOG("ACQ 0x%"PRIx64"\n", regs->acq); 3167 3168 SPDK_NOTICELOG("Number of IO Queues %u\n", migr_data->ctrlr_header.num_io_queues); 3169 3170 if (sdbl != NULL) { 3171 SPDK_NOTICELOG("shadow doorbell buffer=%#lx\n", 3172 migr_data->ctrlr_header.shadow_doorbell_buffer); 3173 SPDK_NOTICELOG("eventidx buffer=%#lx\n", 3174 migr_data->ctrlr_header.eventidx_buffer); 3175 } 3176 3177 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3178 sq = &migr_data->qps[i].sq; 3179 cq = &migr_data->qps[i].cq; 3180 3181 if (sq->size) { 3182 SPDK_NOTICELOG("sqid:%u, bar0_doorbell:%u\n", sq->sqid, doorbell_base[i * 2]); 3183 if (i > 0 && sdbl != NULL) { 3184 SPDK_NOTICELOG("sqid:%u, shadow_doorbell:%u, eventidx:%u\n", 3185 sq->sqid, 3186 sdbl->shadow_doorbells[queue_index(i, false)], 3187 sdbl->eventidxs[queue_index(i, false)]); 3188 } 3189 SPDK_NOTICELOG("SQ sqid:%u, cqid:%u, sqhead:%u, size:%u, dma_addr:0x%"PRIx64"\n", 3190 sq->sqid, sq->cqid, sq->head, sq->size, sq->dma_addr); 3191 } 3192 3193 if (cq->size) { 3194 SPDK_NOTICELOG("cqid:%u, bar0_doorbell:%u\n", cq->cqid, doorbell_base[i * 2 + 1]); 3195 if (i > 0 && sdbl != NULL) { 3196 SPDK_NOTICELOG("cqid:%u, shadow_doorbell:%u, eventidx:%u\n", 3197 cq->cqid, 3198 sdbl->shadow_doorbells[queue_index(i, true)], 3199 sdbl->eventidxs[queue_index(i, true)]); 3200 } 3201 SPDK_NOTICELOG("CQ cqid:%u, phase:%u, cqtail:%u, size:%u, iv:%u, ien:%u, dma_addr:0x%"PRIx64"\n", 3202 cq->cqid, cq->phase, cq->tail, cq->size, cq->iv, cq->ien, cq->dma_addr); 3203 } 3204 } 3205 3206 SPDK_NOTICELOG("%s Dump Done\n", name); 3207 } 3208 3209 /* Read region 9 content and restore it to migration data structures */ 3210 static int 3211 vfio_user_migr_stream_to_data(struct nvmf_vfio_user_endpoint *endpoint, 3212 struct vfio_user_nvme_migr_state *migr_state) 3213 { 3214 void *data_ptr = endpoint->migr_data; 3215 3216 /* Load vfio_user_nvme_migr_header first */ 3217 memcpy(&migr_state->ctrlr_header, data_ptr, sizeof(struct vfio_user_nvme_migr_header)); 3218 /* TODO: version check */ 3219 if (migr_state->ctrlr_header.magic != VFIO_USER_NVME_MIGR_MAGIC) { 3220 SPDK_ERRLOG("%s: bad magic number %x\n", endpoint_id(endpoint), migr_state->ctrlr_header.magic); 3221 return -EINVAL; 3222 } 3223 3224 /* Load nvmf controller data */ 3225 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.nvmf_data_offset; 3226 memcpy(&migr_state->nvmf_data, data_ptr, migr_state->ctrlr_header.nvmf_data_len); 3227 3228 /* Load queue pairs */ 3229 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.qp_offset; 3230 memcpy(&migr_state->qps, data_ptr, migr_state->ctrlr_header.qp_len); 3231 3232 /* Load BAR0 */ 3233 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.bar_offset[VFU_PCI_DEV_BAR0_REGION_IDX]; 3234 memcpy(&migr_state->bar0, data_ptr, migr_state->ctrlr_header.bar_len[VFU_PCI_DEV_BAR0_REGION_IDX]); 3235 3236 /* Load CFG */ 3237 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.bar_offset[VFU_PCI_DEV_CFG_REGION_IDX]; 3238 memcpy(&migr_state->cfg, data_ptr, migr_state->ctrlr_header.bar_len[VFU_PCI_DEV_CFG_REGION_IDX]); 3239 3240 return 0; 3241 } 3242 3243 3244 static void 3245 vfio_user_migr_ctrlr_save_data(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3246 { 3247 struct spdk_nvmf_ctrlr *ctrlr = vu_ctrlr->ctrlr; 3248 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 3249 struct nvmf_vfio_user_sq *sq; 3250 struct nvmf_vfio_user_cq *cq; 3251 struct vfio_user_nvme_migr_state migr_state = {}; 3252 uint64_t data_offset; 3253 void *data_ptr; 3254 int num_aers; 3255 struct spdk_nvme_registers *regs; 3256 uint32_t *doorbell_base; 3257 uint32_t i = 0; 3258 uint16_t sqid, cqid; 3259 3260 /* Save all data to vfio_user_nvme_migr_state first, then we will 3261 * copy it to device migration region at last. 3262 */ 3263 3264 /* save magic number */ 3265 migr_state.ctrlr_header.magic = VFIO_USER_NVME_MIGR_MAGIC; 3266 3267 /* save controller data */ 3268 num_aers = nvmf_ctrlr_save_aers(ctrlr, migr_state.ctrlr_header.aer_cids, 3269 256); 3270 assert(num_aers >= 0); 3271 migr_state.ctrlr_header.nr_aers = num_aers; 3272 3273 /* save nvmf controller data */ 3274 nvmf_ctrlr_save_migr_data(ctrlr, (struct nvmf_ctrlr_migr_data *)&migr_state.nvmf_data); 3275 3276 /* save connected queue pairs */ 3277 TAILQ_FOREACH(sq, &vu_ctrlr->connected_sqs, tailq) { 3278 /* save sq */ 3279 sqid = sq->qid; 3280 migr_state.qps[sqid].sq.sqid = sq->qid; 3281 migr_state.qps[sqid].sq.cqid = sq->cqid; 3282 migr_state.qps[sqid].sq.head = *sq_headp(sq); 3283 migr_state.qps[sqid].sq.size = sq->size; 3284 migr_state.qps[sqid].sq.dma_addr = sq->mapping.prp1; 3285 3286 /* save cq, for shared cq case, cq may be saved multiple times */ 3287 cqid = sq->cqid; 3288 cq = vu_ctrlr->cqs[cqid]; 3289 migr_state.qps[cqid].cq.cqid = cqid; 3290 migr_state.qps[cqid].cq.tail = *cq_tailp(cq); 3291 migr_state.qps[cqid].cq.ien = cq->ien; 3292 migr_state.qps[cqid].cq.iv = cq->iv; 3293 migr_state.qps[cqid].cq.size = cq->size; 3294 migr_state.qps[cqid].cq.phase = cq->phase; 3295 migr_state.qps[cqid].cq.dma_addr = cq->mapping.prp1; 3296 i++; 3297 } 3298 3299 assert(i > 0); 3300 migr_state.ctrlr_header.num_io_queues = i - 1; 3301 3302 regs = (struct spdk_nvme_registers *)&migr_state.bar0; 3303 /* Save mandarory registers to bar0 */ 3304 regs->csts.raw = ctrlr->vcprop.csts.raw; 3305 regs->cap.raw = ctrlr->vcprop.cap.raw; 3306 regs->vs.raw = ctrlr->vcprop.vs.raw; 3307 regs->cc.raw = ctrlr->vcprop.cc.raw; 3308 regs->aqa.raw = ctrlr->vcprop.aqa.raw; 3309 regs->asq = ctrlr->vcprop.asq; 3310 regs->acq = ctrlr->vcprop.acq; 3311 /* Save doorbells */ 3312 doorbell_base = (uint32_t *)®s->doorbell[0].sq_tdbl; 3313 memcpy(doorbell_base, (void *)vu_ctrlr->bar0_doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 3314 3315 /* Save PCI configuration space */ 3316 memcpy(&migr_state.cfg, (void *)endpoint->pci_config_space, NVME_REG_CFG_SIZE); 3317 3318 /* Save all data to device migration region */ 3319 data_ptr = endpoint->migr_data; 3320 3321 /* Copy nvmf controller data */ 3322 data_offset = sizeof(struct vfio_user_nvme_migr_header); 3323 data_ptr += data_offset; 3324 migr_state.ctrlr_header.nvmf_data_offset = data_offset; 3325 migr_state.ctrlr_header.nvmf_data_len = sizeof(struct nvmf_ctrlr_migr_data); 3326 memcpy(data_ptr, &migr_state.nvmf_data, sizeof(struct nvmf_ctrlr_migr_data)); 3327 3328 /* Copy queue pairs */ 3329 data_offset += sizeof(struct nvmf_ctrlr_migr_data); 3330 data_ptr += sizeof(struct nvmf_ctrlr_migr_data); 3331 migr_state.ctrlr_header.qp_offset = data_offset; 3332 migr_state.ctrlr_header.qp_len = i * (sizeof(struct nvme_migr_sq_state) + sizeof( 3333 struct nvme_migr_cq_state)); 3334 memcpy(data_ptr, &migr_state.qps, migr_state.ctrlr_header.qp_len); 3335 3336 /* Copy BAR0 */ 3337 data_offset += migr_state.ctrlr_header.qp_len; 3338 data_ptr += migr_state.ctrlr_header.qp_len; 3339 migr_state.ctrlr_header.bar_offset[VFU_PCI_DEV_BAR0_REGION_IDX] = data_offset; 3340 migr_state.ctrlr_header.bar_len[VFU_PCI_DEV_BAR0_REGION_IDX] = NVME_REG_BAR0_SIZE; 3341 memcpy(data_ptr, &migr_state.bar0, NVME_REG_BAR0_SIZE); 3342 3343 /* Copy CFG */ 3344 data_offset += NVME_REG_BAR0_SIZE; 3345 data_ptr += NVME_REG_BAR0_SIZE; 3346 migr_state.ctrlr_header.bar_offset[VFU_PCI_DEV_CFG_REGION_IDX] = data_offset; 3347 migr_state.ctrlr_header.bar_len[VFU_PCI_DEV_CFG_REGION_IDX] = NVME_REG_CFG_SIZE; 3348 memcpy(data_ptr, &migr_state.cfg, NVME_REG_CFG_SIZE); 3349 3350 /* copy shadow doorbells */ 3351 if (vu_ctrlr->sdbl != NULL) { 3352 migr_state.ctrlr_header.sdbl = true; 3353 migr_state.ctrlr_header.shadow_doorbell_buffer = vu_ctrlr->shadow_doorbell_buffer; 3354 migr_state.ctrlr_header.eventidx_buffer = vu_ctrlr->eventidx_buffer; 3355 } 3356 3357 /* Copy nvme migration header finally */ 3358 memcpy(endpoint->migr_data, &migr_state.ctrlr_header, sizeof(struct vfio_user_nvme_migr_header)); 3359 3360 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 3361 vfio_user_ctrlr_dump_migr_data("SAVE", &migr_state, vu_ctrlr->sdbl); 3362 } 3363 } 3364 3365 /* 3366 * If we are about to close the connection, we need to unregister the interrupt, 3367 * as the library will subsequently close the file descriptor we registered. 3368 */ 3369 static int 3370 vfio_user_device_reset(vfu_ctx_t *vfu_ctx, vfu_reset_type_t type) 3371 { 3372 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3373 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 3374 3375 SPDK_DEBUGLOG(nvmf_vfio, "Device reset type %u\n", type); 3376 3377 if (type == VFU_RESET_LOST_CONN) { 3378 if (ctrlr != NULL) { 3379 spdk_interrupt_unregister(&ctrlr->intr); 3380 ctrlr->intr_fd = -1; 3381 } 3382 return 0; 3383 } 3384 3385 /* FIXME: LOST_CONN case ? */ 3386 if (ctrlr->sdbl != NULL) { 3387 free_sdbl(vfu_ctx, ctrlr->sdbl); 3388 ctrlr->sdbl = NULL; 3389 } 3390 3391 /* FIXME: much more needed here. */ 3392 3393 return 0; 3394 } 3395 3396 static int 3397 vfio_user_migr_ctrlr_construct_qps(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 3398 struct vfio_user_nvme_migr_state *migr_state) 3399 { 3400 uint32_t i, qsize = 0; 3401 uint16_t sqid, cqid; 3402 struct vfio_user_nvme_migr_qp migr_qp; 3403 void *addr; 3404 uint32_t cqs_ref[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR] = {}; 3405 int ret; 3406 3407 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 3408 vfio_user_ctrlr_dump_migr_data("RESUME", migr_state, vu_ctrlr->sdbl); 3409 } 3410 3411 /* restore submission queues */ 3412 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3413 migr_qp = migr_state->qps[i]; 3414 3415 qsize = migr_qp.sq.size; 3416 if (qsize) { 3417 struct nvmf_vfio_user_sq *sq; 3418 3419 sqid = migr_qp.sq.sqid; 3420 if (sqid != i) { 3421 SPDK_ERRLOG("Expected sqid %u while got %u", i, sqid); 3422 return -EINVAL; 3423 } 3424 3425 /* allocate sq if necessary */ 3426 if (vu_ctrlr->sqs[sqid] == NULL) { 3427 ret = init_sq(vu_ctrlr, &vu_ctrlr->transport->transport, sqid); 3428 if (ret) { 3429 SPDK_ERRLOG("Construct qpair with qid %u failed\n", sqid); 3430 return -EFAULT; 3431 } 3432 } 3433 3434 sq = vu_ctrlr->sqs[sqid]; 3435 sq->size = qsize; 3436 3437 ret = alloc_sq_reqs(vu_ctrlr, sq); 3438 if (ret) { 3439 SPDK_ERRLOG("Construct sq with qid %u failed\n", sqid); 3440 return -EFAULT; 3441 } 3442 3443 /* restore sq */ 3444 sq->sq_state = VFIO_USER_SQ_CREATED; 3445 sq->cqid = migr_qp.sq.cqid; 3446 *sq_headp(sq) = migr_qp.sq.head; 3447 sq->mapping.prp1 = migr_qp.sq.dma_addr; 3448 addr = map_one(vu_ctrlr->endpoint->vfu_ctx, 3449 sq->mapping.prp1, sq->size * 64, 3450 sq->mapping.sg, &sq->mapping.iov, 3451 PROT_READ); 3452 if (addr == NULL) { 3453 SPDK_ERRLOG("Restore sq with qid %u PRP1 0x%"PRIx64" with size %u failed\n", 3454 sqid, sq->mapping.prp1, sq->size); 3455 return -EFAULT; 3456 } 3457 cqs_ref[sq->cqid]++; 3458 } 3459 } 3460 3461 /* restore completion queues */ 3462 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3463 migr_qp = migr_state->qps[i]; 3464 3465 qsize = migr_qp.cq.size; 3466 if (qsize) { 3467 struct nvmf_vfio_user_cq *cq; 3468 3469 /* restore cq */ 3470 cqid = migr_qp.sq.cqid; 3471 assert(cqid == i); 3472 3473 /* allocate cq if necessary */ 3474 if (vu_ctrlr->cqs[cqid] == NULL) { 3475 ret = init_cq(vu_ctrlr, cqid); 3476 if (ret) { 3477 SPDK_ERRLOG("Construct qpair with qid %u failed\n", cqid); 3478 return -EFAULT; 3479 } 3480 } 3481 3482 cq = vu_ctrlr->cqs[cqid]; 3483 3484 cq->size = qsize; 3485 3486 cq->cq_state = VFIO_USER_CQ_CREATED; 3487 cq->cq_ref = cqs_ref[cqid]; 3488 *cq_tailp(cq) = migr_qp.cq.tail; 3489 cq->mapping.prp1 = migr_qp.cq.dma_addr; 3490 cq->ien = migr_qp.cq.ien; 3491 cq->iv = migr_qp.cq.iv; 3492 cq->phase = migr_qp.cq.phase; 3493 addr = map_one(vu_ctrlr->endpoint->vfu_ctx, 3494 cq->mapping.prp1, cq->size * 16, 3495 cq->mapping.sg, &cq->mapping.iov, 3496 PROT_READ | PROT_WRITE); 3497 if (addr == NULL) { 3498 SPDK_ERRLOG("Restore cq with qid %u PRP1 0x%"PRIx64" with size %u failed\n", 3499 cqid, cq->mapping.prp1, cq->size); 3500 return -EFAULT; 3501 } 3502 } 3503 } 3504 3505 return 0; 3506 } 3507 3508 static int 3509 vfio_user_migr_ctrlr_restore(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3510 { 3511 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 3512 struct spdk_nvmf_ctrlr *ctrlr = vu_ctrlr->ctrlr; 3513 uint32_t *doorbell_base; 3514 struct vfio_user_nvme_migr_state migr_state = {}; 3515 struct spdk_nvme_registers *regs; 3516 struct spdk_nvme_cmd cmd; 3517 uint16_t i; 3518 int rc = 0; 3519 3520 assert(endpoint->migr_data != NULL); 3521 assert(ctrlr != NULL); 3522 rc = vfio_user_migr_stream_to_data(endpoint, &migr_state); 3523 if (rc) { 3524 return rc; 3525 } 3526 3527 /* restore shadow doorbells */ 3528 if (migr_state.ctrlr_header.sdbl) { 3529 struct nvmf_vfio_user_shadow_doorbells *sdbl; 3530 sdbl = map_sdbl(vu_ctrlr->endpoint->vfu_ctx, 3531 migr_state.ctrlr_header.shadow_doorbell_buffer, 3532 migr_state.ctrlr_header.eventidx_buffer, 3533 memory_page_size(vu_ctrlr)); 3534 if (sdbl == NULL) { 3535 SPDK_ERRLOG("%s: failed to re-map shadow doorbell buffers\n", 3536 ctrlr_id(vu_ctrlr)); 3537 return -1; 3538 } 3539 3540 vu_ctrlr->shadow_doorbell_buffer = migr_state.ctrlr_header.shadow_doorbell_buffer; 3541 vu_ctrlr->eventidx_buffer = migr_state.ctrlr_header.eventidx_buffer; 3542 3543 SWAP(vu_ctrlr->sdbl, sdbl); 3544 } 3545 3546 rc = vfio_user_migr_ctrlr_construct_qps(vu_ctrlr, &migr_state); 3547 if (rc) { 3548 return rc; 3549 } 3550 3551 /* restore PCI configuration space */ 3552 memcpy((void *)endpoint->pci_config_space, &migr_state.cfg, NVME_REG_CFG_SIZE); 3553 3554 regs = (struct spdk_nvme_registers *)&migr_state.bar0; 3555 doorbell_base = (uint32_t *)®s->doorbell[0].sq_tdbl; 3556 /* restore doorbells from saved registers */ 3557 memcpy((void *)vu_ctrlr->bar0_doorbells, doorbell_base, NVMF_VFIO_USER_DOORBELLS_SIZE); 3558 3559 /* restore controller registers after ADMIN queue connection */ 3560 ctrlr->vcprop.csts.raw = regs->csts.raw; 3561 ctrlr->vcprop.cap.raw = regs->cap.raw; 3562 ctrlr->vcprop.vs.raw = regs->vs.raw; 3563 ctrlr->vcprop.cc.raw = regs->cc.raw; 3564 ctrlr->vcprop.aqa.raw = regs->aqa.raw; 3565 ctrlr->vcprop.asq = regs->asq; 3566 ctrlr->vcprop.acq = regs->acq; 3567 3568 /* restore nvmf controller data */ 3569 rc = nvmf_ctrlr_restore_migr_data(ctrlr, &migr_state.nvmf_data); 3570 if (rc) { 3571 return rc; 3572 } 3573 3574 /* resubmit pending AERs */ 3575 for (i = 0; i < migr_state.ctrlr_header.nr_aers; i++) { 3576 SPDK_DEBUGLOG(nvmf_vfio, "%s AER resubmit, CID %u\n", ctrlr_id(vu_ctrlr), 3577 migr_state.ctrlr_header.aer_cids[i]); 3578 memset(&cmd, 0, sizeof(cmd)); 3579 cmd.opc = SPDK_NVME_OPC_ASYNC_EVENT_REQUEST; 3580 cmd.cid = migr_state.ctrlr_header.aer_cids[i]; 3581 rc = handle_cmd_req(vu_ctrlr, &cmd, vu_ctrlr->sqs[0]); 3582 if (rc) { 3583 break; 3584 } 3585 } 3586 3587 return rc; 3588 } 3589 3590 static void 3591 vfio_user_migr_ctrlr_enable_sqs(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3592 { 3593 uint32_t i; 3594 struct nvmf_vfio_user_sq *sq; 3595 3596 /* The Admin queue (qid: 0) does not ever use shadow doorbells. */ 3597 3598 if (vu_ctrlr->sqs[0] != NULL) { 3599 vu_ctrlr->sqs[0]->dbl_tailp = vu_ctrlr->bar0_doorbells + 3600 queue_index(0, false); 3601 } 3602 3603 if (vu_ctrlr->cqs[0] != NULL) { 3604 vu_ctrlr->cqs[0]->dbl_headp = vu_ctrlr->bar0_doorbells + 3605 queue_index(0, true); 3606 } 3607 3608 vfio_user_ctrlr_switch_doorbells(vu_ctrlr, vu_ctrlr->sdbl != NULL); 3609 3610 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3611 sq = vu_ctrlr->sqs[i]; 3612 if (!sq || !sq->size) { 3613 continue; 3614 } 3615 3616 if (nvmf_qpair_is_admin_queue(&sq->qpair)) { 3617 /* ADMIN queue pair is always in the poll group, just enable it */ 3618 sq->sq_state = VFIO_USER_SQ_ACTIVE; 3619 } else { 3620 spdk_nvmf_tgt_new_qpair(vu_ctrlr->transport->transport.tgt, &sq->qpair); 3621 } 3622 } 3623 } 3624 3625 /* 3626 * We are in stop-and-copy state, but still potentially have some current dirty 3627 * sgls: while we're quiesced and thus should have no active requests, we still 3628 * have potentially dirty maps of the shadow doorbells and the CQs (SQs are 3629 * mapped read only). 3630 * 3631 * Since we won't be calling vfu_sgl_put() for them, we need to explicitly 3632 * mark them dirty now. 3633 */ 3634 static void 3635 vfio_user_migr_ctrlr_mark_dirty(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3636 { 3637 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 3638 3639 assert(vu_ctrlr->state == VFIO_USER_CTRLR_MIGRATING); 3640 3641 for (size_t i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3642 struct nvmf_vfio_user_cq *cq = vu_ctrlr->cqs[i]; 3643 3644 if (cq == NULL || q_addr(&cq->mapping) == NULL) { 3645 continue; 3646 } 3647 3648 vfu_sgl_mark_dirty(endpoint->vfu_ctx, cq->mapping.sg, 1); 3649 } 3650 3651 if (vu_ctrlr->sdbl != NULL) { 3652 dma_sg_t *sg; 3653 size_t i; 3654 3655 for (i = 0; i < NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT; 3656 ++i) { 3657 3658 if (!vu_ctrlr->sdbl->iovs[i].iov_len) { 3659 continue; 3660 } 3661 3662 sg = index_to_sg_t(vu_ctrlr->sdbl->sgs, i); 3663 3664 vfu_sgl_mark_dirty(endpoint->vfu_ctx, sg, 1); 3665 } 3666 } 3667 } 3668 3669 static int 3670 vfio_user_migration_device_state_transition(vfu_ctx_t *vfu_ctx, vfu_migr_state_t state) 3671 { 3672 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3673 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3674 struct nvmf_vfio_user_sq *sq; 3675 int ret = 0; 3676 3677 SPDK_DEBUGLOG(nvmf_vfio, "%s controller state %u, migration state %u\n", endpoint_id(endpoint), 3678 vu_ctrlr->state, state); 3679 3680 switch (state) { 3681 case VFU_MIGR_STATE_STOP_AND_COPY: 3682 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 3683 vfio_user_migr_ctrlr_mark_dirty(vu_ctrlr); 3684 vfio_user_migr_ctrlr_save_data(vu_ctrlr); 3685 break; 3686 case VFU_MIGR_STATE_STOP: 3687 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 3688 /* The controller associates with source VM is dead now, we will resume 3689 * the subsystem after destroying the controller data structure, then the 3690 * subsystem can be re-used for another new client. 3691 */ 3692 if (vu_ctrlr->in_source_vm) { 3693 endpoint->need_resume = true; 3694 } 3695 break; 3696 case VFU_MIGR_STATE_PRE_COPY: 3697 assert(vu_ctrlr->state == VFIO_USER_CTRLR_PAUSED); 3698 vu_ctrlr->migr_reg.pending_bytes = vfio_user_migr_data_len(); 3699 vu_ctrlr->migr_reg.last_data_offset = 0; 3700 vu_ctrlr->in_source_vm = true; 3701 break; 3702 case VFU_MIGR_STATE_RESUME: 3703 /* 3704 * Destination ADMIN queue pair is connected when starting the VM, 3705 * but the ADMIN queue pair isn't enabled in destination VM, the poll 3706 * group will do nothing to ADMIN queue pair for now. 3707 */ 3708 if (vu_ctrlr->state != VFIO_USER_CTRLR_RUNNING) { 3709 break; 3710 } 3711 3712 assert(!vu_ctrlr->in_source_vm); 3713 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 3714 3715 sq = TAILQ_FIRST(&vu_ctrlr->connected_sqs); 3716 assert(sq != NULL); 3717 assert(sq->qpair.qid == 0); 3718 sq->sq_state = VFIO_USER_SQ_INACTIVE; 3719 3720 /* Free ADMIN SQ resources first, SQ resources will be 3721 * allocated based on queue size from source VM. 3722 */ 3723 free_sq_reqs(sq); 3724 sq->size = 0; 3725 break; 3726 case VFU_MIGR_STATE_RUNNING: 3727 if (vu_ctrlr->state != VFIO_USER_CTRLR_MIGRATING) { 3728 break; 3729 } 3730 3731 if (!vu_ctrlr->in_source_vm) { 3732 /* Restore destination VM from BAR9 */ 3733 ret = vfio_user_migr_ctrlr_restore(vu_ctrlr); 3734 if (ret) { 3735 break; 3736 } 3737 3738 vfio_user_ctrlr_switch_doorbells(vu_ctrlr, false); 3739 vfio_user_migr_ctrlr_enable_sqs(vu_ctrlr); 3740 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 3741 } else { 3742 /* Rollback source VM */ 3743 vu_ctrlr->state = VFIO_USER_CTRLR_RESUMING; 3744 ret = spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 3745 vfio_user_endpoint_resume_done, endpoint); 3746 if (ret < 0) { 3747 /* TODO: fail controller with CFS bit set */ 3748 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 3749 SPDK_ERRLOG("%s: failed to resume, ret=%d\n", endpoint_id(endpoint), ret); 3750 break; 3751 } 3752 } 3753 break; 3754 3755 default: 3756 return -EINVAL; 3757 } 3758 3759 return ret; 3760 } 3761 3762 static uint64_t 3763 vfio_user_migration_get_pending_bytes(vfu_ctx_t *vfu_ctx) 3764 { 3765 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3766 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 3767 struct vfio_user_migration_region *migr_reg = &ctrlr->migr_reg; 3768 3769 SPDK_DEBUGLOG(nvmf_vfio, "%s current state %u, pending bytes 0x%"PRIx64"\n", endpoint_id(endpoint), 3770 ctrlr->state, migr_reg->pending_bytes); 3771 3772 return migr_reg->pending_bytes; 3773 } 3774 3775 static int 3776 vfio_user_migration_prepare_data(vfu_ctx_t *vfu_ctx, uint64_t *offset, uint64_t *size) 3777 { 3778 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3779 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 3780 struct vfio_user_migration_region *migr_reg = &ctrlr->migr_reg; 3781 3782 if (migr_reg->last_data_offset == vfio_user_migr_data_len()) { 3783 *offset = vfio_user_migr_data_len(); 3784 if (size) { 3785 *size = 0; 3786 } 3787 migr_reg->pending_bytes = 0; 3788 } else { 3789 *offset = 0; 3790 if (size) { 3791 *size = vfio_user_migr_data_len(); 3792 if (ctrlr->state == VFIO_USER_CTRLR_MIGRATING) { 3793 vfio_user_migr_ctrlr_save_data(ctrlr); 3794 migr_reg->last_data_offset = vfio_user_migr_data_len(); 3795 } 3796 } 3797 } 3798 3799 SPDK_DEBUGLOG(nvmf_vfio, "%s current state %u\n", endpoint_id(endpoint), ctrlr->state); 3800 3801 return 0; 3802 } 3803 3804 static ssize_t 3805 vfio_user_migration_read_data(vfu_ctx_t *vfu_ctx, void *buf, uint64_t count, uint64_t offset) 3806 { 3807 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3808 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 3809 struct vfio_user_migration_region *migr_reg = &ctrlr->migr_reg; 3810 3811 memcpy(buf, endpoint->migr_data, count); 3812 migr_reg->pending_bytes = 0; 3813 3814 return 0; 3815 } 3816 3817 static ssize_t 3818 vfio_user_migration_write_data(vfu_ctx_t *vfu_ctx, void *buf, uint64_t count, uint64_t offset) 3819 { 3820 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3821 3822 memcpy(endpoint->migr_data, buf, count); 3823 3824 return 0; 3825 } 3826 3827 static int 3828 vfio_user_migration_data_written(vfu_ctx_t *vfu_ctx, uint64_t count) 3829 { 3830 SPDK_DEBUGLOG(nvmf_vfio, "write 0x%"PRIx64"\n", (uint64_t)count); 3831 3832 return 0; 3833 } 3834 3835 static int 3836 vfio_user_dev_info_fill(struct nvmf_vfio_user_transport *vu_transport, 3837 struct nvmf_vfio_user_endpoint *endpoint) 3838 { 3839 int ret; 3840 ssize_t cap_offset; 3841 vfu_ctx_t *vfu_ctx = endpoint->vfu_ctx; 3842 struct iovec migr_sparse_mmap = {}; 3843 3844 struct pmcap pmcap = { .hdr.id = PCI_CAP_ID_PM, .pmcs.nsfrst = 0x1 }; 3845 struct pxcap pxcap = { 3846 .hdr.id = PCI_CAP_ID_EXP, 3847 .pxcaps.ver = 0x2, 3848 .pxdcap = {.rer = 0x1, .flrc = 0x1}, 3849 .pxdcap2.ctds = 0x1 3850 }; 3851 3852 struct msixcap msixcap = { 3853 .hdr.id = PCI_CAP_ID_MSIX, 3854 .mxc.ts = NVME_IRQ_MSIX_NUM - 1, 3855 .mtab = {.tbir = 0x4, .to = 0x0}, 3856 .mpba = {.pbir = 0x5, .pbao = 0x0} 3857 }; 3858 3859 struct iovec sparse_mmap[] = { 3860 { 3861 .iov_base = (void *)NVME_DOORBELLS_OFFSET, 3862 .iov_len = NVMF_VFIO_USER_DOORBELLS_SIZE, 3863 }, 3864 }; 3865 3866 const vfu_migration_callbacks_t migr_callbacks = { 3867 .version = VFU_MIGR_CALLBACKS_VERS, 3868 .transition = &vfio_user_migration_device_state_transition, 3869 .get_pending_bytes = &vfio_user_migration_get_pending_bytes, 3870 .prepare_data = &vfio_user_migration_prepare_data, 3871 .read_data = &vfio_user_migration_read_data, 3872 .data_written = &vfio_user_migration_data_written, 3873 .write_data = &vfio_user_migration_write_data 3874 }; 3875 3876 ret = vfu_pci_init(vfu_ctx, VFU_PCI_TYPE_EXPRESS, PCI_HEADER_TYPE_NORMAL, 0); 3877 if (ret < 0) { 3878 SPDK_ERRLOG("vfu_ctx %p failed to initialize PCI\n", vfu_ctx); 3879 return ret; 3880 } 3881 vfu_pci_set_id(vfu_ctx, SPDK_PCI_VID_NUTANIX, 0x0001, SPDK_PCI_VID_NUTANIX, 0); 3882 /* 3883 * 0x02, controller uses the NVM Express programming interface 3884 * 0x08, non-volatile memory controller 3885 * 0x01, mass storage controller 3886 */ 3887 vfu_pci_set_class(vfu_ctx, 0x01, 0x08, 0x02); 3888 3889 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pmcap); 3890 if (cap_offset < 0) { 3891 SPDK_ERRLOG("vfu_ctx %p failed add pmcap\n", vfu_ctx); 3892 return ret; 3893 } 3894 3895 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pxcap); 3896 if (cap_offset < 0) { 3897 SPDK_ERRLOG("vfu_ctx %p failed add pxcap\n", vfu_ctx); 3898 return ret; 3899 } 3900 3901 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &msixcap); 3902 if (cap_offset < 0) { 3903 SPDK_ERRLOG("vfu_ctx %p failed add msixcap\n", vfu_ctx); 3904 return ret; 3905 } 3906 3907 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_CFG_REGION_IDX, NVME_REG_CFG_SIZE, 3908 access_pci_config, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 3909 if (ret < 0) { 3910 SPDK_ERRLOG("vfu_ctx %p failed to setup cfg\n", vfu_ctx); 3911 return ret; 3912 } 3913 3914 if (vu_transport->transport_opts.disable_mappable_bar0) { 3915 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE, 3916 access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, 3917 NULL, 0, -1, 0); 3918 } else { 3919 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE, 3920 access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, 3921 sparse_mmap, 1, endpoint->devmem_fd, 0); 3922 } 3923 3924 if (ret < 0) { 3925 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 0\n", vfu_ctx); 3926 return ret; 3927 } 3928 3929 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR4_REGION_IDX, NVME_BAR4_SIZE, 3930 NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 3931 if (ret < 0) { 3932 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 4\n", vfu_ctx); 3933 return ret; 3934 } 3935 3936 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR5_REGION_IDX, NVME_BAR5_SIZE, 3937 NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 3938 if (ret < 0) { 3939 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 5\n", vfu_ctx); 3940 return ret; 3941 } 3942 3943 ret = vfu_setup_device_dma(vfu_ctx, memory_region_add_cb, memory_region_remove_cb); 3944 if (ret < 0) { 3945 SPDK_ERRLOG("vfu_ctx %p failed to setup dma callback\n", vfu_ctx); 3946 return ret; 3947 } 3948 3949 ret = vfu_setup_device_reset_cb(vfu_ctx, vfio_user_device_reset); 3950 if (ret < 0) { 3951 SPDK_ERRLOG("vfu_ctx %p failed to setup reset callback\n", vfu_ctx); 3952 return ret; 3953 } 3954 3955 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_INTX_IRQ, 1); 3956 if (ret < 0) { 3957 SPDK_ERRLOG("vfu_ctx %p failed to setup INTX\n", vfu_ctx); 3958 return ret; 3959 } 3960 3961 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSIX_IRQ, NVME_IRQ_MSIX_NUM); 3962 if (ret < 0) { 3963 SPDK_ERRLOG("vfu_ctx %p failed to setup MSIX\n", vfu_ctx); 3964 return ret; 3965 } 3966 3967 vfu_setup_device_quiesce_cb(vfu_ctx, vfio_user_dev_quiesce_cb); 3968 3969 migr_sparse_mmap.iov_base = (void *)4096; 3970 migr_sparse_mmap.iov_len = vfio_user_migr_data_len(); 3971 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_MIGR_REGION_IDX, 3972 vfu_get_migr_register_area_size() + vfio_user_migr_data_len(), 3973 NULL, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, &migr_sparse_mmap, 3974 1, endpoint->migr_fd, 0); 3975 if (ret < 0) { 3976 SPDK_ERRLOG("vfu_ctx %p failed to setup migration region\n", vfu_ctx); 3977 return ret; 3978 } 3979 3980 ret = vfu_setup_device_migration_callbacks(vfu_ctx, &migr_callbacks, 3981 vfu_get_migr_register_area_size()); 3982 if (ret < 0) { 3983 SPDK_ERRLOG("vfu_ctx %p failed to setup migration callbacks\n", vfu_ctx); 3984 return ret; 3985 } 3986 3987 ret = vfu_realize_ctx(vfu_ctx); 3988 if (ret < 0) { 3989 SPDK_ERRLOG("vfu_ctx %p failed to realize\n", vfu_ctx); 3990 return ret; 3991 } 3992 3993 endpoint->pci_config_space = vfu_pci_get_config_space(endpoint->vfu_ctx); 3994 assert(endpoint->pci_config_space != NULL); 3995 init_pci_config_space(endpoint->pci_config_space); 3996 3997 assert(cap_offset != 0); 3998 endpoint->msix = (struct msixcap *)((uint8_t *)endpoint->pci_config_space + cap_offset); 3999 4000 return 0; 4001 } 4002 4003 static int nvmf_vfio_user_accept(void *ctx); 4004 4005 static void 4006 set_intr_mode_noop(struct spdk_poller *poller, void *arg, bool interrupt_mode) 4007 { 4008 /* Nothing for us to do here. */ 4009 } 4010 4011 /* 4012 * Register an "accept" poller: this is polling for incoming vfio-user socket 4013 * connections (on the listening socket). 4014 * 4015 * We need to do this on first listening, and also after destroying a 4016 * controller, so we can accept another connection. 4017 */ 4018 static int 4019 vfio_user_register_accept_poller(struct nvmf_vfio_user_endpoint *endpoint) 4020 { 4021 uint64_t poll_rate_us = endpoint->transport->transport.opts.acceptor_poll_rate; 4022 4023 SPDK_DEBUGLOG(nvmf_vfio, "registering accept poller\n"); 4024 4025 endpoint->accept_poller = SPDK_POLLER_REGISTER(nvmf_vfio_user_accept, 4026 endpoint, poll_rate_us); 4027 4028 if (!endpoint->accept_poller) { 4029 return -1; 4030 } 4031 4032 endpoint->accept_thread = spdk_get_thread(); 4033 4034 if (!spdk_interrupt_mode_is_enabled()) { 4035 return 0; 4036 } 4037 4038 endpoint->accept_intr_fd = vfu_get_poll_fd(endpoint->vfu_ctx); 4039 assert(endpoint->accept_intr_fd != -1); 4040 4041 endpoint->accept_intr = SPDK_INTERRUPT_REGISTER(endpoint->accept_intr_fd, 4042 nvmf_vfio_user_accept, endpoint); 4043 4044 assert(endpoint->accept_intr != NULL); 4045 4046 spdk_poller_register_interrupt(endpoint->accept_poller, 4047 set_intr_mode_noop, NULL); 4048 return 0; 4049 } 4050 4051 static void 4052 _vfio_user_relisten(void *ctx) 4053 { 4054 struct nvmf_vfio_user_endpoint *endpoint = ctx; 4055 4056 vfio_user_register_accept_poller(endpoint); 4057 } 4058 4059 static void 4060 _free_ctrlr(void *ctx) 4061 { 4062 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 4063 struct nvmf_vfio_user_endpoint *endpoint = ctrlr->endpoint; 4064 4065 free_sdbl(ctrlr->endpoint->vfu_ctx, ctrlr->sdbl); 4066 4067 spdk_interrupt_unregister(&ctrlr->intr); 4068 ctrlr->intr_fd = -1; 4069 spdk_poller_unregister(&ctrlr->vfu_ctx_poller); 4070 4071 free(ctrlr); 4072 4073 if (endpoint == NULL) { 4074 return; 4075 } 4076 4077 if (endpoint->need_async_destroy) { 4078 nvmf_vfio_user_destroy_endpoint(endpoint); 4079 } else { 4080 spdk_thread_send_msg(endpoint->accept_thread, 4081 _vfio_user_relisten, endpoint); 4082 } 4083 } 4084 4085 static void 4086 free_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 4087 { 4088 int i; 4089 assert(ctrlr != NULL); 4090 4091 SPDK_DEBUGLOG(nvmf_vfio, "free %s\n", ctrlr_id(ctrlr)); 4092 4093 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 4094 free_qp(ctrlr, i); 4095 } 4096 4097 spdk_thread_exec_msg(ctrlr->thread, _free_ctrlr, ctrlr); 4098 } 4099 4100 static int 4101 nvmf_vfio_user_create_ctrlr(struct nvmf_vfio_user_transport *transport, 4102 struct nvmf_vfio_user_endpoint *endpoint) 4103 { 4104 struct nvmf_vfio_user_ctrlr *ctrlr; 4105 int err = 0; 4106 4107 SPDK_DEBUGLOG(nvmf_vfio, "%s\n", endpoint_id(endpoint)); 4108 4109 /* First, construct a vfio-user CUSTOM transport controller */ 4110 ctrlr = calloc(1, sizeof(*ctrlr)); 4111 if (ctrlr == NULL) { 4112 err = -ENOMEM; 4113 goto out; 4114 } 4115 /* We can only support one connection for now */ 4116 ctrlr->cntlid = 0x1; 4117 ctrlr->intr_fd = -1; 4118 ctrlr->transport = transport; 4119 ctrlr->endpoint = endpoint; 4120 ctrlr->bar0_doorbells = endpoint->bar0_doorbells; 4121 TAILQ_INIT(&ctrlr->connected_sqs); 4122 4123 ctrlr->adaptive_irqs_enabled = 4124 !transport->transport_opts.disable_adaptive_irq; 4125 4126 /* Then, construct an admin queue pair */ 4127 err = init_sq(ctrlr, &transport->transport, 0); 4128 if (err != 0) { 4129 free(ctrlr); 4130 goto out; 4131 } 4132 4133 err = init_cq(ctrlr, 0); 4134 if (err != 0) { 4135 free(ctrlr); 4136 goto out; 4137 } 4138 4139 ctrlr->sqs[0]->size = NVMF_VFIO_USER_DEFAULT_AQ_DEPTH; 4140 4141 err = alloc_sq_reqs(ctrlr, ctrlr->sqs[0]); 4142 if (err != 0) { 4143 free(ctrlr); 4144 goto out; 4145 } 4146 endpoint->ctrlr = ctrlr; 4147 4148 /* Notify the generic layer about the new admin queue pair */ 4149 spdk_nvmf_tgt_new_qpair(transport->transport.tgt, &ctrlr->sqs[0]->qpair); 4150 4151 out: 4152 if (err != 0) { 4153 SPDK_ERRLOG("%s: failed to create vfio-user controller: %s\n", 4154 endpoint_id(endpoint), strerror(-err)); 4155 } 4156 4157 return err; 4158 } 4159 4160 static int 4161 nvmf_vfio_user_listen(struct spdk_nvmf_transport *transport, 4162 const struct spdk_nvme_transport_id *trid, 4163 struct spdk_nvmf_listen_opts *listen_opts) 4164 { 4165 struct nvmf_vfio_user_transport *vu_transport; 4166 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 4167 char path[PATH_MAX] = {}; 4168 char uuid[PATH_MAX] = {}; 4169 int ret; 4170 4171 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4172 transport); 4173 4174 pthread_mutex_lock(&vu_transport->lock); 4175 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 4176 /* Only compare traddr */ 4177 if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) { 4178 pthread_mutex_unlock(&vu_transport->lock); 4179 return -EEXIST; 4180 } 4181 } 4182 pthread_mutex_unlock(&vu_transport->lock); 4183 4184 endpoint = calloc(1, sizeof(*endpoint)); 4185 if (!endpoint) { 4186 return -ENOMEM; 4187 } 4188 4189 pthread_mutex_init(&endpoint->lock, NULL); 4190 endpoint->devmem_fd = -1; 4191 memcpy(&endpoint->trid, trid, sizeof(endpoint->trid)); 4192 endpoint->transport = vu_transport; 4193 4194 ret = snprintf(path, PATH_MAX, "%s/bar0", endpoint_id(endpoint)); 4195 if (ret < 0 || ret >= PATH_MAX) { 4196 SPDK_ERRLOG("%s: error to get socket path: %s.\n", endpoint_id(endpoint), spdk_strerror(errno)); 4197 ret = -1; 4198 goto out; 4199 } 4200 4201 ret = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); 4202 if (ret == -1) { 4203 SPDK_ERRLOG("%s: failed to open device memory at %s: %s.\n", 4204 endpoint_id(endpoint), path, spdk_strerror(errno)); 4205 goto out; 4206 } 4207 unlink(path); 4208 4209 endpoint->devmem_fd = ret; 4210 ret = ftruncate(endpoint->devmem_fd, 4211 NVME_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE); 4212 if (ret != 0) { 4213 SPDK_ERRLOG("%s: error to ftruncate file %s: %s.\n", endpoint_id(endpoint), path, 4214 spdk_strerror(errno)); 4215 goto out; 4216 } 4217 4218 endpoint->bar0_doorbells = mmap(NULL, NVMF_VFIO_USER_DOORBELLS_SIZE, 4219 PROT_READ | PROT_WRITE, MAP_SHARED, endpoint->devmem_fd, NVME_DOORBELLS_OFFSET); 4220 if (endpoint->bar0_doorbells == MAP_FAILED) { 4221 SPDK_ERRLOG("%s: error to mmap file %s: %s.\n", endpoint_id(endpoint), path, spdk_strerror(errno)); 4222 endpoint->bar0_doorbells = NULL; 4223 ret = -1; 4224 goto out; 4225 } 4226 4227 ret = snprintf(path, PATH_MAX, "%s/migr", endpoint_id(endpoint)); 4228 if (ret < 0 || ret >= PATH_MAX) { 4229 SPDK_ERRLOG("%s: error to get migration file path: %s.\n", endpoint_id(endpoint), 4230 spdk_strerror(errno)); 4231 ret = -1; 4232 goto out; 4233 } 4234 ret = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); 4235 if (ret == -1) { 4236 SPDK_ERRLOG("%s: failed to open device memory at %s: %s.\n", 4237 endpoint_id(endpoint), path, spdk_strerror(errno)); 4238 goto out; 4239 } 4240 unlink(path); 4241 4242 endpoint->migr_fd = ret; 4243 ret = ftruncate(endpoint->migr_fd, 4244 vfu_get_migr_register_area_size() + vfio_user_migr_data_len()); 4245 if (ret != 0) { 4246 SPDK_ERRLOG("%s: error to ftruncate migration file %s: %s.\n", endpoint_id(endpoint), path, 4247 spdk_strerror(errno)); 4248 goto out; 4249 } 4250 4251 endpoint->migr_data = mmap(NULL, vfio_user_migr_data_len(), 4252 PROT_READ | PROT_WRITE, MAP_SHARED, endpoint->migr_fd, vfu_get_migr_register_area_size()); 4253 if (endpoint->migr_data == MAP_FAILED) { 4254 SPDK_ERRLOG("%s: error to mmap file %s: %s.\n", endpoint_id(endpoint), path, spdk_strerror(errno)); 4255 endpoint->migr_data = NULL; 4256 ret = -1; 4257 goto out; 4258 } 4259 4260 ret = snprintf(uuid, PATH_MAX, "%s/cntrl", endpoint_id(endpoint)); 4261 if (ret < 0 || ret >= PATH_MAX) { 4262 SPDK_ERRLOG("%s: error to get ctrlr file path: %s\n", endpoint_id(endpoint), spdk_strerror(errno)); 4263 ret = -1; 4264 goto out; 4265 } 4266 4267 endpoint->vfu_ctx = vfu_create_ctx(VFU_TRANS_SOCK, uuid, LIBVFIO_USER_FLAG_ATTACH_NB, 4268 endpoint, VFU_DEV_TYPE_PCI); 4269 if (endpoint->vfu_ctx == NULL) { 4270 SPDK_ERRLOG("%s: error creating libmuser context: %m\n", 4271 endpoint_id(endpoint)); 4272 ret = -1; 4273 goto out; 4274 } 4275 4276 ret = vfu_setup_log(endpoint->vfu_ctx, vfio_user_log, 4277 vfio_user_get_log_level()); 4278 if (ret < 0) { 4279 goto out; 4280 } 4281 4282 4283 ret = vfio_user_dev_info_fill(vu_transport, endpoint); 4284 if (ret < 0) { 4285 goto out; 4286 } 4287 4288 ret = vfio_user_register_accept_poller(endpoint); 4289 4290 if (ret != 0) { 4291 goto out; 4292 } 4293 4294 pthread_mutex_lock(&vu_transport->lock); 4295 TAILQ_INSERT_TAIL(&vu_transport->endpoints, endpoint, link); 4296 pthread_mutex_unlock(&vu_transport->lock); 4297 4298 out: 4299 if (ret != 0) { 4300 nvmf_vfio_user_destroy_endpoint(endpoint); 4301 } 4302 4303 return ret; 4304 } 4305 4306 static void 4307 nvmf_vfio_user_stop_listen(struct spdk_nvmf_transport *transport, 4308 const struct spdk_nvme_transport_id *trid) 4309 { 4310 struct nvmf_vfio_user_transport *vu_transport; 4311 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 4312 4313 assert(trid != NULL); 4314 assert(trid->traddr != NULL); 4315 4316 SPDK_DEBUGLOG(nvmf_vfio, "%s: stop listen\n", trid->traddr); 4317 4318 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4319 transport); 4320 4321 pthread_mutex_lock(&vu_transport->lock); 4322 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 4323 if (strcmp(trid->traddr, endpoint->trid.traddr) == 0) { 4324 TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link); 4325 /* Defer to free endpoint resources until the controller 4326 * is freed. There are two cases when running here: 4327 * 1. kill nvmf target while VM is connected 4328 * 2. remove listener via RPC call 4329 * nvmf library will disconnect all queue paris. 4330 */ 4331 if (endpoint->ctrlr) { 4332 assert(!endpoint->need_async_destroy); 4333 endpoint->need_async_destroy = true; 4334 pthread_mutex_unlock(&vu_transport->lock); 4335 return; 4336 } 4337 4338 nvmf_vfio_user_destroy_endpoint(endpoint); 4339 pthread_mutex_unlock(&vu_transport->lock); 4340 return; 4341 } 4342 } 4343 pthread_mutex_unlock(&vu_transport->lock); 4344 4345 SPDK_DEBUGLOG(nvmf_vfio, "%s: not found\n", trid->traddr); 4346 } 4347 4348 static void 4349 nvmf_vfio_user_cdata_init(struct spdk_nvmf_transport *transport, 4350 struct spdk_nvmf_subsystem *subsystem, 4351 struct spdk_nvmf_ctrlr_data *cdata) 4352 { 4353 struct nvmf_vfio_user_transport *vu_transport; 4354 4355 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, transport); 4356 4357 cdata->vid = SPDK_PCI_VID_NUTANIX; 4358 cdata->ssvid = SPDK_PCI_VID_NUTANIX; 4359 cdata->ieee[0] = 0x8d; 4360 cdata->ieee[1] = 0x6b; 4361 cdata->ieee[2] = 0x50; 4362 memset(&cdata->sgls, 0, sizeof(struct spdk_nvme_cdata_sgls)); 4363 cdata->sgls.supported = SPDK_NVME_SGLS_SUPPORTED_DWORD_ALIGNED; 4364 cdata->oncs.compare = !vu_transport->transport_opts.disable_compare; 4365 /* libvfio-user can only support 1 connection for now */ 4366 cdata->oncs.reservations = 0; 4367 cdata->oacs.doorbell_buffer_config = !vu_transport->transport_opts.disable_shadow_doorbells; 4368 cdata->fuses.compare_and_write = !vu_transport->transport_opts.disable_compare; 4369 } 4370 4371 static int 4372 nvmf_vfio_user_listen_associate(struct spdk_nvmf_transport *transport, 4373 const struct spdk_nvmf_subsystem *subsystem, 4374 const struct spdk_nvme_transport_id *trid) 4375 { 4376 struct nvmf_vfio_user_transport *vu_transport; 4377 struct nvmf_vfio_user_endpoint *endpoint; 4378 4379 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, transport); 4380 4381 pthread_mutex_lock(&vu_transport->lock); 4382 TAILQ_FOREACH(endpoint, &vu_transport->endpoints, link) { 4383 if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) { 4384 break; 4385 } 4386 } 4387 pthread_mutex_unlock(&vu_transport->lock); 4388 4389 if (endpoint == NULL) { 4390 return -ENOENT; 4391 } 4392 4393 /* Drop const - we will later need to pause/unpause. */ 4394 endpoint->subsystem = (struct spdk_nvmf_subsystem *)subsystem; 4395 4396 return 0; 4397 } 4398 4399 /* 4400 * Executed periodically at a default SPDK_NVMF_DEFAULT_ACCEPT_POLL_RATE_US 4401 * frequency. 4402 * 4403 * For this endpoint (which at the libvfio-user level corresponds to a socket), 4404 * if we don't currently have a controller set up, peek to see if the socket is 4405 * able to accept a new connection. 4406 */ 4407 static int 4408 nvmf_vfio_user_accept(void *ctx) 4409 { 4410 struct nvmf_vfio_user_endpoint *endpoint = ctx; 4411 struct nvmf_vfio_user_transport *vu_transport; 4412 int err; 4413 4414 vu_transport = endpoint->transport; 4415 4416 if (endpoint->ctrlr != NULL) { 4417 return SPDK_POLLER_IDLE; 4418 } 4419 4420 /* While we're here, the controller is already destroyed, 4421 * subsystem may still be in RESUMING state, we will wait 4422 * until the subsystem is in RUNNING state. 4423 */ 4424 if (endpoint->need_resume) { 4425 return SPDK_POLLER_IDLE; 4426 } 4427 4428 err = vfu_attach_ctx(endpoint->vfu_ctx); 4429 if (err == 0) { 4430 SPDK_DEBUGLOG(nvmf_vfio, "attach succeeded\n"); 4431 err = nvmf_vfio_user_create_ctrlr(vu_transport, endpoint); 4432 if (err == 0) { 4433 /* 4434 * Unregister ourselves: now we've accepted a 4435 * connection, there is nothing for us to poll for, and 4436 * we will poll the connection via vfu_run_ctx() 4437 * instead. 4438 */ 4439 spdk_interrupt_unregister(&endpoint->accept_intr); 4440 spdk_poller_unregister(&endpoint->accept_poller); 4441 } 4442 return SPDK_POLLER_BUSY; 4443 } 4444 4445 if (errno == EAGAIN || errno == EWOULDBLOCK) { 4446 return SPDK_POLLER_IDLE; 4447 } 4448 4449 return SPDK_POLLER_BUSY; 4450 } 4451 4452 static void 4453 nvmf_vfio_user_discover(struct spdk_nvmf_transport *transport, 4454 struct spdk_nvme_transport_id *trid, 4455 struct spdk_nvmf_discovery_log_page_entry *entry) 4456 { } 4457 4458 static struct spdk_nvmf_transport_poll_group * 4459 nvmf_vfio_user_poll_group_create(struct spdk_nvmf_transport *transport, 4460 struct spdk_nvmf_poll_group *group) 4461 { 4462 struct nvmf_vfio_user_transport *vu_transport; 4463 struct nvmf_vfio_user_poll_group *vu_group; 4464 4465 SPDK_DEBUGLOG(nvmf_vfio, "create poll group\n"); 4466 4467 vu_group = calloc(1, sizeof(*vu_group)); 4468 if (vu_group == NULL) { 4469 SPDK_ERRLOG("Error allocating poll group: %m"); 4470 return NULL; 4471 } 4472 4473 TAILQ_INIT(&vu_group->sqs); 4474 4475 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4476 transport); 4477 pthread_mutex_lock(&vu_transport->pg_lock); 4478 TAILQ_INSERT_TAIL(&vu_transport->poll_groups, vu_group, link); 4479 if (vu_transport->next_pg == NULL) { 4480 vu_transport->next_pg = vu_group; 4481 } 4482 pthread_mutex_unlock(&vu_transport->pg_lock); 4483 4484 if (!spdk_interrupt_mode_is_enabled()) { 4485 return &vu_group->group; 4486 } 4487 4488 /* 4489 * Only allow the poll group to work in interrupt mode if the transport 4490 * supports it. It's our responsibility to register the actual interrupt 4491 * later (in handle_queue_connect_rsp()) that processes everything in 4492 * the poll group: for us, that's the libvfio-user context, and the 4493 * actual qpairs. 4494 * 4495 * Note that this only works in the case that nothing else shares the 4496 * spdk_nvmf_poll_group. 4497 * 4498 * If not supported, this will effectively always wake up to poll the 4499 * poll group. 4500 */ 4501 4502 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4503 transport); 4504 4505 if (!vu_transport->intr_mode_supported) { 4506 SPDK_WARNLOG("vfio-user interrupt mode not supported\n"); 4507 return &vu_group->group; 4508 } 4509 4510 spdk_poller_register_interrupt(group->poller, set_intr_mode_noop, 4511 NULL); 4512 4513 return &vu_group->group; 4514 } 4515 4516 static bool 4517 in_interrupt_mode(struct nvmf_vfio_user_transport *vu_transport) 4518 { 4519 return spdk_interrupt_mode_is_enabled() && 4520 vu_transport->intr_mode_supported; 4521 } 4522 4523 static struct spdk_nvmf_transport_poll_group * 4524 nvmf_vfio_user_get_optimal_poll_group(struct spdk_nvmf_qpair *qpair) 4525 { 4526 struct nvmf_vfio_user_transport *vu_transport; 4527 struct nvmf_vfio_user_poll_group **vu_group; 4528 struct nvmf_vfio_user_sq *sq; 4529 struct nvmf_vfio_user_cq *cq; 4530 4531 struct spdk_nvmf_transport_poll_group *result = NULL; 4532 4533 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 4534 cq = sq->ctrlr->cqs[sq->cqid]; 4535 assert(cq != NULL); 4536 vu_transport = SPDK_CONTAINEROF(qpair->transport, struct nvmf_vfio_user_transport, transport); 4537 4538 pthread_mutex_lock(&vu_transport->pg_lock); 4539 if (TAILQ_EMPTY(&vu_transport->poll_groups)) { 4540 goto out; 4541 } 4542 4543 if (!nvmf_qpair_is_admin_queue(qpair)) { 4544 /* 4545 * If this is shared IO CQ case, just return the used CQ's poll 4546 * group, so I/O completions don't have to use 4547 * spdk_thread_send_msg(). 4548 */ 4549 if (cq->group != NULL) { 4550 result = cq->group; 4551 goto out; 4552 } 4553 4554 /* 4555 * If we're in interrupt mode, align all qpairs for a controller 4556 * on the same poll group, to avoid complications in 4557 * vfio_user_ctrlr_intr(). 4558 */ 4559 if (in_interrupt_mode(vu_transport)) { 4560 result = sq->ctrlr->sqs[0]->group; 4561 goto out; 4562 } 4563 4564 } 4565 4566 vu_group = &vu_transport->next_pg; 4567 assert(*vu_group != NULL); 4568 4569 result = &(*vu_group)->group; 4570 *vu_group = TAILQ_NEXT(*vu_group, link); 4571 if (*vu_group == NULL) { 4572 *vu_group = TAILQ_FIRST(&vu_transport->poll_groups); 4573 } 4574 4575 out: 4576 if (cq->group == NULL) { 4577 cq->group = result; 4578 } 4579 4580 pthread_mutex_unlock(&vu_transport->pg_lock); 4581 return result; 4582 } 4583 4584 /* called when process exits */ 4585 static void 4586 nvmf_vfio_user_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group) 4587 { 4588 struct nvmf_vfio_user_poll_group *vu_group, *next_tgroup;; 4589 struct nvmf_vfio_user_transport *vu_transport; 4590 4591 SPDK_DEBUGLOG(nvmf_vfio, "destroy poll group\n"); 4592 4593 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 4594 vu_transport = SPDK_CONTAINEROF(vu_group->group.transport, struct nvmf_vfio_user_transport, 4595 transport); 4596 4597 pthread_mutex_lock(&vu_transport->pg_lock); 4598 next_tgroup = TAILQ_NEXT(vu_group, link); 4599 TAILQ_REMOVE(&vu_transport->poll_groups, vu_group, link); 4600 if (next_tgroup == NULL) { 4601 next_tgroup = TAILQ_FIRST(&vu_transport->poll_groups); 4602 } 4603 if (vu_transport->next_pg == vu_group) { 4604 vu_transport->next_pg = next_tgroup; 4605 } 4606 pthread_mutex_unlock(&vu_transport->pg_lock); 4607 4608 free(vu_group); 4609 } 4610 4611 static void 4612 _vfio_user_qpair_disconnect(void *ctx) 4613 { 4614 struct nvmf_vfio_user_sq *sq = ctx; 4615 4616 spdk_nvmf_qpair_disconnect(&sq->qpair, NULL, NULL); 4617 } 4618 4619 /* The function is used when socket connection is destroyed */ 4620 static int 4621 vfio_user_destroy_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 4622 { 4623 struct nvmf_vfio_user_sq *sq; 4624 struct nvmf_vfio_user_endpoint *endpoint; 4625 4626 SPDK_DEBUGLOG(nvmf_vfio, "%s stop processing\n", ctrlr_id(ctrlr)); 4627 4628 endpoint = ctrlr->endpoint; 4629 assert(endpoint != NULL); 4630 4631 pthread_mutex_lock(&endpoint->lock); 4632 if (TAILQ_EMPTY(&ctrlr->connected_sqs)) { 4633 endpoint->ctrlr = NULL; 4634 free_ctrlr(ctrlr); 4635 pthread_mutex_unlock(&endpoint->lock); 4636 return 0; 4637 } 4638 4639 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 4640 /* add another round thread poll to avoid recursive endpoint lock */ 4641 spdk_thread_send_msg(ctrlr->thread, _vfio_user_qpair_disconnect, sq); 4642 } 4643 pthread_mutex_unlock(&endpoint->lock); 4644 4645 return 0; 4646 } 4647 4648 /* 4649 * Poll for and process any incoming vfio-user messages. 4650 */ 4651 static int 4652 vfio_user_poll_vfu_ctx(void *ctx) 4653 { 4654 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 4655 int ret; 4656 4657 assert(ctrlr != NULL); 4658 4659 /* This will call access_bar0_fn() if there are any writes 4660 * to the portion of the BAR that is not mmap'd */ 4661 ret = vfu_run_ctx(ctrlr->endpoint->vfu_ctx); 4662 if (spdk_unlikely(ret == -1)) { 4663 if (errno == EBUSY) { 4664 return SPDK_POLLER_IDLE; 4665 } 4666 4667 spdk_poller_unregister(&ctrlr->vfu_ctx_poller); 4668 4669 /* 4670 * We lost the client; the reset callback will already have 4671 * unregistered the interrupt. 4672 */ 4673 if (errno == ENOTCONN) { 4674 vfio_user_destroy_ctrlr(ctrlr); 4675 return SPDK_POLLER_BUSY; 4676 } 4677 4678 /* 4679 * We might not have got a reset callback in this case, so 4680 * explicitly unregister the interrupt here. 4681 */ 4682 spdk_interrupt_unregister(&ctrlr->intr); 4683 ctrlr->intr_fd = -1; 4684 fail_ctrlr(ctrlr); 4685 } 4686 4687 return ret != 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 4688 } 4689 4690 struct vfio_user_post_cpl_ctx { 4691 struct nvmf_vfio_user_ctrlr *ctrlr; 4692 struct nvmf_vfio_user_cq *cq; 4693 struct spdk_nvme_cpl cpl; 4694 }; 4695 4696 static void 4697 _post_completion_msg(void *ctx) 4698 { 4699 struct vfio_user_post_cpl_ctx *cpl_ctx = ctx; 4700 4701 post_completion(cpl_ctx->ctrlr, cpl_ctx->cq, cpl_ctx->cpl.cdw0, cpl_ctx->cpl.sqid, 4702 cpl_ctx->cpl.cid, cpl_ctx->cpl.status.sc, cpl_ctx->cpl.status.sct); 4703 free(cpl_ctx); 4704 } 4705 4706 static int nvmf_vfio_user_poll_group_poll(struct spdk_nvmf_transport_poll_group *group); 4707 4708 static int vfio_user_poll_group_rearm(struct nvmf_vfio_user_poll_group *vu_group); 4709 4710 /* 4711 * Handle an interrupt for the given controller: we must poll the vfu_ctx, and 4712 * the SQs assigned to our poll group. 4713 */ 4714 static int 4715 vfio_user_ctrlr_intr(void *ctx) 4716 { 4717 struct nvmf_vfio_user_poll_group *vu_group; 4718 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 4719 int ret = 0; 4720 4721 assert(ctrlr != NULL); 4722 assert(ctrlr->sqs[0] != NULL); 4723 assert(ctrlr->sqs[0]->group != NULL); 4724 4725 ctrlr->kick_requested = false; 4726 4727 /* 4728 * Poll vfio-user for this controller. 4729 */ 4730 ret = vfio_user_poll_vfu_ctx(ctrlr); 4731 4732 vu_group = ctrlr_to_poll_group(ctrlr); 4733 4734 /* 4735 * See nvmf_vfio_user_get_optimal_poll_group() for why it's OK to only 4736 * poll this poll group. 4737 * 4738 * Note that this could end up polling other controller's SQs as well 4739 * (since a single poll group can have SQs from multiple separate 4740 * controllers). 4741 */ 4742 ret |= nvmf_vfio_user_poll_group_poll(&vu_group->group); 4743 4744 /* 4745 * Re-arm the event indexes. NB: this also could rearm other 4746 * controller's SQs. 4747 */ 4748 ret |= vfio_user_poll_group_rearm(vu_group); 4749 4750 return ret != 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 4751 } 4752 4753 static void 4754 vfio_user_set_intr_mode(struct spdk_poller *poller, void *arg, 4755 bool interrupt_mode) 4756 { 4757 struct nvmf_vfio_user_ctrlr *ctrlr = arg; 4758 assert(ctrlr != NULL); 4759 assert(ctrlr->endpoint != NULL); 4760 4761 SPDK_DEBUGLOG(nvmf_vfio, "%s: setting interrupt mode to %d\n", 4762 ctrlr_id(ctrlr), interrupt_mode); 4763 4764 /* 4765 * interrupt_mode needs to persist across controller resets, so store 4766 * it in the endpoint instead. 4767 */ 4768 ctrlr->endpoint->interrupt_mode = interrupt_mode; 4769 4770 vfio_user_poll_group_rearm(ctrlr_to_poll_group(ctrlr)); 4771 } 4772 4773 /* 4774 * In response to the nvmf_vfio_user_create_ctrlr() path, the admin queue is now 4775 * set up and we can start operating on this controller. 4776 */ 4777 static void 4778 start_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 4779 struct spdk_nvmf_ctrlr *ctrlr) 4780 { 4781 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 4782 4783 vu_ctrlr->ctrlr = ctrlr; 4784 vu_ctrlr->cntlid = ctrlr->cntlid; 4785 vu_ctrlr->thread = spdk_get_thread(); 4786 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 4787 4788 if (!in_interrupt_mode(endpoint->transport)) { 4789 vu_ctrlr->vfu_ctx_poller = SPDK_POLLER_REGISTER(vfio_user_poll_vfu_ctx, 4790 vu_ctrlr, 1000); 4791 return; 4792 } 4793 4794 vu_ctrlr->vfu_ctx_poller = SPDK_POLLER_REGISTER(vfio_user_poll_vfu_ctx, 4795 vu_ctrlr, 0); 4796 4797 vu_ctrlr->intr_fd = vfu_get_poll_fd(vu_ctrlr->endpoint->vfu_ctx); 4798 assert(vu_ctrlr->intr_fd != -1); 4799 4800 vu_ctrlr->intr = SPDK_INTERRUPT_REGISTER(vu_ctrlr->intr_fd, 4801 vfio_user_ctrlr_intr, vu_ctrlr); 4802 4803 assert(vu_ctrlr->intr != NULL); 4804 4805 spdk_poller_register_interrupt(vu_ctrlr->vfu_ctx_poller, 4806 vfio_user_set_intr_mode, 4807 vu_ctrlr); 4808 } 4809 4810 static int 4811 handle_queue_connect_rsp(struct nvmf_vfio_user_req *req, void *cb_arg) 4812 { 4813 struct nvmf_vfio_user_poll_group *vu_group; 4814 struct nvmf_vfio_user_sq *sq = cb_arg; 4815 struct nvmf_vfio_user_cq *admin_cq; 4816 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 4817 struct nvmf_vfio_user_endpoint *endpoint; 4818 4819 assert(sq != NULL); 4820 assert(req != NULL); 4821 4822 vu_ctrlr = sq->ctrlr; 4823 assert(vu_ctrlr != NULL); 4824 endpoint = vu_ctrlr->endpoint; 4825 assert(endpoint != NULL); 4826 4827 if (spdk_nvme_cpl_is_error(&req->req.rsp->nvme_cpl)) { 4828 SPDK_ERRLOG("SC %u, SCT %u\n", req->req.rsp->nvme_cpl.status.sc, req->req.rsp->nvme_cpl.status.sct); 4829 endpoint->ctrlr = NULL; 4830 free_ctrlr(vu_ctrlr); 4831 return -1; 4832 } 4833 4834 vu_group = SPDK_CONTAINEROF(sq->group, struct nvmf_vfio_user_poll_group, group); 4835 TAILQ_INSERT_TAIL(&vu_group->sqs, sq, link); 4836 4837 admin_cq = vu_ctrlr->cqs[0]; 4838 assert(admin_cq != NULL); 4839 4840 pthread_mutex_lock(&endpoint->lock); 4841 if (nvmf_qpair_is_admin_queue(&sq->qpair)) { 4842 admin_cq->thread = spdk_get_thread(); 4843 start_ctrlr(vu_ctrlr, sq->qpair.ctrlr); 4844 } else { 4845 /* For I/O queues this command was generated in response to an 4846 * ADMIN I/O CREATE SUBMISSION QUEUE command which has not yet 4847 * been completed. Complete it now. 4848 */ 4849 if (sq->post_create_io_sq_completion) { 4850 assert(admin_cq->thread != NULL); 4851 if (admin_cq->thread != spdk_get_thread()) { 4852 struct vfio_user_post_cpl_ctx *cpl_ctx; 4853 4854 cpl_ctx = calloc(1, sizeof(*cpl_ctx)); 4855 if (!cpl_ctx) { 4856 return -ENOMEM; 4857 } 4858 cpl_ctx->ctrlr = vu_ctrlr; 4859 cpl_ctx->cq = admin_cq; 4860 cpl_ctx->cpl.sqid = 0; 4861 cpl_ctx->cpl.cdw0 = 0; 4862 cpl_ctx->cpl.cid = sq->create_io_sq_cmd.cid; 4863 cpl_ctx->cpl.status.sc = SPDK_NVME_SC_SUCCESS; 4864 cpl_ctx->cpl.status.sct = SPDK_NVME_SCT_GENERIC; 4865 4866 spdk_thread_send_msg(admin_cq->thread, _post_completion_msg, 4867 cpl_ctx); 4868 } else { 4869 post_completion(vu_ctrlr, admin_cq, 0, 0, 4870 sq->create_io_sq_cmd.cid, SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC); 4871 } 4872 sq->post_create_io_sq_completion = false; 4873 } else if (in_interrupt_mode(endpoint->transport)) { 4874 /* 4875 * If we're live migrating a guest, there is a window 4876 * where the I/O queues haven't been set up but the 4877 * device is in running state, during which the guest 4878 * might write to a doorbell. This doorbell write will 4879 * go unnoticed, so let's poll the whole controller to 4880 * pick that up. 4881 */ 4882 ctrlr_kick(vu_ctrlr); 4883 } 4884 sq->sq_state = VFIO_USER_SQ_ACTIVE; 4885 } 4886 4887 TAILQ_INSERT_TAIL(&vu_ctrlr->connected_sqs, sq, tailq); 4888 pthread_mutex_unlock(&endpoint->lock); 4889 4890 free(req->req.data); 4891 req->req.data = NULL; 4892 4893 return 0; 4894 } 4895 4896 /* 4897 * Add the given qpair to the given poll group. New qpairs are added via 4898 * spdk_nvmf_tgt_new_qpair(), which picks a poll group via 4899 * nvmf_vfio_user_get_optimal_poll_group(), then calls back here via 4900 * nvmf_transport_poll_group_add(). 4901 */ 4902 static int 4903 nvmf_vfio_user_poll_group_add(struct spdk_nvmf_transport_poll_group *group, 4904 struct spdk_nvmf_qpair *qpair) 4905 { 4906 struct nvmf_vfio_user_sq *sq; 4907 struct nvmf_vfio_user_req *vu_req; 4908 struct nvmf_vfio_user_ctrlr *ctrlr; 4909 struct spdk_nvmf_request *req; 4910 struct spdk_nvmf_fabric_connect_data *data; 4911 bool admin; 4912 4913 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 4914 sq->group = group; 4915 ctrlr = sq->ctrlr; 4916 4917 SPDK_DEBUGLOG(nvmf_vfio, "%s: add QP%d=%p(%p) to poll_group=%p\n", 4918 ctrlr_id(ctrlr), sq->qpair.qid, 4919 sq, qpair, group); 4920 4921 admin = nvmf_qpair_is_admin_queue(&sq->qpair); 4922 4923 vu_req = get_nvmf_vfio_user_req(sq); 4924 if (vu_req == NULL) { 4925 return -1; 4926 } 4927 4928 req = &vu_req->req; 4929 req->cmd->connect_cmd.opcode = SPDK_NVME_OPC_FABRIC; 4930 req->cmd->connect_cmd.cid = 0; 4931 req->cmd->connect_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_CONNECT; 4932 req->cmd->connect_cmd.recfmt = 0; 4933 req->cmd->connect_cmd.sqsize = sq->size - 1; 4934 req->cmd->connect_cmd.qid = admin ? 0 : qpair->qid; 4935 4936 req->length = sizeof(struct spdk_nvmf_fabric_connect_data); 4937 req->data = calloc(1, req->length); 4938 if (req->data == NULL) { 4939 nvmf_vfio_user_req_free(req); 4940 return -ENOMEM; 4941 } 4942 4943 data = (struct spdk_nvmf_fabric_connect_data *)req->data; 4944 data->cntlid = ctrlr->cntlid; 4945 snprintf(data->subnqn, sizeof(data->subnqn), "%s", 4946 spdk_nvmf_subsystem_get_nqn(ctrlr->endpoint->subsystem)); 4947 4948 vu_req->cb_fn = handle_queue_connect_rsp; 4949 vu_req->cb_arg = sq; 4950 4951 SPDK_DEBUGLOG(nvmf_vfio, 4952 "%s: sending connect fabrics command for qid:%#x cntlid=%#x\n", 4953 ctrlr_id(ctrlr), qpair->qid, data->cntlid); 4954 4955 spdk_nvmf_request_exec_fabrics(req); 4956 return 0; 4957 } 4958 4959 static int 4960 nvmf_vfio_user_poll_group_remove(struct spdk_nvmf_transport_poll_group *group, 4961 struct spdk_nvmf_qpair *qpair) 4962 { 4963 struct nvmf_vfio_user_sq *sq; 4964 struct nvmf_vfio_user_poll_group *vu_group; 4965 4966 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 4967 4968 SPDK_DEBUGLOG(nvmf_vfio, 4969 "%s: remove NVMf QP%d=%p from NVMf poll_group=%p\n", 4970 ctrlr_id(sq->ctrlr), qpair->qid, qpair, group); 4971 4972 4973 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 4974 TAILQ_REMOVE(&vu_group->sqs, sq, link); 4975 4976 return 0; 4977 } 4978 4979 static void 4980 _nvmf_vfio_user_req_free(struct nvmf_vfio_user_sq *sq, struct nvmf_vfio_user_req *vu_req) 4981 { 4982 memset(&vu_req->cmd, 0, sizeof(vu_req->cmd)); 4983 memset(&vu_req->rsp, 0, sizeof(vu_req->rsp)); 4984 vu_req->iovcnt = 0; 4985 vu_req->state = VFIO_USER_REQUEST_STATE_FREE; 4986 4987 TAILQ_INSERT_TAIL(&sq->free_reqs, vu_req, link); 4988 } 4989 4990 static int 4991 nvmf_vfio_user_req_free(struct spdk_nvmf_request *req) 4992 { 4993 struct nvmf_vfio_user_sq *sq; 4994 struct nvmf_vfio_user_req *vu_req; 4995 4996 assert(req != NULL); 4997 4998 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 4999 sq = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 5000 5001 _nvmf_vfio_user_req_free(sq, vu_req); 5002 5003 return 0; 5004 } 5005 5006 static int 5007 nvmf_vfio_user_req_complete(struct spdk_nvmf_request *req) 5008 { 5009 struct nvmf_vfio_user_sq *sq; 5010 struct nvmf_vfio_user_req *vu_req; 5011 5012 assert(req != NULL); 5013 5014 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 5015 sq = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 5016 5017 if (vu_req->cb_fn != NULL) { 5018 if (vu_req->cb_fn(vu_req, vu_req->cb_arg) != 0) { 5019 fail_ctrlr(sq->ctrlr); 5020 } 5021 } 5022 5023 _nvmf_vfio_user_req_free(sq, vu_req); 5024 5025 return 0; 5026 } 5027 5028 static void 5029 nvmf_vfio_user_close_qpair(struct spdk_nvmf_qpair *qpair, 5030 spdk_nvmf_transport_qpair_fini_cb cb_fn, void *cb_arg) 5031 { 5032 struct nvmf_vfio_user_sq *sq; 5033 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 5034 struct nvmf_vfio_user_endpoint *endpoint; 5035 5036 assert(qpair != NULL); 5037 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5038 vu_ctrlr = sq->ctrlr; 5039 endpoint = vu_ctrlr->endpoint; 5040 5041 pthread_mutex_lock(&endpoint->lock); 5042 TAILQ_REMOVE(&vu_ctrlr->connected_sqs, sq, tailq); 5043 delete_sq_done(vu_ctrlr, sq); 5044 if (TAILQ_EMPTY(&vu_ctrlr->connected_sqs)) { 5045 endpoint->ctrlr = NULL; 5046 if (vu_ctrlr->in_source_vm && endpoint->need_resume) { 5047 /* The controller will be freed, we can resume the subsystem 5048 * now so that the endpoint can be ready to accept another 5049 * new connection. 5050 */ 5051 spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 5052 vfio_user_endpoint_resume_done, endpoint); 5053 } 5054 free_ctrlr(vu_ctrlr); 5055 } 5056 pthread_mutex_unlock(&endpoint->lock); 5057 5058 if (cb_fn) { 5059 cb_fn(cb_arg); 5060 } 5061 } 5062 5063 /** 5064 * Returns a preallocated request, or NULL if there isn't one available. 5065 */ 5066 static struct nvmf_vfio_user_req * 5067 get_nvmf_vfio_user_req(struct nvmf_vfio_user_sq *sq) 5068 { 5069 struct nvmf_vfio_user_req *req; 5070 5071 if (sq == NULL) { 5072 return NULL; 5073 } 5074 5075 req = TAILQ_FIRST(&sq->free_reqs); 5076 if (req == NULL) { 5077 return NULL; 5078 } 5079 5080 TAILQ_REMOVE(&sq->free_reqs, req, link); 5081 5082 return req; 5083 } 5084 5085 static int 5086 get_nvmf_io_req_length(struct spdk_nvmf_request *req) 5087 { 5088 uint16_t nr; 5089 uint32_t nlb, nsid; 5090 struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; 5091 struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; 5092 struct spdk_nvmf_ns *ns; 5093 5094 nsid = cmd->nsid; 5095 ns = _nvmf_subsystem_get_ns(ctrlr->subsys, nsid); 5096 if (ns == NULL || ns->bdev == NULL) { 5097 SPDK_ERRLOG("unsuccessful query for nsid %u\n", cmd->nsid); 5098 return -EINVAL; 5099 } 5100 5101 if (cmd->opc == SPDK_NVME_OPC_DATASET_MANAGEMENT) { 5102 nr = cmd->cdw10_bits.dsm.nr + 1; 5103 return nr * sizeof(struct spdk_nvme_dsm_range); 5104 } 5105 5106 nlb = (cmd->cdw12 & 0x0000ffffu) + 1; 5107 return nlb * spdk_bdev_get_block_size(ns->bdev); 5108 } 5109 5110 static int 5111 map_admin_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req) 5112 { 5113 struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; 5114 uint32_t len = 0; 5115 uint8_t fid; 5116 int iovcnt; 5117 5118 req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 5119 req->length = 0; 5120 req->data = NULL; 5121 5122 if (req->xfer == SPDK_NVME_DATA_NONE) { 5123 return 0; 5124 } 5125 5126 switch (cmd->opc) { 5127 case SPDK_NVME_OPC_IDENTIFY: 5128 len = 4096; 5129 break; 5130 case SPDK_NVME_OPC_GET_LOG_PAGE: 5131 len = (((cmd->cdw11_bits.get_log_page.numdu << 16) | cmd->cdw10_bits.get_log_page.numdl) + 1) * 4; 5132 break; 5133 case SPDK_NVME_OPC_GET_FEATURES: 5134 case SPDK_NVME_OPC_SET_FEATURES: 5135 fid = cmd->cdw10_bits.set_features.fid; 5136 switch (fid) { 5137 case SPDK_NVME_FEAT_LBA_RANGE_TYPE: 5138 len = 4096; 5139 break; 5140 case SPDK_NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 5141 len = 256; 5142 break; 5143 case SPDK_NVME_FEAT_TIMESTAMP: 5144 len = 8; 5145 break; 5146 case SPDK_NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 5147 len = 512; 5148 break; 5149 case SPDK_NVME_FEAT_HOST_IDENTIFIER: 5150 if (cmd->cdw11_bits.feat_host_identifier.bits.exhid) { 5151 len = 16; 5152 } else { 5153 len = 8; 5154 } 5155 break; 5156 default: 5157 return 0; 5158 } 5159 break; 5160 default: 5161 return 0; 5162 } 5163 5164 /* ADMIN command will not use SGL */ 5165 if (cmd->psdt != 0) { 5166 return -EINVAL; 5167 } 5168 5169 iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, len); 5170 if (iovcnt < 0) { 5171 SPDK_ERRLOG("%s: map Admin Opc %x failed\n", 5172 ctrlr_id(ctrlr), cmd->opc); 5173 return -1; 5174 } 5175 req->length = len; 5176 req->data = req->iov[0].iov_base; 5177 req->iovcnt = iovcnt; 5178 5179 return 0; 5180 } 5181 5182 /* 5183 * Map an I/O command's buffers. 5184 * 5185 * Returns 0 on success and -errno on failure. 5186 */ 5187 static int 5188 map_io_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req) 5189 { 5190 int len, iovcnt; 5191 struct spdk_nvme_cmd *cmd; 5192 5193 assert(ctrlr != NULL); 5194 assert(req != NULL); 5195 5196 cmd = &req->cmd->nvme_cmd; 5197 req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 5198 req->length = 0; 5199 req->data = NULL; 5200 5201 if (spdk_unlikely(req->xfer == SPDK_NVME_DATA_NONE)) { 5202 return 0; 5203 } 5204 5205 len = get_nvmf_io_req_length(req); 5206 if (len < 0) { 5207 return -EINVAL; 5208 } 5209 req->length = len; 5210 5211 iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, req->length); 5212 if (iovcnt < 0) { 5213 SPDK_ERRLOG("%s: failed to map IO OPC %u\n", ctrlr_id(ctrlr), cmd->opc); 5214 return -EFAULT; 5215 } 5216 req->data = req->iov[0].iov_base; 5217 req->iovcnt = iovcnt; 5218 5219 return 0; 5220 } 5221 5222 static int 5223 handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 5224 struct nvmf_vfio_user_sq *sq) 5225 { 5226 int err; 5227 struct nvmf_vfio_user_req *vu_req; 5228 struct spdk_nvmf_request *req; 5229 5230 assert(ctrlr != NULL); 5231 assert(cmd != NULL); 5232 5233 vu_req = get_nvmf_vfio_user_req(sq); 5234 if (spdk_unlikely(vu_req == NULL)) { 5235 SPDK_ERRLOG("%s: no request for NVMe command opc 0x%x\n", ctrlr_id(ctrlr), cmd->opc); 5236 return post_completion(ctrlr, ctrlr->cqs[sq->cqid], 0, 0, cmd->cid, 5237 SPDK_NVME_SC_INTERNAL_DEVICE_ERROR, SPDK_NVME_SCT_GENERIC); 5238 5239 } 5240 req = &vu_req->req; 5241 5242 assert(req->qpair != NULL); 5243 SPDK_DEBUGLOG(nvmf_vfio, "%s: handle sqid:%u, req opc=%#x cid=%d\n", 5244 ctrlr_id(ctrlr), req->qpair->qid, cmd->opc, cmd->cid); 5245 5246 vu_req->cb_fn = handle_cmd_rsp; 5247 vu_req->cb_arg = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 5248 req->cmd->nvme_cmd = *cmd; 5249 5250 if (nvmf_qpair_is_admin_queue(req->qpair)) { 5251 err = map_admin_cmd_req(ctrlr, req); 5252 } else { 5253 switch (cmd->opc) { 5254 case SPDK_NVME_OPC_RESERVATION_REGISTER: 5255 case SPDK_NVME_OPC_RESERVATION_REPORT: 5256 case SPDK_NVME_OPC_RESERVATION_ACQUIRE: 5257 case SPDK_NVME_OPC_RESERVATION_RELEASE: 5258 err = -ENOTSUP; 5259 break; 5260 default: 5261 err = map_io_cmd_req(ctrlr, req); 5262 break; 5263 } 5264 } 5265 5266 if (spdk_unlikely(err < 0)) { 5267 SPDK_ERRLOG("%s: process NVMe command opc 0x%x failed\n", 5268 ctrlr_id(ctrlr), cmd->opc); 5269 req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 5270 req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; 5271 err = handle_cmd_rsp(vu_req, vu_req->cb_arg); 5272 _nvmf_vfio_user_req_free(sq, vu_req); 5273 return err; 5274 } 5275 5276 vu_req->state = VFIO_USER_REQUEST_STATE_EXECUTING; 5277 spdk_nvmf_request_exec(req); 5278 5279 return 0; 5280 } 5281 5282 /* 5283 * If we suppressed an IRQ in post_completion(), check if it needs to be fired 5284 * here: if the host isn't up to date, and is apparently not actively processing 5285 * the queue (i.e. ->last_head isn't changing), we need an IRQ. 5286 */ 5287 static void 5288 handle_suppressed_irq(struct nvmf_vfio_user_ctrlr *ctrlr, 5289 struct nvmf_vfio_user_sq *sq) 5290 { 5291 struct nvmf_vfio_user_cq *cq = ctrlr->cqs[sq->cqid]; 5292 uint32_t cq_head; 5293 uint32_t cq_tail; 5294 5295 if (!cq->ien || cq->qid == 0 || !ctrlr_interrupt_enabled(ctrlr)) { 5296 return; 5297 } 5298 5299 cq_tail = *cq_tailp(cq); 5300 5301 /* Already sent? */ 5302 if (cq_tail == cq->last_trigger_irq_tail) { 5303 return; 5304 } 5305 5306 spdk_ivdt_dcache(cq_dbl_headp(cq)); 5307 cq_head = *cq_dbl_headp(cq); 5308 5309 if (cq_head != cq_tail && cq_head == cq->last_head) { 5310 int err = vfu_irq_trigger(ctrlr->endpoint->vfu_ctx, cq->iv); 5311 if (err != 0) { 5312 SPDK_ERRLOG("%s: failed to trigger interrupt: %m\n", 5313 ctrlr_id(ctrlr)); 5314 } else { 5315 cq->last_trigger_irq_tail = cq_tail; 5316 } 5317 } 5318 5319 cq->last_head = cq_head; 5320 } 5321 5322 /* Returns the number of commands processed, or a negative value on error. */ 5323 static int 5324 nvmf_vfio_user_sq_poll(struct nvmf_vfio_user_sq *sq) 5325 { 5326 struct nvmf_vfio_user_ctrlr *ctrlr; 5327 uint32_t new_tail; 5328 int count = 0; 5329 5330 assert(sq != NULL); 5331 5332 ctrlr = sq->ctrlr; 5333 5334 /* 5335 * A quiesced, or migrating, controller should never process new 5336 * commands. 5337 */ 5338 if (ctrlr->state != VFIO_USER_CTRLR_RUNNING) { 5339 return SPDK_POLLER_IDLE; 5340 } 5341 5342 if (ctrlr->adaptive_irqs_enabled) { 5343 handle_suppressed_irq(ctrlr, sq); 5344 } 5345 5346 /* On aarch64 platforms, doorbells update from guest VM may not be seen 5347 * on SPDK target side. This is because there is memory type mismatch 5348 * situation here. That is on guest VM side, the doorbells are treated as 5349 * device memory while on SPDK target side, it is treated as normal 5350 * memory. And this situation cause problem on ARM platform. 5351 * Refer to "https://developer.arm.com/documentation/102376/0100/ 5352 * Memory-aliasing-and-mismatched-memory-types". Only using spdk_mb() 5353 * cannot fix this. Use "dc civac" to invalidate cache may solve 5354 * this. 5355 */ 5356 spdk_ivdt_dcache(sq_dbl_tailp(sq)); 5357 5358 /* Load-Acquire. */ 5359 new_tail = *sq_dbl_tailp(sq); 5360 5361 new_tail = new_tail & 0xffffu; 5362 if (spdk_unlikely(new_tail >= sq->size)) { 5363 union spdk_nvme_async_event_completion event = {}; 5364 5365 SPDK_DEBUGLOG(nvmf_vfio, "%s: invalid sqid:%u doorbell value %u\n", ctrlr_id(ctrlr), sq->qid, 5366 new_tail); 5367 event.bits.async_event_type = SPDK_NVME_ASYNC_EVENT_TYPE_ERROR; 5368 event.bits.async_event_info = SPDK_NVME_ASYNC_EVENT_INVALID_DB_WRITE; 5369 nvmf_ctrlr_async_event_error_event(ctrlr->ctrlr, event); 5370 5371 return -1; 5372 } 5373 5374 if (*sq_headp(sq) == new_tail) { 5375 return 0; 5376 } 5377 5378 SPDK_DEBUGLOG(nvmf_vfio, "%s: sqid:%u doorbell old=%u new=%u\n", 5379 ctrlr_id(ctrlr), sq->qid, *sq_headp(sq), new_tail); 5380 if (ctrlr->sdbl != NULL) { 5381 SPDK_DEBUGLOG(nvmf_vfio, 5382 "%s: sqid:%u bar0_doorbell=%u shadow_doorbell=%u eventidx=%u\n", 5383 ctrlr_id(ctrlr), sq->qid, 5384 ctrlr->bar0_doorbells[queue_index(sq->qid, false)], 5385 ctrlr->sdbl->shadow_doorbells[queue_index(sq->qid, false)], 5386 ctrlr->sdbl->eventidxs[queue_index(sq->qid, false)]); 5387 } 5388 5389 /* 5390 * Ensure that changes to the queue are visible to us. 5391 * The host driver should write the queue first, do a wmb(), and then 5392 * update the SQ tail doorbell (their Store-Release). 5393 */ 5394 spdk_rmb(); 5395 5396 count = handle_sq_tdbl_write(ctrlr, new_tail, sq); 5397 if (count < 0) { 5398 fail_ctrlr(ctrlr); 5399 } 5400 5401 return count; 5402 } 5403 5404 /* 5405 * vfio-user transport poll handler. Note that the library context is polled in 5406 * a separate poller (->vfu_ctx_poller), so this poller only needs to poll the 5407 * active SQs. 5408 * 5409 * Returns the number of commands processed, or a negative value on error. 5410 */ 5411 static int 5412 nvmf_vfio_user_poll_group_poll(struct spdk_nvmf_transport_poll_group *group) 5413 { 5414 struct nvmf_vfio_user_poll_group *vu_group; 5415 struct nvmf_vfio_user_sq *sq, *tmp; 5416 int count = 0; 5417 5418 assert(group != NULL); 5419 5420 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 5421 5422 SPDK_DEBUGLOG(vfio_user_db, "polling all SQs\n"); 5423 5424 TAILQ_FOREACH_SAFE(sq, &vu_group->sqs, link, tmp) { 5425 int ret; 5426 5427 if (spdk_unlikely(sq->sq_state != VFIO_USER_SQ_ACTIVE || !sq->size)) { 5428 continue; 5429 } 5430 5431 ret = nvmf_vfio_user_sq_poll(sq); 5432 5433 if (ret < 0) { 5434 return ret; 5435 } 5436 5437 count += ret; 5438 } 5439 5440 return count; 5441 } 5442 5443 static int 5444 nvmf_vfio_user_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair, 5445 struct spdk_nvme_transport_id *trid) 5446 { 5447 struct nvmf_vfio_user_sq *sq; 5448 struct nvmf_vfio_user_ctrlr *ctrlr; 5449 5450 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5451 ctrlr = sq->ctrlr; 5452 5453 memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid)); 5454 return 0; 5455 } 5456 5457 static int 5458 nvmf_vfio_user_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair, 5459 struct spdk_nvme_transport_id *trid) 5460 { 5461 return 0; 5462 } 5463 5464 static int 5465 nvmf_vfio_user_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair, 5466 struct spdk_nvme_transport_id *trid) 5467 { 5468 struct nvmf_vfio_user_sq *sq; 5469 struct nvmf_vfio_user_ctrlr *ctrlr; 5470 5471 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5472 ctrlr = sq->ctrlr; 5473 5474 memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid)); 5475 return 0; 5476 } 5477 5478 static void 5479 nvmf_vfio_user_qpair_abort_request(struct spdk_nvmf_qpair *qpair, 5480 struct spdk_nvmf_request *req) 5481 { 5482 struct spdk_nvmf_request *req_to_abort = NULL; 5483 struct spdk_nvmf_request *temp_req = NULL; 5484 uint16_t cid; 5485 5486 cid = req->cmd->nvme_cmd.cdw10_bits.abort.cid; 5487 5488 TAILQ_FOREACH(temp_req, &qpair->outstanding, link) { 5489 struct nvmf_vfio_user_req *vu_req; 5490 5491 vu_req = SPDK_CONTAINEROF(temp_req, struct nvmf_vfio_user_req, req); 5492 5493 if (vu_req->state == VFIO_USER_REQUEST_STATE_EXECUTING && vu_req->cmd.cid == cid) { 5494 req_to_abort = temp_req; 5495 break; 5496 } 5497 } 5498 5499 if (req_to_abort == NULL) { 5500 spdk_nvmf_request_complete(req); 5501 return; 5502 } 5503 5504 req->req_to_abort = req_to_abort; 5505 nvmf_ctrlr_abort_request(req); 5506 } 5507 5508 static void 5509 nvmf_vfio_user_opts_init(struct spdk_nvmf_transport_opts *opts) 5510 { 5511 opts->max_queue_depth = NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH; 5512 opts->max_qpairs_per_ctrlr = NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; 5513 opts->in_capsule_data_size = 0; 5514 opts->max_io_size = NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE; 5515 opts->io_unit_size = NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE; 5516 opts->max_aq_depth = NVMF_VFIO_USER_DEFAULT_AQ_DEPTH; 5517 opts->num_shared_buffers = 0; 5518 opts->buf_cache_size = 0; 5519 opts->association_timeout = 0; 5520 opts->transport_specific = NULL; 5521 } 5522 5523 const struct spdk_nvmf_transport_ops spdk_nvmf_transport_vfio_user = { 5524 .name = "VFIOUSER", 5525 .type = SPDK_NVME_TRANSPORT_VFIOUSER, 5526 .opts_init = nvmf_vfio_user_opts_init, 5527 .create = nvmf_vfio_user_create, 5528 .destroy = nvmf_vfio_user_destroy, 5529 5530 .listen = nvmf_vfio_user_listen, 5531 .stop_listen = nvmf_vfio_user_stop_listen, 5532 .cdata_init = nvmf_vfio_user_cdata_init, 5533 .listen_associate = nvmf_vfio_user_listen_associate, 5534 5535 .listener_discover = nvmf_vfio_user_discover, 5536 5537 .poll_group_create = nvmf_vfio_user_poll_group_create, 5538 .get_optimal_poll_group = nvmf_vfio_user_get_optimal_poll_group, 5539 .poll_group_destroy = nvmf_vfio_user_poll_group_destroy, 5540 .poll_group_add = nvmf_vfio_user_poll_group_add, 5541 .poll_group_remove = nvmf_vfio_user_poll_group_remove, 5542 .poll_group_poll = nvmf_vfio_user_poll_group_poll, 5543 5544 .req_free = nvmf_vfio_user_req_free, 5545 .req_complete = nvmf_vfio_user_req_complete, 5546 5547 .qpair_fini = nvmf_vfio_user_close_qpair, 5548 .qpair_get_local_trid = nvmf_vfio_user_qpair_get_local_trid, 5549 .qpair_get_peer_trid = nvmf_vfio_user_qpair_get_peer_trid, 5550 .qpair_get_listen_trid = nvmf_vfio_user_qpair_get_listen_trid, 5551 .qpair_abort_request = nvmf_vfio_user_qpair_abort_request, 5552 }; 5553 5554 SPDK_NVMF_TRANSPORT_REGISTER(muser, &spdk_nvmf_transport_vfio_user); 5555 SPDK_LOG_REGISTER_COMPONENT(nvmf_vfio) 5556 SPDK_LOG_REGISTER_COMPONENT(vfio_user_db) 5557