1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2020 Intel Corporation. 3 * Copyright (c) 2019-2022, Nutanix Inc. All rights reserved. 4 * Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 /* 8 * NVMe over vfio-user transport 9 */ 10 11 #include <sys/param.h> 12 13 #include <vfio-user/libvfio-user.h> 14 #include <vfio-user/pci_defs.h> 15 16 #include "spdk/barrier.h" 17 #include "spdk/stdinc.h" 18 #include "spdk/assert.h" 19 #include "spdk/thread.h" 20 #include "spdk/nvmf_transport.h" 21 #include "spdk/sock.h" 22 #include "spdk/string.h" 23 #include "spdk/util.h" 24 #include "spdk/log.h" 25 26 #include "transport.h" 27 28 #include "nvmf_internal.h" 29 30 #define SWAP(x, y) \ 31 do \ 32 { \ 33 typeof(x) _tmp = x; \ 34 x = y; \ 35 y = _tmp; \ 36 } while (0) 37 38 #define NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH 256 39 #define NVMF_VFIO_USER_DEFAULT_AQ_DEPTH 32 40 #define NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE ((NVMF_REQ_MAX_BUFFERS - 1) << SHIFT_4KB) 41 #define NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE 42 43 #define NVME_DOORBELLS_OFFSET 0x1000 44 #define NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT 2 45 #define NVMF_VFIO_USER_SET_EVENTIDX_MAX_ATTEMPTS 3 46 #define NVMF_VFIO_USER_EVENTIDX_POLL UINT32_MAX 47 48 #define NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR 512 49 #define NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR (NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR / 4) 50 51 /* NVMe spec 1.4, section 5.21.1.7 */ 52 SPDK_STATIC_ASSERT(NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR >= 2 && 53 NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR <= 65535, 54 "bad number of queues"); 55 56 /* 57 * NVMe driver reads 4096 bytes, which is the extended PCI configuration space 58 * available on PCI-X 2.0 and PCI Express buses 59 */ 60 #define NVME_REG_CFG_SIZE 0x1000 61 62 /* 63 * Doorbells must be page aligned so that they can memory mapped. 64 * 65 * TODO does the NVMe spec also require this? Document it. 66 */ 67 #define NVMF_VFIO_USER_DOORBELLS_SIZE \ 68 SPDK_ALIGN_CEIL( \ 69 (NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR * 2 * SPDK_NVME_DOORBELL_REGISTER_SIZE), \ 70 0x1000) 71 #define NVME_REG_BAR0_SIZE (NVME_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE) 72 73 /* 74 * TODO check the PCI spec whether BAR4 and BAR5 really have to be at least one 75 * page and a multiple of page size (maybe QEMU also needs this?). Document all 76 * this. 77 */ 78 79 /* 80 * MSI-X Pending Bit Array Size 81 * 82 * TODO according to the PCI spec we need one bit per vector, document the 83 * relevant section. 84 * 85 * If the first argument to SPDK_ALIGN_CEIL is 0 then the result is 0, so we 86 * would end up with a 0-size BAR5. 87 */ 88 #define NVME_IRQ_MSIX_NUM MAX(CHAR_BIT, NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) 89 #define NVME_BAR5_SIZE SPDK_ALIGN_CEIL((NVME_IRQ_MSIX_NUM / CHAR_BIT), 0x1000) 90 SPDK_STATIC_ASSERT(NVME_BAR5_SIZE > 0, "Incorrect size"); 91 92 /* MSI-X Table Size */ 93 #define NVME_BAR4_SIZE SPDK_ALIGN_CEIL((NVME_IRQ_MSIX_NUM * 16), 0x1000) 94 SPDK_STATIC_ASSERT(NVME_BAR4_SIZE > 0, "Incorrect size"); 95 96 struct nvmf_vfio_user_req; 97 98 typedef int (*nvmf_vfio_user_req_cb_fn)(struct nvmf_vfio_user_req *req, void *cb_arg); 99 100 /* 1 more for PRP2 list itself */ 101 #define NVMF_VFIO_USER_MAX_IOVECS (NVMF_REQ_MAX_BUFFERS + 1) 102 103 enum nvmf_vfio_user_req_state { 104 VFIO_USER_REQUEST_STATE_FREE = 0, 105 VFIO_USER_REQUEST_STATE_EXECUTING, 106 }; 107 108 /* 109 * Support for live migration in NVMf/vfio-user: live migration is implemented 110 * by stopping the NVMf subsystem when the device is instructed to enter the 111 * stop-and-copy state and then trivially, and most importantly safely, 112 * collecting migration state and providing it to the vfio-user client. We 113 * don't provide any migration state at the pre-copy state as that's too 114 * complicated to do, we might support this in the future. 115 */ 116 117 118 /* NVMe device state representation */ 119 struct nvme_migr_sq_state { 120 uint16_t sqid; 121 uint16_t cqid; 122 uint32_t head; 123 uint32_t size; 124 uint32_t reserved; 125 uint64_t dma_addr; 126 }; 127 SPDK_STATIC_ASSERT(sizeof(struct nvme_migr_sq_state) == 0x18, "Incorrect size"); 128 129 struct nvme_migr_cq_state { 130 uint16_t cqid; 131 uint16_t phase; 132 uint32_t tail; 133 uint32_t size; 134 uint32_t iv; 135 uint32_t ien; 136 uint32_t reserved; 137 uint64_t dma_addr; 138 }; 139 SPDK_STATIC_ASSERT(sizeof(struct nvme_migr_cq_state) == 0x20, "Incorrect size"); 140 141 #define VFIO_USER_NVME_MIGR_MAGIC 0xAFEDBC23 142 143 /* The device state is in VFIO MIGRATION BAR(9) region, keep the device state page aligned. 144 * 145 * NVMe device migration region is defined as below: 146 * ------------------------------------------------------------------------- 147 * | vfio_user_nvme_migr_header | nvmf controller data | queue pairs | BARs | 148 * ------------------------------------------------------------------------- 149 * 150 * Keep vfio_user_nvme_migr_header as a fixed 0x1000 length, all new added fields 151 * can use the reserved space at the end of the data structure. 152 */ 153 struct vfio_user_nvme_migr_header { 154 /* Magic value to validate migration data */ 155 uint32_t magic; 156 /* Version to check the data is same from source to destination */ 157 uint32_t version; 158 159 /* The library uses this field to know how many fields in this 160 * structure are valid, starting at the beginning of this data 161 * structure. New added fields in future use `unused` memory 162 * spaces. 163 */ 164 uint32_t opts_size; 165 uint32_t reserved0; 166 167 /* BARs information */ 168 uint64_t bar_offset[VFU_PCI_DEV_NUM_REGIONS]; 169 uint64_t bar_len[VFU_PCI_DEV_NUM_REGIONS]; 170 171 /* Queue pair start offset, starting at the beginning of this 172 * data structure. 173 */ 174 uint64_t qp_offset; 175 uint64_t qp_len; 176 177 /* Controller data structure */ 178 uint32_t num_io_queues; 179 uint32_t reserved1; 180 181 /* NVMf controller data offset and length if exist, starting at 182 * the beginning of this data structure. 183 */ 184 uint64_t nvmf_data_offset; 185 uint64_t nvmf_data_len; 186 187 /* 188 * Whether or not shadow doorbells are used in the source. 0 is a valid DMA 189 * address. 190 */ 191 uint32_t sdbl; 192 193 /* Shadow doorbell DMA addresses. */ 194 uint64_t shadow_doorbell_buffer; 195 uint64_t eventidx_buffer; 196 197 /* Reserved memory space for new added fields, the 198 * field is always at the end of this data structure. 199 */ 200 uint8_t unused[3856]; 201 }; 202 SPDK_STATIC_ASSERT(sizeof(struct vfio_user_nvme_migr_header) == 0x1000, "Incorrect size"); 203 204 struct vfio_user_nvme_migr_qp { 205 struct nvme_migr_sq_state sq; 206 struct nvme_migr_cq_state cq; 207 }; 208 209 /* NVMe state definition used to load/restore from/to NVMe migration BAR region */ 210 struct vfio_user_nvme_migr_state { 211 struct vfio_user_nvme_migr_header ctrlr_header; 212 struct spdk_nvmf_ctrlr_migr_data nvmf_data; 213 struct vfio_user_nvme_migr_qp qps[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 214 uint8_t doorbells[NVMF_VFIO_USER_DOORBELLS_SIZE]; 215 uint8_t cfg[NVME_REG_CFG_SIZE]; 216 }; 217 218 struct nvmf_vfio_user_req { 219 struct spdk_nvmf_request req; 220 struct spdk_nvme_cpl rsp; 221 struct spdk_nvme_cmd cmd; 222 223 enum nvmf_vfio_user_req_state state; 224 nvmf_vfio_user_req_cb_fn cb_fn; 225 void *cb_arg; 226 227 /* old CC before prop_set_cc fabric command */ 228 union spdk_nvme_cc_register cc; 229 230 TAILQ_ENTRY(nvmf_vfio_user_req) link; 231 232 struct iovec iov[NVMF_VFIO_USER_MAX_IOVECS]; 233 uint8_t iovcnt; 234 235 /* NVMF_VFIO_USER_MAX_IOVECS worth of dma_sg_t. */ 236 uint8_t sg[]; 237 }; 238 239 /* 240 * Mapping of an NVMe queue. 241 * 242 * This holds the information tracking a local process mapping of an NVMe queue 243 * shared by the client. 244 */ 245 struct nvme_q_mapping { 246 /* iov of local process mapping. */ 247 struct iovec iov; 248 /* Stored sg, needed for unmap. */ 249 dma_sg_t *sg; 250 /* Client PRP of queue. */ 251 uint64_t prp1; 252 }; 253 254 enum nvmf_vfio_user_sq_state { 255 VFIO_USER_SQ_UNUSED = 0, 256 VFIO_USER_SQ_CREATED, 257 VFIO_USER_SQ_DELETED, 258 VFIO_USER_SQ_ACTIVE, 259 VFIO_USER_SQ_INACTIVE 260 }; 261 262 enum nvmf_vfio_user_cq_state { 263 VFIO_USER_CQ_UNUSED = 0, 264 VFIO_USER_CQ_CREATED, 265 VFIO_USER_CQ_DELETED, 266 }; 267 268 enum nvmf_vfio_user_ctrlr_state { 269 VFIO_USER_CTRLR_CREATING = 0, 270 VFIO_USER_CTRLR_RUNNING, 271 /* Quiesce requested by libvfio-user */ 272 VFIO_USER_CTRLR_PAUSING, 273 /* NVMf subsystem is paused, it's safe to do PCI reset, memory register, 274 * memory unergister, and vfio migration state transition in this state. 275 */ 276 VFIO_USER_CTRLR_PAUSED, 277 /* 278 * Implies that the NVMf subsystem is paused. Device will be unquiesced (PCI 279 * reset, memory register and unregister, controller in destination VM has 280 * been restored). NVMf subsystem resume has been requested. 281 */ 282 VFIO_USER_CTRLR_RESUMING, 283 /* 284 * Implies that the NVMf subsystem is paused. Both controller in source VM and 285 * destinatiom VM is in this state when doing live migration. 286 */ 287 VFIO_USER_CTRLR_MIGRATING 288 }; 289 290 struct nvmf_vfio_user_sq { 291 struct spdk_nvmf_qpair qpair; 292 struct spdk_nvmf_transport_poll_group *group; 293 struct nvmf_vfio_user_ctrlr *ctrlr; 294 295 uint32_t qid; 296 /* Number of entries in queue. */ 297 uint32_t size; 298 struct nvme_q_mapping mapping; 299 enum nvmf_vfio_user_sq_state sq_state; 300 301 uint32_t head; 302 volatile uint32_t *dbl_tailp; 303 304 /* Whether a shadow doorbell eventidx needs setting. */ 305 bool need_rearm; 306 307 /* multiple SQs can be mapped to the same CQ */ 308 uint16_t cqid; 309 310 /* handle_queue_connect_rsp() can be used both for CREATE IO SQ response 311 * and SQ re-connect response in the destination VM, for the prior case, 312 * we will post a NVMe completion to VM, we will not set this flag when 313 * re-connecting SQs in the destination VM. 314 */ 315 bool post_create_io_sq_completion; 316 /* Copy of Create IO SQ command, this field is used together with 317 * `post_create_io_sq_completion` flag. 318 */ 319 struct spdk_nvme_cmd create_io_sq_cmd; 320 321 /* Currently unallocated reqs. */ 322 TAILQ_HEAD(, nvmf_vfio_user_req) free_reqs; 323 /* Poll group entry */ 324 TAILQ_ENTRY(nvmf_vfio_user_sq) link; 325 /* Connected SQ entry */ 326 TAILQ_ENTRY(nvmf_vfio_user_sq) tailq; 327 }; 328 329 struct nvmf_vfio_user_cq { 330 struct spdk_nvmf_transport_poll_group *group; 331 struct spdk_thread *thread; 332 int cq_ref; 333 334 uint32_t qid; 335 /* Number of entries in queue. */ 336 uint32_t size; 337 struct nvme_q_mapping mapping; 338 enum nvmf_vfio_user_cq_state cq_state; 339 340 uint32_t tail; 341 volatile uint32_t *dbl_headp; 342 343 bool phase; 344 345 uint16_t iv; 346 bool ien; 347 348 uint32_t last_head; 349 uint32_t last_trigger_irq_tail; 350 }; 351 352 struct nvmf_vfio_user_poll_group { 353 struct spdk_nvmf_transport_poll_group group; 354 TAILQ_ENTRY(nvmf_vfio_user_poll_group) link; 355 TAILQ_HEAD(, nvmf_vfio_user_sq) sqs; 356 struct spdk_interrupt *intr; 357 int intr_fd; 358 }; 359 360 struct nvmf_vfio_user_shadow_doorbells { 361 volatile uint32_t *shadow_doorbells; 362 volatile uint32_t *eventidxs; 363 dma_sg_t *sgs; 364 struct iovec *iovs; 365 }; 366 367 struct nvmf_vfio_user_ctrlr { 368 struct nvmf_vfio_user_endpoint *endpoint; 369 struct nvmf_vfio_user_transport *transport; 370 371 /* Connected SQs list */ 372 TAILQ_HEAD(, nvmf_vfio_user_sq) connected_sqs; 373 enum nvmf_vfio_user_ctrlr_state state; 374 375 /* 376 * Tells whether live migration data have been prepared. This is used 377 * by the get_pending_bytes callback to tell whether or not the 378 * previous iteration finished. 379 */ 380 bool migr_data_prepared; 381 382 /* Controller is in source VM when doing live migration */ 383 bool in_source_vm; 384 385 struct spdk_thread *thread; 386 struct spdk_poller *vfu_ctx_poller; 387 struct spdk_interrupt *intr; 388 int intr_fd; 389 390 bool queued_quiesce; 391 392 bool reset_shn; 393 bool disconnect; 394 395 uint16_t cntlid; 396 struct spdk_nvmf_ctrlr *ctrlr; 397 398 struct nvmf_vfio_user_sq *sqs[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 399 struct nvmf_vfio_user_cq *cqs[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 400 401 TAILQ_ENTRY(nvmf_vfio_user_ctrlr) link; 402 403 volatile uint32_t *bar0_doorbells; 404 struct nvmf_vfio_user_shadow_doorbells *sdbl; 405 /* 406 * Shadow doorbells PRPs to provide during the stop-and-copy state. 407 */ 408 uint64_t shadow_doorbell_buffer; 409 uint64_t eventidx_buffer; 410 411 bool adaptive_irqs_enabled; 412 }; 413 414 /* Endpoint in vfio-user is associated with a socket file, which 415 * is the representative of a PCI endpoint. 416 */ 417 struct nvmf_vfio_user_endpoint { 418 struct nvmf_vfio_user_transport *transport; 419 vfu_ctx_t *vfu_ctx; 420 struct spdk_poller *accept_poller; 421 struct spdk_thread *accept_thread; 422 bool interrupt_mode; 423 struct msixcap *msix; 424 vfu_pci_config_space_t *pci_config_space; 425 int devmem_fd; 426 int accept_intr_fd; 427 struct spdk_interrupt *accept_intr; 428 429 volatile uint32_t *bar0_doorbells; 430 431 int migr_fd; 432 void *migr_data; 433 434 struct spdk_nvme_transport_id trid; 435 struct spdk_nvmf_subsystem *subsystem; 436 437 /* Controller is associated with an active socket connection, 438 * the lifecycle of the controller is same as the VM. 439 * Currently we only support one active connection, as the NVMe 440 * specification defines, we may support multiple controllers in 441 * future, so that it can support e.g: RESERVATION. 442 */ 443 struct nvmf_vfio_user_ctrlr *ctrlr; 444 pthread_mutex_t lock; 445 446 bool need_async_destroy; 447 /* The subsystem is in PAUSED state and need to be resumed, TRUE 448 * only when migration is done successfully and the controller is 449 * in source VM. 450 */ 451 bool need_resume; 452 /* Start the accept poller again after destroying the controller */ 453 bool need_relisten; 454 455 TAILQ_ENTRY(nvmf_vfio_user_endpoint) link; 456 }; 457 458 struct nvmf_vfio_user_transport_opts { 459 bool disable_mappable_bar0; 460 bool disable_adaptive_irq; 461 bool disable_shadow_doorbells; 462 bool disable_compare; 463 bool enable_intr_mode_sq_spreading; 464 }; 465 466 struct nvmf_vfio_user_transport { 467 struct spdk_nvmf_transport transport; 468 struct nvmf_vfio_user_transport_opts transport_opts; 469 bool intr_mode_supported; 470 pthread_mutex_t lock; 471 TAILQ_HEAD(, nvmf_vfio_user_endpoint) endpoints; 472 473 pthread_mutex_t pg_lock; 474 TAILQ_HEAD(, nvmf_vfio_user_poll_group) poll_groups; 475 struct nvmf_vfio_user_poll_group *next_pg; 476 }; 477 478 /* 479 * function prototypes 480 */ 481 static int nvmf_vfio_user_req_free(struct spdk_nvmf_request *req); 482 483 static struct nvmf_vfio_user_req *get_nvmf_vfio_user_req(struct nvmf_vfio_user_sq *sq); 484 485 /* 486 * Local process virtual address of a queue. 487 */ 488 static inline void * 489 q_addr(struct nvme_q_mapping *mapping) 490 { 491 return mapping->iov.iov_base; 492 } 493 494 static inline int 495 queue_index(uint16_t qid, bool is_cq) 496 { 497 return (qid * 2) + is_cq; 498 } 499 500 static inline volatile uint32_t * 501 sq_headp(struct nvmf_vfio_user_sq *sq) 502 { 503 assert(sq != NULL); 504 return &sq->head; 505 } 506 507 static inline volatile uint32_t * 508 sq_dbl_tailp(struct nvmf_vfio_user_sq *sq) 509 { 510 assert(sq != NULL); 511 return sq->dbl_tailp; 512 } 513 514 static inline volatile uint32_t * 515 cq_dbl_headp(struct nvmf_vfio_user_cq *cq) 516 { 517 assert(cq != NULL); 518 return cq->dbl_headp; 519 } 520 521 static inline volatile uint32_t * 522 cq_tailp(struct nvmf_vfio_user_cq *cq) 523 { 524 assert(cq != NULL); 525 return &cq->tail; 526 } 527 528 static inline void 529 sq_head_advance(struct nvmf_vfio_user_sq *sq) 530 { 531 assert(sq != NULL); 532 533 assert(*sq_headp(sq) < sq->size); 534 (*sq_headp(sq))++; 535 536 if (spdk_unlikely(*sq_headp(sq) == sq->size)) { 537 *sq_headp(sq) = 0; 538 } 539 } 540 541 static inline void 542 cq_tail_advance(struct nvmf_vfio_user_cq *cq) 543 { 544 assert(cq != NULL); 545 546 assert(*cq_tailp(cq) < cq->size); 547 (*cq_tailp(cq))++; 548 549 if (spdk_unlikely(*cq_tailp(cq) == cq->size)) { 550 *cq_tailp(cq) = 0; 551 cq->phase = !cq->phase; 552 } 553 } 554 555 /* 556 * As per NVMe Base spec 3.3.1.2.1, we are supposed to implement CQ flow 557 * control: if there is no space in the CQ, we should wait until there is. 558 * 559 * In practice, we just fail the controller instead: as it happens, all host 560 * implementations we care about right-size the CQ: this is required anyway for 561 * NVMEoF support (see 3.3.2.8). 562 * 563 * Since reading the head doorbell is relatively expensive, we use the cached 564 * value, so we only have to read it for real if it appears that we are full. 565 */ 566 static inline bool 567 cq_is_full(struct nvmf_vfio_user_cq *cq) 568 { 569 uint32_t qindex; 570 571 assert(cq != NULL); 572 573 qindex = *cq_tailp(cq) + 1; 574 if (spdk_unlikely(qindex == cq->size)) { 575 qindex = 0; 576 } 577 578 if (qindex != cq->last_head) { 579 return false; 580 } 581 582 cq->last_head = *cq_dbl_headp(cq); 583 584 return qindex == cq->last_head; 585 } 586 587 static bool 588 io_q_exists(struct nvmf_vfio_user_ctrlr *vu_ctrlr, const uint16_t qid, const bool is_cq) 589 { 590 assert(vu_ctrlr != NULL); 591 592 if (qid == 0 || qid >= NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) { 593 return false; 594 } 595 596 if (is_cq) { 597 if (vu_ctrlr->cqs[qid] == NULL) { 598 return false; 599 } 600 601 return (vu_ctrlr->cqs[qid]->cq_state != VFIO_USER_CQ_DELETED && 602 vu_ctrlr->cqs[qid]->cq_state != VFIO_USER_CQ_UNUSED); 603 } 604 605 if (vu_ctrlr->sqs[qid] == NULL) { 606 return false; 607 } 608 609 return (vu_ctrlr->sqs[qid]->sq_state != VFIO_USER_SQ_DELETED && 610 vu_ctrlr->sqs[qid]->sq_state != VFIO_USER_SQ_UNUSED); 611 } 612 613 static char * 614 endpoint_id(struct nvmf_vfio_user_endpoint *endpoint) 615 { 616 return endpoint->trid.traddr; 617 } 618 619 static char * 620 ctrlr_id(struct nvmf_vfio_user_ctrlr *ctrlr) 621 { 622 if (!ctrlr || !ctrlr->endpoint) { 623 return "Null Ctrlr"; 624 } 625 626 return endpoint_id(ctrlr->endpoint); 627 } 628 629 /* Return the poll group for the admin queue of the controller. */ 630 static inline struct nvmf_vfio_user_poll_group * 631 ctrlr_to_poll_group(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 632 { 633 return SPDK_CONTAINEROF(vu_ctrlr->sqs[0]->group, 634 struct nvmf_vfio_user_poll_group, 635 group); 636 } 637 638 static inline struct spdk_thread * 639 poll_group_to_thread(struct nvmf_vfio_user_poll_group *vu_pg) 640 { 641 return vu_pg->group.group->thread; 642 } 643 644 static dma_sg_t * 645 index_to_sg_t(void *arr, size_t i) 646 { 647 return (dma_sg_t *)((uintptr_t)arr + i * dma_sg_size()); 648 } 649 650 static inline size_t 651 vfio_user_migr_data_len(void) 652 { 653 return SPDK_ALIGN_CEIL(sizeof(struct vfio_user_nvme_migr_state), PAGE_SIZE); 654 } 655 656 static inline bool 657 in_interrupt_mode(struct nvmf_vfio_user_transport *vu_transport) 658 { 659 return spdk_interrupt_mode_is_enabled() && 660 vu_transport->intr_mode_supported; 661 } 662 663 static int vfio_user_ctrlr_intr(void *ctx); 664 665 static void 666 vfio_user_msg_ctrlr_intr(void *ctx) 667 { 668 vfio_user_ctrlr_intr(ctx); 669 } 670 671 /* 672 * Kick (force a wakeup) of all poll groups for this controller. 673 * vfio_user_ctrlr_intr() itself arranges for kicking other poll groups if 674 * needed. 675 */ 676 static void 677 ctrlr_kick(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 678 { 679 struct nvmf_vfio_user_poll_group *vu_ctrlr_group; 680 681 SPDK_DEBUGLOG(vfio_user_db, "%s: kicked\n", ctrlr_id(vu_ctrlr)); 682 683 vu_ctrlr_group = ctrlr_to_poll_group(vu_ctrlr); 684 685 spdk_thread_send_msg(poll_group_to_thread(vu_ctrlr_group), 686 vfio_user_msg_ctrlr_intr, vu_ctrlr); 687 } 688 689 /* 690 * Make the given DMA address and length available (locally mapped) via iov. 691 */ 692 static void * 693 map_one(vfu_ctx_t *ctx, uint64_t addr, uint64_t len, dma_sg_t *sg, 694 struct iovec *iov, int prot) 695 { 696 int ret; 697 698 assert(ctx != NULL); 699 assert(sg != NULL); 700 assert(iov != NULL); 701 702 ret = vfu_addr_to_sgl(ctx, (void *)(uintptr_t)addr, len, sg, 1, prot); 703 if (ret < 0) { 704 return NULL; 705 } 706 707 ret = vfu_sgl_get(ctx, sg, iov, 1, 0); 708 if (ret != 0) { 709 return NULL; 710 } 711 712 assert(iov->iov_base != NULL); 713 return iov->iov_base; 714 } 715 716 static int 717 nvme_cmd_map_prps(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, 718 uint32_t max_iovcnt, uint32_t len, size_t mps, 719 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 720 { 721 uint64_t prp1, prp2; 722 void *vva; 723 uint32_t i; 724 uint32_t residue_len, nents; 725 uint64_t *prp_list; 726 uint32_t iovcnt; 727 728 assert(max_iovcnt > 0); 729 730 prp1 = cmd->dptr.prp.prp1; 731 prp2 = cmd->dptr.prp.prp2; 732 733 /* PRP1 may started with unaligned page address */ 734 residue_len = mps - (prp1 % mps); 735 residue_len = spdk_min(len, residue_len); 736 737 vva = gpa_to_vva(prv, prp1, residue_len, PROT_READ | PROT_WRITE); 738 if (spdk_unlikely(vva == NULL)) { 739 SPDK_ERRLOG("GPA to VVA failed\n"); 740 return -EINVAL; 741 } 742 len -= residue_len; 743 if (len && max_iovcnt < 2) { 744 SPDK_ERRLOG("Too many page entries, at least two iovs are required\n"); 745 return -ERANGE; 746 } 747 iovs[0].iov_base = vva; 748 iovs[0].iov_len = residue_len; 749 750 if (len) { 751 if (spdk_unlikely(prp2 == 0)) { 752 SPDK_ERRLOG("no PRP2, %d remaining\n", len); 753 return -EINVAL; 754 } 755 756 if (len <= mps) { 757 /* 2 PRP used */ 758 iovcnt = 2; 759 vva = gpa_to_vva(prv, prp2, len, PROT_READ | PROT_WRITE); 760 if (spdk_unlikely(vva == NULL)) { 761 SPDK_ERRLOG("no VVA for %#" PRIx64 ", len%#x\n", 762 prp2, len); 763 return -EINVAL; 764 } 765 iovs[1].iov_base = vva; 766 iovs[1].iov_len = len; 767 } else { 768 /* PRP list used */ 769 nents = (len + mps - 1) / mps; 770 if (spdk_unlikely(nents + 1 > max_iovcnt)) { 771 SPDK_ERRLOG("Too many page entries\n"); 772 return -ERANGE; 773 } 774 775 vva = gpa_to_vva(prv, prp2, nents * sizeof(*prp_list), PROT_READ); 776 if (spdk_unlikely(vva == NULL)) { 777 SPDK_ERRLOG("no VVA for %#" PRIx64 ", nents=%#x\n", 778 prp2, nents); 779 return -EINVAL; 780 } 781 prp_list = vva; 782 i = 0; 783 while (len != 0) { 784 residue_len = spdk_min(len, mps); 785 vva = gpa_to_vva(prv, prp_list[i], residue_len, PROT_READ | PROT_WRITE); 786 if (spdk_unlikely(vva == NULL)) { 787 SPDK_ERRLOG("no VVA for %#" PRIx64 ", residue_len=%#x\n", 788 prp_list[i], residue_len); 789 return -EINVAL; 790 } 791 iovs[i + 1].iov_base = vva; 792 iovs[i + 1].iov_len = residue_len; 793 len -= residue_len; 794 i++; 795 } 796 iovcnt = i + 1; 797 } 798 } else { 799 /* 1 PRP used */ 800 iovcnt = 1; 801 } 802 803 assert(iovcnt <= max_iovcnt); 804 return iovcnt; 805 } 806 807 static int 808 nvme_cmd_map_sgls_data(void *prv, struct spdk_nvme_sgl_descriptor *sgls, uint32_t num_sgls, 809 struct iovec *iovs, uint32_t max_iovcnt, 810 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 811 { 812 uint32_t i; 813 void *vva; 814 815 if (spdk_unlikely(max_iovcnt < num_sgls)) { 816 return -ERANGE; 817 } 818 819 for (i = 0; i < num_sgls; i++) { 820 if (spdk_unlikely(sgls[i].unkeyed.type != SPDK_NVME_SGL_TYPE_DATA_BLOCK)) { 821 SPDK_ERRLOG("Invalid SGL type %u\n", sgls[i].unkeyed.type); 822 return -EINVAL; 823 } 824 vva = gpa_to_vva(prv, sgls[i].address, sgls[i].unkeyed.length, PROT_READ | PROT_WRITE); 825 if (spdk_unlikely(vva == NULL)) { 826 SPDK_ERRLOG("GPA to VVA failed\n"); 827 return -EINVAL; 828 } 829 iovs[i].iov_base = vva; 830 iovs[i].iov_len = sgls[i].unkeyed.length; 831 } 832 833 return num_sgls; 834 } 835 836 static int 837 nvme_cmd_map_sgls(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt, 838 uint32_t len, size_t mps, 839 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 840 { 841 struct spdk_nvme_sgl_descriptor *sgl, *last_sgl; 842 uint32_t num_sgls, seg_len; 843 void *vva; 844 int ret; 845 uint32_t total_iovcnt = 0; 846 847 /* SGL cases */ 848 sgl = &cmd->dptr.sgl1; 849 850 /* only one SGL segment */ 851 if (sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { 852 assert(max_iovcnt > 0); 853 vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, PROT_READ | PROT_WRITE); 854 if (spdk_unlikely(vva == NULL)) { 855 SPDK_ERRLOG("GPA to VVA failed\n"); 856 return -EINVAL; 857 } 858 iovs[0].iov_base = vva; 859 iovs[0].iov_len = sgl->unkeyed.length; 860 assert(sgl->unkeyed.length == len); 861 862 return 1; 863 } 864 865 for (;;) { 866 if (spdk_unlikely((sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_SEGMENT) && 867 (sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_LAST_SEGMENT))) { 868 SPDK_ERRLOG("Invalid SGL type %u\n", sgl->unkeyed.type); 869 return -EINVAL; 870 } 871 872 seg_len = sgl->unkeyed.length; 873 if (spdk_unlikely(seg_len % sizeof(struct spdk_nvme_sgl_descriptor))) { 874 SPDK_ERRLOG("Invalid SGL segment len %u\n", seg_len); 875 return -EINVAL; 876 } 877 878 num_sgls = seg_len / sizeof(struct spdk_nvme_sgl_descriptor); 879 vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, PROT_READ); 880 if (spdk_unlikely(vva == NULL)) { 881 SPDK_ERRLOG("GPA to VVA failed\n"); 882 return -EINVAL; 883 } 884 885 /* sgl point to the first segment */ 886 sgl = (struct spdk_nvme_sgl_descriptor *)vva; 887 last_sgl = &sgl[num_sgls - 1]; 888 889 /* we are done */ 890 if (last_sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { 891 /* map whole sgl list */ 892 ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls, &iovs[total_iovcnt], 893 max_iovcnt - total_iovcnt, gpa_to_vva); 894 if (spdk_unlikely(ret < 0)) { 895 return ret; 896 } 897 total_iovcnt += ret; 898 899 return total_iovcnt; 900 } 901 902 if (num_sgls > 1) { 903 /* map whole sgl exclude last_sgl */ 904 ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls - 1, &iovs[total_iovcnt], 905 max_iovcnt - total_iovcnt, gpa_to_vva); 906 if (spdk_unlikely(ret < 0)) { 907 return ret; 908 } 909 total_iovcnt += ret; 910 } 911 912 /* move to next level's segments */ 913 sgl = last_sgl; 914 } 915 916 return 0; 917 } 918 919 static int 920 nvme_map_cmd(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt, 921 uint32_t len, size_t mps, 922 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 923 { 924 if (cmd->psdt == SPDK_NVME_PSDT_PRP) { 925 return nvme_cmd_map_prps(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva); 926 } 927 928 return nvme_cmd_map_sgls(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva); 929 } 930 931 /* 932 * For each queue, update the location of its doorbell to the correct location: 933 * either our own BAR0, or the guest's configured shadow doorbell area. 934 * 935 * The Admin queue (qid: 0) does not ever use shadow doorbells. 936 */ 937 static void 938 vfio_user_ctrlr_switch_doorbells(struct nvmf_vfio_user_ctrlr *ctrlr, bool shadow) 939 { 940 volatile uint32_t *doorbells = shadow ? ctrlr->sdbl->shadow_doorbells : 941 ctrlr->bar0_doorbells; 942 943 assert(doorbells != NULL); 944 945 for (size_t i = 1; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; i++) { 946 struct nvmf_vfio_user_sq *sq = ctrlr->sqs[i]; 947 struct nvmf_vfio_user_cq *cq = ctrlr->cqs[i]; 948 949 if (sq != NULL) { 950 sq->dbl_tailp = doorbells + queue_index(sq->qid, false); 951 952 ctrlr->sqs[i]->need_rearm = shadow; 953 } 954 955 if (cq != NULL) { 956 cq->dbl_headp = doorbells + queue_index(cq->qid, true); 957 } 958 } 959 } 960 961 static void 962 unmap_sdbl(vfu_ctx_t *vfu_ctx, struct nvmf_vfio_user_shadow_doorbells *sdbl) 963 { 964 assert(vfu_ctx != NULL); 965 assert(sdbl != NULL); 966 967 /* 968 * An allocation error would result in only one of the two being 969 * non-NULL. If that is the case, no memory should have been mapped. 970 */ 971 if (sdbl->iovs == NULL || sdbl->sgs == NULL) { 972 return; 973 } 974 975 for (size_t i = 0; i < NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT; ++i) { 976 struct iovec *iov; 977 dma_sg_t *sg; 978 979 if (!sdbl->iovs[i].iov_len) { 980 continue; 981 } 982 983 sg = index_to_sg_t(sdbl->sgs, i); 984 iov = sdbl->iovs + i; 985 986 vfu_sgl_put(vfu_ctx, sg, iov, 1); 987 } 988 } 989 990 static void 991 free_sdbl(vfu_ctx_t *vfu_ctx, struct nvmf_vfio_user_shadow_doorbells *sdbl) 992 { 993 if (sdbl == NULL) { 994 return; 995 } 996 997 unmap_sdbl(vfu_ctx, sdbl); 998 999 /* 1000 * sdbl->shadow_doorbells and sdbl->eventidxs were mapped, 1001 * not allocated, so don't free() them. 1002 */ 1003 free(sdbl->sgs); 1004 free(sdbl->iovs); 1005 free(sdbl); 1006 } 1007 1008 static struct nvmf_vfio_user_shadow_doorbells * 1009 map_sdbl(vfu_ctx_t *vfu_ctx, uint64_t prp1, uint64_t prp2, size_t len) 1010 { 1011 struct nvmf_vfio_user_shadow_doorbells *sdbl = NULL; 1012 dma_sg_t *sg2 = NULL; 1013 void *p; 1014 1015 assert(vfu_ctx != NULL); 1016 1017 sdbl = calloc(1, sizeof(*sdbl)); 1018 if (sdbl == NULL) { 1019 goto err; 1020 } 1021 1022 sdbl->sgs = calloc(NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT, dma_sg_size()); 1023 sdbl->iovs = calloc(NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT, sizeof(*sdbl->iovs)); 1024 if (sdbl->sgs == NULL || sdbl->iovs == NULL) { 1025 goto err; 1026 } 1027 1028 /* Map shadow doorbell buffer (PRP1). */ 1029 p = map_one(vfu_ctx, prp1, len, sdbl->sgs, sdbl->iovs, 1030 PROT_READ | PROT_WRITE); 1031 1032 if (p == NULL) { 1033 goto err; 1034 } 1035 1036 /* 1037 * Map eventidx buffer (PRP2). 1038 * Should only be written to by the controller. 1039 */ 1040 1041 sg2 = index_to_sg_t(sdbl->sgs, 1); 1042 1043 p = map_one(vfu_ctx, prp2, len, sg2, sdbl->iovs + 1, 1044 PROT_READ | PROT_WRITE); 1045 1046 if (p == NULL) { 1047 goto err; 1048 } 1049 1050 sdbl->shadow_doorbells = (uint32_t *)sdbl->iovs[0].iov_base; 1051 sdbl->eventidxs = (uint32_t *)sdbl->iovs[1].iov_base; 1052 1053 return sdbl; 1054 1055 err: 1056 free_sdbl(vfu_ctx, sdbl); 1057 return NULL; 1058 } 1059 1060 /* 1061 * Copy doorbells from one buffer to the other, during switches betweeen BAR0 1062 * doorbells and shadow doorbells. 1063 */ 1064 static void 1065 copy_doorbells(struct nvmf_vfio_user_ctrlr *ctrlr, 1066 const volatile uint32_t *from, volatile uint32_t *to) 1067 { 1068 assert(ctrlr != NULL); 1069 assert(from != NULL); 1070 assert(to != NULL); 1071 1072 SPDK_DEBUGLOG(vfio_user_db, 1073 "%s: migrating shadow doorbells from %p to %p\n", 1074 ctrlr_id(ctrlr), from, to); 1075 1076 /* Can't use memcpy because it doesn't respect volatile semantics. */ 1077 for (size_t i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; ++i) { 1078 if (ctrlr->sqs[i] != NULL) { 1079 to[queue_index(i, false)] = from[queue_index(i, false)]; 1080 } 1081 1082 if (ctrlr->cqs[i] != NULL) { 1083 to[queue_index(i, true)] = from[queue_index(i, true)]; 1084 } 1085 } 1086 } 1087 1088 static void 1089 fail_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1090 { 1091 const struct spdk_nvmf_registers *regs; 1092 1093 assert(vu_ctrlr != NULL); 1094 assert(vu_ctrlr->ctrlr != NULL); 1095 1096 regs = spdk_nvmf_ctrlr_get_regs(vu_ctrlr->ctrlr); 1097 if (regs->csts.bits.cfs == 0) { 1098 SPDK_ERRLOG(":%s failing controller\n", ctrlr_id(vu_ctrlr)); 1099 } 1100 1101 nvmf_ctrlr_set_fatal_status(vu_ctrlr->ctrlr); 1102 } 1103 1104 static inline bool 1105 ctrlr_interrupt_enabled(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1106 { 1107 assert(vu_ctrlr != NULL); 1108 assert(vu_ctrlr->endpoint != NULL); 1109 1110 vfu_pci_config_space_t *pci = vu_ctrlr->endpoint->pci_config_space; 1111 1112 return (!pci->hdr.cmd.id || vu_ctrlr->endpoint->msix->mxc.mxe); 1113 } 1114 1115 static void 1116 nvmf_vfio_user_destroy_endpoint(struct nvmf_vfio_user_endpoint *endpoint) 1117 { 1118 SPDK_DEBUGLOG(nvmf_vfio, "destroy endpoint %s\n", endpoint_id(endpoint)); 1119 1120 spdk_interrupt_unregister(&endpoint->accept_intr); 1121 spdk_poller_unregister(&endpoint->accept_poller); 1122 1123 if (endpoint->bar0_doorbells) { 1124 munmap((void *)endpoint->bar0_doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 1125 } 1126 1127 if (endpoint->devmem_fd > 0) { 1128 close(endpoint->devmem_fd); 1129 } 1130 1131 if (endpoint->migr_data) { 1132 munmap(endpoint->migr_data, vfio_user_migr_data_len()); 1133 } 1134 1135 if (endpoint->migr_fd > 0) { 1136 close(endpoint->migr_fd); 1137 } 1138 1139 if (endpoint->vfu_ctx) { 1140 vfu_destroy_ctx(endpoint->vfu_ctx); 1141 } 1142 1143 pthread_mutex_destroy(&endpoint->lock); 1144 free(endpoint); 1145 } 1146 1147 /* called when process exits */ 1148 static int 1149 nvmf_vfio_user_destroy(struct spdk_nvmf_transport *transport, 1150 spdk_nvmf_transport_destroy_done_cb cb_fn, void *cb_arg) 1151 { 1152 struct nvmf_vfio_user_transport *vu_transport; 1153 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 1154 1155 SPDK_DEBUGLOG(nvmf_vfio, "destroy transport\n"); 1156 1157 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 1158 transport); 1159 1160 pthread_mutex_destroy(&vu_transport->lock); 1161 pthread_mutex_destroy(&vu_transport->pg_lock); 1162 1163 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 1164 TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link); 1165 nvmf_vfio_user_destroy_endpoint(endpoint); 1166 } 1167 1168 free(vu_transport); 1169 1170 if (cb_fn) { 1171 cb_fn(cb_arg); 1172 } 1173 1174 return 0; 1175 } 1176 1177 static const struct spdk_json_object_decoder vfio_user_transport_opts_decoder[] = { 1178 { 1179 "disable_mappable_bar0", 1180 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_mappable_bar0), 1181 spdk_json_decode_bool, true 1182 }, 1183 { 1184 "disable_adaptive_irq", 1185 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_adaptive_irq), 1186 spdk_json_decode_bool, true 1187 }, 1188 { 1189 "disable_shadow_doorbells", 1190 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_shadow_doorbells), 1191 spdk_json_decode_bool, true 1192 }, 1193 { 1194 "disable_compare", 1195 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_compare), 1196 spdk_json_decode_bool, true 1197 }, 1198 { 1199 "enable_intr_mode_sq_spreading", 1200 offsetof(struct nvmf_vfio_user_transport, transport_opts.enable_intr_mode_sq_spreading), 1201 spdk_json_decode_bool, true 1202 }, 1203 }; 1204 1205 static struct spdk_nvmf_transport * 1206 nvmf_vfio_user_create(struct spdk_nvmf_transport_opts *opts) 1207 { 1208 struct nvmf_vfio_user_transport *vu_transport; 1209 int err; 1210 1211 if (opts->max_qpairs_per_ctrlr > NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) { 1212 SPDK_ERRLOG("Invalid max_qpairs_per_ctrlr=%d, supported max_qpairs_per_ctrlr=%d\n", 1213 opts->max_qpairs_per_ctrlr, NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR); 1214 return NULL; 1215 } 1216 1217 vu_transport = calloc(1, sizeof(*vu_transport)); 1218 if (vu_transport == NULL) { 1219 SPDK_ERRLOG("Transport alloc fail: %m\n"); 1220 return NULL; 1221 } 1222 1223 err = pthread_mutex_init(&vu_transport->lock, NULL); 1224 if (err != 0) { 1225 SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err); 1226 goto err; 1227 } 1228 TAILQ_INIT(&vu_transport->endpoints); 1229 1230 err = pthread_mutex_init(&vu_transport->pg_lock, NULL); 1231 if (err != 0) { 1232 pthread_mutex_destroy(&vu_transport->lock); 1233 SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err); 1234 goto err; 1235 } 1236 TAILQ_INIT(&vu_transport->poll_groups); 1237 1238 if (opts->transport_specific != NULL && 1239 spdk_json_decode_object_relaxed(opts->transport_specific, vfio_user_transport_opts_decoder, 1240 SPDK_COUNTOF(vfio_user_transport_opts_decoder), 1241 vu_transport)) { 1242 SPDK_ERRLOG("spdk_json_decode_object_relaxed failed\n"); 1243 goto cleanup; 1244 } 1245 1246 /* 1247 * To support interrupt mode, the transport must be configured with 1248 * mappable BAR0 disabled: we need a vfio-user message to wake us up 1249 * when a client writes new doorbell values to BAR0, via the 1250 * libvfio-user socket fd. 1251 */ 1252 vu_transport->intr_mode_supported = 1253 vu_transport->transport_opts.disable_mappable_bar0; 1254 1255 /* 1256 * If BAR0 is mappable, it doesn't make sense to support shadow 1257 * doorbells, so explicitly turn it off. 1258 */ 1259 if (!vu_transport->transport_opts.disable_mappable_bar0) { 1260 vu_transport->transport_opts.disable_shadow_doorbells = true; 1261 } 1262 1263 if (spdk_interrupt_mode_is_enabled()) { 1264 if (!vu_transport->intr_mode_supported) { 1265 SPDK_ERRLOG("interrupt mode not supported\n"); 1266 goto cleanup; 1267 } 1268 1269 /* 1270 * If we are in interrupt mode, we cannot support adaptive IRQs, 1271 * as there is no guarantee the SQ poller will run subsequently 1272 * to send pending IRQs. 1273 */ 1274 vu_transport->transport_opts.disable_adaptive_irq = true; 1275 } 1276 1277 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_mappable_bar0=%d\n", 1278 vu_transport->transport_opts.disable_mappable_bar0); 1279 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_adaptive_irq=%d\n", 1280 vu_transport->transport_opts.disable_adaptive_irq); 1281 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_shadow_doorbells=%d\n", 1282 vu_transport->transport_opts.disable_shadow_doorbells); 1283 1284 return &vu_transport->transport; 1285 1286 cleanup: 1287 pthread_mutex_destroy(&vu_transport->lock); 1288 pthread_mutex_destroy(&vu_transport->pg_lock); 1289 err: 1290 free(vu_transport); 1291 return NULL; 1292 } 1293 1294 static uint32_t 1295 max_queue_size(struct nvmf_vfio_user_ctrlr const *vu_ctrlr) 1296 { 1297 assert(vu_ctrlr != NULL); 1298 assert(vu_ctrlr->ctrlr != NULL); 1299 1300 return vu_ctrlr->ctrlr->vcprop.cap.bits.mqes + 1; 1301 } 1302 1303 static uint32_t 1304 doorbell_stride(const struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1305 { 1306 assert(vu_ctrlr != NULL); 1307 assert(vu_ctrlr->ctrlr != NULL); 1308 1309 return vu_ctrlr->ctrlr->vcprop.cap.bits.dstrd; 1310 } 1311 1312 static uintptr_t 1313 memory_page_size(const struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1314 { 1315 uint32_t memory_page_shift = vu_ctrlr->ctrlr->vcprop.cc.bits.mps + 12; 1316 return 1ul << memory_page_shift; 1317 } 1318 1319 static uintptr_t 1320 memory_page_mask(const struct nvmf_vfio_user_ctrlr *ctrlr) 1321 { 1322 return ~(memory_page_size(ctrlr) - 1); 1323 } 1324 1325 static int 1326 map_q(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvme_q_mapping *mapping, 1327 uint32_t q_size, bool is_cq, bool unmap) 1328 { 1329 uint64_t len; 1330 void *ret; 1331 1332 assert(q_size); 1333 assert(q_addr(mapping) == NULL); 1334 1335 if (is_cq) { 1336 len = q_size * sizeof(struct spdk_nvme_cpl); 1337 } else { 1338 len = q_size * sizeof(struct spdk_nvme_cmd); 1339 } 1340 1341 ret = map_one(vu_ctrlr->endpoint->vfu_ctx, mapping->prp1, len, 1342 mapping->sg, &mapping->iov, 1343 is_cq ? PROT_READ | PROT_WRITE : PROT_READ); 1344 if (ret == NULL) { 1345 return -EFAULT; 1346 } 1347 1348 if (unmap) { 1349 memset(q_addr(mapping), 0, len); 1350 } 1351 1352 return 0; 1353 } 1354 1355 static inline void 1356 unmap_q(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvme_q_mapping *mapping) 1357 { 1358 if (q_addr(mapping) != NULL) { 1359 vfu_sgl_put(vu_ctrlr->endpoint->vfu_ctx, mapping->sg, 1360 &mapping->iov, 1); 1361 mapping->iov.iov_base = NULL; 1362 } 1363 } 1364 1365 static int 1366 asq_setup(struct nvmf_vfio_user_ctrlr *ctrlr) 1367 { 1368 struct nvmf_vfio_user_sq *sq; 1369 const struct spdk_nvmf_registers *regs; 1370 int ret; 1371 1372 assert(ctrlr != NULL); 1373 1374 sq = ctrlr->sqs[0]; 1375 1376 assert(sq != NULL); 1377 assert(q_addr(&sq->mapping) == NULL); 1378 /* XXX ctrlr->asq == 0 is a valid memory address */ 1379 1380 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->ctrlr); 1381 sq->qid = 0; 1382 sq->size = regs->aqa.bits.asqs + 1; 1383 sq->mapping.prp1 = regs->asq; 1384 *sq_headp(sq) = 0; 1385 sq->cqid = 0; 1386 1387 ret = map_q(ctrlr, &sq->mapping, sq->size, false, true); 1388 if (ret) { 1389 return ret; 1390 } 1391 1392 /* The Admin queue (qid: 0) does not ever use shadow doorbells. */ 1393 sq->dbl_tailp = ctrlr->bar0_doorbells + queue_index(0, false); 1394 1395 *sq_dbl_tailp(sq) = 0; 1396 1397 return 0; 1398 } 1399 1400 /* 1401 * Updates eventidx to set an SQ into interrupt or polling mode. 1402 * 1403 * Returns false if the current SQ tail does not match the SQ head, as 1404 * this means that the host has submitted more items to the queue while we were 1405 * not looking - or during the event index update. In that case, we must retry, 1406 * or otherwise make sure we are going to wake up again. 1407 */ 1408 static bool 1409 set_sq_eventidx(struct nvmf_vfio_user_sq *sq) 1410 { 1411 struct nvmf_vfio_user_ctrlr *ctrlr; 1412 volatile uint32_t *sq_tail_eidx; 1413 uint32_t old_tail, new_tail; 1414 1415 assert(sq != NULL); 1416 assert(sq->ctrlr != NULL); 1417 assert(sq->ctrlr->sdbl != NULL); 1418 assert(sq->need_rearm); 1419 assert(sq->qid != 0); 1420 1421 ctrlr = sq->ctrlr; 1422 1423 SPDK_DEBUGLOG(vfio_user_db, "%s: updating eventidx of sqid:%u\n", 1424 ctrlr_id(ctrlr), sq->qid); 1425 1426 sq_tail_eidx = ctrlr->sdbl->eventidxs + queue_index(sq->qid, false); 1427 1428 assert(ctrlr->endpoint != NULL); 1429 1430 if (!ctrlr->endpoint->interrupt_mode) { 1431 /* No synchronisation necessary. */ 1432 *sq_tail_eidx = NVMF_VFIO_USER_EVENTIDX_POLL; 1433 return true; 1434 } 1435 1436 old_tail = *sq_dbl_tailp(sq); 1437 *sq_tail_eidx = old_tail; 1438 1439 /* 1440 * Ensure that the event index is updated before re-reading the tail 1441 * doorbell. If it's not, then the host might race us and update the 1442 * tail after the second read but before the event index is written, so 1443 * it won't write to BAR0 and we'll miss the update. 1444 * 1445 * The driver should provide similar ordering with an mb(). 1446 */ 1447 spdk_mb(); 1448 1449 /* 1450 * Check if the host has updated the tail doorbell after we've read it 1451 * for the first time, but before the event index was written. If that's 1452 * the case, then we've lost the race and we need to update the event 1453 * index again (after polling the queue, since the host won't write to 1454 * BAR0). 1455 */ 1456 new_tail = *sq_dbl_tailp(sq); 1457 1458 /* 1459 * We might poll the queue straight after this function returns if the 1460 * tail has been updated, so we need to ensure that any changes to the 1461 * queue will be visible to us if the doorbell has been updated. 1462 * 1463 * The driver should provide similar ordering with a wmb() to ensure 1464 * that the queue is written before it updates the tail doorbell. 1465 */ 1466 spdk_rmb(); 1467 1468 SPDK_DEBUGLOG(vfio_user_db, "%s: sqid:%u, old_tail=%u, new_tail=%u, " 1469 "sq_head=%u\n", ctrlr_id(ctrlr), sq->qid, old_tail, 1470 new_tail, *sq_headp(sq)); 1471 1472 if (new_tail == *sq_headp(sq)) { 1473 sq->need_rearm = false; 1474 return true; 1475 } 1476 1477 /* 1478 * We've lost the race: the tail was updated since we last polled, 1479 * including if it happened within this routine. 1480 * 1481 * The caller should retry after polling (think of this as a cmpxchg 1482 * loop); if we go to sleep while the SQ is not empty, then we won't 1483 * process the remaining events. 1484 */ 1485 return false; 1486 } 1487 1488 static int nvmf_vfio_user_sq_poll(struct nvmf_vfio_user_sq *sq); 1489 1490 /* 1491 * Arrange for an SQ to interrupt us if written. Returns non-zero if we 1492 * processed some SQ entries. 1493 */ 1494 static int 1495 vfio_user_sq_rearm(struct nvmf_vfio_user_ctrlr *ctrlr, 1496 struct nvmf_vfio_user_sq *sq) 1497 { 1498 int count = 0; 1499 size_t i; 1500 1501 assert(sq->need_rearm); 1502 1503 for (i = 0; i < NVMF_VFIO_USER_SET_EVENTIDX_MAX_ATTEMPTS; i++) { 1504 int ret; 1505 1506 if (set_sq_eventidx(sq)) { 1507 /* We won the race and set eventidx; done. */ 1508 return count; 1509 } 1510 1511 ret = nvmf_vfio_user_sq_poll(sq); 1512 1513 count += (ret < 0) ? 1 : ret; 1514 1515 /* 1516 * set_sq_eventidx() hit the race, so we expected 1517 * to process at least one command from this queue. 1518 * If there were no new commands waiting for us, then 1519 * we must have hit an unexpected race condition. 1520 */ 1521 if (ret == 0) { 1522 SPDK_ERRLOG("%s: unexpected race condition detected " 1523 "while updating the shadow doorbell buffer\n", 1524 ctrlr_id(ctrlr)); 1525 1526 fail_ctrlr(ctrlr); 1527 return count; 1528 } 1529 } 1530 1531 SPDK_DEBUGLOG(vfio_user_db, 1532 "%s: set_sq_eventidx() lost the race %zu times\n", 1533 ctrlr_id(ctrlr), i); 1534 1535 /* 1536 * We couldn't arrange an eventidx guaranteed to cause a BAR0 write, as 1537 * we raced with the producer too many times; force ourselves to wake up 1538 * instead. We'll process all queues at that point. 1539 */ 1540 ctrlr_kick(ctrlr); 1541 1542 return count; 1543 } 1544 1545 /* 1546 * We're in interrupt mode, and potentially about to go to sleep. We need to 1547 * make sure any further I/O submissions are guaranteed to wake us up: for 1548 * shadow doorbells that means we may need to go through set_sq_eventidx() for 1549 * every SQ that needs re-arming. 1550 * 1551 * Returns non-zero if we processed something. 1552 */ 1553 static int 1554 vfio_user_poll_group_rearm(struct nvmf_vfio_user_poll_group *vu_group) 1555 { 1556 struct nvmf_vfio_user_sq *sq; 1557 int count = 0; 1558 1559 TAILQ_FOREACH(sq, &vu_group->sqs, link) { 1560 if (spdk_unlikely(sq->sq_state != VFIO_USER_SQ_ACTIVE || !sq->size)) { 1561 continue; 1562 } 1563 1564 if (sq->need_rearm) { 1565 count += vfio_user_sq_rearm(sq->ctrlr, sq); 1566 } 1567 } 1568 1569 return count; 1570 } 1571 1572 static int 1573 acq_setup(struct nvmf_vfio_user_ctrlr *ctrlr) 1574 { 1575 struct nvmf_vfio_user_cq *cq; 1576 const struct spdk_nvmf_registers *regs; 1577 int ret; 1578 1579 assert(ctrlr != NULL); 1580 1581 cq = ctrlr->cqs[0]; 1582 1583 assert(cq != NULL); 1584 1585 assert(q_addr(&cq->mapping) == NULL); 1586 1587 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->ctrlr); 1588 assert(regs != NULL); 1589 cq->qid = 0; 1590 cq->size = regs->aqa.bits.acqs + 1; 1591 cq->mapping.prp1 = regs->acq; 1592 *cq_tailp(cq) = 0; 1593 cq->ien = true; 1594 cq->phase = true; 1595 1596 ret = map_q(ctrlr, &cq->mapping, cq->size, true, true); 1597 if (ret) { 1598 return ret; 1599 } 1600 1601 /* The Admin queue (qid: 0) does not ever use shadow doorbells. */ 1602 cq->dbl_headp = ctrlr->bar0_doorbells + queue_index(0, true); 1603 1604 *cq_dbl_headp(cq) = 0; 1605 1606 return 0; 1607 } 1608 1609 static void * 1610 _map_one(void *prv, uint64_t addr, uint64_t len, int prot) 1611 { 1612 struct spdk_nvmf_request *req = (struct spdk_nvmf_request *)prv; 1613 struct spdk_nvmf_qpair *qpair; 1614 struct nvmf_vfio_user_req *vu_req; 1615 struct nvmf_vfio_user_sq *sq; 1616 void *ret; 1617 1618 assert(req != NULL); 1619 qpair = req->qpair; 1620 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 1621 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 1622 1623 assert(vu_req->iovcnt < NVMF_VFIO_USER_MAX_IOVECS); 1624 ret = map_one(sq->ctrlr->endpoint->vfu_ctx, addr, len, 1625 index_to_sg_t(vu_req->sg, vu_req->iovcnt), 1626 &vu_req->iov[vu_req->iovcnt], prot); 1627 if (spdk_likely(ret != NULL)) { 1628 vu_req->iovcnt++; 1629 } 1630 return ret; 1631 } 1632 1633 static int 1634 vfio_user_map_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req, 1635 struct iovec *iov, uint32_t length) 1636 { 1637 /* Map PRP list to from Guest physical memory to 1638 * virtual memory address. 1639 */ 1640 return nvme_map_cmd(req, &req->cmd->nvme_cmd, iov, NVMF_REQ_MAX_BUFFERS, 1641 length, 4096, _map_one); 1642 } 1643 1644 static int handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 1645 struct nvmf_vfio_user_sq *sq); 1646 1647 /* 1648 * Posts a CQE in the completion queue. 1649 * 1650 * @ctrlr: the vfio-user controller 1651 * @cq: the completion queue 1652 * @cdw0: cdw0 as reported by NVMf 1653 * @sqid: submission queue ID 1654 * @cid: command identifier in NVMe command 1655 * @sc: the NVMe CQE status code 1656 * @sct: the NVMe CQE status code type 1657 */ 1658 static int 1659 post_completion(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_cq *cq, 1660 uint32_t cdw0, uint16_t sqid, uint16_t cid, uint16_t sc, uint16_t sct) 1661 { 1662 struct spdk_nvme_status cpl_status = { 0 }; 1663 struct spdk_nvme_cpl *cpl; 1664 int err; 1665 1666 assert(ctrlr != NULL); 1667 1668 if (spdk_unlikely(cq == NULL || q_addr(&cq->mapping) == NULL)) { 1669 return 0; 1670 } 1671 1672 if (cq->qid == 0) { 1673 assert(spdk_get_thread() == cq->thread); 1674 } 1675 1676 if (cq_is_full(cq)) { 1677 SPDK_ERRLOG("%s: cqid:%d full (tail=%d, head=%d)\n", 1678 ctrlr_id(ctrlr), cq->qid, *cq_tailp(cq), 1679 *cq_dbl_headp(cq)); 1680 return -1; 1681 } 1682 1683 cpl = ((struct spdk_nvme_cpl *)q_addr(&cq->mapping)) + *cq_tailp(cq); 1684 1685 assert(ctrlr->sqs[sqid] != NULL); 1686 SPDK_DEBUGLOG(nvmf_vfio, 1687 "%s: request complete sqid:%d cid=%d status=%#x " 1688 "sqhead=%d cq tail=%d\n", ctrlr_id(ctrlr), sqid, cid, sc, 1689 *sq_headp(ctrlr->sqs[sqid]), *cq_tailp(cq)); 1690 1691 cpl->sqhd = *sq_headp(ctrlr->sqs[sqid]); 1692 cpl->sqid = sqid; 1693 cpl->cid = cid; 1694 cpl->cdw0 = cdw0; 1695 1696 /* 1697 * This is a bitfield: instead of setting the individual bits we need 1698 * directly in cpl->status, which would cause a read-modify-write cycle, 1699 * we'll avoid reading from the CPL altogether by filling in a local 1700 * cpl_status variable, then writing the whole thing. 1701 */ 1702 cpl_status.sct = sct; 1703 cpl_status.sc = sc; 1704 cpl_status.p = cq->phase; 1705 cpl->status = cpl_status; 1706 1707 /* Ensure the Completion Queue Entry is visible. */ 1708 spdk_wmb(); 1709 cq_tail_advance(cq); 1710 1711 if ((cq->qid == 0 || !ctrlr->adaptive_irqs_enabled) && 1712 cq->ien && ctrlr_interrupt_enabled(ctrlr)) { 1713 err = vfu_irq_trigger(ctrlr->endpoint->vfu_ctx, cq->iv); 1714 if (err != 0) { 1715 SPDK_ERRLOG("%s: failed to trigger interrupt: %m\n", 1716 ctrlr_id(ctrlr)); 1717 return err; 1718 } 1719 } 1720 1721 return 0; 1722 } 1723 1724 static void 1725 free_sq_reqs(struct nvmf_vfio_user_sq *sq) 1726 { 1727 while (!TAILQ_EMPTY(&sq->free_reqs)) { 1728 struct nvmf_vfio_user_req *vu_req = TAILQ_FIRST(&sq->free_reqs); 1729 TAILQ_REMOVE(&sq->free_reqs, vu_req, link); 1730 free(vu_req); 1731 } 1732 } 1733 1734 static void 1735 delete_cq_done(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_cq *cq) 1736 { 1737 assert(cq->cq_ref == 0); 1738 unmap_q(ctrlr, &cq->mapping); 1739 cq->size = 0; 1740 cq->cq_state = VFIO_USER_CQ_DELETED; 1741 cq->group = NULL; 1742 } 1743 1744 /* Deletes a SQ, if this SQ is the last user of the associated CQ 1745 * and the controller is being shut down/reset or vfio-user client disconnects, 1746 * then the CQ is also deleted. 1747 */ 1748 static void 1749 delete_sq_done(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvmf_vfio_user_sq *sq) 1750 { 1751 struct nvmf_vfio_user_cq *cq; 1752 uint16_t cqid; 1753 1754 SPDK_DEBUGLOG(nvmf_vfio, "%s: delete sqid:%d=%p done\n", ctrlr_id(vu_ctrlr), 1755 sq->qid, sq); 1756 1757 /* Free SQ resources */ 1758 unmap_q(vu_ctrlr, &sq->mapping); 1759 1760 free_sq_reqs(sq); 1761 1762 sq->size = 0; 1763 1764 sq->sq_state = VFIO_USER_SQ_DELETED; 1765 1766 /* Controller RESET and SHUTDOWN are special cases, 1767 * VM may not send DELETE IO SQ/CQ commands, NVMf library 1768 * will disconnect IO queue pairs. 1769 */ 1770 if (vu_ctrlr->reset_shn || vu_ctrlr->disconnect) { 1771 cqid = sq->cqid; 1772 cq = vu_ctrlr->cqs[cqid]; 1773 1774 SPDK_DEBUGLOG(nvmf_vfio, "%s: try to delete cqid:%u=%p\n", ctrlr_id(vu_ctrlr), 1775 cq->qid, cq); 1776 1777 assert(cq->cq_ref > 0); 1778 if (--cq->cq_ref == 0) { 1779 delete_cq_done(vu_ctrlr, cq); 1780 } 1781 } 1782 } 1783 1784 static void 1785 free_qp(struct nvmf_vfio_user_ctrlr *ctrlr, uint16_t qid) 1786 { 1787 struct nvmf_vfio_user_sq *sq; 1788 struct nvmf_vfio_user_cq *cq; 1789 1790 if (ctrlr == NULL) { 1791 return; 1792 } 1793 1794 sq = ctrlr->sqs[qid]; 1795 if (sq) { 1796 SPDK_DEBUGLOG(nvmf_vfio, "%s: Free SQ %u\n", ctrlr_id(ctrlr), qid); 1797 unmap_q(ctrlr, &sq->mapping); 1798 1799 free_sq_reqs(sq); 1800 1801 free(sq->mapping.sg); 1802 free(sq); 1803 ctrlr->sqs[qid] = NULL; 1804 } 1805 1806 cq = ctrlr->cqs[qid]; 1807 if (cq) { 1808 SPDK_DEBUGLOG(nvmf_vfio, "%s: Free cqid:%u\n", ctrlr_id(ctrlr), qid); 1809 unmap_q(ctrlr, &cq->mapping); 1810 free(cq->mapping.sg); 1811 free(cq); 1812 ctrlr->cqs[qid] = NULL; 1813 } 1814 } 1815 1816 static int 1817 init_sq(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_transport *transport, 1818 const uint16_t id) 1819 { 1820 struct nvmf_vfio_user_sq *sq; 1821 1822 assert(ctrlr != NULL); 1823 assert(transport != NULL); 1824 assert(ctrlr->sqs[id] == NULL); 1825 1826 sq = calloc(1, sizeof(*sq)); 1827 if (sq == NULL) { 1828 return -ENOMEM; 1829 } 1830 sq->mapping.sg = calloc(1, dma_sg_size()); 1831 if (sq->mapping.sg == NULL) { 1832 free(sq); 1833 return -ENOMEM; 1834 } 1835 1836 sq->qid = id; 1837 sq->qpair.qid = id; 1838 sq->qpair.transport = transport; 1839 sq->ctrlr = ctrlr; 1840 ctrlr->sqs[id] = sq; 1841 1842 TAILQ_INIT(&sq->free_reqs); 1843 1844 return 0; 1845 } 1846 1847 static int 1848 init_cq(struct nvmf_vfio_user_ctrlr *vu_ctrlr, const uint16_t id) 1849 { 1850 struct nvmf_vfio_user_cq *cq; 1851 1852 assert(vu_ctrlr != NULL); 1853 assert(vu_ctrlr->cqs[id] == NULL); 1854 1855 cq = calloc(1, sizeof(*cq)); 1856 if (cq == NULL) { 1857 return -ENOMEM; 1858 } 1859 cq->mapping.sg = calloc(1, dma_sg_size()); 1860 if (cq->mapping.sg == NULL) { 1861 free(cq); 1862 return -ENOMEM; 1863 } 1864 1865 cq->qid = id; 1866 vu_ctrlr->cqs[id] = cq; 1867 1868 return 0; 1869 } 1870 1871 static int 1872 alloc_sq_reqs(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvmf_vfio_user_sq *sq) 1873 { 1874 struct nvmf_vfio_user_req *vu_req, *tmp; 1875 size_t req_size; 1876 uint32_t i; 1877 1878 req_size = sizeof(struct nvmf_vfio_user_req) + 1879 (dma_sg_size() * NVMF_VFIO_USER_MAX_IOVECS); 1880 1881 for (i = 0; i < sq->size; i++) { 1882 struct spdk_nvmf_request *req; 1883 1884 vu_req = calloc(1, req_size); 1885 if (vu_req == NULL) { 1886 goto err; 1887 } 1888 1889 req = &vu_req->req; 1890 req->qpair = &sq->qpair; 1891 req->rsp = (union nvmf_c2h_msg *)&vu_req->rsp; 1892 req->cmd = (union nvmf_h2c_msg *)&vu_req->cmd; 1893 req->stripped_data = NULL; 1894 1895 TAILQ_INSERT_TAIL(&sq->free_reqs, vu_req, link); 1896 } 1897 1898 return 0; 1899 1900 err: 1901 TAILQ_FOREACH_SAFE(vu_req, &sq->free_reqs, link, tmp) { 1902 free(vu_req); 1903 } 1904 return -ENOMEM; 1905 } 1906 1907 static volatile uint32_t * 1908 ctrlr_doorbell_ptr(struct nvmf_vfio_user_ctrlr *ctrlr) 1909 { 1910 return ctrlr->sdbl != NULL ? 1911 ctrlr->sdbl->shadow_doorbells : 1912 ctrlr->bar0_doorbells; 1913 } 1914 1915 static uint16_t 1916 handle_create_io_sq(struct nvmf_vfio_user_ctrlr *ctrlr, 1917 struct spdk_nvme_cmd *cmd, uint16_t *sct) 1918 { 1919 struct nvmf_vfio_user_transport *vu_transport = ctrlr->transport; 1920 struct nvmf_vfio_user_sq *sq; 1921 uint32_t qsize; 1922 uint16_t cqid; 1923 uint16_t qid; 1924 int err; 1925 1926 qid = cmd->cdw10_bits.create_io_q.qid; 1927 cqid = cmd->cdw11_bits.create_io_sq.cqid; 1928 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 1929 1930 if (ctrlr->sqs[qid] == NULL) { 1931 err = init_sq(ctrlr, ctrlr->sqs[0]->qpair.transport, qid); 1932 if (err != 0) { 1933 *sct = SPDK_NVME_SCT_GENERIC; 1934 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1935 } 1936 } 1937 1938 if (cqid == 0 || cqid >= vu_transport->transport.opts.max_qpairs_per_ctrlr) { 1939 SPDK_ERRLOG("%s: invalid cqid:%u\n", ctrlr_id(ctrlr), cqid); 1940 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1941 return SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 1942 } 1943 1944 /* CQ must be created before SQ. */ 1945 if (!io_q_exists(ctrlr, cqid, true)) { 1946 SPDK_ERRLOG("%s: cqid:%u does not exist\n", ctrlr_id(ctrlr), cqid); 1947 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1948 return SPDK_NVME_SC_COMPLETION_QUEUE_INVALID; 1949 } 1950 1951 if (cmd->cdw11_bits.create_io_sq.pc != 0x1) { 1952 SPDK_ERRLOG("%s: non-PC SQ not supported\n", ctrlr_id(ctrlr)); 1953 *sct = SPDK_NVME_SCT_GENERIC; 1954 return SPDK_NVME_SC_INVALID_FIELD; 1955 } 1956 1957 sq = ctrlr->sqs[qid]; 1958 sq->size = qsize; 1959 1960 SPDK_DEBUGLOG(nvmf_vfio, "%s: sqid:%d cqid:%d\n", ctrlr_id(ctrlr), 1961 qid, cqid); 1962 1963 sq->mapping.prp1 = cmd->dptr.prp.prp1; 1964 1965 err = map_q(ctrlr, &sq->mapping, sq->size, false, true); 1966 if (err) { 1967 SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr)); 1968 *sct = SPDK_NVME_SCT_GENERIC; 1969 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1970 } 1971 1972 SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped sqid:%d IOVA=%#lx vaddr=%p\n", 1973 ctrlr_id(ctrlr), qid, cmd->dptr.prp.prp1, 1974 q_addr(&sq->mapping)); 1975 1976 err = alloc_sq_reqs(ctrlr, sq); 1977 if (err < 0) { 1978 SPDK_ERRLOG("%s: failed to allocate SQ requests: %m\n", ctrlr_id(ctrlr)); 1979 *sct = SPDK_NVME_SCT_GENERIC; 1980 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1981 } 1982 1983 sq->cqid = cqid; 1984 ctrlr->cqs[sq->cqid]->cq_ref++; 1985 sq->sq_state = VFIO_USER_SQ_CREATED; 1986 *sq_headp(sq) = 0; 1987 1988 sq->dbl_tailp = ctrlr_doorbell_ptr(ctrlr) + queue_index(qid, false); 1989 1990 /* 1991 * We should always reset the doorbells. 1992 * 1993 * The Specification prohibits the controller from writing to the shadow 1994 * doorbell buffer, however older versions of the Linux NVMe driver 1995 * don't reset the shadow doorbell buffer after a Queue-Level or 1996 * Controller-Level reset, which means that we're left with garbage 1997 * doorbell values. 1998 */ 1999 *sq_dbl_tailp(sq) = 0; 2000 2001 if (ctrlr->sdbl != NULL) { 2002 sq->need_rearm = true; 2003 2004 if (!set_sq_eventidx(sq)) { 2005 SPDK_ERRLOG("%s: host updated SQ tail doorbell before " 2006 "sqid:%hu was initialized\n", 2007 ctrlr_id(ctrlr), qid); 2008 fail_ctrlr(ctrlr); 2009 *sct = SPDK_NVME_SCT_GENERIC; 2010 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2011 } 2012 } 2013 2014 /* 2015 * Create our new I/O qpair. This asynchronously invokes, on a suitable 2016 * poll group, the nvmf_vfio_user_poll_group_add() callback, which will 2017 * call spdk_nvmf_request_exec_fabrics() with a generated fabrics 2018 * connect command. This command is then eventually completed via 2019 * handle_queue_connect_rsp(). 2020 */ 2021 sq->create_io_sq_cmd = *cmd; 2022 sq->post_create_io_sq_completion = true; 2023 2024 spdk_nvmf_tgt_new_qpair(ctrlr->transport->transport.tgt, 2025 &sq->qpair); 2026 2027 *sct = SPDK_NVME_SCT_GENERIC; 2028 return SPDK_NVME_SC_SUCCESS; 2029 } 2030 2031 static uint16_t 2032 handle_create_io_cq(struct nvmf_vfio_user_ctrlr *ctrlr, 2033 struct spdk_nvme_cmd *cmd, uint16_t *sct) 2034 { 2035 struct nvmf_vfio_user_cq *cq; 2036 uint32_t qsize; 2037 uint16_t qid; 2038 int err; 2039 2040 qid = cmd->cdw10_bits.create_io_q.qid; 2041 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 2042 2043 if (ctrlr->cqs[qid] == NULL) { 2044 err = init_cq(ctrlr, qid); 2045 if (err != 0) { 2046 *sct = SPDK_NVME_SCT_GENERIC; 2047 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2048 } 2049 } 2050 2051 if (cmd->cdw11_bits.create_io_cq.pc != 0x1) { 2052 SPDK_ERRLOG("%s: non-PC CQ not supported\n", ctrlr_id(ctrlr)); 2053 *sct = SPDK_NVME_SCT_GENERIC; 2054 return SPDK_NVME_SC_INVALID_FIELD; 2055 } 2056 2057 if (cmd->cdw11_bits.create_io_cq.iv > NVME_IRQ_MSIX_NUM - 1) { 2058 SPDK_ERRLOG("%s: IV is too big\n", ctrlr_id(ctrlr)); 2059 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2060 return SPDK_NVME_SC_INVALID_INTERRUPT_VECTOR; 2061 } 2062 2063 cq = ctrlr->cqs[qid]; 2064 cq->size = qsize; 2065 2066 cq->mapping.prp1 = cmd->dptr.prp.prp1; 2067 2068 cq->dbl_headp = ctrlr_doorbell_ptr(ctrlr) + queue_index(qid, true); 2069 2070 err = map_q(ctrlr, &cq->mapping, cq->size, true, true); 2071 if (err) { 2072 SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr)); 2073 *sct = SPDK_NVME_SCT_GENERIC; 2074 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2075 } 2076 2077 SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped cqid:%u IOVA=%#lx vaddr=%p\n", 2078 ctrlr_id(ctrlr), qid, cmd->dptr.prp.prp1, 2079 q_addr(&cq->mapping)); 2080 2081 cq->ien = cmd->cdw11_bits.create_io_cq.ien; 2082 cq->iv = cmd->cdw11_bits.create_io_cq.iv; 2083 cq->phase = true; 2084 cq->cq_state = VFIO_USER_CQ_CREATED; 2085 2086 *cq_tailp(cq) = 0; 2087 2088 /* 2089 * We should always reset the doorbells. 2090 * 2091 * The Specification prohibits the controller from writing to the shadow 2092 * doorbell buffer, however older versions of the Linux NVMe driver 2093 * don't reset the shadow doorbell buffer after a Queue-Level or 2094 * Controller-Level reset, which means that we're left with garbage 2095 * doorbell values. 2096 */ 2097 *cq_dbl_headp(cq) = 0; 2098 2099 *sct = SPDK_NVME_SCT_GENERIC; 2100 return SPDK_NVME_SC_SUCCESS; 2101 } 2102 2103 /* 2104 * Creates a completion or submission I/O queue. Returns 0 on success, -errno 2105 * on error. 2106 */ 2107 static int 2108 handle_create_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, 2109 struct spdk_nvme_cmd *cmd, const bool is_cq) 2110 { 2111 struct nvmf_vfio_user_transport *vu_transport = ctrlr->transport; 2112 uint16_t sct = SPDK_NVME_SCT_GENERIC; 2113 uint16_t sc = SPDK_NVME_SC_SUCCESS; 2114 uint32_t qsize; 2115 uint16_t qid; 2116 2117 assert(ctrlr != NULL); 2118 assert(cmd != NULL); 2119 2120 qid = cmd->cdw10_bits.create_io_q.qid; 2121 if (qid == 0 || qid >= vu_transport->transport.opts.max_qpairs_per_ctrlr) { 2122 SPDK_ERRLOG("%s: invalid qid=%d, max=%d\n", ctrlr_id(ctrlr), 2123 qid, vu_transport->transport.opts.max_qpairs_per_ctrlr); 2124 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2125 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2126 goto out; 2127 } 2128 2129 if (io_q_exists(ctrlr, qid, is_cq)) { 2130 SPDK_ERRLOG("%s: %cqid:%d already exists\n", ctrlr_id(ctrlr), 2131 is_cq ? 'c' : 's', qid); 2132 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2133 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2134 goto out; 2135 } 2136 2137 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 2138 if (qsize == 1 || qsize > max_queue_size(ctrlr)) { 2139 SPDK_ERRLOG("%s: invalid I/O queue size %u\n", ctrlr_id(ctrlr), qsize); 2140 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2141 sc = SPDK_NVME_SC_INVALID_QUEUE_SIZE; 2142 goto out; 2143 } 2144 2145 if (is_cq) { 2146 sc = handle_create_io_cq(ctrlr, cmd, &sct); 2147 } else { 2148 sc = handle_create_io_sq(ctrlr, cmd, &sct); 2149 2150 if (sct == SPDK_NVME_SCT_GENERIC && 2151 sc == SPDK_NVME_SC_SUCCESS) { 2152 /* Completion posted asynchronously. */ 2153 return 0; 2154 } 2155 } 2156 2157 out: 2158 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 2159 } 2160 2161 /* For ADMIN I/O DELETE SUBMISSION QUEUE the NVMf library will disconnect and free 2162 * queue pair, so save the command in a context. 2163 */ 2164 struct vfio_user_delete_sq_ctx { 2165 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 2166 struct spdk_nvme_cmd delete_io_sq_cmd; 2167 }; 2168 2169 static void 2170 vfio_user_qpair_delete_cb(void *cb_arg) 2171 { 2172 struct vfio_user_delete_sq_ctx *ctx = cb_arg; 2173 struct nvmf_vfio_user_ctrlr *vu_ctrlr = ctx->vu_ctrlr; 2174 struct nvmf_vfio_user_cq *admin_cq = vu_ctrlr->cqs[0]; 2175 2176 if (admin_cq->thread != spdk_get_thread()) { 2177 assert(admin_cq->thread != NULL); 2178 spdk_thread_send_msg(admin_cq->thread, 2179 vfio_user_qpair_delete_cb, 2180 cb_arg); 2181 } else { 2182 post_completion(vu_ctrlr, admin_cq, 0, 0, 2183 ctx->delete_io_sq_cmd.cid, 2184 SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC); 2185 free(ctx); 2186 } 2187 } 2188 2189 /* 2190 * Deletes a completion or submission I/O queue. 2191 */ 2192 static int 2193 handle_del_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, 2194 struct spdk_nvme_cmd *cmd, const bool is_cq) 2195 { 2196 uint16_t sct = SPDK_NVME_SCT_GENERIC; 2197 uint16_t sc = SPDK_NVME_SC_SUCCESS; 2198 struct nvmf_vfio_user_sq *sq; 2199 struct nvmf_vfio_user_cq *cq; 2200 struct vfio_user_delete_sq_ctx *ctx; 2201 2202 SPDK_DEBUGLOG(nvmf_vfio, "%s: delete I/O %cqid:%d\n", 2203 ctrlr_id(ctrlr), is_cq ? 'c' : 's', 2204 cmd->cdw10_bits.delete_io_q.qid); 2205 2206 if (!io_q_exists(ctrlr, cmd->cdw10_bits.delete_io_q.qid, is_cq)) { 2207 SPDK_ERRLOG("%s: I/O %cqid:%d does not exist\n", ctrlr_id(ctrlr), 2208 is_cq ? 'c' : 's', cmd->cdw10_bits.delete_io_q.qid); 2209 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2210 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2211 goto out; 2212 } 2213 2214 if (is_cq) { 2215 cq = ctrlr->cqs[cmd->cdw10_bits.delete_io_q.qid]; 2216 if (cq->cq_ref) { 2217 SPDK_ERRLOG("%s: the associated SQ must be deleted first\n", ctrlr_id(ctrlr)); 2218 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2219 sc = SPDK_NVME_SC_INVALID_QUEUE_DELETION; 2220 goto out; 2221 } 2222 delete_cq_done(ctrlr, cq); 2223 } else { 2224 /* 2225 * Deletion of the CQ is only deferred to delete_sq_done() on 2226 * VM reboot or CC.EN change, so we have to delete it in all 2227 * other cases. 2228 */ 2229 ctx = calloc(1, sizeof(*ctx)); 2230 if (!ctx) { 2231 sct = SPDK_NVME_SCT_GENERIC; 2232 sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2233 goto out; 2234 } 2235 ctx->vu_ctrlr = ctrlr; 2236 ctx->delete_io_sq_cmd = *cmd; 2237 2238 sq = ctrlr->sqs[cmd->cdw10_bits.delete_io_q.qid]; 2239 sq->sq_state = VFIO_USER_SQ_DELETED; 2240 assert(ctrlr->cqs[sq->cqid]->cq_ref); 2241 ctrlr->cqs[sq->cqid]->cq_ref--; 2242 2243 spdk_nvmf_qpair_disconnect(&sq->qpair, vfio_user_qpair_delete_cb, ctx); 2244 return 0; 2245 } 2246 2247 out: 2248 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 2249 } 2250 2251 /* 2252 * Configures Shadow Doorbells. 2253 */ 2254 static int 2255 handle_doorbell_buffer_config(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd) 2256 { 2257 struct nvmf_vfio_user_shadow_doorbells *sdbl = NULL; 2258 uint32_t dstrd; 2259 uintptr_t page_size, page_mask; 2260 uint64_t prp1, prp2; 2261 uint16_t sct = SPDK_NVME_SCT_GENERIC; 2262 uint16_t sc = SPDK_NVME_SC_INVALID_FIELD; 2263 2264 assert(ctrlr != NULL); 2265 assert(ctrlr->endpoint != NULL); 2266 assert(cmd != NULL); 2267 2268 dstrd = doorbell_stride(ctrlr); 2269 page_size = memory_page_size(ctrlr); 2270 page_mask = memory_page_mask(ctrlr); 2271 2272 /* FIXME: we don't check doorbell stride when setting queue doorbells. */ 2273 if ((4u << dstrd) * NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR > page_size) { 2274 SPDK_ERRLOG("%s: doorbells do not fit in a single host page", 2275 ctrlr_id(ctrlr)); 2276 2277 goto out; 2278 } 2279 2280 /* Verify guest physical addresses passed as PRPs. */ 2281 if (cmd->psdt != SPDK_NVME_PSDT_PRP) { 2282 SPDK_ERRLOG("%s: received Doorbell Buffer Config without PRPs", 2283 ctrlr_id(ctrlr)); 2284 2285 goto out; 2286 } 2287 2288 prp1 = cmd->dptr.prp.prp1; 2289 prp2 = cmd->dptr.prp.prp2; 2290 2291 SPDK_DEBUGLOG(nvmf_vfio, 2292 "%s: configuring shadow doorbells with PRP1=%#lx and PRP2=%#lx (GPAs)\n", 2293 ctrlr_id(ctrlr), prp1, prp2); 2294 2295 if (prp1 == prp2 2296 || prp1 != (prp1 & page_mask) 2297 || prp2 != (prp2 & page_mask)) { 2298 SPDK_ERRLOG("%s: invalid shadow doorbell GPAs\n", 2299 ctrlr_id(ctrlr)); 2300 2301 goto out; 2302 } 2303 2304 /* Map guest physical addresses to our virtual address space. */ 2305 sdbl = map_sdbl(ctrlr->endpoint->vfu_ctx, prp1, prp2, page_size); 2306 if (sdbl == NULL) { 2307 SPDK_ERRLOG("%s: failed to map shadow doorbell buffers\n", 2308 ctrlr_id(ctrlr)); 2309 2310 goto out; 2311 } 2312 2313 ctrlr->shadow_doorbell_buffer = prp1; 2314 ctrlr->eventidx_buffer = prp2; 2315 2316 SPDK_DEBUGLOG(nvmf_vfio, 2317 "%s: mapped shadow doorbell buffers [%p, %p) and [%p, %p)\n", 2318 ctrlr_id(ctrlr), 2319 sdbl->iovs[0].iov_base, 2320 sdbl->iovs[0].iov_base + sdbl->iovs[0].iov_len, 2321 sdbl->iovs[1].iov_base, 2322 sdbl->iovs[1].iov_base + sdbl->iovs[1].iov_len); 2323 2324 2325 /* 2326 * Set all possible CQ head doorbells to polling mode now, such that we 2327 * don't have to worry about it later if the host creates more queues. 2328 * 2329 * We only ever want interrupts for writes to the SQ tail doorbells 2330 * (which are initialised in set_ctrlr_intr_mode() below). 2331 */ 2332 for (uint16_t i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; ++i) { 2333 sdbl->eventidxs[queue_index(i, true)] = NVMF_VFIO_USER_EVENTIDX_POLL; 2334 } 2335 2336 /* Update controller. */ 2337 SWAP(ctrlr->sdbl, sdbl); 2338 2339 /* 2340 * Copy doorbells from either the previous shadow doorbell buffer or the 2341 * BAR0 doorbells and make I/O queue doorbells point to the new buffer. 2342 * 2343 * This needs to account for older versions of the Linux NVMe driver, 2344 * which don't clear out the buffer after a controller reset. 2345 */ 2346 copy_doorbells(ctrlr, sdbl != NULL ? 2347 sdbl->shadow_doorbells : ctrlr->bar0_doorbells, 2348 ctrlr->sdbl->shadow_doorbells); 2349 2350 vfio_user_ctrlr_switch_doorbells(ctrlr, true); 2351 2352 ctrlr_kick(ctrlr); 2353 2354 sc = SPDK_NVME_SC_SUCCESS; 2355 2356 out: 2357 /* 2358 * Unmap existing buffers, in case Doorbell Buffer Config was sent 2359 * more than once (pointless, but not prohibited by the spec), or 2360 * in case of an error. 2361 * 2362 * If this is the first time Doorbell Buffer Config was processed, 2363 * then we've just swapped a NULL from ctrlr->sdbl into sdbl, so 2364 * free_sdbl() becomes a noop. 2365 */ 2366 free_sdbl(ctrlr->endpoint->vfu_ctx, sdbl); 2367 2368 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 2369 } 2370 2371 /* Returns 0 on success and -errno on error. */ 2372 static int 2373 consume_admin_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd) 2374 { 2375 assert(ctrlr != NULL); 2376 assert(cmd != NULL); 2377 2378 if (cmd->fuse != 0) { 2379 /* Fused admin commands are not supported. */ 2380 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, 2381 SPDK_NVME_SC_INVALID_FIELD, 2382 SPDK_NVME_SCT_GENERIC); 2383 } 2384 2385 switch (cmd->opc) { 2386 case SPDK_NVME_OPC_CREATE_IO_CQ: 2387 case SPDK_NVME_OPC_CREATE_IO_SQ: 2388 return handle_create_io_q(ctrlr, cmd, 2389 cmd->opc == SPDK_NVME_OPC_CREATE_IO_CQ); 2390 case SPDK_NVME_OPC_DELETE_IO_SQ: 2391 case SPDK_NVME_OPC_DELETE_IO_CQ: 2392 return handle_del_io_q(ctrlr, cmd, 2393 cmd->opc == SPDK_NVME_OPC_DELETE_IO_CQ); 2394 case SPDK_NVME_OPC_DOORBELL_BUFFER_CONFIG: 2395 if (!ctrlr->transport->transport_opts.disable_shadow_doorbells) { 2396 return handle_doorbell_buffer_config(ctrlr, cmd); 2397 } 2398 /* FALLTHROUGH */ 2399 default: 2400 return handle_cmd_req(ctrlr, cmd, ctrlr->sqs[0]); 2401 } 2402 } 2403 2404 static int 2405 handle_cmd_rsp(struct nvmf_vfio_user_req *vu_req, void *cb_arg) 2406 { 2407 struct nvmf_vfio_user_sq *sq = cb_arg; 2408 struct nvmf_vfio_user_ctrlr *vu_ctrlr = sq->ctrlr; 2409 uint16_t sqid, cqid; 2410 2411 assert(sq != NULL); 2412 assert(vu_req != NULL); 2413 assert(vu_ctrlr != NULL); 2414 2415 if (spdk_likely(vu_req->iovcnt)) { 2416 vfu_sgl_put(vu_ctrlr->endpoint->vfu_ctx, 2417 index_to_sg_t(vu_req->sg, 0), 2418 vu_req->iov, vu_req->iovcnt); 2419 } 2420 sqid = sq->qid; 2421 cqid = sq->cqid; 2422 2423 return post_completion(vu_ctrlr, vu_ctrlr->cqs[cqid], 2424 vu_req->req.rsp->nvme_cpl.cdw0, 2425 sqid, 2426 vu_req->req.cmd->nvme_cmd.cid, 2427 vu_req->req.rsp->nvme_cpl.status.sc, 2428 vu_req->req.rsp->nvme_cpl.status.sct); 2429 } 2430 2431 static int 2432 consume_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_sq *sq, 2433 struct spdk_nvme_cmd *cmd) 2434 { 2435 assert(sq != NULL); 2436 if (spdk_unlikely(nvmf_qpair_is_admin_queue(&sq->qpair))) { 2437 return consume_admin_cmd(ctrlr, cmd); 2438 } 2439 2440 return handle_cmd_req(ctrlr, cmd, sq); 2441 } 2442 2443 /* Returns the number of commands processed, or a negative value on error. */ 2444 static int 2445 handle_sq_tdbl_write(struct nvmf_vfio_user_ctrlr *ctrlr, const uint32_t new_tail, 2446 struct nvmf_vfio_user_sq *sq) 2447 { 2448 struct spdk_nvme_cmd *queue; 2449 int count = 0; 2450 2451 assert(ctrlr != NULL); 2452 assert(sq != NULL); 2453 2454 if (ctrlr->sdbl != NULL && sq->qid != 0) { 2455 /* 2456 * Submission queue index has moved past the event index, so it 2457 * needs to be re-armed before we go to sleep. 2458 */ 2459 sq->need_rearm = true; 2460 } 2461 2462 queue = q_addr(&sq->mapping); 2463 while (*sq_headp(sq) != new_tail) { 2464 int err; 2465 struct spdk_nvme_cmd *cmd = &queue[*sq_headp(sq)]; 2466 2467 count++; 2468 2469 /* 2470 * SQHD must contain the new head pointer, so we must increase 2471 * it before we generate a completion. 2472 */ 2473 sq_head_advance(sq); 2474 2475 err = consume_cmd(ctrlr, sq, cmd); 2476 if (spdk_unlikely(err != 0)) { 2477 return err; 2478 } 2479 } 2480 2481 return count; 2482 } 2483 2484 /* Checks whether endpoint is connected from the same process */ 2485 static bool 2486 is_peer_same_process(struct nvmf_vfio_user_endpoint *endpoint) 2487 { 2488 struct ucred ucred; 2489 socklen_t ucredlen = sizeof(ucred); 2490 2491 if (endpoint == NULL) { 2492 return false; 2493 } 2494 2495 if (getsockopt(vfu_get_poll_fd(endpoint->vfu_ctx), SOL_SOCKET, SO_PEERCRED, &ucred, 2496 &ucredlen) < 0) { 2497 SPDK_ERRLOG("getsockopt(SO_PEERCRED): %s\n", strerror(errno)); 2498 return false; 2499 } 2500 2501 return ucred.pid == getpid(); 2502 } 2503 2504 static void 2505 memory_region_add_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) 2506 { 2507 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2508 struct nvmf_vfio_user_ctrlr *ctrlr; 2509 struct nvmf_vfio_user_sq *sq; 2510 struct nvmf_vfio_user_cq *cq; 2511 void *map_start, *map_end; 2512 int ret; 2513 2514 /* 2515 * We're not interested in any DMA regions that aren't mappable (we don't 2516 * support clients that don't share their memory). 2517 */ 2518 if (!info->vaddr) { 2519 return; 2520 } 2521 2522 map_start = info->mapping.iov_base; 2523 map_end = info->mapping.iov_base + info->mapping.iov_len; 2524 2525 if (((uintptr_t)info->mapping.iov_base & MASK_2MB) || 2526 (info->mapping.iov_len & MASK_2MB)) { 2527 SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %p-%p\n", 2528 info->vaddr, map_start, map_end); 2529 return; 2530 } 2531 2532 assert(endpoint != NULL); 2533 if (endpoint->ctrlr == NULL) { 2534 return; 2535 } 2536 ctrlr = endpoint->ctrlr; 2537 2538 SPDK_DEBUGLOG(nvmf_vfio, "%s: map IOVA %p-%p\n", endpoint_id(endpoint), 2539 map_start, map_end); 2540 2541 /* VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE are enabled when registering to VFIO, here we also 2542 * check the protection bits before registering. When vfio client and server are run in same process 2543 * there is no need to register the same memory again. 2544 */ 2545 if (info->prot == (PROT_WRITE | PROT_READ) && !is_peer_same_process(endpoint)) { 2546 ret = spdk_mem_register(info->mapping.iov_base, info->mapping.iov_len); 2547 if (ret) { 2548 SPDK_ERRLOG("Memory region register %p-%p failed, ret=%d\n", 2549 map_start, map_end, ret); 2550 } 2551 } 2552 2553 pthread_mutex_lock(&endpoint->lock); 2554 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 2555 if (sq->sq_state != VFIO_USER_SQ_INACTIVE) { 2556 continue; 2557 } 2558 2559 cq = ctrlr->cqs[sq->cqid]; 2560 2561 /* For shared CQ case, we will use q_addr() to avoid mapping CQ multiple times */ 2562 if (cq->size && q_addr(&cq->mapping) == NULL) { 2563 ret = map_q(ctrlr, &cq->mapping, cq->size, true, false); 2564 if (ret) { 2565 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap cqid:%d %#lx-%#lx\n", 2566 cq->qid, cq->mapping.prp1, 2567 cq->mapping.prp1 + cq->size * sizeof(struct spdk_nvme_cpl)); 2568 continue; 2569 } 2570 } 2571 2572 if (sq->size) { 2573 ret = map_q(ctrlr, &sq->mapping, sq->size, false, false); 2574 if (ret) { 2575 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap sqid:%d %#lx-%#lx\n", 2576 sq->qid, sq->mapping.prp1, 2577 sq->mapping.prp1 + sq->size * sizeof(struct spdk_nvme_cmd)); 2578 continue; 2579 } 2580 } 2581 sq->sq_state = VFIO_USER_SQ_ACTIVE; 2582 SPDK_DEBUGLOG(nvmf_vfio, "Remap sqid:%u successfully\n", sq->qid); 2583 } 2584 pthread_mutex_unlock(&endpoint->lock); 2585 } 2586 2587 static void 2588 memory_region_remove_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) 2589 { 2590 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2591 struct nvmf_vfio_user_sq *sq; 2592 struct nvmf_vfio_user_cq *cq; 2593 void *map_start, *map_end; 2594 int ret = 0; 2595 2596 if (!info->vaddr) { 2597 return; 2598 } 2599 2600 map_start = info->mapping.iov_base; 2601 map_end = info->mapping.iov_base + info->mapping.iov_len; 2602 2603 if (((uintptr_t)info->mapping.iov_base & MASK_2MB) || 2604 (info->mapping.iov_len & MASK_2MB)) { 2605 SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %p-%p\n", 2606 info->vaddr, map_start, map_end); 2607 return; 2608 } 2609 2610 assert(endpoint != NULL); 2611 SPDK_DEBUGLOG(nvmf_vfio, "%s: unmap IOVA %p-%p\n", endpoint_id(endpoint), 2612 map_start, map_end); 2613 2614 if (endpoint->ctrlr != NULL) { 2615 struct nvmf_vfio_user_ctrlr *ctrlr; 2616 ctrlr = endpoint->ctrlr; 2617 2618 pthread_mutex_lock(&endpoint->lock); 2619 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 2620 if (q_addr(&sq->mapping) >= map_start && q_addr(&sq->mapping) <= map_end) { 2621 unmap_q(ctrlr, &sq->mapping); 2622 sq->sq_state = VFIO_USER_SQ_INACTIVE; 2623 } 2624 2625 cq = ctrlr->cqs[sq->cqid]; 2626 if (q_addr(&cq->mapping) >= map_start && q_addr(&cq->mapping) <= map_end) { 2627 unmap_q(ctrlr, &cq->mapping); 2628 } 2629 } 2630 2631 if (ctrlr->sdbl != NULL) { 2632 size_t i; 2633 2634 for (i = 0; i < NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT; i++) { 2635 const void *const iov_base = ctrlr->sdbl->iovs[i].iov_base; 2636 2637 if (iov_base >= map_start && iov_base < map_end) { 2638 copy_doorbells(ctrlr, 2639 ctrlr->sdbl->shadow_doorbells, 2640 ctrlr->bar0_doorbells); 2641 vfio_user_ctrlr_switch_doorbells(ctrlr, false); 2642 free_sdbl(endpoint->vfu_ctx, ctrlr->sdbl); 2643 ctrlr->sdbl = NULL; 2644 break; 2645 } 2646 } 2647 } 2648 2649 pthread_mutex_unlock(&endpoint->lock); 2650 } 2651 2652 if (info->prot == (PROT_WRITE | PROT_READ) && !is_peer_same_process(endpoint)) { 2653 ret = spdk_mem_unregister(info->mapping.iov_base, info->mapping.iov_len); 2654 if (ret) { 2655 SPDK_ERRLOG("Memory region unregister %p-%p failed, ret=%d\n", 2656 map_start, map_end, ret); 2657 } 2658 } 2659 } 2660 2661 /* Used to initiate a controller-level reset or a controller shutdown. */ 2662 static void 2663 disable_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 2664 { 2665 SPDK_DEBUGLOG(nvmf_vfio, "%s: disabling controller\n", 2666 ctrlr_id(vu_ctrlr)); 2667 2668 /* Unmap Admin queue. */ 2669 2670 assert(vu_ctrlr->sqs[0] != NULL); 2671 assert(vu_ctrlr->cqs[0] != NULL); 2672 2673 unmap_q(vu_ctrlr, &vu_ctrlr->sqs[0]->mapping); 2674 unmap_q(vu_ctrlr, &vu_ctrlr->cqs[0]->mapping); 2675 2676 vu_ctrlr->sqs[0]->size = 0; 2677 *sq_headp(vu_ctrlr->sqs[0]) = 0; 2678 2679 vu_ctrlr->sqs[0]->sq_state = VFIO_USER_SQ_INACTIVE; 2680 2681 vu_ctrlr->cqs[0]->size = 0; 2682 *cq_tailp(vu_ctrlr->cqs[0]) = 0; 2683 2684 /* 2685 * For PCIe controller reset or shutdown, we will drop all AER 2686 * responses. 2687 */ 2688 nvmf_ctrlr_abort_aer(vu_ctrlr->ctrlr); 2689 2690 /* Free the shadow doorbell buffer. */ 2691 vfio_user_ctrlr_switch_doorbells(vu_ctrlr, false); 2692 free_sdbl(vu_ctrlr->endpoint->vfu_ctx, vu_ctrlr->sdbl); 2693 vu_ctrlr->sdbl = NULL; 2694 } 2695 2696 /* Used to re-enable the controller after a controller-level reset. */ 2697 static int 2698 enable_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 2699 { 2700 int err; 2701 2702 assert(vu_ctrlr != NULL); 2703 2704 SPDK_DEBUGLOG(nvmf_vfio, "%s: enabling controller\n", 2705 ctrlr_id(vu_ctrlr)); 2706 2707 err = acq_setup(vu_ctrlr); 2708 if (err != 0) { 2709 return err; 2710 } 2711 2712 err = asq_setup(vu_ctrlr); 2713 if (err != 0) { 2714 return err; 2715 } 2716 2717 vu_ctrlr->sqs[0]->sq_state = VFIO_USER_SQ_ACTIVE; 2718 2719 return 0; 2720 } 2721 2722 static int 2723 nvmf_vfio_user_prop_req_rsp_set(struct nvmf_vfio_user_req *req, 2724 struct nvmf_vfio_user_sq *sq) 2725 { 2726 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 2727 union spdk_nvme_cc_register cc, diff; 2728 2729 assert(req->req.cmd->prop_set_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET); 2730 assert(sq->ctrlr != NULL); 2731 vu_ctrlr = sq->ctrlr; 2732 2733 if (req->req.cmd->prop_set_cmd.ofst != offsetof(struct spdk_nvme_registers, cc)) { 2734 return 0; 2735 } 2736 2737 cc.raw = req->req.cmd->prop_set_cmd.value.u64; 2738 diff.raw = cc.raw ^ req->cc.raw; 2739 2740 if (diff.bits.en) { 2741 if (cc.bits.en) { 2742 int ret = enable_ctrlr(vu_ctrlr); 2743 if (ret) { 2744 SPDK_ERRLOG("%s: failed to enable ctrlr\n", ctrlr_id(vu_ctrlr)); 2745 return ret; 2746 } 2747 vu_ctrlr->reset_shn = false; 2748 } else { 2749 vu_ctrlr->reset_shn = true; 2750 } 2751 } 2752 2753 if (diff.bits.shn) { 2754 if (cc.bits.shn == SPDK_NVME_SHN_NORMAL || cc.bits.shn == SPDK_NVME_SHN_ABRUPT) { 2755 vu_ctrlr->reset_shn = true; 2756 } 2757 } 2758 2759 if (vu_ctrlr->reset_shn) { 2760 disable_ctrlr(vu_ctrlr); 2761 } 2762 return 0; 2763 } 2764 2765 static int 2766 nvmf_vfio_user_prop_req_rsp(struct nvmf_vfio_user_req *req, void *cb_arg) 2767 { 2768 struct nvmf_vfio_user_sq *sq = cb_arg; 2769 2770 assert(sq != NULL); 2771 assert(req != NULL); 2772 2773 if (req->req.cmd->prop_get_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET) { 2774 assert(sq->ctrlr != NULL); 2775 assert(req != NULL); 2776 2777 memcpy(req->req.data, 2778 &req->req.rsp->prop_get_rsp.value.u64, 2779 req->req.length); 2780 return 0; 2781 } 2782 2783 return nvmf_vfio_user_prop_req_rsp_set(req, sq); 2784 } 2785 2786 /* 2787 * Handles a write at offset 0x1000 or more; this is the non-mapped path when a 2788 * doorbell is written via access_bar0_fn(). 2789 * 2790 * DSTRD is set to fixed value 0 for NVMf. 2791 * 2792 */ 2793 static int 2794 handle_dbl_access(struct nvmf_vfio_user_ctrlr *ctrlr, uint32_t *buf, 2795 const size_t count, loff_t pos, const bool is_write) 2796 { 2797 assert(ctrlr != NULL); 2798 assert(buf != NULL); 2799 2800 if (spdk_unlikely(!is_write)) { 2801 SPDK_WARNLOG("%s: host tried to read BAR0 doorbell %#lx\n", 2802 ctrlr_id(ctrlr), pos); 2803 errno = EPERM; 2804 return -1; 2805 } 2806 2807 if (spdk_unlikely(count != sizeof(uint32_t))) { 2808 SPDK_ERRLOG("%s: bad doorbell buffer size %ld\n", 2809 ctrlr_id(ctrlr), count); 2810 errno = EINVAL; 2811 return -1; 2812 } 2813 2814 pos -= NVME_DOORBELLS_OFFSET; 2815 2816 /* pos must be dword aligned */ 2817 if (spdk_unlikely((pos & 0x3) != 0)) { 2818 SPDK_ERRLOG("%s: bad doorbell offset %#lx\n", ctrlr_id(ctrlr), pos); 2819 errno = EINVAL; 2820 return -1; 2821 } 2822 2823 /* convert byte offset to array index */ 2824 pos >>= 2; 2825 2826 if (spdk_unlikely(pos >= NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR * 2)) { 2827 SPDK_ERRLOG("%s: bad doorbell index %#lx\n", ctrlr_id(ctrlr), pos); 2828 errno = EINVAL; 2829 return -1; 2830 } 2831 2832 ctrlr->bar0_doorbells[pos] = *buf; 2833 spdk_wmb(); 2834 2835 SPDK_DEBUGLOG(vfio_user_db, "%s: updating BAR0 doorbell %s:%ld to %u\n", 2836 ctrlr_id(ctrlr), (pos & 1) ? "cqid" : "sqid", 2837 pos / 2, *buf); 2838 2839 2840 return 0; 2841 } 2842 2843 static size_t 2844 vfio_user_property_access(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 2845 char *buf, size_t count, loff_t pos, 2846 bool is_write) 2847 { 2848 struct nvmf_vfio_user_req *req; 2849 const struct spdk_nvmf_registers *regs; 2850 2851 if ((count != 4) && (count != 8)) { 2852 errno = EINVAL; 2853 return -1; 2854 } 2855 2856 /* Construct a Fabric Property Get/Set command and send it */ 2857 req = get_nvmf_vfio_user_req(vu_ctrlr->sqs[0]); 2858 if (req == NULL) { 2859 errno = ENOBUFS; 2860 return -1; 2861 } 2862 regs = spdk_nvmf_ctrlr_get_regs(vu_ctrlr->ctrlr); 2863 req->cc.raw = regs->cc.raw; 2864 2865 req->cb_fn = nvmf_vfio_user_prop_req_rsp; 2866 req->cb_arg = vu_ctrlr->sqs[0]; 2867 req->req.cmd->prop_set_cmd.opcode = SPDK_NVME_OPC_FABRIC; 2868 req->req.cmd->prop_set_cmd.cid = 0; 2869 if (count == 4) { 2870 req->req.cmd->prop_set_cmd.attrib.size = 0; 2871 } else { 2872 req->req.cmd->prop_set_cmd.attrib.size = 1; 2873 } 2874 req->req.cmd->prop_set_cmd.ofst = pos; 2875 if (is_write) { 2876 req->req.cmd->prop_set_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET; 2877 if (req->req.cmd->prop_set_cmd.attrib.size) { 2878 req->req.cmd->prop_set_cmd.value.u64 = *(uint64_t *)buf; 2879 } else { 2880 req->req.cmd->prop_set_cmd.value.u32.high = 0; 2881 req->req.cmd->prop_set_cmd.value.u32.low = *(uint32_t *)buf; 2882 } 2883 } else { 2884 req->req.cmd->prop_get_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET; 2885 } 2886 req->req.length = count; 2887 req->req.data = buf; 2888 2889 spdk_nvmf_request_exec_fabrics(&req->req); 2890 2891 return count; 2892 } 2893 2894 static ssize_t 2895 access_bar0_fn(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t pos, 2896 bool is_write) 2897 { 2898 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2899 struct nvmf_vfio_user_ctrlr *ctrlr; 2900 int ret; 2901 2902 ctrlr = endpoint->ctrlr; 2903 if (spdk_unlikely(endpoint->need_async_destroy || !ctrlr)) { 2904 errno = EIO; 2905 return -1; 2906 } 2907 2908 if (pos >= NVME_DOORBELLS_OFFSET) { 2909 /* 2910 * The fact that the doorbells can be memory mapped doesn't mean 2911 * that the client (VFIO in QEMU) is obliged to memory map them, 2912 * it might still elect to access them via regular read/write; 2913 * we might also have had disable_mappable_bar0 set. 2914 */ 2915 ret = handle_dbl_access(ctrlr, (uint32_t *)buf, count, 2916 pos, is_write); 2917 if (ret == 0) { 2918 return count; 2919 } 2920 return ret; 2921 } 2922 2923 return vfio_user_property_access(ctrlr, buf, count, pos, is_write); 2924 } 2925 2926 static ssize_t 2927 access_pci_config(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t offset, 2928 bool is_write) 2929 { 2930 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2931 2932 if (is_write) { 2933 SPDK_ERRLOG("%s: write %#lx-%#lx not supported\n", 2934 endpoint_id(endpoint), offset, offset + count); 2935 errno = EINVAL; 2936 return -1; 2937 } 2938 2939 if (offset + count > NVME_REG_CFG_SIZE) { 2940 SPDK_ERRLOG("%s: access past end of extended PCI configuration space, want=%ld+%ld, max=%d\n", 2941 endpoint_id(endpoint), offset, count, 2942 NVME_REG_CFG_SIZE); 2943 errno = ERANGE; 2944 return -1; 2945 } 2946 2947 memcpy(buf, ((unsigned char *)endpoint->pci_config_space) + offset, count); 2948 2949 return count; 2950 } 2951 2952 static void 2953 vfio_user_log(vfu_ctx_t *vfu_ctx, int level, char const *msg) 2954 { 2955 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2956 2957 if (level >= LOG_DEBUG) { 2958 SPDK_DEBUGLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg); 2959 } else if (level >= LOG_INFO) { 2960 SPDK_INFOLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg); 2961 } else if (level >= LOG_NOTICE) { 2962 SPDK_NOTICELOG("%s: %s\n", endpoint_id(endpoint), msg); 2963 } else if (level >= LOG_WARNING) { 2964 SPDK_WARNLOG("%s: %s\n", endpoint_id(endpoint), msg); 2965 } else { 2966 SPDK_ERRLOG("%s: %s\n", endpoint_id(endpoint), msg); 2967 } 2968 } 2969 2970 static int 2971 vfio_user_get_log_level(void) 2972 { 2973 int level; 2974 2975 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 2976 return LOG_DEBUG; 2977 } 2978 2979 level = spdk_log_to_syslog_level(spdk_log_get_level()); 2980 if (level < 0) { 2981 return LOG_ERR; 2982 } 2983 2984 return level; 2985 } 2986 2987 static void 2988 init_pci_config_space(vfu_pci_config_space_t *p) 2989 { 2990 /* MLBAR */ 2991 p->hdr.bars[0].raw = 0x0; 2992 /* MUBAR */ 2993 p->hdr.bars[1].raw = 0x0; 2994 2995 /* vendor specific, let's set them to zero for now */ 2996 p->hdr.bars[3].raw = 0x0; 2997 p->hdr.bars[4].raw = 0x0; 2998 p->hdr.bars[5].raw = 0x0; 2999 3000 /* enable INTx */ 3001 p->hdr.intr.ipin = 0x1; 3002 } 3003 3004 struct ctrlr_quiesce_ctx { 3005 struct nvmf_vfio_user_endpoint *endpoint; 3006 struct nvmf_vfio_user_poll_group *group; 3007 int status; 3008 }; 3009 3010 static void ctrlr_quiesce(struct nvmf_vfio_user_ctrlr *vu_ctrlr); 3011 3012 static void 3013 _vfio_user_endpoint_resume_done_msg(void *ctx) 3014 { 3015 struct nvmf_vfio_user_endpoint *endpoint = ctx; 3016 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3017 3018 endpoint->need_resume = false; 3019 3020 if (!vu_ctrlr) { 3021 return; 3022 } 3023 3024 if (!vu_ctrlr->queued_quiesce) { 3025 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 3026 3027 /* 3028 * We might have ignored new SQ entries while we were quiesced: 3029 * kick ourselves so we'll definitely check again while in 3030 * VFIO_USER_CTRLR_RUNNING state. 3031 */ 3032 if (in_interrupt_mode(endpoint->transport)) { 3033 ctrlr_kick(vu_ctrlr); 3034 } 3035 return; 3036 } 3037 3038 3039 /* 3040 * Basically, once we call `vfu_device_quiesced` the device is 3041 * unquiesced from libvfio-user's perspective so from the moment 3042 * `vfio_user_quiesce_done` returns libvfio-user might quiesce the device 3043 * again. However, because the NVMf subsytem is an asynchronous 3044 * operation, this quiesce might come _before_ the NVMf subsystem has 3045 * been resumed, so in the callback of `spdk_nvmf_subsystem_resume` we 3046 * need to check whether a quiesce was requested. 3047 */ 3048 SPDK_DEBUGLOG(nvmf_vfio, "%s has queued quiesce event, quiesce again\n", 3049 ctrlr_id(vu_ctrlr)); 3050 ctrlr_quiesce(vu_ctrlr); 3051 } 3052 3053 static void 3054 vfio_user_endpoint_resume_done(struct spdk_nvmf_subsystem *subsystem, 3055 void *cb_arg, int status) 3056 { 3057 struct nvmf_vfio_user_endpoint *endpoint = cb_arg; 3058 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3059 3060 SPDK_DEBUGLOG(nvmf_vfio, "%s resumed done with status %d\n", endpoint_id(endpoint), status); 3061 3062 if (!vu_ctrlr) { 3063 return; 3064 } 3065 3066 spdk_thread_send_msg(vu_ctrlr->thread, _vfio_user_endpoint_resume_done_msg, endpoint); 3067 } 3068 3069 static void 3070 vfio_user_quiesce_done(void *ctx) 3071 { 3072 struct ctrlr_quiesce_ctx *quiesce_ctx = ctx; 3073 struct nvmf_vfio_user_endpoint *endpoint = quiesce_ctx->endpoint; 3074 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3075 int ret; 3076 3077 if (!vu_ctrlr) { 3078 free(quiesce_ctx); 3079 return; 3080 } 3081 3082 SPDK_DEBUGLOG(nvmf_vfio, "%s device quiesced\n", ctrlr_id(vu_ctrlr)); 3083 3084 assert(vu_ctrlr->state == VFIO_USER_CTRLR_PAUSING); 3085 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 3086 vfu_device_quiesced(endpoint->vfu_ctx, quiesce_ctx->status); 3087 vu_ctrlr->queued_quiesce = false; 3088 free(quiesce_ctx); 3089 3090 /* `vfu_device_quiesced` can change the migration state, 3091 * so we need to re-check `vu_ctrlr->state`. 3092 */ 3093 if (vu_ctrlr->state == VFIO_USER_CTRLR_MIGRATING) { 3094 SPDK_DEBUGLOG(nvmf_vfio, "%s is in MIGRATION state\n", ctrlr_id(vu_ctrlr)); 3095 return; 3096 } 3097 3098 SPDK_DEBUGLOG(nvmf_vfio, "%s start to resume\n", ctrlr_id(vu_ctrlr)); 3099 vu_ctrlr->state = VFIO_USER_CTRLR_RESUMING; 3100 ret = spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 3101 vfio_user_endpoint_resume_done, endpoint); 3102 if (ret < 0) { 3103 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 3104 SPDK_ERRLOG("%s: failed to resume, ret=%d\n", endpoint_id(endpoint), ret); 3105 } 3106 } 3107 3108 static void 3109 vfio_user_pause_done(struct spdk_nvmf_subsystem *subsystem, 3110 void *ctx, int status) 3111 { 3112 struct ctrlr_quiesce_ctx *quiesce_ctx = ctx; 3113 struct nvmf_vfio_user_endpoint *endpoint = quiesce_ctx->endpoint; 3114 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3115 3116 if (!vu_ctrlr) { 3117 free(quiesce_ctx); 3118 return; 3119 } 3120 3121 quiesce_ctx->status = status; 3122 3123 SPDK_DEBUGLOG(nvmf_vfio, "%s pause done with status %d\n", 3124 ctrlr_id(vu_ctrlr), status); 3125 3126 spdk_thread_send_msg(vu_ctrlr->thread, 3127 vfio_user_quiesce_done, ctx); 3128 } 3129 3130 /* 3131 * Ensure that, for this PG, we've stopped running in nvmf_vfio_user_sq_poll(); 3132 * we've already set ctrlr->state, so we won't process new entries, but we need 3133 * to ensure that this PG is quiesced. This only works because there's no 3134 * callback context set up between polling the SQ and spdk_nvmf_request_exec(). 3135 * 3136 * Once we've walked all PGs, we need to pause any submitted I/O via 3137 * spdk_nvmf_subsystem_pause(SPDK_NVME_GLOBAL_NS_TAG). 3138 */ 3139 static void 3140 vfio_user_quiesce_pg(void *ctx) 3141 { 3142 struct ctrlr_quiesce_ctx *quiesce_ctx = ctx; 3143 struct nvmf_vfio_user_endpoint *endpoint = quiesce_ctx->endpoint; 3144 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3145 struct nvmf_vfio_user_poll_group *vu_group = quiesce_ctx->group; 3146 struct spdk_nvmf_subsystem *subsystem = endpoint->subsystem; 3147 int ret; 3148 3149 SPDK_DEBUGLOG(nvmf_vfio, "quiesced pg:%p\n", vu_group); 3150 3151 if (!vu_ctrlr) { 3152 free(quiesce_ctx); 3153 return; 3154 } 3155 3156 quiesce_ctx->group = TAILQ_NEXT(vu_group, link); 3157 if (quiesce_ctx->group != NULL) { 3158 spdk_thread_send_msg(poll_group_to_thread(quiesce_ctx->group), 3159 vfio_user_quiesce_pg, quiesce_ctx); 3160 return; 3161 } 3162 3163 ret = spdk_nvmf_subsystem_pause(subsystem, SPDK_NVME_GLOBAL_NS_TAG, 3164 vfio_user_pause_done, quiesce_ctx); 3165 if (ret < 0) { 3166 SPDK_ERRLOG("%s: failed to pause, ret=%d\n", 3167 endpoint_id(endpoint), ret); 3168 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 3169 fail_ctrlr(vu_ctrlr); 3170 free(quiesce_ctx); 3171 } 3172 } 3173 3174 static void 3175 ctrlr_quiesce(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3176 { 3177 struct ctrlr_quiesce_ctx *quiesce_ctx; 3178 3179 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSING; 3180 3181 quiesce_ctx = calloc(1, sizeof(*quiesce_ctx)); 3182 if (!quiesce_ctx) { 3183 SPDK_ERRLOG("Failed to allocate subsystem pause context\n"); 3184 assert(false); 3185 return; 3186 } 3187 3188 quiesce_ctx->endpoint = vu_ctrlr->endpoint; 3189 quiesce_ctx->status = 0; 3190 quiesce_ctx->group = TAILQ_FIRST(&vu_ctrlr->transport->poll_groups); 3191 3192 spdk_thread_send_msg(poll_group_to_thread(quiesce_ctx->group), 3193 vfio_user_quiesce_pg, quiesce_ctx); 3194 } 3195 3196 static int 3197 vfio_user_dev_quiesce_cb(vfu_ctx_t *vfu_ctx) 3198 { 3199 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3200 struct spdk_nvmf_subsystem *subsystem = endpoint->subsystem; 3201 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3202 3203 if (!vu_ctrlr) { 3204 return 0; 3205 } 3206 3207 /* NVMf library will destruct controller when no 3208 * connected queue pairs. 3209 */ 3210 if (!nvmf_subsystem_get_ctrlr(subsystem, vu_ctrlr->cntlid)) { 3211 return 0; 3212 } 3213 3214 SPDK_DEBUGLOG(nvmf_vfio, "%s starts to quiesce\n", ctrlr_id(vu_ctrlr)); 3215 3216 /* There is no race condition here as device quiesce callback 3217 * and nvmf_prop_set_cc() are running in the same thread context. 3218 */ 3219 if (!vu_ctrlr->ctrlr->vcprop.cc.bits.en) { 3220 return 0; 3221 } else if (!vu_ctrlr->ctrlr->vcprop.csts.bits.rdy) { 3222 return 0; 3223 } else if (vu_ctrlr->ctrlr->vcprop.csts.bits.shst == SPDK_NVME_SHST_COMPLETE) { 3224 return 0; 3225 } 3226 3227 switch (vu_ctrlr->state) { 3228 case VFIO_USER_CTRLR_PAUSED: 3229 case VFIO_USER_CTRLR_MIGRATING: 3230 return 0; 3231 case VFIO_USER_CTRLR_RUNNING: 3232 ctrlr_quiesce(vu_ctrlr); 3233 break; 3234 case VFIO_USER_CTRLR_RESUMING: 3235 vu_ctrlr->queued_quiesce = true; 3236 SPDK_DEBUGLOG(nvmf_vfio, "%s is busy to quiesce, current state %u\n", ctrlr_id(vu_ctrlr), 3237 vu_ctrlr->state); 3238 break; 3239 default: 3240 assert(vu_ctrlr->state != VFIO_USER_CTRLR_PAUSING); 3241 break; 3242 } 3243 3244 errno = EBUSY; 3245 return -1; 3246 } 3247 3248 static void 3249 vfio_user_ctrlr_dump_migr_data(const char *name, 3250 struct vfio_user_nvme_migr_state *migr_data, 3251 struct nvmf_vfio_user_shadow_doorbells *sdbl) 3252 { 3253 struct spdk_nvmf_registers *regs; 3254 struct nvme_migr_sq_state *sq; 3255 struct nvme_migr_cq_state *cq; 3256 uint32_t *doorbell_base; 3257 uint32_t i; 3258 3259 SPDK_NOTICELOG("Dump %s\n", name); 3260 3261 regs = &migr_data->nvmf_data.regs; 3262 doorbell_base = (uint32_t *)&migr_data->doorbells; 3263 3264 SPDK_NOTICELOG("Registers\n"); 3265 SPDK_NOTICELOG("CSTS 0x%x\n", regs->csts.raw); 3266 SPDK_NOTICELOG("CAP 0x%"PRIx64"\n", regs->cap.raw); 3267 SPDK_NOTICELOG("VS 0x%x\n", regs->vs.raw); 3268 SPDK_NOTICELOG("CC 0x%x\n", regs->cc.raw); 3269 SPDK_NOTICELOG("AQA 0x%x\n", regs->aqa.raw); 3270 SPDK_NOTICELOG("ASQ 0x%"PRIx64"\n", regs->asq); 3271 SPDK_NOTICELOG("ACQ 0x%"PRIx64"\n", regs->acq); 3272 3273 SPDK_NOTICELOG("Number of IO Queues %u\n", migr_data->ctrlr_header.num_io_queues); 3274 3275 if (sdbl != NULL) { 3276 SPDK_NOTICELOG("shadow doorbell buffer=%#lx\n", 3277 migr_data->ctrlr_header.shadow_doorbell_buffer); 3278 SPDK_NOTICELOG("eventidx buffer=%#lx\n", 3279 migr_data->ctrlr_header.eventidx_buffer); 3280 } 3281 3282 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3283 sq = &migr_data->qps[i].sq; 3284 cq = &migr_data->qps[i].cq; 3285 3286 if (sq->size) { 3287 SPDK_NOTICELOG("sqid:%u, bar0_doorbell:%u\n", sq->sqid, doorbell_base[i * 2]); 3288 if (i > 0 && sdbl != NULL) { 3289 SPDK_NOTICELOG("sqid:%u, shadow_doorbell:%u, eventidx:%u\n", 3290 sq->sqid, 3291 sdbl->shadow_doorbells[queue_index(i, false)], 3292 sdbl->eventidxs[queue_index(i, false)]); 3293 } 3294 SPDK_NOTICELOG("SQ sqid:%u, cqid:%u, sqhead:%u, size:%u, dma_addr:0x%"PRIx64"\n", 3295 sq->sqid, sq->cqid, sq->head, sq->size, sq->dma_addr); 3296 } 3297 3298 if (cq->size) { 3299 SPDK_NOTICELOG("cqid:%u, bar0_doorbell:%u\n", cq->cqid, doorbell_base[i * 2 + 1]); 3300 if (i > 0 && sdbl != NULL) { 3301 SPDK_NOTICELOG("cqid:%u, shadow_doorbell:%u, eventidx:%u\n", 3302 cq->cqid, 3303 sdbl->shadow_doorbells[queue_index(i, true)], 3304 sdbl->eventidxs[queue_index(i, true)]); 3305 } 3306 SPDK_NOTICELOG("CQ cqid:%u, phase:%u, cqtail:%u, size:%u, iv:%u, ien:%u, dma_addr:0x%"PRIx64"\n", 3307 cq->cqid, cq->phase, cq->tail, cq->size, cq->iv, cq->ien, cq->dma_addr); 3308 } 3309 } 3310 3311 SPDK_NOTICELOG("%s Dump Done\n", name); 3312 } 3313 3314 /* Read region 9 content and restore it to migration data structures */ 3315 static int 3316 vfio_user_migr_stream_to_data(struct nvmf_vfio_user_endpoint *endpoint, 3317 struct vfio_user_nvme_migr_state *migr_state) 3318 { 3319 void *data_ptr = endpoint->migr_data; 3320 3321 /* Load vfio_user_nvme_migr_header first */ 3322 memcpy(&migr_state->ctrlr_header, data_ptr, sizeof(struct vfio_user_nvme_migr_header)); 3323 /* TODO: version check */ 3324 if (migr_state->ctrlr_header.magic != VFIO_USER_NVME_MIGR_MAGIC) { 3325 SPDK_ERRLOG("%s: bad magic number %x\n", endpoint_id(endpoint), migr_state->ctrlr_header.magic); 3326 return -EINVAL; 3327 } 3328 3329 /* Load nvmf controller data */ 3330 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.nvmf_data_offset; 3331 memcpy(&migr_state->nvmf_data, data_ptr, migr_state->ctrlr_header.nvmf_data_len); 3332 3333 /* Load queue pairs */ 3334 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.qp_offset; 3335 memcpy(&migr_state->qps, data_ptr, migr_state->ctrlr_header.qp_len); 3336 3337 /* Load doorbells */ 3338 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.bar_offset[VFU_PCI_DEV_BAR0_REGION_IDX]; 3339 memcpy(&migr_state->doorbells, data_ptr, 3340 migr_state->ctrlr_header.bar_len[VFU_PCI_DEV_BAR0_REGION_IDX]); 3341 3342 /* Load CFG */ 3343 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.bar_offset[VFU_PCI_DEV_CFG_REGION_IDX]; 3344 memcpy(&migr_state->cfg, data_ptr, migr_state->ctrlr_header.bar_len[VFU_PCI_DEV_CFG_REGION_IDX]); 3345 3346 return 0; 3347 } 3348 3349 3350 static void 3351 vfio_user_migr_ctrlr_save_data(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3352 { 3353 struct spdk_nvmf_ctrlr *ctrlr = vu_ctrlr->ctrlr; 3354 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 3355 struct nvmf_vfio_user_sq *sq; 3356 struct nvmf_vfio_user_cq *cq; 3357 uint64_t data_offset; 3358 void *data_ptr; 3359 uint32_t *doorbell_base; 3360 uint32_t i = 0; 3361 uint16_t sqid, cqid; 3362 struct vfio_user_nvme_migr_state migr_state = { 3363 .nvmf_data = { 3364 .data_size = offsetof(struct spdk_nvmf_ctrlr_migr_data, unused), 3365 .regs_size = sizeof(struct spdk_nvmf_registers), 3366 .feat_size = sizeof(struct spdk_nvmf_ctrlr_feat) 3367 } 3368 }; 3369 3370 /* Save all data to vfio_user_nvme_migr_state first, then we will 3371 * copy it to device migration region at last. 3372 */ 3373 3374 /* save magic number */ 3375 migr_state.ctrlr_header.magic = VFIO_USER_NVME_MIGR_MAGIC; 3376 3377 /* save controller data */ 3378 spdk_nvmf_ctrlr_save_migr_data(ctrlr, &migr_state.nvmf_data); 3379 3380 /* save connected queue pairs */ 3381 TAILQ_FOREACH(sq, &vu_ctrlr->connected_sqs, tailq) { 3382 /* save sq */ 3383 sqid = sq->qid; 3384 migr_state.qps[sqid].sq.sqid = sq->qid; 3385 migr_state.qps[sqid].sq.cqid = sq->cqid; 3386 migr_state.qps[sqid].sq.head = *sq_headp(sq); 3387 migr_state.qps[sqid].sq.size = sq->size; 3388 migr_state.qps[sqid].sq.dma_addr = sq->mapping.prp1; 3389 3390 /* save cq, for shared cq case, cq may be saved multiple times */ 3391 cqid = sq->cqid; 3392 cq = vu_ctrlr->cqs[cqid]; 3393 migr_state.qps[cqid].cq.cqid = cqid; 3394 migr_state.qps[cqid].cq.tail = *cq_tailp(cq); 3395 migr_state.qps[cqid].cq.ien = cq->ien; 3396 migr_state.qps[cqid].cq.iv = cq->iv; 3397 migr_state.qps[cqid].cq.size = cq->size; 3398 migr_state.qps[cqid].cq.phase = cq->phase; 3399 migr_state.qps[cqid].cq.dma_addr = cq->mapping.prp1; 3400 i++; 3401 } 3402 3403 assert(i > 0); 3404 migr_state.ctrlr_header.num_io_queues = i - 1; 3405 3406 /* Save doorbells */ 3407 doorbell_base = (uint32_t *)&migr_state.doorbells; 3408 memcpy(doorbell_base, (void *)vu_ctrlr->bar0_doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 3409 3410 /* Save PCI configuration space */ 3411 memcpy(&migr_state.cfg, (void *)endpoint->pci_config_space, NVME_REG_CFG_SIZE); 3412 3413 /* Save all data to device migration region */ 3414 data_ptr = endpoint->migr_data; 3415 3416 /* Copy nvmf controller data */ 3417 data_offset = sizeof(struct vfio_user_nvme_migr_header); 3418 data_ptr += data_offset; 3419 migr_state.ctrlr_header.nvmf_data_offset = data_offset; 3420 migr_state.ctrlr_header.nvmf_data_len = sizeof(struct spdk_nvmf_ctrlr_migr_data); 3421 memcpy(data_ptr, &migr_state.nvmf_data, sizeof(struct spdk_nvmf_ctrlr_migr_data)); 3422 3423 /* Copy queue pairs */ 3424 data_offset += sizeof(struct spdk_nvmf_ctrlr_migr_data); 3425 data_ptr += sizeof(struct spdk_nvmf_ctrlr_migr_data); 3426 migr_state.ctrlr_header.qp_offset = data_offset; 3427 migr_state.ctrlr_header.qp_len = i * (sizeof(struct nvme_migr_sq_state) + sizeof( 3428 struct nvme_migr_cq_state)); 3429 memcpy(data_ptr, &migr_state.qps, migr_state.ctrlr_header.qp_len); 3430 3431 /* Copy doorbells */ 3432 data_offset += migr_state.ctrlr_header.qp_len; 3433 data_ptr += migr_state.ctrlr_header.qp_len; 3434 migr_state.ctrlr_header.bar_offset[VFU_PCI_DEV_BAR0_REGION_IDX] = data_offset; 3435 migr_state.ctrlr_header.bar_len[VFU_PCI_DEV_BAR0_REGION_IDX] = NVMF_VFIO_USER_DOORBELLS_SIZE; 3436 memcpy(data_ptr, &migr_state.doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 3437 3438 /* Copy CFG */ 3439 data_offset += NVMF_VFIO_USER_DOORBELLS_SIZE; 3440 data_ptr += NVMF_VFIO_USER_DOORBELLS_SIZE; 3441 migr_state.ctrlr_header.bar_offset[VFU_PCI_DEV_CFG_REGION_IDX] = data_offset; 3442 migr_state.ctrlr_header.bar_len[VFU_PCI_DEV_CFG_REGION_IDX] = NVME_REG_CFG_SIZE; 3443 memcpy(data_ptr, &migr_state.cfg, NVME_REG_CFG_SIZE); 3444 3445 /* copy shadow doorbells */ 3446 if (vu_ctrlr->sdbl != NULL) { 3447 migr_state.ctrlr_header.sdbl = true; 3448 migr_state.ctrlr_header.shadow_doorbell_buffer = vu_ctrlr->shadow_doorbell_buffer; 3449 migr_state.ctrlr_header.eventidx_buffer = vu_ctrlr->eventidx_buffer; 3450 } 3451 3452 /* Copy nvme migration header finally */ 3453 memcpy(endpoint->migr_data, &migr_state.ctrlr_header, sizeof(struct vfio_user_nvme_migr_header)); 3454 3455 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 3456 vfio_user_ctrlr_dump_migr_data("SAVE", &migr_state, vu_ctrlr->sdbl); 3457 } 3458 } 3459 3460 /* 3461 * If we are about to close the connection, we need to unregister the interrupt, 3462 * as the library will subsequently close the file descriptor we registered. 3463 */ 3464 static int 3465 vfio_user_device_reset(vfu_ctx_t *vfu_ctx, vfu_reset_type_t type) 3466 { 3467 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3468 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 3469 3470 SPDK_DEBUGLOG(nvmf_vfio, "Device reset type %u\n", type); 3471 3472 if (type == VFU_RESET_LOST_CONN) { 3473 if (ctrlr != NULL) { 3474 spdk_interrupt_unregister(&ctrlr->intr); 3475 ctrlr->intr_fd = -1; 3476 } 3477 return 0; 3478 } 3479 3480 /* FIXME: LOST_CONN case ? */ 3481 if (ctrlr->sdbl != NULL) { 3482 vfio_user_ctrlr_switch_doorbells(ctrlr, false); 3483 free_sdbl(vfu_ctx, ctrlr->sdbl); 3484 ctrlr->sdbl = NULL; 3485 } 3486 3487 /* FIXME: much more needed here. */ 3488 3489 return 0; 3490 } 3491 3492 static int 3493 vfio_user_migr_ctrlr_construct_qps(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 3494 struct vfio_user_nvme_migr_state *migr_state) 3495 { 3496 uint32_t i, qsize = 0; 3497 uint16_t sqid, cqid; 3498 struct vfio_user_nvme_migr_qp migr_qp; 3499 void *addr; 3500 uint32_t cqs_ref[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR] = {}; 3501 int ret; 3502 3503 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 3504 vfio_user_ctrlr_dump_migr_data("RESUME", migr_state, vu_ctrlr->sdbl); 3505 } 3506 3507 /* restore submission queues */ 3508 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3509 migr_qp = migr_state->qps[i]; 3510 3511 qsize = migr_qp.sq.size; 3512 if (qsize) { 3513 struct nvmf_vfio_user_sq *sq; 3514 3515 sqid = migr_qp.sq.sqid; 3516 if (sqid != i) { 3517 SPDK_ERRLOG("Expected sqid %u while got %u", i, sqid); 3518 return -EINVAL; 3519 } 3520 3521 /* allocate sq if necessary */ 3522 if (vu_ctrlr->sqs[sqid] == NULL) { 3523 ret = init_sq(vu_ctrlr, &vu_ctrlr->transport->transport, sqid); 3524 if (ret) { 3525 SPDK_ERRLOG("Construct qpair with qid %u failed\n", sqid); 3526 return -EFAULT; 3527 } 3528 } 3529 3530 sq = vu_ctrlr->sqs[sqid]; 3531 sq->size = qsize; 3532 3533 ret = alloc_sq_reqs(vu_ctrlr, sq); 3534 if (ret) { 3535 SPDK_ERRLOG("Construct sq with qid %u failed\n", sqid); 3536 return -EFAULT; 3537 } 3538 3539 /* restore sq */ 3540 sq->sq_state = VFIO_USER_SQ_CREATED; 3541 sq->cqid = migr_qp.sq.cqid; 3542 *sq_headp(sq) = migr_qp.sq.head; 3543 sq->mapping.prp1 = migr_qp.sq.dma_addr; 3544 addr = map_one(vu_ctrlr->endpoint->vfu_ctx, 3545 sq->mapping.prp1, sq->size * 64, 3546 sq->mapping.sg, &sq->mapping.iov, 3547 PROT_READ); 3548 if (addr == NULL) { 3549 SPDK_ERRLOG("Restore sq with qid %u PRP1 0x%"PRIx64" with size %u failed\n", 3550 sqid, sq->mapping.prp1, sq->size); 3551 return -EFAULT; 3552 } 3553 cqs_ref[sq->cqid]++; 3554 } 3555 } 3556 3557 /* restore completion queues */ 3558 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3559 migr_qp = migr_state->qps[i]; 3560 3561 qsize = migr_qp.cq.size; 3562 if (qsize) { 3563 struct nvmf_vfio_user_cq *cq; 3564 3565 /* restore cq */ 3566 cqid = migr_qp.sq.cqid; 3567 assert(cqid == i); 3568 3569 /* allocate cq if necessary */ 3570 if (vu_ctrlr->cqs[cqid] == NULL) { 3571 ret = init_cq(vu_ctrlr, cqid); 3572 if (ret) { 3573 SPDK_ERRLOG("Construct qpair with qid %u failed\n", cqid); 3574 return -EFAULT; 3575 } 3576 } 3577 3578 cq = vu_ctrlr->cqs[cqid]; 3579 3580 cq->size = qsize; 3581 3582 cq->cq_state = VFIO_USER_CQ_CREATED; 3583 cq->cq_ref = cqs_ref[cqid]; 3584 *cq_tailp(cq) = migr_qp.cq.tail; 3585 cq->mapping.prp1 = migr_qp.cq.dma_addr; 3586 cq->ien = migr_qp.cq.ien; 3587 cq->iv = migr_qp.cq.iv; 3588 cq->phase = migr_qp.cq.phase; 3589 addr = map_one(vu_ctrlr->endpoint->vfu_ctx, 3590 cq->mapping.prp1, cq->size * 16, 3591 cq->mapping.sg, &cq->mapping.iov, 3592 PROT_READ | PROT_WRITE); 3593 if (addr == NULL) { 3594 SPDK_ERRLOG("Restore cq with qid %u PRP1 0x%"PRIx64" with size %u failed\n", 3595 cqid, cq->mapping.prp1, cq->size); 3596 return -EFAULT; 3597 } 3598 } 3599 } 3600 3601 return 0; 3602 } 3603 3604 static int 3605 vfio_user_migr_ctrlr_restore(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3606 { 3607 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 3608 struct spdk_nvmf_ctrlr *ctrlr = vu_ctrlr->ctrlr; 3609 uint32_t *doorbell_base; 3610 struct spdk_nvme_cmd cmd; 3611 uint16_t i; 3612 int rc = 0; 3613 struct vfio_user_nvme_migr_state migr_state = { 3614 .nvmf_data = { 3615 .data_size = offsetof(struct spdk_nvmf_ctrlr_migr_data, unused), 3616 .regs_size = sizeof(struct spdk_nvmf_registers), 3617 .feat_size = sizeof(struct spdk_nvmf_ctrlr_feat) 3618 } 3619 }; 3620 3621 assert(endpoint->migr_data != NULL); 3622 assert(ctrlr != NULL); 3623 rc = vfio_user_migr_stream_to_data(endpoint, &migr_state); 3624 if (rc) { 3625 return rc; 3626 } 3627 3628 /* restore shadow doorbells */ 3629 if (migr_state.ctrlr_header.sdbl) { 3630 struct nvmf_vfio_user_shadow_doorbells *sdbl; 3631 sdbl = map_sdbl(vu_ctrlr->endpoint->vfu_ctx, 3632 migr_state.ctrlr_header.shadow_doorbell_buffer, 3633 migr_state.ctrlr_header.eventidx_buffer, 3634 memory_page_size(vu_ctrlr)); 3635 if (sdbl == NULL) { 3636 SPDK_ERRLOG("%s: failed to re-map shadow doorbell buffers\n", 3637 ctrlr_id(vu_ctrlr)); 3638 return -1; 3639 } 3640 3641 vu_ctrlr->shadow_doorbell_buffer = migr_state.ctrlr_header.shadow_doorbell_buffer; 3642 vu_ctrlr->eventidx_buffer = migr_state.ctrlr_header.eventidx_buffer; 3643 3644 SWAP(vu_ctrlr->sdbl, sdbl); 3645 } 3646 3647 rc = vfio_user_migr_ctrlr_construct_qps(vu_ctrlr, &migr_state); 3648 if (rc) { 3649 return rc; 3650 } 3651 3652 /* restore PCI configuration space */ 3653 memcpy((void *)endpoint->pci_config_space, &migr_state.cfg, NVME_REG_CFG_SIZE); 3654 3655 doorbell_base = (uint32_t *)&migr_state.doorbells; 3656 /* restore doorbells from saved registers */ 3657 memcpy((void *)vu_ctrlr->bar0_doorbells, doorbell_base, NVMF_VFIO_USER_DOORBELLS_SIZE); 3658 3659 /* restore nvmf controller data */ 3660 rc = spdk_nvmf_ctrlr_restore_migr_data(ctrlr, &migr_state.nvmf_data); 3661 if (rc) { 3662 return rc; 3663 } 3664 3665 /* resubmit pending AERs */ 3666 for (i = 0; i < migr_state.nvmf_data.num_aer_cids; i++) { 3667 SPDK_DEBUGLOG(nvmf_vfio, "%s AER resubmit, CID %u\n", ctrlr_id(vu_ctrlr), 3668 migr_state.nvmf_data.aer_cids[i]); 3669 memset(&cmd, 0, sizeof(cmd)); 3670 cmd.opc = SPDK_NVME_OPC_ASYNC_EVENT_REQUEST; 3671 cmd.cid = migr_state.nvmf_data.aer_cids[i]; 3672 rc = handle_cmd_req(vu_ctrlr, &cmd, vu_ctrlr->sqs[0]); 3673 if (spdk_unlikely(rc)) { 3674 break; 3675 } 3676 } 3677 3678 return rc; 3679 } 3680 3681 static void 3682 vfio_user_migr_ctrlr_enable_sqs(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3683 { 3684 uint32_t i; 3685 struct nvmf_vfio_user_sq *sq; 3686 3687 /* The Admin queue (qid: 0) does not ever use shadow doorbells. */ 3688 3689 if (vu_ctrlr->sqs[0] != NULL) { 3690 vu_ctrlr->sqs[0]->dbl_tailp = vu_ctrlr->bar0_doorbells + 3691 queue_index(0, false); 3692 } 3693 3694 if (vu_ctrlr->cqs[0] != NULL) { 3695 vu_ctrlr->cqs[0]->dbl_headp = vu_ctrlr->bar0_doorbells + 3696 queue_index(0, true); 3697 } 3698 3699 vfio_user_ctrlr_switch_doorbells(vu_ctrlr, vu_ctrlr->sdbl != NULL); 3700 3701 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3702 sq = vu_ctrlr->sqs[i]; 3703 if (!sq || !sq->size) { 3704 continue; 3705 } 3706 3707 if (nvmf_qpair_is_admin_queue(&sq->qpair)) { 3708 /* ADMIN queue pair is always in the poll group, just enable it */ 3709 sq->sq_state = VFIO_USER_SQ_ACTIVE; 3710 } else { 3711 spdk_nvmf_tgt_new_qpair(vu_ctrlr->transport->transport.tgt, &sq->qpair); 3712 } 3713 } 3714 } 3715 3716 /* 3717 * We are in stop-and-copy state, but still potentially have some current dirty 3718 * sgls: while we're quiesced and thus should have no active requests, we still 3719 * have potentially dirty maps of the shadow doorbells and the CQs (SQs are 3720 * mapped read only). 3721 * 3722 * Since we won't be calling vfu_sgl_put() for them, we need to explicitly 3723 * mark them dirty now. 3724 */ 3725 static void 3726 vfio_user_migr_ctrlr_mark_dirty(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3727 { 3728 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 3729 3730 assert(vu_ctrlr->state == VFIO_USER_CTRLR_MIGRATING); 3731 3732 for (size_t i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3733 struct nvmf_vfio_user_cq *cq = vu_ctrlr->cqs[i]; 3734 3735 if (cq == NULL || q_addr(&cq->mapping) == NULL) { 3736 continue; 3737 } 3738 3739 vfu_sgl_mark_dirty(endpoint->vfu_ctx, cq->mapping.sg, 1); 3740 } 3741 3742 if (vu_ctrlr->sdbl != NULL) { 3743 dma_sg_t *sg; 3744 size_t i; 3745 3746 for (i = 0; i < NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT; 3747 ++i) { 3748 3749 if (!vu_ctrlr->sdbl->iovs[i].iov_len) { 3750 continue; 3751 } 3752 3753 sg = index_to_sg_t(vu_ctrlr->sdbl->sgs, i); 3754 3755 vfu_sgl_mark_dirty(endpoint->vfu_ctx, sg, 1); 3756 } 3757 } 3758 } 3759 3760 static int 3761 vfio_user_migration_device_state_transition(vfu_ctx_t *vfu_ctx, vfu_migr_state_t state) 3762 { 3763 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3764 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3765 struct nvmf_vfio_user_sq *sq; 3766 int ret = 0; 3767 3768 SPDK_DEBUGLOG(nvmf_vfio, "%s controller state %u, migration state %u\n", endpoint_id(endpoint), 3769 vu_ctrlr->state, state); 3770 3771 switch (state) { 3772 case VFU_MIGR_STATE_STOP_AND_COPY: 3773 vu_ctrlr->in_source_vm = true; 3774 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 3775 vfio_user_migr_ctrlr_mark_dirty(vu_ctrlr); 3776 vfio_user_migr_ctrlr_save_data(vu_ctrlr); 3777 break; 3778 case VFU_MIGR_STATE_STOP: 3779 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 3780 /* The controller associates with source VM is dead now, we will resume 3781 * the subsystem after destroying the controller data structure, then the 3782 * subsystem can be re-used for another new client. 3783 */ 3784 if (vu_ctrlr->in_source_vm) { 3785 endpoint->need_resume = true; 3786 } 3787 break; 3788 case VFU_MIGR_STATE_PRE_COPY: 3789 assert(vu_ctrlr->state == VFIO_USER_CTRLR_PAUSED); 3790 break; 3791 case VFU_MIGR_STATE_RESUME: 3792 /* 3793 * Destination ADMIN queue pair is connected when starting the VM, 3794 * but the ADMIN queue pair isn't enabled in destination VM, the poll 3795 * group will do nothing to ADMIN queue pair for now. 3796 */ 3797 if (vu_ctrlr->state != VFIO_USER_CTRLR_RUNNING) { 3798 break; 3799 } 3800 3801 assert(!vu_ctrlr->in_source_vm); 3802 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 3803 3804 sq = TAILQ_FIRST(&vu_ctrlr->connected_sqs); 3805 assert(sq != NULL); 3806 assert(sq->qpair.qid == 0); 3807 sq->sq_state = VFIO_USER_SQ_INACTIVE; 3808 3809 /* Free ADMIN SQ resources first, SQ resources will be 3810 * allocated based on queue size from source VM. 3811 */ 3812 free_sq_reqs(sq); 3813 sq->size = 0; 3814 break; 3815 case VFU_MIGR_STATE_RUNNING: 3816 3817 if (vu_ctrlr->state != VFIO_USER_CTRLR_MIGRATING) { 3818 break; 3819 } 3820 3821 if (!vu_ctrlr->in_source_vm) { 3822 /* Restore destination VM from BAR9 */ 3823 ret = vfio_user_migr_ctrlr_restore(vu_ctrlr); 3824 if (ret) { 3825 break; 3826 } 3827 3828 vfio_user_ctrlr_switch_doorbells(vu_ctrlr, false); 3829 vfio_user_migr_ctrlr_enable_sqs(vu_ctrlr); 3830 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 3831 /* FIXME where do we resume nvmf? */ 3832 } else { 3833 /* Rollback source VM */ 3834 vu_ctrlr->state = VFIO_USER_CTRLR_RESUMING; 3835 ret = spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 3836 vfio_user_endpoint_resume_done, endpoint); 3837 if (ret < 0) { 3838 /* TODO: fail controller with CFS bit set */ 3839 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 3840 SPDK_ERRLOG("%s: failed to resume, ret=%d\n", endpoint_id(endpoint), ret); 3841 } 3842 } 3843 vu_ctrlr->migr_data_prepared = false; 3844 vu_ctrlr->in_source_vm = false; 3845 break; 3846 3847 default: 3848 return -EINVAL; 3849 } 3850 3851 return ret; 3852 } 3853 3854 static uint64_t 3855 vfio_user_migration_get_pending_bytes(vfu_ctx_t *vfu_ctx) 3856 { 3857 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3858 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 3859 uint64_t pending_bytes; 3860 3861 if (ctrlr->migr_data_prepared) { 3862 assert(ctrlr->state == VFIO_USER_CTRLR_MIGRATING); 3863 pending_bytes = 0; 3864 } else { 3865 pending_bytes = vfio_user_migr_data_len(); 3866 } 3867 3868 SPDK_DEBUGLOG(nvmf_vfio, 3869 "%s current state %u, pending bytes 0x%"PRIx64"\n", 3870 endpoint_id(endpoint), ctrlr->state, pending_bytes); 3871 3872 return pending_bytes; 3873 } 3874 3875 static int 3876 vfio_user_migration_prepare_data(vfu_ctx_t *vfu_ctx, uint64_t *offset, uint64_t *size) 3877 { 3878 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3879 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 3880 3881 /* 3882 * When transitioning to pre-copy state we set pending_bytes to 0, 3883 * so the vfio-user client shouldn't attempt to read any migration 3884 * data. This is not yet guaranteed by libvfio-user. 3885 */ 3886 if (ctrlr->state != VFIO_USER_CTRLR_MIGRATING) { 3887 assert(size != NULL); 3888 *offset = 0; 3889 *size = 0; 3890 return 0; 3891 } 3892 3893 if (ctrlr->in_source_vm) { /* migration source */ 3894 assert(size != NULL); 3895 *size = vfio_user_migr_data_len(); 3896 vfio_user_migr_ctrlr_save_data(ctrlr); 3897 } else { /* migration destination */ 3898 assert(size == NULL); 3899 assert(!ctrlr->migr_data_prepared); 3900 } 3901 *offset = 0; 3902 ctrlr->migr_data_prepared = true; 3903 3904 SPDK_DEBUGLOG(nvmf_vfio, "%s current state %u\n", endpoint_id(endpoint), ctrlr->state); 3905 3906 return 0; 3907 } 3908 3909 static ssize_t 3910 vfio_user_migration_read_data(vfu_ctx_t *vfu_ctx __attribute__((unused)), 3911 void *buf __attribute__((unused)), 3912 uint64_t count __attribute__((unused)), 3913 uint64_t offset __attribute__((unused))) 3914 { 3915 SPDK_DEBUGLOG(nvmf_vfio, "%s: migration read data not supported\n", 3916 endpoint_id(vfu_get_private(vfu_ctx))); 3917 errno = ENOTSUP; 3918 return -1; 3919 } 3920 3921 static ssize_t 3922 vfio_user_migration_write_data(vfu_ctx_t *vfu_ctx __attribute__((unused)), 3923 void *buf __attribute__((unused)), 3924 uint64_t count __attribute__((unused)), 3925 uint64_t offset __attribute__((unused))) 3926 { 3927 SPDK_DEBUGLOG(nvmf_vfio, "%s: migration write data not supported\n", 3928 endpoint_id(vfu_get_private(vfu_ctx))); 3929 errno = ENOTSUP; 3930 return -1; 3931 } 3932 3933 static int 3934 vfio_user_migration_data_written(vfu_ctx_t *vfu_ctx __attribute__((unused)), 3935 uint64_t count) 3936 { 3937 SPDK_DEBUGLOG(nvmf_vfio, "write 0x%"PRIx64"\n", (uint64_t)count); 3938 3939 if (count != vfio_user_migr_data_len()) { 3940 SPDK_DEBUGLOG(nvmf_vfio, "%s bad count %#lx\n", 3941 endpoint_id(vfu_get_private(vfu_ctx)), count); 3942 errno = EINVAL; 3943 return -1; 3944 } 3945 3946 return 0; 3947 } 3948 3949 static int 3950 vfio_user_dev_info_fill(struct nvmf_vfio_user_transport *vu_transport, 3951 struct nvmf_vfio_user_endpoint *endpoint) 3952 { 3953 int ret; 3954 ssize_t cap_offset; 3955 vfu_ctx_t *vfu_ctx = endpoint->vfu_ctx; 3956 struct iovec migr_sparse_mmap = {}; 3957 3958 struct pmcap pmcap = { .hdr.id = PCI_CAP_ID_PM, .pmcs.nsfrst = 0x1 }; 3959 struct pxcap pxcap = { 3960 .hdr.id = PCI_CAP_ID_EXP, 3961 .pxcaps.ver = 0x2, 3962 .pxdcap = {.rer = 0x1, .flrc = 0x1}, 3963 .pxdcap2.ctds = 0x1 3964 }; 3965 3966 struct msixcap msixcap = { 3967 .hdr.id = PCI_CAP_ID_MSIX, 3968 .mxc.ts = NVME_IRQ_MSIX_NUM - 1, 3969 .mtab = {.tbir = 0x4, .to = 0x0}, 3970 .mpba = {.pbir = 0x5, .pbao = 0x0} 3971 }; 3972 3973 struct iovec sparse_mmap[] = { 3974 { 3975 .iov_base = (void *)NVME_DOORBELLS_OFFSET, 3976 .iov_len = NVMF_VFIO_USER_DOORBELLS_SIZE, 3977 }, 3978 }; 3979 3980 const vfu_migration_callbacks_t migr_callbacks = { 3981 .version = VFU_MIGR_CALLBACKS_VERS, 3982 .transition = &vfio_user_migration_device_state_transition, 3983 .get_pending_bytes = &vfio_user_migration_get_pending_bytes, 3984 .prepare_data = &vfio_user_migration_prepare_data, 3985 .read_data = &vfio_user_migration_read_data, 3986 .data_written = &vfio_user_migration_data_written, 3987 .write_data = &vfio_user_migration_write_data 3988 }; 3989 3990 ret = vfu_pci_init(vfu_ctx, VFU_PCI_TYPE_EXPRESS, PCI_HEADER_TYPE_NORMAL, 0); 3991 if (ret < 0) { 3992 SPDK_ERRLOG("vfu_ctx %p failed to initialize PCI\n", vfu_ctx); 3993 return ret; 3994 } 3995 vfu_pci_set_id(vfu_ctx, SPDK_PCI_VID_NUTANIX, 0x0001, SPDK_PCI_VID_NUTANIX, 0); 3996 /* 3997 * 0x02, controller uses the NVM Express programming interface 3998 * 0x08, non-volatile memory controller 3999 * 0x01, mass storage controller 4000 */ 4001 vfu_pci_set_class(vfu_ctx, 0x01, 0x08, 0x02); 4002 4003 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pmcap); 4004 if (cap_offset < 0) { 4005 SPDK_ERRLOG("vfu_ctx %p failed add pmcap\n", vfu_ctx); 4006 return ret; 4007 } 4008 4009 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pxcap); 4010 if (cap_offset < 0) { 4011 SPDK_ERRLOG("vfu_ctx %p failed add pxcap\n", vfu_ctx); 4012 return ret; 4013 } 4014 4015 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &msixcap); 4016 if (cap_offset < 0) { 4017 SPDK_ERRLOG("vfu_ctx %p failed add msixcap\n", vfu_ctx); 4018 return ret; 4019 } 4020 4021 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_CFG_REGION_IDX, NVME_REG_CFG_SIZE, 4022 access_pci_config, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 4023 if (ret < 0) { 4024 SPDK_ERRLOG("vfu_ctx %p failed to setup cfg\n", vfu_ctx); 4025 return ret; 4026 } 4027 4028 if (vu_transport->transport_opts.disable_mappable_bar0) { 4029 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE, 4030 access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, 4031 NULL, 0, -1, 0); 4032 } else { 4033 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE, 4034 access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, 4035 sparse_mmap, 1, endpoint->devmem_fd, 0); 4036 } 4037 4038 if (ret < 0) { 4039 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 0\n", vfu_ctx); 4040 return ret; 4041 } 4042 4043 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR4_REGION_IDX, NVME_BAR4_SIZE, 4044 NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 4045 if (ret < 0) { 4046 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 4\n", vfu_ctx); 4047 return ret; 4048 } 4049 4050 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR5_REGION_IDX, NVME_BAR5_SIZE, 4051 NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 4052 if (ret < 0) { 4053 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 5\n", vfu_ctx); 4054 return ret; 4055 } 4056 4057 ret = vfu_setup_device_dma(vfu_ctx, memory_region_add_cb, memory_region_remove_cb); 4058 if (ret < 0) { 4059 SPDK_ERRLOG("vfu_ctx %p failed to setup dma callback\n", vfu_ctx); 4060 return ret; 4061 } 4062 4063 ret = vfu_setup_device_reset_cb(vfu_ctx, vfio_user_device_reset); 4064 if (ret < 0) { 4065 SPDK_ERRLOG("vfu_ctx %p failed to setup reset callback\n", vfu_ctx); 4066 return ret; 4067 } 4068 4069 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_INTX_IRQ, 1); 4070 if (ret < 0) { 4071 SPDK_ERRLOG("vfu_ctx %p failed to setup INTX\n", vfu_ctx); 4072 return ret; 4073 } 4074 4075 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSIX_IRQ, NVME_IRQ_MSIX_NUM); 4076 if (ret < 0) { 4077 SPDK_ERRLOG("vfu_ctx %p failed to setup MSIX\n", vfu_ctx); 4078 return ret; 4079 } 4080 4081 vfu_setup_device_quiesce_cb(vfu_ctx, vfio_user_dev_quiesce_cb); 4082 4083 migr_sparse_mmap.iov_base = (void *)4096; 4084 migr_sparse_mmap.iov_len = vfio_user_migr_data_len(); 4085 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_MIGR_REGION_IDX, 4086 vfu_get_migr_register_area_size() + vfio_user_migr_data_len(), 4087 NULL, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, &migr_sparse_mmap, 4088 1, endpoint->migr_fd, 0); 4089 if (ret < 0) { 4090 SPDK_ERRLOG("vfu_ctx %p failed to setup migration region\n", vfu_ctx); 4091 return ret; 4092 } 4093 4094 ret = vfu_setup_device_migration_callbacks(vfu_ctx, &migr_callbacks, 4095 vfu_get_migr_register_area_size()); 4096 if (ret < 0) { 4097 SPDK_ERRLOG("vfu_ctx %p failed to setup migration callbacks\n", vfu_ctx); 4098 return ret; 4099 } 4100 4101 ret = vfu_realize_ctx(vfu_ctx); 4102 if (ret < 0) { 4103 SPDK_ERRLOG("vfu_ctx %p failed to realize\n", vfu_ctx); 4104 return ret; 4105 } 4106 4107 endpoint->pci_config_space = vfu_pci_get_config_space(endpoint->vfu_ctx); 4108 assert(endpoint->pci_config_space != NULL); 4109 init_pci_config_space(endpoint->pci_config_space); 4110 4111 assert(cap_offset != 0); 4112 endpoint->msix = (struct msixcap *)((uint8_t *)endpoint->pci_config_space + cap_offset); 4113 4114 return 0; 4115 } 4116 4117 static int nvmf_vfio_user_accept(void *ctx); 4118 4119 static void 4120 set_intr_mode_noop(struct spdk_poller *poller, void *arg, bool interrupt_mode) 4121 { 4122 /* Nothing for us to do here. */ 4123 } 4124 4125 /* 4126 * Register an "accept" poller: this is polling for incoming vfio-user socket 4127 * connections (on the listening socket). 4128 * 4129 * We need to do this on first listening, and also after destroying a 4130 * controller, so we can accept another connection. 4131 */ 4132 static int 4133 vfio_user_register_accept_poller(struct nvmf_vfio_user_endpoint *endpoint) 4134 { 4135 uint64_t poll_rate_us = endpoint->transport->transport.opts.acceptor_poll_rate; 4136 4137 SPDK_DEBUGLOG(nvmf_vfio, "registering accept poller\n"); 4138 4139 endpoint->accept_poller = SPDK_POLLER_REGISTER(nvmf_vfio_user_accept, 4140 endpoint, poll_rate_us); 4141 4142 if (!endpoint->accept_poller) { 4143 return -1; 4144 } 4145 4146 endpoint->accept_thread = spdk_get_thread(); 4147 endpoint->need_relisten = false; 4148 4149 if (!spdk_interrupt_mode_is_enabled()) { 4150 return 0; 4151 } 4152 4153 endpoint->accept_intr_fd = vfu_get_poll_fd(endpoint->vfu_ctx); 4154 assert(endpoint->accept_intr_fd != -1); 4155 4156 endpoint->accept_intr = SPDK_INTERRUPT_REGISTER(endpoint->accept_intr_fd, 4157 nvmf_vfio_user_accept, endpoint); 4158 4159 assert(endpoint->accept_intr != NULL); 4160 4161 spdk_poller_register_interrupt(endpoint->accept_poller, 4162 set_intr_mode_noop, NULL); 4163 return 0; 4164 } 4165 4166 static void 4167 _vfio_user_relisten(void *ctx) 4168 { 4169 struct nvmf_vfio_user_endpoint *endpoint = ctx; 4170 4171 vfio_user_register_accept_poller(endpoint); 4172 } 4173 4174 static void 4175 _free_ctrlr(void *ctx) 4176 { 4177 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 4178 struct nvmf_vfio_user_endpoint *endpoint = ctrlr->endpoint; 4179 4180 free_sdbl(ctrlr->endpoint->vfu_ctx, ctrlr->sdbl); 4181 4182 spdk_interrupt_unregister(&ctrlr->intr); 4183 ctrlr->intr_fd = -1; 4184 spdk_poller_unregister(&ctrlr->vfu_ctx_poller); 4185 4186 free(ctrlr); 4187 4188 if (endpoint == NULL) { 4189 return; 4190 } 4191 4192 if (endpoint->need_async_destroy) { 4193 nvmf_vfio_user_destroy_endpoint(endpoint); 4194 } else if (endpoint->need_relisten) { 4195 spdk_thread_send_msg(endpoint->accept_thread, 4196 _vfio_user_relisten, endpoint); 4197 } 4198 } 4199 4200 static void 4201 free_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 4202 { 4203 int i; 4204 assert(ctrlr != NULL); 4205 4206 SPDK_DEBUGLOG(nvmf_vfio, "free %s\n", ctrlr_id(ctrlr)); 4207 4208 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 4209 free_qp(ctrlr, i); 4210 } 4211 4212 spdk_thread_exec_msg(ctrlr->thread, _free_ctrlr, ctrlr); 4213 } 4214 4215 static int 4216 nvmf_vfio_user_create_ctrlr(struct nvmf_vfio_user_transport *transport, 4217 struct nvmf_vfio_user_endpoint *endpoint) 4218 { 4219 struct nvmf_vfio_user_ctrlr *ctrlr; 4220 int err = 0; 4221 4222 SPDK_DEBUGLOG(nvmf_vfio, "%s\n", endpoint_id(endpoint)); 4223 4224 /* First, construct a vfio-user CUSTOM transport controller */ 4225 ctrlr = calloc(1, sizeof(*ctrlr)); 4226 if (ctrlr == NULL) { 4227 err = -ENOMEM; 4228 goto out; 4229 } 4230 /* We can only support one connection for now */ 4231 ctrlr->cntlid = 0x1; 4232 ctrlr->intr_fd = -1; 4233 ctrlr->transport = transport; 4234 ctrlr->endpoint = endpoint; 4235 ctrlr->bar0_doorbells = endpoint->bar0_doorbells; 4236 TAILQ_INIT(&ctrlr->connected_sqs); 4237 4238 ctrlr->adaptive_irqs_enabled = 4239 !transport->transport_opts.disable_adaptive_irq; 4240 4241 /* Then, construct an admin queue pair */ 4242 err = init_sq(ctrlr, &transport->transport, 0); 4243 if (err != 0) { 4244 free(ctrlr); 4245 goto out; 4246 } 4247 4248 err = init_cq(ctrlr, 0); 4249 if (err != 0) { 4250 free(ctrlr); 4251 goto out; 4252 } 4253 4254 ctrlr->sqs[0]->size = NVMF_VFIO_USER_DEFAULT_AQ_DEPTH; 4255 4256 err = alloc_sq_reqs(ctrlr, ctrlr->sqs[0]); 4257 if (err != 0) { 4258 free(ctrlr); 4259 goto out; 4260 } 4261 endpoint->ctrlr = ctrlr; 4262 4263 /* Notify the generic layer about the new admin queue pair */ 4264 spdk_nvmf_tgt_new_qpair(transport->transport.tgt, &ctrlr->sqs[0]->qpair); 4265 4266 out: 4267 if (err != 0) { 4268 SPDK_ERRLOG("%s: failed to create vfio-user controller: %s\n", 4269 endpoint_id(endpoint), strerror(-err)); 4270 } 4271 4272 return err; 4273 } 4274 4275 static int 4276 nvmf_vfio_user_listen(struct spdk_nvmf_transport *transport, 4277 const struct spdk_nvme_transport_id *trid, 4278 struct spdk_nvmf_listen_opts *listen_opts) 4279 { 4280 struct nvmf_vfio_user_transport *vu_transport; 4281 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 4282 char path[PATH_MAX] = {}; 4283 char uuid[PATH_MAX] = {}; 4284 int ret; 4285 4286 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4287 transport); 4288 4289 pthread_mutex_lock(&vu_transport->lock); 4290 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 4291 /* Only compare traddr */ 4292 if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) { 4293 pthread_mutex_unlock(&vu_transport->lock); 4294 return -EEXIST; 4295 } 4296 } 4297 pthread_mutex_unlock(&vu_transport->lock); 4298 4299 endpoint = calloc(1, sizeof(*endpoint)); 4300 if (!endpoint) { 4301 return -ENOMEM; 4302 } 4303 4304 pthread_mutex_init(&endpoint->lock, NULL); 4305 endpoint->devmem_fd = -1; 4306 memcpy(&endpoint->trid, trid, sizeof(endpoint->trid)); 4307 endpoint->transport = vu_transport; 4308 4309 ret = snprintf(path, PATH_MAX, "%s/bar0", endpoint_id(endpoint)); 4310 if (ret < 0 || ret >= PATH_MAX) { 4311 SPDK_ERRLOG("%s: error to get socket path: %s.\n", endpoint_id(endpoint), spdk_strerror(errno)); 4312 ret = -1; 4313 goto out; 4314 } 4315 4316 ret = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); 4317 if (ret == -1) { 4318 SPDK_ERRLOG("%s: failed to open device memory at %s: %s.\n", 4319 endpoint_id(endpoint), path, spdk_strerror(errno)); 4320 goto out; 4321 } 4322 unlink(path); 4323 4324 endpoint->devmem_fd = ret; 4325 ret = ftruncate(endpoint->devmem_fd, 4326 NVME_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE); 4327 if (ret != 0) { 4328 SPDK_ERRLOG("%s: error to ftruncate file %s: %s.\n", endpoint_id(endpoint), path, 4329 spdk_strerror(errno)); 4330 goto out; 4331 } 4332 4333 endpoint->bar0_doorbells = mmap(NULL, NVMF_VFIO_USER_DOORBELLS_SIZE, 4334 PROT_READ | PROT_WRITE, MAP_SHARED, endpoint->devmem_fd, NVME_DOORBELLS_OFFSET); 4335 if (endpoint->bar0_doorbells == MAP_FAILED) { 4336 SPDK_ERRLOG("%s: error to mmap file %s: %s.\n", endpoint_id(endpoint), path, spdk_strerror(errno)); 4337 endpoint->bar0_doorbells = NULL; 4338 ret = -1; 4339 goto out; 4340 } 4341 4342 ret = snprintf(path, PATH_MAX, "%s/migr", endpoint_id(endpoint)); 4343 if (ret < 0 || ret >= PATH_MAX) { 4344 SPDK_ERRLOG("%s: error to get migration file path: %s.\n", endpoint_id(endpoint), 4345 spdk_strerror(errno)); 4346 ret = -1; 4347 goto out; 4348 } 4349 ret = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); 4350 if (ret == -1) { 4351 SPDK_ERRLOG("%s: failed to open device memory at %s: %s.\n", 4352 endpoint_id(endpoint), path, spdk_strerror(errno)); 4353 goto out; 4354 } 4355 unlink(path); 4356 4357 endpoint->migr_fd = ret; 4358 ret = ftruncate(endpoint->migr_fd, 4359 vfu_get_migr_register_area_size() + vfio_user_migr_data_len()); 4360 if (ret != 0) { 4361 SPDK_ERRLOG("%s: error to ftruncate migration file %s: %s.\n", endpoint_id(endpoint), path, 4362 spdk_strerror(errno)); 4363 goto out; 4364 } 4365 4366 endpoint->migr_data = mmap(NULL, vfio_user_migr_data_len(), 4367 PROT_READ | PROT_WRITE, MAP_SHARED, endpoint->migr_fd, vfu_get_migr_register_area_size()); 4368 if (endpoint->migr_data == MAP_FAILED) { 4369 SPDK_ERRLOG("%s: error to mmap file %s: %s.\n", endpoint_id(endpoint), path, spdk_strerror(errno)); 4370 endpoint->migr_data = NULL; 4371 ret = -1; 4372 goto out; 4373 } 4374 4375 ret = snprintf(uuid, PATH_MAX, "%s/cntrl", endpoint_id(endpoint)); 4376 if (ret < 0 || ret >= PATH_MAX) { 4377 SPDK_ERRLOG("%s: error to get ctrlr file path: %s\n", endpoint_id(endpoint), spdk_strerror(errno)); 4378 ret = -1; 4379 goto out; 4380 } 4381 4382 endpoint->vfu_ctx = vfu_create_ctx(VFU_TRANS_SOCK, uuid, LIBVFIO_USER_FLAG_ATTACH_NB, 4383 endpoint, VFU_DEV_TYPE_PCI); 4384 if (endpoint->vfu_ctx == NULL) { 4385 SPDK_ERRLOG("%s: error creating libmuser context: %m\n", 4386 endpoint_id(endpoint)); 4387 ret = -1; 4388 goto out; 4389 } 4390 4391 ret = vfu_setup_log(endpoint->vfu_ctx, vfio_user_log, 4392 vfio_user_get_log_level()); 4393 if (ret < 0) { 4394 goto out; 4395 } 4396 4397 4398 ret = vfio_user_dev_info_fill(vu_transport, endpoint); 4399 if (ret < 0) { 4400 goto out; 4401 } 4402 4403 ret = vfio_user_register_accept_poller(endpoint); 4404 4405 if (ret != 0) { 4406 goto out; 4407 } 4408 4409 pthread_mutex_lock(&vu_transport->lock); 4410 TAILQ_INSERT_TAIL(&vu_transport->endpoints, endpoint, link); 4411 pthread_mutex_unlock(&vu_transport->lock); 4412 4413 out: 4414 if (ret != 0) { 4415 nvmf_vfio_user_destroy_endpoint(endpoint); 4416 } 4417 4418 return ret; 4419 } 4420 4421 static void 4422 nvmf_vfio_user_stop_listen(struct spdk_nvmf_transport *transport, 4423 const struct spdk_nvme_transport_id *trid) 4424 { 4425 struct nvmf_vfio_user_transport *vu_transport; 4426 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 4427 4428 assert(trid != NULL); 4429 assert(trid->traddr != NULL); 4430 4431 SPDK_DEBUGLOG(nvmf_vfio, "%s: stop listen\n", trid->traddr); 4432 4433 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4434 transport); 4435 4436 pthread_mutex_lock(&vu_transport->lock); 4437 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 4438 if (strcmp(trid->traddr, endpoint->trid.traddr) == 0) { 4439 TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link); 4440 /* Defer to free endpoint resources until the controller 4441 * is freed. There are two cases when running here: 4442 * 1. kill nvmf target while VM is connected 4443 * 2. remove listener via RPC call 4444 * nvmf library will disconnect all queue paris. 4445 */ 4446 if (endpoint->ctrlr) { 4447 assert(!endpoint->need_async_destroy); 4448 endpoint->need_async_destroy = true; 4449 pthread_mutex_unlock(&vu_transport->lock); 4450 return; 4451 } 4452 4453 nvmf_vfio_user_destroy_endpoint(endpoint); 4454 pthread_mutex_unlock(&vu_transport->lock); 4455 return; 4456 } 4457 } 4458 pthread_mutex_unlock(&vu_transport->lock); 4459 4460 SPDK_DEBUGLOG(nvmf_vfio, "%s: not found\n", trid->traddr); 4461 } 4462 4463 static void 4464 nvmf_vfio_user_cdata_init(struct spdk_nvmf_transport *transport, 4465 struct spdk_nvmf_subsystem *subsystem, 4466 struct spdk_nvmf_ctrlr_data *cdata) 4467 { 4468 struct nvmf_vfio_user_transport *vu_transport; 4469 4470 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, transport); 4471 4472 cdata->vid = SPDK_PCI_VID_NUTANIX; 4473 cdata->ssvid = SPDK_PCI_VID_NUTANIX; 4474 cdata->ieee[0] = 0x8d; 4475 cdata->ieee[1] = 0x6b; 4476 cdata->ieee[2] = 0x50; 4477 memset(&cdata->sgls, 0, sizeof(struct spdk_nvme_cdata_sgls)); 4478 cdata->sgls.supported = SPDK_NVME_SGLS_SUPPORTED_DWORD_ALIGNED; 4479 cdata->oncs.compare = !vu_transport->transport_opts.disable_compare; 4480 /* libvfio-user can only support 1 connection for now */ 4481 cdata->oncs.reservations = 0; 4482 cdata->oacs.doorbell_buffer_config = !vu_transport->transport_opts.disable_shadow_doorbells; 4483 cdata->fuses.compare_and_write = !vu_transport->transport_opts.disable_compare; 4484 } 4485 4486 static int 4487 nvmf_vfio_user_listen_associate(struct spdk_nvmf_transport *transport, 4488 const struct spdk_nvmf_subsystem *subsystem, 4489 const struct spdk_nvme_transport_id *trid) 4490 { 4491 struct nvmf_vfio_user_transport *vu_transport; 4492 struct nvmf_vfio_user_endpoint *endpoint; 4493 4494 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, transport); 4495 4496 pthread_mutex_lock(&vu_transport->lock); 4497 TAILQ_FOREACH(endpoint, &vu_transport->endpoints, link) { 4498 if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) { 4499 break; 4500 } 4501 } 4502 pthread_mutex_unlock(&vu_transport->lock); 4503 4504 if (endpoint == NULL) { 4505 return -ENOENT; 4506 } 4507 4508 /* Drop const - we will later need to pause/unpause. */ 4509 endpoint->subsystem = (struct spdk_nvmf_subsystem *)subsystem; 4510 4511 return 0; 4512 } 4513 4514 /* 4515 * Executed periodically at a default SPDK_NVMF_DEFAULT_ACCEPT_POLL_RATE_US 4516 * frequency. 4517 * 4518 * For this endpoint (which at the libvfio-user level corresponds to a socket), 4519 * if we don't currently have a controller set up, peek to see if the socket is 4520 * able to accept a new connection. 4521 */ 4522 static int 4523 nvmf_vfio_user_accept(void *ctx) 4524 { 4525 struct nvmf_vfio_user_endpoint *endpoint = ctx; 4526 struct nvmf_vfio_user_transport *vu_transport; 4527 int err; 4528 4529 vu_transport = endpoint->transport; 4530 4531 if (endpoint->ctrlr != NULL) { 4532 return SPDK_POLLER_IDLE; 4533 } 4534 4535 /* While we're here, the controller is already destroyed, 4536 * subsystem may still be in RESUMING state, we will wait 4537 * until the subsystem is in RUNNING state. 4538 */ 4539 if (endpoint->need_resume) { 4540 return SPDK_POLLER_IDLE; 4541 } 4542 4543 err = vfu_attach_ctx(endpoint->vfu_ctx); 4544 if (err == 0) { 4545 SPDK_DEBUGLOG(nvmf_vfio, "attach succeeded\n"); 4546 err = nvmf_vfio_user_create_ctrlr(vu_transport, endpoint); 4547 if (err == 0) { 4548 /* 4549 * Unregister ourselves: now we've accepted a 4550 * connection, there is nothing for us to poll for, and 4551 * we will poll the connection via vfu_run_ctx() 4552 * instead. 4553 */ 4554 spdk_interrupt_unregister(&endpoint->accept_intr); 4555 spdk_poller_unregister(&endpoint->accept_poller); 4556 } 4557 return SPDK_POLLER_BUSY; 4558 } 4559 4560 if (errno == EAGAIN || errno == EWOULDBLOCK) { 4561 return SPDK_POLLER_IDLE; 4562 } 4563 4564 return SPDK_POLLER_BUSY; 4565 } 4566 4567 static void 4568 nvmf_vfio_user_discover(struct spdk_nvmf_transport *transport, 4569 struct spdk_nvme_transport_id *trid, 4570 struct spdk_nvmf_discovery_log_page_entry *entry) 4571 { } 4572 4573 static int vfio_user_poll_group_intr(void *ctx); 4574 4575 static void 4576 vfio_user_poll_group_add_intr(struct nvmf_vfio_user_poll_group *vu_group, 4577 struct spdk_nvmf_poll_group *group) 4578 { 4579 vu_group->intr_fd = eventfd(0, EFD_NONBLOCK); 4580 assert(vu_group->intr_fd != -1); 4581 4582 vu_group->intr = SPDK_INTERRUPT_REGISTER(vu_group->intr_fd, 4583 vfio_user_poll_group_intr, vu_group); 4584 assert(vu_group->intr != NULL); 4585 4586 spdk_poller_register_interrupt(group->poller, set_intr_mode_noop, 4587 vu_group); 4588 } 4589 4590 static struct spdk_nvmf_transport_poll_group * 4591 nvmf_vfio_user_poll_group_create(struct spdk_nvmf_transport *transport, 4592 struct spdk_nvmf_poll_group *group) 4593 { 4594 struct nvmf_vfio_user_transport *vu_transport; 4595 struct nvmf_vfio_user_poll_group *vu_group; 4596 4597 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4598 transport); 4599 4600 SPDK_DEBUGLOG(nvmf_vfio, "create poll group\n"); 4601 4602 vu_group = calloc(1, sizeof(*vu_group)); 4603 if (vu_group == NULL) { 4604 SPDK_ERRLOG("Error allocating poll group: %m"); 4605 return NULL; 4606 } 4607 4608 if (in_interrupt_mode(vu_transport)) { 4609 vfio_user_poll_group_add_intr(vu_group, group); 4610 } 4611 4612 TAILQ_INIT(&vu_group->sqs); 4613 4614 pthread_mutex_lock(&vu_transport->pg_lock); 4615 TAILQ_INSERT_TAIL(&vu_transport->poll_groups, vu_group, link); 4616 if (vu_transport->next_pg == NULL) { 4617 vu_transport->next_pg = vu_group; 4618 } 4619 pthread_mutex_unlock(&vu_transport->pg_lock); 4620 4621 return &vu_group->group; 4622 } 4623 4624 static struct spdk_nvmf_transport_poll_group * 4625 nvmf_vfio_user_get_optimal_poll_group(struct spdk_nvmf_qpair *qpair) 4626 { 4627 struct nvmf_vfio_user_transport *vu_transport; 4628 struct nvmf_vfio_user_poll_group **vu_group; 4629 struct nvmf_vfio_user_sq *sq; 4630 struct nvmf_vfio_user_cq *cq; 4631 4632 struct spdk_nvmf_transport_poll_group *result = NULL; 4633 4634 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 4635 cq = sq->ctrlr->cqs[sq->cqid]; 4636 assert(cq != NULL); 4637 vu_transport = SPDK_CONTAINEROF(qpair->transport, struct nvmf_vfio_user_transport, transport); 4638 4639 pthread_mutex_lock(&vu_transport->pg_lock); 4640 if (TAILQ_EMPTY(&vu_transport->poll_groups)) { 4641 goto out; 4642 } 4643 4644 if (!nvmf_qpair_is_admin_queue(qpair)) { 4645 /* 4646 * If this is shared IO CQ case, just return the used CQ's poll 4647 * group, so I/O completions don't have to use 4648 * spdk_thread_send_msg(). 4649 */ 4650 if (cq->group != NULL) { 4651 result = cq->group; 4652 goto out; 4653 } 4654 4655 /* 4656 * If we're in interrupt mode, align all qpairs for a controller 4657 * on the same poll group by default, unless requested. This can 4658 * be lower in performance than running on a single poll group, 4659 * so we disable spreading by default. 4660 */ 4661 if (in_interrupt_mode(vu_transport) && 4662 !vu_transport->transport_opts.enable_intr_mode_sq_spreading) { 4663 result = sq->ctrlr->sqs[0]->group; 4664 goto out; 4665 } 4666 4667 } 4668 4669 vu_group = &vu_transport->next_pg; 4670 assert(*vu_group != NULL); 4671 4672 result = &(*vu_group)->group; 4673 *vu_group = TAILQ_NEXT(*vu_group, link); 4674 if (*vu_group == NULL) { 4675 *vu_group = TAILQ_FIRST(&vu_transport->poll_groups); 4676 } 4677 4678 out: 4679 if (cq->group == NULL) { 4680 cq->group = result; 4681 } 4682 4683 pthread_mutex_unlock(&vu_transport->pg_lock); 4684 return result; 4685 } 4686 4687 static void 4688 vfio_user_poll_group_del_intr(struct nvmf_vfio_user_poll_group *vu_group) 4689 { 4690 assert(vu_group->intr_fd != -1); 4691 4692 spdk_interrupt_unregister(&vu_group->intr); 4693 4694 close(vu_group->intr_fd); 4695 vu_group->intr_fd = -1; 4696 } 4697 4698 /* called when process exits */ 4699 static void 4700 nvmf_vfio_user_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group) 4701 { 4702 struct nvmf_vfio_user_poll_group *vu_group, *next_tgroup; 4703 struct nvmf_vfio_user_transport *vu_transport; 4704 4705 SPDK_DEBUGLOG(nvmf_vfio, "destroy poll group\n"); 4706 4707 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 4708 vu_transport = SPDK_CONTAINEROF(vu_group->group.transport, struct nvmf_vfio_user_transport, 4709 transport); 4710 4711 if (in_interrupt_mode(vu_transport)) { 4712 vfio_user_poll_group_del_intr(vu_group); 4713 } 4714 4715 pthread_mutex_lock(&vu_transport->pg_lock); 4716 next_tgroup = TAILQ_NEXT(vu_group, link); 4717 TAILQ_REMOVE(&vu_transport->poll_groups, vu_group, link); 4718 if (next_tgroup == NULL) { 4719 next_tgroup = TAILQ_FIRST(&vu_transport->poll_groups); 4720 } 4721 if (vu_transport->next_pg == vu_group) { 4722 vu_transport->next_pg = next_tgroup; 4723 } 4724 pthread_mutex_unlock(&vu_transport->pg_lock); 4725 4726 free(vu_group); 4727 } 4728 4729 static void 4730 _vfio_user_qpair_disconnect(void *ctx) 4731 { 4732 struct nvmf_vfio_user_sq *sq = ctx; 4733 4734 spdk_nvmf_qpair_disconnect(&sq->qpair, NULL, NULL); 4735 } 4736 4737 /* The function is used when socket connection is destroyed */ 4738 static int 4739 vfio_user_destroy_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 4740 { 4741 struct nvmf_vfio_user_sq *sq; 4742 struct nvmf_vfio_user_endpoint *endpoint; 4743 4744 SPDK_DEBUGLOG(nvmf_vfio, "%s stop processing\n", ctrlr_id(ctrlr)); 4745 4746 endpoint = ctrlr->endpoint; 4747 assert(endpoint != NULL); 4748 4749 pthread_mutex_lock(&endpoint->lock); 4750 endpoint->need_relisten = true; 4751 ctrlr->disconnect = true; 4752 if (TAILQ_EMPTY(&ctrlr->connected_sqs)) { 4753 endpoint->ctrlr = NULL; 4754 free_ctrlr(ctrlr); 4755 pthread_mutex_unlock(&endpoint->lock); 4756 return 0; 4757 } 4758 4759 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 4760 /* add another round thread poll to avoid recursive endpoint lock */ 4761 spdk_thread_send_msg(ctrlr->thread, _vfio_user_qpair_disconnect, sq); 4762 } 4763 pthread_mutex_unlock(&endpoint->lock); 4764 4765 return 0; 4766 } 4767 4768 /* 4769 * Poll for and process any incoming vfio-user messages. 4770 */ 4771 static int 4772 vfio_user_poll_vfu_ctx(void *ctx) 4773 { 4774 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 4775 int ret; 4776 4777 assert(ctrlr != NULL); 4778 4779 /* This will call access_bar0_fn() if there are any writes 4780 * to the portion of the BAR that is not mmap'd */ 4781 ret = vfu_run_ctx(ctrlr->endpoint->vfu_ctx); 4782 if (spdk_unlikely(ret == -1)) { 4783 if (errno == EBUSY) { 4784 return SPDK_POLLER_IDLE; 4785 } 4786 4787 spdk_poller_unregister(&ctrlr->vfu_ctx_poller); 4788 4789 /* 4790 * We lost the client; the reset callback will already have 4791 * unregistered the interrupt. 4792 */ 4793 if (errno == ENOTCONN) { 4794 vfio_user_destroy_ctrlr(ctrlr); 4795 return SPDK_POLLER_BUSY; 4796 } 4797 4798 /* 4799 * We might not have got a reset callback in this case, so 4800 * explicitly unregister the interrupt here. 4801 */ 4802 spdk_interrupt_unregister(&ctrlr->intr); 4803 ctrlr->intr_fd = -1; 4804 fail_ctrlr(ctrlr); 4805 } 4806 4807 return ret != 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 4808 } 4809 4810 struct vfio_user_post_cpl_ctx { 4811 struct nvmf_vfio_user_ctrlr *ctrlr; 4812 struct nvmf_vfio_user_cq *cq; 4813 struct spdk_nvme_cpl cpl; 4814 }; 4815 4816 static void 4817 _post_completion_msg(void *ctx) 4818 { 4819 struct vfio_user_post_cpl_ctx *cpl_ctx = ctx; 4820 4821 post_completion(cpl_ctx->ctrlr, cpl_ctx->cq, cpl_ctx->cpl.cdw0, cpl_ctx->cpl.sqid, 4822 cpl_ctx->cpl.cid, cpl_ctx->cpl.status.sc, cpl_ctx->cpl.status.sct); 4823 free(cpl_ctx); 4824 } 4825 4826 static int nvmf_vfio_user_poll_group_poll(struct spdk_nvmf_transport_poll_group *group); 4827 4828 static int 4829 vfio_user_poll_group_process(void *ctx) 4830 { 4831 struct nvmf_vfio_user_poll_group *vu_group = ctx; 4832 int ret = 0; 4833 4834 SPDK_DEBUGLOG(vfio_user_db, "pg:%p got intr\n", vu_group); 4835 4836 ret |= nvmf_vfio_user_poll_group_poll(&vu_group->group); 4837 4838 /* 4839 * Re-arm the event indexes. NB: this also could rearm other 4840 * controller's SQs. 4841 */ 4842 ret |= vfio_user_poll_group_rearm(vu_group); 4843 4844 return ret != 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 4845 } 4846 4847 static int 4848 vfio_user_poll_group_intr(void *ctx) 4849 { 4850 struct nvmf_vfio_user_poll_group *vu_group = ctx; 4851 eventfd_t val; 4852 4853 eventfd_read(vu_group->intr_fd, &val); 4854 return vfio_user_poll_group_process(ctx); 4855 } 4856 4857 /* 4858 * Handle an interrupt for the given controller: we must poll the vfu_ctx, and 4859 * the SQs assigned to our own poll group. Other poll groups are handled via 4860 * vfio_user_poll_group_intr(). 4861 */ 4862 static int 4863 vfio_user_ctrlr_intr(void *ctx) 4864 { 4865 struct nvmf_vfio_user_poll_group *vu_ctrlr_group; 4866 struct nvmf_vfio_user_ctrlr *vu_ctrlr = ctx; 4867 struct nvmf_vfio_user_poll_group *vu_group; 4868 int ret = SPDK_POLLER_IDLE; 4869 4870 vu_ctrlr_group = ctrlr_to_poll_group(vu_ctrlr); 4871 4872 SPDK_DEBUGLOG(vfio_user_db, "ctrlr pg:%p got intr\n", vu_ctrlr_group); 4873 4874 /* 4875 * Poll vfio-user for this controller. We need to do this before polling 4876 * any SQs, as this is where doorbell writes may be handled. 4877 */ 4878 ret = vfio_user_poll_vfu_ctx(vu_ctrlr); 4879 4880 /* 4881 * `sqs[0]` could be set to NULL in vfio_user_poll_vfu_ctx() context, 4882 * just return for this case. 4883 */ 4884 if (vu_ctrlr->sqs[0] == NULL) { 4885 return ret; 4886 } 4887 4888 if (vu_ctrlr->transport->transport_opts.enable_intr_mode_sq_spreading) { 4889 /* 4890 * We may have just written to a doorbell owned by another 4891 * reactor: we need to prod them to make sure its SQs are polled 4892 * *after* the doorbell value is updated. 4893 */ 4894 TAILQ_FOREACH(vu_group, &vu_ctrlr->transport->poll_groups, link) { 4895 if (vu_group != vu_ctrlr_group) { 4896 SPDK_DEBUGLOG(vfio_user_db, "prodding pg:%p\n", vu_group); 4897 eventfd_write(vu_group->intr_fd, 1); 4898 } 4899 } 4900 } 4901 4902 ret |= vfio_user_poll_group_process(vu_ctrlr_group); 4903 4904 return ret; 4905 } 4906 4907 static void 4908 vfio_user_ctrlr_set_intr_mode(struct spdk_poller *poller, void *ctx, 4909 bool interrupt_mode) 4910 { 4911 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 4912 assert(ctrlr != NULL); 4913 assert(ctrlr->endpoint != NULL); 4914 4915 SPDK_DEBUGLOG(nvmf_vfio, "%s: setting interrupt mode to %d\n", 4916 ctrlr_id(ctrlr), interrupt_mode); 4917 4918 /* 4919 * interrupt_mode needs to persist across controller resets, so store 4920 * it in the endpoint instead. 4921 */ 4922 ctrlr->endpoint->interrupt_mode = interrupt_mode; 4923 4924 vfio_user_poll_group_rearm(ctrlr_to_poll_group(ctrlr)); 4925 } 4926 4927 /* 4928 * In response to the nvmf_vfio_user_create_ctrlr() path, the admin queue is now 4929 * set up and we can start operating on this controller. 4930 */ 4931 static void 4932 start_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 4933 struct spdk_nvmf_ctrlr *ctrlr) 4934 { 4935 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 4936 4937 vu_ctrlr->ctrlr = ctrlr; 4938 vu_ctrlr->cntlid = ctrlr->cntlid; 4939 vu_ctrlr->thread = spdk_get_thread(); 4940 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 4941 4942 if (!in_interrupt_mode(endpoint->transport)) { 4943 vu_ctrlr->vfu_ctx_poller = SPDK_POLLER_REGISTER(vfio_user_poll_vfu_ctx, 4944 vu_ctrlr, 1000); 4945 return; 4946 } 4947 4948 vu_ctrlr->vfu_ctx_poller = SPDK_POLLER_REGISTER(vfio_user_poll_vfu_ctx, 4949 vu_ctrlr, 0); 4950 4951 vu_ctrlr->intr_fd = vfu_get_poll_fd(vu_ctrlr->endpoint->vfu_ctx); 4952 assert(vu_ctrlr->intr_fd != -1); 4953 4954 vu_ctrlr->intr = SPDK_INTERRUPT_REGISTER(vu_ctrlr->intr_fd, 4955 vfio_user_ctrlr_intr, vu_ctrlr); 4956 4957 assert(vu_ctrlr->intr != NULL); 4958 4959 spdk_poller_register_interrupt(vu_ctrlr->vfu_ctx_poller, 4960 vfio_user_ctrlr_set_intr_mode, 4961 vu_ctrlr); 4962 } 4963 4964 static int 4965 handle_queue_connect_rsp(struct nvmf_vfio_user_req *req, void *cb_arg) 4966 { 4967 struct nvmf_vfio_user_poll_group *vu_group; 4968 struct nvmf_vfio_user_sq *sq = cb_arg; 4969 struct nvmf_vfio_user_cq *admin_cq; 4970 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 4971 struct nvmf_vfio_user_endpoint *endpoint; 4972 4973 assert(sq != NULL); 4974 assert(req != NULL); 4975 4976 vu_ctrlr = sq->ctrlr; 4977 assert(vu_ctrlr != NULL); 4978 endpoint = vu_ctrlr->endpoint; 4979 assert(endpoint != NULL); 4980 4981 if (spdk_nvme_cpl_is_error(&req->req.rsp->nvme_cpl)) { 4982 SPDK_ERRLOG("SC %u, SCT %u\n", req->req.rsp->nvme_cpl.status.sc, req->req.rsp->nvme_cpl.status.sct); 4983 endpoint->ctrlr = NULL; 4984 free_ctrlr(vu_ctrlr); 4985 return -1; 4986 } 4987 4988 vu_group = SPDK_CONTAINEROF(sq->group, struct nvmf_vfio_user_poll_group, group); 4989 TAILQ_INSERT_TAIL(&vu_group->sqs, sq, link); 4990 4991 admin_cq = vu_ctrlr->cqs[0]; 4992 assert(admin_cq != NULL); 4993 4994 pthread_mutex_lock(&endpoint->lock); 4995 if (nvmf_qpair_is_admin_queue(&sq->qpair)) { 4996 admin_cq->thread = spdk_get_thread(); 4997 /* 4998 * The admin queue is special as SQ0 and CQ0 are created 4999 * together. 5000 */ 5001 admin_cq->cq_ref = 1; 5002 start_ctrlr(vu_ctrlr, sq->qpair.ctrlr); 5003 } else { 5004 /* For I/O queues this command was generated in response to an 5005 * ADMIN I/O CREATE SUBMISSION QUEUE command which has not yet 5006 * been completed. Complete it now. 5007 */ 5008 if (sq->post_create_io_sq_completion) { 5009 assert(admin_cq->thread != NULL); 5010 if (admin_cq->thread != spdk_get_thread()) { 5011 struct vfio_user_post_cpl_ctx *cpl_ctx; 5012 5013 cpl_ctx = calloc(1, sizeof(*cpl_ctx)); 5014 if (!cpl_ctx) { 5015 return -ENOMEM; 5016 } 5017 cpl_ctx->ctrlr = vu_ctrlr; 5018 cpl_ctx->cq = admin_cq; 5019 cpl_ctx->cpl.sqid = 0; 5020 cpl_ctx->cpl.cdw0 = 0; 5021 cpl_ctx->cpl.cid = sq->create_io_sq_cmd.cid; 5022 cpl_ctx->cpl.status.sc = SPDK_NVME_SC_SUCCESS; 5023 cpl_ctx->cpl.status.sct = SPDK_NVME_SCT_GENERIC; 5024 5025 spdk_thread_send_msg(admin_cq->thread, _post_completion_msg, 5026 cpl_ctx); 5027 } else { 5028 post_completion(vu_ctrlr, admin_cq, 0, 0, 5029 sq->create_io_sq_cmd.cid, SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC); 5030 } 5031 sq->post_create_io_sq_completion = false; 5032 } else if (in_interrupt_mode(endpoint->transport)) { 5033 /* 5034 * If we're live migrating a guest, there is a window 5035 * where the I/O queues haven't been set up but the 5036 * device is in running state, during which the guest 5037 * might write to a doorbell. This doorbell write will 5038 * go unnoticed, so let's poll the whole controller to 5039 * pick that up. 5040 */ 5041 ctrlr_kick(vu_ctrlr); 5042 } 5043 sq->sq_state = VFIO_USER_SQ_ACTIVE; 5044 } 5045 5046 TAILQ_INSERT_TAIL(&vu_ctrlr->connected_sqs, sq, tailq); 5047 pthread_mutex_unlock(&endpoint->lock); 5048 5049 free(req->req.data); 5050 req->req.data = NULL; 5051 5052 return 0; 5053 } 5054 5055 /* 5056 * Add the given qpair to the given poll group. New qpairs are added via 5057 * spdk_nvmf_tgt_new_qpair(), which picks a poll group via 5058 * nvmf_vfio_user_get_optimal_poll_group(), then calls back here via 5059 * nvmf_transport_poll_group_add(). 5060 */ 5061 static int 5062 nvmf_vfio_user_poll_group_add(struct spdk_nvmf_transport_poll_group *group, 5063 struct spdk_nvmf_qpair *qpair) 5064 { 5065 struct nvmf_vfio_user_sq *sq; 5066 struct nvmf_vfio_user_req *vu_req; 5067 struct nvmf_vfio_user_ctrlr *ctrlr; 5068 struct spdk_nvmf_request *req; 5069 struct spdk_nvmf_fabric_connect_data *data; 5070 bool admin; 5071 5072 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5073 sq->group = group; 5074 ctrlr = sq->ctrlr; 5075 5076 SPDK_DEBUGLOG(nvmf_vfio, "%s: add QP%d=%p(%p) to poll_group=%p\n", 5077 ctrlr_id(ctrlr), sq->qpair.qid, 5078 sq, qpair, group); 5079 5080 admin = nvmf_qpair_is_admin_queue(&sq->qpair); 5081 5082 vu_req = get_nvmf_vfio_user_req(sq); 5083 if (vu_req == NULL) { 5084 return -1; 5085 } 5086 5087 req = &vu_req->req; 5088 req->cmd->connect_cmd.opcode = SPDK_NVME_OPC_FABRIC; 5089 req->cmd->connect_cmd.cid = 0; 5090 req->cmd->connect_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_CONNECT; 5091 req->cmd->connect_cmd.recfmt = 0; 5092 req->cmd->connect_cmd.sqsize = sq->size - 1; 5093 req->cmd->connect_cmd.qid = admin ? 0 : qpair->qid; 5094 5095 req->length = sizeof(struct spdk_nvmf_fabric_connect_data); 5096 req->data = calloc(1, req->length); 5097 if (req->data == NULL) { 5098 nvmf_vfio_user_req_free(req); 5099 return -ENOMEM; 5100 } 5101 5102 data = (struct spdk_nvmf_fabric_connect_data *)req->data; 5103 data->cntlid = ctrlr->cntlid; 5104 snprintf(data->subnqn, sizeof(data->subnqn), "%s", 5105 spdk_nvmf_subsystem_get_nqn(ctrlr->endpoint->subsystem)); 5106 5107 vu_req->cb_fn = handle_queue_connect_rsp; 5108 vu_req->cb_arg = sq; 5109 5110 SPDK_DEBUGLOG(nvmf_vfio, 5111 "%s: sending connect fabrics command for qid:%#x cntlid=%#x\n", 5112 ctrlr_id(ctrlr), qpair->qid, data->cntlid); 5113 5114 spdk_nvmf_request_exec_fabrics(req); 5115 return 0; 5116 } 5117 5118 static int 5119 nvmf_vfio_user_poll_group_remove(struct spdk_nvmf_transport_poll_group *group, 5120 struct spdk_nvmf_qpair *qpair) 5121 { 5122 struct nvmf_vfio_user_sq *sq; 5123 struct nvmf_vfio_user_poll_group *vu_group; 5124 5125 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5126 5127 SPDK_DEBUGLOG(nvmf_vfio, 5128 "%s: remove NVMf QP%d=%p from NVMf poll_group=%p\n", 5129 ctrlr_id(sq->ctrlr), qpair->qid, qpair, group); 5130 5131 5132 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 5133 TAILQ_REMOVE(&vu_group->sqs, sq, link); 5134 5135 return 0; 5136 } 5137 5138 static void 5139 _nvmf_vfio_user_req_free(struct nvmf_vfio_user_sq *sq, struct nvmf_vfio_user_req *vu_req) 5140 { 5141 memset(&vu_req->cmd, 0, sizeof(vu_req->cmd)); 5142 memset(&vu_req->rsp, 0, sizeof(vu_req->rsp)); 5143 vu_req->iovcnt = 0; 5144 vu_req->state = VFIO_USER_REQUEST_STATE_FREE; 5145 5146 TAILQ_INSERT_TAIL(&sq->free_reqs, vu_req, link); 5147 } 5148 5149 static int 5150 nvmf_vfio_user_req_free(struct spdk_nvmf_request *req) 5151 { 5152 struct nvmf_vfio_user_sq *sq; 5153 struct nvmf_vfio_user_req *vu_req; 5154 5155 assert(req != NULL); 5156 5157 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 5158 sq = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 5159 5160 _nvmf_vfio_user_req_free(sq, vu_req); 5161 5162 return 0; 5163 } 5164 5165 static int 5166 nvmf_vfio_user_req_complete(struct spdk_nvmf_request *req) 5167 { 5168 struct nvmf_vfio_user_sq *sq; 5169 struct nvmf_vfio_user_req *vu_req; 5170 5171 assert(req != NULL); 5172 5173 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 5174 sq = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 5175 5176 if (vu_req->cb_fn != NULL) { 5177 if (vu_req->cb_fn(vu_req, vu_req->cb_arg) != 0) { 5178 fail_ctrlr(sq->ctrlr); 5179 } 5180 } 5181 5182 _nvmf_vfio_user_req_free(sq, vu_req); 5183 5184 return 0; 5185 } 5186 5187 static void 5188 nvmf_vfio_user_close_qpair(struct spdk_nvmf_qpair *qpair, 5189 spdk_nvmf_transport_qpair_fini_cb cb_fn, void *cb_arg) 5190 { 5191 struct nvmf_vfio_user_sq *sq; 5192 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 5193 struct nvmf_vfio_user_endpoint *endpoint; 5194 5195 assert(qpair != NULL); 5196 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5197 vu_ctrlr = sq->ctrlr; 5198 endpoint = vu_ctrlr->endpoint; 5199 5200 pthread_mutex_lock(&endpoint->lock); 5201 TAILQ_REMOVE(&vu_ctrlr->connected_sqs, sq, tailq); 5202 delete_sq_done(vu_ctrlr, sq); 5203 if (TAILQ_EMPTY(&vu_ctrlr->connected_sqs)) { 5204 endpoint->ctrlr = NULL; 5205 if (vu_ctrlr->in_source_vm && endpoint->need_resume) { 5206 /* The controller will be freed, we can resume the subsystem 5207 * now so that the endpoint can be ready to accept another 5208 * new connection. 5209 */ 5210 spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 5211 vfio_user_endpoint_resume_done, endpoint); 5212 } 5213 free_ctrlr(vu_ctrlr); 5214 } 5215 pthread_mutex_unlock(&endpoint->lock); 5216 5217 if (cb_fn) { 5218 cb_fn(cb_arg); 5219 } 5220 } 5221 5222 /** 5223 * Returns a preallocated request, or NULL if there isn't one available. 5224 */ 5225 static struct nvmf_vfio_user_req * 5226 get_nvmf_vfio_user_req(struct nvmf_vfio_user_sq *sq) 5227 { 5228 struct nvmf_vfio_user_req *req; 5229 5230 if (sq == NULL) { 5231 return NULL; 5232 } 5233 5234 req = TAILQ_FIRST(&sq->free_reqs); 5235 if (req == NULL) { 5236 return NULL; 5237 } 5238 5239 TAILQ_REMOVE(&sq->free_reqs, req, link); 5240 5241 return req; 5242 } 5243 5244 static int 5245 get_nvmf_io_req_length(struct spdk_nvmf_request *req) 5246 { 5247 uint16_t nr; 5248 uint32_t nlb, nsid; 5249 struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; 5250 struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; 5251 struct spdk_nvmf_ns *ns; 5252 5253 nsid = cmd->nsid; 5254 ns = _nvmf_subsystem_get_ns(ctrlr->subsys, nsid); 5255 if (ns == NULL || ns->bdev == NULL) { 5256 SPDK_ERRLOG("unsuccessful query for nsid %u\n", cmd->nsid); 5257 return -EINVAL; 5258 } 5259 5260 if (cmd->opc == SPDK_NVME_OPC_DATASET_MANAGEMENT) { 5261 nr = cmd->cdw10_bits.dsm.nr + 1; 5262 return nr * sizeof(struct spdk_nvme_dsm_range); 5263 } 5264 5265 nlb = (cmd->cdw12 & 0x0000ffffu) + 1; 5266 return nlb * spdk_bdev_get_block_size(ns->bdev); 5267 } 5268 5269 static int 5270 map_admin_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req) 5271 { 5272 struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; 5273 uint32_t len = 0; 5274 uint8_t fid; 5275 int iovcnt; 5276 5277 req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 5278 req->length = 0; 5279 req->data = NULL; 5280 5281 if (req->xfer == SPDK_NVME_DATA_NONE) { 5282 return 0; 5283 } 5284 5285 switch (cmd->opc) { 5286 case SPDK_NVME_OPC_IDENTIFY: 5287 len = 4096; 5288 break; 5289 case SPDK_NVME_OPC_GET_LOG_PAGE: 5290 len = (((cmd->cdw11_bits.get_log_page.numdu << 16) | cmd->cdw10_bits.get_log_page.numdl) + 1) * 4; 5291 break; 5292 case SPDK_NVME_OPC_GET_FEATURES: 5293 case SPDK_NVME_OPC_SET_FEATURES: 5294 fid = cmd->cdw10_bits.set_features.fid; 5295 switch (fid) { 5296 case SPDK_NVME_FEAT_LBA_RANGE_TYPE: 5297 len = 4096; 5298 break; 5299 case SPDK_NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 5300 len = 256; 5301 break; 5302 case SPDK_NVME_FEAT_TIMESTAMP: 5303 len = 8; 5304 break; 5305 case SPDK_NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 5306 len = 512; 5307 break; 5308 case SPDK_NVME_FEAT_HOST_IDENTIFIER: 5309 if (cmd->cdw11_bits.feat_host_identifier.bits.exhid) { 5310 len = 16; 5311 } else { 5312 len = 8; 5313 } 5314 break; 5315 default: 5316 return 0; 5317 } 5318 break; 5319 default: 5320 return 0; 5321 } 5322 5323 /* ADMIN command will not use SGL */ 5324 if (cmd->psdt != 0) { 5325 return -EINVAL; 5326 } 5327 5328 iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, len); 5329 if (iovcnt < 0) { 5330 SPDK_ERRLOG("%s: map Admin Opc %x failed\n", 5331 ctrlr_id(ctrlr), cmd->opc); 5332 return -1; 5333 } 5334 req->length = len; 5335 req->data = req->iov[0].iov_base; 5336 req->iovcnt = iovcnt; 5337 5338 return 0; 5339 } 5340 5341 /* 5342 * Map an I/O command's buffers. 5343 * 5344 * Returns 0 on success and -errno on failure. 5345 */ 5346 static int 5347 map_io_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req) 5348 { 5349 int len, iovcnt; 5350 struct spdk_nvme_cmd *cmd; 5351 5352 assert(ctrlr != NULL); 5353 assert(req != NULL); 5354 5355 cmd = &req->cmd->nvme_cmd; 5356 req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 5357 req->length = 0; 5358 req->data = NULL; 5359 5360 if (spdk_unlikely(req->xfer == SPDK_NVME_DATA_NONE)) { 5361 return 0; 5362 } 5363 5364 len = get_nvmf_io_req_length(req); 5365 if (len < 0) { 5366 return -EINVAL; 5367 } 5368 req->length = len; 5369 5370 iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, req->length); 5371 if (iovcnt < 0) { 5372 SPDK_ERRLOG("%s: failed to map IO OPC %u\n", ctrlr_id(ctrlr), cmd->opc); 5373 return -EFAULT; 5374 } 5375 req->data = req->iov[0].iov_base; 5376 req->iovcnt = iovcnt; 5377 5378 return 0; 5379 } 5380 5381 static int 5382 handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 5383 struct nvmf_vfio_user_sq *sq) 5384 { 5385 int err; 5386 struct nvmf_vfio_user_req *vu_req; 5387 struct spdk_nvmf_request *req; 5388 5389 assert(ctrlr != NULL); 5390 assert(cmd != NULL); 5391 5392 vu_req = get_nvmf_vfio_user_req(sq); 5393 if (spdk_unlikely(vu_req == NULL)) { 5394 SPDK_ERRLOG("%s: no request for NVMe command opc 0x%x\n", ctrlr_id(ctrlr), cmd->opc); 5395 return post_completion(ctrlr, ctrlr->cqs[sq->cqid], 0, 0, cmd->cid, 5396 SPDK_NVME_SC_INTERNAL_DEVICE_ERROR, SPDK_NVME_SCT_GENERIC); 5397 5398 } 5399 req = &vu_req->req; 5400 5401 assert(req->qpair != NULL); 5402 SPDK_DEBUGLOG(nvmf_vfio, "%s: handle sqid:%u, req opc=%#x cid=%d\n", 5403 ctrlr_id(ctrlr), req->qpair->qid, cmd->opc, cmd->cid); 5404 5405 vu_req->cb_fn = handle_cmd_rsp; 5406 vu_req->cb_arg = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 5407 req->cmd->nvme_cmd = *cmd; 5408 5409 if (nvmf_qpair_is_admin_queue(req->qpair)) { 5410 err = map_admin_cmd_req(ctrlr, req); 5411 } else { 5412 switch (cmd->opc) { 5413 case SPDK_NVME_OPC_RESERVATION_REGISTER: 5414 case SPDK_NVME_OPC_RESERVATION_REPORT: 5415 case SPDK_NVME_OPC_RESERVATION_ACQUIRE: 5416 case SPDK_NVME_OPC_RESERVATION_RELEASE: 5417 err = -ENOTSUP; 5418 break; 5419 default: 5420 err = map_io_cmd_req(ctrlr, req); 5421 break; 5422 } 5423 } 5424 5425 if (spdk_unlikely(err < 0)) { 5426 SPDK_ERRLOG("%s: process NVMe command opc 0x%x failed\n", 5427 ctrlr_id(ctrlr), cmd->opc); 5428 req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 5429 req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; 5430 err = handle_cmd_rsp(vu_req, vu_req->cb_arg); 5431 _nvmf_vfio_user_req_free(sq, vu_req); 5432 return err; 5433 } 5434 5435 vu_req->state = VFIO_USER_REQUEST_STATE_EXECUTING; 5436 spdk_nvmf_request_exec(req); 5437 5438 return 0; 5439 } 5440 5441 /* 5442 * If we suppressed an IRQ in post_completion(), check if it needs to be fired 5443 * here: if the host isn't up to date, and is apparently not actively processing 5444 * the queue (i.e. ->last_head isn't changing), we need an IRQ. 5445 */ 5446 static void 5447 handle_suppressed_irq(struct nvmf_vfio_user_ctrlr *ctrlr, 5448 struct nvmf_vfio_user_sq *sq) 5449 { 5450 struct nvmf_vfio_user_cq *cq = ctrlr->cqs[sq->cqid]; 5451 uint32_t cq_head; 5452 uint32_t cq_tail; 5453 5454 if (!cq->ien || cq->qid == 0 || !ctrlr_interrupt_enabled(ctrlr)) { 5455 return; 5456 } 5457 5458 cq_tail = *cq_tailp(cq); 5459 5460 /* Already sent? */ 5461 if (cq_tail == cq->last_trigger_irq_tail) { 5462 return; 5463 } 5464 5465 spdk_ivdt_dcache(cq_dbl_headp(cq)); 5466 cq_head = *cq_dbl_headp(cq); 5467 5468 if (cq_head != cq_tail && cq_head == cq->last_head) { 5469 int err = vfu_irq_trigger(ctrlr->endpoint->vfu_ctx, cq->iv); 5470 if (err != 0) { 5471 SPDK_ERRLOG("%s: failed to trigger interrupt: %m\n", 5472 ctrlr_id(ctrlr)); 5473 } else { 5474 cq->last_trigger_irq_tail = cq_tail; 5475 } 5476 } 5477 5478 cq->last_head = cq_head; 5479 } 5480 5481 /* Returns the number of commands processed, or a negative value on error. */ 5482 static int 5483 nvmf_vfio_user_sq_poll(struct nvmf_vfio_user_sq *sq) 5484 { 5485 struct nvmf_vfio_user_ctrlr *ctrlr; 5486 uint32_t new_tail; 5487 int count = 0; 5488 5489 assert(sq != NULL); 5490 5491 ctrlr = sq->ctrlr; 5492 5493 /* 5494 * A quiesced, or migrating, controller should never process new 5495 * commands. 5496 */ 5497 if (ctrlr->state != VFIO_USER_CTRLR_RUNNING) { 5498 return SPDK_POLLER_IDLE; 5499 } 5500 5501 if (ctrlr->adaptive_irqs_enabled) { 5502 handle_suppressed_irq(ctrlr, sq); 5503 } 5504 5505 /* On aarch64 platforms, doorbells update from guest VM may not be seen 5506 * on SPDK target side. This is because there is memory type mismatch 5507 * situation here. That is on guest VM side, the doorbells are treated as 5508 * device memory while on SPDK target side, it is treated as normal 5509 * memory. And this situation cause problem on ARM platform. 5510 * Refer to "https://developer.arm.com/documentation/102376/0100/ 5511 * Memory-aliasing-and-mismatched-memory-types". Only using spdk_mb() 5512 * cannot fix this. Use "dc civac" to invalidate cache may solve 5513 * this. 5514 */ 5515 spdk_ivdt_dcache(sq_dbl_tailp(sq)); 5516 5517 /* Load-Acquire. */ 5518 new_tail = *sq_dbl_tailp(sq); 5519 5520 new_tail = new_tail & 0xffffu; 5521 if (spdk_unlikely(new_tail >= sq->size)) { 5522 union spdk_nvme_async_event_completion event = {}; 5523 5524 SPDK_DEBUGLOG(nvmf_vfio, "%s: invalid sqid:%u doorbell value %u\n", ctrlr_id(ctrlr), sq->qid, 5525 new_tail); 5526 event.bits.async_event_type = SPDK_NVME_ASYNC_EVENT_TYPE_ERROR; 5527 event.bits.async_event_info = SPDK_NVME_ASYNC_EVENT_INVALID_DB_WRITE; 5528 nvmf_ctrlr_async_event_error_event(ctrlr->ctrlr, event); 5529 5530 return -1; 5531 } 5532 5533 if (*sq_headp(sq) == new_tail) { 5534 return 0; 5535 } 5536 5537 SPDK_DEBUGLOG(nvmf_vfio, "%s: sqid:%u doorbell old=%u new=%u\n", 5538 ctrlr_id(ctrlr), sq->qid, *sq_headp(sq), new_tail); 5539 if (ctrlr->sdbl != NULL) { 5540 SPDK_DEBUGLOG(nvmf_vfio, 5541 "%s: sqid:%u bar0_doorbell=%u shadow_doorbell=%u eventidx=%u\n", 5542 ctrlr_id(ctrlr), sq->qid, 5543 ctrlr->bar0_doorbells[queue_index(sq->qid, false)], 5544 ctrlr->sdbl->shadow_doorbells[queue_index(sq->qid, false)], 5545 ctrlr->sdbl->eventidxs[queue_index(sq->qid, false)]); 5546 } 5547 5548 /* 5549 * Ensure that changes to the queue are visible to us. 5550 * The host driver should write the queue first, do a wmb(), and then 5551 * update the SQ tail doorbell (their Store-Release). 5552 */ 5553 spdk_rmb(); 5554 5555 count = handle_sq_tdbl_write(ctrlr, new_tail, sq); 5556 if (spdk_unlikely(count < 0)) { 5557 fail_ctrlr(ctrlr); 5558 } 5559 5560 return count; 5561 } 5562 5563 /* 5564 * vfio-user transport poll handler. Note that the library context is polled in 5565 * a separate poller (->vfu_ctx_poller), so this poller only needs to poll the 5566 * active SQs. 5567 * 5568 * Returns the number of commands processed, or a negative value on error. 5569 */ 5570 static int 5571 nvmf_vfio_user_poll_group_poll(struct spdk_nvmf_transport_poll_group *group) 5572 { 5573 struct nvmf_vfio_user_poll_group *vu_group; 5574 struct nvmf_vfio_user_sq *sq, *tmp; 5575 int count = 0; 5576 5577 assert(group != NULL); 5578 5579 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 5580 5581 SPDK_DEBUGLOG(vfio_user_db, "polling all SQs\n"); 5582 5583 TAILQ_FOREACH_SAFE(sq, &vu_group->sqs, link, tmp) { 5584 int ret; 5585 5586 if (spdk_unlikely(sq->sq_state != VFIO_USER_SQ_ACTIVE || !sq->size)) { 5587 continue; 5588 } 5589 5590 ret = nvmf_vfio_user_sq_poll(sq); 5591 5592 if (spdk_unlikely(ret < 0)) { 5593 return ret; 5594 } 5595 5596 count += ret; 5597 } 5598 5599 return count; 5600 } 5601 5602 static int 5603 nvmf_vfio_user_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair, 5604 struct spdk_nvme_transport_id *trid) 5605 { 5606 struct nvmf_vfio_user_sq *sq; 5607 struct nvmf_vfio_user_ctrlr *ctrlr; 5608 5609 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5610 ctrlr = sq->ctrlr; 5611 5612 memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid)); 5613 return 0; 5614 } 5615 5616 static int 5617 nvmf_vfio_user_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair, 5618 struct spdk_nvme_transport_id *trid) 5619 { 5620 return 0; 5621 } 5622 5623 static int 5624 nvmf_vfio_user_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair, 5625 struct spdk_nvme_transport_id *trid) 5626 { 5627 struct nvmf_vfio_user_sq *sq; 5628 struct nvmf_vfio_user_ctrlr *ctrlr; 5629 5630 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5631 ctrlr = sq->ctrlr; 5632 5633 memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid)); 5634 return 0; 5635 } 5636 5637 static void 5638 nvmf_vfio_user_qpair_abort_request(struct spdk_nvmf_qpair *qpair, 5639 struct spdk_nvmf_request *req) 5640 { 5641 struct spdk_nvmf_request *req_to_abort = NULL; 5642 struct spdk_nvmf_request *temp_req = NULL; 5643 uint16_t cid; 5644 5645 cid = req->cmd->nvme_cmd.cdw10_bits.abort.cid; 5646 5647 TAILQ_FOREACH(temp_req, &qpair->outstanding, link) { 5648 struct nvmf_vfio_user_req *vu_req; 5649 5650 vu_req = SPDK_CONTAINEROF(temp_req, struct nvmf_vfio_user_req, req); 5651 5652 if (vu_req->state == VFIO_USER_REQUEST_STATE_EXECUTING && vu_req->cmd.cid == cid) { 5653 req_to_abort = temp_req; 5654 break; 5655 } 5656 } 5657 5658 if (req_to_abort == NULL) { 5659 spdk_nvmf_request_complete(req); 5660 return; 5661 } 5662 5663 req->req_to_abort = req_to_abort; 5664 nvmf_ctrlr_abort_request(req); 5665 } 5666 5667 static void 5668 nvmf_vfio_user_opts_init(struct spdk_nvmf_transport_opts *opts) 5669 { 5670 opts->max_queue_depth = NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH; 5671 opts->max_qpairs_per_ctrlr = NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; 5672 opts->in_capsule_data_size = 0; 5673 opts->max_io_size = NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE; 5674 opts->io_unit_size = NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE; 5675 opts->max_aq_depth = NVMF_VFIO_USER_DEFAULT_AQ_DEPTH; 5676 opts->num_shared_buffers = 0; 5677 opts->buf_cache_size = 0; 5678 opts->association_timeout = 0; 5679 opts->transport_specific = NULL; 5680 } 5681 5682 const struct spdk_nvmf_transport_ops spdk_nvmf_transport_vfio_user = { 5683 .name = "VFIOUSER", 5684 .type = SPDK_NVME_TRANSPORT_VFIOUSER, 5685 .opts_init = nvmf_vfio_user_opts_init, 5686 .create = nvmf_vfio_user_create, 5687 .destroy = nvmf_vfio_user_destroy, 5688 5689 .listen = nvmf_vfio_user_listen, 5690 .stop_listen = nvmf_vfio_user_stop_listen, 5691 .cdata_init = nvmf_vfio_user_cdata_init, 5692 .listen_associate = nvmf_vfio_user_listen_associate, 5693 5694 .listener_discover = nvmf_vfio_user_discover, 5695 5696 .poll_group_create = nvmf_vfio_user_poll_group_create, 5697 .get_optimal_poll_group = nvmf_vfio_user_get_optimal_poll_group, 5698 .poll_group_destroy = nvmf_vfio_user_poll_group_destroy, 5699 .poll_group_add = nvmf_vfio_user_poll_group_add, 5700 .poll_group_remove = nvmf_vfio_user_poll_group_remove, 5701 .poll_group_poll = nvmf_vfio_user_poll_group_poll, 5702 5703 .req_free = nvmf_vfio_user_req_free, 5704 .req_complete = nvmf_vfio_user_req_complete, 5705 5706 .qpair_fini = nvmf_vfio_user_close_qpair, 5707 .qpair_get_local_trid = nvmf_vfio_user_qpair_get_local_trid, 5708 .qpair_get_peer_trid = nvmf_vfio_user_qpair_get_peer_trid, 5709 .qpair_get_listen_trid = nvmf_vfio_user_qpair_get_listen_trid, 5710 .qpair_abort_request = nvmf_vfio_user_qpair_abort_request, 5711 }; 5712 5713 SPDK_NVMF_TRANSPORT_REGISTER(muser, &spdk_nvmf_transport_vfio_user); 5714 SPDK_LOG_REGISTER_COMPONENT(nvmf_vfio) 5715 SPDK_LOG_REGISTER_COMPONENT(vfio_user_db) 5716