1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2020 Intel Corporation. 3 * Copyright (c) 2019-2022, Nutanix Inc. All rights reserved. 4 * Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 /* 8 * NVMe over vfio-user transport 9 */ 10 11 #include <sys/param.h> 12 13 #include <vfio-user/libvfio-user.h> 14 #include <vfio-user/pci_defs.h> 15 16 #include "spdk/barrier.h" 17 #include "spdk/stdinc.h" 18 #include "spdk/assert.h" 19 #include "spdk/thread.h" 20 #include "spdk/nvmf_transport.h" 21 #include "spdk/sock.h" 22 #include "spdk/string.h" 23 #include "spdk/util.h" 24 #include "spdk/log.h" 25 26 #include "transport.h" 27 28 #include "nvmf_internal.h" 29 30 #define SWAP(x, y) \ 31 do \ 32 { \ 33 typeof(x) _tmp = x; \ 34 x = y; \ 35 y = _tmp; \ 36 } while (0) 37 38 #define NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH 256 39 #define NVMF_VFIO_USER_DEFAULT_AQ_DEPTH 32 40 #define NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE ((NVMF_REQ_MAX_BUFFERS - 1) << SHIFT_4KB) 41 #define NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE 42 43 #define NVME_DOORBELLS_OFFSET 0x1000 44 #define NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT 2 45 #define NVMF_VFIO_USER_SET_EVENTIDX_MAX_ATTEMPTS 3 46 #define NVMF_VFIO_USER_EVENTIDX_POLL UINT32_MAX 47 48 #define NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR 512 49 #define NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR (NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR / 4) 50 51 /* NVMe spec 1.4, section 5.21.1.7 */ 52 SPDK_STATIC_ASSERT(NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR >= 2 && 53 NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR <= 65535, 54 "bad number of queues"); 55 56 /* 57 * NVMe driver reads 4096 bytes, which is the extended PCI configuration space 58 * available on PCI-X 2.0 and PCI Express buses 59 */ 60 #define NVME_REG_CFG_SIZE 0x1000 61 62 /* 63 * Doorbells must be page aligned so that they can memory mapped. 64 * 65 * TODO does the NVMe spec also require this? Document it. 66 */ 67 #define NVMF_VFIO_USER_DOORBELLS_SIZE \ 68 SPDK_ALIGN_CEIL( \ 69 (NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR * 2 * SPDK_NVME_DOORBELL_REGISTER_SIZE), \ 70 0x1000) 71 #define NVME_REG_BAR0_SIZE (NVME_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE) 72 73 /* 74 * TODO check the PCI spec whether BAR4 and BAR5 really have to be at least one 75 * page and a multiple of page size (maybe QEMU also needs this?). Document all 76 * this. 77 */ 78 79 /* 80 * MSI-X Pending Bit Array Size 81 * 82 * TODO according to the PCI spec we need one bit per vector, document the 83 * relevant section. 84 * 85 * If the first argument to SPDK_ALIGN_CEIL is 0 then the result is 0, so we 86 * would end up with a 0-size BAR5. 87 */ 88 #define NVME_IRQ_MSIX_NUM MAX(CHAR_BIT, NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) 89 #define NVME_BAR5_SIZE SPDK_ALIGN_CEIL((NVME_IRQ_MSIX_NUM / CHAR_BIT), 0x1000) 90 SPDK_STATIC_ASSERT(NVME_BAR5_SIZE > 0, "Incorrect size"); 91 92 /* MSI-X Table Size */ 93 #define NVME_BAR4_SIZE SPDK_ALIGN_CEIL((NVME_IRQ_MSIX_NUM * 16), 0x1000) 94 SPDK_STATIC_ASSERT(NVME_BAR4_SIZE > 0, "Incorrect size"); 95 96 struct nvmf_vfio_user_req; 97 98 typedef int (*nvmf_vfio_user_req_cb_fn)(struct nvmf_vfio_user_req *req, void *cb_arg); 99 100 /* 1 more for PRP2 list itself */ 101 #define NVMF_VFIO_USER_MAX_IOVECS (NVMF_REQ_MAX_BUFFERS + 1) 102 103 enum nvmf_vfio_user_req_state { 104 VFIO_USER_REQUEST_STATE_FREE = 0, 105 VFIO_USER_REQUEST_STATE_EXECUTING, 106 }; 107 108 /* 109 * Support for live migration in NVMf/vfio-user: live migration is implemented 110 * by stopping the NVMf subsystem when the device is instructed to enter the 111 * stop-and-copy state and then trivially, and most importantly safely, 112 * collecting migration state and providing it to the vfio-user client. We 113 * don't provide any migration state at the pre-copy state as that's too 114 * complicated to do, we might support this in the future. 115 */ 116 117 118 /* NVMe device state representation */ 119 struct nvme_migr_sq_state { 120 uint16_t sqid; 121 uint16_t cqid; 122 uint32_t head; 123 uint32_t size; 124 uint32_t reserved; 125 uint64_t dma_addr; 126 }; 127 SPDK_STATIC_ASSERT(sizeof(struct nvme_migr_sq_state) == 0x18, "Incorrect size"); 128 129 struct nvme_migr_cq_state { 130 uint16_t cqid; 131 uint16_t phase; 132 uint32_t tail; 133 uint32_t size; 134 uint32_t iv; 135 uint32_t ien; 136 uint32_t reserved; 137 uint64_t dma_addr; 138 }; 139 SPDK_STATIC_ASSERT(sizeof(struct nvme_migr_cq_state) == 0x20, "Incorrect size"); 140 141 #define VFIO_USER_NVME_MIGR_MAGIC 0xAFEDBC23 142 143 /* The device state is in VFIO MIGRATION BAR(9) region, keep the device state page aligned. 144 * 145 * NVMe device migration region is defined as below: 146 * ------------------------------------------------------------------------- 147 * | vfio_user_nvme_migr_header | nvmf controller data | queue pairs | BARs | 148 * ------------------------------------------------------------------------- 149 * 150 * Keep vfio_user_nvme_migr_header as a fixed 0x1000 length, all new added fields 151 * can use the reserved space at the end of the data structure. 152 */ 153 struct vfio_user_nvme_migr_header { 154 /* Magic value to validate migration data */ 155 uint32_t magic; 156 /* Version to check the data is same from source to destination */ 157 uint32_t version; 158 159 /* The library uses this field to know how many fields in this 160 * structure are valid, starting at the beginning of this data 161 * structure. New added fields in future use `unused` memory 162 * spaces. 163 */ 164 uint32_t opts_size; 165 uint32_t reserved0; 166 167 /* BARs information */ 168 uint64_t bar_offset[VFU_PCI_DEV_NUM_REGIONS]; 169 uint64_t bar_len[VFU_PCI_DEV_NUM_REGIONS]; 170 171 /* Queue pair start offset, starting at the beginning of this 172 * data structure. 173 */ 174 uint64_t qp_offset; 175 uint64_t qp_len; 176 177 /* Controller data structure */ 178 uint32_t num_io_queues; 179 uint32_t reserved1; 180 181 /* NVMf controller data offset and length if exist, starting at 182 * the beginning of this data structure. 183 */ 184 uint64_t nvmf_data_offset; 185 uint64_t nvmf_data_len; 186 187 /* 188 * Whether or not shadow doorbells are used in the source. 0 is a valid DMA 189 * address. 190 */ 191 uint32_t sdbl; 192 193 /* Shadow doorbell DMA addresses. */ 194 uint64_t shadow_doorbell_buffer; 195 uint64_t eventidx_buffer; 196 197 /* Reserved memory space for new added fields, the 198 * field is always at the end of this data structure. 199 */ 200 uint8_t unused[3856]; 201 }; 202 SPDK_STATIC_ASSERT(sizeof(struct vfio_user_nvme_migr_header) == 0x1000, "Incorrect size"); 203 204 struct vfio_user_nvme_migr_qp { 205 struct nvme_migr_sq_state sq; 206 struct nvme_migr_cq_state cq; 207 }; 208 209 /* NVMe state definition used to load/restore from/to NVMe migration BAR region */ 210 struct vfio_user_nvme_migr_state { 211 struct vfio_user_nvme_migr_header ctrlr_header; 212 struct spdk_nvmf_ctrlr_migr_data nvmf_data; 213 struct vfio_user_nvme_migr_qp qps[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 214 uint8_t doorbells[NVMF_VFIO_USER_DOORBELLS_SIZE]; 215 uint8_t cfg[NVME_REG_CFG_SIZE]; 216 }; 217 218 struct nvmf_vfio_user_req { 219 struct spdk_nvmf_request req; 220 struct spdk_nvme_cpl rsp; 221 struct spdk_nvme_cmd cmd; 222 223 enum nvmf_vfio_user_req_state state; 224 nvmf_vfio_user_req_cb_fn cb_fn; 225 void *cb_arg; 226 227 /* old CC before prop_set_cc fabric command */ 228 union spdk_nvme_cc_register cc; 229 230 TAILQ_ENTRY(nvmf_vfio_user_req) link; 231 232 struct iovec iov[NVMF_VFIO_USER_MAX_IOVECS]; 233 uint8_t iovcnt; 234 235 /* NVMF_VFIO_USER_MAX_IOVECS worth of dma_sg_t. */ 236 uint8_t sg[]; 237 }; 238 239 /* 240 * Mapping of an NVMe queue. 241 * 242 * This holds the information tracking a local process mapping of an NVMe queue 243 * shared by the client. 244 */ 245 struct nvme_q_mapping { 246 /* iov of local process mapping. */ 247 struct iovec iov; 248 /* Stored sg, needed for unmap. */ 249 dma_sg_t *sg; 250 /* Client PRP of queue. */ 251 uint64_t prp1; 252 }; 253 254 enum nvmf_vfio_user_sq_state { 255 VFIO_USER_SQ_UNUSED = 0, 256 VFIO_USER_SQ_CREATED, 257 VFIO_USER_SQ_DELETED, 258 VFIO_USER_SQ_ACTIVE, 259 VFIO_USER_SQ_INACTIVE 260 }; 261 262 enum nvmf_vfio_user_cq_state { 263 VFIO_USER_CQ_UNUSED = 0, 264 VFIO_USER_CQ_CREATED, 265 VFIO_USER_CQ_DELETED, 266 }; 267 268 enum nvmf_vfio_user_ctrlr_state { 269 VFIO_USER_CTRLR_CREATING = 0, 270 VFIO_USER_CTRLR_RUNNING, 271 /* Quiesce requested by libvfio-user */ 272 VFIO_USER_CTRLR_PAUSING, 273 /* NVMf subsystem is paused, it's safe to do PCI reset, memory register, 274 * memory unergister, and vfio migration state transition in this state. 275 */ 276 VFIO_USER_CTRLR_PAUSED, 277 /* 278 * Implies that the NVMf subsystem is paused. Device will be unquiesced (PCI 279 * reset, memory register and unregister, controller in destination VM has 280 * been restored). NVMf subsystem resume has been requested. 281 */ 282 VFIO_USER_CTRLR_RESUMING, 283 /* 284 * Implies that the NVMf subsystem is paused. Both controller in source VM and 285 * destinatiom VM is in this state when doing live migration. 286 */ 287 VFIO_USER_CTRLR_MIGRATING 288 }; 289 290 struct nvmf_vfio_user_sq { 291 struct spdk_nvmf_qpair qpair; 292 struct spdk_nvmf_transport_poll_group *group; 293 struct nvmf_vfio_user_ctrlr *ctrlr; 294 295 uint32_t qid; 296 /* Number of entries in queue. */ 297 uint32_t size; 298 struct nvme_q_mapping mapping; 299 enum nvmf_vfio_user_sq_state sq_state; 300 301 uint32_t head; 302 volatile uint32_t *dbl_tailp; 303 304 /* Whether a shadow doorbell eventidx needs setting. */ 305 bool need_rearm; 306 307 /* multiple SQs can be mapped to the same CQ */ 308 uint16_t cqid; 309 310 /* handle_queue_connect_rsp() can be used both for CREATE IO SQ response 311 * and SQ re-connect response in the destination VM, for the prior case, 312 * we will post a NVMe completion to VM, we will not set this flag when 313 * re-connecting SQs in the destination VM. 314 */ 315 bool post_create_io_sq_completion; 316 /* Copy of Create IO SQ command, this field is used together with 317 * `post_create_io_sq_completion` flag. 318 */ 319 struct spdk_nvme_cmd create_io_sq_cmd; 320 321 /* Currently unallocated reqs. */ 322 TAILQ_HEAD(, nvmf_vfio_user_req) free_reqs; 323 /* Poll group entry */ 324 TAILQ_ENTRY(nvmf_vfio_user_sq) link; 325 /* Connected SQ entry */ 326 TAILQ_ENTRY(nvmf_vfio_user_sq) tailq; 327 }; 328 329 struct nvmf_vfio_user_cq { 330 struct spdk_nvmf_transport_poll_group *group; 331 struct spdk_thread *thread; 332 uint32_t cq_ref; 333 334 uint32_t qid; 335 /* Number of entries in queue. */ 336 uint32_t size; 337 struct nvme_q_mapping mapping; 338 enum nvmf_vfio_user_cq_state cq_state; 339 340 uint32_t tail; 341 volatile uint32_t *dbl_headp; 342 343 bool phase; 344 345 uint16_t iv; 346 bool ien; 347 348 uint32_t last_head; 349 uint32_t last_trigger_irq_tail; 350 }; 351 352 struct nvmf_vfio_user_poll_group { 353 struct spdk_nvmf_transport_poll_group group; 354 TAILQ_ENTRY(nvmf_vfio_user_poll_group) link; 355 TAILQ_HEAD(, nvmf_vfio_user_sq) sqs; 356 struct spdk_interrupt *intr; 357 int intr_fd; 358 }; 359 360 struct nvmf_vfio_user_shadow_doorbells { 361 volatile uint32_t *shadow_doorbells; 362 volatile uint32_t *eventidxs; 363 dma_sg_t *sgs; 364 struct iovec *iovs; 365 }; 366 367 struct nvmf_vfio_user_ctrlr { 368 struct nvmf_vfio_user_endpoint *endpoint; 369 struct nvmf_vfio_user_transport *transport; 370 371 /* Connected SQs list */ 372 TAILQ_HEAD(, nvmf_vfio_user_sq) connected_sqs; 373 enum nvmf_vfio_user_ctrlr_state state; 374 375 /* 376 * Tells whether live migration data have been prepared. This is used 377 * by the get_pending_bytes callback to tell whether or not the 378 * previous iteration finished. 379 */ 380 bool migr_data_prepared; 381 382 /* Controller is in source VM when doing live migration */ 383 bool in_source_vm; 384 385 struct spdk_thread *thread; 386 struct spdk_poller *vfu_ctx_poller; 387 struct spdk_interrupt *intr; 388 int intr_fd; 389 390 bool queued_quiesce; 391 392 bool reset_shn; 393 394 uint16_t cntlid; 395 struct spdk_nvmf_ctrlr *ctrlr; 396 397 struct nvmf_vfio_user_sq *sqs[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 398 struct nvmf_vfio_user_cq *cqs[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 399 400 TAILQ_ENTRY(nvmf_vfio_user_ctrlr) link; 401 402 volatile uint32_t *bar0_doorbells; 403 struct nvmf_vfio_user_shadow_doorbells *sdbl; 404 /* 405 * Shadow doorbells PRPs to provide during the stop-and-copy state. 406 */ 407 uint64_t shadow_doorbell_buffer; 408 uint64_t eventidx_buffer; 409 410 bool adaptive_irqs_enabled; 411 }; 412 413 /* Endpoint in vfio-user is associated with a socket file, which 414 * is the representative of a PCI endpoint. 415 */ 416 struct nvmf_vfio_user_endpoint { 417 struct nvmf_vfio_user_transport *transport; 418 vfu_ctx_t *vfu_ctx; 419 struct spdk_poller *accept_poller; 420 struct spdk_thread *accept_thread; 421 bool interrupt_mode; 422 struct msixcap *msix; 423 vfu_pci_config_space_t *pci_config_space; 424 int devmem_fd; 425 int accept_intr_fd; 426 struct spdk_interrupt *accept_intr; 427 428 volatile uint32_t *bar0_doorbells; 429 430 int migr_fd; 431 void *migr_data; 432 433 struct spdk_nvme_transport_id trid; 434 struct spdk_nvmf_subsystem *subsystem; 435 436 /* Controller is associated with an active socket connection, 437 * the lifecycle of the controller is same as the VM. 438 * Currently we only support one active connection, as the NVMe 439 * specification defines, we may support multiple controllers in 440 * future, so that it can support e.g: RESERVATION. 441 */ 442 struct nvmf_vfio_user_ctrlr *ctrlr; 443 pthread_mutex_t lock; 444 445 bool need_async_destroy; 446 /* The subsystem is in PAUSED state and need to be resumed, TRUE 447 * only when migration is done successfully and the controller is 448 * in source VM. 449 */ 450 bool need_resume; 451 /* Start the accept poller again after destroying the controller */ 452 bool need_relisten; 453 454 TAILQ_ENTRY(nvmf_vfio_user_endpoint) link; 455 }; 456 457 struct nvmf_vfio_user_transport_opts { 458 bool disable_mappable_bar0; 459 bool disable_adaptive_irq; 460 bool disable_shadow_doorbells; 461 bool disable_compare; 462 bool enable_intr_mode_sq_spreading; 463 }; 464 465 struct nvmf_vfio_user_transport { 466 struct spdk_nvmf_transport transport; 467 struct nvmf_vfio_user_transport_opts transport_opts; 468 bool intr_mode_supported; 469 pthread_mutex_t lock; 470 TAILQ_HEAD(, nvmf_vfio_user_endpoint) endpoints; 471 472 pthread_mutex_t pg_lock; 473 TAILQ_HEAD(, nvmf_vfio_user_poll_group) poll_groups; 474 struct nvmf_vfio_user_poll_group *next_pg; 475 }; 476 477 /* 478 * function prototypes 479 */ 480 static int nvmf_vfio_user_req_free(struct spdk_nvmf_request *req); 481 482 static struct nvmf_vfio_user_req *get_nvmf_vfio_user_req(struct nvmf_vfio_user_sq *sq); 483 484 /* 485 * Local process virtual address of a queue. 486 */ 487 static inline void * 488 q_addr(struct nvme_q_mapping *mapping) 489 { 490 return mapping->iov.iov_base; 491 } 492 493 static inline int 494 queue_index(uint16_t qid, bool is_cq) 495 { 496 return (qid * 2) + is_cq; 497 } 498 499 static inline volatile uint32_t * 500 sq_headp(struct nvmf_vfio_user_sq *sq) 501 { 502 assert(sq != NULL); 503 return &sq->head; 504 } 505 506 static inline volatile uint32_t * 507 sq_dbl_tailp(struct nvmf_vfio_user_sq *sq) 508 { 509 assert(sq != NULL); 510 return sq->dbl_tailp; 511 } 512 513 static inline volatile uint32_t * 514 cq_dbl_headp(struct nvmf_vfio_user_cq *cq) 515 { 516 assert(cq != NULL); 517 return cq->dbl_headp; 518 } 519 520 static inline volatile uint32_t * 521 cq_tailp(struct nvmf_vfio_user_cq *cq) 522 { 523 assert(cq != NULL); 524 return &cq->tail; 525 } 526 527 static inline void 528 sq_head_advance(struct nvmf_vfio_user_sq *sq) 529 { 530 assert(sq != NULL); 531 532 assert(*sq_headp(sq) < sq->size); 533 (*sq_headp(sq))++; 534 535 if (spdk_unlikely(*sq_headp(sq) == sq->size)) { 536 *sq_headp(sq) = 0; 537 } 538 } 539 540 static inline void 541 cq_tail_advance(struct nvmf_vfio_user_cq *cq) 542 { 543 assert(cq != NULL); 544 545 assert(*cq_tailp(cq) < cq->size); 546 (*cq_tailp(cq))++; 547 548 if (spdk_unlikely(*cq_tailp(cq) == cq->size)) { 549 *cq_tailp(cq) = 0; 550 cq->phase = !cq->phase; 551 } 552 } 553 554 /* 555 * As per NVMe Base spec 3.3.1.2.1, we are supposed to implement CQ flow 556 * control: if there is no space in the CQ, we should wait until there is. 557 * 558 * In practice, we just fail the controller instead: as it happens, all host 559 * implementations we care about right-size the CQ: this is required anyway for 560 * NVMEoF support (see 3.3.2.8). 561 * 562 * Since reading the head doorbell is relatively expensive, we use the cached 563 * value, so we only have to read it for real if it appears that we are full. 564 */ 565 static inline bool 566 cq_is_full(struct nvmf_vfio_user_cq *cq) 567 { 568 uint32_t qindex; 569 570 assert(cq != NULL); 571 572 qindex = *cq_tailp(cq) + 1; 573 if (spdk_unlikely(qindex == cq->size)) { 574 qindex = 0; 575 } 576 577 if (qindex != cq->last_head) { 578 return false; 579 } 580 581 cq->last_head = *cq_dbl_headp(cq); 582 583 return qindex == cq->last_head; 584 } 585 586 static bool 587 io_q_exists(struct nvmf_vfio_user_ctrlr *vu_ctrlr, const uint16_t qid, const bool is_cq) 588 { 589 assert(vu_ctrlr != NULL); 590 591 if (qid == 0 || qid >= NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) { 592 return false; 593 } 594 595 if (is_cq) { 596 if (vu_ctrlr->cqs[qid] == NULL) { 597 return false; 598 } 599 600 return (vu_ctrlr->cqs[qid]->cq_state != VFIO_USER_CQ_DELETED && 601 vu_ctrlr->cqs[qid]->cq_state != VFIO_USER_CQ_UNUSED); 602 } 603 604 if (vu_ctrlr->sqs[qid] == NULL) { 605 return false; 606 } 607 608 return (vu_ctrlr->sqs[qid]->sq_state != VFIO_USER_SQ_DELETED && 609 vu_ctrlr->sqs[qid]->sq_state != VFIO_USER_SQ_UNUSED); 610 } 611 612 static char * 613 endpoint_id(struct nvmf_vfio_user_endpoint *endpoint) 614 { 615 return endpoint->trid.traddr; 616 } 617 618 static char * 619 ctrlr_id(struct nvmf_vfio_user_ctrlr *ctrlr) 620 { 621 if (!ctrlr || !ctrlr->endpoint) { 622 return "Null Ctrlr"; 623 } 624 625 return endpoint_id(ctrlr->endpoint); 626 } 627 628 /* Return the poll group for the admin queue of the controller. */ 629 static inline struct nvmf_vfio_user_poll_group * 630 ctrlr_to_poll_group(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 631 { 632 return SPDK_CONTAINEROF(vu_ctrlr->sqs[0]->group, 633 struct nvmf_vfio_user_poll_group, 634 group); 635 } 636 637 static inline struct spdk_thread * 638 poll_group_to_thread(struct nvmf_vfio_user_poll_group *vu_pg) 639 { 640 return vu_pg->group.group->thread; 641 } 642 643 static dma_sg_t * 644 index_to_sg_t(void *arr, size_t i) 645 { 646 return (dma_sg_t *)((uintptr_t)arr + i * dma_sg_size()); 647 } 648 649 static inline size_t 650 vfio_user_migr_data_len(void) 651 { 652 return SPDK_ALIGN_CEIL(sizeof(struct vfio_user_nvme_migr_state), PAGE_SIZE); 653 } 654 655 static inline bool 656 in_interrupt_mode(struct nvmf_vfio_user_transport *vu_transport) 657 { 658 return spdk_interrupt_mode_is_enabled() && 659 vu_transport->intr_mode_supported; 660 } 661 662 static int vfio_user_ctrlr_intr(void *ctx); 663 664 static void 665 vfio_user_msg_ctrlr_intr(void *ctx) 666 { 667 vfio_user_ctrlr_intr(ctx); 668 } 669 670 /* 671 * Kick (force a wakeup) of all poll groups for this controller. 672 * vfio_user_ctrlr_intr() itself arranges for kicking other poll groups if 673 * needed. 674 */ 675 static void 676 ctrlr_kick(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 677 { 678 struct nvmf_vfio_user_poll_group *vu_ctrlr_group; 679 680 SPDK_DEBUGLOG(vfio_user_db, "%s: kicked\n", ctrlr_id(vu_ctrlr)); 681 682 vu_ctrlr_group = ctrlr_to_poll_group(vu_ctrlr); 683 684 spdk_thread_send_msg(poll_group_to_thread(vu_ctrlr_group), 685 vfio_user_msg_ctrlr_intr, vu_ctrlr); 686 } 687 688 /* 689 * Make the given DMA address and length available (locally mapped) via iov. 690 */ 691 static void * 692 map_one(vfu_ctx_t *ctx, uint64_t addr, uint64_t len, dma_sg_t *sg, 693 struct iovec *iov, int prot) 694 { 695 int ret; 696 697 assert(ctx != NULL); 698 assert(sg != NULL); 699 assert(iov != NULL); 700 701 ret = vfu_addr_to_sgl(ctx, (void *)(uintptr_t)addr, len, sg, 1, prot); 702 if (ret < 0) { 703 return NULL; 704 } 705 706 ret = vfu_sgl_get(ctx, sg, iov, 1, 0); 707 if (ret != 0) { 708 return NULL; 709 } 710 711 assert(iov->iov_base != NULL); 712 return iov->iov_base; 713 } 714 715 static int 716 nvme_cmd_map_prps(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, 717 uint32_t max_iovcnt, uint32_t len, size_t mps, 718 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 719 { 720 uint64_t prp1, prp2; 721 void *vva; 722 uint32_t i; 723 uint32_t residue_len, nents; 724 uint64_t *prp_list; 725 uint32_t iovcnt; 726 727 assert(max_iovcnt > 0); 728 729 prp1 = cmd->dptr.prp.prp1; 730 prp2 = cmd->dptr.prp.prp2; 731 732 /* PRP1 may started with unaligned page address */ 733 residue_len = mps - (prp1 % mps); 734 residue_len = spdk_min(len, residue_len); 735 736 vva = gpa_to_vva(prv, prp1, residue_len, PROT_READ | PROT_WRITE); 737 if (spdk_unlikely(vva == NULL)) { 738 SPDK_ERRLOG("GPA to VVA failed\n"); 739 return -EINVAL; 740 } 741 len -= residue_len; 742 if (len && max_iovcnt < 2) { 743 SPDK_ERRLOG("Too many page entries, at least two iovs are required\n"); 744 return -ERANGE; 745 } 746 iovs[0].iov_base = vva; 747 iovs[0].iov_len = residue_len; 748 749 if (len) { 750 if (spdk_unlikely(prp2 == 0)) { 751 SPDK_ERRLOG("no PRP2, %d remaining\n", len); 752 return -EINVAL; 753 } 754 755 if (len <= mps) { 756 /* 2 PRP used */ 757 iovcnt = 2; 758 vva = gpa_to_vva(prv, prp2, len, PROT_READ | PROT_WRITE); 759 if (spdk_unlikely(vva == NULL)) { 760 SPDK_ERRLOG("no VVA for %#" PRIx64 ", len%#x\n", 761 prp2, len); 762 return -EINVAL; 763 } 764 iovs[1].iov_base = vva; 765 iovs[1].iov_len = len; 766 } else { 767 /* PRP list used */ 768 nents = (len + mps - 1) / mps; 769 if (spdk_unlikely(nents + 1 > max_iovcnt)) { 770 SPDK_ERRLOG("Too many page entries\n"); 771 return -ERANGE; 772 } 773 774 vva = gpa_to_vva(prv, prp2, nents * sizeof(*prp_list), PROT_READ); 775 if (spdk_unlikely(vva == NULL)) { 776 SPDK_ERRLOG("no VVA for %#" PRIx64 ", nents=%#x\n", 777 prp2, nents); 778 return -EINVAL; 779 } 780 prp_list = vva; 781 i = 0; 782 while (len != 0) { 783 residue_len = spdk_min(len, mps); 784 vva = gpa_to_vva(prv, prp_list[i], residue_len, PROT_READ | PROT_WRITE); 785 if (spdk_unlikely(vva == NULL)) { 786 SPDK_ERRLOG("no VVA for %#" PRIx64 ", residue_len=%#x\n", 787 prp_list[i], residue_len); 788 return -EINVAL; 789 } 790 iovs[i + 1].iov_base = vva; 791 iovs[i + 1].iov_len = residue_len; 792 len -= residue_len; 793 i++; 794 } 795 iovcnt = i + 1; 796 } 797 } else { 798 /* 1 PRP used */ 799 iovcnt = 1; 800 } 801 802 assert(iovcnt <= max_iovcnt); 803 return iovcnt; 804 } 805 806 static int 807 nvme_cmd_map_sgls_data(void *prv, struct spdk_nvme_sgl_descriptor *sgls, uint32_t num_sgls, 808 struct iovec *iovs, uint32_t max_iovcnt, 809 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 810 { 811 uint32_t i; 812 void *vva; 813 814 if (spdk_unlikely(max_iovcnt < num_sgls)) { 815 return -ERANGE; 816 } 817 818 for (i = 0; i < num_sgls; i++) { 819 if (spdk_unlikely(sgls[i].unkeyed.type != SPDK_NVME_SGL_TYPE_DATA_BLOCK)) { 820 SPDK_ERRLOG("Invalid SGL type %u\n", sgls[i].unkeyed.type); 821 return -EINVAL; 822 } 823 vva = gpa_to_vva(prv, sgls[i].address, sgls[i].unkeyed.length, PROT_READ | PROT_WRITE); 824 if (spdk_unlikely(vva == NULL)) { 825 SPDK_ERRLOG("GPA to VVA failed\n"); 826 return -EINVAL; 827 } 828 iovs[i].iov_base = vva; 829 iovs[i].iov_len = sgls[i].unkeyed.length; 830 } 831 832 return num_sgls; 833 } 834 835 static int 836 nvme_cmd_map_sgls(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt, 837 uint32_t len, size_t mps, 838 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 839 { 840 struct spdk_nvme_sgl_descriptor *sgl, *last_sgl; 841 uint32_t num_sgls, seg_len; 842 void *vva; 843 int ret; 844 uint32_t total_iovcnt = 0; 845 846 /* SGL cases */ 847 sgl = &cmd->dptr.sgl1; 848 849 /* only one SGL segment */ 850 if (sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { 851 assert(max_iovcnt > 0); 852 vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, PROT_READ | PROT_WRITE); 853 if (spdk_unlikely(vva == NULL)) { 854 SPDK_ERRLOG("GPA to VVA failed\n"); 855 return -EINVAL; 856 } 857 iovs[0].iov_base = vva; 858 iovs[0].iov_len = sgl->unkeyed.length; 859 assert(sgl->unkeyed.length == len); 860 861 return 1; 862 } 863 864 for (;;) { 865 if (spdk_unlikely((sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_SEGMENT) && 866 (sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_LAST_SEGMENT))) { 867 SPDK_ERRLOG("Invalid SGL type %u\n", sgl->unkeyed.type); 868 return -EINVAL; 869 } 870 871 seg_len = sgl->unkeyed.length; 872 if (spdk_unlikely(seg_len % sizeof(struct spdk_nvme_sgl_descriptor))) { 873 SPDK_ERRLOG("Invalid SGL segment len %u\n", seg_len); 874 return -EINVAL; 875 } 876 877 num_sgls = seg_len / sizeof(struct spdk_nvme_sgl_descriptor); 878 vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, PROT_READ); 879 if (spdk_unlikely(vva == NULL)) { 880 SPDK_ERRLOG("GPA to VVA failed\n"); 881 return -EINVAL; 882 } 883 884 /* sgl point to the first segment */ 885 sgl = (struct spdk_nvme_sgl_descriptor *)vva; 886 last_sgl = &sgl[num_sgls - 1]; 887 888 /* we are done */ 889 if (last_sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { 890 /* map whole sgl list */ 891 ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls, &iovs[total_iovcnt], 892 max_iovcnt - total_iovcnt, gpa_to_vva); 893 if (spdk_unlikely(ret < 0)) { 894 return ret; 895 } 896 total_iovcnt += ret; 897 898 return total_iovcnt; 899 } 900 901 if (num_sgls > 1) { 902 /* map whole sgl exclude last_sgl */ 903 ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls - 1, &iovs[total_iovcnt], 904 max_iovcnt - total_iovcnt, gpa_to_vva); 905 if (spdk_unlikely(ret < 0)) { 906 return ret; 907 } 908 total_iovcnt += ret; 909 } 910 911 /* move to next level's segments */ 912 sgl = last_sgl; 913 } 914 915 return 0; 916 } 917 918 static int 919 nvme_map_cmd(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt, 920 uint32_t len, size_t mps, 921 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 922 { 923 if (cmd->psdt == SPDK_NVME_PSDT_PRP) { 924 return nvme_cmd_map_prps(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva); 925 } 926 927 return nvme_cmd_map_sgls(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva); 928 } 929 930 /* 931 * For each queue, update the location of its doorbell to the correct location: 932 * either our own BAR0, or the guest's configured shadow doorbell area. 933 * 934 * The Admin queue (qid: 0) does not ever use shadow doorbells. 935 */ 936 static void 937 vfio_user_ctrlr_switch_doorbells(struct nvmf_vfio_user_ctrlr *ctrlr, bool shadow) 938 { 939 volatile uint32_t *doorbells = shadow ? ctrlr->sdbl->shadow_doorbells : 940 ctrlr->bar0_doorbells; 941 942 assert(doorbells != NULL); 943 944 for (size_t i = 1; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; i++) { 945 struct nvmf_vfio_user_sq *sq = ctrlr->sqs[i]; 946 struct nvmf_vfio_user_cq *cq = ctrlr->cqs[i]; 947 948 if (sq != NULL) { 949 sq->dbl_tailp = doorbells + queue_index(sq->qid, false); 950 951 ctrlr->sqs[i]->need_rearm = shadow; 952 } 953 954 if (cq != NULL) { 955 cq->dbl_headp = doorbells + queue_index(cq->qid, true); 956 } 957 } 958 } 959 960 static void 961 unmap_sdbl(vfu_ctx_t *vfu_ctx, struct nvmf_vfio_user_shadow_doorbells *sdbl) 962 { 963 assert(vfu_ctx != NULL); 964 assert(sdbl != NULL); 965 966 /* 967 * An allocation error would result in only one of the two being 968 * non-NULL. If that is the case, no memory should have been mapped. 969 */ 970 if (sdbl->iovs == NULL || sdbl->sgs == NULL) { 971 return; 972 } 973 974 for (size_t i = 0; i < NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT; ++i) { 975 struct iovec *iov; 976 dma_sg_t *sg; 977 978 if (!sdbl->iovs[i].iov_len) { 979 continue; 980 } 981 982 sg = index_to_sg_t(sdbl->sgs, i); 983 iov = sdbl->iovs + i; 984 985 vfu_sgl_put(vfu_ctx, sg, iov, 1); 986 } 987 } 988 989 static void 990 free_sdbl(vfu_ctx_t *vfu_ctx, struct nvmf_vfio_user_shadow_doorbells *sdbl) 991 { 992 if (sdbl == NULL) { 993 return; 994 } 995 996 unmap_sdbl(vfu_ctx, sdbl); 997 998 /* 999 * sdbl->shadow_doorbells and sdbl->eventidxs were mapped, 1000 * not allocated, so don't free() them. 1001 */ 1002 free(sdbl->sgs); 1003 free(sdbl->iovs); 1004 free(sdbl); 1005 } 1006 1007 static struct nvmf_vfio_user_shadow_doorbells * 1008 map_sdbl(vfu_ctx_t *vfu_ctx, uint64_t prp1, uint64_t prp2, size_t len) 1009 { 1010 struct nvmf_vfio_user_shadow_doorbells *sdbl = NULL; 1011 dma_sg_t *sg2 = NULL; 1012 void *p; 1013 1014 assert(vfu_ctx != NULL); 1015 1016 sdbl = calloc(1, sizeof(*sdbl)); 1017 if (sdbl == NULL) { 1018 goto err; 1019 } 1020 1021 sdbl->sgs = calloc(NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT, dma_sg_size()); 1022 sdbl->iovs = calloc(NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT, sizeof(*sdbl->iovs)); 1023 if (sdbl->sgs == NULL || sdbl->iovs == NULL) { 1024 goto err; 1025 } 1026 1027 /* Map shadow doorbell buffer (PRP1). */ 1028 p = map_one(vfu_ctx, prp1, len, sdbl->sgs, sdbl->iovs, 1029 PROT_READ | PROT_WRITE); 1030 1031 if (p == NULL) { 1032 goto err; 1033 } 1034 1035 /* 1036 * Map eventidx buffer (PRP2). 1037 * Should only be written to by the controller. 1038 */ 1039 1040 sg2 = index_to_sg_t(sdbl->sgs, 1); 1041 1042 p = map_one(vfu_ctx, prp2, len, sg2, sdbl->iovs + 1, 1043 PROT_READ | PROT_WRITE); 1044 1045 if (p == NULL) { 1046 goto err; 1047 } 1048 1049 sdbl->shadow_doorbells = (uint32_t *)sdbl->iovs[0].iov_base; 1050 sdbl->eventidxs = (uint32_t *)sdbl->iovs[1].iov_base; 1051 1052 return sdbl; 1053 1054 err: 1055 free_sdbl(vfu_ctx, sdbl); 1056 return NULL; 1057 } 1058 1059 /* 1060 * Copy doorbells from one buffer to the other, during switches betweeen BAR0 1061 * doorbells and shadow doorbells. 1062 */ 1063 static void 1064 copy_doorbells(struct nvmf_vfio_user_ctrlr *ctrlr, 1065 const volatile uint32_t *from, volatile uint32_t *to) 1066 { 1067 assert(ctrlr != NULL); 1068 assert(from != NULL); 1069 assert(to != NULL); 1070 1071 SPDK_DEBUGLOG(vfio_user_db, 1072 "%s: migrating shadow doorbells from %p to %p\n", 1073 ctrlr_id(ctrlr), from, to); 1074 1075 /* Can't use memcpy because it doesn't respect volatile semantics. */ 1076 for (size_t i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; ++i) { 1077 if (ctrlr->sqs[i] != NULL) { 1078 to[queue_index(i, false)] = from[queue_index(i, false)]; 1079 } 1080 1081 if (ctrlr->cqs[i] != NULL) { 1082 to[queue_index(i, true)] = from[queue_index(i, true)]; 1083 } 1084 } 1085 } 1086 1087 static void 1088 fail_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1089 { 1090 const struct spdk_nvmf_registers *regs; 1091 1092 assert(vu_ctrlr != NULL); 1093 assert(vu_ctrlr->ctrlr != NULL); 1094 1095 regs = spdk_nvmf_ctrlr_get_regs(vu_ctrlr->ctrlr); 1096 if (regs->csts.bits.cfs == 0) { 1097 SPDK_ERRLOG(":%s failing controller\n", ctrlr_id(vu_ctrlr)); 1098 } 1099 1100 nvmf_ctrlr_set_fatal_status(vu_ctrlr->ctrlr); 1101 } 1102 1103 static inline bool 1104 ctrlr_interrupt_enabled(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1105 { 1106 assert(vu_ctrlr != NULL); 1107 assert(vu_ctrlr->endpoint != NULL); 1108 1109 vfu_pci_config_space_t *pci = vu_ctrlr->endpoint->pci_config_space; 1110 1111 return (!pci->hdr.cmd.id || vu_ctrlr->endpoint->msix->mxc.mxe); 1112 } 1113 1114 static void 1115 nvmf_vfio_user_destroy_endpoint(struct nvmf_vfio_user_endpoint *endpoint) 1116 { 1117 SPDK_DEBUGLOG(nvmf_vfio, "destroy endpoint %s\n", endpoint_id(endpoint)); 1118 1119 spdk_interrupt_unregister(&endpoint->accept_intr); 1120 spdk_poller_unregister(&endpoint->accept_poller); 1121 1122 if (endpoint->bar0_doorbells) { 1123 munmap((void *)endpoint->bar0_doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 1124 } 1125 1126 if (endpoint->devmem_fd > 0) { 1127 close(endpoint->devmem_fd); 1128 } 1129 1130 if (endpoint->migr_data) { 1131 munmap(endpoint->migr_data, vfio_user_migr_data_len()); 1132 } 1133 1134 if (endpoint->migr_fd > 0) { 1135 close(endpoint->migr_fd); 1136 } 1137 1138 if (endpoint->vfu_ctx) { 1139 vfu_destroy_ctx(endpoint->vfu_ctx); 1140 } 1141 1142 pthread_mutex_destroy(&endpoint->lock); 1143 free(endpoint); 1144 } 1145 1146 /* called when process exits */ 1147 static int 1148 nvmf_vfio_user_destroy(struct spdk_nvmf_transport *transport, 1149 spdk_nvmf_transport_destroy_done_cb cb_fn, void *cb_arg) 1150 { 1151 struct nvmf_vfio_user_transport *vu_transport; 1152 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 1153 1154 SPDK_DEBUGLOG(nvmf_vfio, "destroy transport\n"); 1155 1156 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 1157 transport); 1158 1159 pthread_mutex_destroy(&vu_transport->lock); 1160 pthread_mutex_destroy(&vu_transport->pg_lock); 1161 1162 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 1163 TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link); 1164 nvmf_vfio_user_destroy_endpoint(endpoint); 1165 } 1166 1167 free(vu_transport); 1168 1169 if (cb_fn) { 1170 cb_fn(cb_arg); 1171 } 1172 1173 return 0; 1174 } 1175 1176 static const struct spdk_json_object_decoder vfio_user_transport_opts_decoder[] = { 1177 { 1178 "disable_mappable_bar0", 1179 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_mappable_bar0), 1180 spdk_json_decode_bool, true 1181 }, 1182 { 1183 "disable_adaptive_irq", 1184 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_adaptive_irq), 1185 spdk_json_decode_bool, true 1186 }, 1187 { 1188 "disable_shadow_doorbells", 1189 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_shadow_doorbells), 1190 spdk_json_decode_bool, true 1191 }, 1192 { 1193 "disable_compare", 1194 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_compare), 1195 spdk_json_decode_bool, true 1196 }, 1197 { 1198 "enable_intr_mode_sq_spreading", 1199 offsetof(struct nvmf_vfio_user_transport, transport_opts.enable_intr_mode_sq_spreading), 1200 spdk_json_decode_bool, true 1201 }, 1202 }; 1203 1204 static struct spdk_nvmf_transport * 1205 nvmf_vfio_user_create(struct spdk_nvmf_transport_opts *opts) 1206 { 1207 struct nvmf_vfio_user_transport *vu_transport; 1208 int err; 1209 1210 if (opts->max_qpairs_per_ctrlr > NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) { 1211 SPDK_ERRLOG("Invalid max_qpairs_per_ctrlr=%d, supported max_qpairs_per_ctrlr=%d\n", 1212 opts->max_qpairs_per_ctrlr, NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR); 1213 return NULL; 1214 } 1215 1216 vu_transport = calloc(1, sizeof(*vu_transport)); 1217 if (vu_transport == NULL) { 1218 SPDK_ERRLOG("Transport alloc fail: %m\n"); 1219 return NULL; 1220 } 1221 1222 err = pthread_mutex_init(&vu_transport->lock, NULL); 1223 if (err != 0) { 1224 SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err); 1225 goto err; 1226 } 1227 TAILQ_INIT(&vu_transport->endpoints); 1228 1229 err = pthread_mutex_init(&vu_transport->pg_lock, NULL); 1230 if (err != 0) { 1231 pthread_mutex_destroy(&vu_transport->lock); 1232 SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err); 1233 goto err; 1234 } 1235 TAILQ_INIT(&vu_transport->poll_groups); 1236 1237 if (opts->transport_specific != NULL && 1238 spdk_json_decode_object_relaxed(opts->transport_specific, vfio_user_transport_opts_decoder, 1239 SPDK_COUNTOF(vfio_user_transport_opts_decoder), 1240 vu_transport)) { 1241 SPDK_ERRLOG("spdk_json_decode_object_relaxed failed\n"); 1242 goto cleanup; 1243 } 1244 1245 /* 1246 * To support interrupt mode, the transport must be configured with 1247 * mappable BAR0 disabled: we need a vfio-user message to wake us up 1248 * when a client writes new doorbell values to BAR0, via the 1249 * libvfio-user socket fd. 1250 */ 1251 vu_transport->intr_mode_supported = 1252 vu_transport->transport_opts.disable_mappable_bar0; 1253 1254 /* 1255 * If BAR0 is mappable, it doesn't make sense to support shadow 1256 * doorbells, so explicitly turn it off. 1257 */ 1258 if (!vu_transport->transport_opts.disable_mappable_bar0) { 1259 vu_transport->transport_opts.disable_shadow_doorbells = true; 1260 } 1261 1262 if (spdk_interrupt_mode_is_enabled()) { 1263 if (!vu_transport->intr_mode_supported) { 1264 SPDK_ERRLOG("interrupt mode not supported\n"); 1265 goto cleanup; 1266 } 1267 1268 /* 1269 * If we are in interrupt mode, we cannot support adaptive IRQs, 1270 * as there is no guarantee the SQ poller will run subsequently 1271 * to send pending IRQs. 1272 */ 1273 vu_transport->transport_opts.disable_adaptive_irq = true; 1274 } 1275 1276 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_mappable_bar0=%d\n", 1277 vu_transport->transport_opts.disable_mappable_bar0); 1278 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_adaptive_irq=%d\n", 1279 vu_transport->transport_opts.disable_adaptive_irq); 1280 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_shadow_doorbells=%d\n", 1281 vu_transport->transport_opts.disable_shadow_doorbells); 1282 1283 return &vu_transport->transport; 1284 1285 cleanup: 1286 pthread_mutex_destroy(&vu_transport->lock); 1287 pthread_mutex_destroy(&vu_transport->pg_lock); 1288 err: 1289 free(vu_transport); 1290 return NULL; 1291 } 1292 1293 static uint32_t 1294 max_queue_size(struct nvmf_vfio_user_ctrlr const *vu_ctrlr) 1295 { 1296 assert(vu_ctrlr != NULL); 1297 assert(vu_ctrlr->ctrlr != NULL); 1298 1299 return vu_ctrlr->ctrlr->vcprop.cap.bits.mqes + 1; 1300 } 1301 1302 static uint32_t 1303 doorbell_stride(const struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1304 { 1305 assert(vu_ctrlr != NULL); 1306 assert(vu_ctrlr->ctrlr != NULL); 1307 1308 return vu_ctrlr->ctrlr->vcprop.cap.bits.dstrd; 1309 } 1310 1311 static uintptr_t 1312 memory_page_size(const struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1313 { 1314 uint32_t memory_page_shift = vu_ctrlr->ctrlr->vcprop.cc.bits.mps + 12; 1315 return 1ul << memory_page_shift; 1316 } 1317 1318 static uintptr_t 1319 memory_page_mask(const struct nvmf_vfio_user_ctrlr *ctrlr) 1320 { 1321 return ~(memory_page_size(ctrlr) - 1); 1322 } 1323 1324 static int 1325 map_q(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvme_q_mapping *mapping, 1326 uint32_t q_size, bool is_cq, bool unmap) 1327 { 1328 uint64_t len; 1329 void *ret; 1330 1331 assert(q_size); 1332 assert(q_addr(mapping) == NULL); 1333 1334 if (is_cq) { 1335 len = q_size * sizeof(struct spdk_nvme_cpl); 1336 } else { 1337 len = q_size * sizeof(struct spdk_nvme_cmd); 1338 } 1339 1340 ret = map_one(vu_ctrlr->endpoint->vfu_ctx, mapping->prp1, len, 1341 mapping->sg, &mapping->iov, 1342 is_cq ? PROT_READ | PROT_WRITE : PROT_READ); 1343 if (ret == NULL) { 1344 return -EFAULT; 1345 } 1346 1347 if (unmap) { 1348 memset(q_addr(mapping), 0, len); 1349 } 1350 1351 return 0; 1352 } 1353 1354 static inline void 1355 unmap_q(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvme_q_mapping *mapping) 1356 { 1357 if (q_addr(mapping) != NULL) { 1358 vfu_sgl_put(vu_ctrlr->endpoint->vfu_ctx, mapping->sg, 1359 &mapping->iov, 1); 1360 mapping->iov.iov_base = NULL; 1361 } 1362 } 1363 1364 static int 1365 asq_setup(struct nvmf_vfio_user_ctrlr *ctrlr) 1366 { 1367 struct nvmf_vfio_user_sq *sq; 1368 const struct spdk_nvmf_registers *regs; 1369 int ret; 1370 1371 assert(ctrlr != NULL); 1372 1373 sq = ctrlr->sqs[0]; 1374 1375 assert(sq != NULL); 1376 assert(q_addr(&sq->mapping) == NULL); 1377 /* XXX ctrlr->asq == 0 is a valid memory address */ 1378 1379 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->ctrlr); 1380 sq->qid = 0; 1381 sq->size = regs->aqa.bits.asqs + 1; 1382 sq->mapping.prp1 = regs->asq; 1383 *sq_headp(sq) = 0; 1384 sq->cqid = 0; 1385 1386 ret = map_q(ctrlr, &sq->mapping, sq->size, false, true); 1387 if (ret) { 1388 return ret; 1389 } 1390 1391 /* The Admin queue (qid: 0) does not ever use shadow doorbells. */ 1392 sq->dbl_tailp = ctrlr->bar0_doorbells + queue_index(0, false); 1393 1394 *sq_dbl_tailp(sq) = 0; 1395 1396 return 0; 1397 } 1398 1399 /* 1400 * Updates eventidx to set an SQ into interrupt or polling mode. 1401 * 1402 * Returns false if the current SQ tail does not match the SQ head, as 1403 * this means that the host has submitted more items to the queue while we were 1404 * not looking - or during the event index update. In that case, we must retry, 1405 * or otherwise make sure we are going to wake up again. 1406 */ 1407 static bool 1408 set_sq_eventidx(struct nvmf_vfio_user_sq *sq) 1409 { 1410 struct nvmf_vfio_user_ctrlr *ctrlr; 1411 volatile uint32_t *sq_tail_eidx; 1412 uint32_t old_tail, new_tail; 1413 1414 assert(sq != NULL); 1415 assert(sq->ctrlr != NULL); 1416 assert(sq->ctrlr->sdbl != NULL); 1417 assert(sq->need_rearm); 1418 assert(sq->qid != 0); 1419 1420 ctrlr = sq->ctrlr; 1421 1422 SPDK_DEBUGLOG(vfio_user_db, "%s: updating eventidx of sqid:%u\n", 1423 ctrlr_id(ctrlr), sq->qid); 1424 1425 sq_tail_eidx = ctrlr->sdbl->eventidxs + queue_index(sq->qid, false); 1426 1427 assert(ctrlr->endpoint != NULL); 1428 1429 if (!ctrlr->endpoint->interrupt_mode) { 1430 /* No synchronisation necessary. */ 1431 *sq_tail_eidx = NVMF_VFIO_USER_EVENTIDX_POLL; 1432 return true; 1433 } 1434 1435 old_tail = *sq_dbl_tailp(sq); 1436 *sq_tail_eidx = old_tail; 1437 1438 /* 1439 * Ensure that the event index is updated before re-reading the tail 1440 * doorbell. If it's not, then the host might race us and update the 1441 * tail after the second read but before the event index is written, so 1442 * it won't write to BAR0 and we'll miss the update. 1443 * 1444 * The driver should provide similar ordering with an mb(). 1445 */ 1446 spdk_mb(); 1447 1448 /* 1449 * Check if the host has updated the tail doorbell after we've read it 1450 * for the first time, but before the event index was written. If that's 1451 * the case, then we've lost the race and we need to update the event 1452 * index again (after polling the queue, since the host won't write to 1453 * BAR0). 1454 */ 1455 new_tail = *sq_dbl_tailp(sq); 1456 1457 /* 1458 * We might poll the queue straight after this function returns if the 1459 * tail has been updated, so we need to ensure that any changes to the 1460 * queue will be visible to us if the doorbell has been updated. 1461 * 1462 * The driver should provide similar ordering with a wmb() to ensure 1463 * that the queue is written before it updates the tail doorbell. 1464 */ 1465 spdk_rmb(); 1466 1467 SPDK_DEBUGLOG(vfio_user_db, "%s: sqid:%u, old_tail=%u, new_tail=%u, " 1468 "sq_head=%u\n", ctrlr_id(ctrlr), sq->qid, old_tail, 1469 new_tail, *sq_headp(sq)); 1470 1471 if (new_tail == *sq_headp(sq)) { 1472 sq->need_rearm = false; 1473 return true; 1474 } 1475 1476 /* 1477 * We've lost the race: the tail was updated since we last polled, 1478 * including if it happened within this routine. 1479 * 1480 * The caller should retry after polling (think of this as a cmpxchg 1481 * loop); if we go to sleep while the SQ is not empty, then we won't 1482 * process the remaining events. 1483 */ 1484 return false; 1485 } 1486 1487 static int nvmf_vfio_user_sq_poll(struct nvmf_vfio_user_sq *sq); 1488 1489 /* 1490 * Arrange for an SQ to interrupt us if written. Returns non-zero if we 1491 * processed some SQ entries. 1492 */ 1493 static int 1494 vfio_user_sq_rearm(struct nvmf_vfio_user_ctrlr *ctrlr, 1495 struct nvmf_vfio_user_sq *sq) 1496 { 1497 int count = 0; 1498 size_t i; 1499 1500 assert(sq->need_rearm); 1501 1502 for (i = 0; i < NVMF_VFIO_USER_SET_EVENTIDX_MAX_ATTEMPTS; i++) { 1503 int ret; 1504 1505 if (set_sq_eventidx(sq)) { 1506 /* We won the race and set eventidx; done. */ 1507 return count; 1508 } 1509 1510 ret = nvmf_vfio_user_sq_poll(sq); 1511 1512 count += (ret < 0) ? 1 : ret; 1513 1514 /* 1515 * set_sq_eventidx() hit the race, so we expected 1516 * to process at least one command from this queue. 1517 * If there were no new commands waiting for us, then 1518 * we must have hit an unexpected race condition. 1519 */ 1520 if (ret == 0) { 1521 SPDK_ERRLOG("%s: unexpected race condition detected " 1522 "while updating the shadow doorbell buffer\n", 1523 ctrlr_id(ctrlr)); 1524 1525 fail_ctrlr(ctrlr); 1526 return count; 1527 } 1528 } 1529 1530 SPDK_DEBUGLOG(vfio_user_db, 1531 "%s: set_sq_eventidx() lost the race %zu times\n", 1532 ctrlr_id(ctrlr), i); 1533 1534 /* 1535 * We couldn't arrange an eventidx guaranteed to cause a BAR0 write, as 1536 * we raced with the producer too many times; force ourselves to wake up 1537 * instead. We'll process all queues at that point. 1538 */ 1539 ctrlr_kick(ctrlr); 1540 1541 return count; 1542 } 1543 1544 /* 1545 * We're in interrupt mode, and potentially about to go to sleep. We need to 1546 * make sure any further I/O submissions are guaranteed to wake us up: for 1547 * shadow doorbells that means we may need to go through set_sq_eventidx() for 1548 * every SQ that needs re-arming. 1549 * 1550 * Returns non-zero if we processed something. 1551 */ 1552 static int 1553 vfio_user_poll_group_rearm(struct nvmf_vfio_user_poll_group *vu_group) 1554 { 1555 struct nvmf_vfio_user_sq *sq; 1556 int count = 0; 1557 1558 TAILQ_FOREACH(sq, &vu_group->sqs, link) { 1559 if (spdk_unlikely(sq->sq_state != VFIO_USER_SQ_ACTIVE || !sq->size)) { 1560 continue; 1561 } 1562 1563 if (sq->need_rearm) { 1564 count += vfio_user_sq_rearm(sq->ctrlr, sq); 1565 } 1566 } 1567 1568 return count; 1569 } 1570 1571 static int 1572 acq_setup(struct nvmf_vfio_user_ctrlr *ctrlr) 1573 { 1574 struct nvmf_vfio_user_cq *cq; 1575 const struct spdk_nvmf_registers *regs; 1576 int ret; 1577 1578 assert(ctrlr != NULL); 1579 1580 cq = ctrlr->cqs[0]; 1581 1582 assert(cq != NULL); 1583 1584 assert(q_addr(&cq->mapping) == NULL); 1585 1586 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->ctrlr); 1587 assert(regs != NULL); 1588 cq->qid = 0; 1589 cq->size = regs->aqa.bits.acqs + 1; 1590 cq->mapping.prp1 = regs->acq; 1591 *cq_tailp(cq) = 0; 1592 cq->ien = true; 1593 cq->phase = true; 1594 1595 ret = map_q(ctrlr, &cq->mapping, cq->size, true, true); 1596 if (ret) { 1597 return ret; 1598 } 1599 1600 /* The Admin queue (qid: 0) does not ever use shadow doorbells. */ 1601 cq->dbl_headp = ctrlr->bar0_doorbells + queue_index(0, true); 1602 1603 *cq_dbl_headp(cq) = 0; 1604 1605 return 0; 1606 } 1607 1608 static void * 1609 _map_one(void *prv, uint64_t addr, uint64_t len, int prot) 1610 { 1611 struct spdk_nvmf_request *req = (struct spdk_nvmf_request *)prv; 1612 struct spdk_nvmf_qpair *qpair; 1613 struct nvmf_vfio_user_req *vu_req; 1614 struct nvmf_vfio_user_sq *sq; 1615 void *ret; 1616 1617 assert(req != NULL); 1618 qpair = req->qpair; 1619 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 1620 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 1621 1622 assert(vu_req->iovcnt < NVMF_VFIO_USER_MAX_IOVECS); 1623 ret = map_one(sq->ctrlr->endpoint->vfu_ctx, addr, len, 1624 index_to_sg_t(vu_req->sg, vu_req->iovcnt), 1625 &vu_req->iov[vu_req->iovcnt], prot); 1626 if (spdk_likely(ret != NULL)) { 1627 vu_req->iovcnt++; 1628 } 1629 return ret; 1630 } 1631 1632 static int 1633 vfio_user_map_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req, 1634 struct iovec *iov, uint32_t length) 1635 { 1636 /* Map PRP list to from Guest physical memory to 1637 * virtual memory address. 1638 */ 1639 return nvme_map_cmd(req, &req->cmd->nvme_cmd, iov, NVMF_REQ_MAX_BUFFERS, 1640 length, 4096, _map_one); 1641 } 1642 1643 static int handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 1644 struct nvmf_vfio_user_sq *sq); 1645 1646 /* 1647 * Posts a CQE in the completion queue. 1648 * 1649 * @ctrlr: the vfio-user controller 1650 * @cq: the completion queue 1651 * @cdw0: cdw0 as reported by NVMf 1652 * @sqid: submission queue ID 1653 * @cid: command identifier in NVMe command 1654 * @sc: the NVMe CQE status code 1655 * @sct: the NVMe CQE status code type 1656 */ 1657 static int 1658 post_completion(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_cq *cq, 1659 uint32_t cdw0, uint16_t sqid, uint16_t cid, uint16_t sc, uint16_t sct) 1660 { 1661 struct spdk_nvme_status cpl_status = { 0 }; 1662 struct spdk_nvme_cpl *cpl; 1663 int err; 1664 1665 assert(ctrlr != NULL); 1666 1667 if (spdk_unlikely(cq == NULL || q_addr(&cq->mapping) == NULL)) { 1668 return 0; 1669 } 1670 1671 if (cq->qid == 0) { 1672 assert(spdk_get_thread() == cq->thread); 1673 } 1674 1675 if (cq_is_full(cq)) { 1676 SPDK_ERRLOG("%s: cqid:%d full (tail=%d, head=%d)\n", 1677 ctrlr_id(ctrlr), cq->qid, *cq_tailp(cq), 1678 *cq_dbl_headp(cq)); 1679 return -1; 1680 } 1681 1682 cpl = ((struct spdk_nvme_cpl *)q_addr(&cq->mapping)) + *cq_tailp(cq); 1683 1684 assert(ctrlr->sqs[sqid] != NULL); 1685 SPDK_DEBUGLOG(nvmf_vfio, 1686 "%s: request complete sqid:%d cid=%d status=%#x " 1687 "sqhead=%d cq tail=%d\n", ctrlr_id(ctrlr), sqid, cid, sc, 1688 *sq_headp(ctrlr->sqs[sqid]), *cq_tailp(cq)); 1689 1690 cpl->sqhd = *sq_headp(ctrlr->sqs[sqid]); 1691 cpl->sqid = sqid; 1692 cpl->cid = cid; 1693 cpl->cdw0 = cdw0; 1694 1695 /* 1696 * This is a bitfield: instead of setting the individual bits we need 1697 * directly in cpl->status, which would cause a read-modify-write cycle, 1698 * we'll avoid reading from the CPL altogether by filling in a local 1699 * cpl_status variable, then writing the whole thing. 1700 */ 1701 cpl_status.sct = sct; 1702 cpl_status.sc = sc; 1703 cpl_status.p = cq->phase; 1704 cpl->status = cpl_status; 1705 1706 /* Ensure the Completion Queue Entry is visible. */ 1707 spdk_wmb(); 1708 cq_tail_advance(cq); 1709 1710 if ((cq->qid == 0 || !ctrlr->adaptive_irqs_enabled) && 1711 cq->ien && ctrlr_interrupt_enabled(ctrlr)) { 1712 err = vfu_irq_trigger(ctrlr->endpoint->vfu_ctx, cq->iv); 1713 if (err != 0) { 1714 SPDK_ERRLOG("%s: failed to trigger interrupt: %m\n", 1715 ctrlr_id(ctrlr)); 1716 return err; 1717 } 1718 } 1719 1720 return 0; 1721 } 1722 1723 static void 1724 free_sq_reqs(struct nvmf_vfio_user_sq *sq) 1725 { 1726 while (!TAILQ_EMPTY(&sq->free_reqs)) { 1727 struct nvmf_vfio_user_req *vu_req = TAILQ_FIRST(&sq->free_reqs); 1728 TAILQ_REMOVE(&sq->free_reqs, vu_req, link); 1729 free(vu_req); 1730 } 1731 } 1732 1733 /* Deletes a SQ, if this SQ is the last user of the associated CQ 1734 * and the controller is being shut down or reset, then the CQ is 1735 * also deleted. 1736 */ 1737 static void 1738 delete_sq_done(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvmf_vfio_user_sq *sq) 1739 { 1740 struct nvmf_vfio_user_cq *cq; 1741 uint16_t cqid; 1742 1743 SPDK_DEBUGLOG(nvmf_vfio, "%s: delete sqid:%d=%p done\n", ctrlr_id(vu_ctrlr), 1744 sq->qid, sq); 1745 1746 /* Free SQ resources */ 1747 unmap_q(vu_ctrlr, &sq->mapping); 1748 1749 free_sq_reqs(sq); 1750 1751 sq->size = 0; 1752 1753 sq->sq_state = VFIO_USER_SQ_DELETED; 1754 1755 /* Controller RESET and SHUTDOWN are special cases, 1756 * VM may not send DELETE IO SQ/CQ commands, NVMf library 1757 * will disconnect IO queue pairs. 1758 */ 1759 if (vu_ctrlr->reset_shn) { 1760 cqid = sq->cqid; 1761 cq = vu_ctrlr->cqs[cqid]; 1762 1763 SPDK_DEBUGLOG(nvmf_vfio, "%s: try to delete cqid:%u=%p\n", ctrlr_id(vu_ctrlr), 1764 cq->qid, cq); 1765 1766 if (cq->cq_ref) { 1767 cq->cq_ref--; 1768 } 1769 if (cq->cq_ref == 0) { 1770 unmap_q(vu_ctrlr, &cq->mapping); 1771 cq->size = 0; 1772 cq->cq_state = VFIO_USER_CQ_DELETED; 1773 cq->group = NULL; 1774 } 1775 } 1776 } 1777 1778 static void 1779 free_qp(struct nvmf_vfio_user_ctrlr *ctrlr, uint16_t qid) 1780 { 1781 struct nvmf_vfio_user_sq *sq; 1782 struct nvmf_vfio_user_cq *cq; 1783 1784 if (ctrlr == NULL) { 1785 return; 1786 } 1787 1788 sq = ctrlr->sqs[qid]; 1789 if (sq) { 1790 SPDK_DEBUGLOG(nvmf_vfio, "%s: Free SQ %u\n", ctrlr_id(ctrlr), qid); 1791 unmap_q(ctrlr, &sq->mapping); 1792 1793 free_sq_reqs(sq); 1794 1795 free(sq->mapping.sg); 1796 free(sq); 1797 ctrlr->sqs[qid] = NULL; 1798 } 1799 1800 cq = ctrlr->cqs[qid]; 1801 if (cq) { 1802 SPDK_DEBUGLOG(nvmf_vfio, "%s: Free cqid:%u\n", ctrlr_id(ctrlr), qid); 1803 unmap_q(ctrlr, &cq->mapping); 1804 free(cq->mapping.sg); 1805 free(cq); 1806 ctrlr->cqs[qid] = NULL; 1807 } 1808 } 1809 1810 static int 1811 init_sq(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_transport *transport, 1812 const uint16_t id) 1813 { 1814 struct nvmf_vfio_user_sq *sq; 1815 1816 assert(ctrlr != NULL); 1817 assert(transport != NULL); 1818 assert(ctrlr->sqs[id] == NULL); 1819 1820 sq = calloc(1, sizeof(*sq)); 1821 if (sq == NULL) { 1822 return -ENOMEM; 1823 } 1824 sq->mapping.sg = calloc(1, dma_sg_size()); 1825 if (sq->mapping.sg == NULL) { 1826 free(sq); 1827 return -ENOMEM; 1828 } 1829 1830 sq->qid = id; 1831 sq->qpair.qid = id; 1832 sq->qpair.transport = transport; 1833 sq->ctrlr = ctrlr; 1834 ctrlr->sqs[id] = sq; 1835 1836 TAILQ_INIT(&sq->free_reqs); 1837 1838 return 0; 1839 } 1840 1841 static int 1842 init_cq(struct nvmf_vfio_user_ctrlr *vu_ctrlr, const uint16_t id) 1843 { 1844 struct nvmf_vfio_user_cq *cq; 1845 1846 assert(vu_ctrlr != NULL); 1847 assert(vu_ctrlr->cqs[id] == NULL); 1848 1849 cq = calloc(1, sizeof(*cq)); 1850 if (cq == NULL) { 1851 return -ENOMEM; 1852 } 1853 cq->mapping.sg = calloc(1, dma_sg_size()); 1854 if (cq->mapping.sg == NULL) { 1855 free(cq); 1856 return -ENOMEM; 1857 } 1858 1859 cq->qid = id; 1860 vu_ctrlr->cqs[id] = cq; 1861 1862 return 0; 1863 } 1864 1865 static int 1866 alloc_sq_reqs(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvmf_vfio_user_sq *sq) 1867 { 1868 struct nvmf_vfio_user_req *vu_req, *tmp; 1869 size_t req_size; 1870 uint32_t i; 1871 1872 req_size = sizeof(struct nvmf_vfio_user_req) + 1873 (dma_sg_size() * NVMF_VFIO_USER_MAX_IOVECS); 1874 1875 for (i = 0; i < sq->size; i++) { 1876 struct spdk_nvmf_request *req; 1877 1878 vu_req = calloc(1, req_size); 1879 if (vu_req == NULL) { 1880 goto err; 1881 } 1882 1883 req = &vu_req->req; 1884 req->qpair = &sq->qpair; 1885 req->rsp = (union nvmf_c2h_msg *)&vu_req->rsp; 1886 req->cmd = (union nvmf_h2c_msg *)&vu_req->cmd; 1887 req->stripped_data = NULL; 1888 1889 TAILQ_INSERT_TAIL(&sq->free_reqs, vu_req, link); 1890 } 1891 1892 return 0; 1893 1894 err: 1895 TAILQ_FOREACH_SAFE(vu_req, &sq->free_reqs, link, tmp) { 1896 free(vu_req); 1897 } 1898 return -ENOMEM; 1899 } 1900 1901 static volatile uint32_t * 1902 ctrlr_doorbell_ptr(struct nvmf_vfio_user_ctrlr *ctrlr) 1903 { 1904 return ctrlr->sdbl != NULL ? 1905 ctrlr->sdbl->shadow_doorbells : 1906 ctrlr->bar0_doorbells; 1907 } 1908 1909 static uint16_t 1910 handle_create_io_sq(struct nvmf_vfio_user_ctrlr *ctrlr, 1911 struct spdk_nvme_cmd *cmd, uint16_t *sct) 1912 { 1913 struct nvmf_vfio_user_transport *vu_transport = ctrlr->transport; 1914 struct nvmf_vfio_user_sq *sq; 1915 uint32_t qsize; 1916 uint16_t cqid; 1917 uint16_t qid; 1918 int err; 1919 1920 qid = cmd->cdw10_bits.create_io_q.qid; 1921 cqid = cmd->cdw11_bits.create_io_sq.cqid; 1922 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 1923 1924 if (ctrlr->sqs[qid] == NULL) { 1925 err = init_sq(ctrlr, ctrlr->sqs[0]->qpair.transport, qid); 1926 if (err != 0) { 1927 *sct = SPDK_NVME_SCT_GENERIC; 1928 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1929 } 1930 } 1931 1932 if (cqid == 0 || cqid >= vu_transport->transport.opts.max_qpairs_per_ctrlr) { 1933 SPDK_ERRLOG("%s: invalid cqid:%u\n", ctrlr_id(ctrlr), cqid); 1934 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1935 return SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 1936 } 1937 1938 /* CQ must be created before SQ. */ 1939 if (!io_q_exists(ctrlr, cqid, true)) { 1940 SPDK_ERRLOG("%s: cqid:%u does not exist\n", ctrlr_id(ctrlr), cqid); 1941 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1942 return SPDK_NVME_SC_COMPLETION_QUEUE_INVALID; 1943 } 1944 1945 if (cmd->cdw11_bits.create_io_sq.pc != 0x1) { 1946 SPDK_ERRLOG("%s: non-PC SQ not supported\n", ctrlr_id(ctrlr)); 1947 *sct = SPDK_NVME_SCT_GENERIC; 1948 return SPDK_NVME_SC_INVALID_FIELD; 1949 } 1950 1951 sq = ctrlr->sqs[qid]; 1952 sq->size = qsize; 1953 1954 SPDK_DEBUGLOG(nvmf_vfio, "%s: sqid:%d cqid:%d\n", ctrlr_id(ctrlr), 1955 qid, cqid); 1956 1957 sq->mapping.prp1 = cmd->dptr.prp.prp1; 1958 1959 err = map_q(ctrlr, &sq->mapping, sq->size, false, true); 1960 if (err) { 1961 SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr)); 1962 *sct = SPDK_NVME_SCT_GENERIC; 1963 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1964 } 1965 1966 SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped sqid:%d IOVA=%#lx vaddr=%p\n", 1967 ctrlr_id(ctrlr), qid, cmd->dptr.prp.prp1, 1968 q_addr(&sq->mapping)); 1969 1970 err = alloc_sq_reqs(ctrlr, sq); 1971 if (err < 0) { 1972 SPDK_ERRLOG("%s: failed to allocate SQ requests: %m\n", ctrlr_id(ctrlr)); 1973 *sct = SPDK_NVME_SCT_GENERIC; 1974 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1975 } 1976 1977 sq->cqid = cqid; 1978 ctrlr->cqs[sq->cqid]->cq_ref++; 1979 sq->sq_state = VFIO_USER_SQ_CREATED; 1980 *sq_headp(sq) = 0; 1981 1982 sq->dbl_tailp = ctrlr_doorbell_ptr(ctrlr) + queue_index(qid, false); 1983 1984 /* 1985 * We should always reset the doorbells. 1986 * 1987 * The Specification prohibits the controller from writing to the shadow 1988 * doorbell buffer, however older versions of the Linux NVMe driver 1989 * don't reset the shadow doorbell buffer after a Queue-Level or 1990 * Controller-Level reset, which means that we're left with garbage 1991 * doorbell values. 1992 */ 1993 *sq_dbl_tailp(sq) = 0; 1994 1995 if (ctrlr->sdbl != NULL) { 1996 sq->need_rearm = true; 1997 1998 if (!set_sq_eventidx(sq)) { 1999 SPDK_ERRLOG("%s: host updated SQ tail doorbell before " 2000 "sqid:%hu was initialized\n", 2001 ctrlr_id(ctrlr), qid); 2002 fail_ctrlr(ctrlr); 2003 *sct = SPDK_NVME_SCT_GENERIC; 2004 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2005 } 2006 } 2007 2008 /* 2009 * Create our new I/O qpair. This asynchronously invokes, on a suitable 2010 * poll group, the nvmf_vfio_user_poll_group_add() callback, which will 2011 * call spdk_nvmf_request_exec_fabrics() with a generated fabrics 2012 * connect command. This command is then eventually completed via 2013 * handle_queue_connect_rsp(). 2014 */ 2015 sq->create_io_sq_cmd = *cmd; 2016 sq->post_create_io_sq_completion = true; 2017 2018 spdk_nvmf_tgt_new_qpair(ctrlr->transport->transport.tgt, 2019 &sq->qpair); 2020 2021 *sct = SPDK_NVME_SCT_GENERIC; 2022 return SPDK_NVME_SC_SUCCESS; 2023 } 2024 2025 static uint16_t 2026 handle_create_io_cq(struct nvmf_vfio_user_ctrlr *ctrlr, 2027 struct spdk_nvme_cmd *cmd, uint16_t *sct) 2028 { 2029 struct nvmf_vfio_user_cq *cq; 2030 uint32_t qsize; 2031 uint16_t qid; 2032 int err; 2033 2034 qid = cmd->cdw10_bits.create_io_q.qid; 2035 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 2036 2037 if (ctrlr->cqs[qid] == NULL) { 2038 err = init_cq(ctrlr, qid); 2039 if (err != 0) { 2040 *sct = SPDK_NVME_SCT_GENERIC; 2041 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2042 } 2043 } 2044 2045 if (cmd->cdw11_bits.create_io_cq.pc != 0x1) { 2046 SPDK_ERRLOG("%s: non-PC CQ not supported\n", ctrlr_id(ctrlr)); 2047 *sct = SPDK_NVME_SCT_GENERIC; 2048 return SPDK_NVME_SC_INVALID_FIELD; 2049 } 2050 2051 if (cmd->cdw11_bits.create_io_cq.iv > NVME_IRQ_MSIX_NUM - 1) { 2052 SPDK_ERRLOG("%s: IV is too big\n", ctrlr_id(ctrlr)); 2053 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2054 return SPDK_NVME_SC_INVALID_INTERRUPT_VECTOR; 2055 } 2056 2057 cq = ctrlr->cqs[qid]; 2058 cq->size = qsize; 2059 2060 cq->mapping.prp1 = cmd->dptr.prp.prp1; 2061 2062 cq->dbl_headp = ctrlr_doorbell_ptr(ctrlr) + queue_index(qid, true); 2063 2064 err = map_q(ctrlr, &cq->mapping, cq->size, true, true); 2065 if (err) { 2066 SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr)); 2067 *sct = SPDK_NVME_SCT_GENERIC; 2068 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2069 } 2070 2071 SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped cqid:%u IOVA=%#lx vaddr=%p\n", 2072 ctrlr_id(ctrlr), qid, cmd->dptr.prp.prp1, 2073 q_addr(&cq->mapping)); 2074 2075 cq->ien = cmd->cdw11_bits.create_io_cq.ien; 2076 cq->iv = cmd->cdw11_bits.create_io_cq.iv; 2077 cq->phase = true; 2078 cq->cq_state = VFIO_USER_CQ_CREATED; 2079 2080 *cq_tailp(cq) = 0; 2081 2082 /* 2083 * We should always reset the doorbells. 2084 * 2085 * The Specification prohibits the controller from writing to the shadow 2086 * doorbell buffer, however older versions of the Linux NVMe driver 2087 * don't reset the shadow doorbell buffer after a Queue-Level or 2088 * Controller-Level reset, which means that we're left with garbage 2089 * doorbell values. 2090 */ 2091 *cq_dbl_headp(cq) = 0; 2092 2093 *sct = SPDK_NVME_SCT_GENERIC; 2094 return SPDK_NVME_SC_SUCCESS; 2095 } 2096 2097 /* 2098 * Creates a completion or submission I/O queue. Returns 0 on success, -errno 2099 * on error. 2100 */ 2101 static int 2102 handle_create_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, 2103 struct spdk_nvme_cmd *cmd, const bool is_cq) 2104 { 2105 struct nvmf_vfio_user_transport *vu_transport = ctrlr->transport; 2106 uint16_t sct = SPDK_NVME_SCT_GENERIC; 2107 uint16_t sc = SPDK_NVME_SC_SUCCESS; 2108 uint32_t qsize; 2109 uint16_t qid; 2110 2111 assert(ctrlr != NULL); 2112 assert(cmd != NULL); 2113 2114 qid = cmd->cdw10_bits.create_io_q.qid; 2115 if (qid == 0 || qid >= vu_transport->transport.opts.max_qpairs_per_ctrlr) { 2116 SPDK_ERRLOG("%s: invalid qid=%d, max=%d\n", ctrlr_id(ctrlr), 2117 qid, vu_transport->transport.opts.max_qpairs_per_ctrlr); 2118 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2119 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2120 goto out; 2121 } 2122 2123 if (io_q_exists(ctrlr, qid, is_cq)) { 2124 SPDK_ERRLOG("%s: %cqid:%d already exists\n", ctrlr_id(ctrlr), 2125 is_cq ? 'c' : 's', qid); 2126 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2127 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2128 goto out; 2129 } 2130 2131 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 2132 if (qsize == 1 || qsize > max_queue_size(ctrlr)) { 2133 SPDK_ERRLOG("%s: invalid I/O queue size %u\n", ctrlr_id(ctrlr), qsize); 2134 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2135 sc = SPDK_NVME_SC_INVALID_QUEUE_SIZE; 2136 goto out; 2137 } 2138 2139 if (is_cq) { 2140 sc = handle_create_io_cq(ctrlr, cmd, &sct); 2141 } else { 2142 sc = handle_create_io_sq(ctrlr, cmd, &sct); 2143 2144 if (sct == SPDK_NVME_SCT_GENERIC && 2145 sc == SPDK_NVME_SC_SUCCESS) { 2146 /* Completion posted asynchronously. */ 2147 return 0; 2148 } 2149 } 2150 2151 out: 2152 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 2153 } 2154 2155 /* For ADMIN I/O DELETE SUBMISSION QUEUE the NVMf library will disconnect and free 2156 * queue pair, so save the command in a context. 2157 */ 2158 struct vfio_user_delete_sq_ctx { 2159 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 2160 struct spdk_nvme_cmd delete_io_sq_cmd; 2161 }; 2162 2163 static void 2164 vfio_user_qpair_delete_cb(void *cb_arg) 2165 { 2166 struct vfio_user_delete_sq_ctx *ctx = cb_arg; 2167 struct nvmf_vfio_user_ctrlr *vu_ctrlr = ctx->vu_ctrlr; 2168 struct nvmf_vfio_user_cq *admin_cq = vu_ctrlr->cqs[0]; 2169 2170 if (admin_cq->thread != spdk_get_thread()) { 2171 assert(admin_cq->thread != NULL); 2172 spdk_thread_send_msg(admin_cq->thread, 2173 vfio_user_qpair_delete_cb, 2174 cb_arg); 2175 } else { 2176 post_completion(vu_ctrlr, admin_cq, 0, 0, 2177 ctx->delete_io_sq_cmd.cid, 2178 SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC); 2179 free(ctx); 2180 } 2181 } 2182 2183 /* 2184 * Deletes a completion or submission I/O queue. 2185 */ 2186 static int 2187 handle_del_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, 2188 struct spdk_nvme_cmd *cmd, const bool is_cq) 2189 { 2190 uint16_t sct = SPDK_NVME_SCT_GENERIC; 2191 uint16_t sc = SPDK_NVME_SC_SUCCESS; 2192 struct nvmf_vfio_user_sq *sq; 2193 struct nvmf_vfio_user_cq *cq; 2194 struct vfio_user_delete_sq_ctx *ctx; 2195 2196 SPDK_DEBUGLOG(nvmf_vfio, "%s: delete I/O %cqid:%d\n", 2197 ctrlr_id(ctrlr), is_cq ? 'c' : 's', 2198 cmd->cdw10_bits.delete_io_q.qid); 2199 2200 if (!io_q_exists(ctrlr, cmd->cdw10_bits.delete_io_q.qid, is_cq)) { 2201 SPDK_ERRLOG("%s: I/O %cqid:%d does not exist\n", ctrlr_id(ctrlr), 2202 is_cq ? 'c' : 's', cmd->cdw10_bits.delete_io_q.qid); 2203 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2204 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2205 goto out; 2206 } 2207 2208 if (is_cq) { 2209 cq = ctrlr->cqs[cmd->cdw10_bits.delete_io_q.qid]; 2210 if (cq->cq_ref) { 2211 SPDK_ERRLOG("%s: the associated SQ must be deleted first\n", ctrlr_id(ctrlr)); 2212 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2213 sc = SPDK_NVME_SC_INVALID_QUEUE_DELETION; 2214 goto out; 2215 } 2216 2217 unmap_q(ctrlr, &cq->mapping); 2218 cq->size = 0; 2219 cq->cq_state = VFIO_USER_CQ_DELETED; 2220 cq->group = NULL; 2221 } else { 2222 ctx = calloc(1, sizeof(*ctx)); 2223 if (!ctx) { 2224 sct = SPDK_NVME_SCT_GENERIC; 2225 sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2226 goto out; 2227 } 2228 ctx->vu_ctrlr = ctrlr; 2229 ctx->delete_io_sq_cmd = *cmd; 2230 2231 sq = ctrlr->sqs[cmd->cdw10_bits.delete_io_q.qid]; 2232 sq->sq_state = VFIO_USER_SQ_DELETED; 2233 assert(ctrlr->cqs[sq->cqid]->cq_ref); 2234 ctrlr->cqs[sq->cqid]->cq_ref--; 2235 2236 spdk_nvmf_qpair_disconnect(&sq->qpair, vfio_user_qpair_delete_cb, ctx); 2237 return 0; 2238 } 2239 2240 out: 2241 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 2242 } 2243 2244 /* 2245 * Configures Shadow Doorbells. 2246 */ 2247 static int 2248 handle_doorbell_buffer_config(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd) 2249 { 2250 struct nvmf_vfio_user_shadow_doorbells *sdbl = NULL; 2251 uint32_t dstrd; 2252 uintptr_t page_size, page_mask; 2253 uint64_t prp1, prp2; 2254 uint16_t sct = SPDK_NVME_SCT_GENERIC; 2255 uint16_t sc = SPDK_NVME_SC_INVALID_FIELD; 2256 2257 assert(ctrlr != NULL); 2258 assert(ctrlr->endpoint != NULL); 2259 assert(cmd != NULL); 2260 2261 dstrd = doorbell_stride(ctrlr); 2262 page_size = memory_page_size(ctrlr); 2263 page_mask = memory_page_mask(ctrlr); 2264 2265 /* FIXME: we don't check doorbell stride when setting queue doorbells. */ 2266 if ((4u << dstrd) * NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR > page_size) { 2267 SPDK_ERRLOG("%s: doorbells do not fit in a single host page", 2268 ctrlr_id(ctrlr)); 2269 2270 goto out; 2271 } 2272 2273 /* Verify guest physical addresses passed as PRPs. */ 2274 if (cmd->psdt != SPDK_NVME_PSDT_PRP) { 2275 SPDK_ERRLOG("%s: received Doorbell Buffer Config without PRPs", 2276 ctrlr_id(ctrlr)); 2277 2278 goto out; 2279 } 2280 2281 prp1 = cmd->dptr.prp.prp1; 2282 prp2 = cmd->dptr.prp.prp2; 2283 2284 SPDK_DEBUGLOG(nvmf_vfio, 2285 "%s: configuring shadow doorbells with PRP1=%#lx and PRP2=%#lx (GPAs)\n", 2286 ctrlr_id(ctrlr), prp1, prp2); 2287 2288 if (prp1 == prp2 2289 || prp1 != (prp1 & page_mask) 2290 || prp2 != (prp2 & page_mask)) { 2291 SPDK_ERRLOG("%s: invalid shadow doorbell GPAs\n", 2292 ctrlr_id(ctrlr)); 2293 2294 goto out; 2295 } 2296 2297 /* Map guest physical addresses to our virtual address space. */ 2298 sdbl = map_sdbl(ctrlr->endpoint->vfu_ctx, prp1, prp2, page_size); 2299 if (sdbl == NULL) { 2300 SPDK_ERRLOG("%s: failed to map shadow doorbell buffers\n", 2301 ctrlr_id(ctrlr)); 2302 2303 goto out; 2304 } 2305 2306 ctrlr->shadow_doorbell_buffer = prp1; 2307 ctrlr->eventidx_buffer = prp2; 2308 2309 SPDK_DEBUGLOG(nvmf_vfio, 2310 "%s: mapped shadow doorbell buffers [%p, %p) and [%p, %p)\n", 2311 ctrlr_id(ctrlr), 2312 sdbl->iovs[0].iov_base, 2313 sdbl->iovs[0].iov_base + sdbl->iovs[0].iov_len, 2314 sdbl->iovs[1].iov_base, 2315 sdbl->iovs[1].iov_base + sdbl->iovs[1].iov_len); 2316 2317 2318 /* 2319 * Set all possible CQ head doorbells to polling mode now, such that we 2320 * don't have to worry about it later if the host creates more queues. 2321 * 2322 * We only ever want interrupts for writes to the SQ tail doorbells 2323 * (which are initialised in set_ctrlr_intr_mode() below). 2324 */ 2325 for (uint16_t i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; ++i) { 2326 sdbl->eventidxs[queue_index(i, true)] = NVMF_VFIO_USER_EVENTIDX_POLL; 2327 } 2328 2329 /* Update controller. */ 2330 SWAP(ctrlr->sdbl, sdbl); 2331 2332 /* 2333 * Copy doorbells from either the previous shadow doorbell buffer or the 2334 * BAR0 doorbells and make I/O queue doorbells point to the new buffer. 2335 * 2336 * This needs to account for older versions of the Linux NVMe driver, 2337 * which don't clear out the buffer after a controller reset. 2338 */ 2339 copy_doorbells(ctrlr, sdbl != NULL ? 2340 sdbl->shadow_doorbells : ctrlr->bar0_doorbells, 2341 ctrlr->sdbl->shadow_doorbells); 2342 2343 vfio_user_ctrlr_switch_doorbells(ctrlr, true); 2344 2345 ctrlr_kick(ctrlr); 2346 2347 sc = SPDK_NVME_SC_SUCCESS; 2348 2349 out: 2350 /* 2351 * Unmap existing buffers, in case Doorbell Buffer Config was sent 2352 * more than once (pointless, but not prohibited by the spec), or 2353 * in case of an error. 2354 * 2355 * If this is the first time Doorbell Buffer Config was processed, 2356 * then we've just swapped a NULL from ctrlr->sdbl into sdbl, so 2357 * free_sdbl() becomes a noop. 2358 */ 2359 free_sdbl(ctrlr->endpoint->vfu_ctx, sdbl); 2360 2361 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 2362 } 2363 2364 /* Returns 0 on success and -errno on error. */ 2365 static int 2366 consume_admin_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd) 2367 { 2368 assert(ctrlr != NULL); 2369 assert(cmd != NULL); 2370 2371 if (cmd->fuse != 0) { 2372 /* Fused admin commands are not supported. */ 2373 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, 2374 SPDK_NVME_SC_INVALID_FIELD, 2375 SPDK_NVME_SCT_GENERIC); 2376 } 2377 2378 switch (cmd->opc) { 2379 case SPDK_NVME_OPC_CREATE_IO_CQ: 2380 case SPDK_NVME_OPC_CREATE_IO_SQ: 2381 return handle_create_io_q(ctrlr, cmd, 2382 cmd->opc == SPDK_NVME_OPC_CREATE_IO_CQ); 2383 case SPDK_NVME_OPC_DELETE_IO_SQ: 2384 case SPDK_NVME_OPC_DELETE_IO_CQ: 2385 return handle_del_io_q(ctrlr, cmd, 2386 cmd->opc == SPDK_NVME_OPC_DELETE_IO_CQ); 2387 case SPDK_NVME_OPC_DOORBELL_BUFFER_CONFIG: 2388 if (!ctrlr->transport->transport_opts.disable_shadow_doorbells) { 2389 return handle_doorbell_buffer_config(ctrlr, cmd); 2390 } 2391 /* FALLTHROUGH */ 2392 default: 2393 return handle_cmd_req(ctrlr, cmd, ctrlr->sqs[0]); 2394 } 2395 } 2396 2397 static int 2398 handle_cmd_rsp(struct nvmf_vfio_user_req *vu_req, void *cb_arg) 2399 { 2400 struct nvmf_vfio_user_sq *sq = cb_arg; 2401 struct nvmf_vfio_user_ctrlr *vu_ctrlr = sq->ctrlr; 2402 uint16_t sqid, cqid; 2403 2404 assert(sq != NULL); 2405 assert(vu_req != NULL); 2406 assert(vu_ctrlr != NULL); 2407 2408 if (spdk_likely(vu_req->iovcnt)) { 2409 vfu_sgl_put(vu_ctrlr->endpoint->vfu_ctx, 2410 index_to_sg_t(vu_req->sg, 0), 2411 vu_req->iov, vu_req->iovcnt); 2412 } 2413 sqid = sq->qid; 2414 cqid = sq->cqid; 2415 2416 return post_completion(vu_ctrlr, vu_ctrlr->cqs[cqid], 2417 vu_req->req.rsp->nvme_cpl.cdw0, 2418 sqid, 2419 vu_req->req.cmd->nvme_cmd.cid, 2420 vu_req->req.rsp->nvme_cpl.status.sc, 2421 vu_req->req.rsp->nvme_cpl.status.sct); 2422 } 2423 2424 static int 2425 consume_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_sq *sq, 2426 struct spdk_nvme_cmd *cmd) 2427 { 2428 assert(sq != NULL); 2429 if (nvmf_qpair_is_admin_queue(&sq->qpair)) { 2430 return consume_admin_cmd(ctrlr, cmd); 2431 } 2432 2433 return handle_cmd_req(ctrlr, cmd, sq); 2434 } 2435 2436 /* Returns the number of commands processed, or a negative value on error. */ 2437 static int 2438 handle_sq_tdbl_write(struct nvmf_vfio_user_ctrlr *ctrlr, const uint32_t new_tail, 2439 struct nvmf_vfio_user_sq *sq) 2440 { 2441 struct spdk_nvme_cmd *queue; 2442 int count = 0; 2443 2444 assert(ctrlr != NULL); 2445 assert(sq != NULL); 2446 2447 if (ctrlr->sdbl != NULL && sq->qid != 0) { 2448 /* 2449 * Submission queue index has moved past the event index, so it 2450 * needs to be re-armed before we go to sleep. 2451 */ 2452 sq->need_rearm = true; 2453 } 2454 2455 queue = q_addr(&sq->mapping); 2456 while (*sq_headp(sq) != new_tail) { 2457 int err; 2458 struct spdk_nvme_cmd *cmd = &queue[*sq_headp(sq)]; 2459 2460 count++; 2461 2462 /* 2463 * SQHD must contain the new head pointer, so we must increase 2464 * it before we generate a completion. 2465 */ 2466 sq_head_advance(sq); 2467 2468 err = consume_cmd(ctrlr, sq, cmd); 2469 if (err != 0) { 2470 return err; 2471 } 2472 } 2473 2474 return count; 2475 } 2476 2477 /* Checks whether endpoint is connected from the same process */ 2478 static bool 2479 is_peer_same_process(struct nvmf_vfio_user_endpoint *endpoint) 2480 { 2481 struct ucred ucred; 2482 socklen_t ucredlen = sizeof(ucred); 2483 2484 if (endpoint == NULL) { 2485 return false; 2486 } 2487 2488 if (getsockopt(vfu_get_poll_fd(endpoint->vfu_ctx), SOL_SOCKET, SO_PEERCRED, &ucred, 2489 &ucredlen) < 0) { 2490 SPDK_ERRLOG("getsockopt(SO_PEERCRED): %s\n", strerror(errno)); 2491 return false; 2492 } 2493 2494 return ucred.pid == getpid(); 2495 } 2496 2497 static void 2498 memory_region_add_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) 2499 { 2500 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2501 struct nvmf_vfio_user_ctrlr *ctrlr; 2502 struct nvmf_vfio_user_sq *sq; 2503 struct nvmf_vfio_user_cq *cq; 2504 void *map_start, *map_end; 2505 int ret; 2506 2507 /* 2508 * We're not interested in any DMA regions that aren't mappable (we don't 2509 * support clients that don't share their memory). 2510 */ 2511 if (!info->vaddr) { 2512 return; 2513 } 2514 2515 map_start = info->mapping.iov_base; 2516 map_end = info->mapping.iov_base + info->mapping.iov_len; 2517 2518 if (((uintptr_t)info->mapping.iov_base & MASK_2MB) || 2519 (info->mapping.iov_len & MASK_2MB)) { 2520 SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %p-%p\n", 2521 info->vaddr, map_start, map_end); 2522 return; 2523 } 2524 2525 assert(endpoint != NULL); 2526 if (endpoint->ctrlr == NULL) { 2527 return; 2528 } 2529 ctrlr = endpoint->ctrlr; 2530 2531 SPDK_DEBUGLOG(nvmf_vfio, "%s: map IOVA %p-%p\n", endpoint_id(endpoint), 2532 map_start, map_end); 2533 2534 /* VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE are enabled when registering to VFIO, here we also 2535 * check the protection bits before registering. When vfio client and server are run in same process 2536 * there is no need to register the same memory again. 2537 */ 2538 if (info->prot == (PROT_WRITE | PROT_READ) && !is_peer_same_process(endpoint)) { 2539 ret = spdk_mem_register(info->mapping.iov_base, info->mapping.iov_len); 2540 if (ret) { 2541 SPDK_ERRLOG("Memory region register %p-%p failed, ret=%d\n", 2542 map_start, map_end, ret); 2543 } 2544 } 2545 2546 pthread_mutex_lock(&endpoint->lock); 2547 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 2548 if (sq->sq_state != VFIO_USER_SQ_INACTIVE) { 2549 continue; 2550 } 2551 2552 cq = ctrlr->cqs[sq->cqid]; 2553 2554 /* For shared CQ case, we will use q_addr() to avoid mapping CQ multiple times */ 2555 if (cq->size && q_addr(&cq->mapping) == NULL) { 2556 ret = map_q(ctrlr, &cq->mapping, cq->size, true, false); 2557 if (ret) { 2558 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap cqid:%d %#lx-%#lx\n", 2559 cq->qid, cq->mapping.prp1, 2560 cq->mapping.prp1 + cq->size * sizeof(struct spdk_nvme_cpl)); 2561 continue; 2562 } 2563 } 2564 2565 if (sq->size) { 2566 ret = map_q(ctrlr, &sq->mapping, sq->size, false, false); 2567 if (ret) { 2568 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap sqid:%d %#lx-%#lx\n", 2569 sq->qid, sq->mapping.prp1, 2570 sq->mapping.prp1 + sq->size * sizeof(struct spdk_nvme_cmd)); 2571 continue; 2572 } 2573 } 2574 sq->sq_state = VFIO_USER_SQ_ACTIVE; 2575 SPDK_DEBUGLOG(nvmf_vfio, "Remap sqid:%u successfully\n", sq->qid); 2576 } 2577 pthread_mutex_unlock(&endpoint->lock); 2578 } 2579 2580 static void 2581 memory_region_remove_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) 2582 { 2583 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2584 struct nvmf_vfio_user_sq *sq; 2585 struct nvmf_vfio_user_cq *cq; 2586 void *map_start, *map_end; 2587 int ret = 0; 2588 2589 if (!info->vaddr) { 2590 return; 2591 } 2592 2593 map_start = info->mapping.iov_base; 2594 map_end = info->mapping.iov_base + info->mapping.iov_len; 2595 2596 if (((uintptr_t)info->mapping.iov_base & MASK_2MB) || 2597 (info->mapping.iov_len & MASK_2MB)) { 2598 SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %p-%p\n", 2599 info->vaddr, map_start, map_end); 2600 return; 2601 } 2602 2603 assert(endpoint != NULL); 2604 SPDK_DEBUGLOG(nvmf_vfio, "%s: unmap IOVA %p-%p\n", endpoint_id(endpoint), 2605 map_start, map_end); 2606 2607 if (endpoint->ctrlr != NULL) { 2608 struct nvmf_vfio_user_ctrlr *ctrlr; 2609 ctrlr = endpoint->ctrlr; 2610 2611 pthread_mutex_lock(&endpoint->lock); 2612 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 2613 if (q_addr(&sq->mapping) >= map_start && q_addr(&sq->mapping) <= map_end) { 2614 unmap_q(ctrlr, &sq->mapping); 2615 sq->sq_state = VFIO_USER_SQ_INACTIVE; 2616 } 2617 2618 cq = ctrlr->cqs[sq->cqid]; 2619 if (q_addr(&cq->mapping) >= map_start && q_addr(&cq->mapping) <= map_end) { 2620 unmap_q(ctrlr, &cq->mapping); 2621 } 2622 } 2623 2624 if (ctrlr->sdbl != NULL) { 2625 size_t i; 2626 2627 for (i = 0; i < NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT; i++) { 2628 const void *const iov_base = ctrlr->sdbl->iovs[i].iov_base; 2629 2630 if (iov_base >= map_start && iov_base < map_end) { 2631 copy_doorbells(ctrlr, 2632 ctrlr->sdbl->shadow_doorbells, 2633 ctrlr->bar0_doorbells); 2634 vfio_user_ctrlr_switch_doorbells(ctrlr, false); 2635 free_sdbl(endpoint->vfu_ctx, ctrlr->sdbl); 2636 ctrlr->sdbl = NULL; 2637 break; 2638 } 2639 } 2640 } 2641 2642 pthread_mutex_unlock(&endpoint->lock); 2643 } 2644 2645 if (info->prot == (PROT_WRITE | PROT_READ) && !is_peer_same_process(endpoint)) { 2646 ret = spdk_mem_unregister(info->mapping.iov_base, info->mapping.iov_len); 2647 if (ret) { 2648 SPDK_ERRLOG("Memory region unregister %p-%p failed, ret=%d\n", 2649 map_start, map_end, ret); 2650 } 2651 } 2652 } 2653 2654 /* Used to initiate a controller-level reset or a controller shutdown. */ 2655 static void 2656 disable_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 2657 { 2658 SPDK_DEBUGLOG(nvmf_vfio, "%s: disabling controller\n", 2659 ctrlr_id(vu_ctrlr)); 2660 2661 /* Unmap Admin queue. */ 2662 2663 assert(vu_ctrlr->sqs[0] != NULL); 2664 assert(vu_ctrlr->cqs[0] != NULL); 2665 2666 unmap_q(vu_ctrlr, &vu_ctrlr->sqs[0]->mapping); 2667 unmap_q(vu_ctrlr, &vu_ctrlr->cqs[0]->mapping); 2668 2669 vu_ctrlr->sqs[0]->size = 0; 2670 *sq_headp(vu_ctrlr->sqs[0]) = 0; 2671 2672 vu_ctrlr->sqs[0]->sq_state = VFIO_USER_SQ_INACTIVE; 2673 2674 vu_ctrlr->cqs[0]->size = 0; 2675 *cq_tailp(vu_ctrlr->cqs[0]) = 0; 2676 2677 /* 2678 * For PCIe controller reset or shutdown, we will drop all AER 2679 * responses. 2680 */ 2681 nvmf_ctrlr_abort_aer(vu_ctrlr->ctrlr); 2682 2683 /* Free the shadow doorbell buffer. */ 2684 vfio_user_ctrlr_switch_doorbells(vu_ctrlr, false); 2685 free_sdbl(vu_ctrlr->endpoint->vfu_ctx, vu_ctrlr->sdbl); 2686 vu_ctrlr->sdbl = NULL; 2687 } 2688 2689 /* Used to re-enable the controller after a controller-level reset. */ 2690 static int 2691 enable_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 2692 { 2693 int err; 2694 2695 assert(vu_ctrlr != NULL); 2696 2697 SPDK_DEBUGLOG(nvmf_vfio, "%s: enabling controller\n", 2698 ctrlr_id(vu_ctrlr)); 2699 2700 err = acq_setup(vu_ctrlr); 2701 if (err != 0) { 2702 return err; 2703 } 2704 2705 err = asq_setup(vu_ctrlr); 2706 if (err != 0) { 2707 return err; 2708 } 2709 2710 vu_ctrlr->sqs[0]->sq_state = VFIO_USER_SQ_ACTIVE; 2711 2712 return 0; 2713 } 2714 2715 static int 2716 nvmf_vfio_user_prop_req_rsp(struct nvmf_vfio_user_req *req, void *cb_arg) 2717 { 2718 struct nvmf_vfio_user_sq *sq = cb_arg; 2719 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 2720 int ret; 2721 2722 assert(sq != NULL); 2723 assert(req != NULL); 2724 2725 if (req->req.cmd->prop_get_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET) { 2726 assert(sq->ctrlr != NULL); 2727 assert(req != NULL); 2728 2729 memcpy(req->req.data, 2730 &req->req.rsp->prop_get_rsp.value.u64, 2731 req->req.length); 2732 } else { 2733 assert(req->req.cmd->prop_set_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET); 2734 assert(sq->ctrlr != NULL); 2735 vu_ctrlr = sq->ctrlr; 2736 2737 if (req->req.cmd->prop_set_cmd.ofst == offsetof(struct spdk_nvme_registers, cc)) { 2738 union spdk_nvme_cc_register cc, diff; 2739 2740 cc.raw = req->req.cmd->prop_set_cmd.value.u64; 2741 diff.raw = cc.raw ^ req->cc.raw; 2742 2743 if (diff.bits.en) { 2744 if (cc.bits.en) { 2745 ret = enable_ctrlr(vu_ctrlr); 2746 if (ret) { 2747 SPDK_ERRLOG("%s: failed to enable ctrlr\n", ctrlr_id(vu_ctrlr)); 2748 return ret; 2749 } 2750 vu_ctrlr->reset_shn = false; 2751 } else { 2752 vu_ctrlr->reset_shn = true; 2753 } 2754 } 2755 2756 if (diff.bits.shn) { 2757 if (cc.bits.shn == SPDK_NVME_SHN_NORMAL || cc.bits.shn == SPDK_NVME_SHN_ABRUPT) { 2758 vu_ctrlr->reset_shn = true; 2759 } 2760 } 2761 2762 if (vu_ctrlr->reset_shn) { 2763 disable_ctrlr(vu_ctrlr); 2764 } 2765 } 2766 } 2767 2768 return 0; 2769 } 2770 2771 /* 2772 * Handles a write at offset 0x1000 or more; this is the non-mapped path when a 2773 * doorbell is written via access_bar0_fn(). 2774 * 2775 * DSTRD is set to fixed value 0 for NVMf. 2776 * 2777 */ 2778 static int 2779 handle_dbl_access(struct nvmf_vfio_user_ctrlr *ctrlr, uint32_t *buf, 2780 const size_t count, loff_t pos, const bool is_write) 2781 { 2782 assert(ctrlr != NULL); 2783 assert(buf != NULL); 2784 2785 if (!is_write) { 2786 SPDK_WARNLOG("%s: host tried to read BAR0 doorbell %#lx\n", 2787 ctrlr_id(ctrlr), pos); 2788 errno = EPERM; 2789 return -1; 2790 } 2791 2792 if (count != sizeof(uint32_t)) { 2793 SPDK_ERRLOG("%s: bad doorbell buffer size %ld\n", 2794 ctrlr_id(ctrlr), count); 2795 errno = EINVAL; 2796 return -1; 2797 } 2798 2799 pos -= NVME_DOORBELLS_OFFSET; 2800 2801 /* pos must be dword aligned */ 2802 if ((pos & 0x3) != 0) { 2803 SPDK_ERRLOG("%s: bad doorbell offset %#lx\n", ctrlr_id(ctrlr), pos); 2804 errno = EINVAL; 2805 return -1; 2806 } 2807 2808 /* convert byte offset to array index */ 2809 pos >>= 2; 2810 2811 if (pos >= NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR * 2) { 2812 SPDK_ERRLOG("%s: bad doorbell index %#lx\n", ctrlr_id(ctrlr), pos); 2813 errno = EINVAL; 2814 return -1; 2815 } 2816 2817 ctrlr->bar0_doorbells[pos] = *buf; 2818 spdk_wmb(); 2819 2820 SPDK_DEBUGLOG(vfio_user_db, "%s: updating BAR0 doorbell %s:%ld to %u\n", 2821 ctrlr_id(ctrlr), (pos & 1) ? "cqid" : "sqid", 2822 pos / 2, *buf); 2823 2824 2825 return 0; 2826 } 2827 2828 static size_t 2829 vfio_user_property_access(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 2830 char *buf, size_t count, loff_t pos, 2831 bool is_write) 2832 { 2833 struct nvmf_vfio_user_req *req; 2834 const struct spdk_nvmf_registers *regs; 2835 2836 if ((count != 4) && (count != 8)) { 2837 errno = EINVAL; 2838 return -1; 2839 } 2840 2841 /* Construct a Fabric Property Get/Set command and send it */ 2842 req = get_nvmf_vfio_user_req(vu_ctrlr->sqs[0]); 2843 if (req == NULL) { 2844 errno = ENOBUFS; 2845 return -1; 2846 } 2847 regs = spdk_nvmf_ctrlr_get_regs(vu_ctrlr->ctrlr); 2848 req->cc.raw = regs->cc.raw; 2849 2850 req->cb_fn = nvmf_vfio_user_prop_req_rsp; 2851 req->cb_arg = vu_ctrlr->sqs[0]; 2852 req->req.cmd->prop_set_cmd.opcode = SPDK_NVME_OPC_FABRIC; 2853 req->req.cmd->prop_set_cmd.cid = 0; 2854 if (count == 4) { 2855 req->req.cmd->prop_set_cmd.attrib.size = 0; 2856 } else { 2857 req->req.cmd->prop_set_cmd.attrib.size = 1; 2858 } 2859 req->req.cmd->prop_set_cmd.ofst = pos; 2860 if (is_write) { 2861 req->req.cmd->prop_set_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET; 2862 if (req->req.cmd->prop_set_cmd.attrib.size) { 2863 req->req.cmd->prop_set_cmd.value.u64 = *(uint64_t *)buf; 2864 } else { 2865 req->req.cmd->prop_set_cmd.value.u32.high = 0; 2866 req->req.cmd->prop_set_cmd.value.u32.low = *(uint32_t *)buf; 2867 } 2868 } else { 2869 req->req.cmd->prop_get_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET; 2870 } 2871 req->req.length = count; 2872 req->req.data = buf; 2873 2874 spdk_nvmf_request_exec_fabrics(&req->req); 2875 2876 return count; 2877 } 2878 2879 static ssize_t 2880 access_bar0_fn(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t pos, 2881 bool is_write) 2882 { 2883 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2884 struct nvmf_vfio_user_ctrlr *ctrlr; 2885 int ret; 2886 2887 ctrlr = endpoint->ctrlr; 2888 if (endpoint->need_async_destroy || !ctrlr) { 2889 errno = EIO; 2890 return -1; 2891 } 2892 2893 if (pos >= NVME_DOORBELLS_OFFSET) { 2894 /* 2895 * The fact that the doorbells can be memory mapped doesn't mean 2896 * that the client (VFIO in QEMU) is obliged to memory map them, 2897 * it might still elect to access them via regular read/write; 2898 * we might also have had disable_mappable_bar0 set. 2899 */ 2900 ret = handle_dbl_access(ctrlr, (uint32_t *)buf, count, 2901 pos, is_write); 2902 if (ret == 0) { 2903 return count; 2904 } 2905 return ret; 2906 } 2907 2908 return vfio_user_property_access(ctrlr, buf, count, pos, is_write); 2909 } 2910 2911 static ssize_t 2912 access_pci_config(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t offset, 2913 bool is_write) 2914 { 2915 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2916 2917 if (is_write) { 2918 SPDK_ERRLOG("%s: write %#lx-%#lx not supported\n", 2919 endpoint_id(endpoint), offset, offset + count); 2920 errno = EINVAL; 2921 return -1; 2922 } 2923 2924 if (offset + count > NVME_REG_CFG_SIZE) { 2925 SPDK_ERRLOG("%s: access past end of extended PCI configuration space, want=%ld+%ld, max=%d\n", 2926 endpoint_id(endpoint), offset, count, 2927 NVME_REG_CFG_SIZE); 2928 errno = ERANGE; 2929 return -1; 2930 } 2931 2932 memcpy(buf, ((unsigned char *)endpoint->pci_config_space) + offset, count); 2933 2934 return count; 2935 } 2936 2937 static void 2938 vfio_user_log(vfu_ctx_t *vfu_ctx, int level, char const *msg) 2939 { 2940 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2941 2942 if (level >= LOG_DEBUG) { 2943 SPDK_DEBUGLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg); 2944 } else if (level >= LOG_INFO) { 2945 SPDK_INFOLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg); 2946 } else if (level >= LOG_NOTICE) { 2947 SPDK_NOTICELOG("%s: %s\n", endpoint_id(endpoint), msg); 2948 } else if (level >= LOG_WARNING) { 2949 SPDK_WARNLOG("%s: %s\n", endpoint_id(endpoint), msg); 2950 } else { 2951 SPDK_ERRLOG("%s: %s\n", endpoint_id(endpoint), msg); 2952 } 2953 } 2954 2955 static int 2956 vfio_user_get_log_level(void) 2957 { 2958 int level; 2959 2960 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 2961 return LOG_DEBUG; 2962 } 2963 2964 level = spdk_log_to_syslog_level(spdk_log_get_level()); 2965 if (level < 0) { 2966 return LOG_ERR; 2967 } 2968 2969 return level; 2970 } 2971 2972 static void 2973 init_pci_config_space(vfu_pci_config_space_t *p) 2974 { 2975 /* MLBAR */ 2976 p->hdr.bars[0].raw = 0x0; 2977 /* MUBAR */ 2978 p->hdr.bars[1].raw = 0x0; 2979 2980 /* vendor specific, let's set them to zero for now */ 2981 p->hdr.bars[3].raw = 0x0; 2982 p->hdr.bars[4].raw = 0x0; 2983 p->hdr.bars[5].raw = 0x0; 2984 2985 /* enable INTx */ 2986 p->hdr.intr.ipin = 0x1; 2987 } 2988 2989 struct ctrlr_quiesce_ctx { 2990 struct nvmf_vfio_user_endpoint *endpoint; 2991 struct nvmf_vfio_user_poll_group *group; 2992 int status; 2993 }; 2994 2995 static void ctrlr_quiesce(struct nvmf_vfio_user_ctrlr *vu_ctrlr); 2996 2997 static void 2998 _vfio_user_endpoint_resume_done_msg(void *ctx) 2999 { 3000 struct nvmf_vfio_user_endpoint *endpoint = ctx; 3001 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3002 3003 endpoint->need_resume = false; 3004 3005 if (!vu_ctrlr) { 3006 return; 3007 } 3008 3009 if (!vu_ctrlr->queued_quiesce) { 3010 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 3011 3012 /* 3013 * We might have ignored new SQ entries while we were quiesced: 3014 * kick ourselves so we'll definitely check again while in 3015 * VFIO_USER_CTRLR_RUNNING state. 3016 */ 3017 if (in_interrupt_mode(endpoint->transport)) { 3018 ctrlr_kick(vu_ctrlr); 3019 } 3020 return; 3021 } 3022 3023 3024 /* 3025 * Basically, once we call `vfu_device_quiesced` the device is 3026 * unquiesced from libvfio-user's perspective so from the moment 3027 * `vfio_user_quiesce_done` returns libvfio-user might quiesce the device 3028 * again. However, because the NVMf subsytem is an asynchronous 3029 * operation, this quiesce might come _before_ the NVMf subsystem has 3030 * been resumed, so in the callback of `spdk_nvmf_subsystem_resume` we 3031 * need to check whether a quiesce was requested. 3032 */ 3033 SPDK_DEBUGLOG(nvmf_vfio, "%s has queued quiesce event, quiesce again\n", 3034 ctrlr_id(vu_ctrlr)); 3035 ctrlr_quiesce(vu_ctrlr); 3036 } 3037 3038 static void 3039 vfio_user_endpoint_resume_done(struct spdk_nvmf_subsystem *subsystem, 3040 void *cb_arg, int status) 3041 { 3042 struct nvmf_vfio_user_endpoint *endpoint = cb_arg; 3043 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3044 3045 SPDK_DEBUGLOG(nvmf_vfio, "%s resumed done with status %d\n", endpoint_id(endpoint), status); 3046 3047 if (!vu_ctrlr) { 3048 return; 3049 } 3050 3051 spdk_thread_send_msg(vu_ctrlr->thread, _vfio_user_endpoint_resume_done_msg, endpoint); 3052 } 3053 3054 static void 3055 vfio_user_quiesce_done(void *ctx) 3056 { 3057 struct ctrlr_quiesce_ctx *quiesce_ctx = ctx; 3058 struct nvmf_vfio_user_endpoint *endpoint = quiesce_ctx->endpoint; 3059 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3060 int ret; 3061 3062 if (!vu_ctrlr) { 3063 free(quiesce_ctx); 3064 return; 3065 } 3066 3067 SPDK_DEBUGLOG(nvmf_vfio, "%s device quiesced\n", ctrlr_id(vu_ctrlr)); 3068 3069 assert(vu_ctrlr->state == VFIO_USER_CTRLR_PAUSING); 3070 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 3071 vfu_device_quiesced(endpoint->vfu_ctx, quiesce_ctx->status); 3072 vu_ctrlr->queued_quiesce = false; 3073 free(quiesce_ctx); 3074 3075 /* `vfu_device_quiesced` can change the migration state, 3076 * so we need to re-check `vu_ctrlr->state`. 3077 */ 3078 if (vu_ctrlr->state == VFIO_USER_CTRLR_MIGRATING) { 3079 SPDK_DEBUGLOG(nvmf_vfio, "%s is in MIGRATION state\n", ctrlr_id(vu_ctrlr)); 3080 return; 3081 } 3082 3083 SPDK_DEBUGLOG(nvmf_vfio, "%s start to resume\n", ctrlr_id(vu_ctrlr)); 3084 vu_ctrlr->state = VFIO_USER_CTRLR_RESUMING; 3085 ret = spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 3086 vfio_user_endpoint_resume_done, endpoint); 3087 if (ret < 0) { 3088 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 3089 SPDK_ERRLOG("%s: failed to resume, ret=%d\n", endpoint_id(endpoint), ret); 3090 } 3091 } 3092 3093 static void 3094 vfio_user_pause_done(struct spdk_nvmf_subsystem *subsystem, 3095 void *ctx, int status) 3096 { 3097 struct ctrlr_quiesce_ctx *quiesce_ctx = ctx; 3098 struct nvmf_vfio_user_endpoint *endpoint = quiesce_ctx->endpoint; 3099 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3100 3101 if (!vu_ctrlr) { 3102 free(quiesce_ctx); 3103 return; 3104 } 3105 3106 quiesce_ctx->status = status; 3107 3108 SPDK_DEBUGLOG(nvmf_vfio, "%s pause done with status %d\n", 3109 ctrlr_id(vu_ctrlr), status); 3110 3111 spdk_thread_send_msg(vu_ctrlr->thread, 3112 vfio_user_quiesce_done, ctx); 3113 } 3114 3115 /* 3116 * Ensure that, for this PG, we've stopped running in nvmf_vfio_user_sq_poll(); 3117 * we've already set ctrlr->state, so we won't process new entries, but we need 3118 * to ensure that this PG is quiesced. This only works because there's no 3119 * callback context set up between polling the SQ and spdk_nvmf_request_exec(). 3120 * 3121 * Once we've walked all PGs, we need to pause any submitted I/O via 3122 * spdk_nvmf_subsystem_pause(SPDK_NVME_GLOBAL_NS_TAG). 3123 */ 3124 static void 3125 vfio_user_quiesce_pg(void *ctx) 3126 { 3127 struct ctrlr_quiesce_ctx *quiesce_ctx = ctx; 3128 struct nvmf_vfio_user_endpoint *endpoint = quiesce_ctx->endpoint; 3129 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3130 struct nvmf_vfio_user_poll_group *vu_group = quiesce_ctx->group; 3131 struct spdk_nvmf_subsystem *subsystem = endpoint->subsystem; 3132 int ret; 3133 3134 SPDK_DEBUGLOG(nvmf_vfio, "quiesced pg:%p\n", vu_group); 3135 3136 if (!vu_ctrlr) { 3137 free(quiesce_ctx); 3138 return; 3139 } 3140 3141 quiesce_ctx->group = TAILQ_NEXT(vu_group, link); 3142 if (quiesce_ctx->group != NULL) { 3143 spdk_thread_send_msg(poll_group_to_thread(quiesce_ctx->group), 3144 vfio_user_quiesce_pg, quiesce_ctx); 3145 return; 3146 } 3147 3148 ret = spdk_nvmf_subsystem_pause(subsystem, SPDK_NVME_GLOBAL_NS_TAG, 3149 vfio_user_pause_done, quiesce_ctx); 3150 if (ret < 0) { 3151 SPDK_ERRLOG("%s: failed to pause, ret=%d\n", 3152 endpoint_id(endpoint), ret); 3153 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 3154 fail_ctrlr(vu_ctrlr); 3155 free(quiesce_ctx); 3156 } 3157 } 3158 3159 static void 3160 ctrlr_quiesce(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3161 { 3162 struct ctrlr_quiesce_ctx *quiesce_ctx; 3163 3164 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSING; 3165 3166 quiesce_ctx = calloc(1, sizeof(*quiesce_ctx)); 3167 if (!quiesce_ctx) { 3168 SPDK_ERRLOG("Failed to allocate subsystem pause context\n"); 3169 assert(false); 3170 return; 3171 } 3172 3173 quiesce_ctx->endpoint = vu_ctrlr->endpoint; 3174 quiesce_ctx->status = 0; 3175 quiesce_ctx->group = TAILQ_FIRST(&vu_ctrlr->transport->poll_groups); 3176 3177 spdk_thread_send_msg(poll_group_to_thread(quiesce_ctx->group), 3178 vfio_user_quiesce_pg, quiesce_ctx); 3179 } 3180 3181 static int 3182 vfio_user_dev_quiesce_cb(vfu_ctx_t *vfu_ctx) 3183 { 3184 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3185 struct spdk_nvmf_subsystem *subsystem = endpoint->subsystem; 3186 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3187 3188 if (!vu_ctrlr) { 3189 return 0; 3190 } 3191 3192 /* NVMf library will destruct controller when no 3193 * connected queue pairs. 3194 */ 3195 if (!nvmf_subsystem_get_ctrlr(subsystem, vu_ctrlr->cntlid)) { 3196 return 0; 3197 } 3198 3199 SPDK_DEBUGLOG(nvmf_vfio, "%s starts to quiesce\n", ctrlr_id(vu_ctrlr)); 3200 3201 /* There is no race condition here as device quiesce callback 3202 * and nvmf_prop_set_cc() are running in the same thread context. 3203 */ 3204 if (!vu_ctrlr->ctrlr->vcprop.cc.bits.en) { 3205 return 0; 3206 } else if (!vu_ctrlr->ctrlr->vcprop.csts.bits.rdy) { 3207 return 0; 3208 } else if (vu_ctrlr->ctrlr->vcprop.csts.bits.shst == SPDK_NVME_SHST_COMPLETE) { 3209 return 0; 3210 } 3211 3212 switch (vu_ctrlr->state) { 3213 case VFIO_USER_CTRLR_PAUSED: 3214 case VFIO_USER_CTRLR_MIGRATING: 3215 return 0; 3216 case VFIO_USER_CTRLR_RUNNING: 3217 ctrlr_quiesce(vu_ctrlr); 3218 break; 3219 case VFIO_USER_CTRLR_RESUMING: 3220 vu_ctrlr->queued_quiesce = true; 3221 SPDK_DEBUGLOG(nvmf_vfio, "%s is busy to quiesce, current state %u\n", ctrlr_id(vu_ctrlr), 3222 vu_ctrlr->state); 3223 break; 3224 default: 3225 assert(vu_ctrlr->state != VFIO_USER_CTRLR_PAUSING); 3226 break; 3227 } 3228 3229 errno = EBUSY; 3230 return -1; 3231 } 3232 3233 static void 3234 vfio_user_ctrlr_dump_migr_data(const char *name, 3235 struct vfio_user_nvme_migr_state *migr_data, 3236 struct nvmf_vfio_user_shadow_doorbells *sdbl) 3237 { 3238 struct spdk_nvmf_registers *regs; 3239 struct nvme_migr_sq_state *sq; 3240 struct nvme_migr_cq_state *cq; 3241 uint32_t *doorbell_base; 3242 uint32_t i; 3243 3244 SPDK_NOTICELOG("Dump %s\n", name); 3245 3246 regs = &migr_data->nvmf_data.regs; 3247 doorbell_base = (uint32_t *)&migr_data->doorbells; 3248 3249 SPDK_NOTICELOG("Registers\n"); 3250 SPDK_NOTICELOG("CSTS 0x%x\n", regs->csts.raw); 3251 SPDK_NOTICELOG("CAP 0x%"PRIx64"\n", regs->cap.raw); 3252 SPDK_NOTICELOG("VS 0x%x\n", regs->vs.raw); 3253 SPDK_NOTICELOG("CC 0x%x\n", regs->cc.raw); 3254 SPDK_NOTICELOG("AQA 0x%x\n", regs->aqa.raw); 3255 SPDK_NOTICELOG("ASQ 0x%"PRIx64"\n", regs->asq); 3256 SPDK_NOTICELOG("ACQ 0x%"PRIx64"\n", regs->acq); 3257 3258 SPDK_NOTICELOG("Number of IO Queues %u\n", migr_data->ctrlr_header.num_io_queues); 3259 3260 if (sdbl != NULL) { 3261 SPDK_NOTICELOG("shadow doorbell buffer=%#lx\n", 3262 migr_data->ctrlr_header.shadow_doorbell_buffer); 3263 SPDK_NOTICELOG("eventidx buffer=%#lx\n", 3264 migr_data->ctrlr_header.eventidx_buffer); 3265 } 3266 3267 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3268 sq = &migr_data->qps[i].sq; 3269 cq = &migr_data->qps[i].cq; 3270 3271 if (sq->size) { 3272 SPDK_NOTICELOG("sqid:%u, bar0_doorbell:%u\n", sq->sqid, doorbell_base[i * 2]); 3273 if (i > 0 && sdbl != NULL) { 3274 SPDK_NOTICELOG("sqid:%u, shadow_doorbell:%u, eventidx:%u\n", 3275 sq->sqid, 3276 sdbl->shadow_doorbells[queue_index(i, false)], 3277 sdbl->eventidxs[queue_index(i, false)]); 3278 } 3279 SPDK_NOTICELOG("SQ sqid:%u, cqid:%u, sqhead:%u, size:%u, dma_addr:0x%"PRIx64"\n", 3280 sq->sqid, sq->cqid, sq->head, sq->size, sq->dma_addr); 3281 } 3282 3283 if (cq->size) { 3284 SPDK_NOTICELOG("cqid:%u, bar0_doorbell:%u\n", cq->cqid, doorbell_base[i * 2 + 1]); 3285 if (i > 0 && sdbl != NULL) { 3286 SPDK_NOTICELOG("cqid:%u, shadow_doorbell:%u, eventidx:%u\n", 3287 cq->cqid, 3288 sdbl->shadow_doorbells[queue_index(i, true)], 3289 sdbl->eventidxs[queue_index(i, true)]); 3290 } 3291 SPDK_NOTICELOG("CQ cqid:%u, phase:%u, cqtail:%u, size:%u, iv:%u, ien:%u, dma_addr:0x%"PRIx64"\n", 3292 cq->cqid, cq->phase, cq->tail, cq->size, cq->iv, cq->ien, cq->dma_addr); 3293 } 3294 } 3295 3296 SPDK_NOTICELOG("%s Dump Done\n", name); 3297 } 3298 3299 /* Read region 9 content and restore it to migration data structures */ 3300 static int 3301 vfio_user_migr_stream_to_data(struct nvmf_vfio_user_endpoint *endpoint, 3302 struct vfio_user_nvme_migr_state *migr_state) 3303 { 3304 void *data_ptr = endpoint->migr_data; 3305 3306 /* Load vfio_user_nvme_migr_header first */ 3307 memcpy(&migr_state->ctrlr_header, data_ptr, sizeof(struct vfio_user_nvme_migr_header)); 3308 /* TODO: version check */ 3309 if (migr_state->ctrlr_header.magic != VFIO_USER_NVME_MIGR_MAGIC) { 3310 SPDK_ERRLOG("%s: bad magic number %x\n", endpoint_id(endpoint), migr_state->ctrlr_header.magic); 3311 return -EINVAL; 3312 } 3313 3314 /* Load nvmf controller data */ 3315 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.nvmf_data_offset; 3316 memcpy(&migr_state->nvmf_data, data_ptr, migr_state->ctrlr_header.nvmf_data_len); 3317 3318 /* Load queue pairs */ 3319 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.qp_offset; 3320 memcpy(&migr_state->qps, data_ptr, migr_state->ctrlr_header.qp_len); 3321 3322 /* Load doorbells */ 3323 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.bar_offset[VFU_PCI_DEV_BAR0_REGION_IDX]; 3324 memcpy(&migr_state->doorbells, data_ptr, 3325 migr_state->ctrlr_header.bar_len[VFU_PCI_DEV_BAR0_REGION_IDX]); 3326 3327 /* Load CFG */ 3328 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.bar_offset[VFU_PCI_DEV_CFG_REGION_IDX]; 3329 memcpy(&migr_state->cfg, data_ptr, migr_state->ctrlr_header.bar_len[VFU_PCI_DEV_CFG_REGION_IDX]); 3330 3331 return 0; 3332 } 3333 3334 3335 static void 3336 vfio_user_migr_ctrlr_save_data(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3337 { 3338 struct spdk_nvmf_ctrlr *ctrlr = vu_ctrlr->ctrlr; 3339 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 3340 struct nvmf_vfio_user_sq *sq; 3341 struct nvmf_vfio_user_cq *cq; 3342 uint64_t data_offset; 3343 void *data_ptr; 3344 uint32_t *doorbell_base; 3345 uint32_t i = 0; 3346 uint16_t sqid, cqid; 3347 struct vfio_user_nvme_migr_state migr_state = { 3348 .nvmf_data = { 3349 .data_size = offsetof(struct spdk_nvmf_ctrlr_migr_data, unused), 3350 .regs_size = sizeof(struct spdk_nvmf_registers), 3351 .feat_size = sizeof(struct spdk_nvmf_ctrlr_feat) 3352 } 3353 }; 3354 3355 /* Save all data to vfio_user_nvme_migr_state first, then we will 3356 * copy it to device migration region at last. 3357 */ 3358 3359 /* save magic number */ 3360 migr_state.ctrlr_header.magic = VFIO_USER_NVME_MIGR_MAGIC; 3361 3362 /* save controller data */ 3363 spdk_nvmf_ctrlr_save_migr_data(ctrlr, &migr_state.nvmf_data); 3364 3365 /* save connected queue pairs */ 3366 TAILQ_FOREACH(sq, &vu_ctrlr->connected_sqs, tailq) { 3367 /* save sq */ 3368 sqid = sq->qid; 3369 migr_state.qps[sqid].sq.sqid = sq->qid; 3370 migr_state.qps[sqid].sq.cqid = sq->cqid; 3371 migr_state.qps[sqid].sq.head = *sq_headp(sq); 3372 migr_state.qps[sqid].sq.size = sq->size; 3373 migr_state.qps[sqid].sq.dma_addr = sq->mapping.prp1; 3374 3375 /* save cq, for shared cq case, cq may be saved multiple times */ 3376 cqid = sq->cqid; 3377 cq = vu_ctrlr->cqs[cqid]; 3378 migr_state.qps[cqid].cq.cqid = cqid; 3379 migr_state.qps[cqid].cq.tail = *cq_tailp(cq); 3380 migr_state.qps[cqid].cq.ien = cq->ien; 3381 migr_state.qps[cqid].cq.iv = cq->iv; 3382 migr_state.qps[cqid].cq.size = cq->size; 3383 migr_state.qps[cqid].cq.phase = cq->phase; 3384 migr_state.qps[cqid].cq.dma_addr = cq->mapping.prp1; 3385 i++; 3386 } 3387 3388 assert(i > 0); 3389 migr_state.ctrlr_header.num_io_queues = i - 1; 3390 3391 /* Save doorbells */ 3392 doorbell_base = (uint32_t *)&migr_state.doorbells; 3393 memcpy(doorbell_base, (void *)vu_ctrlr->bar0_doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 3394 3395 /* Save PCI configuration space */ 3396 memcpy(&migr_state.cfg, (void *)endpoint->pci_config_space, NVME_REG_CFG_SIZE); 3397 3398 /* Save all data to device migration region */ 3399 data_ptr = endpoint->migr_data; 3400 3401 /* Copy nvmf controller data */ 3402 data_offset = sizeof(struct vfio_user_nvme_migr_header); 3403 data_ptr += data_offset; 3404 migr_state.ctrlr_header.nvmf_data_offset = data_offset; 3405 migr_state.ctrlr_header.nvmf_data_len = sizeof(struct spdk_nvmf_ctrlr_migr_data); 3406 memcpy(data_ptr, &migr_state.nvmf_data, sizeof(struct spdk_nvmf_ctrlr_migr_data)); 3407 3408 /* Copy queue pairs */ 3409 data_offset += sizeof(struct spdk_nvmf_ctrlr_migr_data); 3410 data_ptr += sizeof(struct spdk_nvmf_ctrlr_migr_data); 3411 migr_state.ctrlr_header.qp_offset = data_offset; 3412 migr_state.ctrlr_header.qp_len = i * (sizeof(struct nvme_migr_sq_state) + sizeof( 3413 struct nvme_migr_cq_state)); 3414 memcpy(data_ptr, &migr_state.qps, migr_state.ctrlr_header.qp_len); 3415 3416 /* Copy doorbells */ 3417 data_offset += migr_state.ctrlr_header.qp_len; 3418 data_ptr += migr_state.ctrlr_header.qp_len; 3419 migr_state.ctrlr_header.bar_offset[VFU_PCI_DEV_BAR0_REGION_IDX] = data_offset; 3420 migr_state.ctrlr_header.bar_len[VFU_PCI_DEV_BAR0_REGION_IDX] = NVMF_VFIO_USER_DOORBELLS_SIZE; 3421 memcpy(data_ptr, &migr_state.doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 3422 3423 /* Copy CFG */ 3424 data_offset += NVMF_VFIO_USER_DOORBELLS_SIZE; 3425 data_ptr += NVMF_VFIO_USER_DOORBELLS_SIZE; 3426 migr_state.ctrlr_header.bar_offset[VFU_PCI_DEV_CFG_REGION_IDX] = data_offset; 3427 migr_state.ctrlr_header.bar_len[VFU_PCI_DEV_CFG_REGION_IDX] = NVME_REG_CFG_SIZE; 3428 memcpy(data_ptr, &migr_state.cfg, NVME_REG_CFG_SIZE); 3429 3430 /* copy shadow doorbells */ 3431 if (vu_ctrlr->sdbl != NULL) { 3432 migr_state.ctrlr_header.sdbl = true; 3433 migr_state.ctrlr_header.shadow_doorbell_buffer = vu_ctrlr->shadow_doorbell_buffer; 3434 migr_state.ctrlr_header.eventidx_buffer = vu_ctrlr->eventidx_buffer; 3435 } 3436 3437 /* Copy nvme migration header finally */ 3438 memcpy(endpoint->migr_data, &migr_state.ctrlr_header, sizeof(struct vfio_user_nvme_migr_header)); 3439 3440 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 3441 vfio_user_ctrlr_dump_migr_data("SAVE", &migr_state, vu_ctrlr->sdbl); 3442 } 3443 } 3444 3445 /* 3446 * If we are about to close the connection, we need to unregister the interrupt, 3447 * as the library will subsequently close the file descriptor we registered. 3448 */ 3449 static int 3450 vfio_user_device_reset(vfu_ctx_t *vfu_ctx, vfu_reset_type_t type) 3451 { 3452 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3453 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 3454 3455 SPDK_DEBUGLOG(nvmf_vfio, "Device reset type %u\n", type); 3456 3457 if (type == VFU_RESET_LOST_CONN) { 3458 if (ctrlr != NULL) { 3459 spdk_interrupt_unregister(&ctrlr->intr); 3460 ctrlr->intr_fd = -1; 3461 } 3462 return 0; 3463 } 3464 3465 /* FIXME: LOST_CONN case ? */ 3466 if (ctrlr->sdbl != NULL) { 3467 vfio_user_ctrlr_switch_doorbells(ctrlr, false); 3468 free_sdbl(vfu_ctx, ctrlr->sdbl); 3469 ctrlr->sdbl = NULL; 3470 } 3471 3472 /* FIXME: much more needed here. */ 3473 3474 return 0; 3475 } 3476 3477 static int 3478 vfio_user_migr_ctrlr_construct_qps(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 3479 struct vfio_user_nvme_migr_state *migr_state) 3480 { 3481 uint32_t i, qsize = 0; 3482 uint16_t sqid, cqid; 3483 struct vfio_user_nvme_migr_qp migr_qp; 3484 void *addr; 3485 uint32_t cqs_ref[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR] = {}; 3486 int ret; 3487 3488 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 3489 vfio_user_ctrlr_dump_migr_data("RESUME", migr_state, vu_ctrlr->sdbl); 3490 } 3491 3492 /* restore submission queues */ 3493 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3494 migr_qp = migr_state->qps[i]; 3495 3496 qsize = migr_qp.sq.size; 3497 if (qsize) { 3498 struct nvmf_vfio_user_sq *sq; 3499 3500 sqid = migr_qp.sq.sqid; 3501 if (sqid != i) { 3502 SPDK_ERRLOG("Expected sqid %u while got %u", i, sqid); 3503 return -EINVAL; 3504 } 3505 3506 /* allocate sq if necessary */ 3507 if (vu_ctrlr->sqs[sqid] == NULL) { 3508 ret = init_sq(vu_ctrlr, &vu_ctrlr->transport->transport, sqid); 3509 if (ret) { 3510 SPDK_ERRLOG("Construct qpair with qid %u failed\n", sqid); 3511 return -EFAULT; 3512 } 3513 } 3514 3515 sq = vu_ctrlr->sqs[sqid]; 3516 sq->size = qsize; 3517 3518 ret = alloc_sq_reqs(vu_ctrlr, sq); 3519 if (ret) { 3520 SPDK_ERRLOG("Construct sq with qid %u failed\n", sqid); 3521 return -EFAULT; 3522 } 3523 3524 /* restore sq */ 3525 sq->sq_state = VFIO_USER_SQ_CREATED; 3526 sq->cqid = migr_qp.sq.cqid; 3527 *sq_headp(sq) = migr_qp.sq.head; 3528 sq->mapping.prp1 = migr_qp.sq.dma_addr; 3529 addr = map_one(vu_ctrlr->endpoint->vfu_ctx, 3530 sq->mapping.prp1, sq->size * 64, 3531 sq->mapping.sg, &sq->mapping.iov, 3532 PROT_READ); 3533 if (addr == NULL) { 3534 SPDK_ERRLOG("Restore sq with qid %u PRP1 0x%"PRIx64" with size %u failed\n", 3535 sqid, sq->mapping.prp1, sq->size); 3536 return -EFAULT; 3537 } 3538 cqs_ref[sq->cqid]++; 3539 } 3540 } 3541 3542 /* restore completion queues */ 3543 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3544 migr_qp = migr_state->qps[i]; 3545 3546 qsize = migr_qp.cq.size; 3547 if (qsize) { 3548 struct nvmf_vfio_user_cq *cq; 3549 3550 /* restore cq */ 3551 cqid = migr_qp.sq.cqid; 3552 assert(cqid == i); 3553 3554 /* allocate cq if necessary */ 3555 if (vu_ctrlr->cqs[cqid] == NULL) { 3556 ret = init_cq(vu_ctrlr, cqid); 3557 if (ret) { 3558 SPDK_ERRLOG("Construct qpair with qid %u failed\n", cqid); 3559 return -EFAULT; 3560 } 3561 } 3562 3563 cq = vu_ctrlr->cqs[cqid]; 3564 3565 cq->size = qsize; 3566 3567 cq->cq_state = VFIO_USER_CQ_CREATED; 3568 cq->cq_ref = cqs_ref[cqid]; 3569 *cq_tailp(cq) = migr_qp.cq.tail; 3570 cq->mapping.prp1 = migr_qp.cq.dma_addr; 3571 cq->ien = migr_qp.cq.ien; 3572 cq->iv = migr_qp.cq.iv; 3573 cq->phase = migr_qp.cq.phase; 3574 addr = map_one(vu_ctrlr->endpoint->vfu_ctx, 3575 cq->mapping.prp1, cq->size * 16, 3576 cq->mapping.sg, &cq->mapping.iov, 3577 PROT_READ | PROT_WRITE); 3578 if (addr == NULL) { 3579 SPDK_ERRLOG("Restore cq with qid %u PRP1 0x%"PRIx64" with size %u failed\n", 3580 cqid, cq->mapping.prp1, cq->size); 3581 return -EFAULT; 3582 } 3583 } 3584 } 3585 3586 return 0; 3587 } 3588 3589 static int 3590 vfio_user_migr_ctrlr_restore(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3591 { 3592 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 3593 struct spdk_nvmf_ctrlr *ctrlr = vu_ctrlr->ctrlr; 3594 uint32_t *doorbell_base; 3595 struct spdk_nvme_cmd cmd; 3596 uint16_t i; 3597 int rc = 0; 3598 struct vfio_user_nvme_migr_state migr_state = { 3599 .nvmf_data = { 3600 .data_size = offsetof(struct spdk_nvmf_ctrlr_migr_data, unused), 3601 .regs_size = sizeof(struct spdk_nvmf_registers), 3602 .feat_size = sizeof(struct spdk_nvmf_ctrlr_feat) 3603 } 3604 }; 3605 3606 assert(endpoint->migr_data != NULL); 3607 assert(ctrlr != NULL); 3608 rc = vfio_user_migr_stream_to_data(endpoint, &migr_state); 3609 if (rc) { 3610 return rc; 3611 } 3612 3613 /* restore shadow doorbells */ 3614 if (migr_state.ctrlr_header.sdbl) { 3615 struct nvmf_vfio_user_shadow_doorbells *sdbl; 3616 sdbl = map_sdbl(vu_ctrlr->endpoint->vfu_ctx, 3617 migr_state.ctrlr_header.shadow_doorbell_buffer, 3618 migr_state.ctrlr_header.eventidx_buffer, 3619 memory_page_size(vu_ctrlr)); 3620 if (sdbl == NULL) { 3621 SPDK_ERRLOG("%s: failed to re-map shadow doorbell buffers\n", 3622 ctrlr_id(vu_ctrlr)); 3623 return -1; 3624 } 3625 3626 vu_ctrlr->shadow_doorbell_buffer = migr_state.ctrlr_header.shadow_doorbell_buffer; 3627 vu_ctrlr->eventidx_buffer = migr_state.ctrlr_header.eventidx_buffer; 3628 3629 SWAP(vu_ctrlr->sdbl, sdbl); 3630 } 3631 3632 rc = vfio_user_migr_ctrlr_construct_qps(vu_ctrlr, &migr_state); 3633 if (rc) { 3634 return rc; 3635 } 3636 3637 /* restore PCI configuration space */ 3638 memcpy((void *)endpoint->pci_config_space, &migr_state.cfg, NVME_REG_CFG_SIZE); 3639 3640 doorbell_base = (uint32_t *)&migr_state.doorbells; 3641 /* restore doorbells from saved registers */ 3642 memcpy((void *)vu_ctrlr->bar0_doorbells, doorbell_base, NVMF_VFIO_USER_DOORBELLS_SIZE); 3643 3644 /* restore nvmf controller data */ 3645 rc = spdk_nvmf_ctrlr_restore_migr_data(ctrlr, &migr_state.nvmf_data); 3646 if (rc) { 3647 return rc; 3648 } 3649 3650 /* resubmit pending AERs */ 3651 for (i = 0; i < migr_state.nvmf_data.num_aer_cids; i++) { 3652 SPDK_DEBUGLOG(nvmf_vfio, "%s AER resubmit, CID %u\n", ctrlr_id(vu_ctrlr), 3653 migr_state.nvmf_data.aer_cids[i]); 3654 memset(&cmd, 0, sizeof(cmd)); 3655 cmd.opc = SPDK_NVME_OPC_ASYNC_EVENT_REQUEST; 3656 cmd.cid = migr_state.nvmf_data.aer_cids[i]; 3657 rc = handle_cmd_req(vu_ctrlr, &cmd, vu_ctrlr->sqs[0]); 3658 if (rc) { 3659 break; 3660 } 3661 } 3662 3663 return rc; 3664 } 3665 3666 static void 3667 vfio_user_migr_ctrlr_enable_sqs(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3668 { 3669 uint32_t i; 3670 struct nvmf_vfio_user_sq *sq; 3671 3672 /* The Admin queue (qid: 0) does not ever use shadow doorbells. */ 3673 3674 if (vu_ctrlr->sqs[0] != NULL) { 3675 vu_ctrlr->sqs[0]->dbl_tailp = vu_ctrlr->bar0_doorbells + 3676 queue_index(0, false); 3677 } 3678 3679 if (vu_ctrlr->cqs[0] != NULL) { 3680 vu_ctrlr->cqs[0]->dbl_headp = vu_ctrlr->bar0_doorbells + 3681 queue_index(0, true); 3682 } 3683 3684 vfio_user_ctrlr_switch_doorbells(vu_ctrlr, vu_ctrlr->sdbl != NULL); 3685 3686 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3687 sq = vu_ctrlr->sqs[i]; 3688 if (!sq || !sq->size) { 3689 continue; 3690 } 3691 3692 if (nvmf_qpair_is_admin_queue(&sq->qpair)) { 3693 /* ADMIN queue pair is always in the poll group, just enable it */ 3694 sq->sq_state = VFIO_USER_SQ_ACTIVE; 3695 } else { 3696 spdk_nvmf_tgt_new_qpair(vu_ctrlr->transport->transport.tgt, &sq->qpair); 3697 } 3698 } 3699 } 3700 3701 /* 3702 * We are in stop-and-copy state, but still potentially have some current dirty 3703 * sgls: while we're quiesced and thus should have no active requests, we still 3704 * have potentially dirty maps of the shadow doorbells and the CQs (SQs are 3705 * mapped read only). 3706 * 3707 * Since we won't be calling vfu_sgl_put() for them, we need to explicitly 3708 * mark them dirty now. 3709 */ 3710 static void 3711 vfio_user_migr_ctrlr_mark_dirty(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3712 { 3713 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 3714 3715 assert(vu_ctrlr->state == VFIO_USER_CTRLR_MIGRATING); 3716 3717 for (size_t i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3718 struct nvmf_vfio_user_cq *cq = vu_ctrlr->cqs[i]; 3719 3720 if (cq == NULL || q_addr(&cq->mapping) == NULL) { 3721 continue; 3722 } 3723 3724 vfu_sgl_mark_dirty(endpoint->vfu_ctx, cq->mapping.sg, 1); 3725 } 3726 3727 if (vu_ctrlr->sdbl != NULL) { 3728 dma_sg_t *sg; 3729 size_t i; 3730 3731 for (i = 0; i < NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT; 3732 ++i) { 3733 3734 if (!vu_ctrlr->sdbl->iovs[i].iov_len) { 3735 continue; 3736 } 3737 3738 sg = index_to_sg_t(vu_ctrlr->sdbl->sgs, i); 3739 3740 vfu_sgl_mark_dirty(endpoint->vfu_ctx, sg, 1); 3741 } 3742 } 3743 } 3744 3745 static int 3746 vfio_user_migration_device_state_transition(vfu_ctx_t *vfu_ctx, vfu_migr_state_t state) 3747 { 3748 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3749 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3750 struct nvmf_vfio_user_sq *sq; 3751 int ret = 0; 3752 3753 SPDK_DEBUGLOG(nvmf_vfio, "%s controller state %u, migration state %u\n", endpoint_id(endpoint), 3754 vu_ctrlr->state, state); 3755 3756 switch (state) { 3757 case VFU_MIGR_STATE_STOP_AND_COPY: 3758 vu_ctrlr->in_source_vm = true; 3759 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 3760 vfio_user_migr_ctrlr_mark_dirty(vu_ctrlr); 3761 vfio_user_migr_ctrlr_save_data(vu_ctrlr); 3762 break; 3763 case VFU_MIGR_STATE_STOP: 3764 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 3765 /* The controller associates with source VM is dead now, we will resume 3766 * the subsystem after destroying the controller data structure, then the 3767 * subsystem can be re-used for another new client. 3768 */ 3769 if (vu_ctrlr->in_source_vm) { 3770 endpoint->need_resume = true; 3771 } 3772 break; 3773 case VFU_MIGR_STATE_PRE_COPY: 3774 assert(vu_ctrlr->state == VFIO_USER_CTRLR_PAUSED); 3775 break; 3776 case VFU_MIGR_STATE_RESUME: 3777 /* 3778 * Destination ADMIN queue pair is connected when starting the VM, 3779 * but the ADMIN queue pair isn't enabled in destination VM, the poll 3780 * group will do nothing to ADMIN queue pair for now. 3781 */ 3782 if (vu_ctrlr->state != VFIO_USER_CTRLR_RUNNING) { 3783 break; 3784 } 3785 3786 assert(!vu_ctrlr->in_source_vm); 3787 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 3788 3789 sq = TAILQ_FIRST(&vu_ctrlr->connected_sqs); 3790 assert(sq != NULL); 3791 assert(sq->qpair.qid == 0); 3792 sq->sq_state = VFIO_USER_SQ_INACTIVE; 3793 3794 /* Free ADMIN SQ resources first, SQ resources will be 3795 * allocated based on queue size from source VM. 3796 */ 3797 free_sq_reqs(sq); 3798 sq->size = 0; 3799 break; 3800 case VFU_MIGR_STATE_RUNNING: 3801 3802 if (vu_ctrlr->state != VFIO_USER_CTRLR_MIGRATING) { 3803 break; 3804 } 3805 3806 if (!vu_ctrlr->in_source_vm) { 3807 /* Restore destination VM from BAR9 */ 3808 ret = vfio_user_migr_ctrlr_restore(vu_ctrlr); 3809 if (ret) { 3810 break; 3811 } 3812 3813 vfio_user_ctrlr_switch_doorbells(vu_ctrlr, false); 3814 vfio_user_migr_ctrlr_enable_sqs(vu_ctrlr); 3815 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 3816 /* FIXME where do we resume nvmf? */ 3817 } else { 3818 /* Rollback source VM */ 3819 vu_ctrlr->state = VFIO_USER_CTRLR_RESUMING; 3820 ret = spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 3821 vfio_user_endpoint_resume_done, endpoint); 3822 if (ret < 0) { 3823 /* TODO: fail controller with CFS bit set */ 3824 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 3825 SPDK_ERRLOG("%s: failed to resume, ret=%d\n", endpoint_id(endpoint), ret); 3826 } 3827 } 3828 vu_ctrlr->migr_data_prepared = false; 3829 vu_ctrlr->in_source_vm = false; 3830 break; 3831 3832 default: 3833 return -EINVAL; 3834 } 3835 3836 return ret; 3837 } 3838 3839 static uint64_t 3840 vfio_user_migration_get_pending_bytes(vfu_ctx_t *vfu_ctx) 3841 { 3842 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3843 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 3844 uint64_t pending_bytes; 3845 3846 if (ctrlr->migr_data_prepared) { 3847 assert(ctrlr->state == VFIO_USER_CTRLR_MIGRATING); 3848 pending_bytes = 0; 3849 } else { 3850 pending_bytes = vfio_user_migr_data_len(); 3851 } 3852 3853 SPDK_DEBUGLOG(nvmf_vfio, 3854 "%s current state %u, pending bytes 0x%"PRIx64"\n", 3855 endpoint_id(endpoint), ctrlr->state, pending_bytes); 3856 3857 return pending_bytes; 3858 } 3859 3860 static int 3861 vfio_user_migration_prepare_data(vfu_ctx_t *vfu_ctx, uint64_t *offset, uint64_t *size) 3862 { 3863 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3864 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 3865 3866 /* 3867 * When transitioning to pre-copy state we set pending_bytes to 0, 3868 * so the vfio-user client shouldn't attempt to read any migration 3869 * data. This is not yet guaranteed by libvfio-user. 3870 */ 3871 if (ctrlr->state != VFIO_USER_CTRLR_MIGRATING) { 3872 assert(size != NULL); 3873 *offset = 0; 3874 *size = 0; 3875 return 0; 3876 } 3877 3878 if (ctrlr->in_source_vm) { /* migration source */ 3879 assert(size != NULL); 3880 *size = vfio_user_migr_data_len(); 3881 vfio_user_migr_ctrlr_save_data(ctrlr); 3882 } else { /* migration destination */ 3883 assert(size == NULL); 3884 assert(!ctrlr->migr_data_prepared); 3885 } 3886 *offset = 0; 3887 ctrlr->migr_data_prepared = true; 3888 3889 SPDK_DEBUGLOG(nvmf_vfio, "%s current state %u\n", endpoint_id(endpoint), ctrlr->state); 3890 3891 return 0; 3892 } 3893 3894 static ssize_t 3895 vfio_user_migration_read_data(vfu_ctx_t *vfu_ctx __attribute__((unused)), 3896 void *buf __attribute__((unused)), 3897 uint64_t count __attribute__((unused)), 3898 uint64_t offset __attribute__((unused))) 3899 { 3900 SPDK_DEBUGLOG(nvmf_vfio, "%s: migration read data not supported\n", 3901 endpoint_id(vfu_get_private(vfu_ctx))); 3902 errno = ENOTSUP; 3903 return -1; 3904 } 3905 3906 static ssize_t 3907 vfio_user_migration_write_data(vfu_ctx_t *vfu_ctx __attribute__((unused)), 3908 void *buf __attribute__((unused)), 3909 uint64_t count __attribute__((unused)), 3910 uint64_t offset __attribute__((unused))) 3911 { 3912 SPDK_DEBUGLOG(nvmf_vfio, "%s: migration write data not supported\n", 3913 endpoint_id(vfu_get_private(vfu_ctx))); 3914 errno = ENOTSUP; 3915 return -1; 3916 } 3917 3918 static int 3919 vfio_user_migration_data_written(vfu_ctx_t *vfu_ctx __attribute__((unused)), 3920 uint64_t count) 3921 { 3922 SPDK_DEBUGLOG(nvmf_vfio, "write 0x%"PRIx64"\n", (uint64_t)count); 3923 3924 if (count != vfio_user_migr_data_len()) { 3925 SPDK_DEBUGLOG(nvmf_vfio, "%s bad count %#lx\n", 3926 endpoint_id(vfu_get_private(vfu_ctx)), count); 3927 errno = EINVAL; 3928 return -1; 3929 } 3930 3931 return 0; 3932 } 3933 3934 static int 3935 vfio_user_dev_info_fill(struct nvmf_vfio_user_transport *vu_transport, 3936 struct nvmf_vfio_user_endpoint *endpoint) 3937 { 3938 int ret; 3939 ssize_t cap_offset; 3940 vfu_ctx_t *vfu_ctx = endpoint->vfu_ctx; 3941 struct iovec migr_sparse_mmap = {}; 3942 3943 struct pmcap pmcap = { .hdr.id = PCI_CAP_ID_PM, .pmcs.nsfrst = 0x1 }; 3944 struct pxcap pxcap = { 3945 .hdr.id = PCI_CAP_ID_EXP, 3946 .pxcaps.ver = 0x2, 3947 .pxdcap = {.rer = 0x1, .flrc = 0x1}, 3948 .pxdcap2.ctds = 0x1 3949 }; 3950 3951 struct msixcap msixcap = { 3952 .hdr.id = PCI_CAP_ID_MSIX, 3953 .mxc.ts = NVME_IRQ_MSIX_NUM - 1, 3954 .mtab = {.tbir = 0x4, .to = 0x0}, 3955 .mpba = {.pbir = 0x5, .pbao = 0x0} 3956 }; 3957 3958 struct iovec sparse_mmap[] = { 3959 { 3960 .iov_base = (void *)NVME_DOORBELLS_OFFSET, 3961 .iov_len = NVMF_VFIO_USER_DOORBELLS_SIZE, 3962 }, 3963 }; 3964 3965 const vfu_migration_callbacks_t migr_callbacks = { 3966 .version = VFU_MIGR_CALLBACKS_VERS, 3967 .transition = &vfio_user_migration_device_state_transition, 3968 .get_pending_bytes = &vfio_user_migration_get_pending_bytes, 3969 .prepare_data = &vfio_user_migration_prepare_data, 3970 .read_data = &vfio_user_migration_read_data, 3971 .data_written = &vfio_user_migration_data_written, 3972 .write_data = &vfio_user_migration_write_data 3973 }; 3974 3975 ret = vfu_pci_init(vfu_ctx, VFU_PCI_TYPE_EXPRESS, PCI_HEADER_TYPE_NORMAL, 0); 3976 if (ret < 0) { 3977 SPDK_ERRLOG("vfu_ctx %p failed to initialize PCI\n", vfu_ctx); 3978 return ret; 3979 } 3980 vfu_pci_set_id(vfu_ctx, SPDK_PCI_VID_NUTANIX, 0x0001, SPDK_PCI_VID_NUTANIX, 0); 3981 /* 3982 * 0x02, controller uses the NVM Express programming interface 3983 * 0x08, non-volatile memory controller 3984 * 0x01, mass storage controller 3985 */ 3986 vfu_pci_set_class(vfu_ctx, 0x01, 0x08, 0x02); 3987 3988 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pmcap); 3989 if (cap_offset < 0) { 3990 SPDK_ERRLOG("vfu_ctx %p failed add pmcap\n", vfu_ctx); 3991 return ret; 3992 } 3993 3994 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pxcap); 3995 if (cap_offset < 0) { 3996 SPDK_ERRLOG("vfu_ctx %p failed add pxcap\n", vfu_ctx); 3997 return ret; 3998 } 3999 4000 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &msixcap); 4001 if (cap_offset < 0) { 4002 SPDK_ERRLOG("vfu_ctx %p failed add msixcap\n", vfu_ctx); 4003 return ret; 4004 } 4005 4006 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_CFG_REGION_IDX, NVME_REG_CFG_SIZE, 4007 access_pci_config, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 4008 if (ret < 0) { 4009 SPDK_ERRLOG("vfu_ctx %p failed to setup cfg\n", vfu_ctx); 4010 return ret; 4011 } 4012 4013 if (vu_transport->transport_opts.disable_mappable_bar0) { 4014 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE, 4015 access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, 4016 NULL, 0, -1, 0); 4017 } else { 4018 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE, 4019 access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, 4020 sparse_mmap, 1, endpoint->devmem_fd, 0); 4021 } 4022 4023 if (ret < 0) { 4024 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 0\n", vfu_ctx); 4025 return ret; 4026 } 4027 4028 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR4_REGION_IDX, NVME_BAR4_SIZE, 4029 NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 4030 if (ret < 0) { 4031 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 4\n", vfu_ctx); 4032 return ret; 4033 } 4034 4035 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR5_REGION_IDX, NVME_BAR5_SIZE, 4036 NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 4037 if (ret < 0) { 4038 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 5\n", vfu_ctx); 4039 return ret; 4040 } 4041 4042 ret = vfu_setup_device_dma(vfu_ctx, memory_region_add_cb, memory_region_remove_cb); 4043 if (ret < 0) { 4044 SPDK_ERRLOG("vfu_ctx %p failed to setup dma callback\n", vfu_ctx); 4045 return ret; 4046 } 4047 4048 ret = vfu_setup_device_reset_cb(vfu_ctx, vfio_user_device_reset); 4049 if (ret < 0) { 4050 SPDK_ERRLOG("vfu_ctx %p failed to setup reset callback\n", vfu_ctx); 4051 return ret; 4052 } 4053 4054 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_INTX_IRQ, 1); 4055 if (ret < 0) { 4056 SPDK_ERRLOG("vfu_ctx %p failed to setup INTX\n", vfu_ctx); 4057 return ret; 4058 } 4059 4060 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSIX_IRQ, NVME_IRQ_MSIX_NUM); 4061 if (ret < 0) { 4062 SPDK_ERRLOG("vfu_ctx %p failed to setup MSIX\n", vfu_ctx); 4063 return ret; 4064 } 4065 4066 vfu_setup_device_quiesce_cb(vfu_ctx, vfio_user_dev_quiesce_cb); 4067 4068 migr_sparse_mmap.iov_base = (void *)4096; 4069 migr_sparse_mmap.iov_len = vfio_user_migr_data_len(); 4070 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_MIGR_REGION_IDX, 4071 vfu_get_migr_register_area_size() + vfio_user_migr_data_len(), 4072 NULL, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, &migr_sparse_mmap, 4073 1, endpoint->migr_fd, 0); 4074 if (ret < 0) { 4075 SPDK_ERRLOG("vfu_ctx %p failed to setup migration region\n", vfu_ctx); 4076 return ret; 4077 } 4078 4079 ret = vfu_setup_device_migration_callbacks(vfu_ctx, &migr_callbacks, 4080 vfu_get_migr_register_area_size()); 4081 if (ret < 0) { 4082 SPDK_ERRLOG("vfu_ctx %p failed to setup migration callbacks\n", vfu_ctx); 4083 return ret; 4084 } 4085 4086 ret = vfu_realize_ctx(vfu_ctx); 4087 if (ret < 0) { 4088 SPDK_ERRLOG("vfu_ctx %p failed to realize\n", vfu_ctx); 4089 return ret; 4090 } 4091 4092 endpoint->pci_config_space = vfu_pci_get_config_space(endpoint->vfu_ctx); 4093 assert(endpoint->pci_config_space != NULL); 4094 init_pci_config_space(endpoint->pci_config_space); 4095 4096 assert(cap_offset != 0); 4097 endpoint->msix = (struct msixcap *)((uint8_t *)endpoint->pci_config_space + cap_offset); 4098 4099 return 0; 4100 } 4101 4102 static int nvmf_vfio_user_accept(void *ctx); 4103 4104 static void 4105 set_intr_mode_noop(struct spdk_poller *poller, void *arg, bool interrupt_mode) 4106 { 4107 /* Nothing for us to do here. */ 4108 } 4109 4110 /* 4111 * Register an "accept" poller: this is polling for incoming vfio-user socket 4112 * connections (on the listening socket). 4113 * 4114 * We need to do this on first listening, and also after destroying a 4115 * controller, so we can accept another connection. 4116 */ 4117 static int 4118 vfio_user_register_accept_poller(struct nvmf_vfio_user_endpoint *endpoint) 4119 { 4120 uint64_t poll_rate_us = endpoint->transport->transport.opts.acceptor_poll_rate; 4121 4122 SPDK_DEBUGLOG(nvmf_vfio, "registering accept poller\n"); 4123 4124 endpoint->accept_poller = SPDK_POLLER_REGISTER(nvmf_vfio_user_accept, 4125 endpoint, poll_rate_us); 4126 4127 if (!endpoint->accept_poller) { 4128 return -1; 4129 } 4130 4131 endpoint->accept_thread = spdk_get_thread(); 4132 endpoint->need_relisten = false; 4133 4134 if (!spdk_interrupt_mode_is_enabled()) { 4135 return 0; 4136 } 4137 4138 endpoint->accept_intr_fd = vfu_get_poll_fd(endpoint->vfu_ctx); 4139 assert(endpoint->accept_intr_fd != -1); 4140 4141 endpoint->accept_intr = SPDK_INTERRUPT_REGISTER(endpoint->accept_intr_fd, 4142 nvmf_vfio_user_accept, endpoint); 4143 4144 assert(endpoint->accept_intr != NULL); 4145 4146 spdk_poller_register_interrupt(endpoint->accept_poller, 4147 set_intr_mode_noop, NULL); 4148 return 0; 4149 } 4150 4151 static void 4152 _vfio_user_relisten(void *ctx) 4153 { 4154 struct nvmf_vfio_user_endpoint *endpoint = ctx; 4155 4156 vfio_user_register_accept_poller(endpoint); 4157 } 4158 4159 static void 4160 _free_ctrlr(void *ctx) 4161 { 4162 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 4163 struct nvmf_vfio_user_endpoint *endpoint = ctrlr->endpoint; 4164 4165 free_sdbl(ctrlr->endpoint->vfu_ctx, ctrlr->sdbl); 4166 4167 spdk_interrupt_unregister(&ctrlr->intr); 4168 ctrlr->intr_fd = -1; 4169 spdk_poller_unregister(&ctrlr->vfu_ctx_poller); 4170 4171 free(ctrlr); 4172 4173 if (endpoint == NULL) { 4174 return; 4175 } 4176 4177 if (endpoint->need_async_destroy) { 4178 nvmf_vfio_user_destroy_endpoint(endpoint); 4179 } else if (endpoint->need_relisten) { 4180 spdk_thread_send_msg(endpoint->accept_thread, 4181 _vfio_user_relisten, endpoint); 4182 } 4183 } 4184 4185 static void 4186 free_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 4187 { 4188 int i; 4189 assert(ctrlr != NULL); 4190 4191 SPDK_DEBUGLOG(nvmf_vfio, "free %s\n", ctrlr_id(ctrlr)); 4192 4193 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 4194 free_qp(ctrlr, i); 4195 } 4196 4197 spdk_thread_exec_msg(ctrlr->thread, _free_ctrlr, ctrlr); 4198 } 4199 4200 static int 4201 nvmf_vfio_user_create_ctrlr(struct nvmf_vfio_user_transport *transport, 4202 struct nvmf_vfio_user_endpoint *endpoint) 4203 { 4204 struct nvmf_vfio_user_ctrlr *ctrlr; 4205 int err = 0; 4206 4207 SPDK_DEBUGLOG(nvmf_vfio, "%s\n", endpoint_id(endpoint)); 4208 4209 /* First, construct a vfio-user CUSTOM transport controller */ 4210 ctrlr = calloc(1, sizeof(*ctrlr)); 4211 if (ctrlr == NULL) { 4212 err = -ENOMEM; 4213 goto out; 4214 } 4215 /* We can only support one connection for now */ 4216 ctrlr->cntlid = 0x1; 4217 ctrlr->intr_fd = -1; 4218 ctrlr->transport = transport; 4219 ctrlr->endpoint = endpoint; 4220 ctrlr->bar0_doorbells = endpoint->bar0_doorbells; 4221 TAILQ_INIT(&ctrlr->connected_sqs); 4222 4223 ctrlr->adaptive_irqs_enabled = 4224 !transport->transport_opts.disable_adaptive_irq; 4225 4226 /* Then, construct an admin queue pair */ 4227 err = init_sq(ctrlr, &transport->transport, 0); 4228 if (err != 0) { 4229 free(ctrlr); 4230 goto out; 4231 } 4232 4233 err = init_cq(ctrlr, 0); 4234 if (err != 0) { 4235 free(ctrlr); 4236 goto out; 4237 } 4238 4239 ctrlr->sqs[0]->size = NVMF_VFIO_USER_DEFAULT_AQ_DEPTH; 4240 4241 err = alloc_sq_reqs(ctrlr, ctrlr->sqs[0]); 4242 if (err != 0) { 4243 free(ctrlr); 4244 goto out; 4245 } 4246 endpoint->ctrlr = ctrlr; 4247 4248 /* Notify the generic layer about the new admin queue pair */ 4249 spdk_nvmf_tgt_new_qpair(transport->transport.tgt, &ctrlr->sqs[0]->qpair); 4250 4251 out: 4252 if (err != 0) { 4253 SPDK_ERRLOG("%s: failed to create vfio-user controller: %s\n", 4254 endpoint_id(endpoint), strerror(-err)); 4255 } 4256 4257 return err; 4258 } 4259 4260 static int 4261 nvmf_vfio_user_listen(struct spdk_nvmf_transport *transport, 4262 const struct spdk_nvme_transport_id *trid, 4263 struct spdk_nvmf_listen_opts *listen_opts) 4264 { 4265 struct nvmf_vfio_user_transport *vu_transport; 4266 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 4267 char path[PATH_MAX] = {}; 4268 char uuid[PATH_MAX] = {}; 4269 int ret; 4270 4271 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4272 transport); 4273 4274 pthread_mutex_lock(&vu_transport->lock); 4275 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 4276 /* Only compare traddr */ 4277 if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) { 4278 pthread_mutex_unlock(&vu_transport->lock); 4279 return -EEXIST; 4280 } 4281 } 4282 pthread_mutex_unlock(&vu_transport->lock); 4283 4284 endpoint = calloc(1, sizeof(*endpoint)); 4285 if (!endpoint) { 4286 return -ENOMEM; 4287 } 4288 4289 pthread_mutex_init(&endpoint->lock, NULL); 4290 endpoint->devmem_fd = -1; 4291 memcpy(&endpoint->trid, trid, sizeof(endpoint->trid)); 4292 endpoint->transport = vu_transport; 4293 4294 ret = snprintf(path, PATH_MAX, "%s/bar0", endpoint_id(endpoint)); 4295 if (ret < 0 || ret >= PATH_MAX) { 4296 SPDK_ERRLOG("%s: error to get socket path: %s.\n", endpoint_id(endpoint), spdk_strerror(errno)); 4297 ret = -1; 4298 goto out; 4299 } 4300 4301 ret = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); 4302 if (ret == -1) { 4303 SPDK_ERRLOG("%s: failed to open device memory at %s: %s.\n", 4304 endpoint_id(endpoint), path, spdk_strerror(errno)); 4305 goto out; 4306 } 4307 unlink(path); 4308 4309 endpoint->devmem_fd = ret; 4310 ret = ftruncate(endpoint->devmem_fd, 4311 NVME_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE); 4312 if (ret != 0) { 4313 SPDK_ERRLOG("%s: error to ftruncate file %s: %s.\n", endpoint_id(endpoint), path, 4314 spdk_strerror(errno)); 4315 goto out; 4316 } 4317 4318 endpoint->bar0_doorbells = mmap(NULL, NVMF_VFIO_USER_DOORBELLS_SIZE, 4319 PROT_READ | PROT_WRITE, MAP_SHARED, endpoint->devmem_fd, NVME_DOORBELLS_OFFSET); 4320 if (endpoint->bar0_doorbells == MAP_FAILED) { 4321 SPDK_ERRLOG("%s: error to mmap file %s: %s.\n", endpoint_id(endpoint), path, spdk_strerror(errno)); 4322 endpoint->bar0_doorbells = NULL; 4323 ret = -1; 4324 goto out; 4325 } 4326 4327 ret = snprintf(path, PATH_MAX, "%s/migr", endpoint_id(endpoint)); 4328 if (ret < 0 || ret >= PATH_MAX) { 4329 SPDK_ERRLOG("%s: error to get migration file path: %s.\n", endpoint_id(endpoint), 4330 spdk_strerror(errno)); 4331 ret = -1; 4332 goto out; 4333 } 4334 ret = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); 4335 if (ret == -1) { 4336 SPDK_ERRLOG("%s: failed to open device memory at %s: %s.\n", 4337 endpoint_id(endpoint), path, spdk_strerror(errno)); 4338 goto out; 4339 } 4340 unlink(path); 4341 4342 endpoint->migr_fd = ret; 4343 ret = ftruncate(endpoint->migr_fd, 4344 vfu_get_migr_register_area_size() + vfio_user_migr_data_len()); 4345 if (ret != 0) { 4346 SPDK_ERRLOG("%s: error to ftruncate migration file %s: %s.\n", endpoint_id(endpoint), path, 4347 spdk_strerror(errno)); 4348 goto out; 4349 } 4350 4351 endpoint->migr_data = mmap(NULL, vfio_user_migr_data_len(), 4352 PROT_READ | PROT_WRITE, MAP_SHARED, endpoint->migr_fd, vfu_get_migr_register_area_size()); 4353 if (endpoint->migr_data == MAP_FAILED) { 4354 SPDK_ERRLOG("%s: error to mmap file %s: %s.\n", endpoint_id(endpoint), path, spdk_strerror(errno)); 4355 endpoint->migr_data = NULL; 4356 ret = -1; 4357 goto out; 4358 } 4359 4360 ret = snprintf(uuid, PATH_MAX, "%s/cntrl", endpoint_id(endpoint)); 4361 if (ret < 0 || ret >= PATH_MAX) { 4362 SPDK_ERRLOG("%s: error to get ctrlr file path: %s\n", endpoint_id(endpoint), spdk_strerror(errno)); 4363 ret = -1; 4364 goto out; 4365 } 4366 4367 endpoint->vfu_ctx = vfu_create_ctx(VFU_TRANS_SOCK, uuid, LIBVFIO_USER_FLAG_ATTACH_NB, 4368 endpoint, VFU_DEV_TYPE_PCI); 4369 if (endpoint->vfu_ctx == NULL) { 4370 SPDK_ERRLOG("%s: error creating libmuser context: %m\n", 4371 endpoint_id(endpoint)); 4372 ret = -1; 4373 goto out; 4374 } 4375 4376 ret = vfu_setup_log(endpoint->vfu_ctx, vfio_user_log, 4377 vfio_user_get_log_level()); 4378 if (ret < 0) { 4379 goto out; 4380 } 4381 4382 4383 ret = vfio_user_dev_info_fill(vu_transport, endpoint); 4384 if (ret < 0) { 4385 goto out; 4386 } 4387 4388 ret = vfio_user_register_accept_poller(endpoint); 4389 4390 if (ret != 0) { 4391 goto out; 4392 } 4393 4394 pthread_mutex_lock(&vu_transport->lock); 4395 TAILQ_INSERT_TAIL(&vu_transport->endpoints, endpoint, link); 4396 pthread_mutex_unlock(&vu_transport->lock); 4397 4398 out: 4399 if (ret != 0) { 4400 nvmf_vfio_user_destroy_endpoint(endpoint); 4401 } 4402 4403 return ret; 4404 } 4405 4406 static void 4407 nvmf_vfio_user_stop_listen(struct spdk_nvmf_transport *transport, 4408 const struct spdk_nvme_transport_id *trid) 4409 { 4410 struct nvmf_vfio_user_transport *vu_transport; 4411 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 4412 4413 assert(trid != NULL); 4414 assert(trid->traddr != NULL); 4415 4416 SPDK_DEBUGLOG(nvmf_vfio, "%s: stop listen\n", trid->traddr); 4417 4418 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4419 transport); 4420 4421 pthread_mutex_lock(&vu_transport->lock); 4422 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 4423 if (strcmp(trid->traddr, endpoint->trid.traddr) == 0) { 4424 TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link); 4425 /* Defer to free endpoint resources until the controller 4426 * is freed. There are two cases when running here: 4427 * 1. kill nvmf target while VM is connected 4428 * 2. remove listener via RPC call 4429 * nvmf library will disconnect all queue paris. 4430 */ 4431 if (endpoint->ctrlr) { 4432 assert(!endpoint->need_async_destroy); 4433 endpoint->need_async_destroy = true; 4434 pthread_mutex_unlock(&vu_transport->lock); 4435 return; 4436 } 4437 4438 nvmf_vfio_user_destroy_endpoint(endpoint); 4439 pthread_mutex_unlock(&vu_transport->lock); 4440 return; 4441 } 4442 } 4443 pthread_mutex_unlock(&vu_transport->lock); 4444 4445 SPDK_DEBUGLOG(nvmf_vfio, "%s: not found\n", trid->traddr); 4446 } 4447 4448 static void 4449 nvmf_vfio_user_cdata_init(struct spdk_nvmf_transport *transport, 4450 struct spdk_nvmf_subsystem *subsystem, 4451 struct spdk_nvmf_ctrlr_data *cdata) 4452 { 4453 struct nvmf_vfio_user_transport *vu_transport; 4454 4455 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, transport); 4456 4457 cdata->vid = SPDK_PCI_VID_NUTANIX; 4458 cdata->ssvid = SPDK_PCI_VID_NUTANIX; 4459 cdata->ieee[0] = 0x8d; 4460 cdata->ieee[1] = 0x6b; 4461 cdata->ieee[2] = 0x50; 4462 memset(&cdata->sgls, 0, sizeof(struct spdk_nvme_cdata_sgls)); 4463 cdata->sgls.supported = SPDK_NVME_SGLS_SUPPORTED_DWORD_ALIGNED; 4464 cdata->oncs.compare = !vu_transport->transport_opts.disable_compare; 4465 /* libvfio-user can only support 1 connection for now */ 4466 cdata->oncs.reservations = 0; 4467 cdata->oacs.doorbell_buffer_config = !vu_transport->transport_opts.disable_shadow_doorbells; 4468 cdata->fuses.compare_and_write = !vu_transport->transport_opts.disable_compare; 4469 } 4470 4471 static int 4472 nvmf_vfio_user_listen_associate(struct spdk_nvmf_transport *transport, 4473 const struct spdk_nvmf_subsystem *subsystem, 4474 const struct spdk_nvme_transport_id *trid) 4475 { 4476 struct nvmf_vfio_user_transport *vu_transport; 4477 struct nvmf_vfio_user_endpoint *endpoint; 4478 4479 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, transport); 4480 4481 pthread_mutex_lock(&vu_transport->lock); 4482 TAILQ_FOREACH(endpoint, &vu_transport->endpoints, link) { 4483 if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) { 4484 break; 4485 } 4486 } 4487 pthread_mutex_unlock(&vu_transport->lock); 4488 4489 if (endpoint == NULL) { 4490 return -ENOENT; 4491 } 4492 4493 /* Drop const - we will later need to pause/unpause. */ 4494 endpoint->subsystem = (struct spdk_nvmf_subsystem *)subsystem; 4495 4496 return 0; 4497 } 4498 4499 /* 4500 * Executed periodically at a default SPDK_NVMF_DEFAULT_ACCEPT_POLL_RATE_US 4501 * frequency. 4502 * 4503 * For this endpoint (which at the libvfio-user level corresponds to a socket), 4504 * if we don't currently have a controller set up, peek to see if the socket is 4505 * able to accept a new connection. 4506 */ 4507 static int 4508 nvmf_vfio_user_accept(void *ctx) 4509 { 4510 struct nvmf_vfio_user_endpoint *endpoint = ctx; 4511 struct nvmf_vfio_user_transport *vu_transport; 4512 int err; 4513 4514 vu_transport = endpoint->transport; 4515 4516 if (endpoint->ctrlr != NULL) { 4517 return SPDK_POLLER_IDLE; 4518 } 4519 4520 /* While we're here, the controller is already destroyed, 4521 * subsystem may still be in RESUMING state, we will wait 4522 * until the subsystem is in RUNNING state. 4523 */ 4524 if (endpoint->need_resume) { 4525 return SPDK_POLLER_IDLE; 4526 } 4527 4528 err = vfu_attach_ctx(endpoint->vfu_ctx); 4529 if (err == 0) { 4530 SPDK_DEBUGLOG(nvmf_vfio, "attach succeeded\n"); 4531 err = nvmf_vfio_user_create_ctrlr(vu_transport, endpoint); 4532 if (err == 0) { 4533 /* 4534 * Unregister ourselves: now we've accepted a 4535 * connection, there is nothing for us to poll for, and 4536 * we will poll the connection via vfu_run_ctx() 4537 * instead. 4538 */ 4539 spdk_interrupt_unregister(&endpoint->accept_intr); 4540 spdk_poller_unregister(&endpoint->accept_poller); 4541 } 4542 return SPDK_POLLER_BUSY; 4543 } 4544 4545 if (errno == EAGAIN || errno == EWOULDBLOCK) { 4546 return SPDK_POLLER_IDLE; 4547 } 4548 4549 return SPDK_POLLER_BUSY; 4550 } 4551 4552 static void 4553 nvmf_vfio_user_discover(struct spdk_nvmf_transport *transport, 4554 struct spdk_nvme_transport_id *trid, 4555 struct spdk_nvmf_discovery_log_page_entry *entry) 4556 { } 4557 4558 static int vfio_user_poll_group_intr(void *ctx); 4559 4560 static void 4561 vfio_user_poll_group_add_intr(struct nvmf_vfio_user_poll_group *vu_group, 4562 struct spdk_nvmf_poll_group *group) 4563 { 4564 vu_group->intr_fd = eventfd(0, EFD_NONBLOCK); 4565 assert(vu_group->intr_fd != -1); 4566 4567 vu_group->intr = SPDK_INTERRUPT_REGISTER(vu_group->intr_fd, 4568 vfio_user_poll_group_intr, vu_group); 4569 assert(vu_group->intr != NULL); 4570 4571 spdk_poller_register_interrupt(group->poller, set_intr_mode_noop, 4572 vu_group); 4573 } 4574 4575 static struct spdk_nvmf_transport_poll_group * 4576 nvmf_vfio_user_poll_group_create(struct spdk_nvmf_transport *transport, 4577 struct spdk_nvmf_poll_group *group) 4578 { 4579 struct nvmf_vfio_user_transport *vu_transport; 4580 struct nvmf_vfio_user_poll_group *vu_group; 4581 4582 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4583 transport); 4584 4585 SPDK_DEBUGLOG(nvmf_vfio, "create poll group\n"); 4586 4587 vu_group = calloc(1, sizeof(*vu_group)); 4588 if (vu_group == NULL) { 4589 SPDK_ERRLOG("Error allocating poll group: %m"); 4590 return NULL; 4591 } 4592 4593 if (in_interrupt_mode(vu_transport)) { 4594 vfio_user_poll_group_add_intr(vu_group, group); 4595 } 4596 4597 TAILQ_INIT(&vu_group->sqs); 4598 4599 pthread_mutex_lock(&vu_transport->pg_lock); 4600 TAILQ_INSERT_TAIL(&vu_transport->poll_groups, vu_group, link); 4601 if (vu_transport->next_pg == NULL) { 4602 vu_transport->next_pg = vu_group; 4603 } 4604 pthread_mutex_unlock(&vu_transport->pg_lock); 4605 4606 return &vu_group->group; 4607 } 4608 4609 static struct spdk_nvmf_transport_poll_group * 4610 nvmf_vfio_user_get_optimal_poll_group(struct spdk_nvmf_qpair *qpair) 4611 { 4612 struct nvmf_vfio_user_transport *vu_transport; 4613 struct nvmf_vfio_user_poll_group **vu_group; 4614 struct nvmf_vfio_user_sq *sq; 4615 struct nvmf_vfio_user_cq *cq; 4616 4617 struct spdk_nvmf_transport_poll_group *result = NULL; 4618 4619 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 4620 cq = sq->ctrlr->cqs[sq->cqid]; 4621 assert(cq != NULL); 4622 vu_transport = SPDK_CONTAINEROF(qpair->transport, struct nvmf_vfio_user_transport, transport); 4623 4624 pthread_mutex_lock(&vu_transport->pg_lock); 4625 if (TAILQ_EMPTY(&vu_transport->poll_groups)) { 4626 goto out; 4627 } 4628 4629 if (!nvmf_qpair_is_admin_queue(qpair)) { 4630 /* 4631 * If this is shared IO CQ case, just return the used CQ's poll 4632 * group, so I/O completions don't have to use 4633 * spdk_thread_send_msg(). 4634 */ 4635 if (cq->group != NULL) { 4636 result = cq->group; 4637 goto out; 4638 } 4639 4640 /* 4641 * If we're in interrupt mode, align all qpairs for a controller 4642 * on the same poll group by default, unless requested. This can 4643 * be lower in performance than running on a single poll group, 4644 * so we disable spreading by default. 4645 */ 4646 if (in_interrupt_mode(vu_transport) && 4647 !vu_transport->transport_opts.enable_intr_mode_sq_spreading) { 4648 result = sq->ctrlr->sqs[0]->group; 4649 goto out; 4650 } 4651 4652 } 4653 4654 vu_group = &vu_transport->next_pg; 4655 assert(*vu_group != NULL); 4656 4657 result = &(*vu_group)->group; 4658 *vu_group = TAILQ_NEXT(*vu_group, link); 4659 if (*vu_group == NULL) { 4660 *vu_group = TAILQ_FIRST(&vu_transport->poll_groups); 4661 } 4662 4663 out: 4664 if (cq->group == NULL) { 4665 cq->group = result; 4666 } 4667 4668 pthread_mutex_unlock(&vu_transport->pg_lock); 4669 return result; 4670 } 4671 4672 static void 4673 vfio_user_poll_group_del_intr(struct nvmf_vfio_user_poll_group *vu_group) 4674 { 4675 assert(vu_group->intr_fd != -1); 4676 4677 spdk_interrupt_unregister(&vu_group->intr); 4678 4679 close(vu_group->intr_fd); 4680 vu_group->intr_fd = -1; 4681 } 4682 4683 /* called when process exits */ 4684 static void 4685 nvmf_vfio_user_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group) 4686 { 4687 struct nvmf_vfio_user_poll_group *vu_group, *next_tgroup; 4688 struct nvmf_vfio_user_transport *vu_transport; 4689 4690 SPDK_DEBUGLOG(nvmf_vfio, "destroy poll group\n"); 4691 4692 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 4693 vu_transport = SPDK_CONTAINEROF(vu_group->group.transport, struct nvmf_vfio_user_transport, 4694 transport); 4695 4696 if (in_interrupt_mode(vu_transport)) { 4697 vfio_user_poll_group_del_intr(vu_group); 4698 } 4699 4700 pthread_mutex_lock(&vu_transport->pg_lock); 4701 next_tgroup = TAILQ_NEXT(vu_group, link); 4702 TAILQ_REMOVE(&vu_transport->poll_groups, vu_group, link); 4703 if (next_tgroup == NULL) { 4704 next_tgroup = TAILQ_FIRST(&vu_transport->poll_groups); 4705 } 4706 if (vu_transport->next_pg == vu_group) { 4707 vu_transport->next_pg = next_tgroup; 4708 } 4709 pthread_mutex_unlock(&vu_transport->pg_lock); 4710 4711 free(vu_group); 4712 } 4713 4714 static void 4715 _vfio_user_qpair_disconnect(void *ctx) 4716 { 4717 struct nvmf_vfio_user_sq *sq = ctx; 4718 4719 spdk_nvmf_qpair_disconnect(&sq->qpair, NULL, NULL); 4720 } 4721 4722 /* The function is used when socket connection is destroyed */ 4723 static int 4724 vfio_user_destroy_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 4725 { 4726 struct nvmf_vfio_user_sq *sq; 4727 struct nvmf_vfio_user_endpoint *endpoint; 4728 4729 SPDK_DEBUGLOG(nvmf_vfio, "%s stop processing\n", ctrlr_id(ctrlr)); 4730 4731 endpoint = ctrlr->endpoint; 4732 assert(endpoint != NULL); 4733 4734 pthread_mutex_lock(&endpoint->lock); 4735 endpoint->need_relisten = true; 4736 if (TAILQ_EMPTY(&ctrlr->connected_sqs)) { 4737 endpoint->ctrlr = NULL; 4738 free_ctrlr(ctrlr); 4739 pthread_mutex_unlock(&endpoint->lock); 4740 return 0; 4741 } 4742 4743 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 4744 /* add another round thread poll to avoid recursive endpoint lock */ 4745 spdk_thread_send_msg(ctrlr->thread, _vfio_user_qpair_disconnect, sq); 4746 } 4747 pthread_mutex_unlock(&endpoint->lock); 4748 4749 return 0; 4750 } 4751 4752 /* 4753 * Poll for and process any incoming vfio-user messages. 4754 */ 4755 static int 4756 vfio_user_poll_vfu_ctx(void *ctx) 4757 { 4758 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 4759 int ret; 4760 4761 assert(ctrlr != NULL); 4762 4763 /* This will call access_bar0_fn() if there are any writes 4764 * to the portion of the BAR that is not mmap'd */ 4765 ret = vfu_run_ctx(ctrlr->endpoint->vfu_ctx); 4766 if (spdk_unlikely(ret == -1)) { 4767 if (errno == EBUSY) { 4768 return SPDK_POLLER_IDLE; 4769 } 4770 4771 spdk_poller_unregister(&ctrlr->vfu_ctx_poller); 4772 4773 /* 4774 * We lost the client; the reset callback will already have 4775 * unregistered the interrupt. 4776 */ 4777 if (errno == ENOTCONN) { 4778 vfio_user_destroy_ctrlr(ctrlr); 4779 return SPDK_POLLER_BUSY; 4780 } 4781 4782 /* 4783 * We might not have got a reset callback in this case, so 4784 * explicitly unregister the interrupt here. 4785 */ 4786 spdk_interrupt_unregister(&ctrlr->intr); 4787 ctrlr->intr_fd = -1; 4788 fail_ctrlr(ctrlr); 4789 } 4790 4791 return ret != 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 4792 } 4793 4794 struct vfio_user_post_cpl_ctx { 4795 struct nvmf_vfio_user_ctrlr *ctrlr; 4796 struct nvmf_vfio_user_cq *cq; 4797 struct spdk_nvme_cpl cpl; 4798 }; 4799 4800 static void 4801 _post_completion_msg(void *ctx) 4802 { 4803 struct vfio_user_post_cpl_ctx *cpl_ctx = ctx; 4804 4805 post_completion(cpl_ctx->ctrlr, cpl_ctx->cq, cpl_ctx->cpl.cdw0, cpl_ctx->cpl.sqid, 4806 cpl_ctx->cpl.cid, cpl_ctx->cpl.status.sc, cpl_ctx->cpl.status.sct); 4807 free(cpl_ctx); 4808 } 4809 4810 static int nvmf_vfio_user_poll_group_poll(struct spdk_nvmf_transport_poll_group *group); 4811 4812 static int 4813 vfio_user_poll_group_intr(void *ctx) 4814 { 4815 struct nvmf_vfio_user_poll_group *vu_group = ctx; 4816 eventfd_t val; 4817 int ret = 0; 4818 4819 SPDK_DEBUGLOG(vfio_user_db, "pg:%p got intr\n", vu_group); 4820 4821 /* 4822 * NB: this might fail if called from vfio_user_ctrlr_intr(), but it's 4823 * non-blocking, so not an issue. 4824 */ 4825 eventfd_read(vu_group->intr_fd, &val); 4826 4827 ret |= nvmf_vfio_user_poll_group_poll(&vu_group->group); 4828 4829 /* 4830 * Re-arm the event indexes. NB: this also could rearm other 4831 * controller's SQs. 4832 */ 4833 ret |= vfio_user_poll_group_rearm(vu_group); 4834 4835 return ret != 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 4836 } 4837 4838 /* 4839 * Handle an interrupt for the given controller: we must poll the vfu_ctx, and 4840 * the SQs assigned to our own poll group. Other poll groups are handled via 4841 * vfio_user_poll_group_intr(). 4842 */ 4843 static int 4844 vfio_user_ctrlr_intr(void *ctx) 4845 { 4846 struct nvmf_vfio_user_poll_group *vu_ctrlr_group; 4847 struct nvmf_vfio_user_ctrlr *vu_ctrlr = ctx; 4848 struct nvmf_vfio_user_poll_group *vu_group; 4849 int ret = SPDK_POLLER_IDLE; 4850 4851 vu_ctrlr_group = ctrlr_to_poll_group(vu_ctrlr); 4852 4853 SPDK_DEBUGLOG(vfio_user_db, "ctrlr pg:%p got intr\n", vu_ctrlr_group); 4854 4855 /* 4856 * Poll vfio-user for this controller. We need to do this before polling 4857 * any SQs, as this is where doorbell writes may be handled. 4858 */ 4859 ret = vfio_user_poll_vfu_ctx(vu_ctrlr); 4860 4861 /* 4862 * `sqs[0]` could be set to NULL in vfio_user_poll_vfu_ctx() context, 4863 * just return for this case. 4864 */ 4865 if (vu_ctrlr->sqs[0] == NULL) { 4866 return ret; 4867 } 4868 4869 if (vu_ctrlr->transport->transport_opts.enable_intr_mode_sq_spreading) { 4870 /* 4871 * We may have just written to a doorbell owned by another 4872 * reactor: we need to prod them to make sure its SQs are polled 4873 * *after* the doorbell value is updated. 4874 */ 4875 TAILQ_FOREACH(vu_group, &vu_ctrlr->transport->poll_groups, link) { 4876 if (vu_group != vu_ctrlr_group) { 4877 SPDK_DEBUGLOG(vfio_user_db, "prodding pg:%p\n", vu_group); 4878 eventfd_write(vu_group->intr_fd, 1); 4879 } 4880 } 4881 } 4882 4883 ret |= vfio_user_poll_group_intr(vu_ctrlr_group); 4884 4885 return ret; 4886 } 4887 4888 static void 4889 vfio_user_ctrlr_set_intr_mode(struct spdk_poller *poller, void *ctx, 4890 bool interrupt_mode) 4891 { 4892 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 4893 assert(ctrlr != NULL); 4894 assert(ctrlr->endpoint != NULL); 4895 4896 SPDK_DEBUGLOG(nvmf_vfio, "%s: setting interrupt mode to %d\n", 4897 ctrlr_id(ctrlr), interrupt_mode); 4898 4899 /* 4900 * interrupt_mode needs to persist across controller resets, so store 4901 * it in the endpoint instead. 4902 */ 4903 ctrlr->endpoint->interrupt_mode = interrupt_mode; 4904 4905 vfio_user_poll_group_rearm(ctrlr_to_poll_group(ctrlr)); 4906 } 4907 4908 /* 4909 * In response to the nvmf_vfio_user_create_ctrlr() path, the admin queue is now 4910 * set up and we can start operating on this controller. 4911 */ 4912 static void 4913 start_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 4914 struct spdk_nvmf_ctrlr *ctrlr) 4915 { 4916 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 4917 4918 vu_ctrlr->ctrlr = ctrlr; 4919 vu_ctrlr->cntlid = ctrlr->cntlid; 4920 vu_ctrlr->thread = spdk_get_thread(); 4921 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 4922 4923 if (!in_interrupt_mode(endpoint->transport)) { 4924 vu_ctrlr->vfu_ctx_poller = SPDK_POLLER_REGISTER(vfio_user_poll_vfu_ctx, 4925 vu_ctrlr, 1000); 4926 return; 4927 } 4928 4929 vu_ctrlr->vfu_ctx_poller = SPDK_POLLER_REGISTER(vfio_user_poll_vfu_ctx, 4930 vu_ctrlr, 0); 4931 4932 vu_ctrlr->intr_fd = vfu_get_poll_fd(vu_ctrlr->endpoint->vfu_ctx); 4933 assert(vu_ctrlr->intr_fd != -1); 4934 4935 vu_ctrlr->intr = SPDK_INTERRUPT_REGISTER(vu_ctrlr->intr_fd, 4936 vfio_user_ctrlr_intr, vu_ctrlr); 4937 4938 assert(vu_ctrlr->intr != NULL); 4939 4940 spdk_poller_register_interrupt(vu_ctrlr->vfu_ctx_poller, 4941 vfio_user_ctrlr_set_intr_mode, 4942 vu_ctrlr); 4943 } 4944 4945 static int 4946 handle_queue_connect_rsp(struct nvmf_vfio_user_req *req, void *cb_arg) 4947 { 4948 struct nvmf_vfio_user_poll_group *vu_group; 4949 struct nvmf_vfio_user_sq *sq = cb_arg; 4950 struct nvmf_vfio_user_cq *admin_cq; 4951 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 4952 struct nvmf_vfio_user_endpoint *endpoint; 4953 4954 assert(sq != NULL); 4955 assert(req != NULL); 4956 4957 vu_ctrlr = sq->ctrlr; 4958 assert(vu_ctrlr != NULL); 4959 endpoint = vu_ctrlr->endpoint; 4960 assert(endpoint != NULL); 4961 4962 if (spdk_nvme_cpl_is_error(&req->req.rsp->nvme_cpl)) { 4963 SPDK_ERRLOG("SC %u, SCT %u\n", req->req.rsp->nvme_cpl.status.sc, req->req.rsp->nvme_cpl.status.sct); 4964 endpoint->ctrlr = NULL; 4965 free_ctrlr(vu_ctrlr); 4966 return -1; 4967 } 4968 4969 vu_group = SPDK_CONTAINEROF(sq->group, struct nvmf_vfio_user_poll_group, group); 4970 TAILQ_INSERT_TAIL(&vu_group->sqs, sq, link); 4971 4972 admin_cq = vu_ctrlr->cqs[0]; 4973 assert(admin_cq != NULL); 4974 4975 pthread_mutex_lock(&endpoint->lock); 4976 if (nvmf_qpair_is_admin_queue(&sq->qpair)) { 4977 admin_cq->thread = spdk_get_thread(); 4978 start_ctrlr(vu_ctrlr, sq->qpair.ctrlr); 4979 } else { 4980 /* For I/O queues this command was generated in response to an 4981 * ADMIN I/O CREATE SUBMISSION QUEUE command which has not yet 4982 * been completed. Complete it now. 4983 */ 4984 if (sq->post_create_io_sq_completion) { 4985 assert(admin_cq->thread != NULL); 4986 if (admin_cq->thread != spdk_get_thread()) { 4987 struct vfio_user_post_cpl_ctx *cpl_ctx; 4988 4989 cpl_ctx = calloc(1, sizeof(*cpl_ctx)); 4990 if (!cpl_ctx) { 4991 return -ENOMEM; 4992 } 4993 cpl_ctx->ctrlr = vu_ctrlr; 4994 cpl_ctx->cq = admin_cq; 4995 cpl_ctx->cpl.sqid = 0; 4996 cpl_ctx->cpl.cdw0 = 0; 4997 cpl_ctx->cpl.cid = sq->create_io_sq_cmd.cid; 4998 cpl_ctx->cpl.status.sc = SPDK_NVME_SC_SUCCESS; 4999 cpl_ctx->cpl.status.sct = SPDK_NVME_SCT_GENERIC; 5000 5001 spdk_thread_send_msg(admin_cq->thread, _post_completion_msg, 5002 cpl_ctx); 5003 } else { 5004 post_completion(vu_ctrlr, admin_cq, 0, 0, 5005 sq->create_io_sq_cmd.cid, SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC); 5006 } 5007 sq->post_create_io_sq_completion = false; 5008 } else if (in_interrupt_mode(endpoint->transport)) { 5009 /* 5010 * If we're live migrating a guest, there is a window 5011 * where the I/O queues haven't been set up but the 5012 * device is in running state, during which the guest 5013 * might write to a doorbell. This doorbell write will 5014 * go unnoticed, so let's poll the whole controller to 5015 * pick that up. 5016 */ 5017 ctrlr_kick(vu_ctrlr); 5018 } 5019 sq->sq_state = VFIO_USER_SQ_ACTIVE; 5020 } 5021 5022 TAILQ_INSERT_TAIL(&vu_ctrlr->connected_sqs, sq, tailq); 5023 pthread_mutex_unlock(&endpoint->lock); 5024 5025 free(req->req.data); 5026 req->req.data = NULL; 5027 5028 return 0; 5029 } 5030 5031 /* 5032 * Add the given qpair to the given poll group. New qpairs are added via 5033 * spdk_nvmf_tgt_new_qpair(), which picks a poll group via 5034 * nvmf_vfio_user_get_optimal_poll_group(), then calls back here via 5035 * nvmf_transport_poll_group_add(). 5036 */ 5037 static int 5038 nvmf_vfio_user_poll_group_add(struct spdk_nvmf_transport_poll_group *group, 5039 struct spdk_nvmf_qpair *qpair) 5040 { 5041 struct nvmf_vfio_user_sq *sq; 5042 struct nvmf_vfio_user_req *vu_req; 5043 struct nvmf_vfio_user_ctrlr *ctrlr; 5044 struct spdk_nvmf_request *req; 5045 struct spdk_nvmf_fabric_connect_data *data; 5046 bool admin; 5047 5048 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5049 sq->group = group; 5050 ctrlr = sq->ctrlr; 5051 5052 SPDK_DEBUGLOG(nvmf_vfio, "%s: add QP%d=%p(%p) to poll_group=%p\n", 5053 ctrlr_id(ctrlr), sq->qpair.qid, 5054 sq, qpair, group); 5055 5056 admin = nvmf_qpair_is_admin_queue(&sq->qpair); 5057 5058 vu_req = get_nvmf_vfio_user_req(sq); 5059 if (vu_req == NULL) { 5060 return -1; 5061 } 5062 5063 req = &vu_req->req; 5064 req->cmd->connect_cmd.opcode = SPDK_NVME_OPC_FABRIC; 5065 req->cmd->connect_cmd.cid = 0; 5066 req->cmd->connect_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_CONNECT; 5067 req->cmd->connect_cmd.recfmt = 0; 5068 req->cmd->connect_cmd.sqsize = sq->size - 1; 5069 req->cmd->connect_cmd.qid = admin ? 0 : qpair->qid; 5070 5071 req->length = sizeof(struct spdk_nvmf_fabric_connect_data); 5072 req->data = calloc(1, req->length); 5073 if (req->data == NULL) { 5074 nvmf_vfio_user_req_free(req); 5075 return -ENOMEM; 5076 } 5077 5078 data = (struct spdk_nvmf_fabric_connect_data *)req->data; 5079 data->cntlid = ctrlr->cntlid; 5080 snprintf(data->subnqn, sizeof(data->subnqn), "%s", 5081 spdk_nvmf_subsystem_get_nqn(ctrlr->endpoint->subsystem)); 5082 5083 vu_req->cb_fn = handle_queue_connect_rsp; 5084 vu_req->cb_arg = sq; 5085 5086 SPDK_DEBUGLOG(nvmf_vfio, 5087 "%s: sending connect fabrics command for qid:%#x cntlid=%#x\n", 5088 ctrlr_id(ctrlr), qpair->qid, data->cntlid); 5089 5090 spdk_nvmf_request_exec_fabrics(req); 5091 return 0; 5092 } 5093 5094 static int 5095 nvmf_vfio_user_poll_group_remove(struct spdk_nvmf_transport_poll_group *group, 5096 struct spdk_nvmf_qpair *qpair) 5097 { 5098 struct nvmf_vfio_user_sq *sq; 5099 struct nvmf_vfio_user_poll_group *vu_group; 5100 5101 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5102 5103 SPDK_DEBUGLOG(nvmf_vfio, 5104 "%s: remove NVMf QP%d=%p from NVMf poll_group=%p\n", 5105 ctrlr_id(sq->ctrlr), qpair->qid, qpair, group); 5106 5107 5108 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 5109 TAILQ_REMOVE(&vu_group->sqs, sq, link); 5110 5111 return 0; 5112 } 5113 5114 static void 5115 _nvmf_vfio_user_req_free(struct nvmf_vfio_user_sq *sq, struct nvmf_vfio_user_req *vu_req) 5116 { 5117 memset(&vu_req->cmd, 0, sizeof(vu_req->cmd)); 5118 memset(&vu_req->rsp, 0, sizeof(vu_req->rsp)); 5119 vu_req->iovcnt = 0; 5120 vu_req->state = VFIO_USER_REQUEST_STATE_FREE; 5121 5122 TAILQ_INSERT_TAIL(&sq->free_reqs, vu_req, link); 5123 } 5124 5125 static int 5126 nvmf_vfio_user_req_free(struct spdk_nvmf_request *req) 5127 { 5128 struct nvmf_vfio_user_sq *sq; 5129 struct nvmf_vfio_user_req *vu_req; 5130 5131 assert(req != NULL); 5132 5133 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 5134 sq = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 5135 5136 _nvmf_vfio_user_req_free(sq, vu_req); 5137 5138 return 0; 5139 } 5140 5141 static int 5142 nvmf_vfio_user_req_complete(struct spdk_nvmf_request *req) 5143 { 5144 struct nvmf_vfio_user_sq *sq; 5145 struct nvmf_vfio_user_req *vu_req; 5146 5147 assert(req != NULL); 5148 5149 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 5150 sq = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 5151 5152 if (vu_req->cb_fn != NULL) { 5153 if (vu_req->cb_fn(vu_req, vu_req->cb_arg) != 0) { 5154 fail_ctrlr(sq->ctrlr); 5155 } 5156 } 5157 5158 _nvmf_vfio_user_req_free(sq, vu_req); 5159 5160 return 0; 5161 } 5162 5163 static void 5164 nvmf_vfio_user_close_qpair(struct spdk_nvmf_qpair *qpair, 5165 spdk_nvmf_transport_qpair_fini_cb cb_fn, void *cb_arg) 5166 { 5167 struct nvmf_vfio_user_sq *sq; 5168 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 5169 struct nvmf_vfio_user_endpoint *endpoint; 5170 5171 assert(qpair != NULL); 5172 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5173 vu_ctrlr = sq->ctrlr; 5174 endpoint = vu_ctrlr->endpoint; 5175 5176 pthread_mutex_lock(&endpoint->lock); 5177 TAILQ_REMOVE(&vu_ctrlr->connected_sqs, sq, tailq); 5178 delete_sq_done(vu_ctrlr, sq); 5179 if (TAILQ_EMPTY(&vu_ctrlr->connected_sqs)) { 5180 endpoint->ctrlr = NULL; 5181 if (vu_ctrlr->in_source_vm && endpoint->need_resume) { 5182 /* The controller will be freed, we can resume the subsystem 5183 * now so that the endpoint can be ready to accept another 5184 * new connection. 5185 */ 5186 spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 5187 vfio_user_endpoint_resume_done, endpoint); 5188 } 5189 free_ctrlr(vu_ctrlr); 5190 } 5191 pthread_mutex_unlock(&endpoint->lock); 5192 5193 if (cb_fn) { 5194 cb_fn(cb_arg); 5195 } 5196 } 5197 5198 /** 5199 * Returns a preallocated request, or NULL if there isn't one available. 5200 */ 5201 static struct nvmf_vfio_user_req * 5202 get_nvmf_vfio_user_req(struct nvmf_vfio_user_sq *sq) 5203 { 5204 struct nvmf_vfio_user_req *req; 5205 5206 if (sq == NULL) { 5207 return NULL; 5208 } 5209 5210 req = TAILQ_FIRST(&sq->free_reqs); 5211 if (req == NULL) { 5212 return NULL; 5213 } 5214 5215 TAILQ_REMOVE(&sq->free_reqs, req, link); 5216 5217 return req; 5218 } 5219 5220 static int 5221 get_nvmf_io_req_length(struct spdk_nvmf_request *req) 5222 { 5223 uint16_t nr; 5224 uint32_t nlb, nsid; 5225 struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; 5226 struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; 5227 struct spdk_nvmf_ns *ns; 5228 5229 nsid = cmd->nsid; 5230 ns = _nvmf_subsystem_get_ns(ctrlr->subsys, nsid); 5231 if (ns == NULL || ns->bdev == NULL) { 5232 SPDK_ERRLOG("unsuccessful query for nsid %u\n", cmd->nsid); 5233 return -EINVAL; 5234 } 5235 5236 if (cmd->opc == SPDK_NVME_OPC_DATASET_MANAGEMENT) { 5237 nr = cmd->cdw10_bits.dsm.nr + 1; 5238 return nr * sizeof(struct spdk_nvme_dsm_range); 5239 } 5240 5241 nlb = (cmd->cdw12 & 0x0000ffffu) + 1; 5242 return nlb * spdk_bdev_get_block_size(ns->bdev); 5243 } 5244 5245 static int 5246 map_admin_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req) 5247 { 5248 struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; 5249 uint32_t len = 0; 5250 uint8_t fid; 5251 int iovcnt; 5252 5253 req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 5254 req->length = 0; 5255 req->data = NULL; 5256 5257 if (req->xfer == SPDK_NVME_DATA_NONE) { 5258 return 0; 5259 } 5260 5261 switch (cmd->opc) { 5262 case SPDK_NVME_OPC_IDENTIFY: 5263 len = 4096; 5264 break; 5265 case SPDK_NVME_OPC_GET_LOG_PAGE: 5266 len = (((cmd->cdw11_bits.get_log_page.numdu << 16) | cmd->cdw10_bits.get_log_page.numdl) + 1) * 4; 5267 break; 5268 case SPDK_NVME_OPC_GET_FEATURES: 5269 case SPDK_NVME_OPC_SET_FEATURES: 5270 fid = cmd->cdw10_bits.set_features.fid; 5271 switch (fid) { 5272 case SPDK_NVME_FEAT_LBA_RANGE_TYPE: 5273 len = 4096; 5274 break; 5275 case SPDK_NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 5276 len = 256; 5277 break; 5278 case SPDK_NVME_FEAT_TIMESTAMP: 5279 len = 8; 5280 break; 5281 case SPDK_NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 5282 len = 512; 5283 break; 5284 case SPDK_NVME_FEAT_HOST_IDENTIFIER: 5285 if (cmd->cdw11_bits.feat_host_identifier.bits.exhid) { 5286 len = 16; 5287 } else { 5288 len = 8; 5289 } 5290 break; 5291 default: 5292 return 0; 5293 } 5294 break; 5295 default: 5296 return 0; 5297 } 5298 5299 /* ADMIN command will not use SGL */ 5300 if (cmd->psdt != 0) { 5301 return -EINVAL; 5302 } 5303 5304 iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, len); 5305 if (iovcnt < 0) { 5306 SPDK_ERRLOG("%s: map Admin Opc %x failed\n", 5307 ctrlr_id(ctrlr), cmd->opc); 5308 return -1; 5309 } 5310 req->length = len; 5311 req->data = req->iov[0].iov_base; 5312 req->iovcnt = iovcnt; 5313 5314 return 0; 5315 } 5316 5317 /* 5318 * Map an I/O command's buffers. 5319 * 5320 * Returns 0 on success and -errno on failure. 5321 */ 5322 static int 5323 map_io_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req) 5324 { 5325 int len, iovcnt; 5326 struct spdk_nvme_cmd *cmd; 5327 5328 assert(ctrlr != NULL); 5329 assert(req != NULL); 5330 5331 cmd = &req->cmd->nvme_cmd; 5332 req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 5333 req->length = 0; 5334 req->data = NULL; 5335 5336 if (spdk_unlikely(req->xfer == SPDK_NVME_DATA_NONE)) { 5337 return 0; 5338 } 5339 5340 len = get_nvmf_io_req_length(req); 5341 if (len < 0) { 5342 return -EINVAL; 5343 } 5344 req->length = len; 5345 5346 iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, req->length); 5347 if (iovcnt < 0) { 5348 SPDK_ERRLOG("%s: failed to map IO OPC %u\n", ctrlr_id(ctrlr), cmd->opc); 5349 return -EFAULT; 5350 } 5351 req->data = req->iov[0].iov_base; 5352 req->iovcnt = iovcnt; 5353 5354 return 0; 5355 } 5356 5357 static int 5358 handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 5359 struct nvmf_vfio_user_sq *sq) 5360 { 5361 int err; 5362 struct nvmf_vfio_user_req *vu_req; 5363 struct spdk_nvmf_request *req; 5364 5365 assert(ctrlr != NULL); 5366 assert(cmd != NULL); 5367 5368 vu_req = get_nvmf_vfio_user_req(sq); 5369 if (spdk_unlikely(vu_req == NULL)) { 5370 SPDK_ERRLOG("%s: no request for NVMe command opc 0x%x\n", ctrlr_id(ctrlr), cmd->opc); 5371 return post_completion(ctrlr, ctrlr->cqs[sq->cqid], 0, 0, cmd->cid, 5372 SPDK_NVME_SC_INTERNAL_DEVICE_ERROR, SPDK_NVME_SCT_GENERIC); 5373 5374 } 5375 req = &vu_req->req; 5376 5377 assert(req->qpair != NULL); 5378 SPDK_DEBUGLOG(nvmf_vfio, "%s: handle sqid:%u, req opc=%#x cid=%d\n", 5379 ctrlr_id(ctrlr), req->qpair->qid, cmd->opc, cmd->cid); 5380 5381 vu_req->cb_fn = handle_cmd_rsp; 5382 vu_req->cb_arg = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 5383 req->cmd->nvme_cmd = *cmd; 5384 5385 if (nvmf_qpair_is_admin_queue(req->qpair)) { 5386 err = map_admin_cmd_req(ctrlr, req); 5387 } else { 5388 switch (cmd->opc) { 5389 case SPDK_NVME_OPC_RESERVATION_REGISTER: 5390 case SPDK_NVME_OPC_RESERVATION_REPORT: 5391 case SPDK_NVME_OPC_RESERVATION_ACQUIRE: 5392 case SPDK_NVME_OPC_RESERVATION_RELEASE: 5393 err = -ENOTSUP; 5394 break; 5395 default: 5396 err = map_io_cmd_req(ctrlr, req); 5397 break; 5398 } 5399 } 5400 5401 if (spdk_unlikely(err < 0)) { 5402 SPDK_ERRLOG("%s: process NVMe command opc 0x%x failed\n", 5403 ctrlr_id(ctrlr), cmd->opc); 5404 req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 5405 req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; 5406 err = handle_cmd_rsp(vu_req, vu_req->cb_arg); 5407 _nvmf_vfio_user_req_free(sq, vu_req); 5408 return err; 5409 } 5410 5411 vu_req->state = VFIO_USER_REQUEST_STATE_EXECUTING; 5412 spdk_nvmf_request_exec(req); 5413 5414 return 0; 5415 } 5416 5417 /* 5418 * If we suppressed an IRQ in post_completion(), check if it needs to be fired 5419 * here: if the host isn't up to date, and is apparently not actively processing 5420 * the queue (i.e. ->last_head isn't changing), we need an IRQ. 5421 */ 5422 static void 5423 handle_suppressed_irq(struct nvmf_vfio_user_ctrlr *ctrlr, 5424 struct nvmf_vfio_user_sq *sq) 5425 { 5426 struct nvmf_vfio_user_cq *cq = ctrlr->cqs[sq->cqid]; 5427 uint32_t cq_head; 5428 uint32_t cq_tail; 5429 5430 if (!cq->ien || cq->qid == 0 || !ctrlr_interrupt_enabled(ctrlr)) { 5431 return; 5432 } 5433 5434 cq_tail = *cq_tailp(cq); 5435 5436 /* Already sent? */ 5437 if (cq_tail == cq->last_trigger_irq_tail) { 5438 return; 5439 } 5440 5441 spdk_ivdt_dcache(cq_dbl_headp(cq)); 5442 cq_head = *cq_dbl_headp(cq); 5443 5444 if (cq_head != cq_tail && cq_head == cq->last_head) { 5445 int err = vfu_irq_trigger(ctrlr->endpoint->vfu_ctx, cq->iv); 5446 if (err != 0) { 5447 SPDK_ERRLOG("%s: failed to trigger interrupt: %m\n", 5448 ctrlr_id(ctrlr)); 5449 } else { 5450 cq->last_trigger_irq_tail = cq_tail; 5451 } 5452 } 5453 5454 cq->last_head = cq_head; 5455 } 5456 5457 /* Returns the number of commands processed, or a negative value on error. */ 5458 static int 5459 nvmf_vfio_user_sq_poll(struct nvmf_vfio_user_sq *sq) 5460 { 5461 struct nvmf_vfio_user_ctrlr *ctrlr; 5462 uint32_t new_tail; 5463 int count = 0; 5464 5465 assert(sq != NULL); 5466 5467 ctrlr = sq->ctrlr; 5468 5469 /* 5470 * A quiesced, or migrating, controller should never process new 5471 * commands. 5472 */ 5473 if (ctrlr->state != VFIO_USER_CTRLR_RUNNING) { 5474 return SPDK_POLLER_IDLE; 5475 } 5476 5477 if (ctrlr->adaptive_irqs_enabled) { 5478 handle_suppressed_irq(ctrlr, sq); 5479 } 5480 5481 /* On aarch64 platforms, doorbells update from guest VM may not be seen 5482 * on SPDK target side. This is because there is memory type mismatch 5483 * situation here. That is on guest VM side, the doorbells are treated as 5484 * device memory while on SPDK target side, it is treated as normal 5485 * memory. And this situation cause problem on ARM platform. 5486 * Refer to "https://developer.arm.com/documentation/102376/0100/ 5487 * Memory-aliasing-and-mismatched-memory-types". Only using spdk_mb() 5488 * cannot fix this. Use "dc civac" to invalidate cache may solve 5489 * this. 5490 */ 5491 spdk_ivdt_dcache(sq_dbl_tailp(sq)); 5492 5493 /* Load-Acquire. */ 5494 new_tail = *sq_dbl_tailp(sq); 5495 5496 new_tail = new_tail & 0xffffu; 5497 if (spdk_unlikely(new_tail >= sq->size)) { 5498 union spdk_nvme_async_event_completion event = {}; 5499 5500 SPDK_DEBUGLOG(nvmf_vfio, "%s: invalid sqid:%u doorbell value %u\n", ctrlr_id(ctrlr), sq->qid, 5501 new_tail); 5502 event.bits.async_event_type = SPDK_NVME_ASYNC_EVENT_TYPE_ERROR; 5503 event.bits.async_event_info = SPDK_NVME_ASYNC_EVENT_INVALID_DB_WRITE; 5504 nvmf_ctrlr_async_event_error_event(ctrlr->ctrlr, event); 5505 5506 return -1; 5507 } 5508 5509 if (*sq_headp(sq) == new_tail) { 5510 return 0; 5511 } 5512 5513 SPDK_DEBUGLOG(nvmf_vfio, "%s: sqid:%u doorbell old=%u new=%u\n", 5514 ctrlr_id(ctrlr), sq->qid, *sq_headp(sq), new_tail); 5515 if (ctrlr->sdbl != NULL) { 5516 SPDK_DEBUGLOG(nvmf_vfio, 5517 "%s: sqid:%u bar0_doorbell=%u shadow_doorbell=%u eventidx=%u\n", 5518 ctrlr_id(ctrlr), sq->qid, 5519 ctrlr->bar0_doorbells[queue_index(sq->qid, false)], 5520 ctrlr->sdbl->shadow_doorbells[queue_index(sq->qid, false)], 5521 ctrlr->sdbl->eventidxs[queue_index(sq->qid, false)]); 5522 } 5523 5524 /* 5525 * Ensure that changes to the queue are visible to us. 5526 * The host driver should write the queue first, do a wmb(), and then 5527 * update the SQ tail doorbell (their Store-Release). 5528 */ 5529 spdk_rmb(); 5530 5531 count = handle_sq_tdbl_write(ctrlr, new_tail, sq); 5532 if (count < 0) { 5533 fail_ctrlr(ctrlr); 5534 } 5535 5536 return count; 5537 } 5538 5539 /* 5540 * vfio-user transport poll handler. Note that the library context is polled in 5541 * a separate poller (->vfu_ctx_poller), so this poller only needs to poll the 5542 * active SQs. 5543 * 5544 * Returns the number of commands processed, or a negative value on error. 5545 */ 5546 static int 5547 nvmf_vfio_user_poll_group_poll(struct spdk_nvmf_transport_poll_group *group) 5548 { 5549 struct nvmf_vfio_user_poll_group *vu_group; 5550 struct nvmf_vfio_user_sq *sq, *tmp; 5551 int count = 0; 5552 5553 assert(group != NULL); 5554 5555 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 5556 5557 SPDK_DEBUGLOG(vfio_user_db, "polling all SQs\n"); 5558 5559 TAILQ_FOREACH_SAFE(sq, &vu_group->sqs, link, tmp) { 5560 int ret; 5561 5562 if (spdk_unlikely(sq->sq_state != VFIO_USER_SQ_ACTIVE || !sq->size)) { 5563 continue; 5564 } 5565 5566 ret = nvmf_vfio_user_sq_poll(sq); 5567 5568 if (ret < 0) { 5569 return ret; 5570 } 5571 5572 count += ret; 5573 } 5574 5575 return count; 5576 } 5577 5578 static int 5579 nvmf_vfio_user_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair, 5580 struct spdk_nvme_transport_id *trid) 5581 { 5582 struct nvmf_vfio_user_sq *sq; 5583 struct nvmf_vfio_user_ctrlr *ctrlr; 5584 5585 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5586 ctrlr = sq->ctrlr; 5587 5588 memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid)); 5589 return 0; 5590 } 5591 5592 static int 5593 nvmf_vfio_user_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair, 5594 struct spdk_nvme_transport_id *trid) 5595 { 5596 return 0; 5597 } 5598 5599 static int 5600 nvmf_vfio_user_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair, 5601 struct spdk_nvme_transport_id *trid) 5602 { 5603 struct nvmf_vfio_user_sq *sq; 5604 struct nvmf_vfio_user_ctrlr *ctrlr; 5605 5606 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5607 ctrlr = sq->ctrlr; 5608 5609 memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid)); 5610 return 0; 5611 } 5612 5613 static void 5614 nvmf_vfio_user_qpair_abort_request(struct spdk_nvmf_qpair *qpair, 5615 struct spdk_nvmf_request *req) 5616 { 5617 struct spdk_nvmf_request *req_to_abort = NULL; 5618 struct spdk_nvmf_request *temp_req = NULL; 5619 uint16_t cid; 5620 5621 cid = req->cmd->nvme_cmd.cdw10_bits.abort.cid; 5622 5623 TAILQ_FOREACH(temp_req, &qpair->outstanding, link) { 5624 struct nvmf_vfio_user_req *vu_req; 5625 5626 vu_req = SPDK_CONTAINEROF(temp_req, struct nvmf_vfio_user_req, req); 5627 5628 if (vu_req->state == VFIO_USER_REQUEST_STATE_EXECUTING && vu_req->cmd.cid == cid) { 5629 req_to_abort = temp_req; 5630 break; 5631 } 5632 } 5633 5634 if (req_to_abort == NULL) { 5635 spdk_nvmf_request_complete(req); 5636 return; 5637 } 5638 5639 req->req_to_abort = req_to_abort; 5640 nvmf_ctrlr_abort_request(req); 5641 } 5642 5643 static void 5644 nvmf_vfio_user_opts_init(struct spdk_nvmf_transport_opts *opts) 5645 { 5646 opts->max_queue_depth = NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH; 5647 opts->max_qpairs_per_ctrlr = NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; 5648 opts->in_capsule_data_size = 0; 5649 opts->max_io_size = NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE; 5650 opts->io_unit_size = NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE; 5651 opts->max_aq_depth = NVMF_VFIO_USER_DEFAULT_AQ_DEPTH; 5652 opts->num_shared_buffers = 0; 5653 opts->buf_cache_size = 0; 5654 opts->association_timeout = 0; 5655 opts->transport_specific = NULL; 5656 } 5657 5658 const struct spdk_nvmf_transport_ops spdk_nvmf_transport_vfio_user = { 5659 .name = "VFIOUSER", 5660 .type = SPDK_NVME_TRANSPORT_VFIOUSER, 5661 .opts_init = nvmf_vfio_user_opts_init, 5662 .create = nvmf_vfio_user_create, 5663 .destroy = nvmf_vfio_user_destroy, 5664 5665 .listen = nvmf_vfio_user_listen, 5666 .stop_listen = nvmf_vfio_user_stop_listen, 5667 .cdata_init = nvmf_vfio_user_cdata_init, 5668 .listen_associate = nvmf_vfio_user_listen_associate, 5669 5670 .listener_discover = nvmf_vfio_user_discover, 5671 5672 .poll_group_create = nvmf_vfio_user_poll_group_create, 5673 .get_optimal_poll_group = nvmf_vfio_user_get_optimal_poll_group, 5674 .poll_group_destroy = nvmf_vfio_user_poll_group_destroy, 5675 .poll_group_add = nvmf_vfio_user_poll_group_add, 5676 .poll_group_remove = nvmf_vfio_user_poll_group_remove, 5677 .poll_group_poll = nvmf_vfio_user_poll_group_poll, 5678 5679 .req_free = nvmf_vfio_user_req_free, 5680 .req_complete = nvmf_vfio_user_req_complete, 5681 5682 .qpair_fini = nvmf_vfio_user_close_qpair, 5683 .qpair_get_local_trid = nvmf_vfio_user_qpair_get_local_trid, 5684 .qpair_get_peer_trid = nvmf_vfio_user_qpair_get_peer_trid, 5685 .qpair_get_listen_trid = nvmf_vfio_user_qpair_get_listen_trid, 5686 .qpair_abort_request = nvmf_vfio_user_qpair_abort_request, 5687 }; 5688 5689 SPDK_NVMF_TRANSPORT_REGISTER(muser, &spdk_nvmf_transport_vfio_user); 5690 SPDK_LOG_REGISTER_COMPONENT(nvmf_vfio) 5691 SPDK_LOG_REGISTER_COMPONENT(vfio_user_db) 5692