1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (c) 2019-2022, Nutanix Inc. All rights reserved. 3 * Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 */ 5 6 /* 7 * NVMe over vfio-user transport 8 */ 9 10 #include <vfio-user/libvfio-user.h> 11 #include <vfio-user/pci_defs.h> 12 13 #include "spdk/barrier.h" 14 #include "spdk/stdinc.h" 15 #include "spdk/assert.h" 16 #include "spdk/thread.h" 17 #include "spdk/nvmf_transport.h" 18 #include "spdk/sock.h" 19 #include "spdk/string.h" 20 #include "spdk/util.h" 21 #include "spdk/log.h" 22 23 #include "transport.h" 24 25 #include "nvmf_internal.h" 26 27 #define SWAP(x, y) \ 28 do \ 29 { \ 30 typeof(x) _tmp = x; \ 31 x = y; \ 32 y = _tmp; \ 33 } while (0) 34 35 #define NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH 256 36 #define NVMF_VFIO_USER_DEFAULT_AQ_DEPTH 32 37 #define NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE ((NVMF_REQ_MAX_BUFFERS - 1) << SHIFT_4KB) 38 #define NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE 39 40 #define NVME_DOORBELLS_OFFSET 0x1000 41 #define NVMF_VFIO_USER_DOORBELLS_SIZE 0x1000 42 #define NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT 2 43 #define NVMF_VFIO_USER_SET_EVENTIDX_MAX_ATTEMPTS 3 44 #define NVMF_VFIO_USER_EVENTIDX_POLL UINT32_MAX 45 46 /* 47 * NVMe driver reads 4096 bytes, which is the extended PCI configuration space 48 * available on PCI-X 2.0 and PCI Express buses 49 */ 50 #define NVME_REG_CFG_SIZE 0x1000 51 #define NVME_REG_BAR0_SIZE (NVME_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE) 52 #define NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR ((NVMF_VFIO_USER_DOORBELLS_SIZE) / 8) 53 #define NVME_IRQ_MSIX_NUM NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR 54 /* MSIX Table Size */ 55 #define NVME_BAR4_SIZE SPDK_ALIGN_CEIL((NVME_IRQ_MSIX_NUM * 16), 0x1000) 56 /* MSIX Pending Bit Array Size */ 57 #define NVME_BAR5_SIZE SPDK_ALIGN_CEIL((NVME_IRQ_MSIX_NUM / 8), 0x1000) 58 59 #define NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR (NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR / 4) 60 61 struct nvmf_vfio_user_req; 62 63 typedef int (*nvmf_vfio_user_req_cb_fn)(struct nvmf_vfio_user_req *req, void *cb_arg); 64 65 /* 1 more for PRP2 list itself */ 66 #define NVMF_VFIO_USER_MAX_IOVECS (NVMF_REQ_MAX_BUFFERS + 1) 67 68 enum nvmf_vfio_user_req_state { 69 VFIO_USER_REQUEST_STATE_FREE = 0, 70 VFIO_USER_REQUEST_STATE_EXECUTING, 71 }; 72 73 /* 74 * Support for live migration in NVMf/vfio-user: live migration is implemented 75 * by stopping the NVMf subsystem when the device is instructed to enter the 76 * stop-and-copy state and then trivially, and most importantly safely, 77 * collecting migration state and providing it to the vfio-user client. We 78 * don't provide any migration state at the pre-copy state as that's too 79 * complicated to do, we might support this in the future. 80 */ 81 82 83 /* NVMe device state representation */ 84 struct nvme_migr_sq_state { 85 uint16_t sqid; 86 uint16_t cqid; 87 uint32_t head; 88 uint32_t size; 89 uint32_t reserved; 90 uint64_t dma_addr; 91 }; 92 SPDK_STATIC_ASSERT(sizeof(struct nvme_migr_sq_state) == 0x18, "Incorrect size"); 93 94 struct nvme_migr_cq_state { 95 uint16_t cqid; 96 uint16_t phase; 97 uint32_t tail; 98 uint32_t size; 99 uint32_t iv; 100 uint32_t ien; 101 uint32_t reserved; 102 uint64_t dma_addr; 103 }; 104 SPDK_STATIC_ASSERT(sizeof(struct nvme_migr_cq_state) == 0x20, "Incorrect size"); 105 106 #define VFIO_USER_NVME_MIGR_MAGIC 0xAFEDBC23 107 108 /* The device state is in VFIO MIGRATION BAR(9) region, keep the device state page aligned. 109 * 110 * NVMe device migration region is defined as below: 111 * ------------------------------------------------------------------------- 112 * | vfio_user_nvme_migr_header | nvmf controller data | queue pairs | BARs | 113 * ------------------------------------------------------------------------- 114 * 115 * Keep vfio_user_nvme_migr_header as a fixed 0x1000 length, all new added fields 116 * can use the reserved space at the end of the data structure. 117 */ 118 struct vfio_user_nvme_migr_header { 119 /* Magic value to validate migration data */ 120 uint32_t magic; 121 /* Version to check the data is same from source to destination */ 122 uint32_t version; 123 124 /* The library uses this field to know how many fields in this 125 * structure are valid, starting at the beginning of this data 126 * structure. New added fields in future use `unused` memory 127 * spaces. 128 */ 129 uint32_t opts_size; 130 uint32_t reserved0; 131 132 /* BARs information */ 133 uint64_t bar_offset[VFU_PCI_DEV_NUM_REGIONS]; 134 uint64_t bar_len[VFU_PCI_DEV_NUM_REGIONS]; 135 136 /* Queue pair start offset, starting at the beginning of this 137 * data structure. 138 */ 139 uint64_t qp_offset; 140 uint64_t qp_len; 141 142 /* Controller data structure */ 143 uint32_t num_io_queues; 144 uint32_t reserved1; 145 146 /* TODO: this part will be moved to common nvmf controller data */ 147 uint16_t reserved2[3]; 148 uint16_t nr_aers; 149 uint16_t aer_cids[NVMF_MIGR_MAX_PENDING_AERS]; 150 151 /* NVMf controller data offset and length if exist, starting at 152 * the beginning of this data structure. 153 */ 154 uint64_t nvmf_data_offset; 155 uint64_t nvmf_data_len; 156 157 /* 158 * Whether or not shadow doorbells are used in the source. 0 is a valid DMA 159 * address. 160 */ 161 bool sdbl; 162 163 /* Shadow doorbell DMA addresses. */ 164 uint64_t shadow_doorbell_buffer; 165 uint64_t eventidx_buffer; 166 167 /* Reserved memory space for new added fields, the 168 * field is always at the end of this data structure. 169 */ 170 uint8_t unused[3336]; 171 }; 172 SPDK_STATIC_ASSERT(sizeof(struct vfio_user_nvme_migr_header) == 0x1000, "Incorrect size"); 173 174 struct vfio_user_nvme_migr_qp { 175 struct nvme_migr_sq_state sq; 176 struct nvme_migr_cq_state cq; 177 }; 178 179 /* NVMe state definition used to load/restore from/to NVMe migration BAR region */ 180 struct vfio_user_nvme_migr_state { 181 struct vfio_user_nvme_migr_header ctrlr_header; 182 struct nvmf_ctrlr_migr_data nvmf_data; 183 struct vfio_user_nvme_migr_qp qps[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 184 uint8_t bar0[NVME_REG_BAR0_SIZE]; 185 uint8_t cfg[NVME_REG_CFG_SIZE]; 186 }; 187 188 struct nvmf_vfio_user_req { 189 struct spdk_nvmf_request req; 190 struct spdk_nvme_cpl rsp; 191 struct spdk_nvme_cmd cmd; 192 193 enum nvmf_vfio_user_req_state state; 194 nvmf_vfio_user_req_cb_fn cb_fn; 195 void *cb_arg; 196 197 /* old CC before prop_set_cc fabric command */ 198 union spdk_nvme_cc_register cc; 199 200 TAILQ_ENTRY(nvmf_vfio_user_req) link; 201 202 struct iovec iov[NVMF_VFIO_USER_MAX_IOVECS]; 203 uint8_t iovcnt; 204 205 /* NVMF_VFIO_USER_MAX_IOVECS worth of dma_sg_t. */ 206 uint8_t sg[]; 207 }; 208 209 /* 210 * Mapping of an NVMe queue. 211 * 212 * This holds the information tracking a local process mapping of an NVMe queue 213 * shared by the client. 214 */ 215 struct nvme_q_mapping { 216 /* iov of local process mapping. */ 217 struct iovec iov; 218 /* Stored sg, needed for unmap. */ 219 dma_sg_t *sg; 220 /* Client PRP of queue. */ 221 uint64_t prp1; 222 }; 223 224 enum nvmf_vfio_user_sq_state { 225 VFIO_USER_SQ_UNUSED = 0, 226 VFIO_USER_SQ_CREATED, 227 VFIO_USER_SQ_DELETED, 228 VFIO_USER_SQ_ACTIVE, 229 VFIO_USER_SQ_INACTIVE 230 }; 231 232 enum nvmf_vfio_user_cq_state { 233 VFIO_USER_CQ_UNUSED = 0, 234 VFIO_USER_CQ_CREATED, 235 VFIO_USER_CQ_DELETED, 236 }; 237 238 enum nvmf_vfio_user_ctrlr_state { 239 VFIO_USER_CTRLR_CREATING = 0, 240 VFIO_USER_CTRLR_RUNNING, 241 /* Quiesce requested by libvfio-user */ 242 VFIO_USER_CTRLR_PAUSING, 243 /* NVMf subsystem is paused, it's safe to do PCI reset, memory register, 244 * memory unergister, and vfio migration state transition in this state. 245 */ 246 VFIO_USER_CTRLR_PAUSED, 247 /* 248 * Implies that the NVMf subsystem is paused. Device will be unquiesced (PCI 249 * reset, memory register and unregister, controller in destination VM has 250 * been restored). NVMf subsystem resume has been requested. 251 */ 252 VFIO_USER_CTRLR_RESUMING, 253 /* 254 * Implies that the NVMf subsystem is paused. Both controller in source VM and 255 * destinatiom VM is in this state when doing live migration. 256 */ 257 VFIO_USER_CTRLR_MIGRATING 258 }; 259 260 struct nvmf_vfio_user_sq { 261 struct spdk_nvmf_qpair qpair; 262 struct spdk_nvmf_transport_poll_group *group; 263 struct nvmf_vfio_user_ctrlr *ctrlr; 264 265 uint32_t qid; 266 /* Number of entries in queue. */ 267 uint32_t size; 268 struct nvme_q_mapping mapping; 269 enum nvmf_vfio_user_sq_state sq_state; 270 271 uint32_t head; 272 volatile uint32_t *dbl_tailp; 273 274 /* Whether a shadow doorbell eventidx needs setting. */ 275 bool need_rearm; 276 277 /* multiple SQs can be mapped to the same CQ */ 278 uint16_t cqid; 279 280 /* handle_queue_connect_rsp() can be used both for CREATE IO SQ response 281 * and SQ re-connect response in the destination VM, for the prior case, 282 * we will post a NVMe completion to VM, we will not set this flag when 283 * re-connecting SQs in the destination VM. 284 */ 285 bool post_create_io_sq_completion; 286 /* Copy of Create IO SQ command, this field is used together with 287 * `post_create_io_sq_completion` flag. 288 */ 289 struct spdk_nvme_cmd create_io_sq_cmd; 290 291 /* Currently unallocated reqs. */ 292 TAILQ_HEAD(, nvmf_vfio_user_req) free_reqs; 293 /* Poll group entry */ 294 TAILQ_ENTRY(nvmf_vfio_user_sq) link; 295 /* Connected SQ entry */ 296 TAILQ_ENTRY(nvmf_vfio_user_sq) tailq; 297 }; 298 299 struct nvmf_vfio_user_cq { 300 struct spdk_nvmf_transport_poll_group *group; 301 struct spdk_thread *thread; 302 uint32_t cq_ref; 303 304 uint32_t qid; 305 /* Number of entries in queue. */ 306 uint32_t size; 307 struct nvme_q_mapping mapping; 308 enum nvmf_vfio_user_cq_state cq_state; 309 310 uint32_t tail; 311 volatile uint32_t *dbl_headp; 312 313 bool phase; 314 315 uint16_t iv; 316 bool ien; 317 318 uint32_t last_head; 319 uint32_t last_trigger_irq_tail; 320 }; 321 322 struct nvmf_vfio_user_poll_group { 323 struct spdk_nvmf_transport_poll_group group; 324 TAILQ_ENTRY(nvmf_vfio_user_poll_group) link; 325 TAILQ_HEAD(, nvmf_vfio_user_sq) sqs; 326 }; 327 328 struct nvmf_vfio_user_shadow_doorbells { 329 volatile uint32_t *shadow_doorbells; 330 volatile uint32_t *eventidxs; 331 dma_sg_t *sgs; 332 struct iovec *iovs; 333 }; 334 335 struct nvmf_vfio_user_ctrlr { 336 struct nvmf_vfio_user_endpoint *endpoint; 337 struct nvmf_vfio_user_transport *transport; 338 339 /* Connected SQs list */ 340 TAILQ_HEAD(, nvmf_vfio_user_sq) connected_sqs; 341 enum nvmf_vfio_user_ctrlr_state state; 342 343 /* 344 * Tells whether live migration data have been prepared. This is used 345 * by the get_pending_bytes callback to tell whether or not the 346 * previous iteration finished. 347 */ 348 bool migr_data_prepared; 349 350 /* Controller is in source VM when doing live migration */ 351 bool in_source_vm; 352 353 struct spdk_thread *thread; 354 struct spdk_poller *vfu_ctx_poller; 355 struct spdk_interrupt *intr; 356 int intr_fd; 357 358 bool queued_quiesce; 359 360 bool reset_shn; 361 362 uint16_t cntlid; 363 struct spdk_nvmf_ctrlr *ctrlr; 364 365 struct nvmf_vfio_user_sq *sqs[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 366 struct nvmf_vfio_user_cq *cqs[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 367 368 TAILQ_ENTRY(nvmf_vfio_user_ctrlr) link; 369 370 volatile uint32_t *bar0_doorbells; 371 struct nvmf_vfio_user_shadow_doorbells *sdbl; 372 /* 373 * Shadow doorbells PRPs to provide during the stop-and-copy state. 374 */ 375 uint64_t shadow_doorbell_buffer; 376 uint64_t eventidx_buffer; 377 378 bool adaptive_irqs_enabled; 379 bool kick_requested; 380 }; 381 382 /* Endpoint in vfio-user is associated with a socket file, which 383 * is the representative of a PCI endpoint. 384 */ 385 struct nvmf_vfio_user_endpoint { 386 struct nvmf_vfio_user_transport *transport; 387 vfu_ctx_t *vfu_ctx; 388 struct spdk_poller *accept_poller; 389 struct spdk_thread *accept_thread; 390 bool interrupt_mode; 391 struct msixcap *msix; 392 vfu_pci_config_space_t *pci_config_space; 393 int devmem_fd; 394 int accept_intr_fd; 395 struct spdk_interrupt *accept_intr; 396 397 volatile uint32_t *bar0_doorbells; 398 399 int migr_fd; 400 void *migr_data; 401 402 struct spdk_nvme_transport_id trid; 403 struct spdk_nvmf_subsystem *subsystem; 404 405 /* Controller is associated with an active socket connection, 406 * the lifecycle of the controller is same as the VM. 407 * Currently we only support one active connection, as the NVMe 408 * specification defines, we may support multiple controllers in 409 * future, so that it can support e.g: RESERVATION. 410 */ 411 struct nvmf_vfio_user_ctrlr *ctrlr; 412 pthread_mutex_t lock; 413 414 bool need_async_destroy; 415 /* The subsystem is in PAUSED state and need to be resumed, TRUE 416 * only when migration is done successfully and the controller is 417 * in source VM. 418 */ 419 bool need_resume; 420 /* Start the accept poller again after destroying the controller */ 421 bool need_relisten; 422 423 TAILQ_ENTRY(nvmf_vfio_user_endpoint) link; 424 }; 425 426 struct nvmf_vfio_user_transport_opts { 427 bool disable_mappable_bar0; 428 bool disable_adaptive_irq; 429 bool disable_shadow_doorbells; 430 bool disable_compare; 431 }; 432 433 struct nvmf_vfio_user_transport { 434 struct spdk_nvmf_transport transport; 435 struct nvmf_vfio_user_transport_opts transport_opts; 436 bool intr_mode_supported; 437 pthread_mutex_t lock; 438 TAILQ_HEAD(, nvmf_vfio_user_endpoint) endpoints; 439 440 pthread_mutex_t pg_lock; 441 TAILQ_HEAD(, nvmf_vfio_user_poll_group) poll_groups; 442 struct nvmf_vfio_user_poll_group *next_pg; 443 }; 444 445 /* 446 * function prototypes 447 */ 448 static int nvmf_vfio_user_req_free(struct spdk_nvmf_request *req); 449 450 static struct nvmf_vfio_user_req *get_nvmf_vfio_user_req(struct nvmf_vfio_user_sq *sq); 451 452 /* 453 * Local process virtual address of a queue. 454 */ 455 static inline void * 456 q_addr(struct nvme_q_mapping *mapping) 457 { 458 return mapping->iov.iov_base; 459 } 460 461 static inline int 462 queue_index(uint16_t qid, bool is_cq) 463 { 464 return (qid * 2) + is_cq; 465 } 466 467 static inline volatile uint32_t * 468 sq_headp(struct nvmf_vfio_user_sq *sq) 469 { 470 assert(sq != NULL); 471 return &sq->head; 472 } 473 474 static inline volatile uint32_t * 475 sq_dbl_tailp(struct nvmf_vfio_user_sq *sq) 476 { 477 assert(sq != NULL); 478 return sq->dbl_tailp; 479 } 480 481 static inline volatile uint32_t * 482 cq_dbl_headp(struct nvmf_vfio_user_cq *cq) 483 { 484 assert(cq != NULL); 485 return cq->dbl_headp; 486 } 487 488 static inline volatile uint32_t * 489 cq_tailp(struct nvmf_vfio_user_cq *cq) 490 { 491 assert(cq != NULL); 492 return &cq->tail; 493 } 494 495 static inline void 496 sq_head_advance(struct nvmf_vfio_user_sq *sq) 497 { 498 assert(sq != NULL); 499 500 assert(*sq_headp(sq) < sq->size); 501 (*sq_headp(sq))++; 502 503 if (spdk_unlikely(*sq_headp(sq) == sq->size)) { 504 *sq_headp(sq) = 0; 505 } 506 } 507 508 static inline void 509 cq_tail_advance(struct nvmf_vfio_user_cq *cq) 510 { 511 assert(cq != NULL); 512 513 assert(*cq_tailp(cq) < cq->size); 514 (*cq_tailp(cq))++; 515 516 if (spdk_unlikely(*cq_tailp(cq) == cq->size)) { 517 *cq_tailp(cq) = 0; 518 cq->phase = !cq->phase; 519 } 520 } 521 522 /* 523 * As per NVMe Base spec 3.3.1.2.1, we are supposed to implement CQ flow 524 * control: if there is no space in the CQ, we should wait until there is. 525 * 526 * In practice, we just fail the controller instead: as it happens, all host 527 * implementations we care about right-size the CQ: this is required anyway for 528 * NVMEoF support (see 3.3.2.8). 529 * 530 * Since reading the head doorbell is relatively expensive, we use the cached 531 * value, so we only have to read it for real if it appears that we are full. 532 */ 533 static inline bool 534 cq_is_full(struct nvmf_vfio_user_cq *cq) 535 { 536 uint32_t qindex; 537 538 assert(cq != NULL); 539 540 qindex = *cq_tailp(cq) + 1; 541 if (spdk_unlikely(qindex == cq->size)) { 542 qindex = 0; 543 } 544 545 if (qindex != cq->last_head) { 546 return false; 547 } 548 549 cq->last_head = *cq_dbl_headp(cq); 550 551 return qindex == cq->last_head; 552 } 553 554 static bool 555 io_q_exists(struct nvmf_vfio_user_ctrlr *vu_ctrlr, const uint16_t qid, const bool is_cq) 556 { 557 assert(vu_ctrlr != NULL); 558 559 if (qid == 0 || qid >= NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) { 560 return false; 561 } 562 563 if (is_cq) { 564 if (vu_ctrlr->cqs[qid] == NULL) { 565 return false; 566 } 567 568 return (vu_ctrlr->cqs[qid]->cq_state != VFIO_USER_CQ_DELETED && 569 vu_ctrlr->cqs[qid]->cq_state != VFIO_USER_CQ_UNUSED); 570 } 571 572 if (vu_ctrlr->sqs[qid] == NULL) { 573 return false; 574 } 575 576 return (vu_ctrlr->sqs[qid]->sq_state != VFIO_USER_SQ_DELETED && 577 vu_ctrlr->sqs[qid]->sq_state != VFIO_USER_SQ_UNUSED); 578 } 579 580 /* Return the poll group for the admin queue of the controller. */ 581 static inline struct nvmf_vfio_user_poll_group * 582 ctrlr_to_poll_group(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 583 { 584 return SPDK_CONTAINEROF(vu_ctrlr->sqs[0]->group, 585 struct nvmf_vfio_user_poll_group, 586 group); 587 } 588 589 static inline struct spdk_thread * 590 poll_group_to_thread(struct nvmf_vfio_user_poll_group *vu_pg) 591 { 592 return vu_pg->group.group->thread; 593 } 594 595 static dma_sg_t * 596 index_to_sg_t(void *arr, size_t i) 597 { 598 return (dma_sg_t *)((uintptr_t)arr + i * dma_sg_size()); 599 } 600 601 static inline size_t 602 vfio_user_migr_data_len(void) 603 { 604 return SPDK_ALIGN_CEIL(sizeof(struct vfio_user_nvme_migr_state), PAGE_SIZE); 605 } 606 607 static int vfio_user_ctrlr_intr(void *ctx); 608 609 /* 610 * Wrap vfio_user_ctrlr_intr() such that it can be used with 611 * spdk_thread_send_msg(). 612 * Pollers have type int (*)(void *) while message functions should have type 613 * void (*)(void *), so simply discard the returned value. 614 */ 615 static void 616 vfio_user_ctrlr_intr_wrapper(void *ctx) 617 { 618 vfio_user_ctrlr_intr(ctx); 619 } 620 621 /* 622 * Arrange for this controller to immediately wake up and process everything. 623 */ 624 static inline int 625 ctrlr_kick(struct nvmf_vfio_user_ctrlr *ctrlr) 626 { 627 assert(ctrlr != NULL); 628 assert(ctrlr->thread != NULL); 629 630 if (ctrlr->kick_requested) { 631 return 0; 632 } 633 634 ctrlr->kick_requested = true; 635 636 return spdk_thread_send_msg(ctrlr->thread, 637 vfio_user_ctrlr_intr_wrapper, 638 ctrlr); 639 } 640 641 /* 642 * Make the given DMA address and length available (locally mapped) via iov. 643 */ 644 static void * 645 map_one(vfu_ctx_t *ctx, uint64_t addr, uint64_t len, dma_sg_t *sg, 646 struct iovec *iov, int prot) 647 { 648 int ret; 649 650 assert(ctx != NULL); 651 assert(sg != NULL); 652 assert(iov != NULL); 653 654 ret = vfu_addr_to_sgl(ctx, (void *)(uintptr_t)addr, len, sg, 1, prot); 655 if (ret < 0) { 656 return NULL; 657 } 658 659 ret = vfu_sgl_get(ctx, sg, iov, 1, 0); 660 if (ret != 0) { 661 return NULL; 662 } 663 664 assert(iov->iov_base != NULL); 665 return iov->iov_base; 666 } 667 668 static int 669 nvme_cmd_map_prps(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, 670 uint32_t max_iovcnt, uint32_t len, size_t mps, 671 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 672 { 673 uint64_t prp1, prp2; 674 void *vva; 675 uint32_t i; 676 uint32_t residue_len, nents; 677 uint64_t *prp_list; 678 uint32_t iovcnt; 679 680 assert(max_iovcnt > 0); 681 682 prp1 = cmd->dptr.prp.prp1; 683 prp2 = cmd->dptr.prp.prp2; 684 685 /* PRP1 may started with unaligned page address */ 686 residue_len = mps - (prp1 % mps); 687 residue_len = spdk_min(len, residue_len); 688 689 vva = gpa_to_vva(prv, prp1, residue_len, PROT_READ | PROT_WRITE); 690 if (spdk_unlikely(vva == NULL)) { 691 SPDK_ERRLOG("GPA to VVA failed\n"); 692 return -EINVAL; 693 } 694 len -= residue_len; 695 if (len && max_iovcnt < 2) { 696 SPDK_ERRLOG("Too many page entries, at least two iovs are required\n"); 697 return -ERANGE; 698 } 699 iovs[0].iov_base = vva; 700 iovs[0].iov_len = residue_len; 701 702 if (len) { 703 if (spdk_unlikely(prp2 == 0)) { 704 SPDK_ERRLOG("no PRP2, %d remaining\n", len); 705 return -EINVAL; 706 } 707 708 if (len <= mps) { 709 /* 2 PRP used */ 710 iovcnt = 2; 711 vva = gpa_to_vva(prv, prp2, len, PROT_READ | PROT_WRITE); 712 if (spdk_unlikely(vva == NULL)) { 713 SPDK_ERRLOG("no VVA for %#" PRIx64 ", len%#x\n", 714 prp2, len); 715 return -EINVAL; 716 } 717 iovs[1].iov_base = vva; 718 iovs[1].iov_len = len; 719 } else { 720 /* PRP list used */ 721 nents = (len + mps - 1) / mps; 722 if (spdk_unlikely(nents + 1 > max_iovcnt)) { 723 SPDK_ERRLOG("Too many page entries\n"); 724 return -ERANGE; 725 } 726 727 vva = gpa_to_vva(prv, prp2, nents * sizeof(*prp_list), PROT_READ); 728 if (spdk_unlikely(vva == NULL)) { 729 SPDK_ERRLOG("no VVA for %#" PRIx64 ", nents=%#x\n", 730 prp2, nents); 731 return -EINVAL; 732 } 733 prp_list = vva; 734 i = 0; 735 while (len != 0) { 736 residue_len = spdk_min(len, mps); 737 vva = gpa_to_vva(prv, prp_list[i], residue_len, PROT_READ | PROT_WRITE); 738 if (spdk_unlikely(vva == NULL)) { 739 SPDK_ERRLOG("no VVA for %#" PRIx64 ", residue_len=%#x\n", 740 prp_list[i], residue_len); 741 return -EINVAL; 742 } 743 iovs[i + 1].iov_base = vva; 744 iovs[i + 1].iov_len = residue_len; 745 len -= residue_len; 746 i++; 747 } 748 iovcnt = i + 1; 749 } 750 } else { 751 /* 1 PRP used */ 752 iovcnt = 1; 753 } 754 755 assert(iovcnt <= max_iovcnt); 756 return iovcnt; 757 } 758 759 static int 760 nvme_cmd_map_sgls_data(void *prv, struct spdk_nvme_sgl_descriptor *sgls, uint32_t num_sgls, 761 struct iovec *iovs, uint32_t max_iovcnt, 762 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 763 { 764 uint32_t i; 765 void *vva; 766 767 if (spdk_unlikely(max_iovcnt < num_sgls)) { 768 return -ERANGE; 769 } 770 771 for (i = 0; i < num_sgls; i++) { 772 if (spdk_unlikely(sgls[i].unkeyed.type != SPDK_NVME_SGL_TYPE_DATA_BLOCK)) { 773 SPDK_ERRLOG("Invalid SGL type %u\n", sgls[i].unkeyed.type); 774 return -EINVAL; 775 } 776 vva = gpa_to_vva(prv, sgls[i].address, sgls[i].unkeyed.length, PROT_READ | PROT_WRITE); 777 if (spdk_unlikely(vva == NULL)) { 778 SPDK_ERRLOG("GPA to VVA failed\n"); 779 return -EINVAL; 780 } 781 iovs[i].iov_base = vva; 782 iovs[i].iov_len = sgls[i].unkeyed.length; 783 } 784 785 return num_sgls; 786 } 787 788 static int 789 nvme_cmd_map_sgls(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt, 790 uint32_t len, size_t mps, 791 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 792 { 793 struct spdk_nvme_sgl_descriptor *sgl, *last_sgl; 794 uint32_t num_sgls, seg_len; 795 void *vva; 796 int ret; 797 uint32_t total_iovcnt = 0; 798 799 /* SGL cases */ 800 sgl = &cmd->dptr.sgl1; 801 802 /* only one SGL segment */ 803 if (sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { 804 assert(max_iovcnt > 0); 805 vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, PROT_READ | PROT_WRITE); 806 if (spdk_unlikely(vva == NULL)) { 807 SPDK_ERRLOG("GPA to VVA failed\n"); 808 return -EINVAL; 809 } 810 iovs[0].iov_base = vva; 811 iovs[0].iov_len = sgl->unkeyed.length; 812 assert(sgl->unkeyed.length == len); 813 814 return 1; 815 } 816 817 for (;;) { 818 if (spdk_unlikely((sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_SEGMENT) && 819 (sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_LAST_SEGMENT))) { 820 SPDK_ERRLOG("Invalid SGL type %u\n", sgl->unkeyed.type); 821 return -EINVAL; 822 } 823 824 seg_len = sgl->unkeyed.length; 825 if (spdk_unlikely(seg_len % sizeof(struct spdk_nvme_sgl_descriptor))) { 826 SPDK_ERRLOG("Invalid SGL segment len %u\n", seg_len); 827 return -EINVAL; 828 } 829 830 num_sgls = seg_len / sizeof(struct spdk_nvme_sgl_descriptor); 831 vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, PROT_READ); 832 if (spdk_unlikely(vva == NULL)) { 833 SPDK_ERRLOG("GPA to VVA failed\n"); 834 return -EINVAL; 835 } 836 837 /* sgl point to the first segment */ 838 sgl = (struct spdk_nvme_sgl_descriptor *)vva; 839 last_sgl = &sgl[num_sgls - 1]; 840 841 /* we are done */ 842 if (last_sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { 843 /* map whole sgl list */ 844 ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls, &iovs[total_iovcnt], 845 max_iovcnt - total_iovcnt, gpa_to_vva); 846 if (spdk_unlikely(ret < 0)) { 847 return ret; 848 } 849 total_iovcnt += ret; 850 851 return total_iovcnt; 852 } 853 854 if (num_sgls > 1) { 855 /* map whole sgl exclude last_sgl */ 856 ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls - 1, &iovs[total_iovcnt], 857 max_iovcnt - total_iovcnt, gpa_to_vva); 858 if (spdk_unlikely(ret < 0)) { 859 return ret; 860 } 861 total_iovcnt += ret; 862 } 863 864 /* move to next level's segments */ 865 sgl = last_sgl; 866 } 867 868 return 0; 869 } 870 871 static int 872 nvme_map_cmd(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt, 873 uint32_t len, size_t mps, 874 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 875 { 876 if (cmd->psdt == SPDK_NVME_PSDT_PRP) { 877 return nvme_cmd_map_prps(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva); 878 } 879 880 return nvme_cmd_map_sgls(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva); 881 } 882 883 static char * 884 endpoint_id(struct nvmf_vfio_user_endpoint *endpoint) 885 { 886 return endpoint->trid.traddr; 887 } 888 889 static char * 890 ctrlr_id(struct nvmf_vfio_user_ctrlr *ctrlr) 891 { 892 if (!ctrlr || !ctrlr->endpoint) { 893 return "Null Ctrlr"; 894 } 895 896 return endpoint_id(ctrlr->endpoint); 897 } 898 899 /* 900 * For each queue, update the location of its doorbell to the correct location: 901 * either our own BAR0, or the guest's configured shadow doorbell area. 902 * 903 * The Admin queue (qid: 0) does not ever use shadow doorbells. 904 */ 905 static void 906 vfio_user_ctrlr_switch_doorbells(struct nvmf_vfio_user_ctrlr *ctrlr, bool shadow) 907 { 908 volatile uint32_t *doorbells = shadow ? ctrlr->sdbl->shadow_doorbells : 909 ctrlr->bar0_doorbells; 910 911 assert(doorbells != NULL); 912 913 for (size_t i = 1; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; i++) { 914 struct nvmf_vfio_user_sq *sq = ctrlr->sqs[i]; 915 struct nvmf_vfio_user_cq *cq = ctrlr->cqs[i]; 916 917 if (sq != NULL) { 918 sq->dbl_tailp = doorbells + queue_index(sq->qid, false); 919 } 920 921 if (cq != NULL) { 922 cq->dbl_headp = doorbells + queue_index(cq->qid, true); 923 } 924 } 925 } 926 927 static void 928 unmap_sdbl(vfu_ctx_t *vfu_ctx, struct nvmf_vfio_user_shadow_doorbells *sdbl) 929 { 930 assert(vfu_ctx != NULL); 931 assert(sdbl != NULL); 932 933 /* 934 * An allocation error would result in only one of the two being 935 * non-NULL. If that is the case, no memory should have been mapped. 936 */ 937 if (sdbl->iovs == NULL || sdbl->sgs == NULL) { 938 return; 939 } 940 941 for (size_t i = 0; i < NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT; ++i) { 942 struct iovec *iov; 943 dma_sg_t *sg; 944 945 if (!sdbl->iovs[i].iov_len) { 946 continue; 947 } 948 949 sg = index_to_sg_t(sdbl->sgs, i); 950 iov = sdbl->iovs + i; 951 952 vfu_sgl_put(vfu_ctx, sg, iov, 1); 953 } 954 } 955 956 static void 957 free_sdbl(vfu_ctx_t *vfu_ctx, struct nvmf_vfio_user_shadow_doorbells *sdbl) 958 { 959 if (sdbl == NULL) { 960 return; 961 } 962 963 unmap_sdbl(vfu_ctx, sdbl); 964 965 /* 966 * sdbl->shadow_doorbells and sdbl->eventidxs were mapped, 967 * not allocated, so don't free() them. 968 */ 969 free(sdbl->sgs); 970 free(sdbl->iovs); 971 free(sdbl); 972 } 973 974 static struct nvmf_vfio_user_shadow_doorbells * 975 map_sdbl(vfu_ctx_t *vfu_ctx, uint64_t prp1, uint64_t prp2, size_t len) 976 { 977 struct nvmf_vfio_user_shadow_doorbells *sdbl = NULL; 978 dma_sg_t *sg2 = NULL; 979 void *p; 980 981 assert(vfu_ctx != NULL); 982 983 sdbl = calloc(1, sizeof(*sdbl)); 984 if (sdbl == NULL) { 985 goto err; 986 } 987 988 sdbl->sgs = calloc(NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT, dma_sg_size()); 989 sdbl->iovs = calloc(NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT, sizeof(*sdbl->iovs)); 990 if (sdbl->sgs == NULL || sdbl->iovs == NULL) { 991 goto err; 992 } 993 994 /* Map shadow doorbell buffer (PRP1). */ 995 p = map_one(vfu_ctx, prp1, len, sdbl->sgs, sdbl->iovs, 996 PROT_READ | PROT_WRITE); 997 998 if (p == NULL) { 999 goto err; 1000 } 1001 1002 /* 1003 * Map eventidx buffer (PRP2). 1004 * Should only be written to by the controller. 1005 */ 1006 1007 sg2 = index_to_sg_t(sdbl->sgs, 1); 1008 1009 p = map_one(vfu_ctx, prp2, len, sg2, sdbl->iovs + 1, 1010 PROT_READ | PROT_WRITE); 1011 1012 if (p == NULL) { 1013 goto err; 1014 } 1015 1016 sdbl->shadow_doorbells = (uint32_t *)sdbl->iovs[0].iov_base; 1017 sdbl->eventidxs = (uint32_t *)sdbl->iovs[1].iov_base; 1018 1019 return sdbl; 1020 1021 err: 1022 free_sdbl(vfu_ctx, sdbl); 1023 return NULL; 1024 } 1025 1026 /* 1027 * Copy doorbells from one buffer to the other, during switches betweeen BAR0 1028 * doorbells and shadow doorbells. 1029 */ 1030 static void 1031 copy_doorbells(struct nvmf_vfio_user_ctrlr *ctrlr, 1032 const volatile uint32_t *from, volatile uint32_t *to) 1033 { 1034 assert(ctrlr != NULL); 1035 assert(from != NULL); 1036 assert(to != NULL); 1037 1038 SPDK_DEBUGLOG(vfio_user_db, 1039 "%s: migrating shadow doorbells from %p to %p\n", 1040 ctrlr_id(ctrlr), from, to); 1041 1042 /* Can't use memcpy because it doesn't respect volatile semantics. */ 1043 for (size_t i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; ++i) { 1044 if (ctrlr->sqs[i] != NULL) { 1045 to[queue_index(i, false)] = from[queue_index(i, false)]; 1046 } 1047 1048 if (ctrlr->cqs[i] != NULL) { 1049 to[queue_index(i, true)] = from[queue_index(i, true)]; 1050 } 1051 } 1052 } 1053 1054 static void 1055 fail_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1056 { 1057 const struct spdk_nvmf_registers *regs; 1058 1059 assert(vu_ctrlr != NULL); 1060 assert(vu_ctrlr->ctrlr != NULL); 1061 1062 regs = spdk_nvmf_ctrlr_get_regs(vu_ctrlr->ctrlr); 1063 if (regs->csts.bits.cfs == 0) { 1064 SPDK_ERRLOG(":%s failing controller\n", ctrlr_id(vu_ctrlr)); 1065 } 1066 1067 nvmf_ctrlr_set_fatal_status(vu_ctrlr->ctrlr); 1068 } 1069 1070 static inline bool 1071 ctrlr_interrupt_enabled(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1072 { 1073 assert(vu_ctrlr != NULL); 1074 assert(vu_ctrlr->endpoint != NULL); 1075 1076 vfu_pci_config_space_t *pci = vu_ctrlr->endpoint->pci_config_space; 1077 1078 return (!pci->hdr.cmd.id || vu_ctrlr->endpoint->msix->mxc.mxe); 1079 } 1080 1081 static void 1082 nvmf_vfio_user_destroy_endpoint(struct nvmf_vfio_user_endpoint *endpoint) 1083 { 1084 SPDK_DEBUGLOG(nvmf_vfio, "destroy endpoint %s\n", endpoint_id(endpoint)); 1085 1086 spdk_interrupt_unregister(&endpoint->accept_intr); 1087 spdk_poller_unregister(&endpoint->accept_poller); 1088 1089 if (endpoint->bar0_doorbells) { 1090 munmap((void *)endpoint->bar0_doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 1091 } 1092 1093 if (endpoint->devmem_fd > 0) { 1094 close(endpoint->devmem_fd); 1095 } 1096 1097 if (endpoint->migr_data) { 1098 munmap(endpoint->migr_data, vfio_user_migr_data_len()); 1099 } 1100 1101 if (endpoint->migr_fd > 0) { 1102 close(endpoint->migr_fd); 1103 } 1104 1105 if (endpoint->vfu_ctx) { 1106 vfu_destroy_ctx(endpoint->vfu_ctx); 1107 } 1108 1109 pthread_mutex_destroy(&endpoint->lock); 1110 free(endpoint); 1111 } 1112 1113 /* called when process exits */ 1114 static int 1115 nvmf_vfio_user_destroy(struct spdk_nvmf_transport *transport, 1116 spdk_nvmf_transport_destroy_done_cb cb_fn, void *cb_arg) 1117 { 1118 struct nvmf_vfio_user_transport *vu_transport; 1119 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 1120 1121 SPDK_DEBUGLOG(nvmf_vfio, "destroy transport\n"); 1122 1123 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 1124 transport); 1125 1126 pthread_mutex_destroy(&vu_transport->lock); 1127 pthread_mutex_destroy(&vu_transport->pg_lock); 1128 1129 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 1130 TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link); 1131 nvmf_vfio_user_destroy_endpoint(endpoint); 1132 } 1133 1134 free(vu_transport); 1135 1136 if (cb_fn) { 1137 cb_fn(cb_arg); 1138 } 1139 1140 return 0; 1141 } 1142 1143 static const struct spdk_json_object_decoder vfio_user_transport_opts_decoder[] = { 1144 { 1145 "disable_mappable_bar0", 1146 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_mappable_bar0), 1147 spdk_json_decode_bool, true 1148 }, 1149 { 1150 "disable_adaptive_irq", 1151 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_adaptive_irq), 1152 spdk_json_decode_bool, true 1153 }, 1154 { 1155 "disable_shadow_doorbells", 1156 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_shadow_doorbells), 1157 spdk_json_decode_bool, true 1158 }, 1159 { 1160 "disable_compare", 1161 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_compare), 1162 spdk_json_decode_bool, true 1163 }, 1164 }; 1165 1166 static struct spdk_nvmf_transport * 1167 nvmf_vfio_user_create(struct spdk_nvmf_transport_opts *opts) 1168 { 1169 struct nvmf_vfio_user_transport *vu_transport; 1170 int err; 1171 1172 if (opts->max_qpairs_per_ctrlr > NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) { 1173 SPDK_ERRLOG("Invalid max_qpairs_per_ctrlr=%d, supported max_qpairs_per_ctrlr=%d\n", 1174 opts->max_qpairs_per_ctrlr, NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR); 1175 return NULL; 1176 } 1177 1178 vu_transport = calloc(1, sizeof(*vu_transport)); 1179 if (vu_transport == NULL) { 1180 SPDK_ERRLOG("Transport alloc fail: %m\n"); 1181 return NULL; 1182 } 1183 1184 err = pthread_mutex_init(&vu_transport->lock, NULL); 1185 if (err != 0) { 1186 SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err); 1187 goto err; 1188 } 1189 TAILQ_INIT(&vu_transport->endpoints); 1190 1191 err = pthread_mutex_init(&vu_transport->pg_lock, NULL); 1192 if (err != 0) { 1193 pthread_mutex_destroy(&vu_transport->lock); 1194 SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err); 1195 goto err; 1196 } 1197 TAILQ_INIT(&vu_transport->poll_groups); 1198 1199 if (opts->transport_specific != NULL && 1200 spdk_json_decode_object_relaxed(opts->transport_specific, vfio_user_transport_opts_decoder, 1201 SPDK_COUNTOF(vfio_user_transport_opts_decoder), 1202 vu_transport)) { 1203 SPDK_ERRLOG("spdk_json_decode_object_relaxed failed\n"); 1204 goto cleanup; 1205 } 1206 1207 /* 1208 * To support interrupt mode, the transport must be configured with 1209 * mappable BAR0 disabled: we need a vfio-user message to wake us up 1210 * when a client writes new doorbell values to BAR0, via the 1211 * libvfio-user socket fd. 1212 */ 1213 vu_transport->intr_mode_supported = 1214 vu_transport->transport_opts.disable_mappable_bar0; 1215 1216 /* 1217 * If BAR0 is mappable, it doesn't make sense to support shadow 1218 * doorbells, so explicitly turn it off. 1219 */ 1220 if (!vu_transport->transport_opts.disable_mappable_bar0) { 1221 vu_transport->transport_opts.disable_shadow_doorbells = true; 1222 } 1223 1224 /* 1225 * If we are in interrupt mode, we cannot support adaptive IRQs, as 1226 * there is no guarantee the SQ poller will run subsequently to send 1227 * pending IRQs. 1228 */ 1229 if (spdk_interrupt_mode_is_enabled()) { 1230 vu_transport->transport_opts.disable_adaptive_irq = true; 1231 } 1232 1233 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_mappable_bar0=%d\n", 1234 vu_transport->transport_opts.disable_mappable_bar0); 1235 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_adaptive_irq=%d\n", 1236 vu_transport->transport_opts.disable_adaptive_irq); 1237 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_shadow_doorbells=%d\n", 1238 vu_transport->transport_opts.disable_shadow_doorbells); 1239 1240 return &vu_transport->transport; 1241 1242 cleanup: 1243 pthread_mutex_destroy(&vu_transport->lock); 1244 pthread_mutex_destroy(&vu_transport->pg_lock); 1245 err: 1246 free(vu_transport); 1247 return NULL; 1248 } 1249 1250 static uint32_t 1251 max_queue_size(struct nvmf_vfio_user_ctrlr const *vu_ctrlr) 1252 { 1253 assert(vu_ctrlr != NULL); 1254 assert(vu_ctrlr->ctrlr != NULL); 1255 1256 return vu_ctrlr->ctrlr->vcprop.cap.bits.mqes + 1; 1257 } 1258 1259 static uint32_t 1260 doorbell_stride(const struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1261 { 1262 assert(vu_ctrlr != NULL); 1263 assert(vu_ctrlr->ctrlr != NULL); 1264 1265 return vu_ctrlr->ctrlr->vcprop.cap.bits.dstrd; 1266 } 1267 1268 static uintptr_t 1269 memory_page_size(const struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1270 { 1271 uint32_t memory_page_shift = vu_ctrlr->ctrlr->vcprop.cc.bits.mps + 12; 1272 return 1ul << memory_page_shift; 1273 } 1274 1275 static uintptr_t 1276 memory_page_mask(const struct nvmf_vfio_user_ctrlr *ctrlr) 1277 { 1278 return ~(memory_page_size(ctrlr) - 1); 1279 } 1280 1281 static int 1282 map_q(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvme_q_mapping *mapping, 1283 uint32_t q_size, bool is_cq, bool unmap) 1284 { 1285 uint64_t len; 1286 void *ret; 1287 1288 assert(q_size); 1289 assert(q_addr(mapping) == NULL); 1290 1291 if (is_cq) { 1292 len = q_size * sizeof(struct spdk_nvme_cpl); 1293 } else { 1294 len = q_size * sizeof(struct spdk_nvme_cmd); 1295 } 1296 1297 ret = map_one(vu_ctrlr->endpoint->vfu_ctx, mapping->prp1, len, 1298 mapping->sg, &mapping->iov, 1299 is_cq ? PROT_READ | PROT_WRITE : PROT_READ); 1300 if (ret == NULL) { 1301 return -EFAULT; 1302 } 1303 1304 if (unmap) { 1305 memset(q_addr(mapping), 0, len); 1306 } 1307 1308 return 0; 1309 } 1310 1311 static inline void 1312 unmap_q(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvme_q_mapping *mapping) 1313 { 1314 if (q_addr(mapping) != NULL) { 1315 vfu_sgl_put(vu_ctrlr->endpoint->vfu_ctx, mapping->sg, 1316 &mapping->iov, 1); 1317 mapping->iov.iov_base = NULL; 1318 } 1319 } 1320 1321 static int 1322 asq_setup(struct nvmf_vfio_user_ctrlr *ctrlr) 1323 { 1324 struct nvmf_vfio_user_sq *sq; 1325 const struct spdk_nvmf_registers *regs; 1326 int ret; 1327 1328 assert(ctrlr != NULL); 1329 1330 sq = ctrlr->sqs[0]; 1331 1332 assert(sq != NULL); 1333 assert(q_addr(&sq->mapping) == NULL); 1334 /* XXX ctrlr->asq == 0 is a valid memory address */ 1335 1336 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->ctrlr); 1337 sq->qid = 0; 1338 sq->size = regs->aqa.bits.asqs + 1; 1339 sq->mapping.prp1 = regs->asq; 1340 *sq_headp(sq) = 0; 1341 sq->cqid = 0; 1342 1343 ret = map_q(ctrlr, &sq->mapping, sq->size, false, true); 1344 if (ret) { 1345 return ret; 1346 } 1347 1348 /* The Admin queue (qid: 0) does not ever use shadow doorbells. */ 1349 sq->dbl_tailp = ctrlr->bar0_doorbells + queue_index(0, false); 1350 1351 *sq_dbl_tailp(sq) = 0; 1352 1353 return 0; 1354 } 1355 1356 /* 1357 * Updates eventidx to set an SQ into interrupt or polling mode. 1358 * 1359 * Returns false if the current SQ tail does not match the SQ head, as 1360 * this means that the host has submitted more items to the queue while we were 1361 * not looking - or during the event index update. In that case, we must retry, 1362 * or otherwise make sure we are going to wake up again. 1363 */ 1364 static bool 1365 set_sq_eventidx(struct nvmf_vfio_user_sq *sq) 1366 { 1367 struct nvmf_vfio_user_ctrlr *ctrlr; 1368 volatile uint32_t *sq_tail_eidx; 1369 uint32_t old_tail, new_tail; 1370 1371 assert(sq != NULL); 1372 assert(sq->ctrlr != NULL); 1373 assert(sq->ctrlr->sdbl != NULL); 1374 assert(sq->need_rearm); 1375 1376 ctrlr = sq->ctrlr; 1377 1378 SPDK_DEBUGLOG(vfio_user_db, "%s: updating eventidx of sqid:%u\n", 1379 ctrlr_id(ctrlr), sq->qid); 1380 1381 sq_tail_eidx = ctrlr->sdbl->eventidxs + queue_index(sq->qid, false); 1382 1383 assert(ctrlr->endpoint != NULL); 1384 1385 if (!ctrlr->endpoint->interrupt_mode) { 1386 /* No synchronisation necessary. */ 1387 *sq_tail_eidx = NVMF_VFIO_USER_EVENTIDX_POLL; 1388 return true; 1389 } 1390 1391 old_tail = *sq_dbl_tailp(sq); 1392 *sq_tail_eidx = old_tail; 1393 1394 /* 1395 * Ensure that the event index is updated before re-reading the tail 1396 * doorbell. If it's not, then the host might race us and update the 1397 * tail after the second read but before the event index is written, so 1398 * it won't write to BAR0 and we'll miss the update. 1399 * 1400 * The driver should provide similar ordering with an mb(). 1401 */ 1402 spdk_mb(); 1403 1404 /* 1405 * Check if the host has updated the tail doorbell after we've read it 1406 * for the first time, but before the event index was written. If that's 1407 * the case, then we've lost the race and we need to update the event 1408 * index again (after polling the queue, since the host won't write to 1409 * BAR0). 1410 */ 1411 new_tail = *sq_dbl_tailp(sq); 1412 1413 /* 1414 * We might poll the queue straight after this function returns if the 1415 * tail has been updated, so we need to ensure that any changes to the 1416 * queue will be visible to us if the doorbell has been updated. 1417 * 1418 * The driver should provide similar ordering with a wmb() to ensure 1419 * that the queue is written before it updates the tail doorbell. 1420 */ 1421 spdk_rmb(); 1422 1423 SPDK_DEBUGLOG(vfio_user_db, "%s: sqid:%u, old_tail=%u, new_tail=%u, " 1424 "sq_head=%u\n", ctrlr_id(ctrlr), sq->qid, old_tail, 1425 new_tail, *sq_headp(sq)); 1426 1427 if (new_tail == *sq_headp(sq)) { 1428 sq->need_rearm = false; 1429 return true; 1430 } 1431 1432 /* 1433 * We've lost the race: the tail was updated since we last polled, 1434 * including if it happened within this routine. 1435 * 1436 * The caller should retry after polling (think of this as a cmpxchg 1437 * loop); if we go to sleep while the SQ is not empty, then we won't 1438 * process the remaining events. 1439 */ 1440 return false; 1441 } 1442 1443 static int nvmf_vfio_user_sq_poll(struct nvmf_vfio_user_sq *sq); 1444 1445 /* 1446 * Arrange for an SQ to interrupt us if written. Returns non-zero if we 1447 * processed some SQ entries. 1448 */ 1449 static int 1450 vfio_user_sq_rearm(struct nvmf_vfio_user_ctrlr *ctrlr, 1451 struct nvmf_vfio_user_sq *sq) 1452 { 1453 int count = 0; 1454 size_t i; 1455 1456 assert(sq->need_rearm); 1457 1458 for (i = 0; i < NVMF_VFIO_USER_SET_EVENTIDX_MAX_ATTEMPTS; i++) { 1459 int ret; 1460 1461 if (set_sq_eventidx(sq)) { 1462 /* We won the race and set eventidx; done. */ 1463 return count; 1464 } 1465 1466 ret = nvmf_vfio_user_sq_poll(sq); 1467 1468 count += (ret < 0) ? 1 : ret; 1469 1470 /* 1471 * set_sq_eventidx() hit the race, so we expected 1472 * to process at least one command from this queue. 1473 * If there were no new commands waiting for us, then 1474 * we must have hit an unexpected race condition. 1475 */ 1476 if (ret == 0) { 1477 SPDK_ERRLOG("%s: unexpected race condition detected " 1478 "while updating the shadow doorbell buffer\n", 1479 ctrlr_id(ctrlr)); 1480 1481 fail_ctrlr(ctrlr); 1482 return count; 1483 } 1484 } 1485 1486 SPDK_DEBUGLOG(vfio_user_db, 1487 "%s: set_sq_eventidx() lost the race %zu times\n", 1488 ctrlr_id(ctrlr), i); 1489 1490 /* 1491 * We couldn't arrange an eventidx guaranteed to cause a BAR0 write, as 1492 * we raced with the producer too many times; force ourselves to wake up 1493 * instead. We'll process all queues at that point. 1494 */ 1495 ctrlr_kick(ctrlr); 1496 1497 return count; 1498 } 1499 1500 /* 1501 * We're in interrupt mode, and potentially about to go to sleep. We need to 1502 * make sure any further I/O submissions are guaranteed to wake us up: for 1503 * shadow doorbells that means we may need to go through set_sq_eventidx() for 1504 * every SQ that needs re-arming. 1505 * 1506 * Returns non-zero if we processed something. 1507 */ 1508 static int 1509 vfio_user_poll_group_rearm(struct nvmf_vfio_user_poll_group *vu_group) 1510 { 1511 struct nvmf_vfio_user_sq *sq; 1512 int count = 0; 1513 1514 TAILQ_FOREACH(sq, &vu_group->sqs, link) { 1515 if (spdk_unlikely(sq->sq_state != VFIO_USER_SQ_ACTIVE || !sq->size)) { 1516 continue; 1517 } 1518 1519 if (sq->need_rearm) { 1520 count += vfio_user_sq_rearm(sq->ctrlr, sq); 1521 } 1522 } 1523 1524 return count; 1525 } 1526 1527 static int 1528 acq_setup(struct nvmf_vfio_user_ctrlr *ctrlr) 1529 { 1530 struct nvmf_vfio_user_cq *cq; 1531 const struct spdk_nvmf_registers *regs; 1532 int ret; 1533 1534 assert(ctrlr != NULL); 1535 1536 cq = ctrlr->cqs[0]; 1537 1538 assert(cq != NULL); 1539 1540 assert(q_addr(&cq->mapping) == NULL); 1541 1542 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->ctrlr); 1543 assert(regs != NULL); 1544 cq->qid = 0; 1545 cq->size = regs->aqa.bits.acqs + 1; 1546 cq->mapping.prp1 = regs->acq; 1547 *cq_tailp(cq) = 0; 1548 cq->ien = true; 1549 cq->phase = true; 1550 1551 ret = map_q(ctrlr, &cq->mapping, cq->size, true, true); 1552 if (ret) { 1553 return ret; 1554 } 1555 1556 /* The Admin queue (qid: 0) does not ever use shadow doorbells. */ 1557 cq->dbl_headp = ctrlr->bar0_doorbells + queue_index(0, true); 1558 1559 *cq_dbl_headp(cq) = 0; 1560 1561 return 0; 1562 } 1563 1564 static void * 1565 _map_one(void *prv, uint64_t addr, uint64_t len, int prot) 1566 { 1567 struct spdk_nvmf_request *req = (struct spdk_nvmf_request *)prv; 1568 struct spdk_nvmf_qpair *qpair; 1569 struct nvmf_vfio_user_req *vu_req; 1570 struct nvmf_vfio_user_sq *sq; 1571 void *ret; 1572 1573 assert(req != NULL); 1574 qpair = req->qpair; 1575 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 1576 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 1577 1578 assert(vu_req->iovcnt < NVMF_VFIO_USER_MAX_IOVECS); 1579 ret = map_one(sq->ctrlr->endpoint->vfu_ctx, addr, len, 1580 index_to_sg_t(vu_req->sg, vu_req->iovcnt), 1581 &vu_req->iov[vu_req->iovcnt], prot); 1582 if (spdk_likely(ret != NULL)) { 1583 vu_req->iovcnt++; 1584 } 1585 return ret; 1586 } 1587 1588 static int 1589 vfio_user_map_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req, 1590 struct iovec *iov, uint32_t length) 1591 { 1592 /* Map PRP list to from Guest physical memory to 1593 * virtual memory address. 1594 */ 1595 return nvme_map_cmd(req, &req->cmd->nvme_cmd, iov, NVMF_REQ_MAX_BUFFERS, 1596 length, 4096, _map_one); 1597 } 1598 1599 static int handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 1600 struct nvmf_vfio_user_sq *sq); 1601 1602 /* 1603 * Posts a CQE in the completion queue. 1604 * 1605 * @ctrlr: the vfio-user controller 1606 * @cq: the completion queue 1607 * @cdw0: cdw0 as reported by NVMf 1608 * @sqid: submission queue ID 1609 * @cid: command identifier in NVMe command 1610 * @sc: the NVMe CQE status code 1611 * @sct: the NVMe CQE status code type 1612 */ 1613 static int 1614 post_completion(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_cq *cq, 1615 uint32_t cdw0, uint16_t sqid, uint16_t cid, uint16_t sc, uint16_t sct) 1616 { 1617 struct spdk_nvme_status cpl_status = { 0 }; 1618 struct spdk_nvme_cpl *cpl; 1619 int err; 1620 1621 assert(ctrlr != NULL); 1622 1623 if (spdk_unlikely(cq == NULL || q_addr(&cq->mapping) == NULL)) { 1624 return 0; 1625 } 1626 1627 if (cq->qid == 0) { 1628 assert(spdk_get_thread() == cq->thread); 1629 } 1630 1631 if (cq_is_full(cq)) { 1632 SPDK_ERRLOG("%s: cqid:%d full (tail=%d, head=%d)\n", 1633 ctrlr_id(ctrlr), cq->qid, *cq_tailp(cq), 1634 *cq_dbl_headp(cq)); 1635 return -1; 1636 } 1637 1638 cpl = ((struct spdk_nvme_cpl *)q_addr(&cq->mapping)) + *cq_tailp(cq); 1639 1640 assert(ctrlr->sqs[sqid] != NULL); 1641 SPDK_DEBUGLOG(nvmf_vfio, 1642 "%s: request complete sqid:%d cid=%d status=%#x " 1643 "sqhead=%d cq tail=%d\n", ctrlr_id(ctrlr), sqid, cid, sc, 1644 *sq_headp(ctrlr->sqs[sqid]), *cq_tailp(cq)); 1645 1646 cpl->sqhd = *sq_headp(ctrlr->sqs[sqid]); 1647 cpl->sqid = sqid; 1648 cpl->cid = cid; 1649 cpl->cdw0 = cdw0; 1650 1651 /* 1652 * This is a bitfield: instead of setting the individual bits we need 1653 * directly in cpl->status, which would cause a read-modify-write cycle, 1654 * we'll avoid reading from the CPL altogether by filling in a local 1655 * cpl_status variable, then writing the whole thing. 1656 */ 1657 cpl_status.sct = sct; 1658 cpl_status.sc = sc; 1659 cpl_status.p = cq->phase; 1660 cpl->status = cpl_status; 1661 1662 /* Ensure the Completion Queue Entry is visible. */ 1663 spdk_wmb(); 1664 cq_tail_advance(cq); 1665 1666 if ((cq->qid == 0 || !ctrlr->adaptive_irqs_enabled) && 1667 cq->ien && ctrlr_interrupt_enabled(ctrlr)) { 1668 err = vfu_irq_trigger(ctrlr->endpoint->vfu_ctx, cq->iv); 1669 if (err != 0) { 1670 SPDK_ERRLOG("%s: failed to trigger interrupt: %m\n", 1671 ctrlr_id(ctrlr)); 1672 return err; 1673 } 1674 } 1675 1676 return 0; 1677 } 1678 1679 static void 1680 free_sq_reqs(struct nvmf_vfio_user_sq *sq) 1681 { 1682 while (!TAILQ_EMPTY(&sq->free_reqs)) { 1683 struct nvmf_vfio_user_req *vu_req = TAILQ_FIRST(&sq->free_reqs); 1684 TAILQ_REMOVE(&sq->free_reqs, vu_req, link); 1685 free(vu_req); 1686 } 1687 } 1688 1689 /* Deletes a SQ, if this SQ is the last user of the associated CQ 1690 * and the controller is being shut down or reset, then the CQ is 1691 * also deleted. 1692 */ 1693 static void 1694 delete_sq_done(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvmf_vfio_user_sq *sq) 1695 { 1696 struct nvmf_vfio_user_cq *cq; 1697 uint16_t cqid; 1698 1699 SPDK_DEBUGLOG(nvmf_vfio, "%s: delete sqid:%d=%p done\n", ctrlr_id(vu_ctrlr), 1700 sq->qid, sq); 1701 1702 /* Free SQ resources */ 1703 unmap_q(vu_ctrlr, &sq->mapping); 1704 1705 free_sq_reqs(sq); 1706 1707 sq->size = 0; 1708 1709 sq->sq_state = VFIO_USER_SQ_DELETED; 1710 1711 /* Controller RESET and SHUTDOWN are special cases, 1712 * VM may not send DELETE IO SQ/CQ commands, NVMf library 1713 * will disconnect IO queue pairs. 1714 */ 1715 if (vu_ctrlr->reset_shn) { 1716 cqid = sq->cqid; 1717 cq = vu_ctrlr->cqs[cqid]; 1718 1719 SPDK_DEBUGLOG(nvmf_vfio, "%s: try to delete cqid:%u=%p\n", ctrlr_id(vu_ctrlr), 1720 cq->qid, cq); 1721 1722 if (cq->cq_ref) { 1723 cq->cq_ref--; 1724 } 1725 if (cq->cq_ref == 0) { 1726 unmap_q(vu_ctrlr, &cq->mapping); 1727 cq->size = 0; 1728 cq->cq_state = VFIO_USER_CQ_DELETED; 1729 cq->group = NULL; 1730 } 1731 } 1732 } 1733 1734 static void 1735 free_qp(struct nvmf_vfio_user_ctrlr *ctrlr, uint16_t qid) 1736 { 1737 struct nvmf_vfio_user_sq *sq; 1738 struct nvmf_vfio_user_cq *cq; 1739 1740 if (ctrlr == NULL) { 1741 return; 1742 } 1743 1744 sq = ctrlr->sqs[qid]; 1745 if (sq) { 1746 SPDK_DEBUGLOG(nvmf_vfio, "%s: Free SQ %u\n", ctrlr_id(ctrlr), qid); 1747 unmap_q(ctrlr, &sq->mapping); 1748 1749 free_sq_reqs(sq); 1750 1751 free(sq->mapping.sg); 1752 free(sq); 1753 ctrlr->sqs[qid] = NULL; 1754 } 1755 1756 cq = ctrlr->cqs[qid]; 1757 if (cq) { 1758 SPDK_DEBUGLOG(nvmf_vfio, "%s: Free cqid:%u\n", ctrlr_id(ctrlr), qid); 1759 unmap_q(ctrlr, &cq->mapping); 1760 free(cq->mapping.sg); 1761 free(cq); 1762 ctrlr->cqs[qid] = NULL; 1763 } 1764 } 1765 1766 static int 1767 init_sq(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_transport *transport, 1768 const uint16_t id) 1769 { 1770 struct nvmf_vfio_user_sq *sq; 1771 1772 assert(ctrlr != NULL); 1773 assert(transport != NULL); 1774 assert(ctrlr->sqs[id] == NULL); 1775 1776 sq = calloc(1, sizeof(*sq)); 1777 if (sq == NULL) { 1778 return -ENOMEM; 1779 } 1780 sq->mapping.sg = calloc(1, dma_sg_size()); 1781 if (sq->mapping.sg == NULL) { 1782 free(sq); 1783 return -ENOMEM; 1784 } 1785 1786 sq->qid = id; 1787 sq->qpair.qid = id; 1788 sq->qpair.transport = transport; 1789 sq->ctrlr = ctrlr; 1790 ctrlr->sqs[id] = sq; 1791 1792 TAILQ_INIT(&sq->free_reqs); 1793 1794 return 0; 1795 } 1796 1797 static int 1798 init_cq(struct nvmf_vfio_user_ctrlr *vu_ctrlr, const uint16_t id) 1799 { 1800 struct nvmf_vfio_user_cq *cq; 1801 1802 assert(vu_ctrlr != NULL); 1803 assert(vu_ctrlr->cqs[id] == NULL); 1804 1805 cq = calloc(1, sizeof(*cq)); 1806 if (cq == NULL) { 1807 return -ENOMEM; 1808 } 1809 cq->mapping.sg = calloc(1, dma_sg_size()); 1810 if (cq->mapping.sg == NULL) { 1811 free(cq); 1812 return -ENOMEM; 1813 } 1814 1815 cq->qid = id; 1816 vu_ctrlr->cqs[id] = cq; 1817 1818 return 0; 1819 } 1820 1821 static int 1822 alloc_sq_reqs(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvmf_vfio_user_sq *sq) 1823 { 1824 struct nvmf_vfio_user_req *vu_req, *tmp; 1825 size_t req_size; 1826 uint32_t i; 1827 1828 req_size = sizeof(struct nvmf_vfio_user_req) + 1829 (dma_sg_size() * NVMF_VFIO_USER_MAX_IOVECS); 1830 1831 for (i = 0; i < sq->size; i++) { 1832 struct spdk_nvmf_request *req; 1833 1834 vu_req = calloc(1, req_size); 1835 if (vu_req == NULL) { 1836 goto err; 1837 } 1838 1839 req = &vu_req->req; 1840 req->qpair = &sq->qpair; 1841 req->rsp = (union nvmf_c2h_msg *)&vu_req->rsp; 1842 req->cmd = (union nvmf_h2c_msg *)&vu_req->cmd; 1843 req->stripped_data = NULL; 1844 1845 TAILQ_INSERT_TAIL(&sq->free_reqs, vu_req, link); 1846 } 1847 1848 return 0; 1849 1850 err: 1851 TAILQ_FOREACH_SAFE(vu_req, &sq->free_reqs, link, tmp) { 1852 free(vu_req); 1853 } 1854 return -ENOMEM; 1855 } 1856 1857 static volatile uint32_t * 1858 ctrlr_doorbell_ptr(struct nvmf_vfio_user_ctrlr *ctrlr) 1859 { 1860 return ctrlr->sdbl != NULL ? 1861 ctrlr->sdbl->shadow_doorbells : 1862 ctrlr->bar0_doorbells; 1863 } 1864 1865 static uint16_t 1866 handle_create_io_sq(struct nvmf_vfio_user_ctrlr *ctrlr, 1867 struct spdk_nvme_cmd *cmd, uint16_t *sct) 1868 { 1869 struct nvmf_vfio_user_transport *vu_transport = ctrlr->transport; 1870 struct nvmf_vfio_user_sq *sq; 1871 uint32_t qsize; 1872 uint16_t cqid; 1873 uint16_t qid; 1874 int err; 1875 1876 qid = cmd->cdw10_bits.create_io_q.qid; 1877 cqid = cmd->cdw11_bits.create_io_sq.cqid; 1878 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 1879 1880 if (ctrlr->sqs[qid] == NULL) { 1881 err = init_sq(ctrlr, ctrlr->sqs[0]->qpair.transport, qid); 1882 if (err != 0) { 1883 *sct = SPDK_NVME_SCT_GENERIC; 1884 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1885 } 1886 } 1887 1888 if (cqid == 0 || cqid >= vu_transport->transport.opts.max_qpairs_per_ctrlr) { 1889 SPDK_ERRLOG("%s: invalid cqid:%u\n", ctrlr_id(ctrlr), cqid); 1890 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1891 return SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 1892 } 1893 1894 /* CQ must be created before SQ. */ 1895 if (!io_q_exists(ctrlr, cqid, true)) { 1896 SPDK_ERRLOG("%s: cqid:%u does not exist\n", ctrlr_id(ctrlr), cqid); 1897 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1898 return SPDK_NVME_SC_COMPLETION_QUEUE_INVALID; 1899 } 1900 1901 if (cmd->cdw11_bits.create_io_sq.pc != 0x1) { 1902 SPDK_ERRLOG("%s: non-PC SQ not supported\n", ctrlr_id(ctrlr)); 1903 *sct = SPDK_NVME_SCT_GENERIC; 1904 return SPDK_NVME_SC_INVALID_FIELD; 1905 } 1906 1907 sq = ctrlr->sqs[qid]; 1908 sq->size = qsize; 1909 1910 SPDK_DEBUGLOG(nvmf_vfio, "%s: sqid:%d cqid:%d\n", ctrlr_id(ctrlr), 1911 qid, cqid); 1912 1913 sq->mapping.prp1 = cmd->dptr.prp.prp1; 1914 1915 err = map_q(ctrlr, &sq->mapping, sq->size, false, true); 1916 if (err) { 1917 SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr)); 1918 *sct = SPDK_NVME_SCT_GENERIC; 1919 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1920 } 1921 1922 SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped sqid:%d IOVA=%#lx vaddr=%p\n", 1923 ctrlr_id(ctrlr), qid, cmd->dptr.prp.prp1, 1924 q_addr(&sq->mapping)); 1925 1926 err = alloc_sq_reqs(ctrlr, sq); 1927 if (err < 0) { 1928 SPDK_ERRLOG("%s: failed to allocate SQ requests: %m\n", ctrlr_id(ctrlr)); 1929 *sct = SPDK_NVME_SCT_GENERIC; 1930 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1931 } 1932 1933 sq->cqid = cqid; 1934 ctrlr->cqs[sq->cqid]->cq_ref++; 1935 sq->sq_state = VFIO_USER_SQ_CREATED; 1936 *sq_headp(sq) = 0; 1937 1938 sq->dbl_tailp = ctrlr_doorbell_ptr(ctrlr) + queue_index(qid, false); 1939 1940 /* 1941 * We should always reset the doorbells. 1942 * 1943 * The Specification prohibits the controller from writing to the shadow 1944 * doorbell buffer, however older versions of the Linux NVMe driver 1945 * don't reset the shadow doorbell buffer after a Queue-Level or 1946 * Controller-Level reset, which means that we're left with garbage 1947 * doorbell values. 1948 */ 1949 *sq_dbl_tailp(sq) = 0; 1950 1951 if (ctrlr->sdbl != NULL) { 1952 sq->need_rearm = true; 1953 1954 if (!set_sq_eventidx(sq)) { 1955 SPDK_ERRLOG("%s: host updated SQ tail doorbell before " 1956 "sqid:%hu was initialized\n", 1957 ctrlr_id(ctrlr), qid); 1958 fail_ctrlr(ctrlr); 1959 *sct = SPDK_NVME_SCT_GENERIC; 1960 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1961 } 1962 } 1963 1964 /* 1965 * Create our new I/O qpair. This asynchronously invokes, on a suitable 1966 * poll group, the nvmf_vfio_user_poll_group_add() callback, which will 1967 * call spdk_nvmf_request_exec_fabrics() with a generated fabrics 1968 * connect command. This command is then eventually completed via 1969 * handle_queue_connect_rsp(). 1970 */ 1971 sq->create_io_sq_cmd = *cmd; 1972 sq->post_create_io_sq_completion = true; 1973 1974 spdk_nvmf_tgt_new_qpair(ctrlr->transport->transport.tgt, 1975 &sq->qpair); 1976 1977 *sct = SPDK_NVME_SCT_GENERIC; 1978 return SPDK_NVME_SC_SUCCESS; 1979 } 1980 1981 static uint16_t 1982 handle_create_io_cq(struct nvmf_vfio_user_ctrlr *ctrlr, 1983 struct spdk_nvme_cmd *cmd, uint16_t *sct) 1984 { 1985 struct nvmf_vfio_user_cq *cq; 1986 uint32_t qsize; 1987 uint16_t qid; 1988 int err; 1989 1990 qid = cmd->cdw10_bits.create_io_q.qid; 1991 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 1992 1993 if (ctrlr->cqs[qid] == NULL) { 1994 err = init_cq(ctrlr, qid); 1995 if (err != 0) { 1996 *sct = SPDK_NVME_SCT_GENERIC; 1997 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1998 } 1999 } 2000 2001 if (cmd->cdw11_bits.create_io_cq.pc != 0x1) { 2002 SPDK_ERRLOG("%s: non-PC CQ not supported\n", ctrlr_id(ctrlr)); 2003 *sct = SPDK_NVME_SCT_GENERIC; 2004 return SPDK_NVME_SC_INVALID_FIELD; 2005 } 2006 2007 if (cmd->cdw11_bits.create_io_cq.iv > NVME_IRQ_MSIX_NUM - 1) { 2008 SPDK_ERRLOG("%s: IV is too big\n", ctrlr_id(ctrlr)); 2009 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2010 return SPDK_NVME_SC_INVALID_INTERRUPT_VECTOR; 2011 } 2012 2013 cq = ctrlr->cqs[qid]; 2014 cq->size = qsize; 2015 2016 cq->mapping.prp1 = cmd->dptr.prp.prp1; 2017 2018 cq->dbl_headp = ctrlr_doorbell_ptr(ctrlr) + queue_index(qid, true); 2019 2020 err = map_q(ctrlr, &cq->mapping, cq->size, true, true); 2021 if (err) { 2022 SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr)); 2023 *sct = SPDK_NVME_SCT_GENERIC; 2024 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2025 } 2026 2027 SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped cqid:%u IOVA=%#lx vaddr=%p\n", 2028 ctrlr_id(ctrlr), qid, cmd->dptr.prp.prp1, 2029 q_addr(&cq->mapping)); 2030 2031 cq->ien = cmd->cdw11_bits.create_io_cq.ien; 2032 cq->iv = cmd->cdw11_bits.create_io_cq.iv; 2033 cq->phase = true; 2034 cq->cq_state = VFIO_USER_CQ_CREATED; 2035 2036 *cq_tailp(cq) = 0; 2037 2038 /* 2039 * We should always reset the doorbells. 2040 * 2041 * The Specification prohibits the controller from writing to the shadow 2042 * doorbell buffer, however older versions of the Linux NVMe driver 2043 * don't reset the shadow doorbell buffer after a Queue-Level or 2044 * Controller-Level reset, which means that we're left with garbage 2045 * doorbell values. 2046 */ 2047 *cq_dbl_headp(cq) = 0; 2048 2049 *sct = SPDK_NVME_SCT_GENERIC; 2050 return SPDK_NVME_SC_SUCCESS; 2051 } 2052 2053 /* 2054 * Creates a completion or submission I/O queue. Returns 0 on success, -errno 2055 * on error. 2056 */ 2057 static int 2058 handle_create_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, 2059 struct spdk_nvme_cmd *cmd, const bool is_cq) 2060 { 2061 struct nvmf_vfio_user_transport *vu_transport = ctrlr->transport; 2062 uint16_t sct = SPDK_NVME_SCT_GENERIC; 2063 uint16_t sc = SPDK_NVME_SC_SUCCESS; 2064 uint32_t qsize; 2065 uint16_t qid; 2066 2067 assert(ctrlr != NULL); 2068 assert(cmd != NULL); 2069 2070 qid = cmd->cdw10_bits.create_io_q.qid; 2071 if (qid == 0 || qid >= vu_transport->transport.opts.max_qpairs_per_ctrlr) { 2072 SPDK_ERRLOG("%s: invalid qid=%d, max=%d\n", ctrlr_id(ctrlr), 2073 qid, vu_transport->transport.opts.max_qpairs_per_ctrlr); 2074 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2075 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2076 goto out; 2077 } 2078 2079 if (io_q_exists(ctrlr, qid, is_cq)) { 2080 SPDK_ERRLOG("%s: %cqid:%d already exists\n", ctrlr_id(ctrlr), 2081 is_cq ? 'c' : 's', qid); 2082 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2083 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2084 goto out; 2085 } 2086 2087 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 2088 if (qsize == 1 || qsize > max_queue_size(ctrlr)) { 2089 SPDK_ERRLOG("%s: invalid I/O queue size %u\n", ctrlr_id(ctrlr), qsize); 2090 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2091 sc = SPDK_NVME_SC_INVALID_QUEUE_SIZE; 2092 goto out; 2093 } 2094 2095 if (is_cq) { 2096 sc = handle_create_io_cq(ctrlr, cmd, &sct); 2097 } else { 2098 sc = handle_create_io_sq(ctrlr, cmd, &sct); 2099 2100 if (sct == SPDK_NVME_SCT_GENERIC && 2101 sc == SPDK_NVME_SC_SUCCESS) { 2102 /* Completion posted asynchronously. */ 2103 return 0; 2104 } 2105 } 2106 2107 out: 2108 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 2109 } 2110 2111 /* For ADMIN I/O DELETE SUBMISSION QUEUE the NVMf library will disconnect and free 2112 * queue pair, so save the command in a context. 2113 */ 2114 struct vfio_user_delete_sq_ctx { 2115 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 2116 struct spdk_nvme_cmd delete_io_sq_cmd; 2117 }; 2118 2119 static void 2120 vfio_user_qpair_delete_cb(void *cb_arg) 2121 { 2122 struct vfio_user_delete_sq_ctx *ctx = cb_arg; 2123 struct nvmf_vfio_user_ctrlr *vu_ctrlr = ctx->vu_ctrlr; 2124 struct nvmf_vfio_user_cq *admin_cq = vu_ctrlr->cqs[0]; 2125 2126 if (admin_cq->thread != spdk_get_thread()) { 2127 assert(admin_cq->thread != NULL); 2128 spdk_thread_send_msg(admin_cq->thread, 2129 vfio_user_qpair_delete_cb, 2130 cb_arg); 2131 } else { 2132 post_completion(vu_ctrlr, vu_ctrlr->cqs[0], 0, 0, 2133 ctx->delete_io_sq_cmd.cid, 2134 SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC); 2135 free(ctx); 2136 } 2137 } 2138 2139 /* 2140 * Deletes a completion or submission I/O queue. 2141 */ 2142 static int 2143 handle_del_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, 2144 struct spdk_nvme_cmd *cmd, const bool is_cq) 2145 { 2146 uint16_t sct = SPDK_NVME_SCT_GENERIC; 2147 uint16_t sc = SPDK_NVME_SC_SUCCESS; 2148 struct nvmf_vfio_user_sq *sq; 2149 struct nvmf_vfio_user_cq *cq; 2150 struct vfio_user_delete_sq_ctx *ctx; 2151 2152 SPDK_DEBUGLOG(nvmf_vfio, "%s: delete I/O %cqid:%d\n", 2153 ctrlr_id(ctrlr), is_cq ? 'c' : 's', 2154 cmd->cdw10_bits.delete_io_q.qid); 2155 2156 if (!io_q_exists(ctrlr, cmd->cdw10_bits.delete_io_q.qid, is_cq)) { 2157 SPDK_ERRLOG("%s: I/O %cqid:%d does not exist\n", ctrlr_id(ctrlr), 2158 is_cq ? 'c' : 's', cmd->cdw10_bits.delete_io_q.qid); 2159 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2160 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2161 goto out; 2162 } 2163 2164 if (is_cq) { 2165 cq = ctrlr->cqs[cmd->cdw10_bits.delete_io_q.qid]; 2166 if (cq->cq_ref) { 2167 SPDK_ERRLOG("%s: the associated SQ must be deleted first\n", ctrlr_id(ctrlr)); 2168 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2169 sc = SPDK_NVME_SC_INVALID_QUEUE_DELETION; 2170 goto out; 2171 } 2172 2173 unmap_q(ctrlr, &cq->mapping); 2174 cq->size = 0; 2175 cq->cq_state = VFIO_USER_CQ_DELETED; 2176 cq->group = NULL; 2177 } else { 2178 ctx = calloc(1, sizeof(*ctx)); 2179 if (!ctx) { 2180 sct = SPDK_NVME_SCT_GENERIC; 2181 sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2182 goto out; 2183 } 2184 ctx->vu_ctrlr = ctrlr; 2185 ctx->delete_io_sq_cmd = *cmd; 2186 2187 sq = ctrlr->sqs[cmd->cdw10_bits.delete_io_q.qid]; 2188 sq->sq_state = VFIO_USER_SQ_DELETED; 2189 assert(ctrlr->cqs[sq->cqid]->cq_ref); 2190 ctrlr->cqs[sq->cqid]->cq_ref--; 2191 2192 spdk_nvmf_qpair_disconnect(&sq->qpair, vfio_user_qpair_delete_cb, ctx); 2193 return 0; 2194 } 2195 2196 out: 2197 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 2198 } 2199 2200 /* 2201 * Configures Shadow Doorbells. 2202 */ 2203 static int 2204 handle_doorbell_buffer_config(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd) 2205 { 2206 struct nvmf_vfio_user_shadow_doorbells *sdbl = NULL; 2207 uint32_t dstrd; 2208 uintptr_t page_size, page_mask; 2209 uint64_t prp1, prp2; 2210 uint16_t sct = SPDK_NVME_SCT_GENERIC; 2211 uint16_t sc = SPDK_NVME_SC_INVALID_FIELD; 2212 2213 assert(ctrlr != NULL); 2214 assert(ctrlr->endpoint != NULL); 2215 assert(cmd != NULL); 2216 2217 dstrd = doorbell_stride(ctrlr); 2218 page_size = memory_page_size(ctrlr); 2219 page_mask = memory_page_mask(ctrlr); 2220 2221 /* FIXME: we don't check doorbell stride when setting queue doorbells. */ 2222 if ((4u << dstrd) * NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR > page_size) { 2223 SPDK_ERRLOG("%s: doorbells do not fit in a single host page", 2224 ctrlr_id(ctrlr)); 2225 2226 goto out; 2227 } 2228 2229 /* Verify guest physical addresses passed as PRPs. */ 2230 if (cmd->psdt != SPDK_NVME_PSDT_PRP) { 2231 SPDK_ERRLOG("%s: received Doorbell Buffer Config without PRPs", 2232 ctrlr_id(ctrlr)); 2233 2234 goto out; 2235 } 2236 2237 prp1 = cmd->dptr.prp.prp1; 2238 prp2 = cmd->dptr.prp.prp2; 2239 2240 SPDK_DEBUGLOG(nvmf_vfio, 2241 "%s: configuring shadow doorbells with PRP1=%#lx and PRP2=%#lx (GPAs)\n", 2242 ctrlr_id(ctrlr), prp1, prp2); 2243 2244 if (prp1 == prp2 2245 || prp1 != (prp1 & page_mask) 2246 || prp2 != (prp2 & page_mask)) { 2247 SPDK_ERRLOG("%s: invalid shadow doorbell GPAs\n", 2248 ctrlr_id(ctrlr)); 2249 2250 goto out; 2251 } 2252 2253 /* Map guest physical addresses to our virtual address space. */ 2254 sdbl = map_sdbl(ctrlr->endpoint->vfu_ctx, prp1, prp2, page_size); 2255 if (sdbl == NULL) { 2256 SPDK_ERRLOG("%s: failed to map shadow doorbell buffers\n", 2257 ctrlr_id(ctrlr)); 2258 2259 goto out; 2260 } 2261 2262 ctrlr->shadow_doorbell_buffer = prp1; 2263 ctrlr->eventidx_buffer = prp2; 2264 2265 SPDK_DEBUGLOG(nvmf_vfio, 2266 "%s: mapped shadow doorbell buffers [%p, %p) and [%p, %p)\n", 2267 ctrlr_id(ctrlr), 2268 sdbl->iovs[0].iov_base, 2269 sdbl->iovs[0].iov_base + sdbl->iovs[0].iov_len, 2270 sdbl->iovs[1].iov_base, 2271 sdbl->iovs[1].iov_base + sdbl->iovs[1].iov_len); 2272 2273 2274 /* 2275 * Set all possible CQ head doorbells to polling mode now, such that we 2276 * don't have to worry about it later if the host creates more queues. 2277 * 2278 * We only ever want interrupts for writes to the SQ tail doorbells 2279 * (which are initialised in set_ctrlr_intr_mode() below). 2280 */ 2281 for (uint16_t i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; ++i) { 2282 sdbl->eventidxs[queue_index(i, true)] = NVMF_VFIO_USER_EVENTIDX_POLL; 2283 if (ctrlr->sqs[i] != NULL) { 2284 ctrlr->sqs[i]->need_rearm = true; 2285 } 2286 } 2287 2288 /* Update controller. */ 2289 SWAP(ctrlr->sdbl, sdbl); 2290 2291 /* 2292 * Copy doorbells from either the previous shadow doorbell buffer or the 2293 * BAR0 doorbells and make I/O queue doorbells point to the new buffer. 2294 * 2295 * This needs to account for older versions of the Linux NVMe driver, 2296 * which don't clear out the buffer after a controller reset. 2297 */ 2298 copy_doorbells(ctrlr, sdbl != NULL ? 2299 sdbl->shadow_doorbells : ctrlr->bar0_doorbells, 2300 ctrlr->sdbl->shadow_doorbells); 2301 vfio_user_ctrlr_switch_doorbells(ctrlr, true); 2302 2303 /* Update event index buffer and poll queues if necessary. */ 2304 vfio_user_poll_group_rearm(ctrlr_to_poll_group(ctrlr)); 2305 2306 sc = SPDK_NVME_SC_SUCCESS; 2307 2308 out: 2309 /* 2310 * Unmap existing buffers, in case Doorbell Buffer Config was sent 2311 * more than once (pointless, but not prohibited by the spec), or 2312 * in case of an error. 2313 * 2314 * If this is the first time Doorbell Buffer Config was processed, 2315 * then we've just swapped a NULL from ctrlr->sdbl into sdbl, so 2316 * free_sdbl() becomes a noop. 2317 */ 2318 free_sdbl(ctrlr->endpoint->vfu_ctx, sdbl); 2319 2320 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 2321 } 2322 2323 /* Returns 0 on success and -errno on error. */ 2324 static int 2325 consume_admin_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd) 2326 { 2327 assert(ctrlr != NULL); 2328 assert(cmd != NULL); 2329 2330 if (cmd->fuse != 0) { 2331 /* Fused admin commands are not supported. */ 2332 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, 2333 SPDK_NVME_SC_INVALID_FIELD, 2334 SPDK_NVME_SCT_GENERIC); 2335 } 2336 2337 switch (cmd->opc) { 2338 case SPDK_NVME_OPC_CREATE_IO_CQ: 2339 case SPDK_NVME_OPC_CREATE_IO_SQ: 2340 return handle_create_io_q(ctrlr, cmd, 2341 cmd->opc == SPDK_NVME_OPC_CREATE_IO_CQ); 2342 case SPDK_NVME_OPC_DELETE_IO_SQ: 2343 case SPDK_NVME_OPC_DELETE_IO_CQ: 2344 return handle_del_io_q(ctrlr, cmd, 2345 cmd->opc == SPDK_NVME_OPC_DELETE_IO_CQ); 2346 case SPDK_NVME_OPC_DOORBELL_BUFFER_CONFIG: 2347 if (!ctrlr->transport->transport_opts.disable_shadow_doorbells) { 2348 return handle_doorbell_buffer_config(ctrlr, cmd); 2349 } 2350 /* FALLTHROUGH */ 2351 default: 2352 return handle_cmd_req(ctrlr, cmd, ctrlr->sqs[0]); 2353 } 2354 } 2355 2356 static int 2357 handle_cmd_rsp(struct nvmf_vfio_user_req *vu_req, void *cb_arg) 2358 { 2359 struct nvmf_vfio_user_sq *sq = cb_arg; 2360 struct nvmf_vfio_user_ctrlr *vu_ctrlr = sq->ctrlr; 2361 uint16_t sqid, cqid; 2362 2363 assert(sq != NULL); 2364 assert(vu_req != NULL); 2365 assert(vu_ctrlr != NULL); 2366 2367 if (spdk_likely(vu_req->iovcnt)) { 2368 vfu_sgl_put(vu_ctrlr->endpoint->vfu_ctx, 2369 index_to_sg_t(vu_req->sg, 0), 2370 vu_req->iov, vu_req->iovcnt); 2371 } 2372 sqid = sq->qid; 2373 cqid = sq->cqid; 2374 2375 return post_completion(vu_ctrlr, vu_ctrlr->cqs[cqid], 2376 vu_req->req.rsp->nvme_cpl.cdw0, 2377 sqid, 2378 vu_req->req.cmd->nvme_cmd.cid, 2379 vu_req->req.rsp->nvme_cpl.status.sc, 2380 vu_req->req.rsp->nvme_cpl.status.sct); 2381 } 2382 2383 static int 2384 consume_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_sq *sq, 2385 struct spdk_nvme_cmd *cmd) 2386 { 2387 assert(sq != NULL); 2388 if (nvmf_qpair_is_admin_queue(&sq->qpair)) { 2389 return consume_admin_cmd(ctrlr, cmd); 2390 } 2391 2392 return handle_cmd_req(ctrlr, cmd, sq); 2393 } 2394 2395 /* Returns the number of commands processed, or a negative value on error. */ 2396 static int 2397 handle_sq_tdbl_write(struct nvmf_vfio_user_ctrlr *ctrlr, const uint32_t new_tail, 2398 struct nvmf_vfio_user_sq *sq) 2399 { 2400 struct spdk_nvme_cmd *queue; 2401 int count = 0; 2402 2403 assert(ctrlr != NULL); 2404 assert(sq != NULL); 2405 2406 if (ctrlr->sdbl != NULL) { 2407 /* 2408 * Submission queue index has moved past the event index, so it 2409 * needs to be re-armed before we go to sleep. 2410 */ 2411 sq->need_rearm = true; 2412 } 2413 2414 queue = q_addr(&sq->mapping); 2415 while (*sq_headp(sq) != new_tail) { 2416 int err; 2417 struct spdk_nvme_cmd *cmd = &queue[*sq_headp(sq)]; 2418 2419 count++; 2420 2421 /* 2422 * SQHD must contain the new head pointer, so we must increase 2423 * it before we generate a completion. 2424 */ 2425 sq_head_advance(sq); 2426 2427 err = consume_cmd(ctrlr, sq, cmd); 2428 if (err != 0) { 2429 return err; 2430 } 2431 } 2432 2433 return count; 2434 } 2435 2436 static void 2437 memory_region_add_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) 2438 { 2439 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2440 struct nvmf_vfio_user_ctrlr *ctrlr; 2441 struct nvmf_vfio_user_sq *sq; 2442 struct nvmf_vfio_user_cq *cq; 2443 void *map_start, *map_end; 2444 int ret; 2445 2446 /* 2447 * We're not interested in any DMA regions that aren't mappable (we don't 2448 * support clients that don't share their memory). 2449 */ 2450 if (!info->vaddr) { 2451 return; 2452 } 2453 2454 map_start = info->mapping.iov_base; 2455 map_end = info->mapping.iov_base + info->mapping.iov_len; 2456 2457 if (((uintptr_t)info->mapping.iov_base & MASK_2MB) || 2458 (info->mapping.iov_len & MASK_2MB)) { 2459 SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %p-%p\n", 2460 info->vaddr, map_start, map_end); 2461 return; 2462 } 2463 2464 assert(endpoint != NULL); 2465 if (endpoint->ctrlr == NULL) { 2466 return; 2467 } 2468 ctrlr = endpoint->ctrlr; 2469 2470 SPDK_DEBUGLOG(nvmf_vfio, "%s: map IOVA %p-%p\n", endpoint_id(endpoint), 2471 map_start, map_end); 2472 2473 /* VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE are enabled when registering to VFIO, here we also 2474 * check the protection bits before registering. 2475 */ 2476 if (info->prot == (PROT_WRITE | PROT_READ)) { 2477 ret = spdk_mem_register(info->mapping.iov_base, info->mapping.iov_len); 2478 if (ret) { 2479 SPDK_ERRLOG("Memory region register %p-%p failed, ret=%d\n", 2480 map_start, map_end, ret); 2481 } 2482 } 2483 2484 pthread_mutex_lock(&endpoint->lock); 2485 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 2486 if (sq->sq_state != VFIO_USER_SQ_INACTIVE) { 2487 continue; 2488 } 2489 2490 cq = ctrlr->cqs[sq->cqid]; 2491 2492 /* For shared CQ case, we will use q_addr() to avoid mapping CQ multiple times */ 2493 if (cq->size && q_addr(&cq->mapping) == NULL) { 2494 ret = map_q(ctrlr, &cq->mapping, cq->size, true, false); 2495 if (ret) { 2496 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap cqid:%d %#lx-%#lx\n", 2497 cq->qid, cq->mapping.prp1, 2498 cq->mapping.prp1 + cq->size * sizeof(struct spdk_nvme_cpl)); 2499 continue; 2500 } 2501 } 2502 2503 if (sq->size) { 2504 ret = map_q(ctrlr, &sq->mapping, sq->size, false, false); 2505 if (ret) { 2506 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap sqid:%d %#lx-%#lx\n", 2507 sq->qid, sq->mapping.prp1, 2508 sq->mapping.prp1 + sq->size * sizeof(struct spdk_nvme_cmd)); 2509 continue; 2510 } 2511 } 2512 sq->sq_state = VFIO_USER_SQ_ACTIVE; 2513 SPDK_DEBUGLOG(nvmf_vfio, "Remap sqid:%u successfully\n", sq->qid); 2514 } 2515 pthread_mutex_unlock(&endpoint->lock); 2516 } 2517 2518 static void 2519 memory_region_remove_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) 2520 { 2521 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2522 struct nvmf_vfio_user_sq *sq; 2523 struct nvmf_vfio_user_cq *cq; 2524 void *map_start, *map_end; 2525 int ret = 0; 2526 2527 if (!info->vaddr) { 2528 return; 2529 } 2530 2531 map_start = info->mapping.iov_base; 2532 map_end = info->mapping.iov_base + info->mapping.iov_len; 2533 2534 if (((uintptr_t)info->mapping.iov_base & MASK_2MB) || 2535 (info->mapping.iov_len & MASK_2MB)) { 2536 SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %p-%p\n", 2537 info->vaddr, map_start, map_end); 2538 return; 2539 } 2540 2541 assert(endpoint != NULL); 2542 SPDK_DEBUGLOG(nvmf_vfio, "%s: unmap IOVA %p-%p\n", endpoint_id(endpoint), 2543 map_start, map_end); 2544 2545 if (endpoint->ctrlr != NULL) { 2546 struct nvmf_vfio_user_ctrlr *ctrlr; 2547 ctrlr = endpoint->ctrlr; 2548 2549 pthread_mutex_lock(&endpoint->lock); 2550 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 2551 if (q_addr(&sq->mapping) >= map_start && q_addr(&sq->mapping) <= map_end) { 2552 unmap_q(ctrlr, &sq->mapping); 2553 sq->sq_state = VFIO_USER_SQ_INACTIVE; 2554 } 2555 2556 cq = ctrlr->cqs[sq->cqid]; 2557 if (q_addr(&cq->mapping) >= map_start && q_addr(&cq->mapping) <= map_end) { 2558 unmap_q(ctrlr, &cq->mapping); 2559 } 2560 } 2561 2562 if (ctrlr->sdbl != NULL) { 2563 size_t i; 2564 2565 for (i = 0; i < NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT; i++) { 2566 const void *const iov_base = ctrlr->sdbl->iovs[i].iov_base; 2567 2568 if (iov_base >= map_start && iov_base < map_end) { 2569 copy_doorbells(ctrlr, 2570 ctrlr->sdbl->shadow_doorbells, 2571 ctrlr->bar0_doorbells); 2572 vfio_user_ctrlr_switch_doorbells(ctrlr, false); 2573 free_sdbl(endpoint->vfu_ctx, ctrlr->sdbl); 2574 ctrlr->sdbl = NULL; 2575 break; 2576 } 2577 } 2578 } 2579 2580 pthread_mutex_unlock(&endpoint->lock); 2581 } 2582 2583 if (info->prot == (PROT_WRITE | PROT_READ)) { 2584 ret = spdk_mem_unregister(info->mapping.iov_base, info->mapping.iov_len); 2585 if (ret) { 2586 SPDK_ERRLOG("Memory region unregister %p-%p failed, ret=%d\n", 2587 map_start, map_end, ret); 2588 } 2589 } 2590 } 2591 2592 /* Used to initiate a controller-level reset or a controller shutdown. */ 2593 static void 2594 disable_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 2595 { 2596 SPDK_DEBUGLOG(nvmf_vfio, "%s: disabling controller\n", 2597 ctrlr_id(vu_ctrlr)); 2598 2599 /* Unmap Admin queue. */ 2600 2601 assert(vu_ctrlr->sqs[0] != NULL); 2602 assert(vu_ctrlr->cqs[0] != NULL); 2603 2604 unmap_q(vu_ctrlr, &vu_ctrlr->sqs[0]->mapping); 2605 unmap_q(vu_ctrlr, &vu_ctrlr->cqs[0]->mapping); 2606 2607 vu_ctrlr->sqs[0]->size = 0; 2608 *sq_headp(vu_ctrlr->sqs[0]) = 0; 2609 2610 vu_ctrlr->sqs[0]->sq_state = VFIO_USER_SQ_INACTIVE; 2611 2612 vu_ctrlr->cqs[0]->size = 0; 2613 *cq_tailp(vu_ctrlr->cqs[0]) = 0; 2614 2615 /* 2616 * For PCIe controller reset or shutdown, we will drop all AER 2617 * responses. 2618 */ 2619 nvmf_ctrlr_abort_aer(vu_ctrlr->ctrlr); 2620 2621 /* Free the shadow doorbell buffer. */ 2622 free_sdbl(vu_ctrlr->endpoint->vfu_ctx, vu_ctrlr->sdbl); 2623 vu_ctrlr->sdbl = NULL; 2624 } 2625 2626 /* Used to re-enable the controller after a controller-level reset. */ 2627 static int 2628 enable_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 2629 { 2630 int err; 2631 2632 assert(vu_ctrlr != NULL); 2633 2634 SPDK_DEBUGLOG(nvmf_vfio, "%s: enabling controller\n", 2635 ctrlr_id(vu_ctrlr)); 2636 2637 err = acq_setup(vu_ctrlr); 2638 if (err != 0) { 2639 return err; 2640 } 2641 2642 err = asq_setup(vu_ctrlr); 2643 if (err != 0) { 2644 return err; 2645 } 2646 2647 vu_ctrlr->sqs[0]->sq_state = VFIO_USER_SQ_ACTIVE; 2648 2649 return 0; 2650 } 2651 2652 static int 2653 nvmf_vfio_user_prop_req_rsp(struct nvmf_vfio_user_req *req, void *cb_arg) 2654 { 2655 struct nvmf_vfio_user_sq *sq = cb_arg; 2656 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 2657 int ret; 2658 2659 assert(sq != NULL); 2660 assert(req != NULL); 2661 2662 if (req->req.cmd->prop_get_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET) { 2663 assert(sq->ctrlr != NULL); 2664 assert(req != NULL); 2665 2666 memcpy(req->req.data, 2667 &req->req.rsp->prop_get_rsp.value.u64, 2668 req->req.length); 2669 } else { 2670 assert(req->req.cmd->prop_set_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET); 2671 assert(sq->ctrlr != NULL); 2672 vu_ctrlr = sq->ctrlr; 2673 2674 if (req->req.cmd->prop_set_cmd.ofst == offsetof(struct spdk_nvme_registers, cc)) { 2675 union spdk_nvme_cc_register cc, diff; 2676 2677 cc.raw = req->req.cmd->prop_set_cmd.value.u64; 2678 diff.raw = cc.raw ^ req->cc.raw; 2679 2680 if (diff.bits.en) { 2681 if (cc.bits.en) { 2682 ret = enable_ctrlr(vu_ctrlr); 2683 if (ret) { 2684 SPDK_ERRLOG("%s: failed to enable ctrlr\n", ctrlr_id(vu_ctrlr)); 2685 return ret; 2686 } 2687 vu_ctrlr->reset_shn = false; 2688 } else { 2689 vu_ctrlr->reset_shn = true; 2690 } 2691 } 2692 2693 if (diff.bits.shn) { 2694 if (cc.bits.shn == SPDK_NVME_SHN_NORMAL || cc.bits.shn == SPDK_NVME_SHN_ABRUPT) { 2695 vu_ctrlr->reset_shn = true; 2696 } 2697 } 2698 2699 if (vu_ctrlr->reset_shn) { 2700 disable_ctrlr(vu_ctrlr); 2701 } 2702 } 2703 } 2704 2705 return 0; 2706 } 2707 2708 /* 2709 * Handles a write at offset 0x1000 or more; this is the non-mapped path when a 2710 * doorbell is written via access_bar0_fn(). 2711 * 2712 * DSTRD is set to fixed value 0 for NVMf. 2713 * 2714 */ 2715 static int 2716 handle_dbl_access(struct nvmf_vfio_user_ctrlr *ctrlr, uint32_t *buf, 2717 const size_t count, loff_t pos, const bool is_write) 2718 { 2719 assert(ctrlr != NULL); 2720 assert(buf != NULL); 2721 2722 if (!is_write) { 2723 SPDK_WARNLOG("%s: host tried to read BAR0 doorbell %#lx\n", 2724 ctrlr_id(ctrlr), pos); 2725 errno = EPERM; 2726 return -1; 2727 } 2728 2729 if (count != sizeof(uint32_t)) { 2730 SPDK_ERRLOG("%s: bad doorbell buffer size %ld\n", 2731 ctrlr_id(ctrlr), count); 2732 errno = EINVAL; 2733 return -1; 2734 } 2735 2736 pos -= NVME_DOORBELLS_OFFSET; 2737 2738 /* pos must be dword aligned */ 2739 if ((pos & 0x3) != 0) { 2740 SPDK_ERRLOG("%s: bad doorbell offset %#lx\n", ctrlr_id(ctrlr), pos); 2741 errno = EINVAL; 2742 return -1; 2743 } 2744 2745 /* convert byte offset to array index */ 2746 pos >>= 2; 2747 2748 if (pos >= NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR * 2) { 2749 SPDK_ERRLOG("%s: bad doorbell index %#lx\n", ctrlr_id(ctrlr), pos); 2750 errno = EINVAL; 2751 return -1; 2752 } 2753 2754 ctrlr->bar0_doorbells[pos] = *buf; 2755 spdk_wmb(); 2756 2757 SPDK_DEBUGLOG(vfio_user_db, "%s: updating BAR0 doorbell %s:%ld to %u\n", 2758 ctrlr_id(ctrlr), (pos & 1) ? "cqid" : "sqid", 2759 pos / 2, *buf); 2760 2761 2762 return 0; 2763 } 2764 2765 static size_t 2766 vfio_user_property_access(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 2767 char *buf, size_t count, loff_t pos, 2768 bool is_write) 2769 { 2770 struct nvmf_vfio_user_req *req; 2771 const struct spdk_nvmf_registers *regs; 2772 2773 if ((count != 4) && (count != 8)) { 2774 errno = EINVAL; 2775 return -1; 2776 } 2777 2778 /* Construct a Fabric Property Get/Set command and send it */ 2779 req = get_nvmf_vfio_user_req(vu_ctrlr->sqs[0]); 2780 if (req == NULL) { 2781 errno = ENOBUFS; 2782 return -1; 2783 } 2784 regs = spdk_nvmf_ctrlr_get_regs(vu_ctrlr->ctrlr); 2785 req->cc.raw = regs->cc.raw; 2786 2787 req->cb_fn = nvmf_vfio_user_prop_req_rsp; 2788 req->cb_arg = vu_ctrlr->sqs[0]; 2789 req->req.cmd->prop_set_cmd.opcode = SPDK_NVME_OPC_FABRIC; 2790 req->req.cmd->prop_set_cmd.cid = 0; 2791 if (count == 4) { 2792 req->req.cmd->prop_set_cmd.attrib.size = 0; 2793 } else { 2794 req->req.cmd->prop_set_cmd.attrib.size = 1; 2795 } 2796 req->req.cmd->prop_set_cmd.ofst = pos; 2797 if (is_write) { 2798 req->req.cmd->prop_set_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET; 2799 if (req->req.cmd->prop_set_cmd.attrib.size) { 2800 req->req.cmd->prop_set_cmd.value.u64 = *(uint64_t *)buf; 2801 } else { 2802 req->req.cmd->prop_set_cmd.value.u32.high = 0; 2803 req->req.cmd->prop_set_cmd.value.u32.low = *(uint32_t *)buf; 2804 } 2805 } else { 2806 req->req.cmd->prop_get_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET; 2807 } 2808 req->req.length = count; 2809 req->req.data = buf; 2810 2811 spdk_nvmf_request_exec_fabrics(&req->req); 2812 2813 return count; 2814 } 2815 2816 static ssize_t 2817 access_bar0_fn(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t pos, 2818 bool is_write) 2819 { 2820 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2821 struct nvmf_vfio_user_ctrlr *ctrlr; 2822 int ret; 2823 2824 ctrlr = endpoint->ctrlr; 2825 if (endpoint->need_async_destroy || !ctrlr) { 2826 errno = EIO; 2827 return -1; 2828 } 2829 2830 if (pos >= NVME_DOORBELLS_OFFSET) { 2831 /* 2832 * The fact that the doorbells can be memory mapped doesn't mean 2833 * that the client (VFIO in QEMU) is obliged to memory map them, 2834 * it might still elect to access them via regular read/write; 2835 * we might also have had disable_mappable_bar0 set. 2836 */ 2837 ret = handle_dbl_access(ctrlr, (uint32_t *)buf, count, 2838 pos, is_write); 2839 if (ret == 0) { 2840 return count; 2841 } 2842 return ret; 2843 } 2844 2845 return vfio_user_property_access(ctrlr, buf, count, pos, is_write); 2846 } 2847 2848 static ssize_t 2849 access_pci_config(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t offset, 2850 bool is_write) 2851 { 2852 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2853 2854 if (is_write) { 2855 SPDK_ERRLOG("%s: write %#lx-%#lx not supported\n", 2856 endpoint_id(endpoint), offset, offset + count); 2857 errno = EINVAL; 2858 return -1; 2859 } 2860 2861 if (offset + count > NVME_REG_CFG_SIZE) { 2862 SPDK_ERRLOG("%s: access past end of extended PCI configuration space, want=%ld+%ld, max=%d\n", 2863 endpoint_id(endpoint), offset, count, 2864 NVME_REG_CFG_SIZE); 2865 errno = ERANGE; 2866 return -1; 2867 } 2868 2869 memcpy(buf, ((unsigned char *)endpoint->pci_config_space) + offset, count); 2870 2871 return count; 2872 } 2873 2874 static void 2875 vfio_user_log(vfu_ctx_t *vfu_ctx, int level, char const *msg) 2876 { 2877 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2878 2879 if (level >= LOG_DEBUG) { 2880 SPDK_DEBUGLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg); 2881 } else if (level >= LOG_INFO) { 2882 SPDK_INFOLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg); 2883 } else if (level >= LOG_NOTICE) { 2884 SPDK_NOTICELOG("%s: %s\n", endpoint_id(endpoint), msg); 2885 } else if (level >= LOG_WARNING) { 2886 SPDK_WARNLOG("%s: %s\n", endpoint_id(endpoint), msg); 2887 } else { 2888 SPDK_ERRLOG("%s: %s\n", endpoint_id(endpoint), msg); 2889 } 2890 } 2891 2892 static int 2893 vfio_user_get_log_level(void) 2894 { 2895 int level; 2896 2897 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 2898 return LOG_DEBUG; 2899 } 2900 2901 level = spdk_log_to_syslog_level(spdk_log_get_level()); 2902 if (level < 0) { 2903 return LOG_ERR; 2904 } 2905 2906 return level; 2907 } 2908 2909 static void 2910 init_pci_config_space(vfu_pci_config_space_t *p) 2911 { 2912 /* MLBAR */ 2913 p->hdr.bars[0].raw = 0x0; 2914 /* MUBAR */ 2915 p->hdr.bars[1].raw = 0x0; 2916 2917 /* vendor specific, let's set them to zero for now */ 2918 p->hdr.bars[3].raw = 0x0; 2919 p->hdr.bars[4].raw = 0x0; 2920 p->hdr.bars[5].raw = 0x0; 2921 2922 /* enable INTx */ 2923 p->hdr.intr.ipin = 0x1; 2924 } 2925 2926 struct ctrlr_quiesce_ctx { 2927 struct nvmf_vfio_user_endpoint *endpoint; 2928 struct nvmf_vfio_user_poll_group *group; 2929 int status; 2930 }; 2931 2932 static void ctrlr_quiesce(struct nvmf_vfio_user_ctrlr *vu_ctrlr); 2933 2934 static inline bool 2935 in_interrupt_mode(struct nvmf_vfio_user_transport *vu_transport) 2936 { 2937 return spdk_interrupt_mode_is_enabled() && 2938 vu_transport->intr_mode_supported; 2939 } 2940 2941 static void 2942 _vfio_user_endpoint_resume_done_msg(void *ctx) 2943 { 2944 struct nvmf_vfio_user_endpoint *endpoint = ctx; 2945 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 2946 2947 endpoint->need_resume = false; 2948 2949 if (!vu_ctrlr) { 2950 return; 2951 } 2952 2953 if (!vu_ctrlr->queued_quiesce) { 2954 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 2955 2956 /* 2957 * We might have ignored new SQ entries while we were quiesced: 2958 * kick ourselves so we'll definitely check again while in 2959 * VFIO_USER_CTRLR_RUNNING state. 2960 */ 2961 if (in_interrupt_mode(endpoint->transport)) { 2962 ctrlr_kick(vu_ctrlr); 2963 } 2964 return; 2965 } 2966 2967 2968 /* 2969 * Basically, once we call `vfu_device_quiesced` the device is 2970 * unquiesced from libvfio-user's perspective so from the moment 2971 * `vfio_user_quiesce_done` returns libvfio-user might quiesce the device 2972 * again. However, because the NVMf subsytem is an asynchronous 2973 * operation, this quiesce might come _before_ the NVMf subsystem has 2974 * been resumed, so in the callback of `spdk_nvmf_subsystem_resume` we 2975 * need to check whether a quiesce was requested. 2976 */ 2977 SPDK_DEBUGLOG(nvmf_vfio, "%s has queued quiesce event, quiesce again\n", 2978 ctrlr_id(vu_ctrlr)); 2979 ctrlr_quiesce(vu_ctrlr); 2980 } 2981 2982 static void 2983 vfio_user_endpoint_resume_done(struct spdk_nvmf_subsystem *subsystem, 2984 void *cb_arg, int status) 2985 { 2986 struct nvmf_vfio_user_endpoint *endpoint = cb_arg; 2987 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 2988 2989 SPDK_DEBUGLOG(nvmf_vfio, "%s resumed done with status %d\n", endpoint_id(endpoint), status); 2990 2991 if (!vu_ctrlr) { 2992 return; 2993 } 2994 2995 spdk_thread_send_msg(vu_ctrlr->thread, _vfio_user_endpoint_resume_done_msg, endpoint); 2996 } 2997 2998 static void 2999 vfio_user_quiesce_done(void *ctx) 3000 { 3001 struct ctrlr_quiesce_ctx *quiesce_ctx = ctx; 3002 struct nvmf_vfio_user_endpoint *endpoint = quiesce_ctx->endpoint; 3003 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3004 int ret; 3005 3006 if (!vu_ctrlr) { 3007 free(quiesce_ctx); 3008 return; 3009 } 3010 3011 SPDK_DEBUGLOG(nvmf_vfio, "%s device quiesced\n", ctrlr_id(vu_ctrlr)); 3012 3013 assert(vu_ctrlr->state == VFIO_USER_CTRLR_PAUSING); 3014 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 3015 vfu_device_quiesced(endpoint->vfu_ctx, quiesce_ctx->status); 3016 vu_ctrlr->queued_quiesce = false; 3017 free(quiesce_ctx); 3018 3019 /* `vfu_device_quiesced` can change the migration state, 3020 * so we need to re-check `vu_ctrlr->state`. 3021 */ 3022 if (vu_ctrlr->state == VFIO_USER_CTRLR_MIGRATING) { 3023 SPDK_DEBUGLOG(nvmf_vfio, "%s is in MIGRATION state\n", ctrlr_id(vu_ctrlr)); 3024 return; 3025 } 3026 3027 SPDK_DEBUGLOG(nvmf_vfio, "%s start to resume\n", ctrlr_id(vu_ctrlr)); 3028 vu_ctrlr->state = VFIO_USER_CTRLR_RESUMING; 3029 ret = spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 3030 vfio_user_endpoint_resume_done, endpoint); 3031 if (ret < 0) { 3032 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 3033 SPDK_ERRLOG("%s: failed to resume, ret=%d\n", endpoint_id(endpoint), ret); 3034 } 3035 } 3036 3037 static void 3038 vfio_user_pause_done(struct spdk_nvmf_subsystem *subsystem, 3039 void *ctx, int status) 3040 { 3041 struct ctrlr_quiesce_ctx *quiesce_ctx = ctx; 3042 struct nvmf_vfio_user_endpoint *endpoint = quiesce_ctx->endpoint; 3043 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3044 3045 if (!vu_ctrlr) { 3046 free(quiesce_ctx); 3047 return; 3048 } 3049 3050 quiesce_ctx->status = status; 3051 3052 SPDK_DEBUGLOG(nvmf_vfio, "%s pause done with status %d\n", 3053 ctrlr_id(vu_ctrlr), status); 3054 3055 spdk_thread_send_msg(vu_ctrlr->thread, 3056 vfio_user_quiesce_done, ctx); 3057 } 3058 3059 /* 3060 * Ensure that, for this PG, we've stopped running in nvmf_vfio_user_sq_poll(); 3061 * we've already set ctrlr->state, so we won't process new entries, but we need 3062 * to ensure that this PG is quiesced. This only works because there's no 3063 * callback context set up between polling the SQ and spdk_nvmf_request_exec(). 3064 * 3065 * Once we've walked all PGs, we need to pause any submitted I/O via 3066 * spdk_nvmf_subsystem_pause(SPDK_NVME_GLOBAL_NS_TAG). 3067 */ 3068 static void 3069 vfio_user_quiesce_pg(void *ctx) 3070 { 3071 struct ctrlr_quiesce_ctx *quiesce_ctx = ctx; 3072 struct nvmf_vfio_user_endpoint *endpoint = quiesce_ctx->endpoint; 3073 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3074 struct nvmf_vfio_user_poll_group *vu_group = quiesce_ctx->group; 3075 struct spdk_nvmf_subsystem *subsystem = endpoint->subsystem; 3076 int ret; 3077 3078 SPDK_DEBUGLOG(nvmf_vfio, "quiesced pg:%p\n", vu_group); 3079 3080 if (!vu_ctrlr) { 3081 free(quiesce_ctx); 3082 return; 3083 } 3084 3085 quiesce_ctx->group = TAILQ_NEXT(vu_group, link); 3086 if (quiesce_ctx->group != NULL) { 3087 spdk_thread_send_msg(poll_group_to_thread(quiesce_ctx->group), 3088 vfio_user_quiesce_pg, quiesce_ctx); 3089 return; 3090 } 3091 3092 ret = spdk_nvmf_subsystem_pause(subsystem, SPDK_NVME_GLOBAL_NS_TAG, 3093 vfio_user_pause_done, quiesce_ctx); 3094 if (ret < 0) { 3095 SPDK_ERRLOG("%s: failed to pause, ret=%d\n", 3096 endpoint_id(endpoint), ret); 3097 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 3098 fail_ctrlr(vu_ctrlr); 3099 free(quiesce_ctx); 3100 } 3101 } 3102 3103 static void 3104 ctrlr_quiesce(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3105 { 3106 struct ctrlr_quiesce_ctx *quiesce_ctx; 3107 3108 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSING; 3109 3110 quiesce_ctx = calloc(1, sizeof(*quiesce_ctx)); 3111 if (!quiesce_ctx) { 3112 SPDK_ERRLOG("Failed to allocate subsystem pause context\n"); 3113 assert(false); 3114 return; 3115 } 3116 3117 quiesce_ctx->endpoint = vu_ctrlr->endpoint; 3118 quiesce_ctx->status = 0; 3119 quiesce_ctx->group = TAILQ_FIRST(&vu_ctrlr->transport->poll_groups); 3120 3121 spdk_thread_send_msg(poll_group_to_thread(quiesce_ctx->group), 3122 vfio_user_quiesce_pg, quiesce_ctx); 3123 } 3124 3125 static int 3126 vfio_user_dev_quiesce_cb(vfu_ctx_t *vfu_ctx) 3127 { 3128 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3129 struct spdk_nvmf_subsystem *subsystem = endpoint->subsystem; 3130 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3131 3132 if (!vu_ctrlr) { 3133 return 0; 3134 } 3135 3136 /* NVMf library will destruct controller when no 3137 * connected queue pairs. 3138 */ 3139 if (!nvmf_subsystem_get_ctrlr(subsystem, vu_ctrlr->cntlid)) { 3140 return 0; 3141 } 3142 3143 SPDK_DEBUGLOG(nvmf_vfio, "%s starts to quiesce\n", ctrlr_id(vu_ctrlr)); 3144 3145 /* There is no race condition here as device quiesce callback 3146 * and nvmf_prop_set_cc() are running in the same thread context. 3147 */ 3148 if (!vu_ctrlr->ctrlr->vcprop.cc.bits.en) { 3149 return 0; 3150 } else if (!vu_ctrlr->ctrlr->vcprop.csts.bits.rdy) { 3151 return 0; 3152 } else if (vu_ctrlr->ctrlr->vcprop.csts.bits.shst == SPDK_NVME_SHST_COMPLETE) { 3153 return 0; 3154 } 3155 3156 switch (vu_ctrlr->state) { 3157 case VFIO_USER_CTRLR_PAUSED: 3158 case VFIO_USER_CTRLR_MIGRATING: 3159 return 0; 3160 case VFIO_USER_CTRLR_RUNNING: 3161 ctrlr_quiesce(vu_ctrlr); 3162 break; 3163 case VFIO_USER_CTRLR_RESUMING: 3164 vu_ctrlr->queued_quiesce = true; 3165 SPDK_DEBUGLOG(nvmf_vfio, "%s is busy to quiesce, current state %u\n", ctrlr_id(vu_ctrlr), 3166 vu_ctrlr->state); 3167 break; 3168 default: 3169 assert(vu_ctrlr->state != VFIO_USER_CTRLR_PAUSING); 3170 break; 3171 } 3172 3173 errno = EBUSY; 3174 return -1; 3175 } 3176 3177 static void 3178 vfio_user_ctrlr_dump_migr_data(const char *name, 3179 struct vfio_user_nvme_migr_state *migr_data, 3180 struct nvmf_vfio_user_shadow_doorbells *sdbl) 3181 { 3182 struct spdk_nvme_registers *regs; 3183 struct nvme_migr_sq_state *sq; 3184 struct nvme_migr_cq_state *cq; 3185 uint32_t *doorbell_base; 3186 uint32_t i; 3187 3188 SPDK_NOTICELOG("Dump %s\n", name); 3189 3190 regs = (struct spdk_nvme_registers *)migr_data->bar0; 3191 doorbell_base = (uint32_t *)®s->doorbell[0].sq_tdbl; 3192 3193 SPDK_NOTICELOG("Registers\n"); 3194 SPDK_NOTICELOG("CSTS 0x%x\n", regs->csts.raw); 3195 SPDK_NOTICELOG("CAP 0x%"PRIx64"\n", regs->cap.raw); 3196 SPDK_NOTICELOG("VS 0x%x\n", regs->vs.raw); 3197 SPDK_NOTICELOG("CC 0x%x\n", regs->cc.raw); 3198 SPDK_NOTICELOG("AQA 0x%x\n", regs->aqa.raw); 3199 SPDK_NOTICELOG("ASQ 0x%"PRIx64"\n", regs->asq); 3200 SPDK_NOTICELOG("ACQ 0x%"PRIx64"\n", regs->acq); 3201 3202 SPDK_NOTICELOG("Number of IO Queues %u\n", migr_data->ctrlr_header.num_io_queues); 3203 3204 if (sdbl != NULL) { 3205 SPDK_NOTICELOG("shadow doorbell buffer=%#lx\n", 3206 migr_data->ctrlr_header.shadow_doorbell_buffer); 3207 SPDK_NOTICELOG("eventidx buffer=%#lx\n", 3208 migr_data->ctrlr_header.eventidx_buffer); 3209 } 3210 3211 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3212 sq = &migr_data->qps[i].sq; 3213 cq = &migr_data->qps[i].cq; 3214 3215 if (sq->size) { 3216 SPDK_NOTICELOG("sqid:%u, bar0_doorbell:%u\n", sq->sqid, doorbell_base[i * 2]); 3217 if (i > 0 && sdbl != NULL) { 3218 SPDK_NOTICELOG("sqid:%u, shadow_doorbell:%u, eventidx:%u\n", 3219 sq->sqid, 3220 sdbl->shadow_doorbells[queue_index(i, false)], 3221 sdbl->eventidxs[queue_index(i, false)]); 3222 } 3223 SPDK_NOTICELOG("SQ sqid:%u, cqid:%u, sqhead:%u, size:%u, dma_addr:0x%"PRIx64"\n", 3224 sq->sqid, sq->cqid, sq->head, sq->size, sq->dma_addr); 3225 } 3226 3227 if (cq->size) { 3228 SPDK_NOTICELOG("cqid:%u, bar0_doorbell:%u\n", cq->cqid, doorbell_base[i * 2 + 1]); 3229 if (i > 0 && sdbl != NULL) { 3230 SPDK_NOTICELOG("cqid:%u, shadow_doorbell:%u, eventidx:%u\n", 3231 cq->cqid, 3232 sdbl->shadow_doorbells[queue_index(i, true)], 3233 sdbl->eventidxs[queue_index(i, true)]); 3234 } 3235 SPDK_NOTICELOG("CQ cqid:%u, phase:%u, cqtail:%u, size:%u, iv:%u, ien:%u, dma_addr:0x%"PRIx64"\n", 3236 cq->cqid, cq->phase, cq->tail, cq->size, cq->iv, cq->ien, cq->dma_addr); 3237 } 3238 } 3239 3240 SPDK_NOTICELOG("%s Dump Done\n", name); 3241 } 3242 3243 /* Read region 9 content and restore it to migration data structures */ 3244 static int 3245 vfio_user_migr_stream_to_data(struct nvmf_vfio_user_endpoint *endpoint, 3246 struct vfio_user_nvme_migr_state *migr_state) 3247 { 3248 void *data_ptr = endpoint->migr_data; 3249 3250 /* Load vfio_user_nvme_migr_header first */ 3251 memcpy(&migr_state->ctrlr_header, data_ptr, sizeof(struct vfio_user_nvme_migr_header)); 3252 /* TODO: version check */ 3253 if (migr_state->ctrlr_header.magic != VFIO_USER_NVME_MIGR_MAGIC) { 3254 SPDK_ERRLOG("%s: bad magic number %x\n", endpoint_id(endpoint), migr_state->ctrlr_header.magic); 3255 return -EINVAL; 3256 } 3257 3258 /* Load nvmf controller data */ 3259 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.nvmf_data_offset; 3260 memcpy(&migr_state->nvmf_data, data_ptr, migr_state->ctrlr_header.nvmf_data_len); 3261 3262 /* Load queue pairs */ 3263 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.qp_offset; 3264 memcpy(&migr_state->qps, data_ptr, migr_state->ctrlr_header.qp_len); 3265 3266 /* Load BAR0 */ 3267 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.bar_offset[VFU_PCI_DEV_BAR0_REGION_IDX]; 3268 memcpy(&migr_state->bar0, data_ptr, migr_state->ctrlr_header.bar_len[VFU_PCI_DEV_BAR0_REGION_IDX]); 3269 3270 /* Load CFG */ 3271 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.bar_offset[VFU_PCI_DEV_CFG_REGION_IDX]; 3272 memcpy(&migr_state->cfg, data_ptr, migr_state->ctrlr_header.bar_len[VFU_PCI_DEV_CFG_REGION_IDX]); 3273 3274 return 0; 3275 } 3276 3277 3278 static void 3279 vfio_user_migr_ctrlr_save_data(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3280 { 3281 struct spdk_nvmf_ctrlr *ctrlr = vu_ctrlr->ctrlr; 3282 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 3283 struct nvmf_vfio_user_sq *sq; 3284 struct nvmf_vfio_user_cq *cq; 3285 struct vfio_user_nvme_migr_state migr_state = {}; 3286 uint64_t data_offset; 3287 void *data_ptr; 3288 int num_aers; 3289 struct spdk_nvme_registers *regs; 3290 uint32_t *doorbell_base; 3291 uint32_t i = 0; 3292 uint16_t sqid, cqid; 3293 3294 /* Save all data to vfio_user_nvme_migr_state first, then we will 3295 * copy it to device migration region at last. 3296 */ 3297 3298 /* save magic number */ 3299 migr_state.ctrlr_header.magic = VFIO_USER_NVME_MIGR_MAGIC; 3300 3301 /* save controller data */ 3302 num_aers = nvmf_ctrlr_save_aers(ctrlr, migr_state.ctrlr_header.aer_cids, 3303 256); 3304 assert(num_aers >= 0); 3305 migr_state.ctrlr_header.nr_aers = num_aers; 3306 3307 /* save nvmf controller data */ 3308 nvmf_ctrlr_save_migr_data(ctrlr, (struct nvmf_ctrlr_migr_data *)&migr_state.nvmf_data); 3309 3310 /* save connected queue pairs */ 3311 TAILQ_FOREACH(sq, &vu_ctrlr->connected_sqs, tailq) { 3312 /* save sq */ 3313 sqid = sq->qid; 3314 migr_state.qps[sqid].sq.sqid = sq->qid; 3315 migr_state.qps[sqid].sq.cqid = sq->cqid; 3316 migr_state.qps[sqid].sq.head = *sq_headp(sq); 3317 migr_state.qps[sqid].sq.size = sq->size; 3318 migr_state.qps[sqid].sq.dma_addr = sq->mapping.prp1; 3319 3320 /* save cq, for shared cq case, cq may be saved multiple times */ 3321 cqid = sq->cqid; 3322 cq = vu_ctrlr->cqs[cqid]; 3323 migr_state.qps[cqid].cq.cqid = cqid; 3324 migr_state.qps[cqid].cq.tail = *cq_tailp(cq); 3325 migr_state.qps[cqid].cq.ien = cq->ien; 3326 migr_state.qps[cqid].cq.iv = cq->iv; 3327 migr_state.qps[cqid].cq.size = cq->size; 3328 migr_state.qps[cqid].cq.phase = cq->phase; 3329 migr_state.qps[cqid].cq.dma_addr = cq->mapping.prp1; 3330 i++; 3331 } 3332 3333 assert(i > 0); 3334 migr_state.ctrlr_header.num_io_queues = i - 1; 3335 3336 regs = (struct spdk_nvme_registers *)&migr_state.bar0; 3337 /* Save mandarory registers to bar0 */ 3338 regs->csts.raw = ctrlr->vcprop.csts.raw; 3339 regs->cap.raw = ctrlr->vcprop.cap.raw; 3340 regs->vs.raw = ctrlr->vcprop.vs.raw; 3341 regs->cc.raw = ctrlr->vcprop.cc.raw; 3342 regs->aqa.raw = ctrlr->vcprop.aqa.raw; 3343 regs->asq = ctrlr->vcprop.asq; 3344 regs->acq = ctrlr->vcprop.acq; 3345 /* Save doorbells */ 3346 doorbell_base = (uint32_t *)®s->doorbell[0].sq_tdbl; 3347 memcpy(doorbell_base, (void *)vu_ctrlr->bar0_doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 3348 3349 /* Save PCI configuration space */ 3350 memcpy(&migr_state.cfg, (void *)endpoint->pci_config_space, NVME_REG_CFG_SIZE); 3351 3352 /* Save all data to device migration region */ 3353 data_ptr = endpoint->migr_data; 3354 3355 /* Copy nvmf controller data */ 3356 data_offset = sizeof(struct vfio_user_nvme_migr_header); 3357 data_ptr += data_offset; 3358 migr_state.ctrlr_header.nvmf_data_offset = data_offset; 3359 migr_state.ctrlr_header.nvmf_data_len = sizeof(struct nvmf_ctrlr_migr_data); 3360 memcpy(data_ptr, &migr_state.nvmf_data, sizeof(struct nvmf_ctrlr_migr_data)); 3361 3362 /* Copy queue pairs */ 3363 data_offset += sizeof(struct nvmf_ctrlr_migr_data); 3364 data_ptr += sizeof(struct nvmf_ctrlr_migr_data); 3365 migr_state.ctrlr_header.qp_offset = data_offset; 3366 migr_state.ctrlr_header.qp_len = i * (sizeof(struct nvme_migr_sq_state) + sizeof( 3367 struct nvme_migr_cq_state)); 3368 memcpy(data_ptr, &migr_state.qps, migr_state.ctrlr_header.qp_len); 3369 3370 /* Copy BAR0 */ 3371 data_offset += migr_state.ctrlr_header.qp_len; 3372 data_ptr += migr_state.ctrlr_header.qp_len; 3373 migr_state.ctrlr_header.bar_offset[VFU_PCI_DEV_BAR0_REGION_IDX] = data_offset; 3374 migr_state.ctrlr_header.bar_len[VFU_PCI_DEV_BAR0_REGION_IDX] = NVME_REG_BAR0_SIZE; 3375 memcpy(data_ptr, &migr_state.bar0, NVME_REG_BAR0_SIZE); 3376 3377 /* Copy CFG */ 3378 data_offset += NVME_REG_BAR0_SIZE; 3379 data_ptr += NVME_REG_BAR0_SIZE; 3380 migr_state.ctrlr_header.bar_offset[VFU_PCI_DEV_CFG_REGION_IDX] = data_offset; 3381 migr_state.ctrlr_header.bar_len[VFU_PCI_DEV_CFG_REGION_IDX] = NVME_REG_CFG_SIZE; 3382 memcpy(data_ptr, &migr_state.cfg, NVME_REG_CFG_SIZE); 3383 3384 /* copy shadow doorbells */ 3385 if (vu_ctrlr->sdbl != NULL) { 3386 migr_state.ctrlr_header.sdbl = true; 3387 migr_state.ctrlr_header.shadow_doorbell_buffer = vu_ctrlr->shadow_doorbell_buffer; 3388 migr_state.ctrlr_header.eventidx_buffer = vu_ctrlr->eventidx_buffer; 3389 } 3390 3391 /* Copy nvme migration header finally */ 3392 memcpy(endpoint->migr_data, &migr_state.ctrlr_header, sizeof(struct vfio_user_nvme_migr_header)); 3393 3394 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 3395 vfio_user_ctrlr_dump_migr_data("SAVE", &migr_state, vu_ctrlr->sdbl); 3396 } 3397 } 3398 3399 /* 3400 * If we are about to close the connection, we need to unregister the interrupt, 3401 * as the library will subsequently close the file descriptor we registered. 3402 */ 3403 static int 3404 vfio_user_device_reset(vfu_ctx_t *vfu_ctx, vfu_reset_type_t type) 3405 { 3406 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3407 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 3408 3409 SPDK_DEBUGLOG(nvmf_vfio, "Device reset type %u\n", type); 3410 3411 if (type == VFU_RESET_LOST_CONN) { 3412 if (ctrlr != NULL) { 3413 spdk_interrupt_unregister(&ctrlr->intr); 3414 ctrlr->intr_fd = -1; 3415 } 3416 return 0; 3417 } 3418 3419 /* FIXME: LOST_CONN case ? */ 3420 if (ctrlr->sdbl != NULL) { 3421 free_sdbl(vfu_ctx, ctrlr->sdbl); 3422 ctrlr->sdbl = NULL; 3423 } 3424 3425 /* FIXME: much more needed here. */ 3426 3427 return 0; 3428 } 3429 3430 static int 3431 vfio_user_migr_ctrlr_construct_qps(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 3432 struct vfio_user_nvme_migr_state *migr_state) 3433 { 3434 uint32_t i, qsize = 0; 3435 uint16_t sqid, cqid; 3436 struct vfio_user_nvme_migr_qp migr_qp; 3437 void *addr; 3438 uint32_t cqs_ref[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR] = {}; 3439 int ret; 3440 3441 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 3442 vfio_user_ctrlr_dump_migr_data("RESUME", migr_state, vu_ctrlr->sdbl); 3443 } 3444 3445 /* restore submission queues */ 3446 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3447 migr_qp = migr_state->qps[i]; 3448 3449 qsize = migr_qp.sq.size; 3450 if (qsize) { 3451 struct nvmf_vfio_user_sq *sq; 3452 3453 sqid = migr_qp.sq.sqid; 3454 if (sqid != i) { 3455 SPDK_ERRLOG("Expected sqid %u while got %u", i, sqid); 3456 return -EINVAL; 3457 } 3458 3459 /* allocate sq if necessary */ 3460 if (vu_ctrlr->sqs[sqid] == NULL) { 3461 ret = init_sq(vu_ctrlr, &vu_ctrlr->transport->transport, sqid); 3462 if (ret) { 3463 SPDK_ERRLOG("Construct qpair with qid %u failed\n", sqid); 3464 return -EFAULT; 3465 } 3466 } 3467 3468 sq = vu_ctrlr->sqs[sqid]; 3469 sq->size = qsize; 3470 3471 ret = alloc_sq_reqs(vu_ctrlr, sq); 3472 if (ret) { 3473 SPDK_ERRLOG("Construct sq with qid %u failed\n", sqid); 3474 return -EFAULT; 3475 } 3476 3477 /* restore sq */ 3478 sq->sq_state = VFIO_USER_SQ_CREATED; 3479 sq->cqid = migr_qp.sq.cqid; 3480 *sq_headp(sq) = migr_qp.sq.head; 3481 sq->mapping.prp1 = migr_qp.sq.dma_addr; 3482 addr = map_one(vu_ctrlr->endpoint->vfu_ctx, 3483 sq->mapping.prp1, sq->size * 64, 3484 sq->mapping.sg, &sq->mapping.iov, 3485 PROT_READ); 3486 if (addr == NULL) { 3487 SPDK_ERRLOG("Restore sq with qid %u PRP1 0x%"PRIx64" with size %u failed\n", 3488 sqid, sq->mapping.prp1, sq->size); 3489 return -EFAULT; 3490 } 3491 cqs_ref[sq->cqid]++; 3492 } 3493 } 3494 3495 /* restore completion queues */ 3496 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3497 migr_qp = migr_state->qps[i]; 3498 3499 qsize = migr_qp.cq.size; 3500 if (qsize) { 3501 struct nvmf_vfio_user_cq *cq; 3502 3503 /* restore cq */ 3504 cqid = migr_qp.sq.cqid; 3505 assert(cqid == i); 3506 3507 /* allocate cq if necessary */ 3508 if (vu_ctrlr->cqs[cqid] == NULL) { 3509 ret = init_cq(vu_ctrlr, cqid); 3510 if (ret) { 3511 SPDK_ERRLOG("Construct qpair with qid %u failed\n", cqid); 3512 return -EFAULT; 3513 } 3514 } 3515 3516 cq = vu_ctrlr->cqs[cqid]; 3517 3518 cq->size = qsize; 3519 3520 cq->cq_state = VFIO_USER_CQ_CREATED; 3521 cq->cq_ref = cqs_ref[cqid]; 3522 *cq_tailp(cq) = migr_qp.cq.tail; 3523 cq->mapping.prp1 = migr_qp.cq.dma_addr; 3524 cq->ien = migr_qp.cq.ien; 3525 cq->iv = migr_qp.cq.iv; 3526 cq->phase = migr_qp.cq.phase; 3527 addr = map_one(vu_ctrlr->endpoint->vfu_ctx, 3528 cq->mapping.prp1, cq->size * 16, 3529 cq->mapping.sg, &cq->mapping.iov, 3530 PROT_READ | PROT_WRITE); 3531 if (addr == NULL) { 3532 SPDK_ERRLOG("Restore cq with qid %u PRP1 0x%"PRIx64" with size %u failed\n", 3533 cqid, cq->mapping.prp1, cq->size); 3534 return -EFAULT; 3535 } 3536 } 3537 } 3538 3539 return 0; 3540 } 3541 3542 static int 3543 vfio_user_migr_ctrlr_restore(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3544 { 3545 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 3546 struct spdk_nvmf_ctrlr *ctrlr = vu_ctrlr->ctrlr; 3547 uint32_t *doorbell_base; 3548 struct vfio_user_nvme_migr_state migr_state = {}; 3549 struct spdk_nvme_registers *regs; 3550 struct spdk_nvme_cmd cmd; 3551 uint16_t i; 3552 int rc = 0; 3553 3554 assert(endpoint->migr_data != NULL); 3555 assert(ctrlr != NULL); 3556 rc = vfio_user_migr_stream_to_data(endpoint, &migr_state); 3557 if (rc) { 3558 return rc; 3559 } 3560 3561 /* restore shadow doorbells */ 3562 if (migr_state.ctrlr_header.sdbl) { 3563 struct nvmf_vfio_user_shadow_doorbells *sdbl; 3564 sdbl = map_sdbl(vu_ctrlr->endpoint->vfu_ctx, 3565 migr_state.ctrlr_header.shadow_doorbell_buffer, 3566 migr_state.ctrlr_header.eventidx_buffer, 3567 memory_page_size(vu_ctrlr)); 3568 if (sdbl == NULL) { 3569 SPDK_ERRLOG("%s: failed to re-map shadow doorbell buffers\n", 3570 ctrlr_id(vu_ctrlr)); 3571 return -1; 3572 } 3573 3574 vu_ctrlr->shadow_doorbell_buffer = migr_state.ctrlr_header.shadow_doorbell_buffer; 3575 vu_ctrlr->eventidx_buffer = migr_state.ctrlr_header.eventidx_buffer; 3576 3577 SWAP(vu_ctrlr->sdbl, sdbl); 3578 } 3579 3580 rc = vfio_user_migr_ctrlr_construct_qps(vu_ctrlr, &migr_state); 3581 if (rc) { 3582 return rc; 3583 } 3584 3585 /* restore PCI configuration space */ 3586 memcpy((void *)endpoint->pci_config_space, &migr_state.cfg, NVME_REG_CFG_SIZE); 3587 3588 regs = (struct spdk_nvme_registers *)&migr_state.bar0; 3589 doorbell_base = (uint32_t *)®s->doorbell[0].sq_tdbl; 3590 /* restore doorbells from saved registers */ 3591 memcpy((void *)vu_ctrlr->bar0_doorbells, doorbell_base, NVMF_VFIO_USER_DOORBELLS_SIZE); 3592 3593 /* restore controller registers after ADMIN queue connection */ 3594 ctrlr->vcprop.csts.raw = regs->csts.raw; 3595 ctrlr->vcprop.cap.raw = regs->cap.raw; 3596 ctrlr->vcprop.vs.raw = regs->vs.raw; 3597 ctrlr->vcprop.cc.raw = regs->cc.raw; 3598 ctrlr->vcprop.aqa.raw = regs->aqa.raw; 3599 ctrlr->vcprop.asq = regs->asq; 3600 ctrlr->vcprop.acq = regs->acq; 3601 3602 /* restore nvmf controller data */ 3603 rc = nvmf_ctrlr_restore_migr_data(ctrlr, &migr_state.nvmf_data); 3604 if (rc) { 3605 return rc; 3606 } 3607 3608 /* resubmit pending AERs */ 3609 for (i = 0; i < migr_state.ctrlr_header.nr_aers; i++) { 3610 SPDK_DEBUGLOG(nvmf_vfio, "%s AER resubmit, CID %u\n", ctrlr_id(vu_ctrlr), 3611 migr_state.ctrlr_header.aer_cids[i]); 3612 memset(&cmd, 0, sizeof(cmd)); 3613 cmd.opc = SPDK_NVME_OPC_ASYNC_EVENT_REQUEST; 3614 cmd.cid = migr_state.ctrlr_header.aer_cids[i]; 3615 rc = handle_cmd_req(vu_ctrlr, &cmd, vu_ctrlr->sqs[0]); 3616 if (rc) { 3617 break; 3618 } 3619 } 3620 3621 return rc; 3622 } 3623 3624 static void 3625 vfio_user_migr_ctrlr_enable_sqs(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3626 { 3627 uint32_t i; 3628 struct nvmf_vfio_user_sq *sq; 3629 3630 /* The Admin queue (qid: 0) does not ever use shadow doorbells. */ 3631 3632 if (vu_ctrlr->sqs[0] != NULL) { 3633 vu_ctrlr->sqs[0]->dbl_tailp = vu_ctrlr->bar0_doorbells + 3634 queue_index(0, false); 3635 } 3636 3637 if (vu_ctrlr->cqs[0] != NULL) { 3638 vu_ctrlr->cqs[0]->dbl_headp = vu_ctrlr->bar0_doorbells + 3639 queue_index(0, true); 3640 } 3641 3642 vfio_user_ctrlr_switch_doorbells(vu_ctrlr, vu_ctrlr->sdbl != NULL); 3643 3644 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3645 sq = vu_ctrlr->sqs[i]; 3646 if (!sq || !sq->size) { 3647 continue; 3648 } 3649 3650 if (nvmf_qpair_is_admin_queue(&sq->qpair)) { 3651 /* ADMIN queue pair is always in the poll group, just enable it */ 3652 sq->sq_state = VFIO_USER_SQ_ACTIVE; 3653 } else { 3654 spdk_nvmf_tgt_new_qpair(vu_ctrlr->transport->transport.tgt, &sq->qpair); 3655 } 3656 } 3657 } 3658 3659 /* 3660 * We are in stop-and-copy state, but still potentially have some current dirty 3661 * sgls: while we're quiesced and thus should have no active requests, we still 3662 * have potentially dirty maps of the shadow doorbells and the CQs (SQs are 3663 * mapped read only). 3664 * 3665 * Since we won't be calling vfu_sgl_put() for them, we need to explicitly 3666 * mark them dirty now. 3667 */ 3668 static void 3669 vfio_user_migr_ctrlr_mark_dirty(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3670 { 3671 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 3672 3673 assert(vu_ctrlr->state == VFIO_USER_CTRLR_MIGRATING); 3674 3675 for (size_t i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3676 struct nvmf_vfio_user_cq *cq = vu_ctrlr->cqs[i]; 3677 3678 if (cq == NULL || q_addr(&cq->mapping) == NULL) { 3679 continue; 3680 } 3681 3682 vfu_sgl_mark_dirty(endpoint->vfu_ctx, cq->mapping.sg, 1); 3683 } 3684 3685 if (vu_ctrlr->sdbl != NULL) { 3686 dma_sg_t *sg; 3687 size_t i; 3688 3689 for (i = 0; i < NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT; 3690 ++i) { 3691 3692 if (!vu_ctrlr->sdbl->iovs[i].iov_len) { 3693 continue; 3694 } 3695 3696 sg = index_to_sg_t(vu_ctrlr->sdbl->sgs, i); 3697 3698 vfu_sgl_mark_dirty(endpoint->vfu_ctx, sg, 1); 3699 } 3700 } 3701 } 3702 3703 static int 3704 vfio_user_migration_device_state_transition(vfu_ctx_t *vfu_ctx, vfu_migr_state_t state) 3705 { 3706 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3707 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3708 struct nvmf_vfio_user_sq *sq; 3709 int ret = 0; 3710 3711 SPDK_DEBUGLOG(nvmf_vfio, "%s controller state %u, migration state %u\n", endpoint_id(endpoint), 3712 vu_ctrlr->state, state); 3713 3714 switch (state) { 3715 case VFU_MIGR_STATE_STOP_AND_COPY: 3716 vu_ctrlr->in_source_vm = true; 3717 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 3718 vfio_user_migr_ctrlr_mark_dirty(vu_ctrlr); 3719 vfio_user_migr_ctrlr_save_data(vu_ctrlr); 3720 break; 3721 case VFU_MIGR_STATE_STOP: 3722 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 3723 /* The controller associates with source VM is dead now, we will resume 3724 * the subsystem after destroying the controller data structure, then the 3725 * subsystem can be re-used for another new client. 3726 */ 3727 if (vu_ctrlr->in_source_vm) { 3728 endpoint->need_resume = true; 3729 } 3730 break; 3731 case VFU_MIGR_STATE_PRE_COPY: 3732 assert(vu_ctrlr->state == VFIO_USER_CTRLR_PAUSED); 3733 break; 3734 case VFU_MIGR_STATE_RESUME: 3735 /* 3736 * Destination ADMIN queue pair is connected when starting the VM, 3737 * but the ADMIN queue pair isn't enabled in destination VM, the poll 3738 * group will do nothing to ADMIN queue pair for now. 3739 */ 3740 if (vu_ctrlr->state != VFIO_USER_CTRLR_RUNNING) { 3741 break; 3742 } 3743 3744 assert(!vu_ctrlr->in_source_vm); 3745 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 3746 3747 sq = TAILQ_FIRST(&vu_ctrlr->connected_sqs); 3748 assert(sq != NULL); 3749 assert(sq->qpair.qid == 0); 3750 sq->sq_state = VFIO_USER_SQ_INACTIVE; 3751 3752 /* Free ADMIN SQ resources first, SQ resources will be 3753 * allocated based on queue size from source VM. 3754 */ 3755 free_sq_reqs(sq); 3756 sq->size = 0; 3757 break; 3758 case VFU_MIGR_STATE_RUNNING: 3759 3760 if (vu_ctrlr->state != VFIO_USER_CTRLR_MIGRATING) { 3761 break; 3762 } 3763 3764 if (!vu_ctrlr->in_source_vm) { 3765 /* Restore destination VM from BAR9 */ 3766 ret = vfio_user_migr_ctrlr_restore(vu_ctrlr); 3767 if (ret) { 3768 break; 3769 } 3770 3771 vfio_user_ctrlr_switch_doorbells(vu_ctrlr, false); 3772 vfio_user_migr_ctrlr_enable_sqs(vu_ctrlr); 3773 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 3774 /* FIXME where do we resume nvmf? */ 3775 } else { 3776 /* Rollback source VM */ 3777 vu_ctrlr->state = VFIO_USER_CTRLR_RESUMING; 3778 ret = spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 3779 vfio_user_endpoint_resume_done, endpoint); 3780 if (ret < 0) { 3781 /* TODO: fail controller with CFS bit set */ 3782 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 3783 SPDK_ERRLOG("%s: failed to resume, ret=%d\n", endpoint_id(endpoint), ret); 3784 } 3785 } 3786 vu_ctrlr->migr_data_prepared = false; 3787 vu_ctrlr->in_source_vm = false; 3788 break; 3789 3790 default: 3791 return -EINVAL; 3792 } 3793 3794 return ret; 3795 } 3796 3797 static uint64_t 3798 vfio_user_migration_get_pending_bytes(vfu_ctx_t *vfu_ctx) 3799 { 3800 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3801 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 3802 uint64_t pending_bytes; 3803 3804 if (ctrlr->migr_data_prepared) { 3805 assert(ctrlr->state == VFIO_USER_CTRLR_MIGRATING); 3806 pending_bytes = 0; 3807 } else { 3808 pending_bytes = vfio_user_migr_data_len(); 3809 } 3810 3811 SPDK_DEBUGLOG(nvmf_vfio, 3812 "%s current state %u, pending bytes 0x%"PRIx64"\n", 3813 endpoint_id(endpoint), ctrlr->state, pending_bytes); 3814 3815 return pending_bytes; 3816 } 3817 3818 static int 3819 vfio_user_migration_prepare_data(vfu_ctx_t *vfu_ctx, uint64_t *offset, uint64_t *size) 3820 { 3821 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3822 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 3823 3824 /* 3825 * When transitioning to pre-copy state we set pending_bytes to 0, 3826 * so the vfio-user client shouldn't attempt to read any migration 3827 * data. This is not yet guaranteed by libvfio-user. 3828 */ 3829 if (ctrlr->state != VFIO_USER_CTRLR_MIGRATING) { 3830 assert(size != NULL); 3831 *offset = 0; 3832 *size = 0; 3833 return 0; 3834 } 3835 3836 if (ctrlr->in_source_vm) { /* migration source */ 3837 assert(size != NULL); 3838 *size = vfio_user_migr_data_len(); 3839 vfio_user_migr_ctrlr_save_data(ctrlr); 3840 } else { /* migration destination */ 3841 assert(size == NULL); 3842 assert(!ctrlr->migr_data_prepared); 3843 } 3844 *offset = 0; 3845 ctrlr->migr_data_prepared = true; 3846 3847 SPDK_DEBUGLOG(nvmf_vfio, "%s current state %u\n", endpoint_id(endpoint), ctrlr->state); 3848 3849 return 0; 3850 } 3851 3852 static ssize_t 3853 vfio_user_migration_read_data(vfu_ctx_t *vfu_ctx __attribute__((unused)), 3854 void *buf __attribute__((unused)), 3855 uint64_t count __attribute__((unused)), 3856 uint64_t offset __attribute__((unused))) 3857 { 3858 SPDK_DEBUGLOG(nvmf_vfio, "%s: migration read data not supported\n", 3859 endpoint_id(vfu_get_private(vfu_ctx))); 3860 errno = ENOTSUP; 3861 return -1; 3862 } 3863 3864 static ssize_t 3865 vfio_user_migration_write_data(vfu_ctx_t *vfu_ctx __attribute__((unused)), 3866 void *buf __attribute__((unused)), 3867 uint64_t count __attribute__((unused)), 3868 uint64_t offset __attribute__((unused))) 3869 { 3870 SPDK_DEBUGLOG(nvmf_vfio, "%s: migration write data not supported\n", 3871 endpoint_id(vfu_get_private(vfu_ctx))); 3872 errno = ENOTSUP; 3873 return -1; 3874 } 3875 3876 static int 3877 vfio_user_migration_data_written(vfu_ctx_t *vfu_ctx __attribute__((unused)), 3878 uint64_t count) 3879 { 3880 SPDK_DEBUGLOG(nvmf_vfio, "write 0x%"PRIx64"\n", (uint64_t)count); 3881 3882 if (count != vfio_user_migr_data_len()) { 3883 SPDK_DEBUGLOG(nvmf_vfio, "%s bad count %#lx\n", 3884 endpoint_id(vfu_get_private(vfu_ctx)), count); 3885 errno = EINVAL; 3886 return -1; 3887 } 3888 3889 return 0; 3890 } 3891 3892 static int 3893 vfio_user_dev_info_fill(struct nvmf_vfio_user_transport *vu_transport, 3894 struct nvmf_vfio_user_endpoint *endpoint) 3895 { 3896 int ret; 3897 ssize_t cap_offset; 3898 vfu_ctx_t *vfu_ctx = endpoint->vfu_ctx; 3899 struct iovec migr_sparse_mmap = {}; 3900 3901 struct pmcap pmcap = { .hdr.id = PCI_CAP_ID_PM, .pmcs.nsfrst = 0x1 }; 3902 struct pxcap pxcap = { 3903 .hdr.id = PCI_CAP_ID_EXP, 3904 .pxcaps.ver = 0x2, 3905 .pxdcap = {.rer = 0x1, .flrc = 0x1}, 3906 .pxdcap2.ctds = 0x1 3907 }; 3908 3909 struct msixcap msixcap = { 3910 .hdr.id = PCI_CAP_ID_MSIX, 3911 .mxc.ts = NVME_IRQ_MSIX_NUM - 1, 3912 .mtab = {.tbir = 0x4, .to = 0x0}, 3913 .mpba = {.pbir = 0x5, .pbao = 0x0} 3914 }; 3915 3916 struct iovec sparse_mmap[] = { 3917 { 3918 .iov_base = (void *)NVME_DOORBELLS_OFFSET, 3919 .iov_len = NVMF_VFIO_USER_DOORBELLS_SIZE, 3920 }, 3921 }; 3922 3923 const vfu_migration_callbacks_t migr_callbacks = { 3924 .version = VFU_MIGR_CALLBACKS_VERS, 3925 .transition = &vfio_user_migration_device_state_transition, 3926 .get_pending_bytes = &vfio_user_migration_get_pending_bytes, 3927 .prepare_data = &vfio_user_migration_prepare_data, 3928 .read_data = &vfio_user_migration_read_data, 3929 .data_written = &vfio_user_migration_data_written, 3930 .write_data = &vfio_user_migration_write_data 3931 }; 3932 3933 ret = vfu_pci_init(vfu_ctx, VFU_PCI_TYPE_EXPRESS, PCI_HEADER_TYPE_NORMAL, 0); 3934 if (ret < 0) { 3935 SPDK_ERRLOG("vfu_ctx %p failed to initialize PCI\n", vfu_ctx); 3936 return ret; 3937 } 3938 vfu_pci_set_id(vfu_ctx, SPDK_PCI_VID_NUTANIX, 0x0001, SPDK_PCI_VID_NUTANIX, 0); 3939 /* 3940 * 0x02, controller uses the NVM Express programming interface 3941 * 0x08, non-volatile memory controller 3942 * 0x01, mass storage controller 3943 */ 3944 vfu_pci_set_class(vfu_ctx, 0x01, 0x08, 0x02); 3945 3946 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pmcap); 3947 if (cap_offset < 0) { 3948 SPDK_ERRLOG("vfu_ctx %p failed add pmcap\n", vfu_ctx); 3949 return ret; 3950 } 3951 3952 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pxcap); 3953 if (cap_offset < 0) { 3954 SPDK_ERRLOG("vfu_ctx %p failed add pxcap\n", vfu_ctx); 3955 return ret; 3956 } 3957 3958 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &msixcap); 3959 if (cap_offset < 0) { 3960 SPDK_ERRLOG("vfu_ctx %p failed add msixcap\n", vfu_ctx); 3961 return ret; 3962 } 3963 3964 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_CFG_REGION_IDX, NVME_REG_CFG_SIZE, 3965 access_pci_config, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 3966 if (ret < 0) { 3967 SPDK_ERRLOG("vfu_ctx %p failed to setup cfg\n", vfu_ctx); 3968 return ret; 3969 } 3970 3971 if (vu_transport->transport_opts.disable_mappable_bar0) { 3972 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE, 3973 access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, 3974 NULL, 0, -1, 0); 3975 } else { 3976 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE, 3977 access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, 3978 sparse_mmap, 1, endpoint->devmem_fd, 0); 3979 } 3980 3981 if (ret < 0) { 3982 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 0\n", vfu_ctx); 3983 return ret; 3984 } 3985 3986 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR4_REGION_IDX, NVME_BAR4_SIZE, 3987 NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 3988 if (ret < 0) { 3989 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 4\n", vfu_ctx); 3990 return ret; 3991 } 3992 3993 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR5_REGION_IDX, NVME_BAR5_SIZE, 3994 NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 3995 if (ret < 0) { 3996 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 5\n", vfu_ctx); 3997 return ret; 3998 } 3999 4000 ret = vfu_setup_device_dma(vfu_ctx, memory_region_add_cb, memory_region_remove_cb); 4001 if (ret < 0) { 4002 SPDK_ERRLOG("vfu_ctx %p failed to setup dma callback\n", vfu_ctx); 4003 return ret; 4004 } 4005 4006 ret = vfu_setup_device_reset_cb(vfu_ctx, vfio_user_device_reset); 4007 if (ret < 0) { 4008 SPDK_ERRLOG("vfu_ctx %p failed to setup reset callback\n", vfu_ctx); 4009 return ret; 4010 } 4011 4012 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_INTX_IRQ, 1); 4013 if (ret < 0) { 4014 SPDK_ERRLOG("vfu_ctx %p failed to setup INTX\n", vfu_ctx); 4015 return ret; 4016 } 4017 4018 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSIX_IRQ, NVME_IRQ_MSIX_NUM); 4019 if (ret < 0) { 4020 SPDK_ERRLOG("vfu_ctx %p failed to setup MSIX\n", vfu_ctx); 4021 return ret; 4022 } 4023 4024 vfu_setup_device_quiesce_cb(vfu_ctx, vfio_user_dev_quiesce_cb); 4025 4026 migr_sparse_mmap.iov_base = (void *)4096; 4027 migr_sparse_mmap.iov_len = vfio_user_migr_data_len(); 4028 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_MIGR_REGION_IDX, 4029 vfu_get_migr_register_area_size() + vfio_user_migr_data_len(), 4030 NULL, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, &migr_sparse_mmap, 4031 1, endpoint->migr_fd, 0); 4032 if (ret < 0) { 4033 SPDK_ERRLOG("vfu_ctx %p failed to setup migration region\n", vfu_ctx); 4034 return ret; 4035 } 4036 4037 ret = vfu_setup_device_migration_callbacks(vfu_ctx, &migr_callbacks, 4038 vfu_get_migr_register_area_size()); 4039 if (ret < 0) { 4040 SPDK_ERRLOG("vfu_ctx %p failed to setup migration callbacks\n", vfu_ctx); 4041 return ret; 4042 } 4043 4044 ret = vfu_realize_ctx(vfu_ctx); 4045 if (ret < 0) { 4046 SPDK_ERRLOG("vfu_ctx %p failed to realize\n", vfu_ctx); 4047 return ret; 4048 } 4049 4050 endpoint->pci_config_space = vfu_pci_get_config_space(endpoint->vfu_ctx); 4051 assert(endpoint->pci_config_space != NULL); 4052 init_pci_config_space(endpoint->pci_config_space); 4053 4054 assert(cap_offset != 0); 4055 endpoint->msix = (struct msixcap *)((uint8_t *)endpoint->pci_config_space + cap_offset); 4056 4057 return 0; 4058 } 4059 4060 static int nvmf_vfio_user_accept(void *ctx); 4061 4062 static void 4063 set_intr_mode_noop(struct spdk_poller *poller, void *arg, bool interrupt_mode) 4064 { 4065 /* Nothing for us to do here. */ 4066 } 4067 4068 /* 4069 * Register an "accept" poller: this is polling for incoming vfio-user socket 4070 * connections (on the listening socket). 4071 * 4072 * We need to do this on first listening, and also after destroying a 4073 * controller, so we can accept another connection. 4074 */ 4075 static int 4076 vfio_user_register_accept_poller(struct nvmf_vfio_user_endpoint *endpoint) 4077 { 4078 uint64_t poll_rate_us = endpoint->transport->transport.opts.acceptor_poll_rate; 4079 4080 SPDK_DEBUGLOG(nvmf_vfio, "registering accept poller\n"); 4081 4082 endpoint->accept_poller = SPDK_POLLER_REGISTER(nvmf_vfio_user_accept, 4083 endpoint, poll_rate_us); 4084 4085 if (!endpoint->accept_poller) { 4086 return -1; 4087 } 4088 4089 endpoint->accept_thread = spdk_get_thread(); 4090 endpoint->need_relisten = false; 4091 4092 if (!spdk_interrupt_mode_is_enabled()) { 4093 return 0; 4094 } 4095 4096 endpoint->accept_intr_fd = vfu_get_poll_fd(endpoint->vfu_ctx); 4097 assert(endpoint->accept_intr_fd != -1); 4098 4099 endpoint->accept_intr = SPDK_INTERRUPT_REGISTER(endpoint->accept_intr_fd, 4100 nvmf_vfio_user_accept, endpoint); 4101 4102 assert(endpoint->accept_intr != NULL); 4103 4104 spdk_poller_register_interrupt(endpoint->accept_poller, 4105 set_intr_mode_noop, NULL); 4106 return 0; 4107 } 4108 4109 static void 4110 _vfio_user_relisten(void *ctx) 4111 { 4112 struct nvmf_vfio_user_endpoint *endpoint = ctx; 4113 4114 vfio_user_register_accept_poller(endpoint); 4115 } 4116 4117 static void 4118 _free_ctrlr(void *ctx) 4119 { 4120 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 4121 struct nvmf_vfio_user_endpoint *endpoint = ctrlr->endpoint; 4122 4123 free_sdbl(ctrlr->endpoint->vfu_ctx, ctrlr->sdbl); 4124 4125 spdk_interrupt_unregister(&ctrlr->intr); 4126 ctrlr->intr_fd = -1; 4127 spdk_poller_unregister(&ctrlr->vfu_ctx_poller); 4128 4129 free(ctrlr); 4130 4131 if (endpoint == NULL) { 4132 return; 4133 } 4134 4135 if (endpoint->need_async_destroy) { 4136 nvmf_vfio_user_destroy_endpoint(endpoint); 4137 } else if (endpoint->need_relisten) { 4138 spdk_thread_send_msg(endpoint->accept_thread, 4139 _vfio_user_relisten, endpoint); 4140 } 4141 } 4142 4143 static void 4144 free_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 4145 { 4146 int i; 4147 assert(ctrlr != NULL); 4148 4149 SPDK_DEBUGLOG(nvmf_vfio, "free %s\n", ctrlr_id(ctrlr)); 4150 4151 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 4152 free_qp(ctrlr, i); 4153 } 4154 4155 spdk_thread_exec_msg(ctrlr->thread, _free_ctrlr, ctrlr); 4156 } 4157 4158 static int 4159 nvmf_vfio_user_create_ctrlr(struct nvmf_vfio_user_transport *transport, 4160 struct nvmf_vfio_user_endpoint *endpoint) 4161 { 4162 struct nvmf_vfio_user_ctrlr *ctrlr; 4163 int err = 0; 4164 4165 SPDK_DEBUGLOG(nvmf_vfio, "%s\n", endpoint_id(endpoint)); 4166 4167 /* First, construct a vfio-user CUSTOM transport controller */ 4168 ctrlr = calloc(1, sizeof(*ctrlr)); 4169 if (ctrlr == NULL) { 4170 err = -ENOMEM; 4171 goto out; 4172 } 4173 /* We can only support one connection for now */ 4174 ctrlr->cntlid = 0x1; 4175 ctrlr->intr_fd = -1; 4176 ctrlr->transport = transport; 4177 ctrlr->endpoint = endpoint; 4178 ctrlr->bar0_doorbells = endpoint->bar0_doorbells; 4179 TAILQ_INIT(&ctrlr->connected_sqs); 4180 4181 ctrlr->adaptive_irqs_enabled = 4182 !transport->transport_opts.disable_adaptive_irq; 4183 4184 /* Then, construct an admin queue pair */ 4185 err = init_sq(ctrlr, &transport->transport, 0); 4186 if (err != 0) { 4187 free(ctrlr); 4188 goto out; 4189 } 4190 4191 err = init_cq(ctrlr, 0); 4192 if (err != 0) { 4193 free(ctrlr); 4194 goto out; 4195 } 4196 4197 ctrlr->sqs[0]->size = NVMF_VFIO_USER_DEFAULT_AQ_DEPTH; 4198 4199 err = alloc_sq_reqs(ctrlr, ctrlr->sqs[0]); 4200 if (err != 0) { 4201 free(ctrlr); 4202 goto out; 4203 } 4204 endpoint->ctrlr = ctrlr; 4205 4206 /* Notify the generic layer about the new admin queue pair */ 4207 spdk_nvmf_tgt_new_qpair(transport->transport.tgt, &ctrlr->sqs[0]->qpair); 4208 4209 out: 4210 if (err != 0) { 4211 SPDK_ERRLOG("%s: failed to create vfio-user controller: %s\n", 4212 endpoint_id(endpoint), strerror(-err)); 4213 } 4214 4215 return err; 4216 } 4217 4218 static int 4219 nvmf_vfio_user_listen(struct spdk_nvmf_transport *transport, 4220 const struct spdk_nvme_transport_id *trid, 4221 struct spdk_nvmf_listen_opts *listen_opts) 4222 { 4223 struct nvmf_vfio_user_transport *vu_transport; 4224 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 4225 char path[PATH_MAX] = {}; 4226 char uuid[PATH_MAX] = {}; 4227 int ret; 4228 4229 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4230 transport); 4231 4232 pthread_mutex_lock(&vu_transport->lock); 4233 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 4234 /* Only compare traddr */ 4235 if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) { 4236 pthread_mutex_unlock(&vu_transport->lock); 4237 return -EEXIST; 4238 } 4239 } 4240 pthread_mutex_unlock(&vu_transport->lock); 4241 4242 endpoint = calloc(1, sizeof(*endpoint)); 4243 if (!endpoint) { 4244 return -ENOMEM; 4245 } 4246 4247 pthread_mutex_init(&endpoint->lock, NULL); 4248 endpoint->devmem_fd = -1; 4249 memcpy(&endpoint->trid, trid, sizeof(endpoint->trid)); 4250 endpoint->transport = vu_transport; 4251 4252 ret = snprintf(path, PATH_MAX, "%s/bar0", endpoint_id(endpoint)); 4253 if (ret < 0 || ret >= PATH_MAX) { 4254 SPDK_ERRLOG("%s: error to get socket path: %s.\n", endpoint_id(endpoint), spdk_strerror(errno)); 4255 ret = -1; 4256 goto out; 4257 } 4258 4259 ret = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); 4260 if (ret == -1) { 4261 SPDK_ERRLOG("%s: failed to open device memory at %s: %s.\n", 4262 endpoint_id(endpoint), path, spdk_strerror(errno)); 4263 goto out; 4264 } 4265 unlink(path); 4266 4267 endpoint->devmem_fd = ret; 4268 ret = ftruncate(endpoint->devmem_fd, 4269 NVME_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE); 4270 if (ret != 0) { 4271 SPDK_ERRLOG("%s: error to ftruncate file %s: %s.\n", endpoint_id(endpoint), path, 4272 spdk_strerror(errno)); 4273 goto out; 4274 } 4275 4276 endpoint->bar0_doorbells = mmap(NULL, NVMF_VFIO_USER_DOORBELLS_SIZE, 4277 PROT_READ | PROT_WRITE, MAP_SHARED, endpoint->devmem_fd, NVME_DOORBELLS_OFFSET); 4278 if (endpoint->bar0_doorbells == MAP_FAILED) { 4279 SPDK_ERRLOG("%s: error to mmap file %s: %s.\n", endpoint_id(endpoint), path, spdk_strerror(errno)); 4280 endpoint->bar0_doorbells = NULL; 4281 ret = -1; 4282 goto out; 4283 } 4284 4285 ret = snprintf(path, PATH_MAX, "%s/migr", endpoint_id(endpoint)); 4286 if (ret < 0 || ret >= PATH_MAX) { 4287 SPDK_ERRLOG("%s: error to get migration file path: %s.\n", endpoint_id(endpoint), 4288 spdk_strerror(errno)); 4289 ret = -1; 4290 goto out; 4291 } 4292 ret = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); 4293 if (ret == -1) { 4294 SPDK_ERRLOG("%s: failed to open device memory at %s: %s.\n", 4295 endpoint_id(endpoint), path, spdk_strerror(errno)); 4296 goto out; 4297 } 4298 unlink(path); 4299 4300 endpoint->migr_fd = ret; 4301 ret = ftruncate(endpoint->migr_fd, 4302 vfu_get_migr_register_area_size() + vfio_user_migr_data_len()); 4303 if (ret != 0) { 4304 SPDK_ERRLOG("%s: error to ftruncate migration file %s: %s.\n", endpoint_id(endpoint), path, 4305 spdk_strerror(errno)); 4306 goto out; 4307 } 4308 4309 endpoint->migr_data = mmap(NULL, vfio_user_migr_data_len(), 4310 PROT_READ | PROT_WRITE, MAP_SHARED, endpoint->migr_fd, vfu_get_migr_register_area_size()); 4311 if (endpoint->migr_data == MAP_FAILED) { 4312 SPDK_ERRLOG("%s: error to mmap file %s: %s.\n", endpoint_id(endpoint), path, spdk_strerror(errno)); 4313 endpoint->migr_data = NULL; 4314 ret = -1; 4315 goto out; 4316 } 4317 4318 ret = snprintf(uuid, PATH_MAX, "%s/cntrl", endpoint_id(endpoint)); 4319 if (ret < 0 || ret >= PATH_MAX) { 4320 SPDK_ERRLOG("%s: error to get ctrlr file path: %s\n", endpoint_id(endpoint), spdk_strerror(errno)); 4321 ret = -1; 4322 goto out; 4323 } 4324 4325 endpoint->vfu_ctx = vfu_create_ctx(VFU_TRANS_SOCK, uuid, LIBVFIO_USER_FLAG_ATTACH_NB, 4326 endpoint, VFU_DEV_TYPE_PCI); 4327 if (endpoint->vfu_ctx == NULL) { 4328 SPDK_ERRLOG("%s: error creating libmuser context: %m\n", 4329 endpoint_id(endpoint)); 4330 ret = -1; 4331 goto out; 4332 } 4333 4334 ret = vfu_setup_log(endpoint->vfu_ctx, vfio_user_log, 4335 vfio_user_get_log_level()); 4336 if (ret < 0) { 4337 goto out; 4338 } 4339 4340 4341 ret = vfio_user_dev_info_fill(vu_transport, endpoint); 4342 if (ret < 0) { 4343 goto out; 4344 } 4345 4346 ret = vfio_user_register_accept_poller(endpoint); 4347 4348 if (ret != 0) { 4349 goto out; 4350 } 4351 4352 pthread_mutex_lock(&vu_transport->lock); 4353 TAILQ_INSERT_TAIL(&vu_transport->endpoints, endpoint, link); 4354 pthread_mutex_unlock(&vu_transport->lock); 4355 4356 out: 4357 if (ret != 0) { 4358 nvmf_vfio_user_destroy_endpoint(endpoint); 4359 } 4360 4361 return ret; 4362 } 4363 4364 static void 4365 nvmf_vfio_user_stop_listen(struct spdk_nvmf_transport *transport, 4366 const struct spdk_nvme_transport_id *trid) 4367 { 4368 struct nvmf_vfio_user_transport *vu_transport; 4369 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 4370 4371 assert(trid != NULL); 4372 assert(trid->traddr != NULL); 4373 4374 SPDK_DEBUGLOG(nvmf_vfio, "%s: stop listen\n", trid->traddr); 4375 4376 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4377 transport); 4378 4379 pthread_mutex_lock(&vu_transport->lock); 4380 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 4381 if (strcmp(trid->traddr, endpoint->trid.traddr) == 0) { 4382 TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link); 4383 /* Defer to free endpoint resources until the controller 4384 * is freed. There are two cases when running here: 4385 * 1. kill nvmf target while VM is connected 4386 * 2. remove listener via RPC call 4387 * nvmf library will disconnect all queue paris. 4388 */ 4389 if (endpoint->ctrlr) { 4390 assert(!endpoint->need_async_destroy); 4391 endpoint->need_async_destroy = true; 4392 pthread_mutex_unlock(&vu_transport->lock); 4393 return; 4394 } 4395 4396 nvmf_vfio_user_destroy_endpoint(endpoint); 4397 pthread_mutex_unlock(&vu_transport->lock); 4398 return; 4399 } 4400 } 4401 pthread_mutex_unlock(&vu_transport->lock); 4402 4403 SPDK_DEBUGLOG(nvmf_vfio, "%s: not found\n", trid->traddr); 4404 } 4405 4406 static void 4407 nvmf_vfio_user_cdata_init(struct spdk_nvmf_transport *transport, 4408 struct spdk_nvmf_subsystem *subsystem, 4409 struct spdk_nvmf_ctrlr_data *cdata) 4410 { 4411 struct nvmf_vfio_user_transport *vu_transport; 4412 4413 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, transport); 4414 4415 cdata->vid = SPDK_PCI_VID_NUTANIX; 4416 cdata->ssvid = SPDK_PCI_VID_NUTANIX; 4417 cdata->ieee[0] = 0x8d; 4418 cdata->ieee[1] = 0x6b; 4419 cdata->ieee[2] = 0x50; 4420 memset(&cdata->sgls, 0, sizeof(struct spdk_nvme_cdata_sgls)); 4421 cdata->sgls.supported = SPDK_NVME_SGLS_SUPPORTED_DWORD_ALIGNED; 4422 cdata->oncs.compare = !vu_transport->transport_opts.disable_compare; 4423 /* libvfio-user can only support 1 connection for now */ 4424 cdata->oncs.reservations = 0; 4425 cdata->oacs.doorbell_buffer_config = !vu_transport->transport_opts.disable_shadow_doorbells; 4426 cdata->fuses.compare_and_write = !vu_transport->transport_opts.disable_compare; 4427 } 4428 4429 static int 4430 nvmf_vfio_user_listen_associate(struct spdk_nvmf_transport *transport, 4431 const struct spdk_nvmf_subsystem *subsystem, 4432 const struct spdk_nvme_transport_id *trid) 4433 { 4434 struct nvmf_vfio_user_transport *vu_transport; 4435 struct nvmf_vfio_user_endpoint *endpoint; 4436 4437 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, transport); 4438 4439 pthread_mutex_lock(&vu_transport->lock); 4440 TAILQ_FOREACH(endpoint, &vu_transport->endpoints, link) { 4441 if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) { 4442 break; 4443 } 4444 } 4445 pthread_mutex_unlock(&vu_transport->lock); 4446 4447 if (endpoint == NULL) { 4448 return -ENOENT; 4449 } 4450 4451 /* Drop const - we will later need to pause/unpause. */ 4452 endpoint->subsystem = (struct spdk_nvmf_subsystem *)subsystem; 4453 4454 return 0; 4455 } 4456 4457 /* 4458 * Executed periodically at a default SPDK_NVMF_DEFAULT_ACCEPT_POLL_RATE_US 4459 * frequency. 4460 * 4461 * For this endpoint (which at the libvfio-user level corresponds to a socket), 4462 * if we don't currently have a controller set up, peek to see if the socket is 4463 * able to accept a new connection. 4464 */ 4465 static int 4466 nvmf_vfio_user_accept(void *ctx) 4467 { 4468 struct nvmf_vfio_user_endpoint *endpoint = ctx; 4469 struct nvmf_vfio_user_transport *vu_transport; 4470 int err; 4471 4472 vu_transport = endpoint->transport; 4473 4474 if (endpoint->ctrlr != NULL) { 4475 return SPDK_POLLER_IDLE; 4476 } 4477 4478 /* While we're here, the controller is already destroyed, 4479 * subsystem may still be in RESUMING state, we will wait 4480 * until the subsystem is in RUNNING state. 4481 */ 4482 if (endpoint->need_resume) { 4483 return SPDK_POLLER_IDLE; 4484 } 4485 4486 err = vfu_attach_ctx(endpoint->vfu_ctx); 4487 if (err == 0) { 4488 SPDK_DEBUGLOG(nvmf_vfio, "attach succeeded\n"); 4489 err = nvmf_vfio_user_create_ctrlr(vu_transport, endpoint); 4490 if (err == 0) { 4491 /* 4492 * Unregister ourselves: now we've accepted a 4493 * connection, there is nothing for us to poll for, and 4494 * we will poll the connection via vfu_run_ctx() 4495 * instead. 4496 */ 4497 spdk_interrupt_unregister(&endpoint->accept_intr); 4498 spdk_poller_unregister(&endpoint->accept_poller); 4499 } 4500 return SPDK_POLLER_BUSY; 4501 } 4502 4503 if (errno == EAGAIN || errno == EWOULDBLOCK) { 4504 return SPDK_POLLER_IDLE; 4505 } 4506 4507 return SPDK_POLLER_BUSY; 4508 } 4509 4510 static void 4511 nvmf_vfio_user_discover(struct spdk_nvmf_transport *transport, 4512 struct spdk_nvme_transport_id *trid, 4513 struct spdk_nvmf_discovery_log_page_entry *entry) 4514 { } 4515 4516 static struct spdk_nvmf_transport_poll_group * 4517 nvmf_vfio_user_poll_group_create(struct spdk_nvmf_transport *transport, 4518 struct spdk_nvmf_poll_group *group) 4519 { 4520 struct nvmf_vfio_user_transport *vu_transport; 4521 struct nvmf_vfio_user_poll_group *vu_group; 4522 4523 SPDK_DEBUGLOG(nvmf_vfio, "create poll group\n"); 4524 4525 vu_group = calloc(1, sizeof(*vu_group)); 4526 if (vu_group == NULL) { 4527 SPDK_ERRLOG("Error allocating poll group: %m"); 4528 return NULL; 4529 } 4530 4531 TAILQ_INIT(&vu_group->sqs); 4532 4533 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4534 transport); 4535 pthread_mutex_lock(&vu_transport->pg_lock); 4536 TAILQ_INSERT_TAIL(&vu_transport->poll_groups, vu_group, link); 4537 if (vu_transport->next_pg == NULL) { 4538 vu_transport->next_pg = vu_group; 4539 } 4540 pthread_mutex_unlock(&vu_transport->pg_lock); 4541 4542 if (!spdk_interrupt_mode_is_enabled()) { 4543 return &vu_group->group; 4544 } 4545 4546 /* 4547 * Only allow the poll group to work in interrupt mode if the transport 4548 * supports it. It's our responsibility to register the actual interrupt 4549 * later (in handle_queue_connect_rsp()) that processes everything in 4550 * the poll group: for us, that's the libvfio-user context, and the 4551 * actual qpairs. 4552 * 4553 * Note that this only works in the case that nothing else shares the 4554 * spdk_nvmf_poll_group. 4555 * 4556 * If not supported, this will effectively always wake up to poll the 4557 * poll group. 4558 */ 4559 4560 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4561 transport); 4562 4563 if (!vu_transport->intr_mode_supported) { 4564 SPDK_WARNLOG("vfio-user interrupt mode not supported\n"); 4565 return &vu_group->group; 4566 } 4567 4568 spdk_poller_register_interrupt(group->poller, set_intr_mode_noop, 4569 NULL); 4570 4571 return &vu_group->group; 4572 } 4573 4574 static struct spdk_nvmf_transport_poll_group * 4575 nvmf_vfio_user_get_optimal_poll_group(struct spdk_nvmf_qpair *qpair) 4576 { 4577 struct nvmf_vfio_user_transport *vu_transport; 4578 struct nvmf_vfio_user_poll_group **vu_group; 4579 struct nvmf_vfio_user_sq *sq; 4580 struct nvmf_vfio_user_cq *cq; 4581 4582 struct spdk_nvmf_transport_poll_group *result = NULL; 4583 4584 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 4585 cq = sq->ctrlr->cqs[sq->cqid]; 4586 assert(cq != NULL); 4587 vu_transport = SPDK_CONTAINEROF(qpair->transport, struct nvmf_vfio_user_transport, transport); 4588 4589 pthread_mutex_lock(&vu_transport->pg_lock); 4590 if (TAILQ_EMPTY(&vu_transport->poll_groups)) { 4591 goto out; 4592 } 4593 4594 if (!nvmf_qpair_is_admin_queue(qpair)) { 4595 /* 4596 * If this is shared IO CQ case, just return the used CQ's poll 4597 * group, so I/O completions don't have to use 4598 * spdk_thread_send_msg(). 4599 */ 4600 if (cq->group != NULL) { 4601 result = cq->group; 4602 goto out; 4603 } 4604 4605 /* 4606 * If we're in interrupt mode, align all qpairs for a controller 4607 * on the same poll group, to avoid complications in 4608 * vfio_user_ctrlr_intr(). 4609 */ 4610 if (in_interrupt_mode(vu_transport)) { 4611 result = sq->ctrlr->sqs[0]->group; 4612 goto out; 4613 } 4614 4615 } 4616 4617 vu_group = &vu_transport->next_pg; 4618 assert(*vu_group != NULL); 4619 4620 result = &(*vu_group)->group; 4621 *vu_group = TAILQ_NEXT(*vu_group, link); 4622 if (*vu_group == NULL) { 4623 *vu_group = TAILQ_FIRST(&vu_transport->poll_groups); 4624 } 4625 4626 out: 4627 if (cq->group == NULL) { 4628 cq->group = result; 4629 } 4630 4631 pthread_mutex_unlock(&vu_transport->pg_lock); 4632 return result; 4633 } 4634 4635 /* called when process exits */ 4636 static void 4637 nvmf_vfio_user_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group) 4638 { 4639 struct nvmf_vfio_user_poll_group *vu_group, *next_tgroup;; 4640 struct nvmf_vfio_user_transport *vu_transport; 4641 4642 SPDK_DEBUGLOG(nvmf_vfio, "destroy poll group\n"); 4643 4644 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 4645 vu_transport = SPDK_CONTAINEROF(vu_group->group.transport, struct nvmf_vfio_user_transport, 4646 transport); 4647 4648 pthread_mutex_lock(&vu_transport->pg_lock); 4649 next_tgroup = TAILQ_NEXT(vu_group, link); 4650 TAILQ_REMOVE(&vu_transport->poll_groups, vu_group, link); 4651 if (next_tgroup == NULL) { 4652 next_tgroup = TAILQ_FIRST(&vu_transport->poll_groups); 4653 } 4654 if (vu_transport->next_pg == vu_group) { 4655 vu_transport->next_pg = next_tgroup; 4656 } 4657 pthread_mutex_unlock(&vu_transport->pg_lock); 4658 4659 free(vu_group); 4660 } 4661 4662 static void 4663 _vfio_user_qpair_disconnect(void *ctx) 4664 { 4665 struct nvmf_vfio_user_sq *sq = ctx; 4666 4667 spdk_nvmf_qpair_disconnect(&sq->qpair, NULL, NULL); 4668 } 4669 4670 /* The function is used when socket connection is destroyed */ 4671 static int 4672 vfio_user_destroy_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 4673 { 4674 struct nvmf_vfio_user_sq *sq; 4675 struct nvmf_vfio_user_endpoint *endpoint; 4676 4677 SPDK_DEBUGLOG(nvmf_vfio, "%s stop processing\n", ctrlr_id(ctrlr)); 4678 4679 endpoint = ctrlr->endpoint; 4680 assert(endpoint != NULL); 4681 4682 pthread_mutex_lock(&endpoint->lock); 4683 endpoint->need_relisten = true; 4684 if (TAILQ_EMPTY(&ctrlr->connected_sqs)) { 4685 endpoint->ctrlr = NULL; 4686 free_ctrlr(ctrlr); 4687 pthread_mutex_unlock(&endpoint->lock); 4688 return 0; 4689 } 4690 4691 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 4692 /* add another round thread poll to avoid recursive endpoint lock */ 4693 spdk_thread_send_msg(ctrlr->thread, _vfio_user_qpair_disconnect, sq); 4694 } 4695 pthread_mutex_unlock(&endpoint->lock); 4696 4697 return 0; 4698 } 4699 4700 /* 4701 * Poll for and process any incoming vfio-user messages. 4702 */ 4703 static int 4704 vfio_user_poll_vfu_ctx(void *ctx) 4705 { 4706 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 4707 int ret; 4708 4709 assert(ctrlr != NULL); 4710 4711 /* This will call access_bar0_fn() if there are any writes 4712 * to the portion of the BAR that is not mmap'd */ 4713 ret = vfu_run_ctx(ctrlr->endpoint->vfu_ctx); 4714 if (spdk_unlikely(ret == -1)) { 4715 if (errno == EBUSY) { 4716 return SPDK_POLLER_IDLE; 4717 } 4718 4719 spdk_poller_unregister(&ctrlr->vfu_ctx_poller); 4720 4721 /* 4722 * We lost the client; the reset callback will already have 4723 * unregistered the interrupt. 4724 */ 4725 if (errno == ENOTCONN) { 4726 vfio_user_destroy_ctrlr(ctrlr); 4727 return SPDK_POLLER_BUSY; 4728 } 4729 4730 /* 4731 * We might not have got a reset callback in this case, so 4732 * explicitly unregister the interrupt here. 4733 */ 4734 spdk_interrupt_unregister(&ctrlr->intr); 4735 ctrlr->intr_fd = -1; 4736 fail_ctrlr(ctrlr); 4737 } 4738 4739 return ret != 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 4740 } 4741 4742 struct vfio_user_post_cpl_ctx { 4743 struct nvmf_vfio_user_ctrlr *ctrlr; 4744 struct nvmf_vfio_user_cq *cq; 4745 struct spdk_nvme_cpl cpl; 4746 }; 4747 4748 static void 4749 _post_completion_msg(void *ctx) 4750 { 4751 struct vfio_user_post_cpl_ctx *cpl_ctx = ctx; 4752 4753 post_completion(cpl_ctx->ctrlr, cpl_ctx->cq, cpl_ctx->cpl.cdw0, cpl_ctx->cpl.sqid, 4754 cpl_ctx->cpl.cid, cpl_ctx->cpl.status.sc, cpl_ctx->cpl.status.sct); 4755 free(cpl_ctx); 4756 } 4757 4758 static int nvmf_vfio_user_poll_group_poll(struct spdk_nvmf_transport_poll_group *group); 4759 4760 static int vfio_user_poll_group_rearm(struct nvmf_vfio_user_poll_group *vu_group); 4761 4762 /* 4763 * Handle an interrupt for the given controller: we must poll the vfu_ctx, and 4764 * the SQs assigned to our poll group. 4765 */ 4766 static int 4767 vfio_user_ctrlr_intr(void *ctx) 4768 { 4769 struct nvmf_vfio_user_poll_group *vu_group; 4770 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 4771 int ret = 0; 4772 4773 assert(ctrlr != NULL); 4774 assert(ctrlr->sqs[0] != NULL); 4775 assert(ctrlr->sqs[0]->group != NULL); 4776 4777 ctrlr->kick_requested = false; 4778 4779 /* 4780 * Poll vfio-user for this controller. 4781 */ 4782 ret = vfio_user_poll_vfu_ctx(ctrlr); 4783 /* `sqs[0]` could be set to NULL in vfio_user_poll_vfu_ctx() context, just return 4784 * for this case. 4785 */ 4786 if (ctrlr->sqs[0] == NULL) { 4787 return ret; 4788 } 4789 4790 vu_group = ctrlr_to_poll_group(ctrlr); 4791 4792 /* 4793 * See nvmf_vfio_user_get_optimal_poll_group() for why it's OK to only 4794 * poll this poll group. 4795 * 4796 * Note that this could end up polling other controller's SQs as well 4797 * (since a single poll group can have SQs from multiple separate 4798 * controllers). 4799 */ 4800 ret |= nvmf_vfio_user_poll_group_poll(&vu_group->group); 4801 4802 /* 4803 * Re-arm the event indexes. NB: this also could rearm other 4804 * controller's SQs. 4805 */ 4806 ret |= vfio_user_poll_group_rearm(vu_group); 4807 4808 return ret != 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 4809 } 4810 4811 static void 4812 vfio_user_set_intr_mode(struct spdk_poller *poller, void *arg, 4813 bool interrupt_mode) 4814 { 4815 struct nvmf_vfio_user_ctrlr *ctrlr = arg; 4816 assert(ctrlr != NULL); 4817 assert(ctrlr->endpoint != NULL); 4818 4819 SPDK_DEBUGLOG(nvmf_vfio, "%s: setting interrupt mode to %d\n", 4820 ctrlr_id(ctrlr), interrupt_mode); 4821 4822 /* 4823 * interrupt_mode needs to persist across controller resets, so store 4824 * it in the endpoint instead. 4825 */ 4826 ctrlr->endpoint->interrupt_mode = interrupt_mode; 4827 4828 vfio_user_poll_group_rearm(ctrlr_to_poll_group(ctrlr)); 4829 } 4830 4831 /* 4832 * In response to the nvmf_vfio_user_create_ctrlr() path, the admin queue is now 4833 * set up and we can start operating on this controller. 4834 */ 4835 static void 4836 start_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 4837 struct spdk_nvmf_ctrlr *ctrlr) 4838 { 4839 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 4840 4841 vu_ctrlr->ctrlr = ctrlr; 4842 vu_ctrlr->cntlid = ctrlr->cntlid; 4843 vu_ctrlr->thread = spdk_get_thread(); 4844 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 4845 4846 if (!in_interrupt_mode(endpoint->transport)) { 4847 vu_ctrlr->vfu_ctx_poller = SPDK_POLLER_REGISTER(vfio_user_poll_vfu_ctx, 4848 vu_ctrlr, 1000); 4849 return; 4850 } 4851 4852 vu_ctrlr->vfu_ctx_poller = SPDK_POLLER_REGISTER(vfio_user_poll_vfu_ctx, 4853 vu_ctrlr, 0); 4854 4855 vu_ctrlr->intr_fd = vfu_get_poll_fd(vu_ctrlr->endpoint->vfu_ctx); 4856 assert(vu_ctrlr->intr_fd != -1); 4857 4858 vu_ctrlr->intr = SPDK_INTERRUPT_REGISTER(vu_ctrlr->intr_fd, 4859 vfio_user_ctrlr_intr, vu_ctrlr); 4860 4861 assert(vu_ctrlr->intr != NULL); 4862 4863 spdk_poller_register_interrupt(vu_ctrlr->vfu_ctx_poller, 4864 vfio_user_set_intr_mode, 4865 vu_ctrlr); 4866 } 4867 4868 static int 4869 handle_queue_connect_rsp(struct nvmf_vfio_user_req *req, void *cb_arg) 4870 { 4871 struct nvmf_vfio_user_poll_group *vu_group; 4872 struct nvmf_vfio_user_sq *sq = cb_arg; 4873 struct nvmf_vfio_user_cq *admin_cq; 4874 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 4875 struct nvmf_vfio_user_endpoint *endpoint; 4876 4877 assert(sq != NULL); 4878 assert(req != NULL); 4879 4880 vu_ctrlr = sq->ctrlr; 4881 assert(vu_ctrlr != NULL); 4882 endpoint = vu_ctrlr->endpoint; 4883 assert(endpoint != NULL); 4884 4885 if (spdk_nvme_cpl_is_error(&req->req.rsp->nvme_cpl)) { 4886 SPDK_ERRLOG("SC %u, SCT %u\n", req->req.rsp->nvme_cpl.status.sc, req->req.rsp->nvme_cpl.status.sct); 4887 endpoint->ctrlr = NULL; 4888 free_ctrlr(vu_ctrlr); 4889 return -1; 4890 } 4891 4892 vu_group = SPDK_CONTAINEROF(sq->group, struct nvmf_vfio_user_poll_group, group); 4893 TAILQ_INSERT_TAIL(&vu_group->sqs, sq, link); 4894 4895 admin_cq = vu_ctrlr->cqs[0]; 4896 assert(admin_cq != NULL); 4897 4898 pthread_mutex_lock(&endpoint->lock); 4899 if (nvmf_qpair_is_admin_queue(&sq->qpair)) { 4900 admin_cq->thread = spdk_get_thread(); 4901 start_ctrlr(vu_ctrlr, sq->qpair.ctrlr); 4902 } else { 4903 /* For I/O queues this command was generated in response to an 4904 * ADMIN I/O CREATE SUBMISSION QUEUE command which has not yet 4905 * been completed. Complete it now. 4906 */ 4907 if (sq->post_create_io_sq_completion) { 4908 assert(admin_cq->thread != NULL); 4909 if (admin_cq->thread != spdk_get_thread()) { 4910 struct vfio_user_post_cpl_ctx *cpl_ctx; 4911 4912 cpl_ctx = calloc(1, sizeof(*cpl_ctx)); 4913 if (!cpl_ctx) { 4914 return -ENOMEM; 4915 } 4916 cpl_ctx->ctrlr = vu_ctrlr; 4917 cpl_ctx->cq = admin_cq; 4918 cpl_ctx->cpl.sqid = 0; 4919 cpl_ctx->cpl.cdw0 = 0; 4920 cpl_ctx->cpl.cid = sq->create_io_sq_cmd.cid; 4921 cpl_ctx->cpl.status.sc = SPDK_NVME_SC_SUCCESS; 4922 cpl_ctx->cpl.status.sct = SPDK_NVME_SCT_GENERIC; 4923 4924 spdk_thread_send_msg(admin_cq->thread, _post_completion_msg, 4925 cpl_ctx); 4926 } else { 4927 post_completion(vu_ctrlr, admin_cq, 0, 0, 4928 sq->create_io_sq_cmd.cid, SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC); 4929 } 4930 sq->post_create_io_sq_completion = false; 4931 } else if (in_interrupt_mode(endpoint->transport)) { 4932 /* 4933 * If we're live migrating a guest, there is a window 4934 * where the I/O queues haven't been set up but the 4935 * device is in running state, during which the guest 4936 * might write to a doorbell. This doorbell write will 4937 * go unnoticed, so let's poll the whole controller to 4938 * pick that up. 4939 */ 4940 ctrlr_kick(vu_ctrlr); 4941 } 4942 sq->sq_state = VFIO_USER_SQ_ACTIVE; 4943 } 4944 4945 TAILQ_INSERT_TAIL(&vu_ctrlr->connected_sqs, sq, tailq); 4946 pthread_mutex_unlock(&endpoint->lock); 4947 4948 free(req->req.data); 4949 req->req.data = NULL; 4950 4951 return 0; 4952 } 4953 4954 /* 4955 * Add the given qpair to the given poll group. New qpairs are added via 4956 * spdk_nvmf_tgt_new_qpair(), which picks a poll group via 4957 * nvmf_vfio_user_get_optimal_poll_group(), then calls back here via 4958 * nvmf_transport_poll_group_add(). 4959 */ 4960 static int 4961 nvmf_vfio_user_poll_group_add(struct spdk_nvmf_transport_poll_group *group, 4962 struct spdk_nvmf_qpair *qpair) 4963 { 4964 struct nvmf_vfio_user_sq *sq; 4965 struct nvmf_vfio_user_req *vu_req; 4966 struct nvmf_vfio_user_ctrlr *ctrlr; 4967 struct spdk_nvmf_request *req; 4968 struct spdk_nvmf_fabric_connect_data *data; 4969 bool admin; 4970 4971 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 4972 sq->group = group; 4973 ctrlr = sq->ctrlr; 4974 4975 SPDK_DEBUGLOG(nvmf_vfio, "%s: add QP%d=%p(%p) to poll_group=%p\n", 4976 ctrlr_id(ctrlr), sq->qpair.qid, 4977 sq, qpair, group); 4978 4979 admin = nvmf_qpair_is_admin_queue(&sq->qpair); 4980 4981 vu_req = get_nvmf_vfio_user_req(sq); 4982 if (vu_req == NULL) { 4983 return -1; 4984 } 4985 4986 req = &vu_req->req; 4987 req->cmd->connect_cmd.opcode = SPDK_NVME_OPC_FABRIC; 4988 req->cmd->connect_cmd.cid = 0; 4989 req->cmd->connect_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_CONNECT; 4990 req->cmd->connect_cmd.recfmt = 0; 4991 req->cmd->connect_cmd.sqsize = sq->size - 1; 4992 req->cmd->connect_cmd.qid = admin ? 0 : qpair->qid; 4993 4994 req->length = sizeof(struct spdk_nvmf_fabric_connect_data); 4995 req->data = calloc(1, req->length); 4996 if (req->data == NULL) { 4997 nvmf_vfio_user_req_free(req); 4998 return -ENOMEM; 4999 } 5000 5001 data = (struct spdk_nvmf_fabric_connect_data *)req->data; 5002 data->cntlid = ctrlr->cntlid; 5003 snprintf(data->subnqn, sizeof(data->subnqn), "%s", 5004 spdk_nvmf_subsystem_get_nqn(ctrlr->endpoint->subsystem)); 5005 5006 vu_req->cb_fn = handle_queue_connect_rsp; 5007 vu_req->cb_arg = sq; 5008 5009 SPDK_DEBUGLOG(nvmf_vfio, 5010 "%s: sending connect fabrics command for qid:%#x cntlid=%#x\n", 5011 ctrlr_id(ctrlr), qpair->qid, data->cntlid); 5012 5013 spdk_nvmf_request_exec_fabrics(req); 5014 return 0; 5015 } 5016 5017 static int 5018 nvmf_vfio_user_poll_group_remove(struct spdk_nvmf_transport_poll_group *group, 5019 struct spdk_nvmf_qpair *qpair) 5020 { 5021 struct nvmf_vfio_user_sq *sq; 5022 struct nvmf_vfio_user_poll_group *vu_group; 5023 5024 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5025 5026 SPDK_DEBUGLOG(nvmf_vfio, 5027 "%s: remove NVMf QP%d=%p from NVMf poll_group=%p\n", 5028 ctrlr_id(sq->ctrlr), qpair->qid, qpair, group); 5029 5030 5031 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 5032 TAILQ_REMOVE(&vu_group->sqs, sq, link); 5033 5034 return 0; 5035 } 5036 5037 static void 5038 _nvmf_vfio_user_req_free(struct nvmf_vfio_user_sq *sq, struct nvmf_vfio_user_req *vu_req) 5039 { 5040 memset(&vu_req->cmd, 0, sizeof(vu_req->cmd)); 5041 memset(&vu_req->rsp, 0, sizeof(vu_req->rsp)); 5042 vu_req->iovcnt = 0; 5043 vu_req->state = VFIO_USER_REQUEST_STATE_FREE; 5044 5045 TAILQ_INSERT_TAIL(&sq->free_reqs, vu_req, link); 5046 } 5047 5048 static int 5049 nvmf_vfio_user_req_free(struct spdk_nvmf_request *req) 5050 { 5051 struct nvmf_vfio_user_sq *sq; 5052 struct nvmf_vfio_user_req *vu_req; 5053 5054 assert(req != NULL); 5055 5056 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 5057 sq = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 5058 5059 _nvmf_vfio_user_req_free(sq, vu_req); 5060 5061 return 0; 5062 } 5063 5064 static int 5065 nvmf_vfio_user_req_complete(struct spdk_nvmf_request *req) 5066 { 5067 struct nvmf_vfio_user_sq *sq; 5068 struct nvmf_vfio_user_req *vu_req; 5069 5070 assert(req != NULL); 5071 5072 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 5073 sq = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 5074 5075 if (vu_req->cb_fn != NULL) { 5076 if (vu_req->cb_fn(vu_req, vu_req->cb_arg) != 0) { 5077 fail_ctrlr(sq->ctrlr); 5078 } 5079 } 5080 5081 _nvmf_vfio_user_req_free(sq, vu_req); 5082 5083 return 0; 5084 } 5085 5086 static void 5087 nvmf_vfio_user_close_qpair(struct spdk_nvmf_qpair *qpair, 5088 spdk_nvmf_transport_qpair_fini_cb cb_fn, void *cb_arg) 5089 { 5090 struct nvmf_vfio_user_sq *sq; 5091 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 5092 struct nvmf_vfio_user_endpoint *endpoint; 5093 5094 assert(qpair != NULL); 5095 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5096 vu_ctrlr = sq->ctrlr; 5097 endpoint = vu_ctrlr->endpoint; 5098 5099 pthread_mutex_lock(&endpoint->lock); 5100 TAILQ_REMOVE(&vu_ctrlr->connected_sqs, sq, tailq); 5101 delete_sq_done(vu_ctrlr, sq); 5102 if (TAILQ_EMPTY(&vu_ctrlr->connected_sqs)) { 5103 endpoint->ctrlr = NULL; 5104 if (vu_ctrlr->in_source_vm && endpoint->need_resume) { 5105 /* The controller will be freed, we can resume the subsystem 5106 * now so that the endpoint can be ready to accept another 5107 * new connection. 5108 */ 5109 spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 5110 vfio_user_endpoint_resume_done, endpoint); 5111 } 5112 free_ctrlr(vu_ctrlr); 5113 } 5114 pthread_mutex_unlock(&endpoint->lock); 5115 5116 if (cb_fn) { 5117 cb_fn(cb_arg); 5118 } 5119 } 5120 5121 /** 5122 * Returns a preallocated request, or NULL if there isn't one available. 5123 */ 5124 static struct nvmf_vfio_user_req * 5125 get_nvmf_vfio_user_req(struct nvmf_vfio_user_sq *sq) 5126 { 5127 struct nvmf_vfio_user_req *req; 5128 5129 if (sq == NULL) { 5130 return NULL; 5131 } 5132 5133 req = TAILQ_FIRST(&sq->free_reqs); 5134 if (req == NULL) { 5135 return NULL; 5136 } 5137 5138 TAILQ_REMOVE(&sq->free_reqs, req, link); 5139 5140 return req; 5141 } 5142 5143 static int 5144 get_nvmf_io_req_length(struct spdk_nvmf_request *req) 5145 { 5146 uint16_t nr; 5147 uint32_t nlb, nsid; 5148 struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; 5149 struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; 5150 struct spdk_nvmf_ns *ns; 5151 5152 nsid = cmd->nsid; 5153 ns = _nvmf_subsystem_get_ns(ctrlr->subsys, nsid); 5154 if (ns == NULL || ns->bdev == NULL) { 5155 SPDK_ERRLOG("unsuccessful query for nsid %u\n", cmd->nsid); 5156 return -EINVAL; 5157 } 5158 5159 if (cmd->opc == SPDK_NVME_OPC_DATASET_MANAGEMENT) { 5160 nr = cmd->cdw10_bits.dsm.nr + 1; 5161 return nr * sizeof(struct spdk_nvme_dsm_range); 5162 } 5163 5164 nlb = (cmd->cdw12 & 0x0000ffffu) + 1; 5165 return nlb * spdk_bdev_get_block_size(ns->bdev); 5166 } 5167 5168 static int 5169 map_admin_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req) 5170 { 5171 struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; 5172 uint32_t len = 0; 5173 uint8_t fid; 5174 int iovcnt; 5175 5176 req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 5177 req->length = 0; 5178 req->data = NULL; 5179 5180 if (req->xfer == SPDK_NVME_DATA_NONE) { 5181 return 0; 5182 } 5183 5184 switch (cmd->opc) { 5185 case SPDK_NVME_OPC_IDENTIFY: 5186 len = 4096; 5187 break; 5188 case SPDK_NVME_OPC_GET_LOG_PAGE: 5189 len = (((cmd->cdw11_bits.get_log_page.numdu << 16) | cmd->cdw10_bits.get_log_page.numdl) + 1) * 4; 5190 break; 5191 case SPDK_NVME_OPC_GET_FEATURES: 5192 case SPDK_NVME_OPC_SET_FEATURES: 5193 fid = cmd->cdw10_bits.set_features.fid; 5194 switch (fid) { 5195 case SPDK_NVME_FEAT_LBA_RANGE_TYPE: 5196 len = 4096; 5197 break; 5198 case SPDK_NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 5199 len = 256; 5200 break; 5201 case SPDK_NVME_FEAT_TIMESTAMP: 5202 len = 8; 5203 break; 5204 case SPDK_NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 5205 len = 512; 5206 break; 5207 case SPDK_NVME_FEAT_HOST_IDENTIFIER: 5208 if (cmd->cdw11_bits.feat_host_identifier.bits.exhid) { 5209 len = 16; 5210 } else { 5211 len = 8; 5212 } 5213 break; 5214 default: 5215 return 0; 5216 } 5217 break; 5218 default: 5219 return 0; 5220 } 5221 5222 /* ADMIN command will not use SGL */ 5223 if (cmd->psdt != 0) { 5224 return -EINVAL; 5225 } 5226 5227 iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, len); 5228 if (iovcnt < 0) { 5229 SPDK_ERRLOG("%s: map Admin Opc %x failed\n", 5230 ctrlr_id(ctrlr), cmd->opc); 5231 return -1; 5232 } 5233 req->length = len; 5234 req->data = req->iov[0].iov_base; 5235 req->iovcnt = iovcnt; 5236 5237 return 0; 5238 } 5239 5240 /* 5241 * Map an I/O command's buffers. 5242 * 5243 * Returns 0 on success and -errno on failure. 5244 */ 5245 static int 5246 map_io_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req) 5247 { 5248 int len, iovcnt; 5249 struct spdk_nvme_cmd *cmd; 5250 5251 assert(ctrlr != NULL); 5252 assert(req != NULL); 5253 5254 cmd = &req->cmd->nvme_cmd; 5255 req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 5256 req->length = 0; 5257 req->data = NULL; 5258 5259 if (spdk_unlikely(req->xfer == SPDK_NVME_DATA_NONE)) { 5260 return 0; 5261 } 5262 5263 len = get_nvmf_io_req_length(req); 5264 if (len < 0) { 5265 return -EINVAL; 5266 } 5267 req->length = len; 5268 5269 iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, req->length); 5270 if (iovcnt < 0) { 5271 SPDK_ERRLOG("%s: failed to map IO OPC %u\n", ctrlr_id(ctrlr), cmd->opc); 5272 return -EFAULT; 5273 } 5274 req->data = req->iov[0].iov_base; 5275 req->iovcnt = iovcnt; 5276 5277 return 0; 5278 } 5279 5280 static int 5281 handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 5282 struct nvmf_vfio_user_sq *sq) 5283 { 5284 int err; 5285 struct nvmf_vfio_user_req *vu_req; 5286 struct spdk_nvmf_request *req; 5287 5288 assert(ctrlr != NULL); 5289 assert(cmd != NULL); 5290 5291 vu_req = get_nvmf_vfio_user_req(sq); 5292 if (spdk_unlikely(vu_req == NULL)) { 5293 SPDK_ERRLOG("%s: no request for NVMe command opc 0x%x\n", ctrlr_id(ctrlr), cmd->opc); 5294 return post_completion(ctrlr, ctrlr->cqs[sq->cqid], 0, 0, cmd->cid, 5295 SPDK_NVME_SC_INTERNAL_DEVICE_ERROR, SPDK_NVME_SCT_GENERIC); 5296 5297 } 5298 req = &vu_req->req; 5299 5300 assert(req->qpair != NULL); 5301 SPDK_DEBUGLOG(nvmf_vfio, "%s: handle sqid:%u, req opc=%#x cid=%d\n", 5302 ctrlr_id(ctrlr), req->qpair->qid, cmd->opc, cmd->cid); 5303 5304 vu_req->cb_fn = handle_cmd_rsp; 5305 vu_req->cb_arg = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 5306 req->cmd->nvme_cmd = *cmd; 5307 5308 if (nvmf_qpair_is_admin_queue(req->qpair)) { 5309 err = map_admin_cmd_req(ctrlr, req); 5310 } else { 5311 switch (cmd->opc) { 5312 case SPDK_NVME_OPC_RESERVATION_REGISTER: 5313 case SPDK_NVME_OPC_RESERVATION_REPORT: 5314 case SPDK_NVME_OPC_RESERVATION_ACQUIRE: 5315 case SPDK_NVME_OPC_RESERVATION_RELEASE: 5316 err = -ENOTSUP; 5317 break; 5318 default: 5319 err = map_io_cmd_req(ctrlr, req); 5320 break; 5321 } 5322 } 5323 5324 if (spdk_unlikely(err < 0)) { 5325 SPDK_ERRLOG("%s: process NVMe command opc 0x%x failed\n", 5326 ctrlr_id(ctrlr), cmd->opc); 5327 req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 5328 req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; 5329 err = handle_cmd_rsp(vu_req, vu_req->cb_arg); 5330 _nvmf_vfio_user_req_free(sq, vu_req); 5331 return err; 5332 } 5333 5334 vu_req->state = VFIO_USER_REQUEST_STATE_EXECUTING; 5335 spdk_nvmf_request_exec(req); 5336 5337 return 0; 5338 } 5339 5340 /* 5341 * If we suppressed an IRQ in post_completion(), check if it needs to be fired 5342 * here: if the host isn't up to date, and is apparently not actively processing 5343 * the queue (i.e. ->last_head isn't changing), we need an IRQ. 5344 */ 5345 static void 5346 handle_suppressed_irq(struct nvmf_vfio_user_ctrlr *ctrlr, 5347 struct nvmf_vfio_user_sq *sq) 5348 { 5349 struct nvmf_vfio_user_cq *cq = ctrlr->cqs[sq->cqid]; 5350 uint32_t cq_head; 5351 uint32_t cq_tail; 5352 5353 if (!cq->ien || cq->qid == 0 || !ctrlr_interrupt_enabled(ctrlr)) { 5354 return; 5355 } 5356 5357 cq_tail = *cq_tailp(cq); 5358 5359 /* Already sent? */ 5360 if (cq_tail == cq->last_trigger_irq_tail) { 5361 return; 5362 } 5363 5364 spdk_ivdt_dcache(cq_dbl_headp(cq)); 5365 cq_head = *cq_dbl_headp(cq); 5366 5367 if (cq_head != cq_tail && cq_head == cq->last_head) { 5368 int err = vfu_irq_trigger(ctrlr->endpoint->vfu_ctx, cq->iv); 5369 if (err != 0) { 5370 SPDK_ERRLOG("%s: failed to trigger interrupt: %m\n", 5371 ctrlr_id(ctrlr)); 5372 } else { 5373 cq->last_trigger_irq_tail = cq_tail; 5374 } 5375 } 5376 5377 cq->last_head = cq_head; 5378 } 5379 5380 /* Returns the number of commands processed, or a negative value on error. */ 5381 static int 5382 nvmf_vfio_user_sq_poll(struct nvmf_vfio_user_sq *sq) 5383 { 5384 struct nvmf_vfio_user_ctrlr *ctrlr; 5385 uint32_t new_tail; 5386 int count = 0; 5387 5388 assert(sq != NULL); 5389 5390 ctrlr = sq->ctrlr; 5391 5392 /* 5393 * A quiesced, or migrating, controller should never process new 5394 * commands. 5395 */ 5396 if (ctrlr->state != VFIO_USER_CTRLR_RUNNING) { 5397 return SPDK_POLLER_IDLE; 5398 } 5399 5400 if (ctrlr->adaptive_irqs_enabled) { 5401 handle_suppressed_irq(ctrlr, sq); 5402 } 5403 5404 /* On aarch64 platforms, doorbells update from guest VM may not be seen 5405 * on SPDK target side. This is because there is memory type mismatch 5406 * situation here. That is on guest VM side, the doorbells are treated as 5407 * device memory while on SPDK target side, it is treated as normal 5408 * memory. And this situation cause problem on ARM platform. 5409 * Refer to "https://developer.arm.com/documentation/102376/0100/ 5410 * Memory-aliasing-and-mismatched-memory-types". Only using spdk_mb() 5411 * cannot fix this. Use "dc civac" to invalidate cache may solve 5412 * this. 5413 */ 5414 spdk_ivdt_dcache(sq_dbl_tailp(sq)); 5415 5416 /* Load-Acquire. */ 5417 new_tail = *sq_dbl_tailp(sq); 5418 5419 new_tail = new_tail & 0xffffu; 5420 if (spdk_unlikely(new_tail >= sq->size)) { 5421 union spdk_nvme_async_event_completion event = {}; 5422 5423 SPDK_DEBUGLOG(nvmf_vfio, "%s: invalid sqid:%u doorbell value %u\n", ctrlr_id(ctrlr), sq->qid, 5424 new_tail); 5425 event.bits.async_event_type = SPDK_NVME_ASYNC_EVENT_TYPE_ERROR; 5426 event.bits.async_event_info = SPDK_NVME_ASYNC_EVENT_INVALID_DB_WRITE; 5427 nvmf_ctrlr_async_event_error_event(ctrlr->ctrlr, event); 5428 5429 return -1; 5430 } 5431 5432 if (*sq_headp(sq) == new_tail) { 5433 return 0; 5434 } 5435 5436 SPDK_DEBUGLOG(nvmf_vfio, "%s: sqid:%u doorbell old=%u new=%u\n", 5437 ctrlr_id(ctrlr), sq->qid, *sq_headp(sq), new_tail); 5438 if (ctrlr->sdbl != NULL) { 5439 SPDK_DEBUGLOG(nvmf_vfio, 5440 "%s: sqid:%u bar0_doorbell=%u shadow_doorbell=%u eventidx=%u\n", 5441 ctrlr_id(ctrlr), sq->qid, 5442 ctrlr->bar0_doorbells[queue_index(sq->qid, false)], 5443 ctrlr->sdbl->shadow_doorbells[queue_index(sq->qid, false)], 5444 ctrlr->sdbl->eventidxs[queue_index(sq->qid, false)]); 5445 } 5446 5447 /* 5448 * Ensure that changes to the queue are visible to us. 5449 * The host driver should write the queue first, do a wmb(), and then 5450 * update the SQ tail doorbell (their Store-Release). 5451 */ 5452 spdk_rmb(); 5453 5454 count = handle_sq_tdbl_write(ctrlr, new_tail, sq); 5455 if (count < 0) { 5456 fail_ctrlr(ctrlr); 5457 } 5458 5459 return count; 5460 } 5461 5462 /* 5463 * vfio-user transport poll handler. Note that the library context is polled in 5464 * a separate poller (->vfu_ctx_poller), so this poller only needs to poll the 5465 * active SQs. 5466 * 5467 * Returns the number of commands processed, or a negative value on error. 5468 */ 5469 static int 5470 nvmf_vfio_user_poll_group_poll(struct spdk_nvmf_transport_poll_group *group) 5471 { 5472 struct nvmf_vfio_user_poll_group *vu_group; 5473 struct nvmf_vfio_user_sq *sq, *tmp; 5474 int count = 0; 5475 5476 assert(group != NULL); 5477 5478 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 5479 5480 SPDK_DEBUGLOG(vfio_user_db, "polling all SQs\n"); 5481 5482 TAILQ_FOREACH_SAFE(sq, &vu_group->sqs, link, tmp) { 5483 int ret; 5484 5485 if (spdk_unlikely(sq->sq_state != VFIO_USER_SQ_ACTIVE || !sq->size)) { 5486 continue; 5487 } 5488 5489 ret = nvmf_vfio_user_sq_poll(sq); 5490 5491 if (ret < 0) { 5492 return ret; 5493 } 5494 5495 count += ret; 5496 } 5497 5498 return count; 5499 } 5500 5501 static int 5502 nvmf_vfio_user_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair, 5503 struct spdk_nvme_transport_id *trid) 5504 { 5505 struct nvmf_vfio_user_sq *sq; 5506 struct nvmf_vfio_user_ctrlr *ctrlr; 5507 5508 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5509 ctrlr = sq->ctrlr; 5510 5511 memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid)); 5512 return 0; 5513 } 5514 5515 static int 5516 nvmf_vfio_user_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair, 5517 struct spdk_nvme_transport_id *trid) 5518 { 5519 return 0; 5520 } 5521 5522 static int 5523 nvmf_vfio_user_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair, 5524 struct spdk_nvme_transport_id *trid) 5525 { 5526 struct nvmf_vfio_user_sq *sq; 5527 struct nvmf_vfio_user_ctrlr *ctrlr; 5528 5529 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5530 ctrlr = sq->ctrlr; 5531 5532 memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid)); 5533 return 0; 5534 } 5535 5536 static void 5537 nvmf_vfio_user_qpair_abort_request(struct spdk_nvmf_qpair *qpair, 5538 struct spdk_nvmf_request *req) 5539 { 5540 struct spdk_nvmf_request *req_to_abort = NULL; 5541 struct spdk_nvmf_request *temp_req = NULL; 5542 uint16_t cid; 5543 5544 cid = req->cmd->nvme_cmd.cdw10_bits.abort.cid; 5545 5546 TAILQ_FOREACH(temp_req, &qpair->outstanding, link) { 5547 struct nvmf_vfio_user_req *vu_req; 5548 5549 vu_req = SPDK_CONTAINEROF(temp_req, struct nvmf_vfio_user_req, req); 5550 5551 if (vu_req->state == VFIO_USER_REQUEST_STATE_EXECUTING && vu_req->cmd.cid == cid) { 5552 req_to_abort = temp_req; 5553 break; 5554 } 5555 } 5556 5557 if (req_to_abort == NULL) { 5558 spdk_nvmf_request_complete(req); 5559 return; 5560 } 5561 5562 req->req_to_abort = req_to_abort; 5563 nvmf_ctrlr_abort_request(req); 5564 } 5565 5566 static void 5567 nvmf_vfio_user_opts_init(struct spdk_nvmf_transport_opts *opts) 5568 { 5569 opts->max_queue_depth = NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH; 5570 opts->max_qpairs_per_ctrlr = NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; 5571 opts->in_capsule_data_size = 0; 5572 opts->max_io_size = NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE; 5573 opts->io_unit_size = NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE; 5574 opts->max_aq_depth = NVMF_VFIO_USER_DEFAULT_AQ_DEPTH; 5575 opts->num_shared_buffers = 0; 5576 opts->buf_cache_size = 0; 5577 opts->association_timeout = 0; 5578 opts->transport_specific = NULL; 5579 } 5580 5581 const struct spdk_nvmf_transport_ops spdk_nvmf_transport_vfio_user = { 5582 .name = "VFIOUSER", 5583 .type = SPDK_NVME_TRANSPORT_VFIOUSER, 5584 .opts_init = nvmf_vfio_user_opts_init, 5585 .create = nvmf_vfio_user_create, 5586 .destroy = nvmf_vfio_user_destroy, 5587 5588 .listen = nvmf_vfio_user_listen, 5589 .stop_listen = nvmf_vfio_user_stop_listen, 5590 .cdata_init = nvmf_vfio_user_cdata_init, 5591 .listen_associate = nvmf_vfio_user_listen_associate, 5592 5593 .listener_discover = nvmf_vfio_user_discover, 5594 5595 .poll_group_create = nvmf_vfio_user_poll_group_create, 5596 .get_optimal_poll_group = nvmf_vfio_user_get_optimal_poll_group, 5597 .poll_group_destroy = nvmf_vfio_user_poll_group_destroy, 5598 .poll_group_add = nvmf_vfio_user_poll_group_add, 5599 .poll_group_remove = nvmf_vfio_user_poll_group_remove, 5600 .poll_group_poll = nvmf_vfio_user_poll_group_poll, 5601 5602 .req_free = nvmf_vfio_user_req_free, 5603 .req_complete = nvmf_vfio_user_req_complete, 5604 5605 .qpair_fini = nvmf_vfio_user_close_qpair, 5606 .qpair_get_local_trid = nvmf_vfio_user_qpair_get_local_trid, 5607 .qpair_get_peer_trid = nvmf_vfio_user_qpair_get_peer_trid, 5608 .qpair_get_listen_trid = nvmf_vfio_user_qpair_get_listen_trid, 5609 .qpair_abort_request = nvmf_vfio_user_qpair_abort_request, 5610 }; 5611 5612 SPDK_NVMF_TRANSPORT_REGISTER(muser, &spdk_nvmf_transport_vfio_user); 5613 SPDK_LOG_REGISTER_COMPONENT(nvmf_vfio) 5614 SPDK_LOG_REGISTER_COMPONENT(vfio_user_db) 5615