1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (c) 2019-2022, Nutanix Inc. All rights reserved. 3 * Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 */ 5 6 /* 7 * NVMe over vfio-user transport 8 */ 9 10 #include <vfio-user/libvfio-user.h> 11 #include <vfio-user/pci_defs.h> 12 13 #include "spdk/barrier.h" 14 #include "spdk/stdinc.h" 15 #include "spdk/assert.h" 16 #include "spdk/thread.h" 17 #include "spdk/nvmf_transport.h" 18 #include "spdk/sock.h" 19 #include "spdk/string.h" 20 #include "spdk/util.h" 21 #include "spdk/log.h" 22 23 #include "transport.h" 24 25 #include "nvmf_internal.h" 26 27 #define SWAP(x, y) \ 28 do \ 29 { \ 30 typeof(x) _tmp = x; \ 31 x = y; \ 32 y = _tmp; \ 33 } while (0) 34 35 #define NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH 256 36 #define NVMF_VFIO_USER_DEFAULT_AQ_DEPTH 32 37 #define NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE ((NVMF_REQ_MAX_BUFFERS - 1) << SHIFT_4KB) 38 #define NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE 39 40 #define NVME_DOORBELLS_OFFSET 0x1000 41 #define NVMF_VFIO_USER_DOORBELLS_SIZE 0x1000 42 #define NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT 2 43 #define NVMF_VFIO_USER_SET_EVENTIDX_MAX_ATTEMPTS 3 44 #define NVMF_VFIO_USER_EVENTIDX_POLL UINT32_MAX 45 46 /* 47 * NVMe driver reads 4096 bytes, which is the extended PCI configuration space 48 * available on PCI-X 2.0 and PCI Express buses 49 */ 50 #define NVME_REG_CFG_SIZE 0x1000 51 #define NVME_REG_BAR0_SIZE (NVME_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE) 52 #define NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR ((NVMF_VFIO_USER_DOORBELLS_SIZE) / 8) 53 #define NVME_IRQ_MSIX_NUM NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR 54 /* MSIX Table Size */ 55 #define NVME_BAR4_SIZE SPDK_ALIGN_CEIL((NVME_IRQ_MSIX_NUM * 16), 0x1000) 56 /* MSIX Pending Bit Array Size */ 57 #define NVME_BAR5_SIZE SPDK_ALIGN_CEIL((NVME_IRQ_MSIX_NUM / 8), 0x1000) 58 59 #define NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR (NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR / 4) 60 61 struct nvmf_vfio_user_req; 62 63 typedef int (*nvmf_vfio_user_req_cb_fn)(struct nvmf_vfio_user_req *req, void *cb_arg); 64 65 /* 1 more for PRP2 list itself */ 66 #define NVMF_VFIO_USER_MAX_IOVECS (NVMF_REQ_MAX_BUFFERS + 1) 67 68 enum nvmf_vfio_user_req_state { 69 VFIO_USER_REQUEST_STATE_FREE = 0, 70 VFIO_USER_REQUEST_STATE_EXECUTING, 71 }; 72 73 /* 74 * Support for live migration in NVMf/vfio-user: live migration is implemented 75 * by stopping the NVMf subsystem when the device is instructed to enter the 76 * stop-and-copy state and then trivially, and most importantly safely, 77 * collecting migration state and providing it to the vfio-user client. We 78 * don't provide any migration state at the pre-copy state as that's too 79 * complicated to do, we might support this in the future. 80 */ 81 82 83 /* NVMe device state representation */ 84 struct nvme_migr_sq_state { 85 uint16_t sqid; 86 uint16_t cqid; 87 uint32_t head; 88 uint32_t size; 89 uint32_t reserved; 90 uint64_t dma_addr; 91 }; 92 SPDK_STATIC_ASSERT(sizeof(struct nvme_migr_sq_state) == 0x18, "Incorrect size"); 93 94 struct nvme_migr_cq_state { 95 uint16_t cqid; 96 uint16_t phase; 97 uint32_t tail; 98 uint32_t size; 99 uint32_t iv; 100 uint32_t ien; 101 uint32_t reserved; 102 uint64_t dma_addr; 103 }; 104 SPDK_STATIC_ASSERT(sizeof(struct nvme_migr_cq_state) == 0x20, "Incorrect size"); 105 106 #define VFIO_USER_NVME_MIGR_MAGIC 0xAFEDBC23 107 108 /* The device state is in VFIO MIGRATION BAR(9) region, keep the device state page aligned. 109 * 110 * NVMe device migration region is defined as below: 111 * ------------------------------------------------------------------------- 112 * | vfio_user_nvme_migr_header | nvmf controller data | queue pairs | BARs | 113 * ------------------------------------------------------------------------- 114 * 115 * Keep vfio_user_nvme_migr_header as a fixed 0x1000 length, all new added fields 116 * can use the reserved space at the end of the data structure. 117 */ 118 struct vfio_user_nvme_migr_header { 119 /* Magic value to validate migration data */ 120 uint32_t magic; 121 /* Version to check the data is same from source to destination */ 122 uint32_t version; 123 124 /* The library uses this field to know how many fields in this 125 * structure are valid, starting at the beginning of this data 126 * structure. New added fields in future use `unused` memory 127 * spaces. 128 */ 129 uint32_t opts_size; 130 uint32_t reserved0; 131 132 /* BARs information */ 133 uint64_t bar_offset[VFU_PCI_DEV_NUM_REGIONS]; 134 uint64_t bar_len[VFU_PCI_DEV_NUM_REGIONS]; 135 136 /* Queue pair start offset, starting at the beginning of this 137 * data structure. 138 */ 139 uint64_t qp_offset; 140 uint64_t qp_len; 141 142 /* Controller data structure */ 143 uint32_t num_io_queues; 144 uint32_t reserved1; 145 146 /* TODO: this part will be moved to common nvmf controller data */ 147 uint16_t reserved2[3]; 148 uint16_t nr_aers; 149 uint16_t aer_cids[NVMF_MIGR_MAX_PENDING_AERS]; 150 151 /* NVMf controller data offset and length if exist, starting at 152 * the beginning of this data structure. 153 */ 154 uint64_t nvmf_data_offset; 155 uint64_t nvmf_data_len; 156 157 /* 158 * Whether or not shadow doorbells are used in the source. 0 is a valid DMA 159 * address. 160 */ 161 bool sdbl; 162 163 /* Shadow doorbell DMA addresses. */ 164 uint64_t shadow_doorbell_buffer; 165 uint64_t eventidx_buffer; 166 167 /* Reserved memory space for new added fields, the 168 * field is always at the end of this data structure. 169 */ 170 uint8_t unused[3336]; 171 }; 172 SPDK_STATIC_ASSERT(sizeof(struct vfio_user_nvme_migr_header) == 0x1000, "Incorrect size"); 173 174 struct vfio_user_nvme_migr_qp { 175 struct nvme_migr_sq_state sq; 176 struct nvme_migr_cq_state cq; 177 }; 178 179 /* NVMe state definition used to load/restore from/to NVMe migration BAR region */ 180 struct vfio_user_nvme_migr_state { 181 struct vfio_user_nvme_migr_header ctrlr_header; 182 struct nvmf_ctrlr_migr_data nvmf_data; 183 struct vfio_user_nvme_migr_qp qps[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 184 uint8_t bar0[NVME_REG_BAR0_SIZE]; 185 uint8_t cfg[NVME_REG_CFG_SIZE]; 186 }; 187 188 struct nvmf_vfio_user_req { 189 struct spdk_nvmf_request req; 190 struct spdk_nvme_cpl rsp; 191 struct spdk_nvme_cmd cmd; 192 193 enum nvmf_vfio_user_req_state state; 194 nvmf_vfio_user_req_cb_fn cb_fn; 195 void *cb_arg; 196 197 /* old CC before prop_set_cc fabric command */ 198 union spdk_nvme_cc_register cc; 199 200 TAILQ_ENTRY(nvmf_vfio_user_req) link; 201 202 struct iovec iov[NVMF_VFIO_USER_MAX_IOVECS]; 203 uint8_t iovcnt; 204 205 /* NVMF_VFIO_USER_MAX_IOVECS worth of dma_sg_t. */ 206 uint8_t sg[]; 207 }; 208 209 /* 210 * Mapping of an NVMe queue. 211 * 212 * This holds the information tracking a local process mapping of an NVMe queue 213 * shared by the client. 214 */ 215 struct nvme_q_mapping { 216 /* iov of local process mapping. */ 217 struct iovec iov; 218 /* Stored sg, needed for unmap. */ 219 dma_sg_t *sg; 220 /* Client PRP of queue. */ 221 uint64_t prp1; 222 }; 223 224 enum nvmf_vfio_user_sq_state { 225 VFIO_USER_SQ_UNUSED = 0, 226 VFIO_USER_SQ_CREATED, 227 VFIO_USER_SQ_DELETED, 228 VFIO_USER_SQ_ACTIVE, 229 VFIO_USER_SQ_INACTIVE 230 }; 231 232 enum nvmf_vfio_user_cq_state { 233 VFIO_USER_CQ_UNUSED = 0, 234 VFIO_USER_CQ_CREATED, 235 VFIO_USER_CQ_DELETED, 236 }; 237 238 enum nvmf_vfio_user_ctrlr_state { 239 VFIO_USER_CTRLR_CREATING = 0, 240 VFIO_USER_CTRLR_RUNNING, 241 /* Quiesce requested by libvfio-user */ 242 VFIO_USER_CTRLR_PAUSING, 243 /* NVMf subsystem is paused, it's safe to do PCI reset, memory register, 244 * memory unergister, and vfio migration state transition in this state. 245 */ 246 VFIO_USER_CTRLR_PAUSED, 247 /* 248 * Implies that the NVMf subsystem is paused. Device will be unquiesced (PCI 249 * reset, memory register and unregister, controller in destination VM has 250 * been restored). NVMf subsystem resume has been requested. 251 */ 252 VFIO_USER_CTRLR_RESUMING, 253 /* 254 * Implies that the NVMf subsystem is paused. Both controller in source VM and 255 * destinatiom VM is in this state when doing live migration. 256 */ 257 VFIO_USER_CTRLR_MIGRATING 258 }; 259 260 struct nvmf_vfio_user_sq { 261 struct spdk_nvmf_qpair qpair; 262 struct spdk_nvmf_transport_poll_group *group; 263 struct nvmf_vfio_user_ctrlr *ctrlr; 264 265 uint32_t qid; 266 /* Number of entries in queue. */ 267 uint32_t size; 268 struct nvme_q_mapping mapping; 269 enum nvmf_vfio_user_sq_state sq_state; 270 271 uint32_t head; 272 volatile uint32_t *dbl_tailp; 273 274 /* Whether a shadow doorbell eventidx needs setting. */ 275 bool need_rearm; 276 277 /* multiple SQs can be mapped to the same CQ */ 278 uint16_t cqid; 279 280 /* handle_queue_connect_rsp() can be used both for CREATE IO SQ response 281 * and SQ re-connect response in the destination VM, for the prior case, 282 * we will post a NVMe completion to VM, we will not set this flag when 283 * re-connecting SQs in the destination VM. 284 */ 285 bool post_create_io_sq_completion; 286 /* Copy of Create IO SQ command, this field is used together with 287 * `post_create_io_sq_completion` flag. 288 */ 289 struct spdk_nvme_cmd create_io_sq_cmd; 290 291 /* Currently unallocated reqs. */ 292 TAILQ_HEAD(, nvmf_vfio_user_req) free_reqs; 293 /* Poll group entry */ 294 TAILQ_ENTRY(nvmf_vfio_user_sq) link; 295 /* Connected SQ entry */ 296 TAILQ_ENTRY(nvmf_vfio_user_sq) tailq; 297 }; 298 299 struct nvmf_vfio_user_cq { 300 struct spdk_nvmf_transport_poll_group *group; 301 struct spdk_thread *thread; 302 uint32_t cq_ref; 303 304 uint32_t qid; 305 /* Number of entries in queue. */ 306 uint32_t size; 307 struct nvme_q_mapping mapping; 308 enum nvmf_vfio_user_cq_state cq_state; 309 310 uint32_t tail; 311 volatile uint32_t *dbl_headp; 312 313 bool phase; 314 315 uint16_t iv; 316 bool ien; 317 318 uint32_t last_head; 319 uint32_t last_trigger_irq_tail; 320 }; 321 322 struct nvmf_vfio_user_poll_group { 323 struct spdk_nvmf_transport_poll_group group; 324 TAILQ_ENTRY(nvmf_vfio_user_poll_group) link; 325 TAILQ_HEAD(, nvmf_vfio_user_sq) sqs; 326 }; 327 328 struct nvmf_vfio_user_shadow_doorbells { 329 volatile uint32_t *shadow_doorbells; 330 volatile uint32_t *eventidxs; 331 dma_sg_t *sgs; 332 struct iovec *iovs; 333 }; 334 335 struct nvmf_vfio_user_ctrlr { 336 struct nvmf_vfio_user_endpoint *endpoint; 337 struct nvmf_vfio_user_transport *transport; 338 339 /* Connected SQs list */ 340 TAILQ_HEAD(, nvmf_vfio_user_sq) connected_sqs; 341 enum nvmf_vfio_user_ctrlr_state state; 342 343 /* 344 * Tells whether live migration data have been prepared. This is used 345 * by the get_pending_bytes callback to tell whether or not the 346 * previous iteration finished. 347 */ 348 bool migr_data_prepared; 349 350 /* Controller is in source VM when doing live migration */ 351 bool in_source_vm; 352 353 struct spdk_thread *thread; 354 struct spdk_poller *vfu_ctx_poller; 355 struct spdk_interrupt *intr; 356 int intr_fd; 357 358 bool queued_quiesce; 359 360 bool reset_shn; 361 362 uint16_t cntlid; 363 struct spdk_nvmf_ctrlr *ctrlr; 364 365 struct nvmf_vfio_user_sq *sqs[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 366 struct nvmf_vfio_user_cq *cqs[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 367 368 TAILQ_ENTRY(nvmf_vfio_user_ctrlr) link; 369 370 volatile uint32_t *bar0_doorbells; 371 struct nvmf_vfio_user_shadow_doorbells *sdbl; 372 /* 373 * Shadow doorbells PRPs to provide during the stop-and-copy state. 374 */ 375 uint64_t shadow_doorbell_buffer; 376 uint64_t eventidx_buffer; 377 378 bool adaptive_irqs_enabled; 379 bool kick_requested; 380 }; 381 382 /* Endpoint in vfio-user is associated with a socket file, which 383 * is the representative of a PCI endpoint. 384 */ 385 struct nvmf_vfio_user_endpoint { 386 struct nvmf_vfio_user_transport *transport; 387 vfu_ctx_t *vfu_ctx; 388 struct spdk_poller *accept_poller; 389 struct spdk_thread *accept_thread; 390 bool interrupt_mode; 391 struct msixcap *msix; 392 vfu_pci_config_space_t *pci_config_space; 393 int devmem_fd; 394 int accept_intr_fd; 395 struct spdk_interrupt *accept_intr; 396 397 volatile uint32_t *bar0_doorbells; 398 399 int migr_fd; 400 void *migr_data; 401 402 struct spdk_nvme_transport_id trid; 403 struct spdk_nvmf_subsystem *subsystem; 404 405 /* Controller is associated with an active socket connection, 406 * the lifecycle of the controller is same as the VM. 407 * Currently we only support one active connection, as the NVMe 408 * specification defines, we may support multiple controllers in 409 * future, so that it can support e.g: RESERVATION. 410 */ 411 struct nvmf_vfio_user_ctrlr *ctrlr; 412 pthread_mutex_t lock; 413 414 bool need_async_destroy; 415 /* The subsystem is in PAUSED state and need to be resumed, TRUE 416 * only when migration is done successfully and the controller is 417 * in source VM. 418 */ 419 bool need_resume; 420 421 TAILQ_ENTRY(nvmf_vfio_user_endpoint) link; 422 }; 423 424 struct nvmf_vfio_user_transport_opts { 425 bool disable_mappable_bar0; 426 bool disable_adaptive_irq; 427 bool disable_shadow_doorbells; 428 bool disable_compare; 429 }; 430 431 struct nvmf_vfio_user_transport { 432 struct spdk_nvmf_transport transport; 433 struct nvmf_vfio_user_transport_opts transport_opts; 434 bool intr_mode_supported; 435 pthread_mutex_t lock; 436 TAILQ_HEAD(, nvmf_vfio_user_endpoint) endpoints; 437 438 pthread_mutex_t pg_lock; 439 TAILQ_HEAD(, nvmf_vfio_user_poll_group) poll_groups; 440 struct nvmf_vfio_user_poll_group *next_pg; 441 }; 442 443 /* 444 * function prototypes 445 */ 446 static int nvmf_vfio_user_req_free(struct spdk_nvmf_request *req); 447 448 static struct nvmf_vfio_user_req *get_nvmf_vfio_user_req(struct nvmf_vfio_user_sq *sq); 449 450 /* 451 * Local process virtual address of a queue. 452 */ 453 static inline void * 454 q_addr(struct nvme_q_mapping *mapping) 455 { 456 return mapping->iov.iov_base; 457 } 458 459 static inline int 460 queue_index(uint16_t qid, bool is_cq) 461 { 462 return (qid * 2) + is_cq; 463 } 464 465 static inline volatile uint32_t * 466 sq_headp(struct nvmf_vfio_user_sq *sq) 467 { 468 assert(sq != NULL); 469 return &sq->head; 470 } 471 472 static inline volatile uint32_t * 473 sq_dbl_tailp(struct nvmf_vfio_user_sq *sq) 474 { 475 assert(sq != NULL); 476 return sq->dbl_tailp; 477 } 478 479 static inline volatile uint32_t * 480 cq_dbl_headp(struct nvmf_vfio_user_cq *cq) 481 { 482 assert(cq != NULL); 483 return cq->dbl_headp; 484 } 485 486 static inline volatile uint32_t * 487 cq_tailp(struct nvmf_vfio_user_cq *cq) 488 { 489 assert(cq != NULL); 490 return &cq->tail; 491 } 492 493 static inline void 494 sq_head_advance(struct nvmf_vfio_user_sq *sq) 495 { 496 assert(sq != NULL); 497 498 assert(*sq_headp(sq) < sq->size); 499 (*sq_headp(sq))++; 500 501 if (spdk_unlikely(*sq_headp(sq) == sq->size)) { 502 *sq_headp(sq) = 0; 503 } 504 } 505 506 static inline void 507 cq_tail_advance(struct nvmf_vfio_user_cq *cq) 508 { 509 assert(cq != NULL); 510 511 assert(*cq_tailp(cq) < cq->size); 512 (*cq_tailp(cq))++; 513 514 if (spdk_unlikely(*cq_tailp(cq) == cq->size)) { 515 *cq_tailp(cq) = 0; 516 cq->phase = !cq->phase; 517 } 518 } 519 520 /* 521 * As per NVMe Base spec 3.3.1.2.1, we are supposed to implement CQ flow 522 * control: if there is no space in the CQ, we should wait until there is. 523 * 524 * In practice, we just fail the controller instead: as it happens, all host 525 * implementations we care about right-size the CQ: this is required anyway for 526 * NVMEoF support (see 3.3.2.8). 527 * 528 * Since reading the head doorbell is relatively expensive, we use the cached 529 * value, so we only have to read it for real if it appears that we are full. 530 */ 531 static inline bool 532 cq_is_full(struct nvmf_vfio_user_cq *cq) 533 { 534 uint32_t qindex; 535 536 assert(cq != NULL); 537 538 qindex = *cq_tailp(cq) + 1; 539 if (spdk_unlikely(qindex == cq->size)) { 540 qindex = 0; 541 } 542 543 if (qindex != cq->last_head) { 544 return false; 545 } 546 547 cq->last_head = *cq_dbl_headp(cq); 548 549 return qindex == cq->last_head; 550 } 551 552 static bool 553 io_q_exists(struct nvmf_vfio_user_ctrlr *vu_ctrlr, const uint16_t qid, const bool is_cq) 554 { 555 assert(vu_ctrlr != NULL); 556 557 if (qid == 0 || qid >= NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) { 558 return false; 559 } 560 561 if (is_cq) { 562 if (vu_ctrlr->cqs[qid] == NULL) { 563 return false; 564 } 565 566 return (vu_ctrlr->cqs[qid]->cq_state != VFIO_USER_CQ_DELETED && 567 vu_ctrlr->cqs[qid]->cq_state != VFIO_USER_CQ_UNUSED); 568 } 569 570 if (vu_ctrlr->sqs[qid] == NULL) { 571 return false; 572 } 573 574 return (vu_ctrlr->sqs[qid]->sq_state != VFIO_USER_SQ_DELETED && 575 vu_ctrlr->sqs[qid]->sq_state != VFIO_USER_SQ_UNUSED); 576 } 577 578 /* Return the poll group for the admin queue of the controller. */ 579 static inline struct nvmf_vfio_user_poll_group * 580 ctrlr_to_poll_group(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 581 { 582 return SPDK_CONTAINEROF(vu_ctrlr->sqs[0]->group, 583 struct nvmf_vfio_user_poll_group, 584 group); 585 } 586 587 static inline struct spdk_thread * 588 poll_group_to_thread(struct nvmf_vfio_user_poll_group *vu_pg) 589 { 590 return vu_pg->group.group->thread; 591 } 592 593 static dma_sg_t * 594 index_to_sg_t(void *arr, size_t i) 595 { 596 return (dma_sg_t *)((uintptr_t)arr + i * dma_sg_size()); 597 } 598 599 static inline size_t 600 vfio_user_migr_data_len(void) 601 { 602 return SPDK_ALIGN_CEIL(sizeof(struct vfio_user_nvme_migr_state), PAGE_SIZE); 603 } 604 605 static int vfio_user_ctrlr_intr(void *ctx); 606 607 /* 608 * Wrap vfio_user_ctrlr_intr() such that it can be used with 609 * spdk_thread_send_msg(). 610 * Pollers have type int (*)(void *) while message functions should have type 611 * void (*)(void *), so simply discard the returned value. 612 */ 613 static void 614 vfio_user_ctrlr_intr_wrapper(void *ctx) 615 { 616 vfio_user_ctrlr_intr(ctx); 617 } 618 619 /* 620 * Arrange for this controller to immediately wake up and process everything. 621 */ 622 static inline int 623 ctrlr_kick(struct nvmf_vfio_user_ctrlr *ctrlr) 624 { 625 assert(ctrlr != NULL); 626 assert(ctrlr->thread != NULL); 627 628 if (ctrlr->kick_requested) { 629 return 0; 630 } 631 632 ctrlr->kick_requested = true; 633 634 return spdk_thread_send_msg(ctrlr->thread, 635 vfio_user_ctrlr_intr_wrapper, 636 ctrlr); 637 } 638 639 /* 640 * Make the given DMA address and length available (locally mapped) via iov. 641 */ 642 static void * 643 map_one(vfu_ctx_t *ctx, uint64_t addr, uint64_t len, dma_sg_t *sg, 644 struct iovec *iov, int prot) 645 { 646 int ret; 647 648 assert(ctx != NULL); 649 assert(sg != NULL); 650 assert(iov != NULL); 651 652 ret = vfu_addr_to_sgl(ctx, (void *)(uintptr_t)addr, len, sg, 1, prot); 653 if (ret < 0) { 654 return NULL; 655 } 656 657 ret = vfu_sgl_get(ctx, sg, iov, 1, 0); 658 if (ret != 0) { 659 return NULL; 660 } 661 662 assert(iov->iov_base != NULL); 663 return iov->iov_base; 664 } 665 666 static int 667 nvme_cmd_map_prps(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, 668 uint32_t max_iovcnt, uint32_t len, size_t mps, 669 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 670 { 671 uint64_t prp1, prp2; 672 void *vva; 673 uint32_t i; 674 uint32_t residue_len, nents; 675 uint64_t *prp_list; 676 uint32_t iovcnt; 677 678 assert(max_iovcnt > 0); 679 680 prp1 = cmd->dptr.prp.prp1; 681 prp2 = cmd->dptr.prp.prp2; 682 683 /* PRP1 may started with unaligned page address */ 684 residue_len = mps - (prp1 % mps); 685 residue_len = spdk_min(len, residue_len); 686 687 vva = gpa_to_vva(prv, prp1, residue_len, PROT_READ | PROT_WRITE); 688 if (spdk_unlikely(vva == NULL)) { 689 SPDK_ERRLOG("GPA to VVA failed\n"); 690 return -EINVAL; 691 } 692 len -= residue_len; 693 if (len && max_iovcnt < 2) { 694 SPDK_ERRLOG("Too many page entries, at least two iovs are required\n"); 695 return -ERANGE; 696 } 697 iovs[0].iov_base = vva; 698 iovs[0].iov_len = residue_len; 699 700 if (len) { 701 if (spdk_unlikely(prp2 == 0)) { 702 SPDK_ERRLOG("no PRP2, %d remaining\n", len); 703 return -EINVAL; 704 } 705 706 if (len <= mps) { 707 /* 2 PRP used */ 708 iovcnt = 2; 709 vva = gpa_to_vva(prv, prp2, len, PROT_READ | PROT_WRITE); 710 if (spdk_unlikely(vva == NULL)) { 711 SPDK_ERRLOG("no VVA for %#" PRIx64 ", len%#x\n", 712 prp2, len); 713 return -EINVAL; 714 } 715 iovs[1].iov_base = vva; 716 iovs[1].iov_len = len; 717 } else { 718 /* PRP list used */ 719 nents = (len + mps - 1) / mps; 720 if (spdk_unlikely(nents + 1 > max_iovcnt)) { 721 SPDK_ERRLOG("Too many page entries\n"); 722 return -ERANGE; 723 } 724 725 vva = gpa_to_vva(prv, prp2, nents * sizeof(*prp_list), PROT_READ); 726 if (spdk_unlikely(vva == NULL)) { 727 SPDK_ERRLOG("no VVA for %#" PRIx64 ", nents=%#x\n", 728 prp2, nents); 729 return -EINVAL; 730 } 731 prp_list = vva; 732 i = 0; 733 while (len != 0) { 734 residue_len = spdk_min(len, mps); 735 vva = gpa_to_vva(prv, prp_list[i], residue_len, PROT_READ | PROT_WRITE); 736 if (spdk_unlikely(vva == NULL)) { 737 SPDK_ERRLOG("no VVA for %#" PRIx64 ", residue_len=%#x\n", 738 prp_list[i], residue_len); 739 return -EINVAL; 740 } 741 iovs[i + 1].iov_base = vva; 742 iovs[i + 1].iov_len = residue_len; 743 len -= residue_len; 744 i++; 745 } 746 iovcnt = i + 1; 747 } 748 } else { 749 /* 1 PRP used */ 750 iovcnt = 1; 751 } 752 753 assert(iovcnt <= max_iovcnt); 754 return iovcnt; 755 } 756 757 static int 758 nvme_cmd_map_sgls_data(void *prv, struct spdk_nvme_sgl_descriptor *sgls, uint32_t num_sgls, 759 struct iovec *iovs, uint32_t max_iovcnt, 760 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 761 { 762 uint32_t i; 763 void *vva; 764 765 if (spdk_unlikely(max_iovcnt < num_sgls)) { 766 return -ERANGE; 767 } 768 769 for (i = 0; i < num_sgls; i++) { 770 if (spdk_unlikely(sgls[i].unkeyed.type != SPDK_NVME_SGL_TYPE_DATA_BLOCK)) { 771 SPDK_ERRLOG("Invalid SGL type %u\n", sgls[i].unkeyed.type); 772 return -EINVAL; 773 } 774 vva = gpa_to_vva(prv, sgls[i].address, sgls[i].unkeyed.length, PROT_READ | PROT_WRITE); 775 if (spdk_unlikely(vva == NULL)) { 776 SPDK_ERRLOG("GPA to VVA failed\n"); 777 return -EINVAL; 778 } 779 iovs[i].iov_base = vva; 780 iovs[i].iov_len = sgls[i].unkeyed.length; 781 } 782 783 return num_sgls; 784 } 785 786 static int 787 nvme_cmd_map_sgls(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt, 788 uint32_t len, size_t mps, 789 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 790 { 791 struct spdk_nvme_sgl_descriptor *sgl, *last_sgl; 792 uint32_t num_sgls, seg_len; 793 void *vva; 794 int ret; 795 uint32_t total_iovcnt = 0; 796 797 /* SGL cases */ 798 sgl = &cmd->dptr.sgl1; 799 800 /* only one SGL segment */ 801 if (sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { 802 assert(max_iovcnt > 0); 803 vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, PROT_READ | PROT_WRITE); 804 if (spdk_unlikely(vva == NULL)) { 805 SPDK_ERRLOG("GPA to VVA failed\n"); 806 return -EINVAL; 807 } 808 iovs[0].iov_base = vva; 809 iovs[0].iov_len = sgl->unkeyed.length; 810 assert(sgl->unkeyed.length == len); 811 812 return 1; 813 } 814 815 for (;;) { 816 if (spdk_unlikely((sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_SEGMENT) && 817 (sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_LAST_SEGMENT))) { 818 SPDK_ERRLOG("Invalid SGL type %u\n", sgl->unkeyed.type); 819 return -EINVAL; 820 } 821 822 seg_len = sgl->unkeyed.length; 823 if (spdk_unlikely(seg_len % sizeof(struct spdk_nvme_sgl_descriptor))) { 824 SPDK_ERRLOG("Invalid SGL segment len %u\n", seg_len); 825 return -EINVAL; 826 } 827 828 num_sgls = seg_len / sizeof(struct spdk_nvme_sgl_descriptor); 829 vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, PROT_READ); 830 if (spdk_unlikely(vva == NULL)) { 831 SPDK_ERRLOG("GPA to VVA failed\n"); 832 return -EINVAL; 833 } 834 835 /* sgl point to the first segment */ 836 sgl = (struct spdk_nvme_sgl_descriptor *)vva; 837 last_sgl = &sgl[num_sgls - 1]; 838 839 /* we are done */ 840 if (last_sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { 841 /* map whole sgl list */ 842 ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls, &iovs[total_iovcnt], 843 max_iovcnt - total_iovcnt, gpa_to_vva); 844 if (spdk_unlikely(ret < 0)) { 845 return ret; 846 } 847 total_iovcnt += ret; 848 849 return total_iovcnt; 850 } 851 852 if (num_sgls > 1) { 853 /* map whole sgl exclude last_sgl */ 854 ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls - 1, &iovs[total_iovcnt], 855 max_iovcnt - total_iovcnt, gpa_to_vva); 856 if (spdk_unlikely(ret < 0)) { 857 return ret; 858 } 859 total_iovcnt += ret; 860 } 861 862 /* move to next level's segments */ 863 sgl = last_sgl; 864 } 865 866 return 0; 867 } 868 869 static int 870 nvme_map_cmd(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt, 871 uint32_t len, size_t mps, 872 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 873 { 874 if (cmd->psdt == SPDK_NVME_PSDT_PRP) { 875 return nvme_cmd_map_prps(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva); 876 } 877 878 return nvme_cmd_map_sgls(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva); 879 } 880 881 static char * 882 endpoint_id(struct nvmf_vfio_user_endpoint *endpoint) 883 { 884 return endpoint->trid.traddr; 885 } 886 887 static char * 888 ctrlr_id(struct nvmf_vfio_user_ctrlr *ctrlr) 889 { 890 if (!ctrlr || !ctrlr->endpoint) { 891 return "Null Ctrlr"; 892 } 893 894 return endpoint_id(ctrlr->endpoint); 895 } 896 897 /* 898 * For each queue, update the location of its doorbell to the correct location: 899 * either our own BAR0, or the guest's configured shadow doorbell area. 900 * 901 * The Admin queue (qid: 0) does not ever use shadow doorbells. 902 */ 903 static void 904 vfio_user_ctrlr_switch_doorbells(struct nvmf_vfio_user_ctrlr *ctrlr, bool shadow) 905 { 906 volatile uint32_t *doorbells = shadow ? ctrlr->sdbl->shadow_doorbells : 907 ctrlr->bar0_doorbells; 908 909 assert(doorbells != NULL); 910 911 for (size_t i = 1; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; i++) { 912 struct nvmf_vfio_user_sq *sq = ctrlr->sqs[i]; 913 struct nvmf_vfio_user_cq *cq = ctrlr->cqs[i]; 914 915 if (sq != NULL) { 916 sq->dbl_tailp = doorbells + queue_index(sq->qid, false); 917 } 918 919 if (cq != NULL) { 920 cq->dbl_headp = doorbells + queue_index(cq->qid, true); 921 } 922 } 923 } 924 925 static void 926 unmap_sdbl(vfu_ctx_t *vfu_ctx, struct nvmf_vfio_user_shadow_doorbells *sdbl) 927 { 928 assert(vfu_ctx != NULL); 929 assert(sdbl != NULL); 930 931 /* 932 * An allocation error would result in only one of the two being 933 * non-NULL. If that is the case, no memory should have been mapped. 934 */ 935 if (sdbl->iovs == NULL || sdbl->sgs == NULL) { 936 return; 937 } 938 939 for (size_t i = 0; i < NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT; ++i) { 940 struct iovec *iov; 941 dma_sg_t *sg; 942 943 if (!sdbl->iovs[i].iov_len) { 944 continue; 945 } 946 947 sg = index_to_sg_t(sdbl->sgs, i); 948 iov = sdbl->iovs + i; 949 950 vfu_sgl_put(vfu_ctx, sg, iov, 1); 951 } 952 } 953 954 static void 955 free_sdbl(vfu_ctx_t *vfu_ctx, struct nvmf_vfio_user_shadow_doorbells *sdbl) 956 { 957 if (sdbl == NULL) { 958 return; 959 } 960 961 unmap_sdbl(vfu_ctx, sdbl); 962 963 /* 964 * sdbl->shadow_doorbells and sdbl->eventidxs were mapped, 965 * not allocated, so don't free() them. 966 */ 967 free(sdbl->sgs); 968 free(sdbl->iovs); 969 free(sdbl); 970 } 971 972 static struct nvmf_vfio_user_shadow_doorbells * 973 map_sdbl(vfu_ctx_t *vfu_ctx, uint64_t prp1, uint64_t prp2, size_t len) 974 { 975 struct nvmf_vfio_user_shadow_doorbells *sdbl = NULL; 976 dma_sg_t *sg2 = NULL; 977 void *p; 978 979 assert(vfu_ctx != NULL); 980 981 sdbl = calloc(1, sizeof(*sdbl)); 982 if (sdbl == NULL) { 983 goto err; 984 } 985 986 sdbl->sgs = calloc(NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT, dma_sg_size()); 987 sdbl->iovs = calloc(NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT, sizeof(*sdbl->iovs)); 988 if (sdbl->sgs == NULL || sdbl->iovs == NULL) { 989 goto err; 990 } 991 992 /* Map shadow doorbell buffer (PRP1). */ 993 p = map_one(vfu_ctx, prp1, len, sdbl->sgs, sdbl->iovs, 994 PROT_READ | PROT_WRITE); 995 996 if (p == NULL) { 997 goto err; 998 } 999 1000 /* 1001 * Map eventidx buffer (PRP2). 1002 * Should only be written to by the controller. 1003 */ 1004 1005 sg2 = index_to_sg_t(sdbl->sgs, 1); 1006 1007 p = map_one(vfu_ctx, prp2, len, sg2, sdbl->iovs + 1, 1008 PROT_READ | PROT_WRITE); 1009 1010 if (p == NULL) { 1011 goto err; 1012 } 1013 1014 sdbl->shadow_doorbells = (uint32_t *)sdbl->iovs[0].iov_base; 1015 sdbl->eventidxs = (uint32_t *)sdbl->iovs[1].iov_base; 1016 1017 return sdbl; 1018 1019 err: 1020 free_sdbl(vfu_ctx, sdbl); 1021 return NULL; 1022 } 1023 1024 /* 1025 * Copy doorbells from one buffer to the other, during switches betweeen BAR0 1026 * doorbells and shadow doorbells. 1027 */ 1028 static void 1029 copy_doorbells(struct nvmf_vfio_user_ctrlr *ctrlr, 1030 const volatile uint32_t *from, volatile uint32_t *to) 1031 { 1032 assert(ctrlr != NULL); 1033 assert(from != NULL); 1034 assert(to != NULL); 1035 1036 SPDK_DEBUGLOG(vfio_user_db, 1037 "%s: migrating shadow doorbells from %p to %p\n", 1038 ctrlr_id(ctrlr), from, to); 1039 1040 /* Can't use memcpy because it doesn't respect volatile semantics. */ 1041 for (size_t i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; ++i) { 1042 if (ctrlr->sqs[i] != NULL) { 1043 to[queue_index(i, false)] = from[queue_index(i, false)]; 1044 } 1045 1046 if (ctrlr->cqs[i] != NULL) { 1047 to[queue_index(i, true)] = from[queue_index(i, true)]; 1048 } 1049 } 1050 } 1051 1052 static void 1053 fail_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1054 { 1055 const struct spdk_nvmf_registers *regs; 1056 1057 assert(vu_ctrlr != NULL); 1058 assert(vu_ctrlr->ctrlr != NULL); 1059 1060 regs = spdk_nvmf_ctrlr_get_regs(vu_ctrlr->ctrlr); 1061 if (regs->csts.bits.cfs == 0) { 1062 SPDK_ERRLOG(":%s failing controller\n", ctrlr_id(vu_ctrlr)); 1063 } 1064 1065 nvmf_ctrlr_set_fatal_status(vu_ctrlr->ctrlr); 1066 } 1067 1068 static inline bool 1069 ctrlr_interrupt_enabled(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1070 { 1071 assert(vu_ctrlr != NULL); 1072 assert(vu_ctrlr->endpoint != NULL); 1073 1074 vfu_pci_config_space_t *pci = vu_ctrlr->endpoint->pci_config_space; 1075 1076 return (!pci->hdr.cmd.id || vu_ctrlr->endpoint->msix->mxc.mxe); 1077 } 1078 1079 static void 1080 nvmf_vfio_user_destroy_endpoint(struct nvmf_vfio_user_endpoint *endpoint) 1081 { 1082 SPDK_DEBUGLOG(nvmf_vfio, "destroy endpoint %s\n", endpoint_id(endpoint)); 1083 1084 spdk_interrupt_unregister(&endpoint->accept_intr); 1085 spdk_poller_unregister(&endpoint->accept_poller); 1086 1087 if (endpoint->bar0_doorbells) { 1088 munmap((void *)endpoint->bar0_doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 1089 } 1090 1091 if (endpoint->devmem_fd > 0) { 1092 close(endpoint->devmem_fd); 1093 } 1094 1095 if (endpoint->migr_data) { 1096 munmap(endpoint->migr_data, vfio_user_migr_data_len()); 1097 } 1098 1099 if (endpoint->migr_fd > 0) { 1100 close(endpoint->migr_fd); 1101 } 1102 1103 if (endpoint->vfu_ctx) { 1104 vfu_destroy_ctx(endpoint->vfu_ctx); 1105 } 1106 1107 pthread_mutex_destroy(&endpoint->lock); 1108 free(endpoint); 1109 } 1110 1111 /* called when process exits */ 1112 static int 1113 nvmf_vfio_user_destroy(struct spdk_nvmf_transport *transport, 1114 spdk_nvmf_transport_destroy_done_cb cb_fn, void *cb_arg) 1115 { 1116 struct nvmf_vfio_user_transport *vu_transport; 1117 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 1118 1119 SPDK_DEBUGLOG(nvmf_vfio, "destroy transport\n"); 1120 1121 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 1122 transport); 1123 1124 pthread_mutex_destroy(&vu_transport->lock); 1125 pthread_mutex_destroy(&vu_transport->pg_lock); 1126 1127 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 1128 TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link); 1129 nvmf_vfio_user_destroy_endpoint(endpoint); 1130 } 1131 1132 free(vu_transport); 1133 1134 if (cb_fn) { 1135 cb_fn(cb_arg); 1136 } 1137 1138 return 0; 1139 } 1140 1141 static const struct spdk_json_object_decoder vfio_user_transport_opts_decoder[] = { 1142 { 1143 "disable_mappable_bar0", 1144 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_mappable_bar0), 1145 spdk_json_decode_bool, true 1146 }, 1147 { 1148 "disable_adaptive_irq", 1149 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_adaptive_irq), 1150 spdk_json_decode_bool, true 1151 }, 1152 { 1153 "disable_shadow_doorbells", 1154 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_shadow_doorbells), 1155 spdk_json_decode_bool, true 1156 }, 1157 { 1158 "disable_compare", 1159 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_compare), 1160 spdk_json_decode_bool, true 1161 }, 1162 }; 1163 1164 static struct spdk_nvmf_transport * 1165 nvmf_vfio_user_create(struct spdk_nvmf_transport_opts *opts) 1166 { 1167 struct nvmf_vfio_user_transport *vu_transport; 1168 int err; 1169 1170 if (opts->max_qpairs_per_ctrlr > NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) { 1171 SPDK_ERRLOG("Invalid max_qpairs_per_ctrlr=%d, supported max_qpairs_per_ctrlr=%d\n", 1172 opts->max_qpairs_per_ctrlr, NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR); 1173 return NULL; 1174 } 1175 1176 vu_transport = calloc(1, sizeof(*vu_transport)); 1177 if (vu_transport == NULL) { 1178 SPDK_ERRLOG("Transport alloc fail: %m\n"); 1179 return NULL; 1180 } 1181 1182 err = pthread_mutex_init(&vu_transport->lock, NULL); 1183 if (err != 0) { 1184 SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err); 1185 goto err; 1186 } 1187 TAILQ_INIT(&vu_transport->endpoints); 1188 1189 err = pthread_mutex_init(&vu_transport->pg_lock, NULL); 1190 if (err != 0) { 1191 pthread_mutex_destroy(&vu_transport->lock); 1192 SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err); 1193 goto err; 1194 } 1195 TAILQ_INIT(&vu_transport->poll_groups); 1196 1197 if (opts->transport_specific != NULL && 1198 spdk_json_decode_object_relaxed(opts->transport_specific, vfio_user_transport_opts_decoder, 1199 SPDK_COUNTOF(vfio_user_transport_opts_decoder), 1200 vu_transport)) { 1201 SPDK_ERRLOG("spdk_json_decode_object_relaxed failed\n"); 1202 goto cleanup; 1203 } 1204 1205 /* 1206 * To support interrupt mode, the transport must be configured with 1207 * mappable BAR0 disabled: we need a vfio-user message to wake us up 1208 * when a client writes new doorbell values to BAR0, via the 1209 * libvfio-user socket fd. 1210 */ 1211 vu_transport->intr_mode_supported = 1212 vu_transport->transport_opts.disable_mappable_bar0; 1213 1214 /* 1215 * If BAR0 is mappable, it doesn't make sense to support shadow 1216 * doorbells, so explicitly turn it off. 1217 */ 1218 if (!vu_transport->transport_opts.disable_mappable_bar0) { 1219 vu_transport->transport_opts.disable_shadow_doorbells = true; 1220 } 1221 1222 /* 1223 * If we are in interrupt mode, we cannot support adaptive IRQs, as 1224 * there is no guarantee the SQ poller will run subsequently to send 1225 * pending IRQs. 1226 */ 1227 if (spdk_interrupt_mode_is_enabled()) { 1228 vu_transport->transport_opts.disable_adaptive_irq = true; 1229 } 1230 1231 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_mappable_bar0=%d\n", 1232 vu_transport->transport_opts.disable_mappable_bar0); 1233 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_adaptive_irq=%d\n", 1234 vu_transport->transport_opts.disable_adaptive_irq); 1235 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_shadow_doorbells=%d\n", 1236 vu_transport->transport_opts.disable_shadow_doorbells); 1237 1238 return &vu_transport->transport; 1239 1240 cleanup: 1241 pthread_mutex_destroy(&vu_transport->lock); 1242 pthread_mutex_destroy(&vu_transport->pg_lock); 1243 err: 1244 free(vu_transport); 1245 return NULL; 1246 } 1247 1248 static uint32_t 1249 max_queue_size(struct nvmf_vfio_user_ctrlr const *vu_ctrlr) 1250 { 1251 assert(vu_ctrlr != NULL); 1252 assert(vu_ctrlr->ctrlr != NULL); 1253 1254 return vu_ctrlr->ctrlr->vcprop.cap.bits.mqes + 1; 1255 } 1256 1257 static uint32_t 1258 doorbell_stride(const struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1259 { 1260 assert(vu_ctrlr != NULL); 1261 assert(vu_ctrlr->ctrlr != NULL); 1262 1263 return vu_ctrlr->ctrlr->vcprop.cap.bits.dstrd; 1264 } 1265 1266 static uintptr_t 1267 memory_page_size(const struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1268 { 1269 uint32_t memory_page_shift = vu_ctrlr->ctrlr->vcprop.cc.bits.mps + 12; 1270 return 1ul << memory_page_shift; 1271 } 1272 1273 static uintptr_t 1274 memory_page_mask(const struct nvmf_vfio_user_ctrlr *ctrlr) 1275 { 1276 return ~(memory_page_size(ctrlr) - 1); 1277 } 1278 1279 static int 1280 map_q(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvme_q_mapping *mapping, 1281 uint32_t q_size, bool is_cq, bool unmap) 1282 { 1283 uint64_t len; 1284 void *ret; 1285 1286 assert(q_size); 1287 assert(q_addr(mapping) == NULL); 1288 1289 if (is_cq) { 1290 len = q_size * sizeof(struct spdk_nvme_cpl); 1291 } else { 1292 len = q_size * sizeof(struct spdk_nvme_cmd); 1293 } 1294 1295 ret = map_one(vu_ctrlr->endpoint->vfu_ctx, mapping->prp1, len, 1296 mapping->sg, &mapping->iov, 1297 is_cq ? PROT_READ | PROT_WRITE : PROT_READ); 1298 if (ret == NULL) { 1299 return -EFAULT; 1300 } 1301 1302 if (unmap) { 1303 memset(q_addr(mapping), 0, len); 1304 } 1305 1306 return 0; 1307 } 1308 1309 static inline void 1310 unmap_q(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvme_q_mapping *mapping) 1311 { 1312 if (q_addr(mapping) != NULL) { 1313 vfu_sgl_put(vu_ctrlr->endpoint->vfu_ctx, mapping->sg, 1314 &mapping->iov, 1); 1315 mapping->iov.iov_base = NULL; 1316 } 1317 } 1318 1319 static int 1320 asq_setup(struct nvmf_vfio_user_ctrlr *ctrlr) 1321 { 1322 struct nvmf_vfio_user_sq *sq; 1323 const struct spdk_nvmf_registers *regs; 1324 int ret; 1325 1326 assert(ctrlr != NULL); 1327 1328 sq = ctrlr->sqs[0]; 1329 1330 assert(sq != NULL); 1331 assert(q_addr(&sq->mapping) == NULL); 1332 /* XXX ctrlr->asq == 0 is a valid memory address */ 1333 1334 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->ctrlr); 1335 sq->qid = 0; 1336 sq->size = regs->aqa.bits.asqs + 1; 1337 sq->mapping.prp1 = regs->asq; 1338 *sq_headp(sq) = 0; 1339 sq->cqid = 0; 1340 1341 ret = map_q(ctrlr, &sq->mapping, sq->size, false, true); 1342 if (ret) { 1343 return ret; 1344 } 1345 1346 /* The Admin queue (qid: 0) does not ever use shadow doorbells. */ 1347 sq->dbl_tailp = ctrlr->bar0_doorbells + queue_index(0, false); 1348 1349 *sq_dbl_tailp(sq) = 0; 1350 1351 return 0; 1352 } 1353 1354 /* 1355 * Updates eventidx to set an SQ into interrupt or polling mode. 1356 * 1357 * Returns false if the current SQ tail does not match the SQ head, as 1358 * this means that the host has submitted more items to the queue while we were 1359 * not looking - or during the event index update. In that case, we must retry, 1360 * or otherwise make sure we are going to wake up again. 1361 */ 1362 static bool 1363 set_sq_eventidx(struct nvmf_vfio_user_sq *sq) 1364 { 1365 struct nvmf_vfio_user_ctrlr *ctrlr; 1366 volatile uint32_t *sq_tail_eidx; 1367 uint32_t old_tail, new_tail; 1368 1369 assert(sq != NULL); 1370 assert(sq->ctrlr != NULL); 1371 assert(sq->ctrlr->sdbl != NULL); 1372 assert(sq->need_rearm); 1373 1374 ctrlr = sq->ctrlr; 1375 1376 SPDK_DEBUGLOG(vfio_user_db, "%s: updating eventidx of sqid:%u\n", 1377 ctrlr_id(ctrlr), sq->qid); 1378 1379 sq_tail_eidx = ctrlr->sdbl->eventidxs + queue_index(sq->qid, false); 1380 1381 assert(ctrlr->endpoint != NULL); 1382 1383 if (!ctrlr->endpoint->interrupt_mode) { 1384 /* No synchronisation necessary. */ 1385 *sq_tail_eidx = NVMF_VFIO_USER_EVENTIDX_POLL; 1386 return true; 1387 } 1388 1389 old_tail = *sq_dbl_tailp(sq); 1390 *sq_tail_eidx = old_tail; 1391 1392 /* 1393 * Ensure that the event index is updated before re-reading the tail 1394 * doorbell. If it's not, then the host might race us and update the 1395 * tail after the second read but before the event index is written, so 1396 * it won't write to BAR0 and we'll miss the update. 1397 * 1398 * The driver should provide similar ordering with an mb(). 1399 */ 1400 spdk_mb(); 1401 1402 /* 1403 * Check if the host has updated the tail doorbell after we've read it 1404 * for the first time, but before the event index was written. If that's 1405 * the case, then we've lost the race and we need to update the event 1406 * index again (after polling the queue, since the host won't write to 1407 * BAR0). 1408 */ 1409 new_tail = *sq_dbl_tailp(sq); 1410 1411 /* 1412 * We might poll the queue straight after this function returns if the 1413 * tail has been updated, so we need to ensure that any changes to the 1414 * queue will be visible to us if the doorbell has been updated. 1415 * 1416 * The driver should provide similar ordering with a wmb() to ensure 1417 * that the queue is written before it updates the tail doorbell. 1418 */ 1419 spdk_rmb(); 1420 1421 SPDK_DEBUGLOG(vfio_user_db, "%s: sqid:%u, old_tail=%u, new_tail=%u, " 1422 "sq_head=%u\n", ctrlr_id(ctrlr), sq->qid, old_tail, 1423 new_tail, *sq_headp(sq)); 1424 1425 if (new_tail == *sq_headp(sq)) { 1426 sq->need_rearm = false; 1427 return true; 1428 } 1429 1430 /* 1431 * We've lost the race: the tail was updated since we last polled, 1432 * including if it happened within this routine. 1433 * 1434 * The caller should retry after polling (think of this as a cmpxchg 1435 * loop); if we go to sleep while the SQ is not empty, then we won't 1436 * process the remaining events. 1437 */ 1438 return false; 1439 } 1440 1441 static int nvmf_vfio_user_sq_poll(struct nvmf_vfio_user_sq *sq); 1442 1443 /* 1444 * Arrange for an SQ to interrupt us if written. Returns non-zero if we 1445 * processed some SQ entries. 1446 */ 1447 static int 1448 vfio_user_sq_rearm(struct nvmf_vfio_user_ctrlr *ctrlr, 1449 struct nvmf_vfio_user_sq *sq) 1450 { 1451 int count = 0; 1452 size_t i; 1453 1454 assert(sq->need_rearm); 1455 1456 for (i = 0; i < NVMF_VFIO_USER_SET_EVENTIDX_MAX_ATTEMPTS; i++) { 1457 int ret; 1458 1459 if (set_sq_eventidx(sq)) { 1460 /* We won the race and set eventidx; done. */ 1461 return count; 1462 } 1463 1464 ret = nvmf_vfio_user_sq_poll(sq); 1465 1466 count += (ret < 0) ? 1 : ret; 1467 1468 /* 1469 * set_sq_eventidx() hit the race, so we expected 1470 * to process at least one command from this queue. 1471 * If there were no new commands waiting for us, then 1472 * we must have hit an unexpected race condition. 1473 */ 1474 if (ret == 0) { 1475 SPDK_ERRLOG("%s: unexpected race condition detected " 1476 "while updating the shadow doorbell buffer\n", 1477 ctrlr_id(ctrlr)); 1478 1479 fail_ctrlr(ctrlr); 1480 return count; 1481 } 1482 } 1483 1484 SPDK_DEBUGLOG(vfio_user_db, 1485 "%s: set_sq_eventidx() lost the race %zu times\n", 1486 ctrlr_id(ctrlr), i); 1487 1488 /* 1489 * We couldn't arrange an eventidx guaranteed to cause a BAR0 write, as 1490 * we raced with the producer too many times; force ourselves to wake up 1491 * instead. We'll process all queues at that point. 1492 */ 1493 ctrlr_kick(ctrlr); 1494 1495 return count; 1496 } 1497 1498 /* 1499 * We're in interrupt mode, and potentially about to go to sleep. We need to 1500 * make sure any further I/O submissions are guaranteed to wake us up: for 1501 * shadow doorbells that means we may need to go through set_sq_eventidx() for 1502 * every SQ that needs re-arming. 1503 * 1504 * Returns non-zero if we processed something. 1505 */ 1506 static int 1507 vfio_user_poll_group_rearm(struct nvmf_vfio_user_poll_group *vu_group) 1508 { 1509 struct nvmf_vfio_user_sq *sq; 1510 int count = 0; 1511 1512 TAILQ_FOREACH(sq, &vu_group->sqs, link) { 1513 if (spdk_unlikely(sq->sq_state != VFIO_USER_SQ_ACTIVE || !sq->size)) { 1514 continue; 1515 } 1516 1517 if (sq->need_rearm) { 1518 count += vfio_user_sq_rearm(sq->ctrlr, sq); 1519 } 1520 } 1521 1522 return count; 1523 } 1524 1525 static int 1526 acq_setup(struct nvmf_vfio_user_ctrlr *ctrlr) 1527 { 1528 struct nvmf_vfio_user_cq *cq; 1529 const struct spdk_nvmf_registers *regs; 1530 int ret; 1531 1532 assert(ctrlr != NULL); 1533 1534 cq = ctrlr->cqs[0]; 1535 1536 assert(cq != NULL); 1537 1538 assert(q_addr(&cq->mapping) == NULL); 1539 1540 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->ctrlr); 1541 assert(regs != NULL); 1542 cq->qid = 0; 1543 cq->size = regs->aqa.bits.acqs + 1; 1544 cq->mapping.prp1 = regs->acq; 1545 *cq_tailp(cq) = 0; 1546 cq->ien = true; 1547 cq->phase = true; 1548 1549 ret = map_q(ctrlr, &cq->mapping, cq->size, true, true); 1550 if (ret) { 1551 return ret; 1552 } 1553 1554 /* The Admin queue (qid: 0) does not ever use shadow doorbells. */ 1555 cq->dbl_headp = ctrlr->bar0_doorbells + queue_index(0, true); 1556 1557 *cq_dbl_headp(cq) = 0; 1558 1559 return 0; 1560 } 1561 1562 static void * 1563 _map_one(void *prv, uint64_t addr, uint64_t len, int prot) 1564 { 1565 struct spdk_nvmf_request *req = (struct spdk_nvmf_request *)prv; 1566 struct spdk_nvmf_qpair *qpair; 1567 struct nvmf_vfio_user_req *vu_req; 1568 struct nvmf_vfio_user_sq *sq; 1569 void *ret; 1570 1571 assert(req != NULL); 1572 qpair = req->qpair; 1573 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 1574 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 1575 1576 assert(vu_req->iovcnt < NVMF_VFIO_USER_MAX_IOVECS); 1577 ret = map_one(sq->ctrlr->endpoint->vfu_ctx, addr, len, 1578 index_to_sg_t(vu_req->sg, vu_req->iovcnt), 1579 &vu_req->iov[vu_req->iovcnt], prot); 1580 if (spdk_likely(ret != NULL)) { 1581 vu_req->iovcnt++; 1582 } 1583 return ret; 1584 } 1585 1586 static int 1587 vfio_user_map_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req, 1588 struct iovec *iov, uint32_t length) 1589 { 1590 /* Map PRP list to from Guest physical memory to 1591 * virtual memory address. 1592 */ 1593 return nvme_map_cmd(req, &req->cmd->nvme_cmd, iov, NVMF_REQ_MAX_BUFFERS, 1594 length, 4096, _map_one); 1595 } 1596 1597 static int handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 1598 struct nvmf_vfio_user_sq *sq); 1599 1600 /* 1601 * Posts a CQE in the completion queue. 1602 * 1603 * @ctrlr: the vfio-user controller 1604 * @cq: the completion queue 1605 * @cdw0: cdw0 as reported by NVMf 1606 * @sqid: submission queue ID 1607 * @cid: command identifier in NVMe command 1608 * @sc: the NVMe CQE status code 1609 * @sct: the NVMe CQE status code type 1610 */ 1611 static int 1612 post_completion(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_cq *cq, 1613 uint32_t cdw0, uint16_t sqid, uint16_t cid, uint16_t sc, uint16_t sct) 1614 { 1615 struct spdk_nvme_status cpl_status = { 0 }; 1616 struct spdk_nvme_cpl *cpl; 1617 int err; 1618 1619 assert(ctrlr != NULL); 1620 1621 if (spdk_unlikely(cq == NULL || q_addr(&cq->mapping) == NULL)) { 1622 return 0; 1623 } 1624 1625 if (cq_is_full(cq)) { 1626 SPDK_ERRLOG("%s: cqid:%d full (tail=%d, head=%d)\n", 1627 ctrlr_id(ctrlr), cq->qid, *cq_tailp(cq), 1628 *cq_dbl_headp(cq)); 1629 return -1; 1630 } 1631 1632 cpl = ((struct spdk_nvme_cpl *)q_addr(&cq->mapping)) + *cq_tailp(cq); 1633 1634 assert(ctrlr->sqs[sqid] != NULL); 1635 SPDK_DEBUGLOG(nvmf_vfio, 1636 "%s: request complete sqid:%d cid=%d status=%#x " 1637 "sqhead=%d cq tail=%d\n", ctrlr_id(ctrlr), sqid, cid, sc, 1638 *sq_headp(ctrlr->sqs[sqid]), *cq_tailp(cq)); 1639 1640 cpl->sqhd = *sq_headp(ctrlr->sqs[sqid]); 1641 cpl->sqid = sqid; 1642 cpl->cid = cid; 1643 cpl->cdw0 = cdw0; 1644 1645 /* 1646 * This is a bitfield: instead of setting the individual bits we need 1647 * directly in cpl->status, which would cause a read-modify-write cycle, 1648 * we'll avoid reading from the CPL altogether by filling in a local 1649 * cpl_status variable, then writing the whole thing. 1650 */ 1651 cpl_status.sct = sct; 1652 cpl_status.sc = sc; 1653 cpl_status.p = cq->phase; 1654 cpl->status = cpl_status; 1655 1656 /* Ensure the Completion Queue Entry is visible. */ 1657 spdk_wmb(); 1658 cq_tail_advance(cq); 1659 1660 if ((cq->qid == 0 || !ctrlr->adaptive_irqs_enabled) && 1661 cq->ien && ctrlr_interrupt_enabled(ctrlr)) { 1662 err = vfu_irq_trigger(ctrlr->endpoint->vfu_ctx, cq->iv); 1663 if (err != 0) { 1664 SPDK_ERRLOG("%s: failed to trigger interrupt: %m\n", 1665 ctrlr_id(ctrlr)); 1666 return err; 1667 } 1668 } 1669 1670 return 0; 1671 } 1672 1673 static void 1674 free_sq_reqs(struct nvmf_vfio_user_sq *sq) 1675 { 1676 while (!TAILQ_EMPTY(&sq->free_reqs)) { 1677 struct nvmf_vfio_user_req *vu_req = TAILQ_FIRST(&sq->free_reqs); 1678 TAILQ_REMOVE(&sq->free_reqs, vu_req, link); 1679 free(vu_req); 1680 } 1681 } 1682 1683 /* Deletes a SQ, if this SQ is the last user of the associated CQ 1684 * and the controller is being shut down or reset, then the CQ is 1685 * also deleted. 1686 */ 1687 static void 1688 delete_sq_done(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvmf_vfio_user_sq *sq) 1689 { 1690 struct nvmf_vfio_user_cq *cq; 1691 uint16_t cqid; 1692 1693 SPDK_DEBUGLOG(nvmf_vfio, "%s: delete sqid:%d=%p done\n", ctrlr_id(vu_ctrlr), 1694 sq->qid, sq); 1695 1696 /* Free SQ resources */ 1697 unmap_q(vu_ctrlr, &sq->mapping); 1698 1699 free_sq_reqs(sq); 1700 1701 sq->size = 0; 1702 1703 sq->sq_state = VFIO_USER_SQ_DELETED; 1704 1705 /* Controller RESET and SHUTDOWN are special cases, 1706 * VM may not send DELETE IO SQ/CQ commands, NVMf library 1707 * will disconnect IO queue pairs. 1708 */ 1709 if (vu_ctrlr->reset_shn) { 1710 cqid = sq->cqid; 1711 cq = vu_ctrlr->cqs[cqid]; 1712 1713 SPDK_DEBUGLOG(nvmf_vfio, "%s: try to delete cqid:%u=%p\n", ctrlr_id(vu_ctrlr), 1714 cq->qid, cq); 1715 1716 if (cq->cq_ref) { 1717 cq->cq_ref--; 1718 } 1719 if (cq->cq_ref == 0) { 1720 unmap_q(vu_ctrlr, &cq->mapping); 1721 cq->size = 0; 1722 cq->cq_state = VFIO_USER_CQ_DELETED; 1723 cq->group = NULL; 1724 } 1725 } 1726 } 1727 1728 static void 1729 free_qp(struct nvmf_vfio_user_ctrlr *ctrlr, uint16_t qid) 1730 { 1731 struct nvmf_vfio_user_sq *sq; 1732 struct nvmf_vfio_user_cq *cq; 1733 1734 if (ctrlr == NULL) { 1735 return; 1736 } 1737 1738 sq = ctrlr->sqs[qid]; 1739 if (sq) { 1740 SPDK_DEBUGLOG(nvmf_vfio, "%s: Free SQ %u\n", ctrlr_id(ctrlr), qid); 1741 unmap_q(ctrlr, &sq->mapping); 1742 1743 free_sq_reqs(sq); 1744 1745 free(sq->mapping.sg); 1746 free(sq); 1747 ctrlr->sqs[qid] = NULL; 1748 } 1749 1750 cq = ctrlr->cqs[qid]; 1751 if (cq) { 1752 SPDK_DEBUGLOG(nvmf_vfio, "%s: Free cqid:%u\n", ctrlr_id(ctrlr), qid); 1753 unmap_q(ctrlr, &cq->mapping); 1754 free(cq->mapping.sg); 1755 free(cq); 1756 ctrlr->cqs[qid] = NULL; 1757 } 1758 } 1759 1760 static int 1761 init_sq(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_transport *transport, 1762 const uint16_t id) 1763 { 1764 struct nvmf_vfio_user_sq *sq; 1765 1766 assert(ctrlr != NULL); 1767 assert(transport != NULL); 1768 assert(ctrlr->sqs[id] == NULL); 1769 1770 sq = calloc(1, sizeof(*sq)); 1771 if (sq == NULL) { 1772 return -ENOMEM; 1773 } 1774 sq->mapping.sg = calloc(1, dma_sg_size()); 1775 if (sq->mapping.sg == NULL) { 1776 free(sq); 1777 return -ENOMEM; 1778 } 1779 1780 sq->qid = id; 1781 sq->qpair.qid = id; 1782 sq->qpair.transport = transport; 1783 sq->ctrlr = ctrlr; 1784 ctrlr->sqs[id] = sq; 1785 1786 TAILQ_INIT(&sq->free_reqs); 1787 1788 return 0; 1789 } 1790 1791 static int 1792 init_cq(struct nvmf_vfio_user_ctrlr *vu_ctrlr, const uint16_t id) 1793 { 1794 struct nvmf_vfio_user_cq *cq; 1795 1796 assert(vu_ctrlr != NULL); 1797 assert(vu_ctrlr->cqs[id] == NULL); 1798 1799 cq = calloc(1, sizeof(*cq)); 1800 if (cq == NULL) { 1801 return -ENOMEM; 1802 } 1803 cq->mapping.sg = calloc(1, dma_sg_size()); 1804 if (cq->mapping.sg == NULL) { 1805 free(cq); 1806 return -ENOMEM; 1807 } 1808 1809 cq->qid = id; 1810 vu_ctrlr->cqs[id] = cq; 1811 1812 return 0; 1813 } 1814 1815 static int 1816 alloc_sq_reqs(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvmf_vfio_user_sq *sq) 1817 { 1818 struct nvmf_vfio_user_req *vu_req, *tmp; 1819 size_t req_size; 1820 uint32_t i; 1821 1822 req_size = sizeof(struct nvmf_vfio_user_req) + 1823 (dma_sg_size() * NVMF_VFIO_USER_MAX_IOVECS); 1824 1825 for (i = 0; i < sq->size; i++) { 1826 struct spdk_nvmf_request *req; 1827 1828 vu_req = calloc(1, req_size); 1829 if (vu_req == NULL) { 1830 goto err; 1831 } 1832 1833 req = &vu_req->req; 1834 req->qpair = &sq->qpair; 1835 req->rsp = (union nvmf_c2h_msg *)&vu_req->rsp; 1836 req->cmd = (union nvmf_h2c_msg *)&vu_req->cmd; 1837 req->stripped_data = NULL; 1838 1839 TAILQ_INSERT_TAIL(&sq->free_reqs, vu_req, link); 1840 } 1841 1842 return 0; 1843 1844 err: 1845 TAILQ_FOREACH_SAFE(vu_req, &sq->free_reqs, link, tmp) { 1846 free(vu_req); 1847 } 1848 return -ENOMEM; 1849 } 1850 1851 static volatile uint32_t * 1852 ctrlr_doorbell_ptr(struct nvmf_vfio_user_ctrlr *ctrlr) 1853 { 1854 return ctrlr->sdbl != NULL ? 1855 ctrlr->sdbl->shadow_doorbells : 1856 ctrlr->bar0_doorbells; 1857 } 1858 1859 static uint16_t 1860 handle_create_io_sq(struct nvmf_vfio_user_ctrlr *ctrlr, 1861 struct spdk_nvme_cmd *cmd, uint16_t *sct) 1862 { 1863 struct nvmf_vfio_user_transport *vu_transport = ctrlr->transport; 1864 struct nvmf_vfio_user_sq *sq; 1865 uint32_t qsize; 1866 uint16_t cqid; 1867 uint16_t qid; 1868 int err; 1869 1870 qid = cmd->cdw10_bits.create_io_q.qid; 1871 cqid = cmd->cdw11_bits.create_io_sq.cqid; 1872 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 1873 1874 if (ctrlr->sqs[qid] == NULL) { 1875 err = init_sq(ctrlr, ctrlr->sqs[0]->qpair.transport, qid); 1876 if (err != 0) { 1877 *sct = SPDK_NVME_SCT_GENERIC; 1878 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1879 } 1880 } 1881 1882 if (cqid == 0 || cqid >= vu_transport->transport.opts.max_qpairs_per_ctrlr) { 1883 SPDK_ERRLOG("%s: invalid cqid:%u\n", ctrlr_id(ctrlr), cqid); 1884 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1885 return SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 1886 } 1887 1888 /* CQ must be created before SQ. */ 1889 if (!io_q_exists(ctrlr, cqid, true)) { 1890 SPDK_ERRLOG("%s: cqid:%u does not exist\n", ctrlr_id(ctrlr), cqid); 1891 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1892 return SPDK_NVME_SC_COMPLETION_QUEUE_INVALID; 1893 } 1894 1895 if (cmd->cdw11_bits.create_io_sq.pc != 0x1) { 1896 SPDK_ERRLOG("%s: non-PC SQ not supported\n", ctrlr_id(ctrlr)); 1897 *sct = SPDK_NVME_SCT_GENERIC; 1898 return SPDK_NVME_SC_INVALID_FIELD; 1899 } 1900 1901 sq = ctrlr->sqs[qid]; 1902 sq->size = qsize; 1903 1904 SPDK_DEBUGLOG(nvmf_vfio, "%s: sqid:%d cqid:%d\n", ctrlr_id(ctrlr), 1905 qid, cqid); 1906 1907 sq->mapping.prp1 = cmd->dptr.prp.prp1; 1908 1909 err = map_q(ctrlr, &sq->mapping, sq->size, false, true); 1910 if (err) { 1911 SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr)); 1912 *sct = SPDK_NVME_SCT_GENERIC; 1913 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1914 } 1915 1916 SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped sqid:%d IOVA=%#lx vaddr=%p\n", 1917 ctrlr_id(ctrlr), qid, cmd->dptr.prp.prp1, 1918 q_addr(&sq->mapping)); 1919 1920 err = alloc_sq_reqs(ctrlr, sq); 1921 if (err < 0) { 1922 SPDK_ERRLOG("%s: failed to allocate SQ requests: %m\n", ctrlr_id(ctrlr)); 1923 *sct = SPDK_NVME_SCT_GENERIC; 1924 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1925 } 1926 1927 sq->cqid = cqid; 1928 ctrlr->cqs[sq->cqid]->cq_ref++; 1929 sq->sq_state = VFIO_USER_SQ_CREATED; 1930 *sq_headp(sq) = 0; 1931 1932 sq->dbl_tailp = ctrlr_doorbell_ptr(ctrlr) + queue_index(qid, false); 1933 1934 /* 1935 * We should always reset the doorbells. 1936 * 1937 * The Specification prohibits the controller from writing to the shadow 1938 * doorbell buffer, however older versions of the Linux NVMe driver 1939 * don't reset the shadow doorbell buffer after a Queue-Level or 1940 * Controller-Level reset, which means that we're left with garbage 1941 * doorbell values. 1942 */ 1943 *sq_dbl_tailp(sq) = 0; 1944 1945 if (ctrlr->sdbl != NULL) { 1946 sq->need_rearm = true; 1947 1948 if (!set_sq_eventidx(sq)) { 1949 SPDK_ERRLOG("%s: host updated SQ tail doorbell before " 1950 "sqid:%hu was initialized\n", 1951 ctrlr_id(ctrlr), qid); 1952 fail_ctrlr(ctrlr); 1953 *sct = SPDK_NVME_SCT_GENERIC; 1954 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1955 } 1956 } 1957 1958 /* 1959 * Create our new I/O qpair. This asynchronously invokes, on a suitable 1960 * poll group, the nvmf_vfio_user_poll_group_add() callback, which will 1961 * call spdk_nvmf_request_exec_fabrics() with a generated fabrics 1962 * connect command. This command is then eventually completed via 1963 * handle_queue_connect_rsp(). 1964 */ 1965 sq->create_io_sq_cmd = *cmd; 1966 sq->post_create_io_sq_completion = true; 1967 1968 spdk_nvmf_tgt_new_qpair(ctrlr->transport->transport.tgt, 1969 &sq->qpair); 1970 1971 *sct = SPDK_NVME_SCT_GENERIC; 1972 return SPDK_NVME_SC_SUCCESS; 1973 } 1974 1975 static uint16_t 1976 handle_create_io_cq(struct nvmf_vfio_user_ctrlr *ctrlr, 1977 struct spdk_nvme_cmd *cmd, uint16_t *sct) 1978 { 1979 struct nvmf_vfio_user_cq *cq; 1980 uint32_t qsize; 1981 uint16_t qid; 1982 int err; 1983 1984 qid = cmd->cdw10_bits.create_io_q.qid; 1985 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 1986 1987 if (ctrlr->cqs[qid] == NULL) { 1988 err = init_cq(ctrlr, qid); 1989 if (err != 0) { 1990 *sct = SPDK_NVME_SCT_GENERIC; 1991 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1992 } 1993 } 1994 1995 if (cmd->cdw11_bits.create_io_cq.pc != 0x1) { 1996 SPDK_ERRLOG("%s: non-PC CQ not supported\n", ctrlr_id(ctrlr)); 1997 *sct = SPDK_NVME_SCT_GENERIC; 1998 return SPDK_NVME_SC_INVALID_FIELD; 1999 } 2000 2001 if (cmd->cdw11_bits.create_io_cq.iv > NVME_IRQ_MSIX_NUM - 1) { 2002 SPDK_ERRLOG("%s: IV is too big\n", ctrlr_id(ctrlr)); 2003 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2004 return SPDK_NVME_SC_INVALID_INTERRUPT_VECTOR; 2005 } 2006 2007 cq = ctrlr->cqs[qid]; 2008 cq->size = qsize; 2009 2010 cq->mapping.prp1 = cmd->dptr.prp.prp1; 2011 2012 cq->dbl_headp = ctrlr_doorbell_ptr(ctrlr) + queue_index(qid, true); 2013 2014 err = map_q(ctrlr, &cq->mapping, cq->size, true, true); 2015 if (err) { 2016 SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr)); 2017 *sct = SPDK_NVME_SCT_GENERIC; 2018 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2019 } 2020 2021 SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped cqid:%u IOVA=%#lx vaddr=%p\n", 2022 ctrlr_id(ctrlr), qid, cmd->dptr.prp.prp1, 2023 q_addr(&cq->mapping)); 2024 2025 cq->ien = cmd->cdw11_bits.create_io_cq.ien; 2026 cq->iv = cmd->cdw11_bits.create_io_cq.iv; 2027 cq->phase = true; 2028 cq->cq_state = VFIO_USER_CQ_CREATED; 2029 2030 *cq_tailp(cq) = 0; 2031 2032 /* 2033 * We should always reset the doorbells. 2034 * 2035 * The Specification prohibits the controller from writing to the shadow 2036 * doorbell buffer, however older versions of the Linux NVMe driver 2037 * don't reset the shadow doorbell buffer after a Queue-Level or 2038 * Controller-Level reset, which means that we're left with garbage 2039 * doorbell values. 2040 */ 2041 *cq_dbl_headp(cq) = 0; 2042 2043 *sct = SPDK_NVME_SCT_GENERIC; 2044 return SPDK_NVME_SC_SUCCESS; 2045 } 2046 2047 /* 2048 * Creates a completion or submission I/O queue. Returns 0 on success, -errno 2049 * on error. 2050 */ 2051 static int 2052 handle_create_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, 2053 struct spdk_nvme_cmd *cmd, const bool is_cq) 2054 { 2055 struct nvmf_vfio_user_transport *vu_transport = ctrlr->transport; 2056 uint16_t sct = SPDK_NVME_SCT_GENERIC; 2057 uint16_t sc = SPDK_NVME_SC_SUCCESS; 2058 uint32_t qsize; 2059 uint16_t qid; 2060 2061 assert(ctrlr != NULL); 2062 assert(cmd != NULL); 2063 2064 qid = cmd->cdw10_bits.create_io_q.qid; 2065 if (qid == 0 || qid >= vu_transport->transport.opts.max_qpairs_per_ctrlr) { 2066 SPDK_ERRLOG("%s: invalid qid=%d, max=%d\n", ctrlr_id(ctrlr), 2067 qid, vu_transport->transport.opts.max_qpairs_per_ctrlr); 2068 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2069 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2070 goto out; 2071 } 2072 2073 if (io_q_exists(ctrlr, qid, is_cq)) { 2074 SPDK_ERRLOG("%s: %cqid:%d already exists\n", ctrlr_id(ctrlr), 2075 is_cq ? 'c' : 's', qid); 2076 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2077 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2078 goto out; 2079 } 2080 2081 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 2082 if (qsize == 1 || qsize > max_queue_size(ctrlr)) { 2083 SPDK_ERRLOG("%s: invalid I/O queue size %u\n", ctrlr_id(ctrlr), qsize); 2084 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2085 sc = SPDK_NVME_SC_INVALID_QUEUE_SIZE; 2086 goto out; 2087 } 2088 2089 if (is_cq) { 2090 sc = handle_create_io_cq(ctrlr, cmd, &sct); 2091 } else { 2092 sc = handle_create_io_sq(ctrlr, cmd, &sct); 2093 2094 if (sct == SPDK_NVME_SCT_GENERIC && 2095 sc == SPDK_NVME_SC_SUCCESS) { 2096 /* Completion posted asynchronously. */ 2097 return 0; 2098 } 2099 } 2100 2101 out: 2102 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 2103 } 2104 2105 /* For ADMIN I/O DELETE SUBMISSION QUEUE the NVMf library will disconnect and free 2106 * queue pair, so save the command in a context. 2107 */ 2108 struct vfio_user_delete_sq_ctx { 2109 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 2110 struct spdk_nvme_cmd delete_io_sq_cmd; 2111 }; 2112 2113 static void 2114 vfio_user_qpair_delete_cb(void *cb_arg) 2115 { 2116 struct vfio_user_delete_sq_ctx *ctx = cb_arg; 2117 struct nvmf_vfio_user_ctrlr *vu_ctrlr = ctx->vu_ctrlr; 2118 2119 post_completion(vu_ctrlr, vu_ctrlr->cqs[0], 0, 0, ctx->delete_io_sq_cmd.cid, 2120 SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC); 2121 free(ctx); 2122 } 2123 2124 /* 2125 * Deletes a completion or submission I/O queue. 2126 */ 2127 static int 2128 handle_del_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, 2129 struct spdk_nvme_cmd *cmd, const bool is_cq) 2130 { 2131 uint16_t sct = SPDK_NVME_SCT_GENERIC; 2132 uint16_t sc = SPDK_NVME_SC_SUCCESS; 2133 struct nvmf_vfio_user_sq *sq; 2134 struct nvmf_vfio_user_cq *cq; 2135 struct vfio_user_delete_sq_ctx *ctx; 2136 2137 SPDK_DEBUGLOG(nvmf_vfio, "%s: delete I/O %cqid:%d\n", 2138 ctrlr_id(ctrlr), is_cq ? 'c' : 's', 2139 cmd->cdw10_bits.delete_io_q.qid); 2140 2141 if (!io_q_exists(ctrlr, cmd->cdw10_bits.delete_io_q.qid, is_cq)) { 2142 SPDK_ERRLOG("%s: I/O %cqid:%d does not exist\n", ctrlr_id(ctrlr), 2143 is_cq ? 'c' : 's', cmd->cdw10_bits.delete_io_q.qid); 2144 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2145 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2146 goto out; 2147 } 2148 2149 if (is_cq) { 2150 cq = ctrlr->cqs[cmd->cdw10_bits.delete_io_q.qid]; 2151 if (cq->cq_ref) { 2152 SPDK_ERRLOG("%s: the associated SQ must be deleted first\n", ctrlr_id(ctrlr)); 2153 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2154 sc = SPDK_NVME_SC_INVALID_QUEUE_DELETION; 2155 goto out; 2156 } 2157 2158 unmap_q(ctrlr, &cq->mapping); 2159 cq->size = 0; 2160 cq->cq_state = VFIO_USER_CQ_DELETED; 2161 cq->group = NULL; 2162 } else { 2163 ctx = calloc(1, sizeof(*ctx)); 2164 if (!ctx) { 2165 sct = SPDK_NVME_SCT_GENERIC; 2166 sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2167 goto out; 2168 } 2169 ctx->vu_ctrlr = ctrlr; 2170 ctx->delete_io_sq_cmd = *cmd; 2171 2172 sq = ctrlr->sqs[cmd->cdw10_bits.delete_io_q.qid]; 2173 sq->sq_state = VFIO_USER_SQ_DELETED; 2174 assert(ctrlr->cqs[sq->cqid]->cq_ref); 2175 ctrlr->cqs[sq->cqid]->cq_ref--; 2176 2177 spdk_nvmf_qpair_disconnect(&sq->qpair, vfio_user_qpair_delete_cb, ctx); 2178 return 0; 2179 } 2180 2181 out: 2182 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 2183 } 2184 2185 /* 2186 * Configures Shadow Doorbells. 2187 */ 2188 static int 2189 handle_doorbell_buffer_config(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd) 2190 { 2191 struct nvmf_vfio_user_shadow_doorbells *sdbl = NULL; 2192 uint32_t dstrd; 2193 uintptr_t page_size, page_mask; 2194 uint64_t prp1, prp2; 2195 uint16_t sct = SPDK_NVME_SCT_GENERIC; 2196 uint16_t sc = SPDK_NVME_SC_INVALID_FIELD; 2197 2198 assert(ctrlr != NULL); 2199 assert(ctrlr->endpoint != NULL); 2200 assert(cmd != NULL); 2201 2202 dstrd = doorbell_stride(ctrlr); 2203 page_size = memory_page_size(ctrlr); 2204 page_mask = memory_page_mask(ctrlr); 2205 2206 /* FIXME: we don't check doorbell stride when setting queue doorbells. */ 2207 if ((4u << dstrd) * NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR > page_size) { 2208 SPDK_ERRLOG("%s: doorbells do not fit in a single host page", 2209 ctrlr_id(ctrlr)); 2210 2211 goto out; 2212 } 2213 2214 /* Verify guest physical addresses passed as PRPs. */ 2215 if (cmd->psdt != SPDK_NVME_PSDT_PRP) { 2216 SPDK_ERRLOG("%s: received Doorbell Buffer Config without PRPs", 2217 ctrlr_id(ctrlr)); 2218 2219 goto out; 2220 } 2221 2222 prp1 = cmd->dptr.prp.prp1; 2223 prp2 = cmd->dptr.prp.prp2; 2224 2225 SPDK_DEBUGLOG(nvmf_vfio, 2226 "%s: configuring shadow doorbells with PRP1=%#lx and PRP2=%#lx (GPAs)\n", 2227 ctrlr_id(ctrlr), prp1, prp2); 2228 2229 if (prp1 == prp2 2230 || prp1 != (prp1 & page_mask) 2231 || prp2 != (prp2 & page_mask)) { 2232 SPDK_ERRLOG("%s: invalid shadow doorbell GPAs\n", 2233 ctrlr_id(ctrlr)); 2234 2235 goto out; 2236 } 2237 2238 /* Map guest physical addresses to our virtual address space. */ 2239 sdbl = map_sdbl(ctrlr->endpoint->vfu_ctx, prp1, prp2, page_size); 2240 if (sdbl == NULL) { 2241 SPDK_ERRLOG("%s: failed to map shadow doorbell buffers\n", 2242 ctrlr_id(ctrlr)); 2243 2244 goto out; 2245 } 2246 2247 ctrlr->shadow_doorbell_buffer = prp1; 2248 ctrlr->eventidx_buffer = prp2; 2249 2250 SPDK_DEBUGLOG(nvmf_vfio, 2251 "%s: mapped shadow doorbell buffers [%p, %p) and [%p, %p)\n", 2252 ctrlr_id(ctrlr), 2253 sdbl->iovs[0].iov_base, 2254 sdbl->iovs[0].iov_base + sdbl->iovs[0].iov_len, 2255 sdbl->iovs[1].iov_base, 2256 sdbl->iovs[1].iov_base + sdbl->iovs[1].iov_len); 2257 2258 2259 /* 2260 * Set all possible CQ head doorbells to polling mode now, such that we 2261 * don't have to worry about it later if the host creates more queues. 2262 * 2263 * We only ever want interrupts for writes to the SQ tail doorbells 2264 * (which are initialised in set_ctrlr_intr_mode() below). 2265 */ 2266 for (uint16_t i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; ++i) { 2267 sdbl->eventidxs[queue_index(i, true)] = NVMF_VFIO_USER_EVENTIDX_POLL; 2268 if (ctrlr->sqs[i] != NULL) { 2269 ctrlr->sqs[i]->need_rearm = true; 2270 } 2271 } 2272 2273 /* Update controller. */ 2274 SWAP(ctrlr->sdbl, sdbl); 2275 2276 /* 2277 * Copy doorbells from either the previous shadow doorbell buffer or the 2278 * BAR0 doorbells and make I/O queue doorbells point to the new buffer. 2279 * 2280 * This needs to account for older versions of the Linux NVMe driver, 2281 * which don't clear out the buffer after a controller reset. 2282 */ 2283 copy_doorbells(ctrlr, sdbl != NULL ? 2284 sdbl->shadow_doorbells : ctrlr->bar0_doorbells, 2285 ctrlr->sdbl->shadow_doorbells); 2286 vfio_user_ctrlr_switch_doorbells(ctrlr, true); 2287 2288 /* Update event index buffer and poll queues if necessary. */ 2289 vfio_user_poll_group_rearm(ctrlr_to_poll_group(ctrlr)); 2290 2291 sc = SPDK_NVME_SC_SUCCESS; 2292 2293 out: 2294 /* 2295 * Unmap existing buffers, in case Doorbell Buffer Config was sent 2296 * more than once (pointless, but not prohibited by the spec), or 2297 * in case of an error. 2298 * 2299 * If this is the first time Doorbell Buffer Config was processed, 2300 * then we've just swapped a NULL from ctrlr->sdbl into sdbl, so 2301 * free_sdbl() becomes a noop. 2302 */ 2303 free_sdbl(ctrlr->endpoint->vfu_ctx, sdbl); 2304 2305 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 2306 } 2307 2308 /* Returns 0 on success and -errno on error. */ 2309 static int 2310 consume_admin_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd) 2311 { 2312 assert(ctrlr != NULL); 2313 assert(cmd != NULL); 2314 2315 if (cmd->fuse != 0) { 2316 /* Fused admin commands are not supported. */ 2317 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, 2318 SPDK_NVME_SC_INVALID_FIELD, 2319 SPDK_NVME_SCT_GENERIC); 2320 } 2321 2322 switch (cmd->opc) { 2323 case SPDK_NVME_OPC_CREATE_IO_CQ: 2324 case SPDK_NVME_OPC_CREATE_IO_SQ: 2325 return handle_create_io_q(ctrlr, cmd, 2326 cmd->opc == SPDK_NVME_OPC_CREATE_IO_CQ); 2327 case SPDK_NVME_OPC_DELETE_IO_SQ: 2328 case SPDK_NVME_OPC_DELETE_IO_CQ: 2329 return handle_del_io_q(ctrlr, cmd, 2330 cmd->opc == SPDK_NVME_OPC_DELETE_IO_CQ); 2331 case SPDK_NVME_OPC_DOORBELL_BUFFER_CONFIG: 2332 if (!ctrlr->transport->transport_opts.disable_shadow_doorbells) { 2333 return handle_doorbell_buffer_config(ctrlr, cmd); 2334 } 2335 /* FALLTHROUGH */ 2336 default: 2337 return handle_cmd_req(ctrlr, cmd, ctrlr->sqs[0]); 2338 } 2339 } 2340 2341 static int 2342 handle_cmd_rsp(struct nvmf_vfio_user_req *vu_req, void *cb_arg) 2343 { 2344 struct nvmf_vfio_user_sq *sq = cb_arg; 2345 struct nvmf_vfio_user_ctrlr *vu_ctrlr = sq->ctrlr; 2346 uint16_t sqid, cqid; 2347 2348 assert(sq != NULL); 2349 assert(vu_req != NULL); 2350 assert(vu_ctrlr != NULL); 2351 2352 if (spdk_likely(vu_req->iovcnt)) { 2353 vfu_sgl_put(vu_ctrlr->endpoint->vfu_ctx, 2354 index_to_sg_t(vu_req->sg, 0), 2355 vu_req->iov, vu_req->iovcnt); 2356 } 2357 sqid = sq->qid; 2358 cqid = sq->cqid; 2359 2360 return post_completion(vu_ctrlr, vu_ctrlr->cqs[cqid], 2361 vu_req->req.rsp->nvme_cpl.cdw0, 2362 sqid, 2363 vu_req->req.cmd->nvme_cmd.cid, 2364 vu_req->req.rsp->nvme_cpl.status.sc, 2365 vu_req->req.rsp->nvme_cpl.status.sct); 2366 } 2367 2368 static int 2369 consume_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_sq *sq, 2370 struct spdk_nvme_cmd *cmd) 2371 { 2372 assert(sq != NULL); 2373 if (nvmf_qpair_is_admin_queue(&sq->qpair)) { 2374 return consume_admin_cmd(ctrlr, cmd); 2375 } 2376 2377 return handle_cmd_req(ctrlr, cmd, sq); 2378 } 2379 2380 /* Returns the number of commands processed, or a negative value on error. */ 2381 static int 2382 handle_sq_tdbl_write(struct nvmf_vfio_user_ctrlr *ctrlr, const uint32_t new_tail, 2383 struct nvmf_vfio_user_sq *sq) 2384 { 2385 struct spdk_nvme_cmd *queue; 2386 int count = 0; 2387 2388 assert(ctrlr != NULL); 2389 assert(sq != NULL); 2390 2391 if (ctrlr->sdbl != NULL) { 2392 /* 2393 * Submission queue index has moved past the event index, so it 2394 * needs to be re-armed before we go to sleep. 2395 */ 2396 sq->need_rearm = true; 2397 } 2398 2399 queue = q_addr(&sq->mapping); 2400 while (*sq_headp(sq) != new_tail) { 2401 int err; 2402 struct spdk_nvme_cmd *cmd = &queue[*sq_headp(sq)]; 2403 2404 count++; 2405 2406 /* 2407 * SQHD must contain the new head pointer, so we must increase 2408 * it before we generate a completion. 2409 */ 2410 sq_head_advance(sq); 2411 2412 err = consume_cmd(ctrlr, sq, cmd); 2413 if (err != 0) { 2414 return err; 2415 } 2416 } 2417 2418 return count; 2419 } 2420 2421 static void 2422 memory_region_add_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) 2423 { 2424 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2425 struct nvmf_vfio_user_ctrlr *ctrlr; 2426 struct nvmf_vfio_user_sq *sq; 2427 struct nvmf_vfio_user_cq *cq; 2428 void *map_start, *map_end; 2429 int ret; 2430 2431 /* 2432 * We're not interested in any DMA regions that aren't mappable (we don't 2433 * support clients that don't share their memory). 2434 */ 2435 if (!info->vaddr) { 2436 return; 2437 } 2438 2439 map_start = info->mapping.iov_base; 2440 map_end = info->mapping.iov_base + info->mapping.iov_len; 2441 2442 if (((uintptr_t)info->mapping.iov_base & MASK_2MB) || 2443 (info->mapping.iov_len & MASK_2MB)) { 2444 SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %p-%p\n", 2445 info->vaddr, map_start, map_end); 2446 return; 2447 } 2448 2449 assert(endpoint != NULL); 2450 if (endpoint->ctrlr == NULL) { 2451 return; 2452 } 2453 ctrlr = endpoint->ctrlr; 2454 2455 SPDK_DEBUGLOG(nvmf_vfio, "%s: map IOVA %p-%p\n", endpoint_id(endpoint), 2456 map_start, map_end); 2457 2458 /* VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE are enabled when registering to VFIO, here we also 2459 * check the protection bits before registering. 2460 */ 2461 if (info->prot == (PROT_WRITE | PROT_READ)) { 2462 ret = spdk_mem_register(info->mapping.iov_base, info->mapping.iov_len); 2463 if (ret) { 2464 SPDK_ERRLOG("Memory region register %p-%p failed, ret=%d\n", 2465 map_start, map_end, ret); 2466 } 2467 } 2468 2469 pthread_mutex_lock(&endpoint->lock); 2470 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 2471 if (sq->sq_state != VFIO_USER_SQ_INACTIVE) { 2472 continue; 2473 } 2474 2475 cq = ctrlr->cqs[sq->cqid]; 2476 2477 /* For shared CQ case, we will use q_addr() to avoid mapping CQ multiple times */ 2478 if (cq->size && q_addr(&cq->mapping) == NULL) { 2479 ret = map_q(ctrlr, &cq->mapping, cq->size, true, false); 2480 if (ret) { 2481 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap cqid:%d %#lx-%#lx\n", 2482 cq->qid, cq->mapping.prp1, 2483 cq->mapping.prp1 + cq->size * sizeof(struct spdk_nvme_cpl)); 2484 continue; 2485 } 2486 } 2487 2488 if (sq->size) { 2489 ret = map_q(ctrlr, &sq->mapping, sq->size, false, false); 2490 if (ret) { 2491 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap sqid:%d %#lx-%#lx\n", 2492 sq->qid, sq->mapping.prp1, 2493 sq->mapping.prp1 + sq->size * sizeof(struct spdk_nvme_cmd)); 2494 continue; 2495 } 2496 } 2497 sq->sq_state = VFIO_USER_SQ_ACTIVE; 2498 SPDK_DEBUGLOG(nvmf_vfio, "Remap sqid:%u successfully\n", sq->qid); 2499 } 2500 pthread_mutex_unlock(&endpoint->lock); 2501 } 2502 2503 static void 2504 memory_region_remove_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) 2505 { 2506 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2507 struct nvmf_vfio_user_sq *sq; 2508 struct nvmf_vfio_user_cq *cq; 2509 void *map_start, *map_end; 2510 int ret = 0; 2511 2512 if (!info->vaddr) { 2513 return; 2514 } 2515 2516 map_start = info->mapping.iov_base; 2517 map_end = info->mapping.iov_base + info->mapping.iov_len; 2518 2519 if (((uintptr_t)info->mapping.iov_base & MASK_2MB) || 2520 (info->mapping.iov_len & MASK_2MB)) { 2521 SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %p-%p\n", 2522 info->vaddr, map_start, map_end); 2523 return; 2524 } 2525 2526 assert(endpoint != NULL); 2527 SPDK_DEBUGLOG(nvmf_vfio, "%s: unmap IOVA %p-%p\n", endpoint_id(endpoint), 2528 map_start, map_end); 2529 2530 if (endpoint->ctrlr != NULL) { 2531 struct nvmf_vfio_user_ctrlr *ctrlr; 2532 ctrlr = endpoint->ctrlr; 2533 2534 pthread_mutex_lock(&endpoint->lock); 2535 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 2536 if (q_addr(&sq->mapping) >= map_start && q_addr(&sq->mapping) <= map_end) { 2537 unmap_q(ctrlr, &sq->mapping); 2538 sq->sq_state = VFIO_USER_SQ_INACTIVE; 2539 } 2540 2541 cq = ctrlr->cqs[sq->cqid]; 2542 if (q_addr(&cq->mapping) >= map_start && q_addr(&cq->mapping) <= map_end) { 2543 unmap_q(ctrlr, &cq->mapping); 2544 } 2545 } 2546 2547 if (ctrlr->sdbl != NULL) { 2548 size_t i; 2549 2550 for (i = 0; i < NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT; i++) { 2551 const void *const iov_base = ctrlr->sdbl->iovs[i].iov_base; 2552 2553 if (iov_base >= map_start && iov_base < map_end) { 2554 copy_doorbells(ctrlr, 2555 ctrlr->sdbl->shadow_doorbells, 2556 ctrlr->bar0_doorbells); 2557 vfio_user_ctrlr_switch_doorbells(ctrlr, false); 2558 free_sdbl(endpoint->vfu_ctx, ctrlr->sdbl); 2559 ctrlr->sdbl = NULL; 2560 break; 2561 } 2562 } 2563 } 2564 2565 pthread_mutex_unlock(&endpoint->lock); 2566 } 2567 2568 if (info->prot == (PROT_WRITE | PROT_READ)) { 2569 ret = spdk_mem_unregister(info->mapping.iov_base, info->mapping.iov_len); 2570 if (ret) { 2571 SPDK_ERRLOG("Memory region unregister %p-%p failed, ret=%d\n", 2572 map_start, map_end, ret); 2573 } 2574 } 2575 } 2576 2577 /* Used to initiate a controller-level reset or a controller shutdown. */ 2578 static void 2579 disable_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 2580 { 2581 SPDK_DEBUGLOG(nvmf_vfio, "%s: disabling controller\n", 2582 ctrlr_id(vu_ctrlr)); 2583 2584 /* Unmap Admin queue. */ 2585 2586 assert(vu_ctrlr->sqs[0] != NULL); 2587 assert(vu_ctrlr->cqs[0] != NULL); 2588 2589 unmap_q(vu_ctrlr, &vu_ctrlr->sqs[0]->mapping); 2590 unmap_q(vu_ctrlr, &vu_ctrlr->cqs[0]->mapping); 2591 2592 vu_ctrlr->sqs[0]->size = 0; 2593 *sq_headp(vu_ctrlr->sqs[0]) = 0; 2594 2595 vu_ctrlr->sqs[0]->sq_state = VFIO_USER_SQ_INACTIVE; 2596 2597 vu_ctrlr->cqs[0]->size = 0; 2598 *cq_tailp(vu_ctrlr->cqs[0]) = 0; 2599 2600 /* 2601 * For PCIe controller reset or shutdown, we will drop all AER 2602 * responses. 2603 */ 2604 nvmf_ctrlr_abort_aer(vu_ctrlr->ctrlr); 2605 2606 /* Free the shadow doorbell buffer. */ 2607 free_sdbl(vu_ctrlr->endpoint->vfu_ctx, vu_ctrlr->sdbl); 2608 vu_ctrlr->sdbl = NULL; 2609 } 2610 2611 /* Used to re-enable the controller after a controller-level reset. */ 2612 static int 2613 enable_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 2614 { 2615 int err; 2616 2617 assert(vu_ctrlr != NULL); 2618 2619 SPDK_DEBUGLOG(nvmf_vfio, "%s: enabling controller\n", 2620 ctrlr_id(vu_ctrlr)); 2621 2622 err = acq_setup(vu_ctrlr); 2623 if (err != 0) { 2624 return err; 2625 } 2626 2627 err = asq_setup(vu_ctrlr); 2628 if (err != 0) { 2629 return err; 2630 } 2631 2632 vu_ctrlr->sqs[0]->sq_state = VFIO_USER_SQ_ACTIVE; 2633 2634 return 0; 2635 } 2636 2637 static int 2638 nvmf_vfio_user_prop_req_rsp(struct nvmf_vfio_user_req *req, void *cb_arg) 2639 { 2640 struct nvmf_vfio_user_sq *sq = cb_arg; 2641 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 2642 int ret; 2643 2644 assert(sq != NULL); 2645 assert(req != NULL); 2646 2647 if (req->req.cmd->prop_get_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET) { 2648 assert(sq->ctrlr != NULL); 2649 assert(req != NULL); 2650 2651 memcpy(req->req.data, 2652 &req->req.rsp->prop_get_rsp.value.u64, 2653 req->req.length); 2654 } else { 2655 assert(req->req.cmd->prop_set_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET); 2656 assert(sq->ctrlr != NULL); 2657 vu_ctrlr = sq->ctrlr; 2658 2659 if (req->req.cmd->prop_set_cmd.ofst == offsetof(struct spdk_nvme_registers, cc)) { 2660 union spdk_nvme_cc_register cc, diff; 2661 2662 cc.raw = req->req.cmd->prop_set_cmd.value.u64; 2663 diff.raw = cc.raw ^ req->cc.raw; 2664 2665 if (diff.bits.en) { 2666 if (cc.bits.en) { 2667 ret = enable_ctrlr(vu_ctrlr); 2668 if (ret) { 2669 SPDK_ERRLOG("%s: failed to enable ctrlr\n", ctrlr_id(vu_ctrlr)); 2670 return ret; 2671 } 2672 vu_ctrlr->reset_shn = false; 2673 } else { 2674 vu_ctrlr->reset_shn = true; 2675 } 2676 } 2677 2678 if (diff.bits.shn) { 2679 if (cc.bits.shn == SPDK_NVME_SHN_NORMAL || cc.bits.shn == SPDK_NVME_SHN_ABRUPT) { 2680 vu_ctrlr->reset_shn = true; 2681 } 2682 } 2683 2684 if (vu_ctrlr->reset_shn) { 2685 disable_ctrlr(vu_ctrlr); 2686 } 2687 } 2688 } 2689 2690 return 0; 2691 } 2692 2693 /* 2694 * Handles a write at offset 0x1000 or more; this is the non-mapped path when a 2695 * doorbell is written via access_bar0_fn(). 2696 * 2697 * DSTRD is set to fixed value 0 for NVMf. 2698 * 2699 */ 2700 static int 2701 handle_dbl_access(struct nvmf_vfio_user_ctrlr *ctrlr, uint32_t *buf, 2702 const size_t count, loff_t pos, const bool is_write) 2703 { 2704 assert(ctrlr != NULL); 2705 assert(buf != NULL); 2706 2707 if (!is_write) { 2708 SPDK_WARNLOG("%s: host tried to read BAR0 doorbell %#lx\n", 2709 ctrlr_id(ctrlr), pos); 2710 errno = EPERM; 2711 return -1; 2712 } 2713 2714 if (count != sizeof(uint32_t)) { 2715 SPDK_ERRLOG("%s: bad doorbell buffer size %ld\n", 2716 ctrlr_id(ctrlr), count); 2717 errno = EINVAL; 2718 return -1; 2719 } 2720 2721 pos -= NVME_DOORBELLS_OFFSET; 2722 2723 /* pos must be dword aligned */ 2724 if ((pos & 0x3) != 0) { 2725 SPDK_ERRLOG("%s: bad doorbell offset %#lx\n", ctrlr_id(ctrlr), pos); 2726 errno = EINVAL; 2727 return -1; 2728 } 2729 2730 /* convert byte offset to array index */ 2731 pos >>= 2; 2732 2733 if (pos >= NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR * 2) { 2734 SPDK_ERRLOG("%s: bad doorbell index %#lx\n", ctrlr_id(ctrlr), pos); 2735 errno = EINVAL; 2736 return -1; 2737 } 2738 2739 ctrlr->bar0_doorbells[pos] = *buf; 2740 spdk_wmb(); 2741 2742 SPDK_DEBUGLOG(vfio_user_db, "%s: updating BAR0 doorbell %s:%ld to %u\n", 2743 ctrlr_id(ctrlr), (pos & 1) ? "cqid" : "sqid", 2744 pos / 2, *buf); 2745 2746 2747 return 0; 2748 } 2749 2750 static size_t 2751 vfio_user_property_access(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 2752 char *buf, size_t count, loff_t pos, 2753 bool is_write) 2754 { 2755 struct nvmf_vfio_user_req *req; 2756 const struct spdk_nvmf_registers *regs; 2757 2758 if ((count != 4) && (count != 8)) { 2759 errno = EINVAL; 2760 return -1; 2761 } 2762 2763 /* Construct a Fabric Property Get/Set command and send it */ 2764 req = get_nvmf_vfio_user_req(vu_ctrlr->sqs[0]); 2765 if (req == NULL) { 2766 errno = ENOBUFS; 2767 return -1; 2768 } 2769 regs = spdk_nvmf_ctrlr_get_regs(vu_ctrlr->ctrlr); 2770 req->cc.raw = regs->cc.raw; 2771 2772 req->cb_fn = nvmf_vfio_user_prop_req_rsp; 2773 req->cb_arg = vu_ctrlr->sqs[0]; 2774 req->req.cmd->prop_set_cmd.opcode = SPDK_NVME_OPC_FABRIC; 2775 req->req.cmd->prop_set_cmd.cid = 0; 2776 if (count == 4) { 2777 req->req.cmd->prop_set_cmd.attrib.size = 0; 2778 } else { 2779 req->req.cmd->prop_set_cmd.attrib.size = 1; 2780 } 2781 req->req.cmd->prop_set_cmd.ofst = pos; 2782 if (is_write) { 2783 req->req.cmd->prop_set_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET; 2784 if (req->req.cmd->prop_set_cmd.attrib.size) { 2785 req->req.cmd->prop_set_cmd.value.u64 = *(uint64_t *)buf; 2786 } else { 2787 req->req.cmd->prop_set_cmd.value.u32.high = 0; 2788 req->req.cmd->prop_set_cmd.value.u32.low = *(uint32_t *)buf; 2789 } 2790 } else { 2791 req->req.cmd->prop_get_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET; 2792 } 2793 req->req.length = count; 2794 req->req.data = buf; 2795 2796 spdk_nvmf_request_exec_fabrics(&req->req); 2797 2798 return count; 2799 } 2800 2801 static ssize_t 2802 access_bar0_fn(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t pos, 2803 bool is_write) 2804 { 2805 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2806 struct nvmf_vfio_user_ctrlr *ctrlr; 2807 int ret; 2808 2809 ctrlr = endpoint->ctrlr; 2810 if (endpoint->need_async_destroy || !ctrlr) { 2811 errno = EIO; 2812 return -1; 2813 } 2814 2815 if (pos >= NVME_DOORBELLS_OFFSET) { 2816 /* 2817 * The fact that the doorbells can be memory mapped doesn't mean 2818 * that the client (VFIO in QEMU) is obliged to memory map them, 2819 * it might still elect to access them via regular read/write; 2820 * we might also have had disable_mappable_bar0 set. 2821 */ 2822 ret = handle_dbl_access(ctrlr, (uint32_t *)buf, count, 2823 pos, is_write); 2824 if (ret == 0) { 2825 return count; 2826 } 2827 return ret; 2828 } 2829 2830 return vfio_user_property_access(ctrlr, buf, count, pos, is_write); 2831 } 2832 2833 static ssize_t 2834 access_pci_config(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t offset, 2835 bool is_write) 2836 { 2837 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2838 2839 if (is_write) { 2840 SPDK_ERRLOG("%s: write %#lx-%#lx not supported\n", 2841 endpoint_id(endpoint), offset, offset + count); 2842 errno = EINVAL; 2843 return -1; 2844 } 2845 2846 if (offset + count > NVME_REG_CFG_SIZE) { 2847 SPDK_ERRLOG("%s: access past end of extended PCI configuration space, want=%ld+%ld, max=%d\n", 2848 endpoint_id(endpoint), offset, count, 2849 NVME_REG_CFG_SIZE); 2850 errno = ERANGE; 2851 return -1; 2852 } 2853 2854 memcpy(buf, ((unsigned char *)endpoint->pci_config_space) + offset, count); 2855 2856 return count; 2857 } 2858 2859 static void 2860 vfio_user_log(vfu_ctx_t *vfu_ctx, int level, char const *msg) 2861 { 2862 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2863 2864 if (level >= LOG_DEBUG) { 2865 SPDK_DEBUGLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg); 2866 } else if (level >= LOG_INFO) { 2867 SPDK_INFOLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg); 2868 } else if (level >= LOG_NOTICE) { 2869 SPDK_NOTICELOG("%s: %s\n", endpoint_id(endpoint), msg); 2870 } else if (level >= LOG_WARNING) { 2871 SPDK_WARNLOG("%s: %s\n", endpoint_id(endpoint), msg); 2872 } else { 2873 SPDK_ERRLOG("%s: %s\n", endpoint_id(endpoint), msg); 2874 } 2875 } 2876 2877 static int 2878 vfio_user_get_log_level(void) 2879 { 2880 int level; 2881 2882 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 2883 return LOG_DEBUG; 2884 } 2885 2886 level = spdk_log_to_syslog_level(spdk_log_get_level()); 2887 if (level < 0) { 2888 return LOG_ERR; 2889 } 2890 2891 return level; 2892 } 2893 2894 static void 2895 init_pci_config_space(vfu_pci_config_space_t *p) 2896 { 2897 /* MLBAR */ 2898 p->hdr.bars[0].raw = 0x0; 2899 /* MUBAR */ 2900 p->hdr.bars[1].raw = 0x0; 2901 2902 /* vendor specific, let's set them to zero for now */ 2903 p->hdr.bars[3].raw = 0x0; 2904 p->hdr.bars[4].raw = 0x0; 2905 p->hdr.bars[5].raw = 0x0; 2906 2907 /* enable INTx */ 2908 p->hdr.intr.ipin = 0x1; 2909 } 2910 2911 struct ctrlr_quiesce_ctx { 2912 struct nvmf_vfio_user_endpoint *endpoint; 2913 struct nvmf_vfio_user_poll_group *group; 2914 int status; 2915 }; 2916 2917 static void ctrlr_quiesce(struct nvmf_vfio_user_ctrlr *vu_ctrlr); 2918 2919 static void 2920 _vfio_user_endpoint_resume_done_msg(void *ctx) 2921 { 2922 struct nvmf_vfio_user_endpoint *endpoint = ctx; 2923 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 2924 2925 endpoint->need_resume = false; 2926 2927 if (!vu_ctrlr) { 2928 return; 2929 } 2930 2931 if (!vu_ctrlr->queued_quiesce) { 2932 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 2933 2934 /* 2935 * We might have ignored new SQ entries while we were quiesced: 2936 * kick ourselves so we'll definitely check again while in 2937 * VFIO_USER_CTRLR_RUNNING state. 2938 */ 2939 ctrlr_kick(vu_ctrlr); 2940 return; 2941 } 2942 2943 2944 /* 2945 * Basically, once we call `vfu_device_quiesced` the device is 2946 * unquiesced from libvfio-user's perspective so from the moment 2947 * `vfio_user_quiesce_done` returns libvfio-user might quiesce the device 2948 * again. However, because the NVMf subsytem is an asynchronous 2949 * operation, this quiesce might come _before_ the NVMf subsystem has 2950 * been resumed, so in the callback of `spdk_nvmf_subsystem_resume` we 2951 * need to check whether a quiesce was requested. 2952 */ 2953 SPDK_DEBUGLOG(nvmf_vfio, "%s has queued quiesce event, quiesce again\n", 2954 ctrlr_id(vu_ctrlr)); 2955 ctrlr_quiesce(vu_ctrlr); 2956 } 2957 2958 static void 2959 vfio_user_endpoint_resume_done(struct spdk_nvmf_subsystem *subsystem, 2960 void *cb_arg, int status) 2961 { 2962 struct nvmf_vfio_user_endpoint *endpoint = cb_arg; 2963 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 2964 2965 SPDK_DEBUGLOG(nvmf_vfio, "%s resumed done with status %d\n", endpoint_id(endpoint), status); 2966 2967 if (!vu_ctrlr) { 2968 return; 2969 } 2970 2971 spdk_thread_send_msg(vu_ctrlr->thread, _vfio_user_endpoint_resume_done_msg, endpoint); 2972 } 2973 2974 static void 2975 vfio_user_quiesce_done(void *ctx) 2976 { 2977 struct ctrlr_quiesce_ctx *quiesce_ctx = ctx; 2978 struct nvmf_vfio_user_endpoint *endpoint = quiesce_ctx->endpoint; 2979 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 2980 int ret; 2981 2982 if (!vu_ctrlr) { 2983 free(quiesce_ctx); 2984 return; 2985 } 2986 2987 SPDK_DEBUGLOG(nvmf_vfio, "%s device quiesced\n", ctrlr_id(vu_ctrlr)); 2988 2989 assert(vu_ctrlr->state == VFIO_USER_CTRLR_PAUSING); 2990 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 2991 vfu_device_quiesced(endpoint->vfu_ctx, quiesce_ctx->status); 2992 vu_ctrlr->queued_quiesce = false; 2993 free(quiesce_ctx); 2994 2995 /* `vfu_device_quiesced` can change the migration state, 2996 * so we need to re-check `vu_ctrlr->state`. 2997 */ 2998 if (vu_ctrlr->state == VFIO_USER_CTRLR_MIGRATING) { 2999 SPDK_DEBUGLOG(nvmf_vfio, "%s is in MIGRATION state\n", ctrlr_id(vu_ctrlr)); 3000 return; 3001 } 3002 3003 SPDK_DEBUGLOG(nvmf_vfio, "%s start to resume\n", ctrlr_id(vu_ctrlr)); 3004 vu_ctrlr->state = VFIO_USER_CTRLR_RESUMING; 3005 ret = spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 3006 vfio_user_endpoint_resume_done, endpoint); 3007 if (ret < 0) { 3008 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 3009 SPDK_ERRLOG("%s: failed to resume, ret=%d\n", endpoint_id(endpoint), ret); 3010 } 3011 } 3012 3013 static void 3014 vfio_user_pause_done(struct spdk_nvmf_subsystem *subsystem, 3015 void *ctx, int status) 3016 { 3017 struct ctrlr_quiesce_ctx *quiesce_ctx = ctx; 3018 struct nvmf_vfio_user_endpoint *endpoint = quiesce_ctx->endpoint; 3019 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3020 3021 if (!vu_ctrlr) { 3022 free(quiesce_ctx); 3023 return; 3024 } 3025 3026 quiesce_ctx->status = status; 3027 3028 SPDK_DEBUGLOG(nvmf_vfio, "%s pause done with status %d\n", 3029 ctrlr_id(vu_ctrlr), status); 3030 3031 spdk_thread_send_msg(vu_ctrlr->thread, 3032 vfio_user_quiesce_done, ctx); 3033 } 3034 3035 /* 3036 * Ensure that, for this PG, we've stopped running in nvmf_vfio_user_sq_poll(); 3037 * we've already set ctrlr->state, so we won't process new entries, but we need 3038 * to ensure that this PG is quiesced. This only works because there's no 3039 * callback context set up between polling the SQ and spdk_nvmf_request_exec(). 3040 * 3041 * Once we've walked all PGs, we need to pause any submitted I/O via 3042 * spdk_nvmf_subsystem_pause(SPDK_NVME_GLOBAL_NS_TAG). 3043 */ 3044 static void 3045 vfio_user_quiesce_pg(void *ctx) 3046 { 3047 struct ctrlr_quiesce_ctx *quiesce_ctx = ctx; 3048 struct nvmf_vfio_user_endpoint *endpoint = quiesce_ctx->endpoint; 3049 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3050 struct nvmf_vfio_user_poll_group *vu_group = quiesce_ctx->group; 3051 struct spdk_nvmf_subsystem *subsystem = endpoint->subsystem; 3052 int ret; 3053 3054 SPDK_DEBUGLOG(nvmf_vfio, "quiesced pg:%p\n", vu_group); 3055 3056 if (!vu_ctrlr) { 3057 free(quiesce_ctx); 3058 return; 3059 } 3060 3061 quiesce_ctx->group = TAILQ_NEXT(vu_group, link); 3062 if (quiesce_ctx->group != NULL) { 3063 spdk_thread_send_msg(poll_group_to_thread(quiesce_ctx->group), 3064 vfio_user_quiesce_pg, quiesce_ctx); 3065 return; 3066 } 3067 3068 ret = spdk_nvmf_subsystem_pause(subsystem, SPDK_NVME_GLOBAL_NS_TAG, 3069 vfio_user_pause_done, quiesce_ctx); 3070 if (ret < 0) { 3071 SPDK_ERRLOG("%s: failed to pause, ret=%d\n", 3072 endpoint_id(endpoint), ret); 3073 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 3074 fail_ctrlr(vu_ctrlr); 3075 free(quiesce_ctx); 3076 } 3077 } 3078 3079 static void 3080 ctrlr_quiesce(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3081 { 3082 struct ctrlr_quiesce_ctx *quiesce_ctx; 3083 3084 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSING; 3085 3086 quiesce_ctx = calloc(1, sizeof(*quiesce_ctx)); 3087 if (!quiesce_ctx) { 3088 SPDK_ERRLOG("Failed to allocate subsystem pause context\n"); 3089 assert(false); 3090 return; 3091 } 3092 3093 quiesce_ctx->endpoint = vu_ctrlr->endpoint; 3094 quiesce_ctx->status = 0; 3095 quiesce_ctx->group = TAILQ_FIRST(&vu_ctrlr->transport->poll_groups); 3096 3097 spdk_thread_send_msg(poll_group_to_thread(quiesce_ctx->group), 3098 vfio_user_quiesce_pg, quiesce_ctx); 3099 } 3100 3101 static int 3102 vfio_user_dev_quiesce_cb(vfu_ctx_t *vfu_ctx) 3103 { 3104 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3105 struct spdk_nvmf_subsystem *subsystem = endpoint->subsystem; 3106 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3107 3108 if (!vu_ctrlr) { 3109 return 0; 3110 } 3111 3112 /* NVMf library will destruct controller when no 3113 * connected queue pairs. 3114 */ 3115 if (!nvmf_subsystem_get_ctrlr(subsystem, vu_ctrlr->cntlid)) { 3116 return 0; 3117 } 3118 3119 SPDK_DEBUGLOG(nvmf_vfio, "%s starts to quiesce\n", ctrlr_id(vu_ctrlr)); 3120 3121 /* There is no race condition here as device quiesce callback 3122 * and nvmf_prop_set_cc() are running in the same thread context. 3123 */ 3124 if (!vu_ctrlr->ctrlr->vcprop.cc.bits.en) { 3125 return 0; 3126 } else if (!vu_ctrlr->ctrlr->vcprop.csts.bits.rdy) { 3127 return 0; 3128 } else if (vu_ctrlr->ctrlr->vcprop.csts.bits.shst == SPDK_NVME_SHST_COMPLETE) { 3129 return 0; 3130 } 3131 3132 switch (vu_ctrlr->state) { 3133 case VFIO_USER_CTRLR_PAUSED: 3134 case VFIO_USER_CTRLR_MIGRATING: 3135 return 0; 3136 case VFIO_USER_CTRLR_RUNNING: 3137 ctrlr_quiesce(vu_ctrlr); 3138 break; 3139 case VFIO_USER_CTRLR_RESUMING: 3140 vu_ctrlr->queued_quiesce = true; 3141 SPDK_DEBUGLOG(nvmf_vfio, "%s is busy to quiesce, current state %u\n", ctrlr_id(vu_ctrlr), 3142 vu_ctrlr->state); 3143 break; 3144 default: 3145 assert(vu_ctrlr->state != VFIO_USER_CTRLR_PAUSING); 3146 break; 3147 } 3148 3149 errno = EBUSY; 3150 return -1; 3151 } 3152 3153 static void 3154 vfio_user_ctrlr_dump_migr_data(const char *name, 3155 struct vfio_user_nvme_migr_state *migr_data, 3156 struct nvmf_vfio_user_shadow_doorbells *sdbl) 3157 { 3158 struct spdk_nvme_registers *regs; 3159 struct nvme_migr_sq_state *sq; 3160 struct nvme_migr_cq_state *cq; 3161 uint32_t *doorbell_base; 3162 uint32_t i; 3163 3164 SPDK_NOTICELOG("Dump %s\n", name); 3165 3166 regs = (struct spdk_nvme_registers *)migr_data->bar0; 3167 doorbell_base = (uint32_t *)®s->doorbell[0].sq_tdbl; 3168 3169 SPDK_NOTICELOG("Registers\n"); 3170 SPDK_NOTICELOG("CSTS 0x%x\n", regs->csts.raw); 3171 SPDK_NOTICELOG("CAP 0x%"PRIx64"\n", regs->cap.raw); 3172 SPDK_NOTICELOG("VS 0x%x\n", regs->vs.raw); 3173 SPDK_NOTICELOG("CC 0x%x\n", regs->cc.raw); 3174 SPDK_NOTICELOG("AQA 0x%x\n", regs->aqa.raw); 3175 SPDK_NOTICELOG("ASQ 0x%"PRIx64"\n", regs->asq); 3176 SPDK_NOTICELOG("ACQ 0x%"PRIx64"\n", regs->acq); 3177 3178 SPDK_NOTICELOG("Number of IO Queues %u\n", migr_data->ctrlr_header.num_io_queues); 3179 3180 if (sdbl != NULL) { 3181 SPDK_NOTICELOG("shadow doorbell buffer=%#lx\n", 3182 migr_data->ctrlr_header.shadow_doorbell_buffer); 3183 SPDK_NOTICELOG("eventidx buffer=%#lx\n", 3184 migr_data->ctrlr_header.eventidx_buffer); 3185 } 3186 3187 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3188 sq = &migr_data->qps[i].sq; 3189 cq = &migr_data->qps[i].cq; 3190 3191 if (sq->size) { 3192 SPDK_NOTICELOG("sqid:%u, bar0_doorbell:%u\n", sq->sqid, doorbell_base[i * 2]); 3193 if (i > 0 && sdbl != NULL) { 3194 SPDK_NOTICELOG("sqid:%u, shadow_doorbell:%u, eventidx:%u\n", 3195 sq->sqid, 3196 sdbl->shadow_doorbells[queue_index(i, false)], 3197 sdbl->eventidxs[queue_index(i, false)]); 3198 } 3199 SPDK_NOTICELOG("SQ sqid:%u, cqid:%u, sqhead:%u, size:%u, dma_addr:0x%"PRIx64"\n", 3200 sq->sqid, sq->cqid, sq->head, sq->size, sq->dma_addr); 3201 } 3202 3203 if (cq->size) { 3204 SPDK_NOTICELOG("cqid:%u, bar0_doorbell:%u\n", cq->cqid, doorbell_base[i * 2 + 1]); 3205 if (i > 0 && sdbl != NULL) { 3206 SPDK_NOTICELOG("cqid:%u, shadow_doorbell:%u, eventidx:%u\n", 3207 cq->cqid, 3208 sdbl->shadow_doorbells[queue_index(i, true)], 3209 sdbl->eventidxs[queue_index(i, true)]); 3210 } 3211 SPDK_NOTICELOG("CQ cqid:%u, phase:%u, cqtail:%u, size:%u, iv:%u, ien:%u, dma_addr:0x%"PRIx64"\n", 3212 cq->cqid, cq->phase, cq->tail, cq->size, cq->iv, cq->ien, cq->dma_addr); 3213 } 3214 } 3215 3216 SPDK_NOTICELOG("%s Dump Done\n", name); 3217 } 3218 3219 /* Read region 9 content and restore it to migration data structures */ 3220 static int 3221 vfio_user_migr_stream_to_data(struct nvmf_vfio_user_endpoint *endpoint, 3222 struct vfio_user_nvme_migr_state *migr_state) 3223 { 3224 void *data_ptr = endpoint->migr_data; 3225 3226 /* Load vfio_user_nvme_migr_header first */ 3227 memcpy(&migr_state->ctrlr_header, data_ptr, sizeof(struct vfio_user_nvme_migr_header)); 3228 /* TODO: version check */ 3229 if (migr_state->ctrlr_header.magic != VFIO_USER_NVME_MIGR_MAGIC) { 3230 SPDK_ERRLOG("%s: bad magic number %x\n", endpoint_id(endpoint), migr_state->ctrlr_header.magic); 3231 return -EINVAL; 3232 } 3233 3234 /* Load nvmf controller data */ 3235 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.nvmf_data_offset; 3236 memcpy(&migr_state->nvmf_data, data_ptr, migr_state->ctrlr_header.nvmf_data_len); 3237 3238 /* Load queue pairs */ 3239 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.qp_offset; 3240 memcpy(&migr_state->qps, data_ptr, migr_state->ctrlr_header.qp_len); 3241 3242 /* Load BAR0 */ 3243 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.bar_offset[VFU_PCI_DEV_BAR0_REGION_IDX]; 3244 memcpy(&migr_state->bar0, data_ptr, migr_state->ctrlr_header.bar_len[VFU_PCI_DEV_BAR0_REGION_IDX]); 3245 3246 /* Load CFG */ 3247 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.bar_offset[VFU_PCI_DEV_CFG_REGION_IDX]; 3248 memcpy(&migr_state->cfg, data_ptr, migr_state->ctrlr_header.bar_len[VFU_PCI_DEV_CFG_REGION_IDX]); 3249 3250 return 0; 3251 } 3252 3253 3254 static void 3255 vfio_user_migr_ctrlr_save_data(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3256 { 3257 struct spdk_nvmf_ctrlr *ctrlr = vu_ctrlr->ctrlr; 3258 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 3259 struct nvmf_vfio_user_sq *sq; 3260 struct nvmf_vfio_user_cq *cq; 3261 struct vfio_user_nvme_migr_state migr_state = {}; 3262 uint64_t data_offset; 3263 void *data_ptr; 3264 int num_aers; 3265 struct spdk_nvme_registers *regs; 3266 uint32_t *doorbell_base; 3267 uint32_t i = 0; 3268 uint16_t sqid, cqid; 3269 3270 /* Save all data to vfio_user_nvme_migr_state first, then we will 3271 * copy it to device migration region at last. 3272 */ 3273 3274 /* save magic number */ 3275 migr_state.ctrlr_header.magic = VFIO_USER_NVME_MIGR_MAGIC; 3276 3277 /* save controller data */ 3278 num_aers = nvmf_ctrlr_save_aers(ctrlr, migr_state.ctrlr_header.aer_cids, 3279 256); 3280 assert(num_aers >= 0); 3281 migr_state.ctrlr_header.nr_aers = num_aers; 3282 3283 /* save nvmf controller data */ 3284 nvmf_ctrlr_save_migr_data(ctrlr, (struct nvmf_ctrlr_migr_data *)&migr_state.nvmf_data); 3285 3286 /* save connected queue pairs */ 3287 TAILQ_FOREACH(sq, &vu_ctrlr->connected_sqs, tailq) { 3288 /* save sq */ 3289 sqid = sq->qid; 3290 migr_state.qps[sqid].sq.sqid = sq->qid; 3291 migr_state.qps[sqid].sq.cqid = sq->cqid; 3292 migr_state.qps[sqid].sq.head = *sq_headp(sq); 3293 migr_state.qps[sqid].sq.size = sq->size; 3294 migr_state.qps[sqid].sq.dma_addr = sq->mapping.prp1; 3295 3296 /* save cq, for shared cq case, cq may be saved multiple times */ 3297 cqid = sq->cqid; 3298 cq = vu_ctrlr->cqs[cqid]; 3299 migr_state.qps[cqid].cq.cqid = cqid; 3300 migr_state.qps[cqid].cq.tail = *cq_tailp(cq); 3301 migr_state.qps[cqid].cq.ien = cq->ien; 3302 migr_state.qps[cqid].cq.iv = cq->iv; 3303 migr_state.qps[cqid].cq.size = cq->size; 3304 migr_state.qps[cqid].cq.phase = cq->phase; 3305 migr_state.qps[cqid].cq.dma_addr = cq->mapping.prp1; 3306 i++; 3307 } 3308 3309 assert(i > 0); 3310 migr_state.ctrlr_header.num_io_queues = i - 1; 3311 3312 regs = (struct spdk_nvme_registers *)&migr_state.bar0; 3313 /* Save mandarory registers to bar0 */ 3314 regs->csts.raw = ctrlr->vcprop.csts.raw; 3315 regs->cap.raw = ctrlr->vcprop.cap.raw; 3316 regs->vs.raw = ctrlr->vcprop.vs.raw; 3317 regs->cc.raw = ctrlr->vcprop.cc.raw; 3318 regs->aqa.raw = ctrlr->vcprop.aqa.raw; 3319 regs->asq = ctrlr->vcprop.asq; 3320 regs->acq = ctrlr->vcprop.acq; 3321 /* Save doorbells */ 3322 doorbell_base = (uint32_t *)®s->doorbell[0].sq_tdbl; 3323 memcpy(doorbell_base, (void *)vu_ctrlr->bar0_doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 3324 3325 /* Save PCI configuration space */ 3326 memcpy(&migr_state.cfg, (void *)endpoint->pci_config_space, NVME_REG_CFG_SIZE); 3327 3328 /* Save all data to device migration region */ 3329 data_ptr = endpoint->migr_data; 3330 3331 /* Copy nvmf controller data */ 3332 data_offset = sizeof(struct vfio_user_nvme_migr_header); 3333 data_ptr += data_offset; 3334 migr_state.ctrlr_header.nvmf_data_offset = data_offset; 3335 migr_state.ctrlr_header.nvmf_data_len = sizeof(struct nvmf_ctrlr_migr_data); 3336 memcpy(data_ptr, &migr_state.nvmf_data, sizeof(struct nvmf_ctrlr_migr_data)); 3337 3338 /* Copy queue pairs */ 3339 data_offset += sizeof(struct nvmf_ctrlr_migr_data); 3340 data_ptr += sizeof(struct nvmf_ctrlr_migr_data); 3341 migr_state.ctrlr_header.qp_offset = data_offset; 3342 migr_state.ctrlr_header.qp_len = i * (sizeof(struct nvme_migr_sq_state) + sizeof( 3343 struct nvme_migr_cq_state)); 3344 memcpy(data_ptr, &migr_state.qps, migr_state.ctrlr_header.qp_len); 3345 3346 /* Copy BAR0 */ 3347 data_offset += migr_state.ctrlr_header.qp_len; 3348 data_ptr += migr_state.ctrlr_header.qp_len; 3349 migr_state.ctrlr_header.bar_offset[VFU_PCI_DEV_BAR0_REGION_IDX] = data_offset; 3350 migr_state.ctrlr_header.bar_len[VFU_PCI_DEV_BAR0_REGION_IDX] = NVME_REG_BAR0_SIZE; 3351 memcpy(data_ptr, &migr_state.bar0, NVME_REG_BAR0_SIZE); 3352 3353 /* Copy CFG */ 3354 data_offset += NVME_REG_BAR0_SIZE; 3355 data_ptr += NVME_REG_BAR0_SIZE; 3356 migr_state.ctrlr_header.bar_offset[VFU_PCI_DEV_CFG_REGION_IDX] = data_offset; 3357 migr_state.ctrlr_header.bar_len[VFU_PCI_DEV_CFG_REGION_IDX] = NVME_REG_CFG_SIZE; 3358 memcpy(data_ptr, &migr_state.cfg, NVME_REG_CFG_SIZE); 3359 3360 /* copy shadow doorbells */ 3361 if (vu_ctrlr->sdbl != NULL) { 3362 migr_state.ctrlr_header.sdbl = true; 3363 migr_state.ctrlr_header.shadow_doorbell_buffer = vu_ctrlr->shadow_doorbell_buffer; 3364 migr_state.ctrlr_header.eventidx_buffer = vu_ctrlr->eventidx_buffer; 3365 } 3366 3367 /* Copy nvme migration header finally */ 3368 memcpy(endpoint->migr_data, &migr_state.ctrlr_header, sizeof(struct vfio_user_nvme_migr_header)); 3369 3370 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 3371 vfio_user_ctrlr_dump_migr_data("SAVE", &migr_state, vu_ctrlr->sdbl); 3372 } 3373 } 3374 3375 /* 3376 * If we are about to close the connection, we need to unregister the interrupt, 3377 * as the library will subsequently close the file descriptor we registered. 3378 */ 3379 static int 3380 vfio_user_device_reset(vfu_ctx_t *vfu_ctx, vfu_reset_type_t type) 3381 { 3382 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3383 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 3384 3385 SPDK_DEBUGLOG(nvmf_vfio, "Device reset type %u\n", type); 3386 3387 if (type == VFU_RESET_LOST_CONN) { 3388 if (ctrlr != NULL) { 3389 spdk_interrupt_unregister(&ctrlr->intr); 3390 ctrlr->intr_fd = -1; 3391 } 3392 return 0; 3393 } 3394 3395 /* FIXME: LOST_CONN case ? */ 3396 if (ctrlr->sdbl != NULL) { 3397 free_sdbl(vfu_ctx, ctrlr->sdbl); 3398 ctrlr->sdbl = NULL; 3399 } 3400 3401 /* FIXME: much more needed here. */ 3402 3403 return 0; 3404 } 3405 3406 static int 3407 vfio_user_migr_ctrlr_construct_qps(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 3408 struct vfio_user_nvme_migr_state *migr_state) 3409 { 3410 uint32_t i, qsize = 0; 3411 uint16_t sqid, cqid; 3412 struct vfio_user_nvme_migr_qp migr_qp; 3413 void *addr; 3414 uint32_t cqs_ref[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR] = {}; 3415 int ret; 3416 3417 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 3418 vfio_user_ctrlr_dump_migr_data("RESUME", migr_state, vu_ctrlr->sdbl); 3419 } 3420 3421 /* restore submission queues */ 3422 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3423 migr_qp = migr_state->qps[i]; 3424 3425 qsize = migr_qp.sq.size; 3426 if (qsize) { 3427 struct nvmf_vfio_user_sq *sq; 3428 3429 sqid = migr_qp.sq.sqid; 3430 if (sqid != i) { 3431 SPDK_ERRLOG("Expected sqid %u while got %u", i, sqid); 3432 return -EINVAL; 3433 } 3434 3435 /* allocate sq if necessary */ 3436 if (vu_ctrlr->sqs[sqid] == NULL) { 3437 ret = init_sq(vu_ctrlr, &vu_ctrlr->transport->transport, sqid); 3438 if (ret) { 3439 SPDK_ERRLOG("Construct qpair with qid %u failed\n", sqid); 3440 return -EFAULT; 3441 } 3442 } 3443 3444 sq = vu_ctrlr->sqs[sqid]; 3445 sq->size = qsize; 3446 3447 ret = alloc_sq_reqs(vu_ctrlr, sq); 3448 if (ret) { 3449 SPDK_ERRLOG("Construct sq with qid %u failed\n", sqid); 3450 return -EFAULT; 3451 } 3452 3453 /* restore sq */ 3454 sq->sq_state = VFIO_USER_SQ_CREATED; 3455 sq->cqid = migr_qp.sq.cqid; 3456 *sq_headp(sq) = migr_qp.sq.head; 3457 sq->mapping.prp1 = migr_qp.sq.dma_addr; 3458 addr = map_one(vu_ctrlr->endpoint->vfu_ctx, 3459 sq->mapping.prp1, sq->size * 64, 3460 sq->mapping.sg, &sq->mapping.iov, 3461 PROT_READ); 3462 if (addr == NULL) { 3463 SPDK_ERRLOG("Restore sq with qid %u PRP1 0x%"PRIx64" with size %u failed\n", 3464 sqid, sq->mapping.prp1, sq->size); 3465 return -EFAULT; 3466 } 3467 cqs_ref[sq->cqid]++; 3468 } 3469 } 3470 3471 /* restore completion queues */ 3472 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3473 migr_qp = migr_state->qps[i]; 3474 3475 qsize = migr_qp.cq.size; 3476 if (qsize) { 3477 struct nvmf_vfio_user_cq *cq; 3478 3479 /* restore cq */ 3480 cqid = migr_qp.sq.cqid; 3481 assert(cqid == i); 3482 3483 /* allocate cq if necessary */ 3484 if (vu_ctrlr->cqs[cqid] == NULL) { 3485 ret = init_cq(vu_ctrlr, cqid); 3486 if (ret) { 3487 SPDK_ERRLOG("Construct qpair with qid %u failed\n", cqid); 3488 return -EFAULT; 3489 } 3490 } 3491 3492 cq = vu_ctrlr->cqs[cqid]; 3493 3494 cq->size = qsize; 3495 3496 cq->cq_state = VFIO_USER_CQ_CREATED; 3497 cq->cq_ref = cqs_ref[cqid]; 3498 *cq_tailp(cq) = migr_qp.cq.tail; 3499 cq->mapping.prp1 = migr_qp.cq.dma_addr; 3500 cq->ien = migr_qp.cq.ien; 3501 cq->iv = migr_qp.cq.iv; 3502 cq->phase = migr_qp.cq.phase; 3503 addr = map_one(vu_ctrlr->endpoint->vfu_ctx, 3504 cq->mapping.prp1, cq->size * 16, 3505 cq->mapping.sg, &cq->mapping.iov, 3506 PROT_READ | PROT_WRITE); 3507 if (addr == NULL) { 3508 SPDK_ERRLOG("Restore cq with qid %u PRP1 0x%"PRIx64" with size %u failed\n", 3509 cqid, cq->mapping.prp1, cq->size); 3510 return -EFAULT; 3511 } 3512 } 3513 } 3514 3515 return 0; 3516 } 3517 3518 static int 3519 vfio_user_migr_ctrlr_restore(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3520 { 3521 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 3522 struct spdk_nvmf_ctrlr *ctrlr = vu_ctrlr->ctrlr; 3523 uint32_t *doorbell_base; 3524 struct vfio_user_nvme_migr_state migr_state = {}; 3525 struct spdk_nvme_registers *regs; 3526 struct spdk_nvme_cmd cmd; 3527 uint16_t i; 3528 int rc = 0; 3529 3530 assert(endpoint->migr_data != NULL); 3531 assert(ctrlr != NULL); 3532 rc = vfio_user_migr_stream_to_data(endpoint, &migr_state); 3533 if (rc) { 3534 return rc; 3535 } 3536 3537 /* restore shadow doorbells */ 3538 if (migr_state.ctrlr_header.sdbl) { 3539 struct nvmf_vfio_user_shadow_doorbells *sdbl; 3540 sdbl = map_sdbl(vu_ctrlr->endpoint->vfu_ctx, 3541 migr_state.ctrlr_header.shadow_doorbell_buffer, 3542 migr_state.ctrlr_header.eventidx_buffer, 3543 memory_page_size(vu_ctrlr)); 3544 if (sdbl == NULL) { 3545 SPDK_ERRLOG("%s: failed to re-map shadow doorbell buffers\n", 3546 ctrlr_id(vu_ctrlr)); 3547 return -1; 3548 } 3549 3550 vu_ctrlr->shadow_doorbell_buffer = migr_state.ctrlr_header.shadow_doorbell_buffer; 3551 vu_ctrlr->eventidx_buffer = migr_state.ctrlr_header.eventidx_buffer; 3552 3553 SWAP(vu_ctrlr->sdbl, sdbl); 3554 } 3555 3556 rc = vfio_user_migr_ctrlr_construct_qps(vu_ctrlr, &migr_state); 3557 if (rc) { 3558 return rc; 3559 } 3560 3561 /* restore PCI configuration space */ 3562 memcpy((void *)endpoint->pci_config_space, &migr_state.cfg, NVME_REG_CFG_SIZE); 3563 3564 regs = (struct spdk_nvme_registers *)&migr_state.bar0; 3565 doorbell_base = (uint32_t *)®s->doorbell[0].sq_tdbl; 3566 /* restore doorbells from saved registers */ 3567 memcpy((void *)vu_ctrlr->bar0_doorbells, doorbell_base, NVMF_VFIO_USER_DOORBELLS_SIZE); 3568 3569 /* restore controller registers after ADMIN queue connection */ 3570 ctrlr->vcprop.csts.raw = regs->csts.raw; 3571 ctrlr->vcprop.cap.raw = regs->cap.raw; 3572 ctrlr->vcprop.vs.raw = regs->vs.raw; 3573 ctrlr->vcprop.cc.raw = regs->cc.raw; 3574 ctrlr->vcprop.aqa.raw = regs->aqa.raw; 3575 ctrlr->vcprop.asq = regs->asq; 3576 ctrlr->vcprop.acq = regs->acq; 3577 3578 /* restore nvmf controller data */ 3579 rc = nvmf_ctrlr_restore_migr_data(ctrlr, &migr_state.nvmf_data); 3580 if (rc) { 3581 return rc; 3582 } 3583 3584 /* resubmit pending AERs */ 3585 for (i = 0; i < migr_state.ctrlr_header.nr_aers; i++) { 3586 SPDK_DEBUGLOG(nvmf_vfio, "%s AER resubmit, CID %u\n", ctrlr_id(vu_ctrlr), 3587 migr_state.ctrlr_header.aer_cids[i]); 3588 memset(&cmd, 0, sizeof(cmd)); 3589 cmd.opc = SPDK_NVME_OPC_ASYNC_EVENT_REQUEST; 3590 cmd.cid = migr_state.ctrlr_header.aer_cids[i]; 3591 rc = handle_cmd_req(vu_ctrlr, &cmd, vu_ctrlr->sqs[0]); 3592 if (rc) { 3593 break; 3594 } 3595 } 3596 3597 return rc; 3598 } 3599 3600 static void 3601 vfio_user_migr_ctrlr_enable_sqs(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3602 { 3603 uint32_t i; 3604 struct nvmf_vfio_user_sq *sq; 3605 3606 /* The Admin queue (qid: 0) does not ever use shadow doorbells. */ 3607 3608 if (vu_ctrlr->sqs[0] != NULL) { 3609 vu_ctrlr->sqs[0]->dbl_tailp = vu_ctrlr->bar0_doorbells + 3610 queue_index(0, false); 3611 } 3612 3613 if (vu_ctrlr->cqs[0] != NULL) { 3614 vu_ctrlr->cqs[0]->dbl_headp = vu_ctrlr->bar0_doorbells + 3615 queue_index(0, true); 3616 } 3617 3618 vfio_user_ctrlr_switch_doorbells(vu_ctrlr, vu_ctrlr->sdbl != NULL); 3619 3620 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3621 sq = vu_ctrlr->sqs[i]; 3622 if (!sq || !sq->size) { 3623 continue; 3624 } 3625 3626 if (nvmf_qpair_is_admin_queue(&sq->qpair)) { 3627 /* ADMIN queue pair is always in the poll group, just enable it */ 3628 sq->sq_state = VFIO_USER_SQ_ACTIVE; 3629 } else { 3630 spdk_nvmf_tgt_new_qpair(vu_ctrlr->transport->transport.tgt, &sq->qpair); 3631 } 3632 } 3633 } 3634 3635 /* 3636 * We are in stop-and-copy state, but still potentially have some current dirty 3637 * sgls: while we're quiesced and thus should have no active requests, we still 3638 * have potentially dirty maps of the shadow doorbells and the CQs (SQs are 3639 * mapped read only). 3640 * 3641 * Since we won't be calling vfu_sgl_put() for them, we need to explicitly 3642 * mark them dirty now. 3643 */ 3644 static void 3645 vfio_user_migr_ctrlr_mark_dirty(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3646 { 3647 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 3648 3649 assert(vu_ctrlr->state == VFIO_USER_CTRLR_MIGRATING); 3650 3651 for (size_t i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3652 struct nvmf_vfio_user_cq *cq = vu_ctrlr->cqs[i]; 3653 3654 if (cq == NULL || q_addr(&cq->mapping) == NULL) { 3655 continue; 3656 } 3657 3658 vfu_sgl_mark_dirty(endpoint->vfu_ctx, cq->mapping.sg, 1); 3659 } 3660 3661 if (vu_ctrlr->sdbl != NULL) { 3662 dma_sg_t *sg; 3663 size_t i; 3664 3665 for (i = 0; i < NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT; 3666 ++i) { 3667 3668 if (!vu_ctrlr->sdbl->iovs[i].iov_len) { 3669 continue; 3670 } 3671 3672 sg = index_to_sg_t(vu_ctrlr->sdbl->sgs, i); 3673 3674 vfu_sgl_mark_dirty(endpoint->vfu_ctx, sg, 1); 3675 } 3676 } 3677 } 3678 3679 static int 3680 vfio_user_migration_device_state_transition(vfu_ctx_t *vfu_ctx, vfu_migr_state_t state) 3681 { 3682 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3683 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3684 struct nvmf_vfio_user_sq *sq; 3685 int ret = 0; 3686 3687 SPDK_DEBUGLOG(nvmf_vfio, "%s controller state %u, migration state %u\n", endpoint_id(endpoint), 3688 vu_ctrlr->state, state); 3689 3690 switch (state) { 3691 case VFU_MIGR_STATE_STOP_AND_COPY: 3692 vu_ctrlr->in_source_vm = true; 3693 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 3694 vfio_user_migr_ctrlr_mark_dirty(vu_ctrlr); 3695 vfio_user_migr_ctrlr_save_data(vu_ctrlr); 3696 break; 3697 case VFU_MIGR_STATE_STOP: 3698 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 3699 /* The controller associates with source VM is dead now, we will resume 3700 * the subsystem after destroying the controller data structure, then the 3701 * subsystem can be re-used for another new client. 3702 */ 3703 if (vu_ctrlr->in_source_vm) { 3704 endpoint->need_resume = true; 3705 } 3706 break; 3707 case VFU_MIGR_STATE_PRE_COPY: 3708 assert(vu_ctrlr->state == VFIO_USER_CTRLR_PAUSED); 3709 break; 3710 case VFU_MIGR_STATE_RESUME: 3711 /* 3712 * Destination ADMIN queue pair is connected when starting the VM, 3713 * but the ADMIN queue pair isn't enabled in destination VM, the poll 3714 * group will do nothing to ADMIN queue pair for now. 3715 */ 3716 if (vu_ctrlr->state != VFIO_USER_CTRLR_RUNNING) { 3717 break; 3718 } 3719 3720 assert(!vu_ctrlr->in_source_vm); 3721 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 3722 3723 sq = TAILQ_FIRST(&vu_ctrlr->connected_sqs); 3724 assert(sq != NULL); 3725 assert(sq->qpair.qid == 0); 3726 sq->sq_state = VFIO_USER_SQ_INACTIVE; 3727 3728 /* Free ADMIN SQ resources first, SQ resources will be 3729 * allocated based on queue size from source VM. 3730 */ 3731 free_sq_reqs(sq); 3732 sq->size = 0; 3733 break; 3734 case VFU_MIGR_STATE_RUNNING: 3735 3736 if (vu_ctrlr->state != VFIO_USER_CTRLR_MIGRATING) { 3737 break; 3738 } 3739 3740 if (!vu_ctrlr->in_source_vm) { 3741 /* Restore destination VM from BAR9 */ 3742 ret = vfio_user_migr_ctrlr_restore(vu_ctrlr); 3743 if (ret) { 3744 break; 3745 } 3746 3747 vfio_user_ctrlr_switch_doorbells(vu_ctrlr, false); 3748 vfio_user_migr_ctrlr_enable_sqs(vu_ctrlr); 3749 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 3750 /* FIXME where do we resume nvmf? */ 3751 } else { 3752 /* Rollback source VM */ 3753 vu_ctrlr->state = VFIO_USER_CTRLR_RESUMING; 3754 ret = spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 3755 vfio_user_endpoint_resume_done, endpoint); 3756 if (ret < 0) { 3757 /* TODO: fail controller with CFS bit set */ 3758 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 3759 SPDK_ERRLOG("%s: failed to resume, ret=%d\n", endpoint_id(endpoint), ret); 3760 } 3761 } 3762 vu_ctrlr->migr_data_prepared = false; 3763 vu_ctrlr->in_source_vm = false; 3764 break; 3765 3766 default: 3767 return -EINVAL; 3768 } 3769 3770 return ret; 3771 } 3772 3773 static uint64_t 3774 vfio_user_migration_get_pending_bytes(vfu_ctx_t *vfu_ctx) 3775 { 3776 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3777 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 3778 uint64_t pending_bytes; 3779 3780 if (ctrlr->migr_data_prepared) { 3781 assert(ctrlr->state == VFIO_USER_CTRLR_MIGRATING); 3782 pending_bytes = 0; 3783 } else { 3784 pending_bytes = vfio_user_migr_data_len(); 3785 } 3786 3787 SPDK_DEBUGLOG(nvmf_vfio, 3788 "%s current state %u, pending bytes 0x%"PRIx64"\n", 3789 endpoint_id(endpoint), ctrlr->state, pending_bytes); 3790 3791 return pending_bytes; 3792 } 3793 3794 static int 3795 vfio_user_migration_prepare_data(vfu_ctx_t *vfu_ctx, uint64_t *offset, uint64_t *size) 3796 { 3797 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3798 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 3799 3800 /* 3801 * When transitioning to pre-copy state we set pending_bytes to 0, 3802 * so the vfio-user client shouldn't attempt to read any migration 3803 * data. This is not yet guaranteed by libvfio-user. 3804 */ 3805 if (ctrlr->state != VFIO_USER_CTRLR_MIGRATING) { 3806 assert(size != NULL); 3807 *offset = 0; 3808 *size = 0; 3809 return 0; 3810 } 3811 3812 if (ctrlr->in_source_vm) { /* migration source */ 3813 assert(size != NULL); 3814 *size = vfio_user_migr_data_len(); 3815 vfio_user_migr_ctrlr_save_data(ctrlr); 3816 } else { /* migration destination */ 3817 assert(size == NULL); 3818 assert(!ctrlr->migr_data_prepared); 3819 } 3820 *offset = 0; 3821 ctrlr->migr_data_prepared = true; 3822 3823 SPDK_DEBUGLOG(nvmf_vfio, "%s current state %u\n", endpoint_id(endpoint), ctrlr->state); 3824 3825 return 0; 3826 } 3827 3828 static ssize_t 3829 vfio_user_migration_read_data(vfu_ctx_t *vfu_ctx __attribute__((unused)), 3830 void *buf __attribute__((unused)), 3831 uint64_t count __attribute__((unused)), 3832 uint64_t offset __attribute__((unused))) 3833 { 3834 SPDK_DEBUGLOG(nvmf_vfio, "%s: migration read data not supported\n", 3835 endpoint_id(vfu_get_private(vfu_ctx))); 3836 errno = ENOTSUP; 3837 return -1; 3838 } 3839 3840 static ssize_t 3841 vfio_user_migration_write_data(vfu_ctx_t *vfu_ctx __attribute__((unused)), 3842 void *buf __attribute__((unused)), 3843 uint64_t count __attribute__((unused)), 3844 uint64_t offset __attribute__((unused))) 3845 { 3846 SPDK_DEBUGLOG(nvmf_vfio, "%s: migration write data not supported\n", 3847 endpoint_id(vfu_get_private(vfu_ctx))); 3848 errno = ENOTSUP; 3849 return -1; 3850 } 3851 3852 static int 3853 vfio_user_migration_data_written(vfu_ctx_t *vfu_ctx __attribute__((unused)), 3854 uint64_t count) 3855 { 3856 SPDK_DEBUGLOG(nvmf_vfio, "write 0x%"PRIx64"\n", (uint64_t)count); 3857 3858 if (count != vfio_user_migr_data_len()) { 3859 SPDK_DEBUGLOG(nvmf_vfio, "%s bad count %#lx\n", 3860 endpoint_id(vfu_get_private(vfu_ctx)), count); 3861 errno = EINVAL; 3862 return -1; 3863 } 3864 3865 return 0; 3866 } 3867 3868 static int 3869 vfio_user_dev_info_fill(struct nvmf_vfio_user_transport *vu_transport, 3870 struct nvmf_vfio_user_endpoint *endpoint) 3871 { 3872 int ret; 3873 ssize_t cap_offset; 3874 vfu_ctx_t *vfu_ctx = endpoint->vfu_ctx; 3875 struct iovec migr_sparse_mmap = {}; 3876 3877 struct pmcap pmcap = { .hdr.id = PCI_CAP_ID_PM, .pmcs.nsfrst = 0x1 }; 3878 struct pxcap pxcap = { 3879 .hdr.id = PCI_CAP_ID_EXP, 3880 .pxcaps.ver = 0x2, 3881 .pxdcap = {.rer = 0x1, .flrc = 0x1}, 3882 .pxdcap2.ctds = 0x1 3883 }; 3884 3885 struct msixcap msixcap = { 3886 .hdr.id = PCI_CAP_ID_MSIX, 3887 .mxc.ts = NVME_IRQ_MSIX_NUM - 1, 3888 .mtab = {.tbir = 0x4, .to = 0x0}, 3889 .mpba = {.pbir = 0x5, .pbao = 0x0} 3890 }; 3891 3892 struct iovec sparse_mmap[] = { 3893 { 3894 .iov_base = (void *)NVME_DOORBELLS_OFFSET, 3895 .iov_len = NVMF_VFIO_USER_DOORBELLS_SIZE, 3896 }, 3897 }; 3898 3899 const vfu_migration_callbacks_t migr_callbacks = { 3900 .version = VFU_MIGR_CALLBACKS_VERS, 3901 .transition = &vfio_user_migration_device_state_transition, 3902 .get_pending_bytes = &vfio_user_migration_get_pending_bytes, 3903 .prepare_data = &vfio_user_migration_prepare_data, 3904 .read_data = &vfio_user_migration_read_data, 3905 .data_written = &vfio_user_migration_data_written, 3906 .write_data = &vfio_user_migration_write_data 3907 }; 3908 3909 ret = vfu_pci_init(vfu_ctx, VFU_PCI_TYPE_EXPRESS, PCI_HEADER_TYPE_NORMAL, 0); 3910 if (ret < 0) { 3911 SPDK_ERRLOG("vfu_ctx %p failed to initialize PCI\n", vfu_ctx); 3912 return ret; 3913 } 3914 vfu_pci_set_id(vfu_ctx, SPDK_PCI_VID_NUTANIX, 0x0001, SPDK_PCI_VID_NUTANIX, 0); 3915 /* 3916 * 0x02, controller uses the NVM Express programming interface 3917 * 0x08, non-volatile memory controller 3918 * 0x01, mass storage controller 3919 */ 3920 vfu_pci_set_class(vfu_ctx, 0x01, 0x08, 0x02); 3921 3922 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pmcap); 3923 if (cap_offset < 0) { 3924 SPDK_ERRLOG("vfu_ctx %p failed add pmcap\n", vfu_ctx); 3925 return ret; 3926 } 3927 3928 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pxcap); 3929 if (cap_offset < 0) { 3930 SPDK_ERRLOG("vfu_ctx %p failed add pxcap\n", vfu_ctx); 3931 return ret; 3932 } 3933 3934 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &msixcap); 3935 if (cap_offset < 0) { 3936 SPDK_ERRLOG("vfu_ctx %p failed add msixcap\n", vfu_ctx); 3937 return ret; 3938 } 3939 3940 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_CFG_REGION_IDX, NVME_REG_CFG_SIZE, 3941 access_pci_config, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 3942 if (ret < 0) { 3943 SPDK_ERRLOG("vfu_ctx %p failed to setup cfg\n", vfu_ctx); 3944 return ret; 3945 } 3946 3947 if (vu_transport->transport_opts.disable_mappable_bar0) { 3948 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE, 3949 access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, 3950 NULL, 0, -1, 0); 3951 } else { 3952 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE, 3953 access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, 3954 sparse_mmap, 1, endpoint->devmem_fd, 0); 3955 } 3956 3957 if (ret < 0) { 3958 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 0\n", vfu_ctx); 3959 return ret; 3960 } 3961 3962 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR4_REGION_IDX, NVME_BAR4_SIZE, 3963 NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 3964 if (ret < 0) { 3965 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 4\n", vfu_ctx); 3966 return ret; 3967 } 3968 3969 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR5_REGION_IDX, NVME_BAR5_SIZE, 3970 NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 3971 if (ret < 0) { 3972 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 5\n", vfu_ctx); 3973 return ret; 3974 } 3975 3976 ret = vfu_setup_device_dma(vfu_ctx, memory_region_add_cb, memory_region_remove_cb); 3977 if (ret < 0) { 3978 SPDK_ERRLOG("vfu_ctx %p failed to setup dma callback\n", vfu_ctx); 3979 return ret; 3980 } 3981 3982 ret = vfu_setup_device_reset_cb(vfu_ctx, vfio_user_device_reset); 3983 if (ret < 0) { 3984 SPDK_ERRLOG("vfu_ctx %p failed to setup reset callback\n", vfu_ctx); 3985 return ret; 3986 } 3987 3988 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_INTX_IRQ, 1); 3989 if (ret < 0) { 3990 SPDK_ERRLOG("vfu_ctx %p failed to setup INTX\n", vfu_ctx); 3991 return ret; 3992 } 3993 3994 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSIX_IRQ, NVME_IRQ_MSIX_NUM); 3995 if (ret < 0) { 3996 SPDK_ERRLOG("vfu_ctx %p failed to setup MSIX\n", vfu_ctx); 3997 return ret; 3998 } 3999 4000 vfu_setup_device_quiesce_cb(vfu_ctx, vfio_user_dev_quiesce_cb); 4001 4002 migr_sparse_mmap.iov_base = (void *)4096; 4003 migr_sparse_mmap.iov_len = vfio_user_migr_data_len(); 4004 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_MIGR_REGION_IDX, 4005 vfu_get_migr_register_area_size() + vfio_user_migr_data_len(), 4006 NULL, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, &migr_sparse_mmap, 4007 1, endpoint->migr_fd, 0); 4008 if (ret < 0) { 4009 SPDK_ERRLOG("vfu_ctx %p failed to setup migration region\n", vfu_ctx); 4010 return ret; 4011 } 4012 4013 ret = vfu_setup_device_migration_callbacks(vfu_ctx, &migr_callbacks, 4014 vfu_get_migr_register_area_size()); 4015 if (ret < 0) { 4016 SPDK_ERRLOG("vfu_ctx %p failed to setup migration callbacks\n", vfu_ctx); 4017 return ret; 4018 } 4019 4020 ret = vfu_realize_ctx(vfu_ctx); 4021 if (ret < 0) { 4022 SPDK_ERRLOG("vfu_ctx %p failed to realize\n", vfu_ctx); 4023 return ret; 4024 } 4025 4026 endpoint->pci_config_space = vfu_pci_get_config_space(endpoint->vfu_ctx); 4027 assert(endpoint->pci_config_space != NULL); 4028 init_pci_config_space(endpoint->pci_config_space); 4029 4030 assert(cap_offset != 0); 4031 endpoint->msix = (struct msixcap *)((uint8_t *)endpoint->pci_config_space + cap_offset); 4032 4033 return 0; 4034 } 4035 4036 static int nvmf_vfio_user_accept(void *ctx); 4037 4038 static void 4039 set_intr_mode_noop(struct spdk_poller *poller, void *arg, bool interrupt_mode) 4040 { 4041 /* Nothing for us to do here. */ 4042 } 4043 4044 /* 4045 * Register an "accept" poller: this is polling for incoming vfio-user socket 4046 * connections (on the listening socket). 4047 * 4048 * We need to do this on first listening, and also after destroying a 4049 * controller, so we can accept another connection. 4050 */ 4051 static int 4052 vfio_user_register_accept_poller(struct nvmf_vfio_user_endpoint *endpoint) 4053 { 4054 uint64_t poll_rate_us = endpoint->transport->transport.opts.acceptor_poll_rate; 4055 4056 SPDK_DEBUGLOG(nvmf_vfio, "registering accept poller\n"); 4057 4058 endpoint->accept_poller = SPDK_POLLER_REGISTER(nvmf_vfio_user_accept, 4059 endpoint, poll_rate_us); 4060 4061 if (!endpoint->accept_poller) { 4062 return -1; 4063 } 4064 4065 endpoint->accept_thread = spdk_get_thread(); 4066 4067 if (!spdk_interrupt_mode_is_enabled()) { 4068 return 0; 4069 } 4070 4071 endpoint->accept_intr_fd = vfu_get_poll_fd(endpoint->vfu_ctx); 4072 assert(endpoint->accept_intr_fd != -1); 4073 4074 endpoint->accept_intr = SPDK_INTERRUPT_REGISTER(endpoint->accept_intr_fd, 4075 nvmf_vfio_user_accept, endpoint); 4076 4077 assert(endpoint->accept_intr != NULL); 4078 4079 spdk_poller_register_interrupt(endpoint->accept_poller, 4080 set_intr_mode_noop, NULL); 4081 return 0; 4082 } 4083 4084 static void 4085 _vfio_user_relisten(void *ctx) 4086 { 4087 struct nvmf_vfio_user_endpoint *endpoint = ctx; 4088 4089 vfio_user_register_accept_poller(endpoint); 4090 } 4091 4092 static void 4093 _free_ctrlr(void *ctx) 4094 { 4095 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 4096 struct nvmf_vfio_user_endpoint *endpoint = ctrlr->endpoint; 4097 4098 free_sdbl(ctrlr->endpoint->vfu_ctx, ctrlr->sdbl); 4099 4100 spdk_interrupt_unregister(&ctrlr->intr); 4101 ctrlr->intr_fd = -1; 4102 spdk_poller_unregister(&ctrlr->vfu_ctx_poller); 4103 4104 free(ctrlr); 4105 4106 if (endpoint == NULL) { 4107 return; 4108 } 4109 4110 if (endpoint->need_async_destroy) { 4111 nvmf_vfio_user_destroy_endpoint(endpoint); 4112 } else { 4113 spdk_thread_send_msg(endpoint->accept_thread, 4114 _vfio_user_relisten, endpoint); 4115 } 4116 } 4117 4118 static void 4119 free_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 4120 { 4121 int i; 4122 assert(ctrlr != NULL); 4123 4124 SPDK_DEBUGLOG(nvmf_vfio, "free %s\n", ctrlr_id(ctrlr)); 4125 4126 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 4127 free_qp(ctrlr, i); 4128 } 4129 4130 spdk_thread_exec_msg(ctrlr->thread, _free_ctrlr, ctrlr); 4131 } 4132 4133 static int 4134 nvmf_vfio_user_create_ctrlr(struct nvmf_vfio_user_transport *transport, 4135 struct nvmf_vfio_user_endpoint *endpoint) 4136 { 4137 struct nvmf_vfio_user_ctrlr *ctrlr; 4138 int err = 0; 4139 4140 SPDK_DEBUGLOG(nvmf_vfio, "%s\n", endpoint_id(endpoint)); 4141 4142 /* First, construct a vfio-user CUSTOM transport controller */ 4143 ctrlr = calloc(1, sizeof(*ctrlr)); 4144 if (ctrlr == NULL) { 4145 err = -ENOMEM; 4146 goto out; 4147 } 4148 /* We can only support one connection for now */ 4149 ctrlr->cntlid = 0x1; 4150 ctrlr->intr_fd = -1; 4151 ctrlr->transport = transport; 4152 ctrlr->endpoint = endpoint; 4153 ctrlr->bar0_doorbells = endpoint->bar0_doorbells; 4154 TAILQ_INIT(&ctrlr->connected_sqs); 4155 4156 ctrlr->adaptive_irqs_enabled = 4157 !transport->transport_opts.disable_adaptive_irq; 4158 4159 /* Then, construct an admin queue pair */ 4160 err = init_sq(ctrlr, &transport->transport, 0); 4161 if (err != 0) { 4162 free(ctrlr); 4163 goto out; 4164 } 4165 4166 err = init_cq(ctrlr, 0); 4167 if (err != 0) { 4168 free(ctrlr); 4169 goto out; 4170 } 4171 4172 ctrlr->sqs[0]->size = NVMF_VFIO_USER_DEFAULT_AQ_DEPTH; 4173 4174 err = alloc_sq_reqs(ctrlr, ctrlr->sqs[0]); 4175 if (err != 0) { 4176 free(ctrlr); 4177 goto out; 4178 } 4179 endpoint->ctrlr = ctrlr; 4180 4181 /* Notify the generic layer about the new admin queue pair */ 4182 spdk_nvmf_tgt_new_qpair(transport->transport.tgt, &ctrlr->sqs[0]->qpair); 4183 4184 out: 4185 if (err != 0) { 4186 SPDK_ERRLOG("%s: failed to create vfio-user controller: %s\n", 4187 endpoint_id(endpoint), strerror(-err)); 4188 } 4189 4190 return err; 4191 } 4192 4193 static int 4194 nvmf_vfio_user_listen(struct spdk_nvmf_transport *transport, 4195 const struct spdk_nvme_transport_id *trid, 4196 struct spdk_nvmf_listen_opts *listen_opts) 4197 { 4198 struct nvmf_vfio_user_transport *vu_transport; 4199 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 4200 char path[PATH_MAX] = {}; 4201 char uuid[PATH_MAX] = {}; 4202 int ret; 4203 4204 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4205 transport); 4206 4207 pthread_mutex_lock(&vu_transport->lock); 4208 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 4209 /* Only compare traddr */ 4210 if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) { 4211 pthread_mutex_unlock(&vu_transport->lock); 4212 return -EEXIST; 4213 } 4214 } 4215 pthread_mutex_unlock(&vu_transport->lock); 4216 4217 endpoint = calloc(1, sizeof(*endpoint)); 4218 if (!endpoint) { 4219 return -ENOMEM; 4220 } 4221 4222 pthread_mutex_init(&endpoint->lock, NULL); 4223 endpoint->devmem_fd = -1; 4224 memcpy(&endpoint->trid, trid, sizeof(endpoint->trid)); 4225 endpoint->transport = vu_transport; 4226 4227 ret = snprintf(path, PATH_MAX, "%s/bar0", endpoint_id(endpoint)); 4228 if (ret < 0 || ret >= PATH_MAX) { 4229 SPDK_ERRLOG("%s: error to get socket path: %s.\n", endpoint_id(endpoint), spdk_strerror(errno)); 4230 ret = -1; 4231 goto out; 4232 } 4233 4234 ret = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); 4235 if (ret == -1) { 4236 SPDK_ERRLOG("%s: failed to open device memory at %s: %s.\n", 4237 endpoint_id(endpoint), path, spdk_strerror(errno)); 4238 goto out; 4239 } 4240 unlink(path); 4241 4242 endpoint->devmem_fd = ret; 4243 ret = ftruncate(endpoint->devmem_fd, 4244 NVME_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE); 4245 if (ret != 0) { 4246 SPDK_ERRLOG("%s: error to ftruncate file %s: %s.\n", endpoint_id(endpoint), path, 4247 spdk_strerror(errno)); 4248 goto out; 4249 } 4250 4251 endpoint->bar0_doorbells = mmap(NULL, NVMF_VFIO_USER_DOORBELLS_SIZE, 4252 PROT_READ | PROT_WRITE, MAP_SHARED, endpoint->devmem_fd, NVME_DOORBELLS_OFFSET); 4253 if (endpoint->bar0_doorbells == MAP_FAILED) { 4254 SPDK_ERRLOG("%s: error to mmap file %s: %s.\n", endpoint_id(endpoint), path, spdk_strerror(errno)); 4255 endpoint->bar0_doorbells = NULL; 4256 ret = -1; 4257 goto out; 4258 } 4259 4260 ret = snprintf(path, PATH_MAX, "%s/migr", endpoint_id(endpoint)); 4261 if (ret < 0 || ret >= PATH_MAX) { 4262 SPDK_ERRLOG("%s: error to get migration file path: %s.\n", endpoint_id(endpoint), 4263 spdk_strerror(errno)); 4264 ret = -1; 4265 goto out; 4266 } 4267 ret = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); 4268 if (ret == -1) { 4269 SPDK_ERRLOG("%s: failed to open device memory at %s: %s.\n", 4270 endpoint_id(endpoint), path, spdk_strerror(errno)); 4271 goto out; 4272 } 4273 unlink(path); 4274 4275 endpoint->migr_fd = ret; 4276 ret = ftruncate(endpoint->migr_fd, 4277 vfu_get_migr_register_area_size() + vfio_user_migr_data_len()); 4278 if (ret != 0) { 4279 SPDK_ERRLOG("%s: error to ftruncate migration file %s: %s.\n", endpoint_id(endpoint), path, 4280 spdk_strerror(errno)); 4281 goto out; 4282 } 4283 4284 endpoint->migr_data = mmap(NULL, vfio_user_migr_data_len(), 4285 PROT_READ | PROT_WRITE, MAP_SHARED, endpoint->migr_fd, vfu_get_migr_register_area_size()); 4286 if (endpoint->migr_data == MAP_FAILED) { 4287 SPDK_ERRLOG("%s: error to mmap file %s: %s.\n", endpoint_id(endpoint), path, spdk_strerror(errno)); 4288 endpoint->migr_data = NULL; 4289 ret = -1; 4290 goto out; 4291 } 4292 4293 ret = snprintf(uuid, PATH_MAX, "%s/cntrl", endpoint_id(endpoint)); 4294 if (ret < 0 || ret >= PATH_MAX) { 4295 SPDK_ERRLOG("%s: error to get ctrlr file path: %s\n", endpoint_id(endpoint), spdk_strerror(errno)); 4296 ret = -1; 4297 goto out; 4298 } 4299 4300 endpoint->vfu_ctx = vfu_create_ctx(VFU_TRANS_SOCK, uuid, LIBVFIO_USER_FLAG_ATTACH_NB, 4301 endpoint, VFU_DEV_TYPE_PCI); 4302 if (endpoint->vfu_ctx == NULL) { 4303 SPDK_ERRLOG("%s: error creating libmuser context: %m\n", 4304 endpoint_id(endpoint)); 4305 ret = -1; 4306 goto out; 4307 } 4308 4309 ret = vfu_setup_log(endpoint->vfu_ctx, vfio_user_log, 4310 vfio_user_get_log_level()); 4311 if (ret < 0) { 4312 goto out; 4313 } 4314 4315 4316 ret = vfio_user_dev_info_fill(vu_transport, endpoint); 4317 if (ret < 0) { 4318 goto out; 4319 } 4320 4321 ret = vfio_user_register_accept_poller(endpoint); 4322 4323 if (ret != 0) { 4324 goto out; 4325 } 4326 4327 pthread_mutex_lock(&vu_transport->lock); 4328 TAILQ_INSERT_TAIL(&vu_transport->endpoints, endpoint, link); 4329 pthread_mutex_unlock(&vu_transport->lock); 4330 4331 out: 4332 if (ret != 0) { 4333 nvmf_vfio_user_destroy_endpoint(endpoint); 4334 } 4335 4336 return ret; 4337 } 4338 4339 static void 4340 nvmf_vfio_user_stop_listen(struct spdk_nvmf_transport *transport, 4341 const struct spdk_nvme_transport_id *trid) 4342 { 4343 struct nvmf_vfio_user_transport *vu_transport; 4344 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 4345 4346 assert(trid != NULL); 4347 assert(trid->traddr != NULL); 4348 4349 SPDK_DEBUGLOG(nvmf_vfio, "%s: stop listen\n", trid->traddr); 4350 4351 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4352 transport); 4353 4354 pthread_mutex_lock(&vu_transport->lock); 4355 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 4356 if (strcmp(trid->traddr, endpoint->trid.traddr) == 0) { 4357 TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link); 4358 /* Defer to free endpoint resources until the controller 4359 * is freed. There are two cases when running here: 4360 * 1. kill nvmf target while VM is connected 4361 * 2. remove listener via RPC call 4362 * nvmf library will disconnect all queue paris. 4363 */ 4364 if (endpoint->ctrlr) { 4365 assert(!endpoint->need_async_destroy); 4366 endpoint->need_async_destroy = true; 4367 pthread_mutex_unlock(&vu_transport->lock); 4368 return; 4369 } 4370 4371 nvmf_vfio_user_destroy_endpoint(endpoint); 4372 pthread_mutex_unlock(&vu_transport->lock); 4373 return; 4374 } 4375 } 4376 pthread_mutex_unlock(&vu_transport->lock); 4377 4378 SPDK_DEBUGLOG(nvmf_vfio, "%s: not found\n", trid->traddr); 4379 } 4380 4381 static void 4382 nvmf_vfio_user_cdata_init(struct spdk_nvmf_transport *transport, 4383 struct spdk_nvmf_subsystem *subsystem, 4384 struct spdk_nvmf_ctrlr_data *cdata) 4385 { 4386 struct nvmf_vfio_user_transport *vu_transport; 4387 4388 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, transport); 4389 4390 cdata->vid = SPDK_PCI_VID_NUTANIX; 4391 cdata->ssvid = SPDK_PCI_VID_NUTANIX; 4392 cdata->ieee[0] = 0x8d; 4393 cdata->ieee[1] = 0x6b; 4394 cdata->ieee[2] = 0x50; 4395 memset(&cdata->sgls, 0, sizeof(struct spdk_nvme_cdata_sgls)); 4396 cdata->sgls.supported = SPDK_NVME_SGLS_SUPPORTED_DWORD_ALIGNED; 4397 cdata->oncs.compare = !vu_transport->transport_opts.disable_compare; 4398 /* libvfio-user can only support 1 connection for now */ 4399 cdata->oncs.reservations = 0; 4400 cdata->oacs.doorbell_buffer_config = !vu_transport->transport_opts.disable_shadow_doorbells; 4401 cdata->fuses.compare_and_write = !vu_transport->transport_opts.disable_compare; 4402 } 4403 4404 static int 4405 nvmf_vfio_user_listen_associate(struct spdk_nvmf_transport *transport, 4406 const struct spdk_nvmf_subsystem *subsystem, 4407 const struct spdk_nvme_transport_id *trid) 4408 { 4409 struct nvmf_vfio_user_transport *vu_transport; 4410 struct nvmf_vfio_user_endpoint *endpoint; 4411 4412 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, transport); 4413 4414 pthread_mutex_lock(&vu_transport->lock); 4415 TAILQ_FOREACH(endpoint, &vu_transport->endpoints, link) { 4416 if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) { 4417 break; 4418 } 4419 } 4420 pthread_mutex_unlock(&vu_transport->lock); 4421 4422 if (endpoint == NULL) { 4423 return -ENOENT; 4424 } 4425 4426 /* Drop const - we will later need to pause/unpause. */ 4427 endpoint->subsystem = (struct spdk_nvmf_subsystem *)subsystem; 4428 4429 return 0; 4430 } 4431 4432 /* 4433 * Executed periodically at a default SPDK_NVMF_DEFAULT_ACCEPT_POLL_RATE_US 4434 * frequency. 4435 * 4436 * For this endpoint (which at the libvfio-user level corresponds to a socket), 4437 * if we don't currently have a controller set up, peek to see if the socket is 4438 * able to accept a new connection. 4439 */ 4440 static int 4441 nvmf_vfio_user_accept(void *ctx) 4442 { 4443 struct nvmf_vfio_user_endpoint *endpoint = ctx; 4444 struct nvmf_vfio_user_transport *vu_transport; 4445 int err; 4446 4447 vu_transport = endpoint->transport; 4448 4449 if (endpoint->ctrlr != NULL) { 4450 return SPDK_POLLER_IDLE; 4451 } 4452 4453 /* While we're here, the controller is already destroyed, 4454 * subsystem may still be in RESUMING state, we will wait 4455 * until the subsystem is in RUNNING state. 4456 */ 4457 if (endpoint->need_resume) { 4458 return SPDK_POLLER_IDLE; 4459 } 4460 4461 err = vfu_attach_ctx(endpoint->vfu_ctx); 4462 if (err == 0) { 4463 SPDK_DEBUGLOG(nvmf_vfio, "attach succeeded\n"); 4464 err = nvmf_vfio_user_create_ctrlr(vu_transport, endpoint); 4465 if (err == 0) { 4466 /* 4467 * Unregister ourselves: now we've accepted a 4468 * connection, there is nothing for us to poll for, and 4469 * we will poll the connection via vfu_run_ctx() 4470 * instead. 4471 */ 4472 spdk_interrupt_unregister(&endpoint->accept_intr); 4473 spdk_poller_unregister(&endpoint->accept_poller); 4474 } 4475 return SPDK_POLLER_BUSY; 4476 } 4477 4478 if (errno == EAGAIN || errno == EWOULDBLOCK) { 4479 return SPDK_POLLER_IDLE; 4480 } 4481 4482 return SPDK_POLLER_BUSY; 4483 } 4484 4485 static void 4486 nvmf_vfio_user_discover(struct spdk_nvmf_transport *transport, 4487 struct spdk_nvme_transport_id *trid, 4488 struct spdk_nvmf_discovery_log_page_entry *entry) 4489 { } 4490 4491 static struct spdk_nvmf_transport_poll_group * 4492 nvmf_vfio_user_poll_group_create(struct spdk_nvmf_transport *transport, 4493 struct spdk_nvmf_poll_group *group) 4494 { 4495 struct nvmf_vfio_user_transport *vu_transport; 4496 struct nvmf_vfio_user_poll_group *vu_group; 4497 4498 SPDK_DEBUGLOG(nvmf_vfio, "create poll group\n"); 4499 4500 vu_group = calloc(1, sizeof(*vu_group)); 4501 if (vu_group == NULL) { 4502 SPDK_ERRLOG("Error allocating poll group: %m"); 4503 return NULL; 4504 } 4505 4506 TAILQ_INIT(&vu_group->sqs); 4507 4508 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4509 transport); 4510 pthread_mutex_lock(&vu_transport->pg_lock); 4511 TAILQ_INSERT_TAIL(&vu_transport->poll_groups, vu_group, link); 4512 if (vu_transport->next_pg == NULL) { 4513 vu_transport->next_pg = vu_group; 4514 } 4515 pthread_mutex_unlock(&vu_transport->pg_lock); 4516 4517 if (!spdk_interrupt_mode_is_enabled()) { 4518 return &vu_group->group; 4519 } 4520 4521 /* 4522 * Only allow the poll group to work in interrupt mode if the transport 4523 * supports it. It's our responsibility to register the actual interrupt 4524 * later (in handle_queue_connect_rsp()) that processes everything in 4525 * the poll group: for us, that's the libvfio-user context, and the 4526 * actual qpairs. 4527 * 4528 * Note that this only works in the case that nothing else shares the 4529 * spdk_nvmf_poll_group. 4530 * 4531 * If not supported, this will effectively always wake up to poll the 4532 * poll group. 4533 */ 4534 4535 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4536 transport); 4537 4538 if (!vu_transport->intr_mode_supported) { 4539 SPDK_WARNLOG("vfio-user interrupt mode not supported\n"); 4540 return &vu_group->group; 4541 } 4542 4543 spdk_poller_register_interrupt(group->poller, set_intr_mode_noop, 4544 NULL); 4545 4546 return &vu_group->group; 4547 } 4548 4549 static bool 4550 in_interrupt_mode(struct nvmf_vfio_user_transport *vu_transport) 4551 { 4552 return spdk_interrupt_mode_is_enabled() && 4553 vu_transport->intr_mode_supported; 4554 } 4555 4556 static struct spdk_nvmf_transport_poll_group * 4557 nvmf_vfio_user_get_optimal_poll_group(struct spdk_nvmf_qpair *qpair) 4558 { 4559 struct nvmf_vfio_user_transport *vu_transport; 4560 struct nvmf_vfio_user_poll_group **vu_group; 4561 struct nvmf_vfio_user_sq *sq; 4562 struct nvmf_vfio_user_cq *cq; 4563 4564 struct spdk_nvmf_transport_poll_group *result = NULL; 4565 4566 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 4567 cq = sq->ctrlr->cqs[sq->cqid]; 4568 assert(cq != NULL); 4569 vu_transport = SPDK_CONTAINEROF(qpair->transport, struct nvmf_vfio_user_transport, transport); 4570 4571 pthread_mutex_lock(&vu_transport->pg_lock); 4572 if (TAILQ_EMPTY(&vu_transport->poll_groups)) { 4573 goto out; 4574 } 4575 4576 if (!nvmf_qpair_is_admin_queue(qpair)) { 4577 /* 4578 * If this is shared IO CQ case, just return the used CQ's poll 4579 * group, so I/O completions don't have to use 4580 * spdk_thread_send_msg(). 4581 */ 4582 if (cq->group != NULL) { 4583 result = cq->group; 4584 goto out; 4585 } 4586 4587 /* 4588 * If we're in interrupt mode, align all qpairs for a controller 4589 * on the same poll group, to avoid complications in 4590 * vfio_user_ctrlr_intr(). 4591 */ 4592 if (in_interrupt_mode(vu_transport)) { 4593 result = sq->ctrlr->sqs[0]->group; 4594 goto out; 4595 } 4596 4597 } 4598 4599 vu_group = &vu_transport->next_pg; 4600 assert(*vu_group != NULL); 4601 4602 result = &(*vu_group)->group; 4603 *vu_group = TAILQ_NEXT(*vu_group, link); 4604 if (*vu_group == NULL) { 4605 *vu_group = TAILQ_FIRST(&vu_transport->poll_groups); 4606 } 4607 4608 out: 4609 if (cq->group == NULL) { 4610 cq->group = result; 4611 } 4612 4613 pthread_mutex_unlock(&vu_transport->pg_lock); 4614 return result; 4615 } 4616 4617 /* called when process exits */ 4618 static void 4619 nvmf_vfio_user_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group) 4620 { 4621 struct nvmf_vfio_user_poll_group *vu_group, *next_tgroup;; 4622 struct nvmf_vfio_user_transport *vu_transport; 4623 4624 SPDK_DEBUGLOG(nvmf_vfio, "destroy poll group\n"); 4625 4626 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 4627 vu_transport = SPDK_CONTAINEROF(vu_group->group.transport, struct nvmf_vfio_user_transport, 4628 transport); 4629 4630 pthread_mutex_lock(&vu_transport->pg_lock); 4631 next_tgroup = TAILQ_NEXT(vu_group, link); 4632 TAILQ_REMOVE(&vu_transport->poll_groups, vu_group, link); 4633 if (next_tgroup == NULL) { 4634 next_tgroup = TAILQ_FIRST(&vu_transport->poll_groups); 4635 } 4636 if (vu_transport->next_pg == vu_group) { 4637 vu_transport->next_pg = next_tgroup; 4638 } 4639 pthread_mutex_unlock(&vu_transport->pg_lock); 4640 4641 free(vu_group); 4642 } 4643 4644 static void 4645 _vfio_user_qpair_disconnect(void *ctx) 4646 { 4647 struct nvmf_vfio_user_sq *sq = ctx; 4648 4649 spdk_nvmf_qpair_disconnect(&sq->qpair, NULL, NULL); 4650 } 4651 4652 /* The function is used when socket connection is destroyed */ 4653 static int 4654 vfio_user_destroy_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 4655 { 4656 struct nvmf_vfio_user_sq *sq; 4657 struct nvmf_vfio_user_endpoint *endpoint; 4658 4659 SPDK_DEBUGLOG(nvmf_vfio, "%s stop processing\n", ctrlr_id(ctrlr)); 4660 4661 endpoint = ctrlr->endpoint; 4662 assert(endpoint != NULL); 4663 4664 pthread_mutex_lock(&endpoint->lock); 4665 if (TAILQ_EMPTY(&ctrlr->connected_sqs)) { 4666 endpoint->ctrlr = NULL; 4667 free_ctrlr(ctrlr); 4668 pthread_mutex_unlock(&endpoint->lock); 4669 return 0; 4670 } 4671 4672 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 4673 /* add another round thread poll to avoid recursive endpoint lock */ 4674 spdk_thread_send_msg(ctrlr->thread, _vfio_user_qpair_disconnect, sq); 4675 } 4676 pthread_mutex_unlock(&endpoint->lock); 4677 4678 return 0; 4679 } 4680 4681 /* 4682 * Poll for and process any incoming vfio-user messages. 4683 */ 4684 static int 4685 vfio_user_poll_vfu_ctx(void *ctx) 4686 { 4687 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 4688 int ret; 4689 4690 assert(ctrlr != NULL); 4691 4692 /* This will call access_bar0_fn() if there are any writes 4693 * to the portion of the BAR that is not mmap'd */ 4694 ret = vfu_run_ctx(ctrlr->endpoint->vfu_ctx); 4695 if (spdk_unlikely(ret == -1)) { 4696 if (errno == EBUSY) { 4697 return SPDK_POLLER_IDLE; 4698 } 4699 4700 spdk_poller_unregister(&ctrlr->vfu_ctx_poller); 4701 4702 /* 4703 * We lost the client; the reset callback will already have 4704 * unregistered the interrupt. 4705 */ 4706 if (errno == ENOTCONN) { 4707 vfio_user_destroy_ctrlr(ctrlr); 4708 return SPDK_POLLER_BUSY; 4709 } 4710 4711 /* 4712 * We might not have got a reset callback in this case, so 4713 * explicitly unregister the interrupt here. 4714 */ 4715 spdk_interrupt_unregister(&ctrlr->intr); 4716 ctrlr->intr_fd = -1; 4717 fail_ctrlr(ctrlr); 4718 } 4719 4720 return ret != 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 4721 } 4722 4723 struct vfio_user_post_cpl_ctx { 4724 struct nvmf_vfio_user_ctrlr *ctrlr; 4725 struct nvmf_vfio_user_cq *cq; 4726 struct spdk_nvme_cpl cpl; 4727 }; 4728 4729 static void 4730 _post_completion_msg(void *ctx) 4731 { 4732 struct vfio_user_post_cpl_ctx *cpl_ctx = ctx; 4733 4734 post_completion(cpl_ctx->ctrlr, cpl_ctx->cq, cpl_ctx->cpl.cdw0, cpl_ctx->cpl.sqid, 4735 cpl_ctx->cpl.cid, cpl_ctx->cpl.status.sc, cpl_ctx->cpl.status.sct); 4736 free(cpl_ctx); 4737 } 4738 4739 static int nvmf_vfio_user_poll_group_poll(struct spdk_nvmf_transport_poll_group *group); 4740 4741 static int vfio_user_poll_group_rearm(struct nvmf_vfio_user_poll_group *vu_group); 4742 4743 /* 4744 * Handle an interrupt for the given controller: we must poll the vfu_ctx, and 4745 * the SQs assigned to our poll group. 4746 */ 4747 static int 4748 vfio_user_ctrlr_intr(void *ctx) 4749 { 4750 struct nvmf_vfio_user_poll_group *vu_group; 4751 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 4752 int ret = 0; 4753 4754 assert(ctrlr != NULL); 4755 assert(ctrlr->sqs[0] != NULL); 4756 assert(ctrlr->sqs[0]->group != NULL); 4757 4758 ctrlr->kick_requested = false; 4759 4760 /* 4761 * Poll vfio-user for this controller. 4762 */ 4763 ret = vfio_user_poll_vfu_ctx(ctrlr); 4764 4765 vu_group = ctrlr_to_poll_group(ctrlr); 4766 4767 /* 4768 * See nvmf_vfio_user_get_optimal_poll_group() for why it's OK to only 4769 * poll this poll group. 4770 * 4771 * Note that this could end up polling other controller's SQs as well 4772 * (since a single poll group can have SQs from multiple separate 4773 * controllers). 4774 */ 4775 ret |= nvmf_vfio_user_poll_group_poll(&vu_group->group); 4776 4777 /* 4778 * Re-arm the event indexes. NB: this also could rearm other 4779 * controller's SQs. 4780 */ 4781 ret |= vfio_user_poll_group_rearm(vu_group); 4782 4783 return ret != 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 4784 } 4785 4786 static void 4787 vfio_user_set_intr_mode(struct spdk_poller *poller, void *arg, 4788 bool interrupt_mode) 4789 { 4790 struct nvmf_vfio_user_ctrlr *ctrlr = arg; 4791 assert(ctrlr != NULL); 4792 assert(ctrlr->endpoint != NULL); 4793 4794 SPDK_DEBUGLOG(nvmf_vfio, "%s: setting interrupt mode to %d\n", 4795 ctrlr_id(ctrlr), interrupt_mode); 4796 4797 /* 4798 * interrupt_mode needs to persist across controller resets, so store 4799 * it in the endpoint instead. 4800 */ 4801 ctrlr->endpoint->interrupt_mode = interrupt_mode; 4802 4803 vfio_user_poll_group_rearm(ctrlr_to_poll_group(ctrlr)); 4804 } 4805 4806 /* 4807 * In response to the nvmf_vfio_user_create_ctrlr() path, the admin queue is now 4808 * set up and we can start operating on this controller. 4809 */ 4810 static void 4811 start_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 4812 struct spdk_nvmf_ctrlr *ctrlr) 4813 { 4814 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 4815 4816 vu_ctrlr->ctrlr = ctrlr; 4817 vu_ctrlr->cntlid = ctrlr->cntlid; 4818 vu_ctrlr->thread = spdk_get_thread(); 4819 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 4820 4821 if (!in_interrupt_mode(endpoint->transport)) { 4822 vu_ctrlr->vfu_ctx_poller = SPDK_POLLER_REGISTER(vfio_user_poll_vfu_ctx, 4823 vu_ctrlr, 1000); 4824 return; 4825 } 4826 4827 vu_ctrlr->vfu_ctx_poller = SPDK_POLLER_REGISTER(vfio_user_poll_vfu_ctx, 4828 vu_ctrlr, 0); 4829 4830 vu_ctrlr->intr_fd = vfu_get_poll_fd(vu_ctrlr->endpoint->vfu_ctx); 4831 assert(vu_ctrlr->intr_fd != -1); 4832 4833 vu_ctrlr->intr = SPDK_INTERRUPT_REGISTER(vu_ctrlr->intr_fd, 4834 vfio_user_ctrlr_intr, vu_ctrlr); 4835 4836 assert(vu_ctrlr->intr != NULL); 4837 4838 spdk_poller_register_interrupt(vu_ctrlr->vfu_ctx_poller, 4839 vfio_user_set_intr_mode, 4840 vu_ctrlr); 4841 } 4842 4843 static int 4844 handle_queue_connect_rsp(struct nvmf_vfio_user_req *req, void *cb_arg) 4845 { 4846 struct nvmf_vfio_user_poll_group *vu_group; 4847 struct nvmf_vfio_user_sq *sq = cb_arg; 4848 struct nvmf_vfio_user_cq *admin_cq; 4849 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 4850 struct nvmf_vfio_user_endpoint *endpoint; 4851 4852 assert(sq != NULL); 4853 assert(req != NULL); 4854 4855 vu_ctrlr = sq->ctrlr; 4856 assert(vu_ctrlr != NULL); 4857 endpoint = vu_ctrlr->endpoint; 4858 assert(endpoint != NULL); 4859 4860 if (spdk_nvme_cpl_is_error(&req->req.rsp->nvme_cpl)) { 4861 SPDK_ERRLOG("SC %u, SCT %u\n", req->req.rsp->nvme_cpl.status.sc, req->req.rsp->nvme_cpl.status.sct); 4862 endpoint->ctrlr = NULL; 4863 free_ctrlr(vu_ctrlr); 4864 return -1; 4865 } 4866 4867 vu_group = SPDK_CONTAINEROF(sq->group, struct nvmf_vfio_user_poll_group, group); 4868 TAILQ_INSERT_TAIL(&vu_group->sqs, sq, link); 4869 4870 admin_cq = vu_ctrlr->cqs[0]; 4871 assert(admin_cq != NULL); 4872 4873 pthread_mutex_lock(&endpoint->lock); 4874 if (nvmf_qpair_is_admin_queue(&sq->qpair)) { 4875 admin_cq->thread = spdk_get_thread(); 4876 start_ctrlr(vu_ctrlr, sq->qpair.ctrlr); 4877 } else { 4878 /* For I/O queues this command was generated in response to an 4879 * ADMIN I/O CREATE SUBMISSION QUEUE command which has not yet 4880 * been completed. Complete it now. 4881 */ 4882 if (sq->post_create_io_sq_completion) { 4883 assert(admin_cq->thread != NULL); 4884 if (admin_cq->thread != spdk_get_thread()) { 4885 struct vfio_user_post_cpl_ctx *cpl_ctx; 4886 4887 cpl_ctx = calloc(1, sizeof(*cpl_ctx)); 4888 if (!cpl_ctx) { 4889 return -ENOMEM; 4890 } 4891 cpl_ctx->ctrlr = vu_ctrlr; 4892 cpl_ctx->cq = admin_cq; 4893 cpl_ctx->cpl.sqid = 0; 4894 cpl_ctx->cpl.cdw0 = 0; 4895 cpl_ctx->cpl.cid = sq->create_io_sq_cmd.cid; 4896 cpl_ctx->cpl.status.sc = SPDK_NVME_SC_SUCCESS; 4897 cpl_ctx->cpl.status.sct = SPDK_NVME_SCT_GENERIC; 4898 4899 spdk_thread_send_msg(admin_cq->thread, _post_completion_msg, 4900 cpl_ctx); 4901 } else { 4902 post_completion(vu_ctrlr, admin_cq, 0, 0, 4903 sq->create_io_sq_cmd.cid, SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC); 4904 } 4905 sq->post_create_io_sq_completion = false; 4906 } else if (in_interrupt_mode(endpoint->transport)) { 4907 /* 4908 * If we're live migrating a guest, there is a window 4909 * where the I/O queues haven't been set up but the 4910 * device is in running state, during which the guest 4911 * might write to a doorbell. This doorbell write will 4912 * go unnoticed, so let's poll the whole controller to 4913 * pick that up. 4914 */ 4915 ctrlr_kick(vu_ctrlr); 4916 } 4917 sq->sq_state = VFIO_USER_SQ_ACTIVE; 4918 } 4919 4920 TAILQ_INSERT_TAIL(&vu_ctrlr->connected_sqs, sq, tailq); 4921 pthread_mutex_unlock(&endpoint->lock); 4922 4923 free(req->req.data); 4924 req->req.data = NULL; 4925 4926 return 0; 4927 } 4928 4929 /* 4930 * Add the given qpair to the given poll group. New qpairs are added via 4931 * spdk_nvmf_tgt_new_qpair(), which picks a poll group via 4932 * nvmf_vfio_user_get_optimal_poll_group(), then calls back here via 4933 * nvmf_transport_poll_group_add(). 4934 */ 4935 static int 4936 nvmf_vfio_user_poll_group_add(struct spdk_nvmf_transport_poll_group *group, 4937 struct spdk_nvmf_qpair *qpair) 4938 { 4939 struct nvmf_vfio_user_sq *sq; 4940 struct nvmf_vfio_user_req *vu_req; 4941 struct nvmf_vfio_user_ctrlr *ctrlr; 4942 struct spdk_nvmf_request *req; 4943 struct spdk_nvmf_fabric_connect_data *data; 4944 bool admin; 4945 4946 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 4947 sq->group = group; 4948 ctrlr = sq->ctrlr; 4949 4950 SPDK_DEBUGLOG(nvmf_vfio, "%s: add QP%d=%p(%p) to poll_group=%p\n", 4951 ctrlr_id(ctrlr), sq->qpair.qid, 4952 sq, qpair, group); 4953 4954 admin = nvmf_qpair_is_admin_queue(&sq->qpair); 4955 4956 vu_req = get_nvmf_vfio_user_req(sq); 4957 if (vu_req == NULL) { 4958 return -1; 4959 } 4960 4961 req = &vu_req->req; 4962 req->cmd->connect_cmd.opcode = SPDK_NVME_OPC_FABRIC; 4963 req->cmd->connect_cmd.cid = 0; 4964 req->cmd->connect_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_CONNECT; 4965 req->cmd->connect_cmd.recfmt = 0; 4966 req->cmd->connect_cmd.sqsize = sq->size - 1; 4967 req->cmd->connect_cmd.qid = admin ? 0 : qpair->qid; 4968 4969 req->length = sizeof(struct spdk_nvmf_fabric_connect_data); 4970 req->data = calloc(1, req->length); 4971 if (req->data == NULL) { 4972 nvmf_vfio_user_req_free(req); 4973 return -ENOMEM; 4974 } 4975 4976 data = (struct spdk_nvmf_fabric_connect_data *)req->data; 4977 data->cntlid = ctrlr->cntlid; 4978 snprintf(data->subnqn, sizeof(data->subnqn), "%s", 4979 spdk_nvmf_subsystem_get_nqn(ctrlr->endpoint->subsystem)); 4980 4981 vu_req->cb_fn = handle_queue_connect_rsp; 4982 vu_req->cb_arg = sq; 4983 4984 SPDK_DEBUGLOG(nvmf_vfio, 4985 "%s: sending connect fabrics command for qid:%#x cntlid=%#x\n", 4986 ctrlr_id(ctrlr), qpair->qid, data->cntlid); 4987 4988 spdk_nvmf_request_exec_fabrics(req); 4989 return 0; 4990 } 4991 4992 static int 4993 nvmf_vfio_user_poll_group_remove(struct spdk_nvmf_transport_poll_group *group, 4994 struct spdk_nvmf_qpair *qpair) 4995 { 4996 struct nvmf_vfio_user_sq *sq; 4997 struct nvmf_vfio_user_poll_group *vu_group; 4998 4999 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5000 5001 SPDK_DEBUGLOG(nvmf_vfio, 5002 "%s: remove NVMf QP%d=%p from NVMf poll_group=%p\n", 5003 ctrlr_id(sq->ctrlr), qpair->qid, qpair, group); 5004 5005 5006 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 5007 TAILQ_REMOVE(&vu_group->sqs, sq, link); 5008 5009 return 0; 5010 } 5011 5012 static void 5013 _nvmf_vfio_user_req_free(struct nvmf_vfio_user_sq *sq, struct nvmf_vfio_user_req *vu_req) 5014 { 5015 memset(&vu_req->cmd, 0, sizeof(vu_req->cmd)); 5016 memset(&vu_req->rsp, 0, sizeof(vu_req->rsp)); 5017 vu_req->iovcnt = 0; 5018 vu_req->state = VFIO_USER_REQUEST_STATE_FREE; 5019 5020 TAILQ_INSERT_TAIL(&sq->free_reqs, vu_req, link); 5021 } 5022 5023 static int 5024 nvmf_vfio_user_req_free(struct spdk_nvmf_request *req) 5025 { 5026 struct nvmf_vfio_user_sq *sq; 5027 struct nvmf_vfio_user_req *vu_req; 5028 5029 assert(req != NULL); 5030 5031 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 5032 sq = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 5033 5034 _nvmf_vfio_user_req_free(sq, vu_req); 5035 5036 return 0; 5037 } 5038 5039 static int 5040 nvmf_vfio_user_req_complete(struct spdk_nvmf_request *req) 5041 { 5042 struct nvmf_vfio_user_sq *sq; 5043 struct nvmf_vfio_user_req *vu_req; 5044 5045 assert(req != NULL); 5046 5047 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 5048 sq = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 5049 5050 if (vu_req->cb_fn != NULL) { 5051 if (vu_req->cb_fn(vu_req, vu_req->cb_arg) != 0) { 5052 fail_ctrlr(sq->ctrlr); 5053 } 5054 } 5055 5056 _nvmf_vfio_user_req_free(sq, vu_req); 5057 5058 return 0; 5059 } 5060 5061 static void 5062 nvmf_vfio_user_close_qpair(struct spdk_nvmf_qpair *qpair, 5063 spdk_nvmf_transport_qpair_fini_cb cb_fn, void *cb_arg) 5064 { 5065 struct nvmf_vfio_user_sq *sq; 5066 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 5067 struct nvmf_vfio_user_endpoint *endpoint; 5068 5069 assert(qpair != NULL); 5070 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5071 vu_ctrlr = sq->ctrlr; 5072 endpoint = vu_ctrlr->endpoint; 5073 5074 pthread_mutex_lock(&endpoint->lock); 5075 TAILQ_REMOVE(&vu_ctrlr->connected_sqs, sq, tailq); 5076 delete_sq_done(vu_ctrlr, sq); 5077 if (TAILQ_EMPTY(&vu_ctrlr->connected_sqs)) { 5078 endpoint->ctrlr = NULL; 5079 if (vu_ctrlr->in_source_vm && endpoint->need_resume) { 5080 /* The controller will be freed, we can resume the subsystem 5081 * now so that the endpoint can be ready to accept another 5082 * new connection. 5083 */ 5084 spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 5085 vfio_user_endpoint_resume_done, endpoint); 5086 } 5087 free_ctrlr(vu_ctrlr); 5088 } 5089 pthread_mutex_unlock(&endpoint->lock); 5090 5091 if (cb_fn) { 5092 cb_fn(cb_arg); 5093 } 5094 } 5095 5096 /** 5097 * Returns a preallocated request, or NULL if there isn't one available. 5098 */ 5099 static struct nvmf_vfio_user_req * 5100 get_nvmf_vfio_user_req(struct nvmf_vfio_user_sq *sq) 5101 { 5102 struct nvmf_vfio_user_req *req; 5103 5104 if (sq == NULL) { 5105 return NULL; 5106 } 5107 5108 req = TAILQ_FIRST(&sq->free_reqs); 5109 if (req == NULL) { 5110 return NULL; 5111 } 5112 5113 TAILQ_REMOVE(&sq->free_reqs, req, link); 5114 5115 return req; 5116 } 5117 5118 static int 5119 get_nvmf_io_req_length(struct spdk_nvmf_request *req) 5120 { 5121 uint16_t nr; 5122 uint32_t nlb, nsid; 5123 struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; 5124 struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; 5125 struct spdk_nvmf_ns *ns; 5126 5127 nsid = cmd->nsid; 5128 ns = _nvmf_subsystem_get_ns(ctrlr->subsys, nsid); 5129 if (ns == NULL || ns->bdev == NULL) { 5130 SPDK_ERRLOG("unsuccessful query for nsid %u\n", cmd->nsid); 5131 return -EINVAL; 5132 } 5133 5134 if (cmd->opc == SPDK_NVME_OPC_DATASET_MANAGEMENT) { 5135 nr = cmd->cdw10_bits.dsm.nr + 1; 5136 return nr * sizeof(struct spdk_nvme_dsm_range); 5137 } 5138 5139 nlb = (cmd->cdw12 & 0x0000ffffu) + 1; 5140 return nlb * spdk_bdev_get_block_size(ns->bdev); 5141 } 5142 5143 static int 5144 map_admin_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req) 5145 { 5146 struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; 5147 uint32_t len = 0; 5148 uint8_t fid; 5149 int iovcnt; 5150 5151 req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 5152 req->length = 0; 5153 req->data = NULL; 5154 5155 if (req->xfer == SPDK_NVME_DATA_NONE) { 5156 return 0; 5157 } 5158 5159 switch (cmd->opc) { 5160 case SPDK_NVME_OPC_IDENTIFY: 5161 len = 4096; 5162 break; 5163 case SPDK_NVME_OPC_GET_LOG_PAGE: 5164 len = (((cmd->cdw11_bits.get_log_page.numdu << 16) | cmd->cdw10_bits.get_log_page.numdl) + 1) * 4; 5165 break; 5166 case SPDK_NVME_OPC_GET_FEATURES: 5167 case SPDK_NVME_OPC_SET_FEATURES: 5168 fid = cmd->cdw10_bits.set_features.fid; 5169 switch (fid) { 5170 case SPDK_NVME_FEAT_LBA_RANGE_TYPE: 5171 len = 4096; 5172 break; 5173 case SPDK_NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 5174 len = 256; 5175 break; 5176 case SPDK_NVME_FEAT_TIMESTAMP: 5177 len = 8; 5178 break; 5179 case SPDK_NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 5180 len = 512; 5181 break; 5182 case SPDK_NVME_FEAT_HOST_IDENTIFIER: 5183 if (cmd->cdw11_bits.feat_host_identifier.bits.exhid) { 5184 len = 16; 5185 } else { 5186 len = 8; 5187 } 5188 break; 5189 default: 5190 return 0; 5191 } 5192 break; 5193 default: 5194 return 0; 5195 } 5196 5197 /* ADMIN command will not use SGL */ 5198 if (cmd->psdt != 0) { 5199 return -EINVAL; 5200 } 5201 5202 iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, len); 5203 if (iovcnt < 0) { 5204 SPDK_ERRLOG("%s: map Admin Opc %x failed\n", 5205 ctrlr_id(ctrlr), cmd->opc); 5206 return -1; 5207 } 5208 req->length = len; 5209 req->data = req->iov[0].iov_base; 5210 req->iovcnt = iovcnt; 5211 5212 return 0; 5213 } 5214 5215 /* 5216 * Map an I/O command's buffers. 5217 * 5218 * Returns 0 on success and -errno on failure. 5219 */ 5220 static int 5221 map_io_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req) 5222 { 5223 int len, iovcnt; 5224 struct spdk_nvme_cmd *cmd; 5225 5226 assert(ctrlr != NULL); 5227 assert(req != NULL); 5228 5229 cmd = &req->cmd->nvme_cmd; 5230 req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 5231 req->length = 0; 5232 req->data = NULL; 5233 5234 if (spdk_unlikely(req->xfer == SPDK_NVME_DATA_NONE)) { 5235 return 0; 5236 } 5237 5238 len = get_nvmf_io_req_length(req); 5239 if (len < 0) { 5240 return -EINVAL; 5241 } 5242 req->length = len; 5243 5244 iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, req->length); 5245 if (iovcnt < 0) { 5246 SPDK_ERRLOG("%s: failed to map IO OPC %u\n", ctrlr_id(ctrlr), cmd->opc); 5247 return -EFAULT; 5248 } 5249 req->data = req->iov[0].iov_base; 5250 req->iovcnt = iovcnt; 5251 5252 return 0; 5253 } 5254 5255 static int 5256 handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 5257 struct nvmf_vfio_user_sq *sq) 5258 { 5259 int err; 5260 struct nvmf_vfio_user_req *vu_req; 5261 struct spdk_nvmf_request *req; 5262 5263 assert(ctrlr != NULL); 5264 assert(cmd != NULL); 5265 5266 vu_req = get_nvmf_vfio_user_req(sq); 5267 if (spdk_unlikely(vu_req == NULL)) { 5268 SPDK_ERRLOG("%s: no request for NVMe command opc 0x%x\n", ctrlr_id(ctrlr), cmd->opc); 5269 return post_completion(ctrlr, ctrlr->cqs[sq->cqid], 0, 0, cmd->cid, 5270 SPDK_NVME_SC_INTERNAL_DEVICE_ERROR, SPDK_NVME_SCT_GENERIC); 5271 5272 } 5273 req = &vu_req->req; 5274 5275 assert(req->qpair != NULL); 5276 SPDK_DEBUGLOG(nvmf_vfio, "%s: handle sqid:%u, req opc=%#x cid=%d\n", 5277 ctrlr_id(ctrlr), req->qpair->qid, cmd->opc, cmd->cid); 5278 5279 vu_req->cb_fn = handle_cmd_rsp; 5280 vu_req->cb_arg = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 5281 req->cmd->nvme_cmd = *cmd; 5282 5283 if (nvmf_qpair_is_admin_queue(req->qpair)) { 5284 err = map_admin_cmd_req(ctrlr, req); 5285 } else { 5286 switch (cmd->opc) { 5287 case SPDK_NVME_OPC_RESERVATION_REGISTER: 5288 case SPDK_NVME_OPC_RESERVATION_REPORT: 5289 case SPDK_NVME_OPC_RESERVATION_ACQUIRE: 5290 case SPDK_NVME_OPC_RESERVATION_RELEASE: 5291 err = -ENOTSUP; 5292 break; 5293 default: 5294 err = map_io_cmd_req(ctrlr, req); 5295 break; 5296 } 5297 } 5298 5299 if (spdk_unlikely(err < 0)) { 5300 SPDK_ERRLOG("%s: process NVMe command opc 0x%x failed\n", 5301 ctrlr_id(ctrlr), cmd->opc); 5302 req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 5303 req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; 5304 err = handle_cmd_rsp(vu_req, vu_req->cb_arg); 5305 _nvmf_vfio_user_req_free(sq, vu_req); 5306 return err; 5307 } 5308 5309 vu_req->state = VFIO_USER_REQUEST_STATE_EXECUTING; 5310 spdk_nvmf_request_exec(req); 5311 5312 return 0; 5313 } 5314 5315 /* 5316 * If we suppressed an IRQ in post_completion(), check if it needs to be fired 5317 * here: if the host isn't up to date, and is apparently not actively processing 5318 * the queue (i.e. ->last_head isn't changing), we need an IRQ. 5319 */ 5320 static void 5321 handle_suppressed_irq(struct nvmf_vfio_user_ctrlr *ctrlr, 5322 struct nvmf_vfio_user_sq *sq) 5323 { 5324 struct nvmf_vfio_user_cq *cq = ctrlr->cqs[sq->cqid]; 5325 uint32_t cq_head; 5326 uint32_t cq_tail; 5327 5328 if (!cq->ien || cq->qid == 0 || !ctrlr_interrupt_enabled(ctrlr)) { 5329 return; 5330 } 5331 5332 cq_tail = *cq_tailp(cq); 5333 5334 /* Already sent? */ 5335 if (cq_tail == cq->last_trigger_irq_tail) { 5336 return; 5337 } 5338 5339 spdk_ivdt_dcache(cq_dbl_headp(cq)); 5340 cq_head = *cq_dbl_headp(cq); 5341 5342 if (cq_head != cq_tail && cq_head == cq->last_head) { 5343 int err = vfu_irq_trigger(ctrlr->endpoint->vfu_ctx, cq->iv); 5344 if (err != 0) { 5345 SPDK_ERRLOG("%s: failed to trigger interrupt: %m\n", 5346 ctrlr_id(ctrlr)); 5347 } else { 5348 cq->last_trigger_irq_tail = cq_tail; 5349 } 5350 } 5351 5352 cq->last_head = cq_head; 5353 } 5354 5355 /* Returns the number of commands processed, or a negative value on error. */ 5356 static int 5357 nvmf_vfio_user_sq_poll(struct nvmf_vfio_user_sq *sq) 5358 { 5359 struct nvmf_vfio_user_ctrlr *ctrlr; 5360 uint32_t new_tail; 5361 int count = 0; 5362 5363 assert(sq != NULL); 5364 5365 ctrlr = sq->ctrlr; 5366 5367 /* 5368 * A quiesced, or migrating, controller should never process new 5369 * commands. 5370 */ 5371 if (ctrlr->state != VFIO_USER_CTRLR_RUNNING) { 5372 return SPDK_POLLER_IDLE; 5373 } 5374 5375 if (ctrlr->adaptive_irqs_enabled) { 5376 handle_suppressed_irq(ctrlr, sq); 5377 } 5378 5379 /* On aarch64 platforms, doorbells update from guest VM may not be seen 5380 * on SPDK target side. This is because there is memory type mismatch 5381 * situation here. That is on guest VM side, the doorbells are treated as 5382 * device memory while on SPDK target side, it is treated as normal 5383 * memory. And this situation cause problem on ARM platform. 5384 * Refer to "https://developer.arm.com/documentation/102376/0100/ 5385 * Memory-aliasing-and-mismatched-memory-types". Only using spdk_mb() 5386 * cannot fix this. Use "dc civac" to invalidate cache may solve 5387 * this. 5388 */ 5389 spdk_ivdt_dcache(sq_dbl_tailp(sq)); 5390 5391 /* Load-Acquire. */ 5392 new_tail = *sq_dbl_tailp(sq); 5393 5394 new_tail = new_tail & 0xffffu; 5395 if (spdk_unlikely(new_tail >= sq->size)) { 5396 union spdk_nvme_async_event_completion event = {}; 5397 5398 SPDK_DEBUGLOG(nvmf_vfio, "%s: invalid sqid:%u doorbell value %u\n", ctrlr_id(ctrlr), sq->qid, 5399 new_tail); 5400 event.bits.async_event_type = SPDK_NVME_ASYNC_EVENT_TYPE_ERROR; 5401 event.bits.async_event_info = SPDK_NVME_ASYNC_EVENT_INVALID_DB_WRITE; 5402 nvmf_ctrlr_async_event_error_event(ctrlr->ctrlr, event); 5403 5404 return -1; 5405 } 5406 5407 if (*sq_headp(sq) == new_tail) { 5408 return 0; 5409 } 5410 5411 SPDK_DEBUGLOG(nvmf_vfio, "%s: sqid:%u doorbell old=%u new=%u\n", 5412 ctrlr_id(ctrlr), sq->qid, *sq_headp(sq), new_tail); 5413 if (ctrlr->sdbl != NULL) { 5414 SPDK_DEBUGLOG(nvmf_vfio, 5415 "%s: sqid:%u bar0_doorbell=%u shadow_doorbell=%u eventidx=%u\n", 5416 ctrlr_id(ctrlr), sq->qid, 5417 ctrlr->bar0_doorbells[queue_index(sq->qid, false)], 5418 ctrlr->sdbl->shadow_doorbells[queue_index(sq->qid, false)], 5419 ctrlr->sdbl->eventidxs[queue_index(sq->qid, false)]); 5420 } 5421 5422 /* 5423 * Ensure that changes to the queue are visible to us. 5424 * The host driver should write the queue first, do a wmb(), and then 5425 * update the SQ tail doorbell (their Store-Release). 5426 */ 5427 spdk_rmb(); 5428 5429 count = handle_sq_tdbl_write(ctrlr, new_tail, sq); 5430 if (count < 0) { 5431 fail_ctrlr(ctrlr); 5432 } 5433 5434 return count; 5435 } 5436 5437 /* 5438 * vfio-user transport poll handler. Note that the library context is polled in 5439 * a separate poller (->vfu_ctx_poller), so this poller only needs to poll the 5440 * active SQs. 5441 * 5442 * Returns the number of commands processed, or a negative value on error. 5443 */ 5444 static int 5445 nvmf_vfio_user_poll_group_poll(struct spdk_nvmf_transport_poll_group *group) 5446 { 5447 struct nvmf_vfio_user_poll_group *vu_group; 5448 struct nvmf_vfio_user_sq *sq, *tmp; 5449 int count = 0; 5450 5451 assert(group != NULL); 5452 5453 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 5454 5455 SPDK_DEBUGLOG(vfio_user_db, "polling all SQs\n"); 5456 5457 TAILQ_FOREACH_SAFE(sq, &vu_group->sqs, link, tmp) { 5458 int ret; 5459 5460 if (spdk_unlikely(sq->sq_state != VFIO_USER_SQ_ACTIVE || !sq->size)) { 5461 continue; 5462 } 5463 5464 ret = nvmf_vfio_user_sq_poll(sq); 5465 5466 if (ret < 0) { 5467 return ret; 5468 } 5469 5470 count += ret; 5471 } 5472 5473 return count; 5474 } 5475 5476 static int 5477 nvmf_vfio_user_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair, 5478 struct spdk_nvme_transport_id *trid) 5479 { 5480 struct nvmf_vfio_user_sq *sq; 5481 struct nvmf_vfio_user_ctrlr *ctrlr; 5482 5483 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5484 ctrlr = sq->ctrlr; 5485 5486 memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid)); 5487 return 0; 5488 } 5489 5490 static int 5491 nvmf_vfio_user_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair, 5492 struct spdk_nvme_transport_id *trid) 5493 { 5494 return 0; 5495 } 5496 5497 static int 5498 nvmf_vfio_user_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair, 5499 struct spdk_nvme_transport_id *trid) 5500 { 5501 struct nvmf_vfio_user_sq *sq; 5502 struct nvmf_vfio_user_ctrlr *ctrlr; 5503 5504 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5505 ctrlr = sq->ctrlr; 5506 5507 memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid)); 5508 return 0; 5509 } 5510 5511 static void 5512 nvmf_vfio_user_qpair_abort_request(struct spdk_nvmf_qpair *qpair, 5513 struct spdk_nvmf_request *req) 5514 { 5515 struct spdk_nvmf_request *req_to_abort = NULL; 5516 struct spdk_nvmf_request *temp_req = NULL; 5517 uint16_t cid; 5518 5519 cid = req->cmd->nvme_cmd.cdw10_bits.abort.cid; 5520 5521 TAILQ_FOREACH(temp_req, &qpair->outstanding, link) { 5522 struct nvmf_vfio_user_req *vu_req; 5523 5524 vu_req = SPDK_CONTAINEROF(temp_req, struct nvmf_vfio_user_req, req); 5525 5526 if (vu_req->state == VFIO_USER_REQUEST_STATE_EXECUTING && vu_req->cmd.cid == cid) { 5527 req_to_abort = temp_req; 5528 break; 5529 } 5530 } 5531 5532 if (req_to_abort == NULL) { 5533 spdk_nvmf_request_complete(req); 5534 return; 5535 } 5536 5537 req->req_to_abort = req_to_abort; 5538 nvmf_ctrlr_abort_request(req); 5539 } 5540 5541 static void 5542 nvmf_vfio_user_opts_init(struct spdk_nvmf_transport_opts *opts) 5543 { 5544 opts->max_queue_depth = NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH; 5545 opts->max_qpairs_per_ctrlr = NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; 5546 opts->in_capsule_data_size = 0; 5547 opts->max_io_size = NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE; 5548 opts->io_unit_size = NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE; 5549 opts->max_aq_depth = NVMF_VFIO_USER_DEFAULT_AQ_DEPTH; 5550 opts->num_shared_buffers = 0; 5551 opts->buf_cache_size = 0; 5552 opts->association_timeout = 0; 5553 opts->transport_specific = NULL; 5554 } 5555 5556 const struct spdk_nvmf_transport_ops spdk_nvmf_transport_vfio_user = { 5557 .name = "VFIOUSER", 5558 .type = SPDK_NVME_TRANSPORT_VFIOUSER, 5559 .opts_init = nvmf_vfio_user_opts_init, 5560 .create = nvmf_vfio_user_create, 5561 .destroy = nvmf_vfio_user_destroy, 5562 5563 .listen = nvmf_vfio_user_listen, 5564 .stop_listen = nvmf_vfio_user_stop_listen, 5565 .cdata_init = nvmf_vfio_user_cdata_init, 5566 .listen_associate = nvmf_vfio_user_listen_associate, 5567 5568 .listener_discover = nvmf_vfio_user_discover, 5569 5570 .poll_group_create = nvmf_vfio_user_poll_group_create, 5571 .get_optimal_poll_group = nvmf_vfio_user_get_optimal_poll_group, 5572 .poll_group_destroy = nvmf_vfio_user_poll_group_destroy, 5573 .poll_group_add = nvmf_vfio_user_poll_group_add, 5574 .poll_group_remove = nvmf_vfio_user_poll_group_remove, 5575 .poll_group_poll = nvmf_vfio_user_poll_group_poll, 5576 5577 .req_free = nvmf_vfio_user_req_free, 5578 .req_complete = nvmf_vfio_user_req_complete, 5579 5580 .qpair_fini = nvmf_vfio_user_close_qpair, 5581 .qpair_get_local_trid = nvmf_vfio_user_qpair_get_local_trid, 5582 .qpair_get_peer_trid = nvmf_vfio_user_qpair_get_peer_trid, 5583 .qpair_get_listen_trid = nvmf_vfio_user_qpair_get_listen_trid, 5584 .qpair_abort_request = nvmf_vfio_user_qpair_abort_request, 5585 }; 5586 5587 SPDK_NVMF_TRANSPORT_REGISTER(muser, &spdk_nvmf_transport_vfio_user); 5588 SPDK_LOG_REGISTER_COMPONENT(nvmf_vfio) 5589 SPDK_LOG_REGISTER_COMPONENT(vfio_user_db) 5590