1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (c) 2019-2022, Nutanix Inc. All rights reserved. 3 * Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 */ 5 6 /* 7 * NVMe over vfio-user transport 8 */ 9 10 #include <vfio-user/libvfio-user.h> 11 #include <vfio-user/pci_defs.h> 12 13 #include "spdk/barrier.h" 14 #include "spdk/stdinc.h" 15 #include "spdk/assert.h" 16 #include "spdk/thread.h" 17 #include "spdk/nvmf_transport.h" 18 #include "spdk/sock.h" 19 #include "spdk/string.h" 20 #include "spdk/util.h" 21 #include "spdk/log.h" 22 23 #include "transport.h" 24 25 #include "nvmf_internal.h" 26 27 #define SWAP(x, y) \ 28 do \ 29 { \ 30 typeof(x) _tmp = x; \ 31 x = y; \ 32 y = _tmp; \ 33 } while (0) 34 35 #define NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH 256 36 #define NVMF_VFIO_USER_DEFAULT_AQ_DEPTH 32 37 #define NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE ((NVMF_REQ_MAX_BUFFERS - 1) << SHIFT_4KB) 38 #define NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE 39 40 #define NVME_DOORBELLS_OFFSET 0x1000 41 #define NVMF_VFIO_USER_DOORBELLS_SIZE 0x1000 42 #define NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT 2 43 #define NVMF_VFIO_USER_SET_EVENTIDX_MAX_ATTEMPTS 3 44 #define NVMF_VFIO_USER_EVENTIDX_POLL UINT32_MAX 45 46 /* 47 * NVMe driver reads 4096 bytes, which is the extended PCI configuration space 48 * available on PCI-X 2.0 and PCI Express buses 49 */ 50 #define NVME_REG_CFG_SIZE 0x1000 51 #define NVME_REG_BAR0_SIZE (NVME_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE) 52 #define NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR ((NVMF_VFIO_USER_DOORBELLS_SIZE) / 8) 53 #define NVME_IRQ_MSIX_NUM NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR 54 /* MSIX Table Size */ 55 #define NVME_BAR4_SIZE SPDK_ALIGN_CEIL((NVME_IRQ_MSIX_NUM * 16), 0x1000) 56 /* MSIX Pending Bit Array Size */ 57 #define NVME_BAR5_SIZE SPDK_ALIGN_CEIL((NVME_IRQ_MSIX_NUM / 8), 0x1000) 58 59 #define NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR (NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR / 4) 60 61 struct nvmf_vfio_user_req; 62 63 typedef int (*nvmf_vfio_user_req_cb_fn)(struct nvmf_vfio_user_req *req, void *cb_arg); 64 65 /* 1 more for PRP2 list itself */ 66 #define NVMF_VFIO_USER_MAX_IOVECS (NVMF_REQ_MAX_BUFFERS + 1) 67 68 enum nvmf_vfio_user_req_state { 69 VFIO_USER_REQUEST_STATE_FREE = 0, 70 VFIO_USER_REQUEST_STATE_EXECUTING, 71 }; 72 73 /* 74 * Support for live migration in NVMf/vfio-user: live migration is implemented 75 * by stopping the NVMf subsystem when the device is instructed to enter the 76 * stop-and-copy state and then trivially, and most importantly safely, 77 * collecting migration state and providing it to the vfio-user client. We 78 * don't provide any migration state at the pre-copy state as that's too 79 * complicated to do, we might support this in the future. 80 */ 81 82 83 /* NVMe device state representation */ 84 struct nvme_migr_sq_state { 85 uint16_t sqid; 86 uint16_t cqid; 87 uint32_t head; 88 uint32_t size; 89 uint32_t reserved; 90 uint64_t dma_addr; 91 }; 92 SPDK_STATIC_ASSERT(sizeof(struct nvme_migr_sq_state) == 0x18, "Incorrect size"); 93 94 struct nvme_migr_cq_state { 95 uint16_t cqid; 96 uint16_t phase; 97 uint32_t tail; 98 uint32_t size; 99 uint32_t iv; 100 uint32_t ien; 101 uint32_t reserved; 102 uint64_t dma_addr; 103 }; 104 SPDK_STATIC_ASSERT(sizeof(struct nvme_migr_cq_state) == 0x20, "Incorrect size"); 105 106 #define VFIO_USER_NVME_MIGR_MAGIC 0xAFEDBC23 107 108 /* The device state is in VFIO MIGRATION BAR(9) region, keep the device state page aligned. 109 * 110 * NVMe device migration region is defined as below: 111 * ------------------------------------------------------------------------- 112 * | vfio_user_nvme_migr_header | nvmf controller data | queue pairs | BARs | 113 * ------------------------------------------------------------------------- 114 * 115 * Keep vfio_user_nvme_migr_header as a fixed 0x1000 length, all new added fields 116 * can use the reserved space at the end of the data structure. 117 */ 118 struct vfio_user_nvme_migr_header { 119 /* Magic value to validate migration data */ 120 uint32_t magic; 121 /* Version to check the data is same from source to destination */ 122 uint32_t version; 123 124 /* The library uses this field to know how many fields in this 125 * structure are valid, starting at the beginning of this data 126 * structure. New added fields in future use `unused` memory 127 * spaces. 128 */ 129 uint32_t opts_size; 130 uint32_t reserved0; 131 132 /* BARs information */ 133 uint64_t bar_offset[VFU_PCI_DEV_NUM_REGIONS]; 134 uint64_t bar_len[VFU_PCI_DEV_NUM_REGIONS]; 135 136 /* Queue pair start offset, starting at the beginning of this 137 * data structure. 138 */ 139 uint64_t qp_offset; 140 uint64_t qp_len; 141 142 /* Controller data structure */ 143 uint32_t num_io_queues; 144 uint32_t reserved1; 145 146 /* NVMf controller data offset and length if exist, starting at 147 * the beginning of this data structure. 148 */ 149 uint64_t nvmf_data_offset; 150 uint64_t nvmf_data_len; 151 152 /* 153 * Whether or not shadow doorbells are used in the source. 0 is a valid DMA 154 * address. 155 */ 156 uint32_t sdbl; 157 158 /* Shadow doorbell DMA addresses. */ 159 uint64_t shadow_doorbell_buffer; 160 uint64_t eventidx_buffer; 161 162 /* Reserved memory space for new added fields, the 163 * field is always at the end of this data structure. 164 */ 165 uint8_t unused[3856]; 166 }; 167 SPDK_STATIC_ASSERT(sizeof(struct vfio_user_nvme_migr_header) == 0x1000, "Incorrect size"); 168 169 struct vfio_user_nvme_migr_qp { 170 struct nvme_migr_sq_state sq; 171 struct nvme_migr_cq_state cq; 172 }; 173 174 /* NVMe state definition used to load/restore from/to NVMe migration BAR region */ 175 struct vfio_user_nvme_migr_state { 176 struct vfio_user_nvme_migr_header ctrlr_header; 177 struct spdk_nvmf_ctrlr_migr_data nvmf_data; 178 struct vfio_user_nvme_migr_qp qps[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 179 uint8_t doorbells[NVMF_VFIO_USER_DOORBELLS_SIZE]; 180 uint8_t cfg[NVME_REG_CFG_SIZE]; 181 }; 182 183 struct nvmf_vfio_user_req { 184 struct spdk_nvmf_request req; 185 struct spdk_nvme_cpl rsp; 186 struct spdk_nvme_cmd cmd; 187 188 enum nvmf_vfio_user_req_state state; 189 nvmf_vfio_user_req_cb_fn cb_fn; 190 void *cb_arg; 191 192 /* old CC before prop_set_cc fabric command */ 193 union spdk_nvme_cc_register cc; 194 195 TAILQ_ENTRY(nvmf_vfio_user_req) link; 196 197 struct iovec iov[NVMF_VFIO_USER_MAX_IOVECS]; 198 uint8_t iovcnt; 199 200 /* NVMF_VFIO_USER_MAX_IOVECS worth of dma_sg_t. */ 201 uint8_t sg[]; 202 }; 203 204 /* 205 * Mapping of an NVMe queue. 206 * 207 * This holds the information tracking a local process mapping of an NVMe queue 208 * shared by the client. 209 */ 210 struct nvme_q_mapping { 211 /* iov of local process mapping. */ 212 struct iovec iov; 213 /* Stored sg, needed for unmap. */ 214 dma_sg_t *sg; 215 /* Client PRP of queue. */ 216 uint64_t prp1; 217 }; 218 219 enum nvmf_vfio_user_sq_state { 220 VFIO_USER_SQ_UNUSED = 0, 221 VFIO_USER_SQ_CREATED, 222 VFIO_USER_SQ_DELETED, 223 VFIO_USER_SQ_ACTIVE, 224 VFIO_USER_SQ_INACTIVE 225 }; 226 227 enum nvmf_vfio_user_cq_state { 228 VFIO_USER_CQ_UNUSED = 0, 229 VFIO_USER_CQ_CREATED, 230 VFIO_USER_CQ_DELETED, 231 }; 232 233 enum nvmf_vfio_user_ctrlr_state { 234 VFIO_USER_CTRLR_CREATING = 0, 235 VFIO_USER_CTRLR_RUNNING, 236 /* Quiesce requested by libvfio-user */ 237 VFIO_USER_CTRLR_PAUSING, 238 /* NVMf subsystem is paused, it's safe to do PCI reset, memory register, 239 * memory unergister, and vfio migration state transition in this state. 240 */ 241 VFIO_USER_CTRLR_PAUSED, 242 /* 243 * Implies that the NVMf subsystem is paused. Device will be unquiesced (PCI 244 * reset, memory register and unregister, controller in destination VM has 245 * been restored). NVMf subsystem resume has been requested. 246 */ 247 VFIO_USER_CTRLR_RESUMING, 248 /* 249 * Implies that the NVMf subsystem is paused. Both controller in source VM and 250 * destinatiom VM is in this state when doing live migration. 251 */ 252 VFIO_USER_CTRLR_MIGRATING 253 }; 254 255 struct nvmf_vfio_user_sq { 256 struct spdk_nvmf_qpair qpair; 257 struct spdk_nvmf_transport_poll_group *group; 258 struct nvmf_vfio_user_ctrlr *ctrlr; 259 260 uint32_t qid; 261 /* Number of entries in queue. */ 262 uint32_t size; 263 struct nvme_q_mapping mapping; 264 enum nvmf_vfio_user_sq_state sq_state; 265 266 uint32_t head; 267 volatile uint32_t *dbl_tailp; 268 269 /* Whether a shadow doorbell eventidx needs setting. */ 270 bool need_rearm; 271 272 /* multiple SQs can be mapped to the same CQ */ 273 uint16_t cqid; 274 275 /* handle_queue_connect_rsp() can be used both for CREATE IO SQ response 276 * and SQ re-connect response in the destination VM, for the prior case, 277 * we will post a NVMe completion to VM, we will not set this flag when 278 * re-connecting SQs in the destination VM. 279 */ 280 bool post_create_io_sq_completion; 281 /* Copy of Create IO SQ command, this field is used together with 282 * `post_create_io_sq_completion` flag. 283 */ 284 struct spdk_nvme_cmd create_io_sq_cmd; 285 286 /* Currently unallocated reqs. */ 287 TAILQ_HEAD(, nvmf_vfio_user_req) free_reqs; 288 /* Poll group entry */ 289 TAILQ_ENTRY(nvmf_vfio_user_sq) link; 290 /* Connected SQ entry */ 291 TAILQ_ENTRY(nvmf_vfio_user_sq) tailq; 292 }; 293 294 struct nvmf_vfio_user_cq { 295 struct spdk_nvmf_transport_poll_group *group; 296 struct spdk_thread *thread; 297 uint32_t cq_ref; 298 299 uint32_t qid; 300 /* Number of entries in queue. */ 301 uint32_t size; 302 struct nvme_q_mapping mapping; 303 enum nvmf_vfio_user_cq_state cq_state; 304 305 uint32_t tail; 306 volatile uint32_t *dbl_headp; 307 308 bool phase; 309 310 uint16_t iv; 311 bool ien; 312 313 uint32_t last_head; 314 uint32_t last_trigger_irq_tail; 315 }; 316 317 struct nvmf_vfio_user_poll_group { 318 struct spdk_nvmf_transport_poll_group group; 319 TAILQ_ENTRY(nvmf_vfio_user_poll_group) link; 320 TAILQ_HEAD(, nvmf_vfio_user_sq) sqs; 321 }; 322 323 struct nvmf_vfio_user_shadow_doorbells { 324 volatile uint32_t *shadow_doorbells; 325 volatile uint32_t *eventidxs; 326 dma_sg_t *sgs; 327 struct iovec *iovs; 328 }; 329 330 struct nvmf_vfio_user_ctrlr { 331 struct nvmf_vfio_user_endpoint *endpoint; 332 struct nvmf_vfio_user_transport *transport; 333 334 /* Connected SQs list */ 335 TAILQ_HEAD(, nvmf_vfio_user_sq) connected_sqs; 336 enum nvmf_vfio_user_ctrlr_state state; 337 338 /* 339 * Tells whether live migration data have been prepared. This is used 340 * by the get_pending_bytes callback to tell whether or not the 341 * previous iteration finished. 342 */ 343 bool migr_data_prepared; 344 345 /* Controller is in source VM when doing live migration */ 346 bool in_source_vm; 347 348 struct spdk_thread *thread; 349 struct spdk_poller *vfu_ctx_poller; 350 struct spdk_interrupt *intr; 351 int intr_fd; 352 353 bool queued_quiesce; 354 355 bool reset_shn; 356 357 uint16_t cntlid; 358 struct spdk_nvmf_ctrlr *ctrlr; 359 360 struct nvmf_vfio_user_sq *sqs[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 361 struct nvmf_vfio_user_cq *cqs[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 362 363 TAILQ_ENTRY(nvmf_vfio_user_ctrlr) link; 364 365 volatile uint32_t *bar0_doorbells; 366 struct nvmf_vfio_user_shadow_doorbells *sdbl; 367 /* 368 * Shadow doorbells PRPs to provide during the stop-and-copy state. 369 */ 370 uint64_t shadow_doorbell_buffer; 371 uint64_t eventidx_buffer; 372 373 bool adaptive_irqs_enabled; 374 bool kick_requested; 375 }; 376 377 /* Endpoint in vfio-user is associated with a socket file, which 378 * is the representative of a PCI endpoint. 379 */ 380 struct nvmf_vfio_user_endpoint { 381 struct nvmf_vfio_user_transport *transport; 382 vfu_ctx_t *vfu_ctx; 383 struct spdk_poller *accept_poller; 384 struct spdk_thread *accept_thread; 385 bool interrupt_mode; 386 struct msixcap *msix; 387 vfu_pci_config_space_t *pci_config_space; 388 int devmem_fd; 389 int accept_intr_fd; 390 struct spdk_interrupt *accept_intr; 391 392 volatile uint32_t *bar0_doorbells; 393 394 int migr_fd; 395 void *migr_data; 396 397 struct spdk_nvme_transport_id trid; 398 struct spdk_nvmf_subsystem *subsystem; 399 400 /* Controller is associated with an active socket connection, 401 * the lifecycle of the controller is same as the VM. 402 * Currently we only support one active connection, as the NVMe 403 * specification defines, we may support multiple controllers in 404 * future, so that it can support e.g: RESERVATION. 405 */ 406 struct nvmf_vfio_user_ctrlr *ctrlr; 407 pthread_mutex_t lock; 408 409 bool need_async_destroy; 410 /* The subsystem is in PAUSED state and need to be resumed, TRUE 411 * only when migration is done successfully and the controller is 412 * in source VM. 413 */ 414 bool need_resume; 415 /* Start the accept poller again after destroying the controller */ 416 bool need_relisten; 417 418 TAILQ_ENTRY(nvmf_vfio_user_endpoint) link; 419 }; 420 421 struct nvmf_vfio_user_transport_opts { 422 bool disable_mappable_bar0; 423 bool disable_adaptive_irq; 424 bool disable_shadow_doorbells; 425 bool disable_compare; 426 }; 427 428 struct nvmf_vfio_user_transport { 429 struct spdk_nvmf_transport transport; 430 struct nvmf_vfio_user_transport_opts transport_opts; 431 bool intr_mode_supported; 432 pthread_mutex_t lock; 433 TAILQ_HEAD(, nvmf_vfio_user_endpoint) endpoints; 434 435 pthread_mutex_t pg_lock; 436 TAILQ_HEAD(, nvmf_vfio_user_poll_group) poll_groups; 437 struct nvmf_vfio_user_poll_group *next_pg; 438 }; 439 440 /* 441 * function prototypes 442 */ 443 static int nvmf_vfio_user_req_free(struct spdk_nvmf_request *req); 444 445 static struct nvmf_vfio_user_req *get_nvmf_vfio_user_req(struct nvmf_vfio_user_sq *sq); 446 447 /* 448 * Local process virtual address of a queue. 449 */ 450 static inline void * 451 q_addr(struct nvme_q_mapping *mapping) 452 { 453 return mapping->iov.iov_base; 454 } 455 456 static inline int 457 queue_index(uint16_t qid, bool is_cq) 458 { 459 return (qid * 2) + is_cq; 460 } 461 462 static inline volatile uint32_t * 463 sq_headp(struct nvmf_vfio_user_sq *sq) 464 { 465 assert(sq != NULL); 466 return &sq->head; 467 } 468 469 static inline volatile uint32_t * 470 sq_dbl_tailp(struct nvmf_vfio_user_sq *sq) 471 { 472 assert(sq != NULL); 473 return sq->dbl_tailp; 474 } 475 476 static inline volatile uint32_t * 477 cq_dbl_headp(struct nvmf_vfio_user_cq *cq) 478 { 479 assert(cq != NULL); 480 return cq->dbl_headp; 481 } 482 483 static inline volatile uint32_t * 484 cq_tailp(struct nvmf_vfio_user_cq *cq) 485 { 486 assert(cq != NULL); 487 return &cq->tail; 488 } 489 490 static inline void 491 sq_head_advance(struct nvmf_vfio_user_sq *sq) 492 { 493 assert(sq != NULL); 494 495 assert(*sq_headp(sq) < sq->size); 496 (*sq_headp(sq))++; 497 498 if (spdk_unlikely(*sq_headp(sq) == sq->size)) { 499 *sq_headp(sq) = 0; 500 } 501 } 502 503 static inline void 504 cq_tail_advance(struct nvmf_vfio_user_cq *cq) 505 { 506 assert(cq != NULL); 507 508 assert(*cq_tailp(cq) < cq->size); 509 (*cq_tailp(cq))++; 510 511 if (spdk_unlikely(*cq_tailp(cq) == cq->size)) { 512 *cq_tailp(cq) = 0; 513 cq->phase = !cq->phase; 514 } 515 } 516 517 /* 518 * As per NVMe Base spec 3.3.1.2.1, we are supposed to implement CQ flow 519 * control: if there is no space in the CQ, we should wait until there is. 520 * 521 * In practice, we just fail the controller instead: as it happens, all host 522 * implementations we care about right-size the CQ: this is required anyway for 523 * NVMEoF support (see 3.3.2.8). 524 * 525 * Since reading the head doorbell is relatively expensive, we use the cached 526 * value, so we only have to read it for real if it appears that we are full. 527 */ 528 static inline bool 529 cq_is_full(struct nvmf_vfio_user_cq *cq) 530 { 531 uint32_t qindex; 532 533 assert(cq != NULL); 534 535 qindex = *cq_tailp(cq) + 1; 536 if (spdk_unlikely(qindex == cq->size)) { 537 qindex = 0; 538 } 539 540 if (qindex != cq->last_head) { 541 return false; 542 } 543 544 cq->last_head = *cq_dbl_headp(cq); 545 546 return qindex == cq->last_head; 547 } 548 549 static bool 550 io_q_exists(struct nvmf_vfio_user_ctrlr *vu_ctrlr, const uint16_t qid, const bool is_cq) 551 { 552 assert(vu_ctrlr != NULL); 553 554 if (qid == 0 || qid >= NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) { 555 return false; 556 } 557 558 if (is_cq) { 559 if (vu_ctrlr->cqs[qid] == NULL) { 560 return false; 561 } 562 563 return (vu_ctrlr->cqs[qid]->cq_state != VFIO_USER_CQ_DELETED && 564 vu_ctrlr->cqs[qid]->cq_state != VFIO_USER_CQ_UNUSED); 565 } 566 567 if (vu_ctrlr->sqs[qid] == NULL) { 568 return false; 569 } 570 571 return (vu_ctrlr->sqs[qid]->sq_state != VFIO_USER_SQ_DELETED && 572 vu_ctrlr->sqs[qid]->sq_state != VFIO_USER_SQ_UNUSED); 573 } 574 575 /* Return the poll group for the admin queue of the controller. */ 576 static inline struct nvmf_vfio_user_poll_group * 577 ctrlr_to_poll_group(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 578 { 579 return SPDK_CONTAINEROF(vu_ctrlr->sqs[0]->group, 580 struct nvmf_vfio_user_poll_group, 581 group); 582 } 583 584 static inline struct spdk_thread * 585 poll_group_to_thread(struct nvmf_vfio_user_poll_group *vu_pg) 586 { 587 return vu_pg->group.group->thread; 588 } 589 590 static dma_sg_t * 591 index_to_sg_t(void *arr, size_t i) 592 { 593 return (dma_sg_t *)((uintptr_t)arr + i * dma_sg_size()); 594 } 595 596 static inline size_t 597 vfio_user_migr_data_len(void) 598 { 599 return SPDK_ALIGN_CEIL(sizeof(struct vfio_user_nvme_migr_state), PAGE_SIZE); 600 } 601 602 static int vfio_user_ctrlr_intr(void *ctx); 603 604 /* 605 * Wrap vfio_user_ctrlr_intr() such that it can be used with 606 * spdk_thread_send_msg(). 607 * Pollers have type int (*)(void *) while message functions should have type 608 * void (*)(void *), so simply discard the returned value. 609 */ 610 static void 611 vfio_user_ctrlr_intr_wrapper(void *ctx) 612 { 613 vfio_user_ctrlr_intr(ctx); 614 } 615 616 /* 617 * Arrange for this controller to immediately wake up and process everything. 618 */ 619 static inline int 620 ctrlr_kick(struct nvmf_vfio_user_ctrlr *ctrlr) 621 { 622 assert(ctrlr != NULL); 623 assert(ctrlr->thread != NULL); 624 625 if (ctrlr->kick_requested) { 626 return 0; 627 } 628 629 ctrlr->kick_requested = true; 630 631 return spdk_thread_send_msg(ctrlr->thread, 632 vfio_user_ctrlr_intr_wrapper, 633 ctrlr); 634 } 635 636 /* 637 * Make the given DMA address and length available (locally mapped) via iov. 638 */ 639 static void * 640 map_one(vfu_ctx_t *ctx, uint64_t addr, uint64_t len, dma_sg_t *sg, 641 struct iovec *iov, int prot) 642 { 643 int ret; 644 645 assert(ctx != NULL); 646 assert(sg != NULL); 647 assert(iov != NULL); 648 649 ret = vfu_addr_to_sgl(ctx, (void *)(uintptr_t)addr, len, sg, 1, prot); 650 if (ret < 0) { 651 return NULL; 652 } 653 654 ret = vfu_sgl_get(ctx, sg, iov, 1, 0); 655 if (ret != 0) { 656 return NULL; 657 } 658 659 assert(iov->iov_base != NULL); 660 return iov->iov_base; 661 } 662 663 static int 664 nvme_cmd_map_prps(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, 665 uint32_t max_iovcnt, uint32_t len, size_t mps, 666 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 667 { 668 uint64_t prp1, prp2; 669 void *vva; 670 uint32_t i; 671 uint32_t residue_len, nents; 672 uint64_t *prp_list; 673 uint32_t iovcnt; 674 675 assert(max_iovcnt > 0); 676 677 prp1 = cmd->dptr.prp.prp1; 678 prp2 = cmd->dptr.prp.prp2; 679 680 /* PRP1 may started with unaligned page address */ 681 residue_len = mps - (prp1 % mps); 682 residue_len = spdk_min(len, residue_len); 683 684 vva = gpa_to_vva(prv, prp1, residue_len, PROT_READ | PROT_WRITE); 685 if (spdk_unlikely(vva == NULL)) { 686 SPDK_ERRLOG("GPA to VVA failed\n"); 687 return -EINVAL; 688 } 689 len -= residue_len; 690 if (len && max_iovcnt < 2) { 691 SPDK_ERRLOG("Too many page entries, at least two iovs are required\n"); 692 return -ERANGE; 693 } 694 iovs[0].iov_base = vva; 695 iovs[0].iov_len = residue_len; 696 697 if (len) { 698 if (spdk_unlikely(prp2 == 0)) { 699 SPDK_ERRLOG("no PRP2, %d remaining\n", len); 700 return -EINVAL; 701 } 702 703 if (len <= mps) { 704 /* 2 PRP used */ 705 iovcnt = 2; 706 vva = gpa_to_vva(prv, prp2, len, PROT_READ | PROT_WRITE); 707 if (spdk_unlikely(vva == NULL)) { 708 SPDK_ERRLOG("no VVA for %#" PRIx64 ", len%#x\n", 709 prp2, len); 710 return -EINVAL; 711 } 712 iovs[1].iov_base = vva; 713 iovs[1].iov_len = len; 714 } else { 715 /* PRP list used */ 716 nents = (len + mps - 1) / mps; 717 if (spdk_unlikely(nents + 1 > max_iovcnt)) { 718 SPDK_ERRLOG("Too many page entries\n"); 719 return -ERANGE; 720 } 721 722 vva = gpa_to_vva(prv, prp2, nents * sizeof(*prp_list), PROT_READ); 723 if (spdk_unlikely(vva == NULL)) { 724 SPDK_ERRLOG("no VVA for %#" PRIx64 ", nents=%#x\n", 725 prp2, nents); 726 return -EINVAL; 727 } 728 prp_list = vva; 729 i = 0; 730 while (len != 0) { 731 residue_len = spdk_min(len, mps); 732 vva = gpa_to_vva(prv, prp_list[i], residue_len, PROT_READ | PROT_WRITE); 733 if (spdk_unlikely(vva == NULL)) { 734 SPDK_ERRLOG("no VVA for %#" PRIx64 ", residue_len=%#x\n", 735 prp_list[i], residue_len); 736 return -EINVAL; 737 } 738 iovs[i + 1].iov_base = vva; 739 iovs[i + 1].iov_len = residue_len; 740 len -= residue_len; 741 i++; 742 } 743 iovcnt = i + 1; 744 } 745 } else { 746 /* 1 PRP used */ 747 iovcnt = 1; 748 } 749 750 assert(iovcnt <= max_iovcnt); 751 return iovcnt; 752 } 753 754 static int 755 nvme_cmd_map_sgls_data(void *prv, struct spdk_nvme_sgl_descriptor *sgls, uint32_t num_sgls, 756 struct iovec *iovs, uint32_t max_iovcnt, 757 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 758 { 759 uint32_t i; 760 void *vva; 761 762 if (spdk_unlikely(max_iovcnt < num_sgls)) { 763 return -ERANGE; 764 } 765 766 for (i = 0; i < num_sgls; i++) { 767 if (spdk_unlikely(sgls[i].unkeyed.type != SPDK_NVME_SGL_TYPE_DATA_BLOCK)) { 768 SPDK_ERRLOG("Invalid SGL type %u\n", sgls[i].unkeyed.type); 769 return -EINVAL; 770 } 771 vva = gpa_to_vva(prv, sgls[i].address, sgls[i].unkeyed.length, PROT_READ | PROT_WRITE); 772 if (spdk_unlikely(vva == NULL)) { 773 SPDK_ERRLOG("GPA to VVA failed\n"); 774 return -EINVAL; 775 } 776 iovs[i].iov_base = vva; 777 iovs[i].iov_len = sgls[i].unkeyed.length; 778 } 779 780 return num_sgls; 781 } 782 783 static int 784 nvme_cmd_map_sgls(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt, 785 uint32_t len, size_t mps, 786 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 787 { 788 struct spdk_nvme_sgl_descriptor *sgl, *last_sgl; 789 uint32_t num_sgls, seg_len; 790 void *vva; 791 int ret; 792 uint32_t total_iovcnt = 0; 793 794 /* SGL cases */ 795 sgl = &cmd->dptr.sgl1; 796 797 /* only one SGL segment */ 798 if (sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { 799 assert(max_iovcnt > 0); 800 vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, PROT_READ | PROT_WRITE); 801 if (spdk_unlikely(vva == NULL)) { 802 SPDK_ERRLOG("GPA to VVA failed\n"); 803 return -EINVAL; 804 } 805 iovs[0].iov_base = vva; 806 iovs[0].iov_len = sgl->unkeyed.length; 807 assert(sgl->unkeyed.length == len); 808 809 return 1; 810 } 811 812 for (;;) { 813 if (spdk_unlikely((sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_SEGMENT) && 814 (sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_LAST_SEGMENT))) { 815 SPDK_ERRLOG("Invalid SGL type %u\n", sgl->unkeyed.type); 816 return -EINVAL; 817 } 818 819 seg_len = sgl->unkeyed.length; 820 if (spdk_unlikely(seg_len % sizeof(struct spdk_nvme_sgl_descriptor))) { 821 SPDK_ERRLOG("Invalid SGL segment len %u\n", seg_len); 822 return -EINVAL; 823 } 824 825 num_sgls = seg_len / sizeof(struct spdk_nvme_sgl_descriptor); 826 vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, PROT_READ); 827 if (spdk_unlikely(vva == NULL)) { 828 SPDK_ERRLOG("GPA to VVA failed\n"); 829 return -EINVAL; 830 } 831 832 /* sgl point to the first segment */ 833 sgl = (struct spdk_nvme_sgl_descriptor *)vva; 834 last_sgl = &sgl[num_sgls - 1]; 835 836 /* we are done */ 837 if (last_sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { 838 /* map whole sgl list */ 839 ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls, &iovs[total_iovcnt], 840 max_iovcnt - total_iovcnt, gpa_to_vva); 841 if (spdk_unlikely(ret < 0)) { 842 return ret; 843 } 844 total_iovcnt += ret; 845 846 return total_iovcnt; 847 } 848 849 if (num_sgls > 1) { 850 /* map whole sgl exclude last_sgl */ 851 ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls - 1, &iovs[total_iovcnt], 852 max_iovcnt - total_iovcnt, gpa_to_vva); 853 if (spdk_unlikely(ret < 0)) { 854 return ret; 855 } 856 total_iovcnt += ret; 857 } 858 859 /* move to next level's segments */ 860 sgl = last_sgl; 861 } 862 863 return 0; 864 } 865 866 static int 867 nvme_map_cmd(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt, 868 uint32_t len, size_t mps, 869 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 870 { 871 if (cmd->psdt == SPDK_NVME_PSDT_PRP) { 872 return nvme_cmd_map_prps(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva); 873 } 874 875 return nvme_cmd_map_sgls(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva); 876 } 877 878 static char * 879 endpoint_id(struct nvmf_vfio_user_endpoint *endpoint) 880 { 881 return endpoint->trid.traddr; 882 } 883 884 static char * 885 ctrlr_id(struct nvmf_vfio_user_ctrlr *ctrlr) 886 { 887 if (!ctrlr || !ctrlr->endpoint) { 888 return "Null Ctrlr"; 889 } 890 891 return endpoint_id(ctrlr->endpoint); 892 } 893 894 /* 895 * For each queue, update the location of its doorbell to the correct location: 896 * either our own BAR0, or the guest's configured shadow doorbell area. 897 * 898 * The Admin queue (qid: 0) does not ever use shadow doorbells. 899 */ 900 static void 901 vfio_user_ctrlr_switch_doorbells(struct nvmf_vfio_user_ctrlr *ctrlr, bool shadow) 902 { 903 volatile uint32_t *doorbells = shadow ? ctrlr->sdbl->shadow_doorbells : 904 ctrlr->bar0_doorbells; 905 906 assert(doorbells != NULL); 907 908 for (size_t i = 1; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; i++) { 909 struct nvmf_vfio_user_sq *sq = ctrlr->sqs[i]; 910 struct nvmf_vfio_user_cq *cq = ctrlr->cqs[i]; 911 912 if (sq != NULL) { 913 sq->dbl_tailp = doorbells + queue_index(sq->qid, false); 914 } 915 916 if (cq != NULL) { 917 cq->dbl_headp = doorbells + queue_index(cq->qid, true); 918 } 919 } 920 } 921 922 static void 923 unmap_sdbl(vfu_ctx_t *vfu_ctx, struct nvmf_vfio_user_shadow_doorbells *sdbl) 924 { 925 assert(vfu_ctx != NULL); 926 assert(sdbl != NULL); 927 928 /* 929 * An allocation error would result in only one of the two being 930 * non-NULL. If that is the case, no memory should have been mapped. 931 */ 932 if (sdbl->iovs == NULL || sdbl->sgs == NULL) { 933 return; 934 } 935 936 for (size_t i = 0; i < NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT; ++i) { 937 struct iovec *iov; 938 dma_sg_t *sg; 939 940 if (!sdbl->iovs[i].iov_len) { 941 continue; 942 } 943 944 sg = index_to_sg_t(sdbl->sgs, i); 945 iov = sdbl->iovs + i; 946 947 vfu_sgl_put(vfu_ctx, sg, iov, 1); 948 } 949 } 950 951 static void 952 free_sdbl(vfu_ctx_t *vfu_ctx, struct nvmf_vfio_user_shadow_doorbells *sdbl) 953 { 954 if (sdbl == NULL) { 955 return; 956 } 957 958 unmap_sdbl(vfu_ctx, sdbl); 959 960 /* 961 * sdbl->shadow_doorbells and sdbl->eventidxs were mapped, 962 * not allocated, so don't free() them. 963 */ 964 free(sdbl->sgs); 965 free(sdbl->iovs); 966 free(sdbl); 967 } 968 969 static struct nvmf_vfio_user_shadow_doorbells * 970 map_sdbl(vfu_ctx_t *vfu_ctx, uint64_t prp1, uint64_t prp2, size_t len) 971 { 972 struct nvmf_vfio_user_shadow_doorbells *sdbl = NULL; 973 dma_sg_t *sg2 = NULL; 974 void *p; 975 976 assert(vfu_ctx != NULL); 977 978 sdbl = calloc(1, sizeof(*sdbl)); 979 if (sdbl == NULL) { 980 goto err; 981 } 982 983 sdbl->sgs = calloc(NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT, dma_sg_size()); 984 sdbl->iovs = calloc(NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT, sizeof(*sdbl->iovs)); 985 if (sdbl->sgs == NULL || sdbl->iovs == NULL) { 986 goto err; 987 } 988 989 /* Map shadow doorbell buffer (PRP1). */ 990 p = map_one(vfu_ctx, prp1, len, sdbl->sgs, sdbl->iovs, 991 PROT_READ | PROT_WRITE); 992 993 if (p == NULL) { 994 goto err; 995 } 996 997 /* 998 * Map eventidx buffer (PRP2). 999 * Should only be written to by the controller. 1000 */ 1001 1002 sg2 = index_to_sg_t(sdbl->sgs, 1); 1003 1004 p = map_one(vfu_ctx, prp2, len, sg2, sdbl->iovs + 1, 1005 PROT_READ | PROT_WRITE); 1006 1007 if (p == NULL) { 1008 goto err; 1009 } 1010 1011 sdbl->shadow_doorbells = (uint32_t *)sdbl->iovs[0].iov_base; 1012 sdbl->eventidxs = (uint32_t *)sdbl->iovs[1].iov_base; 1013 1014 return sdbl; 1015 1016 err: 1017 free_sdbl(vfu_ctx, sdbl); 1018 return NULL; 1019 } 1020 1021 /* 1022 * Copy doorbells from one buffer to the other, during switches betweeen BAR0 1023 * doorbells and shadow doorbells. 1024 */ 1025 static void 1026 copy_doorbells(struct nvmf_vfio_user_ctrlr *ctrlr, 1027 const volatile uint32_t *from, volatile uint32_t *to) 1028 { 1029 assert(ctrlr != NULL); 1030 assert(from != NULL); 1031 assert(to != NULL); 1032 1033 SPDK_DEBUGLOG(vfio_user_db, 1034 "%s: migrating shadow doorbells from %p to %p\n", 1035 ctrlr_id(ctrlr), from, to); 1036 1037 /* Can't use memcpy because it doesn't respect volatile semantics. */ 1038 for (size_t i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; ++i) { 1039 if (ctrlr->sqs[i] != NULL) { 1040 to[queue_index(i, false)] = from[queue_index(i, false)]; 1041 } 1042 1043 if (ctrlr->cqs[i] != NULL) { 1044 to[queue_index(i, true)] = from[queue_index(i, true)]; 1045 } 1046 } 1047 } 1048 1049 static void 1050 fail_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1051 { 1052 const struct spdk_nvmf_registers *regs; 1053 1054 assert(vu_ctrlr != NULL); 1055 assert(vu_ctrlr->ctrlr != NULL); 1056 1057 regs = spdk_nvmf_ctrlr_get_regs(vu_ctrlr->ctrlr); 1058 if (regs->csts.bits.cfs == 0) { 1059 SPDK_ERRLOG(":%s failing controller\n", ctrlr_id(vu_ctrlr)); 1060 } 1061 1062 nvmf_ctrlr_set_fatal_status(vu_ctrlr->ctrlr); 1063 } 1064 1065 static inline bool 1066 ctrlr_interrupt_enabled(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1067 { 1068 assert(vu_ctrlr != NULL); 1069 assert(vu_ctrlr->endpoint != NULL); 1070 1071 vfu_pci_config_space_t *pci = vu_ctrlr->endpoint->pci_config_space; 1072 1073 return (!pci->hdr.cmd.id || vu_ctrlr->endpoint->msix->mxc.mxe); 1074 } 1075 1076 static void 1077 nvmf_vfio_user_destroy_endpoint(struct nvmf_vfio_user_endpoint *endpoint) 1078 { 1079 SPDK_DEBUGLOG(nvmf_vfio, "destroy endpoint %s\n", endpoint_id(endpoint)); 1080 1081 spdk_interrupt_unregister(&endpoint->accept_intr); 1082 spdk_poller_unregister(&endpoint->accept_poller); 1083 1084 if (endpoint->bar0_doorbells) { 1085 munmap((void *)endpoint->bar0_doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 1086 } 1087 1088 if (endpoint->devmem_fd > 0) { 1089 close(endpoint->devmem_fd); 1090 } 1091 1092 if (endpoint->migr_data) { 1093 munmap(endpoint->migr_data, vfio_user_migr_data_len()); 1094 } 1095 1096 if (endpoint->migr_fd > 0) { 1097 close(endpoint->migr_fd); 1098 } 1099 1100 if (endpoint->vfu_ctx) { 1101 vfu_destroy_ctx(endpoint->vfu_ctx); 1102 } 1103 1104 pthread_mutex_destroy(&endpoint->lock); 1105 free(endpoint); 1106 } 1107 1108 /* called when process exits */ 1109 static int 1110 nvmf_vfio_user_destroy(struct spdk_nvmf_transport *transport, 1111 spdk_nvmf_transport_destroy_done_cb cb_fn, void *cb_arg) 1112 { 1113 struct nvmf_vfio_user_transport *vu_transport; 1114 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 1115 1116 SPDK_DEBUGLOG(nvmf_vfio, "destroy transport\n"); 1117 1118 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 1119 transport); 1120 1121 pthread_mutex_destroy(&vu_transport->lock); 1122 pthread_mutex_destroy(&vu_transport->pg_lock); 1123 1124 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 1125 TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link); 1126 nvmf_vfio_user_destroy_endpoint(endpoint); 1127 } 1128 1129 free(vu_transport); 1130 1131 if (cb_fn) { 1132 cb_fn(cb_arg); 1133 } 1134 1135 return 0; 1136 } 1137 1138 static const struct spdk_json_object_decoder vfio_user_transport_opts_decoder[] = { 1139 { 1140 "disable_mappable_bar0", 1141 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_mappable_bar0), 1142 spdk_json_decode_bool, true 1143 }, 1144 { 1145 "disable_adaptive_irq", 1146 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_adaptive_irq), 1147 spdk_json_decode_bool, true 1148 }, 1149 { 1150 "disable_shadow_doorbells", 1151 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_shadow_doorbells), 1152 spdk_json_decode_bool, true 1153 }, 1154 { 1155 "disable_compare", 1156 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_compare), 1157 spdk_json_decode_bool, true 1158 }, 1159 }; 1160 1161 static struct spdk_nvmf_transport * 1162 nvmf_vfio_user_create(struct spdk_nvmf_transport_opts *opts) 1163 { 1164 struct nvmf_vfio_user_transport *vu_transport; 1165 int err; 1166 1167 if (opts->max_qpairs_per_ctrlr > NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) { 1168 SPDK_ERRLOG("Invalid max_qpairs_per_ctrlr=%d, supported max_qpairs_per_ctrlr=%d\n", 1169 opts->max_qpairs_per_ctrlr, NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR); 1170 return NULL; 1171 } 1172 1173 vu_transport = calloc(1, sizeof(*vu_transport)); 1174 if (vu_transport == NULL) { 1175 SPDK_ERRLOG("Transport alloc fail: %m\n"); 1176 return NULL; 1177 } 1178 1179 err = pthread_mutex_init(&vu_transport->lock, NULL); 1180 if (err != 0) { 1181 SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err); 1182 goto err; 1183 } 1184 TAILQ_INIT(&vu_transport->endpoints); 1185 1186 err = pthread_mutex_init(&vu_transport->pg_lock, NULL); 1187 if (err != 0) { 1188 pthread_mutex_destroy(&vu_transport->lock); 1189 SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err); 1190 goto err; 1191 } 1192 TAILQ_INIT(&vu_transport->poll_groups); 1193 1194 if (opts->transport_specific != NULL && 1195 spdk_json_decode_object_relaxed(opts->transport_specific, vfio_user_transport_opts_decoder, 1196 SPDK_COUNTOF(vfio_user_transport_opts_decoder), 1197 vu_transport)) { 1198 SPDK_ERRLOG("spdk_json_decode_object_relaxed failed\n"); 1199 goto cleanup; 1200 } 1201 1202 /* 1203 * To support interrupt mode, the transport must be configured with 1204 * mappable BAR0 disabled: we need a vfio-user message to wake us up 1205 * when a client writes new doorbell values to BAR0, via the 1206 * libvfio-user socket fd. 1207 */ 1208 vu_transport->intr_mode_supported = 1209 vu_transport->transport_opts.disable_mappable_bar0; 1210 1211 /* 1212 * If BAR0 is mappable, it doesn't make sense to support shadow 1213 * doorbells, so explicitly turn it off. 1214 */ 1215 if (!vu_transport->transport_opts.disable_mappable_bar0) { 1216 vu_transport->transport_opts.disable_shadow_doorbells = true; 1217 } 1218 1219 /* 1220 * If we are in interrupt mode, we cannot support adaptive IRQs, as 1221 * there is no guarantee the SQ poller will run subsequently to send 1222 * pending IRQs. 1223 */ 1224 if (spdk_interrupt_mode_is_enabled()) { 1225 vu_transport->transport_opts.disable_adaptive_irq = true; 1226 } 1227 1228 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_mappable_bar0=%d\n", 1229 vu_transport->transport_opts.disable_mappable_bar0); 1230 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_adaptive_irq=%d\n", 1231 vu_transport->transport_opts.disable_adaptive_irq); 1232 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_shadow_doorbells=%d\n", 1233 vu_transport->transport_opts.disable_shadow_doorbells); 1234 1235 return &vu_transport->transport; 1236 1237 cleanup: 1238 pthread_mutex_destroy(&vu_transport->lock); 1239 pthread_mutex_destroy(&vu_transport->pg_lock); 1240 err: 1241 free(vu_transport); 1242 return NULL; 1243 } 1244 1245 static uint32_t 1246 max_queue_size(struct nvmf_vfio_user_ctrlr const *vu_ctrlr) 1247 { 1248 assert(vu_ctrlr != NULL); 1249 assert(vu_ctrlr->ctrlr != NULL); 1250 1251 return vu_ctrlr->ctrlr->vcprop.cap.bits.mqes + 1; 1252 } 1253 1254 static uint32_t 1255 doorbell_stride(const struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1256 { 1257 assert(vu_ctrlr != NULL); 1258 assert(vu_ctrlr->ctrlr != NULL); 1259 1260 return vu_ctrlr->ctrlr->vcprop.cap.bits.dstrd; 1261 } 1262 1263 static uintptr_t 1264 memory_page_size(const struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1265 { 1266 uint32_t memory_page_shift = vu_ctrlr->ctrlr->vcprop.cc.bits.mps + 12; 1267 return 1ul << memory_page_shift; 1268 } 1269 1270 static uintptr_t 1271 memory_page_mask(const struct nvmf_vfio_user_ctrlr *ctrlr) 1272 { 1273 return ~(memory_page_size(ctrlr) - 1); 1274 } 1275 1276 static int 1277 map_q(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvme_q_mapping *mapping, 1278 uint32_t q_size, bool is_cq, bool unmap) 1279 { 1280 uint64_t len; 1281 void *ret; 1282 1283 assert(q_size); 1284 assert(q_addr(mapping) == NULL); 1285 1286 if (is_cq) { 1287 len = q_size * sizeof(struct spdk_nvme_cpl); 1288 } else { 1289 len = q_size * sizeof(struct spdk_nvme_cmd); 1290 } 1291 1292 ret = map_one(vu_ctrlr->endpoint->vfu_ctx, mapping->prp1, len, 1293 mapping->sg, &mapping->iov, 1294 is_cq ? PROT_READ | PROT_WRITE : PROT_READ); 1295 if (ret == NULL) { 1296 return -EFAULT; 1297 } 1298 1299 if (unmap) { 1300 memset(q_addr(mapping), 0, len); 1301 } 1302 1303 return 0; 1304 } 1305 1306 static inline void 1307 unmap_q(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvme_q_mapping *mapping) 1308 { 1309 if (q_addr(mapping) != NULL) { 1310 vfu_sgl_put(vu_ctrlr->endpoint->vfu_ctx, mapping->sg, 1311 &mapping->iov, 1); 1312 mapping->iov.iov_base = NULL; 1313 } 1314 } 1315 1316 static int 1317 asq_setup(struct nvmf_vfio_user_ctrlr *ctrlr) 1318 { 1319 struct nvmf_vfio_user_sq *sq; 1320 const struct spdk_nvmf_registers *regs; 1321 int ret; 1322 1323 assert(ctrlr != NULL); 1324 1325 sq = ctrlr->sqs[0]; 1326 1327 assert(sq != NULL); 1328 assert(q_addr(&sq->mapping) == NULL); 1329 /* XXX ctrlr->asq == 0 is a valid memory address */ 1330 1331 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->ctrlr); 1332 sq->qid = 0; 1333 sq->size = regs->aqa.bits.asqs + 1; 1334 sq->mapping.prp1 = regs->asq; 1335 *sq_headp(sq) = 0; 1336 sq->cqid = 0; 1337 1338 ret = map_q(ctrlr, &sq->mapping, sq->size, false, true); 1339 if (ret) { 1340 return ret; 1341 } 1342 1343 /* The Admin queue (qid: 0) does not ever use shadow doorbells. */ 1344 sq->dbl_tailp = ctrlr->bar0_doorbells + queue_index(0, false); 1345 1346 *sq_dbl_tailp(sq) = 0; 1347 1348 return 0; 1349 } 1350 1351 /* 1352 * Updates eventidx to set an SQ into interrupt or polling mode. 1353 * 1354 * Returns false if the current SQ tail does not match the SQ head, as 1355 * this means that the host has submitted more items to the queue while we were 1356 * not looking - or during the event index update. In that case, we must retry, 1357 * or otherwise make sure we are going to wake up again. 1358 */ 1359 static bool 1360 set_sq_eventidx(struct nvmf_vfio_user_sq *sq) 1361 { 1362 struct nvmf_vfio_user_ctrlr *ctrlr; 1363 volatile uint32_t *sq_tail_eidx; 1364 uint32_t old_tail, new_tail; 1365 1366 assert(sq != NULL); 1367 assert(sq->ctrlr != NULL); 1368 assert(sq->ctrlr->sdbl != NULL); 1369 assert(sq->need_rearm); 1370 1371 ctrlr = sq->ctrlr; 1372 1373 SPDK_DEBUGLOG(vfio_user_db, "%s: updating eventidx of sqid:%u\n", 1374 ctrlr_id(ctrlr), sq->qid); 1375 1376 sq_tail_eidx = ctrlr->sdbl->eventidxs + queue_index(sq->qid, false); 1377 1378 assert(ctrlr->endpoint != NULL); 1379 1380 if (!ctrlr->endpoint->interrupt_mode) { 1381 /* No synchronisation necessary. */ 1382 *sq_tail_eidx = NVMF_VFIO_USER_EVENTIDX_POLL; 1383 return true; 1384 } 1385 1386 old_tail = *sq_dbl_tailp(sq); 1387 *sq_tail_eidx = old_tail; 1388 1389 /* 1390 * Ensure that the event index is updated before re-reading the tail 1391 * doorbell. If it's not, then the host might race us and update the 1392 * tail after the second read but before the event index is written, so 1393 * it won't write to BAR0 and we'll miss the update. 1394 * 1395 * The driver should provide similar ordering with an mb(). 1396 */ 1397 spdk_mb(); 1398 1399 /* 1400 * Check if the host has updated the tail doorbell after we've read it 1401 * for the first time, but before the event index was written. If that's 1402 * the case, then we've lost the race and we need to update the event 1403 * index again (after polling the queue, since the host won't write to 1404 * BAR0). 1405 */ 1406 new_tail = *sq_dbl_tailp(sq); 1407 1408 /* 1409 * We might poll the queue straight after this function returns if the 1410 * tail has been updated, so we need to ensure that any changes to the 1411 * queue will be visible to us if the doorbell has been updated. 1412 * 1413 * The driver should provide similar ordering with a wmb() to ensure 1414 * that the queue is written before it updates the tail doorbell. 1415 */ 1416 spdk_rmb(); 1417 1418 SPDK_DEBUGLOG(vfio_user_db, "%s: sqid:%u, old_tail=%u, new_tail=%u, " 1419 "sq_head=%u\n", ctrlr_id(ctrlr), sq->qid, old_tail, 1420 new_tail, *sq_headp(sq)); 1421 1422 if (new_tail == *sq_headp(sq)) { 1423 sq->need_rearm = false; 1424 return true; 1425 } 1426 1427 /* 1428 * We've lost the race: the tail was updated since we last polled, 1429 * including if it happened within this routine. 1430 * 1431 * The caller should retry after polling (think of this as a cmpxchg 1432 * loop); if we go to sleep while the SQ is not empty, then we won't 1433 * process the remaining events. 1434 */ 1435 return false; 1436 } 1437 1438 static int nvmf_vfio_user_sq_poll(struct nvmf_vfio_user_sq *sq); 1439 1440 /* 1441 * Arrange for an SQ to interrupt us if written. Returns non-zero if we 1442 * processed some SQ entries. 1443 */ 1444 static int 1445 vfio_user_sq_rearm(struct nvmf_vfio_user_ctrlr *ctrlr, 1446 struct nvmf_vfio_user_sq *sq) 1447 { 1448 int count = 0; 1449 size_t i; 1450 1451 assert(sq->need_rearm); 1452 1453 for (i = 0; i < NVMF_VFIO_USER_SET_EVENTIDX_MAX_ATTEMPTS; i++) { 1454 int ret; 1455 1456 if (set_sq_eventidx(sq)) { 1457 /* We won the race and set eventidx; done. */ 1458 return count; 1459 } 1460 1461 ret = nvmf_vfio_user_sq_poll(sq); 1462 1463 count += (ret < 0) ? 1 : ret; 1464 1465 /* 1466 * set_sq_eventidx() hit the race, so we expected 1467 * to process at least one command from this queue. 1468 * If there were no new commands waiting for us, then 1469 * we must have hit an unexpected race condition. 1470 */ 1471 if (ret == 0) { 1472 SPDK_ERRLOG("%s: unexpected race condition detected " 1473 "while updating the shadow doorbell buffer\n", 1474 ctrlr_id(ctrlr)); 1475 1476 fail_ctrlr(ctrlr); 1477 return count; 1478 } 1479 } 1480 1481 SPDK_DEBUGLOG(vfio_user_db, 1482 "%s: set_sq_eventidx() lost the race %zu times\n", 1483 ctrlr_id(ctrlr), i); 1484 1485 /* 1486 * We couldn't arrange an eventidx guaranteed to cause a BAR0 write, as 1487 * we raced with the producer too many times; force ourselves to wake up 1488 * instead. We'll process all queues at that point. 1489 */ 1490 ctrlr_kick(ctrlr); 1491 1492 return count; 1493 } 1494 1495 /* 1496 * We're in interrupt mode, and potentially about to go to sleep. We need to 1497 * make sure any further I/O submissions are guaranteed to wake us up: for 1498 * shadow doorbells that means we may need to go through set_sq_eventidx() for 1499 * every SQ that needs re-arming. 1500 * 1501 * Returns non-zero if we processed something. 1502 */ 1503 static int 1504 vfio_user_poll_group_rearm(struct nvmf_vfio_user_poll_group *vu_group) 1505 { 1506 struct nvmf_vfio_user_sq *sq; 1507 int count = 0; 1508 1509 TAILQ_FOREACH(sq, &vu_group->sqs, link) { 1510 if (spdk_unlikely(sq->sq_state != VFIO_USER_SQ_ACTIVE || !sq->size)) { 1511 continue; 1512 } 1513 1514 if (sq->need_rearm) { 1515 count += vfio_user_sq_rearm(sq->ctrlr, sq); 1516 } 1517 } 1518 1519 return count; 1520 } 1521 1522 static int 1523 acq_setup(struct nvmf_vfio_user_ctrlr *ctrlr) 1524 { 1525 struct nvmf_vfio_user_cq *cq; 1526 const struct spdk_nvmf_registers *regs; 1527 int ret; 1528 1529 assert(ctrlr != NULL); 1530 1531 cq = ctrlr->cqs[0]; 1532 1533 assert(cq != NULL); 1534 1535 assert(q_addr(&cq->mapping) == NULL); 1536 1537 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->ctrlr); 1538 assert(regs != NULL); 1539 cq->qid = 0; 1540 cq->size = regs->aqa.bits.acqs + 1; 1541 cq->mapping.prp1 = regs->acq; 1542 *cq_tailp(cq) = 0; 1543 cq->ien = true; 1544 cq->phase = true; 1545 1546 ret = map_q(ctrlr, &cq->mapping, cq->size, true, true); 1547 if (ret) { 1548 return ret; 1549 } 1550 1551 /* The Admin queue (qid: 0) does not ever use shadow doorbells. */ 1552 cq->dbl_headp = ctrlr->bar0_doorbells + queue_index(0, true); 1553 1554 *cq_dbl_headp(cq) = 0; 1555 1556 return 0; 1557 } 1558 1559 static void * 1560 _map_one(void *prv, uint64_t addr, uint64_t len, int prot) 1561 { 1562 struct spdk_nvmf_request *req = (struct spdk_nvmf_request *)prv; 1563 struct spdk_nvmf_qpair *qpair; 1564 struct nvmf_vfio_user_req *vu_req; 1565 struct nvmf_vfio_user_sq *sq; 1566 void *ret; 1567 1568 assert(req != NULL); 1569 qpair = req->qpair; 1570 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 1571 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 1572 1573 assert(vu_req->iovcnt < NVMF_VFIO_USER_MAX_IOVECS); 1574 ret = map_one(sq->ctrlr->endpoint->vfu_ctx, addr, len, 1575 index_to_sg_t(vu_req->sg, vu_req->iovcnt), 1576 &vu_req->iov[vu_req->iovcnt], prot); 1577 if (spdk_likely(ret != NULL)) { 1578 vu_req->iovcnt++; 1579 } 1580 return ret; 1581 } 1582 1583 static int 1584 vfio_user_map_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req, 1585 struct iovec *iov, uint32_t length) 1586 { 1587 /* Map PRP list to from Guest physical memory to 1588 * virtual memory address. 1589 */ 1590 return nvme_map_cmd(req, &req->cmd->nvme_cmd, iov, NVMF_REQ_MAX_BUFFERS, 1591 length, 4096, _map_one); 1592 } 1593 1594 static int handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 1595 struct nvmf_vfio_user_sq *sq); 1596 1597 /* 1598 * Posts a CQE in the completion queue. 1599 * 1600 * @ctrlr: the vfio-user controller 1601 * @cq: the completion queue 1602 * @cdw0: cdw0 as reported by NVMf 1603 * @sqid: submission queue ID 1604 * @cid: command identifier in NVMe command 1605 * @sc: the NVMe CQE status code 1606 * @sct: the NVMe CQE status code type 1607 */ 1608 static int 1609 post_completion(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_cq *cq, 1610 uint32_t cdw0, uint16_t sqid, uint16_t cid, uint16_t sc, uint16_t sct) 1611 { 1612 struct spdk_nvme_status cpl_status = { 0 }; 1613 struct spdk_nvme_cpl *cpl; 1614 int err; 1615 1616 assert(ctrlr != NULL); 1617 1618 if (spdk_unlikely(cq == NULL || q_addr(&cq->mapping) == NULL)) { 1619 return 0; 1620 } 1621 1622 if (cq->qid == 0) { 1623 assert(spdk_get_thread() == cq->thread); 1624 } 1625 1626 if (cq_is_full(cq)) { 1627 SPDK_ERRLOG("%s: cqid:%d full (tail=%d, head=%d)\n", 1628 ctrlr_id(ctrlr), cq->qid, *cq_tailp(cq), 1629 *cq_dbl_headp(cq)); 1630 return -1; 1631 } 1632 1633 cpl = ((struct spdk_nvme_cpl *)q_addr(&cq->mapping)) + *cq_tailp(cq); 1634 1635 assert(ctrlr->sqs[sqid] != NULL); 1636 SPDK_DEBUGLOG(nvmf_vfio, 1637 "%s: request complete sqid:%d cid=%d status=%#x " 1638 "sqhead=%d cq tail=%d\n", ctrlr_id(ctrlr), sqid, cid, sc, 1639 *sq_headp(ctrlr->sqs[sqid]), *cq_tailp(cq)); 1640 1641 cpl->sqhd = *sq_headp(ctrlr->sqs[sqid]); 1642 cpl->sqid = sqid; 1643 cpl->cid = cid; 1644 cpl->cdw0 = cdw0; 1645 1646 /* 1647 * This is a bitfield: instead of setting the individual bits we need 1648 * directly in cpl->status, which would cause a read-modify-write cycle, 1649 * we'll avoid reading from the CPL altogether by filling in a local 1650 * cpl_status variable, then writing the whole thing. 1651 */ 1652 cpl_status.sct = sct; 1653 cpl_status.sc = sc; 1654 cpl_status.p = cq->phase; 1655 cpl->status = cpl_status; 1656 1657 /* Ensure the Completion Queue Entry is visible. */ 1658 spdk_wmb(); 1659 cq_tail_advance(cq); 1660 1661 if ((cq->qid == 0 || !ctrlr->adaptive_irqs_enabled) && 1662 cq->ien && ctrlr_interrupt_enabled(ctrlr)) { 1663 err = vfu_irq_trigger(ctrlr->endpoint->vfu_ctx, cq->iv); 1664 if (err != 0) { 1665 SPDK_ERRLOG("%s: failed to trigger interrupt: %m\n", 1666 ctrlr_id(ctrlr)); 1667 return err; 1668 } 1669 } 1670 1671 return 0; 1672 } 1673 1674 static void 1675 free_sq_reqs(struct nvmf_vfio_user_sq *sq) 1676 { 1677 while (!TAILQ_EMPTY(&sq->free_reqs)) { 1678 struct nvmf_vfio_user_req *vu_req = TAILQ_FIRST(&sq->free_reqs); 1679 TAILQ_REMOVE(&sq->free_reqs, vu_req, link); 1680 free(vu_req); 1681 } 1682 } 1683 1684 /* Deletes a SQ, if this SQ is the last user of the associated CQ 1685 * and the controller is being shut down or reset, then the CQ is 1686 * also deleted. 1687 */ 1688 static void 1689 delete_sq_done(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvmf_vfio_user_sq *sq) 1690 { 1691 struct nvmf_vfio_user_cq *cq; 1692 uint16_t cqid; 1693 1694 SPDK_DEBUGLOG(nvmf_vfio, "%s: delete sqid:%d=%p done\n", ctrlr_id(vu_ctrlr), 1695 sq->qid, sq); 1696 1697 /* Free SQ resources */ 1698 unmap_q(vu_ctrlr, &sq->mapping); 1699 1700 free_sq_reqs(sq); 1701 1702 sq->size = 0; 1703 1704 sq->sq_state = VFIO_USER_SQ_DELETED; 1705 1706 /* Controller RESET and SHUTDOWN are special cases, 1707 * VM may not send DELETE IO SQ/CQ commands, NVMf library 1708 * will disconnect IO queue pairs. 1709 */ 1710 if (vu_ctrlr->reset_shn) { 1711 cqid = sq->cqid; 1712 cq = vu_ctrlr->cqs[cqid]; 1713 1714 SPDK_DEBUGLOG(nvmf_vfio, "%s: try to delete cqid:%u=%p\n", ctrlr_id(vu_ctrlr), 1715 cq->qid, cq); 1716 1717 if (cq->cq_ref) { 1718 cq->cq_ref--; 1719 } 1720 if (cq->cq_ref == 0) { 1721 unmap_q(vu_ctrlr, &cq->mapping); 1722 cq->size = 0; 1723 cq->cq_state = VFIO_USER_CQ_DELETED; 1724 cq->group = NULL; 1725 } 1726 } 1727 } 1728 1729 static void 1730 free_qp(struct nvmf_vfio_user_ctrlr *ctrlr, uint16_t qid) 1731 { 1732 struct nvmf_vfio_user_sq *sq; 1733 struct nvmf_vfio_user_cq *cq; 1734 1735 if (ctrlr == NULL) { 1736 return; 1737 } 1738 1739 sq = ctrlr->sqs[qid]; 1740 if (sq) { 1741 SPDK_DEBUGLOG(nvmf_vfio, "%s: Free SQ %u\n", ctrlr_id(ctrlr), qid); 1742 unmap_q(ctrlr, &sq->mapping); 1743 1744 free_sq_reqs(sq); 1745 1746 free(sq->mapping.sg); 1747 free(sq); 1748 ctrlr->sqs[qid] = NULL; 1749 } 1750 1751 cq = ctrlr->cqs[qid]; 1752 if (cq) { 1753 SPDK_DEBUGLOG(nvmf_vfio, "%s: Free cqid:%u\n", ctrlr_id(ctrlr), qid); 1754 unmap_q(ctrlr, &cq->mapping); 1755 free(cq->mapping.sg); 1756 free(cq); 1757 ctrlr->cqs[qid] = NULL; 1758 } 1759 } 1760 1761 static int 1762 init_sq(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_transport *transport, 1763 const uint16_t id) 1764 { 1765 struct nvmf_vfio_user_sq *sq; 1766 1767 assert(ctrlr != NULL); 1768 assert(transport != NULL); 1769 assert(ctrlr->sqs[id] == NULL); 1770 1771 sq = calloc(1, sizeof(*sq)); 1772 if (sq == NULL) { 1773 return -ENOMEM; 1774 } 1775 sq->mapping.sg = calloc(1, dma_sg_size()); 1776 if (sq->mapping.sg == NULL) { 1777 free(sq); 1778 return -ENOMEM; 1779 } 1780 1781 sq->qid = id; 1782 sq->qpair.qid = id; 1783 sq->qpair.transport = transport; 1784 sq->ctrlr = ctrlr; 1785 ctrlr->sqs[id] = sq; 1786 1787 TAILQ_INIT(&sq->free_reqs); 1788 1789 return 0; 1790 } 1791 1792 static int 1793 init_cq(struct nvmf_vfio_user_ctrlr *vu_ctrlr, const uint16_t id) 1794 { 1795 struct nvmf_vfio_user_cq *cq; 1796 1797 assert(vu_ctrlr != NULL); 1798 assert(vu_ctrlr->cqs[id] == NULL); 1799 1800 cq = calloc(1, sizeof(*cq)); 1801 if (cq == NULL) { 1802 return -ENOMEM; 1803 } 1804 cq->mapping.sg = calloc(1, dma_sg_size()); 1805 if (cq->mapping.sg == NULL) { 1806 free(cq); 1807 return -ENOMEM; 1808 } 1809 1810 cq->qid = id; 1811 vu_ctrlr->cqs[id] = cq; 1812 1813 return 0; 1814 } 1815 1816 static int 1817 alloc_sq_reqs(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvmf_vfio_user_sq *sq) 1818 { 1819 struct nvmf_vfio_user_req *vu_req, *tmp; 1820 size_t req_size; 1821 uint32_t i; 1822 1823 req_size = sizeof(struct nvmf_vfio_user_req) + 1824 (dma_sg_size() * NVMF_VFIO_USER_MAX_IOVECS); 1825 1826 for (i = 0; i < sq->size; i++) { 1827 struct spdk_nvmf_request *req; 1828 1829 vu_req = calloc(1, req_size); 1830 if (vu_req == NULL) { 1831 goto err; 1832 } 1833 1834 req = &vu_req->req; 1835 req->qpair = &sq->qpair; 1836 req->rsp = (union nvmf_c2h_msg *)&vu_req->rsp; 1837 req->cmd = (union nvmf_h2c_msg *)&vu_req->cmd; 1838 req->stripped_data = NULL; 1839 1840 TAILQ_INSERT_TAIL(&sq->free_reqs, vu_req, link); 1841 } 1842 1843 return 0; 1844 1845 err: 1846 TAILQ_FOREACH_SAFE(vu_req, &sq->free_reqs, link, tmp) { 1847 free(vu_req); 1848 } 1849 return -ENOMEM; 1850 } 1851 1852 static volatile uint32_t * 1853 ctrlr_doorbell_ptr(struct nvmf_vfio_user_ctrlr *ctrlr) 1854 { 1855 return ctrlr->sdbl != NULL ? 1856 ctrlr->sdbl->shadow_doorbells : 1857 ctrlr->bar0_doorbells; 1858 } 1859 1860 static uint16_t 1861 handle_create_io_sq(struct nvmf_vfio_user_ctrlr *ctrlr, 1862 struct spdk_nvme_cmd *cmd, uint16_t *sct) 1863 { 1864 struct nvmf_vfio_user_transport *vu_transport = ctrlr->transport; 1865 struct nvmf_vfio_user_sq *sq; 1866 uint32_t qsize; 1867 uint16_t cqid; 1868 uint16_t qid; 1869 int err; 1870 1871 qid = cmd->cdw10_bits.create_io_q.qid; 1872 cqid = cmd->cdw11_bits.create_io_sq.cqid; 1873 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 1874 1875 if (ctrlr->sqs[qid] == NULL) { 1876 err = init_sq(ctrlr, ctrlr->sqs[0]->qpair.transport, qid); 1877 if (err != 0) { 1878 *sct = SPDK_NVME_SCT_GENERIC; 1879 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1880 } 1881 } 1882 1883 if (cqid == 0 || cqid >= vu_transport->transport.opts.max_qpairs_per_ctrlr) { 1884 SPDK_ERRLOG("%s: invalid cqid:%u\n", ctrlr_id(ctrlr), cqid); 1885 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1886 return SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 1887 } 1888 1889 /* CQ must be created before SQ. */ 1890 if (!io_q_exists(ctrlr, cqid, true)) { 1891 SPDK_ERRLOG("%s: cqid:%u does not exist\n", ctrlr_id(ctrlr), cqid); 1892 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1893 return SPDK_NVME_SC_COMPLETION_QUEUE_INVALID; 1894 } 1895 1896 if (cmd->cdw11_bits.create_io_sq.pc != 0x1) { 1897 SPDK_ERRLOG("%s: non-PC SQ not supported\n", ctrlr_id(ctrlr)); 1898 *sct = SPDK_NVME_SCT_GENERIC; 1899 return SPDK_NVME_SC_INVALID_FIELD; 1900 } 1901 1902 sq = ctrlr->sqs[qid]; 1903 sq->size = qsize; 1904 1905 SPDK_DEBUGLOG(nvmf_vfio, "%s: sqid:%d cqid:%d\n", ctrlr_id(ctrlr), 1906 qid, cqid); 1907 1908 sq->mapping.prp1 = cmd->dptr.prp.prp1; 1909 1910 err = map_q(ctrlr, &sq->mapping, sq->size, false, true); 1911 if (err) { 1912 SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr)); 1913 *sct = SPDK_NVME_SCT_GENERIC; 1914 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1915 } 1916 1917 SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped sqid:%d IOVA=%#lx vaddr=%p\n", 1918 ctrlr_id(ctrlr), qid, cmd->dptr.prp.prp1, 1919 q_addr(&sq->mapping)); 1920 1921 err = alloc_sq_reqs(ctrlr, sq); 1922 if (err < 0) { 1923 SPDK_ERRLOG("%s: failed to allocate SQ requests: %m\n", ctrlr_id(ctrlr)); 1924 *sct = SPDK_NVME_SCT_GENERIC; 1925 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1926 } 1927 1928 sq->cqid = cqid; 1929 ctrlr->cqs[sq->cqid]->cq_ref++; 1930 sq->sq_state = VFIO_USER_SQ_CREATED; 1931 *sq_headp(sq) = 0; 1932 1933 sq->dbl_tailp = ctrlr_doorbell_ptr(ctrlr) + queue_index(qid, false); 1934 1935 /* 1936 * We should always reset the doorbells. 1937 * 1938 * The Specification prohibits the controller from writing to the shadow 1939 * doorbell buffer, however older versions of the Linux NVMe driver 1940 * don't reset the shadow doorbell buffer after a Queue-Level or 1941 * Controller-Level reset, which means that we're left with garbage 1942 * doorbell values. 1943 */ 1944 *sq_dbl_tailp(sq) = 0; 1945 1946 if (ctrlr->sdbl != NULL) { 1947 sq->need_rearm = true; 1948 1949 if (!set_sq_eventidx(sq)) { 1950 SPDK_ERRLOG("%s: host updated SQ tail doorbell before " 1951 "sqid:%hu was initialized\n", 1952 ctrlr_id(ctrlr), qid); 1953 fail_ctrlr(ctrlr); 1954 *sct = SPDK_NVME_SCT_GENERIC; 1955 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1956 } 1957 } 1958 1959 /* 1960 * Create our new I/O qpair. This asynchronously invokes, on a suitable 1961 * poll group, the nvmf_vfio_user_poll_group_add() callback, which will 1962 * call spdk_nvmf_request_exec_fabrics() with a generated fabrics 1963 * connect command. This command is then eventually completed via 1964 * handle_queue_connect_rsp(). 1965 */ 1966 sq->create_io_sq_cmd = *cmd; 1967 sq->post_create_io_sq_completion = true; 1968 1969 spdk_nvmf_tgt_new_qpair(ctrlr->transport->transport.tgt, 1970 &sq->qpair); 1971 1972 *sct = SPDK_NVME_SCT_GENERIC; 1973 return SPDK_NVME_SC_SUCCESS; 1974 } 1975 1976 static uint16_t 1977 handle_create_io_cq(struct nvmf_vfio_user_ctrlr *ctrlr, 1978 struct spdk_nvme_cmd *cmd, uint16_t *sct) 1979 { 1980 struct nvmf_vfio_user_cq *cq; 1981 uint32_t qsize; 1982 uint16_t qid; 1983 int err; 1984 1985 qid = cmd->cdw10_bits.create_io_q.qid; 1986 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 1987 1988 if (ctrlr->cqs[qid] == NULL) { 1989 err = init_cq(ctrlr, qid); 1990 if (err != 0) { 1991 *sct = SPDK_NVME_SCT_GENERIC; 1992 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1993 } 1994 } 1995 1996 if (cmd->cdw11_bits.create_io_cq.pc != 0x1) { 1997 SPDK_ERRLOG("%s: non-PC CQ not supported\n", ctrlr_id(ctrlr)); 1998 *sct = SPDK_NVME_SCT_GENERIC; 1999 return SPDK_NVME_SC_INVALID_FIELD; 2000 } 2001 2002 if (cmd->cdw11_bits.create_io_cq.iv > NVME_IRQ_MSIX_NUM - 1) { 2003 SPDK_ERRLOG("%s: IV is too big\n", ctrlr_id(ctrlr)); 2004 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2005 return SPDK_NVME_SC_INVALID_INTERRUPT_VECTOR; 2006 } 2007 2008 cq = ctrlr->cqs[qid]; 2009 cq->size = qsize; 2010 2011 cq->mapping.prp1 = cmd->dptr.prp.prp1; 2012 2013 cq->dbl_headp = ctrlr_doorbell_ptr(ctrlr) + queue_index(qid, true); 2014 2015 err = map_q(ctrlr, &cq->mapping, cq->size, true, true); 2016 if (err) { 2017 SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr)); 2018 *sct = SPDK_NVME_SCT_GENERIC; 2019 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2020 } 2021 2022 SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped cqid:%u IOVA=%#lx vaddr=%p\n", 2023 ctrlr_id(ctrlr), qid, cmd->dptr.prp.prp1, 2024 q_addr(&cq->mapping)); 2025 2026 cq->ien = cmd->cdw11_bits.create_io_cq.ien; 2027 cq->iv = cmd->cdw11_bits.create_io_cq.iv; 2028 cq->phase = true; 2029 cq->cq_state = VFIO_USER_CQ_CREATED; 2030 2031 *cq_tailp(cq) = 0; 2032 2033 /* 2034 * We should always reset the doorbells. 2035 * 2036 * The Specification prohibits the controller from writing to the shadow 2037 * doorbell buffer, however older versions of the Linux NVMe driver 2038 * don't reset the shadow doorbell buffer after a Queue-Level or 2039 * Controller-Level reset, which means that we're left with garbage 2040 * doorbell values. 2041 */ 2042 *cq_dbl_headp(cq) = 0; 2043 2044 *sct = SPDK_NVME_SCT_GENERIC; 2045 return SPDK_NVME_SC_SUCCESS; 2046 } 2047 2048 /* 2049 * Creates a completion or submission I/O queue. Returns 0 on success, -errno 2050 * on error. 2051 */ 2052 static int 2053 handle_create_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, 2054 struct spdk_nvme_cmd *cmd, const bool is_cq) 2055 { 2056 struct nvmf_vfio_user_transport *vu_transport = ctrlr->transport; 2057 uint16_t sct = SPDK_NVME_SCT_GENERIC; 2058 uint16_t sc = SPDK_NVME_SC_SUCCESS; 2059 uint32_t qsize; 2060 uint16_t qid; 2061 2062 assert(ctrlr != NULL); 2063 assert(cmd != NULL); 2064 2065 qid = cmd->cdw10_bits.create_io_q.qid; 2066 if (qid == 0 || qid >= vu_transport->transport.opts.max_qpairs_per_ctrlr) { 2067 SPDK_ERRLOG("%s: invalid qid=%d, max=%d\n", ctrlr_id(ctrlr), 2068 qid, vu_transport->transport.opts.max_qpairs_per_ctrlr); 2069 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2070 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2071 goto out; 2072 } 2073 2074 if (io_q_exists(ctrlr, qid, is_cq)) { 2075 SPDK_ERRLOG("%s: %cqid:%d already exists\n", ctrlr_id(ctrlr), 2076 is_cq ? 'c' : 's', qid); 2077 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2078 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2079 goto out; 2080 } 2081 2082 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 2083 if (qsize == 1 || qsize > max_queue_size(ctrlr)) { 2084 SPDK_ERRLOG("%s: invalid I/O queue size %u\n", ctrlr_id(ctrlr), qsize); 2085 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2086 sc = SPDK_NVME_SC_INVALID_QUEUE_SIZE; 2087 goto out; 2088 } 2089 2090 if (is_cq) { 2091 sc = handle_create_io_cq(ctrlr, cmd, &sct); 2092 } else { 2093 sc = handle_create_io_sq(ctrlr, cmd, &sct); 2094 2095 if (sct == SPDK_NVME_SCT_GENERIC && 2096 sc == SPDK_NVME_SC_SUCCESS) { 2097 /* Completion posted asynchronously. */ 2098 return 0; 2099 } 2100 } 2101 2102 out: 2103 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 2104 } 2105 2106 /* For ADMIN I/O DELETE SUBMISSION QUEUE the NVMf library will disconnect and free 2107 * queue pair, so save the command in a context. 2108 */ 2109 struct vfio_user_delete_sq_ctx { 2110 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 2111 struct spdk_nvme_cmd delete_io_sq_cmd; 2112 }; 2113 2114 static void 2115 vfio_user_qpair_delete_cb(void *cb_arg) 2116 { 2117 struct vfio_user_delete_sq_ctx *ctx = cb_arg; 2118 struct nvmf_vfio_user_ctrlr *vu_ctrlr = ctx->vu_ctrlr; 2119 struct nvmf_vfio_user_cq *admin_cq = vu_ctrlr->cqs[0]; 2120 2121 if (admin_cq->thread != spdk_get_thread()) { 2122 assert(admin_cq->thread != NULL); 2123 spdk_thread_send_msg(admin_cq->thread, 2124 vfio_user_qpair_delete_cb, 2125 cb_arg); 2126 } else { 2127 post_completion(vu_ctrlr, admin_cq, 0, 0, 2128 ctx->delete_io_sq_cmd.cid, 2129 SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC); 2130 free(ctx); 2131 } 2132 } 2133 2134 /* 2135 * Deletes a completion or submission I/O queue. 2136 */ 2137 static int 2138 handle_del_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, 2139 struct spdk_nvme_cmd *cmd, const bool is_cq) 2140 { 2141 uint16_t sct = SPDK_NVME_SCT_GENERIC; 2142 uint16_t sc = SPDK_NVME_SC_SUCCESS; 2143 struct nvmf_vfio_user_sq *sq; 2144 struct nvmf_vfio_user_cq *cq; 2145 struct vfio_user_delete_sq_ctx *ctx; 2146 2147 SPDK_DEBUGLOG(nvmf_vfio, "%s: delete I/O %cqid:%d\n", 2148 ctrlr_id(ctrlr), is_cq ? 'c' : 's', 2149 cmd->cdw10_bits.delete_io_q.qid); 2150 2151 if (!io_q_exists(ctrlr, cmd->cdw10_bits.delete_io_q.qid, is_cq)) { 2152 SPDK_ERRLOG("%s: I/O %cqid:%d does not exist\n", ctrlr_id(ctrlr), 2153 is_cq ? 'c' : 's', cmd->cdw10_bits.delete_io_q.qid); 2154 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2155 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2156 goto out; 2157 } 2158 2159 if (is_cq) { 2160 cq = ctrlr->cqs[cmd->cdw10_bits.delete_io_q.qid]; 2161 if (cq->cq_ref) { 2162 SPDK_ERRLOG("%s: the associated SQ must be deleted first\n", ctrlr_id(ctrlr)); 2163 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2164 sc = SPDK_NVME_SC_INVALID_QUEUE_DELETION; 2165 goto out; 2166 } 2167 2168 unmap_q(ctrlr, &cq->mapping); 2169 cq->size = 0; 2170 cq->cq_state = VFIO_USER_CQ_DELETED; 2171 cq->group = NULL; 2172 } else { 2173 ctx = calloc(1, sizeof(*ctx)); 2174 if (!ctx) { 2175 sct = SPDK_NVME_SCT_GENERIC; 2176 sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2177 goto out; 2178 } 2179 ctx->vu_ctrlr = ctrlr; 2180 ctx->delete_io_sq_cmd = *cmd; 2181 2182 sq = ctrlr->sqs[cmd->cdw10_bits.delete_io_q.qid]; 2183 sq->sq_state = VFIO_USER_SQ_DELETED; 2184 assert(ctrlr->cqs[sq->cqid]->cq_ref); 2185 ctrlr->cqs[sq->cqid]->cq_ref--; 2186 2187 spdk_nvmf_qpair_disconnect(&sq->qpair, vfio_user_qpair_delete_cb, ctx); 2188 return 0; 2189 } 2190 2191 out: 2192 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 2193 } 2194 2195 /* 2196 * Configures Shadow Doorbells. 2197 */ 2198 static int 2199 handle_doorbell_buffer_config(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd) 2200 { 2201 struct nvmf_vfio_user_shadow_doorbells *sdbl = NULL; 2202 uint32_t dstrd; 2203 uintptr_t page_size, page_mask; 2204 uint64_t prp1, prp2; 2205 uint16_t sct = SPDK_NVME_SCT_GENERIC; 2206 uint16_t sc = SPDK_NVME_SC_INVALID_FIELD; 2207 2208 assert(ctrlr != NULL); 2209 assert(ctrlr->endpoint != NULL); 2210 assert(cmd != NULL); 2211 2212 dstrd = doorbell_stride(ctrlr); 2213 page_size = memory_page_size(ctrlr); 2214 page_mask = memory_page_mask(ctrlr); 2215 2216 /* FIXME: we don't check doorbell stride when setting queue doorbells. */ 2217 if ((4u << dstrd) * NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR > page_size) { 2218 SPDK_ERRLOG("%s: doorbells do not fit in a single host page", 2219 ctrlr_id(ctrlr)); 2220 2221 goto out; 2222 } 2223 2224 /* Verify guest physical addresses passed as PRPs. */ 2225 if (cmd->psdt != SPDK_NVME_PSDT_PRP) { 2226 SPDK_ERRLOG("%s: received Doorbell Buffer Config without PRPs", 2227 ctrlr_id(ctrlr)); 2228 2229 goto out; 2230 } 2231 2232 prp1 = cmd->dptr.prp.prp1; 2233 prp2 = cmd->dptr.prp.prp2; 2234 2235 SPDK_DEBUGLOG(nvmf_vfio, 2236 "%s: configuring shadow doorbells with PRP1=%#lx and PRP2=%#lx (GPAs)\n", 2237 ctrlr_id(ctrlr), prp1, prp2); 2238 2239 if (prp1 == prp2 2240 || prp1 != (prp1 & page_mask) 2241 || prp2 != (prp2 & page_mask)) { 2242 SPDK_ERRLOG("%s: invalid shadow doorbell GPAs\n", 2243 ctrlr_id(ctrlr)); 2244 2245 goto out; 2246 } 2247 2248 /* Map guest physical addresses to our virtual address space. */ 2249 sdbl = map_sdbl(ctrlr->endpoint->vfu_ctx, prp1, prp2, page_size); 2250 if (sdbl == NULL) { 2251 SPDK_ERRLOG("%s: failed to map shadow doorbell buffers\n", 2252 ctrlr_id(ctrlr)); 2253 2254 goto out; 2255 } 2256 2257 ctrlr->shadow_doorbell_buffer = prp1; 2258 ctrlr->eventidx_buffer = prp2; 2259 2260 SPDK_DEBUGLOG(nvmf_vfio, 2261 "%s: mapped shadow doorbell buffers [%p, %p) and [%p, %p)\n", 2262 ctrlr_id(ctrlr), 2263 sdbl->iovs[0].iov_base, 2264 sdbl->iovs[0].iov_base + sdbl->iovs[0].iov_len, 2265 sdbl->iovs[1].iov_base, 2266 sdbl->iovs[1].iov_base + sdbl->iovs[1].iov_len); 2267 2268 2269 /* 2270 * Set all possible CQ head doorbells to polling mode now, such that we 2271 * don't have to worry about it later if the host creates more queues. 2272 * 2273 * We only ever want interrupts for writes to the SQ tail doorbells 2274 * (which are initialised in set_ctrlr_intr_mode() below). 2275 */ 2276 for (uint16_t i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; ++i) { 2277 sdbl->eventidxs[queue_index(i, true)] = NVMF_VFIO_USER_EVENTIDX_POLL; 2278 if (ctrlr->sqs[i] != NULL) { 2279 ctrlr->sqs[i]->need_rearm = true; 2280 } 2281 } 2282 2283 /* Update controller. */ 2284 SWAP(ctrlr->sdbl, sdbl); 2285 2286 /* 2287 * Copy doorbells from either the previous shadow doorbell buffer or the 2288 * BAR0 doorbells and make I/O queue doorbells point to the new buffer. 2289 * 2290 * This needs to account for older versions of the Linux NVMe driver, 2291 * which don't clear out the buffer after a controller reset. 2292 */ 2293 copy_doorbells(ctrlr, sdbl != NULL ? 2294 sdbl->shadow_doorbells : ctrlr->bar0_doorbells, 2295 ctrlr->sdbl->shadow_doorbells); 2296 vfio_user_ctrlr_switch_doorbells(ctrlr, true); 2297 2298 /* Update event index buffer and poll queues if necessary. */ 2299 vfio_user_poll_group_rearm(ctrlr_to_poll_group(ctrlr)); 2300 2301 sc = SPDK_NVME_SC_SUCCESS; 2302 2303 out: 2304 /* 2305 * Unmap existing buffers, in case Doorbell Buffer Config was sent 2306 * more than once (pointless, but not prohibited by the spec), or 2307 * in case of an error. 2308 * 2309 * If this is the first time Doorbell Buffer Config was processed, 2310 * then we've just swapped a NULL from ctrlr->sdbl into sdbl, so 2311 * free_sdbl() becomes a noop. 2312 */ 2313 free_sdbl(ctrlr->endpoint->vfu_ctx, sdbl); 2314 2315 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 2316 } 2317 2318 /* Returns 0 on success and -errno on error. */ 2319 static int 2320 consume_admin_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd) 2321 { 2322 assert(ctrlr != NULL); 2323 assert(cmd != NULL); 2324 2325 if (cmd->fuse != 0) { 2326 /* Fused admin commands are not supported. */ 2327 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, 2328 SPDK_NVME_SC_INVALID_FIELD, 2329 SPDK_NVME_SCT_GENERIC); 2330 } 2331 2332 switch (cmd->opc) { 2333 case SPDK_NVME_OPC_CREATE_IO_CQ: 2334 case SPDK_NVME_OPC_CREATE_IO_SQ: 2335 return handle_create_io_q(ctrlr, cmd, 2336 cmd->opc == SPDK_NVME_OPC_CREATE_IO_CQ); 2337 case SPDK_NVME_OPC_DELETE_IO_SQ: 2338 case SPDK_NVME_OPC_DELETE_IO_CQ: 2339 return handle_del_io_q(ctrlr, cmd, 2340 cmd->opc == SPDK_NVME_OPC_DELETE_IO_CQ); 2341 case SPDK_NVME_OPC_DOORBELL_BUFFER_CONFIG: 2342 if (!ctrlr->transport->transport_opts.disable_shadow_doorbells) { 2343 return handle_doorbell_buffer_config(ctrlr, cmd); 2344 } 2345 /* FALLTHROUGH */ 2346 default: 2347 return handle_cmd_req(ctrlr, cmd, ctrlr->sqs[0]); 2348 } 2349 } 2350 2351 static int 2352 handle_cmd_rsp(struct nvmf_vfio_user_req *vu_req, void *cb_arg) 2353 { 2354 struct nvmf_vfio_user_sq *sq = cb_arg; 2355 struct nvmf_vfio_user_ctrlr *vu_ctrlr = sq->ctrlr; 2356 uint16_t sqid, cqid; 2357 2358 assert(sq != NULL); 2359 assert(vu_req != NULL); 2360 assert(vu_ctrlr != NULL); 2361 2362 if (spdk_likely(vu_req->iovcnt)) { 2363 vfu_sgl_put(vu_ctrlr->endpoint->vfu_ctx, 2364 index_to_sg_t(vu_req->sg, 0), 2365 vu_req->iov, vu_req->iovcnt); 2366 } 2367 sqid = sq->qid; 2368 cqid = sq->cqid; 2369 2370 return post_completion(vu_ctrlr, vu_ctrlr->cqs[cqid], 2371 vu_req->req.rsp->nvme_cpl.cdw0, 2372 sqid, 2373 vu_req->req.cmd->nvme_cmd.cid, 2374 vu_req->req.rsp->nvme_cpl.status.sc, 2375 vu_req->req.rsp->nvme_cpl.status.sct); 2376 } 2377 2378 static int 2379 consume_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_sq *sq, 2380 struct spdk_nvme_cmd *cmd) 2381 { 2382 assert(sq != NULL); 2383 if (nvmf_qpair_is_admin_queue(&sq->qpair)) { 2384 return consume_admin_cmd(ctrlr, cmd); 2385 } 2386 2387 return handle_cmd_req(ctrlr, cmd, sq); 2388 } 2389 2390 /* Returns the number of commands processed, or a negative value on error. */ 2391 static int 2392 handle_sq_tdbl_write(struct nvmf_vfio_user_ctrlr *ctrlr, const uint32_t new_tail, 2393 struct nvmf_vfio_user_sq *sq) 2394 { 2395 struct spdk_nvme_cmd *queue; 2396 int count = 0; 2397 2398 assert(ctrlr != NULL); 2399 assert(sq != NULL); 2400 2401 if (ctrlr->sdbl != NULL) { 2402 /* 2403 * Submission queue index has moved past the event index, so it 2404 * needs to be re-armed before we go to sleep. 2405 */ 2406 sq->need_rearm = true; 2407 } 2408 2409 queue = q_addr(&sq->mapping); 2410 while (*sq_headp(sq) != new_tail) { 2411 int err; 2412 struct spdk_nvme_cmd *cmd = &queue[*sq_headp(sq)]; 2413 2414 count++; 2415 2416 /* 2417 * SQHD must contain the new head pointer, so we must increase 2418 * it before we generate a completion. 2419 */ 2420 sq_head_advance(sq); 2421 2422 err = consume_cmd(ctrlr, sq, cmd); 2423 if (err != 0) { 2424 return err; 2425 } 2426 } 2427 2428 return count; 2429 } 2430 2431 /* Checks whether endpoint is connected from the same process */ 2432 static bool 2433 is_peer_same_process(struct nvmf_vfio_user_endpoint *endpoint) 2434 { 2435 struct ucred ucred; 2436 socklen_t ucredlen = sizeof(ucred); 2437 2438 if (endpoint == NULL) { 2439 return false; 2440 } 2441 2442 if (getsockopt(vfu_get_poll_fd(endpoint->vfu_ctx), SOL_SOCKET, SO_PEERCRED, &ucred, 2443 &ucredlen) < 0) { 2444 SPDK_ERRLOG("getsockopt(SO_PEERCRED): %s\n", strerror(errno)); 2445 return false; 2446 } 2447 2448 return ucred.pid == getpid(); 2449 } 2450 2451 static void 2452 memory_region_add_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) 2453 { 2454 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2455 struct nvmf_vfio_user_ctrlr *ctrlr; 2456 struct nvmf_vfio_user_sq *sq; 2457 struct nvmf_vfio_user_cq *cq; 2458 void *map_start, *map_end; 2459 int ret; 2460 2461 /* 2462 * We're not interested in any DMA regions that aren't mappable (we don't 2463 * support clients that don't share their memory). 2464 */ 2465 if (!info->vaddr) { 2466 return; 2467 } 2468 2469 map_start = info->mapping.iov_base; 2470 map_end = info->mapping.iov_base + info->mapping.iov_len; 2471 2472 if (((uintptr_t)info->mapping.iov_base & MASK_2MB) || 2473 (info->mapping.iov_len & MASK_2MB)) { 2474 SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %p-%p\n", 2475 info->vaddr, map_start, map_end); 2476 return; 2477 } 2478 2479 assert(endpoint != NULL); 2480 if (endpoint->ctrlr == NULL) { 2481 return; 2482 } 2483 ctrlr = endpoint->ctrlr; 2484 2485 SPDK_DEBUGLOG(nvmf_vfio, "%s: map IOVA %p-%p\n", endpoint_id(endpoint), 2486 map_start, map_end); 2487 2488 /* VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE are enabled when registering to VFIO, here we also 2489 * check the protection bits before registering. When vfio client and server are run in same process 2490 * there is no need to register the same memory again. 2491 */ 2492 if (info->prot == (PROT_WRITE | PROT_READ) && !is_peer_same_process(endpoint)) { 2493 ret = spdk_mem_register(info->mapping.iov_base, info->mapping.iov_len); 2494 if (ret) { 2495 SPDK_ERRLOG("Memory region register %p-%p failed, ret=%d\n", 2496 map_start, map_end, ret); 2497 } 2498 } 2499 2500 pthread_mutex_lock(&endpoint->lock); 2501 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 2502 if (sq->sq_state != VFIO_USER_SQ_INACTIVE) { 2503 continue; 2504 } 2505 2506 cq = ctrlr->cqs[sq->cqid]; 2507 2508 /* For shared CQ case, we will use q_addr() to avoid mapping CQ multiple times */ 2509 if (cq->size && q_addr(&cq->mapping) == NULL) { 2510 ret = map_q(ctrlr, &cq->mapping, cq->size, true, false); 2511 if (ret) { 2512 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap cqid:%d %#lx-%#lx\n", 2513 cq->qid, cq->mapping.prp1, 2514 cq->mapping.prp1 + cq->size * sizeof(struct spdk_nvme_cpl)); 2515 continue; 2516 } 2517 } 2518 2519 if (sq->size) { 2520 ret = map_q(ctrlr, &sq->mapping, sq->size, false, false); 2521 if (ret) { 2522 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap sqid:%d %#lx-%#lx\n", 2523 sq->qid, sq->mapping.prp1, 2524 sq->mapping.prp1 + sq->size * sizeof(struct spdk_nvme_cmd)); 2525 continue; 2526 } 2527 } 2528 sq->sq_state = VFIO_USER_SQ_ACTIVE; 2529 SPDK_DEBUGLOG(nvmf_vfio, "Remap sqid:%u successfully\n", sq->qid); 2530 } 2531 pthread_mutex_unlock(&endpoint->lock); 2532 } 2533 2534 static void 2535 memory_region_remove_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) 2536 { 2537 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2538 struct nvmf_vfio_user_sq *sq; 2539 struct nvmf_vfio_user_cq *cq; 2540 void *map_start, *map_end; 2541 int ret = 0; 2542 2543 if (!info->vaddr) { 2544 return; 2545 } 2546 2547 map_start = info->mapping.iov_base; 2548 map_end = info->mapping.iov_base + info->mapping.iov_len; 2549 2550 if (((uintptr_t)info->mapping.iov_base & MASK_2MB) || 2551 (info->mapping.iov_len & MASK_2MB)) { 2552 SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %p-%p\n", 2553 info->vaddr, map_start, map_end); 2554 return; 2555 } 2556 2557 assert(endpoint != NULL); 2558 SPDK_DEBUGLOG(nvmf_vfio, "%s: unmap IOVA %p-%p\n", endpoint_id(endpoint), 2559 map_start, map_end); 2560 2561 if (endpoint->ctrlr != NULL) { 2562 struct nvmf_vfio_user_ctrlr *ctrlr; 2563 ctrlr = endpoint->ctrlr; 2564 2565 pthread_mutex_lock(&endpoint->lock); 2566 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 2567 if (q_addr(&sq->mapping) >= map_start && q_addr(&sq->mapping) <= map_end) { 2568 unmap_q(ctrlr, &sq->mapping); 2569 sq->sq_state = VFIO_USER_SQ_INACTIVE; 2570 } 2571 2572 cq = ctrlr->cqs[sq->cqid]; 2573 if (q_addr(&cq->mapping) >= map_start && q_addr(&cq->mapping) <= map_end) { 2574 unmap_q(ctrlr, &cq->mapping); 2575 } 2576 } 2577 2578 if (ctrlr->sdbl != NULL) { 2579 size_t i; 2580 2581 for (i = 0; i < NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT; i++) { 2582 const void *const iov_base = ctrlr->sdbl->iovs[i].iov_base; 2583 2584 if (iov_base >= map_start && iov_base < map_end) { 2585 copy_doorbells(ctrlr, 2586 ctrlr->sdbl->shadow_doorbells, 2587 ctrlr->bar0_doorbells); 2588 vfio_user_ctrlr_switch_doorbells(ctrlr, false); 2589 free_sdbl(endpoint->vfu_ctx, ctrlr->sdbl); 2590 ctrlr->sdbl = NULL; 2591 break; 2592 } 2593 } 2594 } 2595 2596 pthread_mutex_unlock(&endpoint->lock); 2597 } 2598 2599 if (info->prot == (PROT_WRITE | PROT_READ) && !is_peer_same_process(endpoint)) { 2600 ret = spdk_mem_unregister(info->mapping.iov_base, info->mapping.iov_len); 2601 if (ret) { 2602 SPDK_ERRLOG("Memory region unregister %p-%p failed, ret=%d\n", 2603 map_start, map_end, ret); 2604 } 2605 } 2606 } 2607 2608 /* Used to initiate a controller-level reset or a controller shutdown. */ 2609 static void 2610 disable_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 2611 { 2612 SPDK_DEBUGLOG(nvmf_vfio, "%s: disabling controller\n", 2613 ctrlr_id(vu_ctrlr)); 2614 2615 /* Unmap Admin queue. */ 2616 2617 assert(vu_ctrlr->sqs[0] != NULL); 2618 assert(vu_ctrlr->cqs[0] != NULL); 2619 2620 unmap_q(vu_ctrlr, &vu_ctrlr->sqs[0]->mapping); 2621 unmap_q(vu_ctrlr, &vu_ctrlr->cqs[0]->mapping); 2622 2623 vu_ctrlr->sqs[0]->size = 0; 2624 *sq_headp(vu_ctrlr->sqs[0]) = 0; 2625 2626 vu_ctrlr->sqs[0]->sq_state = VFIO_USER_SQ_INACTIVE; 2627 2628 vu_ctrlr->cqs[0]->size = 0; 2629 *cq_tailp(vu_ctrlr->cqs[0]) = 0; 2630 2631 /* 2632 * For PCIe controller reset or shutdown, we will drop all AER 2633 * responses. 2634 */ 2635 nvmf_ctrlr_abort_aer(vu_ctrlr->ctrlr); 2636 2637 /* Free the shadow doorbell buffer. */ 2638 free_sdbl(vu_ctrlr->endpoint->vfu_ctx, vu_ctrlr->sdbl); 2639 vu_ctrlr->sdbl = NULL; 2640 } 2641 2642 /* Used to re-enable the controller after a controller-level reset. */ 2643 static int 2644 enable_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 2645 { 2646 int err; 2647 2648 assert(vu_ctrlr != NULL); 2649 2650 SPDK_DEBUGLOG(nvmf_vfio, "%s: enabling controller\n", 2651 ctrlr_id(vu_ctrlr)); 2652 2653 err = acq_setup(vu_ctrlr); 2654 if (err != 0) { 2655 return err; 2656 } 2657 2658 err = asq_setup(vu_ctrlr); 2659 if (err != 0) { 2660 return err; 2661 } 2662 2663 vu_ctrlr->sqs[0]->sq_state = VFIO_USER_SQ_ACTIVE; 2664 2665 return 0; 2666 } 2667 2668 static int 2669 nvmf_vfio_user_prop_req_rsp(struct nvmf_vfio_user_req *req, void *cb_arg) 2670 { 2671 struct nvmf_vfio_user_sq *sq = cb_arg; 2672 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 2673 int ret; 2674 2675 assert(sq != NULL); 2676 assert(req != NULL); 2677 2678 if (req->req.cmd->prop_get_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET) { 2679 assert(sq->ctrlr != NULL); 2680 assert(req != NULL); 2681 2682 memcpy(req->req.data, 2683 &req->req.rsp->prop_get_rsp.value.u64, 2684 req->req.length); 2685 } else { 2686 assert(req->req.cmd->prop_set_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET); 2687 assert(sq->ctrlr != NULL); 2688 vu_ctrlr = sq->ctrlr; 2689 2690 if (req->req.cmd->prop_set_cmd.ofst == offsetof(struct spdk_nvme_registers, cc)) { 2691 union spdk_nvme_cc_register cc, diff; 2692 2693 cc.raw = req->req.cmd->prop_set_cmd.value.u64; 2694 diff.raw = cc.raw ^ req->cc.raw; 2695 2696 if (diff.bits.en) { 2697 if (cc.bits.en) { 2698 ret = enable_ctrlr(vu_ctrlr); 2699 if (ret) { 2700 SPDK_ERRLOG("%s: failed to enable ctrlr\n", ctrlr_id(vu_ctrlr)); 2701 return ret; 2702 } 2703 vu_ctrlr->reset_shn = false; 2704 } else { 2705 vu_ctrlr->reset_shn = true; 2706 } 2707 } 2708 2709 if (diff.bits.shn) { 2710 if (cc.bits.shn == SPDK_NVME_SHN_NORMAL || cc.bits.shn == SPDK_NVME_SHN_ABRUPT) { 2711 vu_ctrlr->reset_shn = true; 2712 } 2713 } 2714 2715 if (vu_ctrlr->reset_shn) { 2716 disable_ctrlr(vu_ctrlr); 2717 } 2718 } 2719 } 2720 2721 return 0; 2722 } 2723 2724 /* 2725 * Handles a write at offset 0x1000 or more; this is the non-mapped path when a 2726 * doorbell is written via access_bar0_fn(). 2727 * 2728 * DSTRD is set to fixed value 0 for NVMf. 2729 * 2730 */ 2731 static int 2732 handle_dbl_access(struct nvmf_vfio_user_ctrlr *ctrlr, uint32_t *buf, 2733 const size_t count, loff_t pos, const bool is_write) 2734 { 2735 assert(ctrlr != NULL); 2736 assert(buf != NULL); 2737 2738 if (!is_write) { 2739 SPDK_WARNLOG("%s: host tried to read BAR0 doorbell %#lx\n", 2740 ctrlr_id(ctrlr), pos); 2741 errno = EPERM; 2742 return -1; 2743 } 2744 2745 if (count != sizeof(uint32_t)) { 2746 SPDK_ERRLOG("%s: bad doorbell buffer size %ld\n", 2747 ctrlr_id(ctrlr), count); 2748 errno = EINVAL; 2749 return -1; 2750 } 2751 2752 pos -= NVME_DOORBELLS_OFFSET; 2753 2754 /* pos must be dword aligned */ 2755 if ((pos & 0x3) != 0) { 2756 SPDK_ERRLOG("%s: bad doorbell offset %#lx\n", ctrlr_id(ctrlr), pos); 2757 errno = EINVAL; 2758 return -1; 2759 } 2760 2761 /* convert byte offset to array index */ 2762 pos >>= 2; 2763 2764 if (pos >= NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR * 2) { 2765 SPDK_ERRLOG("%s: bad doorbell index %#lx\n", ctrlr_id(ctrlr), pos); 2766 errno = EINVAL; 2767 return -1; 2768 } 2769 2770 ctrlr->bar0_doorbells[pos] = *buf; 2771 spdk_wmb(); 2772 2773 SPDK_DEBUGLOG(vfio_user_db, "%s: updating BAR0 doorbell %s:%ld to %u\n", 2774 ctrlr_id(ctrlr), (pos & 1) ? "cqid" : "sqid", 2775 pos / 2, *buf); 2776 2777 2778 return 0; 2779 } 2780 2781 static size_t 2782 vfio_user_property_access(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 2783 char *buf, size_t count, loff_t pos, 2784 bool is_write) 2785 { 2786 struct nvmf_vfio_user_req *req; 2787 const struct spdk_nvmf_registers *regs; 2788 2789 if ((count != 4) && (count != 8)) { 2790 errno = EINVAL; 2791 return -1; 2792 } 2793 2794 /* Construct a Fabric Property Get/Set command and send it */ 2795 req = get_nvmf_vfio_user_req(vu_ctrlr->sqs[0]); 2796 if (req == NULL) { 2797 errno = ENOBUFS; 2798 return -1; 2799 } 2800 regs = spdk_nvmf_ctrlr_get_regs(vu_ctrlr->ctrlr); 2801 req->cc.raw = regs->cc.raw; 2802 2803 req->cb_fn = nvmf_vfio_user_prop_req_rsp; 2804 req->cb_arg = vu_ctrlr->sqs[0]; 2805 req->req.cmd->prop_set_cmd.opcode = SPDK_NVME_OPC_FABRIC; 2806 req->req.cmd->prop_set_cmd.cid = 0; 2807 if (count == 4) { 2808 req->req.cmd->prop_set_cmd.attrib.size = 0; 2809 } else { 2810 req->req.cmd->prop_set_cmd.attrib.size = 1; 2811 } 2812 req->req.cmd->prop_set_cmd.ofst = pos; 2813 if (is_write) { 2814 req->req.cmd->prop_set_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET; 2815 if (req->req.cmd->prop_set_cmd.attrib.size) { 2816 req->req.cmd->prop_set_cmd.value.u64 = *(uint64_t *)buf; 2817 } else { 2818 req->req.cmd->prop_set_cmd.value.u32.high = 0; 2819 req->req.cmd->prop_set_cmd.value.u32.low = *(uint32_t *)buf; 2820 } 2821 } else { 2822 req->req.cmd->prop_get_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET; 2823 } 2824 req->req.length = count; 2825 req->req.data = buf; 2826 2827 spdk_nvmf_request_exec_fabrics(&req->req); 2828 2829 return count; 2830 } 2831 2832 static ssize_t 2833 access_bar0_fn(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t pos, 2834 bool is_write) 2835 { 2836 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2837 struct nvmf_vfio_user_ctrlr *ctrlr; 2838 int ret; 2839 2840 ctrlr = endpoint->ctrlr; 2841 if (endpoint->need_async_destroy || !ctrlr) { 2842 errno = EIO; 2843 return -1; 2844 } 2845 2846 if (pos >= NVME_DOORBELLS_OFFSET) { 2847 /* 2848 * The fact that the doorbells can be memory mapped doesn't mean 2849 * that the client (VFIO in QEMU) is obliged to memory map them, 2850 * it might still elect to access them via regular read/write; 2851 * we might also have had disable_mappable_bar0 set. 2852 */ 2853 ret = handle_dbl_access(ctrlr, (uint32_t *)buf, count, 2854 pos, is_write); 2855 if (ret == 0) { 2856 return count; 2857 } 2858 return ret; 2859 } 2860 2861 return vfio_user_property_access(ctrlr, buf, count, pos, is_write); 2862 } 2863 2864 static ssize_t 2865 access_pci_config(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t offset, 2866 bool is_write) 2867 { 2868 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2869 2870 if (is_write) { 2871 SPDK_ERRLOG("%s: write %#lx-%#lx not supported\n", 2872 endpoint_id(endpoint), offset, offset + count); 2873 errno = EINVAL; 2874 return -1; 2875 } 2876 2877 if (offset + count > NVME_REG_CFG_SIZE) { 2878 SPDK_ERRLOG("%s: access past end of extended PCI configuration space, want=%ld+%ld, max=%d\n", 2879 endpoint_id(endpoint), offset, count, 2880 NVME_REG_CFG_SIZE); 2881 errno = ERANGE; 2882 return -1; 2883 } 2884 2885 memcpy(buf, ((unsigned char *)endpoint->pci_config_space) + offset, count); 2886 2887 return count; 2888 } 2889 2890 static void 2891 vfio_user_log(vfu_ctx_t *vfu_ctx, int level, char const *msg) 2892 { 2893 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2894 2895 if (level >= LOG_DEBUG) { 2896 SPDK_DEBUGLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg); 2897 } else if (level >= LOG_INFO) { 2898 SPDK_INFOLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg); 2899 } else if (level >= LOG_NOTICE) { 2900 SPDK_NOTICELOG("%s: %s\n", endpoint_id(endpoint), msg); 2901 } else if (level >= LOG_WARNING) { 2902 SPDK_WARNLOG("%s: %s\n", endpoint_id(endpoint), msg); 2903 } else { 2904 SPDK_ERRLOG("%s: %s\n", endpoint_id(endpoint), msg); 2905 } 2906 } 2907 2908 static int 2909 vfio_user_get_log_level(void) 2910 { 2911 int level; 2912 2913 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 2914 return LOG_DEBUG; 2915 } 2916 2917 level = spdk_log_to_syslog_level(spdk_log_get_level()); 2918 if (level < 0) { 2919 return LOG_ERR; 2920 } 2921 2922 return level; 2923 } 2924 2925 static void 2926 init_pci_config_space(vfu_pci_config_space_t *p) 2927 { 2928 /* MLBAR */ 2929 p->hdr.bars[0].raw = 0x0; 2930 /* MUBAR */ 2931 p->hdr.bars[1].raw = 0x0; 2932 2933 /* vendor specific, let's set them to zero for now */ 2934 p->hdr.bars[3].raw = 0x0; 2935 p->hdr.bars[4].raw = 0x0; 2936 p->hdr.bars[5].raw = 0x0; 2937 2938 /* enable INTx */ 2939 p->hdr.intr.ipin = 0x1; 2940 } 2941 2942 struct ctrlr_quiesce_ctx { 2943 struct nvmf_vfio_user_endpoint *endpoint; 2944 struct nvmf_vfio_user_poll_group *group; 2945 int status; 2946 }; 2947 2948 static void ctrlr_quiesce(struct nvmf_vfio_user_ctrlr *vu_ctrlr); 2949 2950 static inline bool 2951 in_interrupt_mode(struct nvmf_vfio_user_transport *vu_transport) 2952 { 2953 return spdk_interrupt_mode_is_enabled() && 2954 vu_transport->intr_mode_supported; 2955 } 2956 2957 static void 2958 _vfio_user_endpoint_resume_done_msg(void *ctx) 2959 { 2960 struct nvmf_vfio_user_endpoint *endpoint = ctx; 2961 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 2962 2963 endpoint->need_resume = false; 2964 2965 if (!vu_ctrlr) { 2966 return; 2967 } 2968 2969 if (!vu_ctrlr->queued_quiesce) { 2970 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 2971 2972 /* 2973 * We might have ignored new SQ entries while we were quiesced: 2974 * kick ourselves so we'll definitely check again while in 2975 * VFIO_USER_CTRLR_RUNNING state. 2976 */ 2977 if (in_interrupt_mode(endpoint->transport)) { 2978 ctrlr_kick(vu_ctrlr); 2979 } 2980 return; 2981 } 2982 2983 2984 /* 2985 * Basically, once we call `vfu_device_quiesced` the device is 2986 * unquiesced from libvfio-user's perspective so from the moment 2987 * `vfio_user_quiesce_done` returns libvfio-user might quiesce the device 2988 * again. However, because the NVMf subsytem is an asynchronous 2989 * operation, this quiesce might come _before_ the NVMf subsystem has 2990 * been resumed, so in the callback of `spdk_nvmf_subsystem_resume` we 2991 * need to check whether a quiesce was requested. 2992 */ 2993 SPDK_DEBUGLOG(nvmf_vfio, "%s has queued quiesce event, quiesce again\n", 2994 ctrlr_id(vu_ctrlr)); 2995 ctrlr_quiesce(vu_ctrlr); 2996 } 2997 2998 static void 2999 vfio_user_endpoint_resume_done(struct spdk_nvmf_subsystem *subsystem, 3000 void *cb_arg, int status) 3001 { 3002 struct nvmf_vfio_user_endpoint *endpoint = cb_arg; 3003 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3004 3005 SPDK_DEBUGLOG(nvmf_vfio, "%s resumed done with status %d\n", endpoint_id(endpoint), status); 3006 3007 if (!vu_ctrlr) { 3008 return; 3009 } 3010 3011 spdk_thread_send_msg(vu_ctrlr->thread, _vfio_user_endpoint_resume_done_msg, endpoint); 3012 } 3013 3014 static void 3015 vfio_user_quiesce_done(void *ctx) 3016 { 3017 struct ctrlr_quiesce_ctx *quiesce_ctx = ctx; 3018 struct nvmf_vfio_user_endpoint *endpoint = quiesce_ctx->endpoint; 3019 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3020 int ret; 3021 3022 if (!vu_ctrlr) { 3023 free(quiesce_ctx); 3024 return; 3025 } 3026 3027 SPDK_DEBUGLOG(nvmf_vfio, "%s device quiesced\n", ctrlr_id(vu_ctrlr)); 3028 3029 assert(vu_ctrlr->state == VFIO_USER_CTRLR_PAUSING); 3030 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 3031 vfu_device_quiesced(endpoint->vfu_ctx, quiesce_ctx->status); 3032 vu_ctrlr->queued_quiesce = false; 3033 free(quiesce_ctx); 3034 3035 /* `vfu_device_quiesced` can change the migration state, 3036 * so we need to re-check `vu_ctrlr->state`. 3037 */ 3038 if (vu_ctrlr->state == VFIO_USER_CTRLR_MIGRATING) { 3039 SPDK_DEBUGLOG(nvmf_vfio, "%s is in MIGRATION state\n", ctrlr_id(vu_ctrlr)); 3040 return; 3041 } 3042 3043 SPDK_DEBUGLOG(nvmf_vfio, "%s start to resume\n", ctrlr_id(vu_ctrlr)); 3044 vu_ctrlr->state = VFIO_USER_CTRLR_RESUMING; 3045 ret = spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 3046 vfio_user_endpoint_resume_done, endpoint); 3047 if (ret < 0) { 3048 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 3049 SPDK_ERRLOG("%s: failed to resume, ret=%d\n", endpoint_id(endpoint), ret); 3050 } 3051 } 3052 3053 static void 3054 vfio_user_pause_done(struct spdk_nvmf_subsystem *subsystem, 3055 void *ctx, int status) 3056 { 3057 struct ctrlr_quiesce_ctx *quiesce_ctx = ctx; 3058 struct nvmf_vfio_user_endpoint *endpoint = quiesce_ctx->endpoint; 3059 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3060 3061 if (!vu_ctrlr) { 3062 free(quiesce_ctx); 3063 return; 3064 } 3065 3066 quiesce_ctx->status = status; 3067 3068 SPDK_DEBUGLOG(nvmf_vfio, "%s pause done with status %d\n", 3069 ctrlr_id(vu_ctrlr), status); 3070 3071 spdk_thread_send_msg(vu_ctrlr->thread, 3072 vfio_user_quiesce_done, ctx); 3073 } 3074 3075 /* 3076 * Ensure that, for this PG, we've stopped running in nvmf_vfio_user_sq_poll(); 3077 * we've already set ctrlr->state, so we won't process new entries, but we need 3078 * to ensure that this PG is quiesced. This only works because there's no 3079 * callback context set up between polling the SQ and spdk_nvmf_request_exec(). 3080 * 3081 * Once we've walked all PGs, we need to pause any submitted I/O via 3082 * spdk_nvmf_subsystem_pause(SPDK_NVME_GLOBAL_NS_TAG). 3083 */ 3084 static void 3085 vfio_user_quiesce_pg(void *ctx) 3086 { 3087 struct ctrlr_quiesce_ctx *quiesce_ctx = ctx; 3088 struct nvmf_vfio_user_endpoint *endpoint = quiesce_ctx->endpoint; 3089 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3090 struct nvmf_vfio_user_poll_group *vu_group = quiesce_ctx->group; 3091 struct spdk_nvmf_subsystem *subsystem = endpoint->subsystem; 3092 int ret; 3093 3094 SPDK_DEBUGLOG(nvmf_vfio, "quiesced pg:%p\n", vu_group); 3095 3096 if (!vu_ctrlr) { 3097 free(quiesce_ctx); 3098 return; 3099 } 3100 3101 quiesce_ctx->group = TAILQ_NEXT(vu_group, link); 3102 if (quiesce_ctx->group != NULL) { 3103 spdk_thread_send_msg(poll_group_to_thread(quiesce_ctx->group), 3104 vfio_user_quiesce_pg, quiesce_ctx); 3105 return; 3106 } 3107 3108 ret = spdk_nvmf_subsystem_pause(subsystem, SPDK_NVME_GLOBAL_NS_TAG, 3109 vfio_user_pause_done, quiesce_ctx); 3110 if (ret < 0) { 3111 SPDK_ERRLOG("%s: failed to pause, ret=%d\n", 3112 endpoint_id(endpoint), ret); 3113 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 3114 fail_ctrlr(vu_ctrlr); 3115 free(quiesce_ctx); 3116 } 3117 } 3118 3119 static void 3120 ctrlr_quiesce(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3121 { 3122 struct ctrlr_quiesce_ctx *quiesce_ctx; 3123 3124 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSING; 3125 3126 quiesce_ctx = calloc(1, sizeof(*quiesce_ctx)); 3127 if (!quiesce_ctx) { 3128 SPDK_ERRLOG("Failed to allocate subsystem pause context\n"); 3129 assert(false); 3130 return; 3131 } 3132 3133 quiesce_ctx->endpoint = vu_ctrlr->endpoint; 3134 quiesce_ctx->status = 0; 3135 quiesce_ctx->group = TAILQ_FIRST(&vu_ctrlr->transport->poll_groups); 3136 3137 spdk_thread_send_msg(poll_group_to_thread(quiesce_ctx->group), 3138 vfio_user_quiesce_pg, quiesce_ctx); 3139 } 3140 3141 static int 3142 vfio_user_dev_quiesce_cb(vfu_ctx_t *vfu_ctx) 3143 { 3144 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3145 struct spdk_nvmf_subsystem *subsystem = endpoint->subsystem; 3146 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3147 3148 if (!vu_ctrlr) { 3149 return 0; 3150 } 3151 3152 /* NVMf library will destruct controller when no 3153 * connected queue pairs. 3154 */ 3155 if (!nvmf_subsystem_get_ctrlr(subsystem, vu_ctrlr->cntlid)) { 3156 return 0; 3157 } 3158 3159 SPDK_DEBUGLOG(nvmf_vfio, "%s starts to quiesce\n", ctrlr_id(vu_ctrlr)); 3160 3161 /* There is no race condition here as device quiesce callback 3162 * and nvmf_prop_set_cc() are running in the same thread context. 3163 */ 3164 if (!vu_ctrlr->ctrlr->vcprop.cc.bits.en) { 3165 return 0; 3166 } else if (!vu_ctrlr->ctrlr->vcprop.csts.bits.rdy) { 3167 return 0; 3168 } else if (vu_ctrlr->ctrlr->vcprop.csts.bits.shst == SPDK_NVME_SHST_COMPLETE) { 3169 return 0; 3170 } 3171 3172 switch (vu_ctrlr->state) { 3173 case VFIO_USER_CTRLR_PAUSED: 3174 case VFIO_USER_CTRLR_MIGRATING: 3175 return 0; 3176 case VFIO_USER_CTRLR_RUNNING: 3177 ctrlr_quiesce(vu_ctrlr); 3178 break; 3179 case VFIO_USER_CTRLR_RESUMING: 3180 vu_ctrlr->queued_quiesce = true; 3181 SPDK_DEBUGLOG(nvmf_vfio, "%s is busy to quiesce, current state %u\n", ctrlr_id(vu_ctrlr), 3182 vu_ctrlr->state); 3183 break; 3184 default: 3185 assert(vu_ctrlr->state != VFIO_USER_CTRLR_PAUSING); 3186 break; 3187 } 3188 3189 errno = EBUSY; 3190 return -1; 3191 } 3192 3193 static void 3194 vfio_user_ctrlr_dump_migr_data(const char *name, 3195 struct vfio_user_nvme_migr_state *migr_data, 3196 struct nvmf_vfio_user_shadow_doorbells *sdbl) 3197 { 3198 struct spdk_nvmf_registers *regs; 3199 struct nvme_migr_sq_state *sq; 3200 struct nvme_migr_cq_state *cq; 3201 uint32_t *doorbell_base; 3202 uint32_t i; 3203 3204 SPDK_NOTICELOG("Dump %s\n", name); 3205 3206 regs = &migr_data->nvmf_data.regs; 3207 doorbell_base = (uint32_t *)&migr_data->doorbells; 3208 3209 SPDK_NOTICELOG("Registers\n"); 3210 SPDK_NOTICELOG("CSTS 0x%x\n", regs->csts.raw); 3211 SPDK_NOTICELOG("CAP 0x%"PRIx64"\n", regs->cap.raw); 3212 SPDK_NOTICELOG("VS 0x%x\n", regs->vs.raw); 3213 SPDK_NOTICELOG("CC 0x%x\n", regs->cc.raw); 3214 SPDK_NOTICELOG("AQA 0x%x\n", regs->aqa.raw); 3215 SPDK_NOTICELOG("ASQ 0x%"PRIx64"\n", regs->asq); 3216 SPDK_NOTICELOG("ACQ 0x%"PRIx64"\n", regs->acq); 3217 3218 SPDK_NOTICELOG("Number of IO Queues %u\n", migr_data->ctrlr_header.num_io_queues); 3219 3220 if (sdbl != NULL) { 3221 SPDK_NOTICELOG("shadow doorbell buffer=%#lx\n", 3222 migr_data->ctrlr_header.shadow_doorbell_buffer); 3223 SPDK_NOTICELOG("eventidx buffer=%#lx\n", 3224 migr_data->ctrlr_header.eventidx_buffer); 3225 } 3226 3227 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3228 sq = &migr_data->qps[i].sq; 3229 cq = &migr_data->qps[i].cq; 3230 3231 if (sq->size) { 3232 SPDK_NOTICELOG("sqid:%u, bar0_doorbell:%u\n", sq->sqid, doorbell_base[i * 2]); 3233 if (i > 0 && sdbl != NULL) { 3234 SPDK_NOTICELOG("sqid:%u, shadow_doorbell:%u, eventidx:%u\n", 3235 sq->sqid, 3236 sdbl->shadow_doorbells[queue_index(i, false)], 3237 sdbl->eventidxs[queue_index(i, false)]); 3238 } 3239 SPDK_NOTICELOG("SQ sqid:%u, cqid:%u, sqhead:%u, size:%u, dma_addr:0x%"PRIx64"\n", 3240 sq->sqid, sq->cqid, sq->head, sq->size, sq->dma_addr); 3241 } 3242 3243 if (cq->size) { 3244 SPDK_NOTICELOG("cqid:%u, bar0_doorbell:%u\n", cq->cqid, doorbell_base[i * 2 + 1]); 3245 if (i > 0 && sdbl != NULL) { 3246 SPDK_NOTICELOG("cqid:%u, shadow_doorbell:%u, eventidx:%u\n", 3247 cq->cqid, 3248 sdbl->shadow_doorbells[queue_index(i, true)], 3249 sdbl->eventidxs[queue_index(i, true)]); 3250 } 3251 SPDK_NOTICELOG("CQ cqid:%u, phase:%u, cqtail:%u, size:%u, iv:%u, ien:%u, dma_addr:0x%"PRIx64"\n", 3252 cq->cqid, cq->phase, cq->tail, cq->size, cq->iv, cq->ien, cq->dma_addr); 3253 } 3254 } 3255 3256 SPDK_NOTICELOG("%s Dump Done\n", name); 3257 } 3258 3259 /* Read region 9 content and restore it to migration data structures */ 3260 static int 3261 vfio_user_migr_stream_to_data(struct nvmf_vfio_user_endpoint *endpoint, 3262 struct vfio_user_nvme_migr_state *migr_state) 3263 { 3264 void *data_ptr = endpoint->migr_data; 3265 3266 /* Load vfio_user_nvme_migr_header first */ 3267 memcpy(&migr_state->ctrlr_header, data_ptr, sizeof(struct vfio_user_nvme_migr_header)); 3268 /* TODO: version check */ 3269 if (migr_state->ctrlr_header.magic != VFIO_USER_NVME_MIGR_MAGIC) { 3270 SPDK_ERRLOG("%s: bad magic number %x\n", endpoint_id(endpoint), migr_state->ctrlr_header.magic); 3271 return -EINVAL; 3272 } 3273 3274 /* Load nvmf controller data */ 3275 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.nvmf_data_offset; 3276 memcpy(&migr_state->nvmf_data, data_ptr, migr_state->ctrlr_header.nvmf_data_len); 3277 3278 /* Load queue pairs */ 3279 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.qp_offset; 3280 memcpy(&migr_state->qps, data_ptr, migr_state->ctrlr_header.qp_len); 3281 3282 /* Load doorbells */ 3283 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.bar_offset[VFU_PCI_DEV_BAR0_REGION_IDX]; 3284 memcpy(&migr_state->doorbells, data_ptr, 3285 migr_state->ctrlr_header.bar_len[VFU_PCI_DEV_BAR0_REGION_IDX]); 3286 3287 /* Load CFG */ 3288 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.bar_offset[VFU_PCI_DEV_CFG_REGION_IDX]; 3289 memcpy(&migr_state->cfg, data_ptr, migr_state->ctrlr_header.bar_len[VFU_PCI_DEV_CFG_REGION_IDX]); 3290 3291 return 0; 3292 } 3293 3294 3295 static void 3296 vfio_user_migr_ctrlr_save_data(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3297 { 3298 struct spdk_nvmf_ctrlr *ctrlr = vu_ctrlr->ctrlr; 3299 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 3300 struct nvmf_vfio_user_sq *sq; 3301 struct nvmf_vfio_user_cq *cq; 3302 uint64_t data_offset; 3303 void *data_ptr; 3304 uint32_t *doorbell_base; 3305 uint32_t i = 0; 3306 uint16_t sqid, cqid; 3307 struct vfio_user_nvme_migr_state migr_state = { 3308 .nvmf_data = { 3309 .data_size = offsetof(struct spdk_nvmf_ctrlr_migr_data, unused), 3310 .regs_size = sizeof(struct spdk_nvmf_registers), 3311 .feat_size = sizeof(struct spdk_nvmf_ctrlr_feat) 3312 } 3313 }; 3314 3315 /* Save all data to vfio_user_nvme_migr_state first, then we will 3316 * copy it to device migration region at last. 3317 */ 3318 3319 /* save magic number */ 3320 migr_state.ctrlr_header.magic = VFIO_USER_NVME_MIGR_MAGIC; 3321 3322 /* save controller data */ 3323 spdk_nvmf_ctrlr_save_migr_data(ctrlr, &migr_state.nvmf_data); 3324 3325 /* save connected queue pairs */ 3326 TAILQ_FOREACH(sq, &vu_ctrlr->connected_sqs, tailq) { 3327 /* save sq */ 3328 sqid = sq->qid; 3329 migr_state.qps[sqid].sq.sqid = sq->qid; 3330 migr_state.qps[sqid].sq.cqid = sq->cqid; 3331 migr_state.qps[sqid].sq.head = *sq_headp(sq); 3332 migr_state.qps[sqid].sq.size = sq->size; 3333 migr_state.qps[sqid].sq.dma_addr = sq->mapping.prp1; 3334 3335 /* save cq, for shared cq case, cq may be saved multiple times */ 3336 cqid = sq->cqid; 3337 cq = vu_ctrlr->cqs[cqid]; 3338 migr_state.qps[cqid].cq.cqid = cqid; 3339 migr_state.qps[cqid].cq.tail = *cq_tailp(cq); 3340 migr_state.qps[cqid].cq.ien = cq->ien; 3341 migr_state.qps[cqid].cq.iv = cq->iv; 3342 migr_state.qps[cqid].cq.size = cq->size; 3343 migr_state.qps[cqid].cq.phase = cq->phase; 3344 migr_state.qps[cqid].cq.dma_addr = cq->mapping.prp1; 3345 i++; 3346 } 3347 3348 assert(i > 0); 3349 migr_state.ctrlr_header.num_io_queues = i - 1; 3350 3351 /* Save doorbells */ 3352 doorbell_base = (uint32_t *)&migr_state.doorbells; 3353 memcpy(doorbell_base, (void *)vu_ctrlr->bar0_doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 3354 3355 /* Save PCI configuration space */ 3356 memcpy(&migr_state.cfg, (void *)endpoint->pci_config_space, NVME_REG_CFG_SIZE); 3357 3358 /* Save all data to device migration region */ 3359 data_ptr = endpoint->migr_data; 3360 3361 /* Copy nvmf controller data */ 3362 data_offset = sizeof(struct vfio_user_nvme_migr_header); 3363 data_ptr += data_offset; 3364 migr_state.ctrlr_header.nvmf_data_offset = data_offset; 3365 migr_state.ctrlr_header.nvmf_data_len = sizeof(struct spdk_nvmf_ctrlr_migr_data); 3366 memcpy(data_ptr, &migr_state.nvmf_data, sizeof(struct spdk_nvmf_ctrlr_migr_data)); 3367 3368 /* Copy queue pairs */ 3369 data_offset += sizeof(struct spdk_nvmf_ctrlr_migr_data); 3370 data_ptr += sizeof(struct spdk_nvmf_ctrlr_migr_data); 3371 migr_state.ctrlr_header.qp_offset = data_offset; 3372 migr_state.ctrlr_header.qp_len = i * (sizeof(struct nvme_migr_sq_state) + sizeof( 3373 struct nvme_migr_cq_state)); 3374 memcpy(data_ptr, &migr_state.qps, migr_state.ctrlr_header.qp_len); 3375 3376 /* Copy doorbells */ 3377 data_offset += migr_state.ctrlr_header.qp_len; 3378 data_ptr += migr_state.ctrlr_header.qp_len; 3379 migr_state.ctrlr_header.bar_offset[VFU_PCI_DEV_BAR0_REGION_IDX] = data_offset; 3380 migr_state.ctrlr_header.bar_len[VFU_PCI_DEV_BAR0_REGION_IDX] = NVMF_VFIO_USER_DOORBELLS_SIZE; 3381 memcpy(data_ptr, &migr_state.doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 3382 3383 /* Copy CFG */ 3384 data_offset += NVMF_VFIO_USER_DOORBELLS_SIZE; 3385 data_ptr += NVMF_VFIO_USER_DOORBELLS_SIZE; 3386 migr_state.ctrlr_header.bar_offset[VFU_PCI_DEV_CFG_REGION_IDX] = data_offset; 3387 migr_state.ctrlr_header.bar_len[VFU_PCI_DEV_CFG_REGION_IDX] = NVME_REG_CFG_SIZE; 3388 memcpy(data_ptr, &migr_state.cfg, NVME_REG_CFG_SIZE); 3389 3390 /* copy shadow doorbells */ 3391 if (vu_ctrlr->sdbl != NULL) { 3392 migr_state.ctrlr_header.sdbl = true; 3393 migr_state.ctrlr_header.shadow_doorbell_buffer = vu_ctrlr->shadow_doorbell_buffer; 3394 migr_state.ctrlr_header.eventidx_buffer = vu_ctrlr->eventidx_buffer; 3395 } 3396 3397 /* Copy nvme migration header finally */ 3398 memcpy(endpoint->migr_data, &migr_state.ctrlr_header, sizeof(struct vfio_user_nvme_migr_header)); 3399 3400 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 3401 vfio_user_ctrlr_dump_migr_data("SAVE", &migr_state, vu_ctrlr->sdbl); 3402 } 3403 } 3404 3405 /* 3406 * If we are about to close the connection, we need to unregister the interrupt, 3407 * as the library will subsequently close the file descriptor we registered. 3408 */ 3409 static int 3410 vfio_user_device_reset(vfu_ctx_t *vfu_ctx, vfu_reset_type_t type) 3411 { 3412 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3413 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 3414 3415 SPDK_DEBUGLOG(nvmf_vfio, "Device reset type %u\n", type); 3416 3417 if (type == VFU_RESET_LOST_CONN) { 3418 if (ctrlr != NULL) { 3419 spdk_interrupt_unregister(&ctrlr->intr); 3420 ctrlr->intr_fd = -1; 3421 } 3422 return 0; 3423 } 3424 3425 /* FIXME: LOST_CONN case ? */ 3426 if (ctrlr->sdbl != NULL) { 3427 free_sdbl(vfu_ctx, ctrlr->sdbl); 3428 ctrlr->sdbl = NULL; 3429 } 3430 3431 /* FIXME: much more needed here. */ 3432 3433 return 0; 3434 } 3435 3436 static int 3437 vfio_user_migr_ctrlr_construct_qps(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 3438 struct vfio_user_nvme_migr_state *migr_state) 3439 { 3440 uint32_t i, qsize = 0; 3441 uint16_t sqid, cqid; 3442 struct vfio_user_nvme_migr_qp migr_qp; 3443 void *addr; 3444 uint32_t cqs_ref[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR] = {}; 3445 int ret; 3446 3447 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 3448 vfio_user_ctrlr_dump_migr_data("RESUME", migr_state, vu_ctrlr->sdbl); 3449 } 3450 3451 /* restore submission queues */ 3452 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3453 migr_qp = migr_state->qps[i]; 3454 3455 qsize = migr_qp.sq.size; 3456 if (qsize) { 3457 struct nvmf_vfio_user_sq *sq; 3458 3459 sqid = migr_qp.sq.sqid; 3460 if (sqid != i) { 3461 SPDK_ERRLOG("Expected sqid %u while got %u", i, sqid); 3462 return -EINVAL; 3463 } 3464 3465 /* allocate sq if necessary */ 3466 if (vu_ctrlr->sqs[sqid] == NULL) { 3467 ret = init_sq(vu_ctrlr, &vu_ctrlr->transport->transport, sqid); 3468 if (ret) { 3469 SPDK_ERRLOG("Construct qpair with qid %u failed\n", sqid); 3470 return -EFAULT; 3471 } 3472 } 3473 3474 sq = vu_ctrlr->sqs[sqid]; 3475 sq->size = qsize; 3476 3477 ret = alloc_sq_reqs(vu_ctrlr, sq); 3478 if (ret) { 3479 SPDK_ERRLOG("Construct sq with qid %u failed\n", sqid); 3480 return -EFAULT; 3481 } 3482 3483 /* restore sq */ 3484 sq->sq_state = VFIO_USER_SQ_CREATED; 3485 sq->cqid = migr_qp.sq.cqid; 3486 *sq_headp(sq) = migr_qp.sq.head; 3487 sq->mapping.prp1 = migr_qp.sq.dma_addr; 3488 addr = map_one(vu_ctrlr->endpoint->vfu_ctx, 3489 sq->mapping.prp1, sq->size * 64, 3490 sq->mapping.sg, &sq->mapping.iov, 3491 PROT_READ); 3492 if (addr == NULL) { 3493 SPDK_ERRLOG("Restore sq with qid %u PRP1 0x%"PRIx64" with size %u failed\n", 3494 sqid, sq->mapping.prp1, sq->size); 3495 return -EFAULT; 3496 } 3497 cqs_ref[sq->cqid]++; 3498 } 3499 } 3500 3501 /* restore completion queues */ 3502 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3503 migr_qp = migr_state->qps[i]; 3504 3505 qsize = migr_qp.cq.size; 3506 if (qsize) { 3507 struct nvmf_vfio_user_cq *cq; 3508 3509 /* restore cq */ 3510 cqid = migr_qp.sq.cqid; 3511 assert(cqid == i); 3512 3513 /* allocate cq if necessary */ 3514 if (vu_ctrlr->cqs[cqid] == NULL) { 3515 ret = init_cq(vu_ctrlr, cqid); 3516 if (ret) { 3517 SPDK_ERRLOG("Construct qpair with qid %u failed\n", cqid); 3518 return -EFAULT; 3519 } 3520 } 3521 3522 cq = vu_ctrlr->cqs[cqid]; 3523 3524 cq->size = qsize; 3525 3526 cq->cq_state = VFIO_USER_CQ_CREATED; 3527 cq->cq_ref = cqs_ref[cqid]; 3528 *cq_tailp(cq) = migr_qp.cq.tail; 3529 cq->mapping.prp1 = migr_qp.cq.dma_addr; 3530 cq->ien = migr_qp.cq.ien; 3531 cq->iv = migr_qp.cq.iv; 3532 cq->phase = migr_qp.cq.phase; 3533 addr = map_one(vu_ctrlr->endpoint->vfu_ctx, 3534 cq->mapping.prp1, cq->size * 16, 3535 cq->mapping.sg, &cq->mapping.iov, 3536 PROT_READ | PROT_WRITE); 3537 if (addr == NULL) { 3538 SPDK_ERRLOG("Restore cq with qid %u PRP1 0x%"PRIx64" with size %u failed\n", 3539 cqid, cq->mapping.prp1, cq->size); 3540 return -EFAULT; 3541 } 3542 } 3543 } 3544 3545 return 0; 3546 } 3547 3548 static int 3549 vfio_user_migr_ctrlr_restore(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3550 { 3551 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 3552 struct spdk_nvmf_ctrlr *ctrlr = vu_ctrlr->ctrlr; 3553 uint32_t *doorbell_base; 3554 struct spdk_nvme_cmd cmd; 3555 uint16_t i; 3556 int rc = 0; 3557 struct vfio_user_nvme_migr_state migr_state = { 3558 .nvmf_data = { 3559 .data_size = offsetof(struct spdk_nvmf_ctrlr_migr_data, unused), 3560 .regs_size = sizeof(struct spdk_nvmf_registers), 3561 .feat_size = sizeof(struct spdk_nvmf_ctrlr_feat) 3562 } 3563 }; 3564 3565 assert(endpoint->migr_data != NULL); 3566 assert(ctrlr != NULL); 3567 rc = vfio_user_migr_stream_to_data(endpoint, &migr_state); 3568 if (rc) { 3569 return rc; 3570 } 3571 3572 /* restore shadow doorbells */ 3573 if (migr_state.ctrlr_header.sdbl) { 3574 struct nvmf_vfio_user_shadow_doorbells *sdbl; 3575 sdbl = map_sdbl(vu_ctrlr->endpoint->vfu_ctx, 3576 migr_state.ctrlr_header.shadow_doorbell_buffer, 3577 migr_state.ctrlr_header.eventidx_buffer, 3578 memory_page_size(vu_ctrlr)); 3579 if (sdbl == NULL) { 3580 SPDK_ERRLOG("%s: failed to re-map shadow doorbell buffers\n", 3581 ctrlr_id(vu_ctrlr)); 3582 return -1; 3583 } 3584 3585 vu_ctrlr->shadow_doorbell_buffer = migr_state.ctrlr_header.shadow_doorbell_buffer; 3586 vu_ctrlr->eventidx_buffer = migr_state.ctrlr_header.eventidx_buffer; 3587 3588 SWAP(vu_ctrlr->sdbl, sdbl); 3589 } 3590 3591 rc = vfio_user_migr_ctrlr_construct_qps(vu_ctrlr, &migr_state); 3592 if (rc) { 3593 return rc; 3594 } 3595 3596 /* restore PCI configuration space */ 3597 memcpy((void *)endpoint->pci_config_space, &migr_state.cfg, NVME_REG_CFG_SIZE); 3598 3599 doorbell_base = (uint32_t *)&migr_state.doorbells; 3600 /* restore doorbells from saved registers */ 3601 memcpy((void *)vu_ctrlr->bar0_doorbells, doorbell_base, NVMF_VFIO_USER_DOORBELLS_SIZE); 3602 3603 /* restore nvmf controller data */ 3604 rc = spdk_nvmf_ctrlr_restore_migr_data(ctrlr, &migr_state.nvmf_data); 3605 if (rc) { 3606 return rc; 3607 } 3608 3609 /* resubmit pending AERs */ 3610 for (i = 0; i < migr_state.nvmf_data.num_aer_cids; i++) { 3611 SPDK_DEBUGLOG(nvmf_vfio, "%s AER resubmit, CID %u\n", ctrlr_id(vu_ctrlr), 3612 migr_state.nvmf_data.aer_cids[i]); 3613 memset(&cmd, 0, sizeof(cmd)); 3614 cmd.opc = SPDK_NVME_OPC_ASYNC_EVENT_REQUEST; 3615 cmd.cid = migr_state.nvmf_data.aer_cids[i]; 3616 rc = handle_cmd_req(vu_ctrlr, &cmd, vu_ctrlr->sqs[0]); 3617 if (rc) { 3618 break; 3619 } 3620 } 3621 3622 return rc; 3623 } 3624 3625 static void 3626 vfio_user_migr_ctrlr_enable_sqs(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3627 { 3628 uint32_t i; 3629 struct nvmf_vfio_user_sq *sq; 3630 3631 /* The Admin queue (qid: 0) does not ever use shadow doorbells. */ 3632 3633 if (vu_ctrlr->sqs[0] != NULL) { 3634 vu_ctrlr->sqs[0]->dbl_tailp = vu_ctrlr->bar0_doorbells + 3635 queue_index(0, false); 3636 } 3637 3638 if (vu_ctrlr->cqs[0] != NULL) { 3639 vu_ctrlr->cqs[0]->dbl_headp = vu_ctrlr->bar0_doorbells + 3640 queue_index(0, true); 3641 } 3642 3643 vfio_user_ctrlr_switch_doorbells(vu_ctrlr, vu_ctrlr->sdbl != NULL); 3644 3645 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3646 sq = vu_ctrlr->sqs[i]; 3647 if (!sq || !sq->size) { 3648 continue; 3649 } 3650 3651 if (nvmf_qpair_is_admin_queue(&sq->qpair)) { 3652 /* ADMIN queue pair is always in the poll group, just enable it */ 3653 sq->sq_state = VFIO_USER_SQ_ACTIVE; 3654 } else { 3655 spdk_nvmf_tgt_new_qpair(vu_ctrlr->transport->transport.tgt, &sq->qpair); 3656 } 3657 } 3658 } 3659 3660 /* 3661 * We are in stop-and-copy state, but still potentially have some current dirty 3662 * sgls: while we're quiesced and thus should have no active requests, we still 3663 * have potentially dirty maps of the shadow doorbells and the CQs (SQs are 3664 * mapped read only). 3665 * 3666 * Since we won't be calling vfu_sgl_put() for them, we need to explicitly 3667 * mark them dirty now. 3668 */ 3669 static void 3670 vfio_user_migr_ctrlr_mark_dirty(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3671 { 3672 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 3673 3674 assert(vu_ctrlr->state == VFIO_USER_CTRLR_MIGRATING); 3675 3676 for (size_t i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3677 struct nvmf_vfio_user_cq *cq = vu_ctrlr->cqs[i]; 3678 3679 if (cq == NULL || q_addr(&cq->mapping) == NULL) { 3680 continue; 3681 } 3682 3683 vfu_sgl_mark_dirty(endpoint->vfu_ctx, cq->mapping.sg, 1); 3684 } 3685 3686 if (vu_ctrlr->sdbl != NULL) { 3687 dma_sg_t *sg; 3688 size_t i; 3689 3690 for (i = 0; i < NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT; 3691 ++i) { 3692 3693 if (!vu_ctrlr->sdbl->iovs[i].iov_len) { 3694 continue; 3695 } 3696 3697 sg = index_to_sg_t(vu_ctrlr->sdbl->sgs, i); 3698 3699 vfu_sgl_mark_dirty(endpoint->vfu_ctx, sg, 1); 3700 } 3701 } 3702 } 3703 3704 static int 3705 vfio_user_migration_device_state_transition(vfu_ctx_t *vfu_ctx, vfu_migr_state_t state) 3706 { 3707 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3708 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3709 struct nvmf_vfio_user_sq *sq; 3710 int ret = 0; 3711 3712 SPDK_DEBUGLOG(nvmf_vfio, "%s controller state %u, migration state %u\n", endpoint_id(endpoint), 3713 vu_ctrlr->state, state); 3714 3715 switch (state) { 3716 case VFU_MIGR_STATE_STOP_AND_COPY: 3717 vu_ctrlr->in_source_vm = true; 3718 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 3719 vfio_user_migr_ctrlr_mark_dirty(vu_ctrlr); 3720 vfio_user_migr_ctrlr_save_data(vu_ctrlr); 3721 break; 3722 case VFU_MIGR_STATE_STOP: 3723 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 3724 /* The controller associates with source VM is dead now, we will resume 3725 * the subsystem after destroying the controller data structure, then the 3726 * subsystem can be re-used for another new client. 3727 */ 3728 if (vu_ctrlr->in_source_vm) { 3729 endpoint->need_resume = true; 3730 } 3731 break; 3732 case VFU_MIGR_STATE_PRE_COPY: 3733 assert(vu_ctrlr->state == VFIO_USER_CTRLR_PAUSED); 3734 break; 3735 case VFU_MIGR_STATE_RESUME: 3736 /* 3737 * Destination ADMIN queue pair is connected when starting the VM, 3738 * but the ADMIN queue pair isn't enabled in destination VM, the poll 3739 * group will do nothing to ADMIN queue pair for now. 3740 */ 3741 if (vu_ctrlr->state != VFIO_USER_CTRLR_RUNNING) { 3742 break; 3743 } 3744 3745 assert(!vu_ctrlr->in_source_vm); 3746 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 3747 3748 sq = TAILQ_FIRST(&vu_ctrlr->connected_sqs); 3749 assert(sq != NULL); 3750 assert(sq->qpair.qid == 0); 3751 sq->sq_state = VFIO_USER_SQ_INACTIVE; 3752 3753 /* Free ADMIN SQ resources first, SQ resources will be 3754 * allocated based on queue size from source VM. 3755 */ 3756 free_sq_reqs(sq); 3757 sq->size = 0; 3758 break; 3759 case VFU_MIGR_STATE_RUNNING: 3760 3761 if (vu_ctrlr->state != VFIO_USER_CTRLR_MIGRATING) { 3762 break; 3763 } 3764 3765 if (!vu_ctrlr->in_source_vm) { 3766 /* Restore destination VM from BAR9 */ 3767 ret = vfio_user_migr_ctrlr_restore(vu_ctrlr); 3768 if (ret) { 3769 break; 3770 } 3771 3772 vfio_user_ctrlr_switch_doorbells(vu_ctrlr, false); 3773 vfio_user_migr_ctrlr_enable_sqs(vu_ctrlr); 3774 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 3775 /* FIXME where do we resume nvmf? */ 3776 } else { 3777 /* Rollback source VM */ 3778 vu_ctrlr->state = VFIO_USER_CTRLR_RESUMING; 3779 ret = spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 3780 vfio_user_endpoint_resume_done, endpoint); 3781 if (ret < 0) { 3782 /* TODO: fail controller with CFS bit set */ 3783 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 3784 SPDK_ERRLOG("%s: failed to resume, ret=%d\n", endpoint_id(endpoint), ret); 3785 } 3786 } 3787 vu_ctrlr->migr_data_prepared = false; 3788 vu_ctrlr->in_source_vm = false; 3789 break; 3790 3791 default: 3792 return -EINVAL; 3793 } 3794 3795 return ret; 3796 } 3797 3798 static uint64_t 3799 vfio_user_migration_get_pending_bytes(vfu_ctx_t *vfu_ctx) 3800 { 3801 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3802 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 3803 uint64_t pending_bytes; 3804 3805 if (ctrlr->migr_data_prepared) { 3806 assert(ctrlr->state == VFIO_USER_CTRLR_MIGRATING); 3807 pending_bytes = 0; 3808 } else { 3809 pending_bytes = vfio_user_migr_data_len(); 3810 } 3811 3812 SPDK_DEBUGLOG(nvmf_vfio, 3813 "%s current state %u, pending bytes 0x%"PRIx64"\n", 3814 endpoint_id(endpoint), ctrlr->state, pending_bytes); 3815 3816 return pending_bytes; 3817 } 3818 3819 static int 3820 vfio_user_migration_prepare_data(vfu_ctx_t *vfu_ctx, uint64_t *offset, uint64_t *size) 3821 { 3822 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3823 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 3824 3825 /* 3826 * When transitioning to pre-copy state we set pending_bytes to 0, 3827 * so the vfio-user client shouldn't attempt to read any migration 3828 * data. This is not yet guaranteed by libvfio-user. 3829 */ 3830 if (ctrlr->state != VFIO_USER_CTRLR_MIGRATING) { 3831 assert(size != NULL); 3832 *offset = 0; 3833 *size = 0; 3834 return 0; 3835 } 3836 3837 if (ctrlr->in_source_vm) { /* migration source */ 3838 assert(size != NULL); 3839 *size = vfio_user_migr_data_len(); 3840 vfio_user_migr_ctrlr_save_data(ctrlr); 3841 } else { /* migration destination */ 3842 assert(size == NULL); 3843 assert(!ctrlr->migr_data_prepared); 3844 } 3845 *offset = 0; 3846 ctrlr->migr_data_prepared = true; 3847 3848 SPDK_DEBUGLOG(nvmf_vfio, "%s current state %u\n", endpoint_id(endpoint), ctrlr->state); 3849 3850 return 0; 3851 } 3852 3853 static ssize_t 3854 vfio_user_migration_read_data(vfu_ctx_t *vfu_ctx __attribute__((unused)), 3855 void *buf __attribute__((unused)), 3856 uint64_t count __attribute__((unused)), 3857 uint64_t offset __attribute__((unused))) 3858 { 3859 SPDK_DEBUGLOG(nvmf_vfio, "%s: migration read data not supported\n", 3860 endpoint_id(vfu_get_private(vfu_ctx))); 3861 errno = ENOTSUP; 3862 return -1; 3863 } 3864 3865 static ssize_t 3866 vfio_user_migration_write_data(vfu_ctx_t *vfu_ctx __attribute__((unused)), 3867 void *buf __attribute__((unused)), 3868 uint64_t count __attribute__((unused)), 3869 uint64_t offset __attribute__((unused))) 3870 { 3871 SPDK_DEBUGLOG(nvmf_vfio, "%s: migration write data not supported\n", 3872 endpoint_id(vfu_get_private(vfu_ctx))); 3873 errno = ENOTSUP; 3874 return -1; 3875 } 3876 3877 static int 3878 vfio_user_migration_data_written(vfu_ctx_t *vfu_ctx __attribute__((unused)), 3879 uint64_t count) 3880 { 3881 SPDK_DEBUGLOG(nvmf_vfio, "write 0x%"PRIx64"\n", (uint64_t)count); 3882 3883 if (count != vfio_user_migr_data_len()) { 3884 SPDK_DEBUGLOG(nvmf_vfio, "%s bad count %#lx\n", 3885 endpoint_id(vfu_get_private(vfu_ctx)), count); 3886 errno = EINVAL; 3887 return -1; 3888 } 3889 3890 return 0; 3891 } 3892 3893 static int 3894 vfio_user_dev_info_fill(struct nvmf_vfio_user_transport *vu_transport, 3895 struct nvmf_vfio_user_endpoint *endpoint) 3896 { 3897 int ret; 3898 ssize_t cap_offset; 3899 vfu_ctx_t *vfu_ctx = endpoint->vfu_ctx; 3900 struct iovec migr_sparse_mmap = {}; 3901 3902 struct pmcap pmcap = { .hdr.id = PCI_CAP_ID_PM, .pmcs.nsfrst = 0x1 }; 3903 struct pxcap pxcap = { 3904 .hdr.id = PCI_CAP_ID_EXP, 3905 .pxcaps.ver = 0x2, 3906 .pxdcap = {.rer = 0x1, .flrc = 0x1}, 3907 .pxdcap2.ctds = 0x1 3908 }; 3909 3910 struct msixcap msixcap = { 3911 .hdr.id = PCI_CAP_ID_MSIX, 3912 .mxc.ts = NVME_IRQ_MSIX_NUM - 1, 3913 .mtab = {.tbir = 0x4, .to = 0x0}, 3914 .mpba = {.pbir = 0x5, .pbao = 0x0} 3915 }; 3916 3917 struct iovec sparse_mmap[] = { 3918 { 3919 .iov_base = (void *)NVME_DOORBELLS_OFFSET, 3920 .iov_len = NVMF_VFIO_USER_DOORBELLS_SIZE, 3921 }, 3922 }; 3923 3924 const vfu_migration_callbacks_t migr_callbacks = { 3925 .version = VFU_MIGR_CALLBACKS_VERS, 3926 .transition = &vfio_user_migration_device_state_transition, 3927 .get_pending_bytes = &vfio_user_migration_get_pending_bytes, 3928 .prepare_data = &vfio_user_migration_prepare_data, 3929 .read_data = &vfio_user_migration_read_data, 3930 .data_written = &vfio_user_migration_data_written, 3931 .write_data = &vfio_user_migration_write_data 3932 }; 3933 3934 ret = vfu_pci_init(vfu_ctx, VFU_PCI_TYPE_EXPRESS, PCI_HEADER_TYPE_NORMAL, 0); 3935 if (ret < 0) { 3936 SPDK_ERRLOG("vfu_ctx %p failed to initialize PCI\n", vfu_ctx); 3937 return ret; 3938 } 3939 vfu_pci_set_id(vfu_ctx, SPDK_PCI_VID_NUTANIX, 0x0001, SPDK_PCI_VID_NUTANIX, 0); 3940 /* 3941 * 0x02, controller uses the NVM Express programming interface 3942 * 0x08, non-volatile memory controller 3943 * 0x01, mass storage controller 3944 */ 3945 vfu_pci_set_class(vfu_ctx, 0x01, 0x08, 0x02); 3946 3947 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pmcap); 3948 if (cap_offset < 0) { 3949 SPDK_ERRLOG("vfu_ctx %p failed add pmcap\n", vfu_ctx); 3950 return ret; 3951 } 3952 3953 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pxcap); 3954 if (cap_offset < 0) { 3955 SPDK_ERRLOG("vfu_ctx %p failed add pxcap\n", vfu_ctx); 3956 return ret; 3957 } 3958 3959 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &msixcap); 3960 if (cap_offset < 0) { 3961 SPDK_ERRLOG("vfu_ctx %p failed add msixcap\n", vfu_ctx); 3962 return ret; 3963 } 3964 3965 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_CFG_REGION_IDX, NVME_REG_CFG_SIZE, 3966 access_pci_config, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 3967 if (ret < 0) { 3968 SPDK_ERRLOG("vfu_ctx %p failed to setup cfg\n", vfu_ctx); 3969 return ret; 3970 } 3971 3972 if (vu_transport->transport_opts.disable_mappable_bar0) { 3973 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE, 3974 access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, 3975 NULL, 0, -1, 0); 3976 } else { 3977 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE, 3978 access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, 3979 sparse_mmap, 1, endpoint->devmem_fd, 0); 3980 } 3981 3982 if (ret < 0) { 3983 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 0\n", vfu_ctx); 3984 return ret; 3985 } 3986 3987 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR4_REGION_IDX, NVME_BAR4_SIZE, 3988 NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 3989 if (ret < 0) { 3990 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 4\n", vfu_ctx); 3991 return ret; 3992 } 3993 3994 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR5_REGION_IDX, NVME_BAR5_SIZE, 3995 NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 3996 if (ret < 0) { 3997 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 5\n", vfu_ctx); 3998 return ret; 3999 } 4000 4001 ret = vfu_setup_device_dma(vfu_ctx, memory_region_add_cb, memory_region_remove_cb); 4002 if (ret < 0) { 4003 SPDK_ERRLOG("vfu_ctx %p failed to setup dma callback\n", vfu_ctx); 4004 return ret; 4005 } 4006 4007 ret = vfu_setup_device_reset_cb(vfu_ctx, vfio_user_device_reset); 4008 if (ret < 0) { 4009 SPDK_ERRLOG("vfu_ctx %p failed to setup reset callback\n", vfu_ctx); 4010 return ret; 4011 } 4012 4013 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_INTX_IRQ, 1); 4014 if (ret < 0) { 4015 SPDK_ERRLOG("vfu_ctx %p failed to setup INTX\n", vfu_ctx); 4016 return ret; 4017 } 4018 4019 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSIX_IRQ, NVME_IRQ_MSIX_NUM); 4020 if (ret < 0) { 4021 SPDK_ERRLOG("vfu_ctx %p failed to setup MSIX\n", vfu_ctx); 4022 return ret; 4023 } 4024 4025 vfu_setup_device_quiesce_cb(vfu_ctx, vfio_user_dev_quiesce_cb); 4026 4027 migr_sparse_mmap.iov_base = (void *)4096; 4028 migr_sparse_mmap.iov_len = vfio_user_migr_data_len(); 4029 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_MIGR_REGION_IDX, 4030 vfu_get_migr_register_area_size() + vfio_user_migr_data_len(), 4031 NULL, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, &migr_sparse_mmap, 4032 1, endpoint->migr_fd, 0); 4033 if (ret < 0) { 4034 SPDK_ERRLOG("vfu_ctx %p failed to setup migration region\n", vfu_ctx); 4035 return ret; 4036 } 4037 4038 ret = vfu_setup_device_migration_callbacks(vfu_ctx, &migr_callbacks, 4039 vfu_get_migr_register_area_size()); 4040 if (ret < 0) { 4041 SPDK_ERRLOG("vfu_ctx %p failed to setup migration callbacks\n", vfu_ctx); 4042 return ret; 4043 } 4044 4045 ret = vfu_realize_ctx(vfu_ctx); 4046 if (ret < 0) { 4047 SPDK_ERRLOG("vfu_ctx %p failed to realize\n", vfu_ctx); 4048 return ret; 4049 } 4050 4051 endpoint->pci_config_space = vfu_pci_get_config_space(endpoint->vfu_ctx); 4052 assert(endpoint->pci_config_space != NULL); 4053 init_pci_config_space(endpoint->pci_config_space); 4054 4055 assert(cap_offset != 0); 4056 endpoint->msix = (struct msixcap *)((uint8_t *)endpoint->pci_config_space + cap_offset); 4057 4058 return 0; 4059 } 4060 4061 static int nvmf_vfio_user_accept(void *ctx); 4062 4063 static void 4064 set_intr_mode_noop(struct spdk_poller *poller, void *arg, bool interrupt_mode) 4065 { 4066 /* Nothing for us to do here. */ 4067 } 4068 4069 /* 4070 * Register an "accept" poller: this is polling for incoming vfio-user socket 4071 * connections (on the listening socket). 4072 * 4073 * We need to do this on first listening, and also after destroying a 4074 * controller, so we can accept another connection. 4075 */ 4076 static int 4077 vfio_user_register_accept_poller(struct nvmf_vfio_user_endpoint *endpoint) 4078 { 4079 uint64_t poll_rate_us = endpoint->transport->transport.opts.acceptor_poll_rate; 4080 4081 SPDK_DEBUGLOG(nvmf_vfio, "registering accept poller\n"); 4082 4083 endpoint->accept_poller = SPDK_POLLER_REGISTER(nvmf_vfio_user_accept, 4084 endpoint, poll_rate_us); 4085 4086 if (!endpoint->accept_poller) { 4087 return -1; 4088 } 4089 4090 endpoint->accept_thread = spdk_get_thread(); 4091 endpoint->need_relisten = false; 4092 4093 if (!spdk_interrupt_mode_is_enabled()) { 4094 return 0; 4095 } 4096 4097 endpoint->accept_intr_fd = vfu_get_poll_fd(endpoint->vfu_ctx); 4098 assert(endpoint->accept_intr_fd != -1); 4099 4100 endpoint->accept_intr = SPDK_INTERRUPT_REGISTER(endpoint->accept_intr_fd, 4101 nvmf_vfio_user_accept, endpoint); 4102 4103 assert(endpoint->accept_intr != NULL); 4104 4105 spdk_poller_register_interrupt(endpoint->accept_poller, 4106 set_intr_mode_noop, NULL); 4107 return 0; 4108 } 4109 4110 static void 4111 _vfio_user_relisten(void *ctx) 4112 { 4113 struct nvmf_vfio_user_endpoint *endpoint = ctx; 4114 4115 vfio_user_register_accept_poller(endpoint); 4116 } 4117 4118 static void 4119 _free_ctrlr(void *ctx) 4120 { 4121 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 4122 struct nvmf_vfio_user_endpoint *endpoint = ctrlr->endpoint; 4123 4124 free_sdbl(ctrlr->endpoint->vfu_ctx, ctrlr->sdbl); 4125 4126 spdk_interrupt_unregister(&ctrlr->intr); 4127 ctrlr->intr_fd = -1; 4128 spdk_poller_unregister(&ctrlr->vfu_ctx_poller); 4129 4130 free(ctrlr); 4131 4132 if (endpoint == NULL) { 4133 return; 4134 } 4135 4136 if (endpoint->need_async_destroy) { 4137 nvmf_vfio_user_destroy_endpoint(endpoint); 4138 } else if (endpoint->need_relisten) { 4139 spdk_thread_send_msg(endpoint->accept_thread, 4140 _vfio_user_relisten, endpoint); 4141 } 4142 } 4143 4144 static void 4145 free_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 4146 { 4147 int i; 4148 assert(ctrlr != NULL); 4149 4150 SPDK_DEBUGLOG(nvmf_vfio, "free %s\n", ctrlr_id(ctrlr)); 4151 4152 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 4153 free_qp(ctrlr, i); 4154 } 4155 4156 spdk_thread_exec_msg(ctrlr->thread, _free_ctrlr, ctrlr); 4157 } 4158 4159 static int 4160 nvmf_vfio_user_create_ctrlr(struct nvmf_vfio_user_transport *transport, 4161 struct nvmf_vfio_user_endpoint *endpoint) 4162 { 4163 struct nvmf_vfio_user_ctrlr *ctrlr; 4164 int err = 0; 4165 4166 SPDK_DEBUGLOG(nvmf_vfio, "%s\n", endpoint_id(endpoint)); 4167 4168 /* First, construct a vfio-user CUSTOM transport controller */ 4169 ctrlr = calloc(1, sizeof(*ctrlr)); 4170 if (ctrlr == NULL) { 4171 err = -ENOMEM; 4172 goto out; 4173 } 4174 /* We can only support one connection for now */ 4175 ctrlr->cntlid = 0x1; 4176 ctrlr->intr_fd = -1; 4177 ctrlr->transport = transport; 4178 ctrlr->endpoint = endpoint; 4179 ctrlr->bar0_doorbells = endpoint->bar0_doorbells; 4180 TAILQ_INIT(&ctrlr->connected_sqs); 4181 4182 ctrlr->adaptive_irqs_enabled = 4183 !transport->transport_opts.disable_adaptive_irq; 4184 4185 /* Then, construct an admin queue pair */ 4186 err = init_sq(ctrlr, &transport->transport, 0); 4187 if (err != 0) { 4188 free(ctrlr); 4189 goto out; 4190 } 4191 4192 err = init_cq(ctrlr, 0); 4193 if (err != 0) { 4194 free(ctrlr); 4195 goto out; 4196 } 4197 4198 ctrlr->sqs[0]->size = NVMF_VFIO_USER_DEFAULT_AQ_DEPTH; 4199 4200 err = alloc_sq_reqs(ctrlr, ctrlr->sqs[0]); 4201 if (err != 0) { 4202 free(ctrlr); 4203 goto out; 4204 } 4205 endpoint->ctrlr = ctrlr; 4206 4207 /* Notify the generic layer about the new admin queue pair */ 4208 spdk_nvmf_tgt_new_qpair(transport->transport.tgt, &ctrlr->sqs[0]->qpair); 4209 4210 out: 4211 if (err != 0) { 4212 SPDK_ERRLOG("%s: failed to create vfio-user controller: %s\n", 4213 endpoint_id(endpoint), strerror(-err)); 4214 } 4215 4216 return err; 4217 } 4218 4219 static int 4220 nvmf_vfio_user_listen(struct spdk_nvmf_transport *transport, 4221 const struct spdk_nvme_transport_id *trid, 4222 struct spdk_nvmf_listen_opts *listen_opts) 4223 { 4224 struct nvmf_vfio_user_transport *vu_transport; 4225 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 4226 char path[PATH_MAX] = {}; 4227 char uuid[PATH_MAX] = {}; 4228 int ret; 4229 4230 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4231 transport); 4232 4233 pthread_mutex_lock(&vu_transport->lock); 4234 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 4235 /* Only compare traddr */ 4236 if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) { 4237 pthread_mutex_unlock(&vu_transport->lock); 4238 return -EEXIST; 4239 } 4240 } 4241 pthread_mutex_unlock(&vu_transport->lock); 4242 4243 endpoint = calloc(1, sizeof(*endpoint)); 4244 if (!endpoint) { 4245 return -ENOMEM; 4246 } 4247 4248 pthread_mutex_init(&endpoint->lock, NULL); 4249 endpoint->devmem_fd = -1; 4250 memcpy(&endpoint->trid, trid, sizeof(endpoint->trid)); 4251 endpoint->transport = vu_transport; 4252 4253 ret = snprintf(path, PATH_MAX, "%s/bar0", endpoint_id(endpoint)); 4254 if (ret < 0 || ret >= PATH_MAX) { 4255 SPDK_ERRLOG("%s: error to get socket path: %s.\n", endpoint_id(endpoint), spdk_strerror(errno)); 4256 ret = -1; 4257 goto out; 4258 } 4259 4260 ret = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); 4261 if (ret == -1) { 4262 SPDK_ERRLOG("%s: failed to open device memory at %s: %s.\n", 4263 endpoint_id(endpoint), path, spdk_strerror(errno)); 4264 goto out; 4265 } 4266 unlink(path); 4267 4268 endpoint->devmem_fd = ret; 4269 ret = ftruncate(endpoint->devmem_fd, 4270 NVME_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE); 4271 if (ret != 0) { 4272 SPDK_ERRLOG("%s: error to ftruncate file %s: %s.\n", endpoint_id(endpoint), path, 4273 spdk_strerror(errno)); 4274 goto out; 4275 } 4276 4277 endpoint->bar0_doorbells = mmap(NULL, NVMF_VFIO_USER_DOORBELLS_SIZE, 4278 PROT_READ | PROT_WRITE, MAP_SHARED, endpoint->devmem_fd, NVME_DOORBELLS_OFFSET); 4279 if (endpoint->bar0_doorbells == MAP_FAILED) { 4280 SPDK_ERRLOG("%s: error to mmap file %s: %s.\n", endpoint_id(endpoint), path, spdk_strerror(errno)); 4281 endpoint->bar0_doorbells = NULL; 4282 ret = -1; 4283 goto out; 4284 } 4285 4286 ret = snprintf(path, PATH_MAX, "%s/migr", endpoint_id(endpoint)); 4287 if (ret < 0 || ret >= PATH_MAX) { 4288 SPDK_ERRLOG("%s: error to get migration file path: %s.\n", endpoint_id(endpoint), 4289 spdk_strerror(errno)); 4290 ret = -1; 4291 goto out; 4292 } 4293 ret = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); 4294 if (ret == -1) { 4295 SPDK_ERRLOG("%s: failed to open device memory at %s: %s.\n", 4296 endpoint_id(endpoint), path, spdk_strerror(errno)); 4297 goto out; 4298 } 4299 unlink(path); 4300 4301 endpoint->migr_fd = ret; 4302 ret = ftruncate(endpoint->migr_fd, 4303 vfu_get_migr_register_area_size() + vfio_user_migr_data_len()); 4304 if (ret != 0) { 4305 SPDK_ERRLOG("%s: error to ftruncate migration file %s: %s.\n", endpoint_id(endpoint), path, 4306 spdk_strerror(errno)); 4307 goto out; 4308 } 4309 4310 endpoint->migr_data = mmap(NULL, vfio_user_migr_data_len(), 4311 PROT_READ | PROT_WRITE, MAP_SHARED, endpoint->migr_fd, vfu_get_migr_register_area_size()); 4312 if (endpoint->migr_data == MAP_FAILED) { 4313 SPDK_ERRLOG("%s: error to mmap file %s: %s.\n", endpoint_id(endpoint), path, spdk_strerror(errno)); 4314 endpoint->migr_data = NULL; 4315 ret = -1; 4316 goto out; 4317 } 4318 4319 ret = snprintf(uuid, PATH_MAX, "%s/cntrl", endpoint_id(endpoint)); 4320 if (ret < 0 || ret >= PATH_MAX) { 4321 SPDK_ERRLOG("%s: error to get ctrlr file path: %s\n", endpoint_id(endpoint), spdk_strerror(errno)); 4322 ret = -1; 4323 goto out; 4324 } 4325 4326 endpoint->vfu_ctx = vfu_create_ctx(VFU_TRANS_SOCK, uuid, LIBVFIO_USER_FLAG_ATTACH_NB, 4327 endpoint, VFU_DEV_TYPE_PCI); 4328 if (endpoint->vfu_ctx == NULL) { 4329 SPDK_ERRLOG("%s: error creating libmuser context: %m\n", 4330 endpoint_id(endpoint)); 4331 ret = -1; 4332 goto out; 4333 } 4334 4335 ret = vfu_setup_log(endpoint->vfu_ctx, vfio_user_log, 4336 vfio_user_get_log_level()); 4337 if (ret < 0) { 4338 goto out; 4339 } 4340 4341 4342 ret = vfio_user_dev_info_fill(vu_transport, endpoint); 4343 if (ret < 0) { 4344 goto out; 4345 } 4346 4347 ret = vfio_user_register_accept_poller(endpoint); 4348 4349 if (ret != 0) { 4350 goto out; 4351 } 4352 4353 pthread_mutex_lock(&vu_transport->lock); 4354 TAILQ_INSERT_TAIL(&vu_transport->endpoints, endpoint, link); 4355 pthread_mutex_unlock(&vu_transport->lock); 4356 4357 out: 4358 if (ret != 0) { 4359 nvmf_vfio_user_destroy_endpoint(endpoint); 4360 } 4361 4362 return ret; 4363 } 4364 4365 static void 4366 nvmf_vfio_user_stop_listen(struct spdk_nvmf_transport *transport, 4367 const struct spdk_nvme_transport_id *trid) 4368 { 4369 struct nvmf_vfio_user_transport *vu_transport; 4370 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 4371 4372 assert(trid != NULL); 4373 assert(trid->traddr != NULL); 4374 4375 SPDK_DEBUGLOG(nvmf_vfio, "%s: stop listen\n", trid->traddr); 4376 4377 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4378 transport); 4379 4380 pthread_mutex_lock(&vu_transport->lock); 4381 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 4382 if (strcmp(trid->traddr, endpoint->trid.traddr) == 0) { 4383 TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link); 4384 /* Defer to free endpoint resources until the controller 4385 * is freed. There are two cases when running here: 4386 * 1. kill nvmf target while VM is connected 4387 * 2. remove listener via RPC call 4388 * nvmf library will disconnect all queue paris. 4389 */ 4390 if (endpoint->ctrlr) { 4391 assert(!endpoint->need_async_destroy); 4392 endpoint->need_async_destroy = true; 4393 pthread_mutex_unlock(&vu_transport->lock); 4394 return; 4395 } 4396 4397 nvmf_vfio_user_destroy_endpoint(endpoint); 4398 pthread_mutex_unlock(&vu_transport->lock); 4399 return; 4400 } 4401 } 4402 pthread_mutex_unlock(&vu_transport->lock); 4403 4404 SPDK_DEBUGLOG(nvmf_vfio, "%s: not found\n", trid->traddr); 4405 } 4406 4407 static void 4408 nvmf_vfio_user_cdata_init(struct spdk_nvmf_transport *transport, 4409 struct spdk_nvmf_subsystem *subsystem, 4410 struct spdk_nvmf_ctrlr_data *cdata) 4411 { 4412 struct nvmf_vfio_user_transport *vu_transport; 4413 4414 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, transport); 4415 4416 cdata->vid = SPDK_PCI_VID_NUTANIX; 4417 cdata->ssvid = SPDK_PCI_VID_NUTANIX; 4418 cdata->ieee[0] = 0x8d; 4419 cdata->ieee[1] = 0x6b; 4420 cdata->ieee[2] = 0x50; 4421 memset(&cdata->sgls, 0, sizeof(struct spdk_nvme_cdata_sgls)); 4422 cdata->sgls.supported = SPDK_NVME_SGLS_SUPPORTED_DWORD_ALIGNED; 4423 cdata->oncs.compare = !vu_transport->transport_opts.disable_compare; 4424 /* libvfio-user can only support 1 connection for now */ 4425 cdata->oncs.reservations = 0; 4426 cdata->oacs.doorbell_buffer_config = !vu_transport->transport_opts.disable_shadow_doorbells; 4427 cdata->fuses.compare_and_write = !vu_transport->transport_opts.disable_compare; 4428 } 4429 4430 static int 4431 nvmf_vfio_user_listen_associate(struct spdk_nvmf_transport *transport, 4432 const struct spdk_nvmf_subsystem *subsystem, 4433 const struct spdk_nvme_transport_id *trid) 4434 { 4435 struct nvmf_vfio_user_transport *vu_transport; 4436 struct nvmf_vfio_user_endpoint *endpoint; 4437 4438 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, transport); 4439 4440 pthread_mutex_lock(&vu_transport->lock); 4441 TAILQ_FOREACH(endpoint, &vu_transport->endpoints, link) { 4442 if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) { 4443 break; 4444 } 4445 } 4446 pthread_mutex_unlock(&vu_transport->lock); 4447 4448 if (endpoint == NULL) { 4449 return -ENOENT; 4450 } 4451 4452 /* Drop const - we will later need to pause/unpause. */ 4453 endpoint->subsystem = (struct spdk_nvmf_subsystem *)subsystem; 4454 4455 return 0; 4456 } 4457 4458 /* 4459 * Executed periodically at a default SPDK_NVMF_DEFAULT_ACCEPT_POLL_RATE_US 4460 * frequency. 4461 * 4462 * For this endpoint (which at the libvfio-user level corresponds to a socket), 4463 * if we don't currently have a controller set up, peek to see if the socket is 4464 * able to accept a new connection. 4465 */ 4466 static int 4467 nvmf_vfio_user_accept(void *ctx) 4468 { 4469 struct nvmf_vfio_user_endpoint *endpoint = ctx; 4470 struct nvmf_vfio_user_transport *vu_transport; 4471 int err; 4472 4473 vu_transport = endpoint->transport; 4474 4475 if (endpoint->ctrlr != NULL) { 4476 return SPDK_POLLER_IDLE; 4477 } 4478 4479 /* While we're here, the controller is already destroyed, 4480 * subsystem may still be in RESUMING state, we will wait 4481 * until the subsystem is in RUNNING state. 4482 */ 4483 if (endpoint->need_resume) { 4484 return SPDK_POLLER_IDLE; 4485 } 4486 4487 err = vfu_attach_ctx(endpoint->vfu_ctx); 4488 if (err == 0) { 4489 SPDK_DEBUGLOG(nvmf_vfio, "attach succeeded\n"); 4490 err = nvmf_vfio_user_create_ctrlr(vu_transport, endpoint); 4491 if (err == 0) { 4492 /* 4493 * Unregister ourselves: now we've accepted a 4494 * connection, there is nothing for us to poll for, and 4495 * we will poll the connection via vfu_run_ctx() 4496 * instead. 4497 */ 4498 spdk_interrupt_unregister(&endpoint->accept_intr); 4499 spdk_poller_unregister(&endpoint->accept_poller); 4500 } 4501 return SPDK_POLLER_BUSY; 4502 } 4503 4504 if (errno == EAGAIN || errno == EWOULDBLOCK) { 4505 return SPDK_POLLER_IDLE; 4506 } 4507 4508 return SPDK_POLLER_BUSY; 4509 } 4510 4511 static void 4512 nvmf_vfio_user_discover(struct spdk_nvmf_transport *transport, 4513 struct spdk_nvme_transport_id *trid, 4514 struct spdk_nvmf_discovery_log_page_entry *entry) 4515 { } 4516 4517 static struct spdk_nvmf_transport_poll_group * 4518 nvmf_vfio_user_poll_group_create(struct spdk_nvmf_transport *transport, 4519 struct spdk_nvmf_poll_group *group) 4520 { 4521 struct nvmf_vfio_user_transport *vu_transport; 4522 struct nvmf_vfio_user_poll_group *vu_group; 4523 4524 SPDK_DEBUGLOG(nvmf_vfio, "create poll group\n"); 4525 4526 vu_group = calloc(1, sizeof(*vu_group)); 4527 if (vu_group == NULL) { 4528 SPDK_ERRLOG("Error allocating poll group: %m"); 4529 return NULL; 4530 } 4531 4532 TAILQ_INIT(&vu_group->sqs); 4533 4534 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4535 transport); 4536 pthread_mutex_lock(&vu_transport->pg_lock); 4537 TAILQ_INSERT_TAIL(&vu_transport->poll_groups, vu_group, link); 4538 if (vu_transport->next_pg == NULL) { 4539 vu_transport->next_pg = vu_group; 4540 } 4541 pthread_mutex_unlock(&vu_transport->pg_lock); 4542 4543 if (!spdk_interrupt_mode_is_enabled()) { 4544 return &vu_group->group; 4545 } 4546 4547 /* 4548 * Only allow the poll group to work in interrupt mode if the transport 4549 * supports it. It's our responsibility to register the actual interrupt 4550 * later (in handle_queue_connect_rsp()) that processes everything in 4551 * the poll group: for us, that's the libvfio-user context, and the 4552 * actual qpairs. 4553 * 4554 * Note that this only works in the case that nothing else shares the 4555 * spdk_nvmf_poll_group. 4556 * 4557 * If not supported, this will effectively always wake up to poll the 4558 * poll group. 4559 */ 4560 4561 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4562 transport); 4563 4564 if (!vu_transport->intr_mode_supported) { 4565 SPDK_WARNLOG("vfio-user interrupt mode not supported\n"); 4566 return &vu_group->group; 4567 } 4568 4569 spdk_poller_register_interrupt(group->poller, set_intr_mode_noop, 4570 NULL); 4571 4572 return &vu_group->group; 4573 } 4574 4575 static struct spdk_nvmf_transport_poll_group * 4576 nvmf_vfio_user_get_optimal_poll_group(struct spdk_nvmf_qpair *qpair) 4577 { 4578 struct nvmf_vfio_user_transport *vu_transport; 4579 struct nvmf_vfio_user_poll_group **vu_group; 4580 struct nvmf_vfio_user_sq *sq; 4581 struct nvmf_vfio_user_cq *cq; 4582 4583 struct spdk_nvmf_transport_poll_group *result = NULL; 4584 4585 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 4586 cq = sq->ctrlr->cqs[sq->cqid]; 4587 assert(cq != NULL); 4588 vu_transport = SPDK_CONTAINEROF(qpair->transport, struct nvmf_vfio_user_transport, transport); 4589 4590 pthread_mutex_lock(&vu_transport->pg_lock); 4591 if (TAILQ_EMPTY(&vu_transport->poll_groups)) { 4592 goto out; 4593 } 4594 4595 if (!nvmf_qpair_is_admin_queue(qpair)) { 4596 /* 4597 * If this is shared IO CQ case, just return the used CQ's poll 4598 * group, so I/O completions don't have to use 4599 * spdk_thread_send_msg(). 4600 */ 4601 if (cq->group != NULL) { 4602 result = cq->group; 4603 goto out; 4604 } 4605 4606 /* 4607 * If we're in interrupt mode, align all qpairs for a controller 4608 * on the same poll group, to avoid complications in 4609 * vfio_user_ctrlr_intr(). 4610 */ 4611 if (in_interrupt_mode(vu_transport)) { 4612 result = sq->ctrlr->sqs[0]->group; 4613 goto out; 4614 } 4615 4616 } 4617 4618 vu_group = &vu_transport->next_pg; 4619 assert(*vu_group != NULL); 4620 4621 result = &(*vu_group)->group; 4622 *vu_group = TAILQ_NEXT(*vu_group, link); 4623 if (*vu_group == NULL) { 4624 *vu_group = TAILQ_FIRST(&vu_transport->poll_groups); 4625 } 4626 4627 out: 4628 if (cq->group == NULL) { 4629 cq->group = result; 4630 } 4631 4632 pthread_mutex_unlock(&vu_transport->pg_lock); 4633 return result; 4634 } 4635 4636 /* called when process exits */ 4637 static void 4638 nvmf_vfio_user_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group) 4639 { 4640 struct nvmf_vfio_user_poll_group *vu_group, *next_tgroup;; 4641 struct nvmf_vfio_user_transport *vu_transport; 4642 4643 SPDK_DEBUGLOG(nvmf_vfio, "destroy poll group\n"); 4644 4645 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 4646 vu_transport = SPDK_CONTAINEROF(vu_group->group.transport, struct nvmf_vfio_user_transport, 4647 transport); 4648 4649 pthread_mutex_lock(&vu_transport->pg_lock); 4650 next_tgroup = TAILQ_NEXT(vu_group, link); 4651 TAILQ_REMOVE(&vu_transport->poll_groups, vu_group, link); 4652 if (next_tgroup == NULL) { 4653 next_tgroup = TAILQ_FIRST(&vu_transport->poll_groups); 4654 } 4655 if (vu_transport->next_pg == vu_group) { 4656 vu_transport->next_pg = next_tgroup; 4657 } 4658 pthread_mutex_unlock(&vu_transport->pg_lock); 4659 4660 free(vu_group); 4661 } 4662 4663 static void 4664 _vfio_user_qpair_disconnect(void *ctx) 4665 { 4666 struct nvmf_vfio_user_sq *sq = ctx; 4667 4668 spdk_nvmf_qpair_disconnect(&sq->qpair, NULL, NULL); 4669 } 4670 4671 /* The function is used when socket connection is destroyed */ 4672 static int 4673 vfio_user_destroy_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 4674 { 4675 struct nvmf_vfio_user_sq *sq; 4676 struct nvmf_vfio_user_endpoint *endpoint; 4677 4678 SPDK_DEBUGLOG(nvmf_vfio, "%s stop processing\n", ctrlr_id(ctrlr)); 4679 4680 endpoint = ctrlr->endpoint; 4681 assert(endpoint != NULL); 4682 4683 pthread_mutex_lock(&endpoint->lock); 4684 endpoint->need_relisten = true; 4685 if (TAILQ_EMPTY(&ctrlr->connected_sqs)) { 4686 endpoint->ctrlr = NULL; 4687 free_ctrlr(ctrlr); 4688 pthread_mutex_unlock(&endpoint->lock); 4689 return 0; 4690 } 4691 4692 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 4693 /* add another round thread poll to avoid recursive endpoint lock */ 4694 spdk_thread_send_msg(ctrlr->thread, _vfio_user_qpair_disconnect, sq); 4695 } 4696 pthread_mutex_unlock(&endpoint->lock); 4697 4698 return 0; 4699 } 4700 4701 /* 4702 * Poll for and process any incoming vfio-user messages. 4703 */ 4704 static int 4705 vfio_user_poll_vfu_ctx(void *ctx) 4706 { 4707 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 4708 int ret; 4709 4710 assert(ctrlr != NULL); 4711 4712 /* This will call access_bar0_fn() if there are any writes 4713 * to the portion of the BAR that is not mmap'd */ 4714 ret = vfu_run_ctx(ctrlr->endpoint->vfu_ctx); 4715 if (spdk_unlikely(ret == -1)) { 4716 if (errno == EBUSY) { 4717 return SPDK_POLLER_IDLE; 4718 } 4719 4720 spdk_poller_unregister(&ctrlr->vfu_ctx_poller); 4721 4722 /* 4723 * We lost the client; the reset callback will already have 4724 * unregistered the interrupt. 4725 */ 4726 if (errno == ENOTCONN) { 4727 vfio_user_destroy_ctrlr(ctrlr); 4728 return SPDK_POLLER_BUSY; 4729 } 4730 4731 /* 4732 * We might not have got a reset callback in this case, so 4733 * explicitly unregister the interrupt here. 4734 */ 4735 spdk_interrupt_unregister(&ctrlr->intr); 4736 ctrlr->intr_fd = -1; 4737 fail_ctrlr(ctrlr); 4738 } 4739 4740 return ret != 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 4741 } 4742 4743 struct vfio_user_post_cpl_ctx { 4744 struct nvmf_vfio_user_ctrlr *ctrlr; 4745 struct nvmf_vfio_user_cq *cq; 4746 struct spdk_nvme_cpl cpl; 4747 }; 4748 4749 static void 4750 _post_completion_msg(void *ctx) 4751 { 4752 struct vfio_user_post_cpl_ctx *cpl_ctx = ctx; 4753 4754 post_completion(cpl_ctx->ctrlr, cpl_ctx->cq, cpl_ctx->cpl.cdw0, cpl_ctx->cpl.sqid, 4755 cpl_ctx->cpl.cid, cpl_ctx->cpl.status.sc, cpl_ctx->cpl.status.sct); 4756 free(cpl_ctx); 4757 } 4758 4759 static int nvmf_vfio_user_poll_group_poll(struct spdk_nvmf_transport_poll_group *group); 4760 4761 static int vfio_user_poll_group_rearm(struct nvmf_vfio_user_poll_group *vu_group); 4762 4763 /* 4764 * Handle an interrupt for the given controller: we must poll the vfu_ctx, and 4765 * the SQs assigned to our poll group. 4766 */ 4767 static int 4768 vfio_user_ctrlr_intr(void *ctx) 4769 { 4770 struct nvmf_vfio_user_poll_group *vu_group; 4771 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 4772 int ret = 0; 4773 4774 assert(ctrlr != NULL); 4775 assert(ctrlr->sqs[0] != NULL); 4776 assert(ctrlr->sqs[0]->group != NULL); 4777 4778 ctrlr->kick_requested = false; 4779 4780 /* 4781 * Poll vfio-user for this controller. 4782 */ 4783 ret = vfio_user_poll_vfu_ctx(ctrlr); 4784 /* `sqs[0]` could be set to NULL in vfio_user_poll_vfu_ctx() context, just return 4785 * for this case. 4786 */ 4787 if (ctrlr->sqs[0] == NULL) { 4788 return ret; 4789 } 4790 4791 vu_group = ctrlr_to_poll_group(ctrlr); 4792 4793 /* 4794 * See nvmf_vfio_user_get_optimal_poll_group() for why it's OK to only 4795 * poll this poll group. 4796 * 4797 * Note that this could end up polling other controller's SQs as well 4798 * (since a single poll group can have SQs from multiple separate 4799 * controllers). 4800 */ 4801 ret |= nvmf_vfio_user_poll_group_poll(&vu_group->group); 4802 4803 /* 4804 * Re-arm the event indexes. NB: this also could rearm other 4805 * controller's SQs. 4806 */ 4807 ret |= vfio_user_poll_group_rearm(vu_group); 4808 4809 return ret != 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 4810 } 4811 4812 static void 4813 vfio_user_set_intr_mode(struct spdk_poller *poller, void *arg, 4814 bool interrupt_mode) 4815 { 4816 struct nvmf_vfio_user_ctrlr *ctrlr = arg; 4817 assert(ctrlr != NULL); 4818 assert(ctrlr->endpoint != NULL); 4819 4820 SPDK_DEBUGLOG(nvmf_vfio, "%s: setting interrupt mode to %d\n", 4821 ctrlr_id(ctrlr), interrupt_mode); 4822 4823 /* 4824 * interrupt_mode needs to persist across controller resets, so store 4825 * it in the endpoint instead. 4826 */ 4827 ctrlr->endpoint->interrupt_mode = interrupt_mode; 4828 4829 vfio_user_poll_group_rearm(ctrlr_to_poll_group(ctrlr)); 4830 } 4831 4832 /* 4833 * In response to the nvmf_vfio_user_create_ctrlr() path, the admin queue is now 4834 * set up and we can start operating on this controller. 4835 */ 4836 static void 4837 start_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 4838 struct spdk_nvmf_ctrlr *ctrlr) 4839 { 4840 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 4841 4842 vu_ctrlr->ctrlr = ctrlr; 4843 vu_ctrlr->cntlid = ctrlr->cntlid; 4844 vu_ctrlr->thread = spdk_get_thread(); 4845 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 4846 4847 if (!in_interrupt_mode(endpoint->transport)) { 4848 vu_ctrlr->vfu_ctx_poller = SPDK_POLLER_REGISTER(vfio_user_poll_vfu_ctx, 4849 vu_ctrlr, 1000); 4850 return; 4851 } 4852 4853 vu_ctrlr->vfu_ctx_poller = SPDK_POLLER_REGISTER(vfio_user_poll_vfu_ctx, 4854 vu_ctrlr, 0); 4855 4856 vu_ctrlr->intr_fd = vfu_get_poll_fd(vu_ctrlr->endpoint->vfu_ctx); 4857 assert(vu_ctrlr->intr_fd != -1); 4858 4859 vu_ctrlr->intr = SPDK_INTERRUPT_REGISTER(vu_ctrlr->intr_fd, 4860 vfio_user_ctrlr_intr, vu_ctrlr); 4861 4862 assert(vu_ctrlr->intr != NULL); 4863 4864 spdk_poller_register_interrupt(vu_ctrlr->vfu_ctx_poller, 4865 vfio_user_set_intr_mode, 4866 vu_ctrlr); 4867 } 4868 4869 static int 4870 handle_queue_connect_rsp(struct nvmf_vfio_user_req *req, void *cb_arg) 4871 { 4872 struct nvmf_vfio_user_poll_group *vu_group; 4873 struct nvmf_vfio_user_sq *sq = cb_arg; 4874 struct nvmf_vfio_user_cq *admin_cq; 4875 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 4876 struct nvmf_vfio_user_endpoint *endpoint; 4877 4878 assert(sq != NULL); 4879 assert(req != NULL); 4880 4881 vu_ctrlr = sq->ctrlr; 4882 assert(vu_ctrlr != NULL); 4883 endpoint = vu_ctrlr->endpoint; 4884 assert(endpoint != NULL); 4885 4886 if (spdk_nvme_cpl_is_error(&req->req.rsp->nvme_cpl)) { 4887 SPDK_ERRLOG("SC %u, SCT %u\n", req->req.rsp->nvme_cpl.status.sc, req->req.rsp->nvme_cpl.status.sct); 4888 endpoint->ctrlr = NULL; 4889 free_ctrlr(vu_ctrlr); 4890 return -1; 4891 } 4892 4893 vu_group = SPDK_CONTAINEROF(sq->group, struct nvmf_vfio_user_poll_group, group); 4894 TAILQ_INSERT_TAIL(&vu_group->sqs, sq, link); 4895 4896 admin_cq = vu_ctrlr->cqs[0]; 4897 assert(admin_cq != NULL); 4898 4899 pthread_mutex_lock(&endpoint->lock); 4900 if (nvmf_qpair_is_admin_queue(&sq->qpair)) { 4901 admin_cq->thread = spdk_get_thread(); 4902 start_ctrlr(vu_ctrlr, sq->qpair.ctrlr); 4903 } else { 4904 /* For I/O queues this command was generated in response to an 4905 * ADMIN I/O CREATE SUBMISSION QUEUE command which has not yet 4906 * been completed. Complete it now. 4907 */ 4908 if (sq->post_create_io_sq_completion) { 4909 assert(admin_cq->thread != NULL); 4910 if (admin_cq->thread != spdk_get_thread()) { 4911 struct vfio_user_post_cpl_ctx *cpl_ctx; 4912 4913 cpl_ctx = calloc(1, sizeof(*cpl_ctx)); 4914 if (!cpl_ctx) { 4915 return -ENOMEM; 4916 } 4917 cpl_ctx->ctrlr = vu_ctrlr; 4918 cpl_ctx->cq = admin_cq; 4919 cpl_ctx->cpl.sqid = 0; 4920 cpl_ctx->cpl.cdw0 = 0; 4921 cpl_ctx->cpl.cid = sq->create_io_sq_cmd.cid; 4922 cpl_ctx->cpl.status.sc = SPDK_NVME_SC_SUCCESS; 4923 cpl_ctx->cpl.status.sct = SPDK_NVME_SCT_GENERIC; 4924 4925 spdk_thread_send_msg(admin_cq->thread, _post_completion_msg, 4926 cpl_ctx); 4927 } else { 4928 post_completion(vu_ctrlr, admin_cq, 0, 0, 4929 sq->create_io_sq_cmd.cid, SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC); 4930 } 4931 sq->post_create_io_sq_completion = false; 4932 } else if (in_interrupt_mode(endpoint->transport)) { 4933 /* 4934 * If we're live migrating a guest, there is a window 4935 * where the I/O queues haven't been set up but the 4936 * device is in running state, during which the guest 4937 * might write to a doorbell. This doorbell write will 4938 * go unnoticed, so let's poll the whole controller to 4939 * pick that up. 4940 */ 4941 ctrlr_kick(vu_ctrlr); 4942 } 4943 sq->sq_state = VFIO_USER_SQ_ACTIVE; 4944 } 4945 4946 TAILQ_INSERT_TAIL(&vu_ctrlr->connected_sqs, sq, tailq); 4947 pthread_mutex_unlock(&endpoint->lock); 4948 4949 free(req->req.data); 4950 req->req.data = NULL; 4951 4952 return 0; 4953 } 4954 4955 /* 4956 * Add the given qpair to the given poll group. New qpairs are added via 4957 * spdk_nvmf_tgt_new_qpair(), which picks a poll group via 4958 * nvmf_vfio_user_get_optimal_poll_group(), then calls back here via 4959 * nvmf_transport_poll_group_add(). 4960 */ 4961 static int 4962 nvmf_vfio_user_poll_group_add(struct spdk_nvmf_transport_poll_group *group, 4963 struct spdk_nvmf_qpair *qpair) 4964 { 4965 struct nvmf_vfio_user_sq *sq; 4966 struct nvmf_vfio_user_req *vu_req; 4967 struct nvmf_vfio_user_ctrlr *ctrlr; 4968 struct spdk_nvmf_request *req; 4969 struct spdk_nvmf_fabric_connect_data *data; 4970 bool admin; 4971 4972 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 4973 sq->group = group; 4974 ctrlr = sq->ctrlr; 4975 4976 SPDK_DEBUGLOG(nvmf_vfio, "%s: add QP%d=%p(%p) to poll_group=%p\n", 4977 ctrlr_id(ctrlr), sq->qpair.qid, 4978 sq, qpair, group); 4979 4980 admin = nvmf_qpair_is_admin_queue(&sq->qpair); 4981 4982 vu_req = get_nvmf_vfio_user_req(sq); 4983 if (vu_req == NULL) { 4984 return -1; 4985 } 4986 4987 req = &vu_req->req; 4988 req->cmd->connect_cmd.opcode = SPDK_NVME_OPC_FABRIC; 4989 req->cmd->connect_cmd.cid = 0; 4990 req->cmd->connect_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_CONNECT; 4991 req->cmd->connect_cmd.recfmt = 0; 4992 req->cmd->connect_cmd.sqsize = sq->size - 1; 4993 req->cmd->connect_cmd.qid = admin ? 0 : qpair->qid; 4994 4995 req->length = sizeof(struct spdk_nvmf_fabric_connect_data); 4996 req->data = calloc(1, req->length); 4997 if (req->data == NULL) { 4998 nvmf_vfio_user_req_free(req); 4999 return -ENOMEM; 5000 } 5001 5002 data = (struct spdk_nvmf_fabric_connect_data *)req->data; 5003 data->cntlid = ctrlr->cntlid; 5004 snprintf(data->subnqn, sizeof(data->subnqn), "%s", 5005 spdk_nvmf_subsystem_get_nqn(ctrlr->endpoint->subsystem)); 5006 5007 vu_req->cb_fn = handle_queue_connect_rsp; 5008 vu_req->cb_arg = sq; 5009 5010 SPDK_DEBUGLOG(nvmf_vfio, 5011 "%s: sending connect fabrics command for qid:%#x cntlid=%#x\n", 5012 ctrlr_id(ctrlr), qpair->qid, data->cntlid); 5013 5014 spdk_nvmf_request_exec_fabrics(req); 5015 return 0; 5016 } 5017 5018 static int 5019 nvmf_vfio_user_poll_group_remove(struct spdk_nvmf_transport_poll_group *group, 5020 struct spdk_nvmf_qpair *qpair) 5021 { 5022 struct nvmf_vfio_user_sq *sq; 5023 struct nvmf_vfio_user_poll_group *vu_group; 5024 5025 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5026 5027 SPDK_DEBUGLOG(nvmf_vfio, 5028 "%s: remove NVMf QP%d=%p from NVMf poll_group=%p\n", 5029 ctrlr_id(sq->ctrlr), qpair->qid, qpair, group); 5030 5031 5032 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 5033 TAILQ_REMOVE(&vu_group->sqs, sq, link); 5034 5035 return 0; 5036 } 5037 5038 static void 5039 _nvmf_vfio_user_req_free(struct nvmf_vfio_user_sq *sq, struct nvmf_vfio_user_req *vu_req) 5040 { 5041 memset(&vu_req->cmd, 0, sizeof(vu_req->cmd)); 5042 memset(&vu_req->rsp, 0, sizeof(vu_req->rsp)); 5043 vu_req->iovcnt = 0; 5044 vu_req->state = VFIO_USER_REQUEST_STATE_FREE; 5045 5046 TAILQ_INSERT_TAIL(&sq->free_reqs, vu_req, link); 5047 } 5048 5049 static int 5050 nvmf_vfio_user_req_free(struct spdk_nvmf_request *req) 5051 { 5052 struct nvmf_vfio_user_sq *sq; 5053 struct nvmf_vfio_user_req *vu_req; 5054 5055 assert(req != NULL); 5056 5057 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 5058 sq = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 5059 5060 _nvmf_vfio_user_req_free(sq, vu_req); 5061 5062 return 0; 5063 } 5064 5065 static int 5066 nvmf_vfio_user_req_complete(struct spdk_nvmf_request *req) 5067 { 5068 struct nvmf_vfio_user_sq *sq; 5069 struct nvmf_vfio_user_req *vu_req; 5070 5071 assert(req != NULL); 5072 5073 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 5074 sq = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 5075 5076 if (vu_req->cb_fn != NULL) { 5077 if (vu_req->cb_fn(vu_req, vu_req->cb_arg) != 0) { 5078 fail_ctrlr(sq->ctrlr); 5079 } 5080 } 5081 5082 _nvmf_vfio_user_req_free(sq, vu_req); 5083 5084 return 0; 5085 } 5086 5087 static void 5088 nvmf_vfio_user_close_qpair(struct spdk_nvmf_qpair *qpair, 5089 spdk_nvmf_transport_qpair_fini_cb cb_fn, void *cb_arg) 5090 { 5091 struct nvmf_vfio_user_sq *sq; 5092 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 5093 struct nvmf_vfio_user_endpoint *endpoint; 5094 5095 assert(qpair != NULL); 5096 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5097 vu_ctrlr = sq->ctrlr; 5098 endpoint = vu_ctrlr->endpoint; 5099 5100 pthread_mutex_lock(&endpoint->lock); 5101 TAILQ_REMOVE(&vu_ctrlr->connected_sqs, sq, tailq); 5102 delete_sq_done(vu_ctrlr, sq); 5103 if (TAILQ_EMPTY(&vu_ctrlr->connected_sqs)) { 5104 endpoint->ctrlr = NULL; 5105 if (vu_ctrlr->in_source_vm && endpoint->need_resume) { 5106 /* The controller will be freed, we can resume the subsystem 5107 * now so that the endpoint can be ready to accept another 5108 * new connection. 5109 */ 5110 spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 5111 vfio_user_endpoint_resume_done, endpoint); 5112 } 5113 free_ctrlr(vu_ctrlr); 5114 } 5115 pthread_mutex_unlock(&endpoint->lock); 5116 5117 if (cb_fn) { 5118 cb_fn(cb_arg); 5119 } 5120 } 5121 5122 /** 5123 * Returns a preallocated request, or NULL if there isn't one available. 5124 */ 5125 static struct nvmf_vfio_user_req * 5126 get_nvmf_vfio_user_req(struct nvmf_vfio_user_sq *sq) 5127 { 5128 struct nvmf_vfio_user_req *req; 5129 5130 if (sq == NULL) { 5131 return NULL; 5132 } 5133 5134 req = TAILQ_FIRST(&sq->free_reqs); 5135 if (req == NULL) { 5136 return NULL; 5137 } 5138 5139 TAILQ_REMOVE(&sq->free_reqs, req, link); 5140 5141 return req; 5142 } 5143 5144 static int 5145 get_nvmf_io_req_length(struct spdk_nvmf_request *req) 5146 { 5147 uint16_t nr; 5148 uint32_t nlb, nsid; 5149 struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; 5150 struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; 5151 struct spdk_nvmf_ns *ns; 5152 5153 nsid = cmd->nsid; 5154 ns = _nvmf_subsystem_get_ns(ctrlr->subsys, nsid); 5155 if (ns == NULL || ns->bdev == NULL) { 5156 SPDK_ERRLOG("unsuccessful query for nsid %u\n", cmd->nsid); 5157 return -EINVAL; 5158 } 5159 5160 if (cmd->opc == SPDK_NVME_OPC_DATASET_MANAGEMENT) { 5161 nr = cmd->cdw10_bits.dsm.nr + 1; 5162 return nr * sizeof(struct spdk_nvme_dsm_range); 5163 } 5164 5165 nlb = (cmd->cdw12 & 0x0000ffffu) + 1; 5166 return nlb * spdk_bdev_get_block_size(ns->bdev); 5167 } 5168 5169 static int 5170 map_admin_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req) 5171 { 5172 struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; 5173 uint32_t len = 0; 5174 uint8_t fid; 5175 int iovcnt; 5176 5177 req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 5178 req->length = 0; 5179 req->data = NULL; 5180 5181 if (req->xfer == SPDK_NVME_DATA_NONE) { 5182 return 0; 5183 } 5184 5185 switch (cmd->opc) { 5186 case SPDK_NVME_OPC_IDENTIFY: 5187 len = 4096; 5188 break; 5189 case SPDK_NVME_OPC_GET_LOG_PAGE: 5190 len = (((cmd->cdw11_bits.get_log_page.numdu << 16) | cmd->cdw10_bits.get_log_page.numdl) + 1) * 4; 5191 break; 5192 case SPDK_NVME_OPC_GET_FEATURES: 5193 case SPDK_NVME_OPC_SET_FEATURES: 5194 fid = cmd->cdw10_bits.set_features.fid; 5195 switch (fid) { 5196 case SPDK_NVME_FEAT_LBA_RANGE_TYPE: 5197 len = 4096; 5198 break; 5199 case SPDK_NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 5200 len = 256; 5201 break; 5202 case SPDK_NVME_FEAT_TIMESTAMP: 5203 len = 8; 5204 break; 5205 case SPDK_NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 5206 len = 512; 5207 break; 5208 case SPDK_NVME_FEAT_HOST_IDENTIFIER: 5209 if (cmd->cdw11_bits.feat_host_identifier.bits.exhid) { 5210 len = 16; 5211 } else { 5212 len = 8; 5213 } 5214 break; 5215 default: 5216 return 0; 5217 } 5218 break; 5219 default: 5220 return 0; 5221 } 5222 5223 /* ADMIN command will not use SGL */ 5224 if (cmd->psdt != 0) { 5225 return -EINVAL; 5226 } 5227 5228 iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, len); 5229 if (iovcnt < 0) { 5230 SPDK_ERRLOG("%s: map Admin Opc %x failed\n", 5231 ctrlr_id(ctrlr), cmd->opc); 5232 return -1; 5233 } 5234 req->length = len; 5235 req->data = req->iov[0].iov_base; 5236 req->iovcnt = iovcnt; 5237 5238 return 0; 5239 } 5240 5241 /* 5242 * Map an I/O command's buffers. 5243 * 5244 * Returns 0 on success and -errno on failure. 5245 */ 5246 static int 5247 map_io_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req) 5248 { 5249 int len, iovcnt; 5250 struct spdk_nvme_cmd *cmd; 5251 5252 assert(ctrlr != NULL); 5253 assert(req != NULL); 5254 5255 cmd = &req->cmd->nvme_cmd; 5256 req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 5257 req->length = 0; 5258 req->data = NULL; 5259 5260 if (spdk_unlikely(req->xfer == SPDK_NVME_DATA_NONE)) { 5261 return 0; 5262 } 5263 5264 len = get_nvmf_io_req_length(req); 5265 if (len < 0) { 5266 return -EINVAL; 5267 } 5268 req->length = len; 5269 5270 iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, req->length); 5271 if (iovcnt < 0) { 5272 SPDK_ERRLOG("%s: failed to map IO OPC %u\n", ctrlr_id(ctrlr), cmd->opc); 5273 return -EFAULT; 5274 } 5275 req->data = req->iov[0].iov_base; 5276 req->iovcnt = iovcnt; 5277 5278 return 0; 5279 } 5280 5281 static int 5282 handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 5283 struct nvmf_vfio_user_sq *sq) 5284 { 5285 int err; 5286 struct nvmf_vfio_user_req *vu_req; 5287 struct spdk_nvmf_request *req; 5288 5289 assert(ctrlr != NULL); 5290 assert(cmd != NULL); 5291 5292 vu_req = get_nvmf_vfio_user_req(sq); 5293 if (spdk_unlikely(vu_req == NULL)) { 5294 SPDK_ERRLOG("%s: no request for NVMe command opc 0x%x\n", ctrlr_id(ctrlr), cmd->opc); 5295 return post_completion(ctrlr, ctrlr->cqs[sq->cqid], 0, 0, cmd->cid, 5296 SPDK_NVME_SC_INTERNAL_DEVICE_ERROR, SPDK_NVME_SCT_GENERIC); 5297 5298 } 5299 req = &vu_req->req; 5300 5301 assert(req->qpair != NULL); 5302 SPDK_DEBUGLOG(nvmf_vfio, "%s: handle sqid:%u, req opc=%#x cid=%d\n", 5303 ctrlr_id(ctrlr), req->qpair->qid, cmd->opc, cmd->cid); 5304 5305 vu_req->cb_fn = handle_cmd_rsp; 5306 vu_req->cb_arg = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 5307 req->cmd->nvme_cmd = *cmd; 5308 5309 if (nvmf_qpair_is_admin_queue(req->qpair)) { 5310 err = map_admin_cmd_req(ctrlr, req); 5311 } else { 5312 switch (cmd->opc) { 5313 case SPDK_NVME_OPC_RESERVATION_REGISTER: 5314 case SPDK_NVME_OPC_RESERVATION_REPORT: 5315 case SPDK_NVME_OPC_RESERVATION_ACQUIRE: 5316 case SPDK_NVME_OPC_RESERVATION_RELEASE: 5317 err = -ENOTSUP; 5318 break; 5319 default: 5320 err = map_io_cmd_req(ctrlr, req); 5321 break; 5322 } 5323 } 5324 5325 if (spdk_unlikely(err < 0)) { 5326 SPDK_ERRLOG("%s: process NVMe command opc 0x%x failed\n", 5327 ctrlr_id(ctrlr), cmd->opc); 5328 req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 5329 req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; 5330 err = handle_cmd_rsp(vu_req, vu_req->cb_arg); 5331 _nvmf_vfio_user_req_free(sq, vu_req); 5332 return err; 5333 } 5334 5335 vu_req->state = VFIO_USER_REQUEST_STATE_EXECUTING; 5336 spdk_nvmf_request_exec(req); 5337 5338 return 0; 5339 } 5340 5341 /* 5342 * If we suppressed an IRQ in post_completion(), check if it needs to be fired 5343 * here: if the host isn't up to date, and is apparently not actively processing 5344 * the queue (i.e. ->last_head isn't changing), we need an IRQ. 5345 */ 5346 static void 5347 handle_suppressed_irq(struct nvmf_vfio_user_ctrlr *ctrlr, 5348 struct nvmf_vfio_user_sq *sq) 5349 { 5350 struct nvmf_vfio_user_cq *cq = ctrlr->cqs[sq->cqid]; 5351 uint32_t cq_head; 5352 uint32_t cq_tail; 5353 5354 if (!cq->ien || cq->qid == 0 || !ctrlr_interrupt_enabled(ctrlr)) { 5355 return; 5356 } 5357 5358 cq_tail = *cq_tailp(cq); 5359 5360 /* Already sent? */ 5361 if (cq_tail == cq->last_trigger_irq_tail) { 5362 return; 5363 } 5364 5365 spdk_ivdt_dcache(cq_dbl_headp(cq)); 5366 cq_head = *cq_dbl_headp(cq); 5367 5368 if (cq_head != cq_tail && cq_head == cq->last_head) { 5369 int err = vfu_irq_trigger(ctrlr->endpoint->vfu_ctx, cq->iv); 5370 if (err != 0) { 5371 SPDK_ERRLOG("%s: failed to trigger interrupt: %m\n", 5372 ctrlr_id(ctrlr)); 5373 } else { 5374 cq->last_trigger_irq_tail = cq_tail; 5375 } 5376 } 5377 5378 cq->last_head = cq_head; 5379 } 5380 5381 /* Returns the number of commands processed, or a negative value on error. */ 5382 static int 5383 nvmf_vfio_user_sq_poll(struct nvmf_vfio_user_sq *sq) 5384 { 5385 struct nvmf_vfio_user_ctrlr *ctrlr; 5386 uint32_t new_tail; 5387 int count = 0; 5388 5389 assert(sq != NULL); 5390 5391 ctrlr = sq->ctrlr; 5392 5393 /* 5394 * A quiesced, or migrating, controller should never process new 5395 * commands. 5396 */ 5397 if (ctrlr->state != VFIO_USER_CTRLR_RUNNING) { 5398 return SPDK_POLLER_IDLE; 5399 } 5400 5401 if (ctrlr->adaptive_irqs_enabled) { 5402 handle_suppressed_irq(ctrlr, sq); 5403 } 5404 5405 /* On aarch64 platforms, doorbells update from guest VM may not be seen 5406 * on SPDK target side. This is because there is memory type mismatch 5407 * situation here. That is on guest VM side, the doorbells are treated as 5408 * device memory while on SPDK target side, it is treated as normal 5409 * memory. And this situation cause problem on ARM platform. 5410 * Refer to "https://developer.arm.com/documentation/102376/0100/ 5411 * Memory-aliasing-and-mismatched-memory-types". Only using spdk_mb() 5412 * cannot fix this. Use "dc civac" to invalidate cache may solve 5413 * this. 5414 */ 5415 spdk_ivdt_dcache(sq_dbl_tailp(sq)); 5416 5417 /* Load-Acquire. */ 5418 new_tail = *sq_dbl_tailp(sq); 5419 5420 new_tail = new_tail & 0xffffu; 5421 if (spdk_unlikely(new_tail >= sq->size)) { 5422 union spdk_nvme_async_event_completion event = {}; 5423 5424 SPDK_DEBUGLOG(nvmf_vfio, "%s: invalid sqid:%u doorbell value %u\n", ctrlr_id(ctrlr), sq->qid, 5425 new_tail); 5426 event.bits.async_event_type = SPDK_NVME_ASYNC_EVENT_TYPE_ERROR; 5427 event.bits.async_event_info = SPDK_NVME_ASYNC_EVENT_INVALID_DB_WRITE; 5428 nvmf_ctrlr_async_event_error_event(ctrlr->ctrlr, event); 5429 5430 return -1; 5431 } 5432 5433 if (*sq_headp(sq) == new_tail) { 5434 return 0; 5435 } 5436 5437 SPDK_DEBUGLOG(nvmf_vfio, "%s: sqid:%u doorbell old=%u new=%u\n", 5438 ctrlr_id(ctrlr), sq->qid, *sq_headp(sq), new_tail); 5439 if (ctrlr->sdbl != NULL) { 5440 SPDK_DEBUGLOG(nvmf_vfio, 5441 "%s: sqid:%u bar0_doorbell=%u shadow_doorbell=%u eventidx=%u\n", 5442 ctrlr_id(ctrlr), sq->qid, 5443 ctrlr->bar0_doorbells[queue_index(sq->qid, false)], 5444 ctrlr->sdbl->shadow_doorbells[queue_index(sq->qid, false)], 5445 ctrlr->sdbl->eventidxs[queue_index(sq->qid, false)]); 5446 } 5447 5448 /* 5449 * Ensure that changes to the queue are visible to us. 5450 * The host driver should write the queue first, do a wmb(), and then 5451 * update the SQ tail doorbell (their Store-Release). 5452 */ 5453 spdk_rmb(); 5454 5455 count = handle_sq_tdbl_write(ctrlr, new_tail, sq); 5456 if (count < 0) { 5457 fail_ctrlr(ctrlr); 5458 } 5459 5460 return count; 5461 } 5462 5463 /* 5464 * vfio-user transport poll handler. Note that the library context is polled in 5465 * a separate poller (->vfu_ctx_poller), so this poller only needs to poll the 5466 * active SQs. 5467 * 5468 * Returns the number of commands processed, or a negative value on error. 5469 */ 5470 static int 5471 nvmf_vfio_user_poll_group_poll(struct spdk_nvmf_transport_poll_group *group) 5472 { 5473 struct nvmf_vfio_user_poll_group *vu_group; 5474 struct nvmf_vfio_user_sq *sq, *tmp; 5475 int count = 0; 5476 5477 assert(group != NULL); 5478 5479 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 5480 5481 SPDK_DEBUGLOG(vfio_user_db, "polling all SQs\n"); 5482 5483 TAILQ_FOREACH_SAFE(sq, &vu_group->sqs, link, tmp) { 5484 int ret; 5485 5486 if (spdk_unlikely(sq->sq_state != VFIO_USER_SQ_ACTIVE || !sq->size)) { 5487 continue; 5488 } 5489 5490 ret = nvmf_vfio_user_sq_poll(sq); 5491 5492 if (ret < 0) { 5493 return ret; 5494 } 5495 5496 count += ret; 5497 } 5498 5499 return count; 5500 } 5501 5502 static int 5503 nvmf_vfio_user_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair, 5504 struct spdk_nvme_transport_id *trid) 5505 { 5506 struct nvmf_vfio_user_sq *sq; 5507 struct nvmf_vfio_user_ctrlr *ctrlr; 5508 5509 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5510 ctrlr = sq->ctrlr; 5511 5512 memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid)); 5513 return 0; 5514 } 5515 5516 static int 5517 nvmf_vfio_user_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair, 5518 struct spdk_nvme_transport_id *trid) 5519 { 5520 return 0; 5521 } 5522 5523 static int 5524 nvmf_vfio_user_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair, 5525 struct spdk_nvme_transport_id *trid) 5526 { 5527 struct nvmf_vfio_user_sq *sq; 5528 struct nvmf_vfio_user_ctrlr *ctrlr; 5529 5530 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5531 ctrlr = sq->ctrlr; 5532 5533 memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid)); 5534 return 0; 5535 } 5536 5537 static void 5538 nvmf_vfio_user_qpair_abort_request(struct spdk_nvmf_qpair *qpair, 5539 struct spdk_nvmf_request *req) 5540 { 5541 struct spdk_nvmf_request *req_to_abort = NULL; 5542 struct spdk_nvmf_request *temp_req = NULL; 5543 uint16_t cid; 5544 5545 cid = req->cmd->nvme_cmd.cdw10_bits.abort.cid; 5546 5547 TAILQ_FOREACH(temp_req, &qpair->outstanding, link) { 5548 struct nvmf_vfio_user_req *vu_req; 5549 5550 vu_req = SPDK_CONTAINEROF(temp_req, struct nvmf_vfio_user_req, req); 5551 5552 if (vu_req->state == VFIO_USER_REQUEST_STATE_EXECUTING && vu_req->cmd.cid == cid) { 5553 req_to_abort = temp_req; 5554 break; 5555 } 5556 } 5557 5558 if (req_to_abort == NULL) { 5559 spdk_nvmf_request_complete(req); 5560 return; 5561 } 5562 5563 req->req_to_abort = req_to_abort; 5564 nvmf_ctrlr_abort_request(req); 5565 } 5566 5567 static void 5568 nvmf_vfio_user_opts_init(struct spdk_nvmf_transport_opts *opts) 5569 { 5570 opts->max_queue_depth = NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH; 5571 opts->max_qpairs_per_ctrlr = NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; 5572 opts->in_capsule_data_size = 0; 5573 opts->max_io_size = NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE; 5574 opts->io_unit_size = NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE; 5575 opts->max_aq_depth = NVMF_VFIO_USER_DEFAULT_AQ_DEPTH; 5576 opts->num_shared_buffers = 0; 5577 opts->buf_cache_size = 0; 5578 opts->association_timeout = 0; 5579 opts->transport_specific = NULL; 5580 } 5581 5582 const struct spdk_nvmf_transport_ops spdk_nvmf_transport_vfio_user = { 5583 .name = "VFIOUSER", 5584 .type = SPDK_NVME_TRANSPORT_VFIOUSER, 5585 .opts_init = nvmf_vfio_user_opts_init, 5586 .create = nvmf_vfio_user_create, 5587 .destroy = nvmf_vfio_user_destroy, 5588 5589 .listen = nvmf_vfio_user_listen, 5590 .stop_listen = nvmf_vfio_user_stop_listen, 5591 .cdata_init = nvmf_vfio_user_cdata_init, 5592 .listen_associate = nvmf_vfio_user_listen_associate, 5593 5594 .listener_discover = nvmf_vfio_user_discover, 5595 5596 .poll_group_create = nvmf_vfio_user_poll_group_create, 5597 .get_optimal_poll_group = nvmf_vfio_user_get_optimal_poll_group, 5598 .poll_group_destroy = nvmf_vfio_user_poll_group_destroy, 5599 .poll_group_add = nvmf_vfio_user_poll_group_add, 5600 .poll_group_remove = nvmf_vfio_user_poll_group_remove, 5601 .poll_group_poll = nvmf_vfio_user_poll_group_poll, 5602 5603 .req_free = nvmf_vfio_user_req_free, 5604 .req_complete = nvmf_vfio_user_req_complete, 5605 5606 .qpair_fini = nvmf_vfio_user_close_qpair, 5607 .qpair_get_local_trid = nvmf_vfio_user_qpair_get_local_trid, 5608 .qpair_get_peer_trid = nvmf_vfio_user_qpair_get_peer_trid, 5609 .qpair_get_listen_trid = nvmf_vfio_user_qpair_get_listen_trid, 5610 .qpair_abort_request = nvmf_vfio_user_qpair_abort_request, 5611 }; 5612 5613 SPDK_NVMF_TRANSPORT_REGISTER(muser, &spdk_nvmf_transport_vfio_user); 5614 SPDK_LOG_REGISTER_COMPONENT(nvmf_vfio) 5615 SPDK_LOG_REGISTER_COMPONENT(vfio_user_db) 5616