1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (c) 2019-2022, Nutanix Inc. All rights reserved. 3 * Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 */ 5 6 /* 7 * NVMe over vfio-user transport 8 */ 9 10 #include <vfio-user/libvfio-user.h> 11 #include <vfio-user/pci_defs.h> 12 13 #include "spdk/barrier.h" 14 #include "spdk/stdinc.h" 15 #include "spdk/assert.h" 16 #include "spdk/thread.h" 17 #include "spdk/nvmf_transport.h" 18 #include "spdk/sock.h" 19 #include "spdk/string.h" 20 #include "spdk/util.h" 21 #include "spdk/log.h" 22 23 #include "transport.h" 24 25 #include "nvmf_internal.h" 26 27 #define SWAP(x, y) \ 28 do \ 29 { \ 30 typeof(x) _tmp = x; \ 31 x = y; \ 32 y = _tmp; \ 33 } while (0) 34 35 #define NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH 256 36 #define NVMF_VFIO_USER_DEFAULT_AQ_DEPTH 32 37 #define NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE ((NVMF_REQ_MAX_BUFFERS - 1) << SHIFT_4KB) 38 #define NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE 39 40 #define NVME_DOORBELLS_OFFSET 0x1000 41 #define NVMF_VFIO_USER_DOORBELLS_SIZE 0x1000 42 #define NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT 2 43 #define NVMF_VFIO_USER_SET_EVENTIDX_MAX_ATTEMPTS 3 44 #define NVMF_VFIO_USER_EVENTIDX_POLL UINT32_MAX 45 46 /* 47 * NVMe driver reads 4096 bytes, which is the extended PCI configuration space 48 * available on PCI-X 2.0 and PCI Express buses 49 */ 50 #define NVME_REG_CFG_SIZE 0x1000 51 #define NVME_REG_BAR0_SIZE (NVME_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE) 52 #define NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR ((NVMF_VFIO_USER_DOORBELLS_SIZE) / 8) 53 #define NVME_IRQ_MSIX_NUM NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR 54 /* MSIX Table Size */ 55 #define NVME_BAR4_SIZE SPDK_ALIGN_CEIL((NVME_IRQ_MSIX_NUM * 16), 0x1000) 56 /* MSIX Pending Bit Array Size */ 57 #define NVME_BAR5_SIZE SPDK_ALIGN_CEIL((NVME_IRQ_MSIX_NUM / 8), 0x1000) 58 59 #define NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR (NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR / 4) 60 61 struct nvmf_vfio_user_req; 62 63 typedef int (*nvmf_vfio_user_req_cb_fn)(struct nvmf_vfio_user_req *req, void *cb_arg); 64 65 /* 1 more for PRP2 list itself */ 66 #define NVMF_VFIO_USER_MAX_IOVECS (NVMF_REQ_MAX_BUFFERS + 1) 67 68 enum nvmf_vfio_user_req_state { 69 VFIO_USER_REQUEST_STATE_FREE = 0, 70 VFIO_USER_REQUEST_STATE_EXECUTING, 71 }; 72 73 /* 74 * Support for live migration in NVMf/vfio-user: live migration is implemented 75 * by stopping the NVMf subsystem when the device is instructed to enter the 76 * stop-and-copy state and then trivially, and most importantly safely, 77 * collecting migration state and providing it to the vfio-user client. We 78 * don't provide any migration state at the pre-copy state as that's too 79 * complicated to do, we might support this in the future. 80 */ 81 82 83 /* NVMe device state representation */ 84 struct nvme_migr_sq_state { 85 uint16_t sqid; 86 uint16_t cqid; 87 uint32_t head; 88 uint32_t size; 89 uint32_t reserved; 90 uint64_t dma_addr; 91 }; 92 SPDK_STATIC_ASSERT(sizeof(struct nvme_migr_sq_state) == 0x18, "Incorrect size"); 93 94 struct nvme_migr_cq_state { 95 uint16_t cqid; 96 uint16_t phase; 97 uint32_t tail; 98 uint32_t size; 99 uint32_t iv; 100 uint32_t ien; 101 uint32_t reserved; 102 uint64_t dma_addr; 103 }; 104 SPDK_STATIC_ASSERT(sizeof(struct nvme_migr_cq_state) == 0x20, "Incorrect size"); 105 106 #define VFIO_USER_NVME_MIGR_MAGIC 0xAFEDBC23 107 108 /* The device state is in VFIO MIGRATION BAR(9) region, keep the device state page aligned. 109 * 110 * NVMe device migration region is defined as below: 111 * ------------------------------------------------------------------------- 112 * | vfio_user_nvme_migr_header | nvmf controller data | queue pairs | BARs | 113 * ------------------------------------------------------------------------- 114 * 115 * Keep vfio_user_nvme_migr_header as a fixed 0x1000 length, all new added fields 116 * can use the reserved space at the end of the data structure. 117 */ 118 struct vfio_user_nvme_migr_header { 119 /* Magic value to validate migration data */ 120 uint32_t magic; 121 /* Version to check the data is same from source to destination */ 122 uint32_t version; 123 124 /* The library uses this field to know how many fields in this 125 * structure are valid, starting at the beginning of this data 126 * structure. New added fields in future use `unused` memory 127 * spaces. 128 */ 129 uint32_t opts_size; 130 uint32_t reserved0; 131 132 /* BARs information */ 133 uint64_t bar_offset[VFU_PCI_DEV_NUM_REGIONS]; 134 uint64_t bar_len[VFU_PCI_DEV_NUM_REGIONS]; 135 136 /* Queue pair start offset, starting at the beginning of this 137 * data structure. 138 */ 139 uint64_t qp_offset; 140 uint64_t qp_len; 141 142 /* Controller data structure */ 143 uint32_t num_io_queues; 144 uint32_t reserved1; 145 146 /* NVMf controller data offset and length if exist, starting at 147 * the beginning of this data structure. 148 */ 149 uint64_t nvmf_data_offset; 150 uint64_t nvmf_data_len; 151 152 /* 153 * Whether or not shadow doorbells are used in the source. 0 is a valid DMA 154 * address. 155 */ 156 uint32_t sdbl; 157 158 /* Shadow doorbell DMA addresses. */ 159 uint64_t shadow_doorbell_buffer; 160 uint64_t eventidx_buffer; 161 162 /* Reserved memory space for new added fields, the 163 * field is always at the end of this data structure. 164 */ 165 uint8_t unused[3856]; 166 }; 167 SPDK_STATIC_ASSERT(sizeof(struct vfio_user_nvme_migr_header) == 0x1000, "Incorrect size"); 168 169 struct vfio_user_nvme_migr_qp { 170 struct nvme_migr_sq_state sq; 171 struct nvme_migr_cq_state cq; 172 }; 173 174 /* NVMe state definition used to load/restore from/to NVMe migration BAR region */ 175 struct vfio_user_nvme_migr_state { 176 struct vfio_user_nvme_migr_header ctrlr_header; 177 struct spdk_nvmf_ctrlr_migr_data nvmf_data; 178 struct vfio_user_nvme_migr_qp qps[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 179 uint8_t doorbells[NVMF_VFIO_USER_DOORBELLS_SIZE]; 180 uint8_t cfg[NVME_REG_CFG_SIZE]; 181 }; 182 183 struct nvmf_vfio_user_req { 184 struct spdk_nvmf_request req; 185 struct spdk_nvme_cpl rsp; 186 struct spdk_nvme_cmd cmd; 187 188 enum nvmf_vfio_user_req_state state; 189 nvmf_vfio_user_req_cb_fn cb_fn; 190 void *cb_arg; 191 192 /* old CC before prop_set_cc fabric command */ 193 union spdk_nvme_cc_register cc; 194 195 TAILQ_ENTRY(nvmf_vfio_user_req) link; 196 197 struct iovec iov[NVMF_VFIO_USER_MAX_IOVECS]; 198 uint8_t iovcnt; 199 200 /* NVMF_VFIO_USER_MAX_IOVECS worth of dma_sg_t. */ 201 uint8_t sg[]; 202 }; 203 204 /* 205 * Mapping of an NVMe queue. 206 * 207 * This holds the information tracking a local process mapping of an NVMe queue 208 * shared by the client. 209 */ 210 struct nvme_q_mapping { 211 /* iov of local process mapping. */ 212 struct iovec iov; 213 /* Stored sg, needed for unmap. */ 214 dma_sg_t *sg; 215 /* Client PRP of queue. */ 216 uint64_t prp1; 217 }; 218 219 enum nvmf_vfio_user_sq_state { 220 VFIO_USER_SQ_UNUSED = 0, 221 VFIO_USER_SQ_CREATED, 222 VFIO_USER_SQ_DELETED, 223 VFIO_USER_SQ_ACTIVE, 224 VFIO_USER_SQ_INACTIVE 225 }; 226 227 enum nvmf_vfio_user_cq_state { 228 VFIO_USER_CQ_UNUSED = 0, 229 VFIO_USER_CQ_CREATED, 230 VFIO_USER_CQ_DELETED, 231 }; 232 233 enum nvmf_vfio_user_ctrlr_state { 234 VFIO_USER_CTRLR_CREATING = 0, 235 VFIO_USER_CTRLR_RUNNING, 236 /* Quiesce requested by libvfio-user */ 237 VFIO_USER_CTRLR_PAUSING, 238 /* NVMf subsystem is paused, it's safe to do PCI reset, memory register, 239 * memory unergister, and vfio migration state transition in this state. 240 */ 241 VFIO_USER_CTRLR_PAUSED, 242 /* 243 * Implies that the NVMf subsystem is paused. Device will be unquiesced (PCI 244 * reset, memory register and unregister, controller in destination VM has 245 * been restored). NVMf subsystem resume has been requested. 246 */ 247 VFIO_USER_CTRLR_RESUMING, 248 /* 249 * Implies that the NVMf subsystem is paused. Both controller in source VM and 250 * destinatiom VM is in this state when doing live migration. 251 */ 252 VFIO_USER_CTRLR_MIGRATING 253 }; 254 255 struct nvmf_vfio_user_sq { 256 struct spdk_nvmf_qpair qpair; 257 struct spdk_nvmf_transport_poll_group *group; 258 struct nvmf_vfio_user_ctrlr *ctrlr; 259 260 uint32_t qid; 261 /* Number of entries in queue. */ 262 uint32_t size; 263 struct nvme_q_mapping mapping; 264 enum nvmf_vfio_user_sq_state sq_state; 265 266 uint32_t head; 267 volatile uint32_t *dbl_tailp; 268 269 /* Whether a shadow doorbell eventidx needs setting. */ 270 bool need_rearm; 271 272 /* multiple SQs can be mapped to the same CQ */ 273 uint16_t cqid; 274 275 /* handle_queue_connect_rsp() can be used both for CREATE IO SQ response 276 * and SQ re-connect response in the destination VM, for the prior case, 277 * we will post a NVMe completion to VM, we will not set this flag when 278 * re-connecting SQs in the destination VM. 279 */ 280 bool post_create_io_sq_completion; 281 /* Copy of Create IO SQ command, this field is used together with 282 * `post_create_io_sq_completion` flag. 283 */ 284 struct spdk_nvme_cmd create_io_sq_cmd; 285 286 /* Currently unallocated reqs. */ 287 TAILQ_HEAD(, nvmf_vfio_user_req) free_reqs; 288 /* Poll group entry */ 289 TAILQ_ENTRY(nvmf_vfio_user_sq) link; 290 /* Connected SQ entry */ 291 TAILQ_ENTRY(nvmf_vfio_user_sq) tailq; 292 }; 293 294 struct nvmf_vfio_user_cq { 295 struct spdk_nvmf_transport_poll_group *group; 296 struct spdk_thread *thread; 297 uint32_t cq_ref; 298 299 uint32_t qid; 300 /* Number of entries in queue. */ 301 uint32_t size; 302 struct nvme_q_mapping mapping; 303 enum nvmf_vfio_user_cq_state cq_state; 304 305 uint32_t tail; 306 volatile uint32_t *dbl_headp; 307 308 bool phase; 309 310 uint16_t iv; 311 bool ien; 312 313 uint32_t last_head; 314 uint32_t last_trigger_irq_tail; 315 }; 316 317 struct nvmf_vfio_user_poll_group { 318 struct spdk_nvmf_transport_poll_group group; 319 TAILQ_ENTRY(nvmf_vfio_user_poll_group) link; 320 TAILQ_HEAD(, nvmf_vfio_user_sq) sqs; 321 struct spdk_interrupt *intr; 322 int intr_fd; 323 }; 324 325 struct nvmf_vfio_user_shadow_doorbells { 326 volatile uint32_t *shadow_doorbells; 327 volatile uint32_t *eventidxs; 328 dma_sg_t *sgs; 329 struct iovec *iovs; 330 }; 331 332 struct nvmf_vfio_user_ctrlr { 333 struct nvmf_vfio_user_endpoint *endpoint; 334 struct nvmf_vfio_user_transport *transport; 335 336 /* Connected SQs list */ 337 TAILQ_HEAD(, nvmf_vfio_user_sq) connected_sqs; 338 enum nvmf_vfio_user_ctrlr_state state; 339 340 /* 341 * Tells whether live migration data have been prepared. This is used 342 * by the get_pending_bytes callback to tell whether or not the 343 * previous iteration finished. 344 */ 345 bool migr_data_prepared; 346 347 /* Controller is in source VM when doing live migration */ 348 bool in_source_vm; 349 350 struct spdk_thread *thread; 351 struct spdk_poller *vfu_ctx_poller; 352 struct spdk_interrupt *intr; 353 int intr_fd; 354 355 bool queued_quiesce; 356 357 bool reset_shn; 358 359 uint16_t cntlid; 360 struct spdk_nvmf_ctrlr *ctrlr; 361 362 struct nvmf_vfio_user_sq *sqs[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 363 struct nvmf_vfio_user_cq *cqs[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 364 365 TAILQ_ENTRY(nvmf_vfio_user_ctrlr) link; 366 367 volatile uint32_t *bar0_doorbells; 368 struct nvmf_vfio_user_shadow_doorbells *sdbl; 369 /* 370 * Shadow doorbells PRPs to provide during the stop-and-copy state. 371 */ 372 uint64_t shadow_doorbell_buffer; 373 uint64_t eventidx_buffer; 374 375 bool adaptive_irqs_enabled; 376 }; 377 378 /* Endpoint in vfio-user is associated with a socket file, which 379 * is the representative of a PCI endpoint. 380 */ 381 struct nvmf_vfio_user_endpoint { 382 struct nvmf_vfio_user_transport *transport; 383 vfu_ctx_t *vfu_ctx; 384 struct spdk_poller *accept_poller; 385 struct spdk_thread *accept_thread; 386 bool interrupt_mode; 387 struct msixcap *msix; 388 vfu_pci_config_space_t *pci_config_space; 389 int devmem_fd; 390 int accept_intr_fd; 391 struct spdk_interrupt *accept_intr; 392 393 volatile uint32_t *bar0_doorbells; 394 395 int migr_fd; 396 void *migr_data; 397 398 struct spdk_nvme_transport_id trid; 399 struct spdk_nvmf_subsystem *subsystem; 400 401 /* Controller is associated with an active socket connection, 402 * the lifecycle of the controller is same as the VM. 403 * Currently we only support one active connection, as the NVMe 404 * specification defines, we may support multiple controllers in 405 * future, so that it can support e.g: RESERVATION. 406 */ 407 struct nvmf_vfio_user_ctrlr *ctrlr; 408 pthread_mutex_t lock; 409 410 bool need_async_destroy; 411 /* The subsystem is in PAUSED state and need to be resumed, TRUE 412 * only when migration is done successfully and the controller is 413 * in source VM. 414 */ 415 bool need_resume; 416 /* Start the accept poller again after destroying the controller */ 417 bool need_relisten; 418 419 TAILQ_ENTRY(nvmf_vfio_user_endpoint) link; 420 }; 421 422 struct nvmf_vfio_user_transport_opts { 423 bool disable_mappable_bar0; 424 bool disable_adaptive_irq; 425 bool disable_shadow_doorbells; 426 bool disable_compare; 427 bool enable_intr_mode_sq_spreading; 428 }; 429 430 struct nvmf_vfio_user_transport { 431 struct spdk_nvmf_transport transport; 432 struct nvmf_vfio_user_transport_opts transport_opts; 433 bool intr_mode_supported; 434 pthread_mutex_t lock; 435 TAILQ_HEAD(, nvmf_vfio_user_endpoint) endpoints; 436 437 pthread_mutex_t pg_lock; 438 TAILQ_HEAD(, nvmf_vfio_user_poll_group) poll_groups; 439 struct nvmf_vfio_user_poll_group *next_pg; 440 }; 441 442 /* 443 * function prototypes 444 */ 445 static int nvmf_vfio_user_req_free(struct spdk_nvmf_request *req); 446 447 static struct nvmf_vfio_user_req *get_nvmf_vfio_user_req(struct nvmf_vfio_user_sq *sq); 448 449 /* 450 * Local process virtual address of a queue. 451 */ 452 static inline void * 453 q_addr(struct nvme_q_mapping *mapping) 454 { 455 return mapping->iov.iov_base; 456 } 457 458 static inline int 459 queue_index(uint16_t qid, bool is_cq) 460 { 461 return (qid * 2) + is_cq; 462 } 463 464 static inline volatile uint32_t * 465 sq_headp(struct nvmf_vfio_user_sq *sq) 466 { 467 assert(sq != NULL); 468 return &sq->head; 469 } 470 471 static inline volatile uint32_t * 472 sq_dbl_tailp(struct nvmf_vfio_user_sq *sq) 473 { 474 assert(sq != NULL); 475 return sq->dbl_tailp; 476 } 477 478 static inline volatile uint32_t * 479 cq_dbl_headp(struct nvmf_vfio_user_cq *cq) 480 { 481 assert(cq != NULL); 482 return cq->dbl_headp; 483 } 484 485 static inline volatile uint32_t * 486 cq_tailp(struct nvmf_vfio_user_cq *cq) 487 { 488 assert(cq != NULL); 489 return &cq->tail; 490 } 491 492 static inline void 493 sq_head_advance(struct nvmf_vfio_user_sq *sq) 494 { 495 assert(sq != NULL); 496 497 assert(*sq_headp(sq) < sq->size); 498 (*sq_headp(sq))++; 499 500 if (spdk_unlikely(*sq_headp(sq) == sq->size)) { 501 *sq_headp(sq) = 0; 502 } 503 } 504 505 static inline void 506 cq_tail_advance(struct nvmf_vfio_user_cq *cq) 507 { 508 assert(cq != NULL); 509 510 assert(*cq_tailp(cq) < cq->size); 511 (*cq_tailp(cq))++; 512 513 if (spdk_unlikely(*cq_tailp(cq) == cq->size)) { 514 *cq_tailp(cq) = 0; 515 cq->phase = !cq->phase; 516 } 517 } 518 519 /* 520 * As per NVMe Base spec 3.3.1.2.1, we are supposed to implement CQ flow 521 * control: if there is no space in the CQ, we should wait until there is. 522 * 523 * In practice, we just fail the controller instead: as it happens, all host 524 * implementations we care about right-size the CQ: this is required anyway for 525 * NVMEoF support (see 3.3.2.8). 526 * 527 * Since reading the head doorbell is relatively expensive, we use the cached 528 * value, so we only have to read it for real if it appears that we are full. 529 */ 530 static inline bool 531 cq_is_full(struct nvmf_vfio_user_cq *cq) 532 { 533 uint32_t qindex; 534 535 assert(cq != NULL); 536 537 qindex = *cq_tailp(cq) + 1; 538 if (spdk_unlikely(qindex == cq->size)) { 539 qindex = 0; 540 } 541 542 if (qindex != cq->last_head) { 543 return false; 544 } 545 546 cq->last_head = *cq_dbl_headp(cq); 547 548 return qindex == cq->last_head; 549 } 550 551 static bool 552 io_q_exists(struct nvmf_vfio_user_ctrlr *vu_ctrlr, const uint16_t qid, const bool is_cq) 553 { 554 assert(vu_ctrlr != NULL); 555 556 if (qid == 0 || qid >= NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) { 557 return false; 558 } 559 560 if (is_cq) { 561 if (vu_ctrlr->cqs[qid] == NULL) { 562 return false; 563 } 564 565 return (vu_ctrlr->cqs[qid]->cq_state != VFIO_USER_CQ_DELETED && 566 vu_ctrlr->cqs[qid]->cq_state != VFIO_USER_CQ_UNUSED); 567 } 568 569 if (vu_ctrlr->sqs[qid] == NULL) { 570 return false; 571 } 572 573 return (vu_ctrlr->sqs[qid]->sq_state != VFIO_USER_SQ_DELETED && 574 vu_ctrlr->sqs[qid]->sq_state != VFIO_USER_SQ_UNUSED); 575 } 576 577 static char * 578 endpoint_id(struct nvmf_vfio_user_endpoint *endpoint) 579 { 580 return endpoint->trid.traddr; 581 } 582 583 static char * 584 ctrlr_id(struct nvmf_vfio_user_ctrlr *ctrlr) 585 { 586 if (!ctrlr || !ctrlr->endpoint) { 587 return "Null Ctrlr"; 588 } 589 590 return endpoint_id(ctrlr->endpoint); 591 } 592 593 /* Return the poll group for the admin queue of the controller. */ 594 static inline struct nvmf_vfio_user_poll_group * 595 ctrlr_to_poll_group(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 596 { 597 return SPDK_CONTAINEROF(vu_ctrlr->sqs[0]->group, 598 struct nvmf_vfio_user_poll_group, 599 group); 600 } 601 602 static inline struct spdk_thread * 603 poll_group_to_thread(struct nvmf_vfio_user_poll_group *vu_pg) 604 { 605 return vu_pg->group.group->thread; 606 } 607 608 static dma_sg_t * 609 index_to_sg_t(void *arr, size_t i) 610 { 611 return (dma_sg_t *)((uintptr_t)arr + i * dma_sg_size()); 612 } 613 614 static inline size_t 615 vfio_user_migr_data_len(void) 616 { 617 return SPDK_ALIGN_CEIL(sizeof(struct vfio_user_nvme_migr_state), PAGE_SIZE); 618 } 619 620 static inline bool 621 in_interrupt_mode(struct nvmf_vfio_user_transport *vu_transport) 622 { 623 return spdk_interrupt_mode_is_enabled() && 624 vu_transport->intr_mode_supported; 625 } 626 627 static int vfio_user_ctrlr_intr(void *ctx); 628 629 static void 630 vfio_user_msg_ctrlr_intr(void *ctx) 631 { 632 vfio_user_ctrlr_intr(ctx); 633 } 634 635 /* 636 * Kick (force a wakeup) of all poll groups for this controller. 637 * vfio_user_ctrlr_intr() itself arranges for kicking other poll groups if 638 * needed. 639 */ 640 static void 641 ctrlr_kick(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 642 { 643 struct nvmf_vfio_user_poll_group *vu_ctrlr_group; 644 645 SPDK_DEBUGLOG(vfio_user_db, "%s: kicked\n", ctrlr_id(vu_ctrlr)); 646 647 vu_ctrlr_group = ctrlr_to_poll_group(vu_ctrlr); 648 649 spdk_thread_send_msg(poll_group_to_thread(vu_ctrlr_group), 650 vfio_user_msg_ctrlr_intr, vu_ctrlr); 651 } 652 653 /* 654 * Make the given DMA address and length available (locally mapped) via iov. 655 */ 656 static void * 657 map_one(vfu_ctx_t *ctx, uint64_t addr, uint64_t len, dma_sg_t *sg, 658 struct iovec *iov, int prot) 659 { 660 int ret; 661 662 assert(ctx != NULL); 663 assert(sg != NULL); 664 assert(iov != NULL); 665 666 ret = vfu_addr_to_sgl(ctx, (void *)(uintptr_t)addr, len, sg, 1, prot); 667 if (ret < 0) { 668 return NULL; 669 } 670 671 ret = vfu_sgl_get(ctx, sg, iov, 1, 0); 672 if (ret != 0) { 673 return NULL; 674 } 675 676 assert(iov->iov_base != NULL); 677 return iov->iov_base; 678 } 679 680 static int 681 nvme_cmd_map_prps(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, 682 uint32_t max_iovcnt, uint32_t len, size_t mps, 683 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 684 { 685 uint64_t prp1, prp2; 686 void *vva; 687 uint32_t i; 688 uint32_t residue_len, nents; 689 uint64_t *prp_list; 690 uint32_t iovcnt; 691 692 assert(max_iovcnt > 0); 693 694 prp1 = cmd->dptr.prp.prp1; 695 prp2 = cmd->dptr.prp.prp2; 696 697 /* PRP1 may started with unaligned page address */ 698 residue_len = mps - (prp1 % mps); 699 residue_len = spdk_min(len, residue_len); 700 701 vva = gpa_to_vva(prv, prp1, residue_len, PROT_READ | PROT_WRITE); 702 if (spdk_unlikely(vva == NULL)) { 703 SPDK_ERRLOG("GPA to VVA failed\n"); 704 return -EINVAL; 705 } 706 len -= residue_len; 707 if (len && max_iovcnt < 2) { 708 SPDK_ERRLOG("Too many page entries, at least two iovs are required\n"); 709 return -ERANGE; 710 } 711 iovs[0].iov_base = vva; 712 iovs[0].iov_len = residue_len; 713 714 if (len) { 715 if (spdk_unlikely(prp2 == 0)) { 716 SPDK_ERRLOG("no PRP2, %d remaining\n", len); 717 return -EINVAL; 718 } 719 720 if (len <= mps) { 721 /* 2 PRP used */ 722 iovcnt = 2; 723 vva = gpa_to_vva(prv, prp2, len, PROT_READ | PROT_WRITE); 724 if (spdk_unlikely(vva == NULL)) { 725 SPDK_ERRLOG("no VVA for %#" PRIx64 ", len%#x\n", 726 prp2, len); 727 return -EINVAL; 728 } 729 iovs[1].iov_base = vva; 730 iovs[1].iov_len = len; 731 } else { 732 /* PRP list used */ 733 nents = (len + mps - 1) / mps; 734 if (spdk_unlikely(nents + 1 > max_iovcnt)) { 735 SPDK_ERRLOG("Too many page entries\n"); 736 return -ERANGE; 737 } 738 739 vva = gpa_to_vva(prv, prp2, nents * sizeof(*prp_list), PROT_READ); 740 if (spdk_unlikely(vva == NULL)) { 741 SPDK_ERRLOG("no VVA for %#" PRIx64 ", nents=%#x\n", 742 prp2, nents); 743 return -EINVAL; 744 } 745 prp_list = vva; 746 i = 0; 747 while (len != 0) { 748 residue_len = spdk_min(len, mps); 749 vva = gpa_to_vva(prv, prp_list[i], residue_len, PROT_READ | PROT_WRITE); 750 if (spdk_unlikely(vva == NULL)) { 751 SPDK_ERRLOG("no VVA for %#" PRIx64 ", residue_len=%#x\n", 752 prp_list[i], residue_len); 753 return -EINVAL; 754 } 755 iovs[i + 1].iov_base = vva; 756 iovs[i + 1].iov_len = residue_len; 757 len -= residue_len; 758 i++; 759 } 760 iovcnt = i + 1; 761 } 762 } else { 763 /* 1 PRP used */ 764 iovcnt = 1; 765 } 766 767 assert(iovcnt <= max_iovcnt); 768 return iovcnt; 769 } 770 771 static int 772 nvme_cmd_map_sgls_data(void *prv, struct spdk_nvme_sgl_descriptor *sgls, uint32_t num_sgls, 773 struct iovec *iovs, uint32_t max_iovcnt, 774 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 775 { 776 uint32_t i; 777 void *vva; 778 779 if (spdk_unlikely(max_iovcnt < num_sgls)) { 780 return -ERANGE; 781 } 782 783 for (i = 0; i < num_sgls; i++) { 784 if (spdk_unlikely(sgls[i].unkeyed.type != SPDK_NVME_SGL_TYPE_DATA_BLOCK)) { 785 SPDK_ERRLOG("Invalid SGL type %u\n", sgls[i].unkeyed.type); 786 return -EINVAL; 787 } 788 vva = gpa_to_vva(prv, sgls[i].address, sgls[i].unkeyed.length, PROT_READ | PROT_WRITE); 789 if (spdk_unlikely(vva == NULL)) { 790 SPDK_ERRLOG("GPA to VVA failed\n"); 791 return -EINVAL; 792 } 793 iovs[i].iov_base = vva; 794 iovs[i].iov_len = sgls[i].unkeyed.length; 795 } 796 797 return num_sgls; 798 } 799 800 static int 801 nvme_cmd_map_sgls(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt, 802 uint32_t len, size_t mps, 803 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 804 { 805 struct spdk_nvme_sgl_descriptor *sgl, *last_sgl; 806 uint32_t num_sgls, seg_len; 807 void *vva; 808 int ret; 809 uint32_t total_iovcnt = 0; 810 811 /* SGL cases */ 812 sgl = &cmd->dptr.sgl1; 813 814 /* only one SGL segment */ 815 if (sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { 816 assert(max_iovcnt > 0); 817 vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, PROT_READ | PROT_WRITE); 818 if (spdk_unlikely(vva == NULL)) { 819 SPDK_ERRLOG("GPA to VVA failed\n"); 820 return -EINVAL; 821 } 822 iovs[0].iov_base = vva; 823 iovs[0].iov_len = sgl->unkeyed.length; 824 assert(sgl->unkeyed.length == len); 825 826 return 1; 827 } 828 829 for (;;) { 830 if (spdk_unlikely((sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_SEGMENT) && 831 (sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_LAST_SEGMENT))) { 832 SPDK_ERRLOG("Invalid SGL type %u\n", sgl->unkeyed.type); 833 return -EINVAL; 834 } 835 836 seg_len = sgl->unkeyed.length; 837 if (spdk_unlikely(seg_len % sizeof(struct spdk_nvme_sgl_descriptor))) { 838 SPDK_ERRLOG("Invalid SGL segment len %u\n", seg_len); 839 return -EINVAL; 840 } 841 842 num_sgls = seg_len / sizeof(struct spdk_nvme_sgl_descriptor); 843 vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, PROT_READ); 844 if (spdk_unlikely(vva == NULL)) { 845 SPDK_ERRLOG("GPA to VVA failed\n"); 846 return -EINVAL; 847 } 848 849 /* sgl point to the first segment */ 850 sgl = (struct spdk_nvme_sgl_descriptor *)vva; 851 last_sgl = &sgl[num_sgls - 1]; 852 853 /* we are done */ 854 if (last_sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { 855 /* map whole sgl list */ 856 ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls, &iovs[total_iovcnt], 857 max_iovcnt - total_iovcnt, gpa_to_vva); 858 if (spdk_unlikely(ret < 0)) { 859 return ret; 860 } 861 total_iovcnt += ret; 862 863 return total_iovcnt; 864 } 865 866 if (num_sgls > 1) { 867 /* map whole sgl exclude last_sgl */ 868 ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls - 1, &iovs[total_iovcnt], 869 max_iovcnt - total_iovcnt, gpa_to_vva); 870 if (spdk_unlikely(ret < 0)) { 871 return ret; 872 } 873 total_iovcnt += ret; 874 } 875 876 /* move to next level's segments */ 877 sgl = last_sgl; 878 } 879 880 return 0; 881 } 882 883 static int 884 nvme_map_cmd(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt, 885 uint32_t len, size_t mps, 886 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 887 { 888 if (cmd->psdt == SPDK_NVME_PSDT_PRP) { 889 return nvme_cmd_map_prps(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva); 890 } 891 892 return nvme_cmd_map_sgls(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva); 893 } 894 895 /* 896 * For each queue, update the location of its doorbell to the correct location: 897 * either our own BAR0, or the guest's configured shadow doorbell area. 898 * 899 * The Admin queue (qid: 0) does not ever use shadow doorbells. 900 */ 901 static void 902 vfio_user_ctrlr_switch_doorbells(struct nvmf_vfio_user_ctrlr *ctrlr, bool shadow) 903 { 904 volatile uint32_t *doorbells = shadow ? ctrlr->sdbl->shadow_doorbells : 905 ctrlr->bar0_doorbells; 906 907 assert(doorbells != NULL); 908 909 for (size_t i = 1; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; i++) { 910 struct nvmf_vfio_user_sq *sq = ctrlr->sqs[i]; 911 struct nvmf_vfio_user_cq *cq = ctrlr->cqs[i]; 912 913 if (sq != NULL) { 914 sq->dbl_tailp = doorbells + queue_index(sq->qid, false); 915 916 ctrlr->sqs[i]->need_rearm = shadow; 917 } 918 919 if (cq != NULL) { 920 cq->dbl_headp = doorbells + queue_index(cq->qid, true); 921 } 922 } 923 } 924 925 static void 926 unmap_sdbl(vfu_ctx_t *vfu_ctx, struct nvmf_vfio_user_shadow_doorbells *sdbl) 927 { 928 assert(vfu_ctx != NULL); 929 assert(sdbl != NULL); 930 931 /* 932 * An allocation error would result in only one of the two being 933 * non-NULL. If that is the case, no memory should have been mapped. 934 */ 935 if (sdbl->iovs == NULL || sdbl->sgs == NULL) { 936 return; 937 } 938 939 for (size_t i = 0; i < NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT; ++i) { 940 struct iovec *iov; 941 dma_sg_t *sg; 942 943 if (!sdbl->iovs[i].iov_len) { 944 continue; 945 } 946 947 sg = index_to_sg_t(sdbl->sgs, i); 948 iov = sdbl->iovs + i; 949 950 vfu_sgl_put(vfu_ctx, sg, iov, 1); 951 } 952 } 953 954 static void 955 free_sdbl(vfu_ctx_t *vfu_ctx, struct nvmf_vfio_user_shadow_doorbells *sdbl) 956 { 957 if (sdbl == NULL) { 958 return; 959 } 960 961 unmap_sdbl(vfu_ctx, sdbl); 962 963 /* 964 * sdbl->shadow_doorbells and sdbl->eventidxs were mapped, 965 * not allocated, so don't free() them. 966 */ 967 free(sdbl->sgs); 968 free(sdbl->iovs); 969 free(sdbl); 970 } 971 972 static struct nvmf_vfio_user_shadow_doorbells * 973 map_sdbl(vfu_ctx_t *vfu_ctx, uint64_t prp1, uint64_t prp2, size_t len) 974 { 975 struct nvmf_vfio_user_shadow_doorbells *sdbl = NULL; 976 dma_sg_t *sg2 = NULL; 977 void *p; 978 979 assert(vfu_ctx != NULL); 980 981 sdbl = calloc(1, sizeof(*sdbl)); 982 if (sdbl == NULL) { 983 goto err; 984 } 985 986 sdbl->sgs = calloc(NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT, dma_sg_size()); 987 sdbl->iovs = calloc(NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT, sizeof(*sdbl->iovs)); 988 if (sdbl->sgs == NULL || sdbl->iovs == NULL) { 989 goto err; 990 } 991 992 /* Map shadow doorbell buffer (PRP1). */ 993 p = map_one(vfu_ctx, prp1, len, sdbl->sgs, sdbl->iovs, 994 PROT_READ | PROT_WRITE); 995 996 if (p == NULL) { 997 goto err; 998 } 999 1000 /* 1001 * Map eventidx buffer (PRP2). 1002 * Should only be written to by the controller. 1003 */ 1004 1005 sg2 = index_to_sg_t(sdbl->sgs, 1); 1006 1007 p = map_one(vfu_ctx, prp2, len, sg2, sdbl->iovs + 1, 1008 PROT_READ | PROT_WRITE); 1009 1010 if (p == NULL) { 1011 goto err; 1012 } 1013 1014 sdbl->shadow_doorbells = (uint32_t *)sdbl->iovs[0].iov_base; 1015 sdbl->eventidxs = (uint32_t *)sdbl->iovs[1].iov_base; 1016 1017 return sdbl; 1018 1019 err: 1020 free_sdbl(vfu_ctx, sdbl); 1021 return NULL; 1022 } 1023 1024 /* 1025 * Copy doorbells from one buffer to the other, during switches betweeen BAR0 1026 * doorbells and shadow doorbells. 1027 */ 1028 static void 1029 copy_doorbells(struct nvmf_vfio_user_ctrlr *ctrlr, 1030 const volatile uint32_t *from, volatile uint32_t *to) 1031 { 1032 assert(ctrlr != NULL); 1033 assert(from != NULL); 1034 assert(to != NULL); 1035 1036 SPDK_DEBUGLOG(vfio_user_db, 1037 "%s: migrating shadow doorbells from %p to %p\n", 1038 ctrlr_id(ctrlr), from, to); 1039 1040 /* Can't use memcpy because it doesn't respect volatile semantics. */ 1041 for (size_t i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; ++i) { 1042 if (ctrlr->sqs[i] != NULL) { 1043 to[queue_index(i, false)] = from[queue_index(i, false)]; 1044 } 1045 1046 if (ctrlr->cqs[i] != NULL) { 1047 to[queue_index(i, true)] = from[queue_index(i, true)]; 1048 } 1049 } 1050 } 1051 1052 static void 1053 fail_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1054 { 1055 const struct spdk_nvmf_registers *regs; 1056 1057 assert(vu_ctrlr != NULL); 1058 assert(vu_ctrlr->ctrlr != NULL); 1059 1060 regs = spdk_nvmf_ctrlr_get_regs(vu_ctrlr->ctrlr); 1061 if (regs->csts.bits.cfs == 0) { 1062 SPDK_ERRLOG(":%s failing controller\n", ctrlr_id(vu_ctrlr)); 1063 } 1064 1065 nvmf_ctrlr_set_fatal_status(vu_ctrlr->ctrlr); 1066 } 1067 1068 static inline bool 1069 ctrlr_interrupt_enabled(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1070 { 1071 assert(vu_ctrlr != NULL); 1072 assert(vu_ctrlr->endpoint != NULL); 1073 1074 vfu_pci_config_space_t *pci = vu_ctrlr->endpoint->pci_config_space; 1075 1076 return (!pci->hdr.cmd.id || vu_ctrlr->endpoint->msix->mxc.mxe); 1077 } 1078 1079 static void 1080 nvmf_vfio_user_destroy_endpoint(struct nvmf_vfio_user_endpoint *endpoint) 1081 { 1082 SPDK_DEBUGLOG(nvmf_vfio, "destroy endpoint %s\n", endpoint_id(endpoint)); 1083 1084 spdk_interrupt_unregister(&endpoint->accept_intr); 1085 spdk_poller_unregister(&endpoint->accept_poller); 1086 1087 if (endpoint->bar0_doorbells) { 1088 munmap((void *)endpoint->bar0_doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 1089 } 1090 1091 if (endpoint->devmem_fd > 0) { 1092 close(endpoint->devmem_fd); 1093 } 1094 1095 if (endpoint->migr_data) { 1096 munmap(endpoint->migr_data, vfio_user_migr_data_len()); 1097 } 1098 1099 if (endpoint->migr_fd > 0) { 1100 close(endpoint->migr_fd); 1101 } 1102 1103 if (endpoint->vfu_ctx) { 1104 vfu_destroy_ctx(endpoint->vfu_ctx); 1105 } 1106 1107 pthread_mutex_destroy(&endpoint->lock); 1108 free(endpoint); 1109 } 1110 1111 /* called when process exits */ 1112 static int 1113 nvmf_vfio_user_destroy(struct spdk_nvmf_transport *transport, 1114 spdk_nvmf_transport_destroy_done_cb cb_fn, void *cb_arg) 1115 { 1116 struct nvmf_vfio_user_transport *vu_transport; 1117 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 1118 1119 SPDK_DEBUGLOG(nvmf_vfio, "destroy transport\n"); 1120 1121 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 1122 transport); 1123 1124 pthread_mutex_destroy(&vu_transport->lock); 1125 pthread_mutex_destroy(&vu_transport->pg_lock); 1126 1127 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 1128 TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link); 1129 nvmf_vfio_user_destroy_endpoint(endpoint); 1130 } 1131 1132 free(vu_transport); 1133 1134 if (cb_fn) { 1135 cb_fn(cb_arg); 1136 } 1137 1138 return 0; 1139 } 1140 1141 static const struct spdk_json_object_decoder vfio_user_transport_opts_decoder[] = { 1142 { 1143 "disable_mappable_bar0", 1144 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_mappable_bar0), 1145 spdk_json_decode_bool, true 1146 }, 1147 { 1148 "disable_adaptive_irq", 1149 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_adaptive_irq), 1150 spdk_json_decode_bool, true 1151 }, 1152 { 1153 "disable_shadow_doorbells", 1154 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_shadow_doorbells), 1155 spdk_json_decode_bool, true 1156 }, 1157 { 1158 "disable_compare", 1159 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_compare), 1160 spdk_json_decode_bool, true 1161 }, 1162 { 1163 "enable_intr_mode_sq_spreading", 1164 offsetof(struct nvmf_vfio_user_transport, transport_opts.enable_intr_mode_sq_spreading), 1165 spdk_json_decode_bool, true 1166 }, 1167 }; 1168 1169 static struct spdk_nvmf_transport * 1170 nvmf_vfio_user_create(struct spdk_nvmf_transport_opts *opts) 1171 { 1172 struct nvmf_vfio_user_transport *vu_transport; 1173 int err; 1174 1175 if (opts->max_qpairs_per_ctrlr > NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) { 1176 SPDK_ERRLOG("Invalid max_qpairs_per_ctrlr=%d, supported max_qpairs_per_ctrlr=%d\n", 1177 opts->max_qpairs_per_ctrlr, NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR); 1178 return NULL; 1179 } 1180 1181 vu_transport = calloc(1, sizeof(*vu_transport)); 1182 if (vu_transport == NULL) { 1183 SPDK_ERRLOG("Transport alloc fail: %m\n"); 1184 return NULL; 1185 } 1186 1187 err = pthread_mutex_init(&vu_transport->lock, NULL); 1188 if (err != 0) { 1189 SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err); 1190 goto err; 1191 } 1192 TAILQ_INIT(&vu_transport->endpoints); 1193 1194 err = pthread_mutex_init(&vu_transport->pg_lock, NULL); 1195 if (err != 0) { 1196 pthread_mutex_destroy(&vu_transport->lock); 1197 SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err); 1198 goto err; 1199 } 1200 TAILQ_INIT(&vu_transport->poll_groups); 1201 1202 if (opts->transport_specific != NULL && 1203 spdk_json_decode_object_relaxed(opts->transport_specific, vfio_user_transport_opts_decoder, 1204 SPDK_COUNTOF(vfio_user_transport_opts_decoder), 1205 vu_transport)) { 1206 SPDK_ERRLOG("spdk_json_decode_object_relaxed failed\n"); 1207 goto cleanup; 1208 } 1209 1210 /* 1211 * To support interrupt mode, the transport must be configured with 1212 * mappable BAR0 disabled: we need a vfio-user message to wake us up 1213 * when a client writes new doorbell values to BAR0, via the 1214 * libvfio-user socket fd. 1215 */ 1216 vu_transport->intr_mode_supported = 1217 vu_transport->transport_opts.disable_mappable_bar0; 1218 1219 /* 1220 * If BAR0 is mappable, it doesn't make sense to support shadow 1221 * doorbells, so explicitly turn it off. 1222 */ 1223 if (!vu_transport->transport_opts.disable_mappable_bar0) { 1224 vu_transport->transport_opts.disable_shadow_doorbells = true; 1225 } 1226 1227 if (spdk_interrupt_mode_is_enabled()) { 1228 if (!vu_transport->intr_mode_supported) { 1229 SPDK_ERRLOG("interrupt mode not supported\n"); 1230 goto cleanup; 1231 } 1232 1233 /* 1234 * If we are in interrupt mode, we cannot support adaptive IRQs, 1235 * as there is no guarantee the SQ poller will run subsequently 1236 * to send pending IRQs. 1237 */ 1238 vu_transport->transport_opts.disable_adaptive_irq = true; 1239 } 1240 1241 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_mappable_bar0=%d\n", 1242 vu_transport->transport_opts.disable_mappable_bar0); 1243 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_adaptive_irq=%d\n", 1244 vu_transport->transport_opts.disable_adaptive_irq); 1245 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_shadow_doorbells=%d\n", 1246 vu_transport->transport_opts.disable_shadow_doorbells); 1247 1248 return &vu_transport->transport; 1249 1250 cleanup: 1251 pthread_mutex_destroy(&vu_transport->lock); 1252 pthread_mutex_destroy(&vu_transport->pg_lock); 1253 err: 1254 free(vu_transport); 1255 return NULL; 1256 } 1257 1258 static uint32_t 1259 max_queue_size(struct nvmf_vfio_user_ctrlr const *vu_ctrlr) 1260 { 1261 assert(vu_ctrlr != NULL); 1262 assert(vu_ctrlr->ctrlr != NULL); 1263 1264 return vu_ctrlr->ctrlr->vcprop.cap.bits.mqes + 1; 1265 } 1266 1267 static uint32_t 1268 doorbell_stride(const struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1269 { 1270 assert(vu_ctrlr != NULL); 1271 assert(vu_ctrlr->ctrlr != NULL); 1272 1273 return vu_ctrlr->ctrlr->vcprop.cap.bits.dstrd; 1274 } 1275 1276 static uintptr_t 1277 memory_page_size(const struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1278 { 1279 uint32_t memory_page_shift = vu_ctrlr->ctrlr->vcprop.cc.bits.mps + 12; 1280 return 1ul << memory_page_shift; 1281 } 1282 1283 static uintptr_t 1284 memory_page_mask(const struct nvmf_vfio_user_ctrlr *ctrlr) 1285 { 1286 return ~(memory_page_size(ctrlr) - 1); 1287 } 1288 1289 static int 1290 map_q(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvme_q_mapping *mapping, 1291 uint32_t q_size, bool is_cq, bool unmap) 1292 { 1293 uint64_t len; 1294 void *ret; 1295 1296 assert(q_size); 1297 assert(q_addr(mapping) == NULL); 1298 1299 if (is_cq) { 1300 len = q_size * sizeof(struct spdk_nvme_cpl); 1301 } else { 1302 len = q_size * sizeof(struct spdk_nvme_cmd); 1303 } 1304 1305 ret = map_one(vu_ctrlr->endpoint->vfu_ctx, mapping->prp1, len, 1306 mapping->sg, &mapping->iov, 1307 is_cq ? PROT_READ | PROT_WRITE : PROT_READ); 1308 if (ret == NULL) { 1309 return -EFAULT; 1310 } 1311 1312 if (unmap) { 1313 memset(q_addr(mapping), 0, len); 1314 } 1315 1316 return 0; 1317 } 1318 1319 static inline void 1320 unmap_q(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvme_q_mapping *mapping) 1321 { 1322 if (q_addr(mapping) != NULL) { 1323 vfu_sgl_put(vu_ctrlr->endpoint->vfu_ctx, mapping->sg, 1324 &mapping->iov, 1); 1325 mapping->iov.iov_base = NULL; 1326 } 1327 } 1328 1329 static int 1330 asq_setup(struct nvmf_vfio_user_ctrlr *ctrlr) 1331 { 1332 struct nvmf_vfio_user_sq *sq; 1333 const struct spdk_nvmf_registers *regs; 1334 int ret; 1335 1336 assert(ctrlr != NULL); 1337 1338 sq = ctrlr->sqs[0]; 1339 1340 assert(sq != NULL); 1341 assert(q_addr(&sq->mapping) == NULL); 1342 /* XXX ctrlr->asq == 0 is a valid memory address */ 1343 1344 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->ctrlr); 1345 sq->qid = 0; 1346 sq->size = regs->aqa.bits.asqs + 1; 1347 sq->mapping.prp1 = regs->asq; 1348 *sq_headp(sq) = 0; 1349 sq->cqid = 0; 1350 1351 ret = map_q(ctrlr, &sq->mapping, sq->size, false, true); 1352 if (ret) { 1353 return ret; 1354 } 1355 1356 /* The Admin queue (qid: 0) does not ever use shadow doorbells. */ 1357 sq->dbl_tailp = ctrlr->bar0_doorbells + queue_index(0, false); 1358 1359 *sq_dbl_tailp(sq) = 0; 1360 1361 return 0; 1362 } 1363 1364 /* 1365 * Updates eventidx to set an SQ into interrupt or polling mode. 1366 * 1367 * Returns false if the current SQ tail does not match the SQ head, as 1368 * this means that the host has submitted more items to the queue while we were 1369 * not looking - or during the event index update. In that case, we must retry, 1370 * or otherwise make sure we are going to wake up again. 1371 */ 1372 static bool 1373 set_sq_eventidx(struct nvmf_vfio_user_sq *sq) 1374 { 1375 struct nvmf_vfio_user_ctrlr *ctrlr; 1376 volatile uint32_t *sq_tail_eidx; 1377 uint32_t old_tail, new_tail; 1378 1379 assert(sq != NULL); 1380 assert(sq->ctrlr != NULL); 1381 assert(sq->ctrlr->sdbl != NULL); 1382 assert(sq->need_rearm); 1383 assert(sq->qid != 0); 1384 1385 ctrlr = sq->ctrlr; 1386 1387 SPDK_DEBUGLOG(vfio_user_db, "%s: updating eventidx of sqid:%u\n", 1388 ctrlr_id(ctrlr), sq->qid); 1389 1390 sq_tail_eidx = ctrlr->sdbl->eventidxs + queue_index(sq->qid, false); 1391 1392 assert(ctrlr->endpoint != NULL); 1393 1394 if (!ctrlr->endpoint->interrupt_mode) { 1395 /* No synchronisation necessary. */ 1396 *sq_tail_eidx = NVMF_VFIO_USER_EVENTIDX_POLL; 1397 return true; 1398 } 1399 1400 old_tail = *sq_dbl_tailp(sq); 1401 *sq_tail_eidx = old_tail; 1402 1403 /* 1404 * Ensure that the event index is updated before re-reading the tail 1405 * doorbell. If it's not, then the host might race us and update the 1406 * tail after the second read but before the event index is written, so 1407 * it won't write to BAR0 and we'll miss the update. 1408 * 1409 * The driver should provide similar ordering with an mb(). 1410 */ 1411 spdk_mb(); 1412 1413 /* 1414 * Check if the host has updated the tail doorbell after we've read it 1415 * for the first time, but before the event index was written. If that's 1416 * the case, then we've lost the race and we need to update the event 1417 * index again (after polling the queue, since the host won't write to 1418 * BAR0). 1419 */ 1420 new_tail = *sq_dbl_tailp(sq); 1421 1422 /* 1423 * We might poll the queue straight after this function returns if the 1424 * tail has been updated, so we need to ensure that any changes to the 1425 * queue will be visible to us if the doorbell has been updated. 1426 * 1427 * The driver should provide similar ordering with a wmb() to ensure 1428 * that the queue is written before it updates the tail doorbell. 1429 */ 1430 spdk_rmb(); 1431 1432 SPDK_DEBUGLOG(vfio_user_db, "%s: sqid:%u, old_tail=%u, new_tail=%u, " 1433 "sq_head=%u\n", ctrlr_id(ctrlr), sq->qid, old_tail, 1434 new_tail, *sq_headp(sq)); 1435 1436 if (new_tail == *sq_headp(sq)) { 1437 sq->need_rearm = false; 1438 return true; 1439 } 1440 1441 /* 1442 * We've lost the race: the tail was updated since we last polled, 1443 * including if it happened within this routine. 1444 * 1445 * The caller should retry after polling (think of this as a cmpxchg 1446 * loop); if we go to sleep while the SQ is not empty, then we won't 1447 * process the remaining events. 1448 */ 1449 return false; 1450 } 1451 1452 static int nvmf_vfio_user_sq_poll(struct nvmf_vfio_user_sq *sq); 1453 1454 /* 1455 * Arrange for an SQ to interrupt us if written. Returns non-zero if we 1456 * processed some SQ entries. 1457 */ 1458 static int 1459 vfio_user_sq_rearm(struct nvmf_vfio_user_ctrlr *ctrlr, 1460 struct nvmf_vfio_user_sq *sq) 1461 { 1462 int count = 0; 1463 size_t i; 1464 1465 assert(sq->need_rearm); 1466 1467 for (i = 0; i < NVMF_VFIO_USER_SET_EVENTIDX_MAX_ATTEMPTS; i++) { 1468 int ret; 1469 1470 if (set_sq_eventidx(sq)) { 1471 /* We won the race and set eventidx; done. */ 1472 return count; 1473 } 1474 1475 ret = nvmf_vfio_user_sq_poll(sq); 1476 1477 count += (ret < 0) ? 1 : ret; 1478 1479 /* 1480 * set_sq_eventidx() hit the race, so we expected 1481 * to process at least one command from this queue. 1482 * If there were no new commands waiting for us, then 1483 * we must have hit an unexpected race condition. 1484 */ 1485 if (ret == 0) { 1486 SPDK_ERRLOG("%s: unexpected race condition detected " 1487 "while updating the shadow doorbell buffer\n", 1488 ctrlr_id(ctrlr)); 1489 1490 fail_ctrlr(ctrlr); 1491 return count; 1492 } 1493 } 1494 1495 SPDK_DEBUGLOG(vfio_user_db, 1496 "%s: set_sq_eventidx() lost the race %zu times\n", 1497 ctrlr_id(ctrlr), i); 1498 1499 /* 1500 * We couldn't arrange an eventidx guaranteed to cause a BAR0 write, as 1501 * we raced with the producer too many times; force ourselves to wake up 1502 * instead. We'll process all queues at that point. 1503 */ 1504 ctrlr_kick(ctrlr); 1505 1506 return count; 1507 } 1508 1509 /* 1510 * We're in interrupt mode, and potentially about to go to sleep. We need to 1511 * make sure any further I/O submissions are guaranteed to wake us up: for 1512 * shadow doorbells that means we may need to go through set_sq_eventidx() for 1513 * every SQ that needs re-arming. 1514 * 1515 * Returns non-zero if we processed something. 1516 */ 1517 static int 1518 vfio_user_poll_group_rearm(struct nvmf_vfio_user_poll_group *vu_group) 1519 { 1520 struct nvmf_vfio_user_sq *sq; 1521 int count = 0; 1522 1523 TAILQ_FOREACH(sq, &vu_group->sqs, link) { 1524 if (spdk_unlikely(sq->sq_state != VFIO_USER_SQ_ACTIVE || !sq->size)) { 1525 continue; 1526 } 1527 1528 if (sq->need_rearm) { 1529 count += vfio_user_sq_rearm(sq->ctrlr, sq); 1530 } 1531 } 1532 1533 return count; 1534 } 1535 1536 static int 1537 acq_setup(struct nvmf_vfio_user_ctrlr *ctrlr) 1538 { 1539 struct nvmf_vfio_user_cq *cq; 1540 const struct spdk_nvmf_registers *regs; 1541 int ret; 1542 1543 assert(ctrlr != NULL); 1544 1545 cq = ctrlr->cqs[0]; 1546 1547 assert(cq != NULL); 1548 1549 assert(q_addr(&cq->mapping) == NULL); 1550 1551 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->ctrlr); 1552 assert(regs != NULL); 1553 cq->qid = 0; 1554 cq->size = regs->aqa.bits.acqs + 1; 1555 cq->mapping.prp1 = regs->acq; 1556 *cq_tailp(cq) = 0; 1557 cq->ien = true; 1558 cq->phase = true; 1559 1560 ret = map_q(ctrlr, &cq->mapping, cq->size, true, true); 1561 if (ret) { 1562 return ret; 1563 } 1564 1565 /* The Admin queue (qid: 0) does not ever use shadow doorbells. */ 1566 cq->dbl_headp = ctrlr->bar0_doorbells + queue_index(0, true); 1567 1568 *cq_dbl_headp(cq) = 0; 1569 1570 return 0; 1571 } 1572 1573 static void * 1574 _map_one(void *prv, uint64_t addr, uint64_t len, int prot) 1575 { 1576 struct spdk_nvmf_request *req = (struct spdk_nvmf_request *)prv; 1577 struct spdk_nvmf_qpair *qpair; 1578 struct nvmf_vfio_user_req *vu_req; 1579 struct nvmf_vfio_user_sq *sq; 1580 void *ret; 1581 1582 assert(req != NULL); 1583 qpair = req->qpair; 1584 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 1585 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 1586 1587 assert(vu_req->iovcnt < NVMF_VFIO_USER_MAX_IOVECS); 1588 ret = map_one(sq->ctrlr->endpoint->vfu_ctx, addr, len, 1589 index_to_sg_t(vu_req->sg, vu_req->iovcnt), 1590 &vu_req->iov[vu_req->iovcnt], prot); 1591 if (spdk_likely(ret != NULL)) { 1592 vu_req->iovcnt++; 1593 } 1594 return ret; 1595 } 1596 1597 static int 1598 vfio_user_map_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req, 1599 struct iovec *iov, uint32_t length) 1600 { 1601 /* Map PRP list to from Guest physical memory to 1602 * virtual memory address. 1603 */ 1604 return nvme_map_cmd(req, &req->cmd->nvme_cmd, iov, NVMF_REQ_MAX_BUFFERS, 1605 length, 4096, _map_one); 1606 } 1607 1608 static int handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 1609 struct nvmf_vfio_user_sq *sq); 1610 1611 /* 1612 * Posts a CQE in the completion queue. 1613 * 1614 * @ctrlr: the vfio-user controller 1615 * @cq: the completion queue 1616 * @cdw0: cdw0 as reported by NVMf 1617 * @sqid: submission queue ID 1618 * @cid: command identifier in NVMe command 1619 * @sc: the NVMe CQE status code 1620 * @sct: the NVMe CQE status code type 1621 */ 1622 static int 1623 post_completion(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_cq *cq, 1624 uint32_t cdw0, uint16_t sqid, uint16_t cid, uint16_t sc, uint16_t sct) 1625 { 1626 struct spdk_nvme_status cpl_status = { 0 }; 1627 struct spdk_nvme_cpl *cpl; 1628 int err; 1629 1630 assert(ctrlr != NULL); 1631 1632 if (spdk_unlikely(cq == NULL || q_addr(&cq->mapping) == NULL)) { 1633 return 0; 1634 } 1635 1636 if (cq->qid == 0) { 1637 assert(spdk_get_thread() == cq->thread); 1638 } 1639 1640 if (cq_is_full(cq)) { 1641 SPDK_ERRLOG("%s: cqid:%d full (tail=%d, head=%d)\n", 1642 ctrlr_id(ctrlr), cq->qid, *cq_tailp(cq), 1643 *cq_dbl_headp(cq)); 1644 return -1; 1645 } 1646 1647 cpl = ((struct spdk_nvme_cpl *)q_addr(&cq->mapping)) + *cq_tailp(cq); 1648 1649 assert(ctrlr->sqs[sqid] != NULL); 1650 SPDK_DEBUGLOG(nvmf_vfio, 1651 "%s: request complete sqid:%d cid=%d status=%#x " 1652 "sqhead=%d cq tail=%d\n", ctrlr_id(ctrlr), sqid, cid, sc, 1653 *sq_headp(ctrlr->sqs[sqid]), *cq_tailp(cq)); 1654 1655 cpl->sqhd = *sq_headp(ctrlr->sqs[sqid]); 1656 cpl->sqid = sqid; 1657 cpl->cid = cid; 1658 cpl->cdw0 = cdw0; 1659 1660 /* 1661 * This is a bitfield: instead of setting the individual bits we need 1662 * directly in cpl->status, which would cause a read-modify-write cycle, 1663 * we'll avoid reading from the CPL altogether by filling in a local 1664 * cpl_status variable, then writing the whole thing. 1665 */ 1666 cpl_status.sct = sct; 1667 cpl_status.sc = sc; 1668 cpl_status.p = cq->phase; 1669 cpl->status = cpl_status; 1670 1671 /* Ensure the Completion Queue Entry is visible. */ 1672 spdk_wmb(); 1673 cq_tail_advance(cq); 1674 1675 if ((cq->qid == 0 || !ctrlr->adaptive_irqs_enabled) && 1676 cq->ien && ctrlr_interrupt_enabled(ctrlr)) { 1677 err = vfu_irq_trigger(ctrlr->endpoint->vfu_ctx, cq->iv); 1678 if (err != 0) { 1679 SPDK_ERRLOG("%s: failed to trigger interrupt: %m\n", 1680 ctrlr_id(ctrlr)); 1681 return err; 1682 } 1683 } 1684 1685 return 0; 1686 } 1687 1688 static void 1689 free_sq_reqs(struct nvmf_vfio_user_sq *sq) 1690 { 1691 while (!TAILQ_EMPTY(&sq->free_reqs)) { 1692 struct nvmf_vfio_user_req *vu_req = TAILQ_FIRST(&sq->free_reqs); 1693 TAILQ_REMOVE(&sq->free_reqs, vu_req, link); 1694 free(vu_req); 1695 } 1696 } 1697 1698 /* Deletes a SQ, if this SQ is the last user of the associated CQ 1699 * and the controller is being shut down or reset, then the CQ is 1700 * also deleted. 1701 */ 1702 static void 1703 delete_sq_done(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvmf_vfio_user_sq *sq) 1704 { 1705 struct nvmf_vfio_user_cq *cq; 1706 uint16_t cqid; 1707 1708 SPDK_DEBUGLOG(nvmf_vfio, "%s: delete sqid:%d=%p done\n", ctrlr_id(vu_ctrlr), 1709 sq->qid, sq); 1710 1711 /* Free SQ resources */ 1712 unmap_q(vu_ctrlr, &sq->mapping); 1713 1714 free_sq_reqs(sq); 1715 1716 sq->size = 0; 1717 1718 sq->sq_state = VFIO_USER_SQ_DELETED; 1719 1720 /* Controller RESET and SHUTDOWN are special cases, 1721 * VM may not send DELETE IO SQ/CQ commands, NVMf library 1722 * will disconnect IO queue pairs. 1723 */ 1724 if (vu_ctrlr->reset_shn) { 1725 cqid = sq->cqid; 1726 cq = vu_ctrlr->cqs[cqid]; 1727 1728 SPDK_DEBUGLOG(nvmf_vfio, "%s: try to delete cqid:%u=%p\n", ctrlr_id(vu_ctrlr), 1729 cq->qid, cq); 1730 1731 if (cq->cq_ref) { 1732 cq->cq_ref--; 1733 } 1734 if (cq->cq_ref == 0) { 1735 unmap_q(vu_ctrlr, &cq->mapping); 1736 cq->size = 0; 1737 cq->cq_state = VFIO_USER_CQ_DELETED; 1738 cq->group = NULL; 1739 } 1740 } 1741 } 1742 1743 static void 1744 free_qp(struct nvmf_vfio_user_ctrlr *ctrlr, uint16_t qid) 1745 { 1746 struct nvmf_vfio_user_sq *sq; 1747 struct nvmf_vfio_user_cq *cq; 1748 1749 if (ctrlr == NULL) { 1750 return; 1751 } 1752 1753 sq = ctrlr->sqs[qid]; 1754 if (sq) { 1755 SPDK_DEBUGLOG(nvmf_vfio, "%s: Free SQ %u\n", ctrlr_id(ctrlr), qid); 1756 unmap_q(ctrlr, &sq->mapping); 1757 1758 free_sq_reqs(sq); 1759 1760 free(sq->mapping.sg); 1761 free(sq); 1762 ctrlr->sqs[qid] = NULL; 1763 } 1764 1765 cq = ctrlr->cqs[qid]; 1766 if (cq) { 1767 SPDK_DEBUGLOG(nvmf_vfio, "%s: Free cqid:%u\n", ctrlr_id(ctrlr), qid); 1768 unmap_q(ctrlr, &cq->mapping); 1769 free(cq->mapping.sg); 1770 free(cq); 1771 ctrlr->cqs[qid] = NULL; 1772 } 1773 } 1774 1775 static int 1776 init_sq(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_transport *transport, 1777 const uint16_t id) 1778 { 1779 struct nvmf_vfio_user_sq *sq; 1780 1781 assert(ctrlr != NULL); 1782 assert(transport != NULL); 1783 assert(ctrlr->sqs[id] == NULL); 1784 1785 sq = calloc(1, sizeof(*sq)); 1786 if (sq == NULL) { 1787 return -ENOMEM; 1788 } 1789 sq->mapping.sg = calloc(1, dma_sg_size()); 1790 if (sq->mapping.sg == NULL) { 1791 free(sq); 1792 return -ENOMEM; 1793 } 1794 1795 sq->qid = id; 1796 sq->qpair.qid = id; 1797 sq->qpair.transport = transport; 1798 sq->ctrlr = ctrlr; 1799 ctrlr->sqs[id] = sq; 1800 1801 TAILQ_INIT(&sq->free_reqs); 1802 1803 return 0; 1804 } 1805 1806 static int 1807 init_cq(struct nvmf_vfio_user_ctrlr *vu_ctrlr, const uint16_t id) 1808 { 1809 struct nvmf_vfio_user_cq *cq; 1810 1811 assert(vu_ctrlr != NULL); 1812 assert(vu_ctrlr->cqs[id] == NULL); 1813 1814 cq = calloc(1, sizeof(*cq)); 1815 if (cq == NULL) { 1816 return -ENOMEM; 1817 } 1818 cq->mapping.sg = calloc(1, dma_sg_size()); 1819 if (cq->mapping.sg == NULL) { 1820 free(cq); 1821 return -ENOMEM; 1822 } 1823 1824 cq->qid = id; 1825 vu_ctrlr->cqs[id] = cq; 1826 1827 return 0; 1828 } 1829 1830 static int 1831 alloc_sq_reqs(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvmf_vfio_user_sq *sq) 1832 { 1833 struct nvmf_vfio_user_req *vu_req, *tmp; 1834 size_t req_size; 1835 uint32_t i; 1836 1837 req_size = sizeof(struct nvmf_vfio_user_req) + 1838 (dma_sg_size() * NVMF_VFIO_USER_MAX_IOVECS); 1839 1840 for (i = 0; i < sq->size; i++) { 1841 struct spdk_nvmf_request *req; 1842 1843 vu_req = calloc(1, req_size); 1844 if (vu_req == NULL) { 1845 goto err; 1846 } 1847 1848 req = &vu_req->req; 1849 req->qpair = &sq->qpair; 1850 req->rsp = (union nvmf_c2h_msg *)&vu_req->rsp; 1851 req->cmd = (union nvmf_h2c_msg *)&vu_req->cmd; 1852 req->stripped_data = NULL; 1853 1854 TAILQ_INSERT_TAIL(&sq->free_reqs, vu_req, link); 1855 } 1856 1857 return 0; 1858 1859 err: 1860 TAILQ_FOREACH_SAFE(vu_req, &sq->free_reqs, link, tmp) { 1861 free(vu_req); 1862 } 1863 return -ENOMEM; 1864 } 1865 1866 static volatile uint32_t * 1867 ctrlr_doorbell_ptr(struct nvmf_vfio_user_ctrlr *ctrlr) 1868 { 1869 return ctrlr->sdbl != NULL ? 1870 ctrlr->sdbl->shadow_doorbells : 1871 ctrlr->bar0_doorbells; 1872 } 1873 1874 static uint16_t 1875 handle_create_io_sq(struct nvmf_vfio_user_ctrlr *ctrlr, 1876 struct spdk_nvme_cmd *cmd, uint16_t *sct) 1877 { 1878 struct nvmf_vfio_user_transport *vu_transport = ctrlr->transport; 1879 struct nvmf_vfio_user_sq *sq; 1880 uint32_t qsize; 1881 uint16_t cqid; 1882 uint16_t qid; 1883 int err; 1884 1885 qid = cmd->cdw10_bits.create_io_q.qid; 1886 cqid = cmd->cdw11_bits.create_io_sq.cqid; 1887 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 1888 1889 if (ctrlr->sqs[qid] == NULL) { 1890 err = init_sq(ctrlr, ctrlr->sqs[0]->qpair.transport, qid); 1891 if (err != 0) { 1892 *sct = SPDK_NVME_SCT_GENERIC; 1893 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1894 } 1895 } 1896 1897 if (cqid == 0 || cqid >= vu_transport->transport.opts.max_qpairs_per_ctrlr) { 1898 SPDK_ERRLOG("%s: invalid cqid:%u\n", ctrlr_id(ctrlr), cqid); 1899 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1900 return SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 1901 } 1902 1903 /* CQ must be created before SQ. */ 1904 if (!io_q_exists(ctrlr, cqid, true)) { 1905 SPDK_ERRLOG("%s: cqid:%u does not exist\n", ctrlr_id(ctrlr), cqid); 1906 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1907 return SPDK_NVME_SC_COMPLETION_QUEUE_INVALID; 1908 } 1909 1910 if (cmd->cdw11_bits.create_io_sq.pc != 0x1) { 1911 SPDK_ERRLOG("%s: non-PC SQ not supported\n", ctrlr_id(ctrlr)); 1912 *sct = SPDK_NVME_SCT_GENERIC; 1913 return SPDK_NVME_SC_INVALID_FIELD; 1914 } 1915 1916 sq = ctrlr->sqs[qid]; 1917 sq->size = qsize; 1918 1919 SPDK_DEBUGLOG(nvmf_vfio, "%s: sqid:%d cqid:%d\n", ctrlr_id(ctrlr), 1920 qid, cqid); 1921 1922 sq->mapping.prp1 = cmd->dptr.prp.prp1; 1923 1924 err = map_q(ctrlr, &sq->mapping, sq->size, false, true); 1925 if (err) { 1926 SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr)); 1927 *sct = SPDK_NVME_SCT_GENERIC; 1928 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1929 } 1930 1931 SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped sqid:%d IOVA=%#lx vaddr=%p\n", 1932 ctrlr_id(ctrlr), qid, cmd->dptr.prp.prp1, 1933 q_addr(&sq->mapping)); 1934 1935 err = alloc_sq_reqs(ctrlr, sq); 1936 if (err < 0) { 1937 SPDK_ERRLOG("%s: failed to allocate SQ requests: %m\n", ctrlr_id(ctrlr)); 1938 *sct = SPDK_NVME_SCT_GENERIC; 1939 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1940 } 1941 1942 sq->cqid = cqid; 1943 ctrlr->cqs[sq->cqid]->cq_ref++; 1944 sq->sq_state = VFIO_USER_SQ_CREATED; 1945 *sq_headp(sq) = 0; 1946 1947 sq->dbl_tailp = ctrlr_doorbell_ptr(ctrlr) + queue_index(qid, false); 1948 1949 /* 1950 * We should always reset the doorbells. 1951 * 1952 * The Specification prohibits the controller from writing to the shadow 1953 * doorbell buffer, however older versions of the Linux NVMe driver 1954 * don't reset the shadow doorbell buffer after a Queue-Level or 1955 * Controller-Level reset, which means that we're left with garbage 1956 * doorbell values. 1957 */ 1958 *sq_dbl_tailp(sq) = 0; 1959 1960 if (ctrlr->sdbl != NULL) { 1961 sq->need_rearm = true; 1962 1963 if (!set_sq_eventidx(sq)) { 1964 SPDK_ERRLOG("%s: host updated SQ tail doorbell before " 1965 "sqid:%hu was initialized\n", 1966 ctrlr_id(ctrlr), qid); 1967 fail_ctrlr(ctrlr); 1968 *sct = SPDK_NVME_SCT_GENERIC; 1969 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1970 } 1971 } 1972 1973 /* 1974 * Create our new I/O qpair. This asynchronously invokes, on a suitable 1975 * poll group, the nvmf_vfio_user_poll_group_add() callback, which will 1976 * call spdk_nvmf_request_exec_fabrics() with a generated fabrics 1977 * connect command. This command is then eventually completed via 1978 * handle_queue_connect_rsp(). 1979 */ 1980 sq->create_io_sq_cmd = *cmd; 1981 sq->post_create_io_sq_completion = true; 1982 1983 spdk_nvmf_tgt_new_qpair(ctrlr->transport->transport.tgt, 1984 &sq->qpair); 1985 1986 *sct = SPDK_NVME_SCT_GENERIC; 1987 return SPDK_NVME_SC_SUCCESS; 1988 } 1989 1990 static uint16_t 1991 handle_create_io_cq(struct nvmf_vfio_user_ctrlr *ctrlr, 1992 struct spdk_nvme_cmd *cmd, uint16_t *sct) 1993 { 1994 struct nvmf_vfio_user_cq *cq; 1995 uint32_t qsize; 1996 uint16_t qid; 1997 int err; 1998 1999 qid = cmd->cdw10_bits.create_io_q.qid; 2000 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 2001 2002 if (ctrlr->cqs[qid] == NULL) { 2003 err = init_cq(ctrlr, qid); 2004 if (err != 0) { 2005 *sct = SPDK_NVME_SCT_GENERIC; 2006 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2007 } 2008 } 2009 2010 if (cmd->cdw11_bits.create_io_cq.pc != 0x1) { 2011 SPDK_ERRLOG("%s: non-PC CQ not supported\n", ctrlr_id(ctrlr)); 2012 *sct = SPDK_NVME_SCT_GENERIC; 2013 return SPDK_NVME_SC_INVALID_FIELD; 2014 } 2015 2016 if (cmd->cdw11_bits.create_io_cq.iv > NVME_IRQ_MSIX_NUM - 1) { 2017 SPDK_ERRLOG("%s: IV is too big\n", ctrlr_id(ctrlr)); 2018 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2019 return SPDK_NVME_SC_INVALID_INTERRUPT_VECTOR; 2020 } 2021 2022 cq = ctrlr->cqs[qid]; 2023 cq->size = qsize; 2024 2025 cq->mapping.prp1 = cmd->dptr.prp.prp1; 2026 2027 cq->dbl_headp = ctrlr_doorbell_ptr(ctrlr) + queue_index(qid, true); 2028 2029 err = map_q(ctrlr, &cq->mapping, cq->size, true, true); 2030 if (err) { 2031 SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr)); 2032 *sct = SPDK_NVME_SCT_GENERIC; 2033 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2034 } 2035 2036 SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped cqid:%u IOVA=%#lx vaddr=%p\n", 2037 ctrlr_id(ctrlr), qid, cmd->dptr.prp.prp1, 2038 q_addr(&cq->mapping)); 2039 2040 cq->ien = cmd->cdw11_bits.create_io_cq.ien; 2041 cq->iv = cmd->cdw11_bits.create_io_cq.iv; 2042 cq->phase = true; 2043 cq->cq_state = VFIO_USER_CQ_CREATED; 2044 2045 *cq_tailp(cq) = 0; 2046 2047 /* 2048 * We should always reset the doorbells. 2049 * 2050 * The Specification prohibits the controller from writing to the shadow 2051 * doorbell buffer, however older versions of the Linux NVMe driver 2052 * don't reset the shadow doorbell buffer after a Queue-Level or 2053 * Controller-Level reset, which means that we're left with garbage 2054 * doorbell values. 2055 */ 2056 *cq_dbl_headp(cq) = 0; 2057 2058 *sct = SPDK_NVME_SCT_GENERIC; 2059 return SPDK_NVME_SC_SUCCESS; 2060 } 2061 2062 /* 2063 * Creates a completion or submission I/O queue. Returns 0 on success, -errno 2064 * on error. 2065 */ 2066 static int 2067 handle_create_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, 2068 struct spdk_nvme_cmd *cmd, const bool is_cq) 2069 { 2070 struct nvmf_vfio_user_transport *vu_transport = ctrlr->transport; 2071 uint16_t sct = SPDK_NVME_SCT_GENERIC; 2072 uint16_t sc = SPDK_NVME_SC_SUCCESS; 2073 uint32_t qsize; 2074 uint16_t qid; 2075 2076 assert(ctrlr != NULL); 2077 assert(cmd != NULL); 2078 2079 qid = cmd->cdw10_bits.create_io_q.qid; 2080 if (qid == 0 || qid >= vu_transport->transport.opts.max_qpairs_per_ctrlr) { 2081 SPDK_ERRLOG("%s: invalid qid=%d, max=%d\n", ctrlr_id(ctrlr), 2082 qid, vu_transport->transport.opts.max_qpairs_per_ctrlr); 2083 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2084 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2085 goto out; 2086 } 2087 2088 if (io_q_exists(ctrlr, qid, is_cq)) { 2089 SPDK_ERRLOG("%s: %cqid:%d already exists\n", ctrlr_id(ctrlr), 2090 is_cq ? 'c' : 's', qid); 2091 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2092 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2093 goto out; 2094 } 2095 2096 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 2097 if (qsize == 1 || qsize > max_queue_size(ctrlr)) { 2098 SPDK_ERRLOG("%s: invalid I/O queue size %u\n", ctrlr_id(ctrlr), qsize); 2099 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2100 sc = SPDK_NVME_SC_INVALID_QUEUE_SIZE; 2101 goto out; 2102 } 2103 2104 if (is_cq) { 2105 sc = handle_create_io_cq(ctrlr, cmd, &sct); 2106 } else { 2107 sc = handle_create_io_sq(ctrlr, cmd, &sct); 2108 2109 if (sct == SPDK_NVME_SCT_GENERIC && 2110 sc == SPDK_NVME_SC_SUCCESS) { 2111 /* Completion posted asynchronously. */ 2112 return 0; 2113 } 2114 } 2115 2116 out: 2117 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 2118 } 2119 2120 /* For ADMIN I/O DELETE SUBMISSION QUEUE the NVMf library will disconnect and free 2121 * queue pair, so save the command in a context. 2122 */ 2123 struct vfio_user_delete_sq_ctx { 2124 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 2125 struct spdk_nvme_cmd delete_io_sq_cmd; 2126 }; 2127 2128 static void 2129 vfio_user_qpair_delete_cb(void *cb_arg) 2130 { 2131 struct vfio_user_delete_sq_ctx *ctx = cb_arg; 2132 struct nvmf_vfio_user_ctrlr *vu_ctrlr = ctx->vu_ctrlr; 2133 struct nvmf_vfio_user_cq *admin_cq = vu_ctrlr->cqs[0]; 2134 2135 if (admin_cq->thread != spdk_get_thread()) { 2136 assert(admin_cq->thread != NULL); 2137 spdk_thread_send_msg(admin_cq->thread, 2138 vfio_user_qpair_delete_cb, 2139 cb_arg); 2140 } else { 2141 post_completion(vu_ctrlr, admin_cq, 0, 0, 2142 ctx->delete_io_sq_cmd.cid, 2143 SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC); 2144 free(ctx); 2145 } 2146 } 2147 2148 /* 2149 * Deletes a completion or submission I/O queue. 2150 */ 2151 static int 2152 handle_del_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, 2153 struct spdk_nvme_cmd *cmd, const bool is_cq) 2154 { 2155 uint16_t sct = SPDK_NVME_SCT_GENERIC; 2156 uint16_t sc = SPDK_NVME_SC_SUCCESS; 2157 struct nvmf_vfio_user_sq *sq; 2158 struct nvmf_vfio_user_cq *cq; 2159 struct vfio_user_delete_sq_ctx *ctx; 2160 2161 SPDK_DEBUGLOG(nvmf_vfio, "%s: delete I/O %cqid:%d\n", 2162 ctrlr_id(ctrlr), is_cq ? 'c' : 's', 2163 cmd->cdw10_bits.delete_io_q.qid); 2164 2165 if (!io_q_exists(ctrlr, cmd->cdw10_bits.delete_io_q.qid, is_cq)) { 2166 SPDK_ERRLOG("%s: I/O %cqid:%d does not exist\n", ctrlr_id(ctrlr), 2167 is_cq ? 'c' : 's', cmd->cdw10_bits.delete_io_q.qid); 2168 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2169 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2170 goto out; 2171 } 2172 2173 if (is_cq) { 2174 cq = ctrlr->cqs[cmd->cdw10_bits.delete_io_q.qid]; 2175 if (cq->cq_ref) { 2176 SPDK_ERRLOG("%s: the associated SQ must be deleted first\n", ctrlr_id(ctrlr)); 2177 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2178 sc = SPDK_NVME_SC_INVALID_QUEUE_DELETION; 2179 goto out; 2180 } 2181 2182 unmap_q(ctrlr, &cq->mapping); 2183 cq->size = 0; 2184 cq->cq_state = VFIO_USER_CQ_DELETED; 2185 cq->group = NULL; 2186 } else { 2187 ctx = calloc(1, sizeof(*ctx)); 2188 if (!ctx) { 2189 sct = SPDK_NVME_SCT_GENERIC; 2190 sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2191 goto out; 2192 } 2193 ctx->vu_ctrlr = ctrlr; 2194 ctx->delete_io_sq_cmd = *cmd; 2195 2196 sq = ctrlr->sqs[cmd->cdw10_bits.delete_io_q.qid]; 2197 sq->sq_state = VFIO_USER_SQ_DELETED; 2198 assert(ctrlr->cqs[sq->cqid]->cq_ref); 2199 ctrlr->cqs[sq->cqid]->cq_ref--; 2200 2201 spdk_nvmf_qpair_disconnect(&sq->qpair, vfio_user_qpair_delete_cb, ctx); 2202 return 0; 2203 } 2204 2205 out: 2206 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 2207 } 2208 2209 /* 2210 * Configures Shadow Doorbells. 2211 */ 2212 static int 2213 handle_doorbell_buffer_config(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd) 2214 { 2215 struct nvmf_vfio_user_shadow_doorbells *sdbl = NULL; 2216 uint32_t dstrd; 2217 uintptr_t page_size, page_mask; 2218 uint64_t prp1, prp2; 2219 uint16_t sct = SPDK_NVME_SCT_GENERIC; 2220 uint16_t sc = SPDK_NVME_SC_INVALID_FIELD; 2221 2222 assert(ctrlr != NULL); 2223 assert(ctrlr->endpoint != NULL); 2224 assert(cmd != NULL); 2225 2226 dstrd = doorbell_stride(ctrlr); 2227 page_size = memory_page_size(ctrlr); 2228 page_mask = memory_page_mask(ctrlr); 2229 2230 /* FIXME: we don't check doorbell stride when setting queue doorbells. */ 2231 if ((4u << dstrd) * NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR > page_size) { 2232 SPDK_ERRLOG("%s: doorbells do not fit in a single host page", 2233 ctrlr_id(ctrlr)); 2234 2235 goto out; 2236 } 2237 2238 /* Verify guest physical addresses passed as PRPs. */ 2239 if (cmd->psdt != SPDK_NVME_PSDT_PRP) { 2240 SPDK_ERRLOG("%s: received Doorbell Buffer Config without PRPs", 2241 ctrlr_id(ctrlr)); 2242 2243 goto out; 2244 } 2245 2246 prp1 = cmd->dptr.prp.prp1; 2247 prp2 = cmd->dptr.prp.prp2; 2248 2249 SPDK_DEBUGLOG(nvmf_vfio, 2250 "%s: configuring shadow doorbells with PRP1=%#lx and PRP2=%#lx (GPAs)\n", 2251 ctrlr_id(ctrlr), prp1, prp2); 2252 2253 if (prp1 == prp2 2254 || prp1 != (prp1 & page_mask) 2255 || prp2 != (prp2 & page_mask)) { 2256 SPDK_ERRLOG("%s: invalid shadow doorbell GPAs\n", 2257 ctrlr_id(ctrlr)); 2258 2259 goto out; 2260 } 2261 2262 /* Map guest physical addresses to our virtual address space. */ 2263 sdbl = map_sdbl(ctrlr->endpoint->vfu_ctx, prp1, prp2, page_size); 2264 if (sdbl == NULL) { 2265 SPDK_ERRLOG("%s: failed to map shadow doorbell buffers\n", 2266 ctrlr_id(ctrlr)); 2267 2268 goto out; 2269 } 2270 2271 ctrlr->shadow_doorbell_buffer = prp1; 2272 ctrlr->eventidx_buffer = prp2; 2273 2274 SPDK_DEBUGLOG(nvmf_vfio, 2275 "%s: mapped shadow doorbell buffers [%p, %p) and [%p, %p)\n", 2276 ctrlr_id(ctrlr), 2277 sdbl->iovs[0].iov_base, 2278 sdbl->iovs[0].iov_base + sdbl->iovs[0].iov_len, 2279 sdbl->iovs[1].iov_base, 2280 sdbl->iovs[1].iov_base + sdbl->iovs[1].iov_len); 2281 2282 2283 /* 2284 * Set all possible CQ head doorbells to polling mode now, such that we 2285 * don't have to worry about it later if the host creates more queues. 2286 * 2287 * We only ever want interrupts for writes to the SQ tail doorbells 2288 * (which are initialised in set_ctrlr_intr_mode() below). 2289 */ 2290 for (uint16_t i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; ++i) { 2291 sdbl->eventidxs[queue_index(i, true)] = NVMF_VFIO_USER_EVENTIDX_POLL; 2292 } 2293 2294 /* Update controller. */ 2295 SWAP(ctrlr->sdbl, sdbl); 2296 2297 /* 2298 * Copy doorbells from either the previous shadow doorbell buffer or the 2299 * BAR0 doorbells and make I/O queue doorbells point to the new buffer. 2300 * 2301 * This needs to account for older versions of the Linux NVMe driver, 2302 * which don't clear out the buffer after a controller reset. 2303 */ 2304 copy_doorbells(ctrlr, sdbl != NULL ? 2305 sdbl->shadow_doorbells : ctrlr->bar0_doorbells, 2306 ctrlr->sdbl->shadow_doorbells); 2307 2308 vfio_user_ctrlr_switch_doorbells(ctrlr, true); 2309 2310 ctrlr_kick(ctrlr); 2311 2312 sc = SPDK_NVME_SC_SUCCESS; 2313 2314 out: 2315 /* 2316 * Unmap existing buffers, in case Doorbell Buffer Config was sent 2317 * more than once (pointless, but not prohibited by the spec), or 2318 * in case of an error. 2319 * 2320 * If this is the first time Doorbell Buffer Config was processed, 2321 * then we've just swapped a NULL from ctrlr->sdbl into sdbl, so 2322 * free_sdbl() becomes a noop. 2323 */ 2324 free_sdbl(ctrlr->endpoint->vfu_ctx, sdbl); 2325 2326 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 2327 } 2328 2329 /* Returns 0 on success and -errno on error. */ 2330 static int 2331 consume_admin_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd) 2332 { 2333 assert(ctrlr != NULL); 2334 assert(cmd != NULL); 2335 2336 if (cmd->fuse != 0) { 2337 /* Fused admin commands are not supported. */ 2338 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, 2339 SPDK_NVME_SC_INVALID_FIELD, 2340 SPDK_NVME_SCT_GENERIC); 2341 } 2342 2343 switch (cmd->opc) { 2344 case SPDK_NVME_OPC_CREATE_IO_CQ: 2345 case SPDK_NVME_OPC_CREATE_IO_SQ: 2346 return handle_create_io_q(ctrlr, cmd, 2347 cmd->opc == SPDK_NVME_OPC_CREATE_IO_CQ); 2348 case SPDK_NVME_OPC_DELETE_IO_SQ: 2349 case SPDK_NVME_OPC_DELETE_IO_CQ: 2350 return handle_del_io_q(ctrlr, cmd, 2351 cmd->opc == SPDK_NVME_OPC_DELETE_IO_CQ); 2352 case SPDK_NVME_OPC_DOORBELL_BUFFER_CONFIG: 2353 if (!ctrlr->transport->transport_opts.disable_shadow_doorbells) { 2354 return handle_doorbell_buffer_config(ctrlr, cmd); 2355 } 2356 /* FALLTHROUGH */ 2357 default: 2358 return handle_cmd_req(ctrlr, cmd, ctrlr->sqs[0]); 2359 } 2360 } 2361 2362 static int 2363 handle_cmd_rsp(struct nvmf_vfio_user_req *vu_req, void *cb_arg) 2364 { 2365 struct nvmf_vfio_user_sq *sq = cb_arg; 2366 struct nvmf_vfio_user_ctrlr *vu_ctrlr = sq->ctrlr; 2367 uint16_t sqid, cqid; 2368 2369 assert(sq != NULL); 2370 assert(vu_req != NULL); 2371 assert(vu_ctrlr != NULL); 2372 2373 if (spdk_likely(vu_req->iovcnt)) { 2374 vfu_sgl_put(vu_ctrlr->endpoint->vfu_ctx, 2375 index_to_sg_t(vu_req->sg, 0), 2376 vu_req->iov, vu_req->iovcnt); 2377 } 2378 sqid = sq->qid; 2379 cqid = sq->cqid; 2380 2381 return post_completion(vu_ctrlr, vu_ctrlr->cqs[cqid], 2382 vu_req->req.rsp->nvme_cpl.cdw0, 2383 sqid, 2384 vu_req->req.cmd->nvme_cmd.cid, 2385 vu_req->req.rsp->nvme_cpl.status.sc, 2386 vu_req->req.rsp->nvme_cpl.status.sct); 2387 } 2388 2389 static int 2390 consume_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_sq *sq, 2391 struct spdk_nvme_cmd *cmd) 2392 { 2393 assert(sq != NULL); 2394 if (nvmf_qpair_is_admin_queue(&sq->qpair)) { 2395 return consume_admin_cmd(ctrlr, cmd); 2396 } 2397 2398 return handle_cmd_req(ctrlr, cmd, sq); 2399 } 2400 2401 /* Returns the number of commands processed, or a negative value on error. */ 2402 static int 2403 handle_sq_tdbl_write(struct nvmf_vfio_user_ctrlr *ctrlr, const uint32_t new_tail, 2404 struct nvmf_vfio_user_sq *sq) 2405 { 2406 struct spdk_nvme_cmd *queue; 2407 int count = 0; 2408 2409 assert(ctrlr != NULL); 2410 assert(sq != NULL); 2411 2412 if (ctrlr->sdbl != NULL && sq->qid != 0) { 2413 /* 2414 * Submission queue index has moved past the event index, so it 2415 * needs to be re-armed before we go to sleep. 2416 */ 2417 sq->need_rearm = true; 2418 } 2419 2420 queue = q_addr(&sq->mapping); 2421 while (*sq_headp(sq) != new_tail) { 2422 int err; 2423 struct spdk_nvme_cmd *cmd = &queue[*sq_headp(sq)]; 2424 2425 count++; 2426 2427 /* 2428 * SQHD must contain the new head pointer, so we must increase 2429 * it before we generate a completion. 2430 */ 2431 sq_head_advance(sq); 2432 2433 err = consume_cmd(ctrlr, sq, cmd); 2434 if (err != 0) { 2435 return err; 2436 } 2437 } 2438 2439 return count; 2440 } 2441 2442 /* Checks whether endpoint is connected from the same process */ 2443 static bool 2444 is_peer_same_process(struct nvmf_vfio_user_endpoint *endpoint) 2445 { 2446 struct ucred ucred; 2447 socklen_t ucredlen = sizeof(ucred); 2448 2449 if (endpoint == NULL) { 2450 return false; 2451 } 2452 2453 if (getsockopt(vfu_get_poll_fd(endpoint->vfu_ctx), SOL_SOCKET, SO_PEERCRED, &ucred, 2454 &ucredlen) < 0) { 2455 SPDK_ERRLOG("getsockopt(SO_PEERCRED): %s\n", strerror(errno)); 2456 return false; 2457 } 2458 2459 return ucred.pid == getpid(); 2460 } 2461 2462 static void 2463 memory_region_add_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) 2464 { 2465 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2466 struct nvmf_vfio_user_ctrlr *ctrlr; 2467 struct nvmf_vfio_user_sq *sq; 2468 struct nvmf_vfio_user_cq *cq; 2469 void *map_start, *map_end; 2470 int ret; 2471 2472 /* 2473 * We're not interested in any DMA regions that aren't mappable (we don't 2474 * support clients that don't share their memory). 2475 */ 2476 if (!info->vaddr) { 2477 return; 2478 } 2479 2480 map_start = info->mapping.iov_base; 2481 map_end = info->mapping.iov_base + info->mapping.iov_len; 2482 2483 if (((uintptr_t)info->mapping.iov_base & MASK_2MB) || 2484 (info->mapping.iov_len & MASK_2MB)) { 2485 SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %p-%p\n", 2486 info->vaddr, map_start, map_end); 2487 return; 2488 } 2489 2490 assert(endpoint != NULL); 2491 if (endpoint->ctrlr == NULL) { 2492 return; 2493 } 2494 ctrlr = endpoint->ctrlr; 2495 2496 SPDK_DEBUGLOG(nvmf_vfio, "%s: map IOVA %p-%p\n", endpoint_id(endpoint), 2497 map_start, map_end); 2498 2499 /* VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE are enabled when registering to VFIO, here we also 2500 * check the protection bits before registering. When vfio client and server are run in same process 2501 * there is no need to register the same memory again. 2502 */ 2503 if (info->prot == (PROT_WRITE | PROT_READ) && !is_peer_same_process(endpoint)) { 2504 ret = spdk_mem_register(info->mapping.iov_base, info->mapping.iov_len); 2505 if (ret) { 2506 SPDK_ERRLOG("Memory region register %p-%p failed, ret=%d\n", 2507 map_start, map_end, ret); 2508 } 2509 } 2510 2511 pthread_mutex_lock(&endpoint->lock); 2512 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 2513 if (sq->sq_state != VFIO_USER_SQ_INACTIVE) { 2514 continue; 2515 } 2516 2517 cq = ctrlr->cqs[sq->cqid]; 2518 2519 /* For shared CQ case, we will use q_addr() to avoid mapping CQ multiple times */ 2520 if (cq->size && q_addr(&cq->mapping) == NULL) { 2521 ret = map_q(ctrlr, &cq->mapping, cq->size, true, false); 2522 if (ret) { 2523 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap cqid:%d %#lx-%#lx\n", 2524 cq->qid, cq->mapping.prp1, 2525 cq->mapping.prp1 + cq->size * sizeof(struct spdk_nvme_cpl)); 2526 continue; 2527 } 2528 } 2529 2530 if (sq->size) { 2531 ret = map_q(ctrlr, &sq->mapping, sq->size, false, false); 2532 if (ret) { 2533 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap sqid:%d %#lx-%#lx\n", 2534 sq->qid, sq->mapping.prp1, 2535 sq->mapping.prp1 + sq->size * sizeof(struct spdk_nvme_cmd)); 2536 continue; 2537 } 2538 } 2539 sq->sq_state = VFIO_USER_SQ_ACTIVE; 2540 SPDK_DEBUGLOG(nvmf_vfio, "Remap sqid:%u successfully\n", sq->qid); 2541 } 2542 pthread_mutex_unlock(&endpoint->lock); 2543 } 2544 2545 static void 2546 memory_region_remove_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) 2547 { 2548 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2549 struct nvmf_vfio_user_sq *sq; 2550 struct nvmf_vfio_user_cq *cq; 2551 void *map_start, *map_end; 2552 int ret = 0; 2553 2554 if (!info->vaddr) { 2555 return; 2556 } 2557 2558 map_start = info->mapping.iov_base; 2559 map_end = info->mapping.iov_base + info->mapping.iov_len; 2560 2561 if (((uintptr_t)info->mapping.iov_base & MASK_2MB) || 2562 (info->mapping.iov_len & MASK_2MB)) { 2563 SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %p-%p\n", 2564 info->vaddr, map_start, map_end); 2565 return; 2566 } 2567 2568 assert(endpoint != NULL); 2569 SPDK_DEBUGLOG(nvmf_vfio, "%s: unmap IOVA %p-%p\n", endpoint_id(endpoint), 2570 map_start, map_end); 2571 2572 if (endpoint->ctrlr != NULL) { 2573 struct nvmf_vfio_user_ctrlr *ctrlr; 2574 ctrlr = endpoint->ctrlr; 2575 2576 pthread_mutex_lock(&endpoint->lock); 2577 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 2578 if (q_addr(&sq->mapping) >= map_start && q_addr(&sq->mapping) <= map_end) { 2579 unmap_q(ctrlr, &sq->mapping); 2580 sq->sq_state = VFIO_USER_SQ_INACTIVE; 2581 } 2582 2583 cq = ctrlr->cqs[sq->cqid]; 2584 if (q_addr(&cq->mapping) >= map_start && q_addr(&cq->mapping) <= map_end) { 2585 unmap_q(ctrlr, &cq->mapping); 2586 } 2587 } 2588 2589 if (ctrlr->sdbl != NULL) { 2590 size_t i; 2591 2592 for (i = 0; i < NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT; i++) { 2593 const void *const iov_base = ctrlr->sdbl->iovs[i].iov_base; 2594 2595 if (iov_base >= map_start && iov_base < map_end) { 2596 copy_doorbells(ctrlr, 2597 ctrlr->sdbl->shadow_doorbells, 2598 ctrlr->bar0_doorbells); 2599 vfio_user_ctrlr_switch_doorbells(ctrlr, false); 2600 free_sdbl(endpoint->vfu_ctx, ctrlr->sdbl); 2601 ctrlr->sdbl = NULL; 2602 break; 2603 } 2604 } 2605 } 2606 2607 pthread_mutex_unlock(&endpoint->lock); 2608 } 2609 2610 if (info->prot == (PROT_WRITE | PROT_READ) && !is_peer_same_process(endpoint)) { 2611 ret = spdk_mem_unregister(info->mapping.iov_base, info->mapping.iov_len); 2612 if (ret) { 2613 SPDK_ERRLOG("Memory region unregister %p-%p failed, ret=%d\n", 2614 map_start, map_end, ret); 2615 } 2616 } 2617 } 2618 2619 /* Used to initiate a controller-level reset or a controller shutdown. */ 2620 static void 2621 disable_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 2622 { 2623 SPDK_DEBUGLOG(nvmf_vfio, "%s: disabling controller\n", 2624 ctrlr_id(vu_ctrlr)); 2625 2626 /* Unmap Admin queue. */ 2627 2628 assert(vu_ctrlr->sqs[0] != NULL); 2629 assert(vu_ctrlr->cqs[0] != NULL); 2630 2631 unmap_q(vu_ctrlr, &vu_ctrlr->sqs[0]->mapping); 2632 unmap_q(vu_ctrlr, &vu_ctrlr->cqs[0]->mapping); 2633 2634 vu_ctrlr->sqs[0]->size = 0; 2635 *sq_headp(vu_ctrlr->sqs[0]) = 0; 2636 2637 vu_ctrlr->sqs[0]->sq_state = VFIO_USER_SQ_INACTIVE; 2638 2639 vu_ctrlr->cqs[0]->size = 0; 2640 *cq_tailp(vu_ctrlr->cqs[0]) = 0; 2641 2642 /* 2643 * For PCIe controller reset or shutdown, we will drop all AER 2644 * responses. 2645 */ 2646 nvmf_ctrlr_abort_aer(vu_ctrlr->ctrlr); 2647 2648 /* Free the shadow doorbell buffer. */ 2649 vfio_user_ctrlr_switch_doorbells(vu_ctrlr, false); 2650 free_sdbl(vu_ctrlr->endpoint->vfu_ctx, vu_ctrlr->sdbl); 2651 vu_ctrlr->sdbl = NULL; 2652 } 2653 2654 /* Used to re-enable the controller after a controller-level reset. */ 2655 static int 2656 enable_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 2657 { 2658 int err; 2659 2660 assert(vu_ctrlr != NULL); 2661 2662 SPDK_DEBUGLOG(nvmf_vfio, "%s: enabling controller\n", 2663 ctrlr_id(vu_ctrlr)); 2664 2665 err = acq_setup(vu_ctrlr); 2666 if (err != 0) { 2667 return err; 2668 } 2669 2670 err = asq_setup(vu_ctrlr); 2671 if (err != 0) { 2672 return err; 2673 } 2674 2675 vu_ctrlr->sqs[0]->sq_state = VFIO_USER_SQ_ACTIVE; 2676 2677 return 0; 2678 } 2679 2680 static int 2681 nvmf_vfio_user_prop_req_rsp(struct nvmf_vfio_user_req *req, void *cb_arg) 2682 { 2683 struct nvmf_vfio_user_sq *sq = cb_arg; 2684 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 2685 int ret; 2686 2687 assert(sq != NULL); 2688 assert(req != NULL); 2689 2690 if (req->req.cmd->prop_get_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET) { 2691 assert(sq->ctrlr != NULL); 2692 assert(req != NULL); 2693 2694 memcpy(req->req.data, 2695 &req->req.rsp->prop_get_rsp.value.u64, 2696 req->req.length); 2697 } else { 2698 assert(req->req.cmd->prop_set_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET); 2699 assert(sq->ctrlr != NULL); 2700 vu_ctrlr = sq->ctrlr; 2701 2702 if (req->req.cmd->prop_set_cmd.ofst == offsetof(struct spdk_nvme_registers, cc)) { 2703 union spdk_nvme_cc_register cc, diff; 2704 2705 cc.raw = req->req.cmd->prop_set_cmd.value.u64; 2706 diff.raw = cc.raw ^ req->cc.raw; 2707 2708 if (diff.bits.en) { 2709 if (cc.bits.en) { 2710 ret = enable_ctrlr(vu_ctrlr); 2711 if (ret) { 2712 SPDK_ERRLOG("%s: failed to enable ctrlr\n", ctrlr_id(vu_ctrlr)); 2713 return ret; 2714 } 2715 vu_ctrlr->reset_shn = false; 2716 } else { 2717 vu_ctrlr->reset_shn = true; 2718 } 2719 } 2720 2721 if (diff.bits.shn) { 2722 if (cc.bits.shn == SPDK_NVME_SHN_NORMAL || cc.bits.shn == SPDK_NVME_SHN_ABRUPT) { 2723 vu_ctrlr->reset_shn = true; 2724 } 2725 } 2726 2727 if (vu_ctrlr->reset_shn) { 2728 disable_ctrlr(vu_ctrlr); 2729 } 2730 } 2731 } 2732 2733 return 0; 2734 } 2735 2736 /* 2737 * Handles a write at offset 0x1000 or more; this is the non-mapped path when a 2738 * doorbell is written via access_bar0_fn(). 2739 * 2740 * DSTRD is set to fixed value 0 for NVMf. 2741 * 2742 */ 2743 static int 2744 handle_dbl_access(struct nvmf_vfio_user_ctrlr *ctrlr, uint32_t *buf, 2745 const size_t count, loff_t pos, const bool is_write) 2746 { 2747 assert(ctrlr != NULL); 2748 assert(buf != NULL); 2749 2750 if (!is_write) { 2751 SPDK_WARNLOG("%s: host tried to read BAR0 doorbell %#lx\n", 2752 ctrlr_id(ctrlr), pos); 2753 errno = EPERM; 2754 return -1; 2755 } 2756 2757 if (count != sizeof(uint32_t)) { 2758 SPDK_ERRLOG("%s: bad doorbell buffer size %ld\n", 2759 ctrlr_id(ctrlr), count); 2760 errno = EINVAL; 2761 return -1; 2762 } 2763 2764 pos -= NVME_DOORBELLS_OFFSET; 2765 2766 /* pos must be dword aligned */ 2767 if ((pos & 0x3) != 0) { 2768 SPDK_ERRLOG("%s: bad doorbell offset %#lx\n", ctrlr_id(ctrlr), pos); 2769 errno = EINVAL; 2770 return -1; 2771 } 2772 2773 /* convert byte offset to array index */ 2774 pos >>= 2; 2775 2776 if (pos >= NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR * 2) { 2777 SPDK_ERRLOG("%s: bad doorbell index %#lx\n", ctrlr_id(ctrlr), pos); 2778 errno = EINVAL; 2779 return -1; 2780 } 2781 2782 ctrlr->bar0_doorbells[pos] = *buf; 2783 spdk_wmb(); 2784 2785 SPDK_DEBUGLOG(vfio_user_db, "%s: updating BAR0 doorbell %s:%ld to %u\n", 2786 ctrlr_id(ctrlr), (pos & 1) ? "cqid" : "sqid", 2787 pos / 2, *buf); 2788 2789 2790 return 0; 2791 } 2792 2793 static size_t 2794 vfio_user_property_access(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 2795 char *buf, size_t count, loff_t pos, 2796 bool is_write) 2797 { 2798 struct nvmf_vfio_user_req *req; 2799 const struct spdk_nvmf_registers *regs; 2800 2801 if ((count != 4) && (count != 8)) { 2802 errno = EINVAL; 2803 return -1; 2804 } 2805 2806 /* Construct a Fabric Property Get/Set command and send it */ 2807 req = get_nvmf_vfio_user_req(vu_ctrlr->sqs[0]); 2808 if (req == NULL) { 2809 errno = ENOBUFS; 2810 return -1; 2811 } 2812 regs = spdk_nvmf_ctrlr_get_regs(vu_ctrlr->ctrlr); 2813 req->cc.raw = regs->cc.raw; 2814 2815 req->cb_fn = nvmf_vfio_user_prop_req_rsp; 2816 req->cb_arg = vu_ctrlr->sqs[0]; 2817 req->req.cmd->prop_set_cmd.opcode = SPDK_NVME_OPC_FABRIC; 2818 req->req.cmd->prop_set_cmd.cid = 0; 2819 if (count == 4) { 2820 req->req.cmd->prop_set_cmd.attrib.size = 0; 2821 } else { 2822 req->req.cmd->prop_set_cmd.attrib.size = 1; 2823 } 2824 req->req.cmd->prop_set_cmd.ofst = pos; 2825 if (is_write) { 2826 req->req.cmd->prop_set_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET; 2827 if (req->req.cmd->prop_set_cmd.attrib.size) { 2828 req->req.cmd->prop_set_cmd.value.u64 = *(uint64_t *)buf; 2829 } else { 2830 req->req.cmd->prop_set_cmd.value.u32.high = 0; 2831 req->req.cmd->prop_set_cmd.value.u32.low = *(uint32_t *)buf; 2832 } 2833 } else { 2834 req->req.cmd->prop_get_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET; 2835 } 2836 req->req.length = count; 2837 req->req.data = buf; 2838 2839 spdk_nvmf_request_exec_fabrics(&req->req); 2840 2841 return count; 2842 } 2843 2844 static ssize_t 2845 access_bar0_fn(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t pos, 2846 bool is_write) 2847 { 2848 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2849 struct nvmf_vfio_user_ctrlr *ctrlr; 2850 int ret; 2851 2852 ctrlr = endpoint->ctrlr; 2853 if (endpoint->need_async_destroy || !ctrlr) { 2854 errno = EIO; 2855 return -1; 2856 } 2857 2858 if (pos >= NVME_DOORBELLS_OFFSET) { 2859 /* 2860 * The fact that the doorbells can be memory mapped doesn't mean 2861 * that the client (VFIO in QEMU) is obliged to memory map them, 2862 * it might still elect to access them via regular read/write; 2863 * we might also have had disable_mappable_bar0 set. 2864 */ 2865 ret = handle_dbl_access(ctrlr, (uint32_t *)buf, count, 2866 pos, is_write); 2867 if (ret == 0) { 2868 return count; 2869 } 2870 return ret; 2871 } 2872 2873 return vfio_user_property_access(ctrlr, buf, count, pos, is_write); 2874 } 2875 2876 static ssize_t 2877 access_pci_config(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t offset, 2878 bool is_write) 2879 { 2880 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2881 2882 if (is_write) { 2883 SPDK_ERRLOG("%s: write %#lx-%#lx not supported\n", 2884 endpoint_id(endpoint), offset, offset + count); 2885 errno = EINVAL; 2886 return -1; 2887 } 2888 2889 if (offset + count > NVME_REG_CFG_SIZE) { 2890 SPDK_ERRLOG("%s: access past end of extended PCI configuration space, want=%ld+%ld, max=%d\n", 2891 endpoint_id(endpoint), offset, count, 2892 NVME_REG_CFG_SIZE); 2893 errno = ERANGE; 2894 return -1; 2895 } 2896 2897 memcpy(buf, ((unsigned char *)endpoint->pci_config_space) + offset, count); 2898 2899 return count; 2900 } 2901 2902 static void 2903 vfio_user_log(vfu_ctx_t *vfu_ctx, int level, char const *msg) 2904 { 2905 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2906 2907 if (level >= LOG_DEBUG) { 2908 SPDK_DEBUGLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg); 2909 } else if (level >= LOG_INFO) { 2910 SPDK_INFOLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg); 2911 } else if (level >= LOG_NOTICE) { 2912 SPDK_NOTICELOG("%s: %s\n", endpoint_id(endpoint), msg); 2913 } else if (level >= LOG_WARNING) { 2914 SPDK_WARNLOG("%s: %s\n", endpoint_id(endpoint), msg); 2915 } else { 2916 SPDK_ERRLOG("%s: %s\n", endpoint_id(endpoint), msg); 2917 } 2918 } 2919 2920 static int 2921 vfio_user_get_log_level(void) 2922 { 2923 int level; 2924 2925 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 2926 return LOG_DEBUG; 2927 } 2928 2929 level = spdk_log_to_syslog_level(spdk_log_get_level()); 2930 if (level < 0) { 2931 return LOG_ERR; 2932 } 2933 2934 return level; 2935 } 2936 2937 static void 2938 init_pci_config_space(vfu_pci_config_space_t *p) 2939 { 2940 /* MLBAR */ 2941 p->hdr.bars[0].raw = 0x0; 2942 /* MUBAR */ 2943 p->hdr.bars[1].raw = 0x0; 2944 2945 /* vendor specific, let's set them to zero for now */ 2946 p->hdr.bars[3].raw = 0x0; 2947 p->hdr.bars[4].raw = 0x0; 2948 p->hdr.bars[5].raw = 0x0; 2949 2950 /* enable INTx */ 2951 p->hdr.intr.ipin = 0x1; 2952 } 2953 2954 struct ctrlr_quiesce_ctx { 2955 struct nvmf_vfio_user_endpoint *endpoint; 2956 struct nvmf_vfio_user_poll_group *group; 2957 int status; 2958 }; 2959 2960 static void ctrlr_quiesce(struct nvmf_vfio_user_ctrlr *vu_ctrlr); 2961 2962 static void 2963 _vfio_user_endpoint_resume_done_msg(void *ctx) 2964 { 2965 struct nvmf_vfio_user_endpoint *endpoint = ctx; 2966 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 2967 2968 endpoint->need_resume = false; 2969 2970 if (!vu_ctrlr) { 2971 return; 2972 } 2973 2974 if (!vu_ctrlr->queued_quiesce) { 2975 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 2976 2977 /* 2978 * We might have ignored new SQ entries while we were quiesced: 2979 * kick ourselves so we'll definitely check again while in 2980 * VFIO_USER_CTRLR_RUNNING state. 2981 */ 2982 if (in_interrupt_mode(endpoint->transport)) { 2983 ctrlr_kick(vu_ctrlr); 2984 } 2985 return; 2986 } 2987 2988 2989 /* 2990 * Basically, once we call `vfu_device_quiesced` the device is 2991 * unquiesced from libvfio-user's perspective so from the moment 2992 * `vfio_user_quiesce_done` returns libvfio-user might quiesce the device 2993 * again. However, because the NVMf subsytem is an asynchronous 2994 * operation, this quiesce might come _before_ the NVMf subsystem has 2995 * been resumed, so in the callback of `spdk_nvmf_subsystem_resume` we 2996 * need to check whether a quiesce was requested. 2997 */ 2998 SPDK_DEBUGLOG(nvmf_vfio, "%s has queued quiesce event, quiesce again\n", 2999 ctrlr_id(vu_ctrlr)); 3000 ctrlr_quiesce(vu_ctrlr); 3001 } 3002 3003 static void 3004 vfio_user_endpoint_resume_done(struct spdk_nvmf_subsystem *subsystem, 3005 void *cb_arg, int status) 3006 { 3007 struct nvmf_vfio_user_endpoint *endpoint = cb_arg; 3008 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3009 3010 SPDK_DEBUGLOG(nvmf_vfio, "%s resumed done with status %d\n", endpoint_id(endpoint), status); 3011 3012 if (!vu_ctrlr) { 3013 return; 3014 } 3015 3016 spdk_thread_send_msg(vu_ctrlr->thread, _vfio_user_endpoint_resume_done_msg, endpoint); 3017 } 3018 3019 static void 3020 vfio_user_quiesce_done(void *ctx) 3021 { 3022 struct ctrlr_quiesce_ctx *quiesce_ctx = ctx; 3023 struct nvmf_vfio_user_endpoint *endpoint = quiesce_ctx->endpoint; 3024 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3025 int ret; 3026 3027 if (!vu_ctrlr) { 3028 free(quiesce_ctx); 3029 return; 3030 } 3031 3032 SPDK_DEBUGLOG(nvmf_vfio, "%s device quiesced\n", ctrlr_id(vu_ctrlr)); 3033 3034 assert(vu_ctrlr->state == VFIO_USER_CTRLR_PAUSING); 3035 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 3036 vfu_device_quiesced(endpoint->vfu_ctx, quiesce_ctx->status); 3037 vu_ctrlr->queued_quiesce = false; 3038 free(quiesce_ctx); 3039 3040 /* `vfu_device_quiesced` can change the migration state, 3041 * so we need to re-check `vu_ctrlr->state`. 3042 */ 3043 if (vu_ctrlr->state == VFIO_USER_CTRLR_MIGRATING) { 3044 SPDK_DEBUGLOG(nvmf_vfio, "%s is in MIGRATION state\n", ctrlr_id(vu_ctrlr)); 3045 return; 3046 } 3047 3048 SPDK_DEBUGLOG(nvmf_vfio, "%s start to resume\n", ctrlr_id(vu_ctrlr)); 3049 vu_ctrlr->state = VFIO_USER_CTRLR_RESUMING; 3050 ret = spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 3051 vfio_user_endpoint_resume_done, endpoint); 3052 if (ret < 0) { 3053 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 3054 SPDK_ERRLOG("%s: failed to resume, ret=%d\n", endpoint_id(endpoint), ret); 3055 } 3056 } 3057 3058 static void 3059 vfio_user_pause_done(struct spdk_nvmf_subsystem *subsystem, 3060 void *ctx, int status) 3061 { 3062 struct ctrlr_quiesce_ctx *quiesce_ctx = ctx; 3063 struct nvmf_vfio_user_endpoint *endpoint = quiesce_ctx->endpoint; 3064 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3065 3066 if (!vu_ctrlr) { 3067 free(quiesce_ctx); 3068 return; 3069 } 3070 3071 quiesce_ctx->status = status; 3072 3073 SPDK_DEBUGLOG(nvmf_vfio, "%s pause done with status %d\n", 3074 ctrlr_id(vu_ctrlr), status); 3075 3076 spdk_thread_send_msg(vu_ctrlr->thread, 3077 vfio_user_quiesce_done, ctx); 3078 } 3079 3080 /* 3081 * Ensure that, for this PG, we've stopped running in nvmf_vfio_user_sq_poll(); 3082 * we've already set ctrlr->state, so we won't process new entries, but we need 3083 * to ensure that this PG is quiesced. This only works because there's no 3084 * callback context set up between polling the SQ and spdk_nvmf_request_exec(). 3085 * 3086 * Once we've walked all PGs, we need to pause any submitted I/O via 3087 * spdk_nvmf_subsystem_pause(SPDK_NVME_GLOBAL_NS_TAG). 3088 */ 3089 static void 3090 vfio_user_quiesce_pg(void *ctx) 3091 { 3092 struct ctrlr_quiesce_ctx *quiesce_ctx = ctx; 3093 struct nvmf_vfio_user_endpoint *endpoint = quiesce_ctx->endpoint; 3094 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3095 struct nvmf_vfio_user_poll_group *vu_group = quiesce_ctx->group; 3096 struct spdk_nvmf_subsystem *subsystem = endpoint->subsystem; 3097 int ret; 3098 3099 SPDK_DEBUGLOG(nvmf_vfio, "quiesced pg:%p\n", vu_group); 3100 3101 if (!vu_ctrlr) { 3102 free(quiesce_ctx); 3103 return; 3104 } 3105 3106 quiesce_ctx->group = TAILQ_NEXT(vu_group, link); 3107 if (quiesce_ctx->group != NULL) { 3108 spdk_thread_send_msg(poll_group_to_thread(quiesce_ctx->group), 3109 vfio_user_quiesce_pg, quiesce_ctx); 3110 return; 3111 } 3112 3113 ret = spdk_nvmf_subsystem_pause(subsystem, SPDK_NVME_GLOBAL_NS_TAG, 3114 vfio_user_pause_done, quiesce_ctx); 3115 if (ret < 0) { 3116 SPDK_ERRLOG("%s: failed to pause, ret=%d\n", 3117 endpoint_id(endpoint), ret); 3118 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 3119 fail_ctrlr(vu_ctrlr); 3120 free(quiesce_ctx); 3121 } 3122 } 3123 3124 static void 3125 ctrlr_quiesce(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3126 { 3127 struct ctrlr_quiesce_ctx *quiesce_ctx; 3128 3129 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSING; 3130 3131 quiesce_ctx = calloc(1, sizeof(*quiesce_ctx)); 3132 if (!quiesce_ctx) { 3133 SPDK_ERRLOG("Failed to allocate subsystem pause context\n"); 3134 assert(false); 3135 return; 3136 } 3137 3138 quiesce_ctx->endpoint = vu_ctrlr->endpoint; 3139 quiesce_ctx->status = 0; 3140 quiesce_ctx->group = TAILQ_FIRST(&vu_ctrlr->transport->poll_groups); 3141 3142 spdk_thread_send_msg(poll_group_to_thread(quiesce_ctx->group), 3143 vfio_user_quiesce_pg, quiesce_ctx); 3144 } 3145 3146 static int 3147 vfio_user_dev_quiesce_cb(vfu_ctx_t *vfu_ctx) 3148 { 3149 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3150 struct spdk_nvmf_subsystem *subsystem = endpoint->subsystem; 3151 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3152 3153 if (!vu_ctrlr) { 3154 return 0; 3155 } 3156 3157 /* NVMf library will destruct controller when no 3158 * connected queue pairs. 3159 */ 3160 if (!nvmf_subsystem_get_ctrlr(subsystem, vu_ctrlr->cntlid)) { 3161 return 0; 3162 } 3163 3164 SPDK_DEBUGLOG(nvmf_vfio, "%s starts to quiesce\n", ctrlr_id(vu_ctrlr)); 3165 3166 /* There is no race condition here as device quiesce callback 3167 * and nvmf_prop_set_cc() are running in the same thread context. 3168 */ 3169 if (!vu_ctrlr->ctrlr->vcprop.cc.bits.en) { 3170 return 0; 3171 } else if (!vu_ctrlr->ctrlr->vcprop.csts.bits.rdy) { 3172 return 0; 3173 } else if (vu_ctrlr->ctrlr->vcprop.csts.bits.shst == SPDK_NVME_SHST_COMPLETE) { 3174 return 0; 3175 } 3176 3177 switch (vu_ctrlr->state) { 3178 case VFIO_USER_CTRLR_PAUSED: 3179 case VFIO_USER_CTRLR_MIGRATING: 3180 return 0; 3181 case VFIO_USER_CTRLR_RUNNING: 3182 ctrlr_quiesce(vu_ctrlr); 3183 break; 3184 case VFIO_USER_CTRLR_RESUMING: 3185 vu_ctrlr->queued_quiesce = true; 3186 SPDK_DEBUGLOG(nvmf_vfio, "%s is busy to quiesce, current state %u\n", ctrlr_id(vu_ctrlr), 3187 vu_ctrlr->state); 3188 break; 3189 default: 3190 assert(vu_ctrlr->state != VFIO_USER_CTRLR_PAUSING); 3191 break; 3192 } 3193 3194 errno = EBUSY; 3195 return -1; 3196 } 3197 3198 static void 3199 vfio_user_ctrlr_dump_migr_data(const char *name, 3200 struct vfio_user_nvme_migr_state *migr_data, 3201 struct nvmf_vfio_user_shadow_doorbells *sdbl) 3202 { 3203 struct spdk_nvmf_registers *regs; 3204 struct nvme_migr_sq_state *sq; 3205 struct nvme_migr_cq_state *cq; 3206 uint32_t *doorbell_base; 3207 uint32_t i; 3208 3209 SPDK_NOTICELOG("Dump %s\n", name); 3210 3211 regs = &migr_data->nvmf_data.regs; 3212 doorbell_base = (uint32_t *)&migr_data->doorbells; 3213 3214 SPDK_NOTICELOG("Registers\n"); 3215 SPDK_NOTICELOG("CSTS 0x%x\n", regs->csts.raw); 3216 SPDK_NOTICELOG("CAP 0x%"PRIx64"\n", regs->cap.raw); 3217 SPDK_NOTICELOG("VS 0x%x\n", regs->vs.raw); 3218 SPDK_NOTICELOG("CC 0x%x\n", regs->cc.raw); 3219 SPDK_NOTICELOG("AQA 0x%x\n", regs->aqa.raw); 3220 SPDK_NOTICELOG("ASQ 0x%"PRIx64"\n", regs->asq); 3221 SPDK_NOTICELOG("ACQ 0x%"PRIx64"\n", regs->acq); 3222 3223 SPDK_NOTICELOG("Number of IO Queues %u\n", migr_data->ctrlr_header.num_io_queues); 3224 3225 if (sdbl != NULL) { 3226 SPDK_NOTICELOG("shadow doorbell buffer=%#lx\n", 3227 migr_data->ctrlr_header.shadow_doorbell_buffer); 3228 SPDK_NOTICELOG("eventidx buffer=%#lx\n", 3229 migr_data->ctrlr_header.eventidx_buffer); 3230 } 3231 3232 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3233 sq = &migr_data->qps[i].sq; 3234 cq = &migr_data->qps[i].cq; 3235 3236 if (sq->size) { 3237 SPDK_NOTICELOG("sqid:%u, bar0_doorbell:%u\n", sq->sqid, doorbell_base[i * 2]); 3238 if (i > 0 && sdbl != NULL) { 3239 SPDK_NOTICELOG("sqid:%u, shadow_doorbell:%u, eventidx:%u\n", 3240 sq->sqid, 3241 sdbl->shadow_doorbells[queue_index(i, false)], 3242 sdbl->eventidxs[queue_index(i, false)]); 3243 } 3244 SPDK_NOTICELOG("SQ sqid:%u, cqid:%u, sqhead:%u, size:%u, dma_addr:0x%"PRIx64"\n", 3245 sq->sqid, sq->cqid, sq->head, sq->size, sq->dma_addr); 3246 } 3247 3248 if (cq->size) { 3249 SPDK_NOTICELOG("cqid:%u, bar0_doorbell:%u\n", cq->cqid, doorbell_base[i * 2 + 1]); 3250 if (i > 0 && sdbl != NULL) { 3251 SPDK_NOTICELOG("cqid:%u, shadow_doorbell:%u, eventidx:%u\n", 3252 cq->cqid, 3253 sdbl->shadow_doorbells[queue_index(i, true)], 3254 sdbl->eventidxs[queue_index(i, true)]); 3255 } 3256 SPDK_NOTICELOG("CQ cqid:%u, phase:%u, cqtail:%u, size:%u, iv:%u, ien:%u, dma_addr:0x%"PRIx64"\n", 3257 cq->cqid, cq->phase, cq->tail, cq->size, cq->iv, cq->ien, cq->dma_addr); 3258 } 3259 } 3260 3261 SPDK_NOTICELOG("%s Dump Done\n", name); 3262 } 3263 3264 /* Read region 9 content and restore it to migration data structures */ 3265 static int 3266 vfio_user_migr_stream_to_data(struct nvmf_vfio_user_endpoint *endpoint, 3267 struct vfio_user_nvme_migr_state *migr_state) 3268 { 3269 void *data_ptr = endpoint->migr_data; 3270 3271 /* Load vfio_user_nvme_migr_header first */ 3272 memcpy(&migr_state->ctrlr_header, data_ptr, sizeof(struct vfio_user_nvme_migr_header)); 3273 /* TODO: version check */ 3274 if (migr_state->ctrlr_header.magic != VFIO_USER_NVME_MIGR_MAGIC) { 3275 SPDK_ERRLOG("%s: bad magic number %x\n", endpoint_id(endpoint), migr_state->ctrlr_header.magic); 3276 return -EINVAL; 3277 } 3278 3279 /* Load nvmf controller data */ 3280 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.nvmf_data_offset; 3281 memcpy(&migr_state->nvmf_data, data_ptr, migr_state->ctrlr_header.nvmf_data_len); 3282 3283 /* Load queue pairs */ 3284 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.qp_offset; 3285 memcpy(&migr_state->qps, data_ptr, migr_state->ctrlr_header.qp_len); 3286 3287 /* Load doorbells */ 3288 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.bar_offset[VFU_PCI_DEV_BAR0_REGION_IDX]; 3289 memcpy(&migr_state->doorbells, data_ptr, 3290 migr_state->ctrlr_header.bar_len[VFU_PCI_DEV_BAR0_REGION_IDX]); 3291 3292 /* Load CFG */ 3293 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.bar_offset[VFU_PCI_DEV_CFG_REGION_IDX]; 3294 memcpy(&migr_state->cfg, data_ptr, migr_state->ctrlr_header.bar_len[VFU_PCI_DEV_CFG_REGION_IDX]); 3295 3296 return 0; 3297 } 3298 3299 3300 static void 3301 vfio_user_migr_ctrlr_save_data(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3302 { 3303 struct spdk_nvmf_ctrlr *ctrlr = vu_ctrlr->ctrlr; 3304 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 3305 struct nvmf_vfio_user_sq *sq; 3306 struct nvmf_vfio_user_cq *cq; 3307 uint64_t data_offset; 3308 void *data_ptr; 3309 uint32_t *doorbell_base; 3310 uint32_t i = 0; 3311 uint16_t sqid, cqid; 3312 struct vfio_user_nvme_migr_state migr_state = { 3313 .nvmf_data = { 3314 .data_size = offsetof(struct spdk_nvmf_ctrlr_migr_data, unused), 3315 .regs_size = sizeof(struct spdk_nvmf_registers), 3316 .feat_size = sizeof(struct spdk_nvmf_ctrlr_feat) 3317 } 3318 }; 3319 3320 /* Save all data to vfio_user_nvme_migr_state first, then we will 3321 * copy it to device migration region at last. 3322 */ 3323 3324 /* save magic number */ 3325 migr_state.ctrlr_header.magic = VFIO_USER_NVME_MIGR_MAGIC; 3326 3327 /* save controller data */ 3328 spdk_nvmf_ctrlr_save_migr_data(ctrlr, &migr_state.nvmf_data); 3329 3330 /* save connected queue pairs */ 3331 TAILQ_FOREACH(sq, &vu_ctrlr->connected_sqs, tailq) { 3332 /* save sq */ 3333 sqid = sq->qid; 3334 migr_state.qps[sqid].sq.sqid = sq->qid; 3335 migr_state.qps[sqid].sq.cqid = sq->cqid; 3336 migr_state.qps[sqid].sq.head = *sq_headp(sq); 3337 migr_state.qps[sqid].sq.size = sq->size; 3338 migr_state.qps[sqid].sq.dma_addr = sq->mapping.prp1; 3339 3340 /* save cq, for shared cq case, cq may be saved multiple times */ 3341 cqid = sq->cqid; 3342 cq = vu_ctrlr->cqs[cqid]; 3343 migr_state.qps[cqid].cq.cqid = cqid; 3344 migr_state.qps[cqid].cq.tail = *cq_tailp(cq); 3345 migr_state.qps[cqid].cq.ien = cq->ien; 3346 migr_state.qps[cqid].cq.iv = cq->iv; 3347 migr_state.qps[cqid].cq.size = cq->size; 3348 migr_state.qps[cqid].cq.phase = cq->phase; 3349 migr_state.qps[cqid].cq.dma_addr = cq->mapping.prp1; 3350 i++; 3351 } 3352 3353 assert(i > 0); 3354 migr_state.ctrlr_header.num_io_queues = i - 1; 3355 3356 /* Save doorbells */ 3357 doorbell_base = (uint32_t *)&migr_state.doorbells; 3358 memcpy(doorbell_base, (void *)vu_ctrlr->bar0_doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 3359 3360 /* Save PCI configuration space */ 3361 memcpy(&migr_state.cfg, (void *)endpoint->pci_config_space, NVME_REG_CFG_SIZE); 3362 3363 /* Save all data to device migration region */ 3364 data_ptr = endpoint->migr_data; 3365 3366 /* Copy nvmf controller data */ 3367 data_offset = sizeof(struct vfio_user_nvme_migr_header); 3368 data_ptr += data_offset; 3369 migr_state.ctrlr_header.nvmf_data_offset = data_offset; 3370 migr_state.ctrlr_header.nvmf_data_len = sizeof(struct spdk_nvmf_ctrlr_migr_data); 3371 memcpy(data_ptr, &migr_state.nvmf_data, sizeof(struct spdk_nvmf_ctrlr_migr_data)); 3372 3373 /* Copy queue pairs */ 3374 data_offset += sizeof(struct spdk_nvmf_ctrlr_migr_data); 3375 data_ptr += sizeof(struct spdk_nvmf_ctrlr_migr_data); 3376 migr_state.ctrlr_header.qp_offset = data_offset; 3377 migr_state.ctrlr_header.qp_len = i * (sizeof(struct nvme_migr_sq_state) + sizeof( 3378 struct nvme_migr_cq_state)); 3379 memcpy(data_ptr, &migr_state.qps, migr_state.ctrlr_header.qp_len); 3380 3381 /* Copy doorbells */ 3382 data_offset += migr_state.ctrlr_header.qp_len; 3383 data_ptr += migr_state.ctrlr_header.qp_len; 3384 migr_state.ctrlr_header.bar_offset[VFU_PCI_DEV_BAR0_REGION_IDX] = data_offset; 3385 migr_state.ctrlr_header.bar_len[VFU_PCI_DEV_BAR0_REGION_IDX] = NVMF_VFIO_USER_DOORBELLS_SIZE; 3386 memcpy(data_ptr, &migr_state.doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 3387 3388 /* Copy CFG */ 3389 data_offset += NVMF_VFIO_USER_DOORBELLS_SIZE; 3390 data_ptr += NVMF_VFIO_USER_DOORBELLS_SIZE; 3391 migr_state.ctrlr_header.bar_offset[VFU_PCI_DEV_CFG_REGION_IDX] = data_offset; 3392 migr_state.ctrlr_header.bar_len[VFU_PCI_DEV_CFG_REGION_IDX] = NVME_REG_CFG_SIZE; 3393 memcpy(data_ptr, &migr_state.cfg, NVME_REG_CFG_SIZE); 3394 3395 /* copy shadow doorbells */ 3396 if (vu_ctrlr->sdbl != NULL) { 3397 migr_state.ctrlr_header.sdbl = true; 3398 migr_state.ctrlr_header.shadow_doorbell_buffer = vu_ctrlr->shadow_doorbell_buffer; 3399 migr_state.ctrlr_header.eventidx_buffer = vu_ctrlr->eventidx_buffer; 3400 } 3401 3402 /* Copy nvme migration header finally */ 3403 memcpy(endpoint->migr_data, &migr_state.ctrlr_header, sizeof(struct vfio_user_nvme_migr_header)); 3404 3405 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 3406 vfio_user_ctrlr_dump_migr_data("SAVE", &migr_state, vu_ctrlr->sdbl); 3407 } 3408 } 3409 3410 /* 3411 * If we are about to close the connection, we need to unregister the interrupt, 3412 * as the library will subsequently close the file descriptor we registered. 3413 */ 3414 static int 3415 vfio_user_device_reset(vfu_ctx_t *vfu_ctx, vfu_reset_type_t type) 3416 { 3417 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3418 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 3419 3420 SPDK_DEBUGLOG(nvmf_vfio, "Device reset type %u\n", type); 3421 3422 if (type == VFU_RESET_LOST_CONN) { 3423 if (ctrlr != NULL) { 3424 spdk_interrupt_unregister(&ctrlr->intr); 3425 ctrlr->intr_fd = -1; 3426 } 3427 return 0; 3428 } 3429 3430 /* FIXME: LOST_CONN case ? */ 3431 if (ctrlr->sdbl != NULL) { 3432 vfio_user_ctrlr_switch_doorbells(ctrlr, false); 3433 free_sdbl(vfu_ctx, ctrlr->sdbl); 3434 ctrlr->sdbl = NULL; 3435 } 3436 3437 /* FIXME: much more needed here. */ 3438 3439 return 0; 3440 } 3441 3442 static int 3443 vfio_user_migr_ctrlr_construct_qps(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 3444 struct vfio_user_nvme_migr_state *migr_state) 3445 { 3446 uint32_t i, qsize = 0; 3447 uint16_t sqid, cqid; 3448 struct vfio_user_nvme_migr_qp migr_qp; 3449 void *addr; 3450 uint32_t cqs_ref[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR] = {}; 3451 int ret; 3452 3453 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 3454 vfio_user_ctrlr_dump_migr_data("RESUME", migr_state, vu_ctrlr->sdbl); 3455 } 3456 3457 /* restore submission queues */ 3458 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3459 migr_qp = migr_state->qps[i]; 3460 3461 qsize = migr_qp.sq.size; 3462 if (qsize) { 3463 struct nvmf_vfio_user_sq *sq; 3464 3465 sqid = migr_qp.sq.sqid; 3466 if (sqid != i) { 3467 SPDK_ERRLOG("Expected sqid %u while got %u", i, sqid); 3468 return -EINVAL; 3469 } 3470 3471 /* allocate sq if necessary */ 3472 if (vu_ctrlr->sqs[sqid] == NULL) { 3473 ret = init_sq(vu_ctrlr, &vu_ctrlr->transport->transport, sqid); 3474 if (ret) { 3475 SPDK_ERRLOG("Construct qpair with qid %u failed\n", sqid); 3476 return -EFAULT; 3477 } 3478 } 3479 3480 sq = vu_ctrlr->sqs[sqid]; 3481 sq->size = qsize; 3482 3483 ret = alloc_sq_reqs(vu_ctrlr, sq); 3484 if (ret) { 3485 SPDK_ERRLOG("Construct sq with qid %u failed\n", sqid); 3486 return -EFAULT; 3487 } 3488 3489 /* restore sq */ 3490 sq->sq_state = VFIO_USER_SQ_CREATED; 3491 sq->cqid = migr_qp.sq.cqid; 3492 *sq_headp(sq) = migr_qp.sq.head; 3493 sq->mapping.prp1 = migr_qp.sq.dma_addr; 3494 addr = map_one(vu_ctrlr->endpoint->vfu_ctx, 3495 sq->mapping.prp1, sq->size * 64, 3496 sq->mapping.sg, &sq->mapping.iov, 3497 PROT_READ); 3498 if (addr == NULL) { 3499 SPDK_ERRLOG("Restore sq with qid %u PRP1 0x%"PRIx64" with size %u failed\n", 3500 sqid, sq->mapping.prp1, sq->size); 3501 return -EFAULT; 3502 } 3503 cqs_ref[sq->cqid]++; 3504 } 3505 } 3506 3507 /* restore completion queues */ 3508 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3509 migr_qp = migr_state->qps[i]; 3510 3511 qsize = migr_qp.cq.size; 3512 if (qsize) { 3513 struct nvmf_vfio_user_cq *cq; 3514 3515 /* restore cq */ 3516 cqid = migr_qp.sq.cqid; 3517 assert(cqid == i); 3518 3519 /* allocate cq if necessary */ 3520 if (vu_ctrlr->cqs[cqid] == NULL) { 3521 ret = init_cq(vu_ctrlr, cqid); 3522 if (ret) { 3523 SPDK_ERRLOG("Construct qpair with qid %u failed\n", cqid); 3524 return -EFAULT; 3525 } 3526 } 3527 3528 cq = vu_ctrlr->cqs[cqid]; 3529 3530 cq->size = qsize; 3531 3532 cq->cq_state = VFIO_USER_CQ_CREATED; 3533 cq->cq_ref = cqs_ref[cqid]; 3534 *cq_tailp(cq) = migr_qp.cq.tail; 3535 cq->mapping.prp1 = migr_qp.cq.dma_addr; 3536 cq->ien = migr_qp.cq.ien; 3537 cq->iv = migr_qp.cq.iv; 3538 cq->phase = migr_qp.cq.phase; 3539 addr = map_one(vu_ctrlr->endpoint->vfu_ctx, 3540 cq->mapping.prp1, cq->size * 16, 3541 cq->mapping.sg, &cq->mapping.iov, 3542 PROT_READ | PROT_WRITE); 3543 if (addr == NULL) { 3544 SPDK_ERRLOG("Restore cq with qid %u PRP1 0x%"PRIx64" with size %u failed\n", 3545 cqid, cq->mapping.prp1, cq->size); 3546 return -EFAULT; 3547 } 3548 } 3549 } 3550 3551 return 0; 3552 } 3553 3554 static int 3555 vfio_user_migr_ctrlr_restore(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3556 { 3557 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 3558 struct spdk_nvmf_ctrlr *ctrlr = vu_ctrlr->ctrlr; 3559 uint32_t *doorbell_base; 3560 struct spdk_nvme_cmd cmd; 3561 uint16_t i; 3562 int rc = 0; 3563 struct vfio_user_nvme_migr_state migr_state = { 3564 .nvmf_data = { 3565 .data_size = offsetof(struct spdk_nvmf_ctrlr_migr_data, unused), 3566 .regs_size = sizeof(struct spdk_nvmf_registers), 3567 .feat_size = sizeof(struct spdk_nvmf_ctrlr_feat) 3568 } 3569 }; 3570 3571 assert(endpoint->migr_data != NULL); 3572 assert(ctrlr != NULL); 3573 rc = vfio_user_migr_stream_to_data(endpoint, &migr_state); 3574 if (rc) { 3575 return rc; 3576 } 3577 3578 /* restore shadow doorbells */ 3579 if (migr_state.ctrlr_header.sdbl) { 3580 struct nvmf_vfio_user_shadow_doorbells *sdbl; 3581 sdbl = map_sdbl(vu_ctrlr->endpoint->vfu_ctx, 3582 migr_state.ctrlr_header.shadow_doorbell_buffer, 3583 migr_state.ctrlr_header.eventidx_buffer, 3584 memory_page_size(vu_ctrlr)); 3585 if (sdbl == NULL) { 3586 SPDK_ERRLOG("%s: failed to re-map shadow doorbell buffers\n", 3587 ctrlr_id(vu_ctrlr)); 3588 return -1; 3589 } 3590 3591 vu_ctrlr->shadow_doorbell_buffer = migr_state.ctrlr_header.shadow_doorbell_buffer; 3592 vu_ctrlr->eventidx_buffer = migr_state.ctrlr_header.eventidx_buffer; 3593 3594 SWAP(vu_ctrlr->sdbl, sdbl); 3595 } 3596 3597 rc = vfio_user_migr_ctrlr_construct_qps(vu_ctrlr, &migr_state); 3598 if (rc) { 3599 return rc; 3600 } 3601 3602 /* restore PCI configuration space */ 3603 memcpy((void *)endpoint->pci_config_space, &migr_state.cfg, NVME_REG_CFG_SIZE); 3604 3605 doorbell_base = (uint32_t *)&migr_state.doorbells; 3606 /* restore doorbells from saved registers */ 3607 memcpy((void *)vu_ctrlr->bar0_doorbells, doorbell_base, NVMF_VFIO_USER_DOORBELLS_SIZE); 3608 3609 /* restore nvmf controller data */ 3610 rc = spdk_nvmf_ctrlr_restore_migr_data(ctrlr, &migr_state.nvmf_data); 3611 if (rc) { 3612 return rc; 3613 } 3614 3615 /* resubmit pending AERs */ 3616 for (i = 0; i < migr_state.nvmf_data.num_aer_cids; i++) { 3617 SPDK_DEBUGLOG(nvmf_vfio, "%s AER resubmit, CID %u\n", ctrlr_id(vu_ctrlr), 3618 migr_state.nvmf_data.aer_cids[i]); 3619 memset(&cmd, 0, sizeof(cmd)); 3620 cmd.opc = SPDK_NVME_OPC_ASYNC_EVENT_REQUEST; 3621 cmd.cid = migr_state.nvmf_data.aer_cids[i]; 3622 rc = handle_cmd_req(vu_ctrlr, &cmd, vu_ctrlr->sqs[0]); 3623 if (rc) { 3624 break; 3625 } 3626 } 3627 3628 return rc; 3629 } 3630 3631 static void 3632 vfio_user_migr_ctrlr_enable_sqs(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3633 { 3634 uint32_t i; 3635 struct nvmf_vfio_user_sq *sq; 3636 3637 /* The Admin queue (qid: 0) does not ever use shadow doorbells. */ 3638 3639 if (vu_ctrlr->sqs[0] != NULL) { 3640 vu_ctrlr->sqs[0]->dbl_tailp = vu_ctrlr->bar0_doorbells + 3641 queue_index(0, false); 3642 } 3643 3644 if (vu_ctrlr->cqs[0] != NULL) { 3645 vu_ctrlr->cqs[0]->dbl_headp = vu_ctrlr->bar0_doorbells + 3646 queue_index(0, true); 3647 } 3648 3649 vfio_user_ctrlr_switch_doorbells(vu_ctrlr, vu_ctrlr->sdbl != NULL); 3650 3651 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3652 sq = vu_ctrlr->sqs[i]; 3653 if (!sq || !sq->size) { 3654 continue; 3655 } 3656 3657 if (nvmf_qpair_is_admin_queue(&sq->qpair)) { 3658 /* ADMIN queue pair is always in the poll group, just enable it */ 3659 sq->sq_state = VFIO_USER_SQ_ACTIVE; 3660 } else { 3661 spdk_nvmf_tgt_new_qpair(vu_ctrlr->transport->transport.tgt, &sq->qpair); 3662 } 3663 } 3664 } 3665 3666 /* 3667 * We are in stop-and-copy state, but still potentially have some current dirty 3668 * sgls: while we're quiesced and thus should have no active requests, we still 3669 * have potentially dirty maps of the shadow doorbells and the CQs (SQs are 3670 * mapped read only). 3671 * 3672 * Since we won't be calling vfu_sgl_put() for them, we need to explicitly 3673 * mark them dirty now. 3674 */ 3675 static void 3676 vfio_user_migr_ctrlr_mark_dirty(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3677 { 3678 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 3679 3680 assert(vu_ctrlr->state == VFIO_USER_CTRLR_MIGRATING); 3681 3682 for (size_t i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3683 struct nvmf_vfio_user_cq *cq = vu_ctrlr->cqs[i]; 3684 3685 if (cq == NULL || q_addr(&cq->mapping) == NULL) { 3686 continue; 3687 } 3688 3689 vfu_sgl_mark_dirty(endpoint->vfu_ctx, cq->mapping.sg, 1); 3690 } 3691 3692 if (vu_ctrlr->sdbl != NULL) { 3693 dma_sg_t *sg; 3694 size_t i; 3695 3696 for (i = 0; i < NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT; 3697 ++i) { 3698 3699 if (!vu_ctrlr->sdbl->iovs[i].iov_len) { 3700 continue; 3701 } 3702 3703 sg = index_to_sg_t(vu_ctrlr->sdbl->sgs, i); 3704 3705 vfu_sgl_mark_dirty(endpoint->vfu_ctx, sg, 1); 3706 } 3707 } 3708 } 3709 3710 static int 3711 vfio_user_migration_device_state_transition(vfu_ctx_t *vfu_ctx, vfu_migr_state_t state) 3712 { 3713 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3714 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3715 struct nvmf_vfio_user_sq *sq; 3716 int ret = 0; 3717 3718 SPDK_DEBUGLOG(nvmf_vfio, "%s controller state %u, migration state %u\n", endpoint_id(endpoint), 3719 vu_ctrlr->state, state); 3720 3721 switch (state) { 3722 case VFU_MIGR_STATE_STOP_AND_COPY: 3723 vu_ctrlr->in_source_vm = true; 3724 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 3725 vfio_user_migr_ctrlr_mark_dirty(vu_ctrlr); 3726 vfio_user_migr_ctrlr_save_data(vu_ctrlr); 3727 break; 3728 case VFU_MIGR_STATE_STOP: 3729 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 3730 /* The controller associates with source VM is dead now, we will resume 3731 * the subsystem after destroying the controller data structure, then the 3732 * subsystem can be re-used for another new client. 3733 */ 3734 if (vu_ctrlr->in_source_vm) { 3735 endpoint->need_resume = true; 3736 } 3737 break; 3738 case VFU_MIGR_STATE_PRE_COPY: 3739 assert(vu_ctrlr->state == VFIO_USER_CTRLR_PAUSED); 3740 break; 3741 case VFU_MIGR_STATE_RESUME: 3742 /* 3743 * Destination ADMIN queue pair is connected when starting the VM, 3744 * but the ADMIN queue pair isn't enabled in destination VM, the poll 3745 * group will do nothing to ADMIN queue pair for now. 3746 */ 3747 if (vu_ctrlr->state != VFIO_USER_CTRLR_RUNNING) { 3748 break; 3749 } 3750 3751 assert(!vu_ctrlr->in_source_vm); 3752 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 3753 3754 sq = TAILQ_FIRST(&vu_ctrlr->connected_sqs); 3755 assert(sq != NULL); 3756 assert(sq->qpair.qid == 0); 3757 sq->sq_state = VFIO_USER_SQ_INACTIVE; 3758 3759 /* Free ADMIN SQ resources first, SQ resources will be 3760 * allocated based on queue size from source VM. 3761 */ 3762 free_sq_reqs(sq); 3763 sq->size = 0; 3764 break; 3765 case VFU_MIGR_STATE_RUNNING: 3766 3767 if (vu_ctrlr->state != VFIO_USER_CTRLR_MIGRATING) { 3768 break; 3769 } 3770 3771 if (!vu_ctrlr->in_source_vm) { 3772 /* Restore destination VM from BAR9 */ 3773 ret = vfio_user_migr_ctrlr_restore(vu_ctrlr); 3774 if (ret) { 3775 break; 3776 } 3777 3778 vfio_user_ctrlr_switch_doorbells(vu_ctrlr, false); 3779 vfio_user_migr_ctrlr_enable_sqs(vu_ctrlr); 3780 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 3781 /* FIXME where do we resume nvmf? */ 3782 } else { 3783 /* Rollback source VM */ 3784 vu_ctrlr->state = VFIO_USER_CTRLR_RESUMING; 3785 ret = spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 3786 vfio_user_endpoint_resume_done, endpoint); 3787 if (ret < 0) { 3788 /* TODO: fail controller with CFS bit set */ 3789 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 3790 SPDK_ERRLOG("%s: failed to resume, ret=%d\n", endpoint_id(endpoint), ret); 3791 } 3792 } 3793 vu_ctrlr->migr_data_prepared = false; 3794 vu_ctrlr->in_source_vm = false; 3795 break; 3796 3797 default: 3798 return -EINVAL; 3799 } 3800 3801 return ret; 3802 } 3803 3804 static uint64_t 3805 vfio_user_migration_get_pending_bytes(vfu_ctx_t *vfu_ctx) 3806 { 3807 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3808 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 3809 uint64_t pending_bytes; 3810 3811 if (ctrlr->migr_data_prepared) { 3812 assert(ctrlr->state == VFIO_USER_CTRLR_MIGRATING); 3813 pending_bytes = 0; 3814 } else { 3815 pending_bytes = vfio_user_migr_data_len(); 3816 } 3817 3818 SPDK_DEBUGLOG(nvmf_vfio, 3819 "%s current state %u, pending bytes 0x%"PRIx64"\n", 3820 endpoint_id(endpoint), ctrlr->state, pending_bytes); 3821 3822 return pending_bytes; 3823 } 3824 3825 static int 3826 vfio_user_migration_prepare_data(vfu_ctx_t *vfu_ctx, uint64_t *offset, uint64_t *size) 3827 { 3828 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3829 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 3830 3831 /* 3832 * When transitioning to pre-copy state we set pending_bytes to 0, 3833 * so the vfio-user client shouldn't attempt to read any migration 3834 * data. This is not yet guaranteed by libvfio-user. 3835 */ 3836 if (ctrlr->state != VFIO_USER_CTRLR_MIGRATING) { 3837 assert(size != NULL); 3838 *offset = 0; 3839 *size = 0; 3840 return 0; 3841 } 3842 3843 if (ctrlr->in_source_vm) { /* migration source */ 3844 assert(size != NULL); 3845 *size = vfio_user_migr_data_len(); 3846 vfio_user_migr_ctrlr_save_data(ctrlr); 3847 } else { /* migration destination */ 3848 assert(size == NULL); 3849 assert(!ctrlr->migr_data_prepared); 3850 } 3851 *offset = 0; 3852 ctrlr->migr_data_prepared = true; 3853 3854 SPDK_DEBUGLOG(nvmf_vfio, "%s current state %u\n", endpoint_id(endpoint), ctrlr->state); 3855 3856 return 0; 3857 } 3858 3859 static ssize_t 3860 vfio_user_migration_read_data(vfu_ctx_t *vfu_ctx __attribute__((unused)), 3861 void *buf __attribute__((unused)), 3862 uint64_t count __attribute__((unused)), 3863 uint64_t offset __attribute__((unused))) 3864 { 3865 SPDK_DEBUGLOG(nvmf_vfio, "%s: migration read data not supported\n", 3866 endpoint_id(vfu_get_private(vfu_ctx))); 3867 errno = ENOTSUP; 3868 return -1; 3869 } 3870 3871 static ssize_t 3872 vfio_user_migration_write_data(vfu_ctx_t *vfu_ctx __attribute__((unused)), 3873 void *buf __attribute__((unused)), 3874 uint64_t count __attribute__((unused)), 3875 uint64_t offset __attribute__((unused))) 3876 { 3877 SPDK_DEBUGLOG(nvmf_vfio, "%s: migration write data not supported\n", 3878 endpoint_id(vfu_get_private(vfu_ctx))); 3879 errno = ENOTSUP; 3880 return -1; 3881 } 3882 3883 static int 3884 vfio_user_migration_data_written(vfu_ctx_t *vfu_ctx __attribute__((unused)), 3885 uint64_t count) 3886 { 3887 SPDK_DEBUGLOG(nvmf_vfio, "write 0x%"PRIx64"\n", (uint64_t)count); 3888 3889 if (count != vfio_user_migr_data_len()) { 3890 SPDK_DEBUGLOG(nvmf_vfio, "%s bad count %#lx\n", 3891 endpoint_id(vfu_get_private(vfu_ctx)), count); 3892 errno = EINVAL; 3893 return -1; 3894 } 3895 3896 return 0; 3897 } 3898 3899 static int 3900 vfio_user_dev_info_fill(struct nvmf_vfio_user_transport *vu_transport, 3901 struct nvmf_vfio_user_endpoint *endpoint) 3902 { 3903 int ret; 3904 ssize_t cap_offset; 3905 vfu_ctx_t *vfu_ctx = endpoint->vfu_ctx; 3906 struct iovec migr_sparse_mmap = {}; 3907 3908 struct pmcap pmcap = { .hdr.id = PCI_CAP_ID_PM, .pmcs.nsfrst = 0x1 }; 3909 struct pxcap pxcap = { 3910 .hdr.id = PCI_CAP_ID_EXP, 3911 .pxcaps.ver = 0x2, 3912 .pxdcap = {.rer = 0x1, .flrc = 0x1}, 3913 .pxdcap2.ctds = 0x1 3914 }; 3915 3916 struct msixcap msixcap = { 3917 .hdr.id = PCI_CAP_ID_MSIX, 3918 .mxc.ts = NVME_IRQ_MSIX_NUM - 1, 3919 .mtab = {.tbir = 0x4, .to = 0x0}, 3920 .mpba = {.pbir = 0x5, .pbao = 0x0} 3921 }; 3922 3923 struct iovec sparse_mmap[] = { 3924 { 3925 .iov_base = (void *)NVME_DOORBELLS_OFFSET, 3926 .iov_len = NVMF_VFIO_USER_DOORBELLS_SIZE, 3927 }, 3928 }; 3929 3930 const vfu_migration_callbacks_t migr_callbacks = { 3931 .version = VFU_MIGR_CALLBACKS_VERS, 3932 .transition = &vfio_user_migration_device_state_transition, 3933 .get_pending_bytes = &vfio_user_migration_get_pending_bytes, 3934 .prepare_data = &vfio_user_migration_prepare_data, 3935 .read_data = &vfio_user_migration_read_data, 3936 .data_written = &vfio_user_migration_data_written, 3937 .write_data = &vfio_user_migration_write_data 3938 }; 3939 3940 ret = vfu_pci_init(vfu_ctx, VFU_PCI_TYPE_EXPRESS, PCI_HEADER_TYPE_NORMAL, 0); 3941 if (ret < 0) { 3942 SPDK_ERRLOG("vfu_ctx %p failed to initialize PCI\n", vfu_ctx); 3943 return ret; 3944 } 3945 vfu_pci_set_id(vfu_ctx, SPDK_PCI_VID_NUTANIX, 0x0001, SPDK_PCI_VID_NUTANIX, 0); 3946 /* 3947 * 0x02, controller uses the NVM Express programming interface 3948 * 0x08, non-volatile memory controller 3949 * 0x01, mass storage controller 3950 */ 3951 vfu_pci_set_class(vfu_ctx, 0x01, 0x08, 0x02); 3952 3953 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pmcap); 3954 if (cap_offset < 0) { 3955 SPDK_ERRLOG("vfu_ctx %p failed add pmcap\n", vfu_ctx); 3956 return ret; 3957 } 3958 3959 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pxcap); 3960 if (cap_offset < 0) { 3961 SPDK_ERRLOG("vfu_ctx %p failed add pxcap\n", vfu_ctx); 3962 return ret; 3963 } 3964 3965 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &msixcap); 3966 if (cap_offset < 0) { 3967 SPDK_ERRLOG("vfu_ctx %p failed add msixcap\n", vfu_ctx); 3968 return ret; 3969 } 3970 3971 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_CFG_REGION_IDX, NVME_REG_CFG_SIZE, 3972 access_pci_config, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 3973 if (ret < 0) { 3974 SPDK_ERRLOG("vfu_ctx %p failed to setup cfg\n", vfu_ctx); 3975 return ret; 3976 } 3977 3978 if (vu_transport->transport_opts.disable_mappable_bar0) { 3979 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE, 3980 access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, 3981 NULL, 0, -1, 0); 3982 } else { 3983 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE, 3984 access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, 3985 sparse_mmap, 1, endpoint->devmem_fd, 0); 3986 } 3987 3988 if (ret < 0) { 3989 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 0\n", vfu_ctx); 3990 return ret; 3991 } 3992 3993 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR4_REGION_IDX, NVME_BAR4_SIZE, 3994 NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 3995 if (ret < 0) { 3996 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 4\n", vfu_ctx); 3997 return ret; 3998 } 3999 4000 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR5_REGION_IDX, NVME_BAR5_SIZE, 4001 NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 4002 if (ret < 0) { 4003 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 5\n", vfu_ctx); 4004 return ret; 4005 } 4006 4007 ret = vfu_setup_device_dma(vfu_ctx, memory_region_add_cb, memory_region_remove_cb); 4008 if (ret < 0) { 4009 SPDK_ERRLOG("vfu_ctx %p failed to setup dma callback\n", vfu_ctx); 4010 return ret; 4011 } 4012 4013 ret = vfu_setup_device_reset_cb(vfu_ctx, vfio_user_device_reset); 4014 if (ret < 0) { 4015 SPDK_ERRLOG("vfu_ctx %p failed to setup reset callback\n", vfu_ctx); 4016 return ret; 4017 } 4018 4019 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_INTX_IRQ, 1); 4020 if (ret < 0) { 4021 SPDK_ERRLOG("vfu_ctx %p failed to setup INTX\n", vfu_ctx); 4022 return ret; 4023 } 4024 4025 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSIX_IRQ, NVME_IRQ_MSIX_NUM); 4026 if (ret < 0) { 4027 SPDK_ERRLOG("vfu_ctx %p failed to setup MSIX\n", vfu_ctx); 4028 return ret; 4029 } 4030 4031 vfu_setup_device_quiesce_cb(vfu_ctx, vfio_user_dev_quiesce_cb); 4032 4033 migr_sparse_mmap.iov_base = (void *)4096; 4034 migr_sparse_mmap.iov_len = vfio_user_migr_data_len(); 4035 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_MIGR_REGION_IDX, 4036 vfu_get_migr_register_area_size() + vfio_user_migr_data_len(), 4037 NULL, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, &migr_sparse_mmap, 4038 1, endpoint->migr_fd, 0); 4039 if (ret < 0) { 4040 SPDK_ERRLOG("vfu_ctx %p failed to setup migration region\n", vfu_ctx); 4041 return ret; 4042 } 4043 4044 ret = vfu_setup_device_migration_callbacks(vfu_ctx, &migr_callbacks, 4045 vfu_get_migr_register_area_size()); 4046 if (ret < 0) { 4047 SPDK_ERRLOG("vfu_ctx %p failed to setup migration callbacks\n", vfu_ctx); 4048 return ret; 4049 } 4050 4051 ret = vfu_realize_ctx(vfu_ctx); 4052 if (ret < 0) { 4053 SPDK_ERRLOG("vfu_ctx %p failed to realize\n", vfu_ctx); 4054 return ret; 4055 } 4056 4057 endpoint->pci_config_space = vfu_pci_get_config_space(endpoint->vfu_ctx); 4058 assert(endpoint->pci_config_space != NULL); 4059 init_pci_config_space(endpoint->pci_config_space); 4060 4061 assert(cap_offset != 0); 4062 endpoint->msix = (struct msixcap *)((uint8_t *)endpoint->pci_config_space + cap_offset); 4063 4064 return 0; 4065 } 4066 4067 static int nvmf_vfio_user_accept(void *ctx); 4068 4069 static void 4070 set_intr_mode_noop(struct spdk_poller *poller, void *arg, bool interrupt_mode) 4071 { 4072 /* Nothing for us to do here. */ 4073 } 4074 4075 /* 4076 * Register an "accept" poller: this is polling for incoming vfio-user socket 4077 * connections (on the listening socket). 4078 * 4079 * We need to do this on first listening, and also after destroying a 4080 * controller, so we can accept another connection. 4081 */ 4082 static int 4083 vfio_user_register_accept_poller(struct nvmf_vfio_user_endpoint *endpoint) 4084 { 4085 uint64_t poll_rate_us = endpoint->transport->transport.opts.acceptor_poll_rate; 4086 4087 SPDK_DEBUGLOG(nvmf_vfio, "registering accept poller\n"); 4088 4089 endpoint->accept_poller = SPDK_POLLER_REGISTER(nvmf_vfio_user_accept, 4090 endpoint, poll_rate_us); 4091 4092 if (!endpoint->accept_poller) { 4093 return -1; 4094 } 4095 4096 endpoint->accept_thread = spdk_get_thread(); 4097 endpoint->need_relisten = false; 4098 4099 if (!spdk_interrupt_mode_is_enabled()) { 4100 return 0; 4101 } 4102 4103 endpoint->accept_intr_fd = vfu_get_poll_fd(endpoint->vfu_ctx); 4104 assert(endpoint->accept_intr_fd != -1); 4105 4106 endpoint->accept_intr = SPDK_INTERRUPT_REGISTER(endpoint->accept_intr_fd, 4107 nvmf_vfio_user_accept, endpoint); 4108 4109 assert(endpoint->accept_intr != NULL); 4110 4111 spdk_poller_register_interrupt(endpoint->accept_poller, 4112 set_intr_mode_noop, NULL); 4113 return 0; 4114 } 4115 4116 static void 4117 _vfio_user_relisten(void *ctx) 4118 { 4119 struct nvmf_vfio_user_endpoint *endpoint = ctx; 4120 4121 vfio_user_register_accept_poller(endpoint); 4122 } 4123 4124 static void 4125 _free_ctrlr(void *ctx) 4126 { 4127 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 4128 struct nvmf_vfio_user_endpoint *endpoint = ctrlr->endpoint; 4129 4130 free_sdbl(ctrlr->endpoint->vfu_ctx, ctrlr->sdbl); 4131 4132 spdk_interrupt_unregister(&ctrlr->intr); 4133 ctrlr->intr_fd = -1; 4134 spdk_poller_unregister(&ctrlr->vfu_ctx_poller); 4135 4136 free(ctrlr); 4137 4138 if (endpoint == NULL) { 4139 return; 4140 } 4141 4142 if (endpoint->need_async_destroy) { 4143 nvmf_vfio_user_destroy_endpoint(endpoint); 4144 } else if (endpoint->need_relisten) { 4145 spdk_thread_send_msg(endpoint->accept_thread, 4146 _vfio_user_relisten, endpoint); 4147 } 4148 } 4149 4150 static void 4151 free_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 4152 { 4153 int i; 4154 assert(ctrlr != NULL); 4155 4156 SPDK_DEBUGLOG(nvmf_vfio, "free %s\n", ctrlr_id(ctrlr)); 4157 4158 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 4159 free_qp(ctrlr, i); 4160 } 4161 4162 spdk_thread_exec_msg(ctrlr->thread, _free_ctrlr, ctrlr); 4163 } 4164 4165 static int 4166 nvmf_vfio_user_create_ctrlr(struct nvmf_vfio_user_transport *transport, 4167 struct nvmf_vfio_user_endpoint *endpoint) 4168 { 4169 struct nvmf_vfio_user_ctrlr *ctrlr; 4170 int err = 0; 4171 4172 SPDK_DEBUGLOG(nvmf_vfio, "%s\n", endpoint_id(endpoint)); 4173 4174 /* First, construct a vfio-user CUSTOM transport controller */ 4175 ctrlr = calloc(1, sizeof(*ctrlr)); 4176 if (ctrlr == NULL) { 4177 err = -ENOMEM; 4178 goto out; 4179 } 4180 /* We can only support one connection for now */ 4181 ctrlr->cntlid = 0x1; 4182 ctrlr->intr_fd = -1; 4183 ctrlr->transport = transport; 4184 ctrlr->endpoint = endpoint; 4185 ctrlr->bar0_doorbells = endpoint->bar0_doorbells; 4186 TAILQ_INIT(&ctrlr->connected_sqs); 4187 4188 ctrlr->adaptive_irqs_enabled = 4189 !transport->transport_opts.disable_adaptive_irq; 4190 4191 /* Then, construct an admin queue pair */ 4192 err = init_sq(ctrlr, &transport->transport, 0); 4193 if (err != 0) { 4194 free(ctrlr); 4195 goto out; 4196 } 4197 4198 err = init_cq(ctrlr, 0); 4199 if (err != 0) { 4200 free(ctrlr); 4201 goto out; 4202 } 4203 4204 ctrlr->sqs[0]->size = NVMF_VFIO_USER_DEFAULT_AQ_DEPTH; 4205 4206 err = alloc_sq_reqs(ctrlr, ctrlr->sqs[0]); 4207 if (err != 0) { 4208 free(ctrlr); 4209 goto out; 4210 } 4211 endpoint->ctrlr = ctrlr; 4212 4213 /* Notify the generic layer about the new admin queue pair */ 4214 spdk_nvmf_tgt_new_qpair(transport->transport.tgt, &ctrlr->sqs[0]->qpair); 4215 4216 out: 4217 if (err != 0) { 4218 SPDK_ERRLOG("%s: failed to create vfio-user controller: %s\n", 4219 endpoint_id(endpoint), strerror(-err)); 4220 } 4221 4222 return err; 4223 } 4224 4225 static int 4226 nvmf_vfio_user_listen(struct spdk_nvmf_transport *transport, 4227 const struct spdk_nvme_transport_id *trid, 4228 struct spdk_nvmf_listen_opts *listen_opts) 4229 { 4230 struct nvmf_vfio_user_transport *vu_transport; 4231 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 4232 char path[PATH_MAX] = {}; 4233 char uuid[PATH_MAX] = {}; 4234 int ret; 4235 4236 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4237 transport); 4238 4239 pthread_mutex_lock(&vu_transport->lock); 4240 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 4241 /* Only compare traddr */ 4242 if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) { 4243 pthread_mutex_unlock(&vu_transport->lock); 4244 return -EEXIST; 4245 } 4246 } 4247 pthread_mutex_unlock(&vu_transport->lock); 4248 4249 endpoint = calloc(1, sizeof(*endpoint)); 4250 if (!endpoint) { 4251 return -ENOMEM; 4252 } 4253 4254 pthread_mutex_init(&endpoint->lock, NULL); 4255 endpoint->devmem_fd = -1; 4256 memcpy(&endpoint->trid, trid, sizeof(endpoint->trid)); 4257 endpoint->transport = vu_transport; 4258 4259 ret = snprintf(path, PATH_MAX, "%s/bar0", endpoint_id(endpoint)); 4260 if (ret < 0 || ret >= PATH_MAX) { 4261 SPDK_ERRLOG("%s: error to get socket path: %s.\n", endpoint_id(endpoint), spdk_strerror(errno)); 4262 ret = -1; 4263 goto out; 4264 } 4265 4266 ret = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); 4267 if (ret == -1) { 4268 SPDK_ERRLOG("%s: failed to open device memory at %s: %s.\n", 4269 endpoint_id(endpoint), path, spdk_strerror(errno)); 4270 goto out; 4271 } 4272 unlink(path); 4273 4274 endpoint->devmem_fd = ret; 4275 ret = ftruncate(endpoint->devmem_fd, 4276 NVME_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE); 4277 if (ret != 0) { 4278 SPDK_ERRLOG("%s: error to ftruncate file %s: %s.\n", endpoint_id(endpoint), path, 4279 spdk_strerror(errno)); 4280 goto out; 4281 } 4282 4283 endpoint->bar0_doorbells = mmap(NULL, NVMF_VFIO_USER_DOORBELLS_SIZE, 4284 PROT_READ | PROT_WRITE, MAP_SHARED, endpoint->devmem_fd, NVME_DOORBELLS_OFFSET); 4285 if (endpoint->bar0_doorbells == MAP_FAILED) { 4286 SPDK_ERRLOG("%s: error to mmap file %s: %s.\n", endpoint_id(endpoint), path, spdk_strerror(errno)); 4287 endpoint->bar0_doorbells = NULL; 4288 ret = -1; 4289 goto out; 4290 } 4291 4292 ret = snprintf(path, PATH_MAX, "%s/migr", endpoint_id(endpoint)); 4293 if (ret < 0 || ret >= PATH_MAX) { 4294 SPDK_ERRLOG("%s: error to get migration file path: %s.\n", endpoint_id(endpoint), 4295 spdk_strerror(errno)); 4296 ret = -1; 4297 goto out; 4298 } 4299 ret = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); 4300 if (ret == -1) { 4301 SPDK_ERRLOG("%s: failed to open device memory at %s: %s.\n", 4302 endpoint_id(endpoint), path, spdk_strerror(errno)); 4303 goto out; 4304 } 4305 unlink(path); 4306 4307 endpoint->migr_fd = ret; 4308 ret = ftruncate(endpoint->migr_fd, 4309 vfu_get_migr_register_area_size() + vfio_user_migr_data_len()); 4310 if (ret != 0) { 4311 SPDK_ERRLOG("%s: error to ftruncate migration file %s: %s.\n", endpoint_id(endpoint), path, 4312 spdk_strerror(errno)); 4313 goto out; 4314 } 4315 4316 endpoint->migr_data = mmap(NULL, vfio_user_migr_data_len(), 4317 PROT_READ | PROT_WRITE, MAP_SHARED, endpoint->migr_fd, vfu_get_migr_register_area_size()); 4318 if (endpoint->migr_data == MAP_FAILED) { 4319 SPDK_ERRLOG("%s: error to mmap file %s: %s.\n", endpoint_id(endpoint), path, spdk_strerror(errno)); 4320 endpoint->migr_data = NULL; 4321 ret = -1; 4322 goto out; 4323 } 4324 4325 ret = snprintf(uuid, PATH_MAX, "%s/cntrl", endpoint_id(endpoint)); 4326 if (ret < 0 || ret >= PATH_MAX) { 4327 SPDK_ERRLOG("%s: error to get ctrlr file path: %s\n", endpoint_id(endpoint), spdk_strerror(errno)); 4328 ret = -1; 4329 goto out; 4330 } 4331 4332 endpoint->vfu_ctx = vfu_create_ctx(VFU_TRANS_SOCK, uuid, LIBVFIO_USER_FLAG_ATTACH_NB, 4333 endpoint, VFU_DEV_TYPE_PCI); 4334 if (endpoint->vfu_ctx == NULL) { 4335 SPDK_ERRLOG("%s: error creating libmuser context: %m\n", 4336 endpoint_id(endpoint)); 4337 ret = -1; 4338 goto out; 4339 } 4340 4341 ret = vfu_setup_log(endpoint->vfu_ctx, vfio_user_log, 4342 vfio_user_get_log_level()); 4343 if (ret < 0) { 4344 goto out; 4345 } 4346 4347 4348 ret = vfio_user_dev_info_fill(vu_transport, endpoint); 4349 if (ret < 0) { 4350 goto out; 4351 } 4352 4353 ret = vfio_user_register_accept_poller(endpoint); 4354 4355 if (ret != 0) { 4356 goto out; 4357 } 4358 4359 pthread_mutex_lock(&vu_transport->lock); 4360 TAILQ_INSERT_TAIL(&vu_transport->endpoints, endpoint, link); 4361 pthread_mutex_unlock(&vu_transport->lock); 4362 4363 out: 4364 if (ret != 0) { 4365 nvmf_vfio_user_destroy_endpoint(endpoint); 4366 } 4367 4368 return ret; 4369 } 4370 4371 static void 4372 nvmf_vfio_user_stop_listen(struct spdk_nvmf_transport *transport, 4373 const struct spdk_nvme_transport_id *trid) 4374 { 4375 struct nvmf_vfio_user_transport *vu_transport; 4376 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 4377 4378 assert(trid != NULL); 4379 assert(trid->traddr != NULL); 4380 4381 SPDK_DEBUGLOG(nvmf_vfio, "%s: stop listen\n", trid->traddr); 4382 4383 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4384 transport); 4385 4386 pthread_mutex_lock(&vu_transport->lock); 4387 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 4388 if (strcmp(trid->traddr, endpoint->trid.traddr) == 0) { 4389 TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link); 4390 /* Defer to free endpoint resources until the controller 4391 * is freed. There are two cases when running here: 4392 * 1. kill nvmf target while VM is connected 4393 * 2. remove listener via RPC call 4394 * nvmf library will disconnect all queue paris. 4395 */ 4396 if (endpoint->ctrlr) { 4397 assert(!endpoint->need_async_destroy); 4398 endpoint->need_async_destroy = true; 4399 pthread_mutex_unlock(&vu_transport->lock); 4400 return; 4401 } 4402 4403 nvmf_vfio_user_destroy_endpoint(endpoint); 4404 pthread_mutex_unlock(&vu_transport->lock); 4405 return; 4406 } 4407 } 4408 pthread_mutex_unlock(&vu_transport->lock); 4409 4410 SPDK_DEBUGLOG(nvmf_vfio, "%s: not found\n", trid->traddr); 4411 } 4412 4413 static void 4414 nvmf_vfio_user_cdata_init(struct spdk_nvmf_transport *transport, 4415 struct spdk_nvmf_subsystem *subsystem, 4416 struct spdk_nvmf_ctrlr_data *cdata) 4417 { 4418 struct nvmf_vfio_user_transport *vu_transport; 4419 4420 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, transport); 4421 4422 cdata->vid = SPDK_PCI_VID_NUTANIX; 4423 cdata->ssvid = SPDK_PCI_VID_NUTANIX; 4424 cdata->ieee[0] = 0x8d; 4425 cdata->ieee[1] = 0x6b; 4426 cdata->ieee[2] = 0x50; 4427 memset(&cdata->sgls, 0, sizeof(struct spdk_nvme_cdata_sgls)); 4428 cdata->sgls.supported = SPDK_NVME_SGLS_SUPPORTED_DWORD_ALIGNED; 4429 cdata->oncs.compare = !vu_transport->transport_opts.disable_compare; 4430 /* libvfio-user can only support 1 connection for now */ 4431 cdata->oncs.reservations = 0; 4432 cdata->oacs.doorbell_buffer_config = !vu_transport->transport_opts.disable_shadow_doorbells; 4433 cdata->fuses.compare_and_write = !vu_transport->transport_opts.disable_compare; 4434 } 4435 4436 static int 4437 nvmf_vfio_user_listen_associate(struct spdk_nvmf_transport *transport, 4438 const struct spdk_nvmf_subsystem *subsystem, 4439 const struct spdk_nvme_transport_id *trid) 4440 { 4441 struct nvmf_vfio_user_transport *vu_transport; 4442 struct nvmf_vfio_user_endpoint *endpoint; 4443 4444 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, transport); 4445 4446 pthread_mutex_lock(&vu_transport->lock); 4447 TAILQ_FOREACH(endpoint, &vu_transport->endpoints, link) { 4448 if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) { 4449 break; 4450 } 4451 } 4452 pthread_mutex_unlock(&vu_transport->lock); 4453 4454 if (endpoint == NULL) { 4455 return -ENOENT; 4456 } 4457 4458 /* Drop const - we will later need to pause/unpause. */ 4459 endpoint->subsystem = (struct spdk_nvmf_subsystem *)subsystem; 4460 4461 return 0; 4462 } 4463 4464 /* 4465 * Executed periodically at a default SPDK_NVMF_DEFAULT_ACCEPT_POLL_RATE_US 4466 * frequency. 4467 * 4468 * For this endpoint (which at the libvfio-user level corresponds to a socket), 4469 * if we don't currently have a controller set up, peek to see if the socket is 4470 * able to accept a new connection. 4471 */ 4472 static int 4473 nvmf_vfio_user_accept(void *ctx) 4474 { 4475 struct nvmf_vfio_user_endpoint *endpoint = ctx; 4476 struct nvmf_vfio_user_transport *vu_transport; 4477 int err; 4478 4479 vu_transport = endpoint->transport; 4480 4481 if (endpoint->ctrlr != NULL) { 4482 return SPDK_POLLER_IDLE; 4483 } 4484 4485 /* While we're here, the controller is already destroyed, 4486 * subsystem may still be in RESUMING state, we will wait 4487 * until the subsystem is in RUNNING state. 4488 */ 4489 if (endpoint->need_resume) { 4490 return SPDK_POLLER_IDLE; 4491 } 4492 4493 err = vfu_attach_ctx(endpoint->vfu_ctx); 4494 if (err == 0) { 4495 SPDK_DEBUGLOG(nvmf_vfio, "attach succeeded\n"); 4496 err = nvmf_vfio_user_create_ctrlr(vu_transport, endpoint); 4497 if (err == 0) { 4498 /* 4499 * Unregister ourselves: now we've accepted a 4500 * connection, there is nothing for us to poll for, and 4501 * we will poll the connection via vfu_run_ctx() 4502 * instead. 4503 */ 4504 spdk_interrupt_unregister(&endpoint->accept_intr); 4505 spdk_poller_unregister(&endpoint->accept_poller); 4506 } 4507 return SPDK_POLLER_BUSY; 4508 } 4509 4510 if (errno == EAGAIN || errno == EWOULDBLOCK) { 4511 return SPDK_POLLER_IDLE; 4512 } 4513 4514 return SPDK_POLLER_BUSY; 4515 } 4516 4517 static void 4518 nvmf_vfio_user_discover(struct spdk_nvmf_transport *transport, 4519 struct spdk_nvme_transport_id *trid, 4520 struct spdk_nvmf_discovery_log_page_entry *entry) 4521 { } 4522 4523 static int vfio_user_poll_group_intr(void *ctx); 4524 4525 static void 4526 vfio_user_poll_group_add_intr(struct nvmf_vfio_user_poll_group *vu_group, 4527 struct spdk_nvmf_poll_group *group) 4528 { 4529 vu_group->intr_fd = eventfd(0, EFD_NONBLOCK); 4530 assert(vu_group->intr_fd != -1); 4531 4532 vu_group->intr = SPDK_INTERRUPT_REGISTER(vu_group->intr_fd, 4533 vfio_user_poll_group_intr, vu_group); 4534 assert(vu_group->intr != NULL); 4535 4536 spdk_poller_register_interrupt(group->poller, set_intr_mode_noop, 4537 vu_group); 4538 } 4539 4540 static struct spdk_nvmf_transport_poll_group * 4541 nvmf_vfio_user_poll_group_create(struct spdk_nvmf_transport *transport, 4542 struct spdk_nvmf_poll_group *group) 4543 { 4544 struct nvmf_vfio_user_transport *vu_transport; 4545 struct nvmf_vfio_user_poll_group *vu_group; 4546 4547 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4548 transport); 4549 4550 SPDK_DEBUGLOG(nvmf_vfio, "create poll group\n"); 4551 4552 vu_group = calloc(1, sizeof(*vu_group)); 4553 if (vu_group == NULL) { 4554 SPDK_ERRLOG("Error allocating poll group: %m"); 4555 return NULL; 4556 } 4557 4558 if (in_interrupt_mode(vu_transport)) { 4559 vfio_user_poll_group_add_intr(vu_group, group); 4560 } 4561 4562 TAILQ_INIT(&vu_group->sqs); 4563 4564 pthread_mutex_lock(&vu_transport->pg_lock); 4565 TAILQ_INSERT_TAIL(&vu_transport->poll_groups, vu_group, link); 4566 if (vu_transport->next_pg == NULL) { 4567 vu_transport->next_pg = vu_group; 4568 } 4569 pthread_mutex_unlock(&vu_transport->pg_lock); 4570 4571 return &vu_group->group; 4572 } 4573 4574 static struct spdk_nvmf_transport_poll_group * 4575 nvmf_vfio_user_get_optimal_poll_group(struct spdk_nvmf_qpair *qpair) 4576 { 4577 struct nvmf_vfio_user_transport *vu_transport; 4578 struct nvmf_vfio_user_poll_group **vu_group; 4579 struct nvmf_vfio_user_sq *sq; 4580 struct nvmf_vfio_user_cq *cq; 4581 4582 struct spdk_nvmf_transport_poll_group *result = NULL; 4583 4584 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 4585 cq = sq->ctrlr->cqs[sq->cqid]; 4586 assert(cq != NULL); 4587 vu_transport = SPDK_CONTAINEROF(qpair->transport, struct nvmf_vfio_user_transport, transport); 4588 4589 pthread_mutex_lock(&vu_transport->pg_lock); 4590 if (TAILQ_EMPTY(&vu_transport->poll_groups)) { 4591 goto out; 4592 } 4593 4594 if (!nvmf_qpair_is_admin_queue(qpair)) { 4595 /* 4596 * If this is shared IO CQ case, just return the used CQ's poll 4597 * group, so I/O completions don't have to use 4598 * spdk_thread_send_msg(). 4599 */ 4600 if (cq->group != NULL) { 4601 result = cq->group; 4602 goto out; 4603 } 4604 4605 /* 4606 * If we're in interrupt mode, align all qpairs for a controller 4607 * on the same poll group by default, unless requested. This can 4608 * be lower in performance than running on a single poll group, 4609 * so we disable spreading by default. 4610 */ 4611 if (in_interrupt_mode(vu_transport) && 4612 !vu_transport->transport_opts.enable_intr_mode_sq_spreading) { 4613 result = sq->ctrlr->sqs[0]->group; 4614 goto out; 4615 } 4616 4617 } 4618 4619 vu_group = &vu_transport->next_pg; 4620 assert(*vu_group != NULL); 4621 4622 result = &(*vu_group)->group; 4623 *vu_group = TAILQ_NEXT(*vu_group, link); 4624 if (*vu_group == NULL) { 4625 *vu_group = TAILQ_FIRST(&vu_transport->poll_groups); 4626 } 4627 4628 out: 4629 if (cq->group == NULL) { 4630 cq->group = result; 4631 } 4632 4633 pthread_mutex_unlock(&vu_transport->pg_lock); 4634 return result; 4635 } 4636 4637 static void 4638 vfio_user_poll_group_del_intr(struct nvmf_vfio_user_poll_group *vu_group) 4639 { 4640 assert(vu_group->intr_fd != -1); 4641 4642 spdk_interrupt_unregister(&vu_group->intr); 4643 4644 close(vu_group->intr_fd); 4645 vu_group->intr_fd = -1; 4646 } 4647 4648 /* called when process exits */ 4649 static void 4650 nvmf_vfio_user_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group) 4651 { 4652 struct nvmf_vfio_user_poll_group *vu_group, *next_tgroup; 4653 struct nvmf_vfio_user_transport *vu_transport; 4654 4655 SPDK_DEBUGLOG(nvmf_vfio, "destroy poll group\n"); 4656 4657 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 4658 vu_transport = SPDK_CONTAINEROF(vu_group->group.transport, struct nvmf_vfio_user_transport, 4659 transport); 4660 4661 if (in_interrupt_mode(vu_transport)) { 4662 vfio_user_poll_group_del_intr(vu_group); 4663 } 4664 4665 pthread_mutex_lock(&vu_transport->pg_lock); 4666 next_tgroup = TAILQ_NEXT(vu_group, link); 4667 TAILQ_REMOVE(&vu_transport->poll_groups, vu_group, link); 4668 if (next_tgroup == NULL) { 4669 next_tgroup = TAILQ_FIRST(&vu_transport->poll_groups); 4670 } 4671 if (vu_transport->next_pg == vu_group) { 4672 vu_transport->next_pg = next_tgroup; 4673 } 4674 pthread_mutex_unlock(&vu_transport->pg_lock); 4675 4676 free(vu_group); 4677 } 4678 4679 static void 4680 _vfio_user_qpair_disconnect(void *ctx) 4681 { 4682 struct nvmf_vfio_user_sq *sq = ctx; 4683 4684 spdk_nvmf_qpair_disconnect(&sq->qpair, NULL, NULL); 4685 } 4686 4687 /* The function is used when socket connection is destroyed */ 4688 static int 4689 vfio_user_destroy_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 4690 { 4691 struct nvmf_vfio_user_sq *sq; 4692 struct nvmf_vfio_user_endpoint *endpoint; 4693 4694 SPDK_DEBUGLOG(nvmf_vfio, "%s stop processing\n", ctrlr_id(ctrlr)); 4695 4696 endpoint = ctrlr->endpoint; 4697 assert(endpoint != NULL); 4698 4699 pthread_mutex_lock(&endpoint->lock); 4700 endpoint->need_relisten = true; 4701 if (TAILQ_EMPTY(&ctrlr->connected_sqs)) { 4702 endpoint->ctrlr = NULL; 4703 free_ctrlr(ctrlr); 4704 pthread_mutex_unlock(&endpoint->lock); 4705 return 0; 4706 } 4707 4708 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 4709 /* add another round thread poll to avoid recursive endpoint lock */ 4710 spdk_thread_send_msg(ctrlr->thread, _vfio_user_qpair_disconnect, sq); 4711 } 4712 pthread_mutex_unlock(&endpoint->lock); 4713 4714 return 0; 4715 } 4716 4717 /* 4718 * Poll for and process any incoming vfio-user messages. 4719 */ 4720 static int 4721 vfio_user_poll_vfu_ctx(void *ctx) 4722 { 4723 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 4724 int ret; 4725 4726 assert(ctrlr != NULL); 4727 4728 /* This will call access_bar0_fn() if there are any writes 4729 * to the portion of the BAR that is not mmap'd */ 4730 ret = vfu_run_ctx(ctrlr->endpoint->vfu_ctx); 4731 if (spdk_unlikely(ret == -1)) { 4732 if (errno == EBUSY) { 4733 return SPDK_POLLER_IDLE; 4734 } 4735 4736 spdk_poller_unregister(&ctrlr->vfu_ctx_poller); 4737 4738 /* 4739 * We lost the client; the reset callback will already have 4740 * unregistered the interrupt. 4741 */ 4742 if (errno == ENOTCONN) { 4743 vfio_user_destroy_ctrlr(ctrlr); 4744 return SPDK_POLLER_BUSY; 4745 } 4746 4747 /* 4748 * We might not have got a reset callback in this case, so 4749 * explicitly unregister the interrupt here. 4750 */ 4751 spdk_interrupt_unregister(&ctrlr->intr); 4752 ctrlr->intr_fd = -1; 4753 fail_ctrlr(ctrlr); 4754 } 4755 4756 return ret != 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 4757 } 4758 4759 struct vfio_user_post_cpl_ctx { 4760 struct nvmf_vfio_user_ctrlr *ctrlr; 4761 struct nvmf_vfio_user_cq *cq; 4762 struct spdk_nvme_cpl cpl; 4763 }; 4764 4765 static void 4766 _post_completion_msg(void *ctx) 4767 { 4768 struct vfio_user_post_cpl_ctx *cpl_ctx = ctx; 4769 4770 post_completion(cpl_ctx->ctrlr, cpl_ctx->cq, cpl_ctx->cpl.cdw0, cpl_ctx->cpl.sqid, 4771 cpl_ctx->cpl.cid, cpl_ctx->cpl.status.sc, cpl_ctx->cpl.status.sct); 4772 free(cpl_ctx); 4773 } 4774 4775 static int nvmf_vfio_user_poll_group_poll(struct spdk_nvmf_transport_poll_group *group); 4776 4777 static int 4778 vfio_user_poll_group_intr(void *ctx) 4779 { 4780 struct nvmf_vfio_user_poll_group *vu_group = ctx; 4781 eventfd_t val; 4782 int ret = 0; 4783 4784 SPDK_DEBUGLOG(vfio_user_db, "pg:%p got intr\n", vu_group); 4785 4786 /* 4787 * NB: this might fail if called from vfio_user_ctrlr_intr(), but it's 4788 * non-blocking, so not an issue. 4789 */ 4790 eventfd_read(vu_group->intr_fd, &val); 4791 4792 ret |= nvmf_vfio_user_poll_group_poll(&vu_group->group); 4793 4794 /* 4795 * Re-arm the event indexes. NB: this also could rearm other 4796 * controller's SQs. 4797 */ 4798 ret |= vfio_user_poll_group_rearm(vu_group); 4799 4800 return ret != 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 4801 } 4802 4803 /* 4804 * Handle an interrupt for the given controller: we must poll the vfu_ctx, and 4805 * the SQs assigned to our own poll group. Other poll groups are handled via 4806 * vfio_user_poll_group_intr(). 4807 */ 4808 static int 4809 vfio_user_ctrlr_intr(void *ctx) 4810 { 4811 struct nvmf_vfio_user_poll_group *vu_ctrlr_group; 4812 struct nvmf_vfio_user_ctrlr *vu_ctrlr = ctx; 4813 struct nvmf_vfio_user_poll_group *vu_group; 4814 int ret = SPDK_POLLER_IDLE; 4815 4816 vu_ctrlr_group = ctrlr_to_poll_group(vu_ctrlr); 4817 4818 SPDK_DEBUGLOG(vfio_user_db, "ctrlr pg:%p got intr\n", vu_ctrlr_group); 4819 4820 /* 4821 * Poll vfio-user for this controller. We need to do this before polling 4822 * any SQs, as this is where doorbell writes may be handled. 4823 */ 4824 ret = vfio_user_poll_vfu_ctx(vu_ctrlr); 4825 4826 /* 4827 * `sqs[0]` could be set to NULL in vfio_user_poll_vfu_ctx() context, 4828 * just return for this case. 4829 */ 4830 if (vu_ctrlr->sqs[0] == NULL) { 4831 return ret; 4832 } 4833 4834 if (vu_ctrlr->transport->transport_opts.enable_intr_mode_sq_spreading) { 4835 /* 4836 * We may have just written to a doorbell owned by another 4837 * reactor: we need to prod them to make sure its SQs are polled 4838 * *after* the doorbell value is updated. 4839 */ 4840 TAILQ_FOREACH(vu_group, &vu_ctrlr->transport->poll_groups, link) { 4841 if (vu_group != vu_ctrlr_group) { 4842 SPDK_DEBUGLOG(vfio_user_db, "prodding pg:%p\n", vu_group); 4843 eventfd_write(vu_group->intr_fd, 1); 4844 } 4845 } 4846 } 4847 4848 ret |= vfio_user_poll_group_intr(vu_ctrlr_group); 4849 4850 return ret; 4851 } 4852 4853 static void 4854 vfio_user_ctrlr_set_intr_mode(struct spdk_poller *poller, void *ctx, 4855 bool interrupt_mode) 4856 { 4857 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 4858 assert(ctrlr != NULL); 4859 assert(ctrlr->endpoint != NULL); 4860 4861 SPDK_DEBUGLOG(nvmf_vfio, "%s: setting interrupt mode to %d\n", 4862 ctrlr_id(ctrlr), interrupt_mode); 4863 4864 /* 4865 * interrupt_mode needs to persist across controller resets, so store 4866 * it in the endpoint instead. 4867 */ 4868 ctrlr->endpoint->interrupt_mode = interrupt_mode; 4869 4870 vfio_user_poll_group_rearm(ctrlr_to_poll_group(ctrlr)); 4871 } 4872 4873 /* 4874 * In response to the nvmf_vfio_user_create_ctrlr() path, the admin queue is now 4875 * set up and we can start operating on this controller. 4876 */ 4877 static void 4878 start_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 4879 struct spdk_nvmf_ctrlr *ctrlr) 4880 { 4881 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 4882 4883 vu_ctrlr->ctrlr = ctrlr; 4884 vu_ctrlr->cntlid = ctrlr->cntlid; 4885 vu_ctrlr->thread = spdk_get_thread(); 4886 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 4887 4888 if (!in_interrupt_mode(endpoint->transport)) { 4889 vu_ctrlr->vfu_ctx_poller = SPDK_POLLER_REGISTER(vfio_user_poll_vfu_ctx, 4890 vu_ctrlr, 1000); 4891 return; 4892 } 4893 4894 vu_ctrlr->vfu_ctx_poller = SPDK_POLLER_REGISTER(vfio_user_poll_vfu_ctx, 4895 vu_ctrlr, 0); 4896 4897 vu_ctrlr->intr_fd = vfu_get_poll_fd(vu_ctrlr->endpoint->vfu_ctx); 4898 assert(vu_ctrlr->intr_fd != -1); 4899 4900 vu_ctrlr->intr = SPDK_INTERRUPT_REGISTER(vu_ctrlr->intr_fd, 4901 vfio_user_ctrlr_intr, vu_ctrlr); 4902 4903 assert(vu_ctrlr->intr != NULL); 4904 4905 spdk_poller_register_interrupt(vu_ctrlr->vfu_ctx_poller, 4906 vfio_user_ctrlr_set_intr_mode, 4907 vu_ctrlr); 4908 } 4909 4910 static int 4911 handle_queue_connect_rsp(struct nvmf_vfio_user_req *req, void *cb_arg) 4912 { 4913 struct nvmf_vfio_user_poll_group *vu_group; 4914 struct nvmf_vfio_user_sq *sq = cb_arg; 4915 struct nvmf_vfio_user_cq *admin_cq; 4916 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 4917 struct nvmf_vfio_user_endpoint *endpoint; 4918 4919 assert(sq != NULL); 4920 assert(req != NULL); 4921 4922 vu_ctrlr = sq->ctrlr; 4923 assert(vu_ctrlr != NULL); 4924 endpoint = vu_ctrlr->endpoint; 4925 assert(endpoint != NULL); 4926 4927 if (spdk_nvme_cpl_is_error(&req->req.rsp->nvme_cpl)) { 4928 SPDK_ERRLOG("SC %u, SCT %u\n", req->req.rsp->nvme_cpl.status.sc, req->req.rsp->nvme_cpl.status.sct); 4929 endpoint->ctrlr = NULL; 4930 free_ctrlr(vu_ctrlr); 4931 return -1; 4932 } 4933 4934 vu_group = SPDK_CONTAINEROF(sq->group, struct nvmf_vfio_user_poll_group, group); 4935 TAILQ_INSERT_TAIL(&vu_group->sqs, sq, link); 4936 4937 admin_cq = vu_ctrlr->cqs[0]; 4938 assert(admin_cq != NULL); 4939 4940 pthread_mutex_lock(&endpoint->lock); 4941 if (nvmf_qpair_is_admin_queue(&sq->qpair)) { 4942 admin_cq->thread = spdk_get_thread(); 4943 start_ctrlr(vu_ctrlr, sq->qpair.ctrlr); 4944 } else { 4945 /* For I/O queues this command was generated in response to an 4946 * ADMIN I/O CREATE SUBMISSION QUEUE command which has not yet 4947 * been completed. Complete it now. 4948 */ 4949 if (sq->post_create_io_sq_completion) { 4950 assert(admin_cq->thread != NULL); 4951 if (admin_cq->thread != spdk_get_thread()) { 4952 struct vfio_user_post_cpl_ctx *cpl_ctx; 4953 4954 cpl_ctx = calloc(1, sizeof(*cpl_ctx)); 4955 if (!cpl_ctx) { 4956 return -ENOMEM; 4957 } 4958 cpl_ctx->ctrlr = vu_ctrlr; 4959 cpl_ctx->cq = admin_cq; 4960 cpl_ctx->cpl.sqid = 0; 4961 cpl_ctx->cpl.cdw0 = 0; 4962 cpl_ctx->cpl.cid = sq->create_io_sq_cmd.cid; 4963 cpl_ctx->cpl.status.sc = SPDK_NVME_SC_SUCCESS; 4964 cpl_ctx->cpl.status.sct = SPDK_NVME_SCT_GENERIC; 4965 4966 spdk_thread_send_msg(admin_cq->thread, _post_completion_msg, 4967 cpl_ctx); 4968 } else { 4969 post_completion(vu_ctrlr, admin_cq, 0, 0, 4970 sq->create_io_sq_cmd.cid, SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC); 4971 } 4972 sq->post_create_io_sq_completion = false; 4973 } else if (in_interrupt_mode(endpoint->transport)) { 4974 /* 4975 * If we're live migrating a guest, there is a window 4976 * where the I/O queues haven't been set up but the 4977 * device is in running state, during which the guest 4978 * might write to a doorbell. This doorbell write will 4979 * go unnoticed, so let's poll the whole controller to 4980 * pick that up. 4981 */ 4982 ctrlr_kick(vu_ctrlr); 4983 } 4984 sq->sq_state = VFIO_USER_SQ_ACTIVE; 4985 } 4986 4987 TAILQ_INSERT_TAIL(&vu_ctrlr->connected_sqs, sq, tailq); 4988 pthread_mutex_unlock(&endpoint->lock); 4989 4990 free(req->req.data); 4991 req->req.data = NULL; 4992 4993 return 0; 4994 } 4995 4996 /* 4997 * Add the given qpair to the given poll group. New qpairs are added via 4998 * spdk_nvmf_tgt_new_qpair(), which picks a poll group via 4999 * nvmf_vfio_user_get_optimal_poll_group(), then calls back here via 5000 * nvmf_transport_poll_group_add(). 5001 */ 5002 static int 5003 nvmf_vfio_user_poll_group_add(struct spdk_nvmf_transport_poll_group *group, 5004 struct spdk_nvmf_qpair *qpair) 5005 { 5006 struct nvmf_vfio_user_sq *sq; 5007 struct nvmf_vfio_user_req *vu_req; 5008 struct nvmf_vfio_user_ctrlr *ctrlr; 5009 struct spdk_nvmf_request *req; 5010 struct spdk_nvmf_fabric_connect_data *data; 5011 bool admin; 5012 5013 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5014 sq->group = group; 5015 ctrlr = sq->ctrlr; 5016 5017 SPDK_DEBUGLOG(nvmf_vfio, "%s: add QP%d=%p(%p) to poll_group=%p\n", 5018 ctrlr_id(ctrlr), sq->qpair.qid, 5019 sq, qpair, group); 5020 5021 admin = nvmf_qpair_is_admin_queue(&sq->qpair); 5022 5023 vu_req = get_nvmf_vfio_user_req(sq); 5024 if (vu_req == NULL) { 5025 return -1; 5026 } 5027 5028 req = &vu_req->req; 5029 req->cmd->connect_cmd.opcode = SPDK_NVME_OPC_FABRIC; 5030 req->cmd->connect_cmd.cid = 0; 5031 req->cmd->connect_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_CONNECT; 5032 req->cmd->connect_cmd.recfmt = 0; 5033 req->cmd->connect_cmd.sqsize = sq->size - 1; 5034 req->cmd->connect_cmd.qid = admin ? 0 : qpair->qid; 5035 5036 req->length = sizeof(struct spdk_nvmf_fabric_connect_data); 5037 req->data = calloc(1, req->length); 5038 if (req->data == NULL) { 5039 nvmf_vfio_user_req_free(req); 5040 return -ENOMEM; 5041 } 5042 5043 data = (struct spdk_nvmf_fabric_connect_data *)req->data; 5044 data->cntlid = ctrlr->cntlid; 5045 snprintf(data->subnqn, sizeof(data->subnqn), "%s", 5046 spdk_nvmf_subsystem_get_nqn(ctrlr->endpoint->subsystem)); 5047 5048 vu_req->cb_fn = handle_queue_connect_rsp; 5049 vu_req->cb_arg = sq; 5050 5051 SPDK_DEBUGLOG(nvmf_vfio, 5052 "%s: sending connect fabrics command for qid:%#x cntlid=%#x\n", 5053 ctrlr_id(ctrlr), qpair->qid, data->cntlid); 5054 5055 spdk_nvmf_request_exec_fabrics(req); 5056 return 0; 5057 } 5058 5059 static int 5060 nvmf_vfio_user_poll_group_remove(struct spdk_nvmf_transport_poll_group *group, 5061 struct spdk_nvmf_qpair *qpair) 5062 { 5063 struct nvmf_vfio_user_sq *sq; 5064 struct nvmf_vfio_user_poll_group *vu_group; 5065 5066 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5067 5068 SPDK_DEBUGLOG(nvmf_vfio, 5069 "%s: remove NVMf QP%d=%p from NVMf poll_group=%p\n", 5070 ctrlr_id(sq->ctrlr), qpair->qid, qpair, group); 5071 5072 5073 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 5074 TAILQ_REMOVE(&vu_group->sqs, sq, link); 5075 5076 return 0; 5077 } 5078 5079 static void 5080 _nvmf_vfio_user_req_free(struct nvmf_vfio_user_sq *sq, struct nvmf_vfio_user_req *vu_req) 5081 { 5082 memset(&vu_req->cmd, 0, sizeof(vu_req->cmd)); 5083 memset(&vu_req->rsp, 0, sizeof(vu_req->rsp)); 5084 vu_req->iovcnt = 0; 5085 vu_req->state = VFIO_USER_REQUEST_STATE_FREE; 5086 5087 TAILQ_INSERT_TAIL(&sq->free_reqs, vu_req, link); 5088 } 5089 5090 static int 5091 nvmf_vfio_user_req_free(struct spdk_nvmf_request *req) 5092 { 5093 struct nvmf_vfio_user_sq *sq; 5094 struct nvmf_vfio_user_req *vu_req; 5095 5096 assert(req != NULL); 5097 5098 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 5099 sq = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 5100 5101 _nvmf_vfio_user_req_free(sq, vu_req); 5102 5103 return 0; 5104 } 5105 5106 static int 5107 nvmf_vfio_user_req_complete(struct spdk_nvmf_request *req) 5108 { 5109 struct nvmf_vfio_user_sq *sq; 5110 struct nvmf_vfio_user_req *vu_req; 5111 5112 assert(req != NULL); 5113 5114 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 5115 sq = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 5116 5117 if (vu_req->cb_fn != NULL) { 5118 if (vu_req->cb_fn(vu_req, vu_req->cb_arg) != 0) { 5119 fail_ctrlr(sq->ctrlr); 5120 } 5121 } 5122 5123 _nvmf_vfio_user_req_free(sq, vu_req); 5124 5125 return 0; 5126 } 5127 5128 static void 5129 nvmf_vfio_user_close_qpair(struct spdk_nvmf_qpair *qpair, 5130 spdk_nvmf_transport_qpair_fini_cb cb_fn, void *cb_arg) 5131 { 5132 struct nvmf_vfio_user_sq *sq; 5133 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 5134 struct nvmf_vfio_user_endpoint *endpoint; 5135 5136 assert(qpair != NULL); 5137 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5138 vu_ctrlr = sq->ctrlr; 5139 endpoint = vu_ctrlr->endpoint; 5140 5141 pthread_mutex_lock(&endpoint->lock); 5142 TAILQ_REMOVE(&vu_ctrlr->connected_sqs, sq, tailq); 5143 delete_sq_done(vu_ctrlr, sq); 5144 if (TAILQ_EMPTY(&vu_ctrlr->connected_sqs)) { 5145 endpoint->ctrlr = NULL; 5146 if (vu_ctrlr->in_source_vm && endpoint->need_resume) { 5147 /* The controller will be freed, we can resume the subsystem 5148 * now so that the endpoint can be ready to accept another 5149 * new connection. 5150 */ 5151 spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 5152 vfio_user_endpoint_resume_done, endpoint); 5153 } 5154 free_ctrlr(vu_ctrlr); 5155 } 5156 pthread_mutex_unlock(&endpoint->lock); 5157 5158 if (cb_fn) { 5159 cb_fn(cb_arg); 5160 } 5161 } 5162 5163 /** 5164 * Returns a preallocated request, or NULL if there isn't one available. 5165 */ 5166 static struct nvmf_vfio_user_req * 5167 get_nvmf_vfio_user_req(struct nvmf_vfio_user_sq *sq) 5168 { 5169 struct nvmf_vfio_user_req *req; 5170 5171 if (sq == NULL) { 5172 return NULL; 5173 } 5174 5175 req = TAILQ_FIRST(&sq->free_reqs); 5176 if (req == NULL) { 5177 return NULL; 5178 } 5179 5180 TAILQ_REMOVE(&sq->free_reqs, req, link); 5181 5182 return req; 5183 } 5184 5185 static int 5186 get_nvmf_io_req_length(struct spdk_nvmf_request *req) 5187 { 5188 uint16_t nr; 5189 uint32_t nlb, nsid; 5190 struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; 5191 struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; 5192 struct spdk_nvmf_ns *ns; 5193 5194 nsid = cmd->nsid; 5195 ns = _nvmf_subsystem_get_ns(ctrlr->subsys, nsid); 5196 if (ns == NULL || ns->bdev == NULL) { 5197 SPDK_ERRLOG("unsuccessful query for nsid %u\n", cmd->nsid); 5198 return -EINVAL; 5199 } 5200 5201 if (cmd->opc == SPDK_NVME_OPC_DATASET_MANAGEMENT) { 5202 nr = cmd->cdw10_bits.dsm.nr + 1; 5203 return nr * sizeof(struct spdk_nvme_dsm_range); 5204 } 5205 5206 nlb = (cmd->cdw12 & 0x0000ffffu) + 1; 5207 return nlb * spdk_bdev_get_block_size(ns->bdev); 5208 } 5209 5210 static int 5211 map_admin_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req) 5212 { 5213 struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; 5214 uint32_t len = 0; 5215 uint8_t fid; 5216 int iovcnt; 5217 5218 req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 5219 req->length = 0; 5220 req->data = NULL; 5221 5222 if (req->xfer == SPDK_NVME_DATA_NONE) { 5223 return 0; 5224 } 5225 5226 switch (cmd->opc) { 5227 case SPDK_NVME_OPC_IDENTIFY: 5228 len = 4096; 5229 break; 5230 case SPDK_NVME_OPC_GET_LOG_PAGE: 5231 len = (((cmd->cdw11_bits.get_log_page.numdu << 16) | cmd->cdw10_bits.get_log_page.numdl) + 1) * 4; 5232 break; 5233 case SPDK_NVME_OPC_GET_FEATURES: 5234 case SPDK_NVME_OPC_SET_FEATURES: 5235 fid = cmd->cdw10_bits.set_features.fid; 5236 switch (fid) { 5237 case SPDK_NVME_FEAT_LBA_RANGE_TYPE: 5238 len = 4096; 5239 break; 5240 case SPDK_NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 5241 len = 256; 5242 break; 5243 case SPDK_NVME_FEAT_TIMESTAMP: 5244 len = 8; 5245 break; 5246 case SPDK_NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 5247 len = 512; 5248 break; 5249 case SPDK_NVME_FEAT_HOST_IDENTIFIER: 5250 if (cmd->cdw11_bits.feat_host_identifier.bits.exhid) { 5251 len = 16; 5252 } else { 5253 len = 8; 5254 } 5255 break; 5256 default: 5257 return 0; 5258 } 5259 break; 5260 default: 5261 return 0; 5262 } 5263 5264 /* ADMIN command will not use SGL */ 5265 if (cmd->psdt != 0) { 5266 return -EINVAL; 5267 } 5268 5269 iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, len); 5270 if (iovcnt < 0) { 5271 SPDK_ERRLOG("%s: map Admin Opc %x failed\n", 5272 ctrlr_id(ctrlr), cmd->opc); 5273 return -1; 5274 } 5275 req->length = len; 5276 req->data = req->iov[0].iov_base; 5277 req->iovcnt = iovcnt; 5278 5279 return 0; 5280 } 5281 5282 /* 5283 * Map an I/O command's buffers. 5284 * 5285 * Returns 0 on success and -errno on failure. 5286 */ 5287 static int 5288 map_io_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req) 5289 { 5290 int len, iovcnt; 5291 struct spdk_nvme_cmd *cmd; 5292 5293 assert(ctrlr != NULL); 5294 assert(req != NULL); 5295 5296 cmd = &req->cmd->nvme_cmd; 5297 req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 5298 req->length = 0; 5299 req->data = NULL; 5300 5301 if (spdk_unlikely(req->xfer == SPDK_NVME_DATA_NONE)) { 5302 return 0; 5303 } 5304 5305 len = get_nvmf_io_req_length(req); 5306 if (len < 0) { 5307 return -EINVAL; 5308 } 5309 req->length = len; 5310 5311 iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, req->length); 5312 if (iovcnt < 0) { 5313 SPDK_ERRLOG("%s: failed to map IO OPC %u\n", ctrlr_id(ctrlr), cmd->opc); 5314 return -EFAULT; 5315 } 5316 req->data = req->iov[0].iov_base; 5317 req->iovcnt = iovcnt; 5318 5319 return 0; 5320 } 5321 5322 static int 5323 handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 5324 struct nvmf_vfio_user_sq *sq) 5325 { 5326 int err; 5327 struct nvmf_vfio_user_req *vu_req; 5328 struct spdk_nvmf_request *req; 5329 5330 assert(ctrlr != NULL); 5331 assert(cmd != NULL); 5332 5333 vu_req = get_nvmf_vfio_user_req(sq); 5334 if (spdk_unlikely(vu_req == NULL)) { 5335 SPDK_ERRLOG("%s: no request for NVMe command opc 0x%x\n", ctrlr_id(ctrlr), cmd->opc); 5336 return post_completion(ctrlr, ctrlr->cqs[sq->cqid], 0, 0, cmd->cid, 5337 SPDK_NVME_SC_INTERNAL_DEVICE_ERROR, SPDK_NVME_SCT_GENERIC); 5338 5339 } 5340 req = &vu_req->req; 5341 5342 assert(req->qpair != NULL); 5343 SPDK_DEBUGLOG(nvmf_vfio, "%s: handle sqid:%u, req opc=%#x cid=%d\n", 5344 ctrlr_id(ctrlr), req->qpair->qid, cmd->opc, cmd->cid); 5345 5346 vu_req->cb_fn = handle_cmd_rsp; 5347 vu_req->cb_arg = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 5348 req->cmd->nvme_cmd = *cmd; 5349 5350 if (nvmf_qpair_is_admin_queue(req->qpair)) { 5351 err = map_admin_cmd_req(ctrlr, req); 5352 } else { 5353 switch (cmd->opc) { 5354 case SPDK_NVME_OPC_RESERVATION_REGISTER: 5355 case SPDK_NVME_OPC_RESERVATION_REPORT: 5356 case SPDK_NVME_OPC_RESERVATION_ACQUIRE: 5357 case SPDK_NVME_OPC_RESERVATION_RELEASE: 5358 err = -ENOTSUP; 5359 break; 5360 default: 5361 err = map_io_cmd_req(ctrlr, req); 5362 break; 5363 } 5364 } 5365 5366 if (spdk_unlikely(err < 0)) { 5367 SPDK_ERRLOG("%s: process NVMe command opc 0x%x failed\n", 5368 ctrlr_id(ctrlr), cmd->opc); 5369 req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 5370 req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; 5371 err = handle_cmd_rsp(vu_req, vu_req->cb_arg); 5372 _nvmf_vfio_user_req_free(sq, vu_req); 5373 return err; 5374 } 5375 5376 vu_req->state = VFIO_USER_REQUEST_STATE_EXECUTING; 5377 spdk_nvmf_request_exec(req); 5378 5379 return 0; 5380 } 5381 5382 /* 5383 * If we suppressed an IRQ in post_completion(), check if it needs to be fired 5384 * here: if the host isn't up to date, and is apparently not actively processing 5385 * the queue (i.e. ->last_head isn't changing), we need an IRQ. 5386 */ 5387 static void 5388 handle_suppressed_irq(struct nvmf_vfio_user_ctrlr *ctrlr, 5389 struct nvmf_vfio_user_sq *sq) 5390 { 5391 struct nvmf_vfio_user_cq *cq = ctrlr->cqs[sq->cqid]; 5392 uint32_t cq_head; 5393 uint32_t cq_tail; 5394 5395 if (!cq->ien || cq->qid == 0 || !ctrlr_interrupt_enabled(ctrlr)) { 5396 return; 5397 } 5398 5399 cq_tail = *cq_tailp(cq); 5400 5401 /* Already sent? */ 5402 if (cq_tail == cq->last_trigger_irq_tail) { 5403 return; 5404 } 5405 5406 spdk_ivdt_dcache(cq_dbl_headp(cq)); 5407 cq_head = *cq_dbl_headp(cq); 5408 5409 if (cq_head != cq_tail && cq_head == cq->last_head) { 5410 int err = vfu_irq_trigger(ctrlr->endpoint->vfu_ctx, cq->iv); 5411 if (err != 0) { 5412 SPDK_ERRLOG("%s: failed to trigger interrupt: %m\n", 5413 ctrlr_id(ctrlr)); 5414 } else { 5415 cq->last_trigger_irq_tail = cq_tail; 5416 } 5417 } 5418 5419 cq->last_head = cq_head; 5420 } 5421 5422 /* Returns the number of commands processed, or a negative value on error. */ 5423 static int 5424 nvmf_vfio_user_sq_poll(struct nvmf_vfio_user_sq *sq) 5425 { 5426 struct nvmf_vfio_user_ctrlr *ctrlr; 5427 uint32_t new_tail; 5428 int count = 0; 5429 5430 assert(sq != NULL); 5431 5432 ctrlr = sq->ctrlr; 5433 5434 /* 5435 * A quiesced, or migrating, controller should never process new 5436 * commands. 5437 */ 5438 if (ctrlr->state != VFIO_USER_CTRLR_RUNNING) { 5439 return SPDK_POLLER_IDLE; 5440 } 5441 5442 if (ctrlr->adaptive_irqs_enabled) { 5443 handle_suppressed_irq(ctrlr, sq); 5444 } 5445 5446 /* On aarch64 platforms, doorbells update from guest VM may not be seen 5447 * on SPDK target side. This is because there is memory type mismatch 5448 * situation here. That is on guest VM side, the doorbells are treated as 5449 * device memory while on SPDK target side, it is treated as normal 5450 * memory. And this situation cause problem on ARM platform. 5451 * Refer to "https://developer.arm.com/documentation/102376/0100/ 5452 * Memory-aliasing-and-mismatched-memory-types". Only using spdk_mb() 5453 * cannot fix this. Use "dc civac" to invalidate cache may solve 5454 * this. 5455 */ 5456 spdk_ivdt_dcache(sq_dbl_tailp(sq)); 5457 5458 /* Load-Acquire. */ 5459 new_tail = *sq_dbl_tailp(sq); 5460 5461 new_tail = new_tail & 0xffffu; 5462 if (spdk_unlikely(new_tail >= sq->size)) { 5463 union spdk_nvme_async_event_completion event = {}; 5464 5465 SPDK_DEBUGLOG(nvmf_vfio, "%s: invalid sqid:%u doorbell value %u\n", ctrlr_id(ctrlr), sq->qid, 5466 new_tail); 5467 event.bits.async_event_type = SPDK_NVME_ASYNC_EVENT_TYPE_ERROR; 5468 event.bits.async_event_info = SPDK_NVME_ASYNC_EVENT_INVALID_DB_WRITE; 5469 nvmf_ctrlr_async_event_error_event(ctrlr->ctrlr, event); 5470 5471 return -1; 5472 } 5473 5474 if (*sq_headp(sq) == new_tail) { 5475 return 0; 5476 } 5477 5478 SPDK_DEBUGLOG(nvmf_vfio, "%s: sqid:%u doorbell old=%u new=%u\n", 5479 ctrlr_id(ctrlr), sq->qid, *sq_headp(sq), new_tail); 5480 if (ctrlr->sdbl != NULL) { 5481 SPDK_DEBUGLOG(nvmf_vfio, 5482 "%s: sqid:%u bar0_doorbell=%u shadow_doorbell=%u eventidx=%u\n", 5483 ctrlr_id(ctrlr), sq->qid, 5484 ctrlr->bar0_doorbells[queue_index(sq->qid, false)], 5485 ctrlr->sdbl->shadow_doorbells[queue_index(sq->qid, false)], 5486 ctrlr->sdbl->eventidxs[queue_index(sq->qid, false)]); 5487 } 5488 5489 /* 5490 * Ensure that changes to the queue are visible to us. 5491 * The host driver should write the queue first, do a wmb(), and then 5492 * update the SQ tail doorbell (their Store-Release). 5493 */ 5494 spdk_rmb(); 5495 5496 count = handle_sq_tdbl_write(ctrlr, new_tail, sq); 5497 if (count < 0) { 5498 fail_ctrlr(ctrlr); 5499 } 5500 5501 return count; 5502 } 5503 5504 /* 5505 * vfio-user transport poll handler. Note that the library context is polled in 5506 * a separate poller (->vfu_ctx_poller), so this poller only needs to poll the 5507 * active SQs. 5508 * 5509 * Returns the number of commands processed, or a negative value on error. 5510 */ 5511 static int 5512 nvmf_vfio_user_poll_group_poll(struct spdk_nvmf_transport_poll_group *group) 5513 { 5514 struct nvmf_vfio_user_poll_group *vu_group; 5515 struct nvmf_vfio_user_sq *sq, *tmp; 5516 int count = 0; 5517 5518 assert(group != NULL); 5519 5520 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 5521 5522 SPDK_DEBUGLOG(vfio_user_db, "polling all SQs\n"); 5523 5524 TAILQ_FOREACH_SAFE(sq, &vu_group->sqs, link, tmp) { 5525 int ret; 5526 5527 if (spdk_unlikely(sq->sq_state != VFIO_USER_SQ_ACTIVE || !sq->size)) { 5528 continue; 5529 } 5530 5531 ret = nvmf_vfio_user_sq_poll(sq); 5532 5533 if (ret < 0) { 5534 return ret; 5535 } 5536 5537 count += ret; 5538 } 5539 5540 return count; 5541 } 5542 5543 static int 5544 nvmf_vfio_user_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair, 5545 struct spdk_nvme_transport_id *trid) 5546 { 5547 struct nvmf_vfio_user_sq *sq; 5548 struct nvmf_vfio_user_ctrlr *ctrlr; 5549 5550 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5551 ctrlr = sq->ctrlr; 5552 5553 memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid)); 5554 return 0; 5555 } 5556 5557 static int 5558 nvmf_vfio_user_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair, 5559 struct spdk_nvme_transport_id *trid) 5560 { 5561 return 0; 5562 } 5563 5564 static int 5565 nvmf_vfio_user_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair, 5566 struct spdk_nvme_transport_id *trid) 5567 { 5568 struct nvmf_vfio_user_sq *sq; 5569 struct nvmf_vfio_user_ctrlr *ctrlr; 5570 5571 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5572 ctrlr = sq->ctrlr; 5573 5574 memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid)); 5575 return 0; 5576 } 5577 5578 static void 5579 nvmf_vfio_user_qpair_abort_request(struct spdk_nvmf_qpair *qpair, 5580 struct spdk_nvmf_request *req) 5581 { 5582 struct spdk_nvmf_request *req_to_abort = NULL; 5583 struct spdk_nvmf_request *temp_req = NULL; 5584 uint16_t cid; 5585 5586 cid = req->cmd->nvme_cmd.cdw10_bits.abort.cid; 5587 5588 TAILQ_FOREACH(temp_req, &qpair->outstanding, link) { 5589 struct nvmf_vfio_user_req *vu_req; 5590 5591 vu_req = SPDK_CONTAINEROF(temp_req, struct nvmf_vfio_user_req, req); 5592 5593 if (vu_req->state == VFIO_USER_REQUEST_STATE_EXECUTING && vu_req->cmd.cid == cid) { 5594 req_to_abort = temp_req; 5595 break; 5596 } 5597 } 5598 5599 if (req_to_abort == NULL) { 5600 spdk_nvmf_request_complete(req); 5601 return; 5602 } 5603 5604 req->req_to_abort = req_to_abort; 5605 nvmf_ctrlr_abort_request(req); 5606 } 5607 5608 static void 5609 nvmf_vfio_user_opts_init(struct spdk_nvmf_transport_opts *opts) 5610 { 5611 opts->max_queue_depth = NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH; 5612 opts->max_qpairs_per_ctrlr = NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; 5613 opts->in_capsule_data_size = 0; 5614 opts->max_io_size = NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE; 5615 opts->io_unit_size = NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE; 5616 opts->max_aq_depth = NVMF_VFIO_USER_DEFAULT_AQ_DEPTH; 5617 opts->num_shared_buffers = 0; 5618 opts->buf_cache_size = 0; 5619 opts->association_timeout = 0; 5620 opts->transport_specific = NULL; 5621 } 5622 5623 const struct spdk_nvmf_transport_ops spdk_nvmf_transport_vfio_user = { 5624 .name = "VFIOUSER", 5625 .type = SPDK_NVME_TRANSPORT_VFIOUSER, 5626 .opts_init = nvmf_vfio_user_opts_init, 5627 .create = nvmf_vfio_user_create, 5628 .destroy = nvmf_vfio_user_destroy, 5629 5630 .listen = nvmf_vfio_user_listen, 5631 .stop_listen = nvmf_vfio_user_stop_listen, 5632 .cdata_init = nvmf_vfio_user_cdata_init, 5633 .listen_associate = nvmf_vfio_user_listen_associate, 5634 5635 .listener_discover = nvmf_vfio_user_discover, 5636 5637 .poll_group_create = nvmf_vfio_user_poll_group_create, 5638 .get_optimal_poll_group = nvmf_vfio_user_get_optimal_poll_group, 5639 .poll_group_destroy = nvmf_vfio_user_poll_group_destroy, 5640 .poll_group_add = nvmf_vfio_user_poll_group_add, 5641 .poll_group_remove = nvmf_vfio_user_poll_group_remove, 5642 .poll_group_poll = nvmf_vfio_user_poll_group_poll, 5643 5644 .req_free = nvmf_vfio_user_req_free, 5645 .req_complete = nvmf_vfio_user_req_complete, 5646 5647 .qpair_fini = nvmf_vfio_user_close_qpair, 5648 .qpair_get_local_trid = nvmf_vfio_user_qpair_get_local_trid, 5649 .qpair_get_peer_trid = nvmf_vfio_user_qpair_get_peer_trid, 5650 .qpair_get_listen_trid = nvmf_vfio_user_qpair_get_listen_trid, 5651 .qpair_abort_request = nvmf_vfio_user_qpair_abort_request, 5652 }; 5653 5654 SPDK_NVMF_TRANSPORT_REGISTER(muser, &spdk_nvmf_transport_vfio_user); 5655 SPDK_LOG_REGISTER_COMPONENT(nvmf_vfio) 5656 SPDK_LOG_REGISTER_COMPONENT(vfio_user_db) 5657