1 /*- 2 * BSD LICENSE 3 * Copyright (c) Intel Corporation. All rights reserved. 4 * Copyright (c) 2019-2022, Nutanix Inc. All rights reserved. 5 * Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 /* 35 * NVMe over vfio-user transport 36 */ 37 38 #include <vfio-user/libvfio-user.h> 39 #include <vfio-user/pci_defs.h> 40 41 #include "spdk/barrier.h" 42 #include "spdk/stdinc.h" 43 #include "spdk/assert.h" 44 #include "spdk/thread.h" 45 #include "spdk/nvmf_transport.h" 46 #include "spdk/sock.h" 47 #include "spdk/string.h" 48 #include "spdk/util.h" 49 #include "spdk/log.h" 50 51 #include "transport.h" 52 53 #include "nvmf_internal.h" 54 55 #define SWAP(x, y) \ 56 do \ 57 { \ 58 typeof(x) _tmp = x; \ 59 x = y; \ 60 y = _tmp; \ 61 } while (0) 62 63 #define NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH 256 64 #define NVMF_VFIO_USER_DEFAULT_AQ_DEPTH 32 65 #define NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE ((NVMF_REQ_MAX_BUFFERS - 1) << SHIFT_4KB) 66 #define NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE 67 68 #define NVME_DOORBELLS_OFFSET 0x1000 69 #define NVMF_VFIO_USER_DOORBELLS_SIZE 0x1000 70 #define NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT 2 71 #define NVMF_VFIO_USER_SET_EVENTIDX_MAX_ATTEMPTS 3 72 #define NVMF_VFIO_USER_EVENTIDX_POLL UINT32_MAX 73 74 /* 75 * NVMe driver reads 4096 bytes, which is the extended PCI configuration space 76 * available on PCI-X 2.0 and PCI Express buses 77 */ 78 #define NVME_REG_CFG_SIZE 0x1000 79 #define NVME_REG_BAR0_SIZE (NVME_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE) 80 #define NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR ((NVMF_VFIO_USER_DOORBELLS_SIZE) / 8) 81 #define NVME_IRQ_MSIX_NUM NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR 82 /* MSIX Table Size */ 83 #define NVME_BAR4_SIZE SPDK_ALIGN_CEIL((NVME_IRQ_MSIX_NUM * 16), 0x1000) 84 /* MSIX Pending Bit Array Size */ 85 #define NVME_BAR5_SIZE SPDK_ALIGN_CEIL((NVME_IRQ_MSIX_NUM / 8), 0x1000) 86 87 #define NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR (NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR / 4) 88 89 struct nvmf_vfio_user_req; 90 91 typedef int (*nvmf_vfio_user_req_cb_fn)(struct nvmf_vfio_user_req *req, void *cb_arg); 92 93 /* 1 more for PRP2 list itself */ 94 #define NVMF_VFIO_USER_MAX_IOVECS (NVMF_REQ_MAX_BUFFERS + 1) 95 96 enum nvmf_vfio_user_req_state { 97 VFIO_USER_REQUEST_STATE_FREE = 0, 98 VFIO_USER_REQUEST_STATE_EXECUTING, 99 }; 100 101 /* NVMe device state representation */ 102 struct nvme_migr_sq_state { 103 uint16_t sqid; 104 uint16_t cqid; 105 uint32_t head; 106 uint32_t size; 107 uint32_t reserved; 108 uint64_t dma_addr; 109 }; 110 SPDK_STATIC_ASSERT(sizeof(struct nvme_migr_sq_state) == 0x18, "Incorrect size"); 111 112 struct nvme_migr_cq_state { 113 uint16_t cqid; 114 uint16_t phase; 115 uint32_t tail; 116 uint32_t size; 117 uint32_t iv; 118 uint32_t ien; 119 uint32_t reserved; 120 uint64_t dma_addr; 121 }; 122 SPDK_STATIC_ASSERT(sizeof(struct nvme_migr_cq_state) == 0x20, "Incorrect size"); 123 124 #define VFIO_USER_NVME_MIGR_MAGIC 0xAFEDBC23 125 126 /* The device state is in VFIO MIGRATION BAR(9) region, keep the device state page aligned. 127 * 128 * NVMe device migration region is defined as below: 129 * ------------------------------------------------------------------------- 130 * | vfio_user_nvme_migr_header | nvmf controller data | queue pairs | BARs | 131 * ------------------------------------------------------------------------- 132 * 133 * Keep vfio_user_nvme_migr_header as a fixed 0x1000 length, all new added fields 134 * can use the reserved space at the end of the data structure. 135 */ 136 struct vfio_user_nvme_migr_header { 137 /* Magic value to validate migration data */ 138 uint32_t magic; 139 /* Version to check the data is same from source to destination */ 140 uint32_t version; 141 142 /* The library uses this field to know how many fields in this 143 * structure are valid, starting at the beginning of this data 144 * structure. New added fields in future use `unused` memory 145 * spaces. 146 */ 147 uint32_t opts_size; 148 uint32_t reserved0; 149 150 /* BARs information */ 151 uint64_t bar_offset[VFU_PCI_DEV_NUM_REGIONS]; 152 uint64_t bar_len[VFU_PCI_DEV_NUM_REGIONS]; 153 154 /* Queue pair start offset, starting at the beginning of this 155 * data structure. 156 */ 157 uint64_t qp_offset; 158 uint64_t qp_len; 159 160 /* Controller data structure */ 161 uint32_t num_io_queues; 162 uint32_t reserved1; 163 164 /* TODO: this part will be moved to common nvmf controller data */ 165 uint16_t reserved2[3]; 166 uint16_t nr_aers; 167 uint16_t aer_cids[NVMF_MIGR_MAX_PENDING_AERS]; 168 169 /* NVMf controller data offset and length if exist, starting at 170 * the beginning of this data structure. 171 */ 172 uint64_t nvmf_data_offset; 173 uint64_t nvmf_data_len; 174 175 /* 176 * Whether or not shadow doorbells are used in the source. 0 is a valid DMA 177 * address. 178 */ 179 bool sdbl; 180 181 /* Shadow doorbell DMA addresses. */ 182 uint64_t shadow_doorbell_buffer; 183 uint64_t eventidx_buffer; 184 185 /* Reserved memory space for new added fields, the 186 * field is always at the end of this data structure. 187 */ 188 uint8_t unused[3336]; 189 }; 190 SPDK_STATIC_ASSERT(sizeof(struct vfio_user_nvme_migr_header) == 0x1000, "Incorrect size"); 191 192 struct vfio_user_nvme_migr_qp { 193 struct nvme_migr_sq_state sq; 194 struct nvme_migr_cq_state cq; 195 }; 196 197 /* NVMe state definition used to load/restore from/to NVMe migration BAR region */ 198 struct vfio_user_nvme_migr_state { 199 struct vfio_user_nvme_migr_header ctrlr_header; 200 struct nvmf_ctrlr_migr_data nvmf_data; 201 struct vfio_user_nvme_migr_qp qps[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 202 uint8_t bar0[NVME_REG_BAR0_SIZE]; 203 uint8_t cfg[NVME_REG_CFG_SIZE]; 204 }; 205 206 struct nvmf_vfio_user_req { 207 struct spdk_nvmf_request req; 208 struct spdk_nvme_cpl rsp; 209 struct spdk_nvme_cmd cmd; 210 211 enum nvmf_vfio_user_req_state state; 212 nvmf_vfio_user_req_cb_fn cb_fn; 213 void *cb_arg; 214 215 /* old CC before prop_set_cc fabric command */ 216 union spdk_nvme_cc_register cc; 217 218 TAILQ_ENTRY(nvmf_vfio_user_req) link; 219 220 struct iovec iov[NVMF_VFIO_USER_MAX_IOVECS]; 221 uint8_t iovcnt; 222 223 /* NVMF_VFIO_USER_MAX_IOVECS worth of dma_sg_t. */ 224 uint8_t sg[]; 225 }; 226 227 /* 228 * Mapping of an NVMe queue. 229 * 230 * This holds the information tracking a local process mapping of an NVMe queue 231 * shared by the client. 232 */ 233 struct nvme_q_mapping { 234 /* iov of local process mapping. */ 235 struct iovec iov; 236 /* Stored sg, needed for unmap. */ 237 dma_sg_t *sg; 238 /* Client PRP of queue. */ 239 uint64_t prp1; 240 }; 241 242 enum nvmf_vfio_user_sq_state { 243 VFIO_USER_SQ_UNUSED = 0, 244 VFIO_USER_SQ_CREATED, 245 VFIO_USER_SQ_DELETED, 246 VFIO_USER_SQ_ACTIVE, 247 VFIO_USER_SQ_INACTIVE 248 }; 249 250 enum nvmf_vfio_user_cq_state { 251 VFIO_USER_CQ_UNUSED = 0, 252 VFIO_USER_CQ_CREATED, 253 VFIO_USER_CQ_DELETED, 254 }; 255 256 enum nvmf_vfio_user_ctrlr_state { 257 VFIO_USER_CTRLR_CREATING = 0, 258 VFIO_USER_CTRLR_RUNNING, 259 /* Quiesce requested by libvfio-user */ 260 VFIO_USER_CTRLR_PAUSING, 261 /* NVMf subsystem is paused, it's safe to do PCI reset, memory register, 262 * memory unergister, and vfio migration state transition in this state. 263 */ 264 VFIO_USER_CTRLR_PAUSED, 265 /* 266 * Implies that the NVMf subsystem is paused. Device will be unquiesced (PCI 267 * reset, memory register and unregister, controller in destination VM has 268 * been restored). NVMf subsystem resume has been requested. 269 */ 270 VFIO_USER_CTRLR_RESUMING, 271 /* 272 * Implies that the NVMf subsystem is paused. Both controller in source VM and 273 * destinatiom VM is in this state when doing live migration. 274 */ 275 VFIO_USER_CTRLR_MIGRATING 276 }; 277 278 /* Migration region to record NVMe device state data structure */ 279 struct vfio_user_migration_region { 280 uint64_t last_data_offset; 281 uint64_t pending_bytes; 282 }; 283 284 struct nvmf_vfio_user_sq { 285 struct spdk_nvmf_qpair qpair; 286 struct spdk_nvmf_transport_poll_group *group; 287 struct nvmf_vfio_user_ctrlr *ctrlr; 288 289 uint32_t qid; 290 /* Number of entries in queue. */ 291 uint32_t size; 292 struct nvme_q_mapping mapping; 293 enum nvmf_vfio_user_sq_state sq_state; 294 295 uint32_t head; 296 volatile uint32_t *dbl_tailp; 297 298 /* Whether a shadow doorbell eventidx needs setting. */ 299 bool need_rearm; 300 301 /* multiple SQs can be mapped to the same CQ */ 302 uint16_t cqid; 303 304 /* handle_queue_connect_rsp() can be used both for CREATE IO SQ response 305 * and SQ re-connect response in the destination VM, for the prior case, 306 * we will post a NVMe completion to VM, we will not set this flag when 307 * re-connecting SQs in the destination VM. 308 */ 309 bool post_create_io_sq_completion; 310 /* Copy of Create IO SQ command, this field is used together with 311 * `post_create_io_sq_completion` flag. 312 */ 313 struct spdk_nvme_cmd create_io_sq_cmd; 314 315 /* Currently unallocated reqs. */ 316 TAILQ_HEAD(, nvmf_vfio_user_req) free_reqs; 317 /* Poll group entry */ 318 TAILQ_ENTRY(nvmf_vfio_user_sq) link; 319 /* Connected SQ entry */ 320 TAILQ_ENTRY(nvmf_vfio_user_sq) tailq; 321 }; 322 323 struct nvmf_vfio_user_cq { 324 struct spdk_nvmf_transport_poll_group *group; 325 struct spdk_thread *thread; 326 uint32_t cq_ref; 327 328 uint32_t qid; 329 /* Number of entries in queue. */ 330 uint32_t size; 331 struct nvme_q_mapping mapping; 332 enum nvmf_vfio_user_cq_state cq_state; 333 334 uint32_t tail; 335 volatile uint32_t *dbl_headp; 336 337 bool phase; 338 339 uint16_t iv; 340 bool ien; 341 342 uint32_t last_head; 343 uint32_t last_trigger_irq_tail; 344 }; 345 346 struct nvmf_vfio_user_poll_group { 347 struct spdk_nvmf_transport_poll_group group; 348 TAILQ_ENTRY(nvmf_vfio_user_poll_group) link; 349 TAILQ_HEAD(, nvmf_vfio_user_sq) sqs; 350 }; 351 352 struct nvmf_vfio_user_shadow_doorbells { 353 volatile uint32_t *shadow_doorbells; 354 volatile uint32_t *eventidxs; 355 dma_sg_t *sgs; 356 struct iovec *iovs; 357 }; 358 359 struct nvmf_vfio_user_ctrlr { 360 struct nvmf_vfio_user_endpoint *endpoint; 361 struct nvmf_vfio_user_transport *transport; 362 363 /* Connected SQs list */ 364 TAILQ_HEAD(, nvmf_vfio_user_sq) connected_sqs; 365 enum nvmf_vfio_user_ctrlr_state state; 366 367 struct vfio_user_migration_region migr_reg; 368 /* Controller is in source VM when doing live migration */ 369 bool in_source_vm; 370 371 struct spdk_thread *thread; 372 struct spdk_poller *vfu_ctx_poller; 373 struct spdk_interrupt *intr; 374 int intr_fd; 375 376 bool queued_quiesce; 377 378 bool reset_shn; 379 380 uint16_t cntlid; 381 struct spdk_nvmf_ctrlr *ctrlr; 382 383 struct nvmf_vfio_user_sq *sqs[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 384 struct nvmf_vfio_user_cq *cqs[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 385 386 TAILQ_ENTRY(nvmf_vfio_user_ctrlr) link; 387 388 volatile uint32_t *bar0_doorbells; 389 struct nvmf_vfio_user_shadow_doorbells *sdbl; 390 /* 391 * Shadow doorbells PRPs to provide during the stop-and-copy state. 392 */ 393 uint64_t shadow_doorbell_buffer; 394 uint64_t eventidx_buffer; 395 396 bool self_kick_requested; 397 }; 398 399 /* Endpoint in vfio-user is associated with a socket file, which 400 * is the representative of a PCI endpoint. 401 */ 402 struct nvmf_vfio_user_endpoint { 403 struct nvmf_vfio_user_transport *transport; 404 vfu_ctx_t *vfu_ctx; 405 struct spdk_poller *accept_poller; 406 struct spdk_thread *accept_thread; 407 bool interrupt_mode; 408 struct msixcap *msix; 409 vfu_pci_config_space_t *pci_config_space; 410 int devmem_fd; 411 int accept_intr_fd; 412 struct spdk_interrupt *accept_intr; 413 414 volatile uint32_t *bar0_doorbells; 415 416 int migr_fd; 417 void *migr_data; 418 419 struct spdk_nvme_transport_id trid; 420 const struct spdk_nvmf_subsystem *subsystem; 421 422 /* Controller is associated with an active socket connection, 423 * the lifecycle of the controller is same as the VM. 424 * Currently we only support one active connection, as the NVMe 425 * specification defines, we may support multiple controllers in 426 * future, so that it can support e.g: RESERVATION. 427 */ 428 struct nvmf_vfio_user_ctrlr *ctrlr; 429 pthread_mutex_t lock; 430 431 bool need_async_destroy; 432 /* The subsystem is in PAUSED state and need to be resumed, TRUE 433 * only when migration is done successfully and the controller is 434 * in source VM. 435 */ 436 bool need_resume; 437 438 TAILQ_ENTRY(nvmf_vfio_user_endpoint) link; 439 }; 440 441 struct nvmf_vfio_user_transport_opts { 442 bool disable_mappable_bar0; 443 bool disable_adaptive_irq; 444 bool disable_shadow_doorbells; 445 }; 446 447 struct nvmf_vfio_user_transport { 448 struct spdk_nvmf_transport transport; 449 struct nvmf_vfio_user_transport_opts transport_opts; 450 bool intr_mode_supported; 451 pthread_mutex_t lock; 452 TAILQ_HEAD(, nvmf_vfio_user_endpoint) endpoints; 453 454 pthread_mutex_t pg_lock; 455 TAILQ_HEAD(, nvmf_vfio_user_poll_group) poll_groups; 456 struct nvmf_vfio_user_poll_group *next_pg; 457 }; 458 459 /* 460 * function prototypes 461 */ 462 static int 463 nvmf_vfio_user_req_free(struct spdk_nvmf_request *req); 464 465 static struct nvmf_vfio_user_req * 466 get_nvmf_vfio_user_req(struct nvmf_vfio_user_sq *sq); 467 468 /* 469 * Local process virtual address of a queue. 470 */ 471 static inline void * 472 q_addr(struct nvme_q_mapping *mapping) 473 { 474 return mapping->iov.iov_base; 475 } 476 477 static inline int 478 queue_index(uint16_t qid, bool is_cq) 479 { 480 return (qid * 2) + is_cq; 481 } 482 483 static inline volatile uint32_t * 484 sq_headp(struct nvmf_vfio_user_sq *sq) 485 { 486 assert(sq != NULL); 487 return &sq->head; 488 } 489 490 static inline volatile uint32_t * 491 sq_dbl_tailp(struct nvmf_vfio_user_sq *sq) 492 { 493 assert(sq != NULL); 494 return sq->dbl_tailp; 495 } 496 497 static inline volatile uint32_t * 498 cq_dbl_headp(struct nvmf_vfio_user_cq *cq) 499 { 500 assert(cq != NULL); 501 return cq->dbl_headp; 502 } 503 504 static inline volatile uint32_t * 505 cq_tailp(struct nvmf_vfio_user_cq *cq) 506 { 507 assert(cq != NULL); 508 return &cq->tail; 509 } 510 511 static inline void 512 sq_head_advance(struct nvmf_vfio_user_sq *sq) 513 { 514 assert(sq != NULL); 515 516 assert(*sq_headp(sq) < sq->size); 517 (*sq_headp(sq))++; 518 519 if (spdk_unlikely(*sq_headp(sq) == sq->size)) { 520 *sq_headp(sq) = 0; 521 } 522 } 523 524 static inline void 525 cq_tail_advance(struct nvmf_vfio_user_cq *cq) 526 { 527 assert(cq != NULL); 528 529 assert(*cq_tailp(cq) < cq->size); 530 (*cq_tailp(cq))++; 531 532 if (spdk_unlikely(*cq_tailp(cq) == cq->size)) { 533 *cq_tailp(cq) = 0; 534 cq->phase = !cq->phase; 535 } 536 } 537 538 static inline bool 539 cq_is_full(struct nvmf_vfio_user_cq *cq) 540 { 541 uint32_t qindex; 542 543 assert(cq != NULL); 544 545 qindex = *cq_tailp(cq) + 1; 546 if (spdk_unlikely(qindex == cq->size)) { 547 qindex = 0; 548 } 549 550 return qindex == *cq_dbl_headp(cq); 551 } 552 553 static bool 554 io_q_exists(struct nvmf_vfio_user_ctrlr *vu_ctrlr, const uint16_t qid, const bool is_cq) 555 { 556 assert(vu_ctrlr != NULL); 557 558 if (qid == 0 || qid >= NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) { 559 return false; 560 } 561 562 if (is_cq) { 563 if (vu_ctrlr->cqs[qid] == NULL) { 564 return false; 565 } 566 567 return (vu_ctrlr->cqs[qid]->cq_state != VFIO_USER_CQ_DELETED && 568 vu_ctrlr->cqs[qid]->cq_state != VFIO_USER_CQ_UNUSED); 569 } 570 571 if (vu_ctrlr->sqs[qid] == NULL) { 572 return false; 573 } 574 575 return (vu_ctrlr->sqs[qid]->sq_state != VFIO_USER_SQ_DELETED && 576 vu_ctrlr->sqs[qid]->sq_state != VFIO_USER_SQ_UNUSED); 577 } 578 579 static inline size_t 580 vfio_user_migr_data_len(void) 581 { 582 return SPDK_ALIGN_CEIL(sizeof(struct vfio_user_nvme_migr_state), PAGE_SIZE); 583 } 584 585 static int 586 vfio_user_handle_intr(void *ctx); 587 588 /* 589 * Wrap vfio_user_handle_intr() such that it can be used with 590 * spdk_thread_send_msg(). 591 * Pollers have type int (*)(void *) while message functions should have type 592 * void (*)(void *), so simply discard the returned value. 593 */ 594 static void 595 vfio_user_handle_intr_wrapper(void *ctx) 596 { 597 vfio_user_handle_intr(ctx); 598 } 599 600 static inline int 601 self_kick(struct nvmf_vfio_user_ctrlr *ctrlr) 602 { 603 assert(ctrlr != NULL); 604 assert(ctrlr->thread != NULL); 605 606 if (ctrlr->self_kick_requested) { 607 return 0; 608 } 609 610 ctrlr->self_kick_requested = true; 611 612 return spdk_thread_send_msg(ctrlr->thread, 613 vfio_user_handle_intr_wrapper, 614 ctrlr); 615 } 616 617 /* 618 * Make the given DMA address and length available (locally mapped) via iov. 619 */ 620 static void * 621 map_one(vfu_ctx_t *ctx, uint64_t addr, uint64_t len, dma_sg_t *sg, 622 struct iovec *iov, int prot) 623 { 624 int ret; 625 626 assert(ctx != NULL); 627 assert(sg != NULL); 628 assert(iov != NULL); 629 630 ret = vfu_addr_to_sg(ctx, (void *)(uintptr_t)addr, len, sg, 1, prot); 631 if (ret < 0) { 632 return NULL; 633 } 634 635 ret = vfu_map_sg(ctx, sg, iov, 1, 0); 636 if (ret != 0) { 637 return NULL; 638 } 639 640 assert(iov->iov_base != NULL); 641 return iov->iov_base; 642 } 643 644 static int 645 nvme_cmd_map_prps(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, 646 uint32_t max_iovcnt, uint32_t len, size_t mps, 647 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 648 { 649 uint64_t prp1, prp2; 650 void *vva; 651 uint32_t i; 652 uint32_t residue_len, nents; 653 uint64_t *prp_list; 654 uint32_t iovcnt; 655 656 assert(max_iovcnt > 0); 657 658 prp1 = cmd->dptr.prp.prp1; 659 prp2 = cmd->dptr.prp.prp2; 660 661 /* PRP1 may started with unaligned page address */ 662 residue_len = mps - (prp1 % mps); 663 residue_len = spdk_min(len, residue_len); 664 665 vva = gpa_to_vva(prv, prp1, residue_len, PROT_READ | PROT_WRITE); 666 if (spdk_unlikely(vva == NULL)) { 667 SPDK_ERRLOG("GPA to VVA failed\n"); 668 return -EINVAL; 669 } 670 len -= residue_len; 671 if (len && max_iovcnt < 2) { 672 SPDK_ERRLOG("Too many page entries, at least two iovs are required\n"); 673 return -ERANGE; 674 } 675 iovs[0].iov_base = vva; 676 iovs[0].iov_len = residue_len; 677 678 if (len) { 679 if (spdk_unlikely(prp2 == 0)) { 680 SPDK_ERRLOG("no PRP2, %d remaining\n", len); 681 return -EINVAL; 682 } 683 684 if (len <= mps) { 685 /* 2 PRP used */ 686 iovcnt = 2; 687 vva = gpa_to_vva(prv, prp2, len, PROT_READ | PROT_WRITE); 688 if (spdk_unlikely(vva == NULL)) { 689 SPDK_ERRLOG("no VVA for %#" PRIx64 ", len%#x\n", 690 prp2, len); 691 return -EINVAL; 692 } 693 iovs[1].iov_base = vva; 694 iovs[1].iov_len = len; 695 } else { 696 /* PRP list used */ 697 nents = (len + mps - 1) / mps; 698 if (spdk_unlikely(nents + 1 > max_iovcnt)) { 699 SPDK_ERRLOG("Too many page entries\n"); 700 return -ERANGE; 701 } 702 703 vva = gpa_to_vva(prv, prp2, nents * sizeof(*prp_list), PROT_READ); 704 if (spdk_unlikely(vva == NULL)) { 705 SPDK_ERRLOG("no VVA for %#" PRIx64 ", nents=%#x\n", 706 prp2, nents); 707 return -EINVAL; 708 } 709 prp_list = vva; 710 i = 0; 711 while (len != 0) { 712 residue_len = spdk_min(len, mps); 713 vva = gpa_to_vva(prv, prp_list[i], residue_len, PROT_READ | PROT_WRITE); 714 if (spdk_unlikely(vva == NULL)) { 715 SPDK_ERRLOG("no VVA for %#" PRIx64 ", residue_len=%#x\n", 716 prp_list[i], residue_len); 717 return -EINVAL; 718 } 719 iovs[i + 1].iov_base = vva; 720 iovs[i + 1].iov_len = residue_len; 721 len -= residue_len; 722 i++; 723 } 724 iovcnt = i + 1; 725 } 726 } else { 727 /* 1 PRP used */ 728 iovcnt = 1; 729 } 730 731 assert(iovcnt <= max_iovcnt); 732 return iovcnt; 733 } 734 735 static int 736 nvme_cmd_map_sgls_data(void *prv, struct spdk_nvme_sgl_descriptor *sgls, uint32_t num_sgls, 737 struct iovec *iovs, uint32_t max_iovcnt, 738 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 739 { 740 uint32_t i; 741 void *vva; 742 743 if (spdk_unlikely(max_iovcnt < num_sgls)) { 744 return -ERANGE; 745 } 746 747 for (i = 0; i < num_sgls; i++) { 748 if (spdk_unlikely(sgls[i].unkeyed.type != SPDK_NVME_SGL_TYPE_DATA_BLOCK)) { 749 SPDK_ERRLOG("Invalid SGL type %u\n", sgls[i].unkeyed.type); 750 return -EINVAL; 751 } 752 vva = gpa_to_vva(prv, sgls[i].address, sgls[i].unkeyed.length, PROT_READ | PROT_WRITE); 753 if (spdk_unlikely(vva == NULL)) { 754 SPDK_ERRLOG("GPA to VVA failed\n"); 755 return -EINVAL; 756 } 757 iovs[i].iov_base = vva; 758 iovs[i].iov_len = sgls[i].unkeyed.length; 759 } 760 761 return num_sgls; 762 } 763 764 static int 765 nvme_cmd_map_sgls(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt, 766 uint32_t len, size_t mps, 767 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 768 { 769 struct spdk_nvme_sgl_descriptor *sgl, *last_sgl; 770 uint32_t num_sgls, seg_len; 771 void *vva; 772 int ret; 773 uint32_t total_iovcnt = 0; 774 775 /* SGL cases */ 776 sgl = &cmd->dptr.sgl1; 777 778 /* only one SGL segment */ 779 if (sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { 780 assert(max_iovcnt > 0); 781 vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, PROT_READ | PROT_WRITE); 782 if (spdk_unlikely(vva == NULL)) { 783 SPDK_ERRLOG("GPA to VVA failed\n"); 784 return -EINVAL; 785 } 786 iovs[0].iov_base = vva; 787 iovs[0].iov_len = sgl->unkeyed.length; 788 assert(sgl->unkeyed.length == len); 789 790 return 1; 791 } 792 793 for (;;) { 794 if (spdk_unlikely((sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_SEGMENT) && 795 (sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_LAST_SEGMENT))) { 796 SPDK_ERRLOG("Invalid SGL type %u\n", sgl->unkeyed.type); 797 return -EINVAL; 798 } 799 800 seg_len = sgl->unkeyed.length; 801 if (spdk_unlikely(seg_len % sizeof(struct spdk_nvme_sgl_descriptor))) { 802 SPDK_ERRLOG("Invalid SGL segment len %u\n", seg_len); 803 return -EINVAL; 804 } 805 806 num_sgls = seg_len / sizeof(struct spdk_nvme_sgl_descriptor); 807 vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, PROT_READ); 808 if (spdk_unlikely(vva == NULL)) { 809 SPDK_ERRLOG("GPA to VVA failed\n"); 810 return -EINVAL; 811 } 812 813 /* sgl point to the first segment */ 814 sgl = (struct spdk_nvme_sgl_descriptor *)vva; 815 last_sgl = &sgl[num_sgls - 1]; 816 817 /* we are done */ 818 if (last_sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { 819 /* map whole sgl list */ 820 ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls, &iovs[total_iovcnt], 821 max_iovcnt - total_iovcnt, gpa_to_vva); 822 if (spdk_unlikely(ret < 0)) { 823 return ret; 824 } 825 total_iovcnt += ret; 826 827 return total_iovcnt; 828 } 829 830 if (num_sgls > 1) { 831 /* map whole sgl exclude last_sgl */ 832 ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls - 1, &iovs[total_iovcnt], 833 max_iovcnt - total_iovcnt, gpa_to_vva); 834 if (spdk_unlikely(ret < 0)) { 835 return ret; 836 } 837 total_iovcnt += ret; 838 } 839 840 /* move to next level's segments */ 841 sgl = last_sgl; 842 } 843 844 return 0; 845 } 846 847 static int 848 nvme_map_cmd(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt, 849 uint32_t len, size_t mps, 850 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 851 { 852 if (cmd->psdt == SPDK_NVME_PSDT_PRP) { 853 return nvme_cmd_map_prps(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva); 854 } 855 856 return nvme_cmd_map_sgls(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva); 857 } 858 859 static char * 860 endpoint_id(struct nvmf_vfio_user_endpoint *endpoint) 861 { 862 return endpoint->trid.traddr; 863 } 864 865 static char * 866 ctrlr_id(struct nvmf_vfio_user_ctrlr *ctrlr) 867 { 868 if (!ctrlr || !ctrlr->endpoint) { 869 return "Null Ctrlr"; 870 } 871 872 return endpoint_id(ctrlr->endpoint); 873 } 874 875 /* 876 * For each queue, update the location of its doorbell to the correct location: 877 * either our own BAR0, or the guest's configured shadow doorbell area. 878 * 879 * The Admin queue (qid: 0) does not ever use shadow doorbells. 880 */ 881 static void 882 vfio_user_ctrlr_switch_doorbells(struct nvmf_vfio_user_ctrlr *ctrlr, bool shadow) 883 { 884 volatile uint32_t *doorbells = shadow ? ctrlr->sdbl->shadow_doorbells : 885 ctrlr->bar0_doorbells; 886 887 assert(doorbells != NULL); 888 889 for (size_t i = 1; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; i++) { 890 struct nvmf_vfio_user_sq *sq = ctrlr->sqs[i]; 891 struct nvmf_vfio_user_cq *cq = ctrlr->cqs[i]; 892 893 if (sq != NULL) { 894 sq->dbl_tailp = doorbells + queue_index(sq->qid, false); 895 } 896 897 if (cq != NULL) { 898 cq->dbl_headp = doorbells + queue_index(cq->qid, true); 899 } 900 } 901 } 902 903 static void 904 unmap_sdbl(vfu_ctx_t *vfu_ctx, struct nvmf_vfio_user_shadow_doorbells *sdbl) 905 { 906 assert(vfu_ctx != NULL); 907 assert(sdbl != NULL); 908 909 /* 910 * An allocation error would result in only one of the two being 911 * non-NULL. If that is the case, no memory should have been mapped. 912 */ 913 if (sdbl->iovs == NULL || sdbl->sgs == NULL) { 914 return; 915 } 916 917 for (size_t i = 0; i < NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT; ++i) { 918 struct iovec *iov; 919 dma_sg_t *sg; 920 921 if (!sdbl->iovs[i].iov_len) { 922 continue; 923 } 924 925 sg = (dma_sg_t *)((uintptr_t)sdbl->sgs + i * dma_sg_size()); 926 iov = sdbl->iovs + i; 927 928 vfu_unmap_sg(vfu_ctx, sg, iov, 1); 929 } 930 } 931 932 static void 933 free_sdbl(vfu_ctx_t *vfu_ctx, struct nvmf_vfio_user_shadow_doorbells *sdbl) 934 { 935 if (sdbl == NULL) { 936 return; 937 } 938 939 unmap_sdbl(vfu_ctx, sdbl); 940 941 /* 942 * sdbl->shadow_doorbells and sdbl->eventidxs were mapped, 943 * not allocated, so don't free() them. 944 */ 945 free(sdbl->sgs); 946 free(sdbl->iovs); 947 free(sdbl); 948 } 949 950 static struct nvmf_vfio_user_shadow_doorbells * 951 map_sdbl(vfu_ctx_t *vfu_ctx, uint64_t prp1, uint64_t prp2, size_t len) 952 { 953 struct nvmf_vfio_user_shadow_doorbells *sdbl = NULL; 954 dma_sg_t *sg2 = NULL; 955 void *p; 956 957 assert(vfu_ctx != NULL); 958 959 sdbl = calloc(1, sizeof(*sdbl)); 960 if (sdbl == NULL) { 961 goto err; 962 } 963 964 sdbl->sgs = calloc(NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT, dma_sg_size()); 965 sdbl->iovs = calloc(NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT, sizeof(*sdbl->iovs)); 966 if (sdbl->sgs == NULL || sdbl->iovs == NULL) { 967 goto err; 968 } 969 970 /* Map shadow doorbell buffer (PRP1). */ 971 p = map_one(vfu_ctx, prp1, len, sdbl->sgs, sdbl->iovs, 972 PROT_READ | PROT_WRITE); 973 974 if (p == NULL) { 975 goto err; 976 } 977 978 /* 979 * Map eventidx buffer (PRP2). 980 * Should only be written to by the controller. 981 */ 982 983 sg2 = (dma_sg_t *)((uintptr_t)sdbl->sgs + dma_sg_size()); 984 985 p = map_one(vfu_ctx, prp2, len, sg2, sdbl->iovs + 1, 986 PROT_READ | PROT_WRITE); 987 988 if (p == NULL) { 989 goto err; 990 } 991 992 sdbl->shadow_doorbells = (uint32_t *)sdbl->iovs[0].iov_base; 993 sdbl->eventidxs = (uint32_t *)sdbl->iovs[1].iov_base; 994 995 return sdbl; 996 997 err: 998 free_sdbl(vfu_ctx, sdbl); 999 return NULL; 1000 } 1001 1002 /* 1003 * Copy doorbells from one buffer to the other, during switches betweeen BAR0 1004 * doorbells and shadow doorbells. 1005 */ 1006 static void 1007 copy_doorbells(struct nvmf_vfio_user_ctrlr *ctrlr, 1008 const volatile uint32_t *from, volatile uint32_t *to) 1009 { 1010 assert(ctrlr != NULL); 1011 assert(from != NULL); 1012 assert(to != NULL); 1013 1014 SPDK_DEBUGLOG(vfio_user_db, 1015 "%s: migrating shadow doorbells from %p to %p\n", 1016 ctrlr_id(ctrlr), from, to); 1017 1018 /* Can't use memcpy because it doesn't respect volatile semantics. */ 1019 for (size_t i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; ++i) { 1020 if (ctrlr->sqs[i] != NULL) { 1021 to[queue_index(i, false)] = from[queue_index(i, false)]; 1022 } 1023 1024 if (ctrlr->cqs[i] != NULL) { 1025 to[queue_index(i, true)] = from[queue_index(i, true)]; 1026 } 1027 } 1028 } 1029 1030 static void 1031 fail_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1032 { 1033 const struct spdk_nvmf_registers *regs; 1034 1035 assert(vu_ctrlr != NULL); 1036 assert(vu_ctrlr->ctrlr != NULL); 1037 1038 regs = spdk_nvmf_ctrlr_get_regs(vu_ctrlr->ctrlr); 1039 if (regs->csts.bits.cfs == 0) { 1040 SPDK_ERRLOG(":%s failing controller\n", ctrlr_id(vu_ctrlr)); 1041 } 1042 1043 nvmf_ctrlr_set_fatal_status(vu_ctrlr->ctrlr); 1044 } 1045 1046 static inline bool 1047 ctrlr_interrupt_enabled(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1048 { 1049 assert(vu_ctrlr != NULL); 1050 assert(vu_ctrlr->endpoint != NULL); 1051 1052 vfu_pci_config_space_t *pci = vu_ctrlr->endpoint->pci_config_space; 1053 1054 return (!pci->hdr.cmd.id || vu_ctrlr->endpoint->msix->mxc.mxe); 1055 } 1056 1057 static void 1058 nvmf_vfio_user_destroy_endpoint(struct nvmf_vfio_user_endpoint *endpoint) 1059 { 1060 SPDK_DEBUGLOG(nvmf_vfio, "destroy endpoint %s\n", endpoint_id(endpoint)); 1061 1062 spdk_interrupt_unregister(&endpoint->accept_intr); 1063 spdk_poller_unregister(&endpoint->accept_poller); 1064 1065 if (endpoint->bar0_doorbells) { 1066 munmap((void *)endpoint->bar0_doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 1067 } 1068 1069 if (endpoint->devmem_fd > 0) { 1070 close(endpoint->devmem_fd); 1071 } 1072 1073 if (endpoint->migr_data) { 1074 munmap(endpoint->migr_data, vfio_user_migr_data_len()); 1075 } 1076 1077 if (endpoint->migr_fd > 0) { 1078 close(endpoint->migr_fd); 1079 } 1080 1081 if (endpoint->vfu_ctx) { 1082 vfu_destroy_ctx(endpoint->vfu_ctx); 1083 } 1084 1085 pthread_mutex_destroy(&endpoint->lock); 1086 free(endpoint); 1087 } 1088 1089 /* called when process exits */ 1090 static int 1091 nvmf_vfio_user_destroy(struct spdk_nvmf_transport *transport, 1092 spdk_nvmf_transport_destroy_done_cb cb_fn, void *cb_arg) 1093 { 1094 struct nvmf_vfio_user_transport *vu_transport; 1095 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 1096 1097 SPDK_DEBUGLOG(nvmf_vfio, "destroy transport\n"); 1098 1099 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 1100 transport); 1101 1102 pthread_mutex_destroy(&vu_transport->lock); 1103 pthread_mutex_destroy(&vu_transport->pg_lock); 1104 1105 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 1106 TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link); 1107 nvmf_vfio_user_destroy_endpoint(endpoint); 1108 } 1109 1110 free(vu_transport); 1111 1112 if (cb_fn) { 1113 cb_fn(cb_arg); 1114 } 1115 1116 return 0; 1117 } 1118 1119 static const struct spdk_json_object_decoder vfio_user_transport_opts_decoder[] = { 1120 { 1121 "disable_mappable_bar0", 1122 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_mappable_bar0), 1123 spdk_json_decode_bool, true 1124 }, 1125 { 1126 "disable_adaptive_irq", 1127 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_adaptive_irq), 1128 spdk_json_decode_bool, true 1129 }, 1130 { 1131 "disable_shadow_doorbells", 1132 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_shadow_doorbells), 1133 spdk_json_decode_bool, true 1134 }, 1135 }; 1136 1137 static struct spdk_nvmf_transport * 1138 nvmf_vfio_user_create(struct spdk_nvmf_transport_opts *opts) 1139 { 1140 struct nvmf_vfio_user_transport *vu_transport; 1141 int err; 1142 1143 if (opts->max_qpairs_per_ctrlr > NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) { 1144 SPDK_ERRLOG("Invalid max_qpairs_per_ctrlr=%d, supported max_qpairs_per_ctrlr=%d\n", 1145 opts->max_qpairs_per_ctrlr, NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR); 1146 return NULL; 1147 } 1148 1149 vu_transport = calloc(1, sizeof(*vu_transport)); 1150 if (vu_transport == NULL) { 1151 SPDK_ERRLOG("Transport alloc fail: %m\n"); 1152 return NULL; 1153 } 1154 1155 err = pthread_mutex_init(&vu_transport->lock, NULL); 1156 if (err != 0) { 1157 SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err); 1158 goto err; 1159 } 1160 TAILQ_INIT(&vu_transport->endpoints); 1161 1162 err = pthread_mutex_init(&vu_transport->pg_lock, NULL); 1163 if (err != 0) { 1164 pthread_mutex_destroy(&vu_transport->lock); 1165 SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err); 1166 goto err; 1167 } 1168 TAILQ_INIT(&vu_transport->poll_groups); 1169 1170 if (opts->transport_specific != NULL && 1171 spdk_json_decode_object_relaxed(opts->transport_specific, vfio_user_transport_opts_decoder, 1172 SPDK_COUNTOF(vfio_user_transport_opts_decoder), 1173 vu_transport)) { 1174 SPDK_ERRLOG("spdk_json_decode_object_relaxed failed\n"); 1175 goto cleanup; 1176 } 1177 1178 /* 1179 * To support interrupt mode, the transport must be configured with 1180 * mappable BAR0 disabled: we need a vfio-user message to wake us up 1181 * when a client writes new doorbell values to BAR0, via the 1182 * libvfio-user socket fd. 1183 */ 1184 vu_transport->intr_mode_supported = 1185 vu_transport->transport_opts.disable_mappable_bar0; 1186 1187 /* 1188 * If BAR0 is mappable, it doesn't make sense to support shadow 1189 * doorbells, so explicitly turn it off. 1190 */ 1191 if (!vu_transport->transport_opts.disable_mappable_bar0) { 1192 vu_transport->transport_opts.disable_shadow_doorbells = true; 1193 } 1194 1195 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_mappable_bar0=%d\n", 1196 vu_transport->transport_opts.disable_mappable_bar0); 1197 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_adaptive_irq=%d\n", 1198 vu_transport->transport_opts.disable_adaptive_irq); 1199 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_shadow_doorbells=%d\n", 1200 vu_transport->transport_opts.disable_shadow_doorbells); 1201 1202 return &vu_transport->transport; 1203 1204 cleanup: 1205 pthread_mutex_destroy(&vu_transport->lock); 1206 pthread_mutex_destroy(&vu_transport->pg_lock); 1207 err: 1208 free(vu_transport); 1209 return NULL; 1210 } 1211 1212 static uint32_t 1213 max_queue_size(struct nvmf_vfio_user_ctrlr const *vu_ctrlr) 1214 { 1215 assert(vu_ctrlr != NULL); 1216 assert(vu_ctrlr->ctrlr != NULL); 1217 1218 return vu_ctrlr->ctrlr->vcprop.cap.bits.mqes + 1; 1219 } 1220 1221 static uint32_t 1222 doorbell_stride(const struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1223 { 1224 assert(vu_ctrlr != NULL); 1225 assert(vu_ctrlr->ctrlr != NULL); 1226 1227 return vu_ctrlr->ctrlr->vcprop.cap.bits.dstrd; 1228 } 1229 1230 static uintptr_t 1231 memory_page_size(const struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1232 { 1233 uint32_t memory_page_shift = vu_ctrlr->ctrlr->vcprop.cc.bits.mps + 12; 1234 return 1ul << memory_page_shift; 1235 } 1236 1237 static uintptr_t 1238 memory_page_mask(const struct nvmf_vfio_user_ctrlr *ctrlr) 1239 { 1240 return ~(memory_page_size(ctrlr) - 1); 1241 } 1242 1243 static int 1244 map_q(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvme_q_mapping *mapping, 1245 uint32_t q_size, bool is_cq, bool unmap) 1246 { 1247 uint64_t len; 1248 void *ret; 1249 1250 assert(q_size); 1251 assert(q_addr(mapping) == NULL); 1252 1253 if (is_cq) { 1254 len = q_size * sizeof(struct spdk_nvme_cpl); 1255 } else { 1256 len = q_size * sizeof(struct spdk_nvme_cmd); 1257 } 1258 1259 ret = map_one(vu_ctrlr->endpoint->vfu_ctx, mapping->prp1, len, 1260 mapping->sg, &mapping->iov, 1261 is_cq ? PROT_READ | PROT_WRITE : PROT_READ); 1262 if (ret == NULL) { 1263 return -EFAULT; 1264 } 1265 1266 if (unmap) { 1267 memset(q_addr(mapping), 0, len); 1268 } 1269 1270 return 0; 1271 } 1272 1273 static inline void 1274 unmap_q(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvme_q_mapping *mapping) 1275 { 1276 if (q_addr(mapping) != NULL) { 1277 vfu_unmap_sg(vu_ctrlr->endpoint->vfu_ctx, mapping->sg, 1278 &mapping->iov, 1); 1279 mapping->iov.iov_base = NULL; 1280 } 1281 } 1282 1283 static int 1284 asq_setup(struct nvmf_vfio_user_ctrlr *ctrlr) 1285 { 1286 struct nvmf_vfio_user_sq *sq; 1287 const struct spdk_nvmf_registers *regs; 1288 int ret; 1289 1290 assert(ctrlr != NULL); 1291 1292 sq = ctrlr->sqs[0]; 1293 1294 assert(sq != NULL); 1295 assert(q_addr(&sq->mapping) == NULL); 1296 /* XXX ctrlr->asq == 0 is a valid memory address */ 1297 1298 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->ctrlr); 1299 sq->qid = 0; 1300 sq->size = regs->aqa.bits.asqs + 1; 1301 sq->mapping.prp1 = regs->asq; 1302 *sq_headp(sq) = 0; 1303 sq->cqid = 0; 1304 1305 ret = map_q(ctrlr, &sq->mapping, sq->size, false, true); 1306 if (ret) { 1307 return ret; 1308 } 1309 1310 /* The Admin queue (qid: 0) does not ever use shadow doorbells. */ 1311 sq->dbl_tailp = ctrlr->bar0_doorbells + queue_index(0, false); 1312 1313 *sq_dbl_tailp(sq) = 0; 1314 1315 return 0; 1316 } 1317 1318 /* 1319 * Updates eventidx to set an SQ into interrupt or polling mode. 1320 * 1321 * Returns false if the current SQ tail does not match the SQ head, as 1322 * this means that the host has submitted more items to the queue while we were 1323 * not looking - or during the event index update. In that case, we must retry, 1324 * or otherwise make sure we are going to wake up again. 1325 */ 1326 static bool 1327 set_sq_eventidx(struct nvmf_vfio_user_sq *sq) 1328 { 1329 struct nvmf_vfio_user_ctrlr *ctrlr; 1330 volatile uint32_t *sq_tail_eidx; 1331 uint32_t old_tail, new_tail; 1332 1333 assert(sq != NULL); 1334 assert(sq->ctrlr != NULL); 1335 assert(sq->ctrlr->sdbl != NULL); 1336 assert(sq->need_rearm); 1337 1338 ctrlr = sq->ctrlr; 1339 1340 SPDK_DEBUGLOG(vfio_user_db, "%s: updating eventidx of sqid:%u\n", 1341 ctrlr_id(ctrlr), sq->qid); 1342 1343 sq_tail_eidx = ctrlr->sdbl->eventidxs + queue_index(sq->qid, false); 1344 1345 assert(ctrlr->endpoint != NULL); 1346 1347 if (!ctrlr->endpoint->interrupt_mode) { 1348 /* No synchronisation necessary. */ 1349 *sq_tail_eidx = NVMF_VFIO_USER_EVENTIDX_POLL; 1350 return true; 1351 } 1352 1353 old_tail = *sq_dbl_tailp(sq); 1354 *sq_tail_eidx = old_tail; 1355 1356 /* 1357 * Ensure that the event index is updated before re-reading the tail 1358 * doorbell. If it's not, then the host might race us and update the 1359 * tail after the second read but before the event index is written, so 1360 * it won't write to BAR0 and we'll miss the update. 1361 * 1362 * The driver should provide similar ordering with an mb(). 1363 */ 1364 spdk_mb(); 1365 1366 /* 1367 * Check if the host has updated the tail doorbell after we've read it 1368 * for the first time, but before the event index was written. If that's 1369 * the case, then we've lost the race and we need to update the event 1370 * index again (after polling the queue, since the host won't write to 1371 * BAR0). 1372 */ 1373 new_tail = *sq_dbl_tailp(sq); 1374 1375 /* 1376 * We might poll the queue straight after this function returns if the 1377 * tail has been updated, so we need to ensure that any changes to the 1378 * queue will be visible to us if the doorbell has been updated. 1379 * 1380 * The driver should provide similar ordering with a wmb() to ensure 1381 * that the queue is written before it updates the tail doorbell. 1382 */ 1383 spdk_rmb(); 1384 1385 SPDK_DEBUGLOG(vfio_user_db, "%s: sqid:%u, old_tail=%u, new_tail=%u, " 1386 "sq_head=%u\n", ctrlr_id(ctrlr), sq->qid, old_tail, 1387 new_tail, *sq_headp(sq)); 1388 1389 if (new_tail == *sq_headp(sq)) { 1390 sq->need_rearm = false; 1391 return true; 1392 } 1393 1394 /* 1395 * We've lost the race: the tail was updated since we last polled, 1396 * including if it happened within this routine. 1397 * 1398 * The caller should retry after polling (think of this as a cmpxchg 1399 * loop); if we go to sleep while the SQ is not empty, then we won't 1400 * process the remaining events. 1401 */ 1402 return false; 1403 } 1404 1405 static int 1406 nvmf_vfio_user_sq_poll(struct nvmf_vfio_user_sq *sq); 1407 1408 /* 1409 * Arrange for an SQ to interrupt us if written. Returns non-zero if we 1410 * processed some SQ entries. 1411 */ 1412 static int 1413 set_sq_intr_mode(struct nvmf_vfio_user_ctrlr *ctrlr, 1414 struct nvmf_vfio_user_sq *sq) 1415 { 1416 int count = 0; 1417 size_t i; 1418 1419 if (!sq->need_rearm) { 1420 return 0; 1421 } 1422 1423 for (i = 0; i < NVMF_VFIO_USER_SET_EVENTIDX_MAX_ATTEMPTS; i++) { 1424 int ret; 1425 1426 if (set_sq_eventidx(sq)) { 1427 /* We won the race and set eventidx; done. */ 1428 return count; 1429 } 1430 1431 ret = nvmf_vfio_user_sq_poll(sq); 1432 1433 count += (ret < 0) ? 1 : ret; 1434 1435 /* 1436 * set_sq_eventidx() hit the race, so we expected 1437 * to process at least one command from this queue. 1438 * If there were no new commands waiting for us, then 1439 * we must have hit an unexpected race condition. 1440 */ 1441 if (ret == 0) { 1442 SPDK_ERRLOG("%s: unexpected race condition detected " 1443 "while updating the shadow doorbell buffer\n", 1444 ctrlr_id(ctrlr)); 1445 1446 fail_ctrlr(ctrlr); 1447 return count; 1448 } 1449 } 1450 1451 SPDK_DEBUGLOG(vfio_user_db, 1452 "%s: set_sq_eventidx() lost the race %zu times\n", 1453 ctrlr_id(ctrlr), i); 1454 1455 /* 1456 * We couldn't arrange an eventidx guaranteed to cause a BAR0 write, as 1457 * we raced with the producer too many times; force ourselves to wake up 1458 * instead. We'll process all queues at that point. 1459 */ 1460 self_kick(ctrlr); 1461 1462 return count; 1463 } 1464 1465 /* 1466 * We're in interrupt mode, and potentially about to go to sleep. We need to 1467 * make sure any further I/O submissions are guaranteed to wake us up: for 1468 * shadow doorbells that means we may need to go through set_sq_eventidx() for 1469 * every SQ that needs re-arming. 1470 * 1471 * Returns non-zero if we processed something. 1472 */ 1473 static int 1474 set_ctrlr_intr_mode(struct nvmf_vfio_user_ctrlr *ctrlr) 1475 { 1476 int count = 0; 1477 1478 assert(ctrlr != NULL); 1479 1480 if (ctrlr->sdbl == NULL) { 1481 return 0; 1482 } 1483 1484 /* 1485 * The admin queue (qid: 0) doesn't use the shadow doorbell buffer, so 1486 * skip it. 1487 */ 1488 for (size_t i = 1; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; ++i) { 1489 if (!io_q_exists(ctrlr, i, false)) { 1490 continue; 1491 } 1492 1493 count += set_sq_intr_mode(ctrlr, ctrlr->sqs[i]); 1494 } 1495 1496 return count; 1497 } 1498 1499 static int 1500 acq_setup(struct nvmf_vfio_user_ctrlr *ctrlr) 1501 { 1502 struct nvmf_vfio_user_cq *cq; 1503 const struct spdk_nvmf_registers *regs; 1504 int ret; 1505 1506 assert(ctrlr != NULL); 1507 1508 cq = ctrlr->cqs[0]; 1509 1510 assert(cq != NULL); 1511 1512 assert(q_addr(&cq->mapping) == NULL); 1513 1514 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->ctrlr); 1515 assert(regs != NULL); 1516 cq->qid = 0; 1517 cq->size = regs->aqa.bits.acqs + 1; 1518 cq->mapping.prp1 = regs->acq; 1519 *cq_tailp(cq) = 0; 1520 cq->ien = true; 1521 cq->phase = true; 1522 1523 ret = map_q(ctrlr, &cq->mapping, cq->size, true, true); 1524 if (ret) { 1525 return ret; 1526 } 1527 1528 /* The Admin queue (qid: 0) does not ever use shadow doorbells. */ 1529 cq->dbl_headp = ctrlr->bar0_doorbells + queue_index(0, true); 1530 1531 *cq_dbl_headp(cq) = 0; 1532 1533 return 0; 1534 } 1535 1536 static inline dma_sg_t * 1537 vu_req_to_sg_t(struct nvmf_vfio_user_req *vu_req, uint32_t iovcnt) 1538 { 1539 return (dma_sg_t *)(vu_req->sg + iovcnt * dma_sg_size()); 1540 } 1541 1542 static void * 1543 _map_one(void *prv, uint64_t addr, uint64_t len, int prot) 1544 { 1545 struct spdk_nvmf_request *req = (struct spdk_nvmf_request *)prv; 1546 struct spdk_nvmf_qpair *qpair; 1547 struct nvmf_vfio_user_req *vu_req; 1548 struct nvmf_vfio_user_sq *sq; 1549 void *ret; 1550 1551 assert(req != NULL); 1552 qpair = req->qpair; 1553 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 1554 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 1555 1556 assert(vu_req->iovcnt < NVMF_VFIO_USER_MAX_IOVECS); 1557 ret = map_one(sq->ctrlr->endpoint->vfu_ctx, addr, len, 1558 vu_req_to_sg_t(vu_req, vu_req->iovcnt), 1559 &vu_req->iov[vu_req->iovcnt], prot); 1560 if (spdk_likely(ret != NULL)) { 1561 vu_req->iovcnt++; 1562 } 1563 return ret; 1564 } 1565 1566 static int 1567 vfio_user_map_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req, 1568 struct iovec *iov, uint32_t length) 1569 { 1570 /* Map PRP list to from Guest physical memory to 1571 * virtual memory address. 1572 */ 1573 return nvme_map_cmd(req, &req->cmd->nvme_cmd, iov, NVMF_REQ_MAX_BUFFERS, 1574 length, 4096, _map_one); 1575 } 1576 1577 static int 1578 handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 1579 struct nvmf_vfio_user_sq *sq); 1580 1581 static inline int 1582 adaptive_irq_enabled(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_cq *cq) 1583 { 1584 return (!spdk_interrupt_mode_is_enabled() && cq->qid != 0 && 1585 !ctrlr->transport->transport_opts.disable_adaptive_irq); 1586 1587 } 1588 1589 /* 1590 * Posts a CQE in the completion queue. 1591 * 1592 * @ctrlr: the vfio-user controller 1593 * @cq: the completion queue 1594 * @cdw0: cdw0 as reported by NVMf 1595 * @sqid: submission queue ID 1596 * @cid: command identifier in NVMe command 1597 * @sc: the NVMe CQE status code 1598 * @sct: the NVMe CQE status code type 1599 */ 1600 static int 1601 post_completion(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_cq *cq, 1602 uint32_t cdw0, uint16_t sqid, uint16_t cid, uint16_t sc, uint16_t sct) 1603 { 1604 struct spdk_nvme_status cpl_status = { 0 }; 1605 struct spdk_nvme_cpl *cpl; 1606 int err; 1607 1608 assert(ctrlr != NULL); 1609 1610 if (spdk_unlikely(cq == NULL || q_addr(&cq->mapping) == NULL)) { 1611 return 0; 1612 } 1613 1614 if (cq_is_full(cq)) { 1615 SPDK_ERRLOG("%s: cqid:%d full (tail=%d, head=%d)\n", 1616 ctrlr_id(ctrlr), cq->qid, *cq_tailp(cq), 1617 *cq_dbl_headp(cq)); 1618 return -1; 1619 } 1620 1621 cpl = ((struct spdk_nvme_cpl *)q_addr(&cq->mapping)) + *cq_tailp(cq); 1622 1623 assert(ctrlr->sqs[sqid] != NULL); 1624 SPDK_DEBUGLOG(nvmf_vfio, 1625 "%s: request complete sqid:%d cid=%d status=%#x " 1626 "sqhead=%d cq tail=%d\n", ctrlr_id(ctrlr), sqid, cid, sc, 1627 *sq_headp(ctrlr->sqs[sqid]), *cq_tailp(cq)); 1628 1629 cpl->sqhd = *sq_headp(ctrlr->sqs[sqid]); 1630 cpl->sqid = sqid; 1631 cpl->cid = cid; 1632 cpl->cdw0 = cdw0; 1633 1634 /* 1635 * This is a bitfield: instead of setting the individual bits we need 1636 * directly in cpl->status, which would cause a read-modify-write cycle, 1637 * we'll avoid reading from the CPL altogether by filling in a local 1638 * cpl_status variable, then writing the whole thing. 1639 */ 1640 cpl_status.sct = sct; 1641 cpl_status.sc = sc; 1642 cpl_status.p = cq->phase; 1643 cpl->status = cpl_status; 1644 1645 /* Ensure the Completion Queue Entry is visible. */ 1646 spdk_wmb(); 1647 cq_tail_advance(cq); 1648 1649 /* 1650 * this function now executes at SPDK thread context, we 1651 * might be triggering interrupts from vfio-user thread context so 1652 * check for race conditions. 1653 */ 1654 if (!adaptive_irq_enabled(ctrlr, cq) && 1655 cq->ien && ctrlr_interrupt_enabled(ctrlr)) { 1656 err = vfu_irq_trigger(ctrlr->endpoint->vfu_ctx, cq->iv); 1657 if (err != 0) { 1658 SPDK_ERRLOG("%s: failed to trigger interrupt: %m\n", 1659 ctrlr_id(ctrlr)); 1660 return err; 1661 } 1662 } 1663 1664 return 0; 1665 } 1666 1667 static void 1668 free_sq_reqs(struct nvmf_vfio_user_sq *sq) 1669 { 1670 while (!TAILQ_EMPTY(&sq->free_reqs)) { 1671 struct nvmf_vfio_user_req *vu_req = TAILQ_FIRST(&sq->free_reqs); 1672 TAILQ_REMOVE(&sq->free_reqs, vu_req, link); 1673 free(vu_req); 1674 } 1675 } 1676 1677 /* Deletes a SQ, if this SQ is the last user of the associated CQ 1678 * and the controller is being shut down or reset, then the CQ is 1679 * also deleted. 1680 */ 1681 static void 1682 delete_sq_done(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvmf_vfio_user_sq *sq) 1683 { 1684 struct nvmf_vfio_user_cq *cq; 1685 uint16_t cqid; 1686 1687 SPDK_DEBUGLOG(nvmf_vfio, "%s: delete sqid:%d=%p done\n", ctrlr_id(vu_ctrlr), 1688 sq->qid, sq); 1689 1690 /* Free SQ resources */ 1691 unmap_q(vu_ctrlr, &sq->mapping); 1692 1693 free_sq_reqs(sq); 1694 1695 sq->size = 0; 1696 1697 sq->sq_state = VFIO_USER_SQ_DELETED; 1698 1699 /* Controller RESET and SHUTDOWN are special cases, 1700 * VM may not send DELETE IO SQ/CQ commands, NVMf library 1701 * will disconnect IO queue pairs. 1702 */ 1703 if (vu_ctrlr->reset_shn) { 1704 cqid = sq->cqid; 1705 cq = vu_ctrlr->cqs[cqid]; 1706 1707 SPDK_DEBUGLOG(nvmf_vfio, "%s: try to delete cqid:%u=%p\n", ctrlr_id(vu_ctrlr), 1708 cq->qid, cq); 1709 1710 if (cq->cq_ref) { 1711 cq->cq_ref--; 1712 } 1713 if (cq->cq_ref == 0) { 1714 unmap_q(vu_ctrlr, &cq->mapping); 1715 cq->size = 0; 1716 cq->cq_state = VFIO_USER_CQ_DELETED; 1717 cq->group = NULL; 1718 } 1719 } 1720 } 1721 1722 static void 1723 free_qp(struct nvmf_vfio_user_ctrlr *ctrlr, uint16_t qid) 1724 { 1725 struct nvmf_vfio_user_sq *sq; 1726 struct nvmf_vfio_user_cq *cq; 1727 1728 if (ctrlr == NULL) { 1729 return; 1730 } 1731 1732 sq = ctrlr->sqs[qid]; 1733 if (sq) { 1734 SPDK_DEBUGLOG(nvmf_vfio, "%s: Free SQ %u\n", ctrlr_id(ctrlr), qid); 1735 unmap_q(ctrlr, &sq->mapping); 1736 1737 free_sq_reqs(sq); 1738 1739 free(sq->mapping.sg); 1740 free(sq); 1741 ctrlr->sqs[qid] = NULL; 1742 } 1743 1744 cq = ctrlr->cqs[qid]; 1745 if (cq) { 1746 SPDK_DEBUGLOG(nvmf_vfio, "%s: Free cqid:%u\n", ctrlr_id(ctrlr), qid); 1747 unmap_q(ctrlr, &cq->mapping); 1748 free(cq->mapping.sg); 1749 free(cq); 1750 ctrlr->cqs[qid] = NULL; 1751 } 1752 } 1753 1754 static int 1755 init_sq(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_transport *transport, 1756 const uint16_t id) 1757 { 1758 struct nvmf_vfio_user_sq *sq; 1759 1760 assert(ctrlr != NULL); 1761 assert(transport != NULL); 1762 assert(ctrlr->sqs[id] == NULL); 1763 1764 sq = calloc(1, sizeof(*sq)); 1765 if (sq == NULL) { 1766 return -ENOMEM; 1767 } 1768 sq->mapping.sg = calloc(1, dma_sg_size()); 1769 if (sq->mapping.sg == NULL) { 1770 free(sq); 1771 return -ENOMEM; 1772 } 1773 1774 sq->qid = id; 1775 sq->qpair.qid = id; 1776 sq->qpair.transport = transport; 1777 sq->ctrlr = ctrlr; 1778 ctrlr->sqs[id] = sq; 1779 1780 TAILQ_INIT(&sq->free_reqs); 1781 1782 return 0; 1783 } 1784 1785 static int 1786 init_cq(struct nvmf_vfio_user_ctrlr *vu_ctrlr, const uint16_t id) 1787 { 1788 struct nvmf_vfio_user_cq *cq; 1789 1790 assert(vu_ctrlr != NULL); 1791 assert(vu_ctrlr->cqs[id] == NULL); 1792 1793 cq = calloc(1, sizeof(*cq)); 1794 if (cq == NULL) { 1795 return -ENOMEM; 1796 } 1797 cq->mapping.sg = calloc(1, dma_sg_size()); 1798 if (cq->mapping.sg == NULL) { 1799 free(cq); 1800 return -ENOMEM; 1801 } 1802 1803 cq->qid = id; 1804 vu_ctrlr->cqs[id] = cq; 1805 1806 return 0; 1807 } 1808 1809 static int 1810 alloc_sq_reqs(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvmf_vfio_user_sq *sq) 1811 { 1812 struct nvmf_vfio_user_req *vu_req, *tmp; 1813 size_t req_size; 1814 uint32_t i; 1815 1816 req_size = sizeof(struct nvmf_vfio_user_req) + 1817 (dma_sg_size() * NVMF_VFIO_USER_MAX_IOVECS); 1818 1819 for (i = 0; i < sq->size; i++) { 1820 struct spdk_nvmf_request *req; 1821 1822 vu_req = calloc(1, req_size); 1823 if (vu_req == NULL) { 1824 goto err; 1825 } 1826 1827 req = &vu_req->req; 1828 req->qpair = &sq->qpair; 1829 req->rsp = (union nvmf_c2h_msg *)&vu_req->rsp; 1830 req->cmd = (union nvmf_h2c_msg *)&vu_req->cmd; 1831 req->stripped_data = NULL; 1832 1833 TAILQ_INSERT_TAIL(&sq->free_reqs, vu_req, link); 1834 } 1835 1836 return 0; 1837 1838 err: 1839 TAILQ_FOREACH_SAFE(vu_req, &sq->free_reqs, link, tmp) { 1840 free(vu_req); 1841 } 1842 return -ENOMEM; 1843 } 1844 1845 static volatile uint32_t * 1846 ctrlr_doorbell_ptr(struct nvmf_vfio_user_ctrlr *ctrlr) 1847 { 1848 return ctrlr->sdbl != NULL ? 1849 ctrlr->sdbl->shadow_doorbells : 1850 ctrlr->bar0_doorbells; 1851 } 1852 1853 static uint16_t 1854 handle_create_io_sq(struct nvmf_vfio_user_ctrlr *ctrlr, 1855 struct spdk_nvme_cmd *cmd, uint16_t *sct) 1856 { 1857 struct nvmf_vfio_user_transport *vu_transport = ctrlr->transport; 1858 struct nvmf_vfio_user_sq *sq; 1859 uint32_t qsize; 1860 uint16_t cqid; 1861 uint16_t qid; 1862 int err; 1863 1864 qid = cmd->cdw10_bits.create_io_q.qid; 1865 cqid = cmd->cdw11_bits.create_io_sq.cqid; 1866 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 1867 1868 if (ctrlr->sqs[qid] == NULL) { 1869 err = init_sq(ctrlr, ctrlr->sqs[0]->qpair.transport, qid); 1870 if (err != 0) { 1871 *sct = SPDK_NVME_SCT_GENERIC; 1872 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1873 } 1874 } 1875 1876 if (cqid == 0 || cqid >= vu_transport->transport.opts.max_qpairs_per_ctrlr) { 1877 SPDK_ERRLOG("%s: invalid cqid:%u\n", ctrlr_id(ctrlr), cqid); 1878 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1879 return SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 1880 } 1881 1882 /* CQ must be created before SQ. */ 1883 if (!io_q_exists(ctrlr, cqid, true)) { 1884 SPDK_ERRLOG("%s: cqid:%u does not exist\n", ctrlr_id(ctrlr), cqid); 1885 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1886 return SPDK_NVME_SC_COMPLETION_QUEUE_INVALID; 1887 } 1888 1889 if (cmd->cdw11_bits.create_io_sq.pc != 0x1) { 1890 SPDK_ERRLOG("%s: non-PC SQ not supported\n", ctrlr_id(ctrlr)); 1891 *sct = SPDK_NVME_SCT_GENERIC; 1892 return SPDK_NVME_SC_INVALID_FIELD; 1893 } 1894 1895 sq = ctrlr->sqs[qid]; 1896 sq->size = qsize; 1897 1898 SPDK_DEBUGLOG(nvmf_vfio, "%s: sqid:%d cqid:%d\n", ctrlr_id(ctrlr), 1899 qid, cqid); 1900 1901 sq->mapping.prp1 = cmd->dptr.prp.prp1; 1902 1903 err = map_q(ctrlr, &sq->mapping, sq->size, false, true); 1904 if (err) { 1905 SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr)); 1906 *sct = SPDK_NVME_SCT_GENERIC; 1907 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1908 } 1909 1910 SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped sqid:%d IOVA=%#lx vaddr=%p\n", 1911 ctrlr_id(ctrlr), qid, cmd->dptr.prp.prp1, 1912 q_addr(&sq->mapping)); 1913 1914 err = alloc_sq_reqs(ctrlr, sq); 1915 if (err < 0) { 1916 SPDK_ERRLOG("%s: failed to allocate SQ requests: %m\n", ctrlr_id(ctrlr)); 1917 *sct = SPDK_NVME_SCT_GENERIC; 1918 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1919 } 1920 1921 sq->cqid = cqid; 1922 ctrlr->cqs[sq->cqid]->cq_ref++; 1923 sq->sq_state = VFIO_USER_SQ_CREATED; 1924 *sq_headp(sq) = 0; 1925 1926 sq->dbl_tailp = ctrlr_doorbell_ptr(ctrlr) + queue_index(qid, false); 1927 1928 /* 1929 * We should always reset the doorbells. 1930 * 1931 * The Specification prohibits the controller from writing to the shadow 1932 * doorbell buffer, however older versions of the Linux NVMe driver 1933 * don't reset the shadow doorbell buffer after a Queue-Level or 1934 * Controller-Level reset, which means that we're left with garbage 1935 * doorbell values. 1936 */ 1937 *sq_dbl_tailp(sq) = 0; 1938 1939 if (ctrlr->sdbl != NULL) { 1940 sq->need_rearm = true; 1941 1942 if (!set_sq_eventidx(sq)) { 1943 SPDK_ERRLOG("%s: host updated SQ tail doorbell before " 1944 "sqid:%hu was initialized\n", 1945 ctrlr_id(ctrlr), qid); 1946 fail_ctrlr(ctrlr); 1947 *sct = SPDK_NVME_SCT_GENERIC; 1948 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1949 } 1950 } 1951 1952 /* 1953 * Create our new I/O qpair. This asynchronously invokes, on a suitable 1954 * poll group, the nvmf_vfio_user_poll_group_add() callback, which will 1955 * call spdk_nvmf_request_exec_fabrics() with a generated fabrics 1956 * connect command. This command is then eventually completed via 1957 * handle_queue_connect_rsp(). 1958 */ 1959 sq->create_io_sq_cmd = *cmd; 1960 sq->post_create_io_sq_completion = true; 1961 1962 spdk_nvmf_tgt_new_qpair(ctrlr->transport->transport.tgt, 1963 &sq->qpair); 1964 1965 *sct = SPDK_NVME_SCT_GENERIC; 1966 return SPDK_NVME_SC_SUCCESS; 1967 } 1968 1969 static uint16_t 1970 handle_create_io_cq(struct nvmf_vfio_user_ctrlr *ctrlr, 1971 struct spdk_nvme_cmd *cmd, uint16_t *sct) 1972 { 1973 struct nvmf_vfio_user_cq *cq; 1974 uint32_t qsize; 1975 uint16_t qid; 1976 int err; 1977 1978 qid = cmd->cdw10_bits.create_io_q.qid; 1979 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 1980 1981 if (ctrlr->cqs[qid] == NULL) { 1982 err = init_cq(ctrlr, qid); 1983 if (err != 0) { 1984 *sct = SPDK_NVME_SCT_GENERIC; 1985 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 1986 } 1987 } 1988 1989 if (cmd->cdw11_bits.create_io_cq.pc != 0x1) { 1990 SPDK_ERRLOG("%s: non-PC CQ not supported\n", ctrlr_id(ctrlr)); 1991 *sct = SPDK_NVME_SCT_GENERIC; 1992 return SPDK_NVME_SC_INVALID_FIELD; 1993 } 1994 1995 if (cmd->cdw11_bits.create_io_cq.iv > NVME_IRQ_MSIX_NUM - 1) { 1996 SPDK_ERRLOG("%s: IV is too big\n", ctrlr_id(ctrlr)); 1997 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 1998 return SPDK_NVME_SC_INVALID_INTERRUPT_VECTOR; 1999 } 2000 2001 cq = ctrlr->cqs[qid]; 2002 cq->size = qsize; 2003 2004 cq->mapping.prp1 = cmd->dptr.prp.prp1; 2005 2006 cq->dbl_headp = ctrlr_doorbell_ptr(ctrlr) + queue_index(qid, true); 2007 2008 err = map_q(ctrlr, &cq->mapping, cq->size, true, true); 2009 if (err) { 2010 SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr)); 2011 *sct = SPDK_NVME_SCT_GENERIC; 2012 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2013 } 2014 2015 SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped cqid:%u IOVA=%#lx vaddr=%p\n", 2016 ctrlr_id(ctrlr), qid, cmd->dptr.prp.prp1, 2017 q_addr(&cq->mapping)); 2018 2019 cq->ien = cmd->cdw11_bits.create_io_cq.ien; 2020 cq->iv = cmd->cdw11_bits.create_io_cq.iv; 2021 cq->phase = true; 2022 cq->cq_state = VFIO_USER_CQ_CREATED; 2023 2024 *cq_tailp(cq) = 0; 2025 2026 /* 2027 * We should always reset the doorbells. 2028 * 2029 * The Specification prohibits the controller from writing to the shadow 2030 * doorbell buffer, however older versions of the Linux NVMe driver 2031 * don't reset the shadow doorbell buffer after a Queue-Level or 2032 * Controller-Level reset, which means that we're left with garbage 2033 * doorbell values. 2034 */ 2035 *cq_dbl_headp(cq) = 0; 2036 2037 *sct = SPDK_NVME_SCT_GENERIC; 2038 return SPDK_NVME_SC_SUCCESS; 2039 } 2040 2041 /* 2042 * Creates a completion or submission I/O queue. Returns 0 on success, -errno 2043 * on error. 2044 */ 2045 static int 2046 handle_create_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, 2047 struct spdk_nvme_cmd *cmd, const bool is_cq) 2048 { 2049 struct nvmf_vfio_user_transport *vu_transport = ctrlr->transport; 2050 uint16_t sct = SPDK_NVME_SCT_GENERIC; 2051 uint16_t sc = SPDK_NVME_SC_SUCCESS; 2052 uint32_t qsize; 2053 uint16_t qid; 2054 2055 assert(ctrlr != NULL); 2056 assert(cmd != NULL); 2057 2058 qid = cmd->cdw10_bits.create_io_q.qid; 2059 if (qid == 0 || qid >= vu_transport->transport.opts.max_qpairs_per_ctrlr) { 2060 SPDK_ERRLOG("%s: invalid qid=%d, max=%d\n", ctrlr_id(ctrlr), 2061 qid, vu_transport->transport.opts.max_qpairs_per_ctrlr); 2062 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2063 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2064 goto out; 2065 } 2066 2067 if (io_q_exists(ctrlr, qid, is_cq)) { 2068 SPDK_ERRLOG("%s: %cqid:%d already exists\n", ctrlr_id(ctrlr), 2069 is_cq ? 'c' : 's', qid); 2070 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2071 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2072 goto out; 2073 } 2074 2075 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 2076 if (qsize == 1 || qsize > max_queue_size(ctrlr)) { 2077 SPDK_ERRLOG("%s: invalid I/O queue size %u\n", ctrlr_id(ctrlr), qsize); 2078 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2079 sc = SPDK_NVME_SC_INVALID_QUEUE_SIZE; 2080 goto out; 2081 } 2082 2083 if (is_cq) { 2084 sc = handle_create_io_cq(ctrlr, cmd, &sct); 2085 } else { 2086 sc = handle_create_io_sq(ctrlr, cmd, &sct); 2087 2088 if (sct == SPDK_NVME_SCT_GENERIC && 2089 sc == SPDK_NVME_SC_SUCCESS) { 2090 /* Completion posted asynchronously. */ 2091 return 0; 2092 } 2093 } 2094 2095 out: 2096 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 2097 } 2098 2099 /* For ADMIN I/O DELETE SUBMISSION QUEUE the NVMf library will disconnect and free 2100 * queue pair, so save the command in a context. 2101 */ 2102 struct vfio_user_delete_sq_ctx { 2103 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 2104 struct spdk_nvme_cmd delete_io_sq_cmd; 2105 }; 2106 2107 static void 2108 vfio_user_qpair_delete_cb(void *cb_arg) 2109 { 2110 struct vfio_user_delete_sq_ctx *ctx = cb_arg; 2111 struct nvmf_vfio_user_ctrlr *vu_ctrlr = ctx->vu_ctrlr; 2112 2113 post_completion(vu_ctrlr, vu_ctrlr->cqs[0], 0, 0, ctx->delete_io_sq_cmd.cid, 2114 SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC); 2115 free(ctx); 2116 } 2117 2118 /* 2119 * Deletes a completion or submission I/O queue. 2120 */ 2121 static int 2122 handle_del_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, 2123 struct spdk_nvme_cmd *cmd, const bool is_cq) 2124 { 2125 uint16_t sct = SPDK_NVME_SCT_GENERIC; 2126 uint16_t sc = SPDK_NVME_SC_SUCCESS; 2127 struct nvmf_vfio_user_sq *sq; 2128 struct nvmf_vfio_user_cq *cq; 2129 struct vfio_user_delete_sq_ctx *ctx; 2130 2131 SPDK_DEBUGLOG(nvmf_vfio, "%s: delete I/O %cqid:%d\n", 2132 ctrlr_id(ctrlr), is_cq ? 'c' : 's', 2133 cmd->cdw10_bits.delete_io_q.qid); 2134 2135 if (!io_q_exists(ctrlr, cmd->cdw10_bits.delete_io_q.qid, is_cq)) { 2136 SPDK_ERRLOG("%s: I/O %cqid:%d does not exist\n", ctrlr_id(ctrlr), 2137 is_cq ? 'c' : 's', cmd->cdw10_bits.delete_io_q.qid); 2138 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2139 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2140 goto out; 2141 } 2142 2143 if (is_cq) { 2144 cq = ctrlr->cqs[cmd->cdw10_bits.delete_io_q.qid]; 2145 if (cq->cq_ref) { 2146 SPDK_ERRLOG("%s: the associated SQ must be deleted first\n", ctrlr_id(ctrlr)); 2147 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2148 sc = SPDK_NVME_SC_INVALID_QUEUE_DELETION; 2149 goto out; 2150 } 2151 2152 unmap_q(ctrlr, &cq->mapping); 2153 cq->size = 0; 2154 cq->cq_state = VFIO_USER_CQ_DELETED; 2155 cq->group = NULL; 2156 } else { 2157 ctx = calloc(1, sizeof(*ctx)); 2158 if (!ctx) { 2159 sct = SPDK_NVME_SCT_GENERIC; 2160 sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2161 goto out; 2162 } 2163 ctx->vu_ctrlr = ctrlr; 2164 ctx->delete_io_sq_cmd = *cmd; 2165 2166 sq = ctrlr->sqs[cmd->cdw10_bits.delete_io_q.qid]; 2167 sq->sq_state = VFIO_USER_SQ_DELETED; 2168 assert(ctrlr->cqs[sq->cqid]->cq_ref); 2169 ctrlr->cqs[sq->cqid]->cq_ref--; 2170 2171 spdk_nvmf_qpair_disconnect(&sq->qpair, vfio_user_qpair_delete_cb, ctx); 2172 return 0; 2173 } 2174 2175 out: 2176 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 2177 } 2178 2179 /* 2180 * Configures Shadow Doorbells. 2181 */ 2182 static int 2183 handle_doorbell_buffer_config(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd) 2184 { 2185 struct nvmf_vfio_user_shadow_doorbells *sdbl = NULL; 2186 uint32_t dstrd; 2187 uintptr_t page_size, page_mask; 2188 uint64_t prp1, prp2; 2189 uint16_t sct = SPDK_NVME_SCT_GENERIC; 2190 uint16_t sc = SPDK_NVME_SC_INVALID_FIELD; 2191 2192 assert(ctrlr != NULL); 2193 assert(ctrlr->endpoint != NULL); 2194 assert(cmd != NULL); 2195 2196 dstrd = doorbell_stride(ctrlr); 2197 page_size = memory_page_size(ctrlr); 2198 page_mask = memory_page_mask(ctrlr); 2199 2200 /* FIXME: we don't check doorbell stride when setting queue doorbells. */ 2201 if ((4u << dstrd) * NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR > page_size) { 2202 SPDK_ERRLOG("%s: doorbells do not fit in a single host page", 2203 ctrlr_id(ctrlr)); 2204 2205 goto out; 2206 } 2207 2208 /* Verify guest physical addresses passed as PRPs. */ 2209 if (cmd->psdt != SPDK_NVME_PSDT_PRP) { 2210 SPDK_ERRLOG("%s: received Doorbell Buffer Config without PRPs", 2211 ctrlr_id(ctrlr)); 2212 2213 goto out; 2214 } 2215 2216 prp1 = cmd->dptr.prp.prp1; 2217 prp2 = cmd->dptr.prp.prp2; 2218 2219 SPDK_DEBUGLOG(nvmf_vfio, 2220 "%s: configuring shadow doorbells with PRP1=%#lx and PRP2=%#lx (GPAs)\n", 2221 ctrlr_id(ctrlr), prp1, prp2); 2222 2223 if (prp1 == prp2 2224 || prp1 != (prp1 & page_mask) 2225 || prp2 != (prp2 & page_mask)) { 2226 SPDK_ERRLOG("%s: invalid shadow doorbell GPAs\n", 2227 ctrlr_id(ctrlr)); 2228 2229 goto out; 2230 } 2231 2232 /* Map guest physical addresses to our virtual address space. */ 2233 sdbl = map_sdbl(ctrlr->endpoint->vfu_ctx, prp1, prp2, page_size); 2234 if (sdbl == NULL) { 2235 SPDK_ERRLOG("%s: failed to map shadow doorbell buffers\n", 2236 ctrlr_id(ctrlr)); 2237 2238 goto out; 2239 } 2240 2241 ctrlr->shadow_doorbell_buffer = prp1; 2242 ctrlr->eventidx_buffer = prp2; 2243 2244 SPDK_DEBUGLOG(nvmf_vfio, 2245 "%s: mapped shadow doorbell buffers [%p, %p) and [%p, %p)\n", 2246 ctrlr_id(ctrlr), 2247 sdbl->iovs[0].iov_base, 2248 sdbl->iovs[0].iov_base + sdbl->iovs[0].iov_len, 2249 sdbl->iovs[1].iov_base, 2250 sdbl->iovs[1].iov_base + sdbl->iovs[1].iov_len); 2251 2252 2253 /* 2254 * Set all possible CQ head doorbells to polling mode now, such that we 2255 * don't have to worry about it later if the host creates more queues. 2256 * 2257 * We only ever want interrupts for writes to the SQ tail doorbells 2258 * (which are initialised in set_ctrlr_intr_mode() below). 2259 */ 2260 for (uint16_t i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; ++i) { 2261 sdbl->eventidxs[queue_index(i, true)] = NVMF_VFIO_USER_EVENTIDX_POLL; 2262 if (ctrlr->sqs[i] != NULL) { 2263 ctrlr->sqs[i]->need_rearm = true; 2264 } 2265 } 2266 2267 /* Update controller. */ 2268 SWAP(ctrlr->sdbl, sdbl); 2269 2270 /* 2271 * Copy doorbells from either the previous shadow doorbell buffer or the 2272 * BAR0 doorbells and make I/O queue doorbells point to the new buffer. 2273 * 2274 * This needs to account for older versions of the Linux NVMe driver, 2275 * which don't clear out the buffer after a controller reset. 2276 */ 2277 copy_doorbells(ctrlr, sdbl != NULL ? 2278 sdbl->shadow_doorbells : ctrlr->bar0_doorbells, 2279 ctrlr->sdbl->shadow_doorbells); 2280 vfio_user_ctrlr_switch_doorbells(ctrlr, true); 2281 2282 /* Update event index buffer and poll queues if necessary. */ 2283 set_ctrlr_intr_mode(ctrlr); 2284 2285 sc = SPDK_NVME_SC_SUCCESS; 2286 2287 out: 2288 /* 2289 * Unmap existing buffers, in case Doorbell Buffer Config was sent 2290 * more than once (pointless, but not prohibited by the spec), or 2291 * in case of an error. 2292 * 2293 * If this is the first time Doorbell Buffer Config was processed, 2294 * then we've just swapped a NULL from ctrlr->sdbl into sdbl, so 2295 * free_sdbl() becomes a noop. 2296 */ 2297 free_sdbl(ctrlr->endpoint->vfu_ctx, sdbl); 2298 2299 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 2300 } 2301 2302 /* Returns 0 on success and -errno on error. */ 2303 static int 2304 consume_admin_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd) 2305 { 2306 assert(ctrlr != NULL); 2307 assert(cmd != NULL); 2308 2309 if (cmd->fuse != 0) { 2310 /* Fused admin commands are not supported. */ 2311 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, 2312 SPDK_NVME_SC_INVALID_FIELD, 2313 SPDK_NVME_SCT_GENERIC); 2314 } 2315 2316 switch (cmd->opc) { 2317 case SPDK_NVME_OPC_CREATE_IO_CQ: 2318 case SPDK_NVME_OPC_CREATE_IO_SQ: 2319 return handle_create_io_q(ctrlr, cmd, 2320 cmd->opc == SPDK_NVME_OPC_CREATE_IO_CQ); 2321 case SPDK_NVME_OPC_DELETE_IO_SQ: 2322 case SPDK_NVME_OPC_DELETE_IO_CQ: 2323 return handle_del_io_q(ctrlr, cmd, 2324 cmd->opc == SPDK_NVME_OPC_DELETE_IO_CQ); 2325 case SPDK_NVME_OPC_DOORBELL_BUFFER_CONFIG: 2326 if (!ctrlr->transport->transport_opts.disable_shadow_doorbells) { 2327 return handle_doorbell_buffer_config(ctrlr, cmd); 2328 } 2329 /* FALLTHROUGH */ 2330 default: 2331 return handle_cmd_req(ctrlr, cmd, ctrlr->sqs[0]); 2332 } 2333 } 2334 2335 static int 2336 handle_cmd_rsp(struct nvmf_vfio_user_req *vu_req, void *cb_arg) 2337 { 2338 struct nvmf_vfio_user_sq *sq = cb_arg; 2339 struct nvmf_vfio_user_ctrlr *vu_ctrlr = sq->ctrlr; 2340 uint16_t sqid, cqid; 2341 2342 assert(sq != NULL); 2343 assert(vu_req != NULL); 2344 assert(vu_ctrlr != NULL); 2345 2346 if (spdk_likely(vu_req->iovcnt)) { 2347 vfu_unmap_sg(vu_ctrlr->endpoint->vfu_ctx, 2348 vu_req_to_sg_t(vu_req, 0), 2349 vu_req->iov, vu_req->iovcnt); 2350 } 2351 sqid = sq->qid; 2352 cqid = sq->cqid; 2353 2354 return post_completion(vu_ctrlr, vu_ctrlr->cqs[cqid], 2355 vu_req->req.rsp->nvme_cpl.cdw0, 2356 sqid, 2357 vu_req->req.cmd->nvme_cmd.cid, 2358 vu_req->req.rsp->nvme_cpl.status.sc, 2359 vu_req->req.rsp->nvme_cpl.status.sct); 2360 } 2361 2362 static int 2363 consume_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_sq *sq, 2364 struct spdk_nvme_cmd *cmd) 2365 { 2366 assert(sq != NULL); 2367 if (nvmf_qpair_is_admin_queue(&sq->qpair)) { 2368 return consume_admin_cmd(ctrlr, cmd); 2369 } 2370 2371 return handle_cmd_req(ctrlr, cmd, sq); 2372 } 2373 2374 /* Returns the number of commands processed, or a negative value on error. */ 2375 static int 2376 handle_sq_tdbl_write(struct nvmf_vfio_user_ctrlr *ctrlr, const uint32_t new_tail, 2377 struct nvmf_vfio_user_sq *sq) 2378 { 2379 struct spdk_nvme_cmd *queue; 2380 int count = 0; 2381 2382 assert(ctrlr != NULL); 2383 assert(sq != NULL); 2384 2385 /* 2386 * Submission queue index has moved past the event index, so it needs to 2387 * be re-armed before we go to sleep. 2388 */ 2389 sq->need_rearm = true; 2390 2391 queue = q_addr(&sq->mapping); 2392 while (*sq_headp(sq) != new_tail) { 2393 int err; 2394 struct spdk_nvme_cmd *cmd = &queue[*sq_headp(sq)]; 2395 2396 count++; 2397 2398 /* 2399 * SQHD must contain the new head pointer, so we must increase 2400 * it before we generate a completion. 2401 */ 2402 sq_head_advance(sq); 2403 2404 err = consume_cmd(ctrlr, sq, cmd); 2405 if (err != 0) { 2406 return err; 2407 } 2408 } 2409 2410 return count; 2411 } 2412 2413 static void 2414 memory_region_add_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) 2415 { 2416 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2417 struct nvmf_vfio_user_ctrlr *ctrlr; 2418 struct nvmf_vfio_user_sq *sq; 2419 struct nvmf_vfio_user_cq *cq; 2420 void *map_start, *map_end; 2421 int ret; 2422 2423 /* 2424 * We're not interested in any DMA regions that aren't mappable (we don't 2425 * support clients that don't share their memory). 2426 */ 2427 if (!info->vaddr) { 2428 return; 2429 } 2430 2431 map_start = info->mapping.iov_base; 2432 map_end = info->mapping.iov_base + info->mapping.iov_len; 2433 2434 if (((uintptr_t)info->mapping.iov_base & MASK_2MB) || 2435 (info->mapping.iov_len & MASK_2MB)) { 2436 SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %p-%p\n", 2437 info->vaddr, map_start, map_end); 2438 return; 2439 } 2440 2441 assert(endpoint != NULL); 2442 if (endpoint->ctrlr == NULL) { 2443 return; 2444 } 2445 ctrlr = endpoint->ctrlr; 2446 2447 SPDK_DEBUGLOG(nvmf_vfio, "%s: map IOVA %p-%p\n", endpoint_id(endpoint), 2448 map_start, map_end); 2449 2450 /* VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE are enabled when registering to VFIO, here we also 2451 * check the protection bits before registering. 2452 */ 2453 if (info->prot == (PROT_WRITE | PROT_READ)) { 2454 ret = spdk_mem_register(info->mapping.iov_base, info->mapping.iov_len); 2455 if (ret) { 2456 SPDK_ERRLOG("Memory region register %p-%p failed, ret=%d\n", 2457 map_start, map_end, ret); 2458 } 2459 } 2460 2461 pthread_mutex_lock(&endpoint->lock); 2462 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 2463 if (sq->sq_state != VFIO_USER_SQ_INACTIVE) { 2464 continue; 2465 } 2466 2467 cq = ctrlr->cqs[sq->cqid]; 2468 2469 /* For shared CQ case, we will use q_addr() to avoid mapping CQ multiple times */ 2470 if (cq->size && q_addr(&cq->mapping) == NULL) { 2471 ret = map_q(ctrlr, &cq->mapping, cq->size, true, false); 2472 if (ret) { 2473 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap cqid:%d %#lx-%#lx\n", 2474 cq->qid, cq->mapping.prp1, 2475 cq->mapping.prp1 + cq->size * sizeof(struct spdk_nvme_cpl)); 2476 continue; 2477 } 2478 } 2479 2480 if (sq->size) { 2481 ret = map_q(ctrlr, &sq->mapping, sq->size, false, false); 2482 if (ret) { 2483 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap sqid:%d %#lx-%#lx\n", 2484 sq->qid, sq->mapping.prp1, 2485 sq->mapping.prp1 + sq->size * sizeof(struct spdk_nvme_cmd)); 2486 continue; 2487 } 2488 } 2489 sq->sq_state = VFIO_USER_SQ_ACTIVE; 2490 SPDK_DEBUGLOG(nvmf_vfio, "Remap sqid:%u successfully\n", sq->qid); 2491 } 2492 pthread_mutex_unlock(&endpoint->lock); 2493 } 2494 2495 static void 2496 memory_region_remove_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) 2497 { 2498 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2499 struct nvmf_vfio_user_sq *sq; 2500 struct nvmf_vfio_user_cq *cq; 2501 void *map_start, *map_end; 2502 int ret = 0; 2503 2504 if (!info->vaddr) { 2505 return; 2506 } 2507 2508 map_start = info->mapping.iov_base; 2509 map_end = info->mapping.iov_base + info->mapping.iov_len; 2510 2511 if (((uintptr_t)info->mapping.iov_base & MASK_2MB) || 2512 (info->mapping.iov_len & MASK_2MB)) { 2513 SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %p-%p\n", 2514 info->vaddr, map_start, map_end); 2515 return; 2516 } 2517 2518 assert(endpoint != NULL); 2519 SPDK_DEBUGLOG(nvmf_vfio, "%s: unmap IOVA %p-%p\n", endpoint_id(endpoint), 2520 map_start, map_end); 2521 2522 if (endpoint->ctrlr != NULL) { 2523 struct nvmf_vfio_user_ctrlr *ctrlr; 2524 ctrlr = endpoint->ctrlr; 2525 2526 pthread_mutex_lock(&endpoint->lock); 2527 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 2528 if (q_addr(&sq->mapping) >= map_start && q_addr(&sq->mapping) <= map_end) { 2529 unmap_q(ctrlr, &sq->mapping); 2530 sq->sq_state = VFIO_USER_SQ_INACTIVE; 2531 } 2532 2533 cq = ctrlr->cqs[sq->cqid]; 2534 if (q_addr(&cq->mapping) >= map_start && q_addr(&cq->mapping) <= map_end) { 2535 unmap_q(ctrlr, &cq->mapping); 2536 } 2537 } 2538 2539 if (ctrlr->sdbl != NULL) { 2540 size_t i; 2541 2542 for (i = 0; i < NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT; i++) { 2543 const void *const iov_base = ctrlr->sdbl->iovs[i].iov_base; 2544 2545 if (iov_base >= map_start && iov_base < map_end) { 2546 copy_doorbells(ctrlr, 2547 ctrlr->sdbl->shadow_doorbells, 2548 ctrlr->bar0_doorbells); 2549 vfio_user_ctrlr_switch_doorbells(ctrlr, false); 2550 free_sdbl(endpoint->vfu_ctx, ctrlr->sdbl); 2551 ctrlr->sdbl = NULL; 2552 break; 2553 } 2554 } 2555 } 2556 2557 pthread_mutex_unlock(&endpoint->lock); 2558 } 2559 2560 if (info->prot == (PROT_WRITE | PROT_READ)) { 2561 ret = spdk_mem_unregister(info->mapping.iov_base, info->mapping.iov_len); 2562 if (ret) { 2563 SPDK_ERRLOG("Memory region unregister %p-%p failed, ret=%d\n", 2564 map_start, map_end, ret); 2565 } 2566 } 2567 } 2568 2569 /* Used to initiate a controller-level reset or a controller shutdown. */ 2570 static void 2571 disable_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 2572 { 2573 SPDK_DEBUGLOG(nvmf_vfio, "%s: disabling controller\n", 2574 ctrlr_id(vu_ctrlr)); 2575 2576 /* Unmap Admin queue. */ 2577 2578 assert(vu_ctrlr->sqs[0] != NULL); 2579 assert(vu_ctrlr->cqs[0] != NULL); 2580 2581 unmap_q(vu_ctrlr, &vu_ctrlr->sqs[0]->mapping); 2582 unmap_q(vu_ctrlr, &vu_ctrlr->cqs[0]->mapping); 2583 2584 vu_ctrlr->sqs[0]->size = 0; 2585 *sq_headp(vu_ctrlr->sqs[0]) = 0; 2586 2587 vu_ctrlr->sqs[0]->sq_state = VFIO_USER_SQ_INACTIVE; 2588 2589 vu_ctrlr->cqs[0]->size = 0; 2590 *cq_tailp(vu_ctrlr->cqs[0]) = 0; 2591 2592 /* 2593 * For PCIe controller reset or shutdown, we will drop all AER 2594 * responses. 2595 */ 2596 nvmf_ctrlr_abort_aer(vu_ctrlr->ctrlr); 2597 2598 /* Free the shadow doorbell buffer. */ 2599 free_sdbl(vu_ctrlr->endpoint->vfu_ctx, vu_ctrlr->sdbl); 2600 vu_ctrlr->sdbl = NULL; 2601 } 2602 2603 /* Used to re-enable the controller after a controller-level reset. */ 2604 static int 2605 enable_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 2606 { 2607 int err; 2608 2609 assert(vu_ctrlr != NULL); 2610 2611 SPDK_DEBUGLOG(nvmf_vfio, "%s: enabling controller\n", 2612 ctrlr_id(vu_ctrlr)); 2613 2614 err = acq_setup(vu_ctrlr); 2615 if (err != 0) { 2616 return err; 2617 } 2618 2619 err = asq_setup(vu_ctrlr); 2620 if (err != 0) { 2621 return err; 2622 } 2623 2624 vu_ctrlr->sqs[0]->sq_state = VFIO_USER_SQ_ACTIVE; 2625 2626 return 0; 2627 } 2628 2629 static int 2630 nvmf_vfio_user_prop_req_rsp(struct nvmf_vfio_user_req *req, void *cb_arg) 2631 { 2632 struct nvmf_vfio_user_sq *sq = cb_arg; 2633 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 2634 int ret; 2635 2636 assert(sq != NULL); 2637 assert(req != NULL); 2638 2639 if (req->req.cmd->prop_get_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET) { 2640 assert(sq->ctrlr != NULL); 2641 assert(req != NULL); 2642 2643 memcpy(req->req.data, 2644 &req->req.rsp->prop_get_rsp.value.u64, 2645 req->req.length); 2646 } else { 2647 assert(req->req.cmd->prop_set_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET); 2648 assert(sq->ctrlr != NULL); 2649 vu_ctrlr = sq->ctrlr; 2650 2651 if (req->req.cmd->prop_set_cmd.ofst == offsetof(struct spdk_nvme_registers, cc)) { 2652 union spdk_nvme_cc_register cc, diff; 2653 2654 cc.raw = req->req.cmd->prop_set_cmd.value.u64; 2655 diff.raw = cc.raw ^ req->cc.raw; 2656 2657 if (diff.bits.en) { 2658 if (cc.bits.en) { 2659 ret = enable_ctrlr(vu_ctrlr); 2660 if (ret) { 2661 SPDK_ERRLOG("%s: failed to enable ctrlr\n", ctrlr_id(vu_ctrlr)); 2662 return ret; 2663 } 2664 vu_ctrlr->reset_shn = false; 2665 } else { 2666 vu_ctrlr->reset_shn = true; 2667 } 2668 } 2669 2670 if (diff.bits.shn) { 2671 if (cc.bits.shn == SPDK_NVME_SHN_NORMAL || cc.bits.shn == SPDK_NVME_SHN_ABRUPT) { 2672 vu_ctrlr->reset_shn = true; 2673 } 2674 } 2675 2676 if (vu_ctrlr->reset_shn) { 2677 disable_ctrlr(vu_ctrlr); 2678 } 2679 } 2680 } 2681 2682 return 0; 2683 } 2684 2685 /* 2686 * Handles a write at offset 0x1000 or more; this is the non-mapped path when a 2687 * doorbell is written via access_bar0_fn(). 2688 * 2689 * DSTRD is set to fixed value 0 for NVMf. 2690 * 2691 */ 2692 static int 2693 handle_dbl_access(struct nvmf_vfio_user_ctrlr *ctrlr, uint32_t *buf, 2694 const size_t count, loff_t pos, const bool is_write) 2695 { 2696 assert(ctrlr != NULL); 2697 assert(buf != NULL); 2698 2699 if (!is_write) { 2700 SPDK_WARNLOG("%s: host tried to read BAR0 doorbell %#lx\n", 2701 ctrlr_id(ctrlr), pos); 2702 errno = EPERM; 2703 return -1; 2704 } 2705 2706 if (count != sizeof(uint32_t)) { 2707 SPDK_ERRLOG("%s: bad doorbell buffer size %ld\n", 2708 ctrlr_id(ctrlr), count); 2709 errno = EINVAL; 2710 return -1; 2711 } 2712 2713 pos -= NVME_DOORBELLS_OFFSET; 2714 2715 /* pos must be dword aligned */ 2716 if ((pos & 0x3) != 0) { 2717 SPDK_ERRLOG("%s: bad doorbell offset %#lx\n", ctrlr_id(ctrlr), pos); 2718 errno = EINVAL; 2719 return -1; 2720 } 2721 2722 /* convert byte offset to array index */ 2723 pos >>= 2; 2724 2725 if (pos >= NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR * 2) { 2726 SPDK_ERRLOG("%s: bad doorbell index %#lx\n", ctrlr_id(ctrlr), pos); 2727 errno = EINVAL; 2728 return -1; 2729 } 2730 2731 ctrlr->bar0_doorbells[pos] = *buf; 2732 spdk_wmb(); 2733 2734 SPDK_DEBUGLOG(vfio_user_db, "%s: updating BAR0 doorbell %s:%ld to %u\n", 2735 ctrlr_id(ctrlr), (pos & 1) ? "cqid" : "sqid", 2736 pos / 2, *buf); 2737 2738 2739 return 0; 2740 } 2741 2742 static size_t 2743 vfio_user_property_access(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 2744 char *buf, size_t count, loff_t pos, 2745 bool is_write) 2746 { 2747 struct nvmf_vfio_user_req *req; 2748 const struct spdk_nvmf_registers *regs; 2749 2750 if ((count != 4) && (count != 8)) { 2751 errno = EINVAL; 2752 return -1; 2753 } 2754 2755 /* Construct a Fabric Property Get/Set command and send it */ 2756 req = get_nvmf_vfio_user_req(vu_ctrlr->sqs[0]); 2757 if (req == NULL) { 2758 errno = ENOBUFS; 2759 return -1; 2760 } 2761 regs = spdk_nvmf_ctrlr_get_regs(vu_ctrlr->ctrlr); 2762 req->cc.raw = regs->cc.raw; 2763 2764 req->cb_fn = nvmf_vfio_user_prop_req_rsp; 2765 req->cb_arg = vu_ctrlr->sqs[0]; 2766 req->req.cmd->prop_set_cmd.opcode = SPDK_NVME_OPC_FABRIC; 2767 req->req.cmd->prop_set_cmd.cid = 0; 2768 if (count == 4) { 2769 req->req.cmd->prop_set_cmd.attrib.size = 0; 2770 } else { 2771 req->req.cmd->prop_set_cmd.attrib.size = 1; 2772 } 2773 req->req.cmd->prop_set_cmd.ofst = pos; 2774 if (is_write) { 2775 req->req.cmd->prop_set_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET; 2776 if (req->req.cmd->prop_set_cmd.attrib.size) { 2777 req->req.cmd->prop_set_cmd.value.u64 = *(uint64_t *)buf; 2778 } else { 2779 req->req.cmd->prop_set_cmd.value.u32.high = 0; 2780 req->req.cmd->prop_set_cmd.value.u32.low = *(uint32_t *)buf; 2781 } 2782 } else { 2783 req->req.cmd->prop_get_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET; 2784 } 2785 req->req.length = count; 2786 req->req.data = buf; 2787 2788 spdk_nvmf_request_exec_fabrics(&req->req); 2789 2790 return count; 2791 } 2792 2793 static ssize_t 2794 access_bar0_fn(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t pos, 2795 bool is_write) 2796 { 2797 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2798 struct nvmf_vfio_user_ctrlr *ctrlr; 2799 int ret; 2800 2801 ctrlr = endpoint->ctrlr; 2802 if (endpoint->need_async_destroy || !ctrlr) { 2803 errno = EIO; 2804 return -1; 2805 } 2806 2807 if (pos >= NVME_DOORBELLS_OFFSET) { 2808 /* 2809 * The fact that the doorbells can be memory mapped doesn't mean 2810 * that the client (VFIO in QEMU) is obliged to memory map them, 2811 * it might still elect to access them via regular read/write; 2812 * we might also have had disable_mappable_bar0 set. 2813 */ 2814 ret = handle_dbl_access(ctrlr, (uint32_t *)buf, count, 2815 pos, is_write); 2816 if (ret == 0) { 2817 return count; 2818 } 2819 return ret; 2820 } 2821 2822 return vfio_user_property_access(ctrlr, buf, count, pos, is_write); 2823 } 2824 2825 static ssize_t 2826 access_pci_config(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t offset, 2827 bool is_write) 2828 { 2829 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2830 2831 if (is_write) { 2832 SPDK_ERRLOG("%s: write %#lx-%#lx not supported\n", 2833 endpoint_id(endpoint), offset, offset + count); 2834 errno = EINVAL; 2835 return -1; 2836 } 2837 2838 if (offset + count > NVME_REG_CFG_SIZE) { 2839 SPDK_ERRLOG("%s: access past end of extended PCI configuration space, want=%ld+%ld, max=%d\n", 2840 endpoint_id(endpoint), offset, count, 2841 NVME_REG_CFG_SIZE); 2842 errno = ERANGE; 2843 return -1; 2844 } 2845 2846 memcpy(buf, ((unsigned char *)endpoint->pci_config_space) + offset, count); 2847 2848 return count; 2849 } 2850 2851 static void 2852 vfio_user_log(vfu_ctx_t *vfu_ctx, int level, char const *msg) 2853 { 2854 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2855 2856 if (level >= LOG_DEBUG) { 2857 SPDK_DEBUGLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg); 2858 } else if (level >= LOG_INFO) { 2859 SPDK_INFOLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg); 2860 } else if (level >= LOG_NOTICE) { 2861 SPDK_NOTICELOG("%s: %s\n", endpoint_id(endpoint), msg); 2862 } else if (level >= LOG_WARNING) { 2863 SPDK_WARNLOG("%s: %s\n", endpoint_id(endpoint), msg); 2864 } else { 2865 SPDK_ERRLOG("%s: %s\n", endpoint_id(endpoint), msg); 2866 } 2867 } 2868 2869 static int 2870 vfio_user_get_log_level(void) 2871 { 2872 int level; 2873 2874 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 2875 return LOG_DEBUG; 2876 } 2877 2878 level = spdk_log_to_syslog_level(spdk_log_get_level()); 2879 if (level < 0) { 2880 return LOG_ERR; 2881 } 2882 2883 return level; 2884 } 2885 2886 static void 2887 init_pci_config_space(vfu_pci_config_space_t *p) 2888 { 2889 /* MLBAR */ 2890 p->hdr.bars[0].raw = 0x0; 2891 /* MUBAR */ 2892 p->hdr.bars[1].raw = 0x0; 2893 2894 /* vendor specific, let's set them to zero for now */ 2895 p->hdr.bars[3].raw = 0x0; 2896 p->hdr.bars[4].raw = 0x0; 2897 p->hdr.bars[5].raw = 0x0; 2898 2899 /* enable INTx */ 2900 p->hdr.intr.ipin = 0x1; 2901 } 2902 2903 struct subsystem_pause_ctx { 2904 struct nvmf_vfio_user_ctrlr *ctrlr; 2905 int status; 2906 }; 2907 2908 static void 2909 vfio_user_dev_quiesce_done(struct spdk_nvmf_subsystem *subsystem, 2910 void *cb_arg, int status); 2911 2912 static void 2913 _vfio_user_endpoint_resume_done_msg(void *ctx) 2914 { 2915 struct nvmf_vfio_user_endpoint *endpoint = ctx; 2916 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 2917 int ret; 2918 2919 endpoint->need_resume = false; 2920 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 2921 2922 /* Basically, once we call `vfu_device_quiesced` the device is unquiesced from 2923 * libvfio-user's perspective so from the moment `vfio_user_dev_quiesce_done` returns 2924 * libvfio-user might quiesce the device again. However, because the NVMf subsytem is 2925 * an asynchronous operation, this quiesce might come _before_ the NVMf subsystem has 2926 * been resumed, so in the callback of `spdk_nvmf_subsystem_resume` we need to check 2927 * whether a quiesce was requested. 2928 */ 2929 if (vu_ctrlr->queued_quiesce) { 2930 SPDK_DEBUGLOG(nvmf_vfio, "%s has queued quiesce event, pause again\n", ctrlr_id(vu_ctrlr)); 2931 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSING; 2932 ret = spdk_nvmf_subsystem_pause((struct spdk_nvmf_subsystem *)endpoint->subsystem, 0, 2933 vfio_user_dev_quiesce_done, vu_ctrlr); 2934 if (ret < 0) { 2935 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 2936 SPDK_ERRLOG("%s: failed to pause, ret=%d\n", endpoint_id(endpoint), ret); 2937 } 2938 } 2939 } 2940 2941 static void 2942 vfio_user_endpoint_resume_done(struct spdk_nvmf_subsystem *subsystem, 2943 void *cb_arg, int status) 2944 { 2945 struct nvmf_vfio_user_endpoint *endpoint = cb_arg; 2946 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 2947 2948 SPDK_DEBUGLOG(nvmf_vfio, "%s resumed done with status %d\n", endpoint_id(endpoint), status); 2949 2950 if (!vu_ctrlr) { 2951 return; 2952 } 2953 2954 spdk_thread_send_msg(vu_ctrlr->thread, _vfio_user_endpoint_resume_done_msg, endpoint); 2955 } 2956 2957 static void 2958 _vfio_user_dev_quiesce_done_msg(void *ctx) 2959 { 2960 struct subsystem_pause_ctx *subsystem_ctx = ctx; 2961 struct nvmf_vfio_user_ctrlr *vu_ctrlr = subsystem_ctx->ctrlr; 2962 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 2963 int ret; 2964 2965 assert(vu_ctrlr->state == VFIO_USER_CTRLR_PAUSING); 2966 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 2967 vfu_device_quiesced(endpoint->vfu_ctx, subsystem_ctx->status); 2968 vu_ctrlr->queued_quiesce = false; 2969 free(subsystem_ctx); 2970 2971 /* `vfu_device_quiesced` can change the migration state, 2972 * so we need to re-check `vu_ctrlr->state`. 2973 */ 2974 if (vu_ctrlr->state == VFIO_USER_CTRLR_MIGRATING) { 2975 SPDK_DEBUGLOG(nvmf_vfio, "%s is in MIGRATION state\n", ctrlr_id(vu_ctrlr)); 2976 return; 2977 } 2978 2979 SPDK_DEBUGLOG(nvmf_vfio, "%s start to resume\n", ctrlr_id(vu_ctrlr)); 2980 vu_ctrlr->state = VFIO_USER_CTRLR_RESUMING; 2981 ret = spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 2982 vfio_user_endpoint_resume_done, endpoint); 2983 if (ret < 0) { 2984 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 2985 SPDK_ERRLOG("%s: failed to resume, ret=%d\n", endpoint_id(endpoint), ret); 2986 } 2987 } 2988 2989 static void 2990 vfio_user_dev_quiesce_done(struct spdk_nvmf_subsystem *subsystem, 2991 void *cb_arg, int status) 2992 { 2993 struct nvmf_vfio_user_ctrlr *vu_ctrlr = cb_arg; 2994 struct subsystem_pause_ctx *ctx; 2995 2996 SPDK_DEBUGLOG(nvmf_vfio, "%s paused done with status %d\n", ctrlr_id(vu_ctrlr), status); 2997 2998 ctx = calloc(1, sizeof(*ctx)); 2999 if (!ctx) { 3000 SPDK_ERRLOG("Failed to allocate subsystem pause context\n"); 3001 assert(false); 3002 return; 3003 } 3004 3005 ctx->ctrlr = vu_ctrlr; 3006 ctx->status = status; 3007 3008 spdk_thread_send_msg(vu_ctrlr->thread, _vfio_user_dev_quiesce_done_msg, ctx); 3009 } 3010 3011 static int 3012 vfio_user_dev_quiesce_cb(vfu_ctx_t *vfu_ctx) 3013 { 3014 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3015 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3016 int ret; 3017 3018 if (!vu_ctrlr) { 3019 return 0; 3020 } 3021 3022 /* NVMf library will destruct controller when no 3023 * connected queue pairs. 3024 */ 3025 if (!nvmf_subsystem_get_ctrlr((struct spdk_nvmf_subsystem *)endpoint->subsystem, 3026 vu_ctrlr->cntlid)) { 3027 return 0; 3028 } 3029 3030 SPDK_DEBUGLOG(nvmf_vfio, "%s starts to quiesce\n", ctrlr_id(vu_ctrlr)); 3031 3032 /* There is no race condition here as device quiesce callback 3033 * and nvmf_prop_set_cc() are running in the same thread context. 3034 */ 3035 if (!vu_ctrlr->ctrlr->vcprop.cc.bits.en) { 3036 return 0; 3037 } else if (!vu_ctrlr->ctrlr->vcprop.csts.bits.rdy) { 3038 return 0; 3039 } else if (vu_ctrlr->ctrlr->vcprop.csts.bits.shst == SPDK_NVME_SHST_COMPLETE) { 3040 return 0; 3041 } 3042 3043 switch (vu_ctrlr->state) { 3044 case VFIO_USER_CTRLR_PAUSED: 3045 case VFIO_USER_CTRLR_MIGRATING: 3046 return 0; 3047 case VFIO_USER_CTRLR_RUNNING: 3048 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSING; 3049 ret = spdk_nvmf_subsystem_pause((struct spdk_nvmf_subsystem *)endpoint->subsystem, 0, 3050 vfio_user_dev_quiesce_done, vu_ctrlr); 3051 if (ret < 0) { 3052 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 3053 SPDK_ERRLOG("%s: failed to pause, ret=%d\n", endpoint_id(endpoint), ret); 3054 return 0; 3055 } 3056 break; 3057 case VFIO_USER_CTRLR_RESUMING: 3058 vu_ctrlr->queued_quiesce = true; 3059 SPDK_DEBUGLOG(nvmf_vfio, "%s is busy to quiesce, current state %u\n", ctrlr_id(vu_ctrlr), 3060 vu_ctrlr->state); 3061 break; 3062 default: 3063 assert(vu_ctrlr->state != VFIO_USER_CTRLR_PAUSING); 3064 break; 3065 } 3066 3067 errno = EBUSY; 3068 return -1; 3069 } 3070 3071 static void 3072 vfio_user_ctrlr_dump_migr_data(const char *name, 3073 struct vfio_user_nvme_migr_state *migr_data, 3074 struct nvmf_vfio_user_shadow_doorbells *sdbl) 3075 { 3076 struct spdk_nvme_registers *regs; 3077 struct nvme_migr_sq_state *sq; 3078 struct nvme_migr_cq_state *cq; 3079 uint32_t *doorbell_base; 3080 uint32_t i; 3081 3082 SPDK_NOTICELOG("Dump %s\n", name); 3083 3084 regs = (struct spdk_nvme_registers *)migr_data->bar0; 3085 doorbell_base = (uint32_t *)®s->doorbell[0].sq_tdbl; 3086 3087 SPDK_NOTICELOG("Registers\n"); 3088 SPDK_NOTICELOG("CSTS 0x%x\n", regs->csts.raw); 3089 SPDK_NOTICELOG("CAP 0x%"PRIx64"\n", regs->cap.raw); 3090 SPDK_NOTICELOG("VS 0x%x\n", regs->vs.raw); 3091 SPDK_NOTICELOG("CC 0x%x\n", regs->cc.raw); 3092 SPDK_NOTICELOG("AQA 0x%x\n", regs->aqa.raw); 3093 SPDK_NOTICELOG("ASQ 0x%"PRIx64"\n", regs->asq); 3094 SPDK_NOTICELOG("ACQ 0x%"PRIx64"\n", regs->acq); 3095 3096 SPDK_NOTICELOG("Number of IO Queues %u\n", migr_data->ctrlr_header.num_io_queues); 3097 3098 if (sdbl != NULL) { 3099 SPDK_NOTICELOG("shadow doorbell buffer=%#lx\n", 3100 migr_data->ctrlr_header.shadow_doorbell_buffer); 3101 SPDK_NOTICELOG("eventidx buffer=%#lx\n", 3102 migr_data->ctrlr_header.eventidx_buffer); 3103 } 3104 3105 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3106 sq = &migr_data->qps[i].sq; 3107 cq = &migr_data->qps[i].cq; 3108 3109 if (sq->size) { 3110 SPDK_NOTICELOG("sqid:%u, bar0_doorbell:%u\n", sq->sqid, doorbell_base[i * 2]); 3111 if (i > 0 && sdbl != NULL) { 3112 SPDK_NOTICELOG("sqid:%u, shadow_doorbell:%u, eventidx:%u\n", 3113 sq->sqid, 3114 sdbl->shadow_doorbells[queue_index(i, false)], 3115 sdbl->eventidxs[queue_index(i, false)]); 3116 } 3117 SPDK_NOTICELOG("SQ sqid:%u, cqid:%u, sqhead:%u, size:%u, dma_addr:0x%"PRIx64"\n", 3118 sq->sqid, sq->cqid, sq->head, sq->size, sq->dma_addr); 3119 } 3120 3121 if (cq->size) { 3122 SPDK_NOTICELOG("cqid:%u, bar0_doorbell:%u\n", cq->cqid, doorbell_base[i * 2 + 1]); 3123 if (i > 0 && sdbl != NULL) { 3124 SPDK_NOTICELOG("cqid:%u, shadow_doorbell:%u, eventidx:%u\n", 3125 cq->cqid, 3126 sdbl->shadow_doorbells[queue_index(i, true)], 3127 sdbl->eventidxs[queue_index(i, true)]); 3128 } 3129 SPDK_NOTICELOG("CQ cqid:%u, phase:%u, cqtail:%u, size:%u, iv:%u, ien:%u, dma_addr:0x%"PRIx64"\n", 3130 cq->cqid, cq->phase, cq->tail, cq->size, cq->iv, cq->ien, cq->dma_addr); 3131 } 3132 } 3133 3134 SPDK_NOTICELOG("%s Dump Done\n", name); 3135 } 3136 3137 /* Read region 9 content and restore it to migration data structures */ 3138 static int 3139 vfio_user_migr_stream_to_data(struct nvmf_vfio_user_endpoint *endpoint, 3140 struct vfio_user_nvme_migr_state *migr_state) 3141 { 3142 void *data_ptr = endpoint->migr_data; 3143 3144 /* Load vfio_user_nvme_migr_header first */ 3145 memcpy(&migr_state->ctrlr_header, data_ptr, sizeof(struct vfio_user_nvme_migr_header)); 3146 /* TODO: version check */ 3147 if (migr_state->ctrlr_header.magic != VFIO_USER_NVME_MIGR_MAGIC) { 3148 SPDK_ERRLOG("%s: bad magic number %x\n", endpoint_id(endpoint), migr_state->ctrlr_header.magic); 3149 return -EINVAL; 3150 } 3151 3152 /* Load nvmf controller data */ 3153 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.nvmf_data_offset; 3154 memcpy(&migr_state->nvmf_data, data_ptr, migr_state->ctrlr_header.nvmf_data_len); 3155 3156 /* Load queue pairs */ 3157 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.qp_offset; 3158 memcpy(&migr_state->qps, data_ptr, migr_state->ctrlr_header.qp_len); 3159 3160 /* Load BAR0 */ 3161 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.bar_offset[VFU_PCI_DEV_BAR0_REGION_IDX]; 3162 memcpy(&migr_state->bar0, data_ptr, migr_state->ctrlr_header.bar_len[VFU_PCI_DEV_BAR0_REGION_IDX]); 3163 3164 /* Load CFG */ 3165 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.bar_offset[VFU_PCI_DEV_CFG_REGION_IDX]; 3166 memcpy(&migr_state->cfg, data_ptr, migr_state->ctrlr_header.bar_len[VFU_PCI_DEV_CFG_REGION_IDX]); 3167 3168 return 0; 3169 } 3170 3171 3172 static void 3173 vfio_user_migr_ctrlr_save_data(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3174 { 3175 struct spdk_nvmf_ctrlr *ctrlr = vu_ctrlr->ctrlr; 3176 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 3177 struct nvmf_vfio_user_sq *sq; 3178 struct nvmf_vfio_user_cq *cq; 3179 struct vfio_user_nvme_migr_state migr_state = {}; 3180 uint64_t data_offset; 3181 void *data_ptr; 3182 int num_aers; 3183 struct spdk_nvme_registers *regs; 3184 uint32_t *doorbell_base; 3185 uint32_t i = 0; 3186 uint16_t sqid, cqid; 3187 3188 /* Save all data to vfio_user_nvme_migr_state first, then we will 3189 * copy it to device migration region at last. 3190 */ 3191 3192 /* save magic number */ 3193 migr_state.ctrlr_header.magic = VFIO_USER_NVME_MIGR_MAGIC; 3194 3195 /* save controller data */ 3196 num_aers = nvmf_ctrlr_save_aers(ctrlr, migr_state.ctrlr_header.aer_cids, 3197 256); 3198 assert(num_aers >= 0); 3199 migr_state.ctrlr_header.nr_aers = num_aers; 3200 3201 /* save nvmf controller data */ 3202 nvmf_ctrlr_save_migr_data(ctrlr, (struct nvmf_ctrlr_migr_data *)&migr_state.nvmf_data); 3203 3204 /* save connected queue pairs */ 3205 TAILQ_FOREACH(sq, &vu_ctrlr->connected_sqs, tailq) { 3206 /* save sq */ 3207 sqid = sq->qid; 3208 migr_state.qps[sqid].sq.sqid = sq->qid; 3209 migr_state.qps[sqid].sq.cqid = sq->cqid; 3210 migr_state.qps[sqid].sq.head = *sq_headp(sq); 3211 migr_state.qps[sqid].sq.size = sq->size; 3212 migr_state.qps[sqid].sq.dma_addr = sq->mapping.prp1; 3213 3214 /* save cq, for shared cq case, cq may be saved multiple times */ 3215 cqid = sq->cqid; 3216 cq = vu_ctrlr->cqs[cqid]; 3217 migr_state.qps[cqid].cq.cqid = cqid; 3218 migr_state.qps[cqid].cq.tail = *cq_tailp(cq); 3219 migr_state.qps[cqid].cq.ien = cq->ien; 3220 migr_state.qps[cqid].cq.iv = cq->iv; 3221 migr_state.qps[cqid].cq.size = cq->size; 3222 migr_state.qps[cqid].cq.phase = cq->phase; 3223 migr_state.qps[cqid].cq.dma_addr = cq->mapping.prp1; 3224 i++; 3225 } 3226 3227 assert(i > 0); 3228 migr_state.ctrlr_header.num_io_queues = i - 1; 3229 3230 regs = (struct spdk_nvme_registers *)&migr_state.bar0; 3231 /* Save mandarory registers to bar0 */ 3232 regs->csts.raw = ctrlr->vcprop.csts.raw; 3233 regs->cap.raw = ctrlr->vcprop.cap.raw; 3234 regs->vs.raw = ctrlr->vcprop.vs.raw; 3235 regs->cc.raw = ctrlr->vcprop.cc.raw; 3236 regs->aqa.raw = ctrlr->vcprop.aqa.raw; 3237 regs->asq = ctrlr->vcprop.asq; 3238 regs->acq = ctrlr->vcprop.acq; 3239 /* Save doorbells */ 3240 doorbell_base = (uint32_t *)®s->doorbell[0].sq_tdbl; 3241 memcpy(doorbell_base, (void *)vu_ctrlr->bar0_doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 3242 3243 /* Save PCI configuration space */ 3244 memcpy(&migr_state.cfg, (void *)endpoint->pci_config_space, NVME_REG_CFG_SIZE); 3245 3246 /* Save all data to device migration region */ 3247 data_ptr = endpoint->migr_data; 3248 3249 /* Copy nvmf controller data */ 3250 data_offset = sizeof(struct vfio_user_nvme_migr_header); 3251 data_ptr += data_offset; 3252 migr_state.ctrlr_header.nvmf_data_offset = data_offset; 3253 migr_state.ctrlr_header.nvmf_data_len = sizeof(struct nvmf_ctrlr_migr_data); 3254 memcpy(data_ptr, &migr_state.nvmf_data, sizeof(struct nvmf_ctrlr_migr_data)); 3255 3256 /* Copy queue pairs */ 3257 data_offset += sizeof(struct nvmf_ctrlr_migr_data); 3258 data_ptr += sizeof(struct nvmf_ctrlr_migr_data); 3259 migr_state.ctrlr_header.qp_offset = data_offset; 3260 migr_state.ctrlr_header.qp_len = i * (sizeof(struct nvme_migr_sq_state) + sizeof( 3261 struct nvme_migr_cq_state)); 3262 memcpy(data_ptr, &migr_state.qps, migr_state.ctrlr_header.qp_len); 3263 3264 /* Copy BAR0 */ 3265 data_offset += migr_state.ctrlr_header.qp_len; 3266 data_ptr += migr_state.ctrlr_header.qp_len; 3267 migr_state.ctrlr_header.bar_offset[VFU_PCI_DEV_BAR0_REGION_IDX] = data_offset; 3268 migr_state.ctrlr_header.bar_len[VFU_PCI_DEV_BAR0_REGION_IDX] = NVME_REG_BAR0_SIZE; 3269 memcpy(data_ptr, &migr_state.bar0, NVME_REG_BAR0_SIZE); 3270 3271 /* Copy CFG */ 3272 data_offset += NVME_REG_BAR0_SIZE; 3273 data_ptr += NVME_REG_BAR0_SIZE; 3274 migr_state.ctrlr_header.bar_offset[VFU_PCI_DEV_CFG_REGION_IDX] = data_offset; 3275 migr_state.ctrlr_header.bar_len[VFU_PCI_DEV_CFG_REGION_IDX] = NVME_REG_CFG_SIZE; 3276 memcpy(data_ptr, &migr_state.cfg, NVME_REG_CFG_SIZE); 3277 3278 /* copy shadow doorbells */ 3279 if (vu_ctrlr->sdbl != NULL) { 3280 migr_state.ctrlr_header.sdbl = true; 3281 migr_state.ctrlr_header.shadow_doorbell_buffer = vu_ctrlr->shadow_doorbell_buffer; 3282 migr_state.ctrlr_header.eventidx_buffer = vu_ctrlr->eventidx_buffer; 3283 } 3284 3285 /* Copy nvme migration header finally */ 3286 memcpy(endpoint->migr_data, &migr_state.ctrlr_header, sizeof(struct vfio_user_nvme_migr_header)); 3287 3288 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 3289 vfio_user_ctrlr_dump_migr_data("SAVE", &migr_state, vu_ctrlr->sdbl); 3290 } 3291 } 3292 3293 /* 3294 * If we are about to close the connection, we need to unregister the interrupt, 3295 * as the library will subsequently close the file descriptor we registered. 3296 */ 3297 static int 3298 vfio_user_device_reset(vfu_ctx_t *vfu_ctx, vfu_reset_type_t type) 3299 { 3300 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3301 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 3302 3303 SPDK_DEBUGLOG(nvmf_vfio, "Device reset type %u\n", type); 3304 3305 if (type == VFU_RESET_LOST_CONN) { 3306 if (ctrlr != NULL) { 3307 spdk_interrupt_unregister(&ctrlr->intr); 3308 ctrlr->intr_fd = -1; 3309 } 3310 return 0; 3311 } 3312 3313 /* FIXME: LOST_CONN case ? */ 3314 if (ctrlr->sdbl != NULL) { 3315 free_sdbl(vfu_ctx, ctrlr->sdbl); 3316 ctrlr->sdbl = NULL; 3317 } 3318 3319 /* FIXME: much more needed here. */ 3320 3321 return 0; 3322 } 3323 3324 static int 3325 vfio_user_migr_ctrlr_construct_qps(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 3326 struct vfio_user_nvme_migr_state *migr_state) 3327 { 3328 uint32_t i, qsize = 0; 3329 uint16_t sqid, cqid; 3330 struct vfio_user_nvme_migr_qp migr_qp; 3331 void *addr; 3332 uint32_t cqs_ref[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR] = {}; 3333 int ret; 3334 3335 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 3336 vfio_user_ctrlr_dump_migr_data("RESUME", migr_state, vu_ctrlr->sdbl); 3337 } 3338 3339 /* restore submission queues */ 3340 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3341 migr_qp = migr_state->qps[i]; 3342 3343 qsize = migr_qp.sq.size; 3344 if (qsize) { 3345 struct nvmf_vfio_user_sq *sq; 3346 3347 sqid = migr_qp.sq.sqid; 3348 if (sqid != i) { 3349 SPDK_ERRLOG("Expected sqid %u while got %u", i, sqid); 3350 return -EINVAL; 3351 } 3352 3353 /* allocate sq if necessary */ 3354 if (vu_ctrlr->sqs[sqid] == NULL) { 3355 ret = init_sq(vu_ctrlr, &vu_ctrlr->transport->transport, sqid); 3356 if (ret) { 3357 SPDK_ERRLOG("Construct qpair with qid %u failed\n", sqid); 3358 return -EFAULT; 3359 } 3360 } 3361 3362 sq = vu_ctrlr->sqs[sqid]; 3363 sq->size = qsize; 3364 3365 ret = alloc_sq_reqs(vu_ctrlr, sq); 3366 if (ret) { 3367 SPDK_ERRLOG("Construct sq with qid %u failed\n", sqid); 3368 return -EFAULT; 3369 } 3370 3371 /* restore sq */ 3372 sq->sq_state = VFIO_USER_SQ_CREATED; 3373 sq->cqid = migr_qp.sq.cqid; 3374 *sq_headp(sq) = migr_qp.sq.head; 3375 sq->mapping.prp1 = migr_qp.sq.dma_addr; 3376 addr = map_one(vu_ctrlr->endpoint->vfu_ctx, 3377 sq->mapping.prp1, sq->size * 64, 3378 sq->mapping.sg, &sq->mapping.iov, 3379 PROT_READ); 3380 if (addr == NULL) { 3381 SPDK_ERRLOG("Restore sq with qid %u PRP1 0x%"PRIx64" with size %u failed\n", 3382 sqid, sq->mapping.prp1, sq->size); 3383 return -EFAULT; 3384 } 3385 cqs_ref[sq->cqid]++; 3386 } 3387 } 3388 3389 /* restore completion queues */ 3390 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3391 migr_qp = migr_state->qps[i]; 3392 3393 qsize = migr_qp.cq.size; 3394 if (qsize) { 3395 struct nvmf_vfio_user_cq *cq; 3396 3397 /* restore cq */ 3398 cqid = migr_qp.sq.cqid; 3399 assert(cqid == i); 3400 3401 /* allocate cq if necessary */ 3402 if (vu_ctrlr->cqs[cqid] == NULL) { 3403 ret = init_cq(vu_ctrlr, cqid); 3404 if (ret) { 3405 SPDK_ERRLOG("Construct qpair with qid %u failed\n", cqid); 3406 return -EFAULT; 3407 } 3408 } 3409 3410 cq = vu_ctrlr->cqs[cqid]; 3411 3412 cq->size = qsize; 3413 3414 cq->cq_state = VFIO_USER_CQ_CREATED; 3415 cq->cq_ref = cqs_ref[cqid]; 3416 *cq_tailp(cq) = migr_qp.cq.tail; 3417 cq->mapping.prp1 = migr_qp.cq.dma_addr; 3418 cq->ien = migr_qp.cq.ien; 3419 cq->iv = migr_qp.cq.iv; 3420 cq->phase = migr_qp.cq.phase; 3421 addr = map_one(vu_ctrlr->endpoint->vfu_ctx, 3422 cq->mapping.prp1, cq->size * 16, 3423 cq->mapping.sg, &cq->mapping.iov, 3424 PROT_READ | PROT_WRITE); 3425 if (addr == NULL) { 3426 SPDK_ERRLOG("Restore cq with qid %u PRP1 0x%"PRIx64" with size %u failed\n", 3427 cqid, cq->mapping.prp1, cq->size); 3428 return -EFAULT; 3429 } 3430 } 3431 } 3432 3433 return 0; 3434 } 3435 3436 static int 3437 vfio_user_migr_ctrlr_restore(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3438 { 3439 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 3440 struct spdk_nvmf_ctrlr *ctrlr = vu_ctrlr->ctrlr; 3441 uint32_t *doorbell_base; 3442 struct vfio_user_nvme_migr_state migr_state = {}; 3443 struct spdk_nvme_registers *regs; 3444 struct spdk_nvme_cmd cmd; 3445 uint16_t i; 3446 int rc = 0; 3447 3448 assert(endpoint->migr_data != NULL); 3449 assert(ctrlr != NULL); 3450 rc = vfio_user_migr_stream_to_data(endpoint, &migr_state); 3451 if (rc) { 3452 return rc; 3453 } 3454 3455 /* restore shadow doorbells */ 3456 if (migr_state.ctrlr_header.sdbl) { 3457 struct nvmf_vfio_user_shadow_doorbells *sdbl; 3458 sdbl = map_sdbl(vu_ctrlr->endpoint->vfu_ctx, 3459 migr_state.ctrlr_header.shadow_doorbell_buffer, 3460 migr_state.ctrlr_header.eventidx_buffer, 3461 memory_page_size(vu_ctrlr)); 3462 if (sdbl == NULL) { 3463 SPDK_ERRLOG("%s: failed to re-map shadow doorbell buffers\n", 3464 ctrlr_id(vu_ctrlr)); 3465 return -1; 3466 } 3467 3468 vu_ctrlr->shadow_doorbell_buffer = migr_state.ctrlr_header.shadow_doorbell_buffer; 3469 vu_ctrlr->eventidx_buffer = migr_state.ctrlr_header.eventidx_buffer; 3470 3471 SWAP(vu_ctrlr->sdbl, sdbl); 3472 } 3473 3474 rc = vfio_user_migr_ctrlr_construct_qps(vu_ctrlr, &migr_state); 3475 if (rc) { 3476 return rc; 3477 } 3478 3479 /* restore PCI configuration space */ 3480 memcpy((void *)endpoint->pci_config_space, &migr_state.cfg, NVME_REG_CFG_SIZE); 3481 3482 regs = (struct spdk_nvme_registers *)&migr_state.bar0; 3483 doorbell_base = (uint32_t *)®s->doorbell[0].sq_tdbl; 3484 /* restore doorbells from saved registers */ 3485 memcpy((void *)vu_ctrlr->bar0_doorbells, doorbell_base, NVMF_VFIO_USER_DOORBELLS_SIZE); 3486 3487 /* restore controller registers after ADMIN queue connection */ 3488 ctrlr->vcprop.csts.raw = regs->csts.raw; 3489 ctrlr->vcprop.cap.raw = regs->cap.raw; 3490 ctrlr->vcprop.vs.raw = regs->vs.raw; 3491 ctrlr->vcprop.cc.raw = regs->cc.raw; 3492 ctrlr->vcprop.aqa.raw = regs->aqa.raw; 3493 ctrlr->vcprop.asq = regs->asq; 3494 ctrlr->vcprop.acq = regs->acq; 3495 3496 /* restore nvmf controller data */ 3497 rc = nvmf_ctrlr_restore_migr_data(ctrlr, &migr_state.nvmf_data); 3498 if (rc) { 3499 return rc; 3500 } 3501 3502 /* resubmit pending AERs */ 3503 for (i = 0; i < migr_state.ctrlr_header.nr_aers; i++) { 3504 SPDK_DEBUGLOG(nvmf_vfio, "%s AER resubmit, CID %u\n", ctrlr_id(vu_ctrlr), 3505 migr_state.ctrlr_header.aer_cids[i]); 3506 memset(&cmd, 0, sizeof(cmd)); 3507 cmd.opc = SPDK_NVME_OPC_ASYNC_EVENT_REQUEST; 3508 cmd.cid = migr_state.ctrlr_header.aer_cids[i]; 3509 rc = handle_cmd_req(vu_ctrlr, &cmd, vu_ctrlr->sqs[0]); 3510 if (rc) { 3511 break; 3512 } 3513 } 3514 3515 return rc; 3516 } 3517 3518 static void 3519 vfio_user_migr_ctrlr_enable_sqs(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3520 { 3521 uint32_t i; 3522 struct nvmf_vfio_user_sq *sq; 3523 3524 /* The Admin queue (qid: 0) does not ever use shadow doorbells. */ 3525 3526 if (vu_ctrlr->sqs[0] != NULL) { 3527 vu_ctrlr->sqs[0]->dbl_tailp = vu_ctrlr->bar0_doorbells + 3528 queue_index(0, false); 3529 } 3530 3531 if (vu_ctrlr->cqs[0] != NULL) { 3532 vu_ctrlr->cqs[0]->dbl_headp = vu_ctrlr->bar0_doorbells + 3533 queue_index(0, true); 3534 } 3535 3536 vfio_user_ctrlr_switch_doorbells(vu_ctrlr, vu_ctrlr->sdbl != NULL); 3537 3538 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3539 sq = vu_ctrlr->sqs[i]; 3540 if (!sq || !sq->size) { 3541 continue; 3542 } 3543 3544 if (nvmf_qpair_is_admin_queue(&sq->qpair)) { 3545 /* ADMIN queue pair is always in the poll group, just enable it */ 3546 sq->sq_state = VFIO_USER_SQ_ACTIVE; 3547 } else { 3548 spdk_nvmf_tgt_new_qpair(vu_ctrlr->transport->transport.tgt, &sq->qpair); 3549 } 3550 } 3551 } 3552 3553 static int 3554 vfio_user_migration_device_state_transition(vfu_ctx_t *vfu_ctx, vfu_migr_state_t state) 3555 { 3556 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3557 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3558 struct nvmf_vfio_user_sq *sq; 3559 int ret = 0; 3560 3561 SPDK_DEBUGLOG(nvmf_vfio, "%s controller state %u, migration state %u\n", endpoint_id(endpoint), 3562 vu_ctrlr->state, state); 3563 3564 switch (state) { 3565 case VFU_MIGR_STATE_STOP_AND_COPY: 3566 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 3567 vfio_user_migr_ctrlr_save_data(vu_ctrlr); 3568 break; 3569 case VFU_MIGR_STATE_STOP: 3570 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 3571 /* The controller associates with source VM is dead now, we will resume 3572 * the subsystem after destroying the controller data structure, then the 3573 * subsystem can be re-used for another new client. 3574 */ 3575 if (vu_ctrlr->in_source_vm) { 3576 endpoint->need_resume = true; 3577 } 3578 break; 3579 case VFU_MIGR_STATE_PRE_COPY: 3580 assert(vu_ctrlr->state == VFIO_USER_CTRLR_PAUSED); 3581 vu_ctrlr->migr_reg.pending_bytes = vfio_user_migr_data_len(); 3582 vu_ctrlr->migr_reg.last_data_offset = 0; 3583 vu_ctrlr->in_source_vm = true; 3584 break; 3585 case VFU_MIGR_STATE_RESUME: 3586 /* 3587 * Destination ADMIN queue pair is connected when starting the VM, 3588 * but the ADMIN queue pair isn't enabled in destination VM, the poll 3589 * group will do nothing to ADMIN queue pair for now. 3590 */ 3591 if (vu_ctrlr->state != VFIO_USER_CTRLR_RUNNING) { 3592 break; 3593 } 3594 3595 assert(!vu_ctrlr->in_source_vm); 3596 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 3597 3598 sq = TAILQ_FIRST(&vu_ctrlr->connected_sqs); 3599 assert(sq != NULL); 3600 assert(sq->qpair.qid == 0); 3601 sq->sq_state = VFIO_USER_SQ_INACTIVE; 3602 3603 /* Free ADMIN SQ resources first, SQ resources will be 3604 * allocated based on queue size from source VM. 3605 */ 3606 free_sq_reqs(sq); 3607 sq->size = 0; 3608 break; 3609 case VFU_MIGR_STATE_RUNNING: 3610 if (vu_ctrlr->state != VFIO_USER_CTRLR_MIGRATING) { 3611 break; 3612 } 3613 3614 if (!vu_ctrlr->in_source_vm) { 3615 /* Restore destination VM from BAR9 */ 3616 ret = vfio_user_migr_ctrlr_restore(vu_ctrlr); 3617 if (ret) { 3618 break; 3619 } 3620 3621 vfio_user_ctrlr_switch_doorbells(vu_ctrlr, false); 3622 vfio_user_migr_ctrlr_enable_sqs(vu_ctrlr); 3623 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 3624 } else { 3625 /* Rollback source VM */ 3626 vu_ctrlr->state = VFIO_USER_CTRLR_RESUMING; 3627 ret = spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 3628 vfio_user_endpoint_resume_done, endpoint); 3629 if (ret < 0) { 3630 /* TODO: fail controller with CFS bit set */ 3631 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 3632 SPDK_ERRLOG("%s: failed to resume, ret=%d\n", endpoint_id(endpoint), ret); 3633 break; 3634 } 3635 } 3636 break; 3637 3638 default: 3639 return -EINVAL; 3640 } 3641 3642 return ret; 3643 } 3644 3645 static uint64_t 3646 vfio_user_migration_get_pending_bytes(vfu_ctx_t *vfu_ctx) 3647 { 3648 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3649 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 3650 struct vfio_user_migration_region *migr_reg = &ctrlr->migr_reg; 3651 3652 SPDK_DEBUGLOG(nvmf_vfio, "%s current state %u, pending bytes 0x%"PRIx64"\n", endpoint_id(endpoint), 3653 ctrlr->state, migr_reg->pending_bytes); 3654 3655 return migr_reg->pending_bytes; 3656 } 3657 3658 static int 3659 vfio_user_migration_prepare_data(vfu_ctx_t *vfu_ctx, uint64_t *offset, uint64_t *size) 3660 { 3661 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3662 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 3663 struct vfio_user_migration_region *migr_reg = &ctrlr->migr_reg; 3664 3665 if (migr_reg->last_data_offset == vfio_user_migr_data_len()) { 3666 *offset = vfio_user_migr_data_len(); 3667 if (size) { 3668 *size = 0; 3669 } 3670 migr_reg->pending_bytes = 0; 3671 } else { 3672 *offset = 0; 3673 if (size) { 3674 *size = vfio_user_migr_data_len(); 3675 if (ctrlr->state == VFIO_USER_CTRLR_MIGRATING) { 3676 vfio_user_migr_ctrlr_save_data(ctrlr); 3677 migr_reg->last_data_offset = vfio_user_migr_data_len(); 3678 } 3679 } 3680 } 3681 3682 SPDK_DEBUGLOG(nvmf_vfio, "%s current state %u\n", endpoint_id(endpoint), ctrlr->state); 3683 3684 return 0; 3685 } 3686 3687 static ssize_t 3688 vfio_user_migration_read_data(vfu_ctx_t *vfu_ctx, void *buf, uint64_t count, uint64_t offset) 3689 { 3690 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3691 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 3692 struct vfio_user_migration_region *migr_reg = &ctrlr->migr_reg; 3693 3694 memcpy(buf, endpoint->migr_data, count); 3695 migr_reg->pending_bytes = 0; 3696 3697 return 0; 3698 } 3699 3700 static ssize_t 3701 vfio_user_migration_write_data(vfu_ctx_t *vfu_ctx, void *buf, uint64_t count, uint64_t offset) 3702 { 3703 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3704 3705 memcpy(endpoint->migr_data, buf, count); 3706 3707 return 0; 3708 } 3709 3710 static int 3711 vfio_user_migration_data_written(vfu_ctx_t *vfu_ctx, uint64_t count) 3712 { 3713 SPDK_DEBUGLOG(nvmf_vfio, "write 0x%"PRIx64"\n", (uint64_t)count); 3714 3715 return 0; 3716 } 3717 3718 static int 3719 vfio_user_dev_info_fill(struct nvmf_vfio_user_transport *vu_transport, 3720 struct nvmf_vfio_user_endpoint *endpoint) 3721 { 3722 int ret; 3723 ssize_t cap_offset; 3724 vfu_ctx_t *vfu_ctx = endpoint->vfu_ctx; 3725 struct iovec migr_sparse_mmap = {}; 3726 3727 struct pmcap pmcap = { .hdr.id = PCI_CAP_ID_PM, .pmcs.nsfrst = 0x1 }; 3728 struct pxcap pxcap = { 3729 .hdr.id = PCI_CAP_ID_EXP, 3730 .pxcaps.ver = 0x2, 3731 .pxdcap = {.rer = 0x1, .flrc = 0x1}, 3732 .pxdcap2.ctds = 0x1 3733 }; 3734 3735 struct msixcap msixcap = { 3736 .hdr.id = PCI_CAP_ID_MSIX, 3737 .mxc.ts = NVME_IRQ_MSIX_NUM - 1, 3738 .mtab = {.tbir = 0x4, .to = 0x0}, 3739 .mpba = {.pbir = 0x5, .pbao = 0x0} 3740 }; 3741 3742 struct iovec sparse_mmap[] = { 3743 { 3744 .iov_base = (void *)NVME_DOORBELLS_OFFSET, 3745 .iov_len = NVMF_VFIO_USER_DOORBELLS_SIZE, 3746 }, 3747 }; 3748 3749 const vfu_migration_callbacks_t migr_callbacks = { 3750 .version = VFU_MIGR_CALLBACKS_VERS, 3751 .transition = &vfio_user_migration_device_state_transition, 3752 .get_pending_bytes = &vfio_user_migration_get_pending_bytes, 3753 .prepare_data = &vfio_user_migration_prepare_data, 3754 .read_data = &vfio_user_migration_read_data, 3755 .data_written = &vfio_user_migration_data_written, 3756 .write_data = &vfio_user_migration_write_data 3757 }; 3758 3759 ret = vfu_pci_init(vfu_ctx, VFU_PCI_TYPE_EXPRESS, PCI_HEADER_TYPE_NORMAL, 0); 3760 if (ret < 0) { 3761 SPDK_ERRLOG("vfu_ctx %p failed to initialize PCI\n", vfu_ctx); 3762 return ret; 3763 } 3764 vfu_pci_set_id(vfu_ctx, SPDK_PCI_VID_NUTANIX, 0x0001, SPDK_PCI_VID_NUTANIX, 0); 3765 /* 3766 * 0x02, controller uses the NVM Express programming interface 3767 * 0x08, non-volatile memory controller 3768 * 0x01, mass storage controller 3769 */ 3770 vfu_pci_set_class(vfu_ctx, 0x01, 0x08, 0x02); 3771 3772 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pmcap); 3773 if (cap_offset < 0) { 3774 SPDK_ERRLOG("vfu_ctx %p failed add pmcap\n", vfu_ctx); 3775 return ret; 3776 } 3777 3778 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pxcap); 3779 if (cap_offset < 0) { 3780 SPDK_ERRLOG("vfu_ctx %p failed add pxcap\n", vfu_ctx); 3781 return ret; 3782 } 3783 3784 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &msixcap); 3785 if (cap_offset < 0) { 3786 SPDK_ERRLOG("vfu_ctx %p failed add msixcap\n", vfu_ctx); 3787 return ret; 3788 } 3789 3790 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_CFG_REGION_IDX, NVME_REG_CFG_SIZE, 3791 access_pci_config, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 3792 if (ret < 0) { 3793 SPDK_ERRLOG("vfu_ctx %p failed to setup cfg\n", vfu_ctx); 3794 return ret; 3795 } 3796 3797 if (vu_transport->transport_opts.disable_mappable_bar0) { 3798 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE, 3799 access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, 3800 NULL, 0, -1, 0); 3801 } else { 3802 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE, 3803 access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, 3804 sparse_mmap, 1, endpoint->devmem_fd, 0); 3805 } 3806 3807 if (ret < 0) { 3808 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 0\n", vfu_ctx); 3809 return ret; 3810 } 3811 3812 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR4_REGION_IDX, NVME_BAR4_SIZE, 3813 NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 3814 if (ret < 0) { 3815 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 4\n", vfu_ctx); 3816 return ret; 3817 } 3818 3819 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR5_REGION_IDX, NVME_BAR5_SIZE, 3820 NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 3821 if (ret < 0) { 3822 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 5\n", vfu_ctx); 3823 return ret; 3824 } 3825 3826 ret = vfu_setup_device_dma(vfu_ctx, memory_region_add_cb, memory_region_remove_cb); 3827 if (ret < 0) { 3828 SPDK_ERRLOG("vfu_ctx %p failed to setup dma callback\n", vfu_ctx); 3829 return ret; 3830 } 3831 3832 ret = vfu_setup_device_reset_cb(vfu_ctx, vfio_user_device_reset); 3833 if (ret < 0) { 3834 SPDK_ERRLOG("vfu_ctx %p failed to setup reset callback\n", vfu_ctx); 3835 return ret; 3836 } 3837 3838 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_INTX_IRQ, 1); 3839 if (ret < 0) { 3840 SPDK_ERRLOG("vfu_ctx %p failed to setup INTX\n", vfu_ctx); 3841 return ret; 3842 } 3843 3844 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSIX_IRQ, NVME_IRQ_MSIX_NUM); 3845 if (ret < 0) { 3846 SPDK_ERRLOG("vfu_ctx %p failed to setup MSIX\n", vfu_ctx); 3847 return ret; 3848 } 3849 3850 vfu_setup_device_quiesce_cb(vfu_ctx, vfio_user_dev_quiesce_cb); 3851 3852 migr_sparse_mmap.iov_base = (void *)4096; 3853 migr_sparse_mmap.iov_len = vfio_user_migr_data_len(); 3854 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_MIGR_REGION_IDX, 3855 vfu_get_migr_register_area_size() + vfio_user_migr_data_len(), 3856 NULL, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, &migr_sparse_mmap, 3857 1, endpoint->migr_fd, 0); 3858 if (ret < 0) { 3859 SPDK_ERRLOG("vfu_ctx %p failed to setup migration region\n", vfu_ctx); 3860 return ret; 3861 } 3862 3863 ret = vfu_setup_device_migration_callbacks(vfu_ctx, &migr_callbacks, 3864 vfu_get_migr_register_area_size()); 3865 if (ret < 0) { 3866 SPDK_ERRLOG("vfu_ctx %p failed to setup migration callbacks\n", vfu_ctx); 3867 return ret; 3868 } 3869 3870 ret = vfu_realize_ctx(vfu_ctx); 3871 if (ret < 0) { 3872 SPDK_ERRLOG("vfu_ctx %p failed to realize\n", vfu_ctx); 3873 return ret; 3874 } 3875 3876 endpoint->pci_config_space = vfu_pci_get_config_space(endpoint->vfu_ctx); 3877 assert(endpoint->pci_config_space != NULL); 3878 init_pci_config_space(endpoint->pci_config_space); 3879 3880 assert(cap_offset != 0); 3881 endpoint->msix = (struct msixcap *)((uint8_t *)endpoint->pci_config_space + cap_offset); 3882 3883 return 0; 3884 } 3885 3886 static int nvmf_vfio_user_accept(void *ctx); 3887 3888 static void 3889 set_intr_mode_noop(struct spdk_poller *poller, void *arg, bool interrupt_mode) 3890 { 3891 /* Nothing for us to do here. */ 3892 } 3893 3894 /* 3895 * Register an "accept" poller: this is polling for incoming vfio-user socket 3896 * connections (on the listening socket). 3897 * 3898 * We need to do this on first listening, and also after destroying a 3899 * controller, so we can accept another connection. 3900 */ 3901 static int 3902 vfio_user_register_accept_poller(struct nvmf_vfio_user_endpoint *endpoint) 3903 { 3904 uint64_t poll_rate_us = endpoint->transport->transport.opts.acceptor_poll_rate; 3905 3906 SPDK_DEBUGLOG(nvmf_vfio, "registering accept poller\n"); 3907 3908 endpoint->accept_poller = SPDK_POLLER_REGISTER(nvmf_vfio_user_accept, 3909 endpoint, poll_rate_us); 3910 3911 if (!endpoint->accept_poller) { 3912 return -1; 3913 } 3914 3915 endpoint->accept_thread = spdk_get_thread(); 3916 3917 if (!spdk_interrupt_mode_is_enabled()) { 3918 return 0; 3919 } 3920 3921 endpoint->accept_intr_fd = vfu_get_poll_fd(endpoint->vfu_ctx); 3922 assert(endpoint->accept_intr_fd != -1); 3923 3924 endpoint->accept_intr = SPDK_INTERRUPT_REGISTER(endpoint->accept_intr_fd, 3925 nvmf_vfio_user_accept, endpoint); 3926 3927 assert(endpoint->accept_intr != NULL); 3928 3929 spdk_poller_register_interrupt(endpoint->accept_poller, 3930 set_intr_mode_noop, NULL); 3931 return 0; 3932 } 3933 3934 static void 3935 _vfio_user_relisten(void *ctx) 3936 { 3937 struct nvmf_vfio_user_endpoint *endpoint = ctx; 3938 3939 vfio_user_register_accept_poller(endpoint); 3940 } 3941 3942 static void 3943 _free_ctrlr(void *ctx) 3944 { 3945 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 3946 struct nvmf_vfio_user_endpoint *endpoint = ctrlr->endpoint; 3947 3948 free_sdbl(ctrlr->endpoint->vfu_ctx, ctrlr->sdbl); 3949 3950 spdk_interrupt_unregister(&ctrlr->intr); 3951 ctrlr->intr_fd = -1; 3952 spdk_poller_unregister(&ctrlr->vfu_ctx_poller); 3953 3954 free(ctrlr); 3955 3956 if (endpoint == NULL) { 3957 return; 3958 } 3959 3960 if (endpoint->need_async_destroy) { 3961 nvmf_vfio_user_destroy_endpoint(endpoint); 3962 } else { 3963 spdk_thread_send_msg(endpoint->accept_thread, 3964 _vfio_user_relisten, endpoint); 3965 } 3966 } 3967 3968 static void 3969 free_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 3970 { 3971 int i; 3972 assert(ctrlr != NULL); 3973 3974 SPDK_DEBUGLOG(nvmf_vfio, "free %s\n", ctrlr_id(ctrlr)); 3975 3976 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3977 free_qp(ctrlr, i); 3978 } 3979 3980 spdk_thread_exec_msg(ctrlr->thread, _free_ctrlr, ctrlr); 3981 } 3982 3983 static int 3984 nvmf_vfio_user_create_ctrlr(struct nvmf_vfio_user_transport *transport, 3985 struct nvmf_vfio_user_endpoint *endpoint) 3986 { 3987 struct nvmf_vfio_user_ctrlr *ctrlr; 3988 int err = 0; 3989 3990 SPDK_DEBUGLOG(nvmf_vfio, "%s\n", endpoint_id(endpoint)); 3991 3992 /* First, construct a vfio-user CUSTOM transport controller */ 3993 ctrlr = calloc(1, sizeof(*ctrlr)); 3994 if (ctrlr == NULL) { 3995 err = -ENOMEM; 3996 goto out; 3997 } 3998 /* We can only support one connection for now */ 3999 ctrlr->cntlid = 0x1; 4000 ctrlr->intr_fd = -1; 4001 ctrlr->transport = transport; 4002 ctrlr->endpoint = endpoint; 4003 ctrlr->bar0_doorbells = endpoint->bar0_doorbells; 4004 TAILQ_INIT(&ctrlr->connected_sqs); 4005 4006 /* Then, construct an admin queue pair */ 4007 err = init_sq(ctrlr, &transport->transport, 0); 4008 if (err != 0) { 4009 free(ctrlr); 4010 goto out; 4011 } 4012 4013 err = init_cq(ctrlr, 0); 4014 if (err != 0) { 4015 free(ctrlr); 4016 goto out; 4017 } 4018 4019 ctrlr->sqs[0]->size = NVMF_VFIO_USER_DEFAULT_AQ_DEPTH; 4020 4021 err = alloc_sq_reqs(ctrlr, ctrlr->sqs[0]); 4022 if (err != 0) { 4023 free(ctrlr); 4024 goto out; 4025 } 4026 endpoint->ctrlr = ctrlr; 4027 4028 /* Notify the generic layer about the new admin queue pair */ 4029 spdk_nvmf_tgt_new_qpair(transport->transport.tgt, &ctrlr->sqs[0]->qpair); 4030 4031 out: 4032 if (err != 0) { 4033 SPDK_ERRLOG("%s: failed to create vfio-user controller: %s\n", 4034 endpoint_id(endpoint), strerror(-err)); 4035 } 4036 4037 return err; 4038 } 4039 4040 static int 4041 nvmf_vfio_user_listen(struct spdk_nvmf_transport *transport, 4042 const struct spdk_nvme_transport_id *trid, 4043 struct spdk_nvmf_listen_opts *listen_opts) 4044 { 4045 struct nvmf_vfio_user_transport *vu_transport; 4046 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 4047 char path[PATH_MAX] = {}; 4048 char uuid[PATH_MAX] = {}; 4049 int ret; 4050 4051 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4052 transport); 4053 4054 pthread_mutex_lock(&vu_transport->lock); 4055 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 4056 /* Only compare traddr */ 4057 if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) { 4058 pthread_mutex_unlock(&vu_transport->lock); 4059 return -EEXIST; 4060 } 4061 } 4062 pthread_mutex_unlock(&vu_transport->lock); 4063 4064 endpoint = calloc(1, sizeof(*endpoint)); 4065 if (!endpoint) { 4066 return -ENOMEM; 4067 } 4068 4069 pthread_mutex_init(&endpoint->lock, NULL); 4070 endpoint->devmem_fd = -1; 4071 memcpy(&endpoint->trid, trid, sizeof(endpoint->trid)); 4072 endpoint->transport = vu_transport; 4073 4074 ret = snprintf(path, PATH_MAX, "%s/bar0", endpoint_id(endpoint)); 4075 if (ret < 0 || ret >= PATH_MAX) { 4076 SPDK_ERRLOG("%s: error to get socket path: %s.\n", endpoint_id(endpoint), spdk_strerror(errno)); 4077 ret = -1; 4078 goto out; 4079 } 4080 4081 ret = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); 4082 if (ret == -1) { 4083 SPDK_ERRLOG("%s: failed to open device memory at %s: %s.\n", 4084 endpoint_id(endpoint), path, spdk_strerror(errno)); 4085 goto out; 4086 } 4087 unlink(path); 4088 4089 endpoint->devmem_fd = ret; 4090 ret = ftruncate(endpoint->devmem_fd, 4091 NVME_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE); 4092 if (ret != 0) { 4093 SPDK_ERRLOG("%s: error to ftruncate file %s: %s.\n", endpoint_id(endpoint), path, 4094 spdk_strerror(errno)); 4095 goto out; 4096 } 4097 4098 endpoint->bar0_doorbells = mmap(NULL, NVMF_VFIO_USER_DOORBELLS_SIZE, 4099 PROT_READ | PROT_WRITE, MAP_SHARED, endpoint->devmem_fd, NVME_DOORBELLS_OFFSET); 4100 if (endpoint->bar0_doorbells == MAP_FAILED) { 4101 SPDK_ERRLOG("%s: error to mmap file %s: %s.\n", endpoint_id(endpoint), path, spdk_strerror(errno)); 4102 endpoint->bar0_doorbells = NULL; 4103 ret = -1; 4104 goto out; 4105 } 4106 4107 ret = snprintf(path, PATH_MAX, "%s/migr", endpoint_id(endpoint)); 4108 if (ret < 0 || ret >= PATH_MAX) { 4109 SPDK_ERRLOG("%s: error to get migration file path: %s.\n", endpoint_id(endpoint), 4110 spdk_strerror(errno)); 4111 ret = -1; 4112 goto out; 4113 } 4114 ret = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); 4115 if (ret == -1) { 4116 SPDK_ERRLOG("%s: failed to open device memory at %s: %s.\n", 4117 endpoint_id(endpoint), path, spdk_strerror(errno)); 4118 goto out; 4119 } 4120 unlink(path); 4121 4122 endpoint->migr_fd = ret; 4123 ret = ftruncate(endpoint->migr_fd, 4124 vfu_get_migr_register_area_size() + vfio_user_migr_data_len()); 4125 if (ret != 0) { 4126 SPDK_ERRLOG("%s: error to ftruncate migration file %s: %s.\n", endpoint_id(endpoint), path, 4127 spdk_strerror(errno)); 4128 goto out; 4129 } 4130 4131 endpoint->migr_data = mmap(NULL, vfio_user_migr_data_len(), 4132 PROT_READ | PROT_WRITE, MAP_SHARED, endpoint->migr_fd, vfu_get_migr_register_area_size()); 4133 if (endpoint->migr_data == MAP_FAILED) { 4134 SPDK_ERRLOG("%s: error to mmap file %s: %s.\n", endpoint_id(endpoint), path, spdk_strerror(errno)); 4135 endpoint->migr_data = NULL; 4136 ret = -1; 4137 goto out; 4138 } 4139 4140 ret = snprintf(uuid, PATH_MAX, "%s/cntrl", endpoint_id(endpoint)); 4141 if (ret < 0 || ret >= PATH_MAX) { 4142 SPDK_ERRLOG("%s: error to get ctrlr file path: %s\n", endpoint_id(endpoint), spdk_strerror(errno)); 4143 ret = -1; 4144 goto out; 4145 } 4146 4147 endpoint->vfu_ctx = vfu_create_ctx(VFU_TRANS_SOCK, uuid, LIBVFIO_USER_FLAG_ATTACH_NB, 4148 endpoint, VFU_DEV_TYPE_PCI); 4149 if (endpoint->vfu_ctx == NULL) { 4150 SPDK_ERRLOG("%s: error creating libmuser context: %m\n", 4151 endpoint_id(endpoint)); 4152 ret = -1; 4153 goto out; 4154 } 4155 vfu_setup_log(endpoint->vfu_ctx, vfio_user_log, vfio_user_get_log_level()); 4156 4157 ret = vfio_user_dev_info_fill(vu_transport, endpoint); 4158 if (ret < 0) { 4159 goto out; 4160 } 4161 4162 ret = vfio_user_register_accept_poller(endpoint); 4163 4164 if (ret != 0) { 4165 goto out; 4166 } 4167 4168 pthread_mutex_lock(&vu_transport->lock); 4169 TAILQ_INSERT_TAIL(&vu_transport->endpoints, endpoint, link); 4170 pthread_mutex_unlock(&vu_transport->lock); 4171 4172 out: 4173 if (ret != 0) { 4174 nvmf_vfio_user_destroy_endpoint(endpoint); 4175 } 4176 4177 return ret; 4178 } 4179 4180 static void 4181 nvmf_vfio_user_stop_listen(struct spdk_nvmf_transport *transport, 4182 const struct spdk_nvme_transport_id *trid) 4183 { 4184 struct nvmf_vfio_user_transport *vu_transport; 4185 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 4186 4187 assert(trid != NULL); 4188 assert(trid->traddr != NULL); 4189 4190 SPDK_DEBUGLOG(nvmf_vfio, "%s: stop listen\n", trid->traddr); 4191 4192 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4193 transport); 4194 4195 pthread_mutex_lock(&vu_transport->lock); 4196 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 4197 if (strcmp(trid->traddr, endpoint->trid.traddr) == 0) { 4198 TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link); 4199 /* Defer to free endpoint resources until the controller 4200 * is freed. There are two cases when running here: 4201 * 1. kill nvmf target while VM is connected 4202 * 2. remove listener via RPC call 4203 * nvmf library will disconnect all queue paris. 4204 */ 4205 if (endpoint->ctrlr) { 4206 assert(!endpoint->need_async_destroy); 4207 endpoint->need_async_destroy = true; 4208 pthread_mutex_unlock(&vu_transport->lock); 4209 return; 4210 } 4211 4212 nvmf_vfio_user_destroy_endpoint(endpoint); 4213 pthread_mutex_unlock(&vu_transport->lock); 4214 return; 4215 } 4216 } 4217 pthread_mutex_unlock(&vu_transport->lock); 4218 4219 SPDK_DEBUGLOG(nvmf_vfio, "%s: not found\n", trid->traddr); 4220 } 4221 4222 static void 4223 nvmf_vfio_user_cdata_init(struct spdk_nvmf_transport *transport, 4224 struct spdk_nvmf_subsystem *subsystem, 4225 struct spdk_nvmf_ctrlr_data *cdata) 4226 { 4227 struct nvmf_vfio_user_transport *vu_transport; 4228 4229 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, transport); 4230 4231 cdata->vid = SPDK_PCI_VID_NUTANIX; 4232 cdata->ssvid = SPDK_PCI_VID_NUTANIX; 4233 cdata->ieee[0] = 0x8d; 4234 cdata->ieee[1] = 0x6b; 4235 cdata->ieee[2] = 0x50; 4236 memset(&cdata->sgls, 0, sizeof(struct spdk_nvme_cdata_sgls)); 4237 cdata->sgls.supported = SPDK_NVME_SGLS_SUPPORTED_DWORD_ALIGNED; 4238 /* libvfio-user can only support 1 connection for now */ 4239 cdata->oncs.reservations = 0; 4240 cdata->oacs.doorbell_buffer_config = !vu_transport->transport_opts.disable_shadow_doorbells; 4241 } 4242 4243 static int 4244 nvmf_vfio_user_listen_associate(struct spdk_nvmf_transport *transport, 4245 const struct spdk_nvmf_subsystem *subsystem, 4246 const struct spdk_nvme_transport_id *trid) 4247 { 4248 struct nvmf_vfio_user_transport *vu_transport; 4249 struct nvmf_vfio_user_endpoint *endpoint; 4250 4251 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, transport); 4252 4253 pthread_mutex_lock(&vu_transport->lock); 4254 TAILQ_FOREACH(endpoint, &vu_transport->endpoints, link) { 4255 if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) { 4256 break; 4257 } 4258 } 4259 pthread_mutex_unlock(&vu_transport->lock); 4260 4261 if (endpoint == NULL) { 4262 return -ENOENT; 4263 } 4264 4265 endpoint->subsystem = subsystem; 4266 4267 return 0; 4268 } 4269 4270 /* 4271 * Executed periodically at a default SPDK_NVMF_DEFAULT_ACCEPT_POLL_RATE_US 4272 * frequency. 4273 * 4274 * For this endpoint (which at the libvfio-user level corresponds to a socket), 4275 * if we don't currently have a controller set up, peek to see if the socket is 4276 * able to accept a new connection. 4277 */ 4278 static int 4279 nvmf_vfio_user_accept(void *ctx) 4280 { 4281 struct nvmf_vfio_user_endpoint *endpoint = ctx; 4282 struct nvmf_vfio_user_transport *vu_transport; 4283 int err; 4284 4285 vu_transport = endpoint->transport; 4286 4287 if (endpoint->ctrlr != NULL) { 4288 return SPDK_POLLER_IDLE; 4289 } 4290 4291 /* While we're here, the controller is already destroyed, 4292 * subsystem may still be in RESUMING state, we will wait 4293 * until the subsystem is in RUNNING state. 4294 */ 4295 if (endpoint->need_resume) { 4296 return SPDK_POLLER_IDLE; 4297 } 4298 4299 err = vfu_attach_ctx(endpoint->vfu_ctx); 4300 if (err == 0) { 4301 SPDK_DEBUGLOG(nvmf_vfio, "attach succeeded\n"); 4302 err = nvmf_vfio_user_create_ctrlr(vu_transport, endpoint); 4303 if (err == 0) { 4304 /* 4305 * Unregister ourselves: now we've accepted a 4306 * connection, there is nothing for us to poll for, and 4307 * we will poll the connection via vfu_run_ctx() 4308 * instead. 4309 */ 4310 spdk_interrupt_unregister(&endpoint->accept_intr); 4311 spdk_poller_unregister(&endpoint->accept_poller); 4312 } 4313 return SPDK_POLLER_BUSY; 4314 } 4315 4316 if (errno == EAGAIN || errno == EWOULDBLOCK) { 4317 return SPDK_POLLER_IDLE; 4318 } 4319 4320 return SPDK_POLLER_BUSY; 4321 } 4322 4323 static void 4324 nvmf_vfio_user_discover(struct spdk_nvmf_transport *transport, 4325 struct spdk_nvme_transport_id *trid, 4326 struct spdk_nvmf_discovery_log_page_entry *entry) 4327 { } 4328 4329 static struct spdk_nvmf_transport_poll_group * 4330 nvmf_vfio_user_poll_group_create(struct spdk_nvmf_transport *transport, 4331 struct spdk_nvmf_poll_group *group) 4332 { 4333 struct nvmf_vfio_user_transport *vu_transport; 4334 struct nvmf_vfio_user_poll_group *vu_group; 4335 4336 SPDK_DEBUGLOG(nvmf_vfio, "create poll group\n"); 4337 4338 vu_group = calloc(1, sizeof(*vu_group)); 4339 if (vu_group == NULL) { 4340 SPDK_ERRLOG("Error allocating poll group: %m"); 4341 return NULL; 4342 } 4343 4344 TAILQ_INIT(&vu_group->sqs); 4345 4346 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4347 transport); 4348 pthread_mutex_lock(&vu_transport->pg_lock); 4349 TAILQ_INSERT_TAIL(&vu_transport->poll_groups, vu_group, link); 4350 if (vu_transport->next_pg == NULL) { 4351 vu_transport->next_pg = vu_group; 4352 } 4353 pthread_mutex_unlock(&vu_transport->pg_lock); 4354 4355 if (!spdk_interrupt_mode_is_enabled()) { 4356 return &vu_group->group; 4357 } 4358 4359 /* 4360 * Only allow the poll group to work in interrupt mode if the transport 4361 * supports it. It's our responsibility to register the actual interrupt 4362 * later (in handle_queue_connect_rsp()) that processes everything in 4363 * the poll group: for us, that's the libvfio-user context, and the 4364 * actual qpairs. 4365 * 4366 * Note that this only works in the case that nothing else shares the 4367 * spdk_nvmf_poll_group. 4368 * 4369 * If not supported, this will effectively always wake up to poll the 4370 * poll group. 4371 */ 4372 4373 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4374 transport); 4375 4376 if (!vu_transport->intr_mode_supported) { 4377 SPDK_WARNLOG("vfio-user interrupt mode not supported\n"); 4378 return &vu_group->group; 4379 } 4380 4381 spdk_poller_register_interrupt(group->poller, set_intr_mode_noop, 4382 NULL); 4383 4384 return &vu_group->group; 4385 } 4386 4387 static bool 4388 in_interrupt_mode(struct nvmf_vfio_user_transport *vu_transport) 4389 { 4390 return spdk_interrupt_mode_is_enabled() && 4391 vu_transport->intr_mode_supported; 4392 } 4393 4394 static struct spdk_nvmf_transport_poll_group * 4395 nvmf_vfio_user_get_optimal_poll_group(struct spdk_nvmf_qpair *qpair) 4396 { 4397 struct nvmf_vfio_user_transport *vu_transport; 4398 struct nvmf_vfio_user_poll_group **vu_group; 4399 struct nvmf_vfio_user_sq *sq; 4400 struct nvmf_vfio_user_cq *cq; 4401 4402 struct spdk_nvmf_transport_poll_group *result = NULL; 4403 4404 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 4405 cq = sq->ctrlr->cqs[sq->cqid]; 4406 assert(cq != NULL); 4407 vu_transport = SPDK_CONTAINEROF(qpair->transport, struct nvmf_vfio_user_transport, transport); 4408 4409 pthread_mutex_lock(&vu_transport->pg_lock); 4410 if (TAILQ_EMPTY(&vu_transport->poll_groups)) { 4411 goto out; 4412 } 4413 4414 if (!nvmf_qpair_is_admin_queue(qpair)) { 4415 /* 4416 * If this is shared IO CQ case, just return the used CQ's poll 4417 * group, so I/O completions don't have to use 4418 * spdk_thread_send_msg(). 4419 */ 4420 if (cq->group != NULL) { 4421 result = cq->group; 4422 goto out; 4423 } 4424 4425 /* 4426 * If we're in interrupt mode, align all qpairs for a controller 4427 * on the same poll group, to avoid complications in 4428 * vfio_user_handle_intr(). 4429 */ 4430 if (in_interrupt_mode(vu_transport)) { 4431 result = sq->ctrlr->sqs[0]->group; 4432 goto out; 4433 } 4434 4435 } 4436 4437 vu_group = &vu_transport->next_pg; 4438 assert(*vu_group != NULL); 4439 4440 result = &(*vu_group)->group; 4441 *vu_group = TAILQ_NEXT(*vu_group, link); 4442 if (*vu_group == NULL) { 4443 *vu_group = TAILQ_FIRST(&vu_transport->poll_groups); 4444 } 4445 4446 out: 4447 if (cq->group == NULL) { 4448 cq->group = result; 4449 } 4450 4451 pthread_mutex_unlock(&vu_transport->pg_lock); 4452 return result; 4453 } 4454 4455 /* called when process exits */ 4456 static void 4457 nvmf_vfio_user_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group) 4458 { 4459 struct nvmf_vfio_user_poll_group *vu_group, *next_tgroup;; 4460 struct nvmf_vfio_user_transport *vu_transport; 4461 4462 SPDK_DEBUGLOG(nvmf_vfio, "destroy poll group\n"); 4463 4464 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 4465 vu_transport = SPDK_CONTAINEROF(vu_group->group.transport, struct nvmf_vfio_user_transport, 4466 transport); 4467 4468 pthread_mutex_lock(&vu_transport->pg_lock); 4469 next_tgroup = TAILQ_NEXT(vu_group, link); 4470 TAILQ_REMOVE(&vu_transport->poll_groups, vu_group, link); 4471 if (next_tgroup == NULL) { 4472 next_tgroup = TAILQ_FIRST(&vu_transport->poll_groups); 4473 } 4474 if (vu_transport->next_pg == vu_group) { 4475 vu_transport->next_pg = next_tgroup; 4476 } 4477 pthread_mutex_unlock(&vu_transport->pg_lock); 4478 4479 free(vu_group); 4480 } 4481 4482 static void 4483 _vfio_user_qpair_disconnect(void *ctx) 4484 { 4485 struct nvmf_vfio_user_sq *sq = ctx; 4486 4487 spdk_nvmf_qpair_disconnect(&sq->qpair, NULL, NULL); 4488 } 4489 4490 /* The function is used when socket connection is destroyed */ 4491 static int 4492 vfio_user_destroy_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 4493 { 4494 struct nvmf_vfio_user_sq *sq; 4495 struct nvmf_vfio_user_endpoint *endpoint; 4496 4497 SPDK_DEBUGLOG(nvmf_vfio, "%s stop processing\n", ctrlr_id(ctrlr)); 4498 4499 endpoint = ctrlr->endpoint; 4500 assert(endpoint != NULL); 4501 4502 pthread_mutex_lock(&endpoint->lock); 4503 if (TAILQ_EMPTY(&ctrlr->connected_sqs)) { 4504 endpoint->ctrlr = NULL; 4505 free_ctrlr(ctrlr); 4506 pthread_mutex_unlock(&endpoint->lock); 4507 return 0; 4508 } 4509 4510 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 4511 /* add another round thread poll to avoid recursive endpoint lock */ 4512 spdk_thread_send_msg(ctrlr->thread, _vfio_user_qpair_disconnect, sq); 4513 } 4514 pthread_mutex_unlock(&endpoint->lock); 4515 4516 return 0; 4517 } 4518 4519 /* 4520 * Poll for and process any incoming vfio-user messages. 4521 */ 4522 static int 4523 vfio_user_poll_vfu_ctx(void *ctx) 4524 { 4525 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 4526 int ret; 4527 4528 assert(ctrlr != NULL); 4529 4530 /* This will call access_bar0_fn() if there are any writes 4531 * to the portion of the BAR that is not mmap'd */ 4532 ret = vfu_run_ctx(ctrlr->endpoint->vfu_ctx); 4533 if (spdk_unlikely(ret == -1)) { 4534 if (errno == EBUSY) { 4535 return SPDK_POLLER_IDLE; 4536 } 4537 4538 spdk_poller_unregister(&ctrlr->vfu_ctx_poller); 4539 4540 /* 4541 * We lost the client; the reset callback will already have 4542 * unregistered the interrupt. 4543 */ 4544 if (errno == ENOTCONN) { 4545 vfio_user_destroy_ctrlr(ctrlr); 4546 return SPDK_POLLER_BUSY; 4547 } 4548 4549 /* 4550 * We might not have got a reset callback in this case, so 4551 * explicitly unregister the interrupt here. 4552 */ 4553 spdk_interrupt_unregister(&ctrlr->intr); 4554 ctrlr->intr_fd = -1; 4555 fail_ctrlr(ctrlr); 4556 } 4557 4558 return ret != 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 4559 } 4560 4561 struct vfio_user_post_cpl_ctx { 4562 struct nvmf_vfio_user_ctrlr *ctrlr; 4563 struct nvmf_vfio_user_cq *cq; 4564 struct spdk_nvme_cpl cpl; 4565 }; 4566 4567 static void 4568 _post_completion_msg(void *ctx) 4569 { 4570 struct vfio_user_post_cpl_ctx *cpl_ctx = ctx; 4571 4572 post_completion(cpl_ctx->ctrlr, cpl_ctx->cq, cpl_ctx->cpl.cdw0, cpl_ctx->cpl.sqid, 4573 cpl_ctx->cpl.cid, cpl_ctx->cpl.status.sc, cpl_ctx->cpl.status.sct); 4574 free(cpl_ctx); 4575 } 4576 4577 static int nvmf_vfio_user_poll_group_poll(struct spdk_nvmf_transport_poll_group *group); 4578 4579 static int set_ctrlr_intr_mode(struct nvmf_vfio_user_ctrlr *ctrlr); 4580 4581 static int 4582 vfio_user_handle_intr(void *ctx) 4583 { 4584 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 4585 int ret = 0; 4586 4587 assert(ctrlr != NULL); 4588 assert(ctrlr->sqs[0] != NULL); 4589 assert(ctrlr->sqs[0]->group != NULL); 4590 4591 ctrlr->self_kick_requested = false; 4592 4593 vfio_user_poll_vfu_ctx(ctrlr); 4594 4595 /* 4596 * See nvmf_vfio_user_get_optimal_poll_group() for why it's OK to only 4597 * poll this poll group. 4598 */ 4599 ret |= nvmf_vfio_user_poll_group_poll(ctrlr->sqs[0]->group); 4600 4601 /* Re-arm the event indexes. */ 4602 ret |= set_ctrlr_intr_mode(ctrlr); 4603 4604 return ret != 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 4605 } 4606 4607 static void 4608 vfio_user_set_intr_mode(struct spdk_poller *poller, void *arg, 4609 bool interrupt_mode) 4610 { 4611 struct nvmf_vfio_user_ctrlr *ctrlr = arg; 4612 assert(ctrlr != NULL); 4613 assert(ctrlr->endpoint != NULL); 4614 4615 SPDK_DEBUGLOG(nvmf_vfio, "%s: setting interrupt mode to %d\n", 4616 ctrlr_id(ctrlr), interrupt_mode); 4617 4618 /* 4619 * interrupt_mode needs to persist across controller resets, so store 4620 * it in the endpoint instead. 4621 */ 4622 ctrlr->endpoint->interrupt_mode = interrupt_mode; 4623 set_ctrlr_intr_mode(ctrlr); 4624 } 4625 4626 static int 4627 handle_queue_connect_rsp(struct nvmf_vfio_user_req *req, void *cb_arg) 4628 { 4629 struct nvmf_vfio_user_poll_group *vu_group; 4630 struct nvmf_vfio_user_sq *sq = cb_arg; 4631 struct nvmf_vfio_user_cq *admin_cq; 4632 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 4633 struct nvmf_vfio_user_endpoint *endpoint; 4634 4635 assert(sq != NULL); 4636 assert(req != NULL); 4637 4638 vu_ctrlr = sq->ctrlr; 4639 assert(vu_ctrlr != NULL); 4640 endpoint = vu_ctrlr->endpoint; 4641 assert(endpoint != NULL); 4642 4643 if (spdk_nvme_cpl_is_error(&req->req.rsp->nvme_cpl)) { 4644 SPDK_ERRLOG("SC %u, SCT %u\n", req->req.rsp->nvme_cpl.status.sc, req->req.rsp->nvme_cpl.status.sct); 4645 endpoint->ctrlr = NULL; 4646 free_ctrlr(vu_ctrlr); 4647 return -1; 4648 } 4649 4650 vu_group = SPDK_CONTAINEROF(sq->group, struct nvmf_vfio_user_poll_group, group); 4651 TAILQ_INSERT_TAIL(&vu_group->sqs, sq, link); 4652 4653 admin_cq = vu_ctrlr->cqs[0]; 4654 assert(admin_cq != NULL); 4655 4656 pthread_mutex_lock(&endpoint->lock); 4657 if (nvmf_qpair_is_admin_queue(&sq->qpair)) { 4658 vu_ctrlr->cntlid = sq->qpair.ctrlr->cntlid; 4659 vu_ctrlr->thread = spdk_get_thread(); 4660 vu_ctrlr->ctrlr = sq->qpair.ctrlr; 4661 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 4662 4663 admin_cq->thread = spdk_get_thread(); 4664 4665 if (in_interrupt_mode(endpoint->transport)) { 4666 vu_ctrlr->vfu_ctx_poller = SPDK_POLLER_REGISTER(vfio_user_poll_vfu_ctx, 4667 vu_ctrlr, 0); 4668 4669 vu_ctrlr->intr_fd = vfu_get_poll_fd(vu_ctrlr->endpoint->vfu_ctx); 4670 assert(vu_ctrlr->intr_fd != -1); 4671 4672 vu_ctrlr->intr = SPDK_INTERRUPT_REGISTER(vu_ctrlr->intr_fd, 4673 vfio_user_handle_intr, 4674 vu_ctrlr); 4675 4676 assert(vu_ctrlr->intr != NULL); 4677 4678 spdk_poller_register_interrupt(vu_ctrlr->vfu_ctx_poller, 4679 vfio_user_set_intr_mode, 4680 vu_ctrlr); 4681 } else { 4682 vu_ctrlr->vfu_ctx_poller = SPDK_POLLER_REGISTER(vfio_user_poll_vfu_ctx, 4683 vu_ctrlr, 1000); 4684 } 4685 } else { 4686 /* For I/O queues this command was generated in response to an 4687 * ADMIN I/O CREATE SUBMISSION QUEUE command which has not yet 4688 * been completed. Complete it now. 4689 */ 4690 if (sq->post_create_io_sq_completion) { 4691 assert(admin_cq->thread != NULL); 4692 if (admin_cq->thread != spdk_get_thread()) { 4693 struct vfio_user_post_cpl_ctx *cpl_ctx; 4694 4695 cpl_ctx = calloc(1, sizeof(*cpl_ctx)); 4696 if (!cpl_ctx) { 4697 return -ENOMEM; 4698 } 4699 cpl_ctx->ctrlr = vu_ctrlr; 4700 cpl_ctx->cq = admin_cq; 4701 cpl_ctx->cpl.sqid = 0; 4702 cpl_ctx->cpl.cdw0 = 0; 4703 cpl_ctx->cpl.cid = sq->create_io_sq_cmd.cid; 4704 cpl_ctx->cpl.status.sc = SPDK_NVME_SC_SUCCESS; 4705 cpl_ctx->cpl.status.sct = SPDK_NVME_SCT_GENERIC; 4706 4707 spdk_thread_send_msg(admin_cq->thread, _post_completion_msg, 4708 cpl_ctx); 4709 } else { 4710 post_completion(vu_ctrlr, admin_cq, 0, 0, 4711 sq->create_io_sq_cmd.cid, SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC); 4712 } 4713 sq->post_create_io_sq_completion = false; 4714 } else if (in_interrupt_mode(endpoint->transport)) { 4715 /* 4716 * FIXME self_kick() ends up polling all queues on the 4717 * controller thread, and this will be wrong if we ever 4718 * support interrupt mode with I/O queues in a 4719 * different poll group than the controller's. 4720 */ 4721 self_kick(vu_ctrlr); 4722 } 4723 sq->sq_state = VFIO_USER_SQ_ACTIVE; 4724 } 4725 4726 TAILQ_INSERT_TAIL(&vu_ctrlr->connected_sqs, sq, tailq); 4727 pthread_mutex_unlock(&endpoint->lock); 4728 4729 free(req->req.data); 4730 req->req.data = NULL; 4731 4732 return 0; 4733 } 4734 4735 /* 4736 * Add the given qpair to the given poll group. New qpairs are added via 4737 * spdk_nvmf_tgt_new_qpair(), which picks a poll group via 4738 * nvmf_vfio_user_get_optimal_poll_group(), then calls back here via 4739 * nvmf_transport_poll_group_add(). 4740 */ 4741 static int 4742 nvmf_vfio_user_poll_group_add(struct spdk_nvmf_transport_poll_group *group, 4743 struct spdk_nvmf_qpair *qpair) 4744 { 4745 struct nvmf_vfio_user_sq *sq; 4746 struct nvmf_vfio_user_req *vu_req; 4747 struct nvmf_vfio_user_ctrlr *ctrlr; 4748 struct spdk_nvmf_request *req; 4749 struct spdk_nvmf_fabric_connect_data *data; 4750 bool admin; 4751 4752 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 4753 sq->group = group; 4754 ctrlr = sq->ctrlr; 4755 4756 SPDK_DEBUGLOG(nvmf_vfio, "%s: add QP%d=%p(%p) to poll_group=%p\n", 4757 ctrlr_id(ctrlr), sq->qpair.qid, 4758 sq, qpair, group); 4759 4760 admin = nvmf_qpair_is_admin_queue(&sq->qpair); 4761 4762 vu_req = get_nvmf_vfio_user_req(sq); 4763 if (vu_req == NULL) { 4764 return -1; 4765 } 4766 4767 req = &vu_req->req; 4768 req->cmd->connect_cmd.opcode = SPDK_NVME_OPC_FABRIC; 4769 req->cmd->connect_cmd.cid = 0; 4770 req->cmd->connect_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_CONNECT; 4771 req->cmd->connect_cmd.recfmt = 0; 4772 req->cmd->connect_cmd.sqsize = sq->size - 1; 4773 req->cmd->connect_cmd.qid = admin ? 0 : qpair->qid; 4774 4775 req->length = sizeof(struct spdk_nvmf_fabric_connect_data); 4776 req->data = calloc(1, req->length); 4777 if (req->data == NULL) { 4778 nvmf_vfio_user_req_free(req); 4779 return -ENOMEM; 4780 } 4781 4782 data = (struct spdk_nvmf_fabric_connect_data *)req->data; 4783 data->cntlid = ctrlr->cntlid; 4784 snprintf(data->subnqn, sizeof(data->subnqn), "%s", 4785 spdk_nvmf_subsystem_get_nqn(ctrlr->endpoint->subsystem)); 4786 4787 vu_req->cb_fn = handle_queue_connect_rsp; 4788 vu_req->cb_arg = sq; 4789 4790 SPDK_DEBUGLOG(nvmf_vfio, 4791 "%s: sending connect fabrics command for qid:%#x cntlid=%#x\n", 4792 ctrlr_id(ctrlr), qpair->qid, data->cntlid); 4793 4794 spdk_nvmf_request_exec_fabrics(req); 4795 return 0; 4796 } 4797 4798 static int 4799 nvmf_vfio_user_poll_group_remove(struct spdk_nvmf_transport_poll_group *group, 4800 struct spdk_nvmf_qpair *qpair) 4801 { 4802 struct nvmf_vfio_user_sq *sq; 4803 struct nvmf_vfio_user_poll_group *vu_group; 4804 4805 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 4806 4807 SPDK_DEBUGLOG(nvmf_vfio, 4808 "%s: remove NVMf QP%d=%p from NVMf poll_group=%p\n", 4809 ctrlr_id(sq->ctrlr), qpair->qid, qpair, group); 4810 4811 4812 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 4813 TAILQ_REMOVE(&vu_group->sqs, sq, link); 4814 4815 return 0; 4816 } 4817 4818 static void 4819 _nvmf_vfio_user_req_free(struct nvmf_vfio_user_sq *sq, struct nvmf_vfio_user_req *vu_req) 4820 { 4821 memset(&vu_req->cmd, 0, sizeof(vu_req->cmd)); 4822 memset(&vu_req->rsp, 0, sizeof(vu_req->rsp)); 4823 vu_req->iovcnt = 0; 4824 vu_req->state = VFIO_USER_REQUEST_STATE_FREE; 4825 4826 TAILQ_INSERT_TAIL(&sq->free_reqs, vu_req, link); 4827 } 4828 4829 static int 4830 nvmf_vfio_user_req_free(struct spdk_nvmf_request *req) 4831 { 4832 struct nvmf_vfio_user_sq *sq; 4833 struct nvmf_vfio_user_req *vu_req; 4834 4835 assert(req != NULL); 4836 4837 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 4838 sq = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 4839 4840 _nvmf_vfio_user_req_free(sq, vu_req); 4841 4842 return 0; 4843 } 4844 4845 static int 4846 nvmf_vfio_user_req_complete(struct spdk_nvmf_request *req) 4847 { 4848 struct nvmf_vfio_user_sq *sq; 4849 struct nvmf_vfio_user_req *vu_req; 4850 4851 assert(req != NULL); 4852 4853 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 4854 sq = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 4855 4856 if (vu_req->cb_fn != NULL) { 4857 if (vu_req->cb_fn(vu_req, vu_req->cb_arg) != 0) { 4858 fail_ctrlr(sq->ctrlr); 4859 } 4860 } 4861 4862 _nvmf_vfio_user_req_free(sq, vu_req); 4863 4864 return 0; 4865 } 4866 4867 static void 4868 nvmf_vfio_user_close_qpair(struct spdk_nvmf_qpair *qpair, 4869 spdk_nvmf_transport_qpair_fini_cb cb_fn, void *cb_arg) 4870 { 4871 struct nvmf_vfio_user_sq *sq; 4872 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 4873 struct nvmf_vfio_user_endpoint *endpoint; 4874 4875 assert(qpair != NULL); 4876 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 4877 vu_ctrlr = sq->ctrlr; 4878 endpoint = vu_ctrlr->endpoint; 4879 4880 pthread_mutex_lock(&endpoint->lock); 4881 TAILQ_REMOVE(&vu_ctrlr->connected_sqs, sq, tailq); 4882 delete_sq_done(vu_ctrlr, sq); 4883 if (TAILQ_EMPTY(&vu_ctrlr->connected_sqs)) { 4884 endpoint->ctrlr = NULL; 4885 if (vu_ctrlr->in_source_vm && endpoint->need_resume) { 4886 /* The controller will be freed, we can resume the subsystem 4887 * now so that the endpoint can be ready to accept another 4888 * new connection. 4889 */ 4890 spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 4891 vfio_user_endpoint_resume_done, endpoint); 4892 } 4893 free_ctrlr(vu_ctrlr); 4894 } 4895 pthread_mutex_unlock(&endpoint->lock); 4896 4897 if (cb_fn) { 4898 cb_fn(cb_arg); 4899 } 4900 } 4901 4902 /** 4903 * Returns a preallocated request, or NULL if there isn't one available. 4904 */ 4905 static struct nvmf_vfio_user_req * 4906 get_nvmf_vfio_user_req(struct nvmf_vfio_user_sq *sq) 4907 { 4908 struct nvmf_vfio_user_req *req; 4909 4910 if (sq == NULL) { 4911 return NULL; 4912 } 4913 4914 req = TAILQ_FIRST(&sq->free_reqs); 4915 if (req == NULL) { 4916 return NULL; 4917 } 4918 4919 TAILQ_REMOVE(&sq->free_reqs, req, link); 4920 4921 return req; 4922 } 4923 4924 static int 4925 get_nvmf_io_req_length(struct spdk_nvmf_request *req) 4926 { 4927 uint16_t nr; 4928 uint32_t nlb, nsid; 4929 struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; 4930 struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; 4931 struct spdk_nvmf_ns *ns; 4932 4933 nsid = cmd->nsid; 4934 ns = _nvmf_subsystem_get_ns(ctrlr->subsys, nsid); 4935 if (ns == NULL || ns->bdev == NULL) { 4936 SPDK_ERRLOG("unsuccessful query for nsid %u\n", cmd->nsid); 4937 return -EINVAL; 4938 } 4939 4940 if (cmd->opc == SPDK_NVME_OPC_DATASET_MANAGEMENT) { 4941 nr = cmd->cdw10_bits.dsm.nr + 1; 4942 return nr * sizeof(struct spdk_nvme_dsm_range); 4943 } 4944 4945 nlb = (cmd->cdw12 & 0x0000ffffu) + 1; 4946 return nlb * spdk_bdev_get_block_size(ns->bdev); 4947 } 4948 4949 static int 4950 map_admin_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req) 4951 { 4952 struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; 4953 uint32_t len = 0; 4954 uint8_t fid; 4955 int iovcnt; 4956 4957 req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 4958 req->length = 0; 4959 req->data = NULL; 4960 4961 if (req->xfer == SPDK_NVME_DATA_NONE) { 4962 return 0; 4963 } 4964 4965 switch (cmd->opc) { 4966 case SPDK_NVME_OPC_IDENTIFY: 4967 len = 4096; 4968 break; 4969 case SPDK_NVME_OPC_GET_LOG_PAGE: 4970 len = (((cmd->cdw11_bits.get_log_page.numdu << 16) | cmd->cdw10_bits.get_log_page.numdl) + 1) * 4; 4971 break; 4972 case SPDK_NVME_OPC_GET_FEATURES: 4973 case SPDK_NVME_OPC_SET_FEATURES: 4974 fid = cmd->cdw10_bits.set_features.fid; 4975 switch (fid) { 4976 case SPDK_NVME_FEAT_LBA_RANGE_TYPE: 4977 len = 4096; 4978 break; 4979 case SPDK_NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 4980 len = 256; 4981 break; 4982 case SPDK_NVME_FEAT_TIMESTAMP: 4983 len = 8; 4984 break; 4985 case SPDK_NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 4986 len = 512; 4987 break; 4988 case SPDK_NVME_FEAT_HOST_IDENTIFIER: 4989 if (cmd->cdw11_bits.feat_host_identifier.bits.exhid) { 4990 len = 16; 4991 } else { 4992 len = 8; 4993 } 4994 break; 4995 default: 4996 return 0; 4997 } 4998 break; 4999 default: 5000 return 0; 5001 } 5002 5003 /* ADMIN command will not use SGL */ 5004 if (cmd->psdt != 0) { 5005 return -EINVAL; 5006 } 5007 5008 iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, len); 5009 if (iovcnt < 0) { 5010 SPDK_ERRLOG("%s: map Admin Opc %x failed\n", 5011 ctrlr_id(ctrlr), cmd->opc); 5012 return -1; 5013 } 5014 req->length = len; 5015 req->data = req->iov[0].iov_base; 5016 req->iovcnt = iovcnt; 5017 5018 return 0; 5019 } 5020 5021 /* 5022 * Map an I/O command's buffers. 5023 * 5024 * Returns 0 on success and -errno on failure. 5025 */ 5026 static int 5027 map_io_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req) 5028 { 5029 int len, iovcnt; 5030 struct spdk_nvme_cmd *cmd; 5031 5032 assert(ctrlr != NULL); 5033 assert(req != NULL); 5034 5035 cmd = &req->cmd->nvme_cmd; 5036 req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 5037 req->length = 0; 5038 req->data = NULL; 5039 5040 if (spdk_unlikely(req->xfer == SPDK_NVME_DATA_NONE)) { 5041 return 0; 5042 } 5043 5044 len = get_nvmf_io_req_length(req); 5045 if (len < 0) { 5046 return -EINVAL; 5047 } 5048 req->length = len; 5049 5050 iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, req->length); 5051 if (iovcnt < 0) { 5052 SPDK_ERRLOG("%s: failed to map IO OPC %u\n", ctrlr_id(ctrlr), cmd->opc); 5053 return -EFAULT; 5054 } 5055 req->data = req->iov[0].iov_base; 5056 req->iovcnt = iovcnt; 5057 5058 return 0; 5059 } 5060 5061 static int 5062 handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 5063 struct nvmf_vfio_user_sq *sq) 5064 { 5065 int err; 5066 struct nvmf_vfio_user_req *vu_req; 5067 struct spdk_nvmf_request *req; 5068 5069 assert(ctrlr != NULL); 5070 assert(cmd != NULL); 5071 5072 vu_req = get_nvmf_vfio_user_req(sq); 5073 if (spdk_unlikely(vu_req == NULL)) { 5074 SPDK_ERRLOG("%s: no request for NVMe command opc 0x%x\n", ctrlr_id(ctrlr), cmd->opc); 5075 return post_completion(ctrlr, ctrlr->cqs[sq->cqid], 0, 0, cmd->cid, 5076 SPDK_NVME_SC_INTERNAL_DEVICE_ERROR, SPDK_NVME_SCT_GENERIC); 5077 5078 } 5079 req = &vu_req->req; 5080 5081 assert(req->qpair != NULL); 5082 SPDK_DEBUGLOG(nvmf_vfio, "%s: handle sqid:%u, req opc=%#x cid=%d\n", 5083 ctrlr_id(ctrlr), req->qpair->qid, cmd->opc, cmd->cid); 5084 5085 vu_req->cb_fn = handle_cmd_rsp; 5086 vu_req->cb_arg = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 5087 req->cmd->nvme_cmd = *cmd; 5088 5089 if (nvmf_qpair_is_admin_queue(req->qpair)) { 5090 err = map_admin_cmd_req(ctrlr, req); 5091 } else { 5092 switch (cmd->opc) { 5093 case SPDK_NVME_OPC_RESERVATION_REGISTER: 5094 case SPDK_NVME_OPC_RESERVATION_REPORT: 5095 case SPDK_NVME_OPC_RESERVATION_ACQUIRE: 5096 case SPDK_NVME_OPC_RESERVATION_RELEASE: 5097 err = -ENOTSUP; 5098 break; 5099 default: 5100 err = map_io_cmd_req(ctrlr, req); 5101 break; 5102 } 5103 } 5104 5105 if (spdk_unlikely(err < 0)) { 5106 SPDK_ERRLOG("%s: process NVMe command opc 0x%x failed\n", 5107 ctrlr_id(ctrlr), cmd->opc); 5108 req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 5109 req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; 5110 err = handle_cmd_rsp(vu_req, vu_req->cb_arg); 5111 _nvmf_vfio_user_req_free(sq, vu_req); 5112 return err; 5113 } 5114 5115 vu_req->state = VFIO_USER_REQUEST_STATE_EXECUTING; 5116 spdk_nvmf_request_exec(req); 5117 5118 return 0; 5119 } 5120 5121 /* 5122 * If we suppressed an IRQ in post_completion(), check if it needs to be fired 5123 * here: if the host isn't up to date, and is apparently not actively processing 5124 * the queue (i.e. ->last_head isn't changing), we need an IRQ. 5125 */ 5126 static void 5127 handle_suppressed_irq(struct nvmf_vfio_user_ctrlr *ctrlr, 5128 struct nvmf_vfio_user_sq *sq) 5129 { 5130 struct nvmf_vfio_user_cq *cq = ctrlr->cqs[sq->cqid]; 5131 uint32_t cq_head; 5132 uint32_t cq_tail; 5133 5134 if (!cq->ien || !ctrlr_interrupt_enabled(ctrlr) || 5135 !adaptive_irq_enabled(ctrlr, cq)) { 5136 return; 5137 } 5138 5139 cq_tail = *cq_tailp(cq); 5140 5141 /* Already sent? */ 5142 if (cq_tail == cq->last_trigger_irq_tail) { 5143 return; 5144 } 5145 5146 spdk_ivdt_dcache(cq_dbl_headp(cq)); 5147 cq_head = *cq_dbl_headp(cq); 5148 5149 if (cq_head != cq_tail && cq_head == cq->last_head) { 5150 int err = vfu_irq_trigger(ctrlr->endpoint->vfu_ctx, cq->iv); 5151 if (err != 0) { 5152 SPDK_ERRLOG("%s: failed to trigger interrupt: %m\n", 5153 ctrlr_id(ctrlr)); 5154 } else { 5155 cq->last_trigger_irq_tail = cq_tail; 5156 } 5157 } 5158 5159 cq->last_head = cq_head; 5160 } 5161 5162 /* Returns the number of commands processed, or a negative value on error. */ 5163 static int 5164 nvmf_vfio_user_sq_poll(struct nvmf_vfio_user_sq *sq) 5165 { 5166 struct nvmf_vfio_user_ctrlr *ctrlr; 5167 uint32_t new_tail; 5168 int count = 0; 5169 5170 assert(sq != NULL); 5171 5172 ctrlr = sq->ctrlr; 5173 5174 handle_suppressed_irq(ctrlr, sq); 5175 5176 /* On aarch64 platforms, doorbells update from guest VM may not be seen 5177 * on SPDK target side. This is because there is memory type mismatch 5178 * situation here. That is on guest VM side, the doorbells are treated as 5179 * device memory while on SPDK target side, it is treated as normal 5180 * memory. And this situation cause problem on ARM platform. 5181 * Refer to "https://developer.arm.com/documentation/102376/0100/ 5182 * Memory-aliasing-and-mismatched-memory-types". Only using spdk_mb() 5183 * cannot fix this. Use "dc civac" to invalidate cache may solve 5184 * this. 5185 */ 5186 spdk_ivdt_dcache(sq_dbl_tailp(sq)); 5187 5188 /* Load-Acquire. */ 5189 new_tail = *sq_dbl_tailp(sq); 5190 5191 new_tail = new_tail & 0xffffu; 5192 if (spdk_unlikely(new_tail >= sq->size)) { 5193 union spdk_nvme_async_event_completion event = {}; 5194 5195 SPDK_DEBUGLOG(nvmf_vfio, "%s: invalid sqid:%u doorbell value %u\n", ctrlr_id(ctrlr), sq->qid, 5196 new_tail); 5197 event.bits.async_event_type = SPDK_NVME_ASYNC_EVENT_TYPE_ERROR; 5198 event.bits.async_event_info = SPDK_NVME_ASYNC_EVENT_INVALID_DB_WRITE; 5199 nvmf_ctrlr_async_event_error_event(ctrlr->ctrlr, event); 5200 5201 return -1; 5202 } 5203 5204 if (*sq_headp(sq) == new_tail) { 5205 return 0; 5206 } 5207 5208 SPDK_DEBUGLOG(nvmf_vfio, "%s: sqid:%u doorbell old=%u new=%u\n", 5209 ctrlr_id(ctrlr), sq->qid, *sq_headp(sq), new_tail); 5210 if (ctrlr->sdbl != NULL) { 5211 SPDK_DEBUGLOG(nvmf_vfio, 5212 "%s: sqid:%u bar0_doorbell=%u shadow_doorbell=%u eventidx=%u\n", 5213 ctrlr_id(ctrlr), sq->qid, 5214 ctrlr->bar0_doorbells[queue_index(sq->qid, false)], 5215 ctrlr->sdbl->shadow_doorbells[queue_index(sq->qid, false)], 5216 ctrlr->sdbl->eventidxs[queue_index(sq->qid, false)]); 5217 } 5218 5219 /* 5220 * Ensure that changes to the queue are visible to us. 5221 * The host driver should write the queue first, do a wmb(), and then 5222 * update the SQ tail doorbell (their Store-Release). 5223 */ 5224 spdk_rmb(); 5225 5226 count = handle_sq_tdbl_write(ctrlr, new_tail, sq); 5227 if (count < 0) { 5228 fail_ctrlr(ctrlr); 5229 } 5230 5231 return count; 5232 } 5233 5234 /* 5235 * vfio-user transport poll handler. Note that the library context is polled in 5236 * a separate poller (->vfu_ctx_poller), so this poller only needs to poll the 5237 * active SQs. 5238 * 5239 * Returns the number of commands processed, or a negative value on error. 5240 */ 5241 static int 5242 nvmf_vfio_user_poll_group_poll(struct spdk_nvmf_transport_poll_group *group) 5243 { 5244 struct nvmf_vfio_user_poll_group *vu_group; 5245 struct nvmf_vfio_user_sq *sq, *tmp; 5246 int count = 0; 5247 5248 assert(group != NULL); 5249 5250 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 5251 5252 SPDK_DEBUGLOG(vfio_user_db, "polling all SQs\n"); 5253 5254 TAILQ_FOREACH_SAFE(sq, &vu_group->sqs, link, tmp) { 5255 int ret; 5256 5257 if (spdk_unlikely(sq->sq_state != VFIO_USER_SQ_ACTIVE || !sq->size)) { 5258 continue; 5259 } 5260 5261 ret = nvmf_vfio_user_sq_poll(sq); 5262 5263 if (ret < 0) { 5264 return ret; 5265 } 5266 5267 count += ret; 5268 } 5269 5270 return count; 5271 } 5272 5273 static int 5274 nvmf_vfio_user_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair, 5275 struct spdk_nvme_transport_id *trid) 5276 { 5277 struct nvmf_vfio_user_sq *sq; 5278 struct nvmf_vfio_user_ctrlr *ctrlr; 5279 5280 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5281 ctrlr = sq->ctrlr; 5282 5283 memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid)); 5284 return 0; 5285 } 5286 5287 static int 5288 nvmf_vfio_user_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair, 5289 struct spdk_nvme_transport_id *trid) 5290 { 5291 return 0; 5292 } 5293 5294 static int 5295 nvmf_vfio_user_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair, 5296 struct spdk_nvme_transport_id *trid) 5297 { 5298 struct nvmf_vfio_user_sq *sq; 5299 struct nvmf_vfio_user_ctrlr *ctrlr; 5300 5301 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5302 ctrlr = sq->ctrlr; 5303 5304 memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid)); 5305 return 0; 5306 } 5307 5308 static void 5309 nvmf_vfio_user_qpair_abort_request(struct spdk_nvmf_qpair *qpair, 5310 struct spdk_nvmf_request *req) 5311 { 5312 struct spdk_nvmf_request *req_to_abort = NULL; 5313 struct spdk_nvmf_request *temp_req = NULL; 5314 uint16_t cid; 5315 5316 cid = req->cmd->nvme_cmd.cdw10_bits.abort.cid; 5317 5318 TAILQ_FOREACH(temp_req, &qpair->outstanding, link) { 5319 struct nvmf_vfio_user_req *vu_req; 5320 5321 vu_req = SPDK_CONTAINEROF(temp_req, struct nvmf_vfio_user_req, req); 5322 5323 if (vu_req->state == VFIO_USER_REQUEST_STATE_EXECUTING && vu_req->cmd.cid == cid) { 5324 req_to_abort = temp_req; 5325 break; 5326 } 5327 } 5328 5329 if (req_to_abort == NULL) { 5330 spdk_nvmf_request_complete(req); 5331 return; 5332 } 5333 5334 req->req_to_abort = req_to_abort; 5335 nvmf_ctrlr_abort_request(req); 5336 } 5337 5338 static void 5339 nvmf_vfio_user_opts_init(struct spdk_nvmf_transport_opts *opts) 5340 { 5341 opts->max_queue_depth = NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH; 5342 opts->max_qpairs_per_ctrlr = NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; 5343 opts->in_capsule_data_size = 0; 5344 opts->max_io_size = NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE; 5345 opts->io_unit_size = NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE; 5346 opts->max_aq_depth = NVMF_VFIO_USER_DEFAULT_AQ_DEPTH; 5347 opts->num_shared_buffers = 0; 5348 opts->buf_cache_size = 0; 5349 opts->association_timeout = 0; 5350 opts->transport_specific = NULL; 5351 } 5352 5353 const struct spdk_nvmf_transport_ops spdk_nvmf_transport_vfio_user = { 5354 .name = "VFIOUSER", 5355 .type = SPDK_NVME_TRANSPORT_VFIOUSER, 5356 .opts_init = nvmf_vfio_user_opts_init, 5357 .create = nvmf_vfio_user_create, 5358 .destroy = nvmf_vfio_user_destroy, 5359 5360 .listen = nvmf_vfio_user_listen, 5361 .stop_listen = nvmf_vfio_user_stop_listen, 5362 .cdata_init = nvmf_vfio_user_cdata_init, 5363 .listen_associate = nvmf_vfio_user_listen_associate, 5364 5365 .listener_discover = nvmf_vfio_user_discover, 5366 5367 .poll_group_create = nvmf_vfio_user_poll_group_create, 5368 .get_optimal_poll_group = nvmf_vfio_user_get_optimal_poll_group, 5369 .poll_group_destroy = nvmf_vfio_user_poll_group_destroy, 5370 .poll_group_add = nvmf_vfio_user_poll_group_add, 5371 .poll_group_remove = nvmf_vfio_user_poll_group_remove, 5372 .poll_group_poll = nvmf_vfio_user_poll_group_poll, 5373 5374 .req_free = nvmf_vfio_user_req_free, 5375 .req_complete = nvmf_vfio_user_req_complete, 5376 5377 .qpair_fini = nvmf_vfio_user_close_qpair, 5378 .qpair_get_local_trid = nvmf_vfio_user_qpair_get_local_trid, 5379 .qpair_get_peer_trid = nvmf_vfio_user_qpair_get_peer_trid, 5380 .qpair_get_listen_trid = nvmf_vfio_user_qpair_get_listen_trid, 5381 .qpair_abort_request = nvmf_vfio_user_qpair_abort_request, 5382 }; 5383 5384 SPDK_NVMF_TRANSPORT_REGISTER(muser, &spdk_nvmf_transport_vfio_user); 5385 SPDK_LOG_REGISTER_COMPONENT(nvmf_vfio) 5386 SPDK_LOG_REGISTER_COMPONENT(vfio_user_db) 5387