1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2020 Intel Corporation. 3 * Copyright (c) 2019-2022, Nutanix Inc. All rights reserved. 4 * Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 /* 8 * NVMe over vfio-user transport 9 */ 10 11 #include <sys/param.h> 12 13 #include <vfio-user/libvfio-user.h> 14 #include <vfio-user/pci_defs.h> 15 16 #include "spdk/barrier.h" 17 #include "spdk/stdinc.h" 18 #include "spdk/assert.h" 19 #include "spdk/thread.h" 20 #include "spdk/nvmf_transport.h" 21 #include "spdk/sock.h" 22 #include "spdk/string.h" 23 #include "spdk/util.h" 24 #include "spdk/log.h" 25 26 #include "transport.h" 27 28 #include "nvmf_internal.h" 29 30 #define SWAP(x, y) \ 31 do \ 32 { \ 33 typeof(x) _tmp = x; \ 34 x = y; \ 35 y = _tmp; \ 36 } while (0) 37 38 #define NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH 256 39 #define NVMF_VFIO_USER_DEFAULT_AQ_DEPTH 32 40 #define NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE ((NVMF_REQ_MAX_BUFFERS - 1) << SHIFT_4KB) 41 #define NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE 42 43 #define NVME_DOORBELLS_OFFSET 0x1000 44 #define NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT 2 45 #define NVMF_VFIO_USER_SET_EVENTIDX_MAX_ATTEMPTS 3 46 #define NVMF_VFIO_USER_EVENTIDX_POLL UINT32_MAX 47 48 #define NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR 512 49 #define NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR (NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR / 4) 50 51 /* NVMe spec 1.4, section 5.21.1.7 */ 52 SPDK_STATIC_ASSERT(NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR >= 2 && 53 NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR <= SPDK_NVME_MAX_IO_QUEUES, 54 "bad number of queues"); 55 56 /* 57 * NVMe driver reads 4096 bytes, which is the extended PCI configuration space 58 * available on PCI-X 2.0 and PCI Express buses 59 */ 60 #define NVME_REG_CFG_SIZE 0x1000 61 62 /* 63 * Doorbells must be page aligned so that they can memory mapped. 64 * 65 * TODO does the NVMe spec also require this? Document it. 66 */ 67 #define NVMF_VFIO_USER_DOORBELLS_SIZE \ 68 SPDK_ALIGN_CEIL( \ 69 (NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR * 2 * SPDK_NVME_DOORBELL_REGISTER_SIZE), \ 70 0x1000) 71 #define NVME_REG_BAR0_SIZE (NVME_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE) 72 73 /* 74 * TODO check the PCI spec whether BAR4 and BAR5 really have to be at least one 75 * page and a multiple of page size (maybe QEMU also needs this?). Document all 76 * this. 77 */ 78 79 /* 80 * MSI-X Pending Bit Array Size 81 * 82 * TODO according to the PCI spec we need one bit per vector, document the 83 * relevant section. 84 * 85 * If the first argument to SPDK_ALIGN_CEIL is 0 then the result is 0, so we 86 * would end up with a 0-size BAR5. 87 */ 88 #define NVME_IRQ_MSIX_NUM MAX(CHAR_BIT, NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) 89 #define NVME_BAR5_SIZE SPDK_ALIGN_CEIL((NVME_IRQ_MSIX_NUM / CHAR_BIT), 0x1000) 90 SPDK_STATIC_ASSERT(NVME_BAR5_SIZE > 0, "Incorrect size"); 91 92 /* MSI-X Table Size */ 93 #define NVME_BAR4_SIZE SPDK_ALIGN_CEIL((NVME_IRQ_MSIX_NUM * 16), 0x1000) 94 SPDK_STATIC_ASSERT(NVME_BAR4_SIZE > 0, "Incorrect size"); 95 96 struct nvmf_vfio_user_req; 97 98 typedef int (*nvmf_vfio_user_req_cb_fn)(struct nvmf_vfio_user_req *req, void *cb_arg); 99 100 /* 1 more for PRP2 list itself */ 101 #define NVMF_VFIO_USER_MAX_IOVECS (NVMF_REQ_MAX_BUFFERS + 1) 102 103 enum nvmf_vfio_user_req_state { 104 VFIO_USER_REQUEST_STATE_FREE = 0, 105 VFIO_USER_REQUEST_STATE_EXECUTING, 106 }; 107 108 /* 109 * Support for live migration in NVMf/vfio-user: live migration is implemented 110 * by stopping the NVMf subsystem when the device is instructed to enter the 111 * stop-and-copy state and then trivially, and most importantly safely, 112 * collecting migration state and providing it to the vfio-user client. We 113 * don't provide any migration state at the pre-copy state as that's too 114 * complicated to do, we might support this in the future. 115 */ 116 117 118 /* NVMe device state representation */ 119 struct nvme_migr_sq_state { 120 uint16_t sqid; 121 uint16_t cqid; 122 uint32_t head; 123 uint32_t size; 124 uint32_t reserved; 125 uint64_t dma_addr; 126 }; 127 SPDK_STATIC_ASSERT(sizeof(struct nvme_migr_sq_state) == 0x18, "Incorrect size"); 128 129 struct nvme_migr_cq_state { 130 uint16_t cqid; 131 uint16_t phase; 132 uint32_t tail; 133 uint32_t size; 134 uint32_t iv; 135 uint32_t ien; 136 uint32_t reserved; 137 uint64_t dma_addr; 138 }; 139 SPDK_STATIC_ASSERT(sizeof(struct nvme_migr_cq_state) == 0x20, "Incorrect size"); 140 141 #define VFIO_USER_NVME_MIGR_MAGIC 0xAFEDBC23 142 143 /* The device state is in VFIO MIGRATION BAR(9) region, keep the device state page aligned. 144 * 145 * NVMe device migration region is defined as below: 146 * ------------------------------------------------------------------------- 147 * | vfio_user_nvme_migr_header | nvmf controller data | queue pairs | BARs | 148 * ------------------------------------------------------------------------- 149 * 150 * Keep vfio_user_nvme_migr_header as a fixed 0x1000 length, all new added fields 151 * can use the reserved space at the end of the data structure. 152 */ 153 struct vfio_user_nvme_migr_header { 154 /* Magic value to validate migration data */ 155 uint32_t magic; 156 /* Version to check the data is same from source to destination */ 157 uint32_t version; 158 159 /* The library uses this field to know how many fields in this 160 * structure are valid, starting at the beginning of this data 161 * structure. New added fields in future use `unused` memory 162 * spaces. 163 */ 164 uint32_t opts_size; 165 uint32_t reserved0; 166 167 /* BARs information */ 168 uint64_t bar_offset[VFU_PCI_DEV_NUM_REGIONS]; 169 uint64_t bar_len[VFU_PCI_DEV_NUM_REGIONS]; 170 171 /* Queue pair start offset, starting at the beginning of this 172 * data structure. 173 */ 174 uint64_t qp_offset; 175 uint64_t qp_len; 176 177 /* Controller data structure */ 178 uint32_t num_io_queues; 179 uint32_t reserved1; 180 181 /* NVMf controller data offset and length if exist, starting at 182 * the beginning of this data structure. 183 */ 184 uint64_t nvmf_data_offset; 185 uint64_t nvmf_data_len; 186 187 /* 188 * Whether or not shadow doorbells are used in the source. 0 is a valid DMA 189 * address. 190 */ 191 uint32_t sdbl; 192 193 /* Shadow doorbell DMA addresses. */ 194 uint64_t shadow_doorbell_buffer; 195 uint64_t eventidx_buffer; 196 197 /* Reserved memory space for new added fields, the 198 * field is always at the end of this data structure. 199 */ 200 uint8_t unused[3856]; 201 }; 202 SPDK_STATIC_ASSERT(sizeof(struct vfio_user_nvme_migr_header) == 0x1000, "Incorrect size"); 203 204 struct vfio_user_nvme_migr_qp { 205 struct nvme_migr_sq_state sq; 206 struct nvme_migr_cq_state cq; 207 }; 208 209 /* NVMe state definition used to load/restore from/to NVMe migration BAR region */ 210 struct vfio_user_nvme_migr_state { 211 struct vfio_user_nvme_migr_header ctrlr_header; 212 struct spdk_nvmf_ctrlr_migr_data nvmf_data; 213 struct vfio_user_nvme_migr_qp qps[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 214 uint8_t doorbells[NVMF_VFIO_USER_DOORBELLS_SIZE]; 215 uint8_t cfg[NVME_REG_CFG_SIZE]; 216 }; 217 218 struct nvmf_vfio_user_req { 219 struct spdk_nvmf_request req; 220 struct spdk_nvme_cpl rsp; 221 struct spdk_nvme_cmd cmd; 222 223 enum nvmf_vfio_user_req_state state; 224 nvmf_vfio_user_req_cb_fn cb_fn; 225 void *cb_arg; 226 227 /* old CC before prop_set_cc fabric command */ 228 union spdk_nvme_cc_register cc; 229 230 TAILQ_ENTRY(nvmf_vfio_user_req) link; 231 232 struct iovec iov[NVMF_VFIO_USER_MAX_IOVECS]; 233 uint8_t iovcnt; 234 235 /* NVMF_VFIO_USER_MAX_IOVECS worth of dma_sg_t. */ 236 uint8_t sg[]; 237 }; 238 239 /* 240 * Mapping of an NVMe queue. 241 * 242 * This holds the information tracking a local process mapping of an NVMe queue 243 * shared by the client. 244 */ 245 struct nvme_q_mapping { 246 /* iov of local process mapping. */ 247 struct iovec iov; 248 /* Stored sg, needed for unmap. */ 249 dma_sg_t *sg; 250 /* Client PRP of queue. */ 251 uint64_t prp1; 252 }; 253 254 enum nvmf_vfio_user_sq_state { 255 VFIO_USER_SQ_UNUSED = 0, 256 VFIO_USER_SQ_CREATED, 257 VFIO_USER_SQ_DELETED, 258 VFIO_USER_SQ_ACTIVE, 259 VFIO_USER_SQ_INACTIVE 260 }; 261 262 enum nvmf_vfio_user_cq_state { 263 VFIO_USER_CQ_UNUSED = 0, 264 VFIO_USER_CQ_CREATED, 265 VFIO_USER_CQ_DELETED, 266 }; 267 268 enum nvmf_vfio_user_ctrlr_state { 269 VFIO_USER_CTRLR_CREATING = 0, 270 VFIO_USER_CTRLR_RUNNING, 271 /* Quiesce requested by libvfio-user */ 272 VFIO_USER_CTRLR_PAUSING, 273 /* NVMf subsystem is paused, it's safe to do PCI reset, memory register, 274 * memory unergister, and vfio migration state transition in this state. 275 */ 276 VFIO_USER_CTRLR_PAUSED, 277 /* 278 * Implies that the NVMf subsystem is paused. Device will be unquiesced (PCI 279 * reset, memory register and unregister, controller in destination VM has 280 * been restored). NVMf subsystem resume has been requested. 281 */ 282 VFIO_USER_CTRLR_RESUMING, 283 /* 284 * Implies that the NVMf subsystem is paused. Both controller in source VM and 285 * destinatiom VM is in this state when doing live migration. 286 */ 287 VFIO_USER_CTRLR_MIGRATING 288 }; 289 290 struct nvmf_vfio_user_sq { 291 struct spdk_nvmf_qpair qpair; 292 struct spdk_nvmf_transport_poll_group *group; 293 struct nvmf_vfio_user_ctrlr *ctrlr; 294 295 uint32_t qid; 296 /* Number of entries in queue. */ 297 uint32_t size; 298 struct nvme_q_mapping mapping; 299 enum nvmf_vfio_user_sq_state sq_state; 300 301 uint32_t head; 302 volatile uint32_t *dbl_tailp; 303 304 /* Whether a shadow doorbell eventidx needs setting. */ 305 bool need_rearm; 306 307 /* multiple SQs can be mapped to the same CQ */ 308 uint16_t cqid; 309 310 /* handle_queue_connect_rsp() can be used both for CREATE IO SQ response 311 * and SQ re-connect response in the destination VM, for the prior case, 312 * we will post a NVMe completion to VM, we will not set this flag when 313 * re-connecting SQs in the destination VM. 314 */ 315 bool post_create_io_sq_completion; 316 /* Copy of Create IO SQ command, this field is used together with 317 * `post_create_io_sq_completion` flag. 318 */ 319 struct spdk_nvme_cmd create_io_sq_cmd; 320 321 /* Currently unallocated reqs. */ 322 TAILQ_HEAD(, nvmf_vfio_user_req) free_reqs; 323 /* Poll group entry */ 324 TAILQ_ENTRY(nvmf_vfio_user_sq) link; 325 /* Connected SQ entry */ 326 TAILQ_ENTRY(nvmf_vfio_user_sq) tailq; 327 }; 328 329 struct nvmf_vfio_user_cq { 330 struct spdk_nvmf_transport_poll_group *group; 331 int cq_ref; 332 333 uint32_t qid; 334 /* Number of entries in queue. */ 335 uint32_t size; 336 struct nvme_q_mapping mapping; 337 enum nvmf_vfio_user_cq_state cq_state; 338 339 uint32_t tail; 340 volatile uint32_t *dbl_headp; 341 342 bool phase; 343 344 uint16_t iv; 345 bool ien; 346 347 uint32_t last_head; 348 uint32_t last_trigger_irq_tail; 349 }; 350 351 struct nvmf_vfio_user_poll_group { 352 struct spdk_nvmf_transport_poll_group group; 353 TAILQ_ENTRY(nvmf_vfio_user_poll_group) link; 354 TAILQ_HEAD(, nvmf_vfio_user_sq) sqs; 355 struct spdk_interrupt *intr; 356 int intr_fd; 357 struct { 358 359 /* 360 * ctrlr_intr and ctrlr_kicks will be zero for all other poll 361 * groups. However, they can be zero even for the poll group 362 * the controller belongs are if no vfio-user message has been 363 * received or the controller hasn't been kicked yet. 364 */ 365 366 /* 367 * Number of times vfio_user_ctrlr_intr() has run: 368 * vfio-user file descriptor has been ready or explicitly 369 * kicked (see below). 370 */ 371 uint64_t ctrlr_intr; 372 373 /* 374 * Kicks to the controller by ctrlr_kick(). 375 * ctrlr_intr - ctrlr_kicks is the number of times the 376 * vfio-user poll file descriptor has been ready. 377 */ 378 uint64_t ctrlr_kicks; 379 380 /* 381 * How many times we won the race arming an SQ. 382 */ 383 uint64_t won; 384 385 /* 386 * How many times we lost the race arming an SQ 387 */ 388 uint64_t lost; 389 390 /* 391 * How many requests we processed in total each time we lost 392 * the rearm race. 393 */ 394 uint64_t lost_count; 395 396 /* 397 * Number of attempts we attempted to rearm all the SQs in the 398 * poll group. 399 */ 400 uint64_t rearms; 401 402 uint64_t pg_process_count; 403 uint64_t intr; 404 uint64_t polls; 405 uint64_t polls_spurious; 406 uint64_t poll_reqs; 407 uint64_t poll_reqs_squared; 408 uint64_t cqh_admin_writes; 409 uint64_t cqh_io_writes; 410 } stats; 411 }; 412 413 struct nvmf_vfio_user_shadow_doorbells { 414 volatile uint32_t *shadow_doorbells; 415 volatile uint32_t *eventidxs; 416 dma_sg_t *sgs; 417 struct iovec *iovs; 418 }; 419 420 struct nvmf_vfio_user_ctrlr { 421 struct nvmf_vfio_user_endpoint *endpoint; 422 struct nvmf_vfio_user_transport *transport; 423 424 /* Connected SQs list */ 425 TAILQ_HEAD(, nvmf_vfio_user_sq) connected_sqs; 426 enum nvmf_vfio_user_ctrlr_state state; 427 428 /* 429 * Tells whether live migration data have been prepared. This is used 430 * by the get_pending_bytes callback to tell whether or not the 431 * previous iteration finished. 432 */ 433 bool migr_data_prepared; 434 435 /* Controller is in source VM when doing live migration */ 436 bool in_source_vm; 437 438 struct spdk_thread *thread; 439 struct spdk_poller *vfu_ctx_poller; 440 struct spdk_interrupt *intr; 441 int intr_fd; 442 443 bool queued_quiesce; 444 445 bool reset_shn; 446 bool disconnect; 447 448 uint16_t cntlid; 449 struct spdk_nvmf_ctrlr *ctrlr; 450 451 struct nvmf_vfio_user_sq *sqs[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 452 struct nvmf_vfio_user_cq *cqs[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 453 454 TAILQ_ENTRY(nvmf_vfio_user_ctrlr) link; 455 456 volatile uint32_t *bar0_doorbells; 457 struct nvmf_vfio_user_shadow_doorbells *sdbl; 458 /* 459 * Shadow doorbells PRPs to provide during the stop-and-copy state. 460 */ 461 uint64_t shadow_doorbell_buffer; 462 uint64_t eventidx_buffer; 463 464 bool adaptive_irqs_enabled; 465 }; 466 467 /* Endpoint in vfio-user is associated with a socket file, which 468 * is the representative of a PCI endpoint. 469 */ 470 struct nvmf_vfio_user_endpoint { 471 struct nvmf_vfio_user_transport *transport; 472 vfu_ctx_t *vfu_ctx; 473 struct spdk_poller *accept_poller; 474 struct spdk_thread *accept_thread; 475 bool interrupt_mode; 476 struct msixcap *msix; 477 vfu_pci_config_space_t *pci_config_space; 478 int devmem_fd; 479 int accept_intr_fd; 480 struct spdk_interrupt *accept_intr; 481 482 volatile uint32_t *bar0_doorbells; 483 484 int migr_fd; 485 void *migr_data; 486 487 struct spdk_nvme_transport_id trid; 488 struct spdk_nvmf_subsystem *subsystem; 489 490 /* Controller is associated with an active socket connection, 491 * the lifecycle of the controller is same as the VM. 492 * Currently we only support one active connection, as the NVMe 493 * specification defines, we may support multiple controllers in 494 * future, so that it can support e.g: RESERVATION. 495 */ 496 struct nvmf_vfio_user_ctrlr *ctrlr; 497 pthread_mutex_t lock; 498 499 bool need_async_destroy; 500 /* The subsystem is in PAUSED state and need to be resumed, TRUE 501 * only when migration is done successfully and the controller is 502 * in source VM. 503 */ 504 bool need_resume; 505 /* Start the accept poller again after destroying the controller */ 506 bool need_relisten; 507 508 TAILQ_ENTRY(nvmf_vfio_user_endpoint) link; 509 }; 510 511 struct nvmf_vfio_user_transport_opts { 512 bool disable_mappable_bar0; 513 bool disable_adaptive_irq; 514 bool disable_shadow_doorbells; 515 bool disable_compare; 516 bool enable_intr_mode_sq_spreading; 517 }; 518 519 struct nvmf_vfio_user_transport { 520 struct spdk_nvmf_transport transport; 521 struct nvmf_vfio_user_transport_opts transport_opts; 522 bool intr_mode_supported; 523 pthread_mutex_t lock; 524 TAILQ_HEAD(, nvmf_vfio_user_endpoint) endpoints; 525 526 pthread_mutex_t pg_lock; 527 TAILQ_HEAD(, nvmf_vfio_user_poll_group) poll_groups; 528 struct nvmf_vfio_user_poll_group *next_pg; 529 }; 530 531 /* 532 * function prototypes 533 */ 534 static int nvmf_vfio_user_req_free(struct spdk_nvmf_request *req); 535 536 static struct nvmf_vfio_user_req *get_nvmf_vfio_user_req(struct nvmf_vfio_user_sq *sq); 537 538 /* 539 * Local process virtual address of a queue. 540 */ 541 static inline void * 542 q_addr(struct nvme_q_mapping *mapping) 543 { 544 return mapping->iov.iov_base; 545 } 546 547 static inline int 548 queue_index(uint16_t qid, bool is_cq) 549 { 550 return (qid * 2) + is_cq; 551 } 552 553 static inline volatile uint32_t * 554 sq_headp(struct nvmf_vfio_user_sq *sq) 555 { 556 assert(sq != NULL); 557 return &sq->head; 558 } 559 560 static inline volatile uint32_t * 561 sq_dbl_tailp(struct nvmf_vfio_user_sq *sq) 562 { 563 assert(sq != NULL); 564 return sq->dbl_tailp; 565 } 566 567 static inline volatile uint32_t * 568 cq_dbl_headp(struct nvmf_vfio_user_cq *cq) 569 { 570 assert(cq != NULL); 571 return cq->dbl_headp; 572 } 573 574 static inline volatile uint32_t * 575 cq_tailp(struct nvmf_vfio_user_cq *cq) 576 { 577 assert(cq != NULL); 578 return &cq->tail; 579 } 580 581 static inline void 582 sq_head_advance(struct nvmf_vfio_user_sq *sq) 583 { 584 assert(sq != NULL); 585 586 assert(*sq_headp(sq) < sq->size); 587 (*sq_headp(sq))++; 588 589 if (spdk_unlikely(*sq_headp(sq) == sq->size)) { 590 *sq_headp(sq) = 0; 591 } 592 } 593 594 static inline void 595 cq_tail_advance(struct nvmf_vfio_user_cq *cq) 596 { 597 assert(cq != NULL); 598 599 assert(*cq_tailp(cq) < cq->size); 600 (*cq_tailp(cq))++; 601 602 if (spdk_unlikely(*cq_tailp(cq) == cq->size)) { 603 *cq_tailp(cq) = 0; 604 cq->phase = !cq->phase; 605 } 606 } 607 608 /* 609 * As per NVMe Base spec 3.3.1.2.1, we are supposed to implement CQ flow 610 * control: if there is no space in the CQ, we should wait until there is. 611 * 612 * In practice, we just fail the controller instead: as it happens, all host 613 * implementations we care about right-size the CQ: this is required anyway for 614 * NVMEoF support (see 3.3.2.8). 615 * 616 * Since reading the head doorbell is relatively expensive, we use the cached 617 * value, so we only have to read it for real if it appears that we are full. 618 */ 619 static inline bool 620 cq_is_full(struct nvmf_vfio_user_cq *cq) 621 { 622 uint32_t qindex; 623 624 assert(cq != NULL); 625 626 qindex = *cq_tailp(cq) + 1; 627 if (spdk_unlikely(qindex == cq->size)) { 628 qindex = 0; 629 } 630 631 if (qindex != cq->last_head) { 632 return false; 633 } 634 635 cq->last_head = *cq_dbl_headp(cq); 636 637 return qindex == cq->last_head; 638 } 639 640 static bool 641 io_q_exists(struct nvmf_vfio_user_ctrlr *vu_ctrlr, const uint16_t qid, const bool is_cq) 642 { 643 assert(vu_ctrlr != NULL); 644 645 if (qid == 0 || qid >= NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) { 646 return false; 647 } 648 649 if (is_cq) { 650 if (vu_ctrlr->cqs[qid] == NULL) { 651 return false; 652 } 653 654 return (vu_ctrlr->cqs[qid]->cq_state != VFIO_USER_CQ_DELETED && 655 vu_ctrlr->cqs[qid]->cq_state != VFIO_USER_CQ_UNUSED); 656 } 657 658 if (vu_ctrlr->sqs[qid] == NULL) { 659 return false; 660 } 661 662 return (vu_ctrlr->sqs[qid]->sq_state != VFIO_USER_SQ_DELETED && 663 vu_ctrlr->sqs[qid]->sq_state != VFIO_USER_SQ_UNUSED); 664 } 665 666 static char * 667 endpoint_id(struct nvmf_vfio_user_endpoint *endpoint) 668 { 669 return endpoint->trid.traddr; 670 } 671 672 static char * 673 ctrlr_id(struct nvmf_vfio_user_ctrlr *ctrlr) 674 { 675 if (!ctrlr || !ctrlr->endpoint) { 676 return "Null Ctrlr"; 677 } 678 679 return endpoint_id(ctrlr->endpoint); 680 } 681 682 /* Return the poll group for the admin queue of the controller. */ 683 static inline struct nvmf_vfio_user_poll_group * 684 ctrlr_to_poll_group(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 685 { 686 return SPDK_CONTAINEROF(vu_ctrlr->sqs[0]->group, 687 struct nvmf_vfio_user_poll_group, 688 group); 689 } 690 691 static inline struct spdk_thread * 692 poll_group_to_thread(struct nvmf_vfio_user_poll_group *vu_pg) 693 { 694 return vu_pg->group.group->thread; 695 } 696 697 static dma_sg_t * 698 index_to_sg_t(void *arr, size_t i) 699 { 700 return (dma_sg_t *)((uintptr_t)arr + i * dma_sg_size()); 701 } 702 703 static inline size_t 704 vfio_user_migr_data_len(void) 705 { 706 return SPDK_ALIGN_CEIL(sizeof(struct vfio_user_nvme_migr_state), PAGE_SIZE); 707 } 708 709 static inline bool 710 in_interrupt_mode(struct nvmf_vfio_user_transport *vu_transport) 711 { 712 return spdk_interrupt_mode_is_enabled() && 713 vu_transport->intr_mode_supported; 714 } 715 716 static int vfio_user_ctrlr_intr(void *ctx); 717 718 static void 719 vfio_user_msg_ctrlr_intr(void *ctx) 720 { 721 struct nvmf_vfio_user_ctrlr *vu_ctrlr = ctx; 722 struct nvmf_vfio_user_poll_group *vu_ctrlr_group = ctrlr_to_poll_group(vu_ctrlr); 723 724 vu_ctrlr_group->stats.ctrlr_kicks++; 725 726 vfio_user_ctrlr_intr(ctx); 727 } 728 729 /* 730 * Kick (force a wakeup) of all poll groups for this controller. 731 * vfio_user_ctrlr_intr() itself arranges for kicking other poll groups if 732 * needed. 733 */ 734 static void 735 ctrlr_kick(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 736 { 737 struct nvmf_vfio_user_poll_group *vu_ctrlr_group; 738 739 SPDK_DEBUGLOG(vfio_user_db, "%s: kicked\n", ctrlr_id(vu_ctrlr)); 740 741 vu_ctrlr_group = ctrlr_to_poll_group(vu_ctrlr); 742 743 spdk_thread_send_msg(poll_group_to_thread(vu_ctrlr_group), 744 vfio_user_msg_ctrlr_intr, vu_ctrlr); 745 } 746 747 /* 748 * Make the given DMA address and length available (locally mapped) via iov. 749 */ 750 static void * 751 map_one(vfu_ctx_t *ctx, uint64_t addr, uint64_t len, dma_sg_t *sg, 752 struct iovec *iov, int prot) 753 { 754 int ret; 755 756 assert(ctx != NULL); 757 assert(sg != NULL); 758 assert(iov != NULL); 759 760 ret = vfu_addr_to_sgl(ctx, (void *)(uintptr_t)addr, len, sg, 1, prot); 761 if (ret < 0) { 762 return NULL; 763 } 764 765 ret = vfu_sgl_get(ctx, sg, iov, 1, 0); 766 if (ret != 0) { 767 return NULL; 768 } 769 770 assert(iov->iov_base != NULL); 771 return iov->iov_base; 772 } 773 774 static int 775 nvme_cmd_map_prps(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, 776 uint32_t max_iovcnt, uint32_t len, size_t mps, 777 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 778 { 779 uint64_t prp1, prp2; 780 void *vva; 781 uint32_t i; 782 uint32_t residue_len, nents; 783 uint64_t *prp_list; 784 uint32_t iovcnt; 785 786 assert(max_iovcnt > 0); 787 788 prp1 = cmd->dptr.prp.prp1; 789 prp2 = cmd->dptr.prp.prp2; 790 791 /* PRP1 may started with unaligned page address */ 792 residue_len = mps - (prp1 % mps); 793 residue_len = spdk_min(len, residue_len); 794 795 vva = gpa_to_vva(prv, prp1, residue_len, PROT_READ | PROT_WRITE); 796 if (spdk_unlikely(vva == NULL)) { 797 SPDK_ERRLOG("GPA to VVA failed\n"); 798 return -EINVAL; 799 } 800 len -= residue_len; 801 if (len && max_iovcnt < 2) { 802 SPDK_ERRLOG("Too many page entries, at least two iovs are required\n"); 803 return -ERANGE; 804 } 805 iovs[0].iov_base = vva; 806 iovs[0].iov_len = residue_len; 807 808 if (len) { 809 if (spdk_unlikely(prp2 == 0)) { 810 SPDK_ERRLOG("no PRP2, %d remaining\n", len); 811 return -EINVAL; 812 } 813 814 if (len <= mps) { 815 /* 2 PRP used */ 816 iovcnt = 2; 817 vva = gpa_to_vva(prv, prp2, len, PROT_READ | PROT_WRITE); 818 if (spdk_unlikely(vva == NULL)) { 819 SPDK_ERRLOG("no VVA for %#" PRIx64 ", len%#x\n", 820 prp2, len); 821 return -EINVAL; 822 } 823 iovs[1].iov_base = vva; 824 iovs[1].iov_len = len; 825 } else { 826 /* PRP list used */ 827 nents = (len + mps - 1) / mps; 828 if (spdk_unlikely(nents + 1 > max_iovcnt)) { 829 SPDK_ERRLOG("Too many page entries\n"); 830 return -ERANGE; 831 } 832 833 vva = gpa_to_vva(prv, prp2, nents * sizeof(*prp_list), PROT_READ); 834 if (spdk_unlikely(vva == NULL)) { 835 SPDK_ERRLOG("no VVA for %#" PRIx64 ", nents=%#x\n", 836 prp2, nents); 837 return -EINVAL; 838 } 839 prp_list = vva; 840 i = 0; 841 while (len != 0) { 842 residue_len = spdk_min(len, mps); 843 vva = gpa_to_vva(prv, prp_list[i], residue_len, PROT_READ | PROT_WRITE); 844 if (spdk_unlikely(vva == NULL)) { 845 SPDK_ERRLOG("no VVA for %#" PRIx64 ", residue_len=%#x\n", 846 prp_list[i], residue_len); 847 return -EINVAL; 848 } 849 iovs[i + 1].iov_base = vva; 850 iovs[i + 1].iov_len = residue_len; 851 len -= residue_len; 852 i++; 853 } 854 iovcnt = i + 1; 855 } 856 } else { 857 /* 1 PRP used */ 858 iovcnt = 1; 859 } 860 861 assert(iovcnt <= max_iovcnt); 862 return iovcnt; 863 } 864 865 static int 866 nvme_cmd_map_sgls_data(void *prv, struct spdk_nvme_sgl_descriptor *sgls, uint32_t num_sgls, 867 struct iovec *iovs, uint32_t max_iovcnt, 868 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 869 { 870 uint32_t i; 871 void *vva; 872 873 if (spdk_unlikely(max_iovcnt < num_sgls)) { 874 return -ERANGE; 875 } 876 877 for (i = 0; i < num_sgls; i++) { 878 if (spdk_unlikely(sgls[i].unkeyed.type != SPDK_NVME_SGL_TYPE_DATA_BLOCK)) { 879 SPDK_ERRLOG("Invalid SGL type %u\n", sgls[i].unkeyed.type); 880 return -EINVAL; 881 } 882 vva = gpa_to_vva(prv, sgls[i].address, sgls[i].unkeyed.length, PROT_READ | PROT_WRITE); 883 if (spdk_unlikely(vva == NULL)) { 884 SPDK_ERRLOG("GPA to VVA failed\n"); 885 return -EINVAL; 886 } 887 iovs[i].iov_base = vva; 888 iovs[i].iov_len = sgls[i].unkeyed.length; 889 } 890 891 return num_sgls; 892 } 893 894 static int 895 nvme_cmd_map_sgls(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt, 896 uint32_t len, size_t mps, 897 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 898 { 899 struct spdk_nvme_sgl_descriptor *sgl, *last_sgl; 900 uint32_t num_sgls, seg_len; 901 void *vva; 902 int ret; 903 uint32_t total_iovcnt = 0; 904 905 /* SGL cases */ 906 sgl = &cmd->dptr.sgl1; 907 908 /* only one SGL segment */ 909 if (sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { 910 assert(max_iovcnt > 0); 911 vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, PROT_READ | PROT_WRITE); 912 if (spdk_unlikely(vva == NULL)) { 913 SPDK_ERRLOG("GPA to VVA failed\n"); 914 return -EINVAL; 915 } 916 iovs[0].iov_base = vva; 917 iovs[0].iov_len = sgl->unkeyed.length; 918 assert(sgl->unkeyed.length == len); 919 920 return 1; 921 } 922 923 for (;;) { 924 if (spdk_unlikely((sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_SEGMENT) && 925 (sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_LAST_SEGMENT))) { 926 SPDK_ERRLOG("Invalid SGL type %u\n", sgl->unkeyed.type); 927 return -EINVAL; 928 } 929 930 seg_len = sgl->unkeyed.length; 931 if (spdk_unlikely(seg_len % sizeof(struct spdk_nvme_sgl_descriptor))) { 932 SPDK_ERRLOG("Invalid SGL segment len %u\n", seg_len); 933 return -EINVAL; 934 } 935 936 num_sgls = seg_len / sizeof(struct spdk_nvme_sgl_descriptor); 937 vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, PROT_READ); 938 if (spdk_unlikely(vva == NULL)) { 939 SPDK_ERRLOG("GPA to VVA failed\n"); 940 return -EINVAL; 941 } 942 943 /* sgl point to the first segment */ 944 sgl = (struct spdk_nvme_sgl_descriptor *)vva; 945 last_sgl = &sgl[num_sgls - 1]; 946 947 /* we are done */ 948 if (last_sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { 949 /* map whole sgl list */ 950 ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls, &iovs[total_iovcnt], 951 max_iovcnt - total_iovcnt, gpa_to_vva); 952 if (spdk_unlikely(ret < 0)) { 953 return ret; 954 } 955 total_iovcnt += ret; 956 957 return total_iovcnt; 958 } 959 960 if (num_sgls > 1) { 961 /* map whole sgl exclude last_sgl */ 962 ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls - 1, &iovs[total_iovcnt], 963 max_iovcnt - total_iovcnt, gpa_to_vva); 964 if (spdk_unlikely(ret < 0)) { 965 return ret; 966 } 967 total_iovcnt += ret; 968 } 969 970 /* move to next level's segments */ 971 sgl = last_sgl; 972 } 973 974 return 0; 975 } 976 977 static int 978 nvme_map_cmd(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt, 979 uint32_t len, size_t mps, 980 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 981 { 982 if (cmd->psdt == SPDK_NVME_PSDT_PRP) { 983 return nvme_cmd_map_prps(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva); 984 } 985 986 return nvme_cmd_map_sgls(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva); 987 } 988 989 /* 990 * For each queue, update the location of its doorbell to the correct location: 991 * either our own BAR0, or the guest's configured shadow doorbell area. 992 * 993 * The Admin queue (qid: 0) does not ever use shadow doorbells. 994 */ 995 static void 996 vfio_user_ctrlr_switch_doorbells(struct nvmf_vfio_user_ctrlr *ctrlr, bool shadow) 997 { 998 volatile uint32_t *doorbells = shadow ? ctrlr->sdbl->shadow_doorbells : 999 ctrlr->bar0_doorbells; 1000 1001 assert(doorbells != NULL); 1002 1003 for (size_t i = 1; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; i++) { 1004 struct nvmf_vfio_user_sq *sq = ctrlr->sqs[i]; 1005 struct nvmf_vfio_user_cq *cq = ctrlr->cqs[i]; 1006 1007 if (sq != NULL) { 1008 sq->dbl_tailp = doorbells + queue_index(sq->qid, false); 1009 1010 ctrlr->sqs[i]->need_rearm = shadow; 1011 } 1012 1013 if (cq != NULL) { 1014 cq->dbl_headp = doorbells + queue_index(cq->qid, true); 1015 } 1016 } 1017 } 1018 1019 static void 1020 unmap_sdbl(vfu_ctx_t *vfu_ctx, struct nvmf_vfio_user_shadow_doorbells *sdbl) 1021 { 1022 assert(vfu_ctx != NULL); 1023 assert(sdbl != NULL); 1024 1025 /* 1026 * An allocation error would result in only one of the two being 1027 * non-NULL. If that is the case, no memory should have been mapped. 1028 */ 1029 if (sdbl->iovs == NULL || sdbl->sgs == NULL) { 1030 return; 1031 } 1032 1033 for (size_t i = 0; i < NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT; ++i) { 1034 struct iovec *iov; 1035 dma_sg_t *sg; 1036 1037 if (!sdbl->iovs[i].iov_len) { 1038 continue; 1039 } 1040 1041 sg = index_to_sg_t(sdbl->sgs, i); 1042 iov = sdbl->iovs + i; 1043 1044 vfu_sgl_put(vfu_ctx, sg, iov, 1); 1045 } 1046 } 1047 1048 static void 1049 free_sdbl(vfu_ctx_t *vfu_ctx, struct nvmf_vfio_user_shadow_doorbells *sdbl) 1050 { 1051 if (sdbl == NULL) { 1052 return; 1053 } 1054 1055 unmap_sdbl(vfu_ctx, sdbl); 1056 1057 /* 1058 * sdbl->shadow_doorbells and sdbl->eventidxs were mapped, 1059 * not allocated, so don't free() them. 1060 */ 1061 free(sdbl->sgs); 1062 free(sdbl->iovs); 1063 free(sdbl); 1064 } 1065 1066 static struct nvmf_vfio_user_shadow_doorbells * 1067 map_sdbl(vfu_ctx_t *vfu_ctx, uint64_t prp1, uint64_t prp2, size_t len) 1068 { 1069 struct nvmf_vfio_user_shadow_doorbells *sdbl = NULL; 1070 dma_sg_t *sg2 = NULL; 1071 void *p; 1072 1073 assert(vfu_ctx != NULL); 1074 1075 sdbl = calloc(1, sizeof(*sdbl)); 1076 if (sdbl == NULL) { 1077 goto err; 1078 } 1079 1080 sdbl->sgs = calloc(NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT, dma_sg_size()); 1081 sdbl->iovs = calloc(NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT, sizeof(*sdbl->iovs)); 1082 if (sdbl->sgs == NULL || sdbl->iovs == NULL) { 1083 goto err; 1084 } 1085 1086 /* Map shadow doorbell buffer (PRP1). */ 1087 p = map_one(vfu_ctx, prp1, len, sdbl->sgs, sdbl->iovs, 1088 PROT_READ | PROT_WRITE); 1089 1090 if (p == NULL) { 1091 goto err; 1092 } 1093 1094 /* 1095 * Map eventidx buffer (PRP2). 1096 * Should only be written to by the controller. 1097 */ 1098 1099 sg2 = index_to_sg_t(sdbl->sgs, 1); 1100 1101 p = map_one(vfu_ctx, prp2, len, sg2, sdbl->iovs + 1, 1102 PROT_READ | PROT_WRITE); 1103 1104 if (p == NULL) { 1105 goto err; 1106 } 1107 1108 sdbl->shadow_doorbells = (uint32_t *)sdbl->iovs[0].iov_base; 1109 sdbl->eventidxs = (uint32_t *)sdbl->iovs[1].iov_base; 1110 1111 return sdbl; 1112 1113 err: 1114 free_sdbl(vfu_ctx, sdbl); 1115 return NULL; 1116 } 1117 1118 /* 1119 * Copy doorbells from one buffer to the other, during switches betweeen BAR0 1120 * doorbells and shadow doorbells. 1121 */ 1122 static void 1123 copy_doorbells(struct nvmf_vfio_user_ctrlr *ctrlr, 1124 const volatile uint32_t *from, volatile uint32_t *to) 1125 { 1126 assert(ctrlr != NULL); 1127 assert(from != NULL); 1128 assert(to != NULL); 1129 1130 SPDK_DEBUGLOG(vfio_user_db, 1131 "%s: migrating shadow doorbells from %p to %p\n", 1132 ctrlr_id(ctrlr), from, to); 1133 1134 /* Can't use memcpy because it doesn't respect volatile semantics. */ 1135 for (size_t i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; ++i) { 1136 if (ctrlr->sqs[i] != NULL) { 1137 to[queue_index(i, false)] = from[queue_index(i, false)]; 1138 } 1139 1140 if (ctrlr->cqs[i] != NULL) { 1141 to[queue_index(i, true)] = from[queue_index(i, true)]; 1142 } 1143 } 1144 } 1145 1146 static void 1147 fail_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1148 { 1149 const struct spdk_nvmf_registers *regs; 1150 1151 assert(vu_ctrlr != NULL); 1152 assert(vu_ctrlr->ctrlr != NULL); 1153 1154 regs = spdk_nvmf_ctrlr_get_regs(vu_ctrlr->ctrlr); 1155 if (regs->csts.bits.cfs == 0) { 1156 SPDK_ERRLOG(":%s failing controller\n", ctrlr_id(vu_ctrlr)); 1157 } 1158 1159 nvmf_ctrlr_set_fatal_status(vu_ctrlr->ctrlr); 1160 } 1161 1162 static inline bool 1163 ctrlr_interrupt_enabled(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1164 { 1165 assert(vu_ctrlr != NULL); 1166 assert(vu_ctrlr->endpoint != NULL); 1167 1168 vfu_pci_config_space_t *pci = vu_ctrlr->endpoint->pci_config_space; 1169 1170 return (!pci->hdr.cmd.id || vu_ctrlr->endpoint->msix->mxc.mxe); 1171 } 1172 1173 static void 1174 nvmf_vfio_user_destroy_endpoint(struct nvmf_vfio_user_endpoint *endpoint) 1175 { 1176 SPDK_DEBUGLOG(nvmf_vfio, "destroy endpoint %s\n", endpoint_id(endpoint)); 1177 1178 spdk_interrupt_unregister(&endpoint->accept_intr); 1179 spdk_poller_unregister(&endpoint->accept_poller); 1180 1181 if (endpoint->bar0_doorbells) { 1182 munmap((void *)endpoint->bar0_doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 1183 } 1184 1185 if (endpoint->devmem_fd > 0) { 1186 close(endpoint->devmem_fd); 1187 } 1188 1189 if (endpoint->migr_data) { 1190 munmap(endpoint->migr_data, vfio_user_migr_data_len()); 1191 } 1192 1193 if (endpoint->migr_fd > 0) { 1194 close(endpoint->migr_fd); 1195 } 1196 1197 if (endpoint->vfu_ctx) { 1198 vfu_destroy_ctx(endpoint->vfu_ctx); 1199 } 1200 1201 pthread_mutex_destroy(&endpoint->lock); 1202 free(endpoint); 1203 } 1204 1205 /* called when process exits */ 1206 static int 1207 nvmf_vfio_user_destroy(struct spdk_nvmf_transport *transport, 1208 spdk_nvmf_transport_destroy_done_cb cb_fn, void *cb_arg) 1209 { 1210 struct nvmf_vfio_user_transport *vu_transport; 1211 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 1212 1213 SPDK_DEBUGLOG(nvmf_vfio, "destroy transport\n"); 1214 1215 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 1216 transport); 1217 1218 pthread_mutex_destroy(&vu_transport->lock); 1219 pthread_mutex_destroy(&vu_transport->pg_lock); 1220 1221 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 1222 TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link); 1223 nvmf_vfio_user_destroy_endpoint(endpoint); 1224 } 1225 1226 free(vu_transport); 1227 1228 if (cb_fn) { 1229 cb_fn(cb_arg); 1230 } 1231 1232 return 0; 1233 } 1234 1235 static const struct spdk_json_object_decoder vfio_user_transport_opts_decoder[] = { 1236 { 1237 "disable_mappable_bar0", 1238 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_mappable_bar0), 1239 spdk_json_decode_bool, true 1240 }, 1241 { 1242 "disable_adaptive_irq", 1243 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_adaptive_irq), 1244 spdk_json_decode_bool, true 1245 }, 1246 { 1247 "disable_shadow_doorbells", 1248 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_shadow_doorbells), 1249 spdk_json_decode_bool, true 1250 }, 1251 { 1252 "disable_compare", 1253 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_compare), 1254 spdk_json_decode_bool, true 1255 }, 1256 { 1257 "enable_intr_mode_sq_spreading", 1258 offsetof(struct nvmf_vfio_user_transport, transport_opts.enable_intr_mode_sq_spreading), 1259 spdk_json_decode_bool, true 1260 }, 1261 }; 1262 1263 static struct spdk_nvmf_transport * 1264 nvmf_vfio_user_create(struct spdk_nvmf_transport_opts *opts) 1265 { 1266 struct nvmf_vfio_user_transport *vu_transport; 1267 int err; 1268 1269 if (opts->max_qpairs_per_ctrlr > NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) { 1270 SPDK_ERRLOG("Invalid max_qpairs_per_ctrlr=%d, supported max_qpairs_per_ctrlr=%d\n", 1271 opts->max_qpairs_per_ctrlr, NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR); 1272 return NULL; 1273 } 1274 1275 vu_transport = calloc(1, sizeof(*vu_transport)); 1276 if (vu_transport == NULL) { 1277 SPDK_ERRLOG("Transport alloc fail: %m\n"); 1278 return NULL; 1279 } 1280 1281 err = pthread_mutex_init(&vu_transport->lock, NULL); 1282 if (err != 0) { 1283 SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err); 1284 goto err; 1285 } 1286 TAILQ_INIT(&vu_transport->endpoints); 1287 1288 err = pthread_mutex_init(&vu_transport->pg_lock, NULL); 1289 if (err != 0) { 1290 pthread_mutex_destroy(&vu_transport->lock); 1291 SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err); 1292 goto err; 1293 } 1294 TAILQ_INIT(&vu_transport->poll_groups); 1295 1296 if (opts->transport_specific != NULL && 1297 spdk_json_decode_object_relaxed(opts->transport_specific, vfio_user_transport_opts_decoder, 1298 SPDK_COUNTOF(vfio_user_transport_opts_decoder), 1299 vu_transport)) { 1300 SPDK_ERRLOG("spdk_json_decode_object_relaxed failed\n"); 1301 goto cleanup; 1302 } 1303 1304 /* 1305 * To support interrupt mode, the transport must be configured with 1306 * mappable BAR0 disabled: we need a vfio-user message to wake us up 1307 * when a client writes new doorbell values to BAR0, via the 1308 * libvfio-user socket fd. 1309 */ 1310 vu_transport->intr_mode_supported = 1311 vu_transport->transport_opts.disable_mappable_bar0; 1312 1313 /* 1314 * If BAR0 is mappable, it doesn't make sense to support shadow 1315 * doorbells, so explicitly turn it off. 1316 */ 1317 if (!vu_transport->transport_opts.disable_mappable_bar0) { 1318 vu_transport->transport_opts.disable_shadow_doorbells = true; 1319 } 1320 1321 if (spdk_interrupt_mode_is_enabled()) { 1322 if (!vu_transport->intr_mode_supported) { 1323 SPDK_ERRLOG("interrupt mode not supported\n"); 1324 goto cleanup; 1325 } 1326 1327 /* 1328 * If we are in interrupt mode, we cannot support adaptive IRQs, 1329 * as there is no guarantee the SQ poller will run subsequently 1330 * to send pending IRQs. 1331 */ 1332 vu_transport->transport_opts.disable_adaptive_irq = true; 1333 } 1334 1335 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_mappable_bar0=%d\n", 1336 vu_transport->transport_opts.disable_mappable_bar0); 1337 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_adaptive_irq=%d\n", 1338 vu_transport->transport_opts.disable_adaptive_irq); 1339 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_shadow_doorbells=%d\n", 1340 vu_transport->transport_opts.disable_shadow_doorbells); 1341 1342 return &vu_transport->transport; 1343 1344 cleanup: 1345 pthread_mutex_destroy(&vu_transport->lock); 1346 pthread_mutex_destroy(&vu_transport->pg_lock); 1347 err: 1348 free(vu_transport); 1349 return NULL; 1350 } 1351 1352 static uint32_t 1353 max_queue_size(struct nvmf_vfio_user_ctrlr const *vu_ctrlr) 1354 { 1355 assert(vu_ctrlr != NULL); 1356 assert(vu_ctrlr->ctrlr != NULL); 1357 1358 return vu_ctrlr->ctrlr->vcprop.cap.bits.mqes + 1; 1359 } 1360 1361 static uint32_t 1362 doorbell_stride(const struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1363 { 1364 assert(vu_ctrlr != NULL); 1365 assert(vu_ctrlr->ctrlr != NULL); 1366 1367 return vu_ctrlr->ctrlr->vcprop.cap.bits.dstrd; 1368 } 1369 1370 static uintptr_t 1371 memory_page_size(const struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1372 { 1373 uint32_t memory_page_shift = vu_ctrlr->ctrlr->vcprop.cc.bits.mps + 12; 1374 return 1ul << memory_page_shift; 1375 } 1376 1377 static uintptr_t 1378 memory_page_mask(const struct nvmf_vfio_user_ctrlr *ctrlr) 1379 { 1380 return ~(memory_page_size(ctrlr) - 1); 1381 } 1382 1383 static int 1384 map_q(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvme_q_mapping *mapping, 1385 uint32_t q_size, bool is_cq, bool unmap) 1386 { 1387 uint64_t len; 1388 void *ret; 1389 1390 assert(q_size); 1391 assert(q_addr(mapping) == NULL); 1392 1393 if (is_cq) { 1394 len = q_size * sizeof(struct spdk_nvme_cpl); 1395 } else { 1396 len = q_size * sizeof(struct spdk_nvme_cmd); 1397 } 1398 1399 ret = map_one(vu_ctrlr->endpoint->vfu_ctx, mapping->prp1, len, 1400 mapping->sg, &mapping->iov, 1401 is_cq ? PROT_READ | PROT_WRITE : PROT_READ); 1402 if (ret == NULL) { 1403 return -EFAULT; 1404 } 1405 1406 if (unmap) { 1407 memset(q_addr(mapping), 0, len); 1408 } 1409 1410 return 0; 1411 } 1412 1413 static inline void 1414 unmap_q(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvme_q_mapping *mapping) 1415 { 1416 if (q_addr(mapping) != NULL) { 1417 vfu_sgl_put(vu_ctrlr->endpoint->vfu_ctx, mapping->sg, 1418 &mapping->iov, 1); 1419 mapping->iov.iov_base = NULL; 1420 } 1421 } 1422 1423 static int 1424 asq_setup(struct nvmf_vfio_user_ctrlr *ctrlr) 1425 { 1426 struct nvmf_vfio_user_sq *sq; 1427 const struct spdk_nvmf_registers *regs; 1428 int ret; 1429 1430 assert(ctrlr != NULL); 1431 1432 sq = ctrlr->sqs[0]; 1433 1434 assert(sq != NULL); 1435 assert(q_addr(&sq->mapping) == NULL); 1436 /* XXX ctrlr->asq == 0 is a valid memory address */ 1437 1438 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->ctrlr); 1439 sq->qid = 0; 1440 sq->size = regs->aqa.bits.asqs + 1; 1441 sq->mapping.prp1 = regs->asq; 1442 *sq_headp(sq) = 0; 1443 sq->cqid = 0; 1444 1445 ret = map_q(ctrlr, &sq->mapping, sq->size, false, true); 1446 if (ret) { 1447 return ret; 1448 } 1449 1450 /* The Admin queue (qid: 0) does not ever use shadow doorbells. */ 1451 sq->dbl_tailp = ctrlr->bar0_doorbells + queue_index(0, false); 1452 1453 *sq_dbl_tailp(sq) = 0; 1454 1455 return 0; 1456 } 1457 1458 /* 1459 * Updates eventidx to set an SQ into interrupt or polling mode. 1460 * 1461 * Returns false if the current SQ tail does not match the SQ head, as 1462 * this means that the host has submitted more items to the queue while we were 1463 * not looking - or during the event index update. In that case, we must retry, 1464 * or otherwise make sure we are going to wake up again. 1465 */ 1466 static bool 1467 set_sq_eventidx(struct nvmf_vfio_user_sq *sq) 1468 { 1469 struct nvmf_vfio_user_ctrlr *ctrlr; 1470 volatile uint32_t *sq_tail_eidx; 1471 uint32_t old_tail, new_tail; 1472 1473 assert(sq != NULL); 1474 assert(sq->ctrlr != NULL); 1475 assert(sq->ctrlr->sdbl != NULL); 1476 assert(sq->need_rearm); 1477 assert(sq->qid != 0); 1478 1479 ctrlr = sq->ctrlr; 1480 1481 SPDK_DEBUGLOG(vfio_user_db, "%s: updating eventidx of sqid:%u\n", 1482 ctrlr_id(ctrlr), sq->qid); 1483 1484 sq_tail_eidx = ctrlr->sdbl->eventidxs + queue_index(sq->qid, false); 1485 1486 assert(ctrlr->endpoint != NULL); 1487 1488 if (!ctrlr->endpoint->interrupt_mode) { 1489 /* No synchronisation necessary. */ 1490 *sq_tail_eidx = NVMF_VFIO_USER_EVENTIDX_POLL; 1491 return true; 1492 } 1493 1494 old_tail = *sq_dbl_tailp(sq); 1495 *sq_tail_eidx = old_tail; 1496 1497 /* 1498 * Ensure that the event index is updated before re-reading the tail 1499 * doorbell. If it's not, then the host might race us and update the 1500 * tail after the second read but before the event index is written, so 1501 * it won't write to BAR0 and we'll miss the update. 1502 * 1503 * The driver should provide similar ordering with an mb(). 1504 */ 1505 spdk_mb(); 1506 1507 /* 1508 * Check if the host has updated the tail doorbell after we've read it 1509 * for the first time, but before the event index was written. If that's 1510 * the case, then we've lost the race and we need to update the event 1511 * index again (after polling the queue, since the host won't write to 1512 * BAR0). 1513 */ 1514 new_tail = *sq_dbl_tailp(sq); 1515 1516 /* 1517 * We might poll the queue straight after this function returns if the 1518 * tail has been updated, so we need to ensure that any changes to the 1519 * queue will be visible to us if the doorbell has been updated. 1520 * 1521 * The driver should provide similar ordering with a wmb() to ensure 1522 * that the queue is written before it updates the tail doorbell. 1523 */ 1524 spdk_rmb(); 1525 1526 SPDK_DEBUGLOG(vfio_user_db, "%s: sqid:%u, old_tail=%u, new_tail=%u, " 1527 "sq_head=%u\n", ctrlr_id(ctrlr), sq->qid, old_tail, 1528 new_tail, *sq_headp(sq)); 1529 1530 if (new_tail == *sq_headp(sq)) { 1531 sq->need_rearm = false; 1532 return true; 1533 } 1534 1535 /* 1536 * We've lost the race: the tail was updated since we last polled, 1537 * including if it happened within this routine. 1538 * 1539 * The caller should retry after polling (think of this as a cmpxchg 1540 * loop); if we go to sleep while the SQ is not empty, then we won't 1541 * process the remaining events. 1542 */ 1543 return false; 1544 } 1545 1546 static int nvmf_vfio_user_sq_poll(struct nvmf_vfio_user_sq *sq); 1547 1548 /* 1549 * Arrange for an SQ to interrupt us if written. Returns non-zero if we 1550 * processed some SQ entries. 1551 */ 1552 static int 1553 vfio_user_sq_rearm(struct nvmf_vfio_user_ctrlr *ctrlr, 1554 struct nvmf_vfio_user_sq *sq, 1555 struct nvmf_vfio_user_poll_group *vu_group) 1556 { 1557 int count = 0; 1558 size_t i; 1559 1560 assert(sq->need_rearm); 1561 1562 for (i = 0; i < NVMF_VFIO_USER_SET_EVENTIDX_MAX_ATTEMPTS; i++) { 1563 int ret; 1564 1565 if (set_sq_eventidx(sq)) { 1566 /* We won the race and set eventidx; done. */ 1567 vu_group->stats.won++; 1568 return count; 1569 } 1570 1571 ret = nvmf_vfio_user_sq_poll(sq); 1572 1573 count += (ret < 0) ? 1 : ret; 1574 1575 /* 1576 * set_sq_eventidx() hit the race, so we expected 1577 * to process at least one command from this queue. 1578 * If there were no new commands waiting for us, then 1579 * we must have hit an unexpected race condition. 1580 */ 1581 if (ret == 0) { 1582 SPDK_ERRLOG("%s: unexpected race condition detected " 1583 "while updating the shadow doorbell buffer\n", 1584 ctrlr_id(ctrlr)); 1585 1586 fail_ctrlr(ctrlr); 1587 return count; 1588 } 1589 } 1590 1591 SPDK_DEBUGLOG(vfio_user_db, 1592 "%s: set_sq_eventidx() lost the race %zu times\n", 1593 ctrlr_id(ctrlr), i); 1594 1595 vu_group->stats.lost++; 1596 vu_group->stats.lost_count += count; 1597 1598 /* 1599 * We couldn't arrange an eventidx guaranteed to cause a BAR0 write, as 1600 * we raced with the producer too many times; force ourselves to wake up 1601 * instead. We'll process all queues at that point. 1602 */ 1603 ctrlr_kick(ctrlr); 1604 1605 return count; 1606 } 1607 1608 /* 1609 * We're in interrupt mode, and potentially about to go to sleep. We need to 1610 * make sure any further I/O submissions are guaranteed to wake us up: for 1611 * shadow doorbells that means we may need to go through set_sq_eventidx() for 1612 * every SQ that needs re-arming. 1613 * 1614 * Returns non-zero if we processed something. 1615 */ 1616 static int 1617 vfio_user_poll_group_rearm(struct nvmf_vfio_user_poll_group *vu_group) 1618 { 1619 struct nvmf_vfio_user_sq *sq; 1620 int count = 0; 1621 1622 vu_group->stats.rearms++; 1623 1624 TAILQ_FOREACH(sq, &vu_group->sqs, link) { 1625 if (spdk_unlikely(sq->sq_state != VFIO_USER_SQ_ACTIVE || !sq->size)) { 1626 continue; 1627 } 1628 1629 if (sq->need_rearm) { 1630 count += vfio_user_sq_rearm(sq->ctrlr, sq, vu_group); 1631 } 1632 } 1633 1634 return count; 1635 } 1636 1637 static int 1638 acq_setup(struct nvmf_vfio_user_ctrlr *ctrlr) 1639 { 1640 struct nvmf_vfio_user_cq *cq; 1641 const struct spdk_nvmf_registers *regs; 1642 int ret; 1643 1644 assert(ctrlr != NULL); 1645 1646 cq = ctrlr->cqs[0]; 1647 1648 assert(cq != NULL); 1649 1650 assert(q_addr(&cq->mapping) == NULL); 1651 1652 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->ctrlr); 1653 assert(regs != NULL); 1654 cq->qid = 0; 1655 cq->size = regs->aqa.bits.acqs + 1; 1656 cq->mapping.prp1 = regs->acq; 1657 *cq_tailp(cq) = 0; 1658 cq->ien = true; 1659 cq->phase = true; 1660 1661 ret = map_q(ctrlr, &cq->mapping, cq->size, true, true); 1662 if (ret) { 1663 return ret; 1664 } 1665 1666 /* The Admin queue (qid: 0) does not ever use shadow doorbells. */ 1667 cq->dbl_headp = ctrlr->bar0_doorbells + queue_index(0, true); 1668 1669 *cq_dbl_headp(cq) = 0; 1670 1671 return 0; 1672 } 1673 1674 static void * 1675 _map_one(void *prv, uint64_t addr, uint64_t len, int prot) 1676 { 1677 struct spdk_nvmf_request *req = (struct spdk_nvmf_request *)prv; 1678 struct spdk_nvmf_qpair *qpair; 1679 struct nvmf_vfio_user_req *vu_req; 1680 struct nvmf_vfio_user_sq *sq; 1681 void *ret; 1682 1683 assert(req != NULL); 1684 qpair = req->qpair; 1685 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 1686 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 1687 1688 assert(vu_req->iovcnt < NVMF_VFIO_USER_MAX_IOVECS); 1689 ret = map_one(sq->ctrlr->endpoint->vfu_ctx, addr, len, 1690 index_to_sg_t(vu_req->sg, vu_req->iovcnt), 1691 &vu_req->iov[vu_req->iovcnt], prot); 1692 if (spdk_likely(ret != NULL)) { 1693 vu_req->iovcnt++; 1694 } 1695 return ret; 1696 } 1697 1698 static int 1699 vfio_user_map_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req, 1700 struct iovec *iov, uint32_t length) 1701 { 1702 /* Map PRP list to from Guest physical memory to 1703 * virtual memory address. 1704 */ 1705 return nvme_map_cmd(req, &req->cmd->nvme_cmd, iov, NVMF_REQ_MAX_BUFFERS, 1706 length, 4096, _map_one); 1707 } 1708 1709 static int handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 1710 struct nvmf_vfio_user_sq *sq); 1711 1712 /* 1713 * Posts a CQE in the completion queue. 1714 * 1715 * @ctrlr: the vfio-user controller 1716 * @cq: the completion queue 1717 * @cdw0: cdw0 as reported by NVMf 1718 * @sqid: submission queue ID 1719 * @cid: command identifier in NVMe command 1720 * @sc: the NVMe CQE status code 1721 * @sct: the NVMe CQE status code type 1722 */ 1723 static int 1724 post_completion(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_cq *cq, 1725 uint32_t cdw0, uint16_t sqid, uint16_t cid, uint16_t sc, uint16_t sct) 1726 { 1727 struct spdk_nvme_status cpl_status = { 0 }; 1728 struct spdk_nvme_cpl *cpl; 1729 int err; 1730 1731 assert(ctrlr != NULL); 1732 1733 if (spdk_unlikely(cq == NULL || q_addr(&cq->mapping) == NULL)) { 1734 return 0; 1735 } 1736 1737 if (cq->qid == 0) { 1738 assert(spdk_get_thread() == cq->group->group->thread); 1739 } 1740 1741 if (cq_is_full(cq)) { 1742 SPDK_ERRLOG("%s: cqid:%d full (tail=%d, head=%d)\n", 1743 ctrlr_id(ctrlr), cq->qid, *cq_tailp(cq), 1744 *cq_dbl_headp(cq)); 1745 return -1; 1746 } 1747 1748 cpl = ((struct spdk_nvme_cpl *)q_addr(&cq->mapping)) + *cq_tailp(cq); 1749 1750 assert(ctrlr->sqs[sqid] != NULL); 1751 SPDK_DEBUGLOG(nvmf_vfio, 1752 "%s: request complete sqid:%d cid=%d status=%#x " 1753 "sqhead=%d cq tail=%d\n", ctrlr_id(ctrlr), sqid, cid, sc, 1754 *sq_headp(ctrlr->sqs[sqid]), *cq_tailp(cq)); 1755 1756 cpl->sqhd = *sq_headp(ctrlr->sqs[sqid]); 1757 cpl->sqid = sqid; 1758 cpl->cid = cid; 1759 cpl->cdw0 = cdw0; 1760 1761 /* 1762 * This is a bitfield: instead of setting the individual bits we need 1763 * directly in cpl->status, which would cause a read-modify-write cycle, 1764 * we'll avoid reading from the CPL altogether by filling in a local 1765 * cpl_status variable, then writing the whole thing. 1766 */ 1767 cpl_status.sct = sct; 1768 cpl_status.sc = sc; 1769 cpl_status.p = cq->phase; 1770 cpl->status = cpl_status; 1771 1772 /* Ensure the Completion Queue Entry is visible. */ 1773 spdk_wmb(); 1774 cq_tail_advance(cq); 1775 1776 if ((cq->qid == 0 || !ctrlr->adaptive_irqs_enabled) && 1777 cq->ien && ctrlr_interrupt_enabled(ctrlr)) { 1778 err = vfu_irq_trigger(ctrlr->endpoint->vfu_ctx, cq->iv); 1779 if (err != 0) { 1780 SPDK_ERRLOG("%s: failed to trigger interrupt: %m\n", 1781 ctrlr_id(ctrlr)); 1782 return err; 1783 } 1784 } 1785 1786 return 0; 1787 } 1788 1789 static void 1790 free_sq_reqs(struct nvmf_vfio_user_sq *sq) 1791 { 1792 while (!TAILQ_EMPTY(&sq->free_reqs)) { 1793 struct nvmf_vfio_user_req *vu_req = TAILQ_FIRST(&sq->free_reqs); 1794 TAILQ_REMOVE(&sq->free_reqs, vu_req, link); 1795 free(vu_req); 1796 } 1797 } 1798 1799 static void 1800 delete_cq_done(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_cq *cq) 1801 { 1802 assert(cq->cq_ref == 0); 1803 unmap_q(ctrlr, &cq->mapping); 1804 cq->size = 0; 1805 cq->cq_state = VFIO_USER_CQ_DELETED; 1806 cq->group = NULL; 1807 } 1808 1809 /* Deletes a SQ, if this SQ is the last user of the associated CQ 1810 * and the controller is being shut down/reset or vfio-user client disconnects, 1811 * then the CQ is also deleted. 1812 */ 1813 static void 1814 delete_sq_done(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvmf_vfio_user_sq *sq) 1815 { 1816 struct nvmf_vfio_user_cq *cq; 1817 uint16_t cqid; 1818 1819 SPDK_DEBUGLOG(nvmf_vfio, "%s: delete sqid:%d=%p done\n", ctrlr_id(vu_ctrlr), 1820 sq->qid, sq); 1821 1822 /* Free SQ resources */ 1823 unmap_q(vu_ctrlr, &sq->mapping); 1824 1825 free_sq_reqs(sq); 1826 1827 sq->size = 0; 1828 1829 sq->sq_state = VFIO_USER_SQ_DELETED; 1830 1831 /* Controller RESET and SHUTDOWN are special cases, 1832 * VM may not send DELETE IO SQ/CQ commands, NVMf library 1833 * will disconnect IO queue pairs. 1834 */ 1835 if (vu_ctrlr->reset_shn || vu_ctrlr->disconnect) { 1836 cqid = sq->cqid; 1837 cq = vu_ctrlr->cqs[cqid]; 1838 1839 SPDK_DEBUGLOG(nvmf_vfio, "%s: try to delete cqid:%u=%p\n", ctrlr_id(vu_ctrlr), 1840 cq->qid, cq); 1841 1842 assert(cq->cq_ref > 0); 1843 if (--cq->cq_ref == 0) { 1844 delete_cq_done(vu_ctrlr, cq); 1845 } 1846 } 1847 } 1848 1849 static void 1850 free_qp(struct nvmf_vfio_user_ctrlr *ctrlr, uint16_t qid) 1851 { 1852 struct nvmf_vfio_user_sq *sq; 1853 struct nvmf_vfio_user_cq *cq; 1854 1855 if (ctrlr == NULL) { 1856 return; 1857 } 1858 1859 sq = ctrlr->sqs[qid]; 1860 if (sq) { 1861 SPDK_DEBUGLOG(nvmf_vfio, "%s: Free sqid:%u\n", ctrlr_id(ctrlr), qid); 1862 unmap_q(ctrlr, &sq->mapping); 1863 1864 free_sq_reqs(sq); 1865 1866 free(sq->mapping.sg); 1867 free(sq); 1868 ctrlr->sqs[qid] = NULL; 1869 } 1870 1871 cq = ctrlr->cqs[qid]; 1872 if (cq) { 1873 SPDK_DEBUGLOG(nvmf_vfio, "%s: Free cqid:%u\n", ctrlr_id(ctrlr), qid); 1874 unmap_q(ctrlr, &cq->mapping); 1875 free(cq->mapping.sg); 1876 free(cq); 1877 ctrlr->cqs[qid] = NULL; 1878 } 1879 } 1880 1881 static int 1882 init_sq(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_transport *transport, 1883 const uint16_t id) 1884 { 1885 struct nvmf_vfio_user_sq *sq; 1886 1887 assert(ctrlr != NULL); 1888 assert(transport != NULL); 1889 assert(ctrlr->sqs[id] == NULL); 1890 1891 sq = calloc(1, sizeof(*sq)); 1892 if (sq == NULL) { 1893 return -ENOMEM; 1894 } 1895 sq->mapping.sg = calloc(1, dma_sg_size()); 1896 if (sq->mapping.sg == NULL) { 1897 free(sq); 1898 return -ENOMEM; 1899 } 1900 1901 sq->qid = id; 1902 sq->qpair.qid = id; 1903 sq->qpair.transport = transport; 1904 sq->ctrlr = ctrlr; 1905 ctrlr->sqs[id] = sq; 1906 1907 TAILQ_INIT(&sq->free_reqs); 1908 1909 return 0; 1910 } 1911 1912 static int 1913 init_cq(struct nvmf_vfio_user_ctrlr *vu_ctrlr, const uint16_t id) 1914 { 1915 struct nvmf_vfio_user_cq *cq; 1916 1917 assert(vu_ctrlr != NULL); 1918 assert(vu_ctrlr->cqs[id] == NULL); 1919 1920 cq = calloc(1, sizeof(*cq)); 1921 if (cq == NULL) { 1922 return -ENOMEM; 1923 } 1924 cq->mapping.sg = calloc(1, dma_sg_size()); 1925 if (cq->mapping.sg == NULL) { 1926 free(cq); 1927 return -ENOMEM; 1928 } 1929 1930 cq->qid = id; 1931 vu_ctrlr->cqs[id] = cq; 1932 1933 return 0; 1934 } 1935 1936 static int 1937 alloc_sq_reqs(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvmf_vfio_user_sq *sq) 1938 { 1939 struct nvmf_vfio_user_req *vu_req, *tmp; 1940 size_t req_size; 1941 uint32_t i; 1942 1943 req_size = sizeof(struct nvmf_vfio_user_req) + 1944 (dma_sg_size() * NVMF_VFIO_USER_MAX_IOVECS); 1945 1946 for (i = 0; i < sq->size; i++) { 1947 struct spdk_nvmf_request *req; 1948 1949 vu_req = calloc(1, req_size); 1950 if (vu_req == NULL) { 1951 goto err; 1952 } 1953 1954 req = &vu_req->req; 1955 req->qpair = &sq->qpair; 1956 req->rsp = (union nvmf_c2h_msg *)&vu_req->rsp; 1957 req->cmd = (union nvmf_h2c_msg *)&vu_req->cmd; 1958 req->stripped_data = NULL; 1959 1960 TAILQ_INSERT_TAIL(&sq->free_reqs, vu_req, link); 1961 } 1962 1963 return 0; 1964 1965 err: 1966 TAILQ_FOREACH_SAFE(vu_req, &sq->free_reqs, link, tmp) { 1967 free(vu_req); 1968 } 1969 return -ENOMEM; 1970 } 1971 1972 static volatile uint32_t * 1973 ctrlr_doorbell_ptr(struct nvmf_vfio_user_ctrlr *ctrlr) 1974 { 1975 return ctrlr->sdbl != NULL ? 1976 ctrlr->sdbl->shadow_doorbells : 1977 ctrlr->bar0_doorbells; 1978 } 1979 1980 static uint16_t 1981 handle_create_io_sq(struct nvmf_vfio_user_ctrlr *ctrlr, 1982 struct spdk_nvme_cmd *cmd, uint16_t *sct) 1983 { 1984 struct nvmf_vfio_user_transport *vu_transport = ctrlr->transport; 1985 struct nvmf_vfio_user_sq *sq; 1986 uint32_t qsize; 1987 uint16_t cqid; 1988 uint16_t qid; 1989 int err; 1990 1991 qid = cmd->cdw10_bits.create_io_q.qid; 1992 cqid = cmd->cdw11_bits.create_io_sq.cqid; 1993 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 1994 1995 if (ctrlr->sqs[qid] == NULL) { 1996 err = init_sq(ctrlr, ctrlr->sqs[0]->qpair.transport, qid); 1997 if (err != 0) { 1998 *sct = SPDK_NVME_SCT_GENERIC; 1999 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2000 } 2001 } 2002 2003 if (cqid == 0 || cqid >= vu_transport->transport.opts.max_qpairs_per_ctrlr) { 2004 SPDK_ERRLOG("%s: invalid cqid:%u\n", ctrlr_id(ctrlr), cqid); 2005 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2006 return SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2007 } 2008 2009 /* CQ must be created before SQ. */ 2010 if (!io_q_exists(ctrlr, cqid, true)) { 2011 SPDK_ERRLOG("%s: cqid:%u does not exist\n", ctrlr_id(ctrlr), cqid); 2012 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2013 return SPDK_NVME_SC_COMPLETION_QUEUE_INVALID; 2014 } 2015 2016 if (cmd->cdw11_bits.create_io_sq.pc != 0x1) { 2017 SPDK_ERRLOG("%s: non-PC SQ not supported\n", ctrlr_id(ctrlr)); 2018 *sct = SPDK_NVME_SCT_GENERIC; 2019 return SPDK_NVME_SC_INVALID_FIELD; 2020 } 2021 2022 sq = ctrlr->sqs[qid]; 2023 sq->size = qsize; 2024 2025 SPDK_DEBUGLOG(nvmf_vfio, "%s: sqid:%d cqid:%d\n", ctrlr_id(ctrlr), 2026 qid, cqid); 2027 2028 sq->mapping.prp1 = cmd->dptr.prp.prp1; 2029 2030 err = map_q(ctrlr, &sq->mapping, sq->size, false, true); 2031 if (err) { 2032 SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr)); 2033 *sct = SPDK_NVME_SCT_GENERIC; 2034 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2035 } 2036 2037 SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped sqid:%d IOVA=%#lx vaddr=%p\n", 2038 ctrlr_id(ctrlr), qid, cmd->dptr.prp.prp1, 2039 q_addr(&sq->mapping)); 2040 2041 err = alloc_sq_reqs(ctrlr, sq); 2042 if (err < 0) { 2043 SPDK_ERRLOG("%s: failed to allocate SQ requests: %m\n", ctrlr_id(ctrlr)); 2044 *sct = SPDK_NVME_SCT_GENERIC; 2045 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2046 } 2047 2048 sq->cqid = cqid; 2049 ctrlr->cqs[sq->cqid]->cq_ref++; 2050 sq->sq_state = VFIO_USER_SQ_CREATED; 2051 *sq_headp(sq) = 0; 2052 2053 sq->dbl_tailp = ctrlr_doorbell_ptr(ctrlr) + queue_index(qid, false); 2054 2055 /* 2056 * We should always reset the doorbells. 2057 * 2058 * The Specification prohibits the controller from writing to the shadow 2059 * doorbell buffer, however older versions of the Linux NVMe driver 2060 * don't reset the shadow doorbell buffer after a Queue-Level or 2061 * Controller-Level reset, which means that we're left with garbage 2062 * doorbell values. 2063 */ 2064 *sq_dbl_tailp(sq) = 0; 2065 2066 if (ctrlr->sdbl != NULL) { 2067 sq->need_rearm = true; 2068 2069 if (!set_sq_eventidx(sq)) { 2070 SPDK_ERRLOG("%s: host updated SQ tail doorbell before " 2071 "sqid:%hu was initialized\n", 2072 ctrlr_id(ctrlr), qid); 2073 fail_ctrlr(ctrlr); 2074 *sct = SPDK_NVME_SCT_GENERIC; 2075 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2076 } 2077 } 2078 2079 /* 2080 * Create our new I/O qpair. This asynchronously invokes, on a suitable 2081 * poll group, the nvmf_vfio_user_poll_group_add() callback, which will 2082 * call spdk_nvmf_request_exec_fabrics() with a generated fabrics 2083 * connect command. This command is then eventually completed via 2084 * handle_queue_connect_rsp(). 2085 */ 2086 sq->create_io_sq_cmd = *cmd; 2087 sq->post_create_io_sq_completion = true; 2088 2089 spdk_nvmf_tgt_new_qpair(ctrlr->transport->transport.tgt, 2090 &sq->qpair); 2091 2092 *sct = SPDK_NVME_SCT_GENERIC; 2093 return SPDK_NVME_SC_SUCCESS; 2094 } 2095 2096 static uint16_t 2097 handle_create_io_cq(struct nvmf_vfio_user_ctrlr *ctrlr, 2098 struct spdk_nvme_cmd *cmd, uint16_t *sct) 2099 { 2100 struct nvmf_vfio_user_cq *cq; 2101 uint32_t qsize; 2102 uint16_t qid; 2103 int err; 2104 2105 qid = cmd->cdw10_bits.create_io_q.qid; 2106 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 2107 2108 if (ctrlr->cqs[qid] == NULL) { 2109 err = init_cq(ctrlr, qid); 2110 if (err != 0) { 2111 *sct = SPDK_NVME_SCT_GENERIC; 2112 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2113 } 2114 } 2115 2116 if (cmd->cdw11_bits.create_io_cq.pc != 0x1) { 2117 SPDK_ERRLOG("%s: non-PC CQ not supported\n", ctrlr_id(ctrlr)); 2118 *sct = SPDK_NVME_SCT_GENERIC; 2119 return SPDK_NVME_SC_INVALID_FIELD; 2120 } 2121 2122 if (cmd->cdw11_bits.create_io_cq.iv > NVME_IRQ_MSIX_NUM - 1) { 2123 SPDK_ERRLOG("%s: IV is too big\n", ctrlr_id(ctrlr)); 2124 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2125 return SPDK_NVME_SC_INVALID_INTERRUPT_VECTOR; 2126 } 2127 2128 cq = ctrlr->cqs[qid]; 2129 cq->size = qsize; 2130 2131 cq->mapping.prp1 = cmd->dptr.prp.prp1; 2132 2133 cq->dbl_headp = ctrlr_doorbell_ptr(ctrlr) + queue_index(qid, true); 2134 2135 err = map_q(ctrlr, &cq->mapping, cq->size, true, true); 2136 if (err) { 2137 SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr)); 2138 *sct = SPDK_NVME_SCT_GENERIC; 2139 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2140 } 2141 2142 SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped cqid:%u IOVA=%#lx vaddr=%p\n", 2143 ctrlr_id(ctrlr), qid, cmd->dptr.prp.prp1, 2144 q_addr(&cq->mapping)); 2145 2146 cq->ien = cmd->cdw11_bits.create_io_cq.ien; 2147 cq->iv = cmd->cdw11_bits.create_io_cq.iv; 2148 cq->phase = true; 2149 cq->cq_state = VFIO_USER_CQ_CREATED; 2150 2151 *cq_tailp(cq) = 0; 2152 2153 /* 2154 * We should always reset the doorbells. 2155 * 2156 * The Specification prohibits the controller from writing to the shadow 2157 * doorbell buffer, however older versions of the Linux NVMe driver 2158 * don't reset the shadow doorbell buffer after a Queue-Level or 2159 * Controller-Level reset, which means that we're left with garbage 2160 * doorbell values. 2161 */ 2162 *cq_dbl_headp(cq) = 0; 2163 2164 *sct = SPDK_NVME_SCT_GENERIC; 2165 return SPDK_NVME_SC_SUCCESS; 2166 } 2167 2168 /* 2169 * Creates a completion or submission I/O queue. Returns 0 on success, -errno 2170 * on error. 2171 */ 2172 static int 2173 handle_create_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, 2174 struct spdk_nvme_cmd *cmd, const bool is_cq) 2175 { 2176 struct nvmf_vfio_user_transport *vu_transport = ctrlr->transport; 2177 uint16_t sct = SPDK_NVME_SCT_GENERIC; 2178 uint16_t sc = SPDK_NVME_SC_SUCCESS; 2179 uint32_t qsize; 2180 uint16_t qid; 2181 2182 assert(ctrlr != NULL); 2183 assert(cmd != NULL); 2184 2185 qid = cmd->cdw10_bits.create_io_q.qid; 2186 if (qid == 0 || qid >= vu_transport->transport.opts.max_qpairs_per_ctrlr) { 2187 SPDK_ERRLOG("%s: invalid qid=%d, max=%d\n", ctrlr_id(ctrlr), 2188 qid, vu_transport->transport.opts.max_qpairs_per_ctrlr); 2189 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2190 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2191 goto out; 2192 } 2193 2194 if (io_q_exists(ctrlr, qid, is_cq)) { 2195 SPDK_ERRLOG("%s: %cqid:%d already exists\n", ctrlr_id(ctrlr), 2196 is_cq ? 'c' : 's', qid); 2197 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2198 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2199 goto out; 2200 } 2201 2202 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 2203 if (qsize == 1 || qsize > max_queue_size(ctrlr)) { 2204 SPDK_ERRLOG("%s: invalid I/O queue size %u\n", ctrlr_id(ctrlr), qsize); 2205 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2206 sc = SPDK_NVME_SC_INVALID_QUEUE_SIZE; 2207 goto out; 2208 } 2209 2210 if (is_cq) { 2211 sc = handle_create_io_cq(ctrlr, cmd, &sct); 2212 } else { 2213 sc = handle_create_io_sq(ctrlr, cmd, &sct); 2214 2215 if (sct == SPDK_NVME_SCT_GENERIC && 2216 sc == SPDK_NVME_SC_SUCCESS) { 2217 /* Completion posted asynchronously. */ 2218 return 0; 2219 } 2220 } 2221 2222 out: 2223 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 2224 } 2225 2226 /* For ADMIN I/O DELETE SUBMISSION QUEUE the NVMf library will disconnect and free 2227 * queue pair, so save the command in a context. 2228 */ 2229 struct vfio_user_delete_sq_ctx { 2230 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 2231 struct spdk_nvme_cmd delete_io_sq_cmd; 2232 }; 2233 2234 static void 2235 vfio_user_qpair_delete_cb(void *cb_arg) 2236 { 2237 struct vfio_user_delete_sq_ctx *ctx = cb_arg; 2238 struct nvmf_vfio_user_ctrlr *vu_ctrlr = ctx->vu_ctrlr; 2239 struct nvmf_vfio_user_cq *admin_cq = vu_ctrlr->cqs[0]; 2240 2241 assert(admin_cq != NULL); 2242 assert(admin_cq->group != NULL); 2243 assert(admin_cq->group->group->thread != NULL); 2244 if (admin_cq->group->group->thread != spdk_get_thread()) { 2245 spdk_thread_send_msg(admin_cq->group->group->thread, 2246 vfio_user_qpair_delete_cb, 2247 cb_arg); 2248 } else { 2249 post_completion(vu_ctrlr, admin_cq, 0, 0, 2250 ctx->delete_io_sq_cmd.cid, 2251 SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC); 2252 free(ctx); 2253 } 2254 } 2255 2256 /* 2257 * Deletes a completion or submission I/O queue. 2258 */ 2259 static int 2260 handle_del_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, 2261 struct spdk_nvme_cmd *cmd, const bool is_cq) 2262 { 2263 uint16_t sct = SPDK_NVME_SCT_GENERIC; 2264 uint16_t sc = SPDK_NVME_SC_SUCCESS; 2265 struct nvmf_vfio_user_sq *sq; 2266 struct nvmf_vfio_user_cq *cq; 2267 struct vfio_user_delete_sq_ctx *ctx; 2268 2269 SPDK_DEBUGLOG(nvmf_vfio, "%s: delete I/O %cqid:%d\n", 2270 ctrlr_id(ctrlr), is_cq ? 'c' : 's', 2271 cmd->cdw10_bits.delete_io_q.qid); 2272 2273 if (!io_q_exists(ctrlr, cmd->cdw10_bits.delete_io_q.qid, is_cq)) { 2274 SPDK_ERRLOG("%s: I/O %cqid:%d does not exist\n", ctrlr_id(ctrlr), 2275 is_cq ? 'c' : 's', cmd->cdw10_bits.delete_io_q.qid); 2276 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2277 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2278 goto out; 2279 } 2280 2281 if (is_cq) { 2282 cq = ctrlr->cqs[cmd->cdw10_bits.delete_io_q.qid]; 2283 if (cq->cq_ref) { 2284 SPDK_ERRLOG("%s: the associated SQ must be deleted first\n", ctrlr_id(ctrlr)); 2285 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2286 sc = SPDK_NVME_SC_INVALID_QUEUE_DELETION; 2287 goto out; 2288 } 2289 delete_cq_done(ctrlr, cq); 2290 } else { 2291 /* 2292 * Deletion of the CQ is only deferred to delete_sq_done() on 2293 * VM reboot or CC.EN change, so we have to delete it in all 2294 * other cases. 2295 */ 2296 ctx = calloc(1, sizeof(*ctx)); 2297 if (!ctx) { 2298 sct = SPDK_NVME_SCT_GENERIC; 2299 sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2300 goto out; 2301 } 2302 ctx->vu_ctrlr = ctrlr; 2303 ctx->delete_io_sq_cmd = *cmd; 2304 2305 sq = ctrlr->sqs[cmd->cdw10_bits.delete_io_q.qid]; 2306 sq->sq_state = VFIO_USER_SQ_DELETED; 2307 assert(ctrlr->cqs[sq->cqid]->cq_ref); 2308 ctrlr->cqs[sq->cqid]->cq_ref--; 2309 2310 spdk_nvmf_qpair_disconnect(&sq->qpair, vfio_user_qpair_delete_cb, ctx); 2311 return 0; 2312 } 2313 2314 out: 2315 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 2316 } 2317 2318 /* 2319 * Configures Shadow Doorbells. 2320 */ 2321 static int 2322 handle_doorbell_buffer_config(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd) 2323 { 2324 struct nvmf_vfio_user_shadow_doorbells *sdbl = NULL; 2325 uint32_t dstrd; 2326 uintptr_t page_size, page_mask; 2327 uint64_t prp1, prp2; 2328 uint16_t sct = SPDK_NVME_SCT_GENERIC; 2329 uint16_t sc = SPDK_NVME_SC_INVALID_FIELD; 2330 2331 assert(ctrlr != NULL); 2332 assert(ctrlr->endpoint != NULL); 2333 assert(cmd != NULL); 2334 2335 dstrd = doorbell_stride(ctrlr); 2336 page_size = memory_page_size(ctrlr); 2337 page_mask = memory_page_mask(ctrlr); 2338 2339 /* FIXME: we don't check doorbell stride when setting queue doorbells. */ 2340 if ((4u << dstrd) * NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR > page_size) { 2341 SPDK_ERRLOG("%s: doorbells do not fit in a single host page", 2342 ctrlr_id(ctrlr)); 2343 2344 goto out; 2345 } 2346 2347 /* Verify guest physical addresses passed as PRPs. */ 2348 if (cmd->psdt != SPDK_NVME_PSDT_PRP) { 2349 SPDK_ERRLOG("%s: received Doorbell Buffer Config without PRPs", 2350 ctrlr_id(ctrlr)); 2351 2352 goto out; 2353 } 2354 2355 prp1 = cmd->dptr.prp.prp1; 2356 prp2 = cmd->dptr.prp.prp2; 2357 2358 SPDK_DEBUGLOG(nvmf_vfio, 2359 "%s: configuring shadow doorbells with PRP1=%#lx and PRP2=%#lx (GPAs)\n", 2360 ctrlr_id(ctrlr), prp1, prp2); 2361 2362 if (prp1 == prp2 2363 || prp1 != (prp1 & page_mask) 2364 || prp2 != (prp2 & page_mask)) { 2365 SPDK_ERRLOG("%s: invalid shadow doorbell GPAs\n", 2366 ctrlr_id(ctrlr)); 2367 2368 goto out; 2369 } 2370 2371 /* Map guest physical addresses to our virtual address space. */ 2372 sdbl = map_sdbl(ctrlr->endpoint->vfu_ctx, prp1, prp2, page_size); 2373 if (sdbl == NULL) { 2374 SPDK_ERRLOG("%s: failed to map shadow doorbell buffers\n", 2375 ctrlr_id(ctrlr)); 2376 2377 goto out; 2378 } 2379 2380 ctrlr->shadow_doorbell_buffer = prp1; 2381 ctrlr->eventidx_buffer = prp2; 2382 2383 SPDK_DEBUGLOG(nvmf_vfio, 2384 "%s: mapped shadow doorbell buffers [%p, %p) and [%p, %p)\n", 2385 ctrlr_id(ctrlr), 2386 sdbl->iovs[0].iov_base, 2387 sdbl->iovs[0].iov_base + sdbl->iovs[0].iov_len, 2388 sdbl->iovs[1].iov_base, 2389 sdbl->iovs[1].iov_base + sdbl->iovs[1].iov_len); 2390 2391 2392 /* 2393 * Set all possible CQ head doorbells to polling mode now, such that we 2394 * don't have to worry about it later if the host creates more queues. 2395 * 2396 * We only ever want interrupts for writes to the SQ tail doorbells 2397 * (which are initialised in set_ctrlr_intr_mode() below). 2398 */ 2399 for (uint16_t i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; ++i) { 2400 sdbl->eventidxs[queue_index(i, true)] = NVMF_VFIO_USER_EVENTIDX_POLL; 2401 } 2402 2403 /* Update controller. */ 2404 SWAP(ctrlr->sdbl, sdbl); 2405 2406 /* 2407 * Copy doorbells from either the previous shadow doorbell buffer or the 2408 * BAR0 doorbells and make I/O queue doorbells point to the new buffer. 2409 * 2410 * This needs to account for older versions of the Linux NVMe driver, 2411 * which don't clear out the buffer after a controller reset. 2412 */ 2413 copy_doorbells(ctrlr, sdbl != NULL ? 2414 sdbl->shadow_doorbells : ctrlr->bar0_doorbells, 2415 ctrlr->sdbl->shadow_doorbells); 2416 2417 vfio_user_ctrlr_switch_doorbells(ctrlr, true); 2418 2419 ctrlr_kick(ctrlr); 2420 2421 sc = SPDK_NVME_SC_SUCCESS; 2422 2423 out: 2424 /* 2425 * Unmap existing buffers, in case Doorbell Buffer Config was sent 2426 * more than once (pointless, but not prohibited by the spec), or 2427 * in case of an error. 2428 * 2429 * If this is the first time Doorbell Buffer Config was processed, 2430 * then we've just swapped a NULL from ctrlr->sdbl into sdbl, so 2431 * free_sdbl() becomes a noop. 2432 */ 2433 free_sdbl(ctrlr->endpoint->vfu_ctx, sdbl); 2434 2435 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 2436 } 2437 2438 /* Returns 0 on success and -errno on error. */ 2439 static int 2440 consume_admin_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd) 2441 { 2442 assert(ctrlr != NULL); 2443 assert(cmd != NULL); 2444 2445 if (cmd->fuse != 0) { 2446 /* Fused admin commands are not supported. */ 2447 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, 2448 SPDK_NVME_SC_INVALID_FIELD, 2449 SPDK_NVME_SCT_GENERIC); 2450 } 2451 2452 switch (cmd->opc) { 2453 case SPDK_NVME_OPC_CREATE_IO_CQ: 2454 case SPDK_NVME_OPC_CREATE_IO_SQ: 2455 return handle_create_io_q(ctrlr, cmd, 2456 cmd->opc == SPDK_NVME_OPC_CREATE_IO_CQ); 2457 case SPDK_NVME_OPC_DELETE_IO_SQ: 2458 case SPDK_NVME_OPC_DELETE_IO_CQ: 2459 return handle_del_io_q(ctrlr, cmd, 2460 cmd->opc == SPDK_NVME_OPC_DELETE_IO_CQ); 2461 case SPDK_NVME_OPC_DOORBELL_BUFFER_CONFIG: 2462 if (!ctrlr->transport->transport_opts.disable_shadow_doorbells) { 2463 return handle_doorbell_buffer_config(ctrlr, cmd); 2464 } 2465 /* FALLTHROUGH */ 2466 default: 2467 return handle_cmd_req(ctrlr, cmd, ctrlr->sqs[0]); 2468 } 2469 } 2470 2471 static int 2472 handle_cmd_rsp(struct nvmf_vfio_user_req *vu_req, void *cb_arg) 2473 { 2474 struct nvmf_vfio_user_sq *sq = cb_arg; 2475 struct nvmf_vfio_user_ctrlr *vu_ctrlr = sq->ctrlr; 2476 uint16_t sqid, cqid; 2477 2478 assert(sq != NULL); 2479 assert(vu_req != NULL); 2480 assert(vu_ctrlr != NULL); 2481 2482 if (spdk_likely(vu_req->iovcnt)) { 2483 vfu_sgl_put(vu_ctrlr->endpoint->vfu_ctx, 2484 index_to_sg_t(vu_req->sg, 0), 2485 vu_req->iov, vu_req->iovcnt); 2486 } 2487 sqid = sq->qid; 2488 cqid = sq->cqid; 2489 2490 return post_completion(vu_ctrlr, vu_ctrlr->cqs[cqid], 2491 vu_req->req.rsp->nvme_cpl.cdw0, 2492 sqid, 2493 vu_req->req.cmd->nvme_cmd.cid, 2494 vu_req->req.rsp->nvme_cpl.status.sc, 2495 vu_req->req.rsp->nvme_cpl.status.sct); 2496 } 2497 2498 static int 2499 consume_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_sq *sq, 2500 struct spdk_nvme_cmd *cmd) 2501 { 2502 assert(sq != NULL); 2503 if (spdk_unlikely(nvmf_qpair_is_admin_queue(&sq->qpair))) { 2504 return consume_admin_cmd(ctrlr, cmd); 2505 } 2506 2507 return handle_cmd_req(ctrlr, cmd, sq); 2508 } 2509 2510 /* Returns the number of commands processed, or a negative value on error. */ 2511 static int 2512 handle_sq_tdbl_write(struct nvmf_vfio_user_ctrlr *ctrlr, const uint32_t new_tail, 2513 struct nvmf_vfio_user_sq *sq) 2514 { 2515 struct spdk_nvme_cmd *queue; 2516 int count = 0; 2517 2518 assert(ctrlr != NULL); 2519 assert(sq != NULL); 2520 2521 if (ctrlr->sdbl != NULL && sq->qid != 0) { 2522 /* 2523 * Submission queue index has moved past the event index, so it 2524 * needs to be re-armed before we go to sleep. 2525 */ 2526 sq->need_rearm = true; 2527 } 2528 2529 queue = q_addr(&sq->mapping); 2530 while (*sq_headp(sq) != new_tail) { 2531 int err; 2532 struct spdk_nvme_cmd *cmd = &queue[*sq_headp(sq)]; 2533 2534 count++; 2535 2536 /* 2537 * SQHD must contain the new head pointer, so we must increase 2538 * it before we generate a completion. 2539 */ 2540 sq_head_advance(sq); 2541 2542 err = consume_cmd(ctrlr, sq, cmd); 2543 if (spdk_unlikely(err != 0)) { 2544 return err; 2545 } 2546 } 2547 2548 return count; 2549 } 2550 2551 /* Checks whether endpoint is connected from the same process */ 2552 static bool 2553 is_peer_same_process(struct nvmf_vfio_user_endpoint *endpoint) 2554 { 2555 struct ucred ucred; 2556 socklen_t ucredlen = sizeof(ucred); 2557 2558 if (endpoint == NULL) { 2559 return false; 2560 } 2561 2562 if (getsockopt(vfu_get_poll_fd(endpoint->vfu_ctx), SOL_SOCKET, SO_PEERCRED, &ucred, 2563 &ucredlen) < 0) { 2564 SPDK_ERRLOG("getsockopt(SO_PEERCRED): %s\n", strerror(errno)); 2565 return false; 2566 } 2567 2568 return ucred.pid == getpid(); 2569 } 2570 2571 static void 2572 memory_region_add_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) 2573 { 2574 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2575 struct nvmf_vfio_user_ctrlr *ctrlr; 2576 struct nvmf_vfio_user_sq *sq; 2577 struct nvmf_vfio_user_cq *cq; 2578 void *map_start, *map_end; 2579 int ret; 2580 2581 /* 2582 * We're not interested in any DMA regions that aren't mappable (we don't 2583 * support clients that don't share their memory). 2584 */ 2585 if (!info->vaddr) { 2586 return; 2587 } 2588 2589 map_start = info->mapping.iov_base; 2590 map_end = info->mapping.iov_base + info->mapping.iov_len; 2591 2592 if (((uintptr_t)info->mapping.iov_base & MASK_2MB) || 2593 (info->mapping.iov_len & MASK_2MB)) { 2594 SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %p-%p\n", 2595 info->vaddr, map_start, map_end); 2596 return; 2597 } 2598 2599 assert(endpoint != NULL); 2600 if (endpoint->ctrlr == NULL) { 2601 return; 2602 } 2603 ctrlr = endpoint->ctrlr; 2604 2605 SPDK_DEBUGLOG(nvmf_vfio, "%s: map IOVA %p-%p\n", endpoint_id(endpoint), 2606 map_start, map_end); 2607 2608 /* VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE are enabled when registering to VFIO, here we also 2609 * check the protection bits before registering. When vfio client and server are run in same process 2610 * there is no need to register the same memory again. 2611 */ 2612 if (info->prot == (PROT_WRITE | PROT_READ) && !is_peer_same_process(endpoint)) { 2613 ret = spdk_mem_register(info->mapping.iov_base, info->mapping.iov_len); 2614 if (ret) { 2615 SPDK_ERRLOG("Memory region register %p-%p failed, ret=%d\n", 2616 map_start, map_end, ret); 2617 } 2618 } 2619 2620 pthread_mutex_lock(&endpoint->lock); 2621 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 2622 if (sq->sq_state != VFIO_USER_SQ_INACTIVE) { 2623 continue; 2624 } 2625 2626 cq = ctrlr->cqs[sq->cqid]; 2627 2628 /* For shared CQ case, we will use q_addr() to avoid mapping CQ multiple times */ 2629 if (cq->size && q_addr(&cq->mapping) == NULL) { 2630 ret = map_q(ctrlr, &cq->mapping, cq->size, true, false); 2631 if (ret) { 2632 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap cqid:%d %#lx-%#lx\n", 2633 cq->qid, cq->mapping.prp1, 2634 cq->mapping.prp1 + cq->size * sizeof(struct spdk_nvme_cpl)); 2635 continue; 2636 } 2637 } 2638 2639 if (sq->size) { 2640 ret = map_q(ctrlr, &sq->mapping, sq->size, false, false); 2641 if (ret) { 2642 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap sqid:%d %#lx-%#lx\n", 2643 sq->qid, sq->mapping.prp1, 2644 sq->mapping.prp1 + sq->size * sizeof(struct spdk_nvme_cmd)); 2645 continue; 2646 } 2647 } 2648 sq->sq_state = VFIO_USER_SQ_ACTIVE; 2649 SPDK_DEBUGLOG(nvmf_vfio, "Remap sqid:%u successfully\n", sq->qid); 2650 } 2651 pthread_mutex_unlock(&endpoint->lock); 2652 } 2653 2654 static void 2655 memory_region_remove_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) 2656 { 2657 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2658 struct nvmf_vfio_user_sq *sq; 2659 struct nvmf_vfio_user_cq *cq; 2660 void *map_start, *map_end; 2661 int ret = 0; 2662 2663 if (!info->vaddr) { 2664 return; 2665 } 2666 2667 map_start = info->mapping.iov_base; 2668 map_end = info->mapping.iov_base + info->mapping.iov_len; 2669 2670 if (((uintptr_t)info->mapping.iov_base & MASK_2MB) || 2671 (info->mapping.iov_len & MASK_2MB)) { 2672 SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %p-%p\n", 2673 info->vaddr, map_start, map_end); 2674 return; 2675 } 2676 2677 assert(endpoint != NULL); 2678 SPDK_DEBUGLOG(nvmf_vfio, "%s: unmap IOVA %p-%p\n", endpoint_id(endpoint), 2679 map_start, map_end); 2680 2681 if (endpoint->ctrlr != NULL) { 2682 struct nvmf_vfio_user_ctrlr *ctrlr; 2683 ctrlr = endpoint->ctrlr; 2684 2685 pthread_mutex_lock(&endpoint->lock); 2686 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 2687 if (q_addr(&sq->mapping) >= map_start && q_addr(&sq->mapping) <= map_end) { 2688 unmap_q(ctrlr, &sq->mapping); 2689 sq->sq_state = VFIO_USER_SQ_INACTIVE; 2690 } 2691 2692 cq = ctrlr->cqs[sq->cqid]; 2693 if (q_addr(&cq->mapping) >= map_start && q_addr(&cq->mapping) <= map_end) { 2694 unmap_q(ctrlr, &cq->mapping); 2695 } 2696 } 2697 2698 if (ctrlr->sdbl != NULL) { 2699 size_t i; 2700 2701 for (i = 0; i < NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT; i++) { 2702 const void *const iov_base = ctrlr->sdbl->iovs[i].iov_base; 2703 2704 if (iov_base >= map_start && iov_base < map_end) { 2705 copy_doorbells(ctrlr, 2706 ctrlr->sdbl->shadow_doorbells, 2707 ctrlr->bar0_doorbells); 2708 vfio_user_ctrlr_switch_doorbells(ctrlr, false); 2709 free_sdbl(endpoint->vfu_ctx, ctrlr->sdbl); 2710 ctrlr->sdbl = NULL; 2711 break; 2712 } 2713 } 2714 } 2715 2716 pthread_mutex_unlock(&endpoint->lock); 2717 } 2718 2719 if (info->prot == (PROT_WRITE | PROT_READ) && !is_peer_same_process(endpoint)) { 2720 ret = spdk_mem_unregister(info->mapping.iov_base, info->mapping.iov_len); 2721 if (ret) { 2722 SPDK_ERRLOG("Memory region unregister %p-%p failed, ret=%d\n", 2723 map_start, map_end, ret); 2724 } 2725 } 2726 } 2727 2728 /* Used to initiate a controller-level reset or a controller shutdown. */ 2729 static void 2730 disable_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 2731 { 2732 SPDK_DEBUGLOG(nvmf_vfio, "%s: disabling controller\n", 2733 ctrlr_id(vu_ctrlr)); 2734 2735 /* Unmap Admin queue. */ 2736 2737 assert(vu_ctrlr->sqs[0] != NULL); 2738 assert(vu_ctrlr->cqs[0] != NULL); 2739 2740 unmap_q(vu_ctrlr, &vu_ctrlr->sqs[0]->mapping); 2741 unmap_q(vu_ctrlr, &vu_ctrlr->cqs[0]->mapping); 2742 2743 vu_ctrlr->sqs[0]->size = 0; 2744 *sq_headp(vu_ctrlr->sqs[0]) = 0; 2745 2746 vu_ctrlr->sqs[0]->sq_state = VFIO_USER_SQ_INACTIVE; 2747 2748 vu_ctrlr->cqs[0]->size = 0; 2749 *cq_tailp(vu_ctrlr->cqs[0]) = 0; 2750 2751 /* 2752 * For PCIe controller reset or shutdown, we will drop all AER 2753 * responses. 2754 */ 2755 nvmf_ctrlr_abort_aer(vu_ctrlr->ctrlr); 2756 2757 /* Free the shadow doorbell buffer. */ 2758 vfio_user_ctrlr_switch_doorbells(vu_ctrlr, false); 2759 free_sdbl(vu_ctrlr->endpoint->vfu_ctx, vu_ctrlr->sdbl); 2760 vu_ctrlr->sdbl = NULL; 2761 } 2762 2763 /* Used to re-enable the controller after a controller-level reset. */ 2764 static int 2765 enable_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 2766 { 2767 int err; 2768 2769 assert(vu_ctrlr != NULL); 2770 2771 SPDK_DEBUGLOG(nvmf_vfio, "%s: enabling controller\n", 2772 ctrlr_id(vu_ctrlr)); 2773 2774 err = acq_setup(vu_ctrlr); 2775 if (err != 0) { 2776 return err; 2777 } 2778 2779 err = asq_setup(vu_ctrlr); 2780 if (err != 0) { 2781 return err; 2782 } 2783 2784 vu_ctrlr->sqs[0]->sq_state = VFIO_USER_SQ_ACTIVE; 2785 2786 return 0; 2787 } 2788 2789 static int 2790 nvmf_vfio_user_prop_req_rsp_set(struct nvmf_vfio_user_req *req, 2791 struct nvmf_vfio_user_sq *sq) 2792 { 2793 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 2794 union spdk_nvme_cc_register cc, diff; 2795 2796 assert(req->req.cmd->prop_set_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET); 2797 assert(sq->ctrlr != NULL); 2798 vu_ctrlr = sq->ctrlr; 2799 2800 if (req->req.cmd->prop_set_cmd.ofst != offsetof(struct spdk_nvme_registers, cc)) { 2801 return 0; 2802 } 2803 2804 cc.raw = req->req.cmd->prop_set_cmd.value.u64; 2805 diff.raw = cc.raw ^ req->cc.raw; 2806 2807 if (diff.bits.en) { 2808 if (cc.bits.en) { 2809 int ret = enable_ctrlr(vu_ctrlr); 2810 if (ret) { 2811 SPDK_ERRLOG("%s: failed to enable ctrlr\n", ctrlr_id(vu_ctrlr)); 2812 return ret; 2813 } 2814 vu_ctrlr->reset_shn = false; 2815 } else { 2816 vu_ctrlr->reset_shn = true; 2817 } 2818 } 2819 2820 if (diff.bits.shn) { 2821 if (cc.bits.shn == SPDK_NVME_SHN_NORMAL || cc.bits.shn == SPDK_NVME_SHN_ABRUPT) { 2822 vu_ctrlr->reset_shn = true; 2823 } 2824 } 2825 2826 if (vu_ctrlr->reset_shn) { 2827 disable_ctrlr(vu_ctrlr); 2828 } 2829 return 0; 2830 } 2831 2832 static int 2833 nvmf_vfio_user_prop_req_rsp(struct nvmf_vfio_user_req *req, void *cb_arg) 2834 { 2835 struct nvmf_vfio_user_sq *sq = cb_arg; 2836 2837 assert(sq != NULL); 2838 assert(req != NULL); 2839 2840 if (req->req.cmd->prop_get_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET) { 2841 assert(sq->ctrlr != NULL); 2842 assert(req != NULL); 2843 2844 memcpy(req->req.iov[0].iov_base, 2845 &req->req.rsp->prop_get_rsp.value.u64, 2846 req->req.length); 2847 return 0; 2848 } 2849 2850 return nvmf_vfio_user_prop_req_rsp_set(req, sq); 2851 } 2852 2853 /* 2854 * Handles a write at offset 0x1000 or more; this is the non-mapped path when a 2855 * doorbell is written via access_bar0_fn(). 2856 * 2857 * DSTRD is set to fixed value 0 for NVMf. 2858 * 2859 */ 2860 static int 2861 handle_dbl_access(struct nvmf_vfio_user_ctrlr *ctrlr, uint32_t *buf, 2862 const size_t count, loff_t pos, const bool is_write) 2863 { 2864 struct nvmf_vfio_user_poll_group *group; 2865 2866 assert(ctrlr != NULL); 2867 assert(buf != NULL); 2868 2869 if (spdk_unlikely(!is_write)) { 2870 SPDK_WARNLOG("%s: host tried to read BAR0 doorbell %#lx\n", 2871 ctrlr_id(ctrlr), pos); 2872 errno = EPERM; 2873 return -1; 2874 } 2875 2876 if (spdk_unlikely(count != sizeof(uint32_t))) { 2877 SPDK_ERRLOG("%s: bad doorbell buffer size %ld\n", 2878 ctrlr_id(ctrlr), count); 2879 errno = EINVAL; 2880 return -1; 2881 } 2882 2883 pos -= NVME_DOORBELLS_OFFSET; 2884 2885 /* pos must be dword aligned */ 2886 if (spdk_unlikely((pos & 0x3) != 0)) { 2887 SPDK_ERRLOG("%s: bad doorbell offset %#lx\n", ctrlr_id(ctrlr), pos); 2888 errno = EINVAL; 2889 return -1; 2890 } 2891 2892 /* convert byte offset to array index */ 2893 pos >>= 2; 2894 2895 if (spdk_unlikely(pos >= NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR * 2)) { 2896 SPDK_ERRLOG("%s: bad doorbell index %#lx\n", ctrlr_id(ctrlr), pos); 2897 errno = EINVAL; 2898 return -1; 2899 } 2900 2901 ctrlr->bar0_doorbells[pos] = *buf; 2902 spdk_wmb(); 2903 2904 group = ctrlr_to_poll_group(ctrlr); 2905 if (pos == 1) { 2906 group->stats.cqh_admin_writes++; 2907 } else if (pos & 1) { 2908 group->stats.cqh_io_writes++; 2909 } 2910 2911 SPDK_DEBUGLOG(vfio_user_db, "%s: updating BAR0 doorbell %s:%ld to %u\n", 2912 ctrlr_id(ctrlr), (pos & 1) ? "cqid" : "sqid", 2913 pos / 2, *buf); 2914 2915 2916 return 0; 2917 } 2918 2919 static size_t 2920 vfio_user_property_access(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 2921 char *buf, size_t count, loff_t pos, 2922 bool is_write) 2923 { 2924 struct nvmf_vfio_user_req *req; 2925 const struct spdk_nvmf_registers *regs; 2926 2927 if ((count != 4) && (count != 8)) { 2928 errno = EINVAL; 2929 return -1; 2930 } 2931 2932 /* Construct a Fabric Property Get/Set command and send it */ 2933 req = get_nvmf_vfio_user_req(vu_ctrlr->sqs[0]); 2934 if (req == NULL) { 2935 errno = ENOBUFS; 2936 return -1; 2937 } 2938 regs = spdk_nvmf_ctrlr_get_regs(vu_ctrlr->ctrlr); 2939 req->cc.raw = regs->cc.raw; 2940 2941 req->cb_fn = nvmf_vfio_user_prop_req_rsp; 2942 req->cb_arg = vu_ctrlr->sqs[0]; 2943 req->req.cmd->prop_set_cmd.opcode = SPDK_NVME_OPC_FABRIC; 2944 req->req.cmd->prop_set_cmd.cid = 0; 2945 if (count == 4) { 2946 req->req.cmd->prop_set_cmd.attrib.size = 0; 2947 } else { 2948 req->req.cmd->prop_set_cmd.attrib.size = 1; 2949 } 2950 req->req.cmd->prop_set_cmd.ofst = pos; 2951 if (is_write) { 2952 req->req.cmd->prop_set_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET; 2953 if (req->req.cmd->prop_set_cmd.attrib.size) { 2954 req->req.cmd->prop_set_cmd.value.u64 = *(uint64_t *)buf; 2955 } else { 2956 req->req.cmd->prop_set_cmd.value.u32.high = 0; 2957 req->req.cmd->prop_set_cmd.value.u32.low = *(uint32_t *)buf; 2958 } 2959 } else { 2960 req->req.cmd->prop_get_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET; 2961 } 2962 req->req.length = count; 2963 spdk_iov_one(req->req.iov, &req->req.iovcnt, buf, req->req.length); 2964 req->req.data = buf; 2965 2966 spdk_nvmf_request_exec_fabrics(&req->req); 2967 2968 return count; 2969 } 2970 2971 static ssize_t 2972 access_bar0_fn(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t pos, 2973 bool is_write) 2974 { 2975 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2976 struct nvmf_vfio_user_ctrlr *ctrlr; 2977 int ret; 2978 2979 ctrlr = endpoint->ctrlr; 2980 if (spdk_unlikely(endpoint->need_async_destroy || !ctrlr)) { 2981 errno = EIO; 2982 return -1; 2983 } 2984 2985 if (pos >= NVME_DOORBELLS_OFFSET) { 2986 /* 2987 * The fact that the doorbells can be memory mapped doesn't mean 2988 * that the client (VFIO in QEMU) is obliged to memory map them, 2989 * it might still elect to access them via regular read/write; 2990 * we might also have had disable_mappable_bar0 set. 2991 */ 2992 ret = handle_dbl_access(ctrlr, (uint32_t *)buf, count, 2993 pos, is_write); 2994 if (ret == 0) { 2995 return count; 2996 } 2997 return ret; 2998 } 2999 3000 return vfio_user_property_access(ctrlr, buf, count, pos, is_write); 3001 } 3002 3003 static ssize_t 3004 access_pci_config(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t offset, 3005 bool is_write) 3006 { 3007 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3008 3009 if (is_write) { 3010 SPDK_ERRLOG("%s: write %#lx-%#lx not supported\n", 3011 endpoint_id(endpoint), offset, offset + count); 3012 errno = EINVAL; 3013 return -1; 3014 } 3015 3016 if (offset + count > NVME_REG_CFG_SIZE) { 3017 SPDK_ERRLOG("%s: access past end of extended PCI configuration space, want=%ld+%ld, max=%d\n", 3018 endpoint_id(endpoint), offset, count, 3019 NVME_REG_CFG_SIZE); 3020 errno = ERANGE; 3021 return -1; 3022 } 3023 3024 memcpy(buf, ((unsigned char *)endpoint->pci_config_space) + offset, count); 3025 3026 return count; 3027 } 3028 3029 static void 3030 vfio_user_log(vfu_ctx_t *vfu_ctx, int level, char const *msg) 3031 { 3032 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3033 3034 if (level >= LOG_DEBUG) { 3035 SPDK_DEBUGLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg); 3036 } else if (level >= LOG_INFO) { 3037 SPDK_INFOLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg); 3038 } else if (level >= LOG_NOTICE) { 3039 SPDK_NOTICELOG("%s: %s\n", endpoint_id(endpoint), msg); 3040 } else if (level >= LOG_WARNING) { 3041 SPDK_WARNLOG("%s: %s\n", endpoint_id(endpoint), msg); 3042 } else { 3043 SPDK_ERRLOG("%s: %s\n", endpoint_id(endpoint), msg); 3044 } 3045 } 3046 3047 static int 3048 vfio_user_get_log_level(void) 3049 { 3050 int level; 3051 3052 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 3053 return LOG_DEBUG; 3054 } 3055 3056 level = spdk_log_to_syslog_level(spdk_log_get_level()); 3057 if (level < 0) { 3058 return LOG_ERR; 3059 } 3060 3061 return level; 3062 } 3063 3064 static void 3065 init_pci_config_space(vfu_pci_config_space_t *p) 3066 { 3067 /* MLBAR */ 3068 p->hdr.bars[0].raw = 0x0; 3069 /* MUBAR */ 3070 p->hdr.bars[1].raw = 0x0; 3071 3072 /* vendor specific, let's set them to zero for now */ 3073 p->hdr.bars[3].raw = 0x0; 3074 p->hdr.bars[4].raw = 0x0; 3075 p->hdr.bars[5].raw = 0x0; 3076 3077 /* enable INTx */ 3078 p->hdr.intr.ipin = 0x1; 3079 } 3080 3081 struct ctrlr_quiesce_ctx { 3082 struct nvmf_vfio_user_endpoint *endpoint; 3083 struct nvmf_vfio_user_poll_group *group; 3084 int status; 3085 }; 3086 3087 static void ctrlr_quiesce(struct nvmf_vfio_user_ctrlr *vu_ctrlr); 3088 3089 static void 3090 _vfio_user_endpoint_resume_done_msg(void *ctx) 3091 { 3092 struct nvmf_vfio_user_endpoint *endpoint = ctx; 3093 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3094 3095 endpoint->need_resume = false; 3096 3097 if (!vu_ctrlr) { 3098 return; 3099 } 3100 3101 if (!vu_ctrlr->queued_quiesce) { 3102 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 3103 3104 /* 3105 * We might have ignored new SQ entries while we were quiesced: 3106 * kick ourselves so we'll definitely check again while in 3107 * VFIO_USER_CTRLR_RUNNING state. 3108 */ 3109 if (in_interrupt_mode(endpoint->transport)) { 3110 ctrlr_kick(vu_ctrlr); 3111 } 3112 return; 3113 } 3114 3115 3116 /* 3117 * Basically, once we call `vfu_device_quiesced` the device is 3118 * unquiesced from libvfio-user's perspective so from the moment 3119 * `vfio_user_quiesce_done` returns libvfio-user might quiesce the device 3120 * again. However, because the NVMf subsytem is an asynchronous 3121 * operation, this quiesce might come _before_ the NVMf subsystem has 3122 * been resumed, so in the callback of `spdk_nvmf_subsystem_resume` we 3123 * need to check whether a quiesce was requested. 3124 */ 3125 SPDK_DEBUGLOG(nvmf_vfio, "%s has queued quiesce event, quiesce again\n", 3126 ctrlr_id(vu_ctrlr)); 3127 ctrlr_quiesce(vu_ctrlr); 3128 } 3129 3130 static void 3131 vfio_user_endpoint_resume_done(struct spdk_nvmf_subsystem *subsystem, 3132 void *cb_arg, int status) 3133 { 3134 struct nvmf_vfio_user_endpoint *endpoint = cb_arg; 3135 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3136 3137 SPDK_DEBUGLOG(nvmf_vfio, "%s resumed done with status %d\n", endpoint_id(endpoint), status); 3138 3139 if (!vu_ctrlr) { 3140 return; 3141 } 3142 3143 spdk_thread_send_msg(vu_ctrlr->thread, _vfio_user_endpoint_resume_done_msg, endpoint); 3144 } 3145 3146 static void 3147 vfio_user_quiesce_done(void *ctx) 3148 { 3149 struct ctrlr_quiesce_ctx *quiesce_ctx = ctx; 3150 struct nvmf_vfio_user_endpoint *endpoint = quiesce_ctx->endpoint; 3151 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3152 int ret; 3153 3154 if (!vu_ctrlr) { 3155 free(quiesce_ctx); 3156 return; 3157 } 3158 3159 SPDK_DEBUGLOG(nvmf_vfio, "%s device quiesced\n", ctrlr_id(vu_ctrlr)); 3160 3161 assert(vu_ctrlr->state == VFIO_USER_CTRLR_PAUSING); 3162 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 3163 vfu_device_quiesced(endpoint->vfu_ctx, quiesce_ctx->status); 3164 vu_ctrlr->queued_quiesce = false; 3165 free(quiesce_ctx); 3166 3167 /* `vfu_device_quiesced` can change the migration state, 3168 * so we need to re-check `vu_ctrlr->state`. 3169 */ 3170 if (vu_ctrlr->state == VFIO_USER_CTRLR_MIGRATING) { 3171 SPDK_DEBUGLOG(nvmf_vfio, "%s is in MIGRATION state\n", ctrlr_id(vu_ctrlr)); 3172 return; 3173 } 3174 3175 SPDK_DEBUGLOG(nvmf_vfio, "%s start to resume\n", ctrlr_id(vu_ctrlr)); 3176 vu_ctrlr->state = VFIO_USER_CTRLR_RESUMING; 3177 ret = spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 3178 vfio_user_endpoint_resume_done, endpoint); 3179 if (ret < 0) { 3180 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 3181 SPDK_ERRLOG("%s: failed to resume, ret=%d\n", endpoint_id(endpoint), ret); 3182 } 3183 } 3184 3185 static void 3186 vfio_user_pause_done(struct spdk_nvmf_subsystem *subsystem, 3187 void *ctx, int status) 3188 { 3189 struct ctrlr_quiesce_ctx *quiesce_ctx = ctx; 3190 struct nvmf_vfio_user_endpoint *endpoint = quiesce_ctx->endpoint; 3191 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3192 3193 if (!vu_ctrlr) { 3194 free(quiesce_ctx); 3195 return; 3196 } 3197 3198 quiesce_ctx->status = status; 3199 3200 SPDK_DEBUGLOG(nvmf_vfio, "%s pause done with status %d\n", 3201 ctrlr_id(vu_ctrlr), status); 3202 3203 spdk_thread_send_msg(vu_ctrlr->thread, 3204 vfio_user_quiesce_done, ctx); 3205 } 3206 3207 /* 3208 * Ensure that, for this PG, we've stopped running in nvmf_vfio_user_sq_poll(); 3209 * we've already set ctrlr->state, so we won't process new entries, but we need 3210 * to ensure that this PG is quiesced. This only works because there's no 3211 * callback context set up between polling the SQ and spdk_nvmf_request_exec(). 3212 * 3213 * Once we've walked all PGs, we need to pause any submitted I/O via 3214 * spdk_nvmf_subsystem_pause(SPDK_NVME_GLOBAL_NS_TAG). 3215 */ 3216 static void 3217 vfio_user_quiesce_pg(void *ctx) 3218 { 3219 struct ctrlr_quiesce_ctx *quiesce_ctx = ctx; 3220 struct nvmf_vfio_user_endpoint *endpoint = quiesce_ctx->endpoint; 3221 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3222 struct nvmf_vfio_user_poll_group *vu_group = quiesce_ctx->group; 3223 struct spdk_nvmf_subsystem *subsystem = endpoint->subsystem; 3224 int ret; 3225 3226 SPDK_DEBUGLOG(nvmf_vfio, "quiesced pg:%p\n", vu_group); 3227 3228 if (!vu_ctrlr) { 3229 free(quiesce_ctx); 3230 return; 3231 } 3232 3233 quiesce_ctx->group = TAILQ_NEXT(vu_group, link); 3234 if (quiesce_ctx->group != NULL) { 3235 spdk_thread_send_msg(poll_group_to_thread(quiesce_ctx->group), 3236 vfio_user_quiesce_pg, quiesce_ctx); 3237 return; 3238 } 3239 3240 ret = spdk_nvmf_subsystem_pause(subsystem, SPDK_NVME_GLOBAL_NS_TAG, 3241 vfio_user_pause_done, quiesce_ctx); 3242 if (ret < 0) { 3243 SPDK_ERRLOG("%s: failed to pause, ret=%d\n", 3244 endpoint_id(endpoint), ret); 3245 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 3246 fail_ctrlr(vu_ctrlr); 3247 free(quiesce_ctx); 3248 } 3249 } 3250 3251 static void 3252 ctrlr_quiesce(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3253 { 3254 struct ctrlr_quiesce_ctx *quiesce_ctx; 3255 3256 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSING; 3257 3258 quiesce_ctx = calloc(1, sizeof(*quiesce_ctx)); 3259 if (!quiesce_ctx) { 3260 SPDK_ERRLOG("Failed to allocate subsystem pause context\n"); 3261 assert(false); 3262 return; 3263 } 3264 3265 quiesce_ctx->endpoint = vu_ctrlr->endpoint; 3266 quiesce_ctx->status = 0; 3267 quiesce_ctx->group = TAILQ_FIRST(&vu_ctrlr->transport->poll_groups); 3268 3269 spdk_thread_send_msg(poll_group_to_thread(quiesce_ctx->group), 3270 vfio_user_quiesce_pg, quiesce_ctx); 3271 } 3272 3273 static int 3274 vfio_user_dev_quiesce_cb(vfu_ctx_t *vfu_ctx) 3275 { 3276 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3277 struct spdk_nvmf_subsystem *subsystem = endpoint->subsystem; 3278 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3279 3280 if (!vu_ctrlr) { 3281 return 0; 3282 } 3283 3284 /* NVMf library will destruct controller when no 3285 * connected queue pairs. 3286 */ 3287 if (!nvmf_subsystem_get_ctrlr(subsystem, vu_ctrlr->cntlid)) { 3288 return 0; 3289 } 3290 3291 SPDK_DEBUGLOG(nvmf_vfio, "%s starts to quiesce\n", ctrlr_id(vu_ctrlr)); 3292 3293 /* There is no race condition here as device quiesce callback 3294 * and nvmf_prop_set_cc() are running in the same thread context. 3295 */ 3296 if (!vu_ctrlr->ctrlr->vcprop.cc.bits.en) { 3297 return 0; 3298 } else if (!vu_ctrlr->ctrlr->vcprop.csts.bits.rdy) { 3299 return 0; 3300 } else if (vu_ctrlr->ctrlr->vcprop.csts.bits.shst == SPDK_NVME_SHST_COMPLETE) { 3301 return 0; 3302 } 3303 3304 switch (vu_ctrlr->state) { 3305 case VFIO_USER_CTRLR_PAUSED: 3306 case VFIO_USER_CTRLR_MIGRATING: 3307 return 0; 3308 case VFIO_USER_CTRLR_RUNNING: 3309 ctrlr_quiesce(vu_ctrlr); 3310 break; 3311 case VFIO_USER_CTRLR_RESUMING: 3312 vu_ctrlr->queued_quiesce = true; 3313 SPDK_DEBUGLOG(nvmf_vfio, "%s is busy to quiesce, current state %u\n", ctrlr_id(vu_ctrlr), 3314 vu_ctrlr->state); 3315 break; 3316 default: 3317 assert(vu_ctrlr->state != VFIO_USER_CTRLR_PAUSING); 3318 break; 3319 } 3320 3321 errno = EBUSY; 3322 return -1; 3323 } 3324 3325 static void 3326 vfio_user_ctrlr_dump_migr_data(const char *name, 3327 struct vfio_user_nvme_migr_state *migr_data, 3328 struct nvmf_vfio_user_shadow_doorbells *sdbl) 3329 { 3330 struct spdk_nvmf_registers *regs; 3331 struct nvme_migr_sq_state *sq; 3332 struct nvme_migr_cq_state *cq; 3333 uint32_t *doorbell_base; 3334 uint32_t i; 3335 3336 SPDK_NOTICELOG("Dump %s\n", name); 3337 3338 regs = &migr_data->nvmf_data.regs; 3339 doorbell_base = (uint32_t *)&migr_data->doorbells; 3340 3341 SPDK_NOTICELOG("Registers\n"); 3342 SPDK_NOTICELOG("CSTS 0x%x\n", regs->csts.raw); 3343 SPDK_NOTICELOG("CAP 0x%"PRIx64"\n", regs->cap.raw); 3344 SPDK_NOTICELOG("VS 0x%x\n", regs->vs.raw); 3345 SPDK_NOTICELOG("CC 0x%x\n", regs->cc.raw); 3346 SPDK_NOTICELOG("AQA 0x%x\n", regs->aqa.raw); 3347 SPDK_NOTICELOG("ASQ 0x%"PRIx64"\n", regs->asq); 3348 SPDK_NOTICELOG("ACQ 0x%"PRIx64"\n", regs->acq); 3349 3350 SPDK_NOTICELOG("Number of IO Queues %u\n", migr_data->ctrlr_header.num_io_queues); 3351 3352 if (sdbl != NULL) { 3353 SPDK_NOTICELOG("shadow doorbell buffer=%#lx\n", 3354 migr_data->ctrlr_header.shadow_doorbell_buffer); 3355 SPDK_NOTICELOG("eventidx buffer=%#lx\n", 3356 migr_data->ctrlr_header.eventidx_buffer); 3357 } 3358 3359 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3360 sq = &migr_data->qps[i].sq; 3361 cq = &migr_data->qps[i].cq; 3362 3363 if (sq->size) { 3364 SPDK_NOTICELOG("sqid:%u, bar0_doorbell:%u\n", sq->sqid, doorbell_base[i * 2]); 3365 if (i > 0 && sdbl != NULL) { 3366 SPDK_NOTICELOG("sqid:%u, shadow_doorbell:%u, eventidx:%u\n", 3367 sq->sqid, 3368 sdbl->shadow_doorbells[queue_index(i, false)], 3369 sdbl->eventidxs[queue_index(i, false)]); 3370 } 3371 SPDK_NOTICELOG("SQ sqid:%u, cqid:%u, sqhead:%u, size:%u, dma_addr:0x%"PRIx64"\n", 3372 sq->sqid, sq->cqid, sq->head, sq->size, sq->dma_addr); 3373 } 3374 3375 if (cq->size) { 3376 SPDK_NOTICELOG("cqid:%u, bar0_doorbell:%u\n", cq->cqid, doorbell_base[i * 2 + 1]); 3377 if (i > 0 && sdbl != NULL) { 3378 SPDK_NOTICELOG("cqid:%u, shadow_doorbell:%u, eventidx:%u\n", 3379 cq->cqid, 3380 sdbl->shadow_doorbells[queue_index(i, true)], 3381 sdbl->eventidxs[queue_index(i, true)]); 3382 } 3383 SPDK_NOTICELOG("CQ cqid:%u, phase:%u, cqtail:%u, size:%u, iv:%u, ien:%u, dma_addr:0x%"PRIx64"\n", 3384 cq->cqid, cq->phase, cq->tail, cq->size, cq->iv, cq->ien, cq->dma_addr); 3385 } 3386 } 3387 3388 SPDK_NOTICELOG("%s Dump Done\n", name); 3389 } 3390 3391 /* Read region 9 content and restore it to migration data structures */ 3392 static int 3393 vfio_user_migr_stream_to_data(struct nvmf_vfio_user_endpoint *endpoint, 3394 struct vfio_user_nvme_migr_state *migr_state) 3395 { 3396 void *data_ptr = endpoint->migr_data; 3397 3398 /* Load vfio_user_nvme_migr_header first */ 3399 memcpy(&migr_state->ctrlr_header, data_ptr, sizeof(struct vfio_user_nvme_migr_header)); 3400 /* TODO: version check */ 3401 if (migr_state->ctrlr_header.magic != VFIO_USER_NVME_MIGR_MAGIC) { 3402 SPDK_ERRLOG("%s: bad magic number %x\n", endpoint_id(endpoint), migr_state->ctrlr_header.magic); 3403 return -EINVAL; 3404 } 3405 3406 /* Load nvmf controller data */ 3407 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.nvmf_data_offset; 3408 memcpy(&migr_state->nvmf_data, data_ptr, migr_state->ctrlr_header.nvmf_data_len); 3409 3410 /* Load queue pairs */ 3411 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.qp_offset; 3412 memcpy(&migr_state->qps, data_ptr, migr_state->ctrlr_header.qp_len); 3413 3414 /* Load doorbells */ 3415 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.bar_offset[VFU_PCI_DEV_BAR0_REGION_IDX]; 3416 memcpy(&migr_state->doorbells, data_ptr, 3417 migr_state->ctrlr_header.bar_len[VFU_PCI_DEV_BAR0_REGION_IDX]); 3418 3419 /* Load CFG */ 3420 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.bar_offset[VFU_PCI_DEV_CFG_REGION_IDX]; 3421 memcpy(&migr_state->cfg, data_ptr, migr_state->ctrlr_header.bar_len[VFU_PCI_DEV_CFG_REGION_IDX]); 3422 3423 return 0; 3424 } 3425 3426 3427 static void 3428 vfio_user_migr_ctrlr_save_data(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3429 { 3430 struct spdk_nvmf_ctrlr *ctrlr = vu_ctrlr->ctrlr; 3431 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 3432 struct nvmf_vfio_user_sq *sq; 3433 struct nvmf_vfio_user_cq *cq; 3434 uint64_t data_offset; 3435 void *data_ptr; 3436 uint32_t *doorbell_base; 3437 uint32_t i = 0; 3438 uint16_t sqid, cqid; 3439 struct vfio_user_nvme_migr_state migr_state = { 3440 .nvmf_data = { 3441 .data_size = offsetof(struct spdk_nvmf_ctrlr_migr_data, unused), 3442 .regs_size = sizeof(struct spdk_nvmf_registers), 3443 .feat_size = sizeof(struct spdk_nvmf_ctrlr_feat) 3444 } 3445 }; 3446 3447 /* Save all data to vfio_user_nvme_migr_state first, then we will 3448 * copy it to device migration region at last. 3449 */ 3450 3451 /* save magic number */ 3452 migr_state.ctrlr_header.magic = VFIO_USER_NVME_MIGR_MAGIC; 3453 3454 /* save controller data */ 3455 spdk_nvmf_ctrlr_save_migr_data(ctrlr, &migr_state.nvmf_data); 3456 3457 /* save connected queue pairs */ 3458 TAILQ_FOREACH(sq, &vu_ctrlr->connected_sqs, tailq) { 3459 /* save sq */ 3460 sqid = sq->qid; 3461 migr_state.qps[sqid].sq.sqid = sq->qid; 3462 migr_state.qps[sqid].sq.cqid = sq->cqid; 3463 migr_state.qps[sqid].sq.head = *sq_headp(sq); 3464 migr_state.qps[sqid].sq.size = sq->size; 3465 migr_state.qps[sqid].sq.dma_addr = sq->mapping.prp1; 3466 3467 /* save cq, for shared cq case, cq may be saved multiple times */ 3468 cqid = sq->cqid; 3469 cq = vu_ctrlr->cqs[cqid]; 3470 migr_state.qps[cqid].cq.cqid = cqid; 3471 migr_state.qps[cqid].cq.tail = *cq_tailp(cq); 3472 migr_state.qps[cqid].cq.ien = cq->ien; 3473 migr_state.qps[cqid].cq.iv = cq->iv; 3474 migr_state.qps[cqid].cq.size = cq->size; 3475 migr_state.qps[cqid].cq.phase = cq->phase; 3476 migr_state.qps[cqid].cq.dma_addr = cq->mapping.prp1; 3477 i++; 3478 } 3479 3480 assert(i > 0); 3481 migr_state.ctrlr_header.num_io_queues = i - 1; 3482 3483 /* Save doorbells */ 3484 doorbell_base = (uint32_t *)&migr_state.doorbells; 3485 memcpy(doorbell_base, (void *)vu_ctrlr->bar0_doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 3486 3487 /* Save PCI configuration space */ 3488 memcpy(&migr_state.cfg, (void *)endpoint->pci_config_space, NVME_REG_CFG_SIZE); 3489 3490 /* Save all data to device migration region */ 3491 data_ptr = endpoint->migr_data; 3492 3493 /* Copy nvmf controller data */ 3494 data_offset = sizeof(struct vfio_user_nvme_migr_header); 3495 data_ptr += data_offset; 3496 migr_state.ctrlr_header.nvmf_data_offset = data_offset; 3497 migr_state.ctrlr_header.nvmf_data_len = sizeof(struct spdk_nvmf_ctrlr_migr_data); 3498 memcpy(data_ptr, &migr_state.nvmf_data, sizeof(struct spdk_nvmf_ctrlr_migr_data)); 3499 3500 /* Copy queue pairs */ 3501 data_offset += sizeof(struct spdk_nvmf_ctrlr_migr_data); 3502 data_ptr += sizeof(struct spdk_nvmf_ctrlr_migr_data); 3503 migr_state.ctrlr_header.qp_offset = data_offset; 3504 migr_state.ctrlr_header.qp_len = i * (sizeof(struct nvme_migr_sq_state) + sizeof( 3505 struct nvme_migr_cq_state)); 3506 memcpy(data_ptr, &migr_state.qps, migr_state.ctrlr_header.qp_len); 3507 3508 /* Copy doorbells */ 3509 data_offset += migr_state.ctrlr_header.qp_len; 3510 data_ptr += migr_state.ctrlr_header.qp_len; 3511 migr_state.ctrlr_header.bar_offset[VFU_PCI_DEV_BAR0_REGION_IDX] = data_offset; 3512 migr_state.ctrlr_header.bar_len[VFU_PCI_DEV_BAR0_REGION_IDX] = NVMF_VFIO_USER_DOORBELLS_SIZE; 3513 memcpy(data_ptr, &migr_state.doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 3514 3515 /* Copy CFG */ 3516 data_offset += NVMF_VFIO_USER_DOORBELLS_SIZE; 3517 data_ptr += NVMF_VFIO_USER_DOORBELLS_SIZE; 3518 migr_state.ctrlr_header.bar_offset[VFU_PCI_DEV_CFG_REGION_IDX] = data_offset; 3519 migr_state.ctrlr_header.bar_len[VFU_PCI_DEV_CFG_REGION_IDX] = NVME_REG_CFG_SIZE; 3520 memcpy(data_ptr, &migr_state.cfg, NVME_REG_CFG_SIZE); 3521 3522 /* copy shadow doorbells */ 3523 if (vu_ctrlr->sdbl != NULL) { 3524 migr_state.ctrlr_header.sdbl = true; 3525 migr_state.ctrlr_header.shadow_doorbell_buffer = vu_ctrlr->shadow_doorbell_buffer; 3526 migr_state.ctrlr_header.eventidx_buffer = vu_ctrlr->eventidx_buffer; 3527 } 3528 3529 /* Copy nvme migration header finally */ 3530 memcpy(endpoint->migr_data, &migr_state.ctrlr_header, sizeof(struct vfio_user_nvme_migr_header)); 3531 3532 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 3533 vfio_user_ctrlr_dump_migr_data("SAVE", &migr_state, vu_ctrlr->sdbl); 3534 } 3535 } 3536 3537 /* 3538 * If we are about to close the connection, we need to unregister the interrupt, 3539 * as the library will subsequently close the file descriptor we registered. 3540 */ 3541 static int 3542 vfio_user_device_reset(vfu_ctx_t *vfu_ctx, vfu_reset_type_t type) 3543 { 3544 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3545 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 3546 3547 SPDK_DEBUGLOG(nvmf_vfio, "Device reset type %u\n", type); 3548 3549 if (type == VFU_RESET_LOST_CONN) { 3550 if (ctrlr != NULL) { 3551 spdk_interrupt_unregister(&ctrlr->intr); 3552 ctrlr->intr_fd = -1; 3553 } 3554 return 0; 3555 } 3556 3557 /* FIXME: LOST_CONN case ? */ 3558 if (ctrlr->sdbl != NULL) { 3559 vfio_user_ctrlr_switch_doorbells(ctrlr, false); 3560 free_sdbl(vfu_ctx, ctrlr->sdbl); 3561 ctrlr->sdbl = NULL; 3562 } 3563 3564 /* FIXME: much more needed here. */ 3565 3566 return 0; 3567 } 3568 3569 static int 3570 vfio_user_migr_ctrlr_construct_qps(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 3571 struct vfio_user_nvme_migr_state *migr_state) 3572 { 3573 uint32_t i, qsize = 0; 3574 uint16_t sqid, cqid; 3575 struct vfio_user_nvme_migr_qp migr_qp; 3576 void *addr; 3577 uint32_t cqs_ref[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR] = {}; 3578 int ret; 3579 3580 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 3581 vfio_user_ctrlr_dump_migr_data("RESUME", migr_state, vu_ctrlr->sdbl); 3582 } 3583 3584 /* restore submission queues */ 3585 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3586 migr_qp = migr_state->qps[i]; 3587 3588 qsize = migr_qp.sq.size; 3589 if (qsize) { 3590 struct nvmf_vfio_user_sq *sq; 3591 3592 sqid = migr_qp.sq.sqid; 3593 if (sqid != i) { 3594 SPDK_ERRLOG("Expected sqid %u while got %u", i, sqid); 3595 return -EINVAL; 3596 } 3597 3598 /* allocate sq if necessary */ 3599 if (vu_ctrlr->sqs[sqid] == NULL) { 3600 ret = init_sq(vu_ctrlr, &vu_ctrlr->transport->transport, sqid); 3601 if (ret) { 3602 SPDK_ERRLOG("Construct qpair with qid %u failed\n", sqid); 3603 return -EFAULT; 3604 } 3605 } 3606 3607 sq = vu_ctrlr->sqs[sqid]; 3608 sq->size = qsize; 3609 3610 ret = alloc_sq_reqs(vu_ctrlr, sq); 3611 if (ret) { 3612 SPDK_ERRLOG("Construct sq with qid %u failed\n", sqid); 3613 return -EFAULT; 3614 } 3615 3616 /* restore sq */ 3617 sq->sq_state = VFIO_USER_SQ_CREATED; 3618 sq->cqid = migr_qp.sq.cqid; 3619 *sq_headp(sq) = migr_qp.sq.head; 3620 sq->mapping.prp1 = migr_qp.sq.dma_addr; 3621 addr = map_one(vu_ctrlr->endpoint->vfu_ctx, 3622 sq->mapping.prp1, sq->size * 64, 3623 sq->mapping.sg, &sq->mapping.iov, 3624 PROT_READ); 3625 if (addr == NULL) { 3626 SPDK_ERRLOG("Restore sq with qid %u PRP1 0x%"PRIx64" with size %u failed\n", 3627 sqid, sq->mapping.prp1, sq->size); 3628 return -EFAULT; 3629 } 3630 cqs_ref[sq->cqid]++; 3631 } 3632 } 3633 3634 /* restore completion queues */ 3635 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3636 migr_qp = migr_state->qps[i]; 3637 3638 qsize = migr_qp.cq.size; 3639 if (qsize) { 3640 struct nvmf_vfio_user_cq *cq; 3641 3642 /* restore cq */ 3643 cqid = migr_qp.sq.cqid; 3644 assert(cqid == i); 3645 3646 /* allocate cq if necessary */ 3647 if (vu_ctrlr->cqs[cqid] == NULL) { 3648 ret = init_cq(vu_ctrlr, cqid); 3649 if (ret) { 3650 SPDK_ERRLOG("Construct qpair with qid %u failed\n", cqid); 3651 return -EFAULT; 3652 } 3653 } 3654 3655 cq = vu_ctrlr->cqs[cqid]; 3656 3657 cq->size = qsize; 3658 3659 cq->cq_state = VFIO_USER_CQ_CREATED; 3660 cq->cq_ref = cqs_ref[cqid]; 3661 *cq_tailp(cq) = migr_qp.cq.tail; 3662 cq->mapping.prp1 = migr_qp.cq.dma_addr; 3663 cq->ien = migr_qp.cq.ien; 3664 cq->iv = migr_qp.cq.iv; 3665 cq->phase = migr_qp.cq.phase; 3666 addr = map_one(vu_ctrlr->endpoint->vfu_ctx, 3667 cq->mapping.prp1, cq->size * 16, 3668 cq->mapping.sg, &cq->mapping.iov, 3669 PROT_READ | PROT_WRITE); 3670 if (addr == NULL) { 3671 SPDK_ERRLOG("Restore cq with qid %u PRP1 0x%"PRIx64" with size %u failed\n", 3672 cqid, cq->mapping.prp1, cq->size); 3673 return -EFAULT; 3674 } 3675 } 3676 } 3677 3678 return 0; 3679 } 3680 3681 static int 3682 vfio_user_migr_ctrlr_restore(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3683 { 3684 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 3685 struct spdk_nvmf_ctrlr *ctrlr = vu_ctrlr->ctrlr; 3686 uint32_t *doorbell_base; 3687 struct spdk_nvme_cmd cmd; 3688 uint16_t i; 3689 int rc = 0; 3690 struct vfio_user_nvme_migr_state migr_state = { 3691 .nvmf_data = { 3692 .data_size = offsetof(struct spdk_nvmf_ctrlr_migr_data, unused), 3693 .regs_size = sizeof(struct spdk_nvmf_registers), 3694 .feat_size = sizeof(struct spdk_nvmf_ctrlr_feat) 3695 } 3696 }; 3697 3698 assert(endpoint->migr_data != NULL); 3699 assert(ctrlr != NULL); 3700 rc = vfio_user_migr_stream_to_data(endpoint, &migr_state); 3701 if (rc) { 3702 return rc; 3703 } 3704 3705 /* restore shadow doorbells */ 3706 if (migr_state.ctrlr_header.sdbl) { 3707 struct nvmf_vfio_user_shadow_doorbells *sdbl; 3708 sdbl = map_sdbl(vu_ctrlr->endpoint->vfu_ctx, 3709 migr_state.ctrlr_header.shadow_doorbell_buffer, 3710 migr_state.ctrlr_header.eventidx_buffer, 3711 memory_page_size(vu_ctrlr)); 3712 if (sdbl == NULL) { 3713 SPDK_ERRLOG("%s: failed to re-map shadow doorbell buffers\n", 3714 ctrlr_id(vu_ctrlr)); 3715 return -1; 3716 } 3717 3718 vu_ctrlr->shadow_doorbell_buffer = migr_state.ctrlr_header.shadow_doorbell_buffer; 3719 vu_ctrlr->eventidx_buffer = migr_state.ctrlr_header.eventidx_buffer; 3720 3721 SWAP(vu_ctrlr->sdbl, sdbl); 3722 } 3723 3724 rc = vfio_user_migr_ctrlr_construct_qps(vu_ctrlr, &migr_state); 3725 if (rc) { 3726 return rc; 3727 } 3728 3729 /* restore PCI configuration space */ 3730 memcpy((void *)endpoint->pci_config_space, &migr_state.cfg, NVME_REG_CFG_SIZE); 3731 3732 doorbell_base = (uint32_t *)&migr_state.doorbells; 3733 /* restore doorbells from saved registers */ 3734 memcpy((void *)vu_ctrlr->bar0_doorbells, doorbell_base, NVMF_VFIO_USER_DOORBELLS_SIZE); 3735 3736 /* restore nvmf controller data */ 3737 rc = spdk_nvmf_ctrlr_restore_migr_data(ctrlr, &migr_state.nvmf_data); 3738 if (rc) { 3739 return rc; 3740 } 3741 3742 /* resubmit pending AERs */ 3743 for (i = 0; i < migr_state.nvmf_data.num_aer_cids; i++) { 3744 SPDK_DEBUGLOG(nvmf_vfio, "%s AER resubmit, CID %u\n", ctrlr_id(vu_ctrlr), 3745 migr_state.nvmf_data.aer_cids[i]); 3746 memset(&cmd, 0, sizeof(cmd)); 3747 cmd.opc = SPDK_NVME_OPC_ASYNC_EVENT_REQUEST; 3748 cmd.cid = migr_state.nvmf_data.aer_cids[i]; 3749 rc = handle_cmd_req(vu_ctrlr, &cmd, vu_ctrlr->sqs[0]); 3750 if (spdk_unlikely(rc)) { 3751 break; 3752 } 3753 } 3754 3755 return rc; 3756 } 3757 3758 static void 3759 vfio_user_migr_ctrlr_enable_sqs(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3760 { 3761 uint32_t i; 3762 struct nvmf_vfio_user_sq *sq; 3763 3764 /* The Admin queue (qid: 0) does not ever use shadow doorbells. */ 3765 3766 if (vu_ctrlr->sqs[0] != NULL) { 3767 vu_ctrlr->sqs[0]->dbl_tailp = vu_ctrlr->bar0_doorbells + 3768 queue_index(0, false); 3769 } 3770 3771 if (vu_ctrlr->cqs[0] != NULL) { 3772 vu_ctrlr->cqs[0]->dbl_headp = vu_ctrlr->bar0_doorbells + 3773 queue_index(0, true); 3774 } 3775 3776 vfio_user_ctrlr_switch_doorbells(vu_ctrlr, vu_ctrlr->sdbl != NULL); 3777 3778 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3779 sq = vu_ctrlr->sqs[i]; 3780 if (!sq || !sq->size) { 3781 continue; 3782 } 3783 3784 if (nvmf_qpair_is_admin_queue(&sq->qpair)) { 3785 /* ADMIN queue pair is always in the poll group, just enable it */ 3786 sq->sq_state = VFIO_USER_SQ_ACTIVE; 3787 } else { 3788 spdk_nvmf_tgt_new_qpair(vu_ctrlr->transport->transport.tgt, &sq->qpair); 3789 } 3790 } 3791 } 3792 3793 /* 3794 * We are in stop-and-copy state, but still potentially have some current dirty 3795 * sgls: while we're quiesced and thus should have no active requests, we still 3796 * have potentially dirty maps of the shadow doorbells and the CQs (SQs are 3797 * mapped read only). 3798 * 3799 * Since we won't be calling vfu_sgl_put() for them, we need to explicitly 3800 * mark them dirty now. 3801 */ 3802 static void 3803 vfio_user_migr_ctrlr_mark_dirty(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3804 { 3805 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 3806 3807 assert(vu_ctrlr->state == VFIO_USER_CTRLR_MIGRATING); 3808 3809 for (size_t i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3810 struct nvmf_vfio_user_cq *cq = vu_ctrlr->cqs[i]; 3811 3812 if (cq == NULL || q_addr(&cq->mapping) == NULL) { 3813 continue; 3814 } 3815 3816 vfu_sgl_mark_dirty(endpoint->vfu_ctx, cq->mapping.sg, 1); 3817 } 3818 3819 if (vu_ctrlr->sdbl != NULL) { 3820 dma_sg_t *sg; 3821 size_t i; 3822 3823 for (i = 0; i < NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT; 3824 ++i) { 3825 3826 if (!vu_ctrlr->sdbl->iovs[i].iov_len) { 3827 continue; 3828 } 3829 3830 sg = index_to_sg_t(vu_ctrlr->sdbl->sgs, i); 3831 3832 vfu_sgl_mark_dirty(endpoint->vfu_ctx, sg, 1); 3833 } 3834 } 3835 } 3836 3837 static int 3838 vfio_user_migration_device_state_transition(vfu_ctx_t *vfu_ctx, vfu_migr_state_t state) 3839 { 3840 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3841 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3842 struct nvmf_vfio_user_sq *sq; 3843 int ret = 0; 3844 3845 SPDK_DEBUGLOG(nvmf_vfio, "%s controller state %u, migration state %u\n", endpoint_id(endpoint), 3846 vu_ctrlr->state, state); 3847 3848 switch (state) { 3849 case VFU_MIGR_STATE_STOP_AND_COPY: 3850 vu_ctrlr->in_source_vm = true; 3851 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 3852 vfio_user_migr_ctrlr_mark_dirty(vu_ctrlr); 3853 vfio_user_migr_ctrlr_save_data(vu_ctrlr); 3854 break; 3855 case VFU_MIGR_STATE_STOP: 3856 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 3857 /* The controller associates with source VM is dead now, we will resume 3858 * the subsystem after destroying the controller data structure, then the 3859 * subsystem can be re-used for another new client. 3860 */ 3861 if (vu_ctrlr->in_source_vm) { 3862 endpoint->need_resume = true; 3863 } 3864 break; 3865 case VFU_MIGR_STATE_PRE_COPY: 3866 assert(vu_ctrlr->state == VFIO_USER_CTRLR_PAUSED); 3867 break; 3868 case VFU_MIGR_STATE_RESUME: 3869 /* 3870 * Destination ADMIN queue pair is connected when starting the VM, 3871 * but the ADMIN queue pair isn't enabled in destination VM, the poll 3872 * group will do nothing to ADMIN queue pair for now. 3873 */ 3874 if (vu_ctrlr->state != VFIO_USER_CTRLR_RUNNING) { 3875 break; 3876 } 3877 3878 assert(!vu_ctrlr->in_source_vm); 3879 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 3880 3881 sq = TAILQ_FIRST(&vu_ctrlr->connected_sqs); 3882 assert(sq != NULL); 3883 assert(sq->qpair.qid == 0); 3884 sq->sq_state = VFIO_USER_SQ_INACTIVE; 3885 3886 /* Free ADMIN SQ resources first, SQ resources will be 3887 * allocated based on queue size from source VM. 3888 */ 3889 free_sq_reqs(sq); 3890 sq->size = 0; 3891 break; 3892 case VFU_MIGR_STATE_RUNNING: 3893 3894 if (vu_ctrlr->state != VFIO_USER_CTRLR_MIGRATING) { 3895 break; 3896 } 3897 3898 if (!vu_ctrlr->in_source_vm) { 3899 /* Restore destination VM from BAR9 */ 3900 ret = vfio_user_migr_ctrlr_restore(vu_ctrlr); 3901 if (ret) { 3902 break; 3903 } 3904 3905 vfio_user_ctrlr_switch_doorbells(vu_ctrlr, false); 3906 vfio_user_migr_ctrlr_enable_sqs(vu_ctrlr); 3907 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 3908 /* FIXME where do we resume nvmf? */ 3909 } else { 3910 /* Rollback source VM */ 3911 vu_ctrlr->state = VFIO_USER_CTRLR_RESUMING; 3912 ret = spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 3913 vfio_user_endpoint_resume_done, endpoint); 3914 if (ret < 0) { 3915 /* TODO: fail controller with CFS bit set */ 3916 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 3917 SPDK_ERRLOG("%s: failed to resume, ret=%d\n", endpoint_id(endpoint), ret); 3918 } 3919 } 3920 vu_ctrlr->migr_data_prepared = false; 3921 vu_ctrlr->in_source_vm = false; 3922 break; 3923 3924 default: 3925 return -EINVAL; 3926 } 3927 3928 return ret; 3929 } 3930 3931 static uint64_t 3932 vfio_user_migration_get_pending_bytes(vfu_ctx_t *vfu_ctx) 3933 { 3934 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3935 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 3936 uint64_t pending_bytes; 3937 3938 if (ctrlr->migr_data_prepared) { 3939 assert(ctrlr->state == VFIO_USER_CTRLR_MIGRATING); 3940 pending_bytes = 0; 3941 } else { 3942 pending_bytes = vfio_user_migr_data_len(); 3943 } 3944 3945 SPDK_DEBUGLOG(nvmf_vfio, 3946 "%s current state %u, pending bytes 0x%"PRIx64"\n", 3947 endpoint_id(endpoint), ctrlr->state, pending_bytes); 3948 3949 return pending_bytes; 3950 } 3951 3952 static int 3953 vfio_user_migration_prepare_data(vfu_ctx_t *vfu_ctx, uint64_t *offset, uint64_t *size) 3954 { 3955 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3956 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 3957 3958 /* 3959 * When transitioning to pre-copy state we set pending_bytes to 0, 3960 * so the vfio-user client shouldn't attempt to read any migration 3961 * data. This is not yet guaranteed by libvfio-user. 3962 */ 3963 if (ctrlr->state != VFIO_USER_CTRLR_MIGRATING) { 3964 assert(size != NULL); 3965 *offset = 0; 3966 *size = 0; 3967 return 0; 3968 } 3969 3970 if (ctrlr->in_source_vm) { /* migration source */ 3971 assert(size != NULL); 3972 *size = vfio_user_migr_data_len(); 3973 vfio_user_migr_ctrlr_save_data(ctrlr); 3974 } else { /* migration destination */ 3975 assert(size == NULL); 3976 assert(!ctrlr->migr_data_prepared); 3977 } 3978 *offset = 0; 3979 ctrlr->migr_data_prepared = true; 3980 3981 SPDK_DEBUGLOG(nvmf_vfio, "%s current state %u\n", endpoint_id(endpoint), ctrlr->state); 3982 3983 return 0; 3984 } 3985 3986 static ssize_t 3987 vfio_user_migration_read_data(vfu_ctx_t *vfu_ctx __attribute__((unused)), 3988 void *buf __attribute__((unused)), 3989 uint64_t count __attribute__((unused)), 3990 uint64_t offset __attribute__((unused))) 3991 { 3992 SPDK_DEBUGLOG(nvmf_vfio, "%s: migration read data not supported\n", 3993 endpoint_id(vfu_get_private(vfu_ctx))); 3994 errno = ENOTSUP; 3995 return -1; 3996 } 3997 3998 static ssize_t 3999 vfio_user_migration_write_data(vfu_ctx_t *vfu_ctx __attribute__((unused)), 4000 void *buf __attribute__((unused)), 4001 uint64_t count __attribute__((unused)), 4002 uint64_t offset __attribute__((unused))) 4003 { 4004 SPDK_DEBUGLOG(nvmf_vfio, "%s: migration write data not supported\n", 4005 endpoint_id(vfu_get_private(vfu_ctx))); 4006 errno = ENOTSUP; 4007 return -1; 4008 } 4009 4010 static int 4011 vfio_user_migration_data_written(vfu_ctx_t *vfu_ctx __attribute__((unused)), 4012 uint64_t count) 4013 { 4014 SPDK_DEBUGLOG(nvmf_vfio, "write 0x%"PRIx64"\n", (uint64_t)count); 4015 4016 if (count != vfio_user_migr_data_len()) { 4017 SPDK_DEBUGLOG(nvmf_vfio, "%s bad count %#lx\n", 4018 endpoint_id(vfu_get_private(vfu_ctx)), count); 4019 errno = EINVAL; 4020 return -1; 4021 } 4022 4023 return 0; 4024 } 4025 4026 static int 4027 vfio_user_dev_info_fill(struct nvmf_vfio_user_transport *vu_transport, 4028 struct nvmf_vfio_user_endpoint *endpoint) 4029 { 4030 int ret; 4031 ssize_t cap_offset; 4032 vfu_ctx_t *vfu_ctx = endpoint->vfu_ctx; 4033 struct iovec migr_sparse_mmap = {}; 4034 4035 struct pmcap pmcap = { .hdr.id = PCI_CAP_ID_PM, .pmcs.nsfrst = 0x1 }; 4036 struct pxcap pxcap = { 4037 .hdr.id = PCI_CAP_ID_EXP, 4038 .pxcaps.ver = 0x2, 4039 .pxdcap = {.rer = 0x1, .flrc = 0x1}, 4040 .pxdcap2.ctds = 0x1 4041 }; 4042 4043 struct msixcap msixcap = { 4044 .hdr.id = PCI_CAP_ID_MSIX, 4045 .mxc.ts = NVME_IRQ_MSIX_NUM - 1, 4046 .mtab = {.tbir = 0x4, .to = 0x0}, 4047 .mpba = {.pbir = 0x5, .pbao = 0x0} 4048 }; 4049 4050 struct iovec sparse_mmap[] = { 4051 { 4052 .iov_base = (void *)NVME_DOORBELLS_OFFSET, 4053 .iov_len = NVMF_VFIO_USER_DOORBELLS_SIZE, 4054 }, 4055 }; 4056 4057 const vfu_migration_callbacks_t migr_callbacks = { 4058 .version = VFU_MIGR_CALLBACKS_VERS, 4059 .transition = &vfio_user_migration_device_state_transition, 4060 .get_pending_bytes = &vfio_user_migration_get_pending_bytes, 4061 .prepare_data = &vfio_user_migration_prepare_data, 4062 .read_data = &vfio_user_migration_read_data, 4063 .data_written = &vfio_user_migration_data_written, 4064 .write_data = &vfio_user_migration_write_data 4065 }; 4066 4067 ret = vfu_pci_init(vfu_ctx, VFU_PCI_TYPE_EXPRESS, PCI_HEADER_TYPE_NORMAL, 0); 4068 if (ret < 0) { 4069 SPDK_ERRLOG("vfu_ctx %p failed to initialize PCI\n", vfu_ctx); 4070 return ret; 4071 } 4072 vfu_pci_set_id(vfu_ctx, SPDK_PCI_VID_NUTANIX, 0x0001, SPDK_PCI_VID_NUTANIX, 0); 4073 /* 4074 * 0x02, controller uses the NVM Express programming interface 4075 * 0x08, non-volatile memory controller 4076 * 0x01, mass storage controller 4077 */ 4078 vfu_pci_set_class(vfu_ctx, 0x01, 0x08, 0x02); 4079 4080 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pmcap); 4081 if (cap_offset < 0) { 4082 SPDK_ERRLOG("vfu_ctx %p failed add pmcap\n", vfu_ctx); 4083 return ret; 4084 } 4085 4086 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pxcap); 4087 if (cap_offset < 0) { 4088 SPDK_ERRLOG("vfu_ctx %p failed add pxcap\n", vfu_ctx); 4089 return ret; 4090 } 4091 4092 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &msixcap); 4093 if (cap_offset < 0) { 4094 SPDK_ERRLOG("vfu_ctx %p failed add msixcap\n", vfu_ctx); 4095 return ret; 4096 } 4097 4098 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_CFG_REGION_IDX, NVME_REG_CFG_SIZE, 4099 access_pci_config, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 4100 if (ret < 0) { 4101 SPDK_ERRLOG("vfu_ctx %p failed to setup cfg\n", vfu_ctx); 4102 return ret; 4103 } 4104 4105 if (vu_transport->transport_opts.disable_mappable_bar0) { 4106 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE, 4107 access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, 4108 NULL, 0, -1, 0); 4109 } else { 4110 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE, 4111 access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, 4112 sparse_mmap, 1, endpoint->devmem_fd, 0); 4113 } 4114 4115 if (ret < 0) { 4116 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 0\n", vfu_ctx); 4117 return ret; 4118 } 4119 4120 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR4_REGION_IDX, NVME_BAR4_SIZE, 4121 NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 4122 if (ret < 0) { 4123 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 4\n", vfu_ctx); 4124 return ret; 4125 } 4126 4127 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR5_REGION_IDX, NVME_BAR5_SIZE, 4128 NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 4129 if (ret < 0) { 4130 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 5\n", vfu_ctx); 4131 return ret; 4132 } 4133 4134 ret = vfu_setup_device_dma(vfu_ctx, memory_region_add_cb, memory_region_remove_cb); 4135 if (ret < 0) { 4136 SPDK_ERRLOG("vfu_ctx %p failed to setup dma callback\n", vfu_ctx); 4137 return ret; 4138 } 4139 4140 ret = vfu_setup_device_reset_cb(vfu_ctx, vfio_user_device_reset); 4141 if (ret < 0) { 4142 SPDK_ERRLOG("vfu_ctx %p failed to setup reset callback\n", vfu_ctx); 4143 return ret; 4144 } 4145 4146 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_INTX_IRQ, 1); 4147 if (ret < 0) { 4148 SPDK_ERRLOG("vfu_ctx %p failed to setup INTX\n", vfu_ctx); 4149 return ret; 4150 } 4151 4152 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSIX_IRQ, NVME_IRQ_MSIX_NUM); 4153 if (ret < 0) { 4154 SPDK_ERRLOG("vfu_ctx %p failed to setup MSIX\n", vfu_ctx); 4155 return ret; 4156 } 4157 4158 vfu_setup_device_quiesce_cb(vfu_ctx, vfio_user_dev_quiesce_cb); 4159 4160 migr_sparse_mmap.iov_base = (void *)4096; 4161 migr_sparse_mmap.iov_len = vfio_user_migr_data_len(); 4162 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_MIGR_REGION_IDX, 4163 vfu_get_migr_register_area_size() + vfio_user_migr_data_len(), 4164 NULL, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, &migr_sparse_mmap, 4165 1, endpoint->migr_fd, 0); 4166 if (ret < 0) { 4167 SPDK_ERRLOG("vfu_ctx %p failed to setup migration region\n", vfu_ctx); 4168 return ret; 4169 } 4170 4171 ret = vfu_setup_device_migration_callbacks(vfu_ctx, &migr_callbacks, 4172 vfu_get_migr_register_area_size()); 4173 if (ret < 0) { 4174 SPDK_ERRLOG("vfu_ctx %p failed to setup migration callbacks\n", vfu_ctx); 4175 return ret; 4176 } 4177 4178 ret = vfu_realize_ctx(vfu_ctx); 4179 if (ret < 0) { 4180 SPDK_ERRLOG("vfu_ctx %p failed to realize\n", vfu_ctx); 4181 return ret; 4182 } 4183 4184 endpoint->pci_config_space = vfu_pci_get_config_space(endpoint->vfu_ctx); 4185 assert(endpoint->pci_config_space != NULL); 4186 init_pci_config_space(endpoint->pci_config_space); 4187 4188 assert(cap_offset != 0); 4189 endpoint->msix = (struct msixcap *)((uint8_t *)endpoint->pci_config_space + cap_offset); 4190 4191 return 0; 4192 } 4193 4194 static int nvmf_vfio_user_accept(void *ctx); 4195 4196 static void 4197 set_intr_mode_noop(struct spdk_poller *poller, void *arg, bool interrupt_mode) 4198 { 4199 /* Nothing for us to do here. */ 4200 } 4201 4202 /* 4203 * Register an "accept" poller: this is polling for incoming vfio-user socket 4204 * connections (on the listening socket). 4205 * 4206 * We need to do this on first listening, and also after destroying a 4207 * controller, so we can accept another connection. 4208 */ 4209 static int 4210 vfio_user_register_accept_poller(struct nvmf_vfio_user_endpoint *endpoint) 4211 { 4212 uint64_t poll_rate_us = endpoint->transport->transport.opts.acceptor_poll_rate; 4213 4214 SPDK_DEBUGLOG(nvmf_vfio, "registering accept poller\n"); 4215 4216 endpoint->accept_poller = SPDK_POLLER_REGISTER(nvmf_vfio_user_accept, 4217 endpoint, poll_rate_us); 4218 4219 if (!endpoint->accept_poller) { 4220 return -1; 4221 } 4222 4223 endpoint->accept_thread = spdk_get_thread(); 4224 endpoint->need_relisten = false; 4225 4226 if (!spdk_interrupt_mode_is_enabled()) { 4227 return 0; 4228 } 4229 4230 endpoint->accept_intr_fd = vfu_get_poll_fd(endpoint->vfu_ctx); 4231 assert(endpoint->accept_intr_fd != -1); 4232 4233 endpoint->accept_intr = SPDK_INTERRUPT_REGISTER(endpoint->accept_intr_fd, 4234 nvmf_vfio_user_accept, endpoint); 4235 4236 assert(endpoint->accept_intr != NULL); 4237 4238 spdk_poller_register_interrupt(endpoint->accept_poller, 4239 set_intr_mode_noop, NULL); 4240 return 0; 4241 } 4242 4243 static void 4244 _vfio_user_relisten(void *ctx) 4245 { 4246 struct nvmf_vfio_user_endpoint *endpoint = ctx; 4247 4248 vfio_user_register_accept_poller(endpoint); 4249 } 4250 4251 static void 4252 _free_ctrlr(void *ctx) 4253 { 4254 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 4255 struct nvmf_vfio_user_endpoint *endpoint = ctrlr->endpoint; 4256 4257 free_sdbl(endpoint->vfu_ctx, ctrlr->sdbl); 4258 4259 spdk_interrupt_unregister(&ctrlr->intr); 4260 ctrlr->intr_fd = -1; 4261 spdk_poller_unregister(&ctrlr->vfu_ctx_poller); 4262 4263 free(ctrlr); 4264 4265 if (endpoint->need_async_destroy) { 4266 nvmf_vfio_user_destroy_endpoint(endpoint); 4267 } else if (endpoint->need_relisten) { 4268 spdk_thread_send_msg(endpoint->accept_thread, 4269 _vfio_user_relisten, endpoint); 4270 } 4271 } 4272 4273 static void 4274 free_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 4275 { 4276 int i; 4277 assert(ctrlr != NULL); 4278 4279 SPDK_DEBUGLOG(nvmf_vfio, "free %s\n", ctrlr_id(ctrlr)); 4280 4281 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 4282 free_qp(ctrlr, i); 4283 } 4284 4285 spdk_thread_exec_msg(ctrlr->thread, _free_ctrlr, ctrlr); 4286 } 4287 4288 static int 4289 nvmf_vfio_user_create_ctrlr(struct nvmf_vfio_user_transport *transport, 4290 struct nvmf_vfio_user_endpoint *endpoint) 4291 { 4292 struct nvmf_vfio_user_ctrlr *ctrlr; 4293 int err = 0; 4294 4295 SPDK_DEBUGLOG(nvmf_vfio, "%s\n", endpoint_id(endpoint)); 4296 4297 /* First, construct a vfio-user CUSTOM transport controller */ 4298 ctrlr = calloc(1, sizeof(*ctrlr)); 4299 if (ctrlr == NULL) { 4300 err = -ENOMEM; 4301 goto out; 4302 } 4303 /* We can only support one connection for now */ 4304 ctrlr->cntlid = 0x1; 4305 ctrlr->intr_fd = -1; 4306 ctrlr->transport = transport; 4307 ctrlr->endpoint = endpoint; 4308 ctrlr->bar0_doorbells = endpoint->bar0_doorbells; 4309 TAILQ_INIT(&ctrlr->connected_sqs); 4310 4311 ctrlr->adaptive_irqs_enabled = 4312 !transport->transport_opts.disable_adaptive_irq; 4313 4314 /* Then, construct an admin queue pair */ 4315 err = init_sq(ctrlr, &transport->transport, 0); 4316 if (err != 0) { 4317 free(ctrlr); 4318 goto out; 4319 } 4320 4321 err = init_cq(ctrlr, 0); 4322 if (err != 0) { 4323 free(ctrlr); 4324 goto out; 4325 } 4326 4327 ctrlr->sqs[0]->size = NVMF_VFIO_USER_DEFAULT_AQ_DEPTH; 4328 4329 err = alloc_sq_reqs(ctrlr, ctrlr->sqs[0]); 4330 if (err != 0) { 4331 free(ctrlr); 4332 goto out; 4333 } 4334 endpoint->ctrlr = ctrlr; 4335 4336 /* Notify the generic layer about the new admin queue pair */ 4337 spdk_nvmf_tgt_new_qpair(transport->transport.tgt, &ctrlr->sqs[0]->qpair); 4338 4339 out: 4340 if (err != 0) { 4341 SPDK_ERRLOG("%s: failed to create vfio-user controller: %s\n", 4342 endpoint_id(endpoint), strerror(-err)); 4343 } 4344 4345 return err; 4346 } 4347 4348 static int 4349 nvmf_vfio_user_listen(struct spdk_nvmf_transport *transport, 4350 const struct spdk_nvme_transport_id *trid, 4351 struct spdk_nvmf_listen_opts *listen_opts) 4352 { 4353 struct nvmf_vfio_user_transport *vu_transport; 4354 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 4355 char path[PATH_MAX] = {}; 4356 char uuid[PATH_MAX] = {}; 4357 int ret; 4358 4359 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4360 transport); 4361 4362 pthread_mutex_lock(&vu_transport->lock); 4363 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 4364 /* Only compare traddr */ 4365 if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) { 4366 pthread_mutex_unlock(&vu_transport->lock); 4367 return -EEXIST; 4368 } 4369 } 4370 pthread_mutex_unlock(&vu_transport->lock); 4371 4372 endpoint = calloc(1, sizeof(*endpoint)); 4373 if (!endpoint) { 4374 return -ENOMEM; 4375 } 4376 4377 pthread_mutex_init(&endpoint->lock, NULL); 4378 endpoint->devmem_fd = -1; 4379 memcpy(&endpoint->trid, trid, sizeof(endpoint->trid)); 4380 endpoint->transport = vu_transport; 4381 4382 ret = snprintf(path, PATH_MAX, "%s/bar0", endpoint_id(endpoint)); 4383 if (ret < 0 || ret >= PATH_MAX) { 4384 SPDK_ERRLOG("%s: error to get socket path: %s.\n", endpoint_id(endpoint), spdk_strerror(errno)); 4385 ret = -1; 4386 goto out; 4387 } 4388 4389 ret = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); 4390 if (ret == -1) { 4391 SPDK_ERRLOG("%s: failed to open device memory at %s: %s.\n", 4392 endpoint_id(endpoint), path, spdk_strerror(errno)); 4393 goto out; 4394 } 4395 unlink(path); 4396 4397 endpoint->devmem_fd = ret; 4398 ret = ftruncate(endpoint->devmem_fd, 4399 NVME_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE); 4400 if (ret != 0) { 4401 SPDK_ERRLOG("%s: error to ftruncate file %s: %s.\n", endpoint_id(endpoint), path, 4402 spdk_strerror(errno)); 4403 goto out; 4404 } 4405 4406 endpoint->bar0_doorbells = mmap(NULL, NVMF_VFIO_USER_DOORBELLS_SIZE, 4407 PROT_READ | PROT_WRITE, MAP_SHARED, endpoint->devmem_fd, NVME_DOORBELLS_OFFSET); 4408 if (endpoint->bar0_doorbells == MAP_FAILED) { 4409 SPDK_ERRLOG("%s: error to mmap file %s: %s.\n", endpoint_id(endpoint), path, spdk_strerror(errno)); 4410 endpoint->bar0_doorbells = NULL; 4411 ret = -1; 4412 goto out; 4413 } 4414 4415 ret = snprintf(path, PATH_MAX, "%s/migr", endpoint_id(endpoint)); 4416 if (ret < 0 || ret >= PATH_MAX) { 4417 SPDK_ERRLOG("%s: error to get migration file path: %s.\n", endpoint_id(endpoint), 4418 spdk_strerror(errno)); 4419 ret = -1; 4420 goto out; 4421 } 4422 ret = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); 4423 if (ret == -1) { 4424 SPDK_ERRLOG("%s: failed to open device memory at %s: %s.\n", 4425 endpoint_id(endpoint), path, spdk_strerror(errno)); 4426 goto out; 4427 } 4428 unlink(path); 4429 4430 endpoint->migr_fd = ret; 4431 ret = ftruncate(endpoint->migr_fd, 4432 vfu_get_migr_register_area_size() + vfio_user_migr_data_len()); 4433 if (ret != 0) { 4434 SPDK_ERRLOG("%s: error to ftruncate migration file %s: %s.\n", endpoint_id(endpoint), path, 4435 spdk_strerror(errno)); 4436 goto out; 4437 } 4438 4439 endpoint->migr_data = mmap(NULL, vfio_user_migr_data_len(), 4440 PROT_READ | PROT_WRITE, MAP_SHARED, endpoint->migr_fd, vfu_get_migr_register_area_size()); 4441 if (endpoint->migr_data == MAP_FAILED) { 4442 SPDK_ERRLOG("%s: error to mmap file %s: %s.\n", endpoint_id(endpoint), path, spdk_strerror(errno)); 4443 endpoint->migr_data = NULL; 4444 ret = -1; 4445 goto out; 4446 } 4447 4448 ret = snprintf(uuid, PATH_MAX, "%s/cntrl", endpoint_id(endpoint)); 4449 if (ret < 0 || ret >= PATH_MAX) { 4450 SPDK_ERRLOG("%s: error to get ctrlr file path: %s\n", endpoint_id(endpoint), spdk_strerror(errno)); 4451 ret = -1; 4452 goto out; 4453 } 4454 4455 endpoint->vfu_ctx = vfu_create_ctx(VFU_TRANS_SOCK, uuid, LIBVFIO_USER_FLAG_ATTACH_NB, 4456 endpoint, VFU_DEV_TYPE_PCI); 4457 if (endpoint->vfu_ctx == NULL) { 4458 SPDK_ERRLOG("%s: error creating libmuser context: %m\n", 4459 endpoint_id(endpoint)); 4460 ret = -1; 4461 goto out; 4462 } 4463 4464 ret = vfu_setup_log(endpoint->vfu_ctx, vfio_user_log, 4465 vfio_user_get_log_level()); 4466 if (ret < 0) { 4467 goto out; 4468 } 4469 4470 4471 ret = vfio_user_dev_info_fill(vu_transport, endpoint); 4472 if (ret < 0) { 4473 goto out; 4474 } 4475 4476 ret = vfio_user_register_accept_poller(endpoint); 4477 4478 if (ret != 0) { 4479 goto out; 4480 } 4481 4482 pthread_mutex_lock(&vu_transport->lock); 4483 TAILQ_INSERT_TAIL(&vu_transport->endpoints, endpoint, link); 4484 pthread_mutex_unlock(&vu_transport->lock); 4485 4486 out: 4487 if (ret != 0) { 4488 nvmf_vfio_user_destroy_endpoint(endpoint); 4489 } 4490 4491 return ret; 4492 } 4493 4494 static void 4495 nvmf_vfio_user_stop_listen(struct spdk_nvmf_transport *transport, 4496 const struct spdk_nvme_transport_id *trid) 4497 { 4498 struct nvmf_vfio_user_transport *vu_transport; 4499 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 4500 4501 assert(trid != NULL); 4502 assert(trid->traddr != NULL); 4503 4504 SPDK_DEBUGLOG(nvmf_vfio, "%s: stop listen\n", trid->traddr); 4505 4506 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4507 transport); 4508 4509 pthread_mutex_lock(&vu_transport->lock); 4510 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 4511 if (strcmp(trid->traddr, endpoint->trid.traddr) == 0) { 4512 TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link); 4513 /* Defer to free endpoint resources until the controller 4514 * is freed. There are two cases when running here: 4515 * 1. kill nvmf target while VM is connected 4516 * 2. remove listener via RPC call 4517 * nvmf library will disconnect all queue paris. 4518 */ 4519 if (endpoint->ctrlr) { 4520 assert(!endpoint->need_async_destroy); 4521 endpoint->need_async_destroy = true; 4522 pthread_mutex_unlock(&vu_transport->lock); 4523 return; 4524 } 4525 4526 nvmf_vfio_user_destroy_endpoint(endpoint); 4527 pthread_mutex_unlock(&vu_transport->lock); 4528 return; 4529 } 4530 } 4531 pthread_mutex_unlock(&vu_transport->lock); 4532 4533 SPDK_DEBUGLOG(nvmf_vfio, "%s: not found\n", trid->traddr); 4534 } 4535 4536 static void 4537 nvmf_vfio_user_cdata_init(struct spdk_nvmf_transport *transport, 4538 struct spdk_nvmf_subsystem *subsystem, 4539 struct spdk_nvmf_ctrlr_data *cdata) 4540 { 4541 struct nvmf_vfio_user_transport *vu_transport; 4542 4543 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, transport); 4544 4545 cdata->vid = SPDK_PCI_VID_NUTANIX; 4546 cdata->ssvid = SPDK_PCI_VID_NUTANIX; 4547 cdata->ieee[0] = 0x8d; 4548 cdata->ieee[1] = 0x6b; 4549 cdata->ieee[2] = 0x50; 4550 memset(&cdata->sgls, 0, sizeof(struct spdk_nvme_cdata_sgls)); 4551 cdata->sgls.supported = SPDK_NVME_SGLS_SUPPORTED_DWORD_ALIGNED; 4552 cdata->oncs.compare = !vu_transport->transport_opts.disable_compare; 4553 /* libvfio-user can only support 1 connection for now */ 4554 cdata->oncs.reservations = 0; 4555 cdata->oacs.doorbell_buffer_config = !vu_transport->transport_opts.disable_shadow_doorbells; 4556 cdata->fuses.compare_and_write = !vu_transport->transport_opts.disable_compare; 4557 } 4558 4559 static int 4560 nvmf_vfio_user_listen_associate(struct spdk_nvmf_transport *transport, 4561 const struct spdk_nvmf_subsystem *subsystem, 4562 const struct spdk_nvme_transport_id *trid) 4563 { 4564 struct nvmf_vfio_user_transport *vu_transport; 4565 struct nvmf_vfio_user_endpoint *endpoint; 4566 4567 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, transport); 4568 4569 pthread_mutex_lock(&vu_transport->lock); 4570 TAILQ_FOREACH(endpoint, &vu_transport->endpoints, link) { 4571 if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) { 4572 break; 4573 } 4574 } 4575 pthread_mutex_unlock(&vu_transport->lock); 4576 4577 if (endpoint == NULL) { 4578 return -ENOENT; 4579 } 4580 4581 /* Drop const - we will later need to pause/unpause. */ 4582 endpoint->subsystem = (struct spdk_nvmf_subsystem *)subsystem; 4583 4584 return 0; 4585 } 4586 4587 /* 4588 * Executed periodically at a default SPDK_NVMF_DEFAULT_ACCEPT_POLL_RATE_US 4589 * frequency. 4590 * 4591 * For this endpoint (which at the libvfio-user level corresponds to a socket), 4592 * if we don't currently have a controller set up, peek to see if the socket is 4593 * able to accept a new connection. 4594 */ 4595 static int 4596 nvmf_vfio_user_accept(void *ctx) 4597 { 4598 struct nvmf_vfio_user_endpoint *endpoint = ctx; 4599 struct nvmf_vfio_user_transport *vu_transport; 4600 int err; 4601 4602 vu_transport = endpoint->transport; 4603 4604 if (endpoint->ctrlr != NULL) { 4605 return SPDK_POLLER_IDLE; 4606 } 4607 4608 /* While we're here, the controller is already destroyed, 4609 * subsystem may still be in RESUMING state, we will wait 4610 * until the subsystem is in RUNNING state. 4611 */ 4612 if (endpoint->need_resume) { 4613 return SPDK_POLLER_IDLE; 4614 } 4615 4616 err = vfu_attach_ctx(endpoint->vfu_ctx); 4617 if (err == 0) { 4618 SPDK_DEBUGLOG(nvmf_vfio, "attach succeeded\n"); 4619 err = nvmf_vfio_user_create_ctrlr(vu_transport, endpoint); 4620 if (err == 0) { 4621 /* 4622 * Unregister ourselves: now we've accepted a 4623 * connection, there is nothing for us to poll for, and 4624 * we will poll the connection via vfu_run_ctx() 4625 * instead. 4626 */ 4627 spdk_interrupt_unregister(&endpoint->accept_intr); 4628 spdk_poller_unregister(&endpoint->accept_poller); 4629 } 4630 return SPDK_POLLER_BUSY; 4631 } 4632 4633 if (errno == EAGAIN || errno == EWOULDBLOCK) { 4634 return SPDK_POLLER_IDLE; 4635 } 4636 4637 return SPDK_POLLER_BUSY; 4638 } 4639 4640 static void 4641 nvmf_vfio_user_discover(struct spdk_nvmf_transport *transport, 4642 struct spdk_nvme_transport_id *trid, 4643 struct spdk_nvmf_discovery_log_page_entry *entry) 4644 { } 4645 4646 static int vfio_user_poll_group_intr(void *ctx); 4647 4648 static void 4649 vfio_user_poll_group_add_intr(struct nvmf_vfio_user_poll_group *vu_group, 4650 struct spdk_nvmf_poll_group *group) 4651 { 4652 vu_group->intr_fd = eventfd(0, EFD_NONBLOCK); 4653 assert(vu_group->intr_fd != -1); 4654 4655 vu_group->intr = SPDK_INTERRUPT_REGISTER(vu_group->intr_fd, 4656 vfio_user_poll_group_intr, vu_group); 4657 assert(vu_group->intr != NULL); 4658 4659 spdk_poller_register_interrupt(group->poller, set_intr_mode_noop, 4660 vu_group); 4661 } 4662 4663 static struct spdk_nvmf_transport_poll_group * 4664 nvmf_vfio_user_poll_group_create(struct spdk_nvmf_transport *transport, 4665 struct spdk_nvmf_poll_group *group) 4666 { 4667 struct nvmf_vfio_user_transport *vu_transport; 4668 struct nvmf_vfio_user_poll_group *vu_group; 4669 4670 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4671 transport); 4672 4673 SPDK_DEBUGLOG(nvmf_vfio, "create poll group\n"); 4674 4675 vu_group = calloc(1, sizeof(*vu_group)); 4676 if (vu_group == NULL) { 4677 SPDK_ERRLOG("Error allocating poll group: %m"); 4678 return NULL; 4679 } 4680 4681 if (in_interrupt_mode(vu_transport)) { 4682 vfio_user_poll_group_add_intr(vu_group, group); 4683 } 4684 4685 TAILQ_INIT(&vu_group->sqs); 4686 4687 pthread_mutex_lock(&vu_transport->pg_lock); 4688 TAILQ_INSERT_TAIL(&vu_transport->poll_groups, vu_group, link); 4689 if (vu_transport->next_pg == NULL) { 4690 vu_transport->next_pg = vu_group; 4691 } 4692 pthread_mutex_unlock(&vu_transport->pg_lock); 4693 4694 return &vu_group->group; 4695 } 4696 4697 static struct spdk_nvmf_transport_poll_group * 4698 nvmf_vfio_user_get_optimal_poll_group(struct spdk_nvmf_qpair *qpair) 4699 { 4700 struct nvmf_vfio_user_transport *vu_transport; 4701 struct nvmf_vfio_user_poll_group **vu_group; 4702 struct nvmf_vfio_user_sq *sq; 4703 struct nvmf_vfio_user_cq *cq; 4704 4705 struct spdk_nvmf_transport_poll_group *result = NULL; 4706 4707 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 4708 cq = sq->ctrlr->cqs[sq->cqid]; 4709 assert(cq != NULL); 4710 vu_transport = SPDK_CONTAINEROF(qpair->transport, struct nvmf_vfio_user_transport, transport); 4711 4712 pthread_mutex_lock(&vu_transport->pg_lock); 4713 if (TAILQ_EMPTY(&vu_transport->poll_groups)) { 4714 goto out; 4715 } 4716 4717 if (!nvmf_qpair_is_admin_queue(qpair)) { 4718 /* 4719 * If this is shared IO CQ case, just return the used CQ's poll 4720 * group, so I/O completions don't have to use 4721 * spdk_thread_send_msg(). 4722 */ 4723 if (cq->group != NULL) { 4724 result = cq->group; 4725 goto out; 4726 } 4727 4728 /* 4729 * If we're in interrupt mode, align all qpairs for a controller 4730 * on the same poll group by default, unless requested. This can 4731 * be lower in performance than running on a single poll group, 4732 * so we disable spreading by default. 4733 */ 4734 if (in_interrupt_mode(vu_transport) && 4735 !vu_transport->transport_opts.enable_intr_mode_sq_spreading) { 4736 result = sq->ctrlr->sqs[0]->group; 4737 goto out; 4738 } 4739 4740 } 4741 4742 vu_group = &vu_transport->next_pg; 4743 assert(*vu_group != NULL); 4744 4745 result = &(*vu_group)->group; 4746 *vu_group = TAILQ_NEXT(*vu_group, link); 4747 if (*vu_group == NULL) { 4748 *vu_group = TAILQ_FIRST(&vu_transport->poll_groups); 4749 } 4750 4751 out: 4752 if (cq->group == NULL) { 4753 cq->group = result; 4754 } 4755 4756 pthread_mutex_unlock(&vu_transport->pg_lock); 4757 return result; 4758 } 4759 4760 static void 4761 vfio_user_poll_group_del_intr(struct nvmf_vfio_user_poll_group *vu_group) 4762 { 4763 assert(vu_group->intr_fd != -1); 4764 4765 spdk_interrupt_unregister(&vu_group->intr); 4766 4767 close(vu_group->intr_fd); 4768 vu_group->intr_fd = -1; 4769 } 4770 4771 /* called when process exits */ 4772 static void 4773 nvmf_vfio_user_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group) 4774 { 4775 struct nvmf_vfio_user_poll_group *vu_group, *next_tgroup; 4776 struct nvmf_vfio_user_transport *vu_transport; 4777 4778 SPDK_DEBUGLOG(nvmf_vfio, "destroy poll group\n"); 4779 4780 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 4781 vu_transport = SPDK_CONTAINEROF(vu_group->group.transport, struct nvmf_vfio_user_transport, 4782 transport); 4783 4784 if (in_interrupt_mode(vu_transport)) { 4785 vfio_user_poll_group_del_intr(vu_group); 4786 } 4787 4788 pthread_mutex_lock(&vu_transport->pg_lock); 4789 next_tgroup = TAILQ_NEXT(vu_group, link); 4790 TAILQ_REMOVE(&vu_transport->poll_groups, vu_group, link); 4791 if (next_tgroup == NULL) { 4792 next_tgroup = TAILQ_FIRST(&vu_transport->poll_groups); 4793 } 4794 if (vu_transport->next_pg == vu_group) { 4795 vu_transport->next_pg = next_tgroup; 4796 } 4797 pthread_mutex_unlock(&vu_transport->pg_lock); 4798 4799 free(vu_group); 4800 } 4801 4802 static void 4803 _vfio_user_qpair_disconnect(void *ctx) 4804 { 4805 struct nvmf_vfio_user_sq *sq = ctx; 4806 4807 spdk_nvmf_qpair_disconnect(&sq->qpair, NULL, NULL); 4808 } 4809 4810 /* The function is used when socket connection is destroyed */ 4811 static int 4812 vfio_user_destroy_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 4813 { 4814 struct nvmf_vfio_user_sq *sq; 4815 struct nvmf_vfio_user_endpoint *endpoint; 4816 4817 SPDK_DEBUGLOG(nvmf_vfio, "%s stop processing\n", ctrlr_id(ctrlr)); 4818 4819 endpoint = ctrlr->endpoint; 4820 assert(endpoint != NULL); 4821 4822 pthread_mutex_lock(&endpoint->lock); 4823 endpoint->need_relisten = true; 4824 ctrlr->disconnect = true; 4825 if (TAILQ_EMPTY(&ctrlr->connected_sqs)) { 4826 endpoint->ctrlr = NULL; 4827 free_ctrlr(ctrlr); 4828 pthread_mutex_unlock(&endpoint->lock); 4829 return 0; 4830 } 4831 4832 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 4833 /* add another round thread poll to avoid recursive endpoint lock */ 4834 spdk_thread_send_msg(ctrlr->thread, _vfio_user_qpair_disconnect, sq); 4835 } 4836 pthread_mutex_unlock(&endpoint->lock); 4837 4838 return 0; 4839 } 4840 4841 /* 4842 * Poll for and process any incoming vfio-user messages. 4843 */ 4844 static int 4845 vfio_user_poll_vfu_ctx(void *ctx) 4846 { 4847 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 4848 int ret; 4849 4850 assert(ctrlr != NULL); 4851 4852 /* This will call access_bar0_fn() if there are any writes 4853 * to the portion of the BAR that is not mmap'd */ 4854 ret = vfu_run_ctx(ctrlr->endpoint->vfu_ctx); 4855 if (spdk_unlikely(ret == -1)) { 4856 if (errno == EBUSY) { 4857 return SPDK_POLLER_IDLE; 4858 } 4859 4860 spdk_poller_unregister(&ctrlr->vfu_ctx_poller); 4861 4862 /* 4863 * We lost the client; the reset callback will already have 4864 * unregistered the interrupt. 4865 */ 4866 if (errno == ENOTCONN) { 4867 vfio_user_destroy_ctrlr(ctrlr); 4868 return SPDK_POLLER_BUSY; 4869 } 4870 4871 /* 4872 * We might not have got a reset callback in this case, so 4873 * explicitly unregister the interrupt here. 4874 */ 4875 spdk_interrupt_unregister(&ctrlr->intr); 4876 ctrlr->intr_fd = -1; 4877 fail_ctrlr(ctrlr); 4878 } 4879 4880 return ret != 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 4881 } 4882 4883 struct vfio_user_post_cpl_ctx { 4884 struct nvmf_vfio_user_ctrlr *ctrlr; 4885 struct nvmf_vfio_user_cq *cq; 4886 struct spdk_nvme_cpl cpl; 4887 }; 4888 4889 static void 4890 _post_completion_msg(void *ctx) 4891 { 4892 struct vfio_user_post_cpl_ctx *cpl_ctx = ctx; 4893 4894 post_completion(cpl_ctx->ctrlr, cpl_ctx->cq, cpl_ctx->cpl.cdw0, cpl_ctx->cpl.sqid, 4895 cpl_ctx->cpl.cid, cpl_ctx->cpl.status.sc, cpl_ctx->cpl.status.sct); 4896 free(cpl_ctx); 4897 } 4898 4899 static int nvmf_vfio_user_poll_group_poll(struct spdk_nvmf_transport_poll_group *group); 4900 4901 static int 4902 vfio_user_poll_group_process(void *ctx) 4903 { 4904 struct nvmf_vfio_user_poll_group *vu_group = ctx; 4905 int ret = 0; 4906 4907 SPDK_DEBUGLOG(vfio_user_db, "pg:%p got intr\n", vu_group); 4908 4909 ret |= nvmf_vfio_user_poll_group_poll(&vu_group->group); 4910 4911 /* 4912 * Re-arm the event indexes. NB: this also could rearm other 4913 * controller's SQs. 4914 */ 4915 ret |= vfio_user_poll_group_rearm(vu_group); 4916 4917 vu_group->stats.pg_process_count++; 4918 return ret != 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 4919 } 4920 4921 static int 4922 vfio_user_poll_group_intr(void *ctx) 4923 { 4924 struct nvmf_vfio_user_poll_group *vu_group = ctx; 4925 eventfd_t val; 4926 4927 eventfd_read(vu_group->intr_fd, &val); 4928 4929 vu_group->stats.intr++; 4930 4931 return vfio_user_poll_group_process(ctx); 4932 } 4933 4934 /* 4935 * Handle an interrupt for the given controller: we must poll the vfu_ctx, and 4936 * the SQs assigned to our own poll group. Other poll groups are handled via 4937 * vfio_user_poll_group_intr(). 4938 */ 4939 static int 4940 vfio_user_ctrlr_intr(void *ctx) 4941 { 4942 struct nvmf_vfio_user_poll_group *vu_ctrlr_group; 4943 struct nvmf_vfio_user_ctrlr *vu_ctrlr = ctx; 4944 struct nvmf_vfio_user_poll_group *vu_group; 4945 int ret = SPDK_POLLER_IDLE; 4946 4947 vu_ctrlr_group = ctrlr_to_poll_group(vu_ctrlr); 4948 4949 SPDK_DEBUGLOG(vfio_user_db, "ctrlr pg:%p got intr\n", vu_ctrlr_group); 4950 4951 vu_ctrlr_group->stats.ctrlr_intr++; 4952 4953 /* 4954 * Poll vfio-user for this controller. We need to do this before polling 4955 * any SQs, as this is where doorbell writes may be handled. 4956 */ 4957 ret = vfio_user_poll_vfu_ctx(vu_ctrlr); 4958 4959 /* 4960 * `sqs[0]` could be set to NULL in vfio_user_poll_vfu_ctx() context, 4961 * just return for this case. 4962 */ 4963 if (vu_ctrlr->sqs[0] == NULL) { 4964 return ret; 4965 } 4966 4967 if (vu_ctrlr->transport->transport_opts.enable_intr_mode_sq_spreading) { 4968 /* 4969 * We may have just written to a doorbell owned by another 4970 * reactor: we need to prod them to make sure its SQs are polled 4971 * *after* the doorbell value is updated. 4972 */ 4973 TAILQ_FOREACH(vu_group, &vu_ctrlr->transport->poll_groups, link) { 4974 if (vu_group != vu_ctrlr_group) { 4975 SPDK_DEBUGLOG(vfio_user_db, "prodding pg:%p\n", vu_group); 4976 eventfd_write(vu_group->intr_fd, 1); 4977 } 4978 } 4979 } 4980 4981 ret |= vfio_user_poll_group_process(vu_ctrlr_group); 4982 4983 return ret; 4984 } 4985 4986 static void 4987 vfio_user_ctrlr_set_intr_mode(struct spdk_poller *poller, void *ctx, 4988 bool interrupt_mode) 4989 { 4990 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 4991 assert(ctrlr != NULL); 4992 assert(ctrlr->endpoint != NULL); 4993 4994 SPDK_DEBUGLOG(nvmf_vfio, "%s: setting interrupt mode to %d\n", 4995 ctrlr_id(ctrlr), interrupt_mode); 4996 4997 /* 4998 * interrupt_mode needs to persist across controller resets, so store 4999 * it in the endpoint instead. 5000 */ 5001 ctrlr->endpoint->interrupt_mode = interrupt_mode; 5002 5003 vfio_user_poll_group_rearm(ctrlr_to_poll_group(ctrlr)); 5004 } 5005 5006 /* 5007 * In response to the nvmf_vfio_user_create_ctrlr() path, the admin queue is now 5008 * set up and we can start operating on this controller. 5009 */ 5010 static void 5011 start_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 5012 struct spdk_nvmf_ctrlr *ctrlr) 5013 { 5014 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 5015 5016 vu_ctrlr->ctrlr = ctrlr; 5017 vu_ctrlr->cntlid = ctrlr->cntlid; 5018 vu_ctrlr->thread = spdk_get_thread(); 5019 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 5020 5021 if (!in_interrupt_mode(endpoint->transport)) { 5022 vu_ctrlr->vfu_ctx_poller = SPDK_POLLER_REGISTER(vfio_user_poll_vfu_ctx, 5023 vu_ctrlr, 1000); 5024 return; 5025 } 5026 5027 vu_ctrlr->vfu_ctx_poller = SPDK_POLLER_REGISTER(vfio_user_poll_vfu_ctx, 5028 vu_ctrlr, 0); 5029 5030 vu_ctrlr->intr_fd = vfu_get_poll_fd(vu_ctrlr->endpoint->vfu_ctx); 5031 assert(vu_ctrlr->intr_fd != -1); 5032 5033 vu_ctrlr->intr = SPDK_INTERRUPT_REGISTER(vu_ctrlr->intr_fd, 5034 vfio_user_ctrlr_intr, vu_ctrlr); 5035 5036 assert(vu_ctrlr->intr != NULL); 5037 5038 spdk_poller_register_interrupt(vu_ctrlr->vfu_ctx_poller, 5039 vfio_user_ctrlr_set_intr_mode, 5040 vu_ctrlr); 5041 } 5042 5043 static int 5044 handle_queue_connect_rsp(struct nvmf_vfio_user_req *req, void *cb_arg) 5045 { 5046 struct nvmf_vfio_user_poll_group *vu_group; 5047 struct nvmf_vfio_user_sq *sq = cb_arg; 5048 struct nvmf_vfio_user_cq *admin_cq; 5049 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 5050 struct nvmf_vfio_user_endpoint *endpoint; 5051 5052 assert(sq != NULL); 5053 assert(req != NULL); 5054 5055 vu_ctrlr = sq->ctrlr; 5056 assert(vu_ctrlr != NULL); 5057 endpoint = vu_ctrlr->endpoint; 5058 assert(endpoint != NULL); 5059 5060 if (spdk_nvme_cpl_is_error(&req->req.rsp->nvme_cpl)) { 5061 SPDK_ERRLOG("SC %u, SCT %u\n", req->req.rsp->nvme_cpl.status.sc, req->req.rsp->nvme_cpl.status.sct); 5062 endpoint->ctrlr = NULL; 5063 free_ctrlr(vu_ctrlr); 5064 return -1; 5065 } 5066 5067 vu_group = SPDK_CONTAINEROF(sq->group, struct nvmf_vfio_user_poll_group, group); 5068 TAILQ_INSERT_TAIL(&vu_group->sqs, sq, link); 5069 5070 admin_cq = vu_ctrlr->cqs[0]; 5071 assert(admin_cq != NULL); 5072 assert(admin_cq->group != NULL); 5073 assert(admin_cq->group->group->thread != NULL); 5074 5075 pthread_mutex_lock(&endpoint->lock); 5076 if (nvmf_qpair_is_admin_queue(&sq->qpair)) { 5077 assert(admin_cq->group->group->thread == spdk_get_thread()); 5078 /* 5079 * The admin queue is special as SQ0 and CQ0 are created 5080 * together. 5081 */ 5082 admin_cq->cq_ref = 1; 5083 start_ctrlr(vu_ctrlr, sq->qpair.ctrlr); 5084 } else { 5085 /* For I/O queues this command was generated in response to an 5086 * ADMIN I/O CREATE SUBMISSION QUEUE command which has not yet 5087 * been completed. Complete it now. 5088 */ 5089 if (sq->post_create_io_sq_completion) { 5090 if (admin_cq->group->group->thread != spdk_get_thread()) { 5091 struct vfio_user_post_cpl_ctx *cpl_ctx; 5092 5093 cpl_ctx = calloc(1, sizeof(*cpl_ctx)); 5094 if (!cpl_ctx) { 5095 return -ENOMEM; 5096 } 5097 cpl_ctx->ctrlr = vu_ctrlr; 5098 cpl_ctx->cq = admin_cq; 5099 cpl_ctx->cpl.sqid = 0; 5100 cpl_ctx->cpl.cdw0 = 0; 5101 cpl_ctx->cpl.cid = sq->create_io_sq_cmd.cid; 5102 cpl_ctx->cpl.status.sc = SPDK_NVME_SC_SUCCESS; 5103 cpl_ctx->cpl.status.sct = SPDK_NVME_SCT_GENERIC; 5104 5105 spdk_thread_send_msg(admin_cq->group->group->thread, 5106 _post_completion_msg, 5107 cpl_ctx); 5108 } else { 5109 post_completion(vu_ctrlr, admin_cq, 0, 0, 5110 sq->create_io_sq_cmd.cid, SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC); 5111 } 5112 sq->post_create_io_sq_completion = false; 5113 } else if (in_interrupt_mode(endpoint->transport)) { 5114 /* 5115 * If we're live migrating a guest, there is a window 5116 * where the I/O queues haven't been set up but the 5117 * device is in running state, during which the guest 5118 * might write to a doorbell. This doorbell write will 5119 * go unnoticed, so let's poll the whole controller to 5120 * pick that up. 5121 */ 5122 ctrlr_kick(vu_ctrlr); 5123 } 5124 sq->sq_state = VFIO_USER_SQ_ACTIVE; 5125 } 5126 5127 TAILQ_INSERT_TAIL(&vu_ctrlr->connected_sqs, sq, tailq); 5128 pthread_mutex_unlock(&endpoint->lock); 5129 5130 free(req->req.iov[0].iov_base); 5131 req->req.iov[0].iov_base = NULL; 5132 req->req.iovcnt = 0; 5133 req->req.data = NULL; 5134 5135 return 0; 5136 } 5137 5138 /* 5139 * Add the given qpair to the given poll group. New qpairs are added via 5140 * spdk_nvmf_tgt_new_qpair(), which picks a poll group via 5141 * nvmf_vfio_user_get_optimal_poll_group(), then calls back here via 5142 * nvmf_transport_poll_group_add(). 5143 */ 5144 static int 5145 nvmf_vfio_user_poll_group_add(struct spdk_nvmf_transport_poll_group *group, 5146 struct spdk_nvmf_qpair *qpair) 5147 { 5148 struct nvmf_vfio_user_sq *sq; 5149 struct nvmf_vfio_user_req *vu_req; 5150 struct nvmf_vfio_user_ctrlr *ctrlr; 5151 struct spdk_nvmf_request *req; 5152 struct spdk_nvmf_fabric_connect_data *data; 5153 bool admin; 5154 5155 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5156 sq->group = group; 5157 ctrlr = sq->ctrlr; 5158 5159 SPDK_DEBUGLOG(nvmf_vfio, "%s: add QP%d=%p(%p) to poll_group=%p\n", 5160 ctrlr_id(ctrlr), sq->qpair.qid, 5161 sq, qpair, group); 5162 5163 admin = nvmf_qpair_is_admin_queue(&sq->qpair); 5164 5165 vu_req = get_nvmf_vfio_user_req(sq); 5166 if (vu_req == NULL) { 5167 return -1; 5168 } 5169 5170 req = &vu_req->req; 5171 req->cmd->connect_cmd.opcode = SPDK_NVME_OPC_FABRIC; 5172 req->cmd->connect_cmd.cid = 0; 5173 req->cmd->connect_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_CONNECT; 5174 req->cmd->connect_cmd.recfmt = 0; 5175 req->cmd->connect_cmd.sqsize = sq->size - 1; 5176 req->cmd->connect_cmd.qid = admin ? 0 : qpair->qid; 5177 5178 req->length = sizeof(struct spdk_nvmf_fabric_connect_data); 5179 5180 data = calloc(1, req->length); 5181 if (data == NULL) { 5182 nvmf_vfio_user_req_free(req); 5183 return -ENOMEM; 5184 } 5185 5186 spdk_iov_one(req->iov, &req->iovcnt, data, req->length); 5187 req->data = data; 5188 5189 data->cntlid = ctrlr->cntlid; 5190 snprintf(data->subnqn, sizeof(data->subnqn), "%s", 5191 spdk_nvmf_subsystem_get_nqn(ctrlr->endpoint->subsystem)); 5192 5193 vu_req->cb_fn = handle_queue_connect_rsp; 5194 vu_req->cb_arg = sq; 5195 5196 SPDK_DEBUGLOG(nvmf_vfio, 5197 "%s: sending connect fabrics command for qid:%#x cntlid=%#x\n", 5198 ctrlr_id(ctrlr), qpair->qid, data->cntlid); 5199 5200 spdk_nvmf_request_exec_fabrics(req); 5201 return 0; 5202 } 5203 5204 static int 5205 nvmf_vfio_user_poll_group_remove(struct spdk_nvmf_transport_poll_group *group, 5206 struct spdk_nvmf_qpair *qpair) 5207 { 5208 struct nvmf_vfio_user_sq *sq; 5209 struct nvmf_vfio_user_poll_group *vu_group; 5210 5211 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5212 5213 SPDK_DEBUGLOG(nvmf_vfio, 5214 "%s: remove NVMf QP%d=%p from NVMf poll_group=%p\n", 5215 ctrlr_id(sq->ctrlr), qpair->qid, qpair, group); 5216 5217 5218 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 5219 TAILQ_REMOVE(&vu_group->sqs, sq, link); 5220 5221 return 0; 5222 } 5223 5224 static void 5225 _nvmf_vfio_user_req_free(struct nvmf_vfio_user_sq *sq, struct nvmf_vfio_user_req *vu_req) 5226 { 5227 memset(&vu_req->cmd, 0, sizeof(vu_req->cmd)); 5228 memset(&vu_req->rsp, 0, sizeof(vu_req->rsp)); 5229 vu_req->iovcnt = 0; 5230 vu_req->req.iovcnt = 0; 5231 vu_req->req.data = NULL; 5232 vu_req->req.length = 0; 5233 vu_req->state = VFIO_USER_REQUEST_STATE_FREE; 5234 5235 TAILQ_INSERT_TAIL(&sq->free_reqs, vu_req, link); 5236 } 5237 5238 static int 5239 nvmf_vfio_user_req_free(struct spdk_nvmf_request *req) 5240 { 5241 struct nvmf_vfio_user_sq *sq; 5242 struct nvmf_vfio_user_req *vu_req; 5243 5244 assert(req != NULL); 5245 5246 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 5247 sq = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 5248 5249 _nvmf_vfio_user_req_free(sq, vu_req); 5250 5251 return 0; 5252 } 5253 5254 static int 5255 nvmf_vfio_user_req_complete(struct spdk_nvmf_request *req) 5256 { 5257 struct nvmf_vfio_user_sq *sq; 5258 struct nvmf_vfio_user_req *vu_req; 5259 5260 assert(req != NULL); 5261 5262 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 5263 sq = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 5264 5265 if (vu_req->cb_fn != NULL) { 5266 if (vu_req->cb_fn(vu_req, vu_req->cb_arg) != 0) { 5267 fail_ctrlr(sq->ctrlr); 5268 } 5269 } 5270 5271 _nvmf_vfio_user_req_free(sq, vu_req); 5272 5273 return 0; 5274 } 5275 5276 static void 5277 nvmf_vfio_user_close_qpair(struct spdk_nvmf_qpair *qpair, 5278 spdk_nvmf_transport_qpair_fini_cb cb_fn, void *cb_arg) 5279 { 5280 struct nvmf_vfio_user_sq *sq; 5281 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 5282 struct nvmf_vfio_user_endpoint *endpoint; 5283 5284 assert(qpair != NULL); 5285 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5286 vu_ctrlr = sq->ctrlr; 5287 endpoint = vu_ctrlr->endpoint; 5288 5289 pthread_mutex_lock(&endpoint->lock); 5290 TAILQ_REMOVE(&vu_ctrlr->connected_sqs, sq, tailq); 5291 delete_sq_done(vu_ctrlr, sq); 5292 if (TAILQ_EMPTY(&vu_ctrlr->connected_sqs)) { 5293 endpoint->ctrlr = NULL; 5294 if (vu_ctrlr->in_source_vm && endpoint->need_resume) { 5295 /* The controller will be freed, we can resume the subsystem 5296 * now so that the endpoint can be ready to accept another 5297 * new connection. 5298 */ 5299 spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 5300 vfio_user_endpoint_resume_done, endpoint); 5301 } 5302 free_ctrlr(vu_ctrlr); 5303 } 5304 pthread_mutex_unlock(&endpoint->lock); 5305 5306 if (cb_fn) { 5307 cb_fn(cb_arg); 5308 } 5309 } 5310 5311 /** 5312 * Returns a preallocated request, or NULL if there isn't one available. 5313 */ 5314 static struct nvmf_vfio_user_req * 5315 get_nvmf_vfio_user_req(struct nvmf_vfio_user_sq *sq) 5316 { 5317 struct nvmf_vfio_user_req *req; 5318 5319 if (sq == NULL) { 5320 return NULL; 5321 } 5322 5323 req = TAILQ_FIRST(&sq->free_reqs); 5324 if (req == NULL) { 5325 return NULL; 5326 } 5327 5328 TAILQ_REMOVE(&sq->free_reqs, req, link); 5329 5330 return req; 5331 } 5332 5333 static int 5334 get_nvmf_io_req_length(struct spdk_nvmf_request *req) 5335 { 5336 uint16_t nr; 5337 uint32_t nlb, nsid; 5338 struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; 5339 struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; 5340 struct spdk_nvmf_ns *ns; 5341 5342 nsid = cmd->nsid; 5343 ns = _nvmf_subsystem_get_ns(ctrlr->subsys, nsid); 5344 if (ns == NULL || ns->bdev == NULL) { 5345 SPDK_ERRLOG("unsuccessful query for nsid %u\n", cmd->nsid); 5346 return -EINVAL; 5347 } 5348 5349 if (cmd->opc == SPDK_NVME_OPC_DATASET_MANAGEMENT) { 5350 nr = cmd->cdw10_bits.dsm.nr + 1; 5351 return nr * sizeof(struct spdk_nvme_dsm_range); 5352 } 5353 5354 nlb = (cmd->cdw12 & 0x0000ffffu) + 1; 5355 return nlb * spdk_bdev_get_block_size(ns->bdev); 5356 } 5357 5358 static int 5359 map_admin_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req) 5360 { 5361 struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; 5362 uint32_t len = 0, numdw = 0; 5363 uint8_t fid; 5364 int iovcnt; 5365 5366 req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 5367 5368 if (req->xfer == SPDK_NVME_DATA_NONE) { 5369 return 0; 5370 } 5371 5372 switch (cmd->opc) { 5373 case SPDK_NVME_OPC_IDENTIFY: 5374 len = 4096; 5375 break; 5376 case SPDK_NVME_OPC_GET_LOG_PAGE: 5377 numdw = ((((uint32_t)cmd->cdw11_bits.get_log_page.numdu << 16) | 5378 cmd->cdw10_bits.get_log_page.numdl) + 1); 5379 if (numdw > UINT32_MAX / 4) { 5380 return -EINVAL; 5381 } 5382 len = numdw * 4; 5383 break; 5384 case SPDK_NVME_OPC_GET_FEATURES: 5385 case SPDK_NVME_OPC_SET_FEATURES: 5386 fid = cmd->cdw10_bits.set_features.fid; 5387 switch (fid) { 5388 case SPDK_NVME_FEAT_LBA_RANGE_TYPE: 5389 len = 4096; 5390 break; 5391 case SPDK_NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 5392 len = 256; 5393 break; 5394 case SPDK_NVME_FEAT_TIMESTAMP: 5395 len = 8; 5396 break; 5397 case SPDK_NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 5398 len = 512; 5399 break; 5400 case SPDK_NVME_FEAT_HOST_IDENTIFIER: 5401 if (cmd->cdw11_bits.feat_host_identifier.bits.exhid) { 5402 len = 16; 5403 } else { 5404 len = 8; 5405 } 5406 break; 5407 default: 5408 return 0; 5409 } 5410 break; 5411 default: 5412 return 0; 5413 } 5414 5415 /* ADMIN command will not use SGL */ 5416 if (cmd->psdt != 0) { 5417 return -EINVAL; 5418 } 5419 5420 iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, len); 5421 if (iovcnt < 0) { 5422 SPDK_ERRLOG("%s: map Admin Opc %x failed\n", 5423 ctrlr_id(ctrlr), cmd->opc); 5424 return -1; 5425 } 5426 req->length = len; 5427 req->data = req->iov[0].iov_base; 5428 req->iovcnt = iovcnt; 5429 5430 return 0; 5431 } 5432 5433 /* 5434 * Map an I/O command's buffers. 5435 * 5436 * Returns 0 on success and -errno on failure. 5437 */ 5438 static int 5439 map_io_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req) 5440 { 5441 int len, iovcnt; 5442 struct spdk_nvme_cmd *cmd; 5443 5444 assert(ctrlr != NULL); 5445 assert(req != NULL); 5446 5447 cmd = &req->cmd->nvme_cmd; 5448 req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 5449 5450 if (spdk_unlikely(req->xfer == SPDK_NVME_DATA_NONE)) { 5451 return 0; 5452 } 5453 5454 len = get_nvmf_io_req_length(req); 5455 if (len < 0) { 5456 return -EINVAL; 5457 } 5458 req->length = len; 5459 5460 iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, req->length); 5461 if (iovcnt < 0) { 5462 SPDK_ERRLOG("%s: failed to map IO OPC %u\n", ctrlr_id(ctrlr), cmd->opc); 5463 return -EFAULT; 5464 } 5465 req->data = req->iov[0].iov_base; 5466 req->iovcnt = iovcnt; 5467 5468 return 0; 5469 } 5470 5471 static int 5472 handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 5473 struct nvmf_vfio_user_sq *sq) 5474 { 5475 int err; 5476 struct nvmf_vfio_user_req *vu_req; 5477 struct spdk_nvmf_request *req; 5478 5479 assert(ctrlr != NULL); 5480 assert(cmd != NULL); 5481 5482 vu_req = get_nvmf_vfio_user_req(sq); 5483 if (spdk_unlikely(vu_req == NULL)) { 5484 SPDK_ERRLOG("%s: no request for NVMe command opc 0x%x\n", ctrlr_id(ctrlr), cmd->opc); 5485 return post_completion(ctrlr, ctrlr->cqs[sq->cqid], 0, 0, cmd->cid, 5486 SPDK_NVME_SC_INTERNAL_DEVICE_ERROR, SPDK_NVME_SCT_GENERIC); 5487 5488 } 5489 req = &vu_req->req; 5490 5491 assert(req->qpair != NULL); 5492 SPDK_DEBUGLOG(nvmf_vfio, "%s: handle sqid:%u, req opc=%#x cid=%d\n", 5493 ctrlr_id(ctrlr), req->qpair->qid, cmd->opc, cmd->cid); 5494 5495 vu_req->cb_fn = handle_cmd_rsp; 5496 vu_req->cb_arg = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 5497 req->cmd->nvme_cmd = *cmd; 5498 5499 if (nvmf_qpair_is_admin_queue(req->qpair)) { 5500 err = map_admin_cmd_req(ctrlr, req); 5501 } else { 5502 switch (cmd->opc) { 5503 case SPDK_NVME_OPC_RESERVATION_REGISTER: 5504 case SPDK_NVME_OPC_RESERVATION_REPORT: 5505 case SPDK_NVME_OPC_RESERVATION_ACQUIRE: 5506 case SPDK_NVME_OPC_RESERVATION_RELEASE: 5507 err = -ENOTSUP; 5508 break; 5509 default: 5510 err = map_io_cmd_req(ctrlr, req); 5511 break; 5512 } 5513 } 5514 5515 if (spdk_unlikely(err < 0)) { 5516 SPDK_ERRLOG("%s: process NVMe command opc 0x%x failed\n", 5517 ctrlr_id(ctrlr), cmd->opc); 5518 req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 5519 req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; 5520 err = handle_cmd_rsp(vu_req, vu_req->cb_arg); 5521 _nvmf_vfio_user_req_free(sq, vu_req); 5522 return err; 5523 } 5524 5525 vu_req->state = VFIO_USER_REQUEST_STATE_EXECUTING; 5526 spdk_nvmf_request_exec(req); 5527 5528 return 0; 5529 } 5530 5531 /* 5532 * If we suppressed an IRQ in post_completion(), check if it needs to be fired 5533 * here: if the host isn't up to date, and is apparently not actively processing 5534 * the queue (i.e. ->last_head isn't changing), we need an IRQ. 5535 */ 5536 static void 5537 handle_suppressed_irq(struct nvmf_vfio_user_ctrlr *ctrlr, 5538 struct nvmf_vfio_user_sq *sq) 5539 { 5540 struct nvmf_vfio_user_cq *cq = ctrlr->cqs[sq->cqid]; 5541 uint32_t cq_head; 5542 uint32_t cq_tail; 5543 5544 if (!cq->ien || cq->qid == 0 || !ctrlr_interrupt_enabled(ctrlr)) { 5545 return; 5546 } 5547 5548 cq_tail = *cq_tailp(cq); 5549 5550 /* Already sent? */ 5551 if (cq_tail == cq->last_trigger_irq_tail) { 5552 return; 5553 } 5554 5555 spdk_ivdt_dcache(cq_dbl_headp(cq)); 5556 cq_head = *cq_dbl_headp(cq); 5557 5558 if (cq_head != cq_tail && cq_head == cq->last_head) { 5559 int err = vfu_irq_trigger(ctrlr->endpoint->vfu_ctx, cq->iv); 5560 if (err != 0) { 5561 SPDK_ERRLOG("%s: failed to trigger interrupt: %m\n", 5562 ctrlr_id(ctrlr)); 5563 } else { 5564 cq->last_trigger_irq_tail = cq_tail; 5565 } 5566 } 5567 5568 cq->last_head = cq_head; 5569 } 5570 5571 /* Returns the number of commands processed, or a negative value on error. */ 5572 static int 5573 nvmf_vfio_user_sq_poll(struct nvmf_vfio_user_sq *sq) 5574 { 5575 struct nvmf_vfio_user_ctrlr *ctrlr; 5576 uint32_t new_tail; 5577 int count = 0; 5578 5579 assert(sq != NULL); 5580 5581 ctrlr = sq->ctrlr; 5582 5583 /* 5584 * A quiesced, or migrating, controller should never process new 5585 * commands. 5586 */ 5587 if (ctrlr->state != VFIO_USER_CTRLR_RUNNING) { 5588 return SPDK_POLLER_IDLE; 5589 } 5590 5591 if (ctrlr->adaptive_irqs_enabled) { 5592 handle_suppressed_irq(ctrlr, sq); 5593 } 5594 5595 /* On aarch64 platforms, doorbells update from guest VM may not be seen 5596 * on SPDK target side. This is because there is memory type mismatch 5597 * situation here. That is on guest VM side, the doorbells are treated as 5598 * device memory while on SPDK target side, it is treated as normal 5599 * memory. And this situation cause problem on ARM platform. 5600 * Refer to "https://developer.arm.com/documentation/102376/0100/ 5601 * Memory-aliasing-and-mismatched-memory-types". Only using spdk_mb() 5602 * cannot fix this. Use "dc civac" to invalidate cache may solve 5603 * this. 5604 */ 5605 spdk_ivdt_dcache(sq_dbl_tailp(sq)); 5606 5607 /* Load-Acquire. */ 5608 new_tail = *sq_dbl_tailp(sq); 5609 5610 new_tail = new_tail & 0xffffu; 5611 if (spdk_unlikely(new_tail >= sq->size)) { 5612 union spdk_nvme_async_event_completion event = {}; 5613 5614 SPDK_DEBUGLOG(nvmf_vfio, "%s: invalid sqid:%u doorbell value %u\n", ctrlr_id(ctrlr), sq->qid, 5615 new_tail); 5616 event.bits.async_event_type = SPDK_NVME_ASYNC_EVENT_TYPE_ERROR; 5617 event.bits.async_event_info = SPDK_NVME_ASYNC_EVENT_INVALID_DB_WRITE; 5618 nvmf_ctrlr_async_event_error_event(ctrlr->ctrlr, event); 5619 5620 return -1; 5621 } 5622 5623 if (*sq_headp(sq) == new_tail) { 5624 return 0; 5625 } 5626 5627 SPDK_DEBUGLOG(nvmf_vfio, "%s: sqid:%u doorbell old=%u new=%u\n", 5628 ctrlr_id(ctrlr), sq->qid, *sq_headp(sq), new_tail); 5629 if (ctrlr->sdbl != NULL) { 5630 SPDK_DEBUGLOG(nvmf_vfio, 5631 "%s: sqid:%u bar0_doorbell=%u shadow_doorbell=%u eventidx=%u\n", 5632 ctrlr_id(ctrlr), sq->qid, 5633 ctrlr->bar0_doorbells[queue_index(sq->qid, false)], 5634 ctrlr->sdbl->shadow_doorbells[queue_index(sq->qid, false)], 5635 ctrlr->sdbl->eventidxs[queue_index(sq->qid, false)]); 5636 } 5637 5638 /* 5639 * Ensure that changes to the queue are visible to us. 5640 * The host driver should write the queue first, do a wmb(), and then 5641 * update the SQ tail doorbell (their Store-Release). 5642 */ 5643 spdk_rmb(); 5644 5645 count = handle_sq_tdbl_write(ctrlr, new_tail, sq); 5646 if (spdk_unlikely(count < 0)) { 5647 fail_ctrlr(ctrlr); 5648 } 5649 5650 return count; 5651 } 5652 5653 /* 5654 * vfio-user transport poll handler. Note that the library context is polled in 5655 * a separate poller (->vfu_ctx_poller), so this poller only needs to poll the 5656 * active SQs. 5657 * 5658 * Returns the number of commands processed, or a negative value on error. 5659 */ 5660 static int 5661 nvmf_vfio_user_poll_group_poll(struct spdk_nvmf_transport_poll_group *group) 5662 { 5663 struct nvmf_vfio_user_poll_group *vu_group; 5664 struct nvmf_vfio_user_sq *sq, *tmp; 5665 int count = 0; 5666 5667 assert(group != NULL); 5668 5669 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 5670 5671 SPDK_DEBUGLOG(vfio_user_db, "polling all SQs\n"); 5672 5673 TAILQ_FOREACH_SAFE(sq, &vu_group->sqs, link, tmp) { 5674 int ret; 5675 5676 if (spdk_unlikely(sq->sq_state != VFIO_USER_SQ_ACTIVE || !sq->size)) { 5677 continue; 5678 } 5679 5680 ret = nvmf_vfio_user_sq_poll(sq); 5681 5682 if (spdk_unlikely(ret < 0)) { 5683 return ret; 5684 } 5685 5686 count += ret; 5687 } 5688 5689 vu_group->stats.polls++; 5690 vu_group->stats.poll_reqs += count; 5691 vu_group->stats.poll_reqs_squared += count * count; 5692 if (count == 0) { 5693 vu_group->stats.polls_spurious++; 5694 } 5695 5696 return count; 5697 } 5698 5699 static int 5700 nvmf_vfio_user_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair, 5701 struct spdk_nvme_transport_id *trid) 5702 { 5703 struct nvmf_vfio_user_sq *sq; 5704 struct nvmf_vfio_user_ctrlr *ctrlr; 5705 5706 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5707 ctrlr = sq->ctrlr; 5708 5709 memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid)); 5710 return 0; 5711 } 5712 5713 static int 5714 nvmf_vfio_user_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair, 5715 struct spdk_nvme_transport_id *trid) 5716 { 5717 return 0; 5718 } 5719 5720 static int 5721 nvmf_vfio_user_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair, 5722 struct spdk_nvme_transport_id *trid) 5723 { 5724 struct nvmf_vfio_user_sq *sq; 5725 struct nvmf_vfio_user_ctrlr *ctrlr; 5726 5727 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5728 ctrlr = sq->ctrlr; 5729 5730 memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid)); 5731 return 0; 5732 } 5733 5734 static void 5735 nvmf_vfio_user_qpair_abort_request(struct spdk_nvmf_qpair *qpair, 5736 struct spdk_nvmf_request *req) 5737 { 5738 struct spdk_nvmf_request *req_to_abort = NULL; 5739 struct spdk_nvmf_request *temp_req = NULL; 5740 uint16_t cid; 5741 5742 cid = req->cmd->nvme_cmd.cdw10_bits.abort.cid; 5743 5744 TAILQ_FOREACH(temp_req, &qpair->outstanding, link) { 5745 struct nvmf_vfio_user_req *vu_req; 5746 5747 vu_req = SPDK_CONTAINEROF(temp_req, struct nvmf_vfio_user_req, req); 5748 5749 if (vu_req->state == VFIO_USER_REQUEST_STATE_EXECUTING && vu_req->cmd.cid == cid) { 5750 req_to_abort = temp_req; 5751 break; 5752 } 5753 } 5754 5755 if (req_to_abort == NULL) { 5756 spdk_nvmf_request_complete(req); 5757 return; 5758 } 5759 5760 req->req_to_abort = req_to_abort; 5761 nvmf_ctrlr_abort_request(req); 5762 } 5763 5764 static void 5765 nvmf_vfio_user_poll_group_dump_stat(struct spdk_nvmf_transport_poll_group *group, 5766 struct spdk_json_write_ctx *w) 5767 { 5768 struct nvmf_vfio_user_poll_group *vu_group = SPDK_CONTAINEROF(group, 5769 struct nvmf_vfio_user_poll_group, group); 5770 uint64_t polls_denom; 5771 5772 spdk_json_write_named_uint64(w, "ctrlr_intr", vu_group->stats.ctrlr_intr); 5773 spdk_json_write_named_uint64(w, "ctrlr_kicks", vu_group->stats.ctrlr_kicks); 5774 spdk_json_write_named_uint64(w, "won", vu_group->stats.won); 5775 spdk_json_write_named_uint64(w, "lost", vu_group->stats.lost); 5776 spdk_json_write_named_uint64(w, "lost_count", vu_group->stats.lost_count); 5777 spdk_json_write_named_uint64(w, "rearms", vu_group->stats.rearms); 5778 spdk_json_write_named_uint64(w, "pg_process_count", vu_group->stats.pg_process_count); 5779 spdk_json_write_named_uint64(w, "intr", vu_group->stats.intr); 5780 spdk_json_write_named_uint64(w, "polls", vu_group->stats.polls); 5781 spdk_json_write_named_uint64(w, "polls_spurious", vu_group->stats.polls_spurious); 5782 spdk_json_write_named_uint64(w, "poll_reqs", vu_group->stats.poll_reqs); 5783 polls_denom = vu_group->stats.polls * (vu_group->stats.polls - 1); 5784 if (polls_denom) { 5785 uint64_t n = vu_group->stats.polls * vu_group->stats.poll_reqs_squared - vu_group->stats.poll_reqs * 5786 vu_group->stats.poll_reqs; 5787 spdk_json_write_named_double(w, "poll_reqs_variance", sqrt(n / polls_denom)); 5788 } 5789 5790 spdk_json_write_named_uint64(w, "cqh_admin_writes", vu_group->stats.cqh_admin_writes); 5791 spdk_json_write_named_uint64(w, "cqh_io_writes", vu_group->stats.cqh_io_writes); 5792 } 5793 5794 static void 5795 nvmf_vfio_user_opts_init(struct spdk_nvmf_transport_opts *opts) 5796 { 5797 opts->max_queue_depth = NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH; 5798 opts->max_qpairs_per_ctrlr = NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; 5799 opts->in_capsule_data_size = 0; 5800 opts->max_io_size = NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE; 5801 opts->io_unit_size = NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE; 5802 opts->max_aq_depth = NVMF_VFIO_USER_DEFAULT_AQ_DEPTH; 5803 opts->num_shared_buffers = 0; 5804 opts->buf_cache_size = 0; 5805 opts->association_timeout = 0; 5806 opts->transport_specific = NULL; 5807 } 5808 5809 const struct spdk_nvmf_transport_ops spdk_nvmf_transport_vfio_user = { 5810 .name = "VFIOUSER", 5811 .type = SPDK_NVME_TRANSPORT_VFIOUSER, 5812 .opts_init = nvmf_vfio_user_opts_init, 5813 .create = nvmf_vfio_user_create, 5814 .destroy = nvmf_vfio_user_destroy, 5815 5816 .listen = nvmf_vfio_user_listen, 5817 .stop_listen = nvmf_vfio_user_stop_listen, 5818 .cdata_init = nvmf_vfio_user_cdata_init, 5819 .listen_associate = nvmf_vfio_user_listen_associate, 5820 5821 .listener_discover = nvmf_vfio_user_discover, 5822 5823 .poll_group_create = nvmf_vfio_user_poll_group_create, 5824 .get_optimal_poll_group = nvmf_vfio_user_get_optimal_poll_group, 5825 .poll_group_destroy = nvmf_vfio_user_poll_group_destroy, 5826 .poll_group_add = nvmf_vfio_user_poll_group_add, 5827 .poll_group_remove = nvmf_vfio_user_poll_group_remove, 5828 .poll_group_poll = nvmf_vfio_user_poll_group_poll, 5829 5830 .req_free = nvmf_vfio_user_req_free, 5831 .req_complete = nvmf_vfio_user_req_complete, 5832 5833 .qpair_fini = nvmf_vfio_user_close_qpair, 5834 .qpair_get_local_trid = nvmf_vfio_user_qpair_get_local_trid, 5835 .qpair_get_peer_trid = nvmf_vfio_user_qpair_get_peer_trid, 5836 .qpair_get_listen_trid = nvmf_vfio_user_qpair_get_listen_trid, 5837 .qpair_abort_request = nvmf_vfio_user_qpair_abort_request, 5838 5839 .poll_group_dump_stat = nvmf_vfio_user_poll_group_dump_stat, 5840 }; 5841 5842 SPDK_NVMF_TRANSPORT_REGISTER(muser, &spdk_nvmf_transport_vfio_user); 5843 SPDK_LOG_REGISTER_COMPONENT(nvmf_vfio) 5844 SPDK_LOG_REGISTER_COMPONENT(vfio_user_db) 5845