1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2020 Intel Corporation. 3 * Copyright (c) 2019-2022, Nutanix Inc. All rights reserved. 4 * Copyright (c) 2022, 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 /* 8 * NVMe over vfio-user transport 9 */ 10 11 #include <sys/param.h> 12 13 #include <vfio-user/libvfio-user.h> 14 #include <vfio-user/pci_defs.h> 15 16 #include "spdk/barrier.h" 17 #include "spdk/stdinc.h" 18 #include "spdk/assert.h" 19 #include "spdk/thread.h" 20 #include "spdk/nvmf_transport.h" 21 #include "spdk/sock.h" 22 #include "spdk/string.h" 23 #include "spdk/util.h" 24 #include "spdk/log.h" 25 26 #include "transport.h" 27 28 #include "nvmf_internal.h" 29 30 #define SWAP(x, y) \ 31 do \ 32 { \ 33 typeof(x) _tmp = x; \ 34 x = y; \ 35 y = _tmp; \ 36 } while (0) 37 38 #define NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH 256 39 #define NVMF_VFIO_USER_DEFAULT_AQ_DEPTH 32 40 #define NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE ((NVMF_REQ_MAX_BUFFERS - 1) << SHIFT_4KB) 41 #define NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE 42 43 #define NVME_DOORBELLS_OFFSET 0x1000 44 #define NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT 2 45 #define NVMF_VFIO_USER_SET_EVENTIDX_MAX_ATTEMPTS 3 46 #define NVMF_VFIO_USER_EVENTIDX_POLL UINT32_MAX 47 48 #define NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR 512 49 #define NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR (NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR / 4) 50 51 /* NVMe spec 1.4, section 5.21.1.7 */ 52 SPDK_STATIC_ASSERT(NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR >= 2 && 53 NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR <= SPDK_NVME_MAX_IO_QUEUES, 54 "bad number of queues"); 55 56 /* 57 * NVMe driver reads 4096 bytes, which is the extended PCI configuration space 58 * available on PCI-X 2.0 and PCI Express buses 59 */ 60 #define NVME_REG_CFG_SIZE 0x1000 61 62 /* 63 * Doorbells must be page aligned so that they can memory mapped. 64 * 65 * TODO does the NVMe spec also require this? Document it. 66 */ 67 #define NVMF_VFIO_USER_DOORBELLS_SIZE \ 68 SPDK_ALIGN_CEIL( \ 69 (NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR * 2 * SPDK_NVME_DOORBELL_REGISTER_SIZE), \ 70 0x1000) 71 #define NVME_REG_BAR0_SIZE (NVME_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE) 72 73 /* 74 * TODO check the PCI spec whether BAR4 and BAR5 really have to be at least one 75 * page and a multiple of page size (maybe QEMU also needs this?). Document all 76 * this. 77 */ 78 79 /* 80 * MSI-X Pending Bit Array Size 81 * 82 * TODO according to the PCI spec we need one bit per vector, document the 83 * relevant section. 84 * 85 * If the first argument to SPDK_ALIGN_CEIL is 0 then the result is 0, so we 86 * would end up with a 0-size BAR5. 87 */ 88 #define NVME_IRQ_MSIX_NUM MAX(CHAR_BIT, NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) 89 #define NVME_BAR5_SIZE SPDK_ALIGN_CEIL((NVME_IRQ_MSIX_NUM / CHAR_BIT), 0x1000) 90 SPDK_STATIC_ASSERT(NVME_BAR5_SIZE > 0, "Incorrect size"); 91 92 /* MSI-X Table Size */ 93 #define NVME_BAR4_SIZE SPDK_ALIGN_CEIL((NVME_IRQ_MSIX_NUM * 16), 0x1000) 94 SPDK_STATIC_ASSERT(NVME_BAR4_SIZE > 0, "Incorrect size"); 95 96 struct nvmf_vfio_user_req; 97 98 typedef int (*nvmf_vfio_user_req_cb_fn)(struct nvmf_vfio_user_req *req, void *cb_arg); 99 100 /* 1 more for PRP2 list itself */ 101 #define NVMF_VFIO_USER_MAX_IOVECS (NVMF_REQ_MAX_BUFFERS + 1) 102 103 enum nvmf_vfio_user_req_state { 104 VFIO_USER_REQUEST_STATE_FREE = 0, 105 VFIO_USER_REQUEST_STATE_EXECUTING, 106 }; 107 108 /* 109 * Support for live migration in NVMf/vfio-user: live migration is implemented 110 * by stopping the NVMf subsystem when the device is instructed to enter the 111 * stop-and-copy state and then trivially, and most importantly safely, 112 * collecting migration state and providing it to the vfio-user client. We 113 * don't provide any migration state at the pre-copy state as that's too 114 * complicated to do, we might support this in the future. 115 */ 116 117 118 /* NVMe device state representation */ 119 struct nvme_migr_sq_state { 120 uint16_t sqid; 121 uint16_t cqid; 122 uint32_t head; 123 uint32_t size; 124 uint32_t reserved; 125 uint64_t dma_addr; 126 }; 127 SPDK_STATIC_ASSERT(sizeof(struct nvme_migr_sq_state) == 0x18, "Incorrect size"); 128 129 struct nvme_migr_cq_state { 130 uint16_t cqid; 131 uint16_t phase; 132 uint32_t tail; 133 uint32_t size; 134 uint32_t iv; 135 uint32_t ien; 136 uint32_t reserved; 137 uint64_t dma_addr; 138 }; 139 SPDK_STATIC_ASSERT(sizeof(struct nvme_migr_cq_state) == 0x20, "Incorrect size"); 140 141 #define VFIO_USER_MIGR_CALLBACK_VERS 1 142 #define VFIO_USER_NVME_MIGR_MAGIC 0xAFEDBC23 143 144 /* The device state is in VFIO MIGRATION BAR(9) region, keep the device state page aligned. 145 * 146 * NVMe device migration region is defined as below: 147 * ------------------------------------------------------------------------- 148 * | vfio_user_nvme_migr_header | nvmf controller data | queue pairs | BARs | 149 * ------------------------------------------------------------------------- 150 * 151 * Keep vfio_user_nvme_migr_header as a fixed 0x1000 length, all new added fields 152 * can use the reserved space at the end of the data structure. 153 */ 154 struct vfio_user_nvme_migr_header { 155 /* Magic value to validate migration data */ 156 uint32_t magic; 157 /* Version to check the data is same from source to destination */ 158 uint32_t version; 159 160 /* The library uses this field to know how many fields in this 161 * structure are valid, starting at the beginning of this data 162 * structure. New added fields in future use `unused` memory 163 * spaces. 164 */ 165 uint32_t opts_size; 166 uint32_t reserved0; 167 168 /* BARs information */ 169 uint64_t bar_offset[VFU_PCI_DEV_NUM_REGIONS]; 170 uint64_t bar_len[VFU_PCI_DEV_NUM_REGIONS]; 171 172 /* Queue pair start offset, starting at the beginning of this 173 * data structure. 174 */ 175 uint64_t qp_offset; 176 uint64_t qp_len; 177 178 /* Controller data structure */ 179 uint32_t num_io_queues; 180 uint32_t reserved1; 181 182 /* NVMf controller data offset and length if exist, starting at 183 * the beginning of this data structure. 184 */ 185 uint64_t nvmf_data_offset; 186 uint64_t nvmf_data_len; 187 188 /* 189 * Whether or not shadow doorbells are used in the source. 0 is a valid DMA 190 * address. 191 */ 192 uint32_t sdbl; 193 194 /* Shadow doorbell DMA addresses. */ 195 uint64_t shadow_doorbell_buffer; 196 uint64_t eventidx_buffer; 197 198 /* Reserved memory space for new added fields, the 199 * field is always at the end of this data structure. 200 */ 201 uint8_t unused[3856]; 202 }; 203 SPDK_STATIC_ASSERT(sizeof(struct vfio_user_nvme_migr_header) == 0x1000, "Incorrect size"); 204 205 struct vfio_user_nvme_migr_qp { 206 struct nvme_migr_sq_state sq; 207 struct nvme_migr_cq_state cq; 208 }; 209 210 /* NVMe state definition used to load/restore from/to NVMe migration BAR region */ 211 struct vfio_user_nvme_migr_state { 212 struct vfio_user_nvme_migr_header ctrlr_header; 213 struct spdk_nvmf_ctrlr_migr_data nvmf_data; 214 struct vfio_user_nvme_migr_qp qps[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 215 uint8_t doorbells[NVMF_VFIO_USER_DOORBELLS_SIZE]; 216 uint8_t cfg[NVME_REG_CFG_SIZE]; 217 }; 218 219 struct nvmf_vfio_user_req { 220 struct spdk_nvmf_request req; 221 struct spdk_nvme_cpl rsp; 222 struct spdk_nvme_cmd cmd; 223 224 enum nvmf_vfio_user_req_state state; 225 nvmf_vfio_user_req_cb_fn cb_fn; 226 void *cb_arg; 227 228 /* old CC before prop_set_cc fabric command */ 229 union spdk_nvme_cc_register cc; 230 231 TAILQ_ENTRY(nvmf_vfio_user_req) link; 232 233 struct iovec iov[NVMF_VFIO_USER_MAX_IOVECS]; 234 uint8_t iovcnt; 235 236 /* NVMF_VFIO_USER_MAX_IOVECS worth of dma_sg_t. */ 237 uint8_t sg[]; 238 }; 239 240 #define MAP_R (0) 241 #define MAP_RW (1 << 0) 242 #define MAP_INITIALIZE (1 << 1) 243 #define MAP_QUIET (1 << 2) 244 245 /* 246 * Mapping of an NVMe queue. 247 * 248 * This holds the information tracking a local process mapping of an NVMe queue 249 * shared by the client. 250 */ 251 struct nvme_q_mapping { 252 /* iov of local process mapping. */ 253 struct iovec iov; 254 /* Stored sg, needed for unmap. */ 255 dma_sg_t *sg; 256 /* Client PRP of queue. */ 257 uint64_t prp1; 258 /* Total length in bytes. */ 259 uint64_t len; 260 }; 261 262 enum nvmf_vfio_user_sq_state { 263 VFIO_USER_SQ_UNUSED = 0, 264 VFIO_USER_SQ_CREATED, 265 VFIO_USER_SQ_DELETED, 266 VFIO_USER_SQ_ACTIVE, 267 VFIO_USER_SQ_INACTIVE 268 }; 269 270 enum nvmf_vfio_user_cq_state { 271 VFIO_USER_CQ_UNUSED = 0, 272 VFIO_USER_CQ_CREATED, 273 VFIO_USER_CQ_DELETED, 274 }; 275 276 enum nvmf_vfio_user_ctrlr_state { 277 VFIO_USER_CTRLR_CREATING = 0, 278 VFIO_USER_CTRLR_RUNNING, 279 /* Quiesce requested by libvfio-user */ 280 VFIO_USER_CTRLR_PAUSING, 281 /* NVMf subsystem is paused, it's safe to do PCI reset, memory register, 282 * memory unergister, and vfio migration state transition in this state. 283 */ 284 VFIO_USER_CTRLR_PAUSED, 285 /* 286 * Implies that the NVMf subsystem is paused. Device will be unquiesced (PCI 287 * reset, memory register and unregister, controller in destination VM has 288 * been restored). NVMf subsystem resume has been requested. 289 */ 290 VFIO_USER_CTRLR_RESUMING, 291 /* 292 * Implies that the NVMf subsystem is paused. Both controller in source VM and 293 * destinatiom VM is in this state when doing live migration. 294 */ 295 VFIO_USER_CTRLR_MIGRATING 296 }; 297 298 struct nvmf_vfio_user_sq { 299 struct spdk_nvmf_qpair qpair; 300 struct spdk_nvmf_transport_poll_group *group; 301 struct nvmf_vfio_user_ctrlr *ctrlr; 302 303 uint32_t qid; 304 /* Number of entries in queue. */ 305 uint32_t size; 306 struct nvme_q_mapping mapping; 307 enum nvmf_vfio_user_sq_state sq_state; 308 309 uint32_t head; 310 volatile uint32_t *dbl_tailp; 311 312 /* Whether a shadow doorbell eventidx needs setting. */ 313 bool need_rearm; 314 315 /* multiple SQs can be mapped to the same CQ */ 316 uint16_t cqid; 317 318 /* handle_queue_connect_rsp() can be used both for CREATE IO SQ response 319 * and SQ re-connect response in the destination VM, for the prior case, 320 * we will post a NVMe completion to VM, we will not set this flag when 321 * re-connecting SQs in the destination VM. 322 */ 323 bool post_create_io_sq_completion; 324 /* Copy of Create IO SQ command, this field is used together with 325 * `post_create_io_sq_completion` flag. 326 */ 327 struct spdk_nvme_cmd create_io_sq_cmd; 328 329 struct vfio_user_delete_sq_ctx *delete_ctx; 330 331 /* Currently unallocated reqs. */ 332 TAILQ_HEAD(, nvmf_vfio_user_req) free_reqs; 333 /* Poll group entry */ 334 TAILQ_ENTRY(nvmf_vfio_user_sq) link; 335 /* Connected SQ entry */ 336 TAILQ_ENTRY(nvmf_vfio_user_sq) tailq; 337 }; 338 339 struct nvmf_vfio_user_cq { 340 struct spdk_nvmf_transport_poll_group *group; 341 int cq_ref; 342 343 uint32_t qid; 344 /* Number of entries in queue. */ 345 uint32_t size; 346 struct nvme_q_mapping mapping; 347 enum nvmf_vfio_user_cq_state cq_state; 348 349 uint32_t tail; 350 volatile uint32_t *dbl_headp; 351 352 bool phase; 353 354 uint16_t iv; 355 bool ien; 356 357 uint32_t last_head; 358 uint32_t last_trigger_irq_tail; 359 }; 360 361 struct nvmf_vfio_user_poll_group { 362 struct spdk_nvmf_transport_poll_group group; 363 TAILQ_ENTRY(nvmf_vfio_user_poll_group) link; 364 TAILQ_HEAD(, nvmf_vfio_user_sq) sqs; 365 struct spdk_interrupt *intr; 366 int intr_fd; 367 struct { 368 369 /* 370 * ctrlr_intr and ctrlr_kicks will be zero for all other poll 371 * groups. However, they can be zero even for the poll group 372 * the controller belongs are if no vfio-user message has been 373 * received or the controller hasn't been kicked yet. 374 */ 375 376 /* 377 * Number of times vfio_user_ctrlr_intr() has run: 378 * vfio-user file descriptor has been ready or explicitly 379 * kicked (see below). 380 */ 381 uint64_t ctrlr_intr; 382 383 /* 384 * Kicks to the controller by ctrlr_kick(). 385 * ctrlr_intr - ctrlr_kicks is the number of times the 386 * vfio-user poll file descriptor has been ready. 387 */ 388 uint64_t ctrlr_kicks; 389 390 /* 391 * How many times we won the race arming an SQ. 392 */ 393 uint64_t won; 394 395 /* 396 * How many times we lost the race arming an SQ 397 */ 398 uint64_t lost; 399 400 /* 401 * How many requests we processed in total each time we lost 402 * the rearm race. 403 */ 404 uint64_t lost_count; 405 406 /* 407 * Number of attempts we attempted to rearm all the SQs in the 408 * poll group. 409 */ 410 uint64_t rearms; 411 412 uint64_t pg_process_count; 413 uint64_t intr; 414 uint64_t polls; 415 uint64_t polls_spurious; 416 uint64_t poll_reqs; 417 uint64_t poll_reqs_squared; 418 uint64_t cqh_admin_writes; 419 uint64_t cqh_io_writes; 420 } stats; 421 }; 422 423 struct nvmf_vfio_user_shadow_doorbells { 424 volatile uint32_t *shadow_doorbells; 425 volatile uint32_t *eventidxs; 426 dma_sg_t *sgs; 427 struct iovec *iovs; 428 }; 429 430 struct nvmf_vfio_user_ctrlr { 431 struct nvmf_vfio_user_endpoint *endpoint; 432 struct nvmf_vfio_user_transport *transport; 433 434 /* Connected SQs list */ 435 TAILQ_HEAD(, nvmf_vfio_user_sq) connected_sqs; 436 enum nvmf_vfio_user_ctrlr_state state; 437 438 /* 439 * Tells whether live migration data have been prepared. This is used 440 * by the get_pending_bytes callback to tell whether or not the 441 * previous iteration finished. 442 */ 443 bool migr_data_prepared; 444 445 /* Controller is in source VM when doing live migration */ 446 bool in_source_vm; 447 448 struct spdk_thread *thread; 449 struct spdk_poller *vfu_ctx_poller; 450 struct spdk_interrupt *intr; 451 int intr_fd; 452 453 bool queued_quiesce; 454 455 bool reset_shn; 456 bool disconnect; 457 458 uint16_t cntlid; 459 struct spdk_nvmf_ctrlr *ctrlr; 460 461 struct nvmf_vfio_user_sq *sqs[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 462 struct nvmf_vfio_user_cq *cqs[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 463 464 TAILQ_ENTRY(nvmf_vfio_user_ctrlr) link; 465 466 volatile uint32_t *bar0_doorbells; 467 struct nvmf_vfio_user_shadow_doorbells *sdbl; 468 /* 469 * Shadow doorbells PRPs to provide during the stop-and-copy state. 470 */ 471 uint64_t shadow_doorbell_buffer; 472 uint64_t eventidx_buffer; 473 474 bool adaptive_irqs_enabled; 475 }; 476 477 /* Endpoint in vfio-user is associated with a socket file, which 478 * is the representative of a PCI endpoint. 479 */ 480 struct nvmf_vfio_user_endpoint { 481 struct nvmf_vfio_user_transport *transport; 482 vfu_ctx_t *vfu_ctx; 483 struct spdk_poller *accept_poller; 484 struct spdk_thread *accept_thread; 485 bool interrupt_mode; 486 struct msixcap *msix; 487 vfu_pci_config_space_t *pci_config_space; 488 int devmem_fd; 489 int accept_intr_fd; 490 struct spdk_interrupt *accept_intr; 491 492 volatile uint32_t *bar0_doorbells; 493 494 int migr_fd; 495 void *migr_data; 496 497 struct spdk_nvme_transport_id trid; 498 struct spdk_nvmf_subsystem *subsystem; 499 500 /* Controller is associated with an active socket connection, 501 * the lifecycle of the controller is same as the VM. 502 * Currently we only support one active connection, as the NVMe 503 * specification defines, we may support multiple controllers in 504 * future, so that it can support e.g: RESERVATION. 505 */ 506 struct nvmf_vfio_user_ctrlr *ctrlr; 507 pthread_mutex_t lock; 508 509 bool need_async_destroy; 510 /* The subsystem is in PAUSED state and need to be resumed, TRUE 511 * only when migration is done successfully and the controller is 512 * in source VM. 513 */ 514 bool need_resume; 515 /* Start the accept poller again after destroying the controller */ 516 bool need_relisten; 517 518 TAILQ_ENTRY(nvmf_vfio_user_endpoint) link; 519 }; 520 521 struct nvmf_vfio_user_transport_opts { 522 bool disable_mappable_bar0; 523 bool disable_adaptive_irq; 524 bool disable_shadow_doorbells; 525 bool disable_compare; 526 bool enable_intr_mode_sq_spreading; 527 }; 528 529 struct nvmf_vfio_user_transport { 530 struct spdk_nvmf_transport transport; 531 struct nvmf_vfio_user_transport_opts transport_opts; 532 bool intr_mode_supported; 533 pthread_mutex_t lock; 534 TAILQ_HEAD(, nvmf_vfio_user_endpoint) endpoints; 535 536 pthread_mutex_t pg_lock; 537 TAILQ_HEAD(, nvmf_vfio_user_poll_group) poll_groups; 538 struct nvmf_vfio_user_poll_group *next_pg; 539 }; 540 541 /* 542 * function prototypes 543 */ 544 static int nvmf_vfio_user_req_free(struct spdk_nvmf_request *req); 545 546 static struct nvmf_vfio_user_req *get_nvmf_vfio_user_req(struct nvmf_vfio_user_sq *sq); 547 548 /* 549 * Local process virtual address of a queue. 550 */ 551 static inline void * 552 q_addr(struct nvme_q_mapping *mapping) 553 { 554 return mapping->iov.iov_base; 555 } 556 557 static inline int 558 queue_index(uint16_t qid, bool is_cq) 559 { 560 return (qid * 2) + is_cq; 561 } 562 563 static inline volatile uint32_t * 564 sq_headp(struct nvmf_vfio_user_sq *sq) 565 { 566 assert(sq != NULL); 567 return &sq->head; 568 } 569 570 static inline volatile uint32_t * 571 sq_dbl_tailp(struct nvmf_vfio_user_sq *sq) 572 { 573 assert(sq != NULL); 574 return sq->dbl_tailp; 575 } 576 577 static inline volatile uint32_t * 578 cq_dbl_headp(struct nvmf_vfio_user_cq *cq) 579 { 580 assert(cq != NULL); 581 return cq->dbl_headp; 582 } 583 584 static inline volatile uint32_t * 585 cq_tailp(struct nvmf_vfio_user_cq *cq) 586 { 587 assert(cq != NULL); 588 return &cq->tail; 589 } 590 591 static inline void 592 sq_head_advance(struct nvmf_vfio_user_sq *sq) 593 { 594 assert(sq != NULL); 595 596 assert(*sq_headp(sq) < sq->size); 597 (*sq_headp(sq))++; 598 599 if (spdk_unlikely(*sq_headp(sq) == sq->size)) { 600 *sq_headp(sq) = 0; 601 } 602 } 603 604 static inline void 605 cq_tail_advance(struct nvmf_vfio_user_cq *cq) 606 { 607 assert(cq != NULL); 608 609 assert(*cq_tailp(cq) < cq->size); 610 (*cq_tailp(cq))++; 611 612 if (spdk_unlikely(*cq_tailp(cq) == cq->size)) { 613 *cq_tailp(cq) = 0; 614 cq->phase = !cq->phase; 615 } 616 } 617 618 static bool 619 io_q_exists(struct nvmf_vfio_user_ctrlr *vu_ctrlr, const uint16_t qid, const bool is_cq) 620 { 621 assert(vu_ctrlr != NULL); 622 623 if (qid == 0 || qid >= NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) { 624 return false; 625 } 626 627 if (is_cq) { 628 if (vu_ctrlr->cqs[qid] == NULL) { 629 return false; 630 } 631 632 return (vu_ctrlr->cqs[qid]->cq_state != VFIO_USER_CQ_DELETED && 633 vu_ctrlr->cqs[qid]->cq_state != VFIO_USER_CQ_UNUSED); 634 } 635 636 if (vu_ctrlr->sqs[qid] == NULL) { 637 return false; 638 } 639 640 return (vu_ctrlr->sqs[qid]->sq_state != VFIO_USER_SQ_DELETED && 641 vu_ctrlr->sqs[qid]->sq_state != VFIO_USER_SQ_UNUSED); 642 } 643 644 static char * 645 endpoint_id(struct nvmf_vfio_user_endpoint *endpoint) 646 { 647 return endpoint->trid.traddr; 648 } 649 650 static char * 651 ctrlr_id(struct nvmf_vfio_user_ctrlr *ctrlr) 652 { 653 if (!ctrlr || !ctrlr->endpoint) { 654 return "Null Ctrlr"; 655 } 656 657 return endpoint_id(ctrlr->endpoint); 658 } 659 660 /* Return the poll group for the admin queue of the controller. */ 661 static inline struct nvmf_vfio_user_poll_group * 662 ctrlr_to_poll_group(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 663 { 664 return SPDK_CONTAINEROF(vu_ctrlr->sqs[0]->group, 665 struct nvmf_vfio_user_poll_group, 666 group); 667 } 668 669 static inline struct spdk_thread * 670 poll_group_to_thread(struct nvmf_vfio_user_poll_group *vu_pg) 671 { 672 return vu_pg->group.group->thread; 673 } 674 675 static dma_sg_t * 676 index_to_sg_t(void *arr, size_t i) 677 { 678 return (dma_sg_t *)((uintptr_t)arr + i * dma_sg_size()); 679 } 680 681 static inline size_t 682 vfio_user_migr_data_len(void) 683 { 684 return SPDK_ALIGN_CEIL(sizeof(struct vfio_user_nvme_migr_state), PAGE_SIZE); 685 } 686 687 static inline bool 688 in_interrupt_mode(struct nvmf_vfio_user_transport *vu_transport) 689 { 690 return spdk_interrupt_mode_is_enabled() && 691 vu_transport->intr_mode_supported; 692 } 693 694 static int vfio_user_ctrlr_intr(void *ctx); 695 696 static void 697 vfio_user_msg_ctrlr_intr(void *ctx) 698 { 699 struct nvmf_vfio_user_ctrlr *vu_ctrlr = ctx; 700 struct nvmf_vfio_user_poll_group *vu_ctrlr_group = ctrlr_to_poll_group(vu_ctrlr); 701 702 vu_ctrlr_group->stats.ctrlr_kicks++; 703 704 vfio_user_ctrlr_intr(ctx); 705 } 706 707 /* 708 * Kick (force a wakeup) of all poll groups for this controller. 709 * vfio_user_ctrlr_intr() itself arranges for kicking other poll groups if 710 * needed. 711 */ 712 static void 713 ctrlr_kick(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 714 { 715 struct nvmf_vfio_user_poll_group *vu_ctrlr_group; 716 717 SPDK_DEBUGLOG(vfio_user_db, "%s: kicked\n", ctrlr_id(vu_ctrlr)); 718 719 vu_ctrlr_group = ctrlr_to_poll_group(vu_ctrlr); 720 721 spdk_thread_send_msg(poll_group_to_thread(vu_ctrlr_group), 722 vfio_user_msg_ctrlr_intr, vu_ctrlr); 723 } 724 725 /* 726 * Make the given DMA address and length available (locally mapped) via iov. 727 */ 728 static void * 729 map_one(vfu_ctx_t *ctx, uint64_t addr, uint64_t len, dma_sg_t *sg, 730 struct iovec *iov, int32_t flags) 731 { 732 int prot = PROT_READ; 733 int ret; 734 735 if (flags & MAP_RW) { 736 prot |= PROT_WRITE; 737 } 738 739 assert(ctx != NULL); 740 assert(sg != NULL); 741 assert(iov != NULL); 742 743 ret = vfu_addr_to_sgl(ctx, (void *)(uintptr_t)addr, len, sg, 1, prot); 744 if (ret < 0) { 745 if (ret == -1) { 746 if (!(flags & MAP_QUIET)) { 747 SPDK_ERRLOG("failed to translate IOVA [%#lx, %#lx) (prot=%d) to local VA: %m\n", 748 addr, addr + len, prot); 749 } 750 } else { 751 SPDK_ERRLOG("failed to translate IOVA [%#lx, %#lx) (prot=%d) to local VA: %d segments needed\n", 752 addr, addr + len, prot, -(ret + 1)); 753 } 754 return NULL; 755 } 756 757 ret = vfu_sgl_get(ctx, sg, iov, 1, 0); 758 if (ret != 0) { 759 SPDK_ERRLOG("failed to get iovec for IOVA [%#lx, %#lx): %m\n", 760 addr, addr + len); 761 return NULL; 762 } 763 764 assert(iov->iov_base != NULL); 765 return iov->iov_base; 766 } 767 768 static int 769 nvme_cmd_map_prps(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, 770 uint32_t max_iovcnt, uint32_t len, size_t mps, 771 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, uint32_t flags)) 772 { 773 uint64_t prp1, prp2; 774 void *vva; 775 uint32_t i; 776 uint32_t residue_len, nents; 777 uint64_t *prp_list; 778 uint32_t iovcnt; 779 780 assert(max_iovcnt > 0); 781 782 prp1 = cmd->dptr.prp.prp1; 783 prp2 = cmd->dptr.prp.prp2; 784 785 /* PRP1 may started with unaligned page address */ 786 residue_len = mps - (prp1 % mps); 787 residue_len = spdk_min(len, residue_len); 788 789 vva = gpa_to_vva(prv, prp1, residue_len, MAP_RW); 790 if (spdk_unlikely(vva == NULL)) { 791 SPDK_ERRLOG("GPA to VVA failed\n"); 792 return -EINVAL; 793 } 794 len -= residue_len; 795 if (len && max_iovcnt < 2) { 796 SPDK_ERRLOG("Too many page entries, at least two iovs are required\n"); 797 return -ERANGE; 798 } 799 iovs[0].iov_base = vva; 800 iovs[0].iov_len = residue_len; 801 802 if (len) { 803 if (spdk_unlikely(prp2 == 0)) { 804 SPDK_ERRLOG("no PRP2, %d remaining\n", len); 805 return -EINVAL; 806 } 807 808 if (len <= mps) { 809 /* 2 PRP used */ 810 iovcnt = 2; 811 vva = gpa_to_vva(prv, prp2, len, MAP_RW); 812 if (spdk_unlikely(vva == NULL)) { 813 SPDK_ERRLOG("no VVA for %#" PRIx64 ", len%#x\n", 814 prp2, len); 815 return -EINVAL; 816 } 817 iovs[1].iov_base = vva; 818 iovs[1].iov_len = len; 819 } else { 820 /* PRP list used */ 821 nents = (len + mps - 1) / mps; 822 if (spdk_unlikely(nents + 1 > max_iovcnt)) { 823 SPDK_ERRLOG("Too many page entries\n"); 824 return -ERANGE; 825 } 826 827 vva = gpa_to_vva(prv, prp2, nents * sizeof(*prp_list), MAP_R); 828 if (spdk_unlikely(vva == NULL)) { 829 SPDK_ERRLOG("no VVA for %#" PRIx64 ", nents=%#x\n", 830 prp2, nents); 831 return -EINVAL; 832 } 833 prp_list = vva; 834 i = 0; 835 while (len != 0) { 836 residue_len = spdk_min(len, mps); 837 vva = gpa_to_vva(prv, prp_list[i], residue_len, MAP_RW); 838 if (spdk_unlikely(vva == NULL)) { 839 SPDK_ERRLOG("no VVA for %#" PRIx64 ", residue_len=%#x\n", 840 prp_list[i], residue_len); 841 return -EINVAL; 842 } 843 iovs[i + 1].iov_base = vva; 844 iovs[i + 1].iov_len = residue_len; 845 len -= residue_len; 846 i++; 847 } 848 iovcnt = i + 1; 849 } 850 } else { 851 /* 1 PRP used */ 852 iovcnt = 1; 853 } 854 855 assert(iovcnt <= max_iovcnt); 856 return iovcnt; 857 } 858 859 static int 860 nvme_cmd_map_sgls_data(void *prv, struct spdk_nvme_sgl_descriptor *sgls, uint32_t num_sgls, 861 struct iovec *iovs, uint32_t max_iovcnt, 862 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, uint32_t flags)) 863 { 864 uint32_t i; 865 void *vva; 866 867 if (spdk_unlikely(max_iovcnt < num_sgls)) { 868 return -ERANGE; 869 } 870 871 for (i = 0; i < num_sgls; i++) { 872 if (spdk_unlikely(sgls[i].unkeyed.type != SPDK_NVME_SGL_TYPE_DATA_BLOCK)) { 873 SPDK_ERRLOG("Invalid SGL type %u\n", sgls[i].unkeyed.type); 874 return -EINVAL; 875 } 876 vva = gpa_to_vva(prv, sgls[i].address, sgls[i].unkeyed.length, MAP_RW); 877 if (spdk_unlikely(vva == NULL)) { 878 SPDK_ERRLOG("GPA to VVA failed\n"); 879 return -EINVAL; 880 } 881 iovs[i].iov_base = vva; 882 iovs[i].iov_len = sgls[i].unkeyed.length; 883 } 884 885 return num_sgls; 886 } 887 888 static int 889 nvme_cmd_map_sgls(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt, 890 uint32_t len, size_t mps, 891 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, uint32_t flags)) 892 { 893 struct spdk_nvme_sgl_descriptor *sgl, *last_sgl; 894 uint32_t num_sgls, seg_len; 895 void *vva; 896 int ret; 897 uint32_t total_iovcnt = 0; 898 899 /* SGL cases */ 900 sgl = &cmd->dptr.sgl1; 901 902 /* only one SGL segment */ 903 if (sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { 904 assert(max_iovcnt > 0); 905 vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, MAP_RW); 906 if (spdk_unlikely(vva == NULL)) { 907 SPDK_ERRLOG("GPA to VVA failed\n"); 908 return -EINVAL; 909 } 910 iovs[0].iov_base = vva; 911 iovs[0].iov_len = sgl->unkeyed.length; 912 assert(sgl->unkeyed.length == len); 913 914 return 1; 915 } 916 917 for (;;) { 918 if (spdk_unlikely((sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_SEGMENT) && 919 (sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_LAST_SEGMENT))) { 920 SPDK_ERRLOG("Invalid SGL type %u\n", sgl->unkeyed.type); 921 return -EINVAL; 922 } 923 924 seg_len = sgl->unkeyed.length; 925 if (spdk_unlikely(seg_len % sizeof(struct spdk_nvme_sgl_descriptor))) { 926 SPDK_ERRLOG("Invalid SGL segment len %u\n", seg_len); 927 return -EINVAL; 928 } 929 930 num_sgls = seg_len / sizeof(struct spdk_nvme_sgl_descriptor); 931 vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, MAP_R); 932 if (spdk_unlikely(vva == NULL)) { 933 SPDK_ERRLOG("GPA to VVA failed\n"); 934 return -EINVAL; 935 } 936 937 /* sgl point to the first segment */ 938 sgl = (struct spdk_nvme_sgl_descriptor *)vva; 939 last_sgl = &sgl[num_sgls - 1]; 940 941 /* we are done */ 942 if (last_sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { 943 /* map whole sgl list */ 944 ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls, &iovs[total_iovcnt], 945 max_iovcnt - total_iovcnt, gpa_to_vva); 946 if (spdk_unlikely(ret < 0)) { 947 return ret; 948 } 949 total_iovcnt += ret; 950 951 return total_iovcnt; 952 } 953 954 if (num_sgls > 1) { 955 /* map whole sgl exclude last_sgl */ 956 ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls - 1, &iovs[total_iovcnt], 957 max_iovcnt - total_iovcnt, gpa_to_vva); 958 if (spdk_unlikely(ret < 0)) { 959 return ret; 960 } 961 total_iovcnt += ret; 962 } 963 964 /* move to next level's segments */ 965 sgl = last_sgl; 966 } 967 968 return 0; 969 } 970 971 static int 972 nvme_map_cmd(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt, 973 uint32_t len, size_t mps, 974 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, uint32_t flags)) 975 { 976 if (cmd->psdt == SPDK_NVME_PSDT_PRP) { 977 return nvme_cmd_map_prps(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva); 978 } 979 980 return nvme_cmd_map_sgls(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva); 981 } 982 983 /* 984 * For each queue, update the location of its doorbell to the correct location: 985 * either our own BAR0, or the guest's configured shadow doorbell area. 986 * 987 * The Admin queue (qid: 0) does not ever use shadow doorbells. 988 */ 989 static void 990 vfio_user_ctrlr_switch_doorbells(struct nvmf_vfio_user_ctrlr *ctrlr, bool shadow) 991 { 992 volatile uint32_t *doorbells = shadow ? ctrlr->sdbl->shadow_doorbells : 993 ctrlr->bar0_doorbells; 994 995 assert(doorbells != NULL); 996 997 for (size_t i = 1; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; i++) { 998 struct nvmf_vfio_user_sq *sq = ctrlr->sqs[i]; 999 struct nvmf_vfio_user_cq *cq = ctrlr->cqs[i]; 1000 1001 if (sq != NULL) { 1002 sq->dbl_tailp = doorbells + queue_index(sq->qid, false); 1003 1004 ctrlr->sqs[i]->need_rearm = shadow; 1005 } 1006 1007 if (cq != NULL) { 1008 cq->dbl_headp = doorbells + queue_index(cq->qid, true); 1009 } 1010 } 1011 } 1012 1013 static void 1014 unmap_sdbl(vfu_ctx_t *vfu_ctx, struct nvmf_vfio_user_shadow_doorbells *sdbl) 1015 { 1016 assert(vfu_ctx != NULL); 1017 assert(sdbl != NULL); 1018 1019 /* 1020 * An allocation error would result in only one of the two being 1021 * non-NULL. If that is the case, no memory should have been mapped. 1022 */ 1023 if (sdbl->iovs == NULL || sdbl->sgs == NULL) { 1024 return; 1025 } 1026 1027 for (size_t i = 0; i < NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT; ++i) { 1028 struct iovec *iov; 1029 dma_sg_t *sg; 1030 1031 if (!sdbl->iovs[i].iov_len) { 1032 continue; 1033 } 1034 1035 sg = index_to_sg_t(sdbl->sgs, i); 1036 iov = sdbl->iovs + i; 1037 1038 vfu_sgl_put(vfu_ctx, sg, iov, 1); 1039 } 1040 } 1041 1042 static void 1043 free_sdbl(vfu_ctx_t *vfu_ctx, struct nvmf_vfio_user_shadow_doorbells *sdbl) 1044 { 1045 if (sdbl == NULL) { 1046 return; 1047 } 1048 1049 unmap_sdbl(vfu_ctx, sdbl); 1050 1051 /* 1052 * sdbl->shadow_doorbells and sdbl->eventidxs were mapped, 1053 * not allocated, so don't free() them. 1054 */ 1055 free(sdbl->sgs); 1056 free(sdbl->iovs); 1057 free(sdbl); 1058 } 1059 1060 static struct nvmf_vfio_user_shadow_doorbells * 1061 map_sdbl(vfu_ctx_t *vfu_ctx, uint64_t prp1, uint64_t prp2, size_t len) 1062 { 1063 struct nvmf_vfio_user_shadow_doorbells *sdbl = NULL; 1064 dma_sg_t *sg2 = NULL; 1065 void *p; 1066 1067 assert(vfu_ctx != NULL); 1068 1069 sdbl = calloc(1, sizeof(*sdbl)); 1070 if (sdbl == NULL) { 1071 goto err; 1072 } 1073 1074 sdbl->sgs = calloc(NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT, dma_sg_size()); 1075 sdbl->iovs = calloc(NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT, sizeof(*sdbl->iovs)); 1076 if (sdbl->sgs == NULL || sdbl->iovs == NULL) { 1077 goto err; 1078 } 1079 1080 /* Map shadow doorbell buffer (PRP1). */ 1081 p = map_one(vfu_ctx, prp1, len, sdbl->sgs, sdbl->iovs, MAP_RW); 1082 1083 if (p == NULL) { 1084 goto err; 1085 } 1086 1087 /* 1088 * Map eventidx buffer (PRP2). 1089 * Should only be written to by the controller. 1090 */ 1091 1092 sg2 = index_to_sg_t(sdbl->sgs, 1); 1093 1094 p = map_one(vfu_ctx, prp2, len, sg2, sdbl->iovs + 1, MAP_RW); 1095 1096 if (p == NULL) { 1097 goto err; 1098 } 1099 1100 sdbl->shadow_doorbells = (uint32_t *)sdbl->iovs[0].iov_base; 1101 sdbl->eventidxs = (uint32_t *)sdbl->iovs[1].iov_base; 1102 1103 return sdbl; 1104 1105 err: 1106 free_sdbl(vfu_ctx, sdbl); 1107 return NULL; 1108 } 1109 1110 /* 1111 * Copy doorbells from one buffer to the other, during switches between BAR0 1112 * doorbells and shadow doorbells. 1113 */ 1114 static void 1115 copy_doorbells(struct nvmf_vfio_user_ctrlr *ctrlr, 1116 const volatile uint32_t *from, volatile uint32_t *to) 1117 { 1118 assert(ctrlr != NULL); 1119 assert(from != NULL); 1120 assert(to != NULL); 1121 1122 SPDK_DEBUGLOG(vfio_user_db, 1123 "%s: migrating shadow doorbells from %p to %p\n", 1124 ctrlr_id(ctrlr), from, to); 1125 1126 /* Can't use memcpy because it doesn't respect volatile semantics. */ 1127 for (size_t i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; ++i) { 1128 if (ctrlr->sqs[i] != NULL) { 1129 to[queue_index(i, false)] = from[queue_index(i, false)]; 1130 } 1131 1132 if (ctrlr->cqs[i] != NULL) { 1133 to[queue_index(i, true)] = from[queue_index(i, true)]; 1134 } 1135 } 1136 } 1137 1138 static void 1139 fail_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1140 { 1141 const struct spdk_nvmf_registers *regs; 1142 1143 assert(vu_ctrlr != NULL); 1144 assert(vu_ctrlr->ctrlr != NULL); 1145 1146 regs = spdk_nvmf_ctrlr_get_regs(vu_ctrlr->ctrlr); 1147 if (regs->csts.bits.cfs == 0) { 1148 SPDK_ERRLOG(":%s failing controller\n", ctrlr_id(vu_ctrlr)); 1149 } 1150 1151 nvmf_ctrlr_set_fatal_status(vu_ctrlr->ctrlr); 1152 } 1153 1154 static inline bool 1155 ctrlr_interrupt_enabled(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1156 { 1157 assert(vu_ctrlr != NULL); 1158 assert(vu_ctrlr->endpoint != NULL); 1159 1160 vfu_pci_config_space_t *pci = vu_ctrlr->endpoint->pci_config_space; 1161 1162 return (!pci->hdr.cmd.id || vu_ctrlr->endpoint->msix->mxc.mxe); 1163 } 1164 1165 static void 1166 nvmf_vfio_user_destroy_endpoint(struct nvmf_vfio_user_endpoint *endpoint) 1167 { 1168 SPDK_DEBUGLOG(nvmf_vfio, "destroy endpoint %s\n", endpoint_id(endpoint)); 1169 1170 spdk_interrupt_unregister(&endpoint->accept_intr); 1171 spdk_poller_unregister(&endpoint->accept_poller); 1172 1173 if (endpoint->bar0_doorbells) { 1174 munmap((void *)endpoint->bar0_doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 1175 } 1176 1177 if (endpoint->devmem_fd > 0) { 1178 close(endpoint->devmem_fd); 1179 } 1180 1181 if (endpoint->migr_data) { 1182 munmap(endpoint->migr_data, vfio_user_migr_data_len()); 1183 } 1184 1185 if (endpoint->migr_fd > 0) { 1186 close(endpoint->migr_fd); 1187 } 1188 1189 if (endpoint->vfu_ctx) { 1190 vfu_destroy_ctx(endpoint->vfu_ctx); 1191 } 1192 1193 pthread_mutex_destroy(&endpoint->lock); 1194 free(endpoint); 1195 } 1196 1197 /* called when process exits */ 1198 static int 1199 nvmf_vfio_user_destroy(struct spdk_nvmf_transport *transport, 1200 spdk_nvmf_transport_destroy_done_cb cb_fn, void *cb_arg) 1201 { 1202 struct nvmf_vfio_user_transport *vu_transport; 1203 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 1204 1205 SPDK_DEBUGLOG(nvmf_vfio, "destroy transport\n"); 1206 1207 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 1208 transport); 1209 1210 pthread_mutex_destroy(&vu_transport->lock); 1211 pthread_mutex_destroy(&vu_transport->pg_lock); 1212 1213 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 1214 TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link); 1215 nvmf_vfio_user_destroy_endpoint(endpoint); 1216 } 1217 1218 free(vu_transport); 1219 1220 if (cb_fn) { 1221 cb_fn(cb_arg); 1222 } 1223 1224 return 0; 1225 } 1226 1227 static const struct spdk_json_object_decoder vfio_user_transport_opts_decoder[] = { 1228 { 1229 "disable_mappable_bar0", 1230 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_mappable_bar0), 1231 spdk_json_decode_bool, true 1232 }, 1233 { 1234 "disable_adaptive_irq", 1235 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_adaptive_irq), 1236 spdk_json_decode_bool, true 1237 }, 1238 { 1239 "disable_shadow_doorbells", 1240 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_shadow_doorbells), 1241 spdk_json_decode_bool, true 1242 }, 1243 { 1244 "disable_compare", 1245 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_compare), 1246 spdk_json_decode_bool, true 1247 }, 1248 { 1249 "enable_intr_mode_sq_spreading", 1250 offsetof(struct nvmf_vfio_user_transport, transport_opts.enable_intr_mode_sq_spreading), 1251 spdk_json_decode_bool, true 1252 }, 1253 }; 1254 1255 static struct spdk_nvmf_transport * 1256 nvmf_vfio_user_create(struct spdk_nvmf_transport_opts *opts) 1257 { 1258 struct nvmf_vfio_user_transport *vu_transport; 1259 int err; 1260 1261 if (opts->max_qpairs_per_ctrlr > NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) { 1262 SPDK_ERRLOG("Invalid max_qpairs_per_ctrlr=%d, supported max_qpairs_per_ctrlr=%d\n", 1263 opts->max_qpairs_per_ctrlr, NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR); 1264 return NULL; 1265 } 1266 1267 vu_transport = calloc(1, sizeof(*vu_transport)); 1268 if (vu_transport == NULL) { 1269 SPDK_ERRLOG("Transport alloc fail: %m\n"); 1270 return NULL; 1271 } 1272 1273 err = pthread_mutex_init(&vu_transport->lock, NULL); 1274 if (err != 0) { 1275 SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err); 1276 goto err; 1277 } 1278 TAILQ_INIT(&vu_transport->endpoints); 1279 1280 err = pthread_mutex_init(&vu_transport->pg_lock, NULL); 1281 if (err != 0) { 1282 pthread_mutex_destroy(&vu_transport->lock); 1283 SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err); 1284 goto err; 1285 } 1286 TAILQ_INIT(&vu_transport->poll_groups); 1287 1288 if (opts->transport_specific != NULL && 1289 spdk_json_decode_object_relaxed(opts->transport_specific, vfio_user_transport_opts_decoder, 1290 SPDK_COUNTOF(vfio_user_transport_opts_decoder), 1291 vu_transport)) { 1292 SPDK_ERRLOG("spdk_json_decode_object_relaxed failed\n"); 1293 goto cleanup; 1294 } 1295 1296 /* 1297 * To support interrupt mode, the transport must be configured with 1298 * mappable BAR0 disabled: we need a vfio-user message to wake us up 1299 * when a client writes new doorbell values to BAR0, via the 1300 * libvfio-user socket fd. 1301 */ 1302 vu_transport->intr_mode_supported = 1303 vu_transport->transport_opts.disable_mappable_bar0; 1304 1305 /* 1306 * If BAR0 is mappable, it doesn't make sense to support shadow 1307 * doorbells, so explicitly turn it off. 1308 */ 1309 if (!vu_transport->transport_opts.disable_mappable_bar0) { 1310 vu_transport->transport_opts.disable_shadow_doorbells = true; 1311 } 1312 1313 if (spdk_interrupt_mode_is_enabled()) { 1314 if (!vu_transport->intr_mode_supported) { 1315 SPDK_ERRLOG("interrupt mode not supported\n"); 1316 goto cleanup; 1317 } 1318 1319 /* 1320 * If we are in interrupt mode, we cannot support adaptive IRQs, 1321 * as there is no guarantee the SQ poller will run subsequently 1322 * to send pending IRQs. 1323 */ 1324 vu_transport->transport_opts.disable_adaptive_irq = true; 1325 } 1326 1327 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_mappable_bar0=%d\n", 1328 vu_transport->transport_opts.disable_mappable_bar0); 1329 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_adaptive_irq=%d\n", 1330 vu_transport->transport_opts.disable_adaptive_irq); 1331 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_shadow_doorbells=%d\n", 1332 vu_transport->transport_opts.disable_shadow_doorbells); 1333 1334 return &vu_transport->transport; 1335 1336 cleanup: 1337 pthread_mutex_destroy(&vu_transport->lock); 1338 pthread_mutex_destroy(&vu_transport->pg_lock); 1339 err: 1340 free(vu_transport); 1341 return NULL; 1342 } 1343 1344 static uint32_t 1345 max_queue_size(struct nvmf_vfio_user_ctrlr const *vu_ctrlr) 1346 { 1347 assert(vu_ctrlr != NULL); 1348 assert(vu_ctrlr->ctrlr != NULL); 1349 1350 return vu_ctrlr->ctrlr->vcprop.cap.bits.mqes + 1; 1351 } 1352 1353 static uint32_t 1354 doorbell_stride(const struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1355 { 1356 assert(vu_ctrlr != NULL); 1357 assert(vu_ctrlr->ctrlr != NULL); 1358 1359 return vu_ctrlr->ctrlr->vcprop.cap.bits.dstrd; 1360 } 1361 1362 static uintptr_t 1363 memory_page_size(const struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1364 { 1365 uint32_t memory_page_shift = vu_ctrlr->ctrlr->vcprop.cc.bits.mps + 12; 1366 return 1ul << memory_page_shift; 1367 } 1368 1369 static uintptr_t 1370 memory_page_mask(const struct nvmf_vfio_user_ctrlr *ctrlr) 1371 { 1372 return ~(memory_page_size(ctrlr) - 1); 1373 } 1374 1375 static int 1376 map_q(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvme_q_mapping *mapping, 1377 uint32_t flags) 1378 { 1379 void *ret; 1380 1381 assert(mapping->len != 0); 1382 assert(q_addr(mapping) == NULL); 1383 1384 ret = map_one(vu_ctrlr->endpoint->vfu_ctx, mapping->prp1, mapping->len, 1385 mapping->sg, &mapping->iov, flags); 1386 if (ret == NULL) { 1387 return -EFAULT; 1388 } 1389 1390 if (flags & MAP_INITIALIZE) { 1391 memset(q_addr(mapping), 0, mapping->len); 1392 } 1393 1394 return 0; 1395 } 1396 1397 static inline void 1398 unmap_q(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvme_q_mapping *mapping) 1399 { 1400 if (q_addr(mapping) != NULL) { 1401 vfu_sgl_put(vu_ctrlr->endpoint->vfu_ctx, mapping->sg, 1402 &mapping->iov, 1); 1403 mapping->iov.iov_base = NULL; 1404 } 1405 } 1406 1407 static int 1408 asq_setup(struct nvmf_vfio_user_ctrlr *ctrlr) 1409 { 1410 struct nvmf_vfio_user_sq *sq; 1411 const struct spdk_nvmf_registers *regs; 1412 int ret; 1413 1414 assert(ctrlr != NULL); 1415 1416 sq = ctrlr->sqs[0]; 1417 1418 assert(sq != NULL); 1419 assert(q_addr(&sq->mapping) == NULL); 1420 /* XXX ctrlr->asq == 0 is a valid memory address */ 1421 1422 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->ctrlr); 1423 sq->qid = 0; 1424 sq->size = regs->aqa.bits.asqs + 1; 1425 sq->mapping.prp1 = regs->asq; 1426 sq->mapping.len = sq->size * sizeof(struct spdk_nvme_cmd); 1427 *sq_headp(sq) = 0; 1428 sq->cqid = 0; 1429 1430 ret = map_q(ctrlr, &sq->mapping, MAP_INITIALIZE); 1431 if (ret) { 1432 return ret; 1433 } 1434 1435 /* The Admin queue (qid: 0) does not ever use shadow doorbells. */ 1436 sq->dbl_tailp = ctrlr->bar0_doorbells + queue_index(0, false); 1437 1438 *sq_dbl_tailp(sq) = 0; 1439 1440 return 0; 1441 } 1442 1443 /* 1444 * Updates eventidx to set an SQ into interrupt or polling mode. 1445 * 1446 * Returns false if the current SQ tail does not match the SQ head, as 1447 * this means that the host has submitted more items to the queue while we were 1448 * not looking - or during the event index update. In that case, we must retry, 1449 * or otherwise make sure we are going to wake up again. 1450 */ 1451 static bool 1452 set_sq_eventidx(struct nvmf_vfio_user_sq *sq) 1453 { 1454 struct nvmf_vfio_user_ctrlr *ctrlr; 1455 volatile uint32_t *sq_tail_eidx; 1456 uint32_t old_tail, new_tail; 1457 1458 assert(sq != NULL); 1459 assert(sq->ctrlr != NULL); 1460 assert(sq->ctrlr->sdbl != NULL); 1461 assert(sq->need_rearm); 1462 assert(sq->qid != 0); 1463 1464 ctrlr = sq->ctrlr; 1465 1466 SPDK_DEBUGLOG(vfio_user_db, "%s: updating eventidx of sqid:%u\n", 1467 ctrlr_id(ctrlr), sq->qid); 1468 1469 sq_tail_eidx = ctrlr->sdbl->eventidxs + queue_index(sq->qid, false); 1470 1471 assert(ctrlr->endpoint != NULL); 1472 1473 if (!ctrlr->endpoint->interrupt_mode) { 1474 /* No synchronisation necessary. */ 1475 *sq_tail_eidx = NVMF_VFIO_USER_EVENTIDX_POLL; 1476 return true; 1477 } 1478 1479 old_tail = *sq_dbl_tailp(sq); 1480 *sq_tail_eidx = old_tail; 1481 1482 /* 1483 * Ensure that the event index is updated before re-reading the tail 1484 * doorbell. If it's not, then the host might race us and update the 1485 * tail after the second read but before the event index is written, so 1486 * it won't write to BAR0 and we'll miss the update. 1487 * 1488 * The driver should provide similar ordering with an mb(). 1489 */ 1490 spdk_mb(); 1491 1492 /* 1493 * Check if the host has updated the tail doorbell after we've read it 1494 * for the first time, but before the event index was written. If that's 1495 * the case, then we've lost the race and we need to update the event 1496 * index again (after polling the queue, since the host won't write to 1497 * BAR0). 1498 */ 1499 new_tail = *sq_dbl_tailp(sq); 1500 1501 /* 1502 * We might poll the queue straight after this function returns if the 1503 * tail has been updated, so we need to ensure that any changes to the 1504 * queue will be visible to us if the doorbell has been updated. 1505 * 1506 * The driver should provide similar ordering with a wmb() to ensure 1507 * that the queue is written before it updates the tail doorbell. 1508 */ 1509 spdk_rmb(); 1510 1511 SPDK_DEBUGLOG(vfio_user_db, "%s: sqid:%u, old_tail=%u, new_tail=%u, " 1512 "sq_head=%u\n", ctrlr_id(ctrlr), sq->qid, old_tail, 1513 new_tail, *sq_headp(sq)); 1514 1515 if (new_tail == *sq_headp(sq)) { 1516 sq->need_rearm = false; 1517 return true; 1518 } 1519 1520 /* 1521 * We've lost the race: the tail was updated since we last polled, 1522 * including if it happened within this routine. 1523 * 1524 * The caller should retry after polling (think of this as a cmpxchg 1525 * loop); if we go to sleep while the SQ is not empty, then we won't 1526 * process the remaining events. 1527 */ 1528 return false; 1529 } 1530 1531 static int nvmf_vfio_user_sq_poll(struct nvmf_vfio_user_sq *sq); 1532 1533 /* 1534 * Arrange for an SQ to interrupt us if written. Returns non-zero if we 1535 * processed some SQ entries. 1536 */ 1537 static int 1538 vfio_user_sq_rearm(struct nvmf_vfio_user_ctrlr *ctrlr, 1539 struct nvmf_vfio_user_sq *sq, 1540 struct nvmf_vfio_user_poll_group *vu_group) 1541 { 1542 int count = 0; 1543 size_t i; 1544 1545 assert(sq->need_rearm); 1546 1547 for (i = 0; i < NVMF_VFIO_USER_SET_EVENTIDX_MAX_ATTEMPTS; i++) { 1548 int ret; 1549 1550 if (set_sq_eventidx(sq)) { 1551 /* We won the race and set eventidx; done. */ 1552 vu_group->stats.won++; 1553 return count; 1554 } 1555 1556 ret = nvmf_vfio_user_sq_poll(sq); 1557 1558 count += (ret < 0) ? 1 : ret; 1559 1560 /* 1561 * set_sq_eventidx() hit the race, so we expected 1562 * to process at least one command from this queue. 1563 * If there were no new commands waiting for us, then 1564 * we must have hit an unexpected race condition. 1565 */ 1566 if (ret == 0) { 1567 SPDK_ERRLOG("%s: unexpected race condition detected " 1568 "while updating the shadow doorbell buffer\n", 1569 ctrlr_id(ctrlr)); 1570 1571 fail_ctrlr(ctrlr); 1572 return count; 1573 } 1574 } 1575 1576 SPDK_DEBUGLOG(vfio_user_db, 1577 "%s: set_sq_eventidx() lost the race %zu times\n", 1578 ctrlr_id(ctrlr), i); 1579 1580 vu_group->stats.lost++; 1581 vu_group->stats.lost_count += count; 1582 1583 /* 1584 * We couldn't arrange an eventidx guaranteed to cause a BAR0 write, as 1585 * we raced with the producer too many times; force ourselves to wake up 1586 * instead. We'll process all queues at that point. 1587 */ 1588 ctrlr_kick(ctrlr); 1589 1590 return count; 1591 } 1592 1593 /* 1594 * We're in interrupt mode, and potentially about to go to sleep. We need to 1595 * make sure any further I/O submissions are guaranteed to wake us up: for 1596 * shadow doorbells that means we may need to go through set_sq_eventidx() for 1597 * every SQ that needs re-arming. 1598 * 1599 * Returns non-zero if we processed something. 1600 */ 1601 static int 1602 vfio_user_poll_group_rearm(struct nvmf_vfio_user_poll_group *vu_group) 1603 { 1604 struct nvmf_vfio_user_sq *sq; 1605 int count = 0; 1606 1607 vu_group->stats.rearms++; 1608 1609 TAILQ_FOREACH(sq, &vu_group->sqs, link) { 1610 if (spdk_unlikely(sq->sq_state != VFIO_USER_SQ_ACTIVE || !sq->size)) { 1611 continue; 1612 } 1613 1614 if (sq->need_rearm) { 1615 count += vfio_user_sq_rearm(sq->ctrlr, sq, vu_group); 1616 } 1617 } 1618 1619 return count; 1620 } 1621 1622 static int 1623 acq_setup(struct nvmf_vfio_user_ctrlr *ctrlr) 1624 { 1625 struct nvmf_vfio_user_cq *cq; 1626 const struct spdk_nvmf_registers *regs; 1627 int ret; 1628 1629 assert(ctrlr != NULL); 1630 1631 cq = ctrlr->cqs[0]; 1632 1633 assert(cq != NULL); 1634 1635 assert(q_addr(&cq->mapping) == NULL); 1636 1637 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->ctrlr); 1638 assert(regs != NULL); 1639 cq->qid = 0; 1640 cq->size = regs->aqa.bits.acqs + 1; 1641 cq->mapping.prp1 = regs->acq; 1642 cq->mapping.len = cq->size * sizeof(struct spdk_nvme_cpl); 1643 *cq_tailp(cq) = 0; 1644 cq->ien = true; 1645 cq->phase = true; 1646 1647 ret = map_q(ctrlr, &cq->mapping, MAP_RW | MAP_INITIALIZE); 1648 if (ret) { 1649 return ret; 1650 } 1651 1652 /* The Admin queue (qid: 0) does not ever use shadow doorbells. */ 1653 cq->dbl_headp = ctrlr->bar0_doorbells + queue_index(0, true); 1654 1655 *cq_dbl_headp(cq) = 0; 1656 1657 return 0; 1658 } 1659 1660 static void * 1661 _map_one(void *prv, uint64_t addr, uint64_t len, uint32_t flags) 1662 { 1663 struct spdk_nvmf_request *req = (struct spdk_nvmf_request *)prv; 1664 struct spdk_nvmf_qpair *qpair; 1665 struct nvmf_vfio_user_req *vu_req; 1666 struct nvmf_vfio_user_sq *sq; 1667 void *ret; 1668 1669 assert(req != NULL); 1670 qpair = req->qpair; 1671 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 1672 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 1673 1674 assert(vu_req->iovcnt < NVMF_VFIO_USER_MAX_IOVECS); 1675 ret = map_one(sq->ctrlr->endpoint->vfu_ctx, addr, len, 1676 index_to_sg_t(vu_req->sg, vu_req->iovcnt), 1677 &vu_req->iov[vu_req->iovcnt], flags); 1678 if (spdk_likely(ret != NULL)) { 1679 vu_req->iovcnt++; 1680 } 1681 return ret; 1682 } 1683 1684 static int 1685 vfio_user_map_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req, 1686 struct iovec *iov, uint32_t length) 1687 { 1688 /* Map PRP list to from Guest physical memory to 1689 * virtual memory address. 1690 */ 1691 return nvme_map_cmd(req, &req->cmd->nvme_cmd, iov, NVMF_REQ_MAX_BUFFERS, 1692 length, 4096, _map_one); 1693 } 1694 1695 static int handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 1696 struct nvmf_vfio_user_sq *sq); 1697 1698 static uint32_t 1699 cq_free_slots(struct nvmf_vfio_user_cq *cq) 1700 { 1701 uint32_t free_slots; 1702 1703 assert(cq != NULL); 1704 1705 if (cq->tail == cq->last_head) { 1706 free_slots = cq->size; 1707 } else if (cq->tail > cq->last_head) { 1708 free_slots = cq->size - (cq->tail - cq->last_head); 1709 } else { 1710 free_slots = cq->last_head - cq->tail; 1711 } 1712 assert(free_slots > 0); 1713 1714 return free_slots - 1; 1715 } 1716 1717 /* 1718 * Since reading the head doorbell is relatively expensive, we use the cached 1719 * value, so we only have to read it for real if it appears that we are full. 1720 */ 1721 static inline bool 1722 cq_is_full(struct nvmf_vfio_user_cq *cq) 1723 { 1724 uint32_t free_cq_slots; 1725 1726 assert(cq != NULL); 1727 1728 free_cq_slots = cq_free_slots(cq); 1729 1730 if (spdk_unlikely(free_cq_slots == 0)) { 1731 cq->last_head = *cq_dbl_headp(cq); 1732 free_cq_slots = cq_free_slots(cq); 1733 } 1734 1735 return free_cq_slots == 0; 1736 } 1737 1738 /* 1739 * Posts a CQE in the completion queue. 1740 * 1741 * @ctrlr: the vfio-user controller 1742 * @cq: the completion queue 1743 * @cdw0: cdw0 as reported by NVMf 1744 * @sqid: submission queue ID 1745 * @cid: command identifier in NVMe command 1746 * @sc: the NVMe CQE status code 1747 * @sct: the NVMe CQE status code type 1748 */ 1749 static int 1750 post_completion(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_cq *cq, 1751 uint32_t cdw0, uint16_t sqid, uint16_t cid, uint16_t sc, uint16_t sct) 1752 { 1753 struct spdk_nvme_status cpl_status = { 0 }; 1754 struct spdk_nvme_cpl *cpl; 1755 int err; 1756 1757 assert(ctrlr != NULL); 1758 1759 if (spdk_unlikely(cq == NULL || q_addr(&cq->mapping) == NULL)) { 1760 return 0; 1761 } 1762 1763 if (cq->qid == 0) { 1764 assert(spdk_get_thread() == cq->group->group->thread); 1765 } 1766 1767 /* 1768 * As per NVMe Base spec 3.3.1.2.1, we are supposed to implement CQ flow 1769 * control: if there is no space in the CQ, we should wait until there is. 1770 * 1771 * In practice, we just fail the controller instead: as it happens, all host 1772 * implementations we care about right-size the CQ: this is required anyway for 1773 * NVMEoF support (see 3.3.2.8). 1774 */ 1775 if (cq_is_full(cq)) { 1776 SPDK_ERRLOG("%s: cqid:%d full (tail=%d, head=%d)\n", 1777 ctrlr_id(ctrlr), cq->qid, *cq_tailp(cq), 1778 *cq_dbl_headp(cq)); 1779 return -1; 1780 } 1781 1782 cpl = ((struct spdk_nvme_cpl *)q_addr(&cq->mapping)) + *cq_tailp(cq); 1783 1784 assert(ctrlr->sqs[sqid] != NULL); 1785 SPDK_DEBUGLOG(nvmf_vfio, 1786 "%s: request complete sqid:%d cid=%d status=%#x " 1787 "sqhead=%d cq tail=%d\n", ctrlr_id(ctrlr), sqid, cid, sc, 1788 *sq_headp(ctrlr->sqs[sqid]), *cq_tailp(cq)); 1789 1790 cpl->sqhd = *sq_headp(ctrlr->sqs[sqid]); 1791 cpl->sqid = sqid; 1792 cpl->cid = cid; 1793 cpl->cdw0 = cdw0; 1794 1795 /* 1796 * This is a bitfield: instead of setting the individual bits we need 1797 * directly in cpl->status, which would cause a read-modify-write cycle, 1798 * we'll avoid reading from the CPL altogether by filling in a local 1799 * cpl_status variable, then writing the whole thing. 1800 */ 1801 cpl_status.sct = sct; 1802 cpl_status.sc = sc; 1803 cpl_status.p = cq->phase; 1804 cpl->status = cpl_status; 1805 1806 /* Ensure the Completion Queue Entry is visible. */ 1807 spdk_wmb(); 1808 cq_tail_advance(cq); 1809 1810 if ((cq->qid == 0 || !ctrlr->adaptive_irqs_enabled) && 1811 cq->ien && ctrlr_interrupt_enabled(ctrlr)) { 1812 err = vfu_irq_trigger(ctrlr->endpoint->vfu_ctx, cq->iv); 1813 if (err != 0) { 1814 SPDK_ERRLOG("%s: failed to trigger interrupt: %m\n", 1815 ctrlr_id(ctrlr)); 1816 return err; 1817 } 1818 } 1819 1820 return 0; 1821 } 1822 1823 static void 1824 free_sq_reqs(struct nvmf_vfio_user_sq *sq) 1825 { 1826 while (!TAILQ_EMPTY(&sq->free_reqs)) { 1827 struct nvmf_vfio_user_req *vu_req = TAILQ_FIRST(&sq->free_reqs); 1828 TAILQ_REMOVE(&sq->free_reqs, vu_req, link); 1829 free(vu_req); 1830 } 1831 } 1832 1833 static void 1834 delete_cq_done(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_cq *cq) 1835 { 1836 assert(cq->cq_ref == 0); 1837 unmap_q(ctrlr, &cq->mapping); 1838 cq->size = 0; 1839 cq->cq_state = VFIO_USER_CQ_DELETED; 1840 cq->group = NULL; 1841 } 1842 1843 /* Deletes a SQ, if this SQ is the last user of the associated CQ 1844 * and the controller is being shut down/reset or vfio-user client disconnects, 1845 * then the CQ is also deleted. 1846 */ 1847 static void 1848 delete_sq_done(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvmf_vfio_user_sq *sq) 1849 { 1850 struct nvmf_vfio_user_cq *cq; 1851 uint16_t cqid; 1852 1853 SPDK_DEBUGLOG(nvmf_vfio, "%s: delete sqid:%d=%p done\n", ctrlr_id(vu_ctrlr), 1854 sq->qid, sq); 1855 1856 /* Free SQ resources */ 1857 unmap_q(vu_ctrlr, &sq->mapping); 1858 1859 free_sq_reqs(sq); 1860 1861 sq->size = 0; 1862 1863 sq->sq_state = VFIO_USER_SQ_DELETED; 1864 1865 /* Controller RESET and SHUTDOWN are special cases, 1866 * VM may not send DELETE IO SQ/CQ commands, NVMf library 1867 * will disconnect IO queue pairs. 1868 */ 1869 if (vu_ctrlr->reset_shn || vu_ctrlr->disconnect) { 1870 cqid = sq->cqid; 1871 cq = vu_ctrlr->cqs[cqid]; 1872 1873 SPDK_DEBUGLOG(nvmf_vfio, "%s: try to delete cqid:%u=%p\n", ctrlr_id(vu_ctrlr), 1874 cq->qid, cq); 1875 1876 assert(cq->cq_ref > 0); 1877 if (--cq->cq_ref == 0) { 1878 delete_cq_done(vu_ctrlr, cq); 1879 } 1880 } 1881 } 1882 1883 static void 1884 free_qp(struct nvmf_vfio_user_ctrlr *ctrlr, uint16_t qid) 1885 { 1886 struct nvmf_vfio_user_sq *sq; 1887 struct nvmf_vfio_user_cq *cq; 1888 1889 if (ctrlr == NULL) { 1890 return; 1891 } 1892 1893 sq = ctrlr->sqs[qid]; 1894 if (sq) { 1895 SPDK_DEBUGLOG(nvmf_vfio, "%s: Free sqid:%u\n", ctrlr_id(ctrlr), qid); 1896 unmap_q(ctrlr, &sq->mapping); 1897 1898 free_sq_reqs(sq); 1899 1900 free(sq->mapping.sg); 1901 free(sq); 1902 ctrlr->sqs[qid] = NULL; 1903 } 1904 1905 cq = ctrlr->cqs[qid]; 1906 if (cq) { 1907 SPDK_DEBUGLOG(nvmf_vfio, "%s: Free cqid:%u\n", ctrlr_id(ctrlr), qid); 1908 unmap_q(ctrlr, &cq->mapping); 1909 free(cq->mapping.sg); 1910 free(cq); 1911 ctrlr->cqs[qid] = NULL; 1912 } 1913 } 1914 1915 static int 1916 init_sq(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_transport *transport, 1917 const uint16_t id) 1918 { 1919 struct nvmf_vfio_user_sq *sq; 1920 1921 assert(ctrlr != NULL); 1922 assert(transport != NULL); 1923 assert(ctrlr->sqs[id] == NULL); 1924 1925 sq = calloc(1, sizeof(*sq)); 1926 if (sq == NULL) { 1927 return -ENOMEM; 1928 } 1929 sq->mapping.sg = calloc(1, dma_sg_size()); 1930 if (sq->mapping.sg == NULL) { 1931 free(sq); 1932 return -ENOMEM; 1933 } 1934 1935 sq->qid = id; 1936 sq->qpair.qid = id; 1937 sq->qpair.transport = transport; 1938 sq->ctrlr = ctrlr; 1939 ctrlr->sqs[id] = sq; 1940 1941 TAILQ_INIT(&sq->free_reqs); 1942 1943 return 0; 1944 } 1945 1946 static int 1947 init_cq(struct nvmf_vfio_user_ctrlr *vu_ctrlr, const uint16_t id) 1948 { 1949 struct nvmf_vfio_user_cq *cq; 1950 1951 assert(vu_ctrlr != NULL); 1952 assert(vu_ctrlr->cqs[id] == NULL); 1953 1954 cq = calloc(1, sizeof(*cq)); 1955 if (cq == NULL) { 1956 return -ENOMEM; 1957 } 1958 cq->mapping.sg = calloc(1, dma_sg_size()); 1959 if (cq->mapping.sg == NULL) { 1960 free(cq); 1961 return -ENOMEM; 1962 } 1963 1964 cq->qid = id; 1965 vu_ctrlr->cqs[id] = cq; 1966 1967 return 0; 1968 } 1969 1970 static int 1971 alloc_sq_reqs(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvmf_vfio_user_sq *sq) 1972 { 1973 struct nvmf_vfio_user_req *vu_req, *tmp; 1974 size_t req_size; 1975 uint32_t i; 1976 1977 req_size = sizeof(struct nvmf_vfio_user_req) + 1978 (dma_sg_size() * NVMF_VFIO_USER_MAX_IOVECS); 1979 1980 for (i = 0; i < sq->size; i++) { 1981 struct spdk_nvmf_request *req; 1982 1983 vu_req = calloc(1, req_size); 1984 if (vu_req == NULL) { 1985 goto err; 1986 } 1987 1988 req = &vu_req->req; 1989 req->qpair = &sq->qpair; 1990 req->rsp = (union nvmf_c2h_msg *)&vu_req->rsp; 1991 req->cmd = (union nvmf_h2c_msg *)&vu_req->cmd; 1992 req->stripped_data = NULL; 1993 1994 TAILQ_INSERT_TAIL(&sq->free_reqs, vu_req, link); 1995 } 1996 1997 return 0; 1998 1999 err: 2000 TAILQ_FOREACH_SAFE(vu_req, &sq->free_reqs, link, tmp) { 2001 free(vu_req); 2002 } 2003 return -ENOMEM; 2004 } 2005 2006 static volatile uint32_t * 2007 ctrlr_doorbell_ptr(struct nvmf_vfio_user_ctrlr *ctrlr) 2008 { 2009 return ctrlr->sdbl != NULL ? 2010 ctrlr->sdbl->shadow_doorbells : 2011 ctrlr->bar0_doorbells; 2012 } 2013 2014 static uint16_t 2015 handle_create_io_sq(struct nvmf_vfio_user_ctrlr *ctrlr, 2016 struct spdk_nvme_cmd *cmd, uint16_t *sct) 2017 { 2018 struct nvmf_vfio_user_transport *vu_transport = ctrlr->transport; 2019 struct nvmf_vfio_user_sq *sq; 2020 uint32_t qsize; 2021 uint16_t cqid; 2022 uint16_t qid; 2023 int err; 2024 2025 qid = cmd->cdw10_bits.create_io_q.qid; 2026 cqid = cmd->cdw11_bits.create_io_sq.cqid; 2027 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 2028 2029 if (ctrlr->sqs[qid] == NULL) { 2030 err = init_sq(ctrlr, ctrlr->sqs[0]->qpair.transport, qid); 2031 if (err != 0) { 2032 *sct = SPDK_NVME_SCT_GENERIC; 2033 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2034 } 2035 } 2036 2037 if (cqid == 0 || cqid >= vu_transport->transport.opts.max_qpairs_per_ctrlr) { 2038 SPDK_ERRLOG("%s: invalid cqid:%u\n", ctrlr_id(ctrlr), cqid); 2039 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2040 return SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2041 } 2042 2043 /* CQ must be created before SQ. */ 2044 if (!io_q_exists(ctrlr, cqid, true)) { 2045 SPDK_ERRLOG("%s: cqid:%u does not exist\n", ctrlr_id(ctrlr), cqid); 2046 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2047 return SPDK_NVME_SC_COMPLETION_QUEUE_INVALID; 2048 } 2049 2050 if (cmd->cdw11_bits.create_io_sq.pc != 0x1) { 2051 SPDK_ERRLOG("%s: non-PC SQ not supported\n", ctrlr_id(ctrlr)); 2052 *sct = SPDK_NVME_SCT_GENERIC; 2053 return SPDK_NVME_SC_INVALID_FIELD; 2054 } 2055 2056 sq = ctrlr->sqs[qid]; 2057 sq->size = qsize; 2058 2059 SPDK_DEBUGLOG(nvmf_vfio, "%s: sqid:%d cqid:%d\n", ctrlr_id(ctrlr), 2060 qid, cqid); 2061 2062 sq->mapping.prp1 = cmd->dptr.prp.prp1; 2063 sq->mapping.len = sq->size * sizeof(struct spdk_nvme_cmd); 2064 2065 err = map_q(ctrlr, &sq->mapping, MAP_INITIALIZE); 2066 if (err) { 2067 SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr)); 2068 *sct = SPDK_NVME_SCT_GENERIC; 2069 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2070 } 2071 2072 SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped sqid:%d IOVA=%#lx vaddr=%p\n", 2073 ctrlr_id(ctrlr), qid, cmd->dptr.prp.prp1, 2074 q_addr(&sq->mapping)); 2075 2076 err = alloc_sq_reqs(ctrlr, sq); 2077 if (err < 0) { 2078 SPDK_ERRLOG("%s: failed to allocate SQ requests: %m\n", ctrlr_id(ctrlr)); 2079 *sct = SPDK_NVME_SCT_GENERIC; 2080 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2081 } 2082 2083 sq->cqid = cqid; 2084 ctrlr->cqs[sq->cqid]->cq_ref++; 2085 sq->sq_state = VFIO_USER_SQ_CREATED; 2086 *sq_headp(sq) = 0; 2087 2088 sq->dbl_tailp = ctrlr_doorbell_ptr(ctrlr) + queue_index(qid, false); 2089 2090 /* 2091 * We should always reset the doorbells. 2092 * 2093 * The Specification prohibits the controller from writing to the shadow 2094 * doorbell buffer, however older versions of the Linux NVMe driver 2095 * don't reset the shadow doorbell buffer after a Queue-Level or 2096 * Controller-Level reset, which means that we're left with garbage 2097 * doorbell values. 2098 */ 2099 *sq_dbl_tailp(sq) = 0; 2100 2101 if (ctrlr->sdbl != NULL) { 2102 sq->need_rearm = true; 2103 2104 if (!set_sq_eventidx(sq)) { 2105 SPDK_ERRLOG("%s: host updated SQ tail doorbell before " 2106 "sqid:%hu was initialized\n", 2107 ctrlr_id(ctrlr), qid); 2108 fail_ctrlr(ctrlr); 2109 *sct = SPDK_NVME_SCT_GENERIC; 2110 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2111 } 2112 } 2113 2114 /* 2115 * Create our new I/O qpair. This asynchronously invokes, on a suitable 2116 * poll group, the nvmf_vfio_user_poll_group_add() callback, which will 2117 * call spdk_nvmf_request_exec() with a generated fabrics 2118 * connect command. This command is then eventually completed via 2119 * handle_queue_connect_rsp(). 2120 */ 2121 sq->create_io_sq_cmd = *cmd; 2122 sq->post_create_io_sq_completion = true; 2123 2124 spdk_nvmf_tgt_new_qpair(ctrlr->transport->transport.tgt, 2125 &sq->qpair); 2126 2127 *sct = SPDK_NVME_SCT_GENERIC; 2128 return SPDK_NVME_SC_SUCCESS; 2129 } 2130 2131 static uint16_t 2132 handle_create_io_cq(struct nvmf_vfio_user_ctrlr *ctrlr, 2133 struct spdk_nvme_cmd *cmd, uint16_t *sct) 2134 { 2135 struct nvmf_vfio_user_cq *cq; 2136 uint32_t qsize; 2137 uint16_t qid; 2138 int err; 2139 2140 qid = cmd->cdw10_bits.create_io_q.qid; 2141 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 2142 2143 if (ctrlr->cqs[qid] == NULL) { 2144 err = init_cq(ctrlr, qid); 2145 if (err != 0) { 2146 *sct = SPDK_NVME_SCT_GENERIC; 2147 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2148 } 2149 } 2150 2151 if (cmd->cdw11_bits.create_io_cq.pc != 0x1) { 2152 SPDK_ERRLOG("%s: non-PC CQ not supported\n", ctrlr_id(ctrlr)); 2153 *sct = SPDK_NVME_SCT_GENERIC; 2154 return SPDK_NVME_SC_INVALID_FIELD; 2155 } 2156 2157 if (cmd->cdw11_bits.create_io_cq.iv > NVME_IRQ_MSIX_NUM - 1) { 2158 SPDK_ERRLOG("%s: IV is too big\n", ctrlr_id(ctrlr)); 2159 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2160 return SPDK_NVME_SC_INVALID_INTERRUPT_VECTOR; 2161 } 2162 2163 cq = ctrlr->cqs[qid]; 2164 cq->size = qsize; 2165 2166 cq->mapping.prp1 = cmd->dptr.prp.prp1; 2167 cq->mapping.len = cq->size * sizeof(struct spdk_nvme_cpl); 2168 2169 cq->dbl_headp = ctrlr_doorbell_ptr(ctrlr) + queue_index(qid, true); 2170 2171 err = map_q(ctrlr, &cq->mapping, MAP_RW | MAP_INITIALIZE); 2172 if (err) { 2173 SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr)); 2174 *sct = SPDK_NVME_SCT_GENERIC; 2175 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2176 } 2177 2178 SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped cqid:%u IOVA=%#lx vaddr=%p\n", 2179 ctrlr_id(ctrlr), qid, cmd->dptr.prp.prp1, 2180 q_addr(&cq->mapping)); 2181 2182 cq->ien = cmd->cdw11_bits.create_io_cq.ien; 2183 cq->iv = cmd->cdw11_bits.create_io_cq.iv; 2184 cq->phase = true; 2185 cq->cq_state = VFIO_USER_CQ_CREATED; 2186 2187 *cq_tailp(cq) = 0; 2188 2189 /* 2190 * We should always reset the doorbells. 2191 * 2192 * The Specification prohibits the controller from writing to the shadow 2193 * doorbell buffer, however older versions of the Linux NVMe driver 2194 * don't reset the shadow doorbell buffer after a Queue-Level or 2195 * Controller-Level reset, which means that we're left with garbage 2196 * doorbell values. 2197 */ 2198 *cq_dbl_headp(cq) = 0; 2199 2200 *sct = SPDK_NVME_SCT_GENERIC; 2201 return SPDK_NVME_SC_SUCCESS; 2202 } 2203 2204 /* 2205 * Creates a completion or submission I/O queue. Returns 0 on success, -errno 2206 * on error. 2207 */ 2208 static int 2209 handle_create_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, 2210 struct spdk_nvme_cmd *cmd, const bool is_cq) 2211 { 2212 struct nvmf_vfio_user_transport *vu_transport = ctrlr->transport; 2213 uint16_t sct = SPDK_NVME_SCT_GENERIC; 2214 uint16_t sc = SPDK_NVME_SC_SUCCESS; 2215 uint32_t qsize; 2216 uint16_t qid; 2217 2218 assert(ctrlr != NULL); 2219 assert(cmd != NULL); 2220 2221 qid = cmd->cdw10_bits.create_io_q.qid; 2222 if (qid == 0 || qid >= vu_transport->transport.opts.max_qpairs_per_ctrlr) { 2223 SPDK_ERRLOG("%s: invalid qid=%d, max=%d\n", ctrlr_id(ctrlr), 2224 qid, vu_transport->transport.opts.max_qpairs_per_ctrlr); 2225 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2226 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2227 goto out; 2228 } 2229 2230 if (io_q_exists(ctrlr, qid, is_cq)) { 2231 SPDK_ERRLOG("%s: %cqid:%d already exists\n", ctrlr_id(ctrlr), 2232 is_cq ? 'c' : 's', qid); 2233 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2234 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2235 goto out; 2236 } 2237 2238 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 2239 if (qsize == 1 || qsize > max_queue_size(ctrlr)) { 2240 SPDK_ERRLOG("%s: invalid I/O queue size %u\n", ctrlr_id(ctrlr), qsize); 2241 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2242 sc = SPDK_NVME_SC_INVALID_QUEUE_SIZE; 2243 goto out; 2244 } 2245 2246 if (is_cq) { 2247 sc = handle_create_io_cq(ctrlr, cmd, &sct); 2248 } else { 2249 sc = handle_create_io_sq(ctrlr, cmd, &sct); 2250 2251 if (sct == SPDK_NVME_SCT_GENERIC && 2252 sc == SPDK_NVME_SC_SUCCESS) { 2253 /* Completion posted asynchronously. */ 2254 return 0; 2255 } 2256 } 2257 2258 out: 2259 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 2260 } 2261 2262 /* For ADMIN I/O DELETE SUBMISSION QUEUE the NVMf library will disconnect and free 2263 * queue pair, so save the command id and controller in a context. 2264 */ 2265 struct vfio_user_delete_sq_ctx { 2266 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 2267 uint16_t cid; 2268 }; 2269 2270 static void 2271 vfio_user_qpair_delete_cb(void *cb_arg) 2272 { 2273 struct vfio_user_delete_sq_ctx *ctx = cb_arg; 2274 struct nvmf_vfio_user_ctrlr *vu_ctrlr = ctx->vu_ctrlr; 2275 struct nvmf_vfio_user_cq *admin_cq = vu_ctrlr->cqs[0]; 2276 2277 assert(admin_cq != NULL); 2278 assert(admin_cq->group != NULL); 2279 assert(admin_cq->group->group->thread != NULL); 2280 if (admin_cq->group->group->thread != spdk_get_thread()) { 2281 spdk_thread_send_msg(admin_cq->group->group->thread, 2282 vfio_user_qpair_delete_cb, 2283 cb_arg); 2284 } else { 2285 post_completion(vu_ctrlr, admin_cq, 0, 0, 2286 ctx->cid, 2287 SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC); 2288 free(ctx); 2289 } 2290 } 2291 2292 /* 2293 * Deletes a completion or submission I/O queue. 2294 */ 2295 static int 2296 handle_del_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, 2297 struct spdk_nvme_cmd *cmd, const bool is_cq) 2298 { 2299 uint16_t sct = SPDK_NVME_SCT_GENERIC; 2300 uint16_t sc = SPDK_NVME_SC_SUCCESS; 2301 struct nvmf_vfio_user_sq *sq; 2302 struct nvmf_vfio_user_cq *cq; 2303 2304 SPDK_DEBUGLOG(nvmf_vfio, "%s: delete I/O %cqid:%d\n", 2305 ctrlr_id(ctrlr), is_cq ? 'c' : 's', 2306 cmd->cdw10_bits.delete_io_q.qid); 2307 2308 if (!io_q_exists(ctrlr, cmd->cdw10_bits.delete_io_q.qid, is_cq)) { 2309 SPDK_ERRLOG("%s: I/O %cqid:%d does not exist\n", ctrlr_id(ctrlr), 2310 is_cq ? 'c' : 's', cmd->cdw10_bits.delete_io_q.qid); 2311 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2312 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2313 goto out; 2314 } 2315 2316 if (is_cq) { 2317 cq = ctrlr->cqs[cmd->cdw10_bits.delete_io_q.qid]; 2318 if (cq->cq_ref) { 2319 SPDK_ERRLOG("%s: the associated SQ must be deleted first\n", ctrlr_id(ctrlr)); 2320 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2321 sc = SPDK_NVME_SC_INVALID_QUEUE_DELETION; 2322 goto out; 2323 } 2324 delete_cq_done(ctrlr, cq); 2325 } else { 2326 /* 2327 * Deletion of the CQ is only deferred to delete_sq_done() on 2328 * VM reboot or CC.EN change, so we have to delete it in all 2329 * other cases. 2330 */ 2331 sq = ctrlr->sqs[cmd->cdw10_bits.delete_io_q.qid]; 2332 sq->delete_ctx = calloc(1, sizeof(*sq->delete_ctx)); 2333 if (!sq->delete_ctx) { 2334 sct = SPDK_NVME_SCT_GENERIC; 2335 sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2336 goto out; 2337 } 2338 sq->delete_ctx->vu_ctrlr = ctrlr; 2339 sq->delete_ctx->cid = cmd->cid; 2340 sq->sq_state = VFIO_USER_SQ_DELETED; 2341 assert(ctrlr->cqs[sq->cqid]->cq_ref); 2342 ctrlr->cqs[sq->cqid]->cq_ref--; 2343 2344 spdk_nvmf_qpair_disconnect(&sq->qpair); 2345 return 0; 2346 } 2347 2348 out: 2349 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 2350 } 2351 2352 /* 2353 * Configures Shadow Doorbells. 2354 */ 2355 static int 2356 handle_doorbell_buffer_config(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd) 2357 { 2358 struct nvmf_vfio_user_shadow_doorbells *sdbl = NULL; 2359 uint32_t dstrd; 2360 uintptr_t page_size, page_mask; 2361 uint64_t prp1, prp2; 2362 uint16_t sct = SPDK_NVME_SCT_GENERIC; 2363 uint16_t sc = SPDK_NVME_SC_INVALID_FIELD; 2364 2365 assert(ctrlr != NULL); 2366 assert(ctrlr->endpoint != NULL); 2367 assert(cmd != NULL); 2368 2369 dstrd = doorbell_stride(ctrlr); 2370 page_size = memory_page_size(ctrlr); 2371 page_mask = memory_page_mask(ctrlr); 2372 2373 /* FIXME: we don't check doorbell stride when setting queue doorbells. */ 2374 if ((4u << dstrd) * NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR > page_size) { 2375 SPDK_ERRLOG("%s: doorbells do not fit in a single host page", 2376 ctrlr_id(ctrlr)); 2377 2378 goto out; 2379 } 2380 2381 /* Verify guest physical addresses passed as PRPs. */ 2382 if (cmd->psdt != SPDK_NVME_PSDT_PRP) { 2383 SPDK_ERRLOG("%s: received Doorbell Buffer Config without PRPs", 2384 ctrlr_id(ctrlr)); 2385 2386 goto out; 2387 } 2388 2389 prp1 = cmd->dptr.prp.prp1; 2390 prp2 = cmd->dptr.prp.prp2; 2391 2392 SPDK_DEBUGLOG(nvmf_vfio, 2393 "%s: configuring shadow doorbells with PRP1=%#lx and PRP2=%#lx (GPAs)\n", 2394 ctrlr_id(ctrlr), prp1, prp2); 2395 2396 if (prp1 == prp2 2397 || prp1 != (prp1 & page_mask) 2398 || prp2 != (prp2 & page_mask)) { 2399 SPDK_ERRLOG("%s: invalid shadow doorbell GPAs\n", 2400 ctrlr_id(ctrlr)); 2401 2402 goto out; 2403 } 2404 2405 /* Map guest physical addresses to our virtual address space. */ 2406 sdbl = map_sdbl(ctrlr->endpoint->vfu_ctx, prp1, prp2, page_size); 2407 if (sdbl == NULL) { 2408 SPDK_ERRLOG("%s: failed to map shadow doorbell buffers\n", 2409 ctrlr_id(ctrlr)); 2410 2411 goto out; 2412 } 2413 2414 ctrlr->shadow_doorbell_buffer = prp1; 2415 ctrlr->eventidx_buffer = prp2; 2416 2417 SPDK_DEBUGLOG(nvmf_vfio, 2418 "%s: mapped shadow doorbell buffers [%p, %p) and [%p, %p)\n", 2419 ctrlr_id(ctrlr), 2420 sdbl->iovs[0].iov_base, 2421 sdbl->iovs[0].iov_base + sdbl->iovs[0].iov_len, 2422 sdbl->iovs[1].iov_base, 2423 sdbl->iovs[1].iov_base + sdbl->iovs[1].iov_len); 2424 2425 2426 /* 2427 * Set all possible CQ head doorbells to polling mode now, such that we 2428 * don't have to worry about it later if the host creates more queues. 2429 * 2430 * We only ever want interrupts for writes to the SQ tail doorbells 2431 * (which are initialised in set_ctrlr_intr_mode() below). 2432 */ 2433 for (uint16_t i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; ++i) { 2434 sdbl->eventidxs[queue_index(i, true)] = NVMF_VFIO_USER_EVENTIDX_POLL; 2435 } 2436 2437 /* Update controller. */ 2438 SWAP(ctrlr->sdbl, sdbl); 2439 2440 /* 2441 * Copy doorbells from either the previous shadow doorbell buffer or the 2442 * BAR0 doorbells and make I/O queue doorbells point to the new buffer. 2443 * 2444 * This needs to account for older versions of the Linux NVMe driver, 2445 * which don't clear out the buffer after a controller reset. 2446 */ 2447 copy_doorbells(ctrlr, sdbl != NULL ? 2448 sdbl->shadow_doorbells : ctrlr->bar0_doorbells, 2449 ctrlr->sdbl->shadow_doorbells); 2450 2451 vfio_user_ctrlr_switch_doorbells(ctrlr, true); 2452 2453 ctrlr_kick(ctrlr); 2454 2455 sc = SPDK_NVME_SC_SUCCESS; 2456 2457 out: 2458 /* 2459 * Unmap existing buffers, in case Doorbell Buffer Config was sent 2460 * more than once (pointless, but not prohibited by the spec), or 2461 * in case of an error. 2462 * 2463 * If this is the first time Doorbell Buffer Config was processed, 2464 * then we've just swapped a NULL from ctrlr->sdbl into sdbl, so 2465 * free_sdbl() becomes a noop. 2466 */ 2467 free_sdbl(ctrlr->endpoint->vfu_ctx, sdbl); 2468 2469 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 2470 } 2471 2472 /* Returns 0 on success and -errno on error. */ 2473 static int 2474 consume_admin_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd) 2475 { 2476 assert(ctrlr != NULL); 2477 assert(cmd != NULL); 2478 2479 if (cmd->fuse != 0) { 2480 /* Fused admin commands are not supported. */ 2481 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, 2482 SPDK_NVME_SC_INVALID_FIELD, 2483 SPDK_NVME_SCT_GENERIC); 2484 } 2485 2486 switch (cmd->opc) { 2487 case SPDK_NVME_OPC_CREATE_IO_CQ: 2488 case SPDK_NVME_OPC_CREATE_IO_SQ: 2489 return handle_create_io_q(ctrlr, cmd, 2490 cmd->opc == SPDK_NVME_OPC_CREATE_IO_CQ); 2491 case SPDK_NVME_OPC_DELETE_IO_SQ: 2492 case SPDK_NVME_OPC_DELETE_IO_CQ: 2493 return handle_del_io_q(ctrlr, cmd, 2494 cmd->opc == SPDK_NVME_OPC_DELETE_IO_CQ); 2495 case SPDK_NVME_OPC_DOORBELL_BUFFER_CONFIG: 2496 SPDK_NOTICELOG("%s: requested shadow doorbells (supported: %d)\n", 2497 ctrlr_id(ctrlr), 2498 !ctrlr->transport->transport_opts.disable_shadow_doorbells); 2499 if (!ctrlr->transport->transport_opts.disable_shadow_doorbells) { 2500 return handle_doorbell_buffer_config(ctrlr, cmd); 2501 } 2502 /* FALLTHROUGH */ 2503 default: 2504 return handle_cmd_req(ctrlr, cmd, ctrlr->sqs[0]); 2505 } 2506 } 2507 2508 static int 2509 handle_cmd_rsp(struct nvmf_vfio_user_req *vu_req, void *cb_arg) 2510 { 2511 struct nvmf_vfio_user_sq *sq = cb_arg; 2512 struct nvmf_vfio_user_ctrlr *vu_ctrlr = sq->ctrlr; 2513 uint16_t sqid, cqid; 2514 2515 assert(sq != NULL); 2516 assert(vu_req != NULL); 2517 assert(vu_ctrlr != NULL); 2518 2519 if (spdk_likely(vu_req->iovcnt)) { 2520 vfu_sgl_put(vu_ctrlr->endpoint->vfu_ctx, 2521 index_to_sg_t(vu_req->sg, 0), 2522 vu_req->iov, vu_req->iovcnt); 2523 } 2524 sqid = sq->qid; 2525 cqid = sq->cqid; 2526 2527 return post_completion(vu_ctrlr, vu_ctrlr->cqs[cqid], 2528 vu_req->req.rsp->nvme_cpl.cdw0, 2529 sqid, 2530 vu_req->req.cmd->nvme_cmd.cid, 2531 vu_req->req.rsp->nvme_cpl.status.sc, 2532 vu_req->req.rsp->nvme_cpl.status.sct); 2533 } 2534 2535 static int 2536 consume_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_sq *sq, 2537 struct spdk_nvme_cmd *cmd) 2538 { 2539 assert(sq != NULL); 2540 if (spdk_unlikely(nvmf_qpair_is_admin_queue(&sq->qpair))) { 2541 return consume_admin_cmd(ctrlr, cmd); 2542 } 2543 2544 return handle_cmd_req(ctrlr, cmd, sq); 2545 } 2546 2547 /* Returns the number of commands processed, or a negative value on error. */ 2548 static int 2549 handle_sq_tdbl_write(struct nvmf_vfio_user_ctrlr *ctrlr, const uint32_t new_tail, 2550 struct nvmf_vfio_user_sq *sq) 2551 { 2552 struct spdk_nvme_cmd *queue; 2553 struct nvmf_vfio_user_cq *cq = ctrlr->cqs[sq->cqid]; 2554 int count = 0; 2555 uint32_t free_cq_slots; 2556 2557 assert(ctrlr != NULL); 2558 assert(sq != NULL); 2559 2560 if (ctrlr->sdbl != NULL && sq->qid != 0) { 2561 /* 2562 * Submission queue index has moved past the event index, so it 2563 * needs to be re-armed before we go to sleep. 2564 */ 2565 sq->need_rearm = true; 2566 } 2567 2568 free_cq_slots = cq_free_slots(cq); 2569 queue = q_addr(&sq->mapping); 2570 while (*sq_headp(sq) != new_tail) { 2571 int err; 2572 struct spdk_nvme_cmd *cmd; 2573 2574 /* 2575 * Linux host nvme driver can submit cmd's more than free cq slots 2576 * available. So process only those who have cq slots available. 2577 */ 2578 if (free_cq_slots-- == 0) { 2579 cq->last_head = *cq_dbl_headp(cq); 2580 2581 free_cq_slots = cq_free_slots(cq); 2582 if (free_cq_slots > 0) { 2583 continue; 2584 } 2585 2586 /* 2587 * If there are no free cq slots then kick interrupt FD to loop 2588 * again to process remaining sq cmds. 2589 * In case of polling mode we will process remaining sq cmds during 2590 * next polling iteration. 2591 * sq head is advanced only for consumed commands. 2592 */ 2593 if (in_interrupt_mode(ctrlr->transport)) { 2594 eventfd_write(ctrlr->intr_fd, 1); 2595 } 2596 break; 2597 } 2598 2599 cmd = &queue[*sq_headp(sq)]; 2600 count++; 2601 2602 /* 2603 * SQHD must contain the new head pointer, so we must increase 2604 * it before we generate a completion. 2605 */ 2606 sq_head_advance(sq); 2607 2608 err = consume_cmd(ctrlr, sq, cmd); 2609 if (spdk_unlikely(err != 0)) { 2610 return err; 2611 } 2612 } 2613 2614 return count; 2615 } 2616 2617 /* Checks whether endpoint is connected from the same process */ 2618 static bool 2619 is_peer_same_process(struct nvmf_vfio_user_endpoint *endpoint) 2620 { 2621 struct ucred ucred; 2622 socklen_t ucredlen = sizeof(ucred); 2623 2624 if (endpoint == NULL) { 2625 return false; 2626 } 2627 2628 if (getsockopt(vfu_get_poll_fd(endpoint->vfu_ctx), SOL_SOCKET, SO_PEERCRED, &ucred, 2629 &ucredlen) < 0) { 2630 SPDK_ERRLOG("getsockopt(SO_PEERCRED): %s\n", strerror(errno)); 2631 return false; 2632 } 2633 2634 return ucred.pid == getpid(); 2635 } 2636 2637 static void 2638 memory_region_add_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) 2639 { 2640 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2641 struct nvmf_vfio_user_ctrlr *ctrlr; 2642 struct nvmf_vfio_user_sq *sq; 2643 struct nvmf_vfio_user_cq *cq; 2644 void *map_start, *map_end; 2645 int ret; 2646 2647 /* 2648 * We're not interested in any DMA regions that aren't mappable (we don't 2649 * support clients that don't share their memory). 2650 */ 2651 if (!info->vaddr) { 2652 return; 2653 } 2654 2655 map_start = info->mapping.iov_base; 2656 map_end = info->mapping.iov_base + info->mapping.iov_len; 2657 2658 if (((uintptr_t)info->mapping.iov_base & MASK_2MB) || 2659 (info->mapping.iov_len & MASK_2MB)) { 2660 SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %p-%p\n", 2661 info->vaddr, map_start, map_end); 2662 return; 2663 } 2664 2665 assert(endpoint != NULL); 2666 if (endpoint->ctrlr == NULL) { 2667 return; 2668 } 2669 ctrlr = endpoint->ctrlr; 2670 2671 SPDK_DEBUGLOG(nvmf_vfio, "%s: map IOVA %p-%p\n", endpoint_id(endpoint), 2672 map_start, map_end); 2673 2674 /* VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE are enabled when registering to VFIO, here we also 2675 * check the protection bits before registering. When vfio client and server are run in same process 2676 * there is no need to register the same memory again. 2677 */ 2678 if (info->prot == (PROT_WRITE | PROT_READ) && !is_peer_same_process(endpoint)) { 2679 ret = spdk_mem_register(info->mapping.iov_base, info->mapping.iov_len); 2680 if (ret) { 2681 SPDK_ERRLOG("Memory region register %p-%p failed, ret=%d\n", 2682 map_start, map_end, ret); 2683 } 2684 } 2685 2686 pthread_mutex_lock(&endpoint->lock); 2687 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 2688 if (sq->sq_state != VFIO_USER_SQ_INACTIVE) { 2689 continue; 2690 } 2691 2692 cq = ctrlr->cqs[sq->cqid]; 2693 2694 /* For shared CQ case, we will use q_addr() to avoid mapping CQ multiple times */ 2695 if (cq->size && q_addr(&cq->mapping) == NULL) { 2696 ret = map_q(ctrlr, &cq->mapping, MAP_RW | MAP_QUIET); 2697 if (ret) { 2698 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap cqid:%d %#lx-%#lx\n", 2699 cq->qid, cq->mapping.prp1, 2700 cq->mapping.prp1 + cq->mapping.len); 2701 continue; 2702 } 2703 } 2704 2705 if (sq->size) { 2706 ret = map_q(ctrlr, &sq->mapping, MAP_R | MAP_QUIET); 2707 if (ret) { 2708 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap sqid:%d %#lx-%#lx\n", 2709 sq->qid, sq->mapping.prp1, 2710 sq->mapping.prp1 + sq->mapping.len); 2711 continue; 2712 } 2713 } 2714 sq->sq_state = VFIO_USER_SQ_ACTIVE; 2715 SPDK_DEBUGLOG(nvmf_vfio, "Remap sqid:%u successfully\n", sq->qid); 2716 } 2717 pthread_mutex_unlock(&endpoint->lock); 2718 } 2719 2720 static void 2721 memory_region_remove_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) 2722 { 2723 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2724 struct nvmf_vfio_user_sq *sq; 2725 struct nvmf_vfio_user_cq *cq; 2726 void *map_start, *map_end; 2727 int ret = 0; 2728 2729 if (!info->vaddr) { 2730 return; 2731 } 2732 2733 map_start = info->mapping.iov_base; 2734 map_end = info->mapping.iov_base + info->mapping.iov_len; 2735 2736 if (((uintptr_t)info->mapping.iov_base & MASK_2MB) || 2737 (info->mapping.iov_len & MASK_2MB)) { 2738 SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %p-%p\n", 2739 info->vaddr, map_start, map_end); 2740 return; 2741 } 2742 2743 assert(endpoint != NULL); 2744 SPDK_DEBUGLOG(nvmf_vfio, "%s: unmap IOVA %p-%p\n", endpoint_id(endpoint), 2745 map_start, map_end); 2746 2747 if (endpoint->ctrlr != NULL) { 2748 struct nvmf_vfio_user_ctrlr *ctrlr; 2749 ctrlr = endpoint->ctrlr; 2750 2751 pthread_mutex_lock(&endpoint->lock); 2752 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 2753 if (q_addr(&sq->mapping) >= map_start && q_addr(&sq->mapping) <= map_end) { 2754 unmap_q(ctrlr, &sq->mapping); 2755 sq->sq_state = VFIO_USER_SQ_INACTIVE; 2756 } 2757 2758 cq = ctrlr->cqs[sq->cqid]; 2759 if (q_addr(&cq->mapping) >= map_start && q_addr(&cq->mapping) <= map_end) { 2760 unmap_q(ctrlr, &cq->mapping); 2761 } 2762 } 2763 2764 if (ctrlr->sdbl != NULL) { 2765 size_t i; 2766 2767 for (i = 0; i < NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT; i++) { 2768 const void *const iov_base = ctrlr->sdbl->iovs[i].iov_base; 2769 2770 if (iov_base >= map_start && iov_base < map_end) { 2771 copy_doorbells(ctrlr, 2772 ctrlr->sdbl->shadow_doorbells, 2773 ctrlr->bar0_doorbells); 2774 vfio_user_ctrlr_switch_doorbells(ctrlr, false); 2775 free_sdbl(endpoint->vfu_ctx, ctrlr->sdbl); 2776 ctrlr->sdbl = NULL; 2777 break; 2778 } 2779 } 2780 } 2781 2782 pthread_mutex_unlock(&endpoint->lock); 2783 } 2784 2785 if (info->prot == (PROT_WRITE | PROT_READ) && !is_peer_same_process(endpoint)) { 2786 ret = spdk_mem_unregister(info->mapping.iov_base, info->mapping.iov_len); 2787 if (ret) { 2788 SPDK_ERRLOG("Memory region unregister %p-%p failed, ret=%d\n", 2789 map_start, map_end, ret); 2790 } 2791 } 2792 } 2793 2794 /* Used to initiate a controller-level reset or a controller shutdown. */ 2795 static void 2796 disable_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 2797 { 2798 SPDK_NOTICELOG("%s: disabling controller\n", ctrlr_id(vu_ctrlr)); 2799 2800 /* Unmap Admin queue. */ 2801 2802 assert(vu_ctrlr->sqs[0] != NULL); 2803 assert(vu_ctrlr->cqs[0] != NULL); 2804 2805 unmap_q(vu_ctrlr, &vu_ctrlr->sqs[0]->mapping); 2806 unmap_q(vu_ctrlr, &vu_ctrlr->cqs[0]->mapping); 2807 2808 vu_ctrlr->sqs[0]->size = 0; 2809 *sq_headp(vu_ctrlr->sqs[0]) = 0; 2810 2811 vu_ctrlr->sqs[0]->sq_state = VFIO_USER_SQ_INACTIVE; 2812 2813 vu_ctrlr->cqs[0]->size = 0; 2814 *cq_tailp(vu_ctrlr->cqs[0]) = 0; 2815 2816 /* 2817 * For PCIe controller reset or shutdown, we will drop all AER 2818 * responses. 2819 */ 2820 spdk_nvmf_ctrlr_abort_aer(vu_ctrlr->ctrlr); 2821 2822 /* Free the shadow doorbell buffer. */ 2823 vfio_user_ctrlr_switch_doorbells(vu_ctrlr, false); 2824 free_sdbl(vu_ctrlr->endpoint->vfu_ctx, vu_ctrlr->sdbl); 2825 vu_ctrlr->sdbl = NULL; 2826 } 2827 2828 /* Used to re-enable the controller after a controller-level reset. */ 2829 static int 2830 enable_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 2831 { 2832 int err; 2833 2834 assert(vu_ctrlr != NULL); 2835 2836 SPDK_NOTICELOG("%s: enabling controller\n", ctrlr_id(vu_ctrlr)); 2837 2838 err = acq_setup(vu_ctrlr); 2839 if (err != 0) { 2840 return err; 2841 } 2842 2843 err = asq_setup(vu_ctrlr); 2844 if (err != 0) { 2845 return err; 2846 } 2847 2848 vu_ctrlr->sqs[0]->sq_state = VFIO_USER_SQ_ACTIVE; 2849 2850 return 0; 2851 } 2852 2853 static int 2854 nvmf_vfio_user_prop_req_rsp_set(struct nvmf_vfio_user_req *req, 2855 struct nvmf_vfio_user_sq *sq) 2856 { 2857 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 2858 union spdk_nvme_cc_register cc, diff; 2859 2860 assert(req->req.cmd->prop_set_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET); 2861 assert(sq->ctrlr != NULL); 2862 vu_ctrlr = sq->ctrlr; 2863 2864 if (req->req.cmd->prop_set_cmd.ofst != offsetof(struct spdk_nvme_registers, cc)) { 2865 return 0; 2866 } 2867 2868 cc.raw = req->req.cmd->prop_set_cmd.value.u64; 2869 diff.raw = cc.raw ^ req->cc.raw; 2870 2871 if (diff.bits.en) { 2872 if (cc.bits.en) { 2873 int ret = enable_ctrlr(vu_ctrlr); 2874 if (ret) { 2875 SPDK_ERRLOG("%s: failed to enable ctrlr\n", ctrlr_id(vu_ctrlr)); 2876 return ret; 2877 } 2878 vu_ctrlr->reset_shn = false; 2879 } else { 2880 vu_ctrlr->reset_shn = true; 2881 } 2882 } 2883 2884 if (diff.bits.shn) { 2885 if (cc.bits.shn == SPDK_NVME_SHN_NORMAL || cc.bits.shn == SPDK_NVME_SHN_ABRUPT) { 2886 vu_ctrlr->reset_shn = true; 2887 } 2888 } 2889 2890 if (vu_ctrlr->reset_shn) { 2891 disable_ctrlr(vu_ctrlr); 2892 } 2893 return 0; 2894 } 2895 2896 static int 2897 nvmf_vfio_user_prop_req_rsp(struct nvmf_vfio_user_req *req, void *cb_arg) 2898 { 2899 struct nvmf_vfio_user_sq *sq = cb_arg; 2900 2901 assert(sq != NULL); 2902 assert(req != NULL); 2903 2904 if (req->req.cmd->prop_get_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET) { 2905 assert(sq->ctrlr != NULL); 2906 assert(req != NULL); 2907 2908 memcpy(req->req.iov[0].iov_base, 2909 &req->req.rsp->prop_get_rsp.value.u64, 2910 req->req.length); 2911 return 0; 2912 } 2913 2914 return nvmf_vfio_user_prop_req_rsp_set(req, sq); 2915 } 2916 2917 /* 2918 * Handles a write at offset 0x1000 or more; this is the non-mapped path when a 2919 * doorbell is written via access_bar0_fn(). 2920 * 2921 * DSTRD is set to fixed value 0 for NVMf. 2922 * 2923 */ 2924 static int 2925 handle_dbl_access(struct nvmf_vfio_user_ctrlr *ctrlr, uint32_t *buf, 2926 const size_t count, loff_t pos, const bool is_write) 2927 { 2928 struct nvmf_vfio_user_poll_group *group; 2929 2930 assert(ctrlr != NULL); 2931 assert(buf != NULL); 2932 2933 if (spdk_unlikely(!is_write)) { 2934 SPDK_WARNLOG("%s: host tried to read BAR0 doorbell %#lx\n", 2935 ctrlr_id(ctrlr), pos); 2936 errno = EPERM; 2937 return -1; 2938 } 2939 2940 if (spdk_unlikely(count != sizeof(uint32_t))) { 2941 SPDK_ERRLOG("%s: bad doorbell buffer size %ld\n", 2942 ctrlr_id(ctrlr), count); 2943 errno = EINVAL; 2944 return -1; 2945 } 2946 2947 pos -= NVME_DOORBELLS_OFFSET; 2948 2949 /* pos must be dword aligned */ 2950 if (spdk_unlikely((pos & 0x3) != 0)) { 2951 SPDK_ERRLOG("%s: bad doorbell offset %#lx\n", ctrlr_id(ctrlr), pos); 2952 errno = EINVAL; 2953 return -1; 2954 } 2955 2956 /* convert byte offset to array index */ 2957 pos >>= 2; 2958 2959 if (spdk_unlikely(pos >= NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR * 2)) { 2960 SPDK_ERRLOG("%s: bad doorbell index %#lx\n", ctrlr_id(ctrlr), pos); 2961 errno = EINVAL; 2962 return -1; 2963 } 2964 2965 ctrlr->bar0_doorbells[pos] = *buf; 2966 spdk_wmb(); 2967 2968 group = ctrlr_to_poll_group(ctrlr); 2969 if (pos == 1) { 2970 group->stats.cqh_admin_writes++; 2971 } else if (pos & 1) { 2972 group->stats.cqh_io_writes++; 2973 } 2974 2975 SPDK_DEBUGLOG(vfio_user_db, "%s: updating BAR0 doorbell %s:%ld to %u\n", 2976 ctrlr_id(ctrlr), (pos & 1) ? "cqid" : "sqid", 2977 pos / 2, *buf); 2978 2979 2980 return 0; 2981 } 2982 2983 static size_t 2984 vfio_user_property_access(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 2985 char *buf, size_t count, loff_t pos, 2986 bool is_write) 2987 { 2988 struct nvmf_vfio_user_req *req; 2989 const struct spdk_nvmf_registers *regs; 2990 2991 if ((count != 4) && (count != 8)) { 2992 errno = EINVAL; 2993 return -1; 2994 } 2995 2996 /* Construct a Fabric Property Get/Set command and send it */ 2997 req = get_nvmf_vfio_user_req(vu_ctrlr->sqs[0]); 2998 if (req == NULL) { 2999 errno = ENOBUFS; 3000 return -1; 3001 } 3002 regs = spdk_nvmf_ctrlr_get_regs(vu_ctrlr->ctrlr); 3003 req->cc.raw = regs->cc.raw; 3004 3005 req->cb_fn = nvmf_vfio_user_prop_req_rsp; 3006 req->cb_arg = vu_ctrlr->sqs[0]; 3007 req->req.cmd->prop_set_cmd.opcode = SPDK_NVME_OPC_FABRIC; 3008 req->req.cmd->prop_set_cmd.cid = 0; 3009 if (count == 4) { 3010 req->req.cmd->prop_set_cmd.attrib.size = 0; 3011 } else { 3012 req->req.cmd->prop_set_cmd.attrib.size = 1; 3013 } 3014 req->req.cmd->prop_set_cmd.ofst = pos; 3015 if (is_write) { 3016 req->req.cmd->prop_set_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET; 3017 if (req->req.cmd->prop_set_cmd.attrib.size) { 3018 req->req.cmd->prop_set_cmd.value.u64 = *(uint64_t *)buf; 3019 } else { 3020 req->req.cmd->prop_set_cmd.value.u32.high = 0; 3021 req->req.cmd->prop_set_cmd.value.u32.low = *(uint32_t *)buf; 3022 } 3023 } else { 3024 req->req.cmd->prop_get_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET; 3025 } 3026 req->req.length = count; 3027 SPDK_IOV_ONE(req->req.iov, &req->req.iovcnt, buf, req->req.length); 3028 3029 spdk_nvmf_request_exec(&req->req); 3030 3031 return count; 3032 } 3033 3034 static ssize_t 3035 access_bar0_fn(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t pos, 3036 bool is_write) 3037 { 3038 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3039 struct nvmf_vfio_user_ctrlr *ctrlr; 3040 int ret; 3041 3042 ctrlr = endpoint->ctrlr; 3043 if (spdk_unlikely(endpoint->need_async_destroy || !ctrlr)) { 3044 errno = EIO; 3045 return -1; 3046 } 3047 3048 if (pos >= NVME_DOORBELLS_OFFSET) { 3049 /* 3050 * The fact that the doorbells can be memory mapped doesn't mean 3051 * that the client (VFIO in QEMU) is obliged to memory map them, 3052 * it might still elect to access them via regular read/write; 3053 * we might also have had disable_mappable_bar0 set. 3054 */ 3055 ret = handle_dbl_access(ctrlr, (uint32_t *)buf, count, 3056 pos, is_write); 3057 if (ret == 0) { 3058 return count; 3059 } 3060 return ret; 3061 } 3062 3063 return vfio_user_property_access(ctrlr, buf, count, pos, is_write); 3064 } 3065 3066 static ssize_t 3067 access_pci_config(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t offset, 3068 bool is_write) 3069 { 3070 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3071 3072 if (is_write) { 3073 SPDK_ERRLOG("%s: write %#lx-%#lx not supported\n", 3074 endpoint_id(endpoint), offset, offset + count); 3075 errno = EINVAL; 3076 return -1; 3077 } 3078 3079 if (offset + count > NVME_REG_CFG_SIZE) { 3080 SPDK_ERRLOG("%s: access past end of extended PCI configuration space, want=%ld+%ld, max=%d\n", 3081 endpoint_id(endpoint), offset, count, 3082 NVME_REG_CFG_SIZE); 3083 errno = ERANGE; 3084 return -1; 3085 } 3086 3087 memcpy(buf, ((unsigned char *)endpoint->pci_config_space) + offset, count); 3088 3089 return count; 3090 } 3091 3092 static void 3093 vfio_user_log(vfu_ctx_t *vfu_ctx, int level, char const *msg) 3094 { 3095 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3096 3097 if (level >= LOG_DEBUG) { 3098 SPDK_DEBUGLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg); 3099 } else if (level >= LOG_INFO) { 3100 SPDK_INFOLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg); 3101 } else if (level >= LOG_NOTICE) { 3102 SPDK_NOTICELOG("%s: %s\n", endpoint_id(endpoint), msg); 3103 } else if (level >= LOG_WARNING) { 3104 SPDK_WARNLOG("%s: %s\n", endpoint_id(endpoint), msg); 3105 } else { 3106 SPDK_ERRLOG("%s: %s\n", endpoint_id(endpoint), msg); 3107 } 3108 } 3109 3110 static int 3111 vfio_user_get_log_level(void) 3112 { 3113 int level; 3114 3115 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 3116 return LOG_DEBUG; 3117 } 3118 3119 level = spdk_log_to_syslog_level(spdk_log_get_level()); 3120 if (level < 0) { 3121 return LOG_ERR; 3122 } 3123 3124 return level; 3125 } 3126 3127 static void 3128 init_pci_config_space(vfu_pci_config_space_t *p) 3129 { 3130 /* MLBAR */ 3131 p->hdr.bars[0].raw = 0x0; 3132 /* MUBAR */ 3133 p->hdr.bars[1].raw = 0x0; 3134 3135 /* vendor specific, let's set them to zero for now */ 3136 p->hdr.bars[3].raw = 0x0; 3137 p->hdr.bars[4].raw = 0x0; 3138 p->hdr.bars[5].raw = 0x0; 3139 3140 /* enable INTx */ 3141 p->hdr.intr.ipin = 0x1; 3142 } 3143 3144 struct ctrlr_quiesce_ctx { 3145 struct nvmf_vfio_user_endpoint *endpoint; 3146 struct nvmf_vfio_user_poll_group *group; 3147 int status; 3148 }; 3149 3150 static void ctrlr_quiesce(struct nvmf_vfio_user_ctrlr *vu_ctrlr); 3151 3152 static void 3153 _vfio_user_endpoint_resume_done_msg(void *ctx) 3154 { 3155 struct nvmf_vfio_user_endpoint *endpoint = ctx; 3156 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3157 3158 endpoint->need_resume = false; 3159 3160 if (!vu_ctrlr) { 3161 return; 3162 } 3163 3164 if (!vu_ctrlr->queued_quiesce) { 3165 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 3166 3167 /* 3168 * We might have ignored new SQ entries while we were quiesced: 3169 * kick ourselves so we'll definitely check again while in 3170 * VFIO_USER_CTRLR_RUNNING state. 3171 */ 3172 if (in_interrupt_mode(endpoint->transport)) { 3173 ctrlr_kick(vu_ctrlr); 3174 } 3175 return; 3176 } 3177 3178 3179 /* 3180 * Basically, once we call `vfu_device_quiesced` the device is 3181 * unquiesced from libvfio-user's perspective so from the moment 3182 * `vfio_user_quiesce_done` returns libvfio-user might quiesce the device 3183 * again. However, because the NVMf subsystem is an asynchronous 3184 * operation, this quiesce might come _before_ the NVMf subsystem has 3185 * been resumed, so in the callback of `spdk_nvmf_subsystem_resume` we 3186 * need to check whether a quiesce was requested. 3187 */ 3188 SPDK_DEBUGLOG(nvmf_vfio, "%s has queued quiesce event, quiesce again\n", 3189 ctrlr_id(vu_ctrlr)); 3190 ctrlr_quiesce(vu_ctrlr); 3191 } 3192 3193 static void 3194 vfio_user_endpoint_resume_done(struct spdk_nvmf_subsystem *subsystem, 3195 void *cb_arg, int status) 3196 { 3197 struct nvmf_vfio_user_endpoint *endpoint = cb_arg; 3198 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3199 3200 SPDK_DEBUGLOG(nvmf_vfio, "%s resumed done with status %d\n", endpoint_id(endpoint), status); 3201 3202 if (!vu_ctrlr) { 3203 return; 3204 } 3205 3206 spdk_thread_send_msg(vu_ctrlr->thread, _vfio_user_endpoint_resume_done_msg, endpoint); 3207 } 3208 3209 static void 3210 vfio_user_quiesce_done(void *ctx) 3211 { 3212 struct ctrlr_quiesce_ctx *quiesce_ctx = ctx; 3213 struct nvmf_vfio_user_endpoint *endpoint = quiesce_ctx->endpoint; 3214 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3215 int ret; 3216 3217 if (!vu_ctrlr) { 3218 free(quiesce_ctx); 3219 return; 3220 } 3221 3222 SPDK_DEBUGLOG(nvmf_vfio, "%s device quiesced\n", ctrlr_id(vu_ctrlr)); 3223 3224 assert(vu_ctrlr->state == VFIO_USER_CTRLR_PAUSING); 3225 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 3226 vfu_device_quiesced(endpoint->vfu_ctx, quiesce_ctx->status); 3227 vu_ctrlr->queued_quiesce = false; 3228 free(quiesce_ctx); 3229 3230 /* `vfu_device_quiesced` can change the migration state, 3231 * so we need to re-check `vu_ctrlr->state`. 3232 */ 3233 if (vu_ctrlr->state == VFIO_USER_CTRLR_MIGRATING) { 3234 SPDK_DEBUGLOG(nvmf_vfio, "%s is in MIGRATION state\n", ctrlr_id(vu_ctrlr)); 3235 return; 3236 } 3237 3238 SPDK_DEBUGLOG(nvmf_vfio, "%s start to resume\n", ctrlr_id(vu_ctrlr)); 3239 vu_ctrlr->state = VFIO_USER_CTRLR_RESUMING; 3240 ret = spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 3241 vfio_user_endpoint_resume_done, endpoint); 3242 if (ret < 0) { 3243 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 3244 SPDK_ERRLOG("%s: failed to resume, ret=%d\n", endpoint_id(endpoint), ret); 3245 } 3246 } 3247 3248 static void 3249 vfio_user_pause_done(struct spdk_nvmf_subsystem *subsystem, 3250 void *ctx, int status) 3251 { 3252 struct ctrlr_quiesce_ctx *quiesce_ctx = ctx; 3253 struct nvmf_vfio_user_endpoint *endpoint = quiesce_ctx->endpoint; 3254 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3255 3256 if (!vu_ctrlr) { 3257 free(quiesce_ctx); 3258 return; 3259 } 3260 3261 quiesce_ctx->status = status; 3262 3263 SPDK_DEBUGLOG(nvmf_vfio, "%s pause done with status %d\n", 3264 ctrlr_id(vu_ctrlr), status); 3265 3266 spdk_thread_send_msg(vu_ctrlr->thread, 3267 vfio_user_quiesce_done, ctx); 3268 } 3269 3270 /* 3271 * Ensure that, for this PG, we've stopped running in nvmf_vfio_user_sq_poll(); 3272 * we've already set ctrlr->state, so we won't process new entries, but we need 3273 * to ensure that this PG is quiesced. This only works because there's no 3274 * callback context set up between polling the SQ and spdk_nvmf_request_exec(). 3275 * 3276 * Once we've walked all PGs, we need to pause any submitted I/O via 3277 * spdk_nvmf_subsystem_pause(SPDK_NVME_GLOBAL_NS_TAG). 3278 */ 3279 static void 3280 vfio_user_quiesce_pg(void *ctx) 3281 { 3282 struct ctrlr_quiesce_ctx *quiesce_ctx = ctx; 3283 struct nvmf_vfio_user_endpoint *endpoint = quiesce_ctx->endpoint; 3284 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3285 struct nvmf_vfio_user_poll_group *vu_group = quiesce_ctx->group; 3286 struct spdk_nvmf_subsystem *subsystem = endpoint->subsystem; 3287 int ret; 3288 3289 SPDK_DEBUGLOG(nvmf_vfio, "quiesced pg:%p\n", vu_group); 3290 3291 if (!vu_ctrlr) { 3292 free(quiesce_ctx); 3293 return; 3294 } 3295 3296 quiesce_ctx->group = TAILQ_NEXT(vu_group, link); 3297 if (quiesce_ctx->group != NULL) { 3298 spdk_thread_send_msg(poll_group_to_thread(quiesce_ctx->group), 3299 vfio_user_quiesce_pg, quiesce_ctx); 3300 return; 3301 } 3302 3303 ret = spdk_nvmf_subsystem_pause(subsystem, SPDK_NVME_GLOBAL_NS_TAG, 3304 vfio_user_pause_done, quiesce_ctx); 3305 if (ret < 0) { 3306 SPDK_ERRLOG("%s: failed to pause, ret=%d\n", 3307 endpoint_id(endpoint), ret); 3308 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 3309 fail_ctrlr(vu_ctrlr); 3310 free(quiesce_ctx); 3311 } 3312 } 3313 3314 static void 3315 ctrlr_quiesce(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3316 { 3317 struct ctrlr_quiesce_ctx *quiesce_ctx; 3318 3319 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSING; 3320 3321 quiesce_ctx = calloc(1, sizeof(*quiesce_ctx)); 3322 if (!quiesce_ctx) { 3323 SPDK_ERRLOG("Failed to allocate subsystem pause context\n"); 3324 assert(false); 3325 return; 3326 } 3327 3328 quiesce_ctx->endpoint = vu_ctrlr->endpoint; 3329 quiesce_ctx->status = 0; 3330 quiesce_ctx->group = TAILQ_FIRST(&vu_ctrlr->transport->poll_groups); 3331 3332 spdk_thread_send_msg(poll_group_to_thread(quiesce_ctx->group), 3333 vfio_user_quiesce_pg, quiesce_ctx); 3334 } 3335 3336 static int 3337 vfio_user_dev_quiesce_cb(vfu_ctx_t *vfu_ctx) 3338 { 3339 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3340 struct spdk_nvmf_subsystem *subsystem = endpoint->subsystem; 3341 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3342 3343 if (!vu_ctrlr) { 3344 return 0; 3345 } 3346 3347 /* NVMf library will destruct controller when no 3348 * connected queue pairs. 3349 */ 3350 if (!nvmf_subsystem_get_ctrlr(subsystem, vu_ctrlr->cntlid)) { 3351 return 0; 3352 } 3353 3354 SPDK_DEBUGLOG(nvmf_vfio, "%s starts to quiesce\n", ctrlr_id(vu_ctrlr)); 3355 3356 /* There is no race condition here as device quiesce callback 3357 * and nvmf_prop_set_cc() are running in the same thread context. 3358 */ 3359 if (!vu_ctrlr->ctrlr->vcprop.cc.bits.en) { 3360 return 0; 3361 } else if (!vu_ctrlr->ctrlr->vcprop.csts.bits.rdy) { 3362 return 0; 3363 } else if (vu_ctrlr->ctrlr->vcprop.csts.bits.shst == SPDK_NVME_SHST_COMPLETE) { 3364 return 0; 3365 } 3366 3367 switch (vu_ctrlr->state) { 3368 case VFIO_USER_CTRLR_PAUSED: 3369 case VFIO_USER_CTRLR_MIGRATING: 3370 return 0; 3371 case VFIO_USER_CTRLR_RUNNING: 3372 ctrlr_quiesce(vu_ctrlr); 3373 break; 3374 case VFIO_USER_CTRLR_RESUMING: 3375 vu_ctrlr->queued_quiesce = true; 3376 SPDK_DEBUGLOG(nvmf_vfio, "%s is busy to quiesce, current state %u\n", ctrlr_id(vu_ctrlr), 3377 vu_ctrlr->state); 3378 break; 3379 default: 3380 assert(vu_ctrlr->state != VFIO_USER_CTRLR_PAUSING); 3381 break; 3382 } 3383 3384 errno = EBUSY; 3385 return -1; 3386 } 3387 3388 static void 3389 vfio_user_ctrlr_dump_migr_data(const char *name, 3390 struct vfio_user_nvme_migr_state *migr_data, 3391 struct nvmf_vfio_user_shadow_doorbells *sdbl) 3392 { 3393 struct spdk_nvmf_registers *regs; 3394 struct nvme_migr_sq_state *sq; 3395 struct nvme_migr_cq_state *cq; 3396 uint32_t *doorbell_base; 3397 uint32_t i; 3398 3399 SPDK_NOTICELOG("Dump %s\n", name); 3400 3401 regs = &migr_data->nvmf_data.regs; 3402 doorbell_base = (uint32_t *)&migr_data->doorbells; 3403 3404 SPDK_NOTICELOG("Registers\n"); 3405 SPDK_NOTICELOG("CSTS 0x%x\n", regs->csts.raw); 3406 SPDK_NOTICELOG("CAP 0x%"PRIx64"\n", regs->cap.raw); 3407 SPDK_NOTICELOG("VS 0x%x\n", regs->vs.raw); 3408 SPDK_NOTICELOG("CC 0x%x\n", regs->cc.raw); 3409 SPDK_NOTICELOG("AQA 0x%x\n", regs->aqa.raw); 3410 SPDK_NOTICELOG("ASQ 0x%"PRIx64"\n", regs->asq); 3411 SPDK_NOTICELOG("ACQ 0x%"PRIx64"\n", regs->acq); 3412 3413 SPDK_NOTICELOG("Number of IO Queues %u\n", migr_data->ctrlr_header.num_io_queues); 3414 3415 if (sdbl != NULL) { 3416 SPDK_NOTICELOG("shadow doorbell buffer=%#lx\n", 3417 migr_data->ctrlr_header.shadow_doorbell_buffer); 3418 SPDK_NOTICELOG("eventidx buffer=%#lx\n", 3419 migr_data->ctrlr_header.eventidx_buffer); 3420 } 3421 3422 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3423 sq = &migr_data->qps[i].sq; 3424 cq = &migr_data->qps[i].cq; 3425 3426 if (sq->size) { 3427 SPDK_NOTICELOG("sqid:%u, bar0_doorbell:%u\n", sq->sqid, doorbell_base[i * 2]); 3428 if (i > 0 && sdbl != NULL) { 3429 SPDK_NOTICELOG("sqid:%u, shadow_doorbell:%u, eventidx:%u\n", 3430 sq->sqid, 3431 sdbl->shadow_doorbells[queue_index(i, false)], 3432 sdbl->eventidxs[queue_index(i, false)]); 3433 } 3434 SPDK_NOTICELOG("SQ sqid:%u, cqid:%u, sqhead:%u, size:%u, dma_addr:0x%"PRIx64"\n", 3435 sq->sqid, sq->cqid, sq->head, sq->size, sq->dma_addr); 3436 } 3437 3438 if (cq->size) { 3439 SPDK_NOTICELOG("cqid:%u, bar0_doorbell:%u\n", cq->cqid, doorbell_base[i * 2 + 1]); 3440 if (i > 0 && sdbl != NULL) { 3441 SPDK_NOTICELOG("cqid:%u, shadow_doorbell:%u, eventidx:%u\n", 3442 cq->cqid, 3443 sdbl->shadow_doorbells[queue_index(i, true)], 3444 sdbl->eventidxs[queue_index(i, true)]); 3445 } 3446 SPDK_NOTICELOG("CQ cqid:%u, phase:%u, cqtail:%u, size:%u, iv:%u, ien:%u, dma_addr:0x%"PRIx64"\n", 3447 cq->cqid, cq->phase, cq->tail, cq->size, cq->iv, cq->ien, cq->dma_addr); 3448 } 3449 } 3450 3451 SPDK_NOTICELOG("%s Dump Done\n", name); 3452 } 3453 3454 /* Read region 9 content and restore it to migration data structures */ 3455 static int 3456 vfio_user_migr_stream_to_data(struct nvmf_vfio_user_endpoint *endpoint, 3457 struct vfio_user_nvme_migr_state *migr_state) 3458 { 3459 void *data_ptr = endpoint->migr_data; 3460 3461 /* Load vfio_user_nvme_migr_header first */ 3462 memcpy(&migr_state->ctrlr_header, data_ptr, sizeof(struct vfio_user_nvme_migr_header)); 3463 /* TODO: version check */ 3464 if (migr_state->ctrlr_header.magic != VFIO_USER_NVME_MIGR_MAGIC) { 3465 SPDK_ERRLOG("%s: bad magic number %x\n", endpoint_id(endpoint), migr_state->ctrlr_header.magic); 3466 return -EINVAL; 3467 } 3468 3469 /* Load nvmf controller data */ 3470 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.nvmf_data_offset; 3471 memcpy(&migr_state->nvmf_data, data_ptr, migr_state->ctrlr_header.nvmf_data_len); 3472 3473 /* Load queue pairs */ 3474 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.qp_offset; 3475 memcpy(&migr_state->qps, data_ptr, migr_state->ctrlr_header.qp_len); 3476 3477 /* Load doorbells */ 3478 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.bar_offset[VFU_PCI_DEV_BAR0_REGION_IDX]; 3479 memcpy(&migr_state->doorbells, data_ptr, 3480 migr_state->ctrlr_header.bar_len[VFU_PCI_DEV_BAR0_REGION_IDX]); 3481 3482 /* Load CFG */ 3483 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.bar_offset[VFU_PCI_DEV_CFG_REGION_IDX]; 3484 memcpy(&migr_state->cfg, data_ptr, migr_state->ctrlr_header.bar_len[VFU_PCI_DEV_CFG_REGION_IDX]); 3485 3486 return 0; 3487 } 3488 3489 3490 static void 3491 vfio_user_migr_ctrlr_save_data(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3492 { 3493 struct spdk_nvmf_ctrlr *ctrlr = vu_ctrlr->ctrlr; 3494 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 3495 struct nvmf_vfio_user_sq *sq; 3496 struct nvmf_vfio_user_cq *cq; 3497 uint64_t data_offset; 3498 void *data_ptr; 3499 uint32_t *doorbell_base; 3500 uint32_t i = 0; 3501 uint16_t sqid, cqid; 3502 struct vfio_user_nvme_migr_state migr_state = { 3503 .nvmf_data = { 3504 .data_size = offsetof(struct spdk_nvmf_ctrlr_migr_data, unused), 3505 .regs_size = sizeof(struct spdk_nvmf_registers), 3506 .feat_size = sizeof(struct spdk_nvmf_ctrlr_feat) 3507 } 3508 }; 3509 3510 /* Save all data to vfio_user_nvme_migr_state first, then we will 3511 * copy it to device migration region at last. 3512 */ 3513 3514 /* save magic number */ 3515 migr_state.ctrlr_header.magic = VFIO_USER_NVME_MIGR_MAGIC; 3516 3517 /* save controller data */ 3518 spdk_nvmf_ctrlr_save_migr_data(ctrlr, &migr_state.nvmf_data); 3519 3520 /* save connected queue pairs */ 3521 TAILQ_FOREACH(sq, &vu_ctrlr->connected_sqs, tailq) { 3522 /* save sq */ 3523 sqid = sq->qid; 3524 migr_state.qps[sqid].sq.sqid = sq->qid; 3525 migr_state.qps[sqid].sq.cqid = sq->cqid; 3526 migr_state.qps[sqid].sq.head = *sq_headp(sq); 3527 migr_state.qps[sqid].sq.size = sq->size; 3528 migr_state.qps[sqid].sq.dma_addr = sq->mapping.prp1; 3529 3530 /* save cq, for shared cq case, cq may be saved multiple times */ 3531 cqid = sq->cqid; 3532 cq = vu_ctrlr->cqs[cqid]; 3533 migr_state.qps[cqid].cq.cqid = cqid; 3534 migr_state.qps[cqid].cq.tail = *cq_tailp(cq); 3535 migr_state.qps[cqid].cq.ien = cq->ien; 3536 migr_state.qps[cqid].cq.iv = cq->iv; 3537 migr_state.qps[cqid].cq.size = cq->size; 3538 migr_state.qps[cqid].cq.phase = cq->phase; 3539 migr_state.qps[cqid].cq.dma_addr = cq->mapping.prp1; 3540 i++; 3541 } 3542 3543 assert(i > 0); 3544 migr_state.ctrlr_header.num_io_queues = i - 1; 3545 3546 /* Save doorbells */ 3547 doorbell_base = (uint32_t *)&migr_state.doorbells; 3548 memcpy(doorbell_base, (void *)vu_ctrlr->bar0_doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 3549 3550 /* Save PCI configuration space */ 3551 memcpy(&migr_state.cfg, (void *)endpoint->pci_config_space, NVME_REG_CFG_SIZE); 3552 3553 /* Save all data to device migration region */ 3554 data_ptr = endpoint->migr_data; 3555 3556 /* Copy nvmf controller data */ 3557 data_offset = sizeof(struct vfio_user_nvme_migr_header); 3558 data_ptr += data_offset; 3559 migr_state.ctrlr_header.nvmf_data_offset = data_offset; 3560 migr_state.ctrlr_header.nvmf_data_len = sizeof(struct spdk_nvmf_ctrlr_migr_data); 3561 memcpy(data_ptr, &migr_state.nvmf_data, sizeof(struct spdk_nvmf_ctrlr_migr_data)); 3562 3563 /* Copy queue pairs */ 3564 data_offset += sizeof(struct spdk_nvmf_ctrlr_migr_data); 3565 data_ptr += sizeof(struct spdk_nvmf_ctrlr_migr_data); 3566 migr_state.ctrlr_header.qp_offset = data_offset; 3567 migr_state.ctrlr_header.qp_len = i * (sizeof(struct nvme_migr_sq_state) + sizeof( 3568 struct nvme_migr_cq_state)); 3569 memcpy(data_ptr, &migr_state.qps, migr_state.ctrlr_header.qp_len); 3570 3571 /* Copy doorbells */ 3572 data_offset += migr_state.ctrlr_header.qp_len; 3573 data_ptr += migr_state.ctrlr_header.qp_len; 3574 migr_state.ctrlr_header.bar_offset[VFU_PCI_DEV_BAR0_REGION_IDX] = data_offset; 3575 migr_state.ctrlr_header.bar_len[VFU_PCI_DEV_BAR0_REGION_IDX] = NVMF_VFIO_USER_DOORBELLS_SIZE; 3576 memcpy(data_ptr, &migr_state.doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 3577 3578 /* Copy CFG */ 3579 data_offset += NVMF_VFIO_USER_DOORBELLS_SIZE; 3580 data_ptr += NVMF_VFIO_USER_DOORBELLS_SIZE; 3581 migr_state.ctrlr_header.bar_offset[VFU_PCI_DEV_CFG_REGION_IDX] = data_offset; 3582 migr_state.ctrlr_header.bar_len[VFU_PCI_DEV_CFG_REGION_IDX] = NVME_REG_CFG_SIZE; 3583 memcpy(data_ptr, &migr_state.cfg, NVME_REG_CFG_SIZE); 3584 3585 /* copy shadow doorbells */ 3586 if (vu_ctrlr->sdbl != NULL) { 3587 migr_state.ctrlr_header.sdbl = true; 3588 migr_state.ctrlr_header.shadow_doorbell_buffer = vu_ctrlr->shadow_doorbell_buffer; 3589 migr_state.ctrlr_header.eventidx_buffer = vu_ctrlr->eventidx_buffer; 3590 } 3591 3592 /* Copy nvme migration header finally */ 3593 memcpy(endpoint->migr_data, &migr_state.ctrlr_header, sizeof(struct vfio_user_nvme_migr_header)); 3594 3595 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 3596 vfio_user_ctrlr_dump_migr_data("SAVE", &migr_state, vu_ctrlr->sdbl); 3597 } 3598 } 3599 3600 /* 3601 * If we are about to close the connection, we need to unregister the interrupt, 3602 * as the library will subsequently close the file descriptor we registered. 3603 */ 3604 static int 3605 vfio_user_device_reset(vfu_ctx_t *vfu_ctx, vfu_reset_type_t type) 3606 { 3607 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3608 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 3609 3610 SPDK_DEBUGLOG(nvmf_vfio, "Device reset type %u\n", type); 3611 3612 if (type == VFU_RESET_LOST_CONN) { 3613 if (ctrlr != NULL) { 3614 spdk_interrupt_unregister(&ctrlr->intr); 3615 ctrlr->intr_fd = -1; 3616 } 3617 return 0; 3618 } 3619 3620 /* FIXME: LOST_CONN case ? */ 3621 if (ctrlr->sdbl != NULL) { 3622 vfio_user_ctrlr_switch_doorbells(ctrlr, false); 3623 free_sdbl(vfu_ctx, ctrlr->sdbl); 3624 ctrlr->sdbl = NULL; 3625 } 3626 3627 /* FIXME: much more needed here. */ 3628 3629 return 0; 3630 } 3631 3632 static int 3633 vfio_user_migr_ctrlr_construct_qps(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 3634 struct vfio_user_nvme_migr_state *migr_state) 3635 { 3636 uint32_t i, qsize = 0; 3637 uint16_t sqid, cqid; 3638 struct vfio_user_nvme_migr_qp migr_qp; 3639 void *addr; 3640 uint32_t cqs_ref[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR] = {}; 3641 int ret; 3642 3643 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 3644 vfio_user_ctrlr_dump_migr_data("RESUME", migr_state, vu_ctrlr->sdbl); 3645 } 3646 3647 /* restore submission queues */ 3648 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3649 migr_qp = migr_state->qps[i]; 3650 3651 qsize = migr_qp.sq.size; 3652 if (qsize) { 3653 struct nvmf_vfio_user_sq *sq; 3654 3655 sqid = migr_qp.sq.sqid; 3656 if (sqid != i) { 3657 SPDK_ERRLOG("Expected sqid %u while got %u", i, sqid); 3658 return -EINVAL; 3659 } 3660 3661 /* allocate sq if necessary */ 3662 if (vu_ctrlr->sqs[sqid] == NULL) { 3663 ret = init_sq(vu_ctrlr, &vu_ctrlr->transport->transport, sqid); 3664 if (ret) { 3665 SPDK_ERRLOG("Construct qpair with qid %u failed\n", sqid); 3666 return -EFAULT; 3667 } 3668 } 3669 3670 sq = vu_ctrlr->sqs[sqid]; 3671 sq->size = qsize; 3672 3673 ret = alloc_sq_reqs(vu_ctrlr, sq); 3674 if (ret) { 3675 SPDK_ERRLOG("Construct sq with qid %u failed\n", sqid); 3676 return -EFAULT; 3677 } 3678 3679 /* restore sq */ 3680 sq->sq_state = VFIO_USER_SQ_CREATED; 3681 sq->cqid = migr_qp.sq.cqid; 3682 *sq_headp(sq) = migr_qp.sq.head; 3683 sq->mapping.prp1 = migr_qp.sq.dma_addr; 3684 sq->mapping.len = sq->size * sizeof(struct spdk_nvme_cmd); 3685 addr = map_one(vu_ctrlr->endpoint->vfu_ctx, 3686 sq->mapping.prp1, sq->mapping.len, 3687 sq->mapping.sg, &sq->mapping.iov, 3688 PROT_READ); 3689 if (addr == NULL) { 3690 SPDK_ERRLOG("Restore sq with qid %u PRP1 0x%"PRIx64" with size %u failed\n", 3691 sqid, sq->mapping.prp1, sq->size); 3692 return -EFAULT; 3693 } 3694 cqs_ref[sq->cqid]++; 3695 } 3696 } 3697 3698 /* restore completion queues */ 3699 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3700 migr_qp = migr_state->qps[i]; 3701 3702 qsize = migr_qp.cq.size; 3703 if (qsize) { 3704 struct nvmf_vfio_user_cq *cq; 3705 3706 /* restore cq */ 3707 cqid = migr_qp.sq.cqid; 3708 assert(cqid == i); 3709 3710 /* allocate cq if necessary */ 3711 if (vu_ctrlr->cqs[cqid] == NULL) { 3712 ret = init_cq(vu_ctrlr, cqid); 3713 if (ret) { 3714 SPDK_ERRLOG("Construct qpair with qid %u failed\n", cqid); 3715 return -EFAULT; 3716 } 3717 } 3718 3719 cq = vu_ctrlr->cqs[cqid]; 3720 3721 cq->size = qsize; 3722 3723 cq->cq_state = VFIO_USER_CQ_CREATED; 3724 cq->cq_ref = cqs_ref[cqid]; 3725 *cq_tailp(cq) = migr_qp.cq.tail; 3726 cq->mapping.prp1 = migr_qp.cq.dma_addr; 3727 cq->mapping.len = cq->size * sizeof(struct spdk_nvme_cpl); 3728 cq->ien = migr_qp.cq.ien; 3729 cq->iv = migr_qp.cq.iv; 3730 cq->phase = migr_qp.cq.phase; 3731 addr = map_one(vu_ctrlr->endpoint->vfu_ctx, 3732 cq->mapping.prp1, cq->mapping.len, 3733 cq->mapping.sg, &cq->mapping.iov, 3734 PROT_READ | PROT_WRITE); 3735 if (addr == NULL) { 3736 SPDK_ERRLOG("Restore cq with qid %u PRP1 0x%"PRIx64" with size %u failed\n", 3737 cqid, cq->mapping.prp1, cq->size); 3738 return -EFAULT; 3739 } 3740 } 3741 } 3742 3743 return 0; 3744 } 3745 3746 static int 3747 vfio_user_migr_ctrlr_restore(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3748 { 3749 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 3750 struct spdk_nvmf_ctrlr *ctrlr = vu_ctrlr->ctrlr; 3751 uint32_t *doorbell_base; 3752 struct spdk_nvme_cmd cmd; 3753 uint16_t i; 3754 int rc = 0; 3755 struct vfio_user_nvme_migr_state migr_state = { 3756 .nvmf_data = { 3757 .data_size = offsetof(struct spdk_nvmf_ctrlr_migr_data, unused), 3758 .regs_size = sizeof(struct spdk_nvmf_registers), 3759 .feat_size = sizeof(struct spdk_nvmf_ctrlr_feat) 3760 } 3761 }; 3762 3763 assert(endpoint->migr_data != NULL); 3764 assert(ctrlr != NULL); 3765 rc = vfio_user_migr_stream_to_data(endpoint, &migr_state); 3766 if (rc) { 3767 return rc; 3768 } 3769 3770 /* restore shadow doorbells */ 3771 if (migr_state.ctrlr_header.sdbl) { 3772 struct nvmf_vfio_user_shadow_doorbells *sdbl; 3773 sdbl = map_sdbl(vu_ctrlr->endpoint->vfu_ctx, 3774 migr_state.ctrlr_header.shadow_doorbell_buffer, 3775 migr_state.ctrlr_header.eventidx_buffer, 3776 memory_page_size(vu_ctrlr)); 3777 if (sdbl == NULL) { 3778 SPDK_ERRLOG("%s: failed to re-map shadow doorbell buffers\n", 3779 ctrlr_id(vu_ctrlr)); 3780 return -1; 3781 } 3782 3783 vu_ctrlr->shadow_doorbell_buffer = migr_state.ctrlr_header.shadow_doorbell_buffer; 3784 vu_ctrlr->eventidx_buffer = migr_state.ctrlr_header.eventidx_buffer; 3785 3786 SWAP(vu_ctrlr->sdbl, sdbl); 3787 } 3788 3789 rc = vfio_user_migr_ctrlr_construct_qps(vu_ctrlr, &migr_state); 3790 if (rc) { 3791 return rc; 3792 } 3793 3794 /* restore PCI configuration space */ 3795 memcpy((void *)endpoint->pci_config_space, &migr_state.cfg, NVME_REG_CFG_SIZE); 3796 3797 doorbell_base = (uint32_t *)&migr_state.doorbells; 3798 /* restore doorbells from saved registers */ 3799 memcpy((void *)vu_ctrlr->bar0_doorbells, doorbell_base, NVMF_VFIO_USER_DOORBELLS_SIZE); 3800 3801 /* restore nvmf controller data */ 3802 rc = spdk_nvmf_ctrlr_restore_migr_data(ctrlr, &migr_state.nvmf_data); 3803 if (rc) { 3804 return rc; 3805 } 3806 3807 /* resubmit pending AERs */ 3808 for (i = 0; i < migr_state.nvmf_data.num_aer_cids; i++) { 3809 SPDK_DEBUGLOG(nvmf_vfio, "%s AER resubmit, CID %u\n", ctrlr_id(vu_ctrlr), 3810 migr_state.nvmf_data.aer_cids[i]); 3811 memset(&cmd, 0, sizeof(cmd)); 3812 cmd.opc = SPDK_NVME_OPC_ASYNC_EVENT_REQUEST; 3813 cmd.cid = migr_state.nvmf_data.aer_cids[i]; 3814 rc = handle_cmd_req(vu_ctrlr, &cmd, vu_ctrlr->sqs[0]); 3815 if (spdk_unlikely(rc)) { 3816 break; 3817 } 3818 } 3819 3820 return rc; 3821 } 3822 3823 static void 3824 vfio_user_migr_ctrlr_enable_sqs(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3825 { 3826 uint32_t i; 3827 struct nvmf_vfio_user_sq *sq; 3828 3829 /* The Admin queue (qid: 0) does not ever use shadow doorbells. */ 3830 3831 if (vu_ctrlr->sqs[0] != NULL) { 3832 vu_ctrlr->sqs[0]->dbl_tailp = vu_ctrlr->bar0_doorbells + 3833 queue_index(0, false); 3834 } 3835 3836 if (vu_ctrlr->cqs[0] != NULL) { 3837 vu_ctrlr->cqs[0]->dbl_headp = vu_ctrlr->bar0_doorbells + 3838 queue_index(0, true); 3839 } 3840 3841 vfio_user_ctrlr_switch_doorbells(vu_ctrlr, vu_ctrlr->sdbl != NULL); 3842 3843 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3844 sq = vu_ctrlr->sqs[i]; 3845 if (!sq || !sq->size) { 3846 continue; 3847 } 3848 3849 if (nvmf_qpair_is_admin_queue(&sq->qpair)) { 3850 /* ADMIN queue pair is always in the poll group, just enable it */ 3851 sq->sq_state = VFIO_USER_SQ_ACTIVE; 3852 } else { 3853 spdk_nvmf_tgt_new_qpair(vu_ctrlr->transport->transport.tgt, &sq->qpair); 3854 } 3855 } 3856 } 3857 3858 /* 3859 * We are in stop-and-copy state, but still potentially have some current dirty 3860 * sgls: while we're quiesced and thus should have no active requests, we still 3861 * have potentially dirty maps of the shadow doorbells and the CQs (SQs are 3862 * mapped read only). 3863 * 3864 * Since we won't be calling vfu_sgl_put() for them, we need to explicitly 3865 * mark them dirty now. 3866 */ 3867 static void 3868 vfio_user_migr_ctrlr_mark_dirty(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3869 { 3870 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 3871 3872 assert(vu_ctrlr->state == VFIO_USER_CTRLR_MIGRATING); 3873 3874 for (size_t i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3875 struct nvmf_vfio_user_cq *cq = vu_ctrlr->cqs[i]; 3876 3877 if (cq == NULL || q_addr(&cq->mapping) == NULL) { 3878 continue; 3879 } 3880 3881 vfu_sgl_mark_dirty(endpoint->vfu_ctx, cq->mapping.sg, 1); 3882 } 3883 3884 if (vu_ctrlr->sdbl != NULL) { 3885 dma_sg_t *sg; 3886 size_t i; 3887 3888 for (i = 0; i < NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT; 3889 ++i) { 3890 3891 if (!vu_ctrlr->sdbl->iovs[i].iov_len) { 3892 continue; 3893 } 3894 3895 sg = index_to_sg_t(vu_ctrlr->sdbl->sgs, i); 3896 3897 vfu_sgl_mark_dirty(endpoint->vfu_ctx, sg, 1); 3898 } 3899 } 3900 } 3901 3902 static int 3903 vfio_user_migration_device_state_transition(vfu_ctx_t *vfu_ctx, vfu_migr_state_t state) 3904 { 3905 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3906 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3907 struct nvmf_vfio_user_sq *sq; 3908 int ret = 0; 3909 3910 SPDK_DEBUGLOG(nvmf_vfio, "%s controller state %u, migration state %u\n", endpoint_id(endpoint), 3911 vu_ctrlr->state, state); 3912 3913 switch (state) { 3914 case VFU_MIGR_STATE_STOP_AND_COPY: 3915 vu_ctrlr->in_source_vm = true; 3916 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 3917 vfio_user_migr_ctrlr_mark_dirty(vu_ctrlr); 3918 vfio_user_migr_ctrlr_save_data(vu_ctrlr); 3919 break; 3920 case VFU_MIGR_STATE_STOP: 3921 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 3922 /* The controller associates with source VM is dead now, we will resume 3923 * the subsystem after destroying the controller data structure, then the 3924 * subsystem can be re-used for another new client. 3925 */ 3926 if (vu_ctrlr->in_source_vm) { 3927 endpoint->need_resume = true; 3928 } 3929 break; 3930 case VFU_MIGR_STATE_PRE_COPY: 3931 assert(vu_ctrlr->state == VFIO_USER_CTRLR_PAUSED); 3932 break; 3933 case VFU_MIGR_STATE_RESUME: 3934 /* 3935 * Destination ADMIN queue pair is connected when starting the VM, 3936 * but the ADMIN queue pair isn't enabled in destination VM, the poll 3937 * group will do nothing to ADMIN queue pair for now. 3938 */ 3939 if (vu_ctrlr->state != VFIO_USER_CTRLR_RUNNING) { 3940 break; 3941 } 3942 3943 assert(!vu_ctrlr->in_source_vm); 3944 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 3945 3946 sq = TAILQ_FIRST(&vu_ctrlr->connected_sqs); 3947 assert(sq != NULL); 3948 assert(sq->qpair.qid == 0); 3949 sq->sq_state = VFIO_USER_SQ_INACTIVE; 3950 3951 /* Free ADMIN SQ resources first, SQ resources will be 3952 * allocated based on queue size from source VM. 3953 */ 3954 free_sq_reqs(sq); 3955 sq->size = 0; 3956 break; 3957 case VFU_MIGR_STATE_RUNNING: 3958 3959 if (vu_ctrlr->state != VFIO_USER_CTRLR_MIGRATING) { 3960 break; 3961 } 3962 3963 if (!vu_ctrlr->in_source_vm) { 3964 /* Restore destination VM from BAR9 */ 3965 ret = vfio_user_migr_ctrlr_restore(vu_ctrlr); 3966 if (ret) { 3967 break; 3968 } 3969 3970 vfio_user_ctrlr_switch_doorbells(vu_ctrlr, false); 3971 vfio_user_migr_ctrlr_enable_sqs(vu_ctrlr); 3972 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 3973 /* FIXME where do we resume nvmf? */ 3974 } else { 3975 /* Rollback source VM */ 3976 vu_ctrlr->state = VFIO_USER_CTRLR_RESUMING; 3977 ret = spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 3978 vfio_user_endpoint_resume_done, endpoint); 3979 if (ret < 0) { 3980 /* TODO: fail controller with CFS bit set */ 3981 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 3982 SPDK_ERRLOG("%s: failed to resume, ret=%d\n", endpoint_id(endpoint), ret); 3983 } 3984 } 3985 vu_ctrlr->migr_data_prepared = false; 3986 vu_ctrlr->in_source_vm = false; 3987 break; 3988 3989 default: 3990 return -EINVAL; 3991 } 3992 3993 return ret; 3994 } 3995 3996 static uint64_t 3997 vfio_user_migration_get_pending_bytes(vfu_ctx_t *vfu_ctx) 3998 { 3999 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 4000 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 4001 uint64_t pending_bytes; 4002 4003 if (ctrlr->migr_data_prepared) { 4004 assert(ctrlr->state == VFIO_USER_CTRLR_MIGRATING); 4005 pending_bytes = 0; 4006 } else { 4007 pending_bytes = vfio_user_migr_data_len(); 4008 } 4009 4010 SPDK_DEBUGLOG(nvmf_vfio, 4011 "%s current state %u, pending bytes 0x%"PRIx64"\n", 4012 endpoint_id(endpoint), ctrlr->state, pending_bytes); 4013 4014 return pending_bytes; 4015 } 4016 4017 static int 4018 vfio_user_migration_prepare_data(vfu_ctx_t *vfu_ctx, uint64_t *offset, uint64_t *size) 4019 { 4020 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 4021 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 4022 4023 /* 4024 * When transitioning to pre-copy state we set pending_bytes to 0, 4025 * so the vfio-user client shouldn't attempt to read any migration 4026 * data. This is not yet guaranteed by libvfio-user. 4027 */ 4028 if (ctrlr->state != VFIO_USER_CTRLR_MIGRATING) { 4029 assert(size != NULL); 4030 *offset = 0; 4031 *size = 0; 4032 return 0; 4033 } 4034 4035 if (ctrlr->in_source_vm) { /* migration source */ 4036 assert(size != NULL); 4037 *size = vfio_user_migr_data_len(); 4038 vfio_user_migr_ctrlr_save_data(ctrlr); 4039 } else { /* migration destination */ 4040 assert(size == NULL); 4041 assert(!ctrlr->migr_data_prepared); 4042 } 4043 *offset = 0; 4044 ctrlr->migr_data_prepared = true; 4045 4046 SPDK_DEBUGLOG(nvmf_vfio, "%s current state %u\n", endpoint_id(endpoint), ctrlr->state); 4047 4048 return 0; 4049 } 4050 4051 static ssize_t 4052 vfio_user_migration_read_data(vfu_ctx_t *vfu_ctx __attribute__((unused)), 4053 void *buf __attribute__((unused)), 4054 uint64_t count __attribute__((unused)), 4055 uint64_t offset __attribute__((unused))) 4056 { 4057 SPDK_DEBUGLOG(nvmf_vfio, "%s: migration read data not supported\n", 4058 endpoint_id(vfu_get_private(vfu_ctx))); 4059 errno = ENOTSUP; 4060 return -1; 4061 } 4062 4063 static ssize_t 4064 vfio_user_migration_write_data(vfu_ctx_t *vfu_ctx __attribute__((unused)), 4065 void *buf __attribute__((unused)), 4066 uint64_t count __attribute__((unused)), 4067 uint64_t offset __attribute__((unused))) 4068 { 4069 SPDK_DEBUGLOG(nvmf_vfio, "%s: migration write data not supported\n", 4070 endpoint_id(vfu_get_private(vfu_ctx))); 4071 errno = ENOTSUP; 4072 return -1; 4073 } 4074 4075 static int 4076 vfio_user_migration_data_written(vfu_ctx_t *vfu_ctx __attribute__((unused)), 4077 uint64_t count) 4078 { 4079 SPDK_DEBUGLOG(nvmf_vfio, "write 0x%"PRIx64"\n", (uint64_t)count); 4080 4081 if (count != vfio_user_migr_data_len()) { 4082 SPDK_DEBUGLOG(nvmf_vfio, "%s bad count %#lx\n", 4083 endpoint_id(vfu_get_private(vfu_ctx)), count); 4084 errno = EINVAL; 4085 return -1; 4086 } 4087 4088 return 0; 4089 } 4090 4091 static int 4092 vfio_user_dev_info_fill(struct nvmf_vfio_user_transport *vu_transport, 4093 struct nvmf_vfio_user_endpoint *endpoint) 4094 { 4095 int ret; 4096 ssize_t cap_offset; 4097 vfu_ctx_t *vfu_ctx = endpoint->vfu_ctx; 4098 struct iovec migr_sparse_mmap = {}; 4099 4100 struct pmcap pmcap = { .hdr.id = PCI_CAP_ID_PM, .pmcs.nsfrst = 0x1 }; 4101 struct pxcap pxcap = { 4102 .hdr.id = PCI_CAP_ID_EXP, 4103 .pxcaps.ver = 0x2, 4104 .pxdcap = {.rer = 0x1, .flrc = 0x1}, 4105 .pxdcap2.ctds = 0x1 4106 }; 4107 4108 struct msixcap msixcap = { 4109 .hdr.id = PCI_CAP_ID_MSIX, 4110 .mxc.ts = NVME_IRQ_MSIX_NUM - 1, 4111 .mtab = {.tbir = 0x4, .to = 0x0}, 4112 .mpba = {.pbir = 0x5, .pbao = 0x0} 4113 }; 4114 4115 struct iovec sparse_mmap[] = { 4116 { 4117 .iov_base = (void *)NVME_DOORBELLS_OFFSET, 4118 .iov_len = NVMF_VFIO_USER_DOORBELLS_SIZE, 4119 }, 4120 }; 4121 4122 const vfu_migration_callbacks_t migr_callbacks = { 4123 .version = VFIO_USER_MIGR_CALLBACK_VERS, 4124 .transition = &vfio_user_migration_device_state_transition, 4125 .get_pending_bytes = &vfio_user_migration_get_pending_bytes, 4126 .prepare_data = &vfio_user_migration_prepare_data, 4127 .read_data = &vfio_user_migration_read_data, 4128 .data_written = &vfio_user_migration_data_written, 4129 .write_data = &vfio_user_migration_write_data 4130 }; 4131 4132 ret = vfu_pci_init(vfu_ctx, VFU_PCI_TYPE_EXPRESS, PCI_HEADER_TYPE_NORMAL, 0); 4133 if (ret < 0) { 4134 SPDK_ERRLOG("vfu_ctx %p failed to initialize PCI\n", vfu_ctx); 4135 return ret; 4136 } 4137 vfu_pci_set_id(vfu_ctx, SPDK_PCI_VID_NUTANIX, 0x0001, SPDK_PCI_VID_NUTANIX, 0); 4138 /* 4139 * 0x02, controller uses the NVM Express programming interface 4140 * 0x08, non-volatile memory controller 4141 * 0x01, mass storage controller 4142 */ 4143 vfu_pci_set_class(vfu_ctx, 0x01, 0x08, 0x02); 4144 4145 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pmcap); 4146 if (cap_offset < 0) { 4147 SPDK_ERRLOG("vfu_ctx %p failed add pmcap\n", vfu_ctx); 4148 return ret; 4149 } 4150 4151 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pxcap); 4152 if (cap_offset < 0) { 4153 SPDK_ERRLOG("vfu_ctx %p failed add pxcap\n", vfu_ctx); 4154 return ret; 4155 } 4156 4157 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &msixcap); 4158 if (cap_offset < 0) { 4159 SPDK_ERRLOG("vfu_ctx %p failed add msixcap\n", vfu_ctx); 4160 return ret; 4161 } 4162 4163 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_CFG_REGION_IDX, NVME_REG_CFG_SIZE, 4164 access_pci_config, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 4165 if (ret < 0) { 4166 SPDK_ERRLOG("vfu_ctx %p failed to setup cfg\n", vfu_ctx); 4167 return ret; 4168 } 4169 4170 if (vu_transport->transport_opts.disable_mappable_bar0) { 4171 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE, 4172 access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, 4173 NULL, 0, -1, 0); 4174 } else { 4175 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE, 4176 access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, 4177 sparse_mmap, 1, endpoint->devmem_fd, 0); 4178 } 4179 4180 if (ret < 0) { 4181 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 0\n", vfu_ctx); 4182 return ret; 4183 } 4184 4185 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR4_REGION_IDX, NVME_BAR4_SIZE, 4186 NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 4187 if (ret < 0) { 4188 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 4\n", vfu_ctx); 4189 return ret; 4190 } 4191 4192 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR5_REGION_IDX, NVME_BAR5_SIZE, 4193 NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 4194 if (ret < 0) { 4195 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 5\n", vfu_ctx); 4196 return ret; 4197 } 4198 4199 ret = vfu_setup_device_dma(vfu_ctx, memory_region_add_cb, memory_region_remove_cb); 4200 if (ret < 0) { 4201 SPDK_ERRLOG("vfu_ctx %p failed to setup dma callback\n", vfu_ctx); 4202 return ret; 4203 } 4204 4205 ret = vfu_setup_device_reset_cb(vfu_ctx, vfio_user_device_reset); 4206 if (ret < 0) { 4207 SPDK_ERRLOG("vfu_ctx %p failed to setup reset callback\n", vfu_ctx); 4208 return ret; 4209 } 4210 4211 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_INTX_IRQ, 1); 4212 if (ret < 0) { 4213 SPDK_ERRLOG("vfu_ctx %p failed to setup INTX\n", vfu_ctx); 4214 return ret; 4215 } 4216 4217 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSIX_IRQ, NVME_IRQ_MSIX_NUM); 4218 if (ret < 0) { 4219 SPDK_ERRLOG("vfu_ctx %p failed to setup MSIX\n", vfu_ctx); 4220 return ret; 4221 } 4222 4223 vfu_setup_device_quiesce_cb(vfu_ctx, vfio_user_dev_quiesce_cb); 4224 4225 migr_sparse_mmap.iov_base = (void *)4096; 4226 migr_sparse_mmap.iov_len = vfio_user_migr_data_len(); 4227 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_MIGR_REGION_IDX, 4228 vfu_get_migr_register_area_size() + vfio_user_migr_data_len(), 4229 NULL, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, &migr_sparse_mmap, 4230 1, endpoint->migr_fd, 0); 4231 if (ret < 0) { 4232 SPDK_ERRLOG("vfu_ctx %p failed to setup migration region\n", vfu_ctx); 4233 return ret; 4234 } 4235 4236 ret = vfu_setup_device_migration_callbacks(vfu_ctx, &migr_callbacks, 4237 vfu_get_migr_register_area_size()); 4238 if (ret < 0) { 4239 SPDK_ERRLOG("vfu_ctx %p failed to setup migration callbacks\n", vfu_ctx); 4240 return ret; 4241 } 4242 4243 ret = vfu_realize_ctx(vfu_ctx); 4244 if (ret < 0) { 4245 SPDK_ERRLOG("vfu_ctx %p failed to realize\n", vfu_ctx); 4246 return ret; 4247 } 4248 4249 endpoint->pci_config_space = vfu_pci_get_config_space(endpoint->vfu_ctx); 4250 assert(endpoint->pci_config_space != NULL); 4251 init_pci_config_space(endpoint->pci_config_space); 4252 4253 assert(cap_offset != 0); 4254 endpoint->msix = (struct msixcap *)((uint8_t *)endpoint->pci_config_space + cap_offset); 4255 4256 return 0; 4257 } 4258 4259 static int nvmf_vfio_user_accept(void *ctx); 4260 4261 static void 4262 set_intr_mode_noop(struct spdk_poller *poller, void *arg, bool interrupt_mode) 4263 { 4264 /* Nothing for us to do here. */ 4265 } 4266 4267 /* 4268 * Register an "accept" poller: this is polling for incoming vfio-user socket 4269 * connections (on the listening socket). 4270 * 4271 * We need to do this on first listening, and also after destroying a 4272 * controller, so we can accept another connection. 4273 */ 4274 static int 4275 vfio_user_register_accept_poller(struct nvmf_vfio_user_endpoint *endpoint) 4276 { 4277 uint64_t poll_rate_us = endpoint->transport->transport.opts.acceptor_poll_rate; 4278 4279 SPDK_DEBUGLOG(nvmf_vfio, "registering accept poller\n"); 4280 4281 endpoint->accept_poller = SPDK_POLLER_REGISTER(nvmf_vfio_user_accept, 4282 endpoint, poll_rate_us); 4283 4284 if (!endpoint->accept_poller) { 4285 return -1; 4286 } 4287 4288 endpoint->accept_thread = spdk_get_thread(); 4289 endpoint->need_relisten = false; 4290 4291 if (!spdk_interrupt_mode_is_enabled()) { 4292 return 0; 4293 } 4294 4295 endpoint->accept_intr_fd = vfu_get_poll_fd(endpoint->vfu_ctx); 4296 assert(endpoint->accept_intr_fd != -1); 4297 4298 endpoint->accept_intr = SPDK_INTERRUPT_REGISTER(endpoint->accept_intr_fd, 4299 nvmf_vfio_user_accept, endpoint); 4300 4301 assert(endpoint->accept_intr != NULL); 4302 4303 spdk_poller_register_interrupt(endpoint->accept_poller, 4304 set_intr_mode_noop, NULL); 4305 return 0; 4306 } 4307 4308 static void 4309 _vfio_user_relisten(void *ctx) 4310 { 4311 struct nvmf_vfio_user_endpoint *endpoint = ctx; 4312 4313 vfio_user_register_accept_poller(endpoint); 4314 } 4315 4316 static void 4317 _free_ctrlr(void *ctx) 4318 { 4319 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 4320 struct nvmf_vfio_user_endpoint *endpoint = ctrlr->endpoint; 4321 4322 free_sdbl(endpoint->vfu_ctx, ctrlr->sdbl); 4323 4324 spdk_interrupt_unregister(&ctrlr->intr); 4325 ctrlr->intr_fd = -1; 4326 spdk_poller_unregister(&ctrlr->vfu_ctx_poller); 4327 4328 free(ctrlr); 4329 4330 if (endpoint->need_async_destroy) { 4331 nvmf_vfio_user_destroy_endpoint(endpoint); 4332 } else if (endpoint->need_relisten) { 4333 spdk_thread_send_msg(endpoint->accept_thread, 4334 _vfio_user_relisten, endpoint); 4335 } 4336 } 4337 4338 static void 4339 free_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 4340 { 4341 struct spdk_thread *thread; 4342 int i; 4343 4344 assert(ctrlr != NULL); 4345 thread = ctrlr->thread ? ctrlr->thread : spdk_get_thread(); 4346 4347 SPDK_DEBUGLOG(nvmf_vfio, "free %s\n", ctrlr_id(ctrlr)); 4348 4349 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 4350 free_qp(ctrlr, i); 4351 } 4352 4353 spdk_thread_exec_msg(thread, _free_ctrlr, ctrlr); 4354 } 4355 4356 static int 4357 nvmf_vfio_user_create_ctrlr(struct nvmf_vfio_user_transport *transport, 4358 struct nvmf_vfio_user_endpoint *endpoint) 4359 { 4360 struct nvmf_vfio_user_ctrlr *ctrlr; 4361 int err = 0; 4362 4363 SPDK_DEBUGLOG(nvmf_vfio, "%s\n", endpoint_id(endpoint)); 4364 4365 /* First, construct a vfio-user CUSTOM transport controller */ 4366 ctrlr = calloc(1, sizeof(*ctrlr)); 4367 if (ctrlr == NULL) { 4368 err = -ENOMEM; 4369 goto out; 4370 } 4371 /* 4372 * We can only support one connection for now, but generate a unique cntlid in case vfio-user 4373 * transport is used together with RDMA or TCP transports in the same target 4374 */ 4375 ctrlr->cntlid = nvmf_subsystem_gen_cntlid(endpoint->subsystem); 4376 ctrlr->intr_fd = -1; 4377 ctrlr->transport = transport; 4378 ctrlr->endpoint = endpoint; 4379 ctrlr->bar0_doorbells = endpoint->bar0_doorbells; 4380 TAILQ_INIT(&ctrlr->connected_sqs); 4381 4382 ctrlr->adaptive_irqs_enabled = 4383 !transport->transport_opts.disable_adaptive_irq; 4384 4385 /* Then, construct an admin queue pair */ 4386 err = init_sq(ctrlr, &transport->transport, 0); 4387 if (err != 0) { 4388 free(ctrlr); 4389 goto out; 4390 } 4391 4392 err = init_cq(ctrlr, 0); 4393 if (err != 0) { 4394 free(ctrlr); 4395 goto out; 4396 } 4397 4398 ctrlr->sqs[0]->size = NVMF_VFIO_USER_DEFAULT_AQ_DEPTH; 4399 4400 err = alloc_sq_reqs(ctrlr, ctrlr->sqs[0]); 4401 if (err != 0) { 4402 free(ctrlr); 4403 goto out; 4404 } 4405 endpoint->ctrlr = ctrlr; 4406 4407 /* Notify the generic layer about the new admin queue pair */ 4408 spdk_nvmf_tgt_new_qpair(transport->transport.tgt, &ctrlr->sqs[0]->qpair); 4409 4410 out: 4411 if (err != 0) { 4412 SPDK_ERRLOG("%s: failed to create vfio-user controller: %s\n", 4413 endpoint_id(endpoint), strerror(-err)); 4414 } 4415 4416 return err; 4417 } 4418 4419 static int 4420 nvmf_vfio_user_listen(struct spdk_nvmf_transport *transport, 4421 const struct spdk_nvme_transport_id *trid, 4422 struct spdk_nvmf_listen_opts *listen_opts) 4423 { 4424 struct nvmf_vfio_user_transport *vu_transport; 4425 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 4426 char path[PATH_MAX] = {}; 4427 char uuid[PATH_MAX] = {}; 4428 int ret; 4429 4430 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4431 transport); 4432 4433 pthread_mutex_lock(&vu_transport->lock); 4434 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 4435 /* Only compare traddr */ 4436 if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) { 4437 pthread_mutex_unlock(&vu_transport->lock); 4438 return -EEXIST; 4439 } 4440 } 4441 pthread_mutex_unlock(&vu_transport->lock); 4442 4443 endpoint = calloc(1, sizeof(*endpoint)); 4444 if (!endpoint) { 4445 return -ENOMEM; 4446 } 4447 4448 pthread_mutex_init(&endpoint->lock, NULL); 4449 endpoint->devmem_fd = -1; 4450 memcpy(&endpoint->trid, trid, sizeof(endpoint->trid)); 4451 endpoint->transport = vu_transport; 4452 4453 ret = snprintf(path, PATH_MAX, "%s/bar0", endpoint_id(endpoint)); 4454 if (ret < 0 || ret >= PATH_MAX) { 4455 SPDK_ERRLOG("%s: error to get socket path: %s.\n", endpoint_id(endpoint), spdk_strerror(errno)); 4456 ret = -1; 4457 goto out; 4458 } 4459 4460 ret = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); 4461 if (ret == -1) { 4462 SPDK_ERRLOG("%s: failed to open device memory at %s: %s.\n", 4463 endpoint_id(endpoint), path, spdk_strerror(errno)); 4464 goto out; 4465 } 4466 unlink(path); 4467 4468 endpoint->devmem_fd = ret; 4469 ret = ftruncate(endpoint->devmem_fd, 4470 NVME_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE); 4471 if (ret != 0) { 4472 SPDK_ERRLOG("%s: error to ftruncate file %s: %s.\n", endpoint_id(endpoint), path, 4473 spdk_strerror(errno)); 4474 goto out; 4475 } 4476 4477 endpoint->bar0_doorbells = mmap(NULL, NVMF_VFIO_USER_DOORBELLS_SIZE, 4478 PROT_READ | PROT_WRITE, MAP_SHARED, endpoint->devmem_fd, NVME_DOORBELLS_OFFSET); 4479 if (endpoint->bar0_doorbells == MAP_FAILED) { 4480 SPDK_ERRLOG("%s: error to mmap file %s: %s.\n", endpoint_id(endpoint), path, spdk_strerror(errno)); 4481 endpoint->bar0_doorbells = NULL; 4482 ret = -1; 4483 goto out; 4484 } 4485 4486 ret = snprintf(path, PATH_MAX, "%s/migr", endpoint_id(endpoint)); 4487 if (ret < 0 || ret >= PATH_MAX) { 4488 SPDK_ERRLOG("%s: error to get migration file path: %s.\n", endpoint_id(endpoint), 4489 spdk_strerror(errno)); 4490 ret = -1; 4491 goto out; 4492 } 4493 ret = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); 4494 if (ret == -1) { 4495 SPDK_ERRLOG("%s: failed to open device memory at %s: %s.\n", 4496 endpoint_id(endpoint), path, spdk_strerror(errno)); 4497 goto out; 4498 } 4499 unlink(path); 4500 4501 endpoint->migr_fd = ret; 4502 ret = ftruncate(endpoint->migr_fd, 4503 vfu_get_migr_register_area_size() + vfio_user_migr_data_len()); 4504 if (ret != 0) { 4505 SPDK_ERRLOG("%s: error to ftruncate migration file %s: %s.\n", endpoint_id(endpoint), path, 4506 spdk_strerror(errno)); 4507 goto out; 4508 } 4509 4510 endpoint->migr_data = mmap(NULL, vfio_user_migr_data_len(), 4511 PROT_READ | PROT_WRITE, MAP_SHARED, endpoint->migr_fd, vfu_get_migr_register_area_size()); 4512 if (endpoint->migr_data == MAP_FAILED) { 4513 SPDK_ERRLOG("%s: error to mmap file %s: %s.\n", endpoint_id(endpoint), path, spdk_strerror(errno)); 4514 endpoint->migr_data = NULL; 4515 ret = -1; 4516 goto out; 4517 } 4518 4519 ret = snprintf(uuid, PATH_MAX, "%s/cntrl", endpoint_id(endpoint)); 4520 if (ret < 0 || ret >= PATH_MAX) { 4521 SPDK_ERRLOG("%s: error to get ctrlr file path: %s\n", endpoint_id(endpoint), spdk_strerror(errno)); 4522 ret = -1; 4523 goto out; 4524 } 4525 4526 endpoint->vfu_ctx = vfu_create_ctx(VFU_TRANS_SOCK, uuid, LIBVFIO_USER_FLAG_ATTACH_NB, 4527 endpoint, VFU_DEV_TYPE_PCI); 4528 if (endpoint->vfu_ctx == NULL) { 4529 SPDK_ERRLOG("%s: error creating libmuser context: %m\n", 4530 endpoint_id(endpoint)); 4531 ret = -1; 4532 goto out; 4533 } 4534 4535 ret = vfu_setup_log(endpoint->vfu_ctx, vfio_user_log, 4536 vfio_user_get_log_level()); 4537 if (ret < 0) { 4538 goto out; 4539 } 4540 4541 4542 ret = vfio_user_dev_info_fill(vu_transport, endpoint); 4543 if (ret < 0) { 4544 goto out; 4545 } 4546 4547 ret = vfio_user_register_accept_poller(endpoint); 4548 4549 if (ret != 0) { 4550 goto out; 4551 } 4552 4553 pthread_mutex_lock(&vu_transport->lock); 4554 TAILQ_INSERT_TAIL(&vu_transport->endpoints, endpoint, link); 4555 pthread_mutex_unlock(&vu_transport->lock); 4556 4557 out: 4558 if (ret != 0) { 4559 nvmf_vfio_user_destroy_endpoint(endpoint); 4560 } 4561 4562 return ret; 4563 } 4564 4565 static void 4566 nvmf_vfio_user_stop_listen(struct spdk_nvmf_transport *transport, 4567 const struct spdk_nvme_transport_id *trid) 4568 { 4569 struct nvmf_vfio_user_transport *vu_transport; 4570 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 4571 4572 assert(trid != NULL); 4573 assert(trid->traddr != NULL); 4574 4575 SPDK_DEBUGLOG(nvmf_vfio, "%s: stop listen\n", trid->traddr); 4576 4577 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4578 transport); 4579 4580 pthread_mutex_lock(&vu_transport->lock); 4581 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 4582 if (strcmp(trid->traddr, endpoint->trid.traddr) == 0) { 4583 TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link); 4584 /* Defer to free endpoint resources until the controller 4585 * is freed. There are two cases when running here: 4586 * 1. kill nvmf target while VM is connected 4587 * 2. remove listener via RPC call 4588 * nvmf library will disconnect all queue paris. 4589 */ 4590 if (endpoint->ctrlr) { 4591 assert(!endpoint->need_async_destroy); 4592 endpoint->need_async_destroy = true; 4593 pthread_mutex_unlock(&vu_transport->lock); 4594 return; 4595 } 4596 4597 nvmf_vfio_user_destroy_endpoint(endpoint); 4598 pthread_mutex_unlock(&vu_transport->lock); 4599 return; 4600 } 4601 } 4602 pthread_mutex_unlock(&vu_transport->lock); 4603 4604 SPDK_DEBUGLOG(nvmf_vfio, "%s: not found\n", trid->traddr); 4605 } 4606 4607 static void 4608 nvmf_vfio_user_cdata_init(struct spdk_nvmf_transport *transport, 4609 struct spdk_nvmf_subsystem *subsystem, 4610 struct spdk_nvmf_ctrlr_data *cdata) 4611 { 4612 struct nvmf_vfio_user_transport *vu_transport; 4613 4614 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, transport); 4615 4616 cdata->vid = SPDK_PCI_VID_NUTANIX; 4617 cdata->ssvid = SPDK_PCI_VID_NUTANIX; 4618 cdata->ieee[0] = 0x8d; 4619 cdata->ieee[1] = 0x6b; 4620 cdata->ieee[2] = 0x50; 4621 memset(&cdata->sgls, 0, sizeof(struct spdk_nvme_cdata_sgls)); 4622 cdata->sgls.supported = SPDK_NVME_SGLS_SUPPORTED_DWORD_ALIGNED; 4623 cdata->oncs.compare = !vu_transport->transport_opts.disable_compare; 4624 /* libvfio-user can only support 1 connection for now */ 4625 cdata->oncs.reservations = 0; 4626 cdata->oacs.doorbell_buffer_config = !vu_transport->transport_opts.disable_shadow_doorbells; 4627 cdata->fuses.compare_and_write = !vu_transport->transport_opts.disable_compare; 4628 } 4629 4630 static int 4631 nvmf_vfio_user_listen_associate(struct spdk_nvmf_transport *transport, 4632 const struct spdk_nvmf_subsystem *subsystem, 4633 const struct spdk_nvme_transport_id *trid) 4634 { 4635 struct nvmf_vfio_user_transport *vu_transport; 4636 struct nvmf_vfio_user_endpoint *endpoint; 4637 4638 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, transport); 4639 4640 pthread_mutex_lock(&vu_transport->lock); 4641 TAILQ_FOREACH(endpoint, &vu_transport->endpoints, link) { 4642 if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) { 4643 break; 4644 } 4645 } 4646 pthread_mutex_unlock(&vu_transport->lock); 4647 4648 if (endpoint == NULL) { 4649 return -ENOENT; 4650 } 4651 4652 /* Drop const - we will later need to pause/unpause. */ 4653 endpoint->subsystem = (struct spdk_nvmf_subsystem *)subsystem; 4654 4655 return 0; 4656 } 4657 4658 /* 4659 * Executed periodically at a default SPDK_NVMF_DEFAULT_ACCEPT_POLL_RATE_US 4660 * frequency. 4661 * 4662 * For this endpoint (which at the libvfio-user level corresponds to a socket), 4663 * if we don't currently have a controller set up, peek to see if the socket is 4664 * able to accept a new connection. 4665 */ 4666 static int 4667 nvmf_vfio_user_accept(void *ctx) 4668 { 4669 struct nvmf_vfio_user_endpoint *endpoint = ctx; 4670 struct nvmf_vfio_user_transport *vu_transport; 4671 int err; 4672 4673 vu_transport = endpoint->transport; 4674 4675 if (endpoint->ctrlr != NULL) { 4676 return SPDK_POLLER_IDLE; 4677 } 4678 4679 /* While we're here, the controller is already destroyed, 4680 * subsystem may still be in RESUMING state, we will wait 4681 * until the subsystem is in RUNNING state. 4682 */ 4683 if (endpoint->need_resume) { 4684 return SPDK_POLLER_IDLE; 4685 } 4686 4687 err = vfu_attach_ctx(endpoint->vfu_ctx); 4688 if (err == 0) { 4689 SPDK_DEBUGLOG(nvmf_vfio, "attach succeeded\n"); 4690 err = nvmf_vfio_user_create_ctrlr(vu_transport, endpoint); 4691 if (err == 0) { 4692 /* 4693 * Unregister ourselves: now we've accepted a 4694 * connection, there is nothing for us to poll for, and 4695 * we will poll the connection via vfu_run_ctx() 4696 * instead. 4697 */ 4698 spdk_interrupt_unregister(&endpoint->accept_intr); 4699 spdk_poller_unregister(&endpoint->accept_poller); 4700 } 4701 return SPDK_POLLER_BUSY; 4702 } 4703 4704 if (errno == EAGAIN || errno == EWOULDBLOCK) { 4705 return SPDK_POLLER_IDLE; 4706 } 4707 4708 return SPDK_POLLER_BUSY; 4709 } 4710 4711 static void 4712 nvmf_vfio_user_discover(struct spdk_nvmf_transport *transport, 4713 struct spdk_nvme_transport_id *trid, 4714 struct spdk_nvmf_discovery_log_page_entry *entry) 4715 { } 4716 4717 static int vfio_user_poll_group_intr(void *ctx); 4718 4719 static void 4720 vfio_user_poll_group_add_intr(struct nvmf_vfio_user_poll_group *vu_group, 4721 struct spdk_nvmf_poll_group *group) 4722 { 4723 vu_group->intr_fd = eventfd(0, EFD_NONBLOCK); 4724 assert(vu_group->intr_fd != -1); 4725 4726 vu_group->intr = SPDK_INTERRUPT_REGISTER(vu_group->intr_fd, 4727 vfio_user_poll_group_intr, vu_group); 4728 assert(vu_group->intr != NULL); 4729 } 4730 4731 static struct spdk_nvmf_transport_poll_group * 4732 nvmf_vfio_user_poll_group_create(struct spdk_nvmf_transport *transport, 4733 struct spdk_nvmf_poll_group *group) 4734 { 4735 struct nvmf_vfio_user_transport *vu_transport; 4736 struct nvmf_vfio_user_poll_group *vu_group; 4737 4738 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4739 transport); 4740 4741 SPDK_DEBUGLOG(nvmf_vfio, "create poll group\n"); 4742 4743 vu_group = calloc(1, sizeof(*vu_group)); 4744 if (vu_group == NULL) { 4745 SPDK_ERRLOG("Error allocating poll group: %m"); 4746 return NULL; 4747 } 4748 4749 if (in_interrupt_mode(vu_transport)) { 4750 vfio_user_poll_group_add_intr(vu_group, group); 4751 } 4752 4753 TAILQ_INIT(&vu_group->sqs); 4754 4755 pthread_mutex_lock(&vu_transport->pg_lock); 4756 TAILQ_INSERT_TAIL(&vu_transport->poll_groups, vu_group, link); 4757 if (vu_transport->next_pg == NULL) { 4758 vu_transport->next_pg = vu_group; 4759 } 4760 pthread_mutex_unlock(&vu_transport->pg_lock); 4761 4762 return &vu_group->group; 4763 } 4764 4765 static struct spdk_nvmf_transport_poll_group * 4766 nvmf_vfio_user_get_optimal_poll_group(struct spdk_nvmf_qpair *qpair) 4767 { 4768 struct nvmf_vfio_user_transport *vu_transport; 4769 struct nvmf_vfio_user_poll_group **vu_group; 4770 struct nvmf_vfio_user_sq *sq; 4771 struct nvmf_vfio_user_cq *cq; 4772 4773 struct spdk_nvmf_transport_poll_group *result = NULL; 4774 4775 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 4776 cq = sq->ctrlr->cqs[sq->cqid]; 4777 assert(cq != NULL); 4778 vu_transport = SPDK_CONTAINEROF(qpair->transport, struct nvmf_vfio_user_transport, transport); 4779 4780 pthread_mutex_lock(&vu_transport->pg_lock); 4781 if (TAILQ_EMPTY(&vu_transport->poll_groups)) { 4782 goto out; 4783 } 4784 4785 if (!nvmf_qpair_is_admin_queue(qpair)) { 4786 /* 4787 * If this is shared IO CQ case, just return the used CQ's poll 4788 * group, so I/O completions don't have to use 4789 * spdk_thread_send_msg(). 4790 */ 4791 if (cq->group != NULL) { 4792 result = cq->group; 4793 goto out; 4794 } 4795 4796 /* 4797 * If we're in interrupt mode, align all qpairs for a controller 4798 * on the same poll group by default, unless requested. This can 4799 * be lower in performance than running on a single poll group, 4800 * so we disable spreading by default. 4801 */ 4802 if (in_interrupt_mode(vu_transport) && 4803 !vu_transport->transport_opts.enable_intr_mode_sq_spreading) { 4804 result = sq->ctrlr->sqs[0]->group; 4805 goto out; 4806 } 4807 4808 } 4809 4810 vu_group = &vu_transport->next_pg; 4811 assert(*vu_group != NULL); 4812 4813 result = &(*vu_group)->group; 4814 *vu_group = TAILQ_NEXT(*vu_group, link); 4815 if (*vu_group == NULL) { 4816 *vu_group = TAILQ_FIRST(&vu_transport->poll_groups); 4817 } 4818 4819 out: 4820 if (cq->group == NULL) { 4821 cq->group = result; 4822 } 4823 4824 pthread_mutex_unlock(&vu_transport->pg_lock); 4825 return result; 4826 } 4827 4828 static void 4829 vfio_user_poll_group_del_intr(struct nvmf_vfio_user_poll_group *vu_group) 4830 { 4831 assert(vu_group->intr_fd != -1); 4832 4833 spdk_interrupt_unregister(&vu_group->intr); 4834 4835 close(vu_group->intr_fd); 4836 vu_group->intr_fd = -1; 4837 } 4838 4839 /* called when process exits */ 4840 static void 4841 nvmf_vfio_user_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group) 4842 { 4843 struct nvmf_vfio_user_poll_group *vu_group, *next_tgroup; 4844 struct nvmf_vfio_user_transport *vu_transport; 4845 4846 SPDK_DEBUGLOG(nvmf_vfio, "destroy poll group\n"); 4847 4848 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 4849 vu_transport = SPDK_CONTAINEROF(vu_group->group.transport, struct nvmf_vfio_user_transport, 4850 transport); 4851 4852 if (in_interrupt_mode(vu_transport)) { 4853 vfio_user_poll_group_del_intr(vu_group); 4854 } 4855 4856 pthread_mutex_lock(&vu_transport->pg_lock); 4857 next_tgroup = TAILQ_NEXT(vu_group, link); 4858 TAILQ_REMOVE(&vu_transport->poll_groups, vu_group, link); 4859 if (next_tgroup == NULL) { 4860 next_tgroup = TAILQ_FIRST(&vu_transport->poll_groups); 4861 } 4862 if (vu_transport->next_pg == vu_group) { 4863 vu_transport->next_pg = next_tgroup; 4864 } 4865 pthread_mutex_unlock(&vu_transport->pg_lock); 4866 4867 free(vu_group); 4868 } 4869 4870 static void 4871 _vfio_user_qpair_disconnect(void *ctx) 4872 { 4873 struct nvmf_vfio_user_sq *sq = ctx; 4874 4875 spdk_nvmf_qpair_disconnect(&sq->qpair); 4876 } 4877 4878 /* The function is used when socket connection is destroyed */ 4879 static int 4880 vfio_user_destroy_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 4881 { 4882 struct nvmf_vfio_user_sq *sq; 4883 struct nvmf_vfio_user_endpoint *endpoint; 4884 4885 SPDK_DEBUGLOG(nvmf_vfio, "%s stop processing\n", ctrlr_id(ctrlr)); 4886 4887 endpoint = ctrlr->endpoint; 4888 assert(endpoint != NULL); 4889 4890 pthread_mutex_lock(&endpoint->lock); 4891 endpoint->need_relisten = true; 4892 ctrlr->disconnect = true; 4893 if (TAILQ_EMPTY(&ctrlr->connected_sqs)) { 4894 endpoint->ctrlr = NULL; 4895 free_ctrlr(ctrlr); 4896 pthread_mutex_unlock(&endpoint->lock); 4897 return 0; 4898 } 4899 4900 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 4901 /* add another round thread poll to avoid recursive endpoint lock */ 4902 spdk_thread_send_msg(ctrlr->thread, _vfio_user_qpair_disconnect, sq); 4903 } 4904 pthread_mutex_unlock(&endpoint->lock); 4905 4906 return 0; 4907 } 4908 4909 /* 4910 * Poll for and process any incoming vfio-user messages. 4911 */ 4912 static int 4913 vfio_user_poll_vfu_ctx(void *ctx) 4914 { 4915 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 4916 int ret; 4917 4918 assert(ctrlr != NULL); 4919 4920 /* This will call access_bar0_fn() if there are any writes 4921 * to the portion of the BAR that is not mmap'd */ 4922 ret = vfu_run_ctx(ctrlr->endpoint->vfu_ctx); 4923 if (spdk_unlikely(ret == -1)) { 4924 if (errno == EBUSY) { 4925 return SPDK_POLLER_IDLE; 4926 } 4927 4928 spdk_poller_unregister(&ctrlr->vfu_ctx_poller); 4929 4930 /* 4931 * We lost the client; the reset callback will already have 4932 * unregistered the interrupt. 4933 */ 4934 if (errno == ENOTCONN) { 4935 vfio_user_destroy_ctrlr(ctrlr); 4936 return SPDK_POLLER_BUSY; 4937 } 4938 4939 /* 4940 * We might not have got a reset callback in this case, so 4941 * explicitly unregister the interrupt here. 4942 */ 4943 spdk_interrupt_unregister(&ctrlr->intr); 4944 ctrlr->intr_fd = -1; 4945 fail_ctrlr(ctrlr); 4946 } 4947 4948 return ret != 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 4949 } 4950 4951 struct vfio_user_post_cpl_ctx { 4952 struct nvmf_vfio_user_ctrlr *ctrlr; 4953 struct nvmf_vfio_user_cq *cq; 4954 struct spdk_nvme_cpl cpl; 4955 }; 4956 4957 static void 4958 _post_completion_msg(void *ctx) 4959 { 4960 struct vfio_user_post_cpl_ctx *cpl_ctx = ctx; 4961 4962 post_completion(cpl_ctx->ctrlr, cpl_ctx->cq, cpl_ctx->cpl.cdw0, cpl_ctx->cpl.sqid, 4963 cpl_ctx->cpl.cid, cpl_ctx->cpl.status.sc, cpl_ctx->cpl.status.sct); 4964 free(cpl_ctx); 4965 } 4966 4967 static int nvmf_vfio_user_poll_group_poll(struct spdk_nvmf_transport_poll_group *group); 4968 4969 static int 4970 vfio_user_poll_group_process(void *ctx) 4971 { 4972 struct nvmf_vfio_user_poll_group *vu_group = ctx; 4973 int ret = 0; 4974 4975 SPDK_DEBUGLOG(vfio_user_db, "pg:%p got intr\n", vu_group); 4976 4977 ret |= nvmf_vfio_user_poll_group_poll(&vu_group->group); 4978 4979 /* 4980 * Re-arm the event indexes. NB: this also could rearm other 4981 * controller's SQs. 4982 */ 4983 ret |= vfio_user_poll_group_rearm(vu_group); 4984 4985 vu_group->stats.pg_process_count++; 4986 return ret != 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 4987 } 4988 4989 static int 4990 vfio_user_poll_group_intr(void *ctx) 4991 { 4992 struct nvmf_vfio_user_poll_group *vu_group = ctx; 4993 eventfd_t val; 4994 4995 eventfd_read(vu_group->intr_fd, &val); 4996 4997 vu_group->stats.intr++; 4998 4999 return vfio_user_poll_group_process(ctx); 5000 } 5001 5002 /* 5003 * Handle an interrupt for the given controller: we must poll the vfu_ctx, and 5004 * the SQs assigned to our own poll group. Other poll groups are handled via 5005 * vfio_user_poll_group_intr(). 5006 */ 5007 static int 5008 vfio_user_ctrlr_intr(void *ctx) 5009 { 5010 struct nvmf_vfio_user_poll_group *vu_ctrlr_group; 5011 struct nvmf_vfio_user_ctrlr *vu_ctrlr = ctx; 5012 struct nvmf_vfio_user_poll_group *vu_group; 5013 int ret = SPDK_POLLER_IDLE; 5014 5015 vu_ctrlr_group = ctrlr_to_poll_group(vu_ctrlr); 5016 5017 SPDK_DEBUGLOG(vfio_user_db, "ctrlr pg:%p got intr\n", vu_ctrlr_group); 5018 5019 vu_ctrlr_group->stats.ctrlr_intr++; 5020 5021 /* 5022 * Poll vfio-user for this controller. We need to do this before polling 5023 * any SQs, as this is where doorbell writes may be handled. 5024 */ 5025 ret = vfio_user_poll_vfu_ctx(vu_ctrlr); 5026 5027 /* 5028 * `sqs[0]` could be set to NULL in vfio_user_poll_vfu_ctx() context, 5029 * just return for this case. 5030 */ 5031 if (vu_ctrlr->sqs[0] == NULL) { 5032 return ret; 5033 } 5034 5035 if (vu_ctrlr->transport->transport_opts.enable_intr_mode_sq_spreading) { 5036 /* 5037 * We may have just written to a doorbell owned by another 5038 * reactor: we need to prod them to make sure its SQs are polled 5039 * *after* the doorbell value is updated. 5040 */ 5041 TAILQ_FOREACH(vu_group, &vu_ctrlr->transport->poll_groups, link) { 5042 if (vu_group != vu_ctrlr_group) { 5043 SPDK_DEBUGLOG(vfio_user_db, "prodding pg:%p\n", vu_group); 5044 eventfd_write(vu_group->intr_fd, 1); 5045 } 5046 } 5047 } 5048 5049 ret |= vfio_user_poll_group_process(vu_ctrlr_group); 5050 5051 return ret; 5052 } 5053 5054 static void 5055 vfio_user_ctrlr_set_intr_mode(struct spdk_poller *poller, void *ctx, 5056 bool interrupt_mode) 5057 { 5058 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 5059 assert(ctrlr != NULL); 5060 assert(ctrlr->endpoint != NULL); 5061 5062 SPDK_DEBUGLOG(nvmf_vfio, "%s: setting interrupt mode to %d\n", 5063 ctrlr_id(ctrlr), interrupt_mode); 5064 5065 /* 5066 * interrupt_mode needs to persist across controller resets, so store 5067 * it in the endpoint instead. 5068 */ 5069 ctrlr->endpoint->interrupt_mode = interrupt_mode; 5070 5071 vfio_user_poll_group_rearm(ctrlr_to_poll_group(ctrlr)); 5072 } 5073 5074 /* 5075 * In response to the nvmf_vfio_user_create_ctrlr() path, the admin queue is now 5076 * set up and we can start operating on this controller. 5077 */ 5078 static void 5079 start_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 5080 struct spdk_nvmf_ctrlr *ctrlr) 5081 { 5082 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 5083 5084 vu_ctrlr->ctrlr = ctrlr; 5085 vu_ctrlr->cntlid = ctrlr->cntlid; 5086 vu_ctrlr->thread = spdk_get_thread(); 5087 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 5088 5089 if (!in_interrupt_mode(endpoint->transport)) { 5090 vu_ctrlr->vfu_ctx_poller = SPDK_POLLER_REGISTER(vfio_user_poll_vfu_ctx, 5091 vu_ctrlr, 1000); 5092 return; 5093 } 5094 5095 vu_ctrlr->vfu_ctx_poller = SPDK_POLLER_REGISTER(vfio_user_poll_vfu_ctx, 5096 vu_ctrlr, 0); 5097 5098 vu_ctrlr->intr_fd = vfu_get_poll_fd(vu_ctrlr->endpoint->vfu_ctx); 5099 assert(vu_ctrlr->intr_fd != -1); 5100 5101 vu_ctrlr->intr = SPDK_INTERRUPT_REGISTER(vu_ctrlr->intr_fd, 5102 vfio_user_ctrlr_intr, vu_ctrlr); 5103 5104 assert(vu_ctrlr->intr != NULL); 5105 5106 spdk_poller_register_interrupt(vu_ctrlr->vfu_ctx_poller, 5107 vfio_user_ctrlr_set_intr_mode, 5108 vu_ctrlr); 5109 } 5110 5111 static int 5112 handle_queue_connect_rsp(struct nvmf_vfio_user_req *req, void *cb_arg) 5113 { 5114 struct nvmf_vfio_user_poll_group *vu_group; 5115 struct nvmf_vfio_user_sq *sq = cb_arg; 5116 struct nvmf_vfio_user_cq *admin_cq; 5117 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 5118 struct nvmf_vfio_user_endpoint *endpoint; 5119 5120 assert(sq != NULL); 5121 assert(req != NULL); 5122 5123 vu_ctrlr = sq->ctrlr; 5124 assert(vu_ctrlr != NULL); 5125 endpoint = vu_ctrlr->endpoint; 5126 assert(endpoint != NULL); 5127 5128 if (spdk_nvme_cpl_is_error(&req->req.rsp->nvme_cpl)) { 5129 SPDK_ERRLOG("SC %u, SCT %u\n", req->req.rsp->nvme_cpl.status.sc, req->req.rsp->nvme_cpl.status.sct); 5130 endpoint->ctrlr = NULL; 5131 free_ctrlr(vu_ctrlr); 5132 return -1; 5133 } 5134 5135 vu_group = SPDK_CONTAINEROF(sq->group, struct nvmf_vfio_user_poll_group, group); 5136 TAILQ_INSERT_TAIL(&vu_group->sqs, sq, link); 5137 5138 admin_cq = vu_ctrlr->cqs[0]; 5139 assert(admin_cq != NULL); 5140 assert(admin_cq->group != NULL); 5141 assert(admin_cq->group->group->thread != NULL); 5142 5143 pthread_mutex_lock(&endpoint->lock); 5144 if (nvmf_qpair_is_admin_queue(&sq->qpair)) { 5145 assert(admin_cq->group->group->thread == spdk_get_thread()); 5146 /* 5147 * The admin queue is special as SQ0 and CQ0 are created 5148 * together. 5149 */ 5150 admin_cq->cq_ref = 1; 5151 start_ctrlr(vu_ctrlr, sq->qpair.ctrlr); 5152 } else { 5153 /* For I/O queues this command was generated in response to an 5154 * ADMIN I/O CREATE SUBMISSION QUEUE command which has not yet 5155 * been completed. Complete it now. 5156 */ 5157 if (sq->post_create_io_sq_completion) { 5158 if (admin_cq->group->group->thread != spdk_get_thread()) { 5159 struct vfio_user_post_cpl_ctx *cpl_ctx; 5160 5161 cpl_ctx = calloc(1, sizeof(*cpl_ctx)); 5162 if (!cpl_ctx) { 5163 return -ENOMEM; 5164 } 5165 cpl_ctx->ctrlr = vu_ctrlr; 5166 cpl_ctx->cq = admin_cq; 5167 cpl_ctx->cpl.sqid = 0; 5168 cpl_ctx->cpl.cdw0 = 0; 5169 cpl_ctx->cpl.cid = sq->create_io_sq_cmd.cid; 5170 cpl_ctx->cpl.status.sc = SPDK_NVME_SC_SUCCESS; 5171 cpl_ctx->cpl.status.sct = SPDK_NVME_SCT_GENERIC; 5172 5173 spdk_thread_send_msg(admin_cq->group->group->thread, 5174 _post_completion_msg, 5175 cpl_ctx); 5176 } else { 5177 post_completion(vu_ctrlr, admin_cq, 0, 0, 5178 sq->create_io_sq_cmd.cid, SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC); 5179 } 5180 sq->post_create_io_sq_completion = false; 5181 } else if (in_interrupt_mode(endpoint->transport)) { 5182 /* 5183 * If we're live migrating a guest, there is a window 5184 * where the I/O queues haven't been set up but the 5185 * device is in running state, during which the guest 5186 * might write to a doorbell. This doorbell write will 5187 * go unnoticed, so let's poll the whole controller to 5188 * pick that up. 5189 */ 5190 ctrlr_kick(vu_ctrlr); 5191 } 5192 sq->sq_state = VFIO_USER_SQ_ACTIVE; 5193 } 5194 5195 TAILQ_INSERT_TAIL(&vu_ctrlr->connected_sqs, sq, tailq); 5196 pthread_mutex_unlock(&endpoint->lock); 5197 5198 free(req->req.iov[0].iov_base); 5199 req->req.iov[0].iov_base = NULL; 5200 req->req.iovcnt = 0; 5201 5202 return 0; 5203 } 5204 5205 static void 5206 _nvmf_vfio_user_poll_group_add(void *req) 5207 { 5208 spdk_nvmf_request_exec(req); 5209 } 5210 5211 /* 5212 * Add the given qpair to the given poll group. New qpairs are added via 5213 * spdk_nvmf_tgt_new_qpair(), which picks a poll group via 5214 * nvmf_vfio_user_get_optimal_poll_group(), then calls back here via 5215 * nvmf_transport_poll_group_add(). 5216 */ 5217 static int 5218 nvmf_vfio_user_poll_group_add(struct spdk_nvmf_transport_poll_group *group, 5219 struct spdk_nvmf_qpair *qpair) 5220 { 5221 struct nvmf_vfio_user_sq *sq; 5222 struct nvmf_vfio_user_req *vu_req; 5223 struct nvmf_vfio_user_ctrlr *ctrlr; 5224 struct spdk_nvmf_request *req; 5225 struct spdk_nvmf_fabric_connect_data *data; 5226 bool admin; 5227 5228 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5229 sq->group = group; 5230 ctrlr = sq->ctrlr; 5231 5232 SPDK_DEBUGLOG(nvmf_vfio, "%s: add QP%d=%p(%p) to poll_group=%p\n", 5233 ctrlr_id(ctrlr), sq->qpair.qid, 5234 sq, qpair, group); 5235 5236 admin = nvmf_qpair_is_admin_queue(&sq->qpair); 5237 5238 vu_req = get_nvmf_vfio_user_req(sq); 5239 if (vu_req == NULL) { 5240 return -1; 5241 } 5242 5243 req = &vu_req->req; 5244 req->cmd->connect_cmd.opcode = SPDK_NVME_OPC_FABRIC; 5245 req->cmd->connect_cmd.cid = 0; 5246 req->cmd->connect_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_CONNECT; 5247 req->cmd->connect_cmd.recfmt = 0; 5248 req->cmd->connect_cmd.sqsize = sq->size - 1; 5249 req->cmd->connect_cmd.qid = admin ? 0 : qpair->qid; 5250 5251 req->length = sizeof(struct spdk_nvmf_fabric_connect_data); 5252 5253 data = calloc(1, req->length); 5254 if (data == NULL) { 5255 nvmf_vfio_user_req_free(req); 5256 return -ENOMEM; 5257 } 5258 5259 SPDK_IOV_ONE(req->iov, &req->iovcnt, data, req->length); 5260 5261 data->cntlid = ctrlr->cntlid; 5262 snprintf(data->subnqn, sizeof(data->subnqn), "%s", 5263 spdk_nvmf_subsystem_get_nqn(ctrlr->endpoint->subsystem)); 5264 5265 vu_req->cb_fn = handle_queue_connect_rsp; 5266 vu_req->cb_arg = sq; 5267 5268 SPDK_DEBUGLOG(nvmf_vfio, 5269 "%s: sending connect fabrics command for qid:%#x cntlid=%#x\n", 5270 ctrlr_id(ctrlr), qpair->qid, data->cntlid); 5271 5272 /* 5273 * By the time transport's poll_group_add() callback is executed, the 5274 * qpair isn't in the ACTIVE state yet, so spdk_nvmf_request_exec() 5275 * would fail. The state changes to ACTIVE immediately after the 5276 * callback finishes, so delay spdk_nvmf_request_exec() by sending a 5277 * message. 5278 */ 5279 spdk_thread_send_msg(spdk_get_thread(), _nvmf_vfio_user_poll_group_add, req); 5280 return 0; 5281 } 5282 5283 static int 5284 nvmf_vfio_user_poll_group_remove(struct spdk_nvmf_transport_poll_group *group, 5285 struct spdk_nvmf_qpair *qpair) 5286 { 5287 struct nvmf_vfio_user_sq *sq; 5288 struct nvmf_vfio_user_poll_group *vu_group; 5289 5290 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5291 5292 SPDK_DEBUGLOG(nvmf_vfio, 5293 "%s: remove NVMf QP%d=%p from NVMf poll_group=%p\n", 5294 ctrlr_id(sq->ctrlr), qpair->qid, qpair, group); 5295 5296 5297 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 5298 TAILQ_REMOVE(&vu_group->sqs, sq, link); 5299 5300 return 0; 5301 } 5302 5303 static void 5304 _nvmf_vfio_user_req_free(struct nvmf_vfio_user_sq *sq, struct nvmf_vfio_user_req *vu_req) 5305 { 5306 memset(&vu_req->cmd, 0, sizeof(vu_req->cmd)); 5307 memset(&vu_req->rsp, 0, sizeof(vu_req->rsp)); 5308 vu_req->iovcnt = 0; 5309 vu_req->req.iovcnt = 0; 5310 vu_req->req.length = 0; 5311 vu_req->state = VFIO_USER_REQUEST_STATE_FREE; 5312 5313 TAILQ_INSERT_TAIL(&sq->free_reqs, vu_req, link); 5314 } 5315 5316 static int 5317 nvmf_vfio_user_req_free(struct spdk_nvmf_request *req) 5318 { 5319 struct nvmf_vfio_user_sq *sq; 5320 struct nvmf_vfio_user_req *vu_req; 5321 5322 assert(req != NULL); 5323 5324 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 5325 sq = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 5326 5327 _nvmf_vfio_user_req_free(sq, vu_req); 5328 5329 return 0; 5330 } 5331 5332 static int 5333 nvmf_vfio_user_req_complete(struct spdk_nvmf_request *req) 5334 { 5335 struct nvmf_vfio_user_sq *sq; 5336 struct nvmf_vfio_user_req *vu_req; 5337 5338 assert(req != NULL); 5339 5340 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 5341 sq = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 5342 5343 if (vu_req->cb_fn != NULL) { 5344 if (vu_req->cb_fn(vu_req, vu_req->cb_arg) != 0) { 5345 fail_ctrlr(sq->ctrlr); 5346 } 5347 } 5348 5349 _nvmf_vfio_user_req_free(sq, vu_req); 5350 5351 return 0; 5352 } 5353 5354 static void 5355 nvmf_vfio_user_close_qpair(struct spdk_nvmf_qpair *qpair, 5356 spdk_nvmf_transport_qpair_fini_cb cb_fn, void *cb_arg) 5357 { 5358 struct nvmf_vfio_user_sq *sq; 5359 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 5360 struct nvmf_vfio_user_endpoint *endpoint; 5361 struct vfio_user_delete_sq_ctx *del_ctx; 5362 5363 assert(qpair != NULL); 5364 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5365 vu_ctrlr = sq->ctrlr; 5366 endpoint = vu_ctrlr->endpoint; 5367 del_ctx = sq->delete_ctx; 5368 sq->delete_ctx = NULL; 5369 5370 pthread_mutex_lock(&endpoint->lock); 5371 TAILQ_REMOVE(&vu_ctrlr->connected_sqs, sq, tailq); 5372 delete_sq_done(vu_ctrlr, sq); 5373 if (TAILQ_EMPTY(&vu_ctrlr->connected_sqs)) { 5374 endpoint->ctrlr = NULL; 5375 if (vu_ctrlr->in_source_vm && endpoint->need_resume) { 5376 /* The controller will be freed, we can resume the subsystem 5377 * now so that the endpoint can be ready to accept another 5378 * new connection. 5379 */ 5380 spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 5381 vfio_user_endpoint_resume_done, endpoint); 5382 } 5383 free_ctrlr(vu_ctrlr); 5384 } 5385 pthread_mutex_unlock(&endpoint->lock); 5386 5387 if (del_ctx) { 5388 vfio_user_qpair_delete_cb(del_ctx); 5389 } 5390 5391 if (cb_fn) { 5392 cb_fn(cb_arg); 5393 } 5394 } 5395 5396 /** 5397 * Returns a preallocated request, or NULL if there isn't one available. 5398 */ 5399 static struct nvmf_vfio_user_req * 5400 get_nvmf_vfio_user_req(struct nvmf_vfio_user_sq *sq) 5401 { 5402 struct nvmf_vfio_user_req *req; 5403 5404 if (sq == NULL) { 5405 return NULL; 5406 } 5407 5408 req = TAILQ_FIRST(&sq->free_reqs); 5409 if (req == NULL) { 5410 return NULL; 5411 } 5412 5413 TAILQ_REMOVE(&sq->free_reqs, req, link); 5414 5415 return req; 5416 } 5417 5418 static int 5419 get_nvmf_io_req_length(struct spdk_nvmf_request *req) 5420 { 5421 uint16_t nr; 5422 uint32_t nlb, nsid; 5423 struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; 5424 struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; 5425 struct spdk_nvmf_ns *ns; 5426 5427 nsid = cmd->nsid; 5428 ns = _nvmf_subsystem_get_ns(ctrlr->subsys, nsid); 5429 if (ns == NULL || ns->bdev == NULL) { 5430 SPDK_ERRLOG("unsuccessful query for nsid %u\n", cmd->nsid); 5431 return -EINVAL; 5432 } 5433 5434 if (cmd->opc == SPDK_NVME_OPC_DATASET_MANAGEMENT) { 5435 nr = cmd->cdw10_bits.dsm.nr + 1; 5436 return nr * sizeof(struct spdk_nvme_dsm_range); 5437 } 5438 5439 if (cmd->opc == SPDK_NVME_OPC_COPY) { 5440 nr = (cmd->cdw12 & 0x000000ffu) + 1; 5441 return nr * sizeof(struct spdk_nvme_scc_source_range); 5442 } 5443 5444 nlb = (cmd->cdw12 & 0x0000ffffu) + 1; 5445 return nlb * spdk_bdev_get_block_size(ns->bdev); 5446 } 5447 5448 static int 5449 map_admin_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req) 5450 { 5451 struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; 5452 uint32_t len = 0, numdw = 0; 5453 uint8_t fid; 5454 int iovcnt; 5455 5456 req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 5457 5458 if (req->xfer == SPDK_NVME_DATA_NONE) { 5459 return 0; 5460 } 5461 5462 switch (cmd->opc) { 5463 case SPDK_NVME_OPC_IDENTIFY: 5464 len = 4096; 5465 break; 5466 case SPDK_NVME_OPC_GET_LOG_PAGE: 5467 numdw = ((((uint32_t)cmd->cdw11_bits.get_log_page.numdu << 16) | 5468 cmd->cdw10_bits.get_log_page.numdl) + 1); 5469 if (numdw > UINT32_MAX / 4) { 5470 return -EINVAL; 5471 } 5472 len = numdw * 4; 5473 break; 5474 case SPDK_NVME_OPC_GET_FEATURES: 5475 case SPDK_NVME_OPC_SET_FEATURES: 5476 fid = cmd->cdw10_bits.set_features.fid; 5477 switch (fid) { 5478 case SPDK_NVME_FEAT_LBA_RANGE_TYPE: 5479 len = 4096; 5480 break; 5481 case SPDK_NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 5482 len = 256; 5483 break; 5484 case SPDK_NVME_FEAT_TIMESTAMP: 5485 len = 8; 5486 break; 5487 case SPDK_NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 5488 len = 512; 5489 break; 5490 case SPDK_NVME_FEAT_HOST_IDENTIFIER: 5491 if (cmd->cdw11_bits.feat_host_identifier.bits.exhid) { 5492 len = 16; 5493 } else { 5494 len = 8; 5495 } 5496 break; 5497 default: 5498 return 0; 5499 } 5500 break; 5501 case SPDK_NVME_OPC_FABRIC: 5502 return -ENOTSUP; 5503 default: 5504 return 0; 5505 } 5506 5507 /* ADMIN command will not use SGL */ 5508 if (cmd->psdt != 0) { 5509 return -EINVAL; 5510 } 5511 5512 iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, len); 5513 if (iovcnt < 0) { 5514 SPDK_ERRLOG("%s: map Admin Opc %x failed\n", 5515 ctrlr_id(ctrlr), cmd->opc); 5516 return -1; 5517 } 5518 req->length = len; 5519 req->iovcnt = iovcnt; 5520 5521 return 0; 5522 } 5523 5524 /* 5525 * Map an I/O command's buffers. 5526 * 5527 * Returns 0 on success and -errno on failure. 5528 */ 5529 static int 5530 map_io_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req) 5531 { 5532 int len, iovcnt; 5533 struct spdk_nvme_cmd *cmd; 5534 5535 assert(ctrlr != NULL); 5536 assert(req != NULL); 5537 5538 cmd = &req->cmd->nvme_cmd; 5539 req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 5540 5541 if (spdk_unlikely(req->xfer == SPDK_NVME_DATA_NONE)) { 5542 return 0; 5543 } 5544 5545 len = get_nvmf_io_req_length(req); 5546 if (len < 0) { 5547 return -EINVAL; 5548 } 5549 req->length = len; 5550 5551 iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, req->length); 5552 if (iovcnt < 0) { 5553 SPDK_ERRLOG("%s: failed to map IO OPC %u\n", ctrlr_id(ctrlr), cmd->opc); 5554 return -EFAULT; 5555 } 5556 req->iovcnt = iovcnt; 5557 5558 return 0; 5559 } 5560 5561 static int 5562 handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 5563 struct nvmf_vfio_user_sq *sq) 5564 { 5565 int err; 5566 struct nvmf_vfio_user_req *vu_req; 5567 struct spdk_nvmf_request *req; 5568 5569 assert(ctrlr != NULL); 5570 assert(cmd != NULL); 5571 5572 vu_req = get_nvmf_vfio_user_req(sq); 5573 if (spdk_unlikely(vu_req == NULL)) { 5574 SPDK_ERRLOG("%s: no request for NVMe command opc 0x%x\n", ctrlr_id(ctrlr), cmd->opc); 5575 return post_completion(ctrlr, ctrlr->cqs[sq->cqid], 0, 0, cmd->cid, 5576 SPDK_NVME_SC_INTERNAL_DEVICE_ERROR, SPDK_NVME_SCT_GENERIC); 5577 5578 } 5579 req = &vu_req->req; 5580 5581 assert(req->qpair != NULL); 5582 SPDK_DEBUGLOG(nvmf_vfio, "%s: handle sqid:%u, req opc=%#x cid=%d\n", 5583 ctrlr_id(ctrlr), req->qpair->qid, cmd->opc, cmd->cid); 5584 5585 vu_req->cb_fn = handle_cmd_rsp; 5586 vu_req->cb_arg = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 5587 req->cmd->nvme_cmd = *cmd; 5588 5589 if (nvmf_qpair_is_admin_queue(req->qpair)) { 5590 err = map_admin_cmd_req(ctrlr, req); 5591 } else { 5592 switch (cmd->opc) { 5593 case SPDK_NVME_OPC_RESERVATION_REGISTER: 5594 case SPDK_NVME_OPC_RESERVATION_REPORT: 5595 case SPDK_NVME_OPC_RESERVATION_ACQUIRE: 5596 case SPDK_NVME_OPC_RESERVATION_RELEASE: 5597 case SPDK_NVME_OPC_FABRIC: 5598 err = -ENOTSUP; 5599 break; 5600 default: 5601 err = map_io_cmd_req(ctrlr, req); 5602 break; 5603 } 5604 } 5605 5606 if (spdk_unlikely(err < 0)) { 5607 SPDK_ERRLOG("%s: process NVMe command opc 0x%x failed\n", 5608 ctrlr_id(ctrlr), cmd->opc); 5609 req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; 5610 req->rsp->nvme_cpl.status.sc = err == -ENOTSUP ? 5611 SPDK_NVME_SC_INVALID_OPCODE : 5612 SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 5613 err = handle_cmd_rsp(vu_req, vu_req->cb_arg); 5614 _nvmf_vfio_user_req_free(sq, vu_req); 5615 return err; 5616 } 5617 5618 vu_req->state = VFIO_USER_REQUEST_STATE_EXECUTING; 5619 spdk_nvmf_request_exec(req); 5620 5621 return 0; 5622 } 5623 5624 /* 5625 * If we suppressed an IRQ in post_completion(), check if it needs to be fired 5626 * here: if the host isn't up to date, and is apparently not actively processing 5627 * the queue (i.e. ->last_head isn't changing), we need an IRQ. 5628 */ 5629 static void 5630 handle_suppressed_irq(struct nvmf_vfio_user_ctrlr *ctrlr, 5631 struct nvmf_vfio_user_sq *sq) 5632 { 5633 struct nvmf_vfio_user_cq *cq = ctrlr->cqs[sq->cqid]; 5634 uint32_t cq_head; 5635 uint32_t cq_tail; 5636 5637 if (!cq->ien || cq->qid == 0 || !ctrlr_interrupt_enabled(ctrlr)) { 5638 return; 5639 } 5640 5641 cq_tail = *cq_tailp(cq); 5642 5643 /* Already sent? */ 5644 if (cq_tail == cq->last_trigger_irq_tail) { 5645 return; 5646 } 5647 5648 spdk_ivdt_dcache(cq_dbl_headp(cq)); 5649 cq_head = *cq_dbl_headp(cq); 5650 5651 if (cq_head != cq_tail && cq_head == cq->last_head) { 5652 int err = vfu_irq_trigger(ctrlr->endpoint->vfu_ctx, cq->iv); 5653 if (err != 0) { 5654 SPDK_ERRLOG("%s: failed to trigger interrupt: %m\n", 5655 ctrlr_id(ctrlr)); 5656 } else { 5657 cq->last_trigger_irq_tail = cq_tail; 5658 } 5659 } 5660 5661 cq->last_head = cq_head; 5662 } 5663 5664 /* Returns the number of commands processed, or a negative value on error. */ 5665 static int 5666 nvmf_vfio_user_sq_poll(struct nvmf_vfio_user_sq *sq) 5667 { 5668 struct nvmf_vfio_user_ctrlr *ctrlr; 5669 uint32_t new_tail; 5670 int count = 0; 5671 5672 assert(sq != NULL); 5673 5674 ctrlr = sq->ctrlr; 5675 5676 /* 5677 * A quiesced, or migrating, controller should never process new 5678 * commands. 5679 */ 5680 if (ctrlr->state != VFIO_USER_CTRLR_RUNNING) { 5681 return SPDK_POLLER_IDLE; 5682 } 5683 5684 if (ctrlr->adaptive_irqs_enabled) { 5685 handle_suppressed_irq(ctrlr, sq); 5686 } 5687 5688 /* On aarch64 platforms, doorbells update from guest VM may not be seen 5689 * on SPDK target side. This is because there is memory type mismatch 5690 * situation here. That is on guest VM side, the doorbells are treated as 5691 * device memory while on SPDK target side, it is treated as normal 5692 * memory. And this situation cause problem on ARM platform. 5693 * Refer to "https://developer.arm.com/documentation/102376/0100/ 5694 * Memory-aliasing-and-mismatched-memory-types". Only using spdk_mb() 5695 * cannot fix this. Use "dc civac" to invalidate cache may solve 5696 * this. 5697 */ 5698 spdk_ivdt_dcache(sq_dbl_tailp(sq)); 5699 5700 /* Load-Acquire. */ 5701 new_tail = *sq_dbl_tailp(sq); 5702 5703 new_tail = new_tail & 0xffffu; 5704 if (spdk_unlikely(new_tail >= sq->size)) { 5705 SPDK_DEBUGLOG(nvmf_vfio, "%s: invalid sqid:%u doorbell value %u\n", ctrlr_id(ctrlr), sq->qid, 5706 new_tail); 5707 spdk_nvmf_ctrlr_async_event_error_event(ctrlr->ctrlr, SPDK_NVME_ASYNC_EVENT_INVALID_DB_WRITE); 5708 5709 return -1; 5710 } 5711 5712 if (*sq_headp(sq) == new_tail) { 5713 return 0; 5714 } 5715 5716 SPDK_DEBUGLOG(nvmf_vfio, "%s: sqid:%u doorbell old=%u new=%u\n", 5717 ctrlr_id(ctrlr), sq->qid, *sq_headp(sq), new_tail); 5718 if (ctrlr->sdbl != NULL) { 5719 SPDK_DEBUGLOG(nvmf_vfio, 5720 "%s: sqid:%u bar0_doorbell=%u shadow_doorbell=%u eventidx=%u\n", 5721 ctrlr_id(ctrlr), sq->qid, 5722 ctrlr->bar0_doorbells[queue_index(sq->qid, false)], 5723 ctrlr->sdbl->shadow_doorbells[queue_index(sq->qid, false)], 5724 ctrlr->sdbl->eventidxs[queue_index(sq->qid, false)]); 5725 } 5726 5727 /* 5728 * Ensure that changes to the queue are visible to us. 5729 * The host driver should write the queue first, do a wmb(), and then 5730 * update the SQ tail doorbell (their Store-Release). 5731 */ 5732 spdk_rmb(); 5733 5734 count = handle_sq_tdbl_write(ctrlr, new_tail, sq); 5735 if (spdk_unlikely(count < 0)) { 5736 fail_ctrlr(ctrlr); 5737 } 5738 5739 return count; 5740 } 5741 5742 /* 5743 * vfio-user transport poll handler. Note that the library context is polled in 5744 * a separate poller (->vfu_ctx_poller), so this poller only needs to poll the 5745 * active SQs. 5746 * 5747 * Returns the number of commands processed, or a negative value on error. 5748 */ 5749 static int 5750 nvmf_vfio_user_poll_group_poll(struct spdk_nvmf_transport_poll_group *group) 5751 { 5752 struct nvmf_vfio_user_poll_group *vu_group; 5753 struct nvmf_vfio_user_sq *sq, *tmp; 5754 int count = 0; 5755 5756 assert(group != NULL); 5757 5758 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 5759 5760 SPDK_DEBUGLOG(vfio_user_db, "polling all SQs\n"); 5761 5762 TAILQ_FOREACH_SAFE(sq, &vu_group->sqs, link, tmp) { 5763 int ret; 5764 5765 if (spdk_unlikely(sq->sq_state != VFIO_USER_SQ_ACTIVE || !sq->size)) { 5766 continue; 5767 } 5768 5769 ret = nvmf_vfio_user_sq_poll(sq); 5770 5771 if (spdk_unlikely(ret < 0)) { 5772 return ret; 5773 } 5774 5775 count += ret; 5776 } 5777 5778 vu_group->stats.polls++; 5779 vu_group->stats.poll_reqs += count; 5780 vu_group->stats.poll_reqs_squared += count * count; 5781 if (count == 0) { 5782 vu_group->stats.polls_spurious++; 5783 } 5784 5785 return count; 5786 } 5787 5788 static int 5789 nvmf_vfio_user_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair, 5790 struct spdk_nvme_transport_id *trid) 5791 { 5792 struct nvmf_vfio_user_sq *sq; 5793 struct nvmf_vfio_user_ctrlr *ctrlr; 5794 5795 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5796 ctrlr = sq->ctrlr; 5797 5798 memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid)); 5799 return 0; 5800 } 5801 5802 static int 5803 nvmf_vfio_user_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair, 5804 struct spdk_nvme_transport_id *trid) 5805 { 5806 return 0; 5807 } 5808 5809 static int 5810 nvmf_vfio_user_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair, 5811 struct spdk_nvme_transport_id *trid) 5812 { 5813 struct nvmf_vfio_user_sq *sq; 5814 struct nvmf_vfio_user_ctrlr *ctrlr; 5815 5816 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5817 ctrlr = sq->ctrlr; 5818 5819 memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid)); 5820 return 0; 5821 } 5822 5823 static void 5824 nvmf_vfio_user_qpair_abort_request(struct spdk_nvmf_qpair *qpair, 5825 struct spdk_nvmf_request *req) 5826 { 5827 struct spdk_nvmf_request *req_to_abort = NULL; 5828 struct spdk_nvmf_request *temp_req = NULL; 5829 uint16_t cid; 5830 5831 cid = req->cmd->nvme_cmd.cdw10_bits.abort.cid; 5832 5833 TAILQ_FOREACH(temp_req, &qpair->outstanding, link) { 5834 struct nvmf_vfio_user_req *vu_req; 5835 5836 vu_req = SPDK_CONTAINEROF(temp_req, struct nvmf_vfio_user_req, req); 5837 5838 if (vu_req->state == VFIO_USER_REQUEST_STATE_EXECUTING && vu_req->cmd.cid == cid) { 5839 req_to_abort = temp_req; 5840 break; 5841 } 5842 } 5843 5844 if (req_to_abort == NULL) { 5845 spdk_nvmf_request_complete(req); 5846 return; 5847 } 5848 5849 req->req_to_abort = req_to_abort; 5850 nvmf_ctrlr_abort_request(req); 5851 } 5852 5853 static void 5854 nvmf_vfio_user_poll_group_dump_stat(struct spdk_nvmf_transport_poll_group *group, 5855 struct spdk_json_write_ctx *w) 5856 { 5857 struct nvmf_vfio_user_poll_group *vu_group = SPDK_CONTAINEROF(group, 5858 struct nvmf_vfio_user_poll_group, group); 5859 uint64_t polls_denom; 5860 5861 spdk_json_write_named_uint64(w, "ctrlr_intr", vu_group->stats.ctrlr_intr); 5862 spdk_json_write_named_uint64(w, "ctrlr_kicks", vu_group->stats.ctrlr_kicks); 5863 spdk_json_write_named_uint64(w, "won", vu_group->stats.won); 5864 spdk_json_write_named_uint64(w, "lost", vu_group->stats.lost); 5865 spdk_json_write_named_uint64(w, "lost_count", vu_group->stats.lost_count); 5866 spdk_json_write_named_uint64(w, "rearms", vu_group->stats.rearms); 5867 spdk_json_write_named_uint64(w, "pg_process_count", vu_group->stats.pg_process_count); 5868 spdk_json_write_named_uint64(w, "intr", vu_group->stats.intr); 5869 spdk_json_write_named_uint64(w, "polls", vu_group->stats.polls); 5870 spdk_json_write_named_uint64(w, "polls_spurious", vu_group->stats.polls_spurious); 5871 spdk_json_write_named_uint64(w, "poll_reqs", vu_group->stats.poll_reqs); 5872 polls_denom = vu_group->stats.polls * (vu_group->stats.polls - 1); 5873 if (polls_denom) { 5874 uint64_t n = vu_group->stats.polls * vu_group->stats.poll_reqs_squared - vu_group->stats.poll_reqs * 5875 vu_group->stats.poll_reqs; 5876 spdk_json_write_named_double(w, "poll_reqs_variance", sqrt(n / polls_denom)); 5877 } 5878 5879 spdk_json_write_named_uint64(w, "cqh_admin_writes", vu_group->stats.cqh_admin_writes); 5880 spdk_json_write_named_uint64(w, "cqh_io_writes", vu_group->stats.cqh_io_writes); 5881 } 5882 5883 static void 5884 nvmf_vfio_user_opts_init(struct spdk_nvmf_transport_opts *opts) 5885 { 5886 opts->max_queue_depth = NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH; 5887 opts->max_qpairs_per_ctrlr = NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; 5888 opts->in_capsule_data_size = 0; 5889 opts->max_io_size = NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE; 5890 opts->io_unit_size = NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE; 5891 opts->max_aq_depth = NVMF_VFIO_USER_DEFAULT_AQ_DEPTH; 5892 opts->num_shared_buffers = 0; 5893 opts->buf_cache_size = 0; 5894 opts->association_timeout = 0; 5895 opts->transport_specific = NULL; 5896 } 5897 5898 const struct spdk_nvmf_transport_ops spdk_nvmf_transport_vfio_user = { 5899 .name = "VFIOUSER", 5900 .type = SPDK_NVME_TRANSPORT_VFIOUSER, 5901 .opts_init = nvmf_vfio_user_opts_init, 5902 .create = nvmf_vfio_user_create, 5903 .destroy = nvmf_vfio_user_destroy, 5904 5905 .listen = nvmf_vfio_user_listen, 5906 .stop_listen = nvmf_vfio_user_stop_listen, 5907 .cdata_init = nvmf_vfio_user_cdata_init, 5908 .listen_associate = nvmf_vfio_user_listen_associate, 5909 5910 .listener_discover = nvmf_vfio_user_discover, 5911 5912 .poll_group_create = nvmf_vfio_user_poll_group_create, 5913 .get_optimal_poll_group = nvmf_vfio_user_get_optimal_poll_group, 5914 .poll_group_destroy = nvmf_vfio_user_poll_group_destroy, 5915 .poll_group_add = nvmf_vfio_user_poll_group_add, 5916 .poll_group_remove = nvmf_vfio_user_poll_group_remove, 5917 .poll_group_poll = nvmf_vfio_user_poll_group_poll, 5918 5919 .req_free = nvmf_vfio_user_req_free, 5920 .req_complete = nvmf_vfio_user_req_complete, 5921 5922 .qpair_fini = nvmf_vfio_user_close_qpair, 5923 .qpair_get_local_trid = nvmf_vfio_user_qpair_get_local_trid, 5924 .qpair_get_peer_trid = nvmf_vfio_user_qpair_get_peer_trid, 5925 .qpair_get_listen_trid = nvmf_vfio_user_qpair_get_listen_trid, 5926 .qpair_abort_request = nvmf_vfio_user_qpair_abort_request, 5927 5928 .poll_group_dump_stat = nvmf_vfio_user_poll_group_dump_stat, 5929 }; 5930 5931 SPDK_NVMF_TRANSPORT_REGISTER(muser, &spdk_nvmf_transport_vfio_user); 5932 SPDK_LOG_REGISTER_COMPONENT(nvmf_vfio) 5933 SPDK_LOG_REGISTER_COMPONENT(vfio_user_db) 5934