1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2020 Intel Corporation. 3 * Copyright (c) 2019-2022, Nutanix Inc. All rights reserved. 4 * Copyright (c) 2022, 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 /* 8 * NVMe over vfio-user transport 9 */ 10 11 #include <sys/param.h> 12 13 #include <vfio-user/libvfio-user.h> 14 #include <vfio-user/pci_defs.h> 15 16 #include "spdk/barrier.h" 17 #include "spdk/stdinc.h" 18 #include "spdk/assert.h" 19 #include "spdk/thread.h" 20 #include "spdk/nvmf_transport.h" 21 #include "spdk/sock.h" 22 #include "spdk/string.h" 23 #include "spdk/util.h" 24 #include "spdk/log.h" 25 26 #include "transport.h" 27 28 #include "nvmf_internal.h" 29 30 #define SWAP(x, y) \ 31 do \ 32 { \ 33 typeof(x) _tmp = x; \ 34 x = y; \ 35 y = _tmp; \ 36 } while (0) 37 38 #define NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH 256 39 #define NVMF_VFIO_USER_DEFAULT_AQ_DEPTH 32 40 #define NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE ((NVMF_REQ_MAX_BUFFERS - 1) << SHIFT_4KB) 41 #define NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE 42 43 #define NVME_DOORBELLS_OFFSET 0x1000 44 #define NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT 2 45 #define NVMF_VFIO_USER_SET_EVENTIDX_MAX_ATTEMPTS 3 46 #define NVMF_VFIO_USER_EVENTIDX_POLL UINT32_MAX 47 48 #define NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR 512 49 #define NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR (NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR / 4) 50 51 /* NVMe spec 1.4, section 5.21.1.7 */ 52 SPDK_STATIC_ASSERT(NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR >= 2 && 53 NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR <= SPDK_NVME_MAX_IO_QUEUES, 54 "bad number of queues"); 55 56 /* 57 * NVMe driver reads 4096 bytes, which is the extended PCI configuration space 58 * available on PCI-X 2.0 and PCI Express buses 59 */ 60 #define NVME_REG_CFG_SIZE 0x1000 61 62 /* 63 * Doorbells must be page aligned so that they can memory mapped. 64 * 65 * TODO does the NVMe spec also require this? Document it. 66 */ 67 #define NVMF_VFIO_USER_DOORBELLS_SIZE \ 68 SPDK_ALIGN_CEIL( \ 69 (NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR * 2 * SPDK_NVME_DOORBELL_REGISTER_SIZE), \ 70 0x1000) 71 #define NVME_REG_BAR0_SIZE (NVME_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE) 72 73 /* 74 * TODO check the PCI spec whether BAR4 and BAR5 really have to be at least one 75 * page and a multiple of page size (maybe QEMU also needs this?). Document all 76 * this. 77 */ 78 79 /* 80 * MSI-X Pending Bit Array Size 81 * 82 * TODO according to the PCI spec we need one bit per vector, document the 83 * relevant section. 84 * 85 * If the first argument to SPDK_ALIGN_CEIL is 0 then the result is 0, so we 86 * would end up with a 0-size BAR5. 87 */ 88 #define NVME_IRQ_MSIX_NUM MAX(CHAR_BIT, NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) 89 #define NVME_BAR5_SIZE SPDK_ALIGN_CEIL((NVME_IRQ_MSIX_NUM / CHAR_BIT), 0x1000) 90 SPDK_STATIC_ASSERT(NVME_BAR5_SIZE > 0, "Incorrect size"); 91 92 /* MSI-X Table Size */ 93 #define NVME_BAR4_SIZE SPDK_ALIGN_CEIL((NVME_IRQ_MSIX_NUM * 16), 0x1000) 94 SPDK_STATIC_ASSERT(NVME_BAR4_SIZE > 0, "Incorrect size"); 95 96 struct nvmf_vfio_user_req; 97 98 typedef int (*nvmf_vfio_user_req_cb_fn)(struct nvmf_vfio_user_req *req, void *cb_arg); 99 100 /* 1 more for PRP2 list itself */ 101 #define NVMF_VFIO_USER_MAX_IOVECS (NVMF_REQ_MAX_BUFFERS + 1) 102 103 enum nvmf_vfio_user_req_state { 104 VFIO_USER_REQUEST_STATE_FREE = 0, 105 VFIO_USER_REQUEST_STATE_EXECUTING, 106 }; 107 108 /* 109 * Support for live migration in NVMf/vfio-user: live migration is implemented 110 * by stopping the NVMf subsystem when the device is instructed to enter the 111 * stop-and-copy state and then trivially, and most importantly safely, 112 * collecting migration state and providing it to the vfio-user client. We 113 * don't provide any migration state at the pre-copy state as that's too 114 * complicated to do, we might support this in the future. 115 */ 116 117 118 /* NVMe device state representation */ 119 struct nvme_migr_sq_state { 120 uint16_t sqid; 121 uint16_t cqid; 122 uint32_t head; 123 uint32_t size; 124 uint32_t reserved; 125 uint64_t dma_addr; 126 }; 127 SPDK_STATIC_ASSERT(sizeof(struct nvme_migr_sq_state) == 0x18, "Incorrect size"); 128 129 struct nvme_migr_cq_state { 130 uint16_t cqid; 131 uint16_t phase; 132 uint32_t tail; 133 uint32_t size; 134 uint32_t iv; 135 uint32_t ien; 136 uint32_t reserved; 137 uint64_t dma_addr; 138 }; 139 SPDK_STATIC_ASSERT(sizeof(struct nvme_migr_cq_state) == 0x20, "Incorrect size"); 140 141 #define VFIO_USER_MIGR_CALLBACK_VERS 1 142 #define VFIO_USER_NVME_MIGR_MAGIC 0xAFEDBC23 143 144 /* The device state is in VFIO MIGRATION BAR(9) region, keep the device state page aligned. 145 * 146 * NVMe device migration region is defined as below: 147 * ------------------------------------------------------------------------- 148 * | vfio_user_nvme_migr_header | nvmf controller data | queue pairs | BARs | 149 * ------------------------------------------------------------------------- 150 * 151 * Keep vfio_user_nvme_migr_header as a fixed 0x1000 length, all new added fields 152 * can use the reserved space at the end of the data structure. 153 */ 154 struct vfio_user_nvme_migr_header { 155 /* Magic value to validate migration data */ 156 uint32_t magic; 157 /* Version to check the data is same from source to destination */ 158 uint32_t version; 159 160 /* The library uses this field to know how many fields in this 161 * structure are valid, starting at the beginning of this data 162 * structure. New added fields in future use `unused` memory 163 * spaces. 164 */ 165 uint32_t opts_size; 166 uint32_t reserved0; 167 168 /* BARs information */ 169 uint64_t bar_offset[VFU_PCI_DEV_NUM_REGIONS]; 170 uint64_t bar_len[VFU_PCI_DEV_NUM_REGIONS]; 171 172 /* Queue pair start offset, starting at the beginning of this 173 * data structure. 174 */ 175 uint64_t qp_offset; 176 uint64_t qp_len; 177 178 /* Controller data structure */ 179 uint32_t num_io_queues; 180 uint32_t reserved1; 181 182 /* NVMf controller data offset and length if exist, starting at 183 * the beginning of this data structure. 184 */ 185 uint64_t nvmf_data_offset; 186 uint64_t nvmf_data_len; 187 188 /* 189 * Whether or not shadow doorbells are used in the source. 0 is a valid DMA 190 * address. 191 */ 192 uint32_t sdbl; 193 194 /* Shadow doorbell DMA addresses. */ 195 uint64_t shadow_doorbell_buffer; 196 uint64_t eventidx_buffer; 197 198 /* Reserved memory space for new added fields, the 199 * field is always at the end of this data structure. 200 */ 201 uint8_t unused[3856]; 202 }; 203 SPDK_STATIC_ASSERT(sizeof(struct vfio_user_nvme_migr_header) == 0x1000, "Incorrect size"); 204 205 struct vfio_user_nvme_migr_qp { 206 struct nvme_migr_sq_state sq; 207 struct nvme_migr_cq_state cq; 208 }; 209 210 /* NVMe state definition used to load/restore from/to NVMe migration BAR region */ 211 struct vfio_user_nvme_migr_state { 212 struct vfio_user_nvme_migr_header ctrlr_header; 213 struct spdk_nvmf_ctrlr_migr_data nvmf_data; 214 struct vfio_user_nvme_migr_qp qps[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 215 uint8_t doorbells[NVMF_VFIO_USER_DOORBELLS_SIZE]; 216 uint8_t cfg[NVME_REG_CFG_SIZE]; 217 }; 218 219 struct nvmf_vfio_user_req { 220 struct spdk_nvmf_request req; 221 struct spdk_nvme_cpl rsp; 222 struct spdk_nvme_cmd cmd; 223 224 enum nvmf_vfio_user_req_state state; 225 nvmf_vfio_user_req_cb_fn cb_fn; 226 void *cb_arg; 227 228 /* old CC before prop_set_cc fabric command */ 229 union spdk_nvme_cc_register cc; 230 231 TAILQ_ENTRY(nvmf_vfio_user_req) link; 232 233 struct iovec iov[NVMF_VFIO_USER_MAX_IOVECS]; 234 uint8_t iovcnt; 235 236 /* NVMF_VFIO_USER_MAX_IOVECS worth of dma_sg_t. */ 237 uint8_t sg[]; 238 }; 239 240 /* 241 * Mapping of an NVMe queue. 242 * 243 * This holds the information tracking a local process mapping of an NVMe queue 244 * shared by the client. 245 */ 246 struct nvme_q_mapping { 247 /* iov of local process mapping. */ 248 struct iovec iov; 249 /* Stored sg, needed for unmap. */ 250 dma_sg_t *sg; 251 /* Client PRP of queue. */ 252 uint64_t prp1; 253 }; 254 255 enum nvmf_vfio_user_sq_state { 256 VFIO_USER_SQ_UNUSED = 0, 257 VFIO_USER_SQ_CREATED, 258 VFIO_USER_SQ_DELETED, 259 VFIO_USER_SQ_ACTIVE, 260 VFIO_USER_SQ_INACTIVE 261 }; 262 263 enum nvmf_vfio_user_cq_state { 264 VFIO_USER_CQ_UNUSED = 0, 265 VFIO_USER_CQ_CREATED, 266 VFIO_USER_CQ_DELETED, 267 }; 268 269 enum nvmf_vfio_user_ctrlr_state { 270 VFIO_USER_CTRLR_CREATING = 0, 271 VFIO_USER_CTRLR_RUNNING, 272 /* Quiesce requested by libvfio-user */ 273 VFIO_USER_CTRLR_PAUSING, 274 /* NVMf subsystem is paused, it's safe to do PCI reset, memory register, 275 * memory unergister, and vfio migration state transition in this state. 276 */ 277 VFIO_USER_CTRLR_PAUSED, 278 /* 279 * Implies that the NVMf subsystem is paused. Device will be unquiesced (PCI 280 * reset, memory register and unregister, controller in destination VM has 281 * been restored). NVMf subsystem resume has been requested. 282 */ 283 VFIO_USER_CTRLR_RESUMING, 284 /* 285 * Implies that the NVMf subsystem is paused. Both controller in source VM and 286 * destinatiom VM is in this state when doing live migration. 287 */ 288 VFIO_USER_CTRLR_MIGRATING 289 }; 290 291 struct nvmf_vfio_user_sq { 292 struct spdk_nvmf_qpair qpair; 293 struct spdk_nvmf_transport_poll_group *group; 294 struct nvmf_vfio_user_ctrlr *ctrlr; 295 296 uint32_t qid; 297 /* Number of entries in queue. */ 298 uint32_t size; 299 struct nvme_q_mapping mapping; 300 enum nvmf_vfio_user_sq_state sq_state; 301 302 uint32_t head; 303 volatile uint32_t *dbl_tailp; 304 305 /* Whether a shadow doorbell eventidx needs setting. */ 306 bool need_rearm; 307 308 /* multiple SQs can be mapped to the same CQ */ 309 uint16_t cqid; 310 311 /* handle_queue_connect_rsp() can be used both for CREATE IO SQ response 312 * and SQ re-connect response in the destination VM, for the prior case, 313 * we will post a NVMe completion to VM, we will not set this flag when 314 * re-connecting SQs in the destination VM. 315 */ 316 bool post_create_io_sq_completion; 317 /* Copy of Create IO SQ command, this field is used together with 318 * `post_create_io_sq_completion` flag. 319 */ 320 struct spdk_nvme_cmd create_io_sq_cmd; 321 322 struct vfio_user_delete_sq_ctx *delete_ctx; 323 324 /* Currently unallocated reqs. */ 325 TAILQ_HEAD(, nvmf_vfio_user_req) free_reqs; 326 /* Poll group entry */ 327 TAILQ_ENTRY(nvmf_vfio_user_sq) link; 328 /* Connected SQ entry */ 329 TAILQ_ENTRY(nvmf_vfio_user_sq) tailq; 330 }; 331 332 struct nvmf_vfio_user_cq { 333 struct spdk_nvmf_transport_poll_group *group; 334 int cq_ref; 335 336 uint32_t qid; 337 /* Number of entries in queue. */ 338 uint32_t size; 339 struct nvme_q_mapping mapping; 340 enum nvmf_vfio_user_cq_state cq_state; 341 342 uint32_t tail; 343 volatile uint32_t *dbl_headp; 344 345 bool phase; 346 347 uint16_t iv; 348 bool ien; 349 350 uint32_t last_head; 351 uint32_t last_trigger_irq_tail; 352 }; 353 354 struct nvmf_vfio_user_poll_group { 355 struct spdk_nvmf_transport_poll_group group; 356 TAILQ_ENTRY(nvmf_vfio_user_poll_group) link; 357 TAILQ_HEAD(, nvmf_vfio_user_sq) sqs; 358 struct spdk_interrupt *intr; 359 int intr_fd; 360 struct { 361 362 /* 363 * ctrlr_intr and ctrlr_kicks will be zero for all other poll 364 * groups. However, they can be zero even for the poll group 365 * the controller belongs are if no vfio-user message has been 366 * received or the controller hasn't been kicked yet. 367 */ 368 369 /* 370 * Number of times vfio_user_ctrlr_intr() has run: 371 * vfio-user file descriptor has been ready or explicitly 372 * kicked (see below). 373 */ 374 uint64_t ctrlr_intr; 375 376 /* 377 * Kicks to the controller by ctrlr_kick(). 378 * ctrlr_intr - ctrlr_kicks is the number of times the 379 * vfio-user poll file descriptor has been ready. 380 */ 381 uint64_t ctrlr_kicks; 382 383 /* 384 * How many times we won the race arming an SQ. 385 */ 386 uint64_t won; 387 388 /* 389 * How many times we lost the race arming an SQ 390 */ 391 uint64_t lost; 392 393 /* 394 * How many requests we processed in total each time we lost 395 * the rearm race. 396 */ 397 uint64_t lost_count; 398 399 /* 400 * Number of attempts we attempted to rearm all the SQs in the 401 * poll group. 402 */ 403 uint64_t rearms; 404 405 uint64_t pg_process_count; 406 uint64_t intr; 407 uint64_t polls; 408 uint64_t polls_spurious; 409 uint64_t poll_reqs; 410 uint64_t poll_reqs_squared; 411 uint64_t cqh_admin_writes; 412 uint64_t cqh_io_writes; 413 } stats; 414 }; 415 416 struct nvmf_vfio_user_shadow_doorbells { 417 volatile uint32_t *shadow_doorbells; 418 volatile uint32_t *eventidxs; 419 dma_sg_t *sgs; 420 struct iovec *iovs; 421 }; 422 423 struct nvmf_vfio_user_ctrlr { 424 struct nvmf_vfio_user_endpoint *endpoint; 425 struct nvmf_vfio_user_transport *transport; 426 427 /* Connected SQs list */ 428 TAILQ_HEAD(, nvmf_vfio_user_sq) connected_sqs; 429 enum nvmf_vfio_user_ctrlr_state state; 430 431 /* 432 * Tells whether live migration data have been prepared. This is used 433 * by the get_pending_bytes callback to tell whether or not the 434 * previous iteration finished. 435 */ 436 bool migr_data_prepared; 437 438 /* Controller is in source VM when doing live migration */ 439 bool in_source_vm; 440 441 struct spdk_thread *thread; 442 struct spdk_poller *vfu_ctx_poller; 443 struct spdk_interrupt *intr; 444 int intr_fd; 445 446 bool queued_quiesce; 447 448 bool reset_shn; 449 bool disconnect; 450 451 uint16_t cntlid; 452 struct spdk_nvmf_ctrlr *ctrlr; 453 454 struct nvmf_vfio_user_sq *sqs[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 455 struct nvmf_vfio_user_cq *cqs[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 456 457 TAILQ_ENTRY(nvmf_vfio_user_ctrlr) link; 458 459 volatile uint32_t *bar0_doorbells; 460 struct nvmf_vfio_user_shadow_doorbells *sdbl; 461 /* 462 * Shadow doorbells PRPs to provide during the stop-and-copy state. 463 */ 464 uint64_t shadow_doorbell_buffer; 465 uint64_t eventidx_buffer; 466 467 bool adaptive_irqs_enabled; 468 }; 469 470 /* Endpoint in vfio-user is associated with a socket file, which 471 * is the representative of a PCI endpoint. 472 */ 473 struct nvmf_vfio_user_endpoint { 474 struct nvmf_vfio_user_transport *transport; 475 vfu_ctx_t *vfu_ctx; 476 struct spdk_poller *accept_poller; 477 struct spdk_thread *accept_thread; 478 bool interrupt_mode; 479 struct msixcap *msix; 480 vfu_pci_config_space_t *pci_config_space; 481 int devmem_fd; 482 int accept_intr_fd; 483 struct spdk_interrupt *accept_intr; 484 485 volatile uint32_t *bar0_doorbells; 486 487 int migr_fd; 488 void *migr_data; 489 490 struct spdk_nvme_transport_id trid; 491 struct spdk_nvmf_subsystem *subsystem; 492 493 /* Controller is associated with an active socket connection, 494 * the lifecycle of the controller is same as the VM. 495 * Currently we only support one active connection, as the NVMe 496 * specification defines, we may support multiple controllers in 497 * future, so that it can support e.g: RESERVATION. 498 */ 499 struct nvmf_vfio_user_ctrlr *ctrlr; 500 pthread_mutex_t lock; 501 502 bool need_async_destroy; 503 /* The subsystem is in PAUSED state and need to be resumed, TRUE 504 * only when migration is done successfully and the controller is 505 * in source VM. 506 */ 507 bool need_resume; 508 /* Start the accept poller again after destroying the controller */ 509 bool need_relisten; 510 511 TAILQ_ENTRY(nvmf_vfio_user_endpoint) link; 512 }; 513 514 struct nvmf_vfio_user_transport_opts { 515 bool disable_mappable_bar0; 516 bool disable_adaptive_irq; 517 bool disable_shadow_doorbells; 518 bool disable_compare; 519 bool enable_intr_mode_sq_spreading; 520 }; 521 522 struct nvmf_vfio_user_transport { 523 struct spdk_nvmf_transport transport; 524 struct nvmf_vfio_user_transport_opts transport_opts; 525 bool intr_mode_supported; 526 pthread_mutex_t lock; 527 TAILQ_HEAD(, nvmf_vfio_user_endpoint) endpoints; 528 529 pthread_mutex_t pg_lock; 530 TAILQ_HEAD(, nvmf_vfio_user_poll_group) poll_groups; 531 struct nvmf_vfio_user_poll_group *next_pg; 532 }; 533 534 /* 535 * function prototypes 536 */ 537 static int nvmf_vfio_user_req_free(struct spdk_nvmf_request *req); 538 539 static struct nvmf_vfio_user_req *get_nvmf_vfio_user_req(struct nvmf_vfio_user_sq *sq); 540 541 /* 542 * Local process virtual address of a queue. 543 */ 544 static inline void * 545 q_addr(struct nvme_q_mapping *mapping) 546 { 547 return mapping->iov.iov_base; 548 } 549 550 static inline int 551 queue_index(uint16_t qid, bool is_cq) 552 { 553 return (qid * 2) + is_cq; 554 } 555 556 static inline volatile uint32_t * 557 sq_headp(struct nvmf_vfio_user_sq *sq) 558 { 559 assert(sq != NULL); 560 return &sq->head; 561 } 562 563 static inline volatile uint32_t * 564 sq_dbl_tailp(struct nvmf_vfio_user_sq *sq) 565 { 566 assert(sq != NULL); 567 return sq->dbl_tailp; 568 } 569 570 static inline volatile uint32_t * 571 cq_dbl_headp(struct nvmf_vfio_user_cq *cq) 572 { 573 assert(cq != NULL); 574 return cq->dbl_headp; 575 } 576 577 static inline volatile uint32_t * 578 cq_tailp(struct nvmf_vfio_user_cq *cq) 579 { 580 assert(cq != NULL); 581 return &cq->tail; 582 } 583 584 static inline void 585 sq_head_advance(struct nvmf_vfio_user_sq *sq) 586 { 587 assert(sq != NULL); 588 589 assert(*sq_headp(sq) < sq->size); 590 (*sq_headp(sq))++; 591 592 if (spdk_unlikely(*sq_headp(sq) == sq->size)) { 593 *sq_headp(sq) = 0; 594 } 595 } 596 597 static inline void 598 cq_tail_advance(struct nvmf_vfio_user_cq *cq) 599 { 600 assert(cq != NULL); 601 602 assert(*cq_tailp(cq) < cq->size); 603 (*cq_tailp(cq))++; 604 605 if (spdk_unlikely(*cq_tailp(cq) == cq->size)) { 606 *cq_tailp(cq) = 0; 607 cq->phase = !cq->phase; 608 } 609 } 610 611 static bool 612 io_q_exists(struct nvmf_vfio_user_ctrlr *vu_ctrlr, const uint16_t qid, const bool is_cq) 613 { 614 assert(vu_ctrlr != NULL); 615 616 if (qid == 0 || qid >= NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) { 617 return false; 618 } 619 620 if (is_cq) { 621 if (vu_ctrlr->cqs[qid] == NULL) { 622 return false; 623 } 624 625 return (vu_ctrlr->cqs[qid]->cq_state != VFIO_USER_CQ_DELETED && 626 vu_ctrlr->cqs[qid]->cq_state != VFIO_USER_CQ_UNUSED); 627 } 628 629 if (vu_ctrlr->sqs[qid] == NULL) { 630 return false; 631 } 632 633 return (vu_ctrlr->sqs[qid]->sq_state != VFIO_USER_SQ_DELETED && 634 vu_ctrlr->sqs[qid]->sq_state != VFIO_USER_SQ_UNUSED); 635 } 636 637 static char * 638 endpoint_id(struct nvmf_vfio_user_endpoint *endpoint) 639 { 640 return endpoint->trid.traddr; 641 } 642 643 static char * 644 ctrlr_id(struct nvmf_vfio_user_ctrlr *ctrlr) 645 { 646 if (!ctrlr || !ctrlr->endpoint) { 647 return "Null Ctrlr"; 648 } 649 650 return endpoint_id(ctrlr->endpoint); 651 } 652 653 /* Return the poll group for the admin queue of the controller. */ 654 static inline struct nvmf_vfio_user_poll_group * 655 ctrlr_to_poll_group(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 656 { 657 return SPDK_CONTAINEROF(vu_ctrlr->sqs[0]->group, 658 struct nvmf_vfio_user_poll_group, 659 group); 660 } 661 662 static inline struct spdk_thread * 663 poll_group_to_thread(struct nvmf_vfio_user_poll_group *vu_pg) 664 { 665 return vu_pg->group.group->thread; 666 } 667 668 static dma_sg_t * 669 index_to_sg_t(void *arr, size_t i) 670 { 671 return (dma_sg_t *)((uintptr_t)arr + i * dma_sg_size()); 672 } 673 674 static inline size_t 675 vfio_user_migr_data_len(void) 676 { 677 return SPDK_ALIGN_CEIL(sizeof(struct vfio_user_nvme_migr_state), PAGE_SIZE); 678 } 679 680 static inline bool 681 in_interrupt_mode(struct nvmf_vfio_user_transport *vu_transport) 682 { 683 return spdk_interrupt_mode_is_enabled() && 684 vu_transport->intr_mode_supported; 685 } 686 687 static int vfio_user_ctrlr_intr(void *ctx); 688 689 static void 690 vfio_user_msg_ctrlr_intr(void *ctx) 691 { 692 struct nvmf_vfio_user_ctrlr *vu_ctrlr = ctx; 693 struct nvmf_vfio_user_poll_group *vu_ctrlr_group = ctrlr_to_poll_group(vu_ctrlr); 694 695 vu_ctrlr_group->stats.ctrlr_kicks++; 696 697 vfio_user_ctrlr_intr(ctx); 698 } 699 700 /* 701 * Kick (force a wakeup) of all poll groups for this controller. 702 * vfio_user_ctrlr_intr() itself arranges for kicking other poll groups if 703 * needed. 704 */ 705 static void 706 ctrlr_kick(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 707 { 708 struct nvmf_vfio_user_poll_group *vu_ctrlr_group; 709 710 SPDK_DEBUGLOG(vfio_user_db, "%s: kicked\n", ctrlr_id(vu_ctrlr)); 711 712 vu_ctrlr_group = ctrlr_to_poll_group(vu_ctrlr); 713 714 spdk_thread_send_msg(poll_group_to_thread(vu_ctrlr_group), 715 vfio_user_msg_ctrlr_intr, vu_ctrlr); 716 } 717 718 /* 719 * Make the given DMA address and length available (locally mapped) via iov. 720 */ 721 static void * 722 map_one(vfu_ctx_t *ctx, uint64_t addr, uint64_t len, dma_sg_t *sg, 723 struct iovec *iov, int prot) 724 { 725 int ret; 726 727 assert(ctx != NULL); 728 assert(sg != NULL); 729 assert(iov != NULL); 730 731 ret = vfu_addr_to_sgl(ctx, (void *)(uintptr_t)addr, len, sg, 1, prot); 732 if (ret < 0) { 733 if (ret == -1) { 734 SPDK_ERRLOG("failed to translate IOVA [%lu, %lu) (prot=%d) to local VA: %m\n", 735 addr, addr + len, prot); 736 } else { 737 SPDK_ERRLOG("failed to translate IOVA [%lu, %lu) (prot=%d) to local VA: %d segments needed\n", 738 addr, addr + len, prot, -(ret + 1)); 739 } 740 return NULL; 741 } 742 743 ret = vfu_sgl_get(ctx, sg, iov, 1, 0); 744 if (ret != 0) { 745 SPDK_ERRLOG("failed to get IOVA for IOVA [%ld, %ld): %m\n", 746 addr, addr + len); 747 return NULL; 748 } 749 750 assert(iov->iov_base != NULL); 751 return iov->iov_base; 752 } 753 754 static int 755 nvme_cmd_map_prps(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, 756 uint32_t max_iovcnt, uint32_t len, size_t mps, 757 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 758 { 759 uint64_t prp1, prp2; 760 void *vva; 761 uint32_t i; 762 uint32_t residue_len, nents; 763 uint64_t *prp_list; 764 uint32_t iovcnt; 765 766 assert(max_iovcnt > 0); 767 768 prp1 = cmd->dptr.prp.prp1; 769 prp2 = cmd->dptr.prp.prp2; 770 771 /* PRP1 may started with unaligned page address */ 772 residue_len = mps - (prp1 % mps); 773 residue_len = spdk_min(len, residue_len); 774 775 vva = gpa_to_vva(prv, prp1, residue_len, PROT_READ | PROT_WRITE); 776 if (spdk_unlikely(vva == NULL)) { 777 SPDK_ERRLOG("GPA to VVA failed\n"); 778 return -EINVAL; 779 } 780 len -= residue_len; 781 if (len && max_iovcnt < 2) { 782 SPDK_ERRLOG("Too many page entries, at least two iovs are required\n"); 783 return -ERANGE; 784 } 785 iovs[0].iov_base = vva; 786 iovs[0].iov_len = residue_len; 787 788 if (len) { 789 if (spdk_unlikely(prp2 == 0)) { 790 SPDK_ERRLOG("no PRP2, %d remaining\n", len); 791 return -EINVAL; 792 } 793 794 if (len <= mps) { 795 /* 2 PRP used */ 796 iovcnt = 2; 797 vva = gpa_to_vva(prv, prp2, len, PROT_READ | PROT_WRITE); 798 if (spdk_unlikely(vva == NULL)) { 799 SPDK_ERRLOG("no VVA for %#" PRIx64 ", len%#x\n", 800 prp2, len); 801 return -EINVAL; 802 } 803 iovs[1].iov_base = vva; 804 iovs[1].iov_len = len; 805 } else { 806 /* PRP list used */ 807 nents = (len + mps - 1) / mps; 808 if (spdk_unlikely(nents + 1 > max_iovcnt)) { 809 SPDK_ERRLOG("Too many page entries\n"); 810 return -ERANGE; 811 } 812 813 vva = gpa_to_vva(prv, prp2, nents * sizeof(*prp_list), PROT_READ); 814 if (spdk_unlikely(vva == NULL)) { 815 SPDK_ERRLOG("no VVA for %#" PRIx64 ", nents=%#x\n", 816 prp2, nents); 817 return -EINVAL; 818 } 819 prp_list = vva; 820 i = 0; 821 while (len != 0) { 822 residue_len = spdk_min(len, mps); 823 vva = gpa_to_vva(prv, prp_list[i], residue_len, PROT_READ | PROT_WRITE); 824 if (spdk_unlikely(vva == NULL)) { 825 SPDK_ERRLOG("no VVA for %#" PRIx64 ", residue_len=%#x\n", 826 prp_list[i], residue_len); 827 return -EINVAL; 828 } 829 iovs[i + 1].iov_base = vva; 830 iovs[i + 1].iov_len = residue_len; 831 len -= residue_len; 832 i++; 833 } 834 iovcnt = i + 1; 835 } 836 } else { 837 /* 1 PRP used */ 838 iovcnt = 1; 839 } 840 841 assert(iovcnt <= max_iovcnt); 842 return iovcnt; 843 } 844 845 static int 846 nvme_cmd_map_sgls_data(void *prv, struct spdk_nvme_sgl_descriptor *sgls, uint32_t num_sgls, 847 struct iovec *iovs, uint32_t max_iovcnt, 848 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 849 { 850 uint32_t i; 851 void *vva; 852 853 if (spdk_unlikely(max_iovcnt < num_sgls)) { 854 return -ERANGE; 855 } 856 857 for (i = 0; i < num_sgls; i++) { 858 if (spdk_unlikely(sgls[i].unkeyed.type != SPDK_NVME_SGL_TYPE_DATA_BLOCK)) { 859 SPDK_ERRLOG("Invalid SGL type %u\n", sgls[i].unkeyed.type); 860 return -EINVAL; 861 } 862 vva = gpa_to_vva(prv, sgls[i].address, sgls[i].unkeyed.length, PROT_READ | PROT_WRITE); 863 if (spdk_unlikely(vva == NULL)) { 864 SPDK_ERRLOG("GPA to VVA failed\n"); 865 return -EINVAL; 866 } 867 iovs[i].iov_base = vva; 868 iovs[i].iov_len = sgls[i].unkeyed.length; 869 } 870 871 return num_sgls; 872 } 873 874 static int 875 nvme_cmd_map_sgls(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt, 876 uint32_t len, size_t mps, 877 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 878 { 879 struct spdk_nvme_sgl_descriptor *sgl, *last_sgl; 880 uint32_t num_sgls, seg_len; 881 void *vva; 882 int ret; 883 uint32_t total_iovcnt = 0; 884 885 /* SGL cases */ 886 sgl = &cmd->dptr.sgl1; 887 888 /* only one SGL segment */ 889 if (sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { 890 assert(max_iovcnt > 0); 891 vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, PROT_READ | PROT_WRITE); 892 if (spdk_unlikely(vva == NULL)) { 893 SPDK_ERRLOG("GPA to VVA failed\n"); 894 return -EINVAL; 895 } 896 iovs[0].iov_base = vva; 897 iovs[0].iov_len = sgl->unkeyed.length; 898 assert(sgl->unkeyed.length == len); 899 900 return 1; 901 } 902 903 for (;;) { 904 if (spdk_unlikely((sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_SEGMENT) && 905 (sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_LAST_SEGMENT))) { 906 SPDK_ERRLOG("Invalid SGL type %u\n", sgl->unkeyed.type); 907 return -EINVAL; 908 } 909 910 seg_len = sgl->unkeyed.length; 911 if (spdk_unlikely(seg_len % sizeof(struct spdk_nvme_sgl_descriptor))) { 912 SPDK_ERRLOG("Invalid SGL segment len %u\n", seg_len); 913 return -EINVAL; 914 } 915 916 num_sgls = seg_len / sizeof(struct spdk_nvme_sgl_descriptor); 917 vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, PROT_READ); 918 if (spdk_unlikely(vva == NULL)) { 919 SPDK_ERRLOG("GPA to VVA failed\n"); 920 return -EINVAL; 921 } 922 923 /* sgl point to the first segment */ 924 sgl = (struct spdk_nvme_sgl_descriptor *)vva; 925 last_sgl = &sgl[num_sgls - 1]; 926 927 /* we are done */ 928 if (last_sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { 929 /* map whole sgl list */ 930 ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls, &iovs[total_iovcnt], 931 max_iovcnt - total_iovcnt, gpa_to_vva); 932 if (spdk_unlikely(ret < 0)) { 933 return ret; 934 } 935 total_iovcnt += ret; 936 937 return total_iovcnt; 938 } 939 940 if (num_sgls > 1) { 941 /* map whole sgl exclude last_sgl */ 942 ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls - 1, &iovs[total_iovcnt], 943 max_iovcnt - total_iovcnt, gpa_to_vva); 944 if (spdk_unlikely(ret < 0)) { 945 return ret; 946 } 947 total_iovcnt += ret; 948 } 949 950 /* move to next level's segments */ 951 sgl = last_sgl; 952 } 953 954 return 0; 955 } 956 957 static int 958 nvme_map_cmd(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt, 959 uint32_t len, size_t mps, 960 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 961 { 962 if (cmd->psdt == SPDK_NVME_PSDT_PRP) { 963 return nvme_cmd_map_prps(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva); 964 } 965 966 return nvme_cmd_map_sgls(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva); 967 } 968 969 /* 970 * For each queue, update the location of its doorbell to the correct location: 971 * either our own BAR0, or the guest's configured shadow doorbell area. 972 * 973 * The Admin queue (qid: 0) does not ever use shadow doorbells. 974 */ 975 static void 976 vfio_user_ctrlr_switch_doorbells(struct nvmf_vfio_user_ctrlr *ctrlr, bool shadow) 977 { 978 volatile uint32_t *doorbells = shadow ? ctrlr->sdbl->shadow_doorbells : 979 ctrlr->bar0_doorbells; 980 981 assert(doorbells != NULL); 982 983 for (size_t i = 1; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; i++) { 984 struct nvmf_vfio_user_sq *sq = ctrlr->sqs[i]; 985 struct nvmf_vfio_user_cq *cq = ctrlr->cqs[i]; 986 987 if (sq != NULL) { 988 sq->dbl_tailp = doorbells + queue_index(sq->qid, false); 989 990 ctrlr->sqs[i]->need_rearm = shadow; 991 } 992 993 if (cq != NULL) { 994 cq->dbl_headp = doorbells + queue_index(cq->qid, true); 995 } 996 } 997 } 998 999 static void 1000 unmap_sdbl(vfu_ctx_t *vfu_ctx, struct nvmf_vfio_user_shadow_doorbells *sdbl) 1001 { 1002 assert(vfu_ctx != NULL); 1003 assert(sdbl != NULL); 1004 1005 /* 1006 * An allocation error would result in only one of the two being 1007 * non-NULL. If that is the case, no memory should have been mapped. 1008 */ 1009 if (sdbl->iovs == NULL || sdbl->sgs == NULL) { 1010 return; 1011 } 1012 1013 for (size_t i = 0; i < NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT; ++i) { 1014 struct iovec *iov; 1015 dma_sg_t *sg; 1016 1017 if (!sdbl->iovs[i].iov_len) { 1018 continue; 1019 } 1020 1021 sg = index_to_sg_t(sdbl->sgs, i); 1022 iov = sdbl->iovs + i; 1023 1024 vfu_sgl_put(vfu_ctx, sg, iov, 1); 1025 } 1026 } 1027 1028 static void 1029 free_sdbl(vfu_ctx_t *vfu_ctx, struct nvmf_vfio_user_shadow_doorbells *sdbl) 1030 { 1031 if (sdbl == NULL) { 1032 return; 1033 } 1034 1035 unmap_sdbl(vfu_ctx, sdbl); 1036 1037 /* 1038 * sdbl->shadow_doorbells and sdbl->eventidxs were mapped, 1039 * not allocated, so don't free() them. 1040 */ 1041 free(sdbl->sgs); 1042 free(sdbl->iovs); 1043 free(sdbl); 1044 } 1045 1046 static struct nvmf_vfio_user_shadow_doorbells * 1047 map_sdbl(vfu_ctx_t *vfu_ctx, uint64_t prp1, uint64_t prp2, size_t len) 1048 { 1049 struct nvmf_vfio_user_shadow_doorbells *sdbl = NULL; 1050 dma_sg_t *sg2 = NULL; 1051 void *p; 1052 1053 assert(vfu_ctx != NULL); 1054 1055 sdbl = calloc(1, sizeof(*sdbl)); 1056 if (sdbl == NULL) { 1057 goto err; 1058 } 1059 1060 sdbl->sgs = calloc(NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT, dma_sg_size()); 1061 sdbl->iovs = calloc(NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT, sizeof(*sdbl->iovs)); 1062 if (sdbl->sgs == NULL || sdbl->iovs == NULL) { 1063 goto err; 1064 } 1065 1066 /* Map shadow doorbell buffer (PRP1). */ 1067 p = map_one(vfu_ctx, prp1, len, sdbl->sgs, sdbl->iovs, 1068 PROT_READ | PROT_WRITE); 1069 1070 if (p == NULL) { 1071 goto err; 1072 } 1073 1074 /* 1075 * Map eventidx buffer (PRP2). 1076 * Should only be written to by the controller. 1077 */ 1078 1079 sg2 = index_to_sg_t(sdbl->sgs, 1); 1080 1081 p = map_one(vfu_ctx, prp2, len, sg2, sdbl->iovs + 1, 1082 PROT_READ | PROT_WRITE); 1083 1084 if (p == NULL) { 1085 goto err; 1086 } 1087 1088 sdbl->shadow_doorbells = (uint32_t *)sdbl->iovs[0].iov_base; 1089 sdbl->eventidxs = (uint32_t *)sdbl->iovs[1].iov_base; 1090 1091 return sdbl; 1092 1093 err: 1094 free_sdbl(vfu_ctx, sdbl); 1095 return NULL; 1096 } 1097 1098 /* 1099 * Copy doorbells from one buffer to the other, during switches betweeen BAR0 1100 * doorbells and shadow doorbells. 1101 */ 1102 static void 1103 copy_doorbells(struct nvmf_vfio_user_ctrlr *ctrlr, 1104 const volatile uint32_t *from, volatile uint32_t *to) 1105 { 1106 assert(ctrlr != NULL); 1107 assert(from != NULL); 1108 assert(to != NULL); 1109 1110 SPDK_DEBUGLOG(vfio_user_db, 1111 "%s: migrating shadow doorbells from %p to %p\n", 1112 ctrlr_id(ctrlr), from, to); 1113 1114 /* Can't use memcpy because it doesn't respect volatile semantics. */ 1115 for (size_t i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; ++i) { 1116 if (ctrlr->sqs[i] != NULL) { 1117 to[queue_index(i, false)] = from[queue_index(i, false)]; 1118 } 1119 1120 if (ctrlr->cqs[i] != NULL) { 1121 to[queue_index(i, true)] = from[queue_index(i, true)]; 1122 } 1123 } 1124 } 1125 1126 static void 1127 fail_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1128 { 1129 const struct spdk_nvmf_registers *regs; 1130 1131 assert(vu_ctrlr != NULL); 1132 assert(vu_ctrlr->ctrlr != NULL); 1133 1134 regs = spdk_nvmf_ctrlr_get_regs(vu_ctrlr->ctrlr); 1135 if (regs->csts.bits.cfs == 0) { 1136 SPDK_ERRLOG(":%s failing controller\n", ctrlr_id(vu_ctrlr)); 1137 } 1138 1139 nvmf_ctrlr_set_fatal_status(vu_ctrlr->ctrlr); 1140 } 1141 1142 static inline bool 1143 ctrlr_interrupt_enabled(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1144 { 1145 assert(vu_ctrlr != NULL); 1146 assert(vu_ctrlr->endpoint != NULL); 1147 1148 vfu_pci_config_space_t *pci = vu_ctrlr->endpoint->pci_config_space; 1149 1150 return (!pci->hdr.cmd.id || vu_ctrlr->endpoint->msix->mxc.mxe); 1151 } 1152 1153 static void 1154 nvmf_vfio_user_destroy_endpoint(struct nvmf_vfio_user_endpoint *endpoint) 1155 { 1156 SPDK_DEBUGLOG(nvmf_vfio, "destroy endpoint %s\n", endpoint_id(endpoint)); 1157 1158 spdk_interrupt_unregister(&endpoint->accept_intr); 1159 spdk_poller_unregister(&endpoint->accept_poller); 1160 1161 if (endpoint->bar0_doorbells) { 1162 munmap((void *)endpoint->bar0_doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 1163 } 1164 1165 if (endpoint->devmem_fd > 0) { 1166 close(endpoint->devmem_fd); 1167 } 1168 1169 if (endpoint->migr_data) { 1170 munmap(endpoint->migr_data, vfio_user_migr_data_len()); 1171 } 1172 1173 if (endpoint->migr_fd > 0) { 1174 close(endpoint->migr_fd); 1175 } 1176 1177 if (endpoint->vfu_ctx) { 1178 vfu_destroy_ctx(endpoint->vfu_ctx); 1179 } 1180 1181 pthread_mutex_destroy(&endpoint->lock); 1182 free(endpoint); 1183 } 1184 1185 /* called when process exits */ 1186 static int 1187 nvmf_vfio_user_destroy(struct spdk_nvmf_transport *transport, 1188 spdk_nvmf_transport_destroy_done_cb cb_fn, void *cb_arg) 1189 { 1190 struct nvmf_vfio_user_transport *vu_transport; 1191 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 1192 1193 SPDK_DEBUGLOG(nvmf_vfio, "destroy transport\n"); 1194 1195 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 1196 transport); 1197 1198 pthread_mutex_destroy(&vu_transport->lock); 1199 pthread_mutex_destroy(&vu_transport->pg_lock); 1200 1201 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 1202 TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link); 1203 nvmf_vfio_user_destroy_endpoint(endpoint); 1204 } 1205 1206 free(vu_transport); 1207 1208 if (cb_fn) { 1209 cb_fn(cb_arg); 1210 } 1211 1212 return 0; 1213 } 1214 1215 static const struct spdk_json_object_decoder vfio_user_transport_opts_decoder[] = { 1216 { 1217 "disable_mappable_bar0", 1218 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_mappable_bar0), 1219 spdk_json_decode_bool, true 1220 }, 1221 { 1222 "disable_adaptive_irq", 1223 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_adaptive_irq), 1224 spdk_json_decode_bool, true 1225 }, 1226 { 1227 "disable_shadow_doorbells", 1228 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_shadow_doorbells), 1229 spdk_json_decode_bool, true 1230 }, 1231 { 1232 "disable_compare", 1233 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_compare), 1234 spdk_json_decode_bool, true 1235 }, 1236 { 1237 "enable_intr_mode_sq_spreading", 1238 offsetof(struct nvmf_vfio_user_transport, transport_opts.enable_intr_mode_sq_spreading), 1239 spdk_json_decode_bool, true 1240 }, 1241 }; 1242 1243 static struct spdk_nvmf_transport * 1244 nvmf_vfio_user_create(struct spdk_nvmf_transport_opts *opts) 1245 { 1246 struct nvmf_vfio_user_transport *vu_transport; 1247 int err; 1248 1249 if (opts->max_qpairs_per_ctrlr > NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) { 1250 SPDK_ERRLOG("Invalid max_qpairs_per_ctrlr=%d, supported max_qpairs_per_ctrlr=%d\n", 1251 opts->max_qpairs_per_ctrlr, NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR); 1252 return NULL; 1253 } 1254 1255 vu_transport = calloc(1, sizeof(*vu_transport)); 1256 if (vu_transport == NULL) { 1257 SPDK_ERRLOG("Transport alloc fail: %m\n"); 1258 return NULL; 1259 } 1260 1261 err = pthread_mutex_init(&vu_transport->lock, NULL); 1262 if (err != 0) { 1263 SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err); 1264 goto err; 1265 } 1266 TAILQ_INIT(&vu_transport->endpoints); 1267 1268 err = pthread_mutex_init(&vu_transport->pg_lock, NULL); 1269 if (err != 0) { 1270 pthread_mutex_destroy(&vu_transport->lock); 1271 SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err); 1272 goto err; 1273 } 1274 TAILQ_INIT(&vu_transport->poll_groups); 1275 1276 if (opts->transport_specific != NULL && 1277 spdk_json_decode_object_relaxed(opts->transport_specific, vfio_user_transport_opts_decoder, 1278 SPDK_COUNTOF(vfio_user_transport_opts_decoder), 1279 vu_transport)) { 1280 SPDK_ERRLOG("spdk_json_decode_object_relaxed failed\n"); 1281 goto cleanup; 1282 } 1283 1284 /* 1285 * To support interrupt mode, the transport must be configured with 1286 * mappable BAR0 disabled: we need a vfio-user message to wake us up 1287 * when a client writes new doorbell values to BAR0, via the 1288 * libvfio-user socket fd. 1289 */ 1290 vu_transport->intr_mode_supported = 1291 vu_transport->transport_opts.disable_mappable_bar0; 1292 1293 /* 1294 * If BAR0 is mappable, it doesn't make sense to support shadow 1295 * doorbells, so explicitly turn it off. 1296 */ 1297 if (!vu_transport->transport_opts.disable_mappable_bar0) { 1298 vu_transport->transport_opts.disable_shadow_doorbells = true; 1299 } 1300 1301 if (spdk_interrupt_mode_is_enabled()) { 1302 if (!vu_transport->intr_mode_supported) { 1303 SPDK_ERRLOG("interrupt mode not supported\n"); 1304 goto cleanup; 1305 } 1306 1307 /* 1308 * If we are in interrupt mode, we cannot support adaptive IRQs, 1309 * as there is no guarantee the SQ poller will run subsequently 1310 * to send pending IRQs. 1311 */ 1312 vu_transport->transport_opts.disable_adaptive_irq = true; 1313 } 1314 1315 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_mappable_bar0=%d\n", 1316 vu_transport->transport_opts.disable_mappable_bar0); 1317 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_adaptive_irq=%d\n", 1318 vu_transport->transport_opts.disable_adaptive_irq); 1319 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_shadow_doorbells=%d\n", 1320 vu_transport->transport_opts.disable_shadow_doorbells); 1321 1322 return &vu_transport->transport; 1323 1324 cleanup: 1325 pthread_mutex_destroy(&vu_transport->lock); 1326 pthread_mutex_destroy(&vu_transport->pg_lock); 1327 err: 1328 free(vu_transport); 1329 return NULL; 1330 } 1331 1332 static uint32_t 1333 max_queue_size(struct nvmf_vfio_user_ctrlr const *vu_ctrlr) 1334 { 1335 assert(vu_ctrlr != NULL); 1336 assert(vu_ctrlr->ctrlr != NULL); 1337 1338 return vu_ctrlr->ctrlr->vcprop.cap.bits.mqes + 1; 1339 } 1340 1341 static uint32_t 1342 doorbell_stride(const struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1343 { 1344 assert(vu_ctrlr != NULL); 1345 assert(vu_ctrlr->ctrlr != NULL); 1346 1347 return vu_ctrlr->ctrlr->vcprop.cap.bits.dstrd; 1348 } 1349 1350 static uintptr_t 1351 memory_page_size(const struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1352 { 1353 uint32_t memory_page_shift = vu_ctrlr->ctrlr->vcprop.cc.bits.mps + 12; 1354 return 1ul << memory_page_shift; 1355 } 1356 1357 static uintptr_t 1358 memory_page_mask(const struct nvmf_vfio_user_ctrlr *ctrlr) 1359 { 1360 return ~(memory_page_size(ctrlr) - 1); 1361 } 1362 1363 static int 1364 map_q(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvme_q_mapping *mapping, 1365 uint32_t q_size, bool is_cq, bool unmap) 1366 { 1367 uint64_t len; 1368 void *ret; 1369 1370 assert(q_size); 1371 assert(q_addr(mapping) == NULL); 1372 1373 if (is_cq) { 1374 len = q_size * sizeof(struct spdk_nvme_cpl); 1375 } else { 1376 len = q_size * sizeof(struct spdk_nvme_cmd); 1377 } 1378 1379 ret = map_one(vu_ctrlr->endpoint->vfu_ctx, mapping->prp1, len, 1380 mapping->sg, &mapping->iov, 1381 is_cq ? PROT_READ | PROT_WRITE : PROT_READ); 1382 if (ret == NULL) { 1383 return -EFAULT; 1384 } 1385 1386 if (unmap) { 1387 memset(q_addr(mapping), 0, len); 1388 } 1389 1390 return 0; 1391 } 1392 1393 static inline void 1394 unmap_q(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvme_q_mapping *mapping) 1395 { 1396 if (q_addr(mapping) != NULL) { 1397 vfu_sgl_put(vu_ctrlr->endpoint->vfu_ctx, mapping->sg, 1398 &mapping->iov, 1); 1399 mapping->iov.iov_base = NULL; 1400 } 1401 } 1402 1403 static int 1404 asq_setup(struct nvmf_vfio_user_ctrlr *ctrlr) 1405 { 1406 struct nvmf_vfio_user_sq *sq; 1407 const struct spdk_nvmf_registers *regs; 1408 int ret; 1409 1410 assert(ctrlr != NULL); 1411 1412 sq = ctrlr->sqs[0]; 1413 1414 assert(sq != NULL); 1415 assert(q_addr(&sq->mapping) == NULL); 1416 /* XXX ctrlr->asq == 0 is a valid memory address */ 1417 1418 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->ctrlr); 1419 sq->qid = 0; 1420 sq->size = regs->aqa.bits.asqs + 1; 1421 sq->mapping.prp1 = regs->asq; 1422 *sq_headp(sq) = 0; 1423 sq->cqid = 0; 1424 1425 ret = map_q(ctrlr, &sq->mapping, sq->size, false, true); 1426 if (ret) { 1427 return ret; 1428 } 1429 1430 /* The Admin queue (qid: 0) does not ever use shadow doorbells. */ 1431 sq->dbl_tailp = ctrlr->bar0_doorbells + queue_index(0, false); 1432 1433 *sq_dbl_tailp(sq) = 0; 1434 1435 return 0; 1436 } 1437 1438 /* 1439 * Updates eventidx to set an SQ into interrupt or polling mode. 1440 * 1441 * Returns false if the current SQ tail does not match the SQ head, as 1442 * this means that the host has submitted more items to the queue while we were 1443 * not looking - or during the event index update. In that case, we must retry, 1444 * or otherwise make sure we are going to wake up again. 1445 */ 1446 static bool 1447 set_sq_eventidx(struct nvmf_vfio_user_sq *sq) 1448 { 1449 struct nvmf_vfio_user_ctrlr *ctrlr; 1450 volatile uint32_t *sq_tail_eidx; 1451 uint32_t old_tail, new_tail; 1452 1453 assert(sq != NULL); 1454 assert(sq->ctrlr != NULL); 1455 assert(sq->ctrlr->sdbl != NULL); 1456 assert(sq->need_rearm); 1457 assert(sq->qid != 0); 1458 1459 ctrlr = sq->ctrlr; 1460 1461 SPDK_DEBUGLOG(vfio_user_db, "%s: updating eventidx of sqid:%u\n", 1462 ctrlr_id(ctrlr), sq->qid); 1463 1464 sq_tail_eidx = ctrlr->sdbl->eventidxs + queue_index(sq->qid, false); 1465 1466 assert(ctrlr->endpoint != NULL); 1467 1468 if (!ctrlr->endpoint->interrupt_mode) { 1469 /* No synchronisation necessary. */ 1470 *sq_tail_eidx = NVMF_VFIO_USER_EVENTIDX_POLL; 1471 return true; 1472 } 1473 1474 old_tail = *sq_dbl_tailp(sq); 1475 *sq_tail_eidx = old_tail; 1476 1477 /* 1478 * Ensure that the event index is updated before re-reading the tail 1479 * doorbell. If it's not, then the host might race us and update the 1480 * tail after the second read but before the event index is written, so 1481 * it won't write to BAR0 and we'll miss the update. 1482 * 1483 * The driver should provide similar ordering with an mb(). 1484 */ 1485 spdk_mb(); 1486 1487 /* 1488 * Check if the host has updated the tail doorbell after we've read it 1489 * for the first time, but before the event index was written. If that's 1490 * the case, then we've lost the race and we need to update the event 1491 * index again (after polling the queue, since the host won't write to 1492 * BAR0). 1493 */ 1494 new_tail = *sq_dbl_tailp(sq); 1495 1496 /* 1497 * We might poll the queue straight after this function returns if the 1498 * tail has been updated, so we need to ensure that any changes to the 1499 * queue will be visible to us if the doorbell has been updated. 1500 * 1501 * The driver should provide similar ordering with a wmb() to ensure 1502 * that the queue is written before it updates the tail doorbell. 1503 */ 1504 spdk_rmb(); 1505 1506 SPDK_DEBUGLOG(vfio_user_db, "%s: sqid:%u, old_tail=%u, new_tail=%u, " 1507 "sq_head=%u\n", ctrlr_id(ctrlr), sq->qid, old_tail, 1508 new_tail, *sq_headp(sq)); 1509 1510 if (new_tail == *sq_headp(sq)) { 1511 sq->need_rearm = false; 1512 return true; 1513 } 1514 1515 /* 1516 * We've lost the race: the tail was updated since we last polled, 1517 * including if it happened within this routine. 1518 * 1519 * The caller should retry after polling (think of this as a cmpxchg 1520 * loop); if we go to sleep while the SQ is not empty, then we won't 1521 * process the remaining events. 1522 */ 1523 return false; 1524 } 1525 1526 static int nvmf_vfio_user_sq_poll(struct nvmf_vfio_user_sq *sq); 1527 1528 /* 1529 * Arrange for an SQ to interrupt us if written. Returns non-zero if we 1530 * processed some SQ entries. 1531 */ 1532 static int 1533 vfio_user_sq_rearm(struct nvmf_vfio_user_ctrlr *ctrlr, 1534 struct nvmf_vfio_user_sq *sq, 1535 struct nvmf_vfio_user_poll_group *vu_group) 1536 { 1537 int count = 0; 1538 size_t i; 1539 1540 assert(sq->need_rearm); 1541 1542 for (i = 0; i < NVMF_VFIO_USER_SET_EVENTIDX_MAX_ATTEMPTS; i++) { 1543 int ret; 1544 1545 if (set_sq_eventidx(sq)) { 1546 /* We won the race and set eventidx; done. */ 1547 vu_group->stats.won++; 1548 return count; 1549 } 1550 1551 ret = nvmf_vfio_user_sq_poll(sq); 1552 1553 count += (ret < 0) ? 1 : ret; 1554 1555 /* 1556 * set_sq_eventidx() hit the race, so we expected 1557 * to process at least one command from this queue. 1558 * If there were no new commands waiting for us, then 1559 * we must have hit an unexpected race condition. 1560 */ 1561 if (ret == 0) { 1562 SPDK_ERRLOG("%s: unexpected race condition detected " 1563 "while updating the shadow doorbell buffer\n", 1564 ctrlr_id(ctrlr)); 1565 1566 fail_ctrlr(ctrlr); 1567 return count; 1568 } 1569 } 1570 1571 SPDK_DEBUGLOG(vfio_user_db, 1572 "%s: set_sq_eventidx() lost the race %zu times\n", 1573 ctrlr_id(ctrlr), i); 1574 1575 vu_group->stats.lost++; 1576 vu_group->stats.lost_count += count; 1577 1578 /* 1579 * We couldn't arrange an eventidx guaranteed to cause a BAR0 write, as 1580 * we raced with the producer too many times; force ourselves to wake up 1581 * instead. We'll process all queues at that point. 1582 */ 1583 ctrlr_kick(ctrlr); 1584 1585 return count; 1586 } 1587 1588 /* 1589 * We're in interrupt mode, and potentially about to go to sleep. We need to 1590 * make sure any further I/O submissions are guaranteed to wake us up: for 1591 * shadow doorbells that means we may need to go through set_sq_eventidx() for 1592 * every SQ that needs re-arming. 1593 * 1594 * Returns non-zero if we processed something. 1595 */ 1596 static int 1597 vfio_user_poll_group_rearm(struct nvmf_vfio_user_poll_group *vu_group) 1598 { 1599 struct nvmf_vfio_user_sq *sq; 1600 int count = 0; 1601 1602 vu_group->stats.rearms++; 1603 1604 TAILQ_FOREACH(sq, &vu_group->sqs, link) { 1605 if (spdk_unlikely(sq->sq_state != VFIO_USER_SQ_ACTIVE || !sq->size)) { 1606 continue; 1607 } 1608 1609 if (sq->need_rearm) { 1610 count += vfio_user_sq_rearm(sq->ctrlr, sq, vu_group); 1611 } 1612 } 1613 1614 return count; 1615 } 1616 1617 static int 1618 acq_setup(struct nvmf_vfio_user_ctrlr *ctrlr) 1619 { 1620 struct nvmf_vfio_user_cq *cq; 1621 const struct spdk_nvmf_registers *regs; 1622 int ret; 1623 1624 assert(ctrlr != NULL); 1625 1626 cq = ctrlr->cqs[0]; 1627 1628 assert(cq != NULL); 1629 1630 assert(q_addr(&cq->mapping) == NULL); 1631 1632 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->ctrlr); 1633 assert(regs != NULL); 1634 cq->qid = 0; 1635 cq->size = regs->aqa.bits.acqs + 1; 1636 cq->mapping.prp1 = regs->acq; 1637 *cq_tailp(cq) = 0; 1638 cq->ien = true; 1639 cq->phase = true; 1640 1641 ret = map_q(ctrlr, &cq->mapping, cq->size, true, true); 1642 if (ret) { 1643 return ret; 1644 } 1645 1646 /* The Admin queue (qid: 0) does not ever use shadow doorbells. */ 1647 cq->dbl_headp = ctrlr->bar0_doorbells + queue_index(0, true); 1648 1649 *cq_dbl_headp(cq) = 0; 1650 1651 return 0; 1652 } 1653 1654 static void * 1655 _map_one(void *prv, uint64_t addr, uint64_t len, int prot) 1656 { 1657 struct spdk_nvmf_request *req = (struct spdk_nvmf_request *)prv; 1658 struct spdk_nvmf_qpair *qpair; 1659 struct nvmf_vfio_user_req *vu_req; 1660 struct nvmf_vfio_user_sq *sq; 1661 void *ret; 1662 1663 assert(req != NULL); 1664 qpair = req->qpair; 1665 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 1666 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 1667 1668 assert(vu_req->iovcnt < NVMF_VFIO_USER_MAX_IOVECS); 1669 ret = map_one(sq->ctrlr->endpoint->vfu_ctx, addr, len, 1670 index_to_sg_t(vu_req->sg, vu_req->iovcnt), 1671 &vu_req->iov[vu_req->iovcnt], prot); 1672 if (spdk_likely(ret != NULL)) { 1673 vu_req->iovcnt++; 1674 } 1675 return ret; 1676 } 1677 1678 static int 1679 vfio_user_map_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req, 1680 struct iovec *iov, uint32_t length) 1681 { 1682 /* Map PRP list to from Guest physical memory to 1683 * virtual memory address. 1684 */ 1685 return nvme_map_cmd(req, &req->cmd->nvme_cmd, iov, NVMF_REQ_MAX_BUFFERS, 1686 length, 4096, _map_one); 1687 } 1688 1689 static int handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 1690 struct nvmf_vfio_user_sq *sq); 1691 1692 static uint32_t 1693 cq_free_slots(struct nvmf_vfio_user_cq *cq) 1694 { 1695 uint32_t free_slots; 1696 1697 assert(cq != NULL); 1698 1699 if (cq->tail == cq->last_head) { 1700 free_slots = cq->size; 1701 } else if (cq->tail > cq->last_head) { 1702 free_slots = cq->size - (cq->tail - cq->last_head); 1703 } else { 1704 free_slots = cq->last_head - cq->tail; 1705 } 1706 assert(free_slots > 0); 1707 1708 return free_slots - 1; 1709 } 1710 1711 /* 1712 * Since reading the head doorbell is relatively expensive, we use the cached 1713 * value, so we only have to read it for real if it appears that we are full. 1714 */ 1715 static inline bool 1716 cq_is_full(struct nvmf_vfio_user_cq *cq) 1717 { 1718 uint32_t free_cq_slots; 1719 1720 assert(cq != NULL); 1721 1722 free_cq_slots = cq_free_slots(cq); 1723 1724 if (spdk_unlikely(free_cq_slots == 0)) { 1725 cq->last_head = *cq_dbl_headp(cq); 1726 free_cq_slots = cq_free_slots(cq); 1727 } 1728 1729 return free_cq_slots == 0; 1730 } 1731 1732 /* 1733 * Posts a CQE in the completion queue. 1734 * 1735 * @ctrlr: the vfio-user controller 1736 * @cq: the completion queue 1737 * @cdw0: cdw0 as reported by NVMf 1738 * @sqid: submission queue ID 1739 * @cid: command identifier in NVMe command 1740 * @sc: the NVMe CQE status code 1741 * @sct: the NVMe CQE status code type 1742 */ 1743 static int 1744 post_completion(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_cq *cq, 1745 uint32_t cdw0, uint16_t sqid, uint16_t cid, uint16_t sc, uint16_t sct) 1746 { 1747 struct spdk_nvme_status cpl_status = { 0 }; 1748 struct spdk_nvme_cpl *cpl; 1749 int err; 1750 1751 assert(ctrlr != NULL); 1752 1753 if (spdk_unlikely(cq == NULL || q_addr(&cq->mapping) == NULL)) { 1754 return 0; 1755 } 1756 1757 if (cq->qid == 0) { 1758 assert(spdk_get_thread() == cq->group->group->thread); 1759 } 1760 1761 /* 1762 * As per NVMe Base spec 3.3.1.2.1, we are supposed to implement CQ flow 1763 * control: if there is no space in the CQ, we should wait until there is. 1764 * 1765 * In practice, we just fail the controller instead: as it happens, all host 1766 * implementations we care about right-size the CQ: this is required anyway for 1767 * NVMEoF support (see 3.3.2.8). 1768 */ 1769 if (cq_is_full(cq)) { 1770 SPDK_ERRLOG("%s: cqid:%d full (tail=%d, head=%d)\n", 1771 ctrlr_id(ctrlr), cq->qid, *cq_tailp(cq), 1772 *cq_dbl_headp(cq)); 1773 return -1; 1774 } 1775 1776 cpl = ((struct spdk_nvme_cpl *)q_addr(&cq->mapping)) + *cq_tailp(cq); 1777 1778 assert(ctrlr->sqs[sqid] != NULL); 1779 SPDK_DEBUGLOG(nvmf_vfio, 1780 "%s: request complete sqid:%d cid=%d status=%#x " 1781 "sqhead=%d cq tail=%d\n", ctrlr_id(ctrlr), sqid, cid, sc, 1782 *sq_headp(ctrlr->sqs[sqid]), *cq_tailp(cq)); 1783 1784 cpl->sqhd = *sq_headp(ctrlr->sqs[sqid]); 1785 cpl->sqid = sqid; 1786 cpl->cid = cid; 1787 cpl->cdw0 = cdw0; 1788 1789 /* 1790 * This is a bitfield: instead of setting the individual bits we need 1791 * directly in cpl->status, which would cause a read-modify-write cycle, 1792 * we'll avoid reading from the CPL altogether by filling in a local 1793 * cpl_status variable, then writing the whole thing. 1794 */ 1795 cpl_status.sct = sct; 1796 cpl_status.sc = sc; 1797 cpl_status.p = cq->phase; 1798 cpl->status = cpl_status; 1799 1800 /* Ensure the Completion Queue Entry is visible. */ 1801 spdk_wmb(); 1802 cq_tail_advance(cq); 1803 1804 if ((cq->qid == 0 || !ctrlr->adaptive_irqs_enabled) && 1805 cq->ien && ctrlr_interrupt_enabled(ctrlr)) { 1806 err = vfu_irq_trigger(ctrlr->endpoint->vfu_ctx, cq->iv); 1807 if (err != 0) { 1808 SPDK_ERRLOG("%s: failed to trigger interrupt: %m\n", 1809 ctrlr_id(ctrlr)); 1810 return err; 1811 } 1812 } 1813 1814 return 0; 1815 } 1816 1817 static void 1818 free_sq_reqs(struct nvmf_vfio_user_sq *sq) 1819 { 1820 while (!TAILQ_EMPTY(&sq->free_reqs)) { 1821 struct nvmf_vfio_user_req *vu_req = TAILQ_FIRST(&sq->free_reqs); 1822 TAILQ_REMOVE(&sq->free_reqs, vu_req, link); 1823 free(vu_req); 1824 } 1825 } 1826 1827 static void 1828 delete_cq_done(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_cq *cq) 1829 { 1830 assert(cq->cq_ref == 0); 1831 unmap_q(ctrlr, &cq->mapping); 1832 cq->size = 0; 1833 cq->cq_state = VFIO_USER_CQ_DELETED; 1834 cq->group = NULL; 1835 } 1836 1837 /* Deletes a SQ, if this SQ is the last user of the associated CQ 1838 * and the controller is being shut down/reset or vfio-user client disconnects, 1839 * then the CQ is also deleted. 1840 */ 1841 static void 1842 delete_sq_done(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvmf_vfio_user_sq *sq) 1843 { 1844 struct nvmf_vfio_user_cq *cq; 1845 uint16_t cqid; 1846 1847 SPDK_DEBUGLOG(nvmf_vfio, "%s: delete sqid:%d=%p done\n", ctrlr_id(vu_ctrlr), 1848 sq->qid, sq); 1849 1850 /* Free SQ resources */ 1851 unmap_q(vu_ctrlr, &sq->mapping); 1852 1853 free_sq_reqs(sq); 1854 1855 sq->size = 0; 1856 1857 sq->sq_state = VFIO_USER_SQ_DELETED; 1858 1859 /* Controller RESET and SHUTDOWN are special cases, 1860 * VM may not send DELETE IO SQ/CQ commands, NVMf library 1861 * will disconnect IO queue pairs. 1862 */ 1863 if (vu_ctrlr->reset_shn || vu_ctrlr->disconnect) { 1864 cqid = sq->cqid; 1865 cq = vu_ctrlr->cqs[cqid]; 1866 1867 SPDK_DEBUGLOG(nvmf_vfio, "%s: try to delete cqid:%u=%p\n", ctrlr_id(vu_ctrlr), 1868 cq->qid, cq); 1869 1870 assert(cq->cq_ref > 0); 1871 if (--cq->cq_ref == 0) { 1872 delete_cq_done(vu_ctrlr, cq); 1873 } 1874 } 1875 } 1876 1877 static void 1878 free_qp(struct nvmf_vfio_user_ctrlr *ctrlr, uint16_t qid) 1879 { 1880 struct nvmf_vfio_user_sq *sq; 1881 struct nvmf_vfio_user_cq *cq; 1882 1883 if (ctrlr == NULL) { 1884 return; 1885 } 1886 1887 sq = ctrlr->sqs[qid]; 1888 if (sq) { 1889 SPDK_DEBUGLOG(nvmf_vfio, "%s: Free sqid:%u\n", ctrlr_id(ctrlr), qid); 1890 unmap_q(ctrlr, &sq->mapping); 1891 1892 free_sq_reqs(sq); 1893 1894 free(sq->mapping.sg); 1895 free(sq); 1896 ctrlr->sqs[qid] = NULL; 1897 } 1898 1899 cq = ctrlr->cqs[qid]; 1900 if (cq) { 1901 SPDK_DEBUGLOG(nvmf_vfio, "%s: Free cqid:%u\n", ctrlr_id(ctrlr), qid); 1902 unmap_q(ctrlr, &cq->mapping); 1903 free(cq->mapping.sg); 1904 free(cq); 1905 ctrlr->cqs[qid] = NULL; 1906 } 1907 } 1908 1909 static int 1910 init_sq(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_transport *transport, 1911 const uint16_t id) 1912 { 1913 struct nvmf_vfio_user_sq *sq; 1914 1915 assert(ctrlr != NULL); 1916 assert(transport != NULL); 1917 assert(ctrlr->sqs[id] == NULL); 1918 1919 sq = calloc(1, sizeof(*sq)); 1920 if (sq == NULL) { 1921 return -ENOMEM; 1922 } 1923 sq->mapping.sg = calloc(1, dma_sg_size()); 1924 if (sq->mapping.sg == NULL) { 1925 free(sq); 1926 return -ENOMEM; 1927 } 1928 1929 sq->qid = id; 1930 sq->qpair.qid = id; 1931 sq->qpair.transport = transport; 1932 sq->ctrlr = ctrlr; 1933 ctrlr->sqs[id] = sq; 1934 1935 TAILQ_INIT(&sq->free_reqs); 1936 1937 return 0; 1938 } 1939 1940 static int 1941 init_cq(struct nvmf_vfio_user_ctrlr *vu_ctrlr, const uint16_t id) 1942 { 1943 struct nvmf_vfio_user_cq *cq; 1944 1945 assert(vu_ctrlr != NULL); 1946 assert(vu_ctrlr->cqs[id] == NULL); 1947 1948 cq = calloc(1, sizeof(*cq)); 1949 if (cq == NULL) { 1950 return -ENOMEM; 1951 } 1952 cq->mapping.sg = calloc(1, dma_sg_size()); 1953 if (cq->mapping.sg == NULL) { 1954 free(cq); 1955 return -ENOMEM; 1956 } 1957 1958 cq->qid = id; 1959 vu_ctrlr->cqs[id] = cq; 1960 1961 return 0; 1962 } 1963 1964 static int 1965 alloc_sq_reqs(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvmf_vfio_user_sq *sq) 1966 { 1967 struct nvmf_vfio_user_req *vu_req, *tmp; 1968 size_t req_size; 1969 uint32_t i; 1970 1971 req_size = sizeof(struct nvmf_vfio_user_req) + 1972 (dma_sg_size() * NVMF_VFIO_USER_MAX_IOVECS); 1973 1974 for (i = 0; i < sq->size; i++) { 1975 struct spdk_nvmf_request *req; 1976 1977 vu_req = calloc(1, req_size); 1978 if (vu_req == NULL) { 1979 goto err; 1980 } 1981 1982 req = &vu_req->req; 1983 req->qpair = &sq->qpair; 1984 req->rsp = (union nvmf_c2h_msg *)&vu_req->rsp; 1985 req->cmd = (union nvmf_h2c_msg *)&vu_req->cmd; 1986 req->stripped_data = NULL; 1987 1988 TAILQ_INSERT_TAIL(&sq->free_reqs, vu_req, link); 1989 } 1990 1991 return 0; 1992 1993 err: 1994 TAILQ_FOREACH_SAFE(vu_req, &sq->free_reqs, link, tmp) { 1995 free(vu_req); 1996 } 1997 return -ENOMEM; 1998 } 1999 2000 static volatile uint32_t * 2001 ctrlr_doorbell_ptr(struct nvmf_vfio_user_ctrlr *ctrlr) 2002 { 2003 return ctrlr->sdbl != NULL ? 2004 ctrlr->sdbl->shadow_doorbells : 2005 ctrlr->bar0_doorbells; 2006 } 2007 2008 static uint16_t 2009 handle_create_io_sq(struct nvmf_vfio_user_ctrlr *ctrlr, 2010 struct spdk_nvme_cmd *cmd, uint16_t *sct) 2011 { 2012 struct nvmf_vfio_user_transport *vu_transport = ctrlr->transport; 2013 struct nvmf_vfio_user_sq *sq; 2014 uint32_t qsize; 2015 uint16_t cqid; 2016 uint16_t qid; 2017 int err; 2018 2019 qid = cmd->cdw10_bits.create_io_q.qid; 2020 cqid = cmd->cdw11_bits.create_io_sq.cqid; 2021 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 2022 2023 if (ctrlr->sqs[qid] == NULL) { 2024 err = init_sq(ctrlr, ctrlr->sqs[0]->qpair.transport, qid); 2025 if (err != 0) { 2026 *sct = SPDK_NVME_SCT_GENERIC; 2027 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2028 } 2029 } 2030 2031 if (cqid == 0 || cqid >= vu_transport->transport.opts.max_qpairs_per_ctrlr) { 2032 SPDK_ERRLOG("%s: invalid cqid:%u\n", ctrlr_id(ctrlr), cqid); 2033 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2034 return SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2035 } 2036 2037 /* CQ must be created before SQ. */ 2038 if (!io_q_exists(ctrlr, cqid, true)) { 2039 SPDK_ERRLOG("%s: cqid:%u does not exist\n", ctrlr_id(ctrlr), cqid); 2040 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2041 return SPDK_NVME_SC_COMPLETION_QUEUE_INVALID; 2042 } 2043 2044 if (cmd->cdw11_bits.create_io_sq.pc != 0x1) { 2045 SPDK_ERRLOG("%s: non-PC SQ not supported\n", ctrlr_id(ctrlr)); 2046 *sct = SPDK_NVME_SCT_GENERIC; 2047 return SPDK_NVME_SC_INVALID_FIELD; 2048 } 2049 2050 sq = ctrlr->sqs[qid]; 2051 sq->size = qsize; 2052 2053 SPDK_DEBUGLOG(nvmf_vfio, "%s: sqid:%d cqid:%d\n", ctrlr_id(ctrlr), 2054 qid, cqid); 2055 2056 sq->mapping.prp1 = cmd->dptr.prp.prp1; 2057 2058 err = map_q(ctrlr, &sq->mapping, sq->size, false, true); 2059 if (err) { 2060 SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr)); 2061 *sct = SPDK_NVME_SCT_GENERIC; 2062 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2063 } 2064 2065 SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped sqid:%d IOVA=%#lx vaddr=%p\n", 2066 ctrlr_id(ctrlr), qid, cmd->dptr.prp.prp1, 2067 q_addr(&sq->mapping)); 2068 2069 err = alloc_sq_reqs(ctrlr, sq); 2070 if (err < 0) { 2071 SPDK_ERRLOG("%s: failed to allocate SQ requests: %m\n", ctrlr_id(ctrlr)); 2072 *sct = SPDK_NVME_SCT_GENERIC; 2073 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2074 } 2075 2076 sq->cqid = cqid; 2077 ctrlr->cqs[sq->cqid]->cq_ref++; 2078 sq->sq_state = VFIO_USER_SQ_CREATED; 2079 *sq_headp(sq) = 0; 2080 2081 sq->dbl_tailp = ctrlr_doorbell_ptr(ctrlr) + queue_index(qid, false); 2082 2083 /* 2084 * We should always reset the doorbells. 2085 * 2086 * The Specification prohibits the controller from writing to the shadow 2087 * doorbell buffer, however older versions of the Linux NVMe driver 2088 * don't reset the shadow doorbell buffer after a Queue-Level or 2089 * Controller-Level reset, which means that we're left with garbage 2090 * doorbell values. 2091 */ 2092 *sq_dbl_tailp(sq) = 0; 2093 2094 if (ctrlr->sdbl != NULL) { 2095 sq->need_rearm = true; 2096 2097 if (!set_sq_eventidx(sq)) { 2098 SPDK_ERRLOG("%s: host updated SQ tail doorbell before " 2099 "sqid:%hu was initialized\n", 2100 ctrlr_id(ctrlr), qid); 2101 fail_ctrlr(ctrlr); 2102 *sct = SPDK_NVME_SCT_GENERIC; 2103 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2104 } 2105 } 2106 2107 /* 2108 * Create our new I/O qpair. This asynchronously invokes, on a suitable 2109 * poll group, the nvmf_vfio_user_poll_group_add() callback, which will 2110 * call spdk_nvmf_request_exec_fabrics() with a generated fabrics 2111 * connect command. This command is then eventually completed via 2112 * handle_queue_connect_rsp(). 2113 */ 2114 sq->create_io_sq_cmd = *cmd; 2115 sq->post_create_io_sq_completion = true; 2116 2117 spdk_nvmf_tgt_new_qpair(ctrlr->transport->transport.tgt, 2118 &sq->qpair); 2119 2120 *sct = SPDK_NVME_SCT_GENERIC; 2121 return SPDK_NVME_SC_SUCCESS; 2122 } 2123 2124 static uint16_t 2125 handle_create_io_cq(struct nvmf_vfio_user_ctrlr *ctrlr, 2126 struct spdk_nvme_cmd *cmd, uint16_t *sct) 2127 { 2128 struct nvmf_vfio_user_cq *cq; 2129 uint32_t qsize; 2130 uint16_t qid; 2131 int err; 2132 2133 qid = cmd->cdw10_bits.create_io_q.qid; 2134 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 2135 2136 if (ctrlr->cqs[qid] == NULL) { 2137 err = init_cq(ctrlr, qid); 2138 if (err != 0) { 2139 *sct = SPDK_NVME_SCT_GENERIC; 2140 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2141 } 2142 } 2143 2144 if (cmd->cdw11_bits.create_io_cq.pc != 0x1) { 2145 SPDK_ERRLOG("%s: non-PC CQ not supported\n", ctrlr_id(ctrlr)); 2146 *sct = SPDK_NVME_SCT_GENERIC; 2147 return SPDK_NVME_SC_INVALID_FIELD; 2148 } 2149 2150 if (cmd->cdw11_bits.create_io_cq.iv > NVME_IRQ_MSIX_NUM - 1) { 2151 SPDK_ERRLOG("%s: IV is too big\n", ctrlr_id(ctrlr)); 2152 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2153 return SPDK_NVME_SC_INVALID_INTERRUPT_VECTOR; 2154 } 2155 2156 cq = ctrlr->cqs[qid]; 2157 cq->size = qsize; 2158 2159 cq->mapping.prp1 = cmd->dptr.prp.prp1; 2160 2161 cq->dbl_headp = ctrlr_doorbell_ptr(ctrlr) + queue_index(qid, true); 2162 2163 err = map_q(ctrlr, &cq->mapping, cq->size, true, true); 2164 if (err) { 2165 SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr)); 2166 *sct = SPDK_NVME_SCT_GENERIC; 2167 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2168 } 2169 2170 SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped cqid:%u IOVA=%#lx vaddr=%p\n", 2171 ctrlr_id(ctrlr), qid, cmd->dptr.prp.prp1, 2172 q_addr(&cq->mapping)); 2173 2174 cq->ien = cmd->cdw11_bits.create_io_cq.ien; 2175 cq->iv = cmd->cdw11_bits.create_io_cq.iv; 2176 cq->phase = true; 2177 cq->cq_state = VFIO_USER_CQ_CREATED; 2178 2179 *cq_tailp(cq) = 0; 2180 2181 /* 2182 * We should always reset the doorbells. 2183 * 2184 * The Specification prohibits the controller from writing to the shadow 2185 * doorbell buffer, however older versions of the Linux NVMe driver 2186 * don't reset the shadow doorbell buffer after a Queue-Level or 2187 * Controller-Level reset, which means that we're left with garbage 2188 * doorbell values. 2189 */ 2190 *cq_dbl_headp(cq) = 0; 2191 2192 *sct = SPDK_NVME_SCT_GENERIC; 2193 return SPDK_NVME_SC_SUCCESS; 2194 } 2195 2196 /* 2197 * Creates a completion or submission I/O queue. Returns 0 on success, -errno 2198 * on error. 2199 */ 2200 static int 2201 handle_create_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, 2202 struct spdk_nvme_cmd *cmd, const bool is_cq) 2203 { 2204 struct nvmf_vfio_user_transport *vu_transport = ctrlr->transport; 2205 uint16_t sct = SPDK_NVME_SCT_GENERIC; 2206 uint16_t sc = SPDK_NVME_SC_SUCCESS; 2207 uint32_t qsize; 2208 uint16_t qid; 2209 2210 assert(ctrlr != NULL); 2211 assert(cmd != NULL); 2212 2213 qid = cmd->cdw10_bits.create_io_q.qid; 2214 if (qid == 0 || qid >= vu_transport->transport.opts.max_qpairs_per_ctrlr) { 2215 SPDK_ERRLOG("%s: invalid qid=%d, max=%d\n", ctrlr_id(ctrlr), 2216 qid, vu_transport->transport.opts.max_qpairs_per_ctrlr); 2217 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2218 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2219 goto out; 2220 } 2221 2222 if (io_q_exists(ctrlr, qid, is_cq)) { 2223 SPDK_ERRLOG("%s: %cqid:%d already exists\n", ctrlr_id(ctrlr), 2224 is_cq ? 'c' : 's', qid); 2225 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2226 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2227 goto out; 2228 } 2229 2230 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 2231 if (qsize == 1 || qsize > max_queue_size(ctrlr)) { 2232 SPDK_ERRLOG("%s: invalid I/O queue size %u\n", ctrlr_id(ctrlr), qsize); 2233 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2234 sc = SPDK_NVME_SC_INVALID_QUEUE_SIZE; 2235 goto out; 2236 } 2237 2238 if (is_cq) { 2239 sc = handle_create_io_cq(ctrlr, cmd, &sct); 2240 } else { 2241 sc = handle_create_io_sq(ctrlr, cmd, &sct); 2242 2243 if (sct == SPDK_NVME_SCT_GENERIC && 2244 sc == SPDK_NVME_SC_SUCCESS) { 2245 /* Completion posted asynchronously. */ 2246 return 0; 2247 } 2248 } 2249 2250 out: 2251 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 2252 } 2253 2254 /* For ADMIN I/O DELETE SUBMISSION QUEUE the NVMf library will disconnect and free 2255 * queue pair, so save the command id and controller in a context. 2256 */ 2257 struct vfio_user_delete_sq_ctx { 2258 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 2259 uint16_t cid; 2260 }; 2261 2262 static void 2263 vfio_user_qpair_delete_cb(void *cb_arg) 2264 { 2265 struct vfio_user_delete_sq_ctx *ctx = cb_arg; 2266 struct nvmf_vfio_user_ctrlr *vu_ctrlr = ctx->vu_ctrlr; 2267 struct nvmf_vfio_user_cq *admin_cq = vu_ctrlr->cqs[0]; 2268 2269 assert(admin_cq != NULL); 2270 assert(admin_cq->group != NULL); 2271 assert(admin_cq->group->group->thread != NULL); 2272 if (admin_cq->group->group->thread != spdk_get_thread()) { 2273 spdk_thread_send_msg(admin_cq->group->group->thread, 2274 vfio_user_qpair_delete_cb, 2275 cb_arg); 2276 } else { 2277 post_completion(vu_ctrlr, admin_cq, 0, 0, 2278 ctx->cid, 2279 SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC); 2280 free(ctx); 2281 } 2282 } 2283 2284 /* 2285 * Deletes a completion or submission I/O queue. 2286 */ 2287 static int 2288 handle_del_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, 2289 struct spdk_nvme_cmd *cmd, const bool is_cq) 2290 { 2291 uint16_t sct = SPDK_NVME_SCT_GENERIC; 2292 uint16_t sc = SPDK_NVME_SC_SUCCESS; 2293 struct nvmf_vfio_user_sq *sq; 2294 struct nvmf_vfio_user_cq *cq; 2295 2296 SPDK_DEBUGLOG(nvmf_vfio, "%s: delete I/O %cqid:%d\n", 2297 ctrlr_id(ctrlr), is_cq ? 'c' : 's', 2298 cmd->cdw10_bits.delete_io_q.qid); 2299 2300 if (!io_q_exists(ctrlr, cmd->cdw10_bits.delete_io_q.qid, is_cq)) { 2301 SPDK_ERRLOG("%s: I/O %cqid:%d does not exist\n", ctrlr_id(ctrlr), 2302 is_cq ? 'c' : 's', cmd->cdw10_bits.delete_io_q.qid); 2303 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2304 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2305 goto out; 2306 } 2307 2308 if (is_cq) { 2309 cq = ctrlr->cqs[cmd->cdw10_bits.delete_io_q.qid]; 2310 if (cq->cq_ref) { 2311 SPDK_ERRLOG("%s: the associated SQ must be deleted first\n", ctrlr_id(ctrlr)); 2312 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2313 sc = SPDK_NVME_SC_INVALID_QUEUE_DELETION; 2314 goto out; 2315 } 2316 delete_cq_done(ctrlr, cq); 2317 } else { 2318 /* 2319 * Deletion of the CQ is only deferred to delete_sq_done() on 2320 * VM reboot or CC.EN change, so we have to delete it in all 2321 * other cases. 2322 */ 2323 sq = ctrlr->sqs[cmd->cdw10_bits.delete_io_q.qid]; 2324 sq->delete_ctx = calloc(1, sizeof(*sq->delete_ctx)); 2325 if (!sq->delete_ctx) { 2326 sct = SPDK_NVME_SCT_GENERIC; 2327 sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2328 goto out; 2329 } 2330 sq->delete_ctx->vu_ctrlr = ctrlr; 2331 sq->delete_ctx->cid = cmd->cid; 2332 sq->sq_state = VFIO_USER_SQ_DELETED; 2333 assert(ctrlr->cqs[sq->cqid]->cq_ref); 2334 ctrlr->cqs[sq->cqid]->cq_ref--; 2335 2336 spdk_nvmf_qpair_disconnect(&sq->qpair, NULL, NULL); 2337 return 0; 2338 } 2339 2340 out: 2341 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 2342 } 2343 2344 /* 2345 * Configures Shadow Doorbells. 2346 */ 2347 static int 2348 handle_doorbell_buffer_config(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd) 2349 { 2350 struct nvmf_vfio_user_shadow_doorbells *sdbl = NULL; 2351 uint32_t dstrd; 2352 uintptr_t page_size, page_mask; 2353 uint64_t prp1, prp2; 2354 uint16_t sct = SPDK_NVME_SCT_GENERIC; 2355 uint16_t sc = SPDK_NVME_SC_INVALID_FIELD; 2356 2357 assert(ctrlr != NULL); 2358 assert(ctrlr->endpoint != NULL); 2359 assert(cmd != NULL); 2360 2361 dstrd = doorbell_stride(ctrlr); 2362 page_size = memory_page_size(ctrlr); 2363 page_mask = memory_page_mask(ctrlr); 2364 2365 /* FIXME: we don't check doorbell stride when setting queue doorbells. */ 2366 if ((4u << dstrd) * NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR > page_size) { 2367 SPDK_ERRLOG("%s: doorbells do not fit in a single host page", 2368 ctrlr_id(ctrlr)); 2369 2370 goto out; 2371 } 2372 2373 /* Verify guest physical addresses passed as PRPs. */ 2374 if (cmd->psdt != SPDK_NVME_PSDT_PRP) { 2375 SPDK_ERRLOG("%s: received Doorbell Buffer Config without PRPs", 2376 ctrlr_id(ctrlr)); 2377 2378 goto out; 2379 } 2380 2381 prp1 = cmd->dptr.prp.prp1; 2382 prp2 = cmd->dptr.prp.prp2; 2383 2384 SPDK_DEBUGLOG(nvmf_vfio, 2385 "%s: configuring shadow doorbells with PRP1=%#lx and PRP2=%#lx (GPAs)\n", 2386 ctrlr_id(ctrlr), prp1, prp2); 2387 2388 if (prp1 == prp2 2389 || prp1 != (prp1 & page_mask) 2390 || prp2 != (prp2 & page_mask)) { 2391 SPDK_ERRLOG("%s: invalid shadow doorbell GPAs\n", 2392 ctrlr_id(ctrlr)); 2393 2394 goto out; 2395 } 2396 2397 /* Map guest physical addresses to our virtual address space. */ 2398 sdbl = map_sdbl(ctrlr->endpoint->vfu_ctx, prp1, prp2, page_size); 2399 if (sdbl == NULL) { 2400 SPDK_ERRLOG("%s: failed to map shadow doorbell buffers\n", 2401 ctrlr_id(ctrlr)); 2402 2403 goto out; 2404 } 2405 2406 ctrlr->shadow_doorbell_buffer = prp1; 2407 ctrlr->eventidx_buffer = prp2; 2408 2409 SPDK_DEBUGLOG(nvmf_vfio, 2410 "%s: mapped shadow doorbell buffers [%p, %p) and [%p, %p)\n", 2411 ctrlr_id(ctrlr), 2412 sdbl->iovs[0].iov_base, 2413 sdbl->iovs[0].iov_base + sdbl->iovs[0].iov_len, 2414 sdbl->iovs[1].iov_base, 2415 sdbl->iovs[1].iov_base + sdbl->iovs[1].iov_len); 2416 2417 2418 /* 2419 * Set all possible CQ head doorbells to polling mode now, such that we 2420 * don't have to worry about it later if the host creates more queues. 2421 * 2422 * We only ever want interrupts for writes to the SQ tail doorbells 2423 * (which are initialised in set_ctrlr_intr_mode() below). 2424 */ 2425 for (uint16_t i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; ++i) { 2426 sdbl->eventidxs[queue_index(i, true)] = NVMF_VFIO_USER_EVENTIDX_POLL; 2427 } 2428 2429 /* Update controller. */ 2430 SWAP(ctrlr->sdbl, sdbl); 2431 2432 /* 2433 * Copy doorbells from either the previous shadow doorbell buffer or the 2434 * BAR0 doorbells and make I/O queue doorbells point to the new buffer. 2435 * 2436 * This needs to account for older versions of the Linux NVMe driver, 2437 * which don't clear out the buffer after a controller reset. 2438 */ 2439 copy_doorbells(ctrlr, sdbl != NULL ? 2440 sdbl->shadow_doorbells : ctrlr->bar0_doorbells, 2441 ctrlr->sdbl->shadow_doorbells); 2442 2443 vfio_user_ctrlr_switch_doorbells(ctrlr, true); 2444 2445 ctrlr_kick(ctrlr); 2446 2447 sc = SPDK_NVME_SC_SUCCESS; 2448 2449 out: 2450 /* 2451 * Unmap existing buffers, in case Doorbell Buffer Config was sent 2452 * more than once (pointless, but not prohibited by the spec), or 2453 * in case of an error. 2454 * 2455 * If this is the first time Doorbell Buffer Config was processed, 2456 * then we've just swapped a NULL from ctrlr->sdbl into sdbl, so 2457 * free_sdbl() becomes a noop. 2458 */ 2459 free_sdbl(ctrlr->endpoint->vfu_ctx, sdbl); 2460 2461 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 2462 } 2463 2464 /* Returns 0 on success and -errno on error. */ 2465 static int 2466 consume_admin_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd) 2467 { 2468 assert(ctrlr != NULL); 2469 assert(cmd != NULL); 2470 2471 if (cmd->fuse != 0) { 2472 /* Fused admin commands are not supported. */ 2473 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, 2474 SPDK_NVME_SC_INVALID_FIELD, 2475 SPDK_NVME_SCT_GENERIC); 2476 } 2477 2478 switch (cmd->opc) { 2479 case SPDK_NVME_OPC_CREATE_IO_CQ: 2480 case SPDK_NVME_OPC_CREATE_IO_SQ: 2481 return handle_create_io_q(ctrlr, cmd, 2482 cmd->opc == SPDK_NVME_OPC_CREATE_IO_CQ); 2483 case SPDK_NVME_OPC_DELETE_IO_SQ: 2484 case SPDK_NVME_OPC_DELETE_IO_CQ: 2485 return handle_del_io_q(ctrlr, cmd, 2486 cmd->opc == SPDK_NVME_OPC_DELETE_IO_CQ); 2487 case SPDK_NVME_OPC_DOORBELL_BUFFER_CONFIG: 2488 if (!ctrlr->transport->transport_opts.disable_shadow_doorbells) { 2489 return handle_doorbell_buffer_config(ctrlr, cmd); 2490 } 2491 /* FALLTHROUGH */ 2492 default: 2493 return handle_cmd_req(ctrlr, cmd, ctrlr->sqs[0]); 2494 } 2495 } 2496 2497 static int 2498 handle_cmd_rsp(struct nvmf_vfio_user_req *vu_req, void *cb_arg) 2499 { 2500 struct nvmf_vfio_user_sq *sq = cb_arg; 2501 struct nvmf_vfio_user_ctrlr *vu_ctrlr = sq->ctrlr; 2502 uint16_t sqid, cqid; 2503 2504 assert(sq != NULL); 2505 assert(vu_req != NULL); 2506 assert(vu_ctrlr != NULL); 2507 2508 if (spdk_likely(vu_req->iovcnt)) { 2509 vfu_sgl_put(vu_ctrlr->endpoint->vfu_ctx, 2510 index_to_sg_t(vu_req->sg, 0), 2511 vu_req->iov, vu_req->iovcnt); 2512 } 2513 sqid = sq->qid; 2514 cqid = sq->cqid; 2515 2516 return post_completion(vu_ctrlr, vu_ctrlr->cqs[cqid], 2517 vu_req->req.rsp->nvme_cpl.cdw0, 2518 sqid, 2519 vu_req->req.cmd->nvme_cmd.cid, 2520 vu_req->req.rsp->nvme_cpl.status.sc, 2521 vu_req->req.rsp->nvme_cpl.status.sct); 2522 } 2523 2524 static int 2525 consume_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_sq *sq, 2526 struct spdk_nvme_cmd *cmd) 2527 { 2528 assert(sq != NULL); 2529 if (spdk_unlikely(nvmf_qpair_is_admin_queue(&sq->qpair))) { 2530 return consume_admin_cmd(ctrlr, cmd); 2531 } 2532 2533 return handle_cmd_req(ctrlr, cmd, sq); 2534 } 2535 2536 /* Returns the number of commands processed, or a negative value on error. */ 2537 static int 2538 handle_sq_tdbl_write(struct nvmf_vfio_user_ctrlr *ctrlr, const uint32_t new_tail, 2539 struct nvmf_vfio_user_sq *sq) 2540 { 2541 struct spdk_nvme_cmd *queue; 2542 struct nvmf_vfio_user_cq *cq = ctrlr->cqs[sq->cqid]; 2543 int count = 0; 2544 uint32_t free_cq_slots; 2545 2546 assert(ctrlr != NULL); 2547 assert(sq != NULL); 2548 2549 if (ctrlr->sdbl != NULL && sq->qid != 0) { 2550 /* 2551 * Submission queue index has moved past the event index, so it 2552 * needs to be re-armed before we go to sleep. 2553 */ 2554 sq->need_rearm = true; 2555 } 2556 2557 free_cq_slots = cq_free_slots(cq); 2558 queue = q_addr(&sq->mapping); 2559 while (*sq_headp(sq) != new_tail) { 2560 int err; 2561 struct spdk_nvme_cmd *cmd; 2562 2563 /* 2564 * Linux host nvme driver can submit cmd's more than free cq slots 2565 * available. So process only those who have cq slots available. 2566 */ 2567 if (free_cq_slots-- == 0) { 2568 cq->last_head = *cq_dbl_headp(cq); 2569 2570 free_cq_slots = cq_free_slots(cq); 2571 if (free_cq_slots > 0) { 2572 continue; 2573 } 2574 2575 /* 2576 * If there are no free cq slots then kick interrupt FD to loop 2577 * again to process remaining sq cmds. 2578 * In case of polling mode we will process remaining sq cmds during 2579 * next polling interation. 2580 * sq head is advanced only for consumed commands. 2581 */ 2582 if (in_interrupt_mode(ctrlr->transport)) { 2583 eventfd_write(ctrlr->intr_fd, 1); 2584 } 2585 break; 2586 } 2587 2588 cmd = &queue[*sq_headp(sq)]; 2589 count++; 2590 2591 /* 2592 * SQHD must contain the new head pointer, so we must increase 2593 * it before we generate a completion. 2594 */ 2595 sq_head_advance(sq); 2596 2597 err = consume_cmd(ctrlr, sq, cmd); 2598 if (spdk_unlikely(err != 0)) { 2599 return err; 2600 } 2601 } 2602 2603 return count; 2604 } 2605 2606 /* Checks whether endpoint is connected from the same process */ 2607 static bool 2608 is_peer_same_process(struct nvmf_vfio_user_endpoint *endpoint) 2609 { 2610 struct ucred ucred; 2611 socklen_t ucredlen = sizeof(ucred); 2612 2613 if (endpoint == NULL) { 2614 return false; 2615 } 2616 2617 if (getsockopt(vfu_get_poll_fd(endpoint->vfu_ctx), SOL_SOCKET, SO_PEERCRED, &ucred, 2618 &ucredlen) < 0) { 2619 SPDK_ERRLOG("getsockopt(SO_PEERCRED): %s\n", strerror(errno)); 2620 return false; 2621 } 2622 2623 return ucred.pid == getpid(); 2624 } 2625 2626 static void 2627 memory_region_add_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) 2628 { 2629 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2630 struct nvmf_vfio_user_ctrlr *ctrlr; 2631 struct nvmf_vfio_user_sq *sq; 2632 struct nvmf_vfio_user_cq *cq; 2633 void *map_start, *map_end; 2634 int ret; 2635 2636 /* 2637 * We're not interested in any DMA regions that aren't mappable (we don't 2638 * support clients that don't share their memory). 2639 */ 2640 if (!info->vaddr) { 2641 return; 2642 } 2643 2644 map_start = info->mapping.iov_base; 2645 map_end = info->mapping.iov_base + info->mapping.iov_len; 2646 2647 if (((uintptr_t)info->mapping.iov_base & MASK_2MB) || 2648 (info->mapping.iov_len & MASK_2MB)) { 2649 SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %p-%p\n", 2650 info->vaddr, map_start, map_end); 2651 return; 2652 } 2653 2654 assert(endpoint != NULL); 2655 if (endpoint->ctrlr == NULL) { 2656 return; 2657 } 2658 ctrlr = endpoint->ctrlr; 2659 2660 SPDK_DEBUGLOG(nvmf_vfio, "%s: map IOVA %p-%p\n", endpoint_id(endpoint), 2661 map_start, map_end); 2662 2663 /* VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE are enabled when registering to VFIO, here we also 2664 * check the protection bits before registering. When vfio client and server are run in same process 2665 * there is no need to register the same memory again. 2666 */ 2667 if (info->prot == (PROT_WRITE | PROT_READ) && !is_peer_same_process(endpoint)) { 2668 ret = spdk_mem_register(info->mapping.iov_base, info->mapping.iov_len); 2669 if (ret) { 2670 SPDK_ERRLOG("Memory region register %p-%p failed, ret=%d\n", 2671 map_start, map_end, ret); 2672 } 2673 } 2674 2675 pthread_mutex_lock(&endpoint->lock); 2676 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 2677 if (sq->sq_state != VFIO_USER_SQ_INACTIVE) { 2678 continue; 2679 } 2680 2681 cq = ctrlr->cqs[sq->cqid]; 2682 2683 /* For shared CQ case, we will use q_addr() to avoid mapping CQ multiple times */ 2684 if (cq->size && q_addr(&cq->mapping) == NULL) { 2685 ret = map_q(ctrlr, &cq->mapping, cq->size, true, false); 2686 if (ret) { 2687 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap cqid:%d %#lx-%#lx\n", 2688 cq->qid, cq->mapping.prp1, 2689 cq->mapping.prp1 + cq->size * sizeof(struct spdk_nvme_cpl)); 2690 continue; 2691 } 2692 } 2693 2694 if (sq->size) { 2695 ret = map_q(ctrlr, &sq->mapping, sq->size, false, false); 2696 if (ret) { 2697 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap sqid:%d %#lx-%#lx\n", 2698 sq->qid, sq->mapping.prp1, 2699 sq->mapping.prp1 + sq->size * sizeof(struct spdk_nvme_cmd)); 2700 continue; 2701 } 2702 } 2703 sq->sq_state = VFIO_USER_SQ_ACTIVE; 2704 SPDK_DEBUGLOG(nvmf_vfio, "Remap sqid:%u successfully\n", sq->qid); 2705 } 2706 pthread_mutex_unlock(&endpoint->lock); 2707 } 2708 2709 static void 2710 memory_region_remove_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) 2711 { 2712 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2713 struct nvmf_vfio_user_sq *sq; 2714 struct nvmf_vfio_user_cq *cq; 2715 void *map_start, *map_end; 2716 int ret = 0; 2717 2718 if (!info->vaddr) { 2719 return; 2720 } 2721 2722 map_start = info->mapping.iov_base; 2723 map_end = info->mapping.iov_base + info->mapping.iov_len; 2724 2725 if (((uintptr_t)info->mapping.iov_base & MASK_2MB) || 2726 (info->mapping.iov_len & MASK_2MB)) { 2727 SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %p-%p\n", 2728 info->vaddr, map_start, map_end); 2729 return; 2730 } 2731 2732 assert(endpoint != NULL); 2733 SPDK_DEBUGLOG(nvmf_vfio, "%s: unmap IOVA %p-%p\n", endpoint_id(endpoint), 2734 map_start, map_end); 2735 2736 if (endpoint->ctrlr != NULL) { 2737 struct nvmf_vfio_user_ctrlr *ctrlr; 2738 ctrlr = endpoint->ctrlr; 2739 2740 pthread_mutex_lock(&endpoint->lock); 2741 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 2742 if (q_addr(&sq->mapping) >= map_start && q_addr(&sq->mapping) <= map_end) { 2743 unmap_q(ctrlr, &sq->mapping); 2744 sq->sq_state = VFIO_USER_SQ_INACTIVE; 2745 } 2746 2747 cq = ctrlr->cqs[sq->cqid]; 2748 if (q_addr(&cq->mapping) >= map_start && q_addr(&cq->mapping) <= map_end) { 2749 unmap_q(ctrlr, &cq->mapping); 2750 } 2751 } 2752 2753 if (ctrlr->sdbl != NULL) { 2754 size_t i; 2755 2756 for (i = 0; i < NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT; i++) { 2757 const void *const iov_base = ctrlr->sdbl->iovs[i].iov_base; 2758 2759 if (iov_base >= map_start && iov_base < map_end) { 2760 copy_doorbells(ctrlr, 2761 ctrlr->sdbl->shadow_doorbells, 2762 ctrlr->bar0_doorbells); 2763 vfio_user_ctrlr_switch_doorbells(ctrlr, false); 2764 free_sdbl(endpoint->vfu_ctx, ctrlr->sdbl); 2765 ctrlr->sdbl = NULL; 2766 break; 2767 } 2768 } 2769 } 2770 2771 pthread_mutex_unlock(&endpoint->lock); 2772 } 2773 2774 if (info->prot == (PROT_WRITE | PROT_READ) && !is_peer_same_process(endpoint)) { 2775 ret = spdk_mem_unregister(info->mapping.iov_base, info->mapping.iov_len); 2776 if (ret) { 2777 SPDK_ERRLOG("Memory region unregister %p-%p failed, ret=%d\n", 2778 map_start, map_end, ret); 2779 } 2780 } 2781 } 2782 2783 /* Used to initiate a controller-level reset or a controller shutdown. */ 2784 static void 2785 disable_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 2786 { 2787 SPDK_DEBUGLOG(nvmf_vfio, "%s: disabling controller\n", 2788 ctrlr_id(vu_ctrlr)); 2789 2790 /* Unmap Admin queue. */ 2791 2792 assert(vu_ctrlr->sqs[0] != NULL); 2793 assert(vu_ctrlr->cqs[0] != NULL); 2794 2795 unmap_q(vu_ctrlr, &vu_ctrlr->sqs[0]->mapping); 2796 unmap_q(vu_ctrlr, &vu_ctrlr->cqs[0]->mapping); 2797 2798 vu_ctrlr->sqs[0]->size = 0; 2799 *sq_headp(vu_ctrlr->sqs[0]) = 0; 2800 2801 vu_ctrlr->sqs[0]->sq_state = VFIO_USER_SQ_INACTIVE; 2802 2803 vu_ctrlr->cqs[0]->size = 0; 2804 *cq_tailp(vu_ctrlr->cqs[0]) = 0; 2805 2806 /* 2807 * For PCIe controller reset or shutdown, we will drop all AER 2808 * responses. 2809 */ 2810 spdk_nvmf_ctrlr_abort_aer(vu_ctrlr->ctrlr); 2811 2812 /* Free the shadow doorbell buffer. */ 2813 vfio_user_ctrlr_switch_doorbells(vu_ctrlr, false); 2814 free_sdbl(vu_ctrlr->endpoint->vfu_ctx, vu_ctrlr->sdbl); 2815 vu_ctrlr->sdbl = NULL; 2816 } 2817 2818 /* Used to re-enable the controller after a controller-level reset. */ 2819 static int 2820 enable_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 2821 { 2822 int err; 2823 2824 assert(vu_ctrlr != NULL); 2825 2826 SPDK_DEBUGLOG(nvmf_vfio, "%s: enabling controller\n", 2827 ctrlr_id(vu_ctrlr)); 2828 2829 err = acq_setup(vu_ctrlr); 2830 if (err != 0) { 2831 return err; 2832 } 2833 2834 err = asq_setup(vu_ctrlr); 2835 if (err != 0) { 2836 return err; 2837 } 2838 2839 vu_ctrlr->sqs[0]->sq_state = VFIO_USER_SQ_ACTIVE; 2840 2841 return 0; 2842 } 2843 2844 static int 2845 nvmf_vfio_user_prop_req_rsp_set(struct nvmf_vfio_user_req *req, 2846 struct nvmf_vfio_user_sq *sq) 2847 { 2848 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 2849 union spdk_nvme_cc_register cc, diff; 2850 2851 assert(req->req.cmd->prop_set_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET); 2852 assert(sq->ctrlr != NULL); 2853 vu_ctrlr = sq->ctrlr; 2854 2855 if (req->req.cmd->prop_set_cmd.ofst != offsetof(struct spdk_nvme_registers, cc)) { 2856 return 0; 2857 } 2858 2859 cc.raw = req->req.cmd->prop_set_cmd.value.u64; 2860 diff.raw = cc.raw ^ req->cc.raw; 2861 2862 if (diff.bits.en) { 2863 if (cc.bits.en) { 2864 int ret = enable_ctrlr(vu_ctrlr); 2865 if (ret) { 2866 SPDK_ERRLOG("%s: failed to enable ctrlr\n", ctrlr_id(vu_ctrlr)); 2867 return ret; 2868 } 2869 vu_ctrlr->reset_shn = false; 2870 } else { 2871 vu_ctrlr->reset_shn = true; 2872 } 2873 } 2874 2875 if (diff.bits.shn) { 2876 if (cc.bits.shn == SPDK_NVME_SHN_NORMAL || cc.bits.shn == SPDK_NVME_SHN_ABRUPT) { 2877 vu_ctrlr->reset_shn = true; 2878 } 2879 } 2880 2881 if (vu_ctrlr->reset_shn) { 2882 disable_ctrlr(vu_ctrlr); 2883 } 2884 return 0; 2885 } 2886 2887 static int 2888 nvmf_vfio_user_prop_req_rsp(struct nvmf_vfio_user_req *req, void *cb_arg) 2889 { 2890 struct nvmf_vfio_user_sq *sq = cb_arg; 2891 2892 assert(sq != NULL); 2893 assert(req != NULL); 2894 2895 if (req->req.cmd->prop_get_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET) { 2896 assert(sq->ctrlr != NULL); 2897 assert(req != NULL); 2898 2899 memcpy(req->req.iov[0].iov_base, 2900 &req->req.rsp->prop_get_rsp.value.u64, 2901 req->req.length); 2902 return 0; 2903 } 2904 2905 return nvmf_vfio_user_prop_req_rsp_set(req, sq); 2906 } 2907 2908 /* 2909 * Handles a write at offset 0x1000 or more; this is the non-mapped path when a 2910 * doorbell is written via access_bar0_fn(). 2911 * 2912 * DSTRD is set to fixed value 0 for NVMf. 2913 * 2914 */ 2915 static int 2916 handle_dbl_access(struct nvmf_vfio_user_ctrlr *ctrlr, uint32_t *buf, 2917 const size_t count, loff_t pos, const bool is_write) 2918 { 2919 struct nvmf_vfio_user_poll_group *group; 2920 2921 assert(ctrlr != NULL); 2922 assert(buf != NULL); 2923 2924 if (spdk_unlikely(!is_write)) { 2925 SPDK_WARNLOG("%s: host tried to read BAR0 doorbell %#lx\n", 2926 ctrlr_id(ctrlr), pos); 2927 errno = EPERM; 2928 return -1; 2929 } 2930 2931 if (spdk_unlikely(count != sizeof(uint32_t))) { 2932 SPDK_ERRLOG("%s: bad doorbell buffer size %ld\n", 2933 ctrlr_id(ctrlr), count); 2934 errno = EINVAL; 2935 return -1; 2936 } 2937 2938 pos -= NVME_DOORBELLS_OFFSET; 2939 2940 /* pos must be dword aligned */ 2941 if (spdk_unlikely((pos & 0x3) != 0)) { 2942 SPDK_ERRLOG("%s: bad doorbell offset %#lx\n", ctrlr_id(ctrlr), pos); 2943 errno = EINVAL; 2944 return -1; 2945 } 2946 2947 /* convert byte offset to array index */ 2948 pos >>= 2; 2949 2950 if (spdk_unlikely(pos >= NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR * 2)) { 2951 SPDK_ERRLOG("%s: bad doorbell index %#lx\n", ctrlr_id(ctrlr), pos); 2952 errno = EINVAL; 2953 return -1; 2954 } 2955 2956 ctrlr->bar0_doorbells[pos] = *buf; 2957 spdk_wmb(); 2958 2959 group = ctrlr_to_poll_group(ctrlr); 2960 if (pos == 1) { 2961 group->stats.cqh_admin_writes++; 2962 } else if (pos & 1) { 2963 group->stats.cqh_io_writes++; 2964 } 2965 2966 SPDK_DEBUGLOG(vfio_user_db, "%s: updating BAR0 doorbell %s:%ld to %u\n", 2967 ctrlr_id(ctrlr), (pos & 1) ? "cqid" : "sqid", 2968 pos / 2, *buf); 2969 2970 2971 return 0; 2972 } 2973 2974 static size_t 2975 vfio_user_property_access(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 2976 char *buf, size_t count, loff_t pos, 2977 bool is_write) 2978 { 2979 struct nvmf_vfio_user_req *req; 2980 const struct spdk_nvmf_registers *regs; 2981 2982 if ((count != 4) && (count != 8)) { 2983 errno = EINVAL; 2984 return -1; 2985 } 2986 2987 /* Construct a Fabric Property Get/Set command and send it */ 2988 req = get_nvmf_vfio_user_req(vu_ctrlr->sqs[0]); 2989 if (req == NULL) { 2990 errno = ENOBUFS; 2991 return -1; 2992 } 2993 regs = spdk_nvmf_ctrlr_get_regs(vu_ctrlr->ctrlr); 2994 req->cc.raw = regs->cc.raw; 2995 2996 req->cb_fn = nvmf_vfio_user_prop_req_rsp; 2997 req->cb_arg = vu_ctrlr->sqs[0]; 2998 req->req.cmd->prop_set_cmd.opcode = SPDK_NVME_OPC_FABRIC; 2999 req->req.cmd->prop_set_cmd.cid = 0; 3000 if (count == 4) { 3001 req->req.cmd->prop_set_cmd.attrib.size = 0; 3002 } else { 3003 req->req.cmd->prop_set_cmd.attrib.size = 1; 3004 } 3005 req->req.cmd->prop_set_cmd.ofst = pos; 3006 if (is_write) { 3007 req->req.cmd->prop_set_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET; 3008 if (req->req.cmd->prop_set_cmd.attrib.size) { 3009 req->req.cmd->prop_set_cmd.value.u64 = *(uint64_t *)buf; 3010 } else { 3011 req->req.cmd->prop_set_cmd.value.u32.high = 0; 3012 req->req.cmd->prop_set_cmd.value.u32.low = *(uint32_t *)buf; 3013 } 3014 } else { 3015 req->req.cmd->prop_get_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET; 3016 } 3017 req->req.length = count; 3018 SPDK_IOV_ONE(req->req.iov, &req->req.iovcnt, buf, req->req.length); 3019 3020 spdk_nvmf_request_exec_fabrics(&req->req); 3021 3022 return count; 3023 } 3024 3025 static ssize_t 3026 access_bar0_fn(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t pos, 3027 bool is_write) 3028 { 3029 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3030 struct nvmf_vfio_user_ctrlr *ctrlr; 3031 int ret; 3032 3033 ctrlr = endpoint->ctrlr; 3034 if (spdk_unlikely(endpoint->need_async_destroy || !ctrlr)) { 3035 errno = EIO; 3036 return -1; 3037 } 3038 3039 if (pos >= NVME_DOORBELLS_OFFSET) { 3040 /* 3041 * The fact that the doorbells can be memory mapped doesn't mean 3042 * that the client (VFIO in QEMU) is obliged to memory map them, 3043 * it might still elect to access them via regular read/write; 3044 * we might also have had disable_mappable_bar0 set. 3045 */ 3046 ret = handle_dbl_access(ctrlr, (uint32_t *)buf, count, 3047 pos, is_write); 3048 if (ret == 0) { 3049 return count; 3050 } 3051 return ret; 3052 } 3053 3054 return vfio_user_property_access(ctrlr, buf, count, pos, is_write); 3055 } 3056 3057 static ssize_t 3058 access_pci_config(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t offset, 3059 bool is_write) 3060 { 3061 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3062 3063 if (is_write) { 3064 SPDK_ERRLOG("%s: write %#lx-%#lx not supported\n", 3065 endpoint_id(endpoint), offset, offset + count); 3066 errno = EINVAL; 3067 return -1; 3068 } 3069 3070 if (offset + count > NVME_REG_CFG_SIZE) { 3071 SPDK_ERRLOG("%s: access past end of extended PCI configuration space, want=%ld+%ld, max=%d\n", 3072 endpoint_id(endpoint), offset, count, 3073 NVME_REG_CFG_SIZE); 3074 errno = ERANGE; 3075 return -1; 3076 } 3077 3078 memcpy(buf, ((unsigned char *)endpoint->pci_config_space) + offset, count); 3079 3080 return count; 3081 } 3082 3083 static void 3084 vfio_user_log(vfu_ctx_t *vfu_ctx, int level, char const *msg) 3085 { 3086 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3087 3088 if (level >= LOG_DEBUG) { 3089 SPDK_DEBUGLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg); 3090 } else if (level >= LOG_INFO) { 3091 SPDK_INFOLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg); 3092 } else if (level >= LOG_NOTICE) { 3093 SPDK_NOTICELOG("%s: %s\n", endpoint_id(endpoint), msg); 3094 } else if (level >= LOG_WARNING) { 3095 SPDK_WARNLOG("%s: %s\n", endpoint_id(endpoint), msg); 3096 } else { 3097 SPDK_ERRLOG("%s: %s\n", endpoint_id(endpoint), msg); 3098 } 3099 } 3100 3101 static int 3102 vfio_user_get_log_level(void) 3103 { 3104 int level; 3105 3106 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 3107 return LOG_DEBUG; 3108 } 3109 3110 level = spdk_log_to_syslog_level(spdk_log_get_level()); 3111 if (level < 0) { 3112 return LOG_ERR; 3113 } 3114 3115 return level; 3116 } 3117 3118 static void 3119 init_pci_config_space(vfu_pci_config_space_t *p) 3120 { 3121 /* MLBAR */ 3122 p->hdr.bars[0].raw = 0x0; 3123 /* MUBAR */ 3124 p->hdr.bars[1].raw = 0x0; 3125 3126 /* vendor specific, let's set them to zero for now */ 3127 p->hdr.bars[3].raw = 0x0; 3128 p->hdr.bars[4].raw = 0x0; 3129 p->hdr.bars[5].raw = 0x0; 3130 3131 /* enable INTx */ 3132 p->hdr.intr.ipin = 0x1; 3133 } 3134 3135 struct ctrlr_quiesce_ctx { 3136 struct nvmf_vfio_user_endpoint *endpoint; 3137 struct nvmf_vfio_user_poll_group *group; 3138 int status; 3139 }; 3140 3141 static void ctrlr_quiesce(struct nvmf_vfio_user_ctrlr *vu_ctrlr); 3142 3143 static void 3144 _vfio_user_endpoint_resume_done_msg(void *ctx) 3145 { 3146 struct nvmf_vfio_user_endpoint *endpoint = ctx; 3147 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3148 3149 endpoint->need_resume = false; 3150 3151 if (!vu_ctrlr) { 3152 return; 3153 } 3154 3155 if (!vu_ctrlr->queued_quiesce) { 3156 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 3157 3158 /* 3159 * We might have ignored new SQ entries while we were quiesced: 3160 * kick ourselves so we'll definitely check again while in 3161 * VFIO_USER_CTRLR_RUNNING state. 3162 */ 3163 if (in_interrupt_mode(endpoint->transport)) { 3164 ctrlr_kick(vu_ctrlr); 3165 } 3166 return; 3167 } 3168 3169 3170 /* 3171 * Basically, once we call `vfu_device_quiesced` the device is 3172 * unquiesced from libvfio-user's perspective so from the moment 3173 * `vfio_user_quiesce_done` returns libvfio-user might quiesce the device 3174 * again. However, because the NVMf subsytem is an asynchronous 3175 * operation, this quiesce might come _before_ the NVMf subsystem has 3176 * been resumed, so in the callback of `spdk_nvmf_subsystem_resume` we 3177 * need to check whether a quiesce was requested. 3178 */ 3179 SPDK_DEBUGLOG(nvmf_vfio, "%s has queued quiesce event, quiesce again\n", 3180 ctrlr_id(vu_ctrlr)); 3181 ctrlr_quiesce(vu_ctrlr); 3182 } 3183 3184 static void 3185 vfio_user_endpoint_resume_done(struct spdk_nvmf_subsystem *subsystem, 3186 void *cb_arg, int status) 3187 { 3188 struct nvmf_vfio_user_endpoint *endpoint = cb_arg; 3189 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3190 3191 SPDK_DEBUGLOG(nvmf_vfio, "%s resumed done with status %d\n", endpoint_id(endpoint), status); 3192 3193 if (!vu_ctrlr) { 3194 return; 3195 } 3196 3197 spdk_thread_send_msg(vu_ctrlr->thread, _vfio_user_endpoint_resume_done_msg, endpoint); 3198 } 3199 3200 static void 3201 vfio_user_quiesce_done(void *ctx) 3202 { 3203 struct ctrlr_quiesce_ctx *quiesce_ctx = ctx; 3204 struct nvmf_vfio_user_endpoint *endpoint = quiesce_ctx->endpoint; 3205 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3206 int ret; 3207 3208 if (!vu_ctrlr) { 3209 free(quiesce_ctx); 3210 return; 3211 } 3212 3213 SPDK_DEBUGLOG(nvmf_vfio, "%s device quiesced\n", ctrlr_id(vu_ctrlr)); 3214 3215 assert(vu_ctrlr->state == VFIO_USER_CTRLR_PAUSING); 3216 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 3217 vfu_device_quiesced(endpoint->vfu_ctx, quiesce_ctx->status); 3218 vu_ctrlr->queued_quiesce = false; 3219 free(quiesce_ctx); 3220 3221 /* `vfu_device_quiesced` can change the migration state, 3222 * so we need to re-check `vu_ctrlr->state`. 3223 */ 3224 if (vu_ctrlr->state == VFIO_USER_CTRLR_MIGRATING) { 3225 SPDK_DEBUGLOG(nvmf_vfio, "%s is in MIGRATION state\n", ctrlr_id(vu_ctrlr)); 3226 return; 3227 } 3228 3229 SPDK_DEBUGLOG(nvmf_vfio, "%s start to resume\n", ctrlr_id(vu_ctrlr)); 3230 vu_ctrlr->state = VFIO_USER_CTRLR_RESUMING; 3231 ret = spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 3232 vfio_user_endpoint_resume_done, endpoint); 3233 if (ret < 0) { 3234 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 3235 SPDK_ERRLOG("%s: failed to resume, ret=%d\n", endpoint_id(endpoint), ret); 3236 } 3237 } 3238 3239 static void 3240 vfio_user_pause_done(struct spdk_nvmf_subsystem *subsystem, 3241 void *ctx, int status) 3242 { 3243 struct ctrlr_quiesce_ctx *quiesce_ctx = ctx; 3244 struct nvmf_vfio_user_endpoint *endpoint = quiesce_ctx->endpoint; 3245 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3246 3247 if (!vu_ctrlr) { 3248 free(quiesce_ctx); 3249 return; 3250 } 3251 3252 quiesce_ctx->status = status; 3253 3254 SPDK_DEBUGLOG(nvmf_vfio, "%s pause done with status %d\n", 3255 ctrlr_id(vu_ctrlr), status); 3256 3257 spdk_thread_send_msg(vu_ctrlr->thread, 3258 vfio_user_quiesce_done, ctx); 3259 } 3260 3261 /* 3262 * Ensure that, for this PG, we've stopped running in nvmf_vfio_user_sq_poll(); 3263 * we've already set ctrlr->state, so we won't process new entries, but we need 3264 * to ensure that this PG is quiesced. This only works because there's no 3265 * callback context set up between polling the SQ and spdk_nvmf_request_exec(). 3266 * 3267 * Once we've walked all PGs, we need to pause any submitted I/O via 3268 * spdk_nvmf_subsystem_pause(SPDK_NVME_GLOBAL_NS_TAG). 3269 */ 3270 static void 3271 vfio_user_quiesce_pg(void *ctx) 3272 { 3273 struct ctrlr_quiesce_ctx *quiesce_ctx = ctx; 3274 struct nvmf_vfio_user_endpoint *endpoint = quiesce_ctx->endpoint; 3275 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3276 struct nvmf_vfio_user_poll_group *vu_group = quiesce_ctx->group; 3277 struct spdk_nvmf_subsystem *subsystem = endpoint->subsystem; 3278 int ret; 3279 3280 SPDK_DEBUGLOG(nvmf_vfio, "quiesced pg:%p\n", vu_group); 3281 3282 if (!vu_ctrlr) { 3283 free(quiesce_ctx); 3284 return; 3285 } 3286 3287 quiesce_ctx->group = TAILQ_NEXT(vu_group, link); 3288 if (quiesce_ctx->group != NULL) { 3289 spdk_thread_send_msg(poll_group_to_thread(quiesce_ctx->group), 3290 vfio_user_quiesce_pg, quiesce_ctx); 3291 return; 3292 } 3293 3294 ret = spdk_nvmf_subsystem_pause(subsystem, SPDK_NVME_GLOBAL_NS_TAG, 3295 vfio_user_pause_done, quiesce_ctx); 3296 if (ret < 0) { 3297 SPDK_ERRLOG("%s: failed to pause, ret=%d\n", 3298 endpoint_id(endpoint), ret); 3299 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 3300 fail_ctrlr(vu_ctrlr); 3301 free(quiesce_ctx); 3302 } 3303 } 3304 3305 static void 3306 ctrlr_quiesce(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3307 { 3308 struct ctrlr_quiesce_ctx *quiesce_ctx; 3309 3310 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSING; 3311 3312 quiesce_ctx = calloc(1, sizeof(*quiesce_ctx)); 3313 if (!quiesce_ctx) { 3314 SPDK_ERRLOG("Failed to allocate subsystem pause context\n"); 3315 assert(false); 3316 return; 3317 } 3318 3319 quiesce_ctx->endpoint = vu_ctrlr->endpoint; 3320 quiesce_ctx->status = 0; 3321 quiesce_ctx->group = TAILQ_FIRST(&vu_ctrlr->transport->poll_groups); 3322 3323 spdk_thread_send_msg(poll_group_to_thread(quiesce_ctx->group), 3324 vfio_user_quiesce_pg, quiesce_ctx); 3325 } 3326 3327 static int 3328 vfio_user_dev_quiesce_cb(vfu_ctx_t *vfu_ctx) 3329 { 3330 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3331 struct spdk_nvmf_subsystem *subsystem = endpoint->subsystem; 3332 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3333 3334 if (!vu_ctrlr) { 3335 return 0; 3336 } 3337 3338 /* NVMf library will destruct controller when no 3339 * connected queue pairs. 3340 */ 3341 if (!nvmf_subsystem_get_ctrlr(subsystem, vu_ctrlr->cntlid)) { 3342 return 0; 3343 } 3344 3345 SPDK_DEBUGLOG(nvmf_vfio, "%s starts to quiesce\n", ctrlr_id(vu_ctrlr)); 3346 3347 /* There is no race condition here as device quiesce callback 3348 * and nvmf_prop_set_cc() are running in the same thread context. 3349 */ 3350 if (!vu_ctrlr->ctrlr->vcprop.cc.bits.en) { 3351 return 0; 3352 } else if (!vu_ctrlr->ctrlr->vcprop.csts.bits.rdy) { 3353 return 0; 3354 } else if (vu_ctrlr->ctrlr->vcprop.csts.bits.shst == SPDK_NVME_SHST_COMPLETE) { 3355 return 0; 3356 } 3357 3358 switch (vu_ctrlr->state) { 3359 case VFIO_USER_CTRLR_PAUSED: 3360 case VFIO_USER_CTRLR_MIGRATING: 3361 return 0; 3362 case VFIO_USER_CTRLR_RUNNING: 3363 ctrlr_quiesce(vu_ctrlr); 3364 break; 3365 case VFIO_USER_CTRLR_RESUMING: 3366 vu_ctrlr->queued_quiesce = true; 3367 SPDK_DEBUGLOG(nvmf_vfio, "%s is busy to quiesce, current state %u\n", ctrlr_id(vu_ctrlr), 3368 vu_ctrlr->state); 3369 break; 3370 default: 3371 assert(vu_ctrlr->state != VFIO_USER_CTRLR_PAUSING); 3372 break; 3373 } 3374 3375 errno = EBUSY; 3376 return -1; 3377 } 3378 3379 static void 3380 vfio_user_ctrlr_dump_migr_data(const char *name, 3381 struct vfio_user_nvme_migr_state *migr_data, 3382 struct nvmf_vfio_user_shadow_doorbells *sdbl) 3383 { 3384 struct spdk_nvmf_registers *regs; 3385 struct nvme_migr_sq_state *sq; 3386 struct nvme_migr_cq_state *cq; 3387 uint32_t *doorbell_base; 3388 uint32_t i; 3389 3390 SPDK_NOTICELOG("Dump %s\n", name); 3391 3392 regs = &migr_data->nvmf_data.regs; 3393 doorbell_base = (uint32_t *)&migr_data->doorbells; 3394 3395 SPDK_NOTICELOG("Registers\n"); 3396 SPDK_NOTICELOG("CSTS 0x%x\n", regs->csts.raw); 3397 SPDK_NOTICELOG("CAP 0x%"PRIx64"\n", regs->cap.raw); 3398 SPDK_NOTICELOG("VS 0x%x\n", regs->vs.raw); 3399 SPDK_NOTICELOG("CC 0x%x\n", regs->cc.raw); 3400 SPDK_NOTICELOG("AQA 0x%x\n", regs->aqa.raw); 3401 SPDK_NOTICELOG("ASQ 0x%"PRIx64"\n", regs->asq); 3402 SPDK_NOTICELOG("ACQ 0x%"PRIx64"\n", regs->acq); 3403 3404 SPDK_NOTICELOG("Number of IO Queues %u\n", migr_data->ctrlr_header.num_io_queues); 3405 3406 if (sdbl != NULL) { 3407 SPDK_NOTICELOG("shadow doorbell buffer=%#lx\n", 3408 migr_data->ctrlr_header.shadow_doorbell_buffer); 3409 SPDK_NOTICELOG("eventidx buffer=%#lx\n", 3410 migr_data->ctrlr_header.eventidx_buffer); 3411 } 3412 3413 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3414 sq = &migr_data->qps[i].sq; 3415 cq = &migr_data->qps[i].cq; 3416 3417 if (sq->size) { 3418 SPDK_NOTICELOG("sqid:%u, bar0_doorbell:%u\n", sq->sqid, doorbell_base[i * 2]); 3419 if (i > 0 && sdbl != NULL) { 3420 SPDK_NOTICELOG("sqid:%u, shadow_doorbell:%u, eventidx:%u\n", 3421 sq->sqid, 3422 sdbl->shadow_doorbells[queue_index(i, false)], 3423 sdbl->eventidxs[queue_index(i, false)]); 3424 } 3425 SPDK_NOTICELOG("SQ sqid:%u, cqid:%u, sqhead:%u, size:%u, dma_addr:0x%"PRIx64"\n", 3426 sq->sqid, sq->cqid, sq->head, sq->size, sq->dma_addr); 3427 } 3428 3429 if (cq->size) { 3430 SPDK_NOTICELOG("cqid:%u, bar0_doorbell:%u\n", cq->cqid, doorbell_base[i * 2 + 1]); 3431 if (i > 0 && sdbl != NULL) { 3432 SPDK_NOTICELOG("cqid:%u, shadow_doorbell:%u, eventidx:%u\n", 3433 cq->cqid, 3434 sdbl->shadow_doorbells[queue_index(i, true)], 3435 sdbl->eventidxs[queue_index(i, true)]); 3436 } 3437 SPDK_NOTICELOG("CQ cqid:%u, phase:%u, cqtail:%u, size:%u, iv:%u, ien:%u, dma_addr:0x%"PRIx64"\n", 3438 cq->cqid, cq->phase, cq->tail, cq->size, cq->iv, cq->ien, cq->dma_addr); 3439 } 3440 } 3441 3442 SPDK_NOTICELOG("%s Dump Done\n", name); 3443 } 3444 3445 /* Read region 9 content and restore it to migration data structures */ 3446 static int 3447 vfio_user_migr_stream_to_data(struct nvmf_vfio_user_endpoint *endpoint, 3448 struct vfio_user_nvme_migr_state *migr_state) 3449 { 3450 void *data_ptr = endpoint->migr_data; 3451 3452 /* Load vfio_user_nvme_migr_header first */ 3453 memcpy(&migr_state->ctrlr_header, data_ptr, sizeof(struct vfio_user_nvme_migr_header)); 3454 /* TODO: version check */ 3455 if (migr_state->ctrlr_header.magic != VFIO_USER_NVME_MIGR_MAGIC) { 3456 SPDK_ERRLOG("%s: bad magic number %x\n", endpoint_id(endpoint), migr_state->ctrlr_header.magic); 3457 return -EINVAL; 3458 } 3459 3460 /* Load nvmf controller data */ 3461 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.nvmf_data_offset; 3462 memcpy(&migr_state->nvmf_data, data_ptr, migr_state->ctrlr_header.nvmf_data_len); 3463 3464 /* Load queue pairs */ 3465 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.qp_offset; 3466 memcpy(&migr_state->qps, data_ptr, migr_state->ctrlr_header.qp_len); 3467 3468 /* Load doorbells */ 3469 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.bar_offset[VFU_PCI_DEV_BAR0_REGION_IDX]; 3470 memcpy(&migr_state->doorbells, data_ptr, 3471 migr_state->ctrlr_header.bar_len[VFU_PCI_DEV_BAR0_REGION_IDX]); 3472 3473 /* Load CFG */ 3474 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.bar_offset[VFU_PCI_DEV_CFG_REGION_IDX]; 3475 memcpy(&migr_state->cfg, data_ptr, migr_state->ctrlr_header.bar_len[VFU_PCI_DEV_CFG_REGION_IDX]); 3476 3477 return 0; 3478 } 3479 3480 3481 static void 3482 vfio_user_migr_ctrlr_save_data(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3483 { 3484 struct spdk_nvmf_ctrlr *ctrlr = vu_ctrlr->ctrlr; 3485 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 3486 struct nvmf_vfio_user_sq *sq; 3487 struct nvmf_vfio_user_cq *cq; 3488 uint64_t data_offset; 3489 void *data_ptr; 3490 uint32_t *doorbell_base; 3491 uint32_t i = 0; 3492 uint16_t sqid, cqid; 3493 struct vfio_user_nvme_migr_state migr_state = { 3494 .nvmf_data = { 3495 .data_size = offsetof(struct spdk_nvmf_ctrlr_migr_data, unused), 3496 .regs_size = sizeof(struct spdk_nvmf_registers), 3497 .feat_size = sizeof(struct spdk_nvmf_ctrlr_feat) 3498 } 3499 }; 3500 3501 /* Save all data to vfio_user_nvme_migr_state first, then we will 3502 * copy it to device migration region at last. 3503 */ 3504 3505 /* save magic number */ 3506 migr_state.ctrlr_header.magic = VFIO_USER_NVME_MIGR_MAGIC; 3507 3508 /* save controller data */ 3509 spdk_nvmf_ctrlr_save_migr_data(ctrlr, &migr_state.nvmf_data); 3510 3511 /* save connected queue pairs */ 3512 TAILQ_FOREACH(sq, &vu_ctrlr->connected_sqs, tailq) { 3513 /* save sq */ 3514 sqid = sq->qid; 3515 migr_state.qps[sqid].sq.sqid = sq->qid; 3516 migr_state.qps[sqid].sq.cqid = sq->cqid; 3517 migr_state.qps[sqid].sq.head = *sq_headp(sq); 3518 migr_state.qps[sqid].sq.size = sq->size; 3519 migr_state.qps[sqid].sq.dma_addr = sq->mapping.prp1; 3520 3521 /* save cq, for shared cq case, cq may be saved multiple times */ 3522 cqid = sq->cqid; 3523 cq = vu_ctrlr->cqs[cqid]; 3524 migr_state.qps[cqid].cq.cqid = cqid; 3525 migr_state.qps[cqid].cq.tail = *cq_tailp(cq); 3526 migr_state.qps[cqid].cq.ien = cq->ien; 3527 migr_state.qps[cqid].cq.iv = cq->iv; 3528 migr_state.qps[cqid].cq.size = cq->size; 3529 migr_state.qps[cqid].cq.phase = cq->phase; 3530 migr_state.qps[cqid].cq.dma_addr = cq->mapping.prp1; 3531 i++; 3532 } 3533 3534 assert(i > 0); 3535 migr_state.ctrlr_header.num_io_queues = i - 1; 3536 3537 /* Save doorbells */ 3538 doorbell_base = (uint32_t *)&migr_state.doorbells; 3539 memcpy(doorbell_base, (void *)vu_ctrlr->bar0_doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 3540 3541 /* Save PCI configuration space */ 3542 memcpy(&migr_state.cfg, (void *)endpoint->pci_config_space, NVME_REG_CFG_SIZE); 3543 3544 /* Save all data to device migration region */ 3545 data_ptr = endpoint->migr_data; 3546 3547 /* Copy nvmf controller data */ 3548 data_offset = sizeof(struct vfio_user_nvme_migr_header); 3549 data_ptr += data_offset; 3550 migr_state.ctrlr_header.nvmf_data_offset = data_offset; 3551 migr_state.ctrlr_header.nvmf_data_len = sizeof(struct spdk_nvmf_ctrlr_migr_data); 3552 memcpy(data_ptr, &migr_state.nvmf_data, sizeof(struct spdk_nvmf_ctrlr_migr_data)); 3553 3554 /* Copy queue pairs */ 3555 data_offset += sizeof(struct spdk_nvmf_ctrlr_migr_data); 3556 data_ptr += sizeof(struct spdk_nvmf_ctrlr_migr_data); 3557 migr_state.ctrlr_header.qp_offset = data_offset; 3558 migr_state.ctrlr_header.qp_len = i * (sizeof(struct nvme_migr_sq_state) + sizeof( 3559 struct nvme_migr_cq_state)); 3560 memcpy(data_ptr, &migr_state.qps, migr_state.ctrlr_header.qp_len); 3561 3562 /* Copy doorbells */ 3563 data_offset += migr_state.ctrlr_header.qp_len; 3564 data_ptr += migr_state.ctrlr_header.qp_len; 3565 migr_state.ctrlr_header.bar_offset[VFU_PCI_DEV_BAR0_REGION_IDX] = data_offset; 3566 migr_state.ctrlr_header.bar_len[VFU_PCI_DEV_BAR0_REGION_IDX] = NVMF_VFIO_USER_DOORBELLS_SIZE; 3567 memcpy(data_ptr, &migr_state.doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 3568 3569 /* Copy CFG */ 3570 data_offset += NVMF_VFIO_USER_DOORBELLS_SIZE; 3571 data_ptr += NVMF_VFIO_USER_DOORBELLS_SIZE; 3572 migr_state.ctrlr_header.bar_offset[VFU_PCI_DEV_CFG_REGION_IDX] = data_offset; 3573 migr_state.ctrlr_header.bar_len[VFU_PCI_DEV_CFG_REGION_IDX] = NVME_REG_CFG_SIZE; 3574 memcpy(data_ptr, &migr_state.cfg, NVME_REG_CFG_SIZE); 3575 3576 /* copy shadow doorbells */ 3577 if (vu_ctrlr->sdbl != NULL) { 3578 migr_state.ctrlr_header.sdbl = true; 3579 migr_state.ctrlr_header.shadow_doorbell_buffer = vu_ctrlr->shadow_doorbell_buffer; 3580 migr_state.ctrlr_header.eventidx_buffer = vu_ctrlr->eventidx_buffer; 3581 } 3582 3583 /* Copy nvme migration header finally */ 3584 memcpy(endpoint->migr_data, &migr_state.ctrlr_header, sizeof(struct vfio_user_nvme_migr_header)); 3585 3586 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 3587 vfio_user_ctrlr_dump_migr_data("SAVE", &migr_state, vu_ctrlr->sdbl); 3588 } 3589 } 3590 3591 /* 3592 * If we are about to close the connection, we need to unregister the interrupt, 3593 * as the library will subsequently close the file descriptor we registered. 3594 */ 3595 static int 3596 vfio_user_device_reset(vfu_ctx_t *vfu_ctx, vfu_reset_type_t type) 3597 { 3598 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3599 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 3600 3601 SPDK_DEBUGLOG(nvmf_vfio, "Device reset type %u\n", type); 3602 3603 if (type == VFU_RESET_LOST_CONN) { 3604 if (ctrlr != NULL) { 3605 spdk_interrupt_unregister(&ctrlr->intr); 3606 ctrlr->intr_fd = -1; 3607 } 3608 return 0; 3609 } 3610 3611 /* FIXME: LOST_CONN case ? */ 3612 if (ctrlr->sdbl != NULL) { 3613 vfio_user_ctrlr_switch_doorbells(ctrlr, false); 3614 free_sdbl(vfu_ctx, ctrlr->sdbl); 3615 ctrlr->sdbl = NULL; 3616 } 3617 3618 /* FIXME: much more needed here. */ 3619 3620 return 0; 3621 } 3622 3623 static int 3624 vfio_user_migr_ctrlr_construct_qps(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 3625 struct vfio_user_nvme_migr_state *migr_state) 3626 { 3627 uint32_t i, qsize = 0; 3628 uint16_t sqid, cqid; 3629 struct vfio_user_nvme_migr_qp migr_qp; 3630 void *addr; 3631 uint32_t cqs_ref[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR] = {}; 3632 int ret; 3633 3634 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 3635 vfio_user_ctrlr_dump_migr_data("RESUME", migr_state, vu_ctrlr->sdbl); 3636 } 3637 3638 /* restore submission queues */ 3639 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3640 migr_qp = migr_state->qps[i]; 3641 3642 qsize = migr_qp.sq.size; 3643 if (qsize) { 3644 struct nvmf_vfio_user_sq *sq; 3645 3646 sqid = migr_qp.sq.sqid; 3647 if (sqid != i) { 3648 SPDK_ERRLOG("Expected sqid %u while got %u", i, sqid); 3649 return -EINVAL; 3650 } 3651 3652 /* allocate sq if necessary */ 3653 if (vu_ctrlr->sqs[sqid] == NULL) { 3654 ret = init_sq(vu_ctrlr, &vu_ctrlr->transport->transport, sqid); 3655 if (ret) { 3656 SPDK_ERRLOG("Construct qpair with qid %u failed\n", sqid); 3657 return -EFAULT; 3658 } 3659 } 3660 3661 sq = vu_ctrlr->sqs[sqid]; 3662 sq->size = qsize; 3663 3664 ret = alloc_sq_reqs(vu_ctrlr, sq); 3665 if (ret) { 3666 SPDK_ERRLOG("Construct sq with qid %u failed\n", sqid); 3667 return -EFAULT; 3668 } 3669 3670 /* restore sq */ 3671 sq->sq_state = VFIO_USER_SQ_CREATED; 3672 sq->cqid = migr_qp.sq.cqid; 3673 *sq_headp(sq) = migr_qp.sq.head; 3674 sq->mapping.prp1 = migr_qp.sq.dma_addr; 3675 addr = map_one(vu_ctrlr->endpoint->vfu_ctx, 3676 sq->mapping.prp1, sq->size * 64, 3677 sq->mapping.sg, &sq->mapping.iov, 3678 PROT_READ); 3679 if (addr == NULL) { 3680 SPDK_ERRLOG("Restore sq with qid %u PRP1 0x%"PRIx64" with size %u failed\n", 3681 sqid, sq->mapping.prp1, sq->size); 3682 return -EFAULT; 3683 } 3684 cqs_ref[sq->cqid]++; 3685 } 3686 } 3687 3688 /* restore completion queues */ 3689 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3690 migr_qp = migr_state->qps[i]; 3691 3692 qsize = migr_qp.cq.size; 3693 if (qsize) { 3694 struct nvmf_vfio_user_cq *cq; 3695 3696 /* restore cq */ 3697 cqid = migr_qp.sq.cqid; 3698 assert(cqid == i); 3699 3700 /* allocate cq if necessary */ 3701 if (vu_ctrlr->cqs[cqid] == NULL) { 3702 ret = init_cq(vu_ctrlr, cqid); 3703 if (ret) { 3704 SPDK_ERRLOG("Construct qpair with qid %u failed\n", cqid); 3705 return -EFAULT; 3706 } 3707 } 3708 3709 cq = vu_ctrlr->cqs[cqid]; 3710 3711 cq->size = qsize; 3712 3713 cq->cq_state = VFIO_USER_CQ_CREATED; 3714 cq->cq_ref = cqs_ref[cqid]; 3715 *cq_tailp(cq) = migr_qp.cq.tail; 3716 cq->mapping.prp1 = migr_qp.cq.dma_addr; 3717 cq->ien = migr_qp.cq.ien; 3718 cq->iv = migr_qp.cq.iv; 3719 cq->phase = migr_qp.cq.phase; 3720 addr = map_one(vu_ctrlr->endpoint->vfu_ctx, 3721 cq->mapping.prp1, cq->size * 16, 3722 cq->mapping.sg, &cq->mapping.iov, 3723 PROT_READ | PROT_WRITE); 3724 if (addr == NULL) { 3725 SPDK_ERRLOG("Restore cq with qid %u PRP1 0x%"PRIx64" with size %u failed\n", 3726 cqid, cq->mapping.prp1, cq->size); 3727 return -EFAULT; 3728 } 3729 } 3730 } 3731 3732 return 0; 3733 } 3734 3735 static int 3736 vfio_user_migr_ctrlr_restore(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3737 { 3738 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 3739 struct spdk_nvmf_ctrlr *ctrlr = vu_ctrlr->ctrlr; 3740 uint32_t *doorbell_base; 3741 struct spdk_nvme_cmd cmd; 3742 uint16_t i; 3743 int rc = 0; 3744 struct vfio_user_nvme_migr_state migr_state = { 3745 .nvmf_data = { 3746 .data_size = offsetof(struct spdk_nvmf_ctrlr_migr_data, unused), 3747 .regs_size = sizeof(struct spdk_nvmf_registers), 3748 .feat_size = sizeof(struct spdk_nvmf_ctrlr_feat) 3749 } 3750 }; 3751 3752 assert(endpoint->migr_data != NULL); 3753 assert(ctrlr != NULL); 3754 rc = vfio_user_migr_stream_to_data(endpoint, &migr_state); 3755 if (rc) { 3756 return rc; 3757 } 3758 3759 /* restore shadow doorbells */ 3760 if (migr_state.ctrlr_header.sdbl) { 3761 struct nvmf_vfio_user_shadow_doorbells *sdbl; 3762 sdbl = map_sdbl(vu_ctrlr->endpoint->vfu_ctx, 3763 migr_state.ctrlr_header.shadow_doorbell_buffer, 3764 migr_state.ctrlr_header.eventidx_buffer, 3765 memory_page_size(vu_ctrlr)); 3766 if (sdbl == NULL) { 3767 SPDK_ERRLOG("%s: failed to re-map shadow doorbell buffers\n", 3768 ctrlr_id(vu_ctrlr)); 3769 return -1; 3770 } 3771 3772 vu_ctrlr->shadow_doorbell_buffer = migr_state.ctrlr_header.shadow_doorbell_buffer; 3773 vu_ctrlr->eventidx_buffer = migr_state.ctrlr_header.eventidx_buffer; 3774 3775 SWAP(vu_ctrlr->sdbl, sdbl); 3776 } 3777 3778 rc = vfio_user_migr_ctrlr_construct_qps(vu_ctrlr, &migr_state); 3779 if (rc) { 3780 return rc; 3781 } 3782 3783 /* restore PCI configuration space */ 3784 memcpy((void *)endpoint->pci_config_space, &migr_state.cfg, NVME_REG_CFG_SIZE); 3785 3786 doorbell_base = (uint32_t *)&migr_state.doorbells; 3787 /* restore doorbells from saved registers */ 3788 memcpy((void *)vu_ctrlr->bar0_doorbells, doorbell_base, NVMF_VFIO_USER_DOORBELLS_SIZE); 3789 3790 /* restore nvmf controller data */ 3791 rc = spdk_nvmf_ctrlr_restore_migr_data(ctrlr, &migr_state.nvmf_data); 3792 if (rc) { 3793 return rc; 3794 } 3795 3796 /* resubmit pending AERs */ 3797 for (i = 0; i < migr_state.nvmf_data.num_aer_cids; i++) { 3798 SPDK_DEBUGLOG(nvmf_vfio, "%s AER resubmit, CID %u\n", ctrlr_id(vu_ctrlr), 3799 migr_state.nvmf_data.aer_cids[i]); 3800 memset(&cmd, 0, sizeof(cmd)); 3801 cmd.opc = SPDK_NVME_OPC_ASYNC_EVENT_REQUEST; 3802 cmd.cid = migr_state.nvmf_data.aer_cids[i]; 3803 rc = handle_cmd_req(vu_ctrlr, &cmd, vu_ctrlr->sqs[0]); 3804 if (spdk_unlikely(rc)) { 3805 break; 3806 } 3807 } 3808 3809 return rc; 3810 } 3811 3812 static void 3813 vfio_user_migr_ctrlr_enable_sqs(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3814 { 3815 uint32_t i; 3816 struct nvmf_vfio_user_sq *sq; 3817 3818 /* The Admin queue (qid: 0) does not ever use shadow doorbells. */ 3819 3820 if (vu_ctrlr->sqs[0] != NULL) { 3821 vu_ctrlr->sqs[0]->dbl_tailp = vu_ctrlr->bar0_doorbells + 3822 queue_index(0, false); 3823 } 3824 3825 if (vu_ctrlr->cqs[0] != NULL) { 3826 vu_ctrlr->cqs[0]->dbl_headp = vu_ctrlr->bar0_doorbells + 3827 queue_index(0, true); 3828 } 3829 3830 vfio_user_ctrlr_switch_doorbells(vu_ctrlr, vu_ctrlr->sdbl != NULL); 3831 3832 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3833 sq = vu_ctrlr->sqs[i]; 3834 if (!sq || !sq->size) { 3835 continue; 3836 } 3837 3838 if (nvmf_qpair_is_admin_queue(&sq->qpair)) { 3839 /* ADMIN queue pair is always in the poll group, just enable it */ 3840 sq->sq_state = VFIO_USER_SQ_ACTIVE; 3841 } else { 3842 spdk_nvmf_tgt_new_qpair(vu_ctrlr->transport->transport.tgt, &sq->qpair); 3843 } 3844 } 3845 } 3846 3847 /* 3848 * We are in stop-and-copy state, but still potentially have some current dirty 3849 * sgls: while we're quiesced and thus should have no active requests, we still 3850 * have potentially dirty maps of the shadow doorbells and the CQs (SQs are 3851 * mapped read only). 3852 * 3853 * Since we won't be calling vfu_sgl_put() for them, we need to explicitly 3854 * mark them dirty now. 3855 */ 3856 static void 3857 vfio_user_migr_ctrlr_mark_dirty(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3858 { 3859 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 3860 3861 assert(vu_ctrlr->state == VFIO_USER_CTRLR_MIGRATING); 3862 3863 for (size_t i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3864 struct nvmf_vfio_user_cq *cq = vu_ctrlr->cqs[i]; 3865 3866 if (cq == NULL || q_addr(&cq->mapping) == NULL) { 3867 continue; 3868 } 3869 3870 vfu_sgl_mark_dirty(endpoint->vfu_ctx, cq->mapping.sg, 1); 3871 } 3872 3873 if (vu_ctrlr->sdbl != NULL) { 3874 dma_sg_t *sg; 3875 size_t i; 3876 3877 for (i = 0; i < NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT; 3878 ++i) { 3879 3880 if (!vu_ctrlr->sdbl->iovs[i].iov_len) { 3881 continue; 3882 } 3883 3884 sg = index_to_sg_t(vu_ctrlr->sdbl->sgs, i); 3885 3886 vfu_sgl_mark_dirty(endpoint->vfu_ctx, sg, 1); 3887 } 3888 } 3889 } 3890 3891 static int 3892 vfio_user_migration_device_state_transition(vfu_ctx_t *vfu_ctx, vfu_migr_state_t state) 3893 { 3894 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3895 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3896 struct nvmf_vfio_user_sq *sq; 3897 int ret = 0; 3898 3899 SPDK_DEBUGLOG(nvmf_vfio, "%s controller state %u, migration state %u\n", endpoint_id(endpoint), 3900 vu_ctrlr->state, state); 3901 3902 switch (state) { 3903 case VFU_MIGR_STATE_STOP_AND_COPY: 3904 vu_ctrlr->in_source_vm = true; 3905 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 3906 vfio_user_migr_ctrlr_mark_dirty(vu_ctrlr); 3907 vfio_user_migr_ctrlr_save_data(vu_ctrlr); 3908 break; 3909 case VFU_MIGR_STATE_STOP: 3910 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 3911 /* The controller associates with source VM is dead now, we will resume 3912 * the subsystem after destroying the controller data structure, then the 3913 * subsystem can be re-used for another new client. 3914 */ 3915 if (vu_ctrlr->in_source_vm) { 3916 endpoint->need_resume = true; 3917 } 3918 break; 3919 case VFU_MIGR_STATE_PRE_COPY: 3920 assert(vu_ctrlr->state == VFIO_USER_CTRLR_PAUSED); 3921 break; 3922 case VFU_MIGR_STATE_RESUME: 3923 /* 3924 * Destination ADMIN queue pair is connected when starting the VM, 3925 * but the ADMIN queue pair isn't enabled in destination VM, the poll 3926 * group will do nothing to ADMIN queue pair for now. 3927 */ 3928 if (vu_ctrlr->state != VFIO_USER_CTRLR_RUNNING) { 3929 break; 3930 } 3931 3932 assert(!vu_ctrlr->in_source_vm); 3933 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 3934 3935 sq = TAILQ_FIRST(&vu_ctrlr->connected_sqs); 3936 assert(sq != NULL); 3937 assert(sq->qpair.qid == 0); 3938 sq->sq_state = VFIO_USER_SQ_INACTIVE; 3939 3940 /* Free ADMIN SQ resources first, SQ resources will be 3941 * allocated based on queue size from source VM. 3942 */ 3943 free_sq_reqs(sq); 3944 sq->size = 0; 3945 break; 3946 case VFU_MIGR_STATE_RUNNING: 3947 3948 if (vu_ctrlr->state != VFIO_USER_CTRLR_MIGRATING) { 3949 break; 3950 } 3951 3952 if (!vu_ctrlr->in_source_vm) { 3953 /* Restore destination VM from BAR9 */ 3954 ret = vfio_user_migr_ctrlr_restore(vu_ctrlr); 3955 if (ret) { 3956 break; 3957 } 3958 3959 vfio_user_ctrlr_switch_doorbells(vu_ctrlr, false); 3960 vfio_user_migr_ctrlr_enable_sqs(vu_ctrlr); 3961 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 3962 /* FIXME where do we resume nvmf? */ 3963 } else { 3964 /* Rollback source VM */ 3965 vu_ctrlr->state = VFIO_USER_CTRLR_RESUMING; 3966 ret = spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 3967 vfio_user_endpoint_resume_done, endpoint); 3968 if (ret < 0) { 3969 /* TODO: fail controller with CFS bit set */ 3970 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 3971 SPDK_ERRLOG("%s: failed to resume, ret=%d\n", endpoint_id(endpoint), ret); 3972 } 3973 } 3974 vu_ctrlr->migr_data_prepared = false; 3975 vu_ctrlr->in_source_vm = false; 3976 break; 3977 3978 default: 3979 return -EINVAL; 3980 } 3981 3982 return ret; 3983 } 3984 3985 static uint64_t 3986 vfio_user_migration_get_pending_bytes(vfu_ctx_t *vfu_ctx) 3987 { 3988 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3989 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 3990 uint64_t pending_bytes; 3991 3992 if (ctrlr->migr_data_prepared) { 3993 assert(ctrlr->state == VFIO_USER_CTRLR_MIGRATING); 3994 pending_bytes = 0; 3995 } else { 3996 pending_bytes = vfio_user_migr_data_len(); 3997 } 3998 3999 SPDK_DEBUGLOG(nvmf_vfio, 4000 "%s current state %u, pending bytes 0x%"PRIx64"\n", 4001 endpoint_id(endpoint), ctrlr->state, pending_bytes); 4002 4003 return pending_bytes; 4004 } 4005 4006 static int 4007 vfio_user_migration_prepare_data(vfu_ctx_t *vfu_ctx, uint64_t *offset, uint64_t *size) 4008 { 4009 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 4010 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 4011 4012 /* 4013 * When transitioning to pre-copy state we set pending_bytes to 0, 4014 * so the vfio-user client shouldn't attempt to read any migration 4015 * data. This is not yet guaranteed by libvfio-user. 4016 */ 4017 if (ctrlr->state != VFIO_USER_CTRLR_MIGRATING) { 4018 assert(size != NULL); 4019 *offset = 0; 4020 *size = 0; 4021 return 0; 4022 } 4023 4024 if (ctrlr->in_source_vm) { /* migration source */ 4025 assert(size != NULL); 4026 *size = vfio_user_migr_data_len(); 4027 vfio_user_migr_ctrlr_save_data(ctrlr); 4028 } else { /* migration destination */ 4029 assert(size == NULL); 4030 assert(!ctrlr->migr_data_prepared); 4031 } 4032 *offset = 0; 4033 ctrlr->migr_data_prepared = true; 4034 4035 SPDK_DEBUGLOG(nvmf_vfio, "%s current state %u\n", endpoint_id(endpoint), ctrlr->state); 4036 4037 return 0; 4038 } 4039 4040 static ssize_t 4041 vfio_user_migration_read_data(vfu_ctx_t *vfu_ctx __attribute__((unused)), 4042 void *buf __attribute__((unused)), 4043 uint64_t count __attribute__((unused)), 4044 uint64_t offset __attribute__((unused))) 4045 { 4046 SPDK_DEBUGLOG(nvmf_vfio, "%s: migration read data not supported\n", 4047 endpoint_id(vfu_get_private(vfu_ctx))); 4048 errno = ENOTSUP; 4049 return -1; 4050 } 4051 4052 static ssize_t 4053 vfio_user_migration_write_data(vfu_ctx_t *vfu_ctx __attribute__((unused)), 4054 void *buf __attribute__((unused)), 4055 uint64_t count __attribute__((unused)), 4056 uint64_t offset __attribute__((unused))) 4057 { 4058 SPDK_DEBUGLOG(nvmf_vfio, "%s: migration write data not supported\n", 4059 endpoint_id(vfu_get_private(vfu_ctx))); 4060 errno = ENOTSUP; 4061 return -1; 4062 } 4063 4064 static int 4065 vfio_user_migration_data_written(vfu_ctx_t *vfu_ctx __attribute__((unused)), 4066 uint64_t count) 4067 { 4068 SPDK_DEBUGLOG(nvmf_vfio, "write 0x%"PRIx64"\n", (uint64_t)count); 4069 4070 if (count != vfio_user_migr_data_len()) { 4071 SPDK_DEBUGLOG(nvmf_vfio, "%s bad count %#lx\n", 4072 endpoint_id(vfu_get_private(vfu_ctx)), count); 4073 errno = EINVAL; 4074 return -1; 4075 } 4076 4077 return 0; 4078 } 4079 4080 static int 4081 vfio_user_dev_info_fill(struct nvmf_vfio_user_transport *vu_transport, 4082 struct nvmf_vfio_user_endpoint *endpoint) 4083 { 4084 int ret; 4085 ssize_t cap_offset; 4086 vfu_ctx_t *vfu_ctx = endpoint->vfu_ctx; 4087 struct iovec migr_sparse_mmap = {}; 4088 4089 struct pmcap pmcap = { .hdr.id = PCI_CAP_ID_PM, .pmcs.nsfrst = 0x1 }; 4090 struct pxcap pxcap = { 4091 .hdr.id = PCI_CAP_ID_EXP, 4092 .pxcaps.ver = 0x2, 4093 .pxdcap = {.rer = 0x1, .flrc = 0x1}, 4094 .pxdcap2.ctds = 0x1 4095 }; 4096 4097 struct msixcap msixcap = { 4098 .hdr.id = PCI_CAP_ID_MSIX, 4099 .mxc.ts = NVME_IRQ_MSIX_NUM - 1, 4100 .mtab = {.tbir = 0x4, .to = 0x0}, 4101 .mpba = {.pbir = 0x5, .pbao = 0x0} 4102 }; 4103 4104 struct iovec sparse_mmap[] = { 4105 { 4106 .iov_base = (void *)NVME_DOORBELLS_OFFSET, 4107 .iov_len = NVMF_VFIO_USER_DOORBELLS_SIZE, 4108 }, 4109 }; 4110 4111 const vfu_migration_callbacks_t migr_callbacks = { 4112 .version = VFIO_USER_MIGR_CALLBACK_VERS, 4113 .transition = &vfio_user_migration_device_state_transition, 4114 .get_pending_bytes = &vfio_user_migration_get_pending_bytes, 4115 .prepare_data = &vfio_user_migration_prepare_data, 4116 .read_data = &vfio_user_migration_read_data, 4117 .data_written = &vfio_user_migration_data_written, 4118 .write_data = &vfio_user_migration_write_data 4119 }; 4120 4121 ret = vfu_pci_init(vfu_ctx, VFU_PCI_TYPE_EXPRESS, PCI_HEADER_TYPE_NORMAL, 0); 4122 if (ret < 0) { 4123 SPDK_ERRLOG("vfu_ctx %p failed to initialize PCI\n", vfu_ctx); 4124 return ret; 4125 } 4126 vfu_pci_set_id(vfu_ctx, SPDK_PCI_VID_NUTANIX, 0x0001, SPDK_PCI_VID_NUTANIX, 0); 4127 /* 4128 * 0x02, controller uses the NVM Express programming interface 4129 * 0x08, non-volatile memory controller 4130 * 0x01, mass storage controller 4131 */ 4132 vfu_pci_set_class(vfu_ctx, 0x01, 0x08, 0x02); 4133 4134 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pmcap); 4135 if (cap_offset < 0) { 4136 SPDK_ERRLOG("vfu_ctx %p failed add pmcap\n", vfu_ctx); 4137 return ret; 4138 } 4139 4140 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pxcap); 4141 if (cap_offset < 0) { 4142 SPDK_ERRLOG("vfu_ctx %p failed add pxcap\n", vfu_ctx); 4143 return ret; 4144 } 4145 4146 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &msixcap); 4147 if (cap_offset < 0) { 4148 SPDK_ERRLOG("vfu_ctx %p failed add msixcap\n", vfu_ctx); 4149 return ret; 4150 } 4151 4152 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_CFG_REGION_IDX, NVME_REG_CFG_SIZE, 4153 access_pci_config, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 4154 if (ret < 0) { 4155 SPDK_ERRLOG("vfu_ctx %p failed to setup cfg\n", vfu_ctx); 4156 return ret; 4157 } 4158 4159 if (vu_transport->transport_opts.disable_mappable_bar0) { 4160 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE, 4161 access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, 4162 NULL, 0, -1, 0); 4163 } else { 4164 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE, 4165 access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, 4166 sparse_mmap, 1, endpoint->devmem_fd, 0); 4167 } 4168 4169 if (ret < 0) { 4170 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 0\n", vfu_ctx); 4171 return ret; 4172 } 4173 4174 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR4_REGION_IDX, NVME_BAR4_SIZE, 4175 NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 4176 if (ret < 0) { 4177 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 4\n", vfu_ctx); 4178 return ret; 4179 } 4180 4181 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR5_REGION_IDX, NVME_BAR5_SIZE, 4182 NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 4183 if (ret < 0) { 4184 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 5\n", vfu_ctx); 4185 return ret; 4186 } 4187 4188 ret = vfu_setup_device_dma(vfu_ctx, memory_region_add_cb, memory_region_remove_cb); 4189 if (ret < 0) { 4190 SPDK_ERRLOG("vfu_ctx %p failed to setup dma callback\n", vfu_ctx); 4191 return ret; 4192 } 4193 4194 ret = vfu_setup_device_reset_cb(vfu_ctx, vfio_user_device_reset); 4195 if (ret < 0) { 4196 SPDK_ERRLOG("vfu_ctx %p failed to setup reset callback\n", vfu_ctx); 4197 return ret; 4198 } 4199 4200 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_INTX_IRQ, 1); 4201 if (ret < 0) { 4202 SPDK_ERRLOG("vfu_ctx %p failed to setup INTX\n", vfu_ctx); 4203 return ret; 4204 } 4205 4206 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSIX_IRQ, NVME_IRQ_MSIX_NUM); 4207 if (ret < 0) { 4208 SPDK_ERRLOG("vfu_ctx %p failed to setup MSIX\n", vfu_ctx); 4209 return ret; 4210 } 4211 4212 vfu_setup_device_quiesce_cb(vfu_ctx, vfio_user_dev_quiesce_cb); 4213 4214 migr_sparse_mmap.iov_base = (void *)4096; 4215 migr_sparse_mmap.iov_len = vfio_user_migr_data_len(); 4216 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_MIGR_REGION_IDX, 4217 vfu_get_migr_register_area_size() + vfio_user_migr_data_len(), 4218 NULL, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, &migr_sparse_mmap, 4219 1, endpoint->migr_fd, 0); 4220 if (ret < 0) { 4221 SPDK_ERRLOG("vfu_ctx %p failed to setup migration region\n", vfu_ctx); 4222 return ret; 4223 } 4224 4225 ret = vfu_setup_device_migration_callbacks(vfu_ctx, &migr_callbacks, 4226 vfu_get_migr_register_area_size()); 4227 if (ret < 0) { 4228 SPDK_ERRLOG("vfu_ctx %p failed to setup migration callbacks\n", vfu_ctx); 4229 return ret; 4230 } 4231 4232 ret = vfu_realize_ctx(vfu_ctx); 4233 if (ret < 0) { 4234 SPDK_ERRLOG("vfu_ctx %p failed to realize\n", vfu_ctx); 4235 return ret; 4236 } 4237 4238 endpoint->pci_config_space = vfu_pci_get_config_space(endpoint->vfu_ctx); 4239 assert(endpoint->pci_config_space != NULL); 4240 init_pci_config_space(endpoint->pci_config_space); 4241 4242 assert(cap_offset != 0); 4243 endpoint->msix = (struct msixcap *)((uint8_t *)endpoint->pci_config_space + cap_offset); 4244 4245 return 0; 4246 } 4247 4248 static int nvmf_vfio_user_accept(void *ctx); 4249 4250 static void 4251 set_intr_mode_noop(struct spdk_poller *poller, void *arg, bool interrupt_mode) 4252 { 4253 /* Nothing for us to do here. */ 4254 } 4255 4256 /* 4257 * Register an "accept" poller: this is polling for incoming vfio-user socket 4258 * connections (on the listening socket). 4259 * 4260 * We need to do this on first listening, and also after destroying a 4261 * controller, so we can accept another connection. 4262 */ 4263 static int 4264 vfio_user_register_accept_poller(struct nvmf_vfio_user_endpoint *endpoint) 4265 { 4266 uint64_t poll_rate_us = endpoint->transport->transport.opts.acceptor_poll_rate; 4267 4268 SPDK_DEBUGLOG(nvmf_vfio, "registering accept poller\n"); 4269 4270 endpoint->accept_poller = SPDK_POLLER_REGISTER(nvmf_vfio_user_accept, 4271 endpoint, poll_rate_us); 4272 4273 if (!endpoint->accept_poller) { 4274 return -1; 4275 } 4276 4277 endpoint->accept_thread = spdk_get_thread(); 4278 endpoint->need_relisten = false; 4279 4280 if (!spdk_interrupt_mode_is_enabled()) { 4281 return 0; 4282 } 4283 4284 endpoint->accept_intr_fd = vfu_get_poll_fd(endpoint->vfu_ctx); 4285 assert(endpoint->accept_intr_fd != -1); 4286 4287 endpoint->accept_intr = SPDK_INTERRUPT_REGISTER(endpoint->accept_intr_fd, 4288 nvmf_vfio_user_accept, endpoint); 4289 4290 assert(endpoint->accept_intr != NULL); 4291 4292 spdk_poller_register_interrupt(endpoint->accept_poller, 4293 set_intr_mode_noop, NULL); 4294 return 0; 4295 } 4296 4297 static void 4298 _vfio_user_relisten(void *ctx) 4299 { 4300 struct nvmf_vfio_user_endpoint *endpoint = ctx; 4301 4302 vfio_user_register_accept_poller(endpoint); 4303 } 4304 4305 static void 4306 _free_ctrlr(void *ctx) 4307 { 4308 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 4309 struct nvmf_vfio_user_endpoint *endpoint = ctrlr->endpoint; 4310 4311 free_sdbl(endpoint->vfu_ctx, ctrlr->sdbl); 4312 4313 spdk_interrupt_unregister(&ctrlr->intr); 4314 ctrlr->intr_fd = -1; 4315 spdk_poller_unregister(&ctrlr->vfu_ctx_poller); 4316 4317 free(ctrlr); 4318 4319 if (endpoint->need_async_destroy) { 4320 nvmf_vfio_user_destroy_endpoint(endpoint); 4321 } else if (endpoint->need_relisten) { 4322 spdk_thread_send_msg(endpoint->accept_thread, 4323 _vfio_user_relisten, endpoint); 4324 } 4325 } 4326 4327 static void 4328 free_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 4329 { 4330 int i; 4331 assert(ctrlr != NULL); 4332 4333 SPDK_DEBUGLOG(nvmf_vfio, "free %s\n", ctrlr_id(ctrlr)); 4334 4335 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 4336 free_qp(ctrlr, i); 4337 } 4338 4339 spdk_thread_exec_msg(ctrlr->thread, _free_ctrlr, ctrlr); 4340 } 4341 4342 static int 4343 nvmf_vfio_user_create_ctrlr(struct nvmf_vfio_user_transport *transport, 4344 struct nvmf_vfio_user_endpoint *endpoint) 4345 { 4346 struct nvmf_vfio_user_ctrlr *ctrlr; 4347 int err = 0; 4348 4349 SPDK_DEBUGLOG(nvmf_vfio, "%s\n", endpoint_id(endpoint)); 4350 4351 /* First, construct a vfio-user CUSTOM transport controller */ 4352 ctrlr = calloc(1, sizeof(*ctrlr)); 4353 if (ctrlr == NULL) { 4354 err = -ENOMEM; 4355 goto out; 4356 } 4357 /* We can only support one connection for now */ 4358 ctrlr->cntlid = 0x1; 4359 ctrlr->intr_fd = -1; 4360 ctrlr->transport = transport; 4361 ctrlr->endpoint = endpoint; 4362 ctrlr->bar0_doorbells = endpoint->bar0_doorbells; 4363 TAILQ_INIT(&ctrlr->connected_sqs); 4364 4365 ctrlr->adaptive_irqs_enabled = 4366 !transport->transport_opts.disable_adaptive_irq; 4367 4368 /* Then, construct an admin queue pair */ 4369 err = init_sq(ctrlr, &transport->transport, 0); 4370 if (err != 0) { 4371 free(ctrlr); 4372 goto out; 4373 } 4374 4375 err = init_cq(ctrlr, 0); 4376 if (err != 0) { 4377 free(ctrlr); 4378 goto out; 4379 } 4380 4381 ctrlr->sqs[0]->size = NVMF_VFIO_USER_DEFAULT_AQ_DEPTH; 4382 4383 err = alloc_sq_reqs(ctrlr, ctrlr->sqs[0]); 4384 if (err != 0) { 4385 free(ctrlr); 4386 goto out; 4387 } 4388 endpoint->ctrlr = ctrlr; 4389 4390 /* Notify the generic layer about the new admin queue pair */ 4391 spdk_nvmf_tgt_new_qpair(transport->transport.tgt, &ctrlr->sqs[0]->qpair); 4392 4393 out: 4394 if (err != 0) { 4395 SPDK_ERRLOG("%s: failed to create vfio-user controller: %s\n", 4396 endpoint_id(endpoint), strerror(-err)); 4397 } 4398 4399 return err; 4400 } 4401 4402 static int 4403 nvmf_vfio_user_listen(struct spdk_nvmf_transport *transport, 4404 const struct spdk_nvme_transport_id *trid, 4405 struct spdk_nvmf_listen_opts *listen_opts) 4406 { 4407 struct nvmf_vfio_user_transport *vu_transport; 4408 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 4409 char path[PATH_MAX] = {}; 4410 char uuid[PATH_MAX] = {}; 4411 int ret; 4412 4413 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4414 transport); 4415 4416 pthread_mutex_lock(&vu_transport->lock); 4417 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 4418 /* Only compare traddr */ 4419 if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) { 4420 pthread_mutex_unlock(&vu_transport->lock); 4421 return -EEXIST; 4422 } 4423 } 4424 pthread_mutex_unlock(&vu_transport->lock); 4425 4426 endpoint = calloc(1, sizeof(*endpoint)); 4427 if (!endpoint) { 4428 return -ENOMEM; 4429 } 4430 4431 pthread_mutex_init(&endpoint->lock, NULL); 4432 endpoint->devmem_fd = -1; 4433 memcpy(&endpoint->trid, trid, sizeof(endpoint->trid)); 4434 endpoint->transport = vu_transport; 4435 4436 ret = snprintf(path, PATH_MAX, "%s/bar0", endpoint_id(endpoint)); 4437 if (ret < 0 || ret >= PATH_MAX) { 4438 SPDK_ERRLOG("%s: error to get socket path: %s.\n", endpoint_id(endpoint), spdk_strerror(errno)); 4439 ret = -1; 4440 goto out; 4441 } 4442 4443 ret = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); 4444 if (ret == -1) { 4445 SPDK_ERRLOG("%s: failed to open device memory at %s: %s.\n", 4446 endpoint_id(endpoint), path, spdk_strerror(errno)); 4447 goto out; 4448 } 4449 unlink(path); 4450 4451 endpoint->devmem_fd = ret; 4452 ret = ftruncate(endpoint->devmem_fd, 4453 NVME_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE); 4454 if (ret != 0) { 4455 SPDK_ERRLOG("%s: error to ftruncate file %s: %s.\n", endpoint_id(endpoint), path, 4456 spdk_strerror(errno)); 4457 goto out; 4458 } 4459 4460 endpoint->bar0_doorbells = mmap(NULL, NVMF_VFIO_USER_DOORBELLS_SIZE, 4461 PROT_READ | PROT_WRITE, MAP_SHARED, endpoint->devmem_fd, NVME_DOORBELLS_OFFSET); 4462 if (endpoint->bar0_doorbells == MAP_FAILED) { 4463 SPDK_ERRLOG("%s: error to mmap file %s: %s.\n", endpoint_id(endpoint), path, spdk_strerror(errno)); 4464 endpoint->bar0_doorbells = NULL; 4465 ret = -1; 4466 goto out; 4467 } 4468 4469 ret = snprintf(path, PATH_MAX, "%s/migr", endpoint_id(endpoint)); 4470 if (ret < 0 || ret >= PATH_MAX) { 4471 SPDK_ERRLOG("%s: error to get migration file path: %s.\n", endpoint_id(endpoint), 4472 spdk_strerror(errno)); 4473 ret = -1; 4474 goto out; 4475 } 4476 ret = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); 4477 if (ret == -1) { 4478 SPDK_ERRLOG("%s: failed to open device memory at %s: %s.\n", 4479 endpoint_id(endpoint), path, spdk_strerror(errno)); 4480 goto out; 4481 } 4482 unlink(path); 4483 4484 endpoint->migr_fd = ret; 4485 ret = ftruncate(endpoint->migr_fd, 4486 vfu_get_migr_register_area_size() + vfio_user_migr_data_len()); 4487 if (ret != 0) { 4488 SPDK_ERRLOG("%s: error to ftruncate migration file %s: %s.\n", endpoint_id(endpoint), path, 4489 spdk_strerror(errno)); 4490 goto out; 4491 } 4492 4493 endpoint->migr_data = mmap(NULL, vfio_user_migr_data_len(), 4494 PROT_READ | PROT_WRITE, MAP_SHARED, endpoint->migr_fd, vfu_get_migr_register_area_size()); 4495 if (endpoint->migr_data == MAP_FAILED) { 4496 SPDK_ERRLOG("%s: error to mmap file %s: %s.\n", endpoint_id(endpoint), path, spdk_strerror(errno)); 4497 endpoint->migr_data = NULL; 4498 ret = -1; 4499 goto out; 4500 } 4501 4502 ret = snprintf(uuid, PATH_MAX, "%s/cntrl", endpoint_id(endpoint)); 4503 if (ret < 0 || ret >= PATH_MAX) { 4504 SPDK_ERRLOG("%s: error to get ctrlr file path: %s\n", endpoint_id(endpoint), spdk_strerror(errno)); 4505 ret = -1; 4506 goto out; 4507 } 4508 4509 endpoint->vfu_ctx = vfu_create_ctx(VFU_TRANS_SOCK, uuid, LIBVFIO_USER_FLAG_ATTACH_NB, 4510 endpoint, VFU_DEV_TYPE_PCI); 4511 if (endpoint->vfu_ctx == NULL) { 4512 SPDK_ERRLOG("%s: error creating libmuser context: %m\n", 4513 endpoint_id(endpoint)); 4514 ret = -1; 4515 goto out; 4516 } 4517 4518 ret = vfu_setup_log(endpoint->vfu_ctx, vfio_user_log, 4519 vfio_user_get_log_level()); 4520 if (ret < 0) { 4521 goto out; 4522 } 4523 4524 4525 ret = vfio_user_dev_info_fill(vu_transport, endpoint); 4526 if (ret < 0) { 4527 goto out; 4528 } 4529 4530 ret = vfio_user_register_accept_poller(endpoint); 4531 4532 if (ret != 0) { 4533 goto out; 4534 } 4535 4536 pthread_mutex_lock(&vu_transport->lock); 4537 TAILQ_INSERT_TAIL(&vu_transport->endpoints, endpoint, link); 4538 pthread_mutex_unlock(&vu_transport->lock); 4539 4540 out: 4541 if (ret != 0) { 4542 nvmf_vfio_user_destroy_endpoint(endpoint); 4543 } 4544 4545 return ret; 4546 } 4547 4548 static void 4549 nvmf_vfio_user_stop_listen(struct spdk_nvmf_transport *transport, 4550 const struct spdk_nvme_transport_id *trid) 4551 { 4552 struct nvmf_vfio_user_transport *vu_transport; 4553 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 4554 4555 assert(trid != NULL); 4556 assert(trid->traddr != NULL); 4557 4558 SPDK_DEBUGLOG(nvmf_vfio, "%s: stop listen\n", trid->traddr); 4559 4560 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4561 transport); 4562 4563 pthread_mutex_lock(&vu_transport->lock); 4564 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 4565 if (strcmp(trid->traddr, endpoint->trid.traddr) == 0) { 4566 TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link); 4567 /* Defer to free endpoint resources until the controller 4568 * is freed. There are two cases when running here: 4569 * 1. kill nvmf target while VM is connected 4570 * 2. remove listener via RPC call 4571 * nvmf library will disconnect all queue paris. 4572 */ 4573 if (endpoint->ctrlr) { 4574 assert(!endpoint->need_async_destroy); 4575 endpoint->need_async_destroy = true; 4576 pthread_mutex_unlock(&vu_transport->lock); 4577 return; 4578 } 4579 4580 nvmf_vfio_user_destroy_endpoint(endpoint); 4581 pthread_mutex_unlock(&vu_transport->lock); 4582 return; 4583 } 4584 } 4585 pthread_mutex_unlock(&vu_transport->lock); 4586 4587 SPDK_DEBUGLOG(nvmf_vfio, "%s: not found\n", trid->traddr); 4588 } 4589 4590 static void 4591 nvmf_vfio_user_cdata_init(struct spdk_nvmf_transport *transport, 4592 struct spdk_nvmf_subsystem *subsystem, 4593 struct spdk_nvmf_ctrlr_data *cdata) 4594 { 4595 struct nvmf_vfio_user_transport *vu_transport; 4596 4597 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, transport); 4598 4599 cdata->vid = SPDK_PCI_VID_NUTANIX; 4600 cdata->ssvid = SPDK_PCI_VID_NUTANIX; 4601 cdata->ieee[0] = 0x8d; 4602 cdata->ieee[1] = 0x6b; 4603 cdata->ieee[2] = 0x50; 4604 memset(&cdata->sgls, 0, sizeof(struct spdk_nvme_cdata_sgls)); 4605 cdata->sgls.supported = SPDK_NVME_SGLS_SUPPORTED_DWORD_ALIGNED; 4606 cdata->oncs.compare = !vu_transport->transport_opts.disable_compare; 4607 /* libvfio-user can only support 1 connection for now */ 4608 cdata->oncs.reservations = 0; 4609 cdata->oacs.doorbell_buffer_config = !vu_transport->transport_opts.disable_shadow_doorbells; 4610 cdata->fuses.compare_and_write = !vu_transport->transport_opts.disable_compare; 4611 } 4612 4613 static int 4614 nvmf_vfio_user_listen_associate(struct spdk_nvmf_transport *transport, 4615 const struct spdk_nvmf_subsystem *subsystem, 4616 const struct spdk_nvme_transport_id *trid) 4617 { 4618 struct nvmf_vfio_user_transport *vu_transport; 4619 struct nvmf_vfio_user_endpoint *endpoint; 4620 4621 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, transport); 4622 4623 pthread_mutex_lock(&vu_transport->lock); 4624 TAILQ_FOREACH(endpoint, &vu_transport->endpoints, link) { 4625 if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) { 4626 break; 4627 } 4628 } 4629 pthread_mutex_unlock(&vu_transport->lock); 4630 4631 if (endpoint == NULL) { 4632 return -ENOENT; 4633 } 4634 4635 /* Drop const - we will later need to pause/unpause. */ 4636 endpoint->subsystem = (struct spdk_nvmf_subsystem *)subsystem; 4637 4638 return 0; 4639 } 4640 4641 /* 4642 * Executed periodically at a default SPDK_NVMF_DEFAULT_ACCEPT_POLL_RATE_US 4643 * frequency. 4644 * 4645 * For this endpoint (which at the libvfio-user level corresponds to a socket), 4646 * if we don't currently have a controller set up, peek to see if the socket is 4647 * able to accept a new connection. 4648 */ 4649 static int 4650 nvmf_vfio_user_accept(void *ctx) 4651 { 4652 struct nvmf_vfio_user_endpoint *endpoint = ctx; 4653 struct nvmf_vfio_user_transport *vu_transport; 4654 int err; 4655 4656 vu_transport = endpoint->transport; 4657 4658 if (endpoint->ctrlr != NULL) { 4659 return SPDK_POLLER_IDLE; 4660 } 4661 4662 /* While we're here, the controller is already destroyed, 4663 * subsystem may still be in RESUMING state, we will wait 4664 * until the subsystem is in RUNNING state. 4665 */ 4666 if (endpoint->need_resume) { 4667 return SPDK_POLLER_IDLE; 4668 } 4669 4670 err = vfu_attach_ctx(endpoint->vfu_ctx); 4671 if (err == 0) { 4672 SPDK_DEBUGLOG(nvmf_vfio, "attach succeeded\n"); 4673 err = nvmf_vfio_user_create_ctrlr(vu_transport, endpoint); 4674 if (err == 0) { 4675 /* 4676 * Unregister ourselves: now we've accepted a 4677 * connection, there is nothing for us to poll for, and 4678 * we will poll the connection via vfu_run_ctx() 4679 * instead. 4680 */ 4681 spdk_interrupt_unregister(&endpoint->accept_intr); 4682 spdk_poller_unregister(&endpoint->accept_poller); 4683 } 4684 return SPDK_POLLER_BUSY; 4685 } 4686 4687 if (errno == EAGAIN || errno == EWOULDBLOCK) { 4688 return SPDK_POLLER_IDLE; 4689 } 4690 4691 return SPDK_POLLER_BUSY; 4692 } 4693 4694 static void 4695 nvmf_vfio_user_discover(struct spdk_nvmf_transport *transport, 4696 struct spdk_nvme_transport_id *trid, 4697 struct spdk_nvmf_discovery_log_page_entry *entry) 4698 { } 4699 4700 static int vfio_user_poll_group_intr(void *ctx); 4701 4702 static void 4703 vfio_user_poll_group_add_intr(struct nvmf_vfio_user_poll_group *vu_group, 4704 struct spdk_nvmf_poll_group *group) 4705 { 4706 vu_group->intr_fd = eventfd(0, EFD_NONBLOCK); 4707 assert(vu_group->intr_fd != -1); 4708 4709 vu_group->intr = SPDK_INTERRUPT_REGISTER(vu_group->intr_fd, 4710 vfio_user_poll_group_intr, vu_group); 4711 assert(vu_group->intr != NULL); 4712 4713 spdk_poller_register_interrupt(group->poller, set_intr_mode_noop, 4714 vu_group); 4715 } 4716 4717 static struct spdk_nvmf_transport_poll_group * 4718 nvmf_vfio_user_poll_group_create(struct spdk_nvmf_transport *transport, 4719 struct spdk_nvmf_poll_group *group) 4720 { 4721 struct nvmf_vfio_user_transport *vu_transport; 4722 struct nvmf_vfio_user_poll_group *vu_group; 4723 4724 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4725 transport); 4726 4727 SPDK_DEBUGLOG(nvmf_vfio, "create poll group\n"); 4728 4729 vu_group = calloc(1, sizeof(*vu_group)); 4730 if (vu_group == NULL) { 4731 SPDK_ERRLOG("Error allocating poll group: %m"); 4732 return NULL; 4733 } 4734 4735 if (in_interrupt_mode(vu_transport)) { 4736 vfio_user_poll_group_add_intr(vu_group, group); 4737 } 4738 4739 TAILQ_INIT(&vu_group->sqs); 4740 4741 pthread_mutex_lock(&vu_transport->pg_lock); 4742 TAILQ_INSERT_TAIL(&vu_transport->poll_groups, vu_group, link); 4743 if (vu_transport->next_pg == NULL) { 4744 vu_transport->next_pg = vu_group; 4745 } 4746 pthread_mutex_unlock(&vu_transport->pg_lock); 4747 4748 return &vu_group->group; 4749 } 4750 4751 static struct spdk_nvmf_transport_poll_group * 4752 nvmf_vfio_user_get_optimal_poll_group(struct spdk_nvmf_qpair *qpair) 4753 { 4754 struct nvmf_vfio_user_transport *vu_transport; 4755 struct nvmf_vfio_user_poll_group **vu_group; 4756 struct nvmf_vfio_user_sq *sq; 4757 struct nvmf_vfio_user_cq *cq; 4758 4759 struct spdk_nvmf_transport_poll_group *result = NULL; 4760 4761 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 4762 cq = sq->ctrlr->cqs[sq->cqid]; 4763 assert(cq != NULL); 4764 vu_transport = SPDK_CONTAINEROF(qpair->transport, struct nvmf_vfio_user_transport, transport); 4765 4766 pthread_mutex_lock(&vu_transport->pg_lock); 4767 if (TAILQ_EMPTY(&vu_transport->poll_groups)) { 4768 goto out; 4769 } 4770 4771 if (!nvmf_qpair_is_admin_queue(qpair)) { 4772 /* 4773 * If this is shared IO CQ case, just return the used CQ's poll 4774 * group, so I/O completions don't have to use 4775 * spdk_thread_send_msg(). 4776 */ 4777 if (cq->group != NULL) { 4778 result = cq->group; 4779 goto out; 4780 } 4781 4782 /* 4783 * If we're in interrupt mode, align all qpairs for a controller 4784 * on the same poll group by default, unless requested. This can 4785 * be lower in performance than running on a single poll group, 4786 * so we disable spreading by default. 4787 */ 4788 if (in_interrupt_mode(vu_transport) && 4789 !vu_transport->transport_opts.enable_intr_mode_sq_spreading) { 4790 result = sq->ctrlr->sqs[0]->group; 4791 goto out; 4792 } 4793 4794 } 4795 4796 vu_group = &vu_transport->next_pg; 4797 assert(*vu_group != NULL); 4798 4799 result = &(*vu_group)->group; 4800 *vu_group = TAILQ_NEXT(*vu_group, link); 4801 if (*vu_group == NULL) { 4802 *vu_group = TAILQ_FIRST(&vu_transport->poll_groups); 4803 } 4804 4805 out: 4806 if (cq->group == NULL) { 4807 cq->group = result; 4808 } 4809 4810 pthread_mutex_unlock(&vu_transport->pg_lock); 4811 return result; 4812 } 4813 4814 static void 4815 vfio_user_poll_group_del_intr(struct nvmf_vfio_user_poll_group *vu_group) 4816 { 4817 assert(vu_group->intr_fd != -1); 4818 4819 spdk_interrupt_unregister(&vu_group->intr); 4820 4821 close(vu_group->intr_fd); 4822 vu_group->intr_fd = -1; 4823 } 4824 4825 /* called when process exits */ 4826 static void 4827 nvmf_vfio_user_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group) 4828 { 4829 struct nvmf_vfio_user_poll_group *vu_group, *next_tgroup; 4830 struct nvmf_vfio_user_transport *vu_transport; 4831 4832 SPDK_DEBUGLOG(nvmf_vfio, "destroy poll group\n"); 4833 4834 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 4835 vu_transport = SPDK_CONTAINEROF(vu_group->group.transport, struct nvmf_vfio_user_transport, 4836 transport); 4837 4838 if (in_interrupt_mode(vu_transport)) { 4839 vfio_user_poll_group_del_intr(vu_group); 4840 } 4841 4842 pthread_mutex_lock(&vu_transport->pg_lock); 4843 next_tgroup = TAILQ_NEXT(vu_group, link); 4844 TAILQ_REMOVE(&vu_transport->poll_groups, vu_group, link); 4845 if (next_tgroup == NULL) { 4846 next_tgroup = TAILQ_FIRST(&vu_transport->poll_groups); 4847 } 4848 if (vu_transport->next_pg == vu_group) { 4849 vu_transport->next_pg = next_tgroup; 4850 } 4851 pthread_mutex_unlock(&vu_transport->pg_lock); 4852 4853 free(vu_group); 4854 } 4855 4856 static void 4857 _vfio_user_qpair_disconnect(void *ctx) 4858 { 4859 struct nvmf_vfio_user_sq *sq = ctx; 4860 4861 spdk_nvmf_qpair_disconnect(&sq->qpair, NULL, NULL); 4862 } 4863 4864 /* The function is used when socket connection is destroyed */ 4865 static int 4866 vfio_user_destroy_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 4867 { 4868 struct nvmf_vfio_user_sq *sq; 4869 struct nvmf_vfio_user_endpoint *endpoint; 4870 4871 SPDK_DEBUGLOG(nvmf_vfio, "%s stop processing\n", ctrlr_id(ctrlr)); 4872 4873 endpoint = ctrlr->endpoint; 4874 assert(endpoint != NULL); 4875 4876 pthread_mutex_lock(&endpoint->lock); 4877 endpoint->need_relisten = true; 4878 ctrlr->disconnect = true; 4879 if (TAILQ_EMPTY(&ctrlr->connected_sqs)) { 4880 endpoint->ctrlr = NULL; 4881 free_ctrlr(ctrlr); 4882 pthread_mutex_unlock(&endpoint->lock); 4883 return 0; 4884 } 4885 4886 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 4887 /* add another round thread poll to avoid recursive endpoint lock */ 4888 spdk_thread_send_msg(ctrlr->thread, _vfio_user_qpair_disconnect, sq); 4889 } 4890 pthread_mutex_unlock(&endpoint->lock); 4891 4892 return 0; 4893 } 4894 4895 /* 4896 * Poll for and process any incoming vfio-user messages. 4897 */ 4898 static int 4899 vfio_user_poll_vfu_ctx(void *ctx) 4900 { 4901 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 4902 int ret; 4903 4904 assert(ctrlr != NULL); 4905 4906 /* This will call access_bar0_fn() if there are any writes 4907 * to the portion of the BAR that is not mmap'd */ 4908 ret = vfu_run_ctx(ctrlr->endpoint->vfu_ctx); 4909 if (spdk_unlikely(ret == -1)) { 4910 if (errno == EBUSY) { 4911 return SPDK_POLLER_IDLE; 4912 } 4913 4914 spdk_poller_unregister(&ctrlr->vfu_ctx_poller); 4915 4916 /* 4917 * We lost the client; the reset callback will already have 4918 * unregistered the interrupt. 4919 */ 4920 if (errno == ENOTCONN) { 4921 vfio_user_destroy_ctrlr(ctrlr); 4922 return SPDK_POLLER_BUSY; 4923 } 4924 4925 /* 4926 * We might not have got a reset callback in this case, so 4927 * explicitly unregister the interrupt here. 4928 */ 4929 spdk_interrupt_unregister(&ctrlr->intr); 4930 ctrlr->intr_fd = -1; 4931 fail_ctrlr(ctrlr); 4932 } 4933 4934 return ret != 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 4935 } 4936 4937 struct vfio_user_post_cpl_ctx { 4938 struct nvmf_vfio_user_ctrlr *ctrlr; 4939 struct nvmf_vfio_user_cq *cq; 4940 struct spdk_nvme_cpl cpl; 4941 }; 4942 4943 static void 4944 _post_completion_msg(void *ctx) 4945 { 4946 struct vfio_user_post_cpl_ctx *cpl_ctx = ctx; 4947 4948 post_completion(cpl_ctx->ctrlr, cpl_ctx->cq, cpl_ctx->cpl.cdw0, cpl_ctx->cpl.sqid, 4949 cpl_ctx->cpl.cid, cpl_ctx->cpl.status.sc, cpl_ctx->cpl.status.sct); 4950 free(cpl_ctx); 4951 } 4952 4953 static int nvmf_vfio_user_poll_group_poll(struct spdk_nvmf_transport_poll_group *group); 4954 4955 static int 4956 vfio_user_poll_group_process(void *ctx) 4957 { 4958 struct nvmf_vfio_user_poll_group *vu_group = ctx; 4959 int ret = 0; 4960 4961 SPDK_DEBUGLOG(vfio_user_db, "pg:%p got intr\n", vu_group); 4962 4963 ret |= nvmf_vfio_user_poll_group_poll(&vu_group->group); 4964 4965 /* 4966 * Re-arm the event indexes. NB: this also could rearm other 4967 * controller's SQs. 4968 */ 4969 ret |= vfio_user_poll_group_rearm(vu_group); 4970 4971 vu_group->stats.pg_process_count++; 4972 return ret != 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 4973 } 4974 4975 static int 4976 vfio_user_poll_group_intr(void *ctx) 4977 { 4978 struct nvmf_vfio_user_poll_group *vu_group = ctx; 4979 eventfd_t val; 4980 4981 eventfd_read(vu_group->intr_fd, &val); 4982 4983 vu_group->stats.intr++; 4984 4985 return vfio_user_poll_group_process(ctx); 4986 } 4987 4988 /* 4989 * Handle an interrupt for the given controller: we must poll the vfu_ctx, and 4990 * the SQs assigned to our own poll group. Other poll groups are handled via 4991 * vfio_user_poll_group_intr(). 4992 */ 4993 static int 4994 vfio_user_ctrlr_intr(void *ctx) 4995 { 4996 struct nvmf_vfio_user_poll_group *vu_ctrlr_group; 4997 struct nvmf_vfio_user_ctrlr *vu_ctrlr = ctx; 4998 struct nvmf_vfio_user_poll_group *vu_group; 4999 int ret = SPDK_POLLER_IDLE; 5000 5001 vu_ctrlr_group = ctrlr_to_poll_group(vu_ctrlr); 5002 5003 SPDK_DEBUGLOG(vfio_user_db, "ctrlr pg:%p got intr\n", vu_ctrlr_group); 5004 5005 vu_ctrlr_group->stats.ctrlr_intr++; 5006 5007 /* 5008 * Poll vfio-user for this controller. We need to do this before polling 5009 * any SQs, as this is where doorbell writes may be handled. 5010 */ 5011 ret = vfio_user_poll_vfu_ctx(vu_ctrlr); 5012 5013 /* 5014 * `sqs[0]` could be set to NULL in vfio_user_poll_vfu_ctx() context, 5015 * just return for this case. 5016 */ 5017 if (vu_ctrlr->sqs[0] == NULL) { 5018 return ret; 5019 } 5020 5021 if (vu_ctrlr->transport->transport_opts.enable_intr_mode_sq_spreading) { 5022 /* 5023 * We may have just written to a doorbell owned by another 5024 * reactor: we need to prod them to make sure its SQs are polled 5025 * *after* the doorbell value is updated. 5026 */ 5027 TAILQ_FOREACH(vu_group, &vu_ctrlr->transport->poll_groups, link) { 5028 if (vu_group != vu_ctrlr_group) { 5029 SPDK_DEBUGLOG(vfio_user_db, "prodding pg:%p\n", vu_group); 5030 eventfd_write(vu_group->intr_fd, 1); 5031 } 5032 } 5033 } 5034 5035 ret |= vfio_user_poll_group_process(vu_ctrlr_group); 5036 5037 return ret; 5038 } 5039 5040 static void 5041 vfio_user_ctrlr_set_intr_mode(struct spdk_poller *poller, void *ctx, 5042 bool interrupt_mode) 5043 { 5044 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 5045 assert(ctrlr != NULL); 5046 assert(ctrlr->endpoint != NULL); 5047 5048 SPDK_DEBUGLOG(nvmf_vfio, "%s: setting interrupt mode to %d\n", 5049 ctrlr_id(ctrlr), interrupt_mode); 5050 5051 /* 5052 * interrupt_mode needs to persist across controller resets, so store 5053 * it in the endpoint instead. 5054 */ 5055 ctrlr->endpoint->interrupt_mode = interrupt_mode; 5056 5057 vfio_user_poll_group_rearm(ctrlr_to_poll_group(ctrlr)); 5058 } 5059 5060 /* 5061 * In response to the nvmf_vfio_user_create_ctrlr() path, the admin queue is now 5062 * set up and we can start operating on this controller. 5063 */ 5064 static void 5065 start_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 5066 struct spdk_nvmf_ctrlr *ctrlr) 5067 { 5068 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 5069 5070 vu_ctrlr->ctrlr = ctrlr; 5071 vu_ctrlr->cntlid = ctrlr->cntlid; 5072 vu_ctrlr->thread = spdk_get_thread(); 5073 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 5074 5075 if (!in_interrupt_mode(endpoint->transport)) { 5076 vu_ctrlr->vfu_ctx_poller = SPDK_POLLER_REGISTER(vfio_user_poll_vfu_ctx, 5077 vu_ctrlr, 1000); 5078 return; 5079 } 5080 5081 vu_ctrlr->vfu_ctx_poller = SPDK_POLLER_REGISTER(vfio_user_poll_vfu_ctx, 5082 vu_ctrlr, 0); 5083 5084 vu_ctrlr->intr_fd = vfu_get_poll_fd(vu_ctrlr->endpoint->vfu_ctx); 5085 assert(vu_ctrlr->intr_fd != -1); 5086 5087 vu_ctrlr->intr = SPDK_INTERRUPT_REGISTER(vu_ctrlr->intr_fd, 5088 vfio_user_ctrlr_intr, vu_ctrlr); 5089 5090 assert(vu_ctrlr->intr != NULL); 5091 5092 spdk_poller_register_interrupt(vu_ctrlr->vfu_ctx_poller, 5093 vfio_user_ctrlr_set_intr_mode, 5094 vu_ctrlr); 5095 } 5096 5097 static int 5098 handle_queue_connect_rsp(struct nvmf_vfio_user_req *req, void *cb_arg) 5099 { 5100 struct nvmf_vfio_user_poll_group *vu_group; 5101 struct nvmf_vfio_user_sq *sq = cb_arg; 5102 struct nvmf_vfio_user_cq *admin_cq; 5103 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 5104 struct nvmf_vfio_user_endpoint *endpoint; 5105 5106 assert(sq != NULL); 5107 assert(req != NULL); 5108 5109 vu_ctrlr = sq->ctrlr; 5110 assert(vu_ctrlr != NULL); 5111 endpoint = vu_ctrlr->endpoint; 5112 assert(endpoint != NULL); 5113 5114 if (spdk_nvme_cpl_is_error(&req->req.rsp->nvme_cpl)) { 5115 SPDK_ERRLOG("SC %u, SCT %u\n", req->req.rsp->nvme_cpl.status.sc, req->req.rsp->nvme_cpl.status.sct); 5116 endpoint->ctrlr = NULL; 5117 free_ctrlr(vu_ctrlr); 5118 return -1; 5119 } 5120 5121 vu_group = SPDK_CONTAINEROF(sq->group, struct nvmf_vfio_user_poll_group, group); 5122 TAILQ_INSERT_TAIL(&vu_group->sqs, sq, link); 5123 5124 admin_cq = vu_ctrlr->cqs[0]; 5125 assert(admin_cq != NULL); 5126 assert(admin_cq->group != NULL); 5127 assert(admin_cq->group->group->thread != NULL); 5128 5129 pthread_mutex_lock(&endpoint->lock); 5130 if (nvmf_qpair_is_admin_queue(&sq->qpair)) { 5131 assert(admin_cq->group->group->thread == spdk_get_thread()); 5132 /* 5133 * The admin queue is special as SQ0 and CQ0 are created 5134 * together. 5135 */ 5136 admin_cq->cq_ref = 1; 5137 start_ctrlr(vu_ctrlr, sq->qpair.ctrlr); 5138 } else { 5139 /* For I/O queues this command was generated in response to an 5140 * ADMIN I/O CREATE SUBMISSION QUEUE command which has not yet 5141 * been completed. Complete it now. 5142 */ 5143 if (sq->post_create_io_sq_completion) { 5144 if (admin_cq->group->group->thread != spdk_get_thread()) { 5145 struct vfio_user_post_cpl_ctx *cpl_ctx; 5146 5147 cpl_ctx = calloc(1, sizeof(*cpl_ctx)); 5148 if (!cpl_ctx) { 5149 return -ENOMEM; 5150 } 5151 cpl_ctx->ctrlr = vu_ctrlr; 5152 cpl_ctx->cq = admin_cq; 5153 cpl_ctx->cpl.sqid = 0; 5154 cpl_ctx->cpl.cdw0 = 0; 5155 cpl_ctx->cpl.cid = sq->create_io_sq_cmd.cid; 5156 cpl_ctx->cpl.status.sc = SPDK_NVME_SC_SUCCESS; 5157 cpl_ctx->cpl.status.sct = SPDK_NVME_SCT_GENERIC; 5158 5159 spdk_thread_send_msg(admin_cq->group->group->thread, 5160 _post_completion_msg, 5161 cpl_ctx); 5162 } else { 5163 post_completion(vu_ctrlr, admin_cq, 0, 0, 5164 sq->create_io_sq_cmd.cid, SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC); 5165 } 5166 sq->post_create_io_sq_completion = false; 5167 } else if (in_interrupt_mode(endpoint->transport)) { 5168 /* 5169 * If we're live migrating a guest, there is a window 5170 * where the I/O queues haven't been set up but the 5171 * device is in running state, during which the guest 5172 * might write to a doorbell. This doorbell write will 5173 * go unnoticed, so let's poll the whole controller to 5174 * pick that up. 5175 */ 5176 ctrlr_kick(vu_ctrlr); 5177 } 5178 sq->sq_state = VFIO_USER_SQ_ACTIVE; 5179 } 5180 5181 TAILQ_INSERT_TAIL(&vu_ctrlr->connected_sqs, sq, tailq); 5182 pthread_mutex_unlock(&endpoint->lock); 5183 5184 free(req->req.iov[0].iov_base); 5185 req->req.iov[0].iov_base = NULL; 5186 req->req.iovcnt = 0; 5187 5188 return 0; 5189 } 5190 5191 /* 5192 * Add the given qpair to the given poll group. New qpairs are added via 5193 * spdk_nvmf_tgt_new_qpair(), which picks a poll group via 5194 * nvmf_vfio_user_get_optimal_poll_group(), then calls back here via 5195 * nvmf_transport_poll_group_add(). 5196 */ 5197 static int 5198 nvmf_vfio_user_poll_group_add(struct spdk_nvmf_transport_poll_group *group, 5199 struct spdk_nvmf_qpair *qpair) 5200 { 5201 struct nvmf_vfio_user_sq *sq; 5202 struct nvmf_vfio_user_req *vu_req; 5203 struct nvmf_vfio_user_ctrlr *ctrlr; 5204 struct spdk_nvmf_request *req; 5205 struct spdk_nvmf_fabric_connect_data *data; 5206 bool admin; 5207 5208 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5209 sq->group = group; 5210 ctrlr = sq->ctrlr; 5211 5212 SPDK_DEBUGLOG(nvmf_vfio, "%s: add QP%d=%p(%p) to poll_group=%p\n", 5213 ctrlr_id(ctrlr), sq->qpair.qid, 5214 sq, qpair, group); 5215 5216 admin = nvmf_qpair_is_admin_queue(&sq->qpair); 5217 5218 vu_req = get_nvmf_vfio_user_req(sq); 5219 if (vu_req == NULL) { 5220 return -1; 5221 } 5222 5223 req = &vu_req->req; 5224 req->cmd->connect_cmd.opcode = SPDK_NVME_OPC_FABRIC; 5225 req->cmd->connect_cmd.cid = 0; 5226 req->cmd->connect_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_CONNECT; 5227 req->cmd->connect_cmd.recfmt = 0; 5228 req->cmd->connect_cmd.sqsize = sq->size - 1; 5229 req->cmd->connect_cmd.qid = admin ? 0 : qpair->qid; 5230 5231 req->length = sizeof(struct spdk_nvmf_fabric_connect_data); 5232 5233 data = calloc(1, req->length); 5234 if (data == NULL) { 5235 nvmf_vfio_user_req_free(req); 5236 return -ENOMEM; 5237 } 5238 5239 SPDK_IOV_ONE(req->iov, &req->iovcnt, data, req->length); 5240 5241 data->cntlid = ctrlr->cntlid; 5242 snprintf(data->subnqn, sizeof(data->subnqn), "%s", 5243 spdk_nvmf_subsystem_get_nqn(ctrlr->endpoint->subsystem)); 5244 5245 vu_req->cb_fn = handle_queue_connect_rsp; 5246 vu_req->cb_arg = sq; 5247 5248 SPDK_DEBUGLOG(nvmf_vfio, 5249 "%s: sending connect fabrics command for qid:%#x cntlid=%#x\n", 5250 ctrlr_id(ctrlr), qpair->qid, data->cntlid); 5251 5252 spdk_nvmf_request_exec_fabrics(req); 5253 return 0; 5254 } 5255 5256 static int 5257 nvmf_vfio_user_poll_group_remove(struct spdk_nvmf_transport_poll_group *group, 5258 struct spdk_nvmf_qpair *qpair) 5259 { 5260 struct nvmf_vfio_user_sq *sq; 5261 struct nvmf_vfio_user_poll_group *vu_group; 5262 5263 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5264 5265 SPDK_DEBUGLOG(nvmf_vfio, 5266 "%s: remove NVMf QP%d=%p from NVMf poll_group=%p\n", 5267 ctrlr_id(sq->ctrlr), qpair->qid, qpair, group); 5268 5269 5270 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 5271 TAILQ_REMOVE(&vu_group->sqs, sq, link); 5272 5273 return 0; 5274 } 5275 5276 static void 5277 _nvmf_vfio_user_req_free(struct nvmf_vfio_user_sq *sq, struct nvmf_vfio_user_req *vu_req) 5278 { 5279 memset(&vu_req->cmd, 0, sizeof(vu_req->cmd)); 5280 memset(&vu_req->rsp, 0, sizeof(vu_req->rsp)); 5281 vu_req->iovcnt = 0; 5282 vu_req->req.iovcnt = 0; 5283 vu_req->req.length = 0; 5284 vu_req->state = VFIO_USER_REQUEST_STATE_FREE; 5285 5286 TAILQ_INSERT_TAIL(&sq->free_reqs, vu_req, link); 5287 } 5288 5289 static int 5290 nvmf_vfio_user_req_free(struct spdk_nvmf_request *req) 5291 { 5292 struct nvmf_vfio_user_sq *sq; 5293 struct nvmf_vfio_user_req *vu_req; 5294 5295 assert(req != NULL); 5296 5297 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 5298 sq = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 5299 5300 _nvmf_vfio_user_req_free(sq, vu_req); 5301 5302 return 0; 5303 } 5304 5305 static int 5306 nvmf_vfio_user_req_complete(struct spdk_nvmf_request *req) 5307 { 5308 struct nvmf_vfio_user_sq *sq; 5309 struct nvmf_vfio_user_req *vu_req; 5310 5311 assert(req != NULL); 5312 5313 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 5314 sq = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 5315 5316 if (vu_req->cb_fn != NULL) { 5317 if (vu_req->cb_fn(vu_req, vu_req->cb_arg) != 0) { 5318 fail_ctrlr(sq->ctrlr); 5319 } 5320 } 5321 5322 _nvmf_vfio_user_req_free(sq, vu_req); 5323 5324 return 0; 5325 } 5326 5327 static void 5328 nvmf_vfio_user_close_qpair(struct spdk_nvmf_qpair *qpair, 5329 spdk_nvmf_transport_qpair_fini_cb cb_fn, void *cb_arg) 5330 { 5331 struct nvmf_vfio_user_sq *sq; 5332 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 5333 struct nvmf_vfio_user_endpoint *endpoint; 5334 struct vfio_user_delete_sq_ctx *del_ctx; 5335 5336 assert(qpair != NULL); 5337 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5338 vu_ctrlr = sq->ctrlr; 5339 endpoint = vu_ctrlr->endpoint; 5340 del_ctx = sq->delete_ctx; 5341 sq->delete_ctx = NULL; 5342 5343 pthread_mutex_lock(&endpoint->lock); 5344 TAILQ_REMOVE(&vu_ctrlr->connected_sqs, sq, tailq); 5345 delete_sq_done(vu_ctrlr, sq); 5346 if (TAILQ_EMPTY(&vu_ctrlr->connected_sqs)) { 5347 endpoint->ctrlr = NULL; 5348 if (vu_ctrlr->in_source_vm && endpoint->need_resume) { 5349 /* The controller will be freed, we can resume the subsystem 5350 * now so that the endpoint can be ready to accept another 5351 * new connection. 5352 */ 5353 spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 5354 vfio_user_endpoint_resume_done, endpoint); 5355 } 5356 free_ctrlr(vu_ctrlr); 5357 } 5358 pthread_mutex_unlock(&endpoint->lock); 5359 5360 if (del_ctx) { 5361 vfio_user_qpair_delete_cb(del_ctx); 5362 } 5363 5364 if (cb_fn) { 5365 cb_fn(cb_arg); 5366 } 5367 } 5368 5369 /** 5370 * Returns a preallocated request, or NULL if there isn't one available. 5371 */ 5372 static struct nvmf_vfio_user_req * 5373 get_nvmf_vfio_user_req(struct nvmf_vfio_user_sq *sq) 5374 { 5375 struct nvmf_vfio_user_req *req; 5376 5377 if (sq == NULL) { 5378 return NULL; 5379 } 5380 5381 req = TAILQ_FIRST(&sq->free_reqs); 5382 if (req == NULL) { 5383 return NULL; 5384 } 5385 5386 TAILQ_REMOVE(&sq->free_reqs, req, link); 5387 5388 return req; 5389 } 5390 5391 static int 5392 get_nvmf_io_req_length(struct spdk_nvmf_request *req) 5393 { 5394 uint16_t nr; 5395 uint32_t nlb, nsid; 5396 struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; 5397 struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; 5398 struct spdk_nvmf_ns *ns; 5399 5400 nsid = cmd->nsid; 5401 ns = _nvmf_subsystem_get_ns(ctrlr->subsys, nsid); 5402 if (ns == NULL || ns->bdev == NULL) { 5403 SPDK_ERRLOG("unsuccessful query for nsid %u\n", cmd->nsid); 5404 return -EINVAL; 5405 } 5406 5407 if (cmd->opc == SPDK_NVME_OPC_DATASET_MANAGEMENT) { 5408 nr = cmd->cdw10_bits.dsm.nr + 1; 5409 return nr * sizeof(struct spdk_nvme_dsm_range); 5410 } 5411 5412 if (cmd->opc == SPDK_NVME_OPC_COPY) { 5413 nr = (cmd->cdw12 & 0x000000ffu) + 1; 5414 return nr * sizeof(struct spdk_nvme_scc_source_range); 5415 } 5416 5417 nlb = (cmd->cdw12 & 0x0000ffffu) + 1; 5418 return nlb * spdk_bdev_get_block_size(ns->bdev); 5419 } 5420 5421 static int 5422 map_admin_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req) 5423 { 5424 struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; 5425 uint32_t len = 0, numdw = 0; 5426 uint8_t fid; 5427 int iovcnt; 5428 5429 req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 5430 5431 if (req->xfer == SPDK_NVME_DATA_NONE) { 5432 return 0; 5433 } 5434 5435 switch (cmd->opc) { 5436 case SPDK_NVME_OPC_IDENTIFY: 5437 len = 4096; 5438 break; 5439 case SPDK_NVME_OPC_GET_LOG_PAGE: 5440 numdw = ((((uint32_t)cmd->cdw11_bits.get_log_page.numdu << 16) | 5441 cmd->cdw10_bits.get_log_page.numdl) + 1); 5442 if (numdw > UINT32_MAX / 4) { 5443 return -EINVAL; 5444 } 5445 len = numdw * 4; 5446 break; 5447 case SPDK_NVME_OPC_GET_FEATURES: 5448 case SPDK_NVME_OPC_SET_FEATURES: 5449 fid = cmd->cdw10_bits.set_features.fid; 5450 switch (fid) { 5451 case SPDK_NVME_FEAT_LBA_RANGE_TYPE: 5452 len = 4096; 5453 break; 5454 case SPDK_NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 5455 len = 256; 5456 break; 5457 case SPDK_NVME_FEAT_TIMESTAMP: 5458 len = 8; 5459 break; 5460 case SPDK_NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 5461 len = 512; 5462 break; 5463 case SPDK_NVME_FEAT_HOST_IDENTIFIER: 5464 if (cmd->cdw11_bits.feat_host_identifier.bits.exhid) { 5465 len = 16; 5466 } else { 5467 len = 8; 5468 } 5469 break; 5470 default: 5471 return 0; 5472 } 5473 break; 5474 default: 5475 return 0; 5476 } 5477 5478 /* ADMIN command will not use SGL */ 5479 if (cmd->psdt != 0) { 5480 return -EINVAL; 5481 } 5482 5483 iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, len); 5484 if (iovcnt < 0) { 5485 SPDK_ERRLOG("%s: map Admin Opc %x failed\n", 5486 ctrlr_id(ctrlr), cmd->opc); 5487 return -1; 5488 } 5489 req->length = len; 5490 req->iovcnt = iovcnt; 5491 5492 return 0; 5493 } 5494 5495 /* 5496 * Map an I/O command's buffers. 5497 * 5498 * Returns 0 on success and -errno on failure. 5499 */ 5500 static int 5501 map_io_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req) 5502 { 5503 int len, iovcnt; 5504 struct spdk_nvme_cmd *cmd; 5505 5506 assert(ctrlr != NULL); 5507 assert(req != NULL); 5508 5509 cmd = &req->cmd->nvme_cmd; 5510 req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 5511 5512 if (spdk_unlikely(req->xfer == SPDK_NVME_DATA_NONE)) { 5513 return 0; 5514 } 5515 5516 len = get_nvmf_io_req_length(req); 5517 if (len < 0) { 5518 return -EINVAL; 5519 } 5520 req->length = len; 5521 5522 iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, req->length); 5523 if (iovcnt < 0) { 5524 SPDK_ERRLOG("%s: failed to map IO OPC %u\n", ctrlr_id(ctrlr), cmd->opc); 5525 return -EFAULT; 5526 } 5527 req->iovcnt = iovcnt; 5528 5529 return 0; 5530 } 5531 5532 static int 5533 handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 5534 struct nvmf_vfio_user_sq *sq) 5535 { 5536 int err; 5537 struct nvmf_vfio_user_req *vu_req; 5538 struct spdk_nvmf_request *req; 5539 5540 assert(ctrlr != NULL); 5541 assert(cmd != NULL); 5542 5543 vu_req = get_nvmf_vfio_user_req(sq); 5544 if (spdk_unlikely(vu_req == NULL)) { 5545 SPDK_ERRLOG("%s: no request for NVMe command opc 0x%x\n", ctrlr_id(ctrlr), cmd->opc); 5546 return post_completion(ctrlr, ctrlr->cqs[sq->cqid], 0, 0, cmd->cid, 5547 SPDK_NVME_SC_INTERNAL_DEVICE_ERROR, SPDK_NVME_SCT_GENERIC); 5548 5549 } 5550 req = &vu_req->req; 5551 5552 assert(req->qpair != NULL); 5553 SPDK_DEBUGLOG(nvmf_vfio, "%s: handle sqid:%u, req opc=%#x cid=%d\n", 5554 ctrlr_id(ctrlr), req->qpair->qid, cmd->opc, cmd->cid); 5555 5556 vu_req->cb_fn = handle_cmd_rsp; 5557 vu_req->cb_arg = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 5558 req->cmd->nvme_cmd = *cmd; 5559 5560 if (nvmf_qpair_is_admin_queue(req->qpair)) { 5561 err = map_admin_cmd_req(ctrlr, req); 5562 } else { 5563 switch (cmd->opc) { 5564 case SPDK_NVME_OPC_RESERVATION_REGISTER: 5565 case SPDK_NVME_OPC_RESERVATION_REPORT: 5566 case SPDK_NVME_OPC_RESERVATION_ACQUIRE: 5567 case SPDK_NVME_OPC_RESERVATION_RELEASE: 5568 err = -ENOTSUP; 5569 break; 5570 default: 5571 err = map_io_cmd_req(ctrlr, req); 5572 break; 5573 } 5574 } 5575 5576 if (spdk_unlikely(err < 0)) { 5577 SPDK_ERRLOG("%s: process NVMe command opc 0x%x failed\n", 5578 ctrlr_id(ctrlr), cmd->opc); 5579 req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 5580 req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; 5581 err = handle_cmd_rsp(vu_req, vu_req->cb_arg); 5582 _nvmf_vfio_user_req_free(sq, vu_req); 5583 return err; 5584 } 5585 5586 vu_req->state = VFIO_USER_REQUEST_STATE_EXECUTING; 5587 spdk_nvmf_request_exec(req); 5588 5589 return 0; 5590 } 5591 5592 /* 5593 * If we suppressed an IRQ in post_completion(), check if it needs to be fired 5594 * here: if the host isn't up to date, and is apparently not actively processing 5595 * the queue (i.e. ->last_head isn't changing), we need an IRQ. 5596 */ 5597 static void 5598 handle_suppressed_irq(struct nvmf_vfio_user_ctrlr *ctrlr, 5599 struct nvmf_vfio_user_sq *sq) 5600 { 5601 struct nvmf_vfio_user_cq *cq = ctrlr->cqs[sq->cqid]; 5602 uint32_t cq_head; 5603 uint32_t cq_tail; 5604 5605 if (!cq->ien || cq->qid == 0 || !ctrlr_interrupt_enabled(ctrlr)) { 5606 return; 5607 } 5608 5609 cq_tail = *cq_tailp(cq); 5610 5611 /* Already sent? */ 5612 if (cq_tail == cq->last_trigger_irq_tail) { 5613 return; 5614 } 5615 5616 spdk_ivdt_dcache(cq_dbl_headp(cq)); 5617 cq_head = *cq_dbl_headp(cq); 5618 5619 if (cq_head != cq_tail && cq_head == cq->last_head) { 5620 int err = vfu_irq_trigger(ctrlr->endpoint->vfu_ctx, cq->iv); 5621 if (err != 0) { 5622 SPDK_ERRLOG("%s: failed to trigger interrupt: %m\n", 5623 ctrlr_id(ctrlr)); 5624 } else { 5625 cq->last_trigger_irq_tail = cq_tail; 5626 } 5627 } 5628 5629 cq->last_head = cq_head; 5630 } 5631 5632 /* Returns the number of commands processed, or a negative value on error. */ 5633 static int 5634 nvmf_vfio_user_sq_poll(struct nvmf_vfio_user_sq *sq) 5635 { 5636 struct nvmf_vfio_user_ctrlr *ctrlr; 5637 uint32_t new_tail; 5638 int count = 0; 5639 5640 assert(sq != NULL); 5641 5642 ctrlr = sq->ctrlr; 5643 5644 /* 5645 * A quiesced, or migrating, controller should never process new 5646 * commands. 5647 */ 5648 if (ctrlr->state != VFIO_USER_CTRLR_RUNNING) { 5649 return SPDK_POLLER_IDLE; 5650 } 5651 5652 if (ctrlr->adaptive_irqs_enabled) { 5653 handle_suppressed_irq(ctrlr, sq); 5654 } 5655 5656 /* On aarch64 platforms, doorbells update from guest VM may not be seen 5657 * on SPDK target side. This is because there is memory type mismatch 5658 * situation here. That is on guest VM side, the doorbells are treated as 5659 * device memory while on SPDK target side, it is treated as normal 5660 * memory. And this situation cause problem on ARM platform. 5661 * Refer to "https://developer.arm.com/documentation/102376/0100/ 5662 * Memory-aliasing-and-mismatched-memory-types". Only using spdk_mb() 5663 * cannot fix this. Use "dc civac" to invalidate cache may solve 5664 * this. 5665 */ 5666 spdk_ivdt_dcache(sq_dbl_tailp(sq)); 5667 5668 /* Load-Acquire. */ 5669 new_tail = *sq_dbl_tailp(sq); 5670 5671 new_tail = new_tail & 0xffffu; 5672 if (spdk_unlikely(new_tail >= sq->size)) { 5673 SPDK_DEBUGLOG(nvmf_vfio, "%s: invalid sqid:%u doorbell value %u\n", ctrlr_id(ctrlr), sq->qid, 5674 new_tail); 5675 spdk_nvmf_ctrlr_async_event_error_event(ctrlr->ctrlr, SPDK_NVME_ASYNC_EVENT_INVALID_DB_WRITE); 5676 5677 return -1; 5678 } 5679 5680 if (*sq_headp(sq) == new_tail) { 5681 return 0; 5682 } 5683 5684 SPDK_DEBUGLOG(nvmf_vfio, "%s: sqid:%u doorbell old=%u new=%u\n", 5685 ctrlr_id(ctrlr), sq->qid, *sq_headp(sq), new_tail); 5686 if (ctrlr->sdbl != NULL) { 5687 SPDK_DEBUGLOG(nvmf_vfio, 5688 "%s: sqid:%u bar0_doorbell=%u shadow_doorbell=%u eventidx=%u\n", 5689 ctrlr_id(ctrlr), sq->qid, 5690 ctrlr->bar0_doorbells[queue_index(sq->qid, false)], 5691 ctrlr->sdbl->shadow_doorbells[queue_index(sq->qid, false)], 5692 ctrlr->sdbl->eventidxs[queue_index(sq->qid, false)]); 5693 } 5694 5695 /* 5696 * Ensure that changes to the queue are visible to us. 5697 * The host driver should write the queue first, do a wmb(), and then 5698 * update the SQ tail doorbell (their Store-Release). 5699 */ 5700 spdk_rmb(); 5701 5702 count = handle_sq_tdbl_write(ctrlr, new_tail, sq); 5703 if (spdk_unlikely(count < 0)) { 5704 fail_ctrlr(ctrlr); 5705 } 5706 5707 return count; 5708 } 5709 5710 /* 5711 * vfio-user transport poll handler. Note that the library context is polled in 5712 * a separate poller (->vfu_ctx_poller), so this poller only needs to poll the 5713 * active SQs. 5714 * 5715 * Returns the number of commands processed, or a negative value on error. 5716 */ 5717 static int 5718 nvmf_vfio_user_poll_group_poll(struct spdk_nvmf_transport_poll_group *group) 5719 { 5720 struct nvmf_vfio_user_poll_group *vu_group; 5721 struct nvmf_vfio_user_sq *sq, *tmp; 5722 int count = 0; 5723 5724 assert(group != NULL); 5725 5726 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 5727 5728 SPDK_DEBUGLOG(vfio_user_db, "polling all SQs\n"); 5729 5730 TAILQ_FOREACH_SAFE(sq, &vu_group->sqs, link, tmp) { 5731 int ret; 5732 5733 if (spdk_unlikely(sq->sq_state != VFIO_USER_SQ_ACTIVE || !sq->size)) { 5734 continue; 5735 } 5736 5737 ret = nvmf_vfio_user_sq_poll(sq); 5738 5739 if (spdk_unlikely(ret < 0)) { 5740 return ret; 5741 } 5742 5743 count += ret; 5744 } 5745 5746 vu_group->stats.polls++; 5747 vu_group->stats.poll_reqs += count; 5748 vu_group->stats.poll_reqs_squared += count * count; 5749 if (count == 0) { 5750 vu_group->stats.polls_spurious++; 5751 } 5752 5753 return count; 5754 } 5755 5756 static int 5757 nvmf_vfio_user_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair, 5758 struct spdk_nvme_transport_id *trid) 5759 { 5760 struct nvmf_vfio_user_sq *sq; 5761 struct nvmf_vfio_user_ctrlr *ctrlr; 5762 5763 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5764 ctrlr = sq->ctrlr; 5765 5766 memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid)); 5767 return 0; 5768 } 5769 5770 static int 5771 nvmf_vfio_user_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair, 5772 struct spdk_nvme_transport_id *trid) 5773 { 5774 return 0; 5775 } 5776 5777 static int 5778 nvmf_vfio_user_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair, 5779 struct spdk_nvme_transport_id *trid) 5780 { 5781 struct nvmf_vfio_user_sq *sq; 5782 struct nvmf_vfio_user_ctrlr *ctrlr; 5783 5784 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5785 ctrlr = sq->ctrlr; 5786 5787 memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid)); 5788 return 0; 5789 } 5790 5791 static void 5792 nvmf_vfio_user_qpair_abort_request(struct spdk_nvmf_qpair *qpair, 5793 struct spdk_nvmf_request *req) 5794 { 5795 struct spdk_nvmf_request *req_to_abort = NULL; 5796 struct spdk_nvmf_request *temp_req = NULL; 5797 uint16_t cid; 5798 5799 cid = req->cmd->nvme_cmd.cdw10_bits.abort.cid; 5800 5801 TAILQ_FOREACH(temp_req, &qpair->outstanding, link) { 5802 struct nvmf_vfio_user_req *vu_req; 5803 5804 vu_req = SPDK_CONTAINEROF(temp_req, struct nvmf_vfio_user_req, req); 5805 5806 if (vu_req->state == VFIO_USER_REQUEST_STATE_EXECUTING && vu_req->cmd.cid == cid) { 5807 req_to_abort = temp_req; 5808 break; 5809 } 5810 } 5811 5812 if (req_to_abort == NULL) { 5813 spdk_nvmf_request_complete(req); 5814 return; 5815 } 5816 5817 req->req_to_abort = req_to_abort; 5818 nvmf_ctrlr_abort_request(req); 5819 } 5820 5821 static void 5822 nvmf_vfio_user_poll_group_dump_stat(struct spdk_nvmf_transport_poll_group *group, 5823 struct spdk_json_write_ctx *w) 5824 { 5825 struct nvmf_vfio_user_poll_group *vu_group = SPDK_CONTAINEROF(group, 5826 struct nvmf_vfio_user_poll_group, group); 5827 uint64_t polls_denom; 5828 5829 spdk_json_write_named_uint64(w, "ctrlr_intr", vu_group->stats.ctrlr_intr); 5830 spdk_json_write_named_uint64(w, "ctrlr_kicks", vu_group->stats.ctrlr_kicks); 5831 spdk_json_write_named_uint64(w, "won", vu_group->stats.won); 5832 spdk_json_write_named_uint64(w, "lost", vu_group->stats.lost); 5833 spdk_json_write_named_uint64(w, "lost_count", vu_group->stats.lost_count); 5834 spdk_json_write_named_uint64(w, "rearms", vu_group->stats.rearms); 5835 spdk_json_write_named_uint64(w, "pg_process_count", vu_group->stats.pg_process_count); 5836 spdk_json_write_named_uint64(w, "intr", vu_group->stats.intr); 5837 spdk_json_write_named_uint64(w, "polls", vu_group->stats.polls); 5838 spdk_json_write_named_uint64(w, "polls_spurious", vu_group->stats.polls_spurious); 5839 spdk_json_write_named_uint64(w, "poll_reqs", vu_group->stats.poll_reqs); 5840 polls_denom = vu_group->stats.polls * (vu_group->stats.polls - 1); 5841 if (polls_denom) { 5842 uint64_t n = vu_group->stats.polls * vu_group->stats.poll_reqs_squared - vu_group->stats.poll_reqs * 5843 vu_group->stats.poll_reqs; 5844 spdk_json_write_named_double(w, "poll_reqs_variance", sqrt(n / polls_denom)); 5845 } 5846 5847 spdk_json_write_named_uint64(w, "cqh_admin_writes", vu_group->stats.cqh_admin_writes); 5848 spdk_json_write_named_uint64(w, "cqh_io_writes", vu_group->stats.cqh_io_writes); 5849 } 5850 5851 static void 5852 nvmf_vfio_user_opts_init(struct spdk_nvmf_transport_opts *opts) 5853 { 5854 opts->max_queue_depth = NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH; 5855 opts->max_qpairs_per_ctrlr = NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; 5856 opts->in_capsule_data_size = 0; 5857 opts->max_io_size = NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE; 5858 opts->io_unit_size = NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE; 5859 opts->max_aq_depth = NVMF_VFIO_USER_DEFAULT_AQ_DEPTH; 5860 opts->num_shared_buffers = 0; 5861 opts->buf_cache_size = 0; 5862 opts->association_timeout = 0; 5863 opts->transport_specific = NULL; 5864 } 5865 5866 const struct spdk_nvmf_transport_ops spdk_nvmf_transport_vfio_user = { 5867 .name = "VFIOUSER", 5868 .type = SPDK_NVME_TRANSPORT_VFIOUSER, 5869 .opts_init = nvmf_vfio_user_opts_init, 5870 .create = nvmf_vfio_user_create, 5871 .destroy = nvmf_vfio_user_destroy, 5872 5873 .listen = nvmf_vfio_user_listen, 5874 .stop_listen = nvmf_vfio_user_stop_listen, 5875 .cdata_init = nvmf_vfio_user_cdata_init, 5876 .listen_associate = nvmf_vfio_user_listen_associate, 5877 5878 .listener_discover = nvmf_vfio_user_discover, 5879 5880 .poll_group_create = nvmf_vfio_user_poll_group_create, 5881 .get_optimal_poll_group = nvmf_vfio_user_get_optimal_poll_group, 5882 .poll_group_destroy = nvmf_vfio_user_poll_group_destroy, 5883 .poll_group_add = nvmf_vfio_user_poll_group_add, 5884 .poll_group_remove = nvmf_vfio_user_poll_group_remove, 5885 .poll_group_poll = nvmf_vfio_user_poll_group_poll, 5886 5887 .req_free = nvmf_vfio_user_req_free, 5888 .req_complete = nvmf_vfio_user_req_complete, 5889 5890 .qpair_fini = nvmf_vfio_user_close_qpair, 5891 .qpair_get_local_trid = nvmf_vfio_user_qpair_get_local_trid, 5892 .qpair_get_peer_trid = nvmf_vfio_user_qpair_get_peer_trid, 5893 .qpair_get_listen_trid = nvmf_vfio_user_qpair_get_listen_trid, 5894 .qpair_abort_request = nvmf_vfio_user_qpair_abort_request, 5895 5896 .poll_group_dump_stat = nvmf_vfio_user_poll_group_dump_stat, 5897 }; 5898 5899 SPDK_NVMF_TRANSPORT_REGISTER(muser, &spdk_nvmf_transport_vfio_user); 5900 SPDK_LOG_REGISTER_COMPONENT(nvmf_vfio) 5901 SPDK_LOG_REGISTER_COMPONENT(vfio_user_db) 5902