1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2020 Intel Corporation. 3 * Copyright (c) 2019-2022, Nutanix Inc. All rights reserved. 4 * Copyright (c) 2022, 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 /* 8 * NVMe over vfio-user transport 9 */ 10 11 #include <sys/param.h> 12 13 #include <vfio-user/libvfio-user.h> 14 #include <vfio-user/pci_defs.h> 15 16 #include "spdk/barrier.h" 17 #include "spdk/stdinc.h" 18 #include "spdk/assert.h" 19 #include "spdk/thread.h" 20 #include "spdk/nvmf_transport.h" 21 #include "spdk/sock.h" 22 #include "spdk/string.h" 23 #include "spdk/util.h" 24 #include "spdk/log.h" 25 26 #include "transport.h" 27 28 #include "nvmf_internal.h" 29 30 #define SWAP(x, y) \ 31 do \ 32 { \ 33 typeof(x) _tmp = x; \ 34 x = y; \ 35 y = _tmp; \ 36 } while (0) 37 38 #define NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH 256 39 #define NVMF_VFIO_USER_DEFAULT_AQ_DEPTH 32 40 #define NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE ((NVMF_REQ_MAX_BUFFERS - 1) << SHIFT_4KB) 41 #define NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE 42 43 #define NVME_DOORBELLS_OFFSET 0x1000 44 #define NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT 2 45 #define NVMF_VFIO_USER_SET_EVENTIDX_MAX_ATTEMPTS 3 46 #define NVMF_VFIO_USER_EVENTIDX_POLL UINT32_MAX 47 48 #define NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR 512 49 #define NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR (NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR / 4) 50 51 /* NVMe spec 1.4, section 5.21.1.7 */ 52 SPDK_STATIC_ASSERT(NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR >= 2 && 53 NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR <= SPDK_NVME_MAX_IO_QUEUES, 54 "bad number of queues"); 55 56 /* 57 * NVMe driver reads 4096 bytes, which is the extended PCI configuration space 58 * available on PCI-X 2.0 and PCI Express buses 59 */ 60 #define NVME_REG_CFG_SIZE 0x1000 61 62 /* 63 * Doorbells must be page aligned so that they can memory mapped. 64 * 65 * TODO does the NVMe spec also require this? Document it. 66 */ 67 #define NVMF_VFIO_USER_DOORBELLS_SIZE \ 68 SPDK_ALIGN_CEIL( \ 69 (NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR * 2 * SPDK_NVME_DOORBELL_REGISTER_SIZE), \ 70 0x1000) 71 #define NVME_REG_BAR0_SIZE (NVME_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE) 72 73 /* 74 * TODO check the PCI spec whether BAR4 and BAR5 really have to be at least one 75 * page and a multiple of page size (maybe QEMU also needs this?). Document all 76 * this. 77 */ 78 79 /* 80 * MSI-X Pending Bit Array Size 81 * 82 * TODO according to the PCI spec we need one bit per vector, document the 83 * relevant section. 84 * 85 * If the first argument to SPDK_ALIGN_CEIL is 0 then the result is 0, so we 86 * would end up with a 0-size BAR5. 87 */ 88 #define NVME_IRQ_MSIX_NUM MAX(CHAR_BIT, NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) 89 #define NVME_BAR5_SIZE SPDK_ALIGN_CEIL((NVME_IRQ_MSIX_NUM / CHAR_BIT), 0x1000) 90 SPDK_STATIC_ASSERT(NVME_BAR5_SIZE > 0, "Incorrect size"); 91 92 /* MSI-X Table Size */ 93 #define NVME_BAR4_SIZE SPDK_ALIGN_CEIL((NVME_IRQ_MSIX_NUM * 16), 0x1000) 94 SPDK_STATIC_ASSERT(NVME_BAR4_SIZE > 0, "Incorrect size"); 95 96 struct nvmf_vfio_user_req; 97 98 typedef int (*nvmf_vfio_user_req_cb_fn)(struct nvmf_vfio_user_req *req, void *cb_arg); 99 100 /* 1 more for PRP2 list itself */ 101 #define NVMF_VFIO_USER_MAX_IOVECS (NVMF_REQ_MAX_BUFFERS + 1) 102 103 enum nvmf_vfio_user_req_state { 104 VFIO_USER_REQUEST_STATE_FREE = 0, 105 VFIO_USER_REQUEST_STATE_EXECUTING, 106 }; 107 108 /* 109 * Support for live migration in NVMf/vfio-user: live migration is implemented 110 * by stopping the NVMf subsystem when the device is instructed to enter the 111 * stop-and-copy state and then trivially, and most importantly safely, 112 * collecting migration state and providing it to the vfio-user client. We 113 * don't provide any migration state at the pre-copy state as that's too 114 * complicated to do, we might support this in the future. 115 */ 116 117 118 /* NVMe device state representation */ 119 struct nvme_migr_sq_state { 120 uint16_t sqid; 121 uint16_t cqid; 122 uint32_t head; 123 uint32_t size; 124 uint32_t reserved; 125 uint64_t dma_addr; 126 }; 127 SPDK_STATIC_ASSERT(sizeof(struct nvme_migr_sq_state) == 0x18, "Incorrect size"); 128 129 struct nvme_migr_cq_state { 130 uint16_t cqid; 131 uint16_t phase; 132 uint32_t tail; 133 uint32_t size; 134 uint32_t iv; 135 uint32_t ien; 136 uint32_t reserved; 137 uint64_t dma_addr; 138 }; 139 SPDK_STATIC_ASSERT(sizeof(struct nvme_migr_cq_state) == 0x20, "Incorrect size"); 140 141 #define VFIO_USER_NVME_MIGR_MAGIC 0xAFEDBC23 142 143 /* The device state is in VFIO MIGRATION BAR(9) region, keep the device state page aligned. 144 * 145 * NVMe device migration region is defined as below: 146 * ------------------------------------------------------------------------- 147 * | vfio_user_nvme_migr_header | nvmf controller data | queue pairs | BARs | 148 * ------------------------------------------------------------------------- 149 * 150 * Keep vfio_user_nvme_migr_header as a fixed 0x1000 length, all new added fields 151 * can use the reserved space at the end of the data structure. 152 */ 153 struct vfio_user_nvme_migr_header { 154 /* Magic value to validate migration data */ 155 uint32_t magic; 156 /* Version to check the data is same from source to destination */ 157 uint32_t version; 158 159 /* The library uses this field to know how many fields in this 160 * structure are valid, starting at the beginning of this data 161 * structure. New added fields in future use `unused` memory 162 * spaces. 163 */ 164 uint32_t opts_size; 165 uint32_t reserved0; 166 167 /* BARs information */ 168 uint64_t bar_offset[VFU_PCI_DEV_NUM_REGIONS]; 169 uint64_t bar_len[VFU_PCI_DEV_NUM_REGIONS]; 170 171 /* Queue pair start offset, starting at the beginning of this 172 * data structure. 173 */ 174 uint64_t qp_offset; 175 uint64_t qp_len; 176 177 /* Controller data structure */ 178 uint32_t num_io_queues; 179 uint32_t reserved1; 180 181 /* NVMf controller data offset and length if exist, starting at 182 * the beginning of this data structure. 183 */ 184 uint64_t nvmf_data_offset; 185 uint64_t nvmf_data_len; 186 187 /* 188 * Whether or not shadow doorbells are used in the source. 0 is a valid DMA 189 * address. 190 */ 191 uint32_t sdbl; 192 193 /* Shadow doorbell DMA addresses. */ 194 uint64_t shadow_doorbell_buffer; 195 uint64_t eventidx_buffer; 196 197 /* Reserved memory space for new added fields, the 198 * field is always at the end of this data structure. 199 */ 200 uint8_t unused[3856]; 201 }; 202 SPDK_STATIC_ASSERT(sizeof(struct vfio_user_nvme_migr_header) == 0x1000, "Incorrect size"); 203 204 struct vfio_user_nvme_migr_qp { 205 struct nvme_migr_sq_state sq; 206 struct nvme_migr_cq_state cq; 207 }; 208 209 /* NVMe state definition used to load/restore from/to NVMe migration BAR region */ 210 struct vfio_user_nvme_migr_state { 211 struct vfio_user_nvme_migr_header ctrlr_header; 212 struct spdk_nvmf_ctrlr_migr_data nvmf_data; 213 struct vfio_user_nvme_migr_qp qps[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 214 uint8_t doorbells[NVMF_VFIO_USER_DOORBELLS_SIZE]; 215 uint8_t cfg[NVME_REG_CFG_SIZE]; 216 }; 217 218 struct nvmf_vfio_user_req { 219 struct spdk_nvmf_request req; 220 struct spdk_nvme_cpl rsp; 221 struct spdk_nvme_cmd cmd; 222 223 enum nvmf_vfio_user_req_state state; 224 nvmf_vfio_user_req_cb_fn cb_fn; 225 void *cb_arg; 226 227 /* old CC before prop_set_cc fabric command */ 228 union spdk_nvme_cc_register cc; 229 230 TAILQ_ENTRY(nvmf_vfio_user_req) link; 231 232 struct iovec iov[NVMF_VFIO_USER_MAX_IOVECS]; 233 uint8_t iovcnt; 234 235 /* NVMF_VFIO_USER_MAX_IOVECS worth of dma_sg_t. */ 236 uint8_t sg[]; 237 }; 238 239 /* 240 * Mapping of an NVMe queue. 241 * 242 * This holds the information tracking a local process mapping of an NVMe queue 243 * shared by the client. 244 */ 245 struct nvme_q_mapping { 246 /* iov of local process mapping. */ 247 struct iovec iov; 248 /* Stored sg, needed for unmap. */ 249 dma_sg_t *sg; 250 /* Client PRP of queue. */ 251 uint64_t prp1; 252 }; 253 254 enum nvmf_vfio_user_sq_state { 255 VFIO_USER_SQ_UNUSED = 0, 256 VFIO_USER_SQ_CREATED, 257 VFIO_USER_SQ_DELETED, 258 VFIO_USER_SQ_ACTIVE, 259 VFIO_USER_SQ_INACTIVE 260 }; 261 262 enum nvmf_vfio_user_cq_state { 263 VFIO_USER_CQ_UNUSED = 0, 264 VFIO_USER_CQ_CREATED, 265 VFIO_USER_CQ_DELETED, 266 }; 267 268 enum nvmf_vfio_user_ctrlr_state { 269 VFIO_USER_CTRLR_CREATING = 0, 270 VFIO_USER_CTRLR_RUNNING, 271 /* Quiesce requested by libvfio-user */ 272 VFIO_USER_CTRLR_PAUSING, 273 /* NVMf subsystem is paused, it's safe to do PCI reset, memory register, 274 * memory unergister, and vfio migration state transition in this state. 275 */ 276 VFIO_USER_CTRLR_PAUSED, 277 /* 278 * Implies that the NVMf subsystem is paused. Device will be unquiesced (PCI 279 * reset, memory register and unregister, controller in destination VM has 280 * been restored). NVMf subsystem resume has been requested. 281 */ 282 VFIO_USER_CTRLR_RESUMING, 283 /* 284 * Implies that the NVMf subsystem is paused. Both controller in source VM and 285 * destinatiom VM is in this state when doing live migration. 286 */ 287 VFIO_USER_CTRLR_MIGRATING 288 }; 289 290 struct nvmf_vfio_user_sq { 291 struct spdk_nvmf_qpair qpair; 292 struct spdk_nvmf_transport_poll_group *group; 293 struct nvmf_vfio_user_ctrlr *ctrlr; 294 295 uint32_t qid; 296 /* Number of entries in queue. */ 297 uint32_t size; 298 struct nvme_q_mapping mapping; 299 enum nvmf_vfio_user_sq_state sq_state; 300 301 uint32_t head; 302 volatile uint32_t *dbl_tailp; 303 304 /* Whether a shadow doorbell eventidx needs setting. */ 305 bool need_rearm; 306 307 /* multiple SQs can be mapped to the same CQ */ 308 uint16_t cqid; 309 310 /* handle_queue_connect_rsp() can be used both for CREATE IO SQ response 311 * and SQ re-connect response in the destination VM, for the prior case, 312 * we will post a NVMe completion to VM, we will not set this flag when 313 * re-connecting SQs in the destination VM. 314 */ 315 bool post_create_io_sq_completion; 316 /* Copy of Create IO SQ command, this field is used together with 317 * `post_create_io_sq_completion` flag. 318 */ 319 struct spdk_nvme_cmd create_io_sq_cmd; 320 321 struct vfio_user_delete_sq_ctx *delete_ctx; 322 323 /* Currently unallocated reqs. */ 324 TAILQ_HEAD(, nvmf_vfio_user_req) free_reqs; 325 /* Poll group entry */ 326 TAILQ_ENTRY(nvmf_vfio_user_sq) link; 327 /* Connected SQ entry */ 328 TAILQ_ENTRY(nvmf_vfio_user_sq) tailq; 329 }; 330 331 struct nvmf_vfio_user_cq { 332 struct spdk_nvmf_transport_poll_group *group; 333 int cq_ref; 334 335 uint32_t qid; 336 /* Number of entries in queue. */ 337 uint32_t size; 338 struct nvme_q_mapping mapping; 339 enum nvmf_vfio_user_cq_state cq_state; 340 341 uint32_t tail; 342 volatile uint32_t *dbl_headp; 343 344 bool phase; 345 346 uint16_t iv; 347 bool ien; 348 349 uint32_t last_head; 350 uint32_t last_trigger_irq_tail; 351 }; 352 353 struct nvmf_vfio_user_poll_group { 354 struct spdk_nvmf_transport_poll_group group; 355 TAILQ_ENTRY(nvmf_vfio_user_poll_group) link; 356 TAILQ_HEAD(, nvmf_vfio_user_sq) sqs; 357 struct spdk_interrupt *intr; 358 int intr_fd; 359 struct { 360 361 /* 362 * ctrlr_intr and ctrlr_kicks will be zero for all other poll 363 * groups. However, they can be zero even for the poll group 364 * the controller belongs are if no vfio-user message has been 365 * received or the controller hasn't been kicked yet. 366 */ 367 368 /* 369 * Number of times vfio_user_ctrlr_intr() has run: 370 * vfio-user file descriptor has been ready or explicitly 371 * kicked (see below). 372 */ 373 uint64_t ctrlr_intr; 374 375 /* 376 * Kicks to the controller by ctrlr_kick(). 377 * ctrlr_intr - ctrlr_kicks is the number of times the 378 * vfio-user poll file descriptor has been ready. 379 */ 380 uint64_t ctrlr_kicks; 381 382 /* 383 * How many times we won the race arming an SQ. 384 */ 385 uint64_t won; 386 387 /* 388 * How many times we lost the race arming an SQ 389 */ 390 uint64_t lost; 391 392 /* 393 * How many requests we processed in total each time we lost 394 * the rearm race. 395 */ 396 uint64_t lost_count; 397 398 /* 399 * Number of attempts we attempted to rearm all the SQs in the 400 * poll group. 401 */ 402 uint64_t rearms; 403 404 uint64_t pg_process_count; 405 uint64_t intr; 406 uint64_t polls; 407 uint64_t polls_spurious; 408 uint64_t poll_reqs; 409 uint64_t poll_reqs_squared; 410 uint64_t cqh_admin_writes; 411 uint64_t cqh_io_writes; 412 } stats; 413 }; 414 415 struct nvmf_vfio_user_shadow_doorbells { 416 volatile uint32_t *shadow_doorbells; 417 volatile uint32_t *eventidxs; 418 dma_sg_t *sgs; 419 struct iovec *iovs; 420 }; 421 422 struct nvmf_vfio_user_ctrlr { 423 struct nvmf_vfio_user_endpoint *endpoint; 424 struct nvmf_vfio_user_transport *transport; 425 426 /* Connected SQs list */ 427 TAILQ_HEAD(, nvmf_vfio_user_sq) connected_sqs; 428 enum nvmf_vfio_user_ctrlr_state state; 429 430 /* 431 * Tells whether live migration data have been prepared. This is used 432 * by the get_pending_bytes callback to tell whether or not the 433 * previous iteration finished. 434 */ 435 bool migr_data_prepared; 436 437 /* Controller is in source VM when doing live migration */ 438 bool in_source_vm; 439 440 struct spdk_thread *thread; 441 struct spdk_poller *vfu_ctx_poller; 442 struct spdk_interrupt *intr; 443 int intr_fd; 444 445 bool queued_quiesce; 446 447 bool reset_shn; 448 bool disconnect; 449 450 uint16_t cntlid; 451 struct spdk_nvmf_ctrlr *ctrlr; 452 453 struct nvmf_vfio_user_sq *sqs[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 454 struct nvmf_vfio_user_cq *cqs[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 455 456 TAILQ_ENTRY(nvmf_vfio_user_ctrlr) link; 457 458 volatile uint32_t *bar0_doorbells; 459 struct nvmf_vfio_user_shadow_doorbells *sdbl; 460 /* 461 * Shadow doorbells PRPs to provide during the stop-and-copy state. 462 */ 463 uint64_t shadow_doorbell_buffer; 464 uint64_t eventidx_buffer; 465 466 bool adaptive_irqs_enabled; 467 }; 468 469 /* Endpoint in vfio-user is associated with a socket file, which 470 * is the representative of a PCI endpoint. 471 */ 472 struct nvmf_vfio_user_endpoint { 473 struct nvmf_vfio_user_transport *transport; 474 vfu_ctx_t *vfu_ctx; 475 struct spdk_poller *accept_poller; 476 struct spdk_thread *accept_thread; 477 bool interrupt_mode; 478 struct msixcap *msix; 479 vfu_pci_config_space_t *pci_config_space; 480 int devmem_fd; 481 int accept_intr_fd; 482 struct spdk_interrupt *accept_intr; 483 484 volatile uint32_t *bar0_doorbells; 485 486 int migr_fd; 487 void *migr_data; 488 489 struct spdk_nvme_transport_id trid; 490 struct spdk_nvmf_subsystem *subsystem; 491 492 /* Controller is associated with an active socket connection, 493 * the lifecycle of the controller is same as the VM. 494 * Currently we only support one active connection, as the NVMe 495 * specification defines, we may support multiple controllers in 496 * future, so that it can support e.g: RESERVATION. 497 */ 498 struct nvmf_vfio_user_ctrlr *ctrlr; 499 pthread_mutex_t lock; 500 501 bool need_async_destroy; 502 /* The subsystem is in PAUSED state and need to be resumed, TRUE 503 * only when migration is done successfully and the controller is 504 * in source VM. 505 */ 506 bool need_resume; 507 /* Start the accept poller again after destroying the controller */ 508 bool need_relisten; 509 510 TAILQ_ENTRY(nvmf_vfio_user_endpoint) link; 511 }; 512 513 struct nvmf_vfio_user_transport_opts { 514 bool disable_mappable_bar0; 515 bool disable_adaptive_irq; 516 bool disable_shadow_doorbells; 517 bool disable_compare; 518 bool enable_intr_mode_sq_spreading; 519 }; 520 521 struct nvmf_vfio_user_transport { 522 struct spdk_nvmf_transport transport; 523 struct nvmf_vfio_user_transport_opts transport_opts; 524 bool intr_mode_supported; 525 pthread_mutex_t lock; 526 TAILQ_HEAD(, nvmf_vfio_user_endpoint) endpoints; 527 528 pthread_mutex_t pg_lock; 529 TAILQ_HEAD(, nvmf_vfio_user_poll_group) poll_groups; 530 struct nvmf_vfio_user_poll_group *next_pg; 531 }; 532 533 /* 534 * function prototypes 535 */ 536 static int nvmf_vfio_user_req_free(struct spdk_nvmf_request *req); 537 538 static struct nvmf_vfio_user_req *get_nvmf_vfio_user_req(struct nvmf_vfio_user_sq *sq); 539 540 /* 541 * Local process virtual address of a queue. 542 */ 543 static inline void * 544 q_addr(struct nvme_q_mapping *mapping) 545 { 546 return mapping->iov.iov_base; 547 } 548 549 static inline int 550 queue_index(uint16_t qid, bool is_cq) 551 { 552 return (qid * 2) + is_cq; 553 } 554 555 static inline volatile uint32_t * 556 sq_headp(struct nvmf_vfio_user_sq *sq) 557 { 558 assert(sq != NULL); 559 return &sq->head; 560 } 561 562 static inline volatile uint32_t * 563 sq_dbl_tailp(struct nvmf_vfio_user_sq *sq) 564 { 565 assert(sq != NULL); 566 return sq->dbl_tailp; 567 } 568 569 static inline volatile uint32_t * 570 cq_dbl_headp(struct nvmf_vfio_user_cq *cq) 571 { 572 assert(cq != NULL); 573 return cq->dbl_headp; 574 } 575 576 static inline volatile uint32_t * 577 cq_tailp(struct nvmf_vfio_user_cq *cq) 578 { 579 assert(cq != NULL); 580 return &cq->tail; 581 } 582 583 static inline void 584 sq_head_advance(struct nvmf_vfio_user_sq *sq) 585 { 586 assert(sq != NULL); 587 588 assert(*sq_headp(sq) < sq->size); 589 (*sq_headp(sq))++; 590 591 if (spdk_unlikely(*sq_headp(sq) == sq->size)) { 592 *sq_headp(sq) = 0; 593 } 594 } 595 596 static inline void 597 cq_tail_advance(struct nvmf_vfio_user_cq *cq) 598 { 599 assert(cq != NULL); 600 601 assert(*cq_tailp(cq) < cq->size); 602 (*cq_tailp(cq))++; 603 604 if (spdk_unlikely(*cq_tailp(cq) == cq->size)) { 605 *cq_tailp(cq) = 0; 606 cq->phase = !cq->phase; 607 } 608 } 609 610 static bool 611 io_q_exists(struct nvmf_vfio_user_ctrlr *vu_ctrlr, const uint16_t qid, const bool is_cq) 612 { 613 assert(vu_ctrlr != NULL); 614 615 if (qid == 0 || qid >= NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) { 616 return false; 617 } 618 619 if (is_cq) { 620 if (vu_ctrlr->cqs[qid] == NULL) { 621 return false; 622 } 623 624 return (vu_ctrlr->cqs[qid]->cq_state != VFIO_USER_CQ_DELETED && 625 vu_ctrlr->cqs[qid]->cq_state != VFIO_USER_CQ_UNUSED); 626 } 627 628 if (vu_ctrlr->sqs[qid] == NULL) { 629 return false; 630 } 631 632 return (vu_ctrlr->sqs[qid]->sq_state != VFIO_USER_SQ_DELETED && 633 vu_ctrlr->sqs[qid]->sq_state != VFIO_USER_SQ_UNUSED); 634 } 635 636 static char * 637 endpoint_id(struct nvmf_vfio_user_endpoint *endpoint) 638 { 639 return endpoint->trid.traddr; 640 } 641 642 static char * 643 ctrlr_id(struct nvmf_vfio_user_ctrlr *ctrlr) 644 { 645 if (!ctrlr || !ctrlr->endpoint) { 646 return "Null Ctrlr"; 647 } 648 649 return endpoint_id(ctrlr->endpoint); 650 } 651 652 /* Return the poll group for the admin queue of the controller. */ 653 static inline struct nvmf_vfio_user_poll_group * 654 ctrlr_to_poll_group(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 655 { 656 return SPDK_CONTAINEROF(vu_ctrlr->sqs[0]->group, 657 struct nvmf_vfio_user_poll_group, 658 group); 659 } 660 661 static inline struct spdk_thread * 662 poll_group_to_thread(struct nvmf_vfio_user_poll_group *vu_pg) 663 { 664 return vu_pg->group.group->thread; 665 } 666 667 static dma_sg_t * 668 index_to_sg_t(void *arr, size_t i) 669 { 670 return (dma_sg_t *)((uintptr_t)arr + i * dma_sg_size()); 671 } 672 673 static inline size_t 674 vfio_user_migr_data_len(void) 675 { 676 return SPDK_ALIGN_CEIL(sizeof(struct vfio_user_nvme_migr_state), PAGE_SIZE); 677 } 678 679 static inline bool 680 in_interrupt_mode(struct nvmf_vfio_user_transport *vu_transport) 681 { 682 return spdk_interrupt_mode_is_enabled() && 683 vu_transport->intr_mode_supported; 684 } 685 686 static int vfio_user_ctrlr_intr(void *ctx); 687 688 static void 689 vfio_user_msg_ctrlr_intr(void *ctx) 690 { 691 struct nvmf_vfio_user_ctrlr *vu_ctrlr = ctx; 692 struct nvmf_vfio_user_poll_group *vu_ctrlr_group = ctrlr_to_poll_group(vu_ctrlr); 693 694 vu_ctrlr_group->stats.ctrlr_kicks++; 695 696 vfio_user_ctrlr_intr(ctx); 697 } 698 699 /* 700 * Kick (force a wakeup) of all poll groups for this controller. 701 * vfio_user_ctrlr_intr() itself arranges for kicking other poll groups if 702 * needed. 703 */ 704 static void 705 ctrlr_kick(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 706 { 707 struct nvmf_vfio_user_poll_group *vu_ctrlr_group; 708 709 SPDK_DEBUGLOG(vfio_user_db, "%s: kicked\n", ctrlr_id(vu_ctrlr)); 710 711 vu_ctrlr_group = ctrlr_to_poll_group(vu_ctrlr); 712 713 spdk_thread_send_msg(poll_group_to_thread(vu_ctrlr_group), 714 vfio_user_msg_ctrlr_intr, vu_ctrlr); 715 } 716 717 /* 718 * Make the given DMA address and length available (locally mapped) via iov. 719 */ 720 static void * 721 map_one(vfu_ctx_t *ctx, uint64_t addr, uint64_t len, dma_sg_t *sg, 722 struct iovec *iov, int prot) 723 { 724 int ret; 725 726 assert(ctx != NULL); 727 assert(sg != NULL); 728 assert(iov != NULL); 729 730 ret = vfu_addr_to_sgl(ctx, (void *)(uintptr_t)addr, len, sg, 1, prot); 731 if (ret < 0) { 732 if (ret == -1) { 733 SPDK_ERRLOG("failed to translate IOVA [%lu, %lu) (prot=%d) to local VA: %m\n", 734 addr, addr + len, prot); 735 } else { 736 SPDK_ERRLOG("failed to translate IOVA [%lu, %lu) (prot=%d) to local VA: %d segments needed\n", 737 addr, addr + len, prot, -(ret + 1)); 738 } 739 return NULL; 740 } 741 742 ret = vfu_sgl_get(ctx, sg, iov, 1, 0); 743 if (ret != 0) { 744 SPDK_ERRLOG("failed to get IOVA for IOVA [%ld, %ld): %m\n", 745 addr, addr + len); 746 return NULL; 747 } 748 749 assert(iov->iov_base != NULL); 750 return iov->iov_base; 751 } 752 753 static int 754 nvme_cmd_map_prps(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, 755 uint32_t max_iovcnt, uint32_t len, size_t mps, 756 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 757 { 758 uint64_t prp1, prp2; 759 void *vva; 760 uint32_t i; 761 uint32_t residue_len, nents; 762 uint64_t *prp_list; 763 uint32_t iovcnt; 764 765 assert(max_iovcnt > 0); 766 767 prp1 = cmd->dptr.prp.prp1; 768 prp2 = cmd->dptr.prp.prp2; 769 770 /* PRP1 may started with unaligned page address */ 771 residue_len = mps - (prp1 % mps); 772 residue_len = spdk_min(len, residue_len); 773 774 vva = gpa_to_vva(prv, prp1, residue_len, PROT_READ | PROT_WRITE); 775 if (spdk_unlikely(vva == NULL)) { 776 SPDK_ERRLOG("GPA to VVA failed\n"); 777 return -EINVAL; 778 } 779 len -= residue_len; 780 if (len && max_iovcnt < 2) { 781 SPDK_ERRLOG("Too many page entries, at least two iovs are required\n"); 782 return -ERANGE; 783 } 784 iovs[0].iov_base = vva; 785 iovs[0].iov_len = residue_len; 786 787 if (len) { 788 if (spdk_unlikely(prp2 == 0)) { 789 SPDK_ERRLOG("no PRP2, %d remaining\n", len); 790 return -EINVAL; 791 } 792 793 if (len <= mps) { 794 /* 2 PRP used */ 795 iovcnt = 2; 796 vva = gpa_to_vva(prv, prp2, len, PROT_READ | PROT_WRITE); 797 if (spdk_unlikely(vva == NULL)) { 798 SPDK_ERRLOG("no VVA for %#" PRIx64 ", len%#x\n", 799 prp2, len); 800 return -EINVAL; 801 } 802 iovs[1].iov_base = vva; 803 iovs[1].iov_len = len; 804 } else { 805 /* PRP list used */ 806 nents = (len + mps - 1) / mps; 807 if (spdk_unlikely(nents + 1 > max_iovcnt)) { 808 SPDK_ERRLOG("Too many page entries\n"); 809 return -ERANGE; 810 } 811 812 vva = gpa_to_vva(prv, prp2, nents * sizeof(*prp_list), PROT_READ); 813 if (spdk_unlikely(vva == NULL)) { 814 SPDK_ERRLOG("no VVA for %#" PRIx64 ", nents=%#x\n", 815 prp2, nents); 816 return -EINVAL; 817 } 818 prp_list = vva; 819 i = 0; 820 while (len != 0) { 821 residue_len = spdk_min(len, mps); 822 vva = gpa_to_vva(prv, prp_list[i], residue_len, PROT_READ | PROT_WRITE); 823 if (spdk_unlikely(vva == NULL)) { 824 SPDK_ERRLOG("no VVA for %#" PRIx64 ", residue_len=%#x\n", 825 prp_list[i], residue_len); 826 return -EINVAL; 827 } 828 iovs[i + 1].iov_base = vva; 829 iovs[i + 1].iov_len = residue_len; 830 len -= residue_len; 831 i++; 832 } 833 iovcnt = i + 1; 834 } 835 } else { 836 /* 1 PRP used */ 837 iovcnt = 1; 838 } 839 840 assert(iovcnt <= max_iovcnt); 841 return iovcnt; 842 } 843 844 static int 845 nvme_cmd_map_sgls_data(void *prv, struct spdk_nvme_sgl_descriptor *sgls, uint32_t num_sgls, 846 struct iovec *iovs, uint32_t max_iovcnt, 847 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 848 { 849 uint32_t i; 850 void *vva; 851 852 if (spdk_unlikely(max_iovcnt < num_sgls)) { 853 return -ERANGE; 854 } 855 856 for (i = 0; i < num_sgls; i++) { 857 if (spdk_unlikely(sgls[i].unkeyed.type != SPDK_NVME_SGL_TYPE_DATA_BLOCK)) { 858 SPDK_ERRLOG("Invalid SGL type %u\n", sgls[i].unkeyed.type); 859 return -EINVAL; 860 } 861 vva = gpa_to_vva(prv, sgls[i].address, sgls[i].unkeyed.length, PROT_READ | PROT_WRITE); 862 if (spdk_unlikely(vva == NULL)) { 863 SPDK_ERRLOG("GPA to VVA failed\n"); 864 return -EINVAL; 865 } 866 iovs[i].iov_base = vva; 867 iovs[i].iov_len = sgls[i].unkeyed.length; 868 } 869 870 return num_sgls; 871 } 872 873 static int 874 nvme_cmd_map_sgls(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt, 875 uint32_t len, size_t mps, 876 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 877 { 878 struct spdk_nvme_sgl_descriptor *sgl, *last_sgl; 879 uint32_t num_sgls, seg_len; 880 void *vva; 881 int ret; 882 uint32_t total_iovcnt = 0; 883 884 /* SGL cases */ 885 sgl = &cmd->dptr.sgl1; 886 887 /* only one SGL segment */ 888 if (sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { 889 assert(max_iovcnt > 0); 890 vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, PROT_READ | PROT_WRITE); 891 if (spdk_unlikely(vva == NULL)) { 892 SPDK_ERRLOG("GPA to VVA failed\n"); 893 return -EINVAL; 894 } 895 iovs[0].iov_base = vva; 896 iovs[0].iov_len = sgl->unkeyed.length; 897 assert(sgl->unkeyed.length == len); 898 899 return 1; 900 } 901 902 for (;;) { 903 if (spdk_unlikely((sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_SEGMENT) && 904 (sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_LAST_SEGMENT))) { 905 SPDK_ERRLOG("Invalid SGL type %u\n", sgl->unkeyed.type); 906 return -EINVAL; 907 } 908 909 seg_len = sgl->unkeyed.length; 910 if (spdk_unlikely(seg_len % sizeof(struct spdk_nvme_sgl_descriptor))) { 911 SPDK_ERRLOG("Invalid SGL segment len %u\n", seg_len); 912 return -EINVAL; 913 } 914 915 num_sgls = seg_len / sizeof(struct spdk_nvme_sgl_descriptor); 916 vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, PROT_READ); 917 if (spdk_unlikely(vva == NULL)) { 918 SPDK_ERRLOG("GPA to VVA failed\n"); 919 return -EINVAL; 920 } 921 922 /* sgl point to the first segment */ 923 sgl = (struct spdk_nvme_sgl_descriptor *)vva; 924 last_sgl = &sgl[num_sgls - 1]; 925 926 /* we are done */ 927 if (last_sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { 928 /* map whole sgl list */ 929 ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls, &iovs[total_iovcnt], 930 max_iovcnt - total_iovcnt, gpa_to_vva); 931 if (spdk_unlikely(ret < 0)) { 932 return ret; 933 } 934 total_iovcnt += ret; 935 936 return total_iovcnt; 937 } 938 939 if (num_sgls > 1) { 940 /* map whole sgl exclude last_sgl */ 941 ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls - 1, &iovs[total_iovcnt], 942 max_iovcnt - total_iovcnt, gpa_to_vva); 943 if (spdk_unlikely(ret < 0)) { 944 return ret; 945 } 946 total_iovcnt += ret; 947 } 948 949 /* move to next level's segments */ 950 sgl = last_sgl; 951 } 952 953 return 0; 954 } 955 956 static int 957 nvme_map_cmd(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt, 958 uint32_t len, size_t mps, 959 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 960 { 961 if (cmd->psdt == SPDK_NVME_PSDT_PRP) { 962 return nvme_cmd_map_prps(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva); 963 } 964 965 return nvme_cmd_map_sgls(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva); 966 } 967 968 /* 969 * For each queue, update the location of its doorbell to the correct location: 970 * either our own BAR0, or the guest's configured shadow doorbell area. 971 * 972 * The Admin queue (qid: 0) does not ever use shadow doorbells. 973 */ 974 static void 975 vfio_user_ctrlr_switch_doorbells(struct nvmf_vfio_user_ctrlr *ctrlr, bool shadow) 976 { 977 volatile uint32_t *doorbells = shadow ? ctrlr->sdbl->shadow_doorbells : 978 ctrlr->bar0_doorbells; 979 980 assert(doorbells != NULL); 981 982 for (size_t i = 1; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; i++) { 983 struct nvmf_vfio_user_sq *sq = ctrlr->sqs[i]; 984 struct nvmf_vfio_user_cq *cq = ctrlr->cqs[i]; 985 986 if (sq != NULL) { 987 sq->dbl_tailp = doorbells + queue_index(sq->qid, false); 988 989 ctrlr->sqs[i]->need_rearm = shadow; 990 } 991 992 if (cq != NULL) { 993 cq->dbl_headp = doorbells + queue_index(cq->qid, true); 994 } 995 } 996 } 997 998 static void 999 unmap_sdbl(vfu_ctx_t *vfu_ctx, struct nvmf_vfio_user_shadow_doorbells *sdbl) 1000 { 1001 assert(vfu_ctx != NULL); 1002 assert(sdbl != NULL); 1003 1004 /* 1005 * An allocation error would result in only one of the two being 1006 * non-NULL. If that is the case, no memory should have been mapped. 1007 */ 1008 if (sdbl->iovs == NULL || sdbl->sgs == NULL) { 1009 return; 1010 } 1011 1012 for (size_t i = 0; i < NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT; ++i) { 1013 struct iovec *iov; 1014 dma_sg_t *sg; 1015 1016 if (!sdbl->iovs[i].iov_len) { 1017 continue; 1018 } 1019 1020 sg = index_to_sg_t(sdbl->sgs, i); 1021 iov = sdbl->iovs + i; 1022 1023 vfu_sgl_put(vfu_ctx, sg, iov, 1); 1024 } 1025 } 1026 1027 static void 1028 free_sdbl(vfu_ctx_t *vfu_ctx, struct nvmf_vfio_user_shadow_doorbells *sdbl) 1029 { 1030 if (sdbl == NULL) { 1031 return; 1032 } 1033 1034 unmap_sdbl(vfu_ctx, sdbl); 1035 1036 /* 1037 * sdbl->shadow_doorbells and sdbl->eventidxs were mapped, 1038 * not allocated, so don't free() them. 1039 */ 1040 free(sdbl->sgs); 1041 free(sdbl->iovs); 1042 free(sdbl); 1043 } 1044 1045 static struct nvmf_vfio_user_shadow_doorbells * 1046 map_sdbl(vfu_ctx_t *vfu_ctx, uint64_t prp1, uint64_t prp2, size_t len) 1047 { 1048 struct nvmf_vfio_user_shadow_doorbells *sdbl = NULL; 1049 dma_sg_t *sg2 = NULL; 1050 void *p; 1051 1052 assert(vfu_ctx != NULL); 1053 1054 sdbl = calloc(1, sizeof(*sdbl)); 1055 if (sdbl == NULL) { 1056 goto err; 1057 } 1058 1059 sdbl->sgs = calloc(NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT, dma_sg_size()); 1060 sdbl->iovs = calloc(NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT, sizeof(*sdbl->iovs)); 1061 if (sdbl->sgs == NULL || sdbl->iovs == NULL) { 1062 goto err; 1063 } 1064 1065 /* Map shadow doorbell buffer (PRP1). */ 1066 p = map_one(vfu_ctx, prp1, len, sdbl->sgs, sdbl->iovs, 1067 PROT_READ | PROT_WRITE); 1068 1069 if (p == NULL) { 1070 goto err; 1071 } 1072 1073 /* 1074 * Map eventidx buffer (PRP2). 1075 * Should only be written to by the controller. 1076 */ 1077 1078 sg2 = index_to_sg_t(sdbl->sgs, 1); 1079 1080 p = map_one(vfu_ctx, prp2, len, sg2, sdbl->iovs + 1, 1081 PROT_READ | PROT_WRITE); 1082 1083 if (p == NULL) { 1084 goto err; 1085 } 1086 1087 sdbl->shadow_doorbells = (uint32_t *)sdbl->iovs[0].iov_base; 1088 sdbl->eventidxs = (uint32_t *)sdbl->iovs[1].iov_base; 1089 1090 return sdbl; 1091 1092 err: 1093 free_sdbl(vfu_ctx, sdbl); 1094 return NULL; 1095 } 1096 1097 /* 1098 * Copy doorbells from one buffer to the other, during switches betweeen BAR0 1099 * doorbells and shadow doorbells. 1100 */ 1101 static void 1102 copy_doorbells(struct nvmf_vfio_user_ctrlr *ctrlr, 1103 const volatile uint32_t *from, volatile uint32_t *to) 1104 { 1105 assert(ctrlr != NULL); 1106 assert(from != NULL); 1107 assert(to != NULL); 1108 1109 SPDK_DEBUGLOG(vfio_user_db, 1110 "%s: migrating shadow doorbells from %p to %p\n", 1111 ctrlr_id(ctrlr), from, to); 1112 1113 /* Can't use memcpy because it doesn't respect volatile semantics. */ 1114 for (size_t i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; ++i) { 1115 if (ctrlr->sqs[i] != NULL) { 1116 to[queue_index(i, false)] = from[queue_index(i, false)]; 1117 } 1118 1119 if (ctrlr->cqs[i] != NULL) { 1120 to[queue_index(i, true)] = from[queue_index(i, true)]; 1121 } 1122 } 1123 } 1124 1125 static void 1126 fail_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1127 { 1128 const struct spdk_nvmf_registers *regs; 1129 1130 assert(vu_ctrlr != NULL); 1131 assert(vu_ctrlr->ctrlr != NULL); 1132 1133 regs = spdk_nvmf_ctrlr_get_regs(vu_ctrlr->ctrlr); 1134 if (regs->csts.bits.cfs == 0) { 1135 SPDK_ERRLOG(":%s failing controller\n", ctrlr_id(vu_ctrlr)); 1136 } 1137 1138 nvmf_ctrlr_set_fatal_status(vu_ctrlr->ctrlr); 1139 } 1140 1141 static inline bool 1142 ctrlr_interrupt_enabled(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1143 { 1144 assert(vu_ctrlr != NULL); 1145 assert(vu_ctrlr->endpoint != NULL); 1146 1147 vfu_pci_config_space_t *pci = vu_ctrlr->endpoint->pci_config_space; 1148 1149 return (!pci->hdr.cmd.id || vu_ctrlr->endpoint->msix->mxc.mxe); 1150 } 1151 1152 static void 1153 nvmf_vfio_user_destroy_endpoint(struct nvmf_vfio_user_endpoint *endpoint) 1154 { 1155 SPDK_DEBUGLOG(nvmf_vfio, "destroy endpoint %s\n", endpoint_id(endpoint)); 1156 1157 spdk_interrupt_unregister(&endpoint->accept_intr); 1158 spdk_poller_unregister(&endpoint->accept_poller); 1159 1160 if (endpoint->bar0_doorbells) { 1161 munmap((void *)endpoint->bar0_doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 1162 } 1163 1164 if (endpoint->devmem_fd > 0) { 1165 close(endpoint->devmem_fd); 1166 } 1167 1168 if (endpoint->migr_data) { 1169 munmap(endpoint->migr_data, vfio_user_migr_data_len()); 1170 } 1171 1172 if (endpoint->migr_fd > 0) { 1173 close(endpoint->migr_fd); 1174 } 1175 1176 if (endpoint->vfu_ctx) { 1177 vfu_destroy_ctx(endpoint->vfu_ctx); 1178 } 1179 1180 pthread_mutex_destroy(&endpoint->lock); 1181 free(endpoint); 1182 } 1183 1184 /* called when process exits */ 1185 static int 1186 nvmf_vfio_user_destroy(struct spdk_nvmf_transport *transport, 1187 spdk_nvmf_transport_destroy_done_cb cb_fn, void *cb_arg) 1188 { 1189 struct nvmf_vfio_user_transport *vu_transport; 1190 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 1191 1192 SPDK_DEBUGLOG(nvmf_vfio, "destroy transport\n"); 1193 1194 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 1195 transport); 1196 1197 pthread_mutex_destroy(&vu_transport->lock); 1198 pthread_mutex_destroy(&vu_transport->pg_lock); 1199 1200 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 1201 TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link); 1202 nvmf_vfio_user_destroy_endpoint(endpoint); 1203 } 1204 1205 free(vu_transport); 1206 1207 if (cb_fn) { 1208 cb_fn(cb_arg); 1209 } 1210 1211 return 0; 1212 } 1213 1214 static const struct spdk_json_object_decoder vfio_user_transport_opts_decoder[] = { 1215 { 1216 "disable_mappable_bar0", 1217 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_mappable_bar0), 1218 spdk_json_decode_bool, true 1219 }, 1220 { 1221 "disable_adaptive_irq", 1222 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_adaptive_irq), 1223 spdk_json_decode_bool, true 1224 }, 1225 { 1226 "disable_shadow_doorbells", 1227 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_shadow_doorbells), 1228 spdk_json_decode_bool, true 1229 }, 1230 { 1231 "disable_compare", 1232 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_compare), 1233 spdk_json_decode_bool, true 1234 }, 1235 { 1236 "enable_intr_mode_sq_spreading", 1237 offsetof(struct nvmf_vfio_user_transport, transport_opts.enable_intr_mode_sq_spreading), 1238 spdk_json_decode_bool, true 1239 }, 1240 }; 1241 1242 static struct spdk_nvmf_transport * 1243 nvmf_vfio_user_create(struct spdk_nvmf_transport_opts *opts) 1244 { 1245 struct nvmf_vfio_user_transport *vu_transport; 1246 int err; 1247 1248 if (opts->max_qpairs_per_ctrlr > NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) { 1249 SPDK_ERRLOG("Invalid max_qpairs_per_ctrlr=%d, supported max_qpairs_per_ctrlr=%d\n", 1250 opts->max_qpairs_per_ctrlr, NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR); 1251 return NULL; 1252 } 1253 1254 vu_transport = calloc(1, sizeof(*vu_transport)); 1255 if (vu_transport == NULL) { 1256 SPDK_ERRLOG("Transport alloc fail: %m\n"); 1257 return NULL; 1258 } 1259 1260 err = pthread_mutex_init(&vu_transport->lock, NULL); 1261 if (err != 0) { 1262 SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err); 1263 goto err; 1264 } 1265 TAILQ_INIT(&vu_transport->endpoints); 1266 1267 err = pthread_mutex_init(&vu_transport->pg_lock, NULL); 1268 if (err != 0) { 1269 pthread_mutex_destroy(&vu_transport->lock); 1270 SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err); 1271 goto err; 1272 } 1273 TAILQ_INIT(&vu_transport->poll_groups); 1274 1275 if (opts->transport_specific != NULL && 1276 spdk_json_decode_object_relaxed(opts->transport_specific, vfio_user_transport_opts_decoder, 1277 SPDK_COUNTOF(vfio_user_transport_opts_decoder), 1278 vu_transport)) { 1279 SPDK_ERRLOG("spdk_json_decode_object_relaxed failed\n"); 1280 goto cleanup; 1281 } 1282 1283 /* 1284 * To support interrupt mode, the transport must be configured with 1285 * mappable BAR0 disabled: we need a vfio-user message to wake us up 1286 * when a client writes new doorbell values to BAR0, via the 1287 * libvfio-user socket fd. 1288 */ 1289 vu_transport->intr_mode_supported = 1290 vu_transport->transport_opts.disable_mappable_bar0; 1291 1292 /* 1293 * If BAR0 is mappable, it doesn't make sense to support shadow 1294 * doorbells, so explicitly turn it off. 1295 */ 1296 if (!vu_transport->transport_opts.disable_mappable_bar0) { 1297 vu_transport->transport_opts.disable_shadow_doorbells = true; 1298 } 1299 1300 if (spdk_interrupt_mode_is_enabled()) { 1301 if (!vu_transport->intr_mode_supported) { 1302 SPDK_ERRLOG("interrupt mode not supported\n"); 1303 goto cleanup; 1304 } 1305 1306 /* 1307 * If we are in interrupt mode, we cannot support adaptive IRQs, 1308 * as there is no guarantee the SQ poller will run subsequently 1309 * to send pending IRQs. 1310 */ 1311 vu_transport->transport_opts.disable_adaptive_irq = true; 1312 } 1313 1314 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_mappable_bar0=%d\n", 1315 vu_transport->transport_opts.disable_mappable_bar0); 1316 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_adaptive_irq=%d\n", 1317 vu_transport->transport_opts.disable_adaptive_irq); 1318 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_shadow_doorbells=%d\n", 1319 vu_transport->transport_opts.disable_shadow_doorbells); 1320 1321 return &vu_transport->transport; 1322 1323 cleanup: 1324 pthread_mutex_destroy(&vu_transport->lock); 1325 pthread_mutex_destroy(&vu_transport->pg_lock); 1326 err: 1327 free(vu_transport); 1328 return NULL; 1329 } 1330 1331 static uint32_t 1332 max_queue_size(struct nvmf_vfio_user_ctrlr const *vu_ctrlr) 1333 { 1334 assert(vu_ctrlr != NULL); 1335 assert(vu_ctrlr->ctrlr != NULL); 1336 1337 return vu_ctrlr->ctrlr->vcprop.cap.bits.mqes + 1; 1338 } 1339 1340 static uint32_t 1341 doorbell_stride(const struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1342 { 1343 assert(vu_ctrlr != NULL); 1344 assert(vu_ctrlr->ctrlr != NULL); 1345 1346 return vu_ctrlr->ctrlr->vcprop.cap.bits.dstrd; 1347 } 1348 1349 static uintptr_t 1350 memory_page_size(const struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1351 { 1352 uint32_t memory_page_shift = vu_ctrlr->ctrlr->vcprop.cc.bits.mps + 12; 1353 return 1ul << memory_page_shift; 1354 } 1355 1356 static uintptr_t 1357 memory_page_mask(const struct nvmf_vfio_user_ctrlr *ctrlr) 1358 { 1359 return ~(memory_page_size(ctrlr) - 1); 1360 } 1361 1362 static int 1363 map_q(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvme_q_mapping *mapping, 1364 uint32_t q_size, bool is_cq, bool unmap) 1365 { 1366 uint64_t len; 1367 void *ret; 1368 1369 assert(q_size); 1370 assert(q_addr(mapping) == NULL); 1371 1372 if (is_cq) { 1373 len = q_size * sizeof(struct spdk_nvme_cpl); 1374 } else { 1375 len = q_size * sizeof(struct spdk_nvme_cmd); 1376 } 1377 1378 ret = map_one(vu_ctrlr->endpoint->vfu_ctx, mapping->prp1, len, 1379 mapping->sg, &mapping->iov, 1380 is_cq ? PROT_READ | PROT_WRITE : PROT_READ); 1381 if (ret == NULL) { 1382 return -EFAULT; 1383 } 1384 1385 if (unmap) { 1386 memset(q_addr(mapping), 0, len); 1387 } 1388 1389 return 0; 1390 } 1391 1392 static inline void 1393 unmap_q(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvme_q_mapping *mapping) 1394 { 1395 if (q_addr(mapping) != NULL) { 1396 vfu_sgl_put(vu_ctrlr->endpoint->vfu_ctx, mapping->sg, 1397 &mapping->iov, 1); 1398 mapping->iov.iov_base = NULL; 1399 } 1400 } 1401 1402 static int 1403 asq_setup(struct nvmf_vfio_user_ctrlr *ctrlr) 1404 { 1405 struct nvmf_vfio_user_sq *sq; 1406 const struct spdk_nvmf_registers *regs; 1407 int ret; 1408 1409 assert(ctrlr != NULL); 1410 1411 sq = ctrlr->sqs[0]; 1412 1413 assert(sq != NULL); 1414 assert(q_addr(&sq->mapping) == NULL); 1415 /* XXX ctrlr->asq == 0 is a valid memory address */ 1416 1417 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->ctrlr); 1418 sq->qid = 0; 1419 sq->size = regs->aqa.bits.asqs + 1; 1420 sq->mapping.prp1 = regs->asq; 1421 *sq_headp(sq) = 0; 1422 sq->cqid = 0; 1423 1424 ret = map_q(ctrlr, &sq->mapping, sq->size, false, true); 1425 if (ret) { 1426 return ret; 1427 } 1428 1429 /* The Admin queue (qid: 0) does not ever use shadow doorbells. */ 1430 sq->dbl_tailp = ctrlr->bar0_doorbells + queue_index(0, false); 1431 1432 *sq_dbl_tailp(sq) = 0; 1433 1434 return 0; 1435 } 1436 1437 /* 1438 * Updates eventidx to set an SQ into interrupt or polling mode. 1439 * 1440 * Returns false if the current SQ tail does not match the SQ head, as 1441 * this means that the host has submitted more items to the queue while we were 1442 * not looking - or during the event index update. In that case, we must retry, 1443 * or otherwise make sure we are going to wake up again. 1444 */ 1445 static bool 1446 set_sq_eventidx(struct nvmf_vfio_user_sq *sq) 1447 { 1448 struct nvmf_vfio_user_ctrlr *ctrlr; 1449 volatile uint32_t *sq_tail_eidx; 1450 uint32_t old_tail, new_tail; 1451 1452 assert(sq != NULL); 1453 assert(sq->ctrlr != NULL); 1454 assert(sq->ctrlr->sdbl != NULL); 1455 assert(sq->need_rearm); 1456 assert(sq->qid != 0); 1457 1458 ctrlr = sq->ctrlr; 1459 1460 SPDK_DEBUGLOG(vfio_user_db, "%s: updating eventidx of sqid:%u\n", 1461 ctrlr_id(ctrlr), sq->qid); 1462 1463 sq_tail_eidx = ctrlr->sdbl->eventidxs + queue_index(sq->qid, false); 1464 1465 assert(ctrlr->endpoint != NULL); 1466 1467 if (!ctrlr->endpoint->interrupt_mode) { 1468 /* No synchronisation necessary. */ 1469 *sq_tail_eidx = NVMF_VFIO_USER_EVENTIDX_POLL; 1470 return true; 1471 } 1472 1473 old_tail = *sq_dbl_tailp(sq); 1474 *sq_tail_eidx = old_tail; 1475 1476 /* 1477 * Ensure that the event index is updated before re-reading the tail 1478 * doorbell. If it's not, then the host might race us and update the 1479 * tail after the second read but before the event index is written, so 1480 * it won't write to BAR0 and we'll miss the update. 1481 * 1482 * The driver should provide similar ordering with an mb(). 1483 */ 1484 spdk_mb(); 1485 1486 /* 1487 * Check if the host has updated the tail doorbell after we've read it 1488 * for the first time, but before the event index was written. If that's 1489 * the case, then we've lost the race and we need to update the event 1490 * index again (after polling the queue, since the host won't write to 1491 * BAR0). 1492 */ 1493 new_tail = *sq_dbl_tailp(sq); 1494 1495 /* 1496 * We might poll the queue straight after this function returns if the 1497 * tail has been updated, so we need to ensure that any changes to the 1498 * queue will be visible to us if the doorbell has been updated. 1499 * 1500 * The driver should provide similar ordering with a wmb() to ensure 1501 * that the queue is written before it updates the tail doorbell. 1502 */ 1503 spdk_rmb(); 1504 1505 SPDK_DEBUGLOG(vfio_user_db, "%s: sqid:%u, old_tail=%u, new_tail=%u, " 1506 "sq_head=%u\n", ctrlr_id(ctrlr), sq->qid, old_tail, 1507 new_tail, *sq_headp(sq)); 1508 1509 if (new_tail == *sq_headp(sq)) { 1510 sq->need_rearm = false; 1511 return true; 1512 } 1513 1514 /* 1515 * We've lost the race: the tail was updated since we last polled, 1516 * including if it happened within this routine. 1517 * 1518 * The caller should retry after polling (think of this as a cmpxchg 1519 * loop); if we go to sleep while the SQ is not empty, then we won't 1520 * process the remaining events. 1521 */ 1522 return false; 1523 } 1524 1525 static int nvmf_vfio_user_sq_poll(struct nvmf_vfio_user_sq *sq); 1526 1527 /* 1528 * Arrange for an SQ to interrupt us if written. Returns non-zero if we 1529 * processed some SQ entries. 1530 */ 1531 static int 1532 vfio_user_sq_rearm(struct nvmf_vfio_user_ctrlr *ctrlr, 1533 struct nvmf_vfio_user_sq *sq, 1534 struct nvmf_vfio_user_poll_group *vu_group) 1535 { 1536 int count = 0; 1537 size_t i; 1538 1539 assert(sq->need_rearm); 1540 1541 for (i = 0; i < NVMF_VFIO_USER_SET_EVENTIDX_MAX_ATTEMPTS; i++) { 1542 int ret; 1543 1544 if (set_sq_eventidx(sq)) { 1545 /* We won the race and set eventidx; done. */ 1546 vu_group->stats.won++; 1547 return count; 1548 } 1549 1550 ret = nvmf_vfio_user_sq_poll(sq); 1551 1552 count += (ret < 0) ? 1 : ret; 1553 1554 /* 1555 * set_sq_eventidx() hit the race, so we expected 1556 * to process at least one command from this queue. 1557 * If there were no new commands waiting for us, then 1558 * we must have hit an unexpected race condition. 1559 */ 1560 if (ret == 0) { 1561 SPDK_ERRLOG("%s: unexpected race condition detected " 1562 "while updating the shadow doorbell buffer\n", 1563 ctrlr_id(ctrlr)); 1564 1565 fail_ctrlr(ctrlr); 1566 return count; 1567 } 1568 } 1569 1570 SPDK_DEBUGLOG(vfio_user_db, 1571 "%s: set_sq_eventidx() lost the race %zu times\n", 1572 ctrlr_id(ctrlr), i); 1573 1574 vu_group->stats.lost++; 1575 vu_group->stats.lost_count += count; 1576 1577 /* 1578 * We couldn't arrange an eventidx guaranteed to cause a BAR0 write, as 1579 * we raced with the producer too many times; force ourselves to wake up 1580 * instead. We'll process all queues at that point. 1581 */ 1582 ctrlr_kick(ctrlr); 1583 1584 return count; 1585 } 1586 1587 /* 1588 * We're in interrupt mode, and potentially about to go to sleep. We need to 1589 * make sure any further I/O submissions are guaranteed to wake us up: for 1590 * shadow doorbells that means we may need to go through set_sq_eventidx() for 1591 * every SQ that needs re-arming. 1592 * 1593 * Returns non-zero if we processed something. 1594 */ 1595 static int 1596 vfio_user_poll_group_rearm(struct nvmf_vfio_user_poll_group *vu_group) 1597 { 1598 struct nvmf_vfio_user_sq *sq; 1599 int count = 0; 1600 1601 vu_group->stats.rearms++; 1602 1603 TAILQ_FOREACH(sq, &vu_group->sqs, link) { 1604 if (spdk_unlikely(sq->sq_state != VFIO_USER_SQ_ACTIVE || !sq->size)) { 1605 continue; 1606 } 1607 1608 if (sq->need_rearm) { 1609 count += vfio_user_sq_rearm(sq->ctrlr, sq, vu_group); 1610 } 1611 } 1612 1613 return count; 1614 } 1615 1616 static int 1617 acq_setup(struct nvmf_vfio_user_ctrlr *ctrlr) 1618 { 1619 struct nvmf_vfio_user_cq *cq; 1620 const struct spdk_nvmf_registers *regs; 1621 int ret; 1622 1623 assert(ctrlr != NULL); 1624 1625 cq = ctrlr->cqs[0]; 1626 1627 assert(cq != NULL); 1628 1629 assert(q_addr(&cq->mapping) == NULL); 1630 1631 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->ctrlr); 1632 assert(regs != NULL); 1633 cq->qid = 0; 1634 cq->size = regs->aqa.bits.acqs + 1; 1635 cq->mapping.prp1 = regs->acq; 1636 *cq_tailp(cq) = 0; 1637 cq->ien = true; 1638 cq->phase = true; 1639 1640 ret = map_q(ctrlr, &cq->mapping, cq->size, true, true); 1641 if (ret) { 1642 return ret; 1643 } 1644 1645 /* The Admin queue (qid: 0) does not ever use shadow doorbells. */ 1646 cq->dbl_headp = ctrlr->bar0_doorbells + queue_index(0, true); 1647 1648 *cq_dbl_headp(cq) = 0; 1649 1650 return 0; 1651 } 1652 1653 static void * 1654 _map_one(void *prv, uint64_t addr, uint64_t len, int prot) 1655 { 1656 struct spdk_nvmf_request *req = (struct spdk_nvmf_request *)prv; 1657 struct spdk_nvmf_qpair *qpair; 1658 struct nvmf_vfio_user_req *vu_req; 1659 struct nvmf_vfio_user_sq *sq; 1660 void *ret; 1661 1662 assert(req != NULL); 1663 qpair = req->qpair; 1664 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 1665 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 1666 1667 assert(vu_req->iovcnt < NVMF_VFIO_USER_MAX_IOVECS); 1668 ret = map_one(sq->ctrlr->endpoint->vfu_ctx, addr, len, 1669 index_to_sg_t(vu_req->sg, vu_req->iovcnt), 1670 &vu_req->iov[vu_req->iovcnt], prot); 1671 if (spdk_likely(ret != NULL)) { 1672 vu_req->iovcnt++; 1673 } 1674 return ret; 1675 } 1676 1677 static int 1678 vfio_user_map_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req, 1679 struct iovec *iov, uint32_t length) 1680 { 1681 /* Map PRP list to from Guest physical memory to 1682 * virtual memory address. 1683 */ 1684 return nvme_map_cmd(req, &req->cmd->nvme_cmd, iov, NVMF_REQ_MAX_BUFFERS, 1685 length, 4096, _map_one); 1686 } 1687 1688 static int handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 1689 struct nvmf_vfio_user_sq *sq); 1690 1691 static uint32_t 1692 cq_free_slots(struct nvmf_vfio_user_cq *cq) 1693 { 1694 uint32_t free_slots; 1695 1696 assert(cq != NULL); 1697 1698 if (cq->tail == cq->last_head) { 1699 free_slots = cq->size; 1700 } else if (cq->tail > cq->last_head) { 1701 free_slots = cq->size - (cq->tail - cq->last_head); 1702 } else { 1703 free_slots = cq->last_head - cq->tail; 1704 } 1705 assert(free_slots > 0); 1706 1707 return free_slots - 1; 1708 } 1709 1710 /* 1711 * Since reading the head doorbell is relatively expensive, we use the cached 1712 * value, so we only have to read it for real if it appears that we are full. 1713 */ 1714 static inline bool 1715 cq_is_full(struct nvmf_vfio_user_cq *cq) 1716 { 1717 uint32_t free_cq_slots; 1718 1719 assert(cq != NULL); 1720 1721 free_cq_slots = cq_free_slots(cq); 1722 1723 if (spdk_unlikely(free_cq_slots == 0)) { 1724 cq->last_head = *cq_dbl_headp(cq); 1725 free_cq_slots = cq_free_slots(cq); 1726 } 1727 1728 return free_cq_slots == 0; 1729 } 1730 1731 /* 1732 * Posts a CQE in the completion queue. 1733 * 1734 * @ctrlr: the vfio-user controller 1735 * @cq: the completion queue 1736 * @cdw0: cdw0 as reported by NVMf 1737 * @sqid: submission queue ID 1738 * @cid: command identifier in NVMe command 1739 * @sc: the NVMe CQE status code 1740 * @sct: the NVMe CQE status code type 1741 */ 1742 static int 1743 post_completion(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_cq *cq, 1744 uint32_t cdw0, uint16_t sqid, uint16_t cid, uint16_t sc, uint16_t sct) 1745 { 1746 struct spdk_nvme_status cpl_status = { 0 }; 1747 struct spdk_nvme_cpl *cpl; 1748 int err; 1749 1750 assert(ctrlr != NULL); 1751 1752 if (spdk_unlikely(cq == NULL || q_addr(&cq->mapping) == NULL)) { 1753 return 0; 1754 } 1755 1756 if (cq->qid == 0) { 1757 assert(spdk_get_thread() == cq->group->group->thread); 1758 } 1759 1760 /* 1761 * As per NVMe Base spec 3.3.1.2.1, we are supposed to implement CQ flow 1762 * control: if there is no space in the CQ, we should wait until there is. 1763 * 1764 * In practice, we just fail the controller instead: as it happens, all host 1765 * implementations we care about right-size the CQ: this is required anyway for 1766 * NVMEoF support (see 3.3.2.8). 1767 */ 1768 if (cq_is_full(cq)) { 1769 SPDK_ERRLOG("%s: cqid:%d full (tail=%d, head=%d)\n", 1770 ctrlr_id(ctrlr), cq->qid, *cq_tailp(cq), 1771 *cq_dbl_headp(cq)); 1772 return -1; 1773 } 1774 1775 cpl = ((struct spdk_nvme_cpl *)q_addr(&cq->mapping)) + *cq_tailp(cq); 1776 1777 assert(ctrlr->sqs[sqid] != NULL); 1778 SPDK_DEBUGLOG(nvmf_vfio, 1779 "%s: request complete sqid:%d cid=%d status=%#x " 1780 "sqhead=%d cq tail=%d\n", ctrlr_id(ctrlr), sqid, cid, sc, 1781 *sq_headp(ctrlr->sqs[sqid]), *cq_tailp(cq)); 1782 1783 cpl->sqhd = *sq_headp(ctrlr->sqs[sqid]); 1784 cpl->sqid = sqid; 1785 cpl->cid = cid; 1786 cpl->cdw0 = cdw0; 1787 1788 /* 1789 * This is a bitfield: instead of setting the individual bits we need 1790 * directly in cpl->status, which would cause a read-modify-write cycle, 1791 * we'll avoid reading from the CPL altogether by filling in a local 1792 * cpl_status variable, then writing the whole thing. 1793 */ 1794 cpl_status.sct = sct; 1795 cpl_status.sc = sc; 1796 cpl_status.p = cq->phase; 1797 cpl->status = cpl_status; 1798 1799 /* Ensure the Completion Queue Entry is visible. */ 1800 spdk_wmb(); 1801 cq_tail_advance(cq); 1802 1803 if ((cq->qid == 0 || !ctrlr->adaptive_irqs_enabled) && 1804 cq->ien && ctrlr_interrupt_enabled(ctrlr)) { 1805 err = vfu_irq_trigger(ctrlr->endpoint->vfu_ctx, cq->iv); 1806 if (err != 0) { 1807 SPDK_ERRLOG("%s: failed to trigger interrupt: %m\n", 1808 ctrlr_id(ctrlr)); 1809 return err; 1810 } 1811 } 1812 1813 return 0; 1814 } 1815 1816 static void 1817 free_sq_reqs(struct nvmf_vfio_user_sq *sq) 1818 { 1819 while (!TAILQ_EMPTY(&sq->free_reqs)) { 1820 struct nvmf_vfio_user_req *vu_req = TAILQ_FIRST(&sq->free_reqs); 1821 TAILQ_REMOVE(&sq->free_reqs, vu_req, link); 1822 free(vu_req); 1823 } 1824 } 1825 1826 static void 1827 delete_cq_done(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_cq *cq) 1828 { 1829 assert(cq->cq_ref == 0); 1830 unmap_q(ctrlr, &cq->mapping); 1831 cq->size = 0; 1832 cq->cq_state = VFIO_USER_CQ_DELETED; 1833 cq->group = NULL; 1834 } 1835 1836 /* Deletes a SQ, if this SQ is the last user of the associated CQ 1837 * and the controller is being shut down/reset or vfio-user client disconnects, 1838 * then the CQ is also deleted. 1839 */ 1840 static void 1841 delete_sq_done(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvmf_vfio_user_sq *sq) 1842 { 1843 struct nvmf_vfio_user_cq *cq; 1844 uint16_t cqid; 1845 1846 SPDK_DEBUGLOG(nvmf_vfio, "%s: delete sqid:%d=%p done\n", ctrlr_id(vu_ctrlr), 1847 sq->qid, sq); 1848 1849 /* Free SQ resources */ 1850 unmap_q(vu_ctrlr, &sq->mapping); 1851 1852 free_sq_reqs(sq); 1853 1854 sq->size = 0; 1855 1856 sq->sq_state = VFIO_USER_SQ_DELETED; 1857 1858 /* Controller RESET and SHUTDOWN are special cases, 1859 * VM may not send DELETE IO SQ/CQ commands, NVMf library 1860 * will disconnect IO queue pairs. 1861 */ 1862 if (vu_ctrlr->reset_shn || vu_ctrlr->disconnect) { 1863 cqid = sq->cqid; 1864 cq = vu_ctrlr->cqs[cqid]; 1865 1866 SPDK_DEBUGLOG(nvmf_vfio, "%s: try to delete cqid:%u=%p\n", ctrlr_id(vu_ctrlr), 1867 cq->qid, cq); 1868 1869 assert(cq->cq_ref > 0); 1870 if (--cq->cq_ref == 0) { 1871 delete_cq_done(vu_ctrlr, cq); 1872 } 1873 } 1874 } 1875 1876 static void 1877 free_qp(struct nvmf_vfio_user_ctrlr *ctrlr, uint16_t qid) 1878 { 1879 struct nvmf_vfio_user_sq *sq; 1880 struct nvmf_vfio_user_cq *cq; 1881 1882 if (ctrlr == NULL) { 1883 return; 1884 } 1885 1886 sq = ctrlr->sqs[qid]; 1887 if (sq) { 1888 SPDK_DEBUGLOG(nvmf_vfio, "%s: Free sqid:%u\n", ctrlr_id(ctrlr), qid); 1889 unmap_q(ctrlr, &sq->mapping); 1890 1891 free_sq_reqs(sq); 1892 1893 free(sq->mapping.sg); 1894 free(sq); 1895 ctrlr->sqs[qid] = NULL; 1896 } 1897 1898 cq = ctrlr->cqs[qid]; 1899 if (cq) { 1900 SPDK_DEBUGLOG(nvmf_vfio, "%s: Free cqid:%u\n", ctrlr_id(ctrlr), qid); 1901 unmap_q(ctrlr, &cq->mapping); 1902 free(cq->mapping.sg); 1903 free(cq); 1904 ctrlr->cqs[qid] = NULL; 1905 } 1906 } 1907 1908 static int 1909 init_sq(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_transport *transport, 1910 const uint16_t id) 1911 { 1912 struct nvmf_vfio_user_sq *sq; 1913 1914 assert(ctrlr != NULL); 1915 assert(transport != NULL); 1916 assert(ctrlr->sqs[id] == NULL); 1917 1918 sq = calloc(1, sizeof(*sq)); 1919 if (sq == NULL) { 1920 return -ENOMEM; 1921 } 1922 sq->mapping.sg = calloc(1, dma_sg_size()); 1923 if (sq->mapping.sg == NULL) { 1924 free(sq); 1925 return -ENOMEM; 1926 } 1927 1928 sq->qid = id; 1929 sq->qpair.qid = id; 1930 sq->qpair.transport = transport; 1931 sq->ctrlr = ctrlr; 1932 ctrlr->sqs[id] = sq; 1933 1934 TAILQ_INIT(&sq->free_reqs); 1935 1936 return 0; 1937 } 1938 1939 static int 1940 init_cq(struct nvmf_vfio_user_ctrlr *vu_ctrlr, const uint16_t id) 1941 { 1942 struct nvmf_vfio_user_cq *cq; 1943 1944 assert(vu_ctrlr != NULL); 1945 assert(vu_ctrlr->cqs[id] == NULL); 1946 1947 cq = calloc(1, sizeof(*cq)); 1948 if (cq == NULL) { 1949 return -ENOMEM; 1950 } 1951 cq->mapping.sg = calloc(1, dma_sg_size()); 1952 if (cq->mapping.sg == NULL) { 1953 free(cq); 1954 return -ENOMEM; 1955 } 1956 1957 cq->qid = id; 1958 vu_ctrlr->cqs[id] = cq; 1959 1960 return 0; 1961 } 1962 1963 static int 1964 alloc_sq_reqs(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvmf_vfio_user_sq *sq) 1965 { 1966 struct nvmf_vfio_user_req *vu_req, *tmp; 1967 size_t req_size; 1968 uint32_t i; 1969 1970 req_size = sizeof(struct nvmf_vfio_user_req) + 1971 (dma_sg_size() * NVMF_VFIO_USER_MAX_IOVECS); 1972 1973 for (i = 0; i < sq->size; i++) { 1974 struct spdk_nvmf_request *req; 1975 1976 vu_req = calloc(1, req_size); 1977 if (vu_req == NULL) { 1978 goto err; 1979 } 1980 1981 req = &vu_req->req; 1982 req->qpair = &sq->qpair; 1983 req->rsp = (union nvmf_c2h_msg *)&vu_req->rsp; 1984 req->cmd = (union nvmf_h2c_msg *)&vu_req->cmd; 1985 req->stripped_data = NULL; 1986 1987 TAILQ_INSERT_TAIL(&sq->free_reqs, vu_req, link); 1988 } 1989 1990 return 0; 1991 1992 err: 1993 TAILQ_FOREACH_SAFE(vu_req, &sq->free_reqs, link, tmp) { 1994 free(vu_req); 1995 } 1996 return -ENOMEM; 1997 } 1998 1999 static volatile uint32_t * 2000 ctrlr_doorbell_ptr(struct nvmf_vfio_user_ctrlr *ctrlr) 2001 { 2002 return ctrlr->sdbl != NULL ? 2003 ctrlr->sdbl->shadow_doorbells : 2004 ctrlr->bar0_doorbells; 2005 } 2006 2007 static uint16_t 2008 handle_create_io_sq(struct nvmf_vfio_user_ctrlr *ctrlr, 2009 struct spdk_nvme_cmd *cmd, uint16_t *sct) 2010 { 2011 struct nvmf_vfio_user_transport *vu_transport = ctrlr->transport; 2012 struct nvmf_vfio_user_sq *sq; 2013 uint32_t qsize; 2014 uint16_t cqid; 2015 uint16_t qid; 2016 int err; 2017 2018 qid = cmd->cdw10_bits.create_io_q.qid; 2019 cqid = cmd->cdw11_bits.create_io_sq.cqid; 2020 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 2021 2022 if (ctrlr->sqs[qid] == NULL) { 2023 err = init_sq(ctrlr, ctrlr->sqs[0]->qpair.transport, qid); 2024 if (err != 0) { 2025 *sct = SPDK_NVME_SCT_GENERIC; 2026 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2027 } 2028 } 2029 2030 if (cqid == 0 || cqid >= vu_transport->transport.opts.max_qpairs_per_ctrlr) { 2031 SPDK_ERRLOG("%s: invalid cqid:%u\n", ctrlr_id(ctrlr), cqid); 2032 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2033 return SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2034 } 2035 2036 /* CQ must be created before SQ. */ 2037 if (!io_q_exists(ctrlr, cqid, true)) { 2038 SPDK_ERRLOG("%s: cqid:%u does not exist\n", ctrlr_id(ctrlr), cqid); 2039 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2040 return SPDK_NVME_SC_COMPLETION_QUEUE_INVALID; 2041 } 2042 2043 if (cmd->cdw11_bits.create_io_sq.pc != 0x1) { 2044 SPDK_ERRLOG("%s: non-PC SQ not supported\n", ctrlr_id(ctrlr)); 2045 *sct = SPDK_NVME_SCT_GENERIC; 2046 return SPDK_NVME_SC_INVALID_FIELD; 2047 } 2048 2049 sq = ctrlr->sqs[qid]; 2050 sq->size = qsize; 2051 2052 SPDK_DEBUGLOG(nvmf_vfio, "%s: sqid:%d cqid:%d\n", ctrlr_id(ctrlr), 2053 qid, cqid); 2054 2055 sq->mapping.prp1 = cmd->dptr.prp.prp1; 2056 2057 err = map_q(ctrlr, &sq->mapping, sq->size, false, true); 2058 if (err) { 2059 SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr)); 2060 *sct = SPDK_NVME_SCT_GENERIC; 2061 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2062 } 2063 2064 SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped sqid:%d IOVA=%#lx vaddr=%p\n", 2065 ctrlr_id(ctrlr), qid, cmd->dptr.prp.prp1, 2066 q_addr(&sq->mapping)); 2067 2068 err = alloc_sq_reqs(ctrlr, sq); 2069 if (err < 0) { 2070 SPDK_ERRLOG("%s: failed to allocate SQ requests: %m\n", ctrlr_id(ctrlr)); 2071 *sct = SPDK_NVME_SCT_GENERIC; 2072 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2073 } 2074 2075 sq->cqid = cqid; 2076 ctrlr->cqs[sq->cqid]->cq_ref++; 2077 sq->sq_state = VFIO_USER_SQ_CREATED; 2078 *sq_headp(sq) = 0; 2079 2080 sq->dbl_tailp = ctrlr_doorbell_ptr(ctrlr) + queue_index(qid, false); 2081 2082 /* 2083 * We should always reset the doorbells. 2084 * 2085 * The Specification prohibits the controller from writing to the shadow 2086 * doorbell buffer, however older versions of the Linux NVMe driver 2087 * don't reset the shadow doorbell buffer after a Queue-Level or 2088 * Controller-Level reset, which means that we're left with garbage 2089 * doorbell values. 2090 */ 2091 *sq_dbl_tailp(sq) = 0; 2092 2093 if (ctrlr->sdbl != NULL) { 2094 sq->need_rearm = true; 2095 2096 if (!set_sq_eventidx(sq)) { 2097 SPDK_ERRLOG("%s: host updated SQ tail doorbell before " 2098 "sqid:%hu was initialized\n", 2099 ctrlr_id(ctrlr), qid); 2100 fail_ctrlr(ctrlr); 2101 *sct = SPDK_NVME_SCT_GENERIC; 2102 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2103 } 2104 } 2105 2106 /* 2107 * Create our new I/O qpair. This asynchronously invokes, on a suitable 2108 * poll group, the nvmf_vfio_user_poll_group_add() callback, which will 2109 * call spdk_nvmf_request_exec_fabrics() with a generated fabrics 2110 * connect command. This command is then eventually completed via 2111 * handle_queue_connect_rsp(). 2112 */ 2113 sq->create_io_sq_cmd = *cmd; 2114 sq->post_create_io_sq_completion = true; 2115 2116 spdk_nvmf_tgt_new_qpair(ctrlr->transport->transport.tgt, 2117 &sq->qpair); 2118 2119 *sct = SPDK_NVME_SCT_GENERIC; 2120 return SPDK_NVME_SC_SUCCESS; 2121 } 2122 2123 static uint16_t 2124 handle_create_io_cq(struct nvmf_vfio_user_ctrlr *ctrlr, 2125 struct spdk_nvme_cmd *cmd, uint16_t *sct) 2126 { 2127 struct nvmf_vfio_user_cq *cq; 2128 uint32_t qsize; 2129 uint16_t qid; 2130 int err; 2131 2132 qid = cmd->cdw10_bits.create_io_q.qid; 2133 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 2134 2135 if (ctrlr->cqs[qid] == NULL) { 2136 err = init_cq(ctrlr, qid); 2137 if (err != 0) { 2138 *sct = SPDK_NVME_SCT_GENERIC; 2139 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2140 } 2141 } 2142 2143 if (cmd->cdw11_bits.create_io_cq.pc != 0x1) { 2144 SPDK_ERRLOG("%s: non-PC CQ not supported\n", ctrlr_id(ctrlr)); 2145 *sct = SPDK_NVME_SCT_GENERIC; 2146 return SPDK_NVME_SC_INVALID_FIELD; 2147 } 2148 2149 if (cmd->cdw11_bits.create_io_cq.iv > NVME_IRQ_MSIX_NUM - 1) { 2150 SPDK_ERRLOG("%s: IV is too big\n", ctrlr_id(ctrlr)); 2151 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2152 return SPDK_NVME_SC_INVALID_INTERRUPT_VECTOR; 2153 } 2154 2155 cq = ctrlr->cqs[qid]; 2156 cq->size = qsize; 2157 2158 cq->mapping.prp1 = cmd->dptr.prp.prp1; 2159 2160 cq->dbl_headp = ctrlr_doorbell_ptr(ctrlr) + queue_index(qid, true); 2161 2162 err = map_q(ctrlr, &cq->mapping, cq->size, true, true); 2163 if (err) { 2164 SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr)); 2165 *sct = SPDK_NVME_SCT_GENERIC; 2166 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2167 } 2168 2169 SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped cqid:%u IOVA=%#lx vaddr=%p\n", 2170 ctrlr_id(ctrlr), qid, cmd->dptr.prp.prp1, 2171 q_addr(&cq->mapping)); 2172 2173 cq->ien = cmd->cdw11_bits.create_io_cq.ien; 2174 cq->iv = cmd->cdw11_bits.create_io_cq.iv; 2175 cq->phase = true; 2176 cq->cq_state = VFIO_USER_CQ_CREATED; 2177 2178 *cq_tailp(cq) = 0; 2179 2180 /* 2181 * We should always reset the doorbells. 2182 * 2183 * The Specification prohibits the controller from writing to the shadow 2184 * doorbell buffer, however older versions of the Linux NVMe driver 2185 * don't reset the shadow doorbell buffer after a Queue-Level or 2186 * Controller-Level reset, which means that we're left with garbage 2187 * doorbell values. 2188 */ 2189 *cq_dbl_headp(cq) = 0; 2190 2191 *sct = SPDK_NVME_SCT_GENERIC; 2192 return SPDK_NVME_SC_SUCCESS; 2193 } 2194 2195 /* 2196 * Creates a completion or submission I/O queue. Returns 0 on success, -errno 2197 * on error. 2198 */ 2199 static int 2200 handle_create_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, 2201 struct spdk_nvme_cmd *cmd, const bool is_cq) 2202 { 2203 struct nvmf_vfio_user_transport *vu_transport = ctrlr->transport; 2204 uint16_t sct = SPDK_NVME_SCT_GENERIC; 2205 uint16_t sc = SPDK_NVME_SC_SUCCESS; 2206 uint32_t qsize; 2207 uint16_t qid; 2208 2209 assert(ctrlr != NULL); 2210 assert(cmd != NULL); 2211 2212 qid = cmd->cdw10_bits.create_io_q.qid; 2213 if (qid == 0 || qid >= vu_transport->transport.opts.max_qpairs_per_ctrlr) { 2214 SPDK_ERRLOG("%s: invalid qid=%d, max=%d\n", ctrlr_id(ctrlr), 2215 qid, vu_transport->transport.opts.max_qpairs_per_ctrlr); 2216 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2217 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2218 goto out; 2219 } 2220 2221 if (io_q_exists(ctrlr, qid, is_cq)) { 2222 SPDK_ERRLOG("%s: %cqid:%d already exists\n", ctrlr_id(ctrlr), 2223 is_cq ? 'c' : 's', qid); 2224 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2225 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2226 goto out; 2227 } 2228 2229 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 2230 if (qsize == 1 || qsize > max_queue_size(ctrlr)) { 2231 SPDK_ERRLOG("%s: invalid I/O queue size %u\n", ctrlr_id(ctrlr), qsize); 2232 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2233 sc = SPDK_NVME_SC_INVALID_QUEUE_SIZE; 2234 goto out; 2235 } 2236 2237 if (is_cq) { 2238 sc = handle_create_io_cq(ctrlr, cmd, &sct); 2239 } else { 2240 sc = handle_create_io_sq(ctrlr, cmd, &sct); 2241 2242 if (sct == SPDK_NVME_SCT_GENERIC && 2243 sc == SPDK_NVME_SC_SUCCESS) { 2244 /* Completion posted asynchronously. */ 2245 return 0; 2246 } 2247 } 2248 2249 out: 2250 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 2251 } 2252 2253 /* For ADMIN I/O DELETE SUBMISSION QUEUE the NVMf library will disconnect and free 2254 * queue pair, so save the command id and controller in a context. 2255 */ 2256 struct vfio_user_delete_sq_ctx { 2257 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 2258 uint16_t cid; 2259 }; 2260 2261 static void 2262 vfio_user_qpair_delete_cb(void *cb_arg) 2263 { 2264 struct vfio_user_delete_sq_ctx *ctx = cb_arg; 2265 struct nvmf_vfio_user_ctrlr *vu_ctrlr = ctx->vu_ctrlr; 2266 struct nvmf_vfio_user_cq *admin_cq = vu_ctrlr->cqs[0]; 2267 2268 assert(admin_cq != NULL); 2269 assert(admin_cq->group != NULL); 2270 assert(admin_cq->group->group->thread != NULL); 2271 if (admin_cq->group->group->thread != spdk_get_thread()) { 2272 spdk_thread_send_msg(admin_cq->group->group->thread, 2273 vfio_user_qpair_delete_cb, 2274 cb_arg); 2275 } else { 2276 post_completion(vu_ctrlr, admin_cq, 0, 0, 2277 ctx->cid, 2278 SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC); 2279 free(ctx); 2280 } 2281 } 2282 2283 /* 2284 * Deletes a completion or submission I/O queue. 2285 */ 2286 static int 2287 handle_del_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, 2288 struct spdk_nvme_cmd *cmd, const bool is_cq) 2289 { 2290 uint16_t sct = SPDK_NVME_SCT_GENERIC; 2291 uint16_t sc = SPDK_NVME_SC_SUCCESS; 2292 struct nvmf_vfio_user_sq *sq; 2293 struct nvmf_vfio_user_cq *cq; 2294 2295 SPDK_DEBUGLOG(nvmf_vfio, "%s: delete I/O %cqid:%d\n", 2296 ctrlr_id(ctrlr), is_cq ? 'c' : 's', 2297 cmd->cdw10_bits.delete_io_q.qid); 2298 2299 if (!io_q_exists(ctrlr, cmd->cdw10_bits.delete_io_q.qid, is_cq)) { 2300 SPDK_ERRLOG("%s: I/O %cqid:%d does not exist\n", ctrlr_id(ctrlr), 2301 is_cq ? 'c' : 's', cmd->cdw10_bits.delete_io_q.qid); 2302 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2303 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2304 goto out; 2305 } 2306 2307 if (is_cq) { 2308 cq = ctrlr->cqs[cmd->cdw10_bits.delete_io_q.qid]; 2309 if (cq->cq_ref) { 2310 SPDK_ERRLOG("%s: the associated SQ must be deleted first\n", ctrlr_id(ctrlr)); 2311 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2312 sc = SPDK_NVME_SC_INVALID_QUEUE_DELETION; 2313 goto out; 2314 } 2315 delete_cq_done(ctrlr, cq); 2316 } else { 2317 /* 2318 * Deletion of the CQ is only deferred to delete_sq_done() on 2319 * VM reboot or CC.EN change, so we have to delete it in all 2320 * other cases. 2321 */ 2322 sq = ctrlr->sqs[cmd->cdw10_bits.delete_io_q.qid]; 2323 sq->delete_ctx = calloc(1, sizeof(*sq->delete_ctx)); 2324 if (!sq->delete_ctx) { 2325 sct = SPDK_NVME_SCT_GENERIC; 2326 sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2327 goto out; 2328 } 2329 sq->delete_ctx->vu_ctrlr = ctrlr; 2330 sq->delete_ctx->cid = cmd->cid; 2331 sq->sq_state = VFIO_USER_SQ_DELETED; 2332 assert(ctrlr->cqs[sq->cqid]->cq_ref); 2333 ctrlr->cqs[sq->cqid]->cq_ref--; 2334 2335 spdk_nvmf_qpair_disconnect(&sq->qpair, NULL, NULL); 2336 return 0; 2337 } 2338 2339 out: 2340 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 2341 } 2342 2343 /* 2344 * Configures Shadow Doorbells. 2345 */ 2346 static int 2347 handle_doorbell_buffer_config(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd) 2348 { 2349 struct nvmf_vfio_user_shadow_doorbells *sdbl = NULL; 2350 uint32_t dstrd; 2351 uintptr_t page_size, page_mask; 2352 uint64_t prp1, prp2; 2353 uint16_t sct = SPDK_NVME_SCT_GENERIC; 2354 uint16_t sc = SPDK_NVME_SC_INVALID_FIELD; 2355 2356 assert(ctrlr != NULL); 2357 assert(ctrlr->endpoint != NULL); 2358 assert(cmd != NULL); 2359 2360 dstrd = doorbell_stride(ctrlr); 2361 page_size = memory_page_size(ctrlr); 2362 page_mask = memory_page_mask(ctrlr); 2363 2364 /* FIXME: we don't check doorbell stride when setting queue doorbells. */ 2365 if ((4u << dstrd) * NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR > page_size) { 2366 SPDK_ERRLOG("%s: doorbells do not fit in a single host page", 2367 ctrlr_id(ctrlr)); 2368 2369 goto out; 2370 } 2371 2372 /* Verify guest physical addresses passed as PRPs. */ 2373 if (cmd->psdt != SPDK_NVME_PSDT_PRP) { 2374 SPDK_ERRLOG("%s: received Doorbell Buffer Config without PRPs", 2375 ctrlr_id(ctrlr)); 2376 2377 goto out; 2378 } 2379 2380 prp1 = cmd->dptr.prp.prp1; 2381 prp2 = cmd->dptr.prp.prp2; 2382 2383 SPDK_DEBUGLOG(nvmf_vfio, 2384 "%s: configuring shadow doorbells with PRP1=%#lx and PRP2=%#lx (GPAs)\n", 2385 ctrlr_id(ctrlr), prp1, prp2); 2386 2387 if (prp1 == prp2 2388 || prp1 != (prp1 & page_mask) 2389 || prp2 != (prp2 & page_mask)) { 2390 SPDK_ERRLOG("%s: invalid shadow doorbell GPAs\n", 2391 ctrlr_id(ctrlr)); 2392 2393 goto out; 2394 } 2395 2396 /* Map guest physical addresses to our virtual address space. */ 2397 sdbl = map_sdbl(ctrlr->endpoint->vfu_ctx, prp1, prp2, page_size); 2398 if (sdbl == NULL) { 2399 SPDK_ERRLOG("%s: failed to map shadow doorbell buffers\n", 2400 ctrlr_id(ctrlr)); 2401 2402 goto out; 2403 } 2404 2405 ctrlr->shadow_doorbell_buffer = prp1; 2406 ctrlr->eventidx_buffer = prp2; 2407 2408 SPDK_DEBUGLOG(nvmf_vfio, 2409 "%s: mapped shadow doorbell buffers [%p, %p) and [%p, %p)\n", 2410 ctrlr_id(ctrlr), 2411 sdbl->iovs[0].iov_base, 2412 sdbl->iovs[0].iov_base + sdbl->iovs[0].iov_len, 2413 sdbl->iovs[1].iov_base, 2414 sdbl->iovs[1].iov_base + sdbl->iovs[1].iov_len); 2415 2416 2417 /* 2418 * Set all possible CQ head doorbells to polling mode now, such that we 2419 * don't have to worry about it later if the host creates more queues. 2420 * 2421 * We only ever want interrupts for writes to the SQ tail doorbells 2422 * (which are initialised in set_ctrlr_intr_mode() below). 2423 */ 2424 for (uint16_t i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; ++i) { 2425 sdbl->eventidxs[queue_index(i, true)] = NVMF_VFIO_USER_EVENTIDX_POLL; 2426 } 2427 2428 /* Update controller. */ 2429 SWAP(ctrlr->sdbl, sdbl); 2430 2431 /* 2432 * Copy doorbells from either the previous shadow doorbell buffer or the 2433 * BAR0 doorbells and make I/O queue doorbells point to the new buffer. 2434 * 2435 * This needs to account for older versions of the Linux NVMe driver, 2436 * which don't clear out the buffer after a controller reset. 2437 */ 2438 copy_doorbells(ctrlr, sdbl != NULL ? 2439 sdbl->shadow_doorbells : ctrlr->bar0_doorbells, 2440 ctrlr->sdbl->shadow_doorbells); 2441 2442 vfio_user_ctrlr_switch_doorbells(ctrlr, true); 2443 2444 ctrlr_kick(ctrlr); 2445 2446 sc = SPDK_NVME_SC_SUCCESS; 2447 2448 out: 2449 /* 2450 * Unmap existing buffers, in case Doorbell Buffer Config was sent 2451 * more than once (pointless, but not prohibited by the spec), or 2452 * in case of an error. 2453 * 2454 * If this is the first time Doorbell Buffer Config was processed, 2455 * then we've just swapped a NULL from ctrlr->sdbl into sdbl, so 2456 * free_sdbl() becomes a noop. 2457 */ 2458 free_sdbl(ctrlr->endpoint->vfu_ctx, sdbl); 2459 2460 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 2461 } 2462 2463 /* Returns 0 on success and -errno on error. */ 2464 static int 2465 consume_admin_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd) 2466 { 2467 assert(ctrlr != NULL); 2468 assert(cmd != NULL); 2469 2470 if (cmd->fuse != 0) { 2471 /* Fused admin commands are not supported. */ 2472 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, 2473 SPDK_NVME_SC_INVALID_FIELD, 2474 SPDK_NVME_SCT_GENERIC); 2475 } 2476 2477 switch (cmd->opc) { 2478 case SPDK_NVME_OPC_CREATE_IO_CQ: 2479 case SPDK_NVME_OPC_CREATE_IO_SQ: 2480 return handle_create_io_q(ctrlr, cmd, 2481 cmd->opc == SPDK_NVME_OPC_CREATE_IO_CQ); 2482 case SPDK_NVME_OPC_DELETE_IO_SQ: 2483 case SPDK_NVME_OPC_DELETE_IO_CQ: 2484 return handle_del_io_q(ctrlr, cmd, 2485 cmd->opc == SPDK_NVME_OPC_DELETE_IO_CQ); 2486 case SPDK_NVME_OPC_DOORBELL_BUFFER_CONFIG: 2487 if (!ctrlr->transport->transport_opts.disable_shadow_doorbells) { 2488 return handle_doorbell_buffer_config(ctrlr, cmd); 2489 } 2490 /* FALLTHROUGH */ 2491 default: 2492 return handle_cmd_req(ctrlr, cmd, ctrlr->sqs[0]); 2493 } 2494 } 2495 2496 static int 2497 handle_cmd_rsp(struct nvmf_vfio_user_req *vu_req, void *cb_arg) 2498 { 2499 struct nvmf_vfio_user_sq *sq = cb_arg; 2500 struct nvmf_vfio_user_ctrlr *vu_ctrlr = sq->ctrlr; 2501 uint16_t sqid, cqid; 2502 2503 assert(sq != NULL); 2504 assert(vu_req != NULL); 2505 assert(vu_ctrlr != NULL); 2506 2507 if (spdk_likely(vu_req->iovcnt)) { 2508 vfu_sgl_put(vu_ctrlr->endpoint->vfu_ctx, 2509 index_to_sg_t(vu_req->sg, 0), 2510 vu_req->iov, vu_req->iovcnt); 2511 } 2512 sqid = sq->qid; 2513 cqid = sq->cqid; 2514 2515 return post_completion(vu_ctrlr, vu_ctrlr->cqs[cqid], 2516 vu_req->req.rsp->nvme_cpl.cdw0, 2517 sqid, 2518 vu_req->req.cmd->nvme_cmd.cid, 2519 vu_req->req.rsp->nvme_cpl.status.sc, 2520 vu_req->req.rsp->nvme_cpl.status.sct); 2521 } 2522 2523 static int 2524 consume_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_sq *sq, 2525 struct spdk_nvme_cmd *cmd) 2526 { 2527 assert(sq != NULL); 2528 if (spdk_unlikely(nvmf_qpair_is_admin_queue(&sq->qpair))) { 2529 return consume_admin_cmd(ctrlr, cmd); 2530 } 2531 2532 return handle_cmd_req(ctrlr, cmd, sq); 2533 } 2534 2535 /* Returns the number of commands processed, or a negative value on error. */ 2536 static int 2537 handle_sq_tdbl_write(struct nvmf_vfio_user_ctrlr *ctrlr, const uint32_t new_tail, 2538 struct nvmf_vfio_user_sq *sq) 2539 { 2540 struct spdk_nvme_cmd *queue; 2541 struct nvmf_vfio_user_cq *cq = ctrlr->cqs[sq->cqid]; 2542 int count = 0; 2543 uint32_t free_cq_slots; 2544 2545 assert(ctrlr != NULL); 2546 assert(sq != NULL); 2547 2548 if (ctrlr->sdbl != NULL && sq->qid != 0) { 2549 /* 2550 * Submission queue index has moved past the event index, so it 2551 * needs to be re-armed before we go to sleep. 2552 */ 2553 sq->need_rearm = true; 2554 } 2555 2556 free_cq_slots = cq_free_slots(cq); 2557 queue = q_addr(&sq->mapping); 2558 while (*sq_headp(sq) != new_tail) { 2559 int err; 2560 struct spdk_nvme_cmd *cmd; 2561 2562 /* 2563 * Linux host nvme driver can submit cmd's more than free cq slots 2564 * available. So process only those who have cq slots available. 2565 */ 2566 if (free_cq_slots-- == 0) { 2567 cq->last_head = *cq_dbl_headp(cq); 2568 2569 free_cq_slots = cq_free_slots(cq); 2570 if (free_cq_slots > 0) { 2571 continue; 2572 } 2573 2574 /* 2575 * If there are no free cq slots then kick interrupt FD to loop 2576 * again to process remaining sq cmds. 2577 * In case of polling mode we will process remaining sq cmds during 2578 * next polling interation. 2579 * sq head is advanced only for consumed commands. 2580 */ 2581 if (in_interrupt_mode(ctrlr->transport)) { 2582 eventfd_write(ctrlr->intr_fd, 1); 2583 } 2584 break; 2585 } 2586 2587 cmd = &queue[*sq_headp(sq)]; 2588 count++; 2589 2590 /* 2591 * SQHD must contain the new head pointer, so we must increase 2592 * it before we generate a completion. 2593 */ 2594 sq_head_advance(sq); 2595 2596 err = consume_cmd(ctrlr, sq, cmd); 2597 if (spdk_unlikely(err != 0)) { 2598 return err; 2599 } 2600 } 2601 2602 return count; 2603 } 2604 2605 /* Checks whether endpoint is connected from the same process */ 2606 static bool 2607 is_peer_same_process(struct nvmf_vfio_user_endpoint *endpoint) 2608 { 2609 struct ucred ucred; 2610 socklen_t ucredlen = sizeof(ucred); 2611 2612 if (endpoint == NULL) { 2613 return false; 2614 } 2615 2616 if (getsockopt(vfu_get_poll_fd(endpoint->vfu_ctx), SOL_SOCKET, SO_PEERCRED, &ucred, 2617 &ucredlen) < 0) { 2618 SPDK_ERRLOG("getsockopt(SO_PEERCRED): %s\n", strerror(errno)); 2619 return false; 2620 } 2621 2622 return ucred.pid == getpid(); 2623 } 2624 2625 static void 2626 memory_region_add_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) 2627 { 2628 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2629 struct nvmf_vfio_user_ctrlr *ctrlr; 2630 struct nvmf_vfio_user_sq *sq; 2631 struct nvmf_vfio_user_cq *cq; 2632 void *map_start, *map_end; 2633 int ret; 2634 2635 /* 2636 * We're not interested in any DMA regions that aren't mappable (we don't 2637 * support clients that don't share their memory). 2638 */ 2639 if (!info->vaddr) { 2640 return; 2641 } 2642 2643 map_start = info->mapping.iov_base; 2644 map_end = info->mapping.iov_base + info->mapping.iov_len; 2645 2646 if (((uintptr_t)info->mapping.iov_base & MASK_2MB) || 2647 (info->mapping.iov_len & MASK_2MB)) { 2648 SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %p-%p\n", 2649 info->vaddr, map_start, map_end); 2650 return; 2651 } 2652 2653 assert(endpoint != NULL); 2654 if (endpoint->ctrlr == NULL) { 2655 return; 2656 } 2657 ctrlr = endpoint->ctrlr; 2658 2659 SPDK_DEBUGLOG(nvmf_vfio, "%s: map IOVA %p-%p\n", endpoint_id(endpoint), 2660 map_start, map_end); 2661 2662 /* VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE are enabled when registering to VFIO, here we also 2663 * check the protection bits before registering. When vfio client and server are run in same process 2664 * there is no need to register the same memory again. 2665 */ 2666 if (info->prot == (PROT_WRITE | PROT_READ) && !is_peer_same_process(endpoint)) { 2667 ret = spdk_mem_register(info->mapping.iov_base, info->mapping.iov_len); 2668 if (ret) { 2669 SPDK_ERRLOG("Memory region register %p-%p failed, ret=%d\n", 2670 map_start, map_end, ret); 2671 } 2672 } 2673 2674 pthread_mutex_lock(&endpoint->lock); 2675 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 2676 if (sq->sq_state != VFIO_USER_SQ_INACTIVE) { 2677 continue; 2678 } 2679 2680 cq = ctrlr->cqs[sq->cqid]; 2681 2682 /* For shared CQ case, we will use q_addr() to avoid mapping CQ multiple times */ 2683 if (cq->size && q_addr(&cq->mapping) == NULL) { 2684 ret = map_q(ctrlr, &cq->mapping, cq->size, true, false); 2685 if (ret) { 2686 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap cqid:%d %#lx-%#lx\n", 2687 cq->qid, cq->mapping.prp1, 2688 cq->mapping.prp1 + cq->size * sizeof(struct spdk_nvme_cpl)); 2689 continue; 2690 } 2691 } 2692 2693 if (sq->size) { 2694 ret = map_q(ctrlr, &sq->mapping, sq->size, false, false); 2695 if (ret) { 2696 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap sqid:%d %#lx-%#lx\n", 2697 sq->qid, sq->mapping.prp1, 2698 sq->mapping.prp1 + sq->size * sizeof(struct spdk_nvme_cmd)); 2699 continue; 2700 } 2701 } 2702 sq->sq_state = VFIO_USER_SQ_ACTIVE; 2703 SPDK_DEBUGLOG(nvmf_vfio, "Remap sqid:%u successfully\n", sq->qid); 2704 } 2705 pthread_mutex_unlock(&endpoint->lock); 2706 } 2707 2708 static void 2709 memory_region_remove_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) 2710 { 2711 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2712 struct nvmf_vfio_user_sq *sq; 2713 struct nvmf_vfio_user_cq *cq; 2714 void *map_start, *map_end; 2715 int ret = 0; 2716 2717 if (!info->vaddr) { 2718 return; 2719 } 2720 2721 map_start = info->mapping.iov_base; 2722 map_end = info->mapping.iov_base + info->mapping.iov_len; 2723 2724 if (((uintptr_t)info->mapping.iov_base & MASK_2MB) || 2725 (info->mapping.iov_len & MASK_2MB)) { 2726 SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %p-%p\n", 2727 info->vaddr, map_start, map_end); 2728 return; 2729 } 2730 2731 assert(endpoint != NULL); 2732 SPDK_DEBUGLOG(nvmf_vfio, "%s: unmap IOVA %p-%p\n", endpoint_id(endpoint), 2733 map_start, map_end); 2734 2735 if (endpoint->ctrlr != NULL) { 2736 struct nvmf_vfio_user_ctrlr *ctrlr; 2737 ctrlr = endpoint->ctrlr; 2738 2739 pthread_mutex_lock(&endpoint->lock); 2740 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 2741 if (q_addr(&sq->mapping) >= map_start && q_addr(&sq->mapping) <= map_end) { 2742 unmap_q(ctrlr, &sq->mapping); 2743 sq->sq_state = VFIO_USER_SQ_INACTIVE; 2744 } 2745 2746 cq = ctrlr->cqs[sq->cqid]; 2747 if (q_addr(&cq->mapping) >= map_start && q_addr(&cq->mapping) <= map_end) { 2748 unmap_q(ctrlr, &cq->mapping); 2749 } 2750 } 2751 2752 if (ctrlr->sdbl != NULL) { 2753 size_t i; 2754 2755 for (i = 0; i < NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT; i++) { 2756 const void *const iov_base = ctrlr->sdbl->iovs[i].iov_base; 2757 2758 if (iov_base >= map_start && iov_base < map_end) { 2759 copy_doorbells(ctrlr, 2760 ctrlr->sdbl->shadow_doorbells, 2761 ctrlr->bar0_doorbells); 2762 vfio_user_ctrlr_switch_doorbells(ctrlr, false); 2763 free_sdbl(endpoint->vfu_ctx, ctrlr->sdbl); 2764 ctrlr->sdbl = NULL; 2765 break; 2766 } 2767 } 2768 } 2769 2770 pthread_mutex_unlock(&endpoint->lock); 2771 } 2772 2773 if (info->prot == (PROT_WRITE | PROT_READ) && !is_peer_same_process(endpoint)) { 2774 ret = spdk_mem_unregister(info->mapping.iov_base, info->mapping.iov_len); 2775 if (ret) { 2776 SPDK_ERRLOG("Memory region unregister %p-%p failed, ret=%d\n", 2777 map_start, map_end, ret); 2778 } 2779 } 2780 } 2781 2782 /* Used to initiate a controller-level reset or a controller shutdown. */ 2783 static void 2784 disable_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 2785 { 2786 SPDK_DEBUGLOG(nvmf_vfio, "%s: disabling controller\n", 2787 ctrlr_id(vu_ctrlr)); 2788 2789 /* Unmap Admin queue. */ 2790 2791 assert(vu_ctrlr->sqs[0] != NULL); 2792 assert(vu_ctrlr->cqs[0] != NULL); 2793 2794 unmap_q(vu_ctrlr, &vu_ctrlr->sqs[0]->mapping); 2795 unmap_q(vu_ctrlr, &vu_ctrlr->cqs[0]->mapping); 2796 2797 vu_ctrlr->sqs[0]->size = 0; 2798 *sq_headp(vu_ctrlr->sqs[0]) = 0; 2799 2800 vu_ctrlr->sqs[0]->sq_state = VFIO_USER_SQ_INACTIVE; 2801 2802 vu_ctrlr->cqs[0]->size = 0; 2803 *cq_tailp(vu_ctrlr->cqs[0]) = 0; 2804 2805 /* 2806 * For PCIe controller reset or shutdown, we will drop all AER 2807 * responses. 2808 */ 2809 spdk_nvmf_ctrlr_abort_aer(vu_ctrlr->ctrlr); 2810 2811 /* Free the shadow doorbell buffer. */ 2812 vfio_user_ctrlr_switch_doorbells(vu_ctrlr, false); 2813 free_sdbl(vu_ctrlr->endpoint->vfu_ctx, vu_ctrlr->sdbl); 2814 vu_ctrlr->sdbl = NULL; 2815 } 2816 2817 /* Used to re-enable the controller after a controller-level reset. */ 2818 static int 2819 enable_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 2820 { 2821 int err; 2822 2823 assert(vu_ctrlr != NULL); 2824 2825 SPDK_DEBUGLOG(nvmf_vfio, "%s: enabling controller\n", 2826 ctrlr_id(vu_ctrlr)); 2827 2828 err = acq_setup(vu_ctrlr); 2829 if (err != 0) { 2830 return err; 2831 } 2832 2833 err = asq_setup(vu_ctrlr); 2834 if (err != 0) { 2835 return err; 2836 } 2837 2838 vu_ctrlr->sqs[0]->sq_state = VFIO_USER_SQ_ACTIVE; 2839 2840 return 0; 2841 } 2842 2843 static int 2844 nvmf_vfio_user_prop_req_rsp_set(struct nvmf_vfio_user_req *req, 2845 struct nvmf_vfio_user_sq *sq) 2846 { 2847 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 2848 union spdk_nvme_cc_register cc, diff; 2849 2850 assert(req->req.cmd->prop_set_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET); 2851 assert(sq->ctrlr != NULL); 2852 vu_ctrlr = sq->ctrlr; 2853 2854 if (req->req.cmd->prop_set_cmd.ofst != offsetof(struct spdk_nvme_registers, cc)) { 2855 return 0; 2856 } 2857 2858 cc.raw = req->req.cmd->prop_set_cmd.value.u64; 2859 diff.raw = cc.raw ^ req->cc.raw; 2860 2861 if (diff.bits.en) { 2862 if (cc.bits.en) { 2863 int ret = enable_ctrlr(vu_ctrlr); 2864 if (ret) { 2865 SPDK_ERRLOG("%s: failed to enable ctrlr\n", ctrlr_id(vu_ctrlr)); 2866 return ret; 2867 } 2868 vu_ctrlr->reset_shn = false; 2869 } else { 2870 vu_ctrlr->reset_shn = true; 2871 } 2872 } 2873 2874 if (diff.bits.shn) { 2875 if (cc.bits.shn == SPDK_NVME_SHN_NORMAL || cc.bits.shn == SPDK_NVME_SHN_ABRUPT) { 2876 vu_ctrlr->reset_shn = true; 2877 } 2878 } 2879 2880 if (vu_ctrlr->reset_shn) { 2881 disable_ctrlr(vu_ctrlr); 2882 } 2883 return 0; 2884 } 2885 2886 static int 2887 nvmf_vfio_user_prop_req_rsp(struct nvmf_vfio_user_req *req, void *cb_arg) 2888 { 2889 struct nvmf_vfio_user_sq *sq = cb_arg; 2890 2891 assert(sq != NULL); 2892 assert(req != NULL); 2893 2894 if (req->req.cmd->prop_get_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET) { 2895 assert(sq->ctrlr != NULL); 2896 assert(req != NULL); 2897 2898 memcpy(req->req.iov[0].iov_base, 2899 &req->req.rsp->prop_get_rsp.value.u64, 2900 req->req.length); 2901 return 0; 2902 } 2903 2904 return nvmf_vfio_user_prop_req_rsp_set(req, sq); 2905 } 2906 2907 /* 2908 * Handles a write at offset 0x1000 or more; this is the non-mapped path when a 2909 * doorbell is written via access_bar0_fn(). 2910 * 2911 * DSTRD is set to fixed value 0 for NVMf. 2912 * 2913 */ 2914 static int 2915 handle_dbl_access(struct nvmf_vfio_user_ctrlr *ctrlr, uint32_t *buf, 2916 const size_t count, loff_t pos, const bool is_write) 2917 { 2918 struct nvmf_vfio_user_poll_group *group; 2919 2920 assert(ctrlr != NULL); 2921 assert(buf != NULL); 2922 2923 if (spdk_unlikely(!is_write)) { 2924 SPDK_WARNLOG("%s: host tried to read BAR0 doorbell %#lx\n", 2925 ctrlr_id(ctrlr), pos); 2926 errno = EPERM; 2927 return -1; 2928 } 2929 2930 if (spdk_unlikely(count != sizeof(uint32_t))) { 2931 SPDK_ERRLOG("%s: bad doorbell buffer size %ld\n", 2932 ctrlr_id(ctrlr), count); 2933 errno = EINVAL; 2934 return -1; 2935 } 2936 2937 pos -= NVME_DOORBELLS_OFFSET; 2938 2939 /* pos must be dword aligned */ 2940 if (spdk_unlikely((pos & 0x3) != 0)) { 2941 SPDK_ERRLOG("%s: bad doorbell offset %#lx\n", ctrlr_id(ctrlr), pos); 2942 errno = EINVAL; 2943 return -1; 2944 } 2945 2946 /* convert byte offset to array index */ 2947 pos >>= 2; 2948 2949 if (spdk_unlikely(pos >= NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR * 2)) { 2950 SPDK_ERRLOG("%s: bad doorbell index %#lx\n", ctrlr_id(ctrlr), pos); 2951 errno = EINVAL; 2952 return -1; 2953 } 2954 2955 ctrlr->bar0_doorbells[pos] = *buf; 2956 spdk_wmb(); 2957 2958 group = ctrlr_to_poll_group(ctrlr); 2959 if (pos == 1) { 2960 group->stats.cqh_admin_writes++; 2961 } else if (pos & 1) { 2962 group->stats.cqh_io_writes++; 2963 } 2964 2965 SPDK_DEBUGLOG(vfio_user_db, "%s: updating BAR0 doorbell %s:%ld to %u\n", 2966 ctrlr_id(ctrlr), (pos & 1) ? "cqid" : "sqid", 2967 pos / 2, *buf); 2968 2969 2970 return 0; 2971 } 2972 2973 static size_t 2974 vfio_user_property_access(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 2975 char *buf, size_t count, loff_t pos, 2976 bool is_write) 2977 { 2978 struct nvmf_vfio_user_req *req; 2979 const struct spdk_nvmf_registers *regs; 2980 2981 if ((count != 4) && (count != 8)) { 2982 errno = EINVAL; 2983 return -1; 2984 } 2985 2986 /* Construct a Fabric Property Get/Set command and send it */ 2987 req = get_nvmf_vfio_user_req(vu_ctrlr->sqs[0]); 2988 if (req == NULL) { 2989 errno = ENOBUFS; 2990 return -1; 2991 } 2992 regs = spdk_nvmf_ctrlr_get_regs(vu_ctrlr->ctrlr); 2993 req->cc.raw = regs->cc.raw; 2994 2995 req->cb_fn = nvmf_vfio_user_prop_req_rsp; 2996 req->cb_arg = vu_ctrlr->sqs[0]; 2997 req->req.cmd->prop_set_cmd.opcode = SPDK_NVME_OPC_FABRIC; 2998 req->req.cmd->prop_set_cmd.cid = 0; 2999 if (count == 4) { 3000 req->req.cmd->prop_set_cmd.attrib.size = 0; 3001 } else { 3002 req->req.cmd->prop_set_cmd.attrib.size = 1; 3003 } 3004 req->req.cmd->prop_set_cmd.ofst = pos; 3005 if (is_write) { 3006 req->req.cmd->prop_set_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET; 3007 if (req->req.cmd->prop_set_cmd.attrib.size) { 3008 req->req.cmd->prop_set_cmd.value.u64 = *(uint64_t *)buf; 3009 } else { 3010 req->req.cmd->prop_set_cmd.value.u32.high = 0; 3011 req->req.cmd->prop_set_cmd.value.u32.low = *(uint32_t *)buf; 3012 } 3013 } else { 3014 req->req.cmd->prop_get_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET; 3015 } 3016 req->req.length = count; 3017 spdk_iov_one(req->req.iov, &req->req.iovcnt, buf, req->req.length); 3018 3019 spdk_nvmf_request_exec_fabrics(&req->req); 3020 3021 return count; 3022 } 3023 3024 static ssize_t 3025 access_bar0_fn(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t pos, 3026 bool is_write) 3027 { 3028 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3029 struct nvmf_vfio_user_ctrlr *ctrlr; 3030 int ret; 3031 3032 ctrlr = endpoint->ctrlr; 3033 if (spdk_unlikely(endpoint->need_async_destroy || !ctrlr)) { 3034 errno = EIO; 3035 return -1; 3036 } 3037 3038 if (pos >= NVME_DOORBELLS_OFFSET) { 3039 /* 3040 * The fact that the doorbells can be memory mapped doesn't mean 3041 * that the client (VFIO in QEMU) is obliged to memory map them, 3042 * it might still elect to access them via regular read/write; 3043 * we might also have had disable_mappable_bar0 set. 3044 */ 3045 ret = handle_dbl_access(ctrlr, (uint32_t *)buf, count, 3046 pos, is_write); 3047 if (ret == 0) { 3048 return count; 3049 } 3050 return ret; 3051 } 3052 3053 return vfio_user_property_access(ctrlr, buf, count, pos, is_write); 3054 } 3055 3056 static ssize_t 3057 access_pci_config(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t offset, 3058 bool is_write) 3059 { 3060 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3061 3062 if (is_write) { 3063 SPDK_ERRLOG("%s: write %#lx-%#lx not supported\n", 3064 endpoint_id(endpoint), offset, offset + count); 3065 errno = EINVAL; 3066 return -1; 3067 } 3068 3069 if (offset + count > NVME_REG_CFG_SIZE) { 3070 SPDK_ERRLOG("%s: access past end of extended PCI configuration space, want=%ld+%ld, max=%d\n", 3071 endpoint_id(endpoint), offset, count, 3072 NVME_REG_CFG_SIZE); 3073 errno = ERANGE; 3074 return -1; 3075 } 3076 3077 memcpy(buf, ((unsigned char *)endpoint->pci_config_space) + offset, count); 3078 3079 return count; 3080 } 3081 3082 static void 3083 vfio_user_log(vfu_ctx_t *vfu_ctx, int level, char const *msg) 3084 { 3085 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3086 3087 if (level >= LOG_DEBUG) { 3088 SPDK_DEBUGLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg); 3089 } else if (level >= LOG_INFO) { 3090 SPDK_INFOLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg); 3091 } else if (level >= LOG_NOTICE) { 3092 SPDK_NOTICELOG("%s: %s\n", endpoint_id(endpoint), msg); 3093 } else if (level >= LOG_WARNING) { 3094 SPDK_WARNLOG("%s: %s\n", endpoint_id(endpoint), msg); 3095 } else { 3096 SPDK_ERRLOG("%s: %s\n", endpoint_id(endpoint), msg); 3097 } 3098 } 3099 3100 static int 3101 vfio_user_get_log_level(void) 3102 { 3103 int level; 3104 3105 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 3106 return LOG_DEBUG; 3107 } 3108 3109 level = spdk_log_to_syslog_level(spdk_log_get_level()); 3110 if (level < 0) { 3111 return LOG_ERR; 3112 } 3113 3114 return level; 3115 } 3116 3117 static void 3118 init_pci_config_space(vfu_pci_config_space_t *p) 3119 { 3120 /* MLBAR */ 3121 p->hdr.bars[0].raw = 0x0; 3122 /* MUBAR */ 3123 p->hdr.bars[1].raw = 0x0; 3124 3125 /* vendor specific, let's set them to zero for now */ 3126 p->hdr.bars[3].raw = 0x0; 3127 p->hdr.bars[4].raw = 0x0; 3128 p->hdr.bars[5].raw = 0x0; 3129 3130 /* enable INTx */ 3131 p->hdr.intr.ipin = 0x1; 3132 } 3133 3134 struct ctrlr_quiesce_ctx { 3135 struct nvmf_vfio_user_endpoint *endpoint; 3136 struct nvmf_vfio_user_poll_group *group; 3137 int status; 3138 }; 3139 3140 static void ctrlr_quiesce(struct nvmf_vfio_user_ctrlr *vu_ctrlr); 3141 3142 static void 3143 _vfio_user_endpoint_resume_done_msg(void *ctx) 3144 { 3145 struct nvmf_vfio_user_endpoint *endpoint = ctx; 3146 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3147 3148 endpoint->need_resume = false; 3149 3150 if (!vu_ctrlr) { 3151 return; 3152 } 3153 3154 if (!vu_ctrlr->queued_quiesce) { 3155 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 3156 3157 /* 3158 * We might have ignored new SQ entries while we were quiesced: 3159 * kick ourselves so we'll definitely check again while in 3160 * VFIO_USER_CTRLR_RUNNING state. 3161 */ 3162 if (in_interrupt_mode(endpoint->transport)) { 3163 ctrlr_kick(vu_ctrlr); 3164 } 3165 return; 3166 } 3167 3168 3169 /* 3170 * Basically, once we call `vfu_device_quiesced` the device is 3171 * unquiesced from libvfio-user's perspective so from the moment 3172 * `vfio_user_quiesce_done` returns libvfio-user might quiesce the device 3173 * again. However, because the NVMf subsytem is an asynchronous 3174 * operation, this quiesce might come _before_ the NVMf subsystem has 3175 * been resumed, so in the callback of `spdk_nvmf_subsystem_resume` we 3176 * need to check whether a quiesce was requested. 3177 */ 3178 SPDK_DEBUGLOG(nvmf_vfio, "%s has queued quiesce event, quiesce again\n", 3179 ctrlr_id(vu_ctrlr)); 3180 ctrlr_quiesce(vu_ctrlr); 3181 } 3182 3183 static void 3184 vfio_user_endpoint_resume_done(struct spdk_nvmf_subsystem *subsystem, 3185 void *cb_arg, int status) 3186 { 3187 struct nvmf_vfio_user_endpoint *endpoint = cb_arg; 3188 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3189 3190 SPDK_DEBUGLOG(nvmf_vfio, "%s resumed done with status %d\n", endpoint_id(endpoint), status); 3191 3192 if (!vu_ctrlr) { 3193 return; 3194 } 3195 3196 spdk_thread_send_msg(vu_ctrlr->thread, _vfio_user_endpoint_resume_done_msg, endpoint); 3197 } 3198 3199 static void 3200 vfio_user_quiesce_done(void *ctx) 3201 { 3202 struct ctrlr_quiesce_ctx *quiesce_ctx = ctx; 3203 struct nvmf_vfio_user_endpoint *endpoint = quiesce_ctx->endpoint; 3204 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3205 int ret; 3206 3207 if (!vu_ctrlr) { 3208 free(quiesce_ctx); 3209 return; 3210 } 3211 3212 SPDK_DEBUGLOG(nvmf_vfio, "%s device quiesced\n", ctrlr_id(vu_ctrlr)); 3213 3214 assert(vu_ctrlr->state == VFIO_USER_CTRLR_PAUSING); 3215 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 3216 vfu_device_quiesced(endpoint->vfu_ctx, quiesce_ctx->status); 3217 vu_ctrlr->queued_quiesce = false; 3218 free(quiesce_ctx); 3219 3220 /* `vfu_device_quiesced` can change the migration state, 3221 * so we need to re-check `vu_ctrlr->state`. 3222 */ 3223 if (vu_ctrlr->state == VFIO_USER_CTRLR_MIGRATING) { 3224 SPDK_DEBUGLOG(nvmf_vfio, "%s is in MIGRATION state\n", ctrlr_id(vu_ctrlr)); 3225 return; 3226 } 3227 3228 SPDK_DEBUGLOG(nvmf_vfio, "%s start to resume\n", ctrlr_id(vu_ctrlr)); 3229 vu_ctrlr->state = VFIO_USER_CTRLR_RESUMING; 3230 ret = spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 3231 vfio_user_endpoint_resume_done, endpoint); 3232 if (ret < 0) { 3233 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 3234 SPDK_ERRLOG("%s: failed to resume, ret=%d\n", endpoint_id(endpoint), ret); 3235 } 3236 } 3237 3238 static void 3239 vfio_user_pause_done(struct spdk_nvmf_subsystem *subsystem, 3240 void *ctx, int status) 3241 { 3242 struct ctrlr_quiesce_ctx *quiesce_ctx = ctx; 3243 struct nvmf_vfio_user_endpoint *endpoint = quiesce_ctx->endpoint; 3244 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3245 3246 if (!vu_ctrlr) { 3247 free(quiesce_ctx); 3248 return; 3249 } 3250 3251 quiesce_ctx->status = status; 3252 3253 SPDK_DEBUGLOG(nvmf_vfio, "%s pause done with status %d\n", 3254 ctrlr_id(vu_ctrlr), status); 3255 3256 spdk_thread_send_msg(vu_ctrlr->thread, 3257 vfio_user_quiesce_done, ctx); 3258 } 3259 3260 /* 3261 * Ensure that, for this PG, we've stopped running in nvmf_vfio_user_sq_poll(); 3262 * we've already set ctrlr->state, so we won't process new entries, but we need 3263 * to ensure that this PG is quiesced. This only works because there's no 3264 * callback context set up between polling the SQ and spdk_nvmf_request_exec(). 3265 * 3266 * Once we've walked all PGs, we need to pause any submitted I/O via 3267 * spdk_nvmf_subsystem_pause(SPDK_NVME_GLOBAL_NS_TAG). 3268 */ 3269 static void 3270 vfio_user_quiesce_pg(void *ctx) 3271 { 3272 struct ctrlr_quiesce_ctx *quiesce_ctx = ctx; 3273 struct nvmf_vfio_user_endpoint *endpoint = quiesce_ctx->endpoint; 3274 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3275 struct nvmf_vfio_user_poll_group *vu_group = quiesce_ctx->group; 3276 struct spdk_nvmf_subsystem *subsystem = endpoint->subsystem; 3277 int ret; 3278 3279 SPDK_DEBUGLOG(nvmf_vfio, "quiesced pg:%p\n", vu_group); 3280 3281 if (!vu_ctrlr) { 3282 free(quiesce_ctx); 3283 return; 3284 } 3285 3286 quiesce_ctx->group = TAILQ_NEXT(vu_group, link); 3287 if (quiesce_ctx->group != NULL) { 3288 spdk_thread_send_msg(poll_group_to_thread(quiesce_ctx->group), 3289 vfio_user_quiesce_pg, quiesce_ctx); 3290 return; 3291 } 3292 3293 ret = spdk_nvmf_subsystem_pause(subsystem, SPDK_NVME_GLOBAL_NS_TAG, 3294 vfio_user_pause_done, quiesce_ctx); 3295 if (ret < 0) { 3296 SPDK_ERRLOG("%s: failed to pause, ret=%d\n", 3297 endpoint_id(endpoint), ret); 3298 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 3299 fail_ctrlr(vu_ctrlr); 3300 free(quiesce_ctx); 3301 } 3302 } 3303 3304 static void 3305 ctrlr_quiesce(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3306 { 3307 struct ctrlr_quiesce_ctx *quiesce_ctx; 3308 3309 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSING; 3310 3311 quiesce_ctx = calloc(1, sizeof(*quiesce_ctx)); 3312 if (!quiesce_ctx) { 3313 SPDK_ERRLOG("Failed to allocate subsystem pause context\n"); 3314 assert(false); 3315 return; 3316 } 3317 3318 quiesce_ctx->endpoint = vu_ctrlr->endpoint; 3319 quiesce_ctx->status = 0; 3320 quiesce_ctx->group = TAILQ_FIRST(&vu_ctrlr->transport->poll_groups); 3321 3322 spdk_thread_send_msg(poll_group_to_thread(quiesce_ctx->group), 3323 vfio_user_quiesce_pg, quiesce_ctx); 3324 } 3325 3326 static int 3327 vfio_user_dev_quiesce_cb(vfu_ctx_t *vfu_ctx) 3328 { 3329 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3330 struct spdk_nvmf_subsystem *subsystem = endpoint->subsystem; 3331 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3332 3333 if (!vu_ctrlr) { 3334 return 0; 3335 } 3336 3337 /* NVMf library will destruct controller when no 3338 * connected queue pairs. 3339 */ 3340 if (!nvmf_subsystem_get_ctrlr(subsystem, vu_ctrlr->cntlid)) { 3341 return 0; 3342 } 3343 3344 SPDK_DEBUGLOG(nvmf_vfio, "%s starts to quiesce\n", ctrlr_id(vu_ctrlr)); 3345 3346 /* There is no race condition here as device quiesce callback 3347 * and nvmf_prop_set_cc() are running in the same thread context. 3348 */ 3349 if (!vu_ctrlr->ctrlr->vcprop.cc.bits.en) { 3350 return 0; 3351 } else if (!vu_ctrlr->ctrlr->vcprop.csts.bits.rdy) { 3352 return 0; 3353 } else if (vu_ctrlr->ctrlr->vcprop.csts.bits.shst == SPDK_NVME_SHST_COMPLETE) { 3354 return 0; 3355 } 3356 3357 switch (vu_ctrlr->state) { 3358 case VFIO_USER_CTRLR_PAUSED: 3359 case VFIO_USER_CTRLR_MIGRATING: 3360 return 0; 3361 case VFIO_USER_CTRLR_RUNNING: 3362 ctrlr_quiesce(vu_ctrlr); 3363 break; 3364 case VFIO_USER_CTRLR_RESUMING: 3365 vu_ctrlr->queued_quiesce = true; 3366 SPDK_DEBUGLOG(nvmf_vfio, "%s is busy to quiesce, current state %u\n", ctrlr_id(vu_ctrlr), 3367 vu_ctrlr->state); 3368 break; 3369 default: 3370 assert(vu_ctrlr->state != VFIO_USER_CTRLR_PAUSING); 3371 break; 3372 } 3373 3374 errno = EBUSY; 3375 return -1; 3376 } 3377 3378 static void 3379 vfio_user_ctrlr_dump_migr_data(const char *name, 3380 struct vfio_user_nvme_migr_state *migr_data, 3381 struct nvmf_vfio_user_shadow_doorbells *sdbl) 3382 { 3383 struct spdk_nvmf_registers *regs; 3384 struct nvme_migr_sq_state *sq; 3385 struct nvme_migr_cq_state *cq; 3386 uint32_t *doorbell_base; 3387 uint32_t i; 3388 3389 SPDK_NOTICELOG("Dump %s\n", name); 3390 3391 regs = &migr_data->nvmf_data.regs; 3392 doorbell_base = (uint32_t *)&migr_data->doorbells; 3393 3394 SPDK_NOTICELOG("Registers\n"); 3395 SPDK_NOTICELOG("CSTS 0x%x\n", regs->csts.raw); 3396 SPDK_NOTICELOG("CAP 0x%"PRIx64"\n", regs->cap.raw); 3397 SPDK_NOTICELOG("VS 0x%x\n", regs->vs.raw); 3398 SPDK_NOTICELOG("CC 0x%x\n", regs->cc.raw); 3399 SPDK_NOTICELOG("AQA 0x%x\n", regs->aqa.raw); 3400 SPDK_NOTICELOG("ASQ 0x%"PRIx64"\n", regs->asq); 3401 SPDK_NOTICELOG("ACQ 0x%"PRIx64"\n", regs->acq); 3402 3403 SPDK_NOTICELOG("Number of IO Queues %u\n", migr_data->ctrlr_header.num_io_queues); 3404 3405 if (sdbl != NULL) { 3406 SPDK_NOTICELOG("shadow doorbell buffer=%#lx\n", 3407 migr_data->ctrlr_header.shadow_doorbell_buffer); 3408 SPDK_NOTICELOG("eventidx buffer=%#lx\n", 3409 migr_data->ctrlr_header.eventidx_buffer); 3410 } 3411 3412 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3413 sq = &migr_data->qps[i].sq; 3414 cq = &migr_data->qps[i].cq; 3415 3416 if (sq->size) { 3417 SPDK_NOTICELOG("sqid:%u, bar0_doorbell:%u\n", sq->sqid, doorbell_base[i * 2]); 3418 if (i > 0 && sdbl != NULL) { 3419 SPDK_NOTICELOG("sqid:%u, shadow_doorbell:%u, eventidx:%u\n", 3420 sq->sqid, 3421 sdbl->shadow_doorbells[queue_index(i, false)], 3422 sdbl->eventidxs[queue_index(i, false)]); 3423 } 3424 SPDK_NOTICELOG("SQ sqid:%u, cqid:%u, sqhead:%u, size:%u, dma_addr:0x%"PRIx64"\n", 3425 sq->sqid, sq->cqid, sq->head, sq->size, sq->dma_addr); 3426 } 3427 3428 if (cq->size) { 3429 SPDK_NOTICELOG("cqid:%u, bar0_doorbell:%u\n", cq->cqid, doorbell_base[i * 2 + 1]); 3430 if (i > 0 && sdbl != NULL) { 3431 SPDK_NOTICELOG("cqid:%u, shadow_doorbell:%u, eventidx:%u\n", 3432 cq->cqid, 3433 sdbl->shadow_doorbells[queue_index(i, true)], 3434 sdbl->eventidxs[queue_index(i, true)]); 3435 } 3436 SPDK_NOTICELOG("CQ cqid:%u, phase:%u, cqtail:%u, size:%u, iv:%u, ien:%u, dma_addr:0x%"PRIx64"\n", 3437 cq->cqid, cq->phase, cq->tail, cq->size, cq->iv, cq->ien, cq->dma_addr); 3438 } 3439 } 3440 3441 SPDK_NOTICELOG("%s Dump Done\n", name); 3442 } 3443 3444 /* Read region 9 content and restore it to migration data structures */ 3445 static int 3446 vfio_user_migr_stream_to_data(struct nvmf_vfio_user_endpoint *endpoint, 3447 struct vfio_user_nvme_migr_state *migr_state) 3448 { 3449 void *data_ptr = endpoint->migr_data; 3450 3451 /* Load vfio_user_nvme_migr_header first */ 3452 memcpy(&migr_state->ctrlr_header, data_ptr, sizeof(struct vfio_user_nvme_migr_header)); 3453 /* TODO: version check */ 3454 if (migr_state->ctrlr_header.magic != VFIO_USER_NVME_MIGR_MAGIC) { 3455 SPDK_ERRLOG("%s: bad magic number %x\n", endpoint_id(endpoint), migr_state->ctrlr_header.magic); 3456 return -EINVAL; 3457 } 3458 3459 /* Load nvmf controller data */ 3460 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.nvmf_data_offset; 3461 memcpy(&migr_state->nvmf_data, data_ptr, migr_state->ctrlr_header.nvmf_data_len); 3462 3463 /* Load queue pairs */ 3464 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.qp_offset; 3465 memcpy(&migr_state->qps, data_ptr, migr_state->ctrlr_header.qp_len); 3466 3467 /* Load doorbells */ 3468 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.bar_offset[VFU_PCI_DEV_BAR0_REGION_IDX]; 3469 memcpy(&migr_state->doorbells, data_ptr, 3470 migr_state->ctrlr_header.bar_len[VFU_PCI_DEV_BAR0_REGION_IDX]); 3471 3472 /* Load CFG */ 3473 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.bar_offset[VFU_PCI_DEV_CFG_REGION_IDX]; 3474 memcpy(&migr_state->cfg, data_ptr, migr_state->ctrlr_header.bar_len[VFU_PCI_DEV_CFG_REGION_IDX]); 3475 3476 return 0; 3477 } 3478 3479 3480 static void 3481 vfio_user_migr_ctrlr_save_data(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3482 { 3483 struct spdk_nvmf_ctrlr *ctrlr = vu_ctrlr->ctrlr; 3484 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 3485 struct nvmf_vfio_user_sq *sq; 3486 struct nvmf_vfio_user_cq *cq; 3487 uint64_t data_offset; 3488 void *data_ptr; 3489 uint32_t *doorbell_base; 3490 uint32_t i = 0; 3491 uint16_t sqid, cqid; 3492 struct vfio_user_nvme_migr_state migr_state = { 3493 .nvmf_data = { 3494 .data_size = offsetof(struct spdk_nvmf_ctrlr_migr_data, unused), 3495 .regs_size = sizeof(struct spdk_nvmf_registers), 3496 .feat_size = sizeof(struct spdk_nvmf_ctrlr_feat) 3497 } 3498 }; 3499 3500 /* Save all data to vfio_user_nvme_migr_state first, then we will 3501 * copy it to device migration region at last. 3502 */ 3503 3504 /* save magic number */ 3505 migr_state.ctrlr_header.magic = VFIO_USER_NVME_MIGR_MAGIC; 3506 3507 /* save controller data */ 3508 spdk_nvmf_ctrlr_save_migr_data(ctrlr, &migr_state.nvmf_data); 3509 3510 /* save connected queue pairs */ 3511 TAILQ_FOREACH(sq, &vu_ctrlr->connected_sqs, tailq) { 3512 /* save sq */ 3513 sqid = sq->qid; 3514 migr_state.qps[sqid].sq.sqid = sq->qid; 3515 migr_state.qps[sqid].sq.cqid = sq->cqid; 3516 migr_state.qps[sqid].sq.head = *sq_headp(sq); 3517 migr_state.qps[sqid].sq.size = sq->size; 3518 migr_state.qps[sqid].sq.dma_addr = sq->mapping.prp1; 3519 3520 /* save cq, for shared cq case, cq may be saved multiple times */ 3521 cqid = sq->cqid; 3522 cq = vu_ctrlr->cqs[cqid]; 3523 migr_state.qps[cqid].cq.cqid = cqid; 3524 migr_state.qps[cqid].cq.tail = *cq_tailp(cq); 3525 migr_state.qps[cqid].cq.ien = cq->ien; 3526 migr_state.qps[cqid].cq.iv = cq->iv; 3527 migr_state.qps[cqid].cq.size = cq->size; 3528 migr_state.qps[cqid].cq.phase = cq->phase; 3529 migr_state.qps[cqid].cq.dma_addr = cq->mapping.prp1; 3530 i++; 3531 } 3532 3533 assert(i > 0); 3534 migr_state.ctrlr_header.num_io_queues = i - 1; 3535 3536 /* Save doorbells */ 3537 doorbell_base = (uint32_t *)&migr_state.doorbells; 3538 memcpy(doorbell_base, (void *)vu_ctrlr->bar0_doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 3539 3540 /* Save PCI configuration space */ 3541 memcpy(&migr_state.cfg, (void *)endpoint->pci_config_space, NVME_REG_CFG_SIZE); 3542 3543 /* Save all data to device migration region */ 3544 data_ptr = endpoint->migr_data; 3545 3546 /* Copy nvmf controller data */ 3547 data_offset = sizeof(struct vfio_user_nvme_migr_header); 3548 data_ptr += data_offset; 3549 migr_state.ctrlr_header.nvmf_data_offset = data_offset; 3550 migr_state.ctrlr_header.nvmf_data_len = sizeof(struct spdk_nvmf_ctrlr_migr_data); 3551 memcpy(data_ptr, &migr_state.nvmf_data, sizeof(struct spdk_nvmf_ctrlr_migr_data)); 3552 3553 /* Copy queue pairs */ 3554 data_offset += sizeof(struct spdk_nvmf_ctrlr_migr_data); 3555 data_ptr += sizeof(struct spdk_nvmf_ctrlr_migr_data); 3556 migr_state.ctrlr_header.qp_offset = data_offset; 3557 migr_state.ctrlr_header.qp_len = i * (sizeof(struct nvme_migr_sq_state) + sizeof( 3558 struct nvme_migr_cq_state)); 3559 memcpy(data_ptr, &migr_state.qps, migr_state.ctrlr_header.qp_len); 3560 3561 /* Copy doorbells */ 3562 data_offset += migr_state.ctrlr_header.qp_len; 3563 data_ptr += migr_state.ctrlr_header.qp_len; 3564 migr_state.ctrlr_header.bar_offset[VFU_PCI_DEV_BAR0_REGION_IDX] = data_offset; 3565 migr_state.ctrlr_header.bar_len[VFU_PCI_DEV_BAR0_REGION_IDX] = NVMF_VFIO_USER_DOORBELLS_SIZE; 3566 memcpy(data_ptr, &migr_state.doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 3567 3568 /* Copy CFG */ 3569 data_offset += NVMF_VFIO_USER_DOORBELLS_SIZE; 3570 data_ptr += NVMF_VFIO_USER_DOORBELLS_SIZE; 3571 migr_state.ctrlr_header.bar_offset[VFU_PCI_DEV_CFG_REGION_IDX] = data_offset; 3572 migr_state.ctrlr_header.bar_len[VFU_PCI_DEV_CFG_REGION_IDX] = NVME_REG_CFG_SIZE; 3573 memcpy(data_ptr, &migr_state.cfg, NVME_REG_CFG_SIZE); 3574 3575 /* copy shadow doorbells */ 3576 if (vu_ctrlr->sdbl != NULL) { 3577 migr_state.ctrlr_header.sdbl = true; 3578 migr_state.ctrlr_header.shadow_doorbell_buffer = vu_ctrlr->shadow_doorbell_buffer; 3579 migr_state.ctrlr_header.eventidx_buffer = vu_ctrlr->eventidx_buffer; 3580 } 3581 3582 /* Copy nvme migration header finally */ 3583 memcpy(endpoint->migr_data, &migr_state.ctrlr_header, sizeof(struct vfio_user_nvme_migr_header)); 3584 3585 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 3586 vfio_user_ctrlr_dump_migr_data("SAVE", &migr_state, vu_ctrlr->sdbl); 3587 } 3588 } 3589 3590 /* 3591 * If we are about to close the connection, we need to unregister the interrupt, 3592 * as the library will subsequently close the file descriptor we registered. 3593 */ 3594 static int 3595 vfio_user_device_reset(vfu_ctx_t *vfu_ctx, vfu_reset_type_t type) 3596 { 3597 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3598 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 3599 3600 SPDK_DEBUGLOG(nvmf_vfio, "Device reset type %u\n", type); 3601 3602 if (type == VFU_RESET_LOST_CONN) { 3603 if (ctrlr != NULL) { 3604 spdk_interrupt_unregister(&ctrlr->intr); 3605 ctrlr->intr_fd = -1; 3606 } 3607 return 0; 3608 } 3609 3610 /* FIXME: LOST_CONN case ? */ 3611 if (ctrlr->sdbl != NULL) { 3612 vfio_user_ctrlr_switch_doorbells(ctrlr, false); 3613 free_sdbl(vfu_ctx, ctrlr->sdbl); 3614 ctrlr->sdbl = NULL; 3615 } 3616 3617 /* FIXME: much more needed here. */ 3618 3619 return 0; 3620 } 3621 3622 static int 3623 vfio_user_migr_ctrlr_construct_qps(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 3624 struct vfio_user_nvme_migr_state *migr_state) 3625 { 3626 uint32_t i, qsize = 0; 3627 uint16_t sqid, cqid; 3628 struct vfio_user_nvme_migr_qp migr_qp; 3629 void *addr; 3630 uint32_t cqs_ref[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR] = {}; 3631 int ret; 3632 3633 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 3634 vfio_user_ctrlr_dump_migr_data("RESUME", migr_state, vu_ctrlr->sdbl); 3635 } 3636 3637 /* restore submission queues */ 3638 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3639 migr_qp = migr_state->qps[i]; 3640 3641 qsize = migr_qp.sq.size; 3642 if (qsize) { 3643 struct nvmf_vfio_user_sq *sq; 3644 3645 sqid = migr_qp.sq.sqid; 3646 if (sqid != i) { 3647 SPDK_ERRLOG("Expected sqid %u while got %u", i, sqid); 3648 return -EINVAL; 3649 } 3650 3651 /* allocate sq if necessary */ 3652 if (vu_ctrlr->sqs[sqid] == NULL) { 3653 ret = init_sq(vu_ctrlr, &vu_ctrlr->transport->transport, sqid); 3654 if (ret) { 3655 SPDK_ERRLOG("Construct qpair with qid %u failed\n", sqid); 3656 return -EFAULT; 3657 } 3658 } 3659 3660 sq = vu_ctrlr->sqs[sqid]; 3661 sq->size = qsize; 3662 3663 ret = alloc_sq_reqs(vu_ctrlr, sq); 3664 if (ret) { 3665 SPDK_ERRLOG("Construct sq with qid %u failed\n", sqid); 3666 return -EFAULT; 3667 } 3668 3669 /* restore sq */ 3670 sq->sq_state = VFIO_USER_SQ_CREATED; 3671 sq->cqid = migr_qp.sq.cqid; 3672 *sq_headp(sq) = migr_qp.sq.head; 3673 sq->mapping.prp1 = migr_qp.sq.dma_addr; 3674 addr = map_one(vu_ctrlr->endpoint->vfu_ctx, 3675 sq->mapping.prp1, sq->size * 64, 3676 sq->mapping.sg, &sq->mapping.iov, 3677 PROT_READ); 3678 if (addr == NULL) { 3679 SPDK_ERRLOG("Restore sq with qid %u PRP1 0x%"PRIx64" with size %u failed\n", 3680 sqid, sq->mapping.prp1, sq->size); 3681 return -EFAULT; 3682 } 3683 cqs_ref[sq->cqid]++; 3684 } 3685 } 3686 3687 /* restore completion queues */ 3688 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3689 migr_qp = migr_state->qps[i]; 3690 3691 qsize = migr_qp.cq.size; 3692 if (qsize) { 3693 struct nvmf_vfio_user_cq *cq; 3694 3695 /* restore cq */ 3696 cqid = migr_qp.sq.cqid; 3697 assert(cqid == i); 3698 3699 /* allocate cq if necessary */ 3700 if (vu_ctrlr->cqs[cqid] == NULL) { 3701 ret = init_cq(vu_ctrlr, cqid); 3702 if (ret) { 3703 SPDK_ERRLOG("Construct qpair with qid %u failed\n", cqid); 3704 return -EFAULT; 3705 } 3706 } 3707 3708 cq = vu_ctrlr->cqs[cqid]; 3709 3710 cq->size = qsize; 3711 3712 cq->cq_state = VFIO_USER_CQ_CREATED; 3713 cq->cq_ref = cqs_ref[cqid]; 3714 *cq_tailp(cq) = migr_qp.cq.tail; 3715 cq->mapping.prp1 = migr_qp.cq.dma_addr; 3716 cq->ien = migr_qp.cq.ien; 3717 cq->iv = migr_qp.cq.iv; 3718 cq->phase = migr_qp.cq.phase; 3719 addr = map_one(vu_ctrlr->endpoint->vfu_ctx, 3720 cq->mapping.prp1, cq->size * 16, 3721 cq->mapping.sg, &cq->mapping.iov, 3722 PROT_READ | PROT_WRITE); 3723 if (addr == NULL) { 3724 SPDK_ERRLOG("Restore cq with qid %u PRP1 0x%"PRIx64" with size %u failed\n", 3725 cqid, cq->mapping.prp1, cq->size); 3726 return -EFAULT; 3727 } 3728 } 3729 } 3730 3731 return 0; 3732 } 3733 3734 static int 3735 vfio_user_migr_ctrlr_restore(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3736 { 3737 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 3738 struct spdk_nvmf_ctrlr *ctrlr = vu_ctrlr->ctrlr; 3739 uint32_t *doorbell_base; 3740 struct spdk_nvme_cmd cmd; 3741 uint16_t i; 3742 int rc = 0; 3743 struct vfio_user_nvme_migr_state migr_state = { 3744 .nvmf_data = { 3745 .data_size = offsetof(struct spdk_nvmf_ctrlr_migr_data, unused), 3746 .regs_size = sizeof(struct spdk_nvmf_registers), 3747 .feat_size = sizeof(struct spdk_nvmf_ctrlr_feat) 3748 } 3749 }; 3750 3751 assert(endpoint->migr_data != NULL); 3752 assert(ctrlr != NULL); 3753 rc = vfio_user_migr_stream_to_data(endpoint, &migr_state); 3754 if (rc) { 3755 return rc; 3756 } 3757 3758 /* restore shadow doorbells */ 3759 if (migr_state.ctrlr_header.sdbl) { 3760 struct nvmf_vfio_user_shadow_doorbells *sdbl; 3761 sdbl = map_sdbl(vu_ctrlr->endpoint->vfu_ctx, 3762 migr_state.ctrlr_header.shadow_doorbell_buffer, 3763 migr_state.ctrlr_header.eventidx_buffer, 3764 memory_page_size(vu_ctrlr)); 3765 if (sdbl == NULL) { 3766 SPDK_ERRLOG("%s: failed to re-map shadow doorbell buffers\n", 3767 ctrlr_id(vu_ctrlr)); 3768 return -1; 3769 } 3770 3771 vu_ctrlr->shadow_doorbell_buffer = migr_state.ctrlr_header.shadow_doorbell_buffer; 3772 vu_ctrlr->eventidx_buffer = migr_state.ctrlr_header.eventidx_buffer; 3773 3774 SWAP(vu_ctrlr->sdbl, sdbl); 3775 } 3776 3777 rc = vfio_user_migr_ctrlr_construct_qps(vu_ctrlr, &migr_state); 3778 if (rc) { 3779 return rc; 3780 } 3781 3782 /* restore PCI configuration space */ 3783 memcpy((void *)endpoint->pci_config_space, &migr_state.cfg, NVME_REG_CFG_SIZE); 3784 3785 doorbell_base = (uint32_t *)&migr_state.doorbells; 3786 /* restore doorbells from saved registers */ 3787 memcpy((void *)vu_ctrlr->bar0_doorbells, doorbell_base, NVMF_VFIO_USER_DOORBELLS_SIZE); 3788 3789 /* restore nvmf controller data */ 3790 rc = spdk_nvmf_ctrlr_restore_migr_data(ctrlr, &migr_state.nvmf_data); 3791 if (rc) { 3792 return rc; 3793 } 3794 3795 /* resubmit pending AERs */ 3796 for (i = 0; i < migr_state.nvmf_data.num_aer_cids; i++) { 3797 SPDK_DEBUGLOG(nvmf_vfio, "%s AER resubmit, CID %u\n", ctrlr_id(vu_ctrlr), 3798 migr_state.nvmf_data.aer_cids[i]); 3799 memset(&cmd, 0, sizeof(cmd)); 3800 cmd.opc = SPDK_NVME_OPC_ASYNC_EVENT_REQUEST; 3801 cmd.cid = migr_state.nvmf_data.aer_cids[i]; 3802 rc = handle_cmd_req(vu_ctrlr, &cmd, vu_ctrlr->sqs[0]); 3803 if (spdk_unlikely(rc)) { 3804 break; 3805 } 3806 } 3807 3808 return rc; 3809 } 3810 3811 static void 3812 vfio_user_migr_ctrlr_enable_sqs(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3813 { 3814 uint32_t i; 3815 struct nvmf_vfio_user_sq *sq; 3816 3817 /* The Admin queue (qid: 0) does not ever use shadow doorbells. */ 3818 3819 if (vu_ctrlr->sqs[0] != NULL) { 3820 vu_ctrlr->sqs[0]->dbl_tailp = vu_ctrlr->bar0_doorbells + 3821 queue_index(0, false); 3822 } 3823 3824 if (vu_ctrlr->cqs[0] != NULL) { 3825 vu_ctrlr->cqs[0]->dbl_headp = vu_ctrlr->bar0_doorbells + 3826 queue_index(0, true); 3827 } 3828 3829 vfio_user_ctrlr_switch_doorbells(vu_ctrlr, vu_ctrlr->sdbl != NULL); 3830 3831 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3832 sq = vu_ctrlr->sqs[i]; 3833 if (!sq || !sq->size) { 3834 continue; 3835 } 3836 3837 if (nvmf_qpair_is_admin_queue(&sq->qpair)) { 3838 /* ADMIN queue pair is always in the poll group, just enable it */ 3839 sq->sq_state = VFIO_USER_SQ_ACTIVE; 3840 } else { 3841 spdk_nvmf_tgt_new_qpair(vu_ctrlr->transport->transport.tgt, &sq->qpair); 3842 } 3843 } 3844 } 3845 3846 /* 3847 * We are in stop-and-copy state, but still potentially have some current dirty 3848 * sgls: while we're quiesced and thus should have no active requests, we still 3849 * have potentially dirty maps of the shadow doorbells and the CQs (SQs are 3850 * mapped read only). 3851 * 3852 * Since we won't be calling vfu_sgl_put() for them, we need to explicitly 3853 * mark them dirty now. 3854 */ 3855 static void 3856 vfio_user_migr_ctrlr_mark_dirty(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3857 { 3858 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 3859 3860 assert(vu_ctrlr->state == VFIO_USER_CTRLR_MIGRATING); 3861 3862 for (size_t i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3863 struct nvmf_vfio_user_cq *cq = vu_ctrlr->cqs[i]; 3864 3865 if (cq == NULL || q_addr(&cq->mapping) == NULL) { 3866 continue; 3867 } 3868 3869 vfu_sgl_mark_dirty(endpoint->vfu_ctx, cq->mapping.sg, 1); 3870 } 3871 3872 if (vu_ctrlr->sdbl != NULL) { 3873 dma_sg_t *sg; 3874 size_t i; 3875 3876 for (i = 0; i < NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT; 3877 ++i) { 3878 3879 if (!vu_ctrlr->sdbl->iovs[i].iov_len) { 3880 continue; 3881 } 3882 3883 sg = index_to_sg_t(vu_ctrlr->sdbl->sgs, i); 3884 3885 vfu_sgl_mark_dirty(endpoint->vfu_ctx, sg, 1); 3886 } 3887 } 3888 } 3889 3890 static int 3891 vfio_user_migration_device_state_transition(vfu_ctx_t *vfu_ctx, vfu_migr_state_t state) 3892 { 3893 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3894 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3895 struct nvmf_vfio_user_sq *sq; 3896 int ret = 0; 3897 3898 SPDK_DEBUGLOG(nvmf_vfio, "%s controller state %u, migration state %u\n", endpoint_id(endpoint), 3899 vu_ctrlr->state, state); 3900 3901 switch (state) { 3902 case VFU_MIGR_STATE_STOP_AND_COPY: 3903 vu_ctrlr->in_source_vm = true; 3904 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 3905 vfio_user_migr_ctrlr_mark_dirty(vu_ctrlr); 3906 vfio_user_migr_ctrlr_save_data(vu_ctrlr); 3907 break; 3908 case VFU_MIGR_STATE_STOP: 3909 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 3910 /* The controller associates with source VM is dead now, we will resume 3911 * the subsystem after destroying the controller data structure, then the 3912 * subsystem can be re-used for another new client. 3913 */ 3914 if (vu_ctrlr->in_source_vm) { 3915 endpoint->need_resume = true; 3916 } 3917 break; 3918 case VFU_MIGR_STATE_PRE_COPY: 3919 assert(vu_ctrlr->state == VFIO_USER_CTRLR_PAUSED); 3920 break; 3921 case VFU_MIGR_STATE_RESUME: 3922 /* 3923 * Destination ADMIN queue pair is connected when starting the VM, 3924 * but the ADMIN queue pair isn't enabled in destination VM, the poll 3925 * group will do nothing to ADMIN queue pair for now. 3926 */ 3927 if (vu_ctrlr->state != VFIO_USER_CTRLR_RUNNING) { 3928 break; 3929 } 3930 3931 assert(!vu_ctrlr->in_source_vm); 3932 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 3933 3934 sq = TAILQ_FIRST(&vu_ctrlr->connected_sqs); 3935 assert(sq != NULL); 3936 assert(sq->qpair.qid == 0); 3937 sq->sq_state = VFIO_USER_SQ_INACTIVE; 3938 3939 /* Free ADMIN SQ resources first, SQ resources will be 3940 * allocated based on queue size from source VM. 3941 */ 3942 free_sq_reqs(sq); 3943 sq->size = 0; 3944 break; 3945 case VFU_MIGR_STATE_RUNNING: 3946 3947 if (vu_ctrlr->state != VFIO_USER_CTRLR_MIGRATING) { 3948 break; 3949 } 3950 3951 if (!vu_ctrlr->in_source_vm) { 3952 /* Restore destination VM from BAR9 */ 3953 ret = vfio_user_migr_ctrlr_restore(vu_ctrlr); 3954 if (ret) { 3955 break; 3956 } 3957 3958 vfio_user_ctrlr_switch_doorbells(vu_ctrlr, false); 3959 vfio_user_migr_ctrlr_enable_sqs(vu_ctrlr); 3960 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 3961 /* FIXME where do we resume nvmf? */ 3962 } else { 3963 /* Rollback source VM */ 3964 vu_ctrlr->state = VFIO_USER_CTRLR_RESUMING; 3965 ret = spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 3966 vfio_user_endpoint_resume_done, endpoint); 3967 if (ret < 0) { 3968 /* TODO: fail controller with CFS bit set */ 3969 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 3970 SPDK_ERRLOG("%s: failed to resume, ret=%d\n", endpoint_id(endpoint), ret); 3971 } 3972 } 3973 vu_ctrlr->migr_data_prepared = false; 3974 vu_ctrlr->in_source_vm = false; 3975 break; 3976 3977 default: 3978 return -EINVAL; 3979 } 3980 3981 return ret; 3982 } 3983 3984 static uint64_t 3985 vfio_user_migration_get_pending_bytes(vfu_ctx_t *vfu_ctx) 3986 { 3987 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3988 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 3989 uint64_t pending_bytes; 3990 3991 if (ctrlr->migr_data_prepared) { 3992 assert(ctrlr->state == VFIO_USER_CTRLR_MIGRATING); 3993 pending_bytes = 0; 3994 } else { 3995 pending_bytes = vfio_user_migr_data_len(); 3996 } 3997 3998 SPDK_DEBUGLOG(nvmf_vfio, 3999 "%s current state %u, pending bytes 0x%"PRIx64"\n", 4000 endpoint_id(endpoint), ctrlr->state, pending_bytes); 4001 4002 return pending_bytes; 4003 } 4004 4005 static int 4006 vfio_user_migration_prepare_data(vfu_ctx_t *vfu_ctx, uint64_t *offset, uint64_t *size) 4007 { 4008 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 4009 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 4010 4011 /* 4012 * When transitioning to pre-copy state we set pending_bytes to 0, 4013 * so the vfio-user client shouldn't attempt to read any migration 4014 * data. This is not yet guaranteed by libvfio-user. 4015 */ 4016 if (ctrlr->state != VFIO_USER_CTRLR_MIGRATING) { 4017 assert(size != NULL); 4018 *offset = 0; 4019 *size = 0; 4020 return 0; 4021 } 4022 4023 if (ctrlr->in_source_vm) { /* migration source */ 4024 assert(size != NULL); 4025 *size = vfio_user_migr_data_len(); 4026 vfio_user_migr_ctrlr_save_data(ctrlr); 4027 } else { /* migration destination */ 4028 assert(size == NULL); 4029 assert(!ctrlr->migr_data_prepared); 4030 } 4031 *offset = 0; 4032 ctrlr->migr_data_prepared = true; 4033 4034 SPDK_DEBUGLOG(nvmf_vfio, "%s current state %u\n", endpoint_id(endpoint), ctrlr->state); 4035 4036 return 0; 4037 } 4038 4039 static ssize_t 4040 vfio_user_migration_read_data(vfu_ctx_t *vfu_ctx __attribute__((unused)), 4041 void *buf __attribute__((unused)), 4042 uint64_t count __attribute__((unused)), 4043 uint64_t offset __attribute__((unused))) 4044 { 4045 SPDK_DEBUGLOG(nvmf_vfio, "%s: migration read data not supported\n", 4046 endpoint_id(vfu_get_private(vfu_ctx))); 4047 errno = ENOTSUP; 4048 return -1; 4049 } 4050 4051 static ssize_t 4052 vfio_user_migration_write_data(vfu_ctx_t *vfu_ctx __attribute__((unused)), 4053 void *buf __attribute__((unused)), 4054 uint64_t count __attribute__((unused)), 4055 uint64_t offset __attribute__((unused))) 4056 { 4057 SPDK_DEBUGLOG(nvmf_vfio, "%s: migration write data not supported\n", 4058 endpoint_id(vfu_get_private(vfu_ctx))); 4059 errno = ENOTSUP; 4060 return -1; 4061 } 4062 4063 static int 4064 vfio_user_migration_data_written(vfu_ctx_t *vfu_ctx __attribute__((unused)), 4065 uint64_t count) 4066 { 4067 SPDK_DEBUGLOG(nvmf_vfio, "write 0x%"PRIx64"\n", (uint64_t)count); 4068 4069 if (count != vfio_user_migr_data_len()) { 4070 SPDK_DEBUGLOG(nvmf_vfio, "%s bad count %#lx\n", 4071 endpoint_id(vfu_get_private(vfu_ctx)), count); 4072 errno = EINVAL; 4073 return -1; 4074 } 4075 4076 return 0; 4077 } 4078 4079 static int 4080 vfio_user_dev_info_fill(struct nvmf_vfio_user_transport *vu_transport, 4081 struct nvmf_vfio_user_endpoint *endpoint) 4082 { 4083 int ret; 4084 ssize_t cap_offset; 4085 vfu_ctx_t *vfu_ctx = endpoint->vfu_ctx; 4086 struct iovec migr_sparse_mmap = {}; 4087 4088 struct pmcap pmcap = { .hdr.id = PCI_CAP_ID_PM, .pmcs.nsfrst = 0x1 }; 4089 struct pxcap pxcap = { 4090 .hdr.id = PCI_CAP_ID_EXP, 4091 .pxcaps.ver = 0x2, 4092 .pxdcap = {.rer = 0x1, .flrc = 0x1}, 4093 .pxdcap2.ctds = 0x1 4094 }; 4095 4096 struct msixcap msixcap = { 4097 .hdr.id = PCI_CAP_ID_MSIX, 4098 .mxc.ts = NVME_IRQ_MSIX_NUM - 1, 4099 .mtab = {.tbir = 0x4, .to = 0x0}, 4100 .mpba = {.pbir = 0x5, .pbao = 0x0} 4101 }; 4102 4103 struct iovec sparse_mmap[] = { 4104 { 4105 .iov_base = (void *)NVME_DOORBELLS_OFFSET, 4106 .iov_len = NVMF_VFIO_USER_DOORBELLS_SIZE, 4107 }, 4108 }; 4109 4110 const vfu_migration_callbacks_t migr_callbacks = { 4111 .version = VFU_MIGR_CALLBACKS_VERS, 4112 .transition = &vfio_user_migration_device_state_transition, 4113 .get_pending_bytes = &vfio_user_migration_get_pending_bytes, 4114 .prepare_data = &vfio_user_migration_prepare_data, 4115 .read_data = &vfio_user_migration_read_data, 4116 .data_written = &vfio_user_migration_data_written, 4117 .write_data = &vfio_user_migration_write_data 4118 }; 4119 4120 ret = vfu_pci_init(vfu_ctx, VFU_PCI_TYPE_EXPRESS, PCI_HEADER_TYPE_NORMAL, 0); 4121 if (ret < 0) { 4122 SPDK_ERRLOG("vfu_ctx %p failed to initialize PCI\n", vfu_ctx); 4123 return ret; 4124 } 4125 vfu_pci_set_id(vfu_ctx, SPDK_PCI_VID_NUTANIX, 0x0001, SPDK_PCI_VID_NUTANIX, 0); 4126 /* 4127 * 0x02, controller uses the NVM Express programming interface 4128 * 0x08, non-volatile memory controller 4129 * 0x01, mass storage controller 4130 */ 4131 vfu_pci_set_class(vfu_ctx, 0x01, 0x08, 0x02); 4132 4133 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pmcap); 4134 if (cap_offset < 0) { 4135 SPDK_ERRLOG("vfu_ctx %p failed add pmcap\n", vfu_ctx); 4136 return ret; 4137 } 4138 4139 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pxcap); 4140 if (cap_offset < 0) { 4141 SPDK_ERRLOG("vfu_ctx %p failed add pxcap\n", vfu_ctx); 4142 return ret; 4143 } 4144 4145 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &msixcap); 4146 if (cap_offset < 0) { 4147 SPDK_ERRLOG("vfu_ctx %p failed add msixcap\n", vfu_ctx); 4148 return ret; 4149 } 4150 4151 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_CFG_REGION_IDX, NVME_REG_CFG_SIZE, 4152 access_pci_config, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 4153 if (ret < 0) { 4154 SPDK_ERRLOG("vfu_ctx %p failed to setup cfg\n", vfu_ctx); 4155 return ret; 4156 } 4157 4158 if (vu_transport->transport_opts.disable_mappable_bar0) { 4159 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE, 4160 access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, 4161 NULL, 0, -1, 0); 4162 } else { 4163 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE, 4164 access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, 4165 sparse_mmap, 1, endpoint->devmem_fd, 0); 4166 } 4167 4168 if (ret < 0) { 4169 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 0\n", vfu_ctx); 4170 return ret; 4171 } 4172 4173 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR4_REGION_IDX, NVME_BAR4_SIZE, 4174 NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 4175 if (ret < 0) { 4176 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 4\n", vfu_ctx); 4177 return ret; 4178 } 4179 4180 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR5_REGION_IDX, NVME_BAR5_SIZE, 4181 NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 4182 if (ret < 0) { 4183 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 5\n", vfu_ctx); 4184 return ret; 4185 } 4186 4187 ret = vfu_setup_device_dma(vfu_ctx, memory_region_add_cb, memory_region_remove_cb); 4188 if (ret < 0) { 4189 SPDK_ERRLOG("vfu_ctx %p failed to setup dma callback\n", vfu_ctx); 4190 return ret; 4191 } 4192 4193 ret = vfu_setup_device_reset_cb(vfu_ctx, vfio_user_device_reset); 4194 if (ret < 0) { 4195 SPDK_ERRLOG("vfu_ctx %p failed to setup reset callback\n", vfu_ctx); 4196 return ret; 4197 } 4198 4199 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_INTX_IRQ, 1); 4200 if (ret < 0) { 4201 SPDK_ERRLOG("vfu_ctx %p failed to setup INTX\n", vfu_ctx); 4202 return ret; 4203 } 4204 4205 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSIX_IRQ, NVME_IRQ_MSIX_NUM); 4206 if (ret < 0) { 4207 SPDK_ERRLOG("vfu_ctx %p failed to setup MSIX\n", vfu_ctx); 4208 return ret; 4209 } 4210 4211 vfu_setup_device_quiesce_cb(vfu_ctx, vfio_user_dev_quiesce_cb); 4212 4213 migr_sparse_mmap.iov_base = (void *)4096; 4214 migr_sparse_mmap.iov_len = vfio_user_migr_data_len(); 4215 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_MIGR_REGION_IDX, 4216 vfu_get_migr_register_area_size() + vfio_user_migr_data_len(), 4217 NULL, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, &migr_sparse_mmap, 4218 1, endpoint->migr_fd, 0); 4219 if (ret < 0) { 4220 SPDK_ERRLOG("vfu_ctx %p failed to setup migration region\n", vfu_ctx); 4221 return ret; 4222 } 4223 4224 ret = vfu_setup_device_migration_callbacks(vfu_ctx, &migr_callbacks, 4225 vfu_get_migr_register_area_size()); 4226 if (ret < 0) { 4227 SPDK_ERRLOG("vfu_ctx %p failed to setup migration callbacks\n", vfu_ctx); 4228 return ret; 4229 } 4230 4231 ret = vfu_realize_ctx(vfu_ctx); 4232 if (ret < 0) { 4233 SPDK_ERRLOG("vfu_ctx %p failed to realize\n", vfu_ctx); 4234 return ret; 4235 } 4236 4237 endpoint->pci_config_space = vfu_pci_get_config_space(endpoint->vfu_ctx); 4238 assert(endpoint->pci_config_space != NULL); 4239 init_pci_config_space(endpoint->pci_config_space); 4240 4241 assert(cap_offset != 0); 4242 endpoint->msix = (struct msixcap *)((uint8_t *)endpoint->pci_config_space + cap_offset); 4243 4244 return 0; 4245 } 4246 4247 static int nvmf_vfio_user_accept(void *ctx); 4248 4249 static void 4250 set_intr_mode_noop(struct spdk_poller *poller, void *arg, bool interrupt_mode) 4251 { 4252 /* Nothing for us to do here. */ 4253 } 4254 4255 /* 4256 * Register an "accept" poller: this is polling for incoming vfio-user socket 4257 * connections (on the listening socket). 4258 * 4259 * We need to do this on first listening, and also after destroying a 4260 * controller, so we can accept another connection. 4261 */ 4262 static int 4263 vfio_user_register_accept_poller(struct nvmf_vfio_user_endpoint *endpoint) 4264 { 4265 uint64_t poll_rate_us = endpoint->transport->transport.opts.acceptor_poll_rate; 4266 4267 SPDK_DEBUGLOG(nvmf_vfio, "registering accept poller\n"); 4268 4269 endpoint->accept_poller = SPDK_POLLER_REGISTER(nvmf_vfio_user_accept, 4270 endpoint, poll_rate_us); 4271 4272 if (!endpoint->accept_poller) { 4273 return -1; 4274 } 4275 4276 endpoint->accept_thread = spdk_get_thread(); 4277 endpoint->need_relisten = false; 4278 4279 if (!spdk_interrupt_mode_is_enabled()) { 4280 return 0; 4281 } 4282 4283 endpoint->accept_intr_fd = vfu_get_poll_fd(endpoint->vfu_ctx); 4284 assert(endpoint->accept_intr_fd != -1); 4285 4286 endpoint->accept_intr = SPDK_INTERRUPT_REGISTER(endpoint->accept_intr_fd, 4287 nvmf_vfio_user_accept, endpoint); 4288 4289 assert(endpoint->accept_intr != NULL); 4290 4291 spdk_poller_register_interrupt(endpoint->accept_poller, 4292 set_intr_mode_noop, NULL); 4293 return 0; 4294 } 4295 4296 static void 4297 _vfio_user_relisten(void *ctx) 4298 { 4299 struct nvmf_vfio_user_endpoint *endpoint = ctx; 4300 4301 vfio_user_register_accept_poller(endpoint); 4302 } 4303 4304 static void 4305 _free_ctrlr(void *ctx) 4306 { 4307 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 4308 struct nvmf_vfio_user_endpoint *endpoint = ctrlr->endpoint; 4309 4310 free_sdbl(endpoint->vfu_ctx, ctrlr->sdbl); 4311 4312 spdk_interrupt_unregister(&ctrlr->intr); 4313 ctrlr->intr_fd = -1; 4314 spdk_poller_unregister(&ctrlr->vfu_ctx_poller); 4315 4316 free(ctrlr); 4317 4318 if (endpoint->need_async_destroy) { 4319 nvmf_vfio_user_destroy_endpoint(endpoint); 4320 } else if (endpoint->need_relisten) { 4321 spdk_thread_send_msg(endpoint->accept_thread, 4322 _vfio_user_relisten, endpoint); 4323 } 4324 } 4325 4326 static void 4327 free_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 4328 { 4329 int i; 4330 assert(ctrlr != NULL); 4331 4332 SPDK_DEBUGLOG(nvmf_vfio, "free %s\n", ctrlr_id(ctrlr)); 4333 4334 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 4335 free_qp(ctrlr, i); 4336 } 4337 4338 spdk_thread_exec_msg(ctrlr->thread, _free_ctrlr, ctrlr); 4339 } 4340 4341 static int 4342 nvmf_vfio_user_create_ctrlr(struct nvmf_vfio_user_transport *transport, 4343 struct nvmf_vfio_user_endpoint *endpoint) 4344 { 4345 struct nvmf_vfio_user_ctrlr *ctrlr; 4346 int err = 0; 4347 4348 SPDK_DEBUGLOG(nvmf_vfio, "%s\n", endpoint_id(endpoint)); 4349 4350 /* First, construct a vfio-user CUSTOM transport controller */ 4351 ctrlr = calloc(1, sizeof(*ctrlr)); 4352 if (ctrlr == NULL) { 4353 err = -ENOMEM; 4354 goto out; 4355 } 4356 /* We can only support one connection for now */ 4357 ctrlr->cntlid = 0x1; 4358 ctrlr->intr_fd = -1; 4359 ctrlr->transport = transport; 4360 ctrlr->endpoint = endpoint; 4361 ctrlr->bar0_doorbells = endpoint->bar0_doorbells; 4362 TAILQ_INIT(&ctrlr->connected_sqs); 4363 4364 ctrlr->adaptive_irqs_enabled = 4365 !transport->transport_opts.disable_adaptive_irq; 4366 4367 /* Then, construct an admin queue pair */ 4368 err = init_sq(ctrlr, &transport->transport, 0); 4369 if (err != 0) { 4370 free(ctrlr); 4371 goto out; 4372 } 4373 4374 err = init_cq(ctrlr, 0); 4375 if (err != 0) { 4376 free(ctrlr); 4377 goto out; 4378 } 4379 4380 ctrlr->sqs[0]->size = NVMF_VFIO_USER_DEFAULT_AQ_DEPTH; 4381 4382 err = alloc_sq_reqs(ctrlr, ctrlr->sqs[0]); 4383 if (err != 0) { 4384 free(ctrlr); 4385 goto out; 4386 } 4387 endpoint->ctrlr = ctrlr; 4388 4389 /* Notify the generic layer about the new admin queue pair */ 4390 spdk_nvmf_tgt_new_qpair(transport->transport.tgt, &ctrlr->sqs[0]->qpair); 4391 4392 out: 4393 if (err != 0) { 4394 SPDK_ERRLOG("%s: failed to create vfio-user controller: %s\n", 4395 endpoint_id(endpoint), strerror(-err)); 4396 } 4397 4398 return err; 4399 } 4400 4401 static int 4402 nvmf_vfio_user_listen(struct spdk_nvmf_transport *transport, 4403 const struct spdk_nvme_transport_id *trid, 4404 struct spdk_nvmf_listen_opts *listen_opts) 4405 { 4406 struct nvmf_vfio_user_transport *vu_transport; 4407 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 4408 char path[PATH_MAX] = {}; 4409 char uuid[PATH_MAX] = {}; 4410 int ret; 4411 4412 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4413 transport); 4414 4415 pthread_mutex_lock(&vu_transport->lock); 4416 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 4417 /* Only compare traddr */ 4418 if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) { 4419 pthread_mutex_unlock(&vu_transport->lock); 4420 return -EEXIST; 4421 } 4422 } 4423 pthread_mutex_unlock(&vu_transport->lock); 4424 4425 endpoint = calloc(1, sizeof(*endpoint)); 4426 if (!endpoint) { 4427 return -ENOMEM; 4428 } 4429 4430 pthread_mutex_init(&endpoint->lock, NULL); 4431 endpoint->devmem_fd = -1; 4432 memcpy(&endpoint->trid, trid, sizeof(endpoint->trid)); 4433 endpoint->transport = vu_transport; 4434 4435 ret = snprintf(path, PATH_MAX, "%s/bar0", endpoint_id(endpoint)); 4436 if (ret < 0 || ret >= PATH_MAX) { 4437 SPDK_ERRLOG("%s: error to get socket path: %s.\n", endpoint_id(endpoint), spdk_strerror(errno)); 4438 ret = -1; 4439 goto out; 4440 } 4441 4442 ret = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); 4443 if (ret == -1) { 4444 SPDK_ERRLOG("%s: failed to open device memory at %s: %s.\n", 4445 endpoint_id(endpoint), path, spdk_strerror(errno)); 4446 goto out; 4447 } 4448 unlink(path); 4449 4450 endpoint->devmem_fd = ret; 4451 ret = ftruncate(endpoint->devmem_fd, 4452 NVME_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE); 4453 if (ret != 0) { 4454 SPDK_ERRLOG("%s: error to ftruncate file %s: %s.\n", endpoint_id(endpoint), path, 4455 spdk_strerror(errno)); 4456 goto out; 4457 } 4458 4459 endpoint->bar0_doorbells = mmap(NULL, NVMF_VFIO_USER_DOORBELLS_SIZE, 4460 PROT_READ | PROT_WRITE, MAP_SHARED, endpoint->devmem_fd, NVME_DOORBELLS_OFFSET); 4461 if (endpoint->bar0_doorbells == MAP_FAILED) { 4462 SPDK_ERRLOG("%s: error to mmap file %s: %s.\n", endpoint_id(endpoint), path, spdk_strerror(errno)); 4463 endpoint->bar0_doorbells = NULL; 4464 ret = -1; 4465 goto out; 4466 } 4467 4468 ret = snprintf(path, PATH_MAX, "%s/migr", endpoint_id(endpoint)); 4469 if (ret < 0 || ret >= PATH_MAX) { 4470 SPDK_ERRLOG("%s: error to get migration file path: %s.\n", endpoint_id(endpoint), 4471 spdk_strerror(errno)); 4472 ret = -1; 4473 goto out; 4474 } 4475 ret = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); 4476 if (ret == -1) { 4477 SPDK_ERRLOG("%s: failed to open device memory at %s: %s.\n", 4478 endpoint_id(endpoint), path, spdk_strerror(errno)); 4479 goto out; 4480 } 4481 unlink(path); 4482 4483 endpoint->migr_fd = ret; 4484 ret = ftruncate(endpoint->migr_fd, 4485 vfu_get_migr_register_area_size() + vfio_user_migr_data_len()); 4486 if (ret != 0) { 4487 SPDK_ERRLOG("%s: error to ftruncate migration file %s: %s.\n", endpoint_id(endpoint), path, 4488 spdk_strerror(errno)); 4489 goto out; 4490 } 4491 4492 endpoint->migr_data = mmap(NULL, vfio_user_migr_data_len(), 4493 PROT_READ | PROT_WRITE, MAP_SHARED, endpoint->migr_fd, vfu_get_migr_register_area_size()); 4494 if (endpoint->migr_data == MAP_FAILED) { 4495 SPDK_ERRLOG("%s: error to mmap file %s: %s.\n", endpoint_id(endpoint), path, spdk_strerror(errno)); 4496 endpoint->migr_data = NULL; 4497 ret = -1; 4498 goto out; 4499 } 4500 4501 ret = snprintf(uuid, PATH_MAX, "%s/cntrl", endpoint_id(endpoint)); 4502 if (ret < 0 || ret >= PATH_MAX) { 4503 SPDK_ERRLOG("%s: error to get ctrlr file path: %s\n", endpoint_id(endpoint), spdk_strerror(errno)); 4504 ret = -1; 4505 goto out; 4506 } 4507 4508 endpoint->vfu_ctx = vfu_create_ctx(VFU_TRANS_SOCK, uuid, LIBVFIO_USER_FLAG_ATTACH_NB, 4509 endpoint, VFU_DEV_TYPE_PCI); 4510 if (endpoint->vfu_ctx == NULL) { 4511 SPDK_ERRLOG("%s: error creating libmuser context: %m\n", 4512 endpoint_id(endpoint)); 4513 ret = -1; 4514 goto out; 4515 } 4516 4517 ret = vfu_setup_log(endpoint->vfu_ctx, vfio_user_log, 4518 vfio_user_get_log_level()); 4519 if (ret < 0) { 4520 goto out; 4521 } 4522 4523 4524 ret = vfio_user_dev_info_fill(vu_transport, endpoint); 4525 if (ret < 0) { 4526 goto out; 4527 } 4528 4529 ret = vfio_user_register_accept_poller(endpoint); 4530 4531 if (ret != 0) { 4532 goto out; 4533 } 4534 4535 pthread_mutex_lock(&vu_transport->lock); 4536 TAILQ_INSERT_TAIL(&vu_transport->endpoints, endpoint, link); 4537 pthread_mutex_unlock(&vu_transport->lock); 4538 4539 out: 4540 if (ret != 0) { 4541 nvmf_vfio_user_destroy_endpoint(endpoint); 4542 } 4543 4544 return ret; 4545 } 4546 4547 static void 4548 nvmf_vfio_user_stop_listen(struct spdk_nvmf_transport *transport, 4549 const struct spdk_nvme_transport_id *trid) 4550 { 4551 struct nvmf_vfio_user_transport *vu_transport; 4552 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 4553 4554 assert(trid != NULL); 4555 assert(trid->traddr != NULL); 4556 4557 SPDK_DEBUGLOG(nvmf_vfio, "%s: stop listen\n", trid->traddr); 4558 4559 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4560 transport); 4561 4562 pthread_mutex_lock(&vu_transport->lock); 4563 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 4564 if (strcmp(trid->traddr, endpoint->trid.traddr) == 0) { 4565 TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link); 4566 /* Defer to free endpoint resources until the controller 4567 * is freed. There are two cases when running here: 4568 * 1. kill nvmf target while VM is connected 4569 * 2. remove listener via RPC call 4570 * nvmf library will disconnect all queue paris. 4571 */ 4572 if (endpoint->ctrlr) { 4573 assert(!endpoint->need_async_destroy); 4574 endpoint->need_async_destroy = true; 4575 pthread_mutex_unlock(&vu_transport->lock); 4576 return; 4577 } 4578 4579 nvmf_vfio_user_destroy_endpoint(endpoint); 4580 pthread_mutex_unlock(&vu_transport->lock); 4581 return; 4582 } 4583 } 4584 pthread_mutex_unlock(&vu_transport->lock); 4585 4586 SPDK_DEBUGLOG(nvmf_vfio, "%s: not found\n", trid->traddr); 4587 } 4588 4589 static void 4590 nvmf_vfio_user_cdata_init(struct spdk_nvmf_transport *transport, 4591 struct spdk_nvmf_subsystem *subsystem, 4592 struct spdk_nvmf_ctrlr_data *cdata) 4593 { 4594 struct nvmf_vfio_user_transport *vu_transport; 4595 4596 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, transport); 4597 4598 cdata->vid = SPDK_PCI_VID_NUTANIX; 4599 cdata->ssvid = SPDK_PCI_VID_NUTANIX; 4600 cdata->ieee[0] = 0x8d; 4601 cdata->ieee[1] = 0x6b; 4602 cdata->ieee[2] = 0x50; 4603 memset(&cdata->sgls, 0, sizeof(struct spdk_nvme_cdata_sgls)); 4604 cdata->sgls.supported = SPDK_NVME_SGLS_SUPPORTED_DWORD_ALIGNED; 4605 cdata->oncs.compare = !vu_transport->transport_opts.disable_compare; 4606 /* libvfio-user can only support 1 connection for now */ 4607 cdata->oncs.reservations = 0; 4608 cdata->oacs.doorbell_buffer_config = !vu_transport->transport_opts.disable_shadow_doorbells; 4609 cdata->fuses.compare_and_write = !vu_transport->transport_opts.disable_compare; 4610 } 4611 4612 static int 4613 nvmf_vfio_user_listen_associate(struct spdk_nvmf_transport *transport, 4614 const struct spdk_nvmf_subsystem *subsystem, 4615 const struct spdk_nvme_transport_id *trid) 4616 { 4617 struct nvmf_vfio_user_transport *vu_transport; 4618 struct nvmf_vfio_user_endpoint *endpoint; 4619 4620 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, transport); 4621 4622 pthread_mutex_lock(&vu_transport->lock); 4623 TAILQ_FOREACH(endpoint, &vu_transport->endpoints, link) { 4624 if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) { 4625 break; 4626 } 4627 } 4628 pthread_mutex_unlock(&vu_transport->lock); 4629 4630 if (endpoint == NULL) { 4631 return -ENOENT; 4632 } 4633 4634 /* Drop const - we will later need to pause/unpause. */ 4635 endpoint->subsystem = (struct spdk_nvmf_subsystem *)subsystem; 4636 4637 return 0; 4638 } 4639 4640 /* 4641 * Executed periodically at a default SPDK_NVMF_DEFAULT_ACCEPT_POLL_RATE_US 4642 * frequency. 4643 * 4644 * For this endpoint (which at the libvfio-user level corresponds to a socket), 4645 * if we don't currently have a controller set up, peek to see if the socket is 4646 * able to accept a new connection. 4647 */ 4648 static int 4649 nvmf_vfio_user_accept(void *ctx) 4650 { 4651 struct nvmf_vfio_user_endpoint *endpoint = ctx; 4652 struct nvmf_vfio_user_transport *vu_transport; 4653 int err; 4654 4655 vu_transport = endpoint->transport; 4656 4657 if (endpoint->ctrlr != NULL) { 4658 return SPDK_POLLER_IDLE; 4659 } 4660 4661 /* While we're here, the controller is already destroyed, 4662 * subsystem may still be in RESUMING state, we will wait 4663 * until the subsystem is in RUNNING state. 4664 */ 4665 if (endpoint->need_resume) { 4666 return SPDK_POLLER_IDLE; 4667 } 4668 4669 err = vfu_attach_ctx(endpoint->vfu_ctx); 4670 if (err == 0) { 4671 SPDK_DEBUGLOG(nvmf_vfio, "attach succeeded\n"); 4672 err = nvmf_vfio_user_create_ctrlr(vu_transport, endpoint); 4673 if (err == 0) { 4674 /* 4675 * Unregister ourselves: now we've accepted a 4676 * connection, there is nothing for us to poll for, and 4677 * we will poll the connection via vfu_run_ctx() 4678 * instead. 4679 */ 4680 spdk_interrupt_unregister(&endpoint->accept_intr); 4681 spdk_poller_unregister(&endpoint->accept_poller); 4682 } 4683 return SPDK_POLLER_BUSY; 4684 } 4685 4686 if (errno == EAGAIN || errno == EWOULDBLOCK) { 4687 return SPDK_POLLER_IDLE; 4688 } 4689 4690 return SPDK_POLLER_BUSY; 4691 } 4692 4693 static void 4694 nvmf_vfio_user_discover(struct spdk_nvmf_transport *transport, 4695 struct spdk_nvme_transport_id *trid, 4696 struct spdk_nvmf_discovery_log_page_entry *entry) 4697 { } 4698 4699 static int vfio_user_poll_group_intr(void *ctx); 4700 4701 static void 4702 vfio_user_poll_group_add_intr(struct nvmf_vfio_user_poll_group *vu_group, 4703 struct spdk_nvmf_poll_group *group) 4704 { 4705 vu_group->intr_fd = eventfd(0, EFD_NONBLOCK); 4706 assert(vu_group->intr_fd != -1); 4707 4708 vu_group->intr = SPDK_INTERRUPT_REGISTER(vu_group->intr_fd, 4709 vfio_user_poll_group_intr, vu_group); 4710 assert(vu_group->intr != NULL); 4711 4712 spdk_poller_register_interrupt(group->poller, set_intr_mode_noop, 4713 vu_group); 4714 } 4715 4716 static struct spdk_nvmf_transport_poll_group * 4717 nvmf_vfio_user_poll_group_create(struct spdk_nvmf_transport *transport, 4718 struct spdk_nvmf_poll_group *group) 4719 { 4720 struct nvmf_vfio_user_transport *vu_transport; 4721 struct nvmf_vfio_user_poll_group *vu_group; 4722 4723 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4724 transport); 4725 4726 SPDK_DEBUGLOG(nvmf_vfio, "create poll group\n"); 4727 4728 vu_group = calloc(1, sizeof(*vu_group)); 4729 if (vu_group == NULL) { 4730 SPDK_ERRLOG("Error allocating poll group: %m"); 4731 return NULL; 4732 } 4733 4734 if (in_interrupt_mode(vu_transport)) { 4735 vfio_user_poll_group_add_intr(vu_group, group); 4736 } 4737 4738 TAILQ_INIT(&vu_group->sqs); 4739 4740 pthread_mutex_lock(&vu_transport->pg_lock); 4741 TAILQ_INSERT_TAIL(&vu_transport->poll_groups, vu_group, link); 4742 if (vu_transport->next_pg == NULL) { 4743 vu_transport->next_pg = vu_group; 4744 } 4745 pthread_mutex_unlock(&vu_transport->pg_lock); 4746 4747 return &vu_group->group; 4748 } 4749 4750 static struct spdk_nvmf_transport_poll_group * 4751 nvmf_vfio_user_get_optimal_poll_group(struct spdk_nvmf_qpair *qpair) 4752 { 4753 struct nvmf_vfio_user_transport *vu_transport; 4754 struct nvmf_vfio_user_poll_group **vu_group; 4755 struct nvmf_vfio_user_sq *sq; 4756 struct nvmf_vfio_user_cq *cq; 4757 4758 struct spdk_nvmf_transport_poll_group *result = NULL; 4759 4760 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 4761 cq = sq->ctrlr->cqs[sq->cqid]; 4762 assert(cq != NULL); 4763 vu_transport = SPDK_CONTAINEROF(qpair->transport, struct nvmf_vfio_user_transport, transport); 4764 4765 pthread_mutex_lock(&vu_transport->pg_lock); 4766 if (TAILQ_EMPTY(&vu_transport->poll_groups)) { 4767 goto out; 4768 } 4769 4770 if (!nvmf_qpair_is_admin_queue(qpair)) { 4771 /* 4772 * If this is shared IO CQ case, just return the used CQ's poll 4773 * group, so I/O completions don't have to use 4774 * spdk_thread_send_msg(). 4775 */ 4776 if (cq->group != NULL) { 4777 result = cq->group; 4778 goto out; 4779 } 4780 4781 /* 4782 * If we're in interrupt mode, align all qpairs for a controller 4783 * on the same poll group by default, unless requested. This can 4784 * be lower in performance than running on a single poll group, 4785 * so we disable spreading by default. 4786 */ 4787 if (in_interrupt_mode(vu_transport) && 4788 !vu_transport->transport_opts.enable_intr_mode_sq_spreading) { 4789 result = sq->ctrlr->sqs[0]->group; 4790 goto out; 4791 } 4792 4793 } 4794 4795 vu_group = &vu_transport->next_pg; 4796 assert(*vu_group != NULL); 4797 4798 result = &(*vu_group)->group; 4799 *vu_group = TAILQ_NEXT(*vu_group, link); 4800 if (*vu_group == NULL) { 4801 *vu_group = TAILQ_FIRST(&vu_transport->poll_groups); 4802 } 4803 4804 out: 4805 if (cq->group == NULL) { 4806 cq->group = result; 4807 } 4808 4809 pthread_mutex_unlock(&vu_transport->pg_lock); 4810 return result; 4811 } 4812 4813 static void 4814 vfio_user_poll_group_del_intr(struct nvmf_vfio_user_poll_group *vu_group) 4815 { 4816 assert(vu_group->intr_fd != -1); 4817 4818 spdk_interrupt_unregister(&vu_group->intr); 4819 4820 close(vu_group->intr_fd); 4821 vu_group->intr_fd = -1; 4822 } 4823 4824 /* called when process exits */ 4825 static void 4826 nvmf_vfio_user_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group) 4827 { 4828 struct nvmf_vfio_user_poll_group *vu_group, *next_tgroup; 4829 struct nvmf_vfio_user_transport *vu_transport; 4830 4831 SPDK_DEBUGLOG(nvmf_vfio, "destroy poll group\n"); 4832 4833 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 4834 vu_transport = SPDK_CONTAINEROF(vu_group->group.transport, struct nvmf_vfio_user_transport, 4835 transport); 4836 4837 if (in_interrupt_mode(vu_transport)) { 4838 vfio_user_poll_group_del_intr(vu_group); 4839 } 4840 4841 pthread_mutex_lock(&vu_transport->pg_lock); 4842 next_tgroup = TAILQ_NEXT(vu_group, link); 4843 TAILQ_REMOVE(&vu_transport->poll_groups, vu_group, link); 4844 if (next_tgroup == NULL) { 4845 next_tgroup = TAILQ_FIRST(&vu_transport->poll_groups); 4846 } 4847 if (vu_transport->next_pg == vu_group) { 4848 vu_transport->next_pg = next_tgroup; 4849 } 4850 pthread_mutex_unlock(&vu_transport->pg_lock); 4851 4852 free(vu_group); 4853 } 4854 4855 static void 4856 _vfio_user_qpair_disconnect(void *ctx) 4857 { 4858 struct nvmf_vfio_user_sq *sq = ctx; 4859 4860 spdk_nvmf_qpair_disconnect(&sq->qpair, NULL, NULL); 4861 } 4862 4863 /* The function is used when socket connection is destroyed */ 4864 static int 4865 vfio_user_destroy_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 4866 { 4867 struct nvmf_vfio_user_sq *sq; 4868 struct nvmf_vfio_user_endpoint *endpoint; 4869 4870 SPDK_DEBUGLOG(nvmf_vfio, "%s stop processing\n", ctrlr_id(ctrlr)); 4871 4872 endpoint = ctrlr->endpoint; 4873 assert(endpoint != NULL); 4874 4875 pthread_mutex_lock(&endpoint->lock); 4876 endpoint->need_relisten = true; 4877 ctrlr->disconnect = true; 4878 if (TAILQ_EMPTY(&ctrlr->connected_sqs)) { 4879 endpoint->ctrlr = NULL; 4880 free_ctrlr(ctrlr); 4881 pthread_mutex_unlock(&endpoint->lock); 4882 return 0; 4883 } 4884 4885 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 4886 /* add another round thread poll to avoid recursive endpoint lock */ 4887 spdk_thread_send_msg(ctrlr->thread, _vfio_user_qpair_disconnect, sq); 4888 } 4889 pthread_mutex_unlock(&endpoint->lock); 4890 4891 return 0; 4892 } 4893 4894 /* 4895 * Poll for and process any incoming vfio-user messages. 4896 */ 4897 static int 4898 vfio_user_poll_vfu_ctx(void *ctx) 4899 { 4900 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 4901 int ret; 4902 4903 assert(ctrlr != NULL); 4904 4905 /* This will call access_bar0_fn() if there are any writes 4906 * to the portion of the BAR that is not mmap'd */ 4907 ret = vfu_run_ctx(ctrlr->endpoint->vfu_ctx); 4908 if (spdk_unlikely(ret == -1)) { 4909 if (errno == EBUSY) { 4910 return SPDK_POLLER_IDLE; 4911 } 4912 4913 spdk_poller_unregister(&ctrlr->vfu_ctx_poller); 4914 4915 /* 4916 * We lost the client; the reset callback will already have 4917 * unregistered the interrupt. 4918 */ 4919 if (errno == ENOTCONN) { 4920 vfio_user_destroy_ctrlr(ctrlr); 4921 return SPDK_POLLER_BUSY; 4922 } 4923 4924 /* 4925 * We might not have got a reset callback in this case, so 4926 * explicitly unregister the interrupt here. 4927 */ 4928 spdk_interrupt_unregister(&ctrlr->intr); 4929 ctrlr->intr_fd = -1; 4930 fail_ctrlr(ctrlr); 4931 } 4932 4933 return ret != 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 4934 } 4935 4936 struct vfio_user_post_cpl_ctx { 4937 struct nvmf_vfio_user_ctrlr *ctrlr; 4938 struct nvmf_vfio_user_cq *cq; 4939 struct spdk_nvme_cpl cpl; 4940 }; 4941 4942 static void 4943 _post_completion_msg(void *ctx) 4944 { 4945 struct vfio_user_post_cpl_ctx *cpl_ctx = ctx; 4946 4947 post_completion(cpl_ctx->ctrlr, cpl_ctx->cq, cpl_ctx->cpl.cdw0, cpl_ctx->cpl.sqid, 4948 cpl_ctx->cpl.cid, cpl_ctx->cpl.status.sc, cpl_ctx->cpl.status.sct); 4949 free(cpl_ctx); 4950 } 4951 4952 static int nvmf_vfio_user_poll_group_poll(struct spdk_nvmf_transport_poll_group *group); 4953 4954 static int 4955 vfio_user_poll_group_process(void *ctx) 4956 { 4957 struct nvmf_vfio_user_poll_group *vu_group = ctx; 4958 int ret = 0; 4959 4960 SPDK_DEBUGLOG(vfio_user_db, "pg:%p got intr\n", vu_group); 4961 4962 ret |= nvmf_vfio_user_poll_group_poll(&vu_group->group); 4963 4964 /* 4965 * Re-arm the event indexes. NB: this also could rearm other 4966 * controller's SQs. 4967 */ 4968 ret |= vfio_user_poll_group_rearm(vu_group); 4969 4970 vu_group->stats.pg_process_count++; 4971 return ret != 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 4972 } 4973 4974 static int 4975 vfio_user_poll_group_intr(void *ctx) 4976 { 4977 struct nvmf_vfio_user_poll_group *vu_group = ctx; 4978 eventfd_t val; 4979 4980 eventfd_read(vu_group->intr_fd, &val); 4981 4982 vu_group->stats.intr++; 4983 4984 return vfio_user_poll_group_process(ctx); 4985 } 4986 4987 /* 4988 * Handle an interrupt for the given controller: we must poll the vfu_ctx, and 4989 * the SQs assigned to our own poll group. Other poll groups are handled via 4990 * vfio_user_poll_group_intr(). 4991 */ 4992 static int 4993 vfio_user_ctrlr_intr(void *ctx) 4994 { 4995 struct nvmf_vfio_user_poll_group *vu_ctrlr_group; 4996 struct nvmf_vfio_user_ctrlr *vu_ctrlr = ctx; 4997 struct nvmf_vfio_user_poll_group *vu_group; 4998 int ret = SPDK_POLLER_IDLE; 4999 5000 vu_ctrlr_group = ctrlr_to_poll_group(vu_ctrlr); 5001 5002 SPDK_DEBUGLOG(vfio_user_db, "ctrlr pg:%p got intr\n", vu_ctrlr_group); 5003 5004 vu_ctrlr_group->stats.ctrlr_intr++; 5005 5006 /* 5007 * Poll vfio-user for this controller. We need to do this before polling 5008 * any SQs, as this is where doorbell writes may be handled. 5009 */ 5010 ret = vfio_user_poll_vfu_ctx(vu_ctrlr); 5011 5012 /* 5013 * `sqs[0]` could be set to NULL in vfio_user_poll_vfu_ctx() context, 5014 * just return for this case. 5015 */ 5016 if (vu_ctrlr->sqs[0] == NULL) { 5017 return ret; 5018 } 5019 5020 if (vu_ctrlr->transport->transport_opts.enable_intr_mode_sq_spreading) { 5021 /* 5022 * We may have just written to a doorbell owned by another 5023 * reactor: we need to prod them to make sure its SQs are polled 5024 * *after* the doorbell value is updated. 5025 */ 5026 TAILQ_FOREACH(vu_group, &vu_ctrlr->transport->poll_groups, link) { 5027 if (vu_group != vu_ctrlr_group) { 5028 SPDK_DEBUGLOG(vfio_user_db, "prodding pg:%p\n", vu_group); 5029 eventfd_write(vu_group->intr_fd, 1); 5030 } 5031 } 5032 } 5033 5034 ret |= vfio_user_poll_group_process(vu_ctrlr_group); 5035 5036 return ret; 5037 } 5038 5039 static void 5040 vfio_user_ctrlr_set_intr_mode(struct spdk_poller *poller, void *ctx, 5041 bool interrupt_mode) 5042 { 5043 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 5044 assert(ctrlr != NULL); 5045 assert(ctrlr->endpoint != NULL); 5046 5047 SPDK_DEBUGLOG(nvmf_vfio, "%s: setting interrupt mode to %d\n", 5048 ctrlr_id(ctrlr), interrupt_mode); 5049 5050 /* 5051 * interrupt_mode needs to persist across controller resets, so store 5052 * it in the endpoint instead. 5053 */ 5054 ctrlr->endpoint->interrupt_mode = interrupt_mode; 5055 5056 vfio_user_poll_group_rearm(ctrlr_to_poll_group(ctrlr)); 5057 } 5058 5059 /* 5060 * In response to the nvmf_vfio_user_create_ctrlr() path, the admin queue is now 5061 * set up and we can start operating on this controller. 5062 */ 5063 static void 5064 start_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 5065 struct spdk_nvmf_ctrlr *ctrlr) 5066 { 5067 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 5068 5069 vu_ctrlr->ctrlr = ctrlr; 5070 vu_ctrlr->cntlid = ctrlr->cntlid; 5071 vu_ctrlr->thread = spdk_get_thread(); 5072 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 5073 5074 if (!in_interrupt_mode(endpoint->transport)) { 5075 vu_ctrlr->vfu_ctx_poller = SPDK_POLLER_REGISTER(vfio_user_poll_vfu_ctx, 5076 vu_ctrlr, 1000); 5077 return; 5078 } 5079 5080 vu_ctrlr->vfu_ctx_poller = SPDK_POLLER_REGISTER(vfio_user_poll_vfu_ctx, 5081 vu_ctrlr, 0); 5082 5083 vu_ctrlr->intr_fd = vfu_get_poll_fd(vu_ctrlr->endpoint->vfu_ctx); 5084 assert(vu_ctrlr->intr_fd != -1); 5085 5086 vu_ctrlr->intr = SPDK_INTERRUPT_REGISTER(vu_ctrlr->intr_fd, 5087 vfio_user_ctrlr_intr, vu_ctrlr); 5088 5089 assert(vu_ctrlr->intr != NULL); 5090 5091 spdk_poller_register_interrupt(vu_ctrlr->vfu_ctx_poller, 5092 vfio_user_ctrlr_set_intr_mode, 5093 vu_ctrlr); 5094 } 5095 5096 static int 5097 handle_queue_connect_rsp(struct nvmf_vfio_user_req *req, void *cb_arg) 5098 { 5099 struct nvmf_vfio_user_poll_group *vu_group; 5100 struct nvmf_vfio_user_sq *sq = cb_arg; 5101 struct nvmf_vfio_user_cq *admin_cq; 5102 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 5103 struct nvmf_vfio_user_endpoint *endpoint; 5104 5105 assert(sq != NULL); 5106 assert(req != NULL); 5107 5108 vu_ctrlr = sq->ctrlr; 5109 assert(vu_ctrlr != NULL); 5110 endpoint = vu_ctrlr->endpoint; 5111 assert(endpoint != NULL); 5112 5113 if (spdk_nvme_cpl_is_error(&req->req.rsp->nvme_cpl)) { 5114 SPDK_ERRLOG("SC %u, SCT %u\n", req->req.rsp->nvme_cpl.status.sc, req->req.rsp->nvme_cpl.status.sct); 5115 endpoint->ctrlr = NULL; 5116 free_ctrlr(vu_ctrlr); 5117 return -1; 5118 } 5119 5120 vu_group = SPDK_CONTAINEROF(sq->group, struct nvmf_vfio_user_poll_group, group); 5121 TAILQ_INSERT_TAIL(&vu_group->sqs, sq, link); 5122 5123 admin_cq = vu_ctrlr->cqs[0]; 5124 assert(admin_cq != NULL); 5125 assert(admin_cq->group != NULL); 5126 assert(admin_cq->group->group->thread != NULL); 5127 5128 pthread_mutex_lock(&endpoint->lock); 5129 if (nvmf_qpair_is_admin_queue(&sq->qpair)) { 5130 assert(admin_cq->group->group->thread == spdk_get_thread()); 5131 /* 5132 * The admin queue is special as SQ0 and CQ0 are created 5133 * together. 5134 */ 5135 admin_cq->cq_ref = 1; 5136 start_ctrlr(vu_ctrlr, sq->qpair.ctrlr); 5137 } else { 5138 /* For I/O queues this command was generated in response to an 5139 * ADMIN I/O CREATE SUBMISSION QUEUE command which has not yet 5140 * been completed. Complete it now. 5141 */ 5142 if (sq->post_create_io_sq_completion) { 5143 if (admin_cq->group->group->thread != spdk_get_thread()) { 5144 struct vfio_user_post_cpl_ctx *cpl_ctx; 5145 5146 cpl_ctx = calloc(1, sizeof(*cpl_ctx)); 5147 if (!cpl_ctx) { 5148 return -ENOMEM; 5149 } 5150 cpl_ctx->ctrlr = vu_ctrlr; 5151 cpl_ctx->cq = admin_cq; 5152 cpl_ctx->cpl.sqid = 0; 5153 cpl_ctx->cpl.cdw0 = 0; 5154 cpl_ctx->cpl.cid = sq->create_io_sq_cmd.cid; 5155 cpl_ctx->cpl.status.sc = SPDK_NVME_SC_SUCCESS; 5156 cpl_ctx->cpl.status.sct = SPDK_NVME_SCT_GENERIC; 5157 5158 spdk_thread_send_msg(admin_cq->group->group->thread, 5159 _post_completion_msg, 5160 cpl_ctx); 5161 } else { 5162 post_completion(vu_ctrlr, admin_cq, 0, 0, 5163 sq->create_io_sq_cmd.cid, SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC); 5164 } 5165 sq->post_create_io_sq_completion = false; 5166 } else if (in_interrupt_mode(endpoint->transport)) { 5167 /* 5168 * If we're live migrating a guest, there is a window 5169 * where the I/O queues haven't been set up but the 5170 * device is in running state, during which the guest 5171 * might write to a doorbell. This doorbell write will 5172 * go unnoticed, so let's poll the whole controller to 5173 * pick that up. 5174 */ 5175 ctrlr_kick(vu_ctrlr); 5176 } 5177 sq->sq_state = VFIO_USER_SQ_ACTIVE; 5178 } 5179 5180 TAILQ_INSERT_TAIL(&vu_ctrlr->connected_sqs, sq, tailq); 5181 pthread_mutex_unlock(&endpoint->lock); 5182 5183 free(req->req.iov[0].iov_base); 5184 req->req.iov[0].iov_base = NULL; 5185 req->req.iovcnt = 0; 5186 5187 return 0; 5188 } 5189 5190 /* 5191 * Add the given qpair to the given poll group. New qpairs are added via 5192 * spdk_nvmf_tgt_new_qpair(), which picks a poll group via 5193 * nvmf_vfio_user_get_optimal_poll_group(), then calls back here via 5194 * nvmf_transport_poll_group_add(). 5195 */ 5196 static int 5197 nvmf_vfio_user_poll_group_add(struct spdk_nvmf_transport_poll_group *group, 5198 struct spdk_nvmf_qpair *qpair) 5199 { 5200 struct nvmf_vfio_user_sq *sq; 5201 struct nvmf_vfio_user_req *vu_req; 5202 struct nvmf_vfio_user_ctrlr *ctrlr; 5203 struct spdk_nvmf_request *req; 5204 struct spdk_nvmf_fabric_connect_data *data; 5205 bool admin; 5206 5207 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5208 sq->group = group; 5209 ctrlr = sq->ctrlr; 5210 5211 SPDK_DEBUGLOG(nvmf_vfio, "%s: add QP%d=%p(%p) to poll_group=%p\n", 5212 ctrlr_id(ctrlr), sq->qpair.qid, 5213 sq, qpair, group); 5214 5215 admin = nvmf_qpair_is_admin_queue(&sq->qpair); 5216 5217 vu_req = get_nvmf_vfio_user_req(sq); 5218 if (vu_req == NULL) { 5219 return -1; 5220 } 5221 5222 req = &vu_req->req; 5223 req->cmd->connect_cmd.opcode = SPDK_NVME_OPC_FABRIC; 5224 req->cmd->connect_cmd.cid = 0; 5225 req->cmd->connect_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_CONNECT; 5226 req->cmd->connect_cmd.recfmt = 0; 5227 req->cmd->connect_cmd.sqsize = sq->size - 1; 5228 req->cmd->connect_cmd.qid = admin ? 0 : qpair->qid; 5229 5230 req->length = sizeof(struct spdk_nvmf_fabric_connect_data); 5231 5232 data = calloc(1, req->length); 5233 if (data == NULL) { 5234 nvmf_vfio_user_req_free(req); 5235 return -ENOMEM; 5236 } 5237 5238 spdk_iov_one(req->iov, &req->iovcnt, data, req->length); 5239 5240 data->cntlid = ctrlr->cntlid; 5241 snprintf(data->subnqn, sizeof(data->subnqn), "%s", 5242 spdk_nvmf_subsystem_get_nqn(ctrlr->endpoint->subsystem)); 5243 5244 vu_req->cb_fn = handle_queue_connect_rsp; 5245 vu_req->cb_arg = sq; 5246 5247 SPDK_DEBUGLOG(nvmf_vfio, 5248 "%s: sending connect fabrics command for qid:%#x cntlid=%#x\n", 5249 ctrlr_id(ctrlr), qpair->qid, data->cntlid); 5250 5251 spdk_nvmf_request_exec_fabrics(req); 5252 return 0; 5253 } 5254 5255 static int 5256 nvmf_vfio_user_poll_group_remove(struct spdk_nvmf_transport_poll_group *group, 5257 struct spdk_nvmf_qpair *qpair) 5258 { 5259 struct nvmf_vfio_user_sq *sq; 5260 struct nvmf_vfio_user_poll_group *vu_group; 5261 5262 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5263 5264 SPDK_DEBUGLOG(nvmf_vfio, 5265 "%s: remove NVMf QP%d=%p from NVMf poll_group=%p\n", 5266 ctrlr_id(sq->ctrlr), qpair->qid, qpair, group); 5267 5268 5269 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 5270 TAILQ_REMOVE(&vu_group->sqs, sq, link); 5271 5272 return 0; 5273 } 5274 5275 static void 5276 _nvmf_vfio_user_req_free(struct nvmf_vfio_user_sq *sq, struct nvmf_vfio_user_req *vu_req) 5277 { 5278 memset(&vu_req->cmd, 0, sizeof(vu_req->cmd)); 5279 memset(&vu_req->rsp, 0, sizeof(vu_req->rsp)); 5280 vu_req->iovcnt = 0; 5281 vu_req->req.iovcnt = 0; 5282 vu_req->req.length = 0; 5283 vu_req->state = VFIO_USER_REQUEST_STATE_FREE; 5284 5285 TAILQ_INSERT_TAIL(&sq->free_reqs, vu_req, link); 5286 } 5287 5288 static int 5289 nvmf_vfio_user_req_free(struct spdk_nvmf_request *req) 5290 { 5291 struct nvmf_vfio_user_sq *sq; 5292 struct nvmf_vfio_user_req *vu_req; 5293 5294 assert(req != NULL); 5295 5296 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 5297 sq = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 5298 5299 _nvmf_vfio_user_req_free(sq, vu_req); 5300 5301 return 0; 5302 } 5303 5304 static int 5305 nvmf_vfio_user_req_complete(struct spdk_nvmf_request *req) 5306 { 5307 struct nvmf_vfio_user_sq *sq; 5308 struct nvmf_vfio_user_req *vu_req; 5309 5310 assert(req != NULL); 5311 5312 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 5313 sq = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 5314 5315 if (vu_req->cb_fn != NULL) { 5316 if (vu_req->cb_fn(vu_req, vu_req->cb_arg) != 0) { 5317 fail_ctrlr(sq->ctrlr); 5318 } 5319 } 5320 5321 _nvmf_vfio_user_req_free(sq, vu_req); 5322 5323 return 0; 5324 } 5325 5326 static void 5327 nvmf_vfio_user_close_qpair(struct spdk_nvmf_qpair *qpair, 5328 spdk_nvmf_transport_qpair_fini_cb cb_fn, void *cb_arg) 5329 { 5330 struct nvmf_vfio_user_sq *sq; 5331 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 5332 struct nvmf_vfio_user_endpoint *endpoint; 5333 struct vfio_user_delete_sq_ctx *del_ctx; 5334 5335 assert(qpair != NULL); 5336 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5337 vu_ctrlr = sq->ctrlr; 5338 endpoint = vu_ctrlr->endpoint; 5339 del_ctx = sq->delete_ctx; 5340 sq->delete_ctx = NULL; 5341 5342 pthread_mutex_lock(&endpoint->lock); 5343 TAILQ_REMOVE(&vu_ctrlr->connected_sqs, sq, tailq); 5344 delete_sq_done(vu_ctrlr, sq); 5345 if (TAILQ_EMPTY(&vu_ctrlr->connected_sqs)) { 5346 endpoint->ctrlr = NULL; 5347 if (vu_ctrlr->in_source_vm && endpoint->need_resume) { 5348 /* The controller will be freed, we can resume the subsystem 5349 * now so that the endpoint can be ready to accept another 5350 * new connection. 5351 */ 5352 spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 5353 vfio_user_endpoint_resume_done, endpoint); 5354 } 5355 free_ctrlr(vu_ctrlr); 5356 } 5357 pthread_mutex_unlock(&endpoint->lock); 5358 5359 if (del_ctx) { 5360 vfio_user_qpair_delete_cb(del_ctx); 5361 } 5362 5363 if (cb_fn) { 5364 cb_fn(cb_arg); 5365 } 5366 } 5367 5368 /** 5369 * Returns a preallocated request, or NULL if there isn't one available. 5370 */ 5371 static struct nvmf_vfio_user_req * 5372 get_nvmf_vfio_user_req(struct nvmf_vfio_user_sq *sq) 5373 { 5374 struct nvmf_vfio_user_req *req; 5375 5376 if (sq == NULL) { 5377 return NULL; 5378 } 5379 5380 req = TAILQ_FIRST(&sq->free_reqs); 5381 if (req == NULL) { 5382 return NULL; 5383 } 5384 5385 TAILQ_REMOVE(&sq->free_reqs, req, link); 5386 5387 return req; 5388 } 5389 5390 static int 5391 get_nvmf_io_req_length(struct spdk_nvmf_request *req) 5392 { 5393 uint16_t nr; 5394 uint32_t nlb, nsid; 5395 struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; 5396 struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; 5397 struct spdk_nvmf_ns *ns; 5398 5399 nsid = cmd->nsid; 5400 ns = _nvmf_subsystem_get_ns(ctrlr->subsys, nsid); 5401 if (ns == NULL || ns->bdev == NULL) { 5402 SPDK_ERRLOG("unsuccessful query for nsid %u\n", cmd->nsid); 5403 return -EINVAL; 5404 } 5405 5406 if (cmd->opc == SPDK_NVME_OPC_DATASET_MANAGEMENT) { 5407 nr = cmd->cdw10_bits.dsm.nr + 1; 5408 return nr * sizeof(struct spdk_nvme_dsm_range); 5409 } 5410 5411 if (cmd->opc == SPDK_NVME_OPC_COPY) { 5412 nr = (cmd->cdw12 & 0x000000ffu) + 1; 5413 return nr * sizeof(struct spdk_nvme_scc_source_range); 5414 } 5415 5416 nlb = (cmd->cdw12 & 0x0000ffffu) + 1; 5417 return nlb * spdk_bdev_get_block_size(ns->bdev); 5418 } 5419 5420 static int 5421 map_admin_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req) 5422 { 5423 struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; 5424 uint32_t len = 0, numdw = 0; 5425 uint8_t fid; 5426 int iovcnt; 5427 5428 req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 5429 5430 if (req->xfer == SPDK_NVME_DATA_NONE) { 5431 return 0; 5432 } 5433 5434 switch (cmd->opc) { 5435 case SPDK_NVME_OPC_IDENTIFY: 5436 len = 4096; 5437 break; 5438 case SPDK_NVME_OPC_GET_LOG_PAGE: 5439 numdw = ((((uint32_t)cmd->cdw11_bits.get_log_page.numdu << 16) | 5440 cmd->cdw10_bits.get_log_page.numdl) + 1); 5441 if (numdw > UINT32_MAX / 4) { 5442 return -EINVAL; 5443 } 5444 len = numdw * 4; 5445 break; 5446 case SPDK_NVME_OPC_GET_FEATURES: 5447 case SPDK_NVME_OPC_SET_FEATURES: 5448 fid = cmd->cdw10_bits.set_features.fid; 5449 switch (fid) { 5450 case SPDK_NVME_FEAT_LBA_RANGE_TYPE: 5451 len = 4096; 5452 break; 5453 case SPDK_NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 5454 len = 256; 5455 break; 5456 case SPDK_NVME_FEAT_TIMESTAMP: 5457 len = 8; 5458 break; 5459 case SPDK_NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 5460 len = 512; 5461 break; 5462 case SPDK_NVME_FEAT_HOST_IDENTIFIER: 5463 if (cmd->cdw11_bits.feat_host_identifier.bits.exhid) { 5464 len = 16; 5465 } else { 5466 len = 8; 5467 } 5468 break; 5469 default: 5470 return 0; 5471 } 5472 break; 5473 default: 5474 return 0; 5475 } 5476 5477 /* ADMIN command will not use SGL */ 5478 if (cmd->psdt != 0) { 5479 return -EINVAL; 5480 } 5481 5482 iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, len); 5483 if (iovcnt < 0) { 5484 SPDK_ERRLOG("%s: map Admin Opc %x failed\n", 5485 ctrlr_id(ctrlr), cmd->opc); 5486 return -1; 5487 } 5488 req->length = len; 5489 req->iovcnt = iovcnt; 5490 5491 return 0; 5492 } 5493 5494 /* 5495 * Map an I/O command's buffers. 5496 * 5497 * Returns 0 on success and -errno on failure. 5498 */ 5499 static int 5500 map_io_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req) 5501 { 5502 int len, iovcnt; 5503 struct spdk_nvme_cmd *cmd; 5504 5505 assert(ctrlr != NULL); 5506 assert(req != NULL); 5507 5508 cmd = &req->cmd->nvme_cmd; 5509 req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 5510 5511 if (spdk_unlikely(req->xfer == SPDK_NVME_DATA_NONE)) { 5512 return 0; 5513 } 5514 5515 len = get_nvmf_io_req_length(req); 5516 if (len < 0) { 5517 return -EINVAL; 5518 } 5519 req->length = len; 5520 5521 iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, req->length); 5522 if (iovcnt < 0) { 5523 SPDK_ERRLOG("%s: failed to map IO OPC %u\n", ctrlr_id(ctrlr), cmd->opc); 5524 return -EFAULT; 5525 } 5526 req->iovcnt = iovcnt; 5527 5528 return 0; 5529 } 5530 5531 static int 5532 handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 5533 struct nvmf_vfio_user_sq *sq) 5534 { 5535 int err; 5536 struct nvmf_vfio_user_req *vu_req; 5537 struct spdk_nvmf_request *req; 5538 5539 assert(ctrlr != NULL); 5540 assert(cmd != NULL); 5541 5542 vu_req = get_nvmf_vfio_user_req(sq); 5543 if (spdk_unlikely(vu_req == NULL)) { 5544 SPDK_ERRLOG("%s: no request for NVMe command opc 0x%x\n", ctrlr_id(ctrlr), cmd->opc); 5545 return post_completion(ctrlr, ctrlr->cqs[sq->cqid], 0, 0, cmd->cid, 5546 SPDK_NVME_SC_INTERNAL_DEVICE_ERROR, SPDK_NVME_SCT_GENERIC); 5547 5548 } 5549 req = &vu_req->req; 5550 5551 assert(req->qpair != NULL); 5552 SPDK_DEBUGLOG(nvmf_vfio, "%s: handle sqid:%u, req opc=%#x cid=%d\n", 5553 ctrlr_id(ctrlr), req->qpair->qid, cmd->opc, cmd->cid); 5554 5555 vu_req->cb_fn = handle_cmd_rsp; 5556 vu_req->cb_arg = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 5557 req->cmd->nvme_cmd = *cmd; 5558 5559 if (nvmf_qpair_is_admin_queue(req->qpair)) { 5560 err = map_admin_cmd_req(ctrlr, req); 5561 } else { 5562 switch (cmd->opc) { 5563 case SPDK_NVME_OPC_RESERVATION_REGISTER: 5564 case SPDK_NVME_OPC_RESERVATION_REPORT: 5565 case SPDK_NVME_OPC_RESERVATION_ACQUIRE: 5566 case SPDK_NVME_OPC_RESERVATION_RELEASE: 5567 err = -ENOTSUP; 5568 break; 5569 default: 5570 err = map_io_cmd_req(ctrlr, req); 5571 break; 5572 } 5573 } 5574 5575 if (spdk_unlikely(err < 0)) { 5576 SPDK_ERRLOG("%s: process NVMe command opc 0x%x failed\n", 5577 ctrlr_id(ctrlr), cmd->opc); 5578 req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 5579 req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; 5580 err = handle_cmd_rsp(vu_req, vu_req->cb_arg); 5581 _nvmf_vfio_user_req_free(sq, vu_req); 5582 return err; 5583 } 5584 5585 vu_req->state = VFIO_USER_REQUEST_STATE_EXECUTING; 5586 spdk_nvmf_request_exec(req); 5587 5588 return 0; 5589 } 5590 5591 /* 5592 * If we suppressed an IRQ in post_completion(), check if it needs to be fired 5593 * here: if the host isn't up to date, and is apparently not actively processing 5594 * the queue (i.e. ->last_head isn't changing), we need an IRQ. 5595 */ 5596 static void 5597 handle_suppressed_irq(struct nvmf_vfio_user_ctrlr *ctrlr, 5598 struct nvmf_vfio_user_sq *sq) 5599 { 5600 struct nvmf_vfio_user_cq *cq = ctrlr->cqs[sq->cqid]; 5601 uint32_t cq_head; 5602 uint32_t cq_tail; 5603 5604 if (!cq->ien || cq->qid == 0 || !ctrlr_interrupt_enabled(ctrlr)) { 5605 return; 5606 } 5607 5608 cq_tail = *cq_tailp(cq); 5609 5610 /* Already sent? */ 5611 if (cq_tail == cq->last_trigger_irq_tail) { 5612 return; 5613 } 5614 5615 spdk_ivdt_dcache(cq_dbl_headp(cq)); 5616 cq_head = *cq_dbl_headp(cq); 5617 5618 if (cq_head != cq_tail && cq_head == cq->last_head) { 5619 int err = vfu_irq_trigger(ctrlr->endpoint->vfu_ctx, cq->iv); 5620 if (err != 0) { 5621 SPDK_ERRLOG("%s: failed to trigger interrupt: %m\n", 5622 ctrlr_id(ctrlr)); 5623 } else { 5624 cq->last_trigger_irq_tail = cq_tail; 5625 } 5626 } 5627 5628 cq->last_head = cq_head; 5629 } 5630 5631 /* Returns the number of commands processed, or a negative value on error. */ 5632 static int 5633 nvmf_vfio_user_sq_poll(struct nvmf_vfio_user_sq *sq) 5634 { 5635 struct nvmf_vfio_user_ctrlr *ctrlr; 5636 uint32_t new_tail; 5637 int count = 0; 5638 5639 assert(sq != NULL); 5640 5641 ctrlr = sq->ctrlr; 5642 5643 /* 5644 * A quiesced, or migrating, controller should never process new 5645 * commands. 5646 */ 5647 if (ctrlr->state != VFIO_USER_CTRLR_RUNNING) { 5648 return SPDK_POLLER_IDLE; 5649 } 5650 5651 if (ctrlr->adaptive_irqs_enabled) { 5652 handle_suppressed_irq(ctrlr, sq); 5653 } 5654 5655 /* On aarch64 platforms, doorbells update from guest VM may not be seen 5656 * on SPDK target side. This is because there is memory type mismatch 5657 * situation here. That is on guest VM side, the doorbells are treated as 5658 * device memory while on SPDK target side, it is treated as normal 5659 * memory. And this situation cause problem on ARM platform. 5660 * Refer to "https://developer.arm.com/documentation/102376/0100/ 5661 * Memory-aliasing-and-mismatched-memory-types". Only using spdk_mb() 5662 * cannot fix this. Use "dc civac" to invalidate cache may solve 5663 * this. 5664 */ 5665 spdk_ivdt_dcache(sq_dbl_tailp(sq)); 5666 5667 /* Load-Acquire. */ 5668 new_tail = *sq_dbl_tailp(sq); 5669 5670 new_tail = new_tail & 0xffffu; 5671 if (spdk_unlikely(new_tail >= sq->size)) { 5672 SPDK_DEBUGLOG(nvmf_vfio, "%s: invalid sqid:%u doorbell value %u\n", ctrlr_id(ctrlr), sq->qid, 5673 new_tail); 5674 spdk_nvmf_ctrlr_async_event_error_event(ctrlr->ctrlr, SPDK_NVME_ASYNC_EVENT_INVALID_DB_WRITE); 5675 5676 return -1; 5677 } 5678 5679 if (*sq_headp(sq) == new_tail) { 5680 return 0; 5681 } 5682 5683 SPDK_DEBUGLOG(nvmf_vfio, "%s: sqid:%u doorbell old=%u new=%u\n", 5684 ctrlr_id(ctrlr), sq->qid, *sq_headp(sq), new_tail); 5685 if (ctrlr->sdbl != NULL) { 5686 SPDK_DEBUGLOG(nvmf_vfio, 5687 "%s: sqid:%u bar0_doorbell=%u shadow_doorbell=%u eventidx=%u\n", 5688 ctrlr_id(ctrlr), sq->qid, 5689 ctrlr->bar0_doorbells[queue_index(sq->qid, false)], 5690 ctrlr->sdbl->shadow_doorbells[queue_index(sq->qid, false)], 5691 ctrlr->sdbl->eventidxs[queue_index(sq->qid, false)]); 5692 } 5693 5694 /* 5695 * Ensure that changes to the queue are visible to us. 5696 * The host driver should write the queue first, do a wmb(), and then 5697 * update the SQ tail doorbell (their Store-Release). 5698 */ 5699 spdk_rmb(); 5700 5701 count = handle_sq_tdbl_write(ctrlr, new_tail, sq); 5702 if (spdk_unlikely(count < 0)) { 5703 fail_ctrlr(ctrlr); 5704 } 5705 5706 return count; 5707 } 5708 5709 /* 5710 * vfio-user transport poll handler. Note that the library context is polled in 5711 * a separate poller (->vfu_ctx_poller), so this poller only needs to poll the 5712 * active SQs. 5713 * 5714 * Returns the number of commands processed, or a negative value on error. 5715 */ 5716 static int 5717 nvmf_vfio_user_poll_group_poll(struct spdk_nvmf_transport_poll_group *group) 5718 { 5719 struct nvmf_vfio_user_poll_group *vu_group; 5720 struct nvmf_vfio_user_sq *sq, *tmp; 5721 int count = 0; 5722 5723 assert(group != NULL); 5724 5725 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 5726 5727 SPDK_DEBUGLOG(vfio_user_db, "polling all SQs\n"); 5728 5729 TAILQ_FOREACH_SAFE(sq, &vu_group->sqs, link, tmp) { 5730 int ret; 5731 5732 if (spdk_unlikely(sq->sq_state != VFIO_USER_SQ_ACTIVE || !sq->size)) { 5733 continue; 5734 } 5735 5736 ret = nvmf_vfio_user_sq_poll(sq); 5737 5738 if (spdk_unlikely(ret < 0)) { 5739 return ret; 5740 } 5741 5742 count += ret; 5743 } 5744 5745 vu_group->stats.polls++; 5746 vu_group->stats.poll_reqs += count; 5747 vu_group->stats.poll_reqs_squared += count * count; 5748 if (count == 0) { 5749 vu_group->stats.polls_spurious++; 5750 } 5751 5752 return count; 5753 } 5754 5755 static int 5756 nvmf_vfio_user_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair, 5757 struct spdk_nvme_transport_id *trid) 5758 { 5759 struct nvmf_vfio_user_sq *sq; 5760 struct nvmf_vfio_user_ctrlr *ctrlr; 5761 5762 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5763 ctrlr = sq->ctrlr; 5764 5765 memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid)); 5766 return 0; 5767 } 5768 5769 static int 5770 nvmf_vfio_user_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair, 5771 struct spdk_nvme_transport_id *trid) 5772 { 5773 return 0; 5774 } 5775 5776 static int 5777 nvmf_vfio_user_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair, 5778 struct spdk_nvme_transport_id *trid) 5779 { 5780 struct nvmf_vfio_user_sq *sq; 5781 struct nvmf_vfio_user_ctrlr *ctrlr; 5782 5783 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5784 ctrlr = sq->ctrlr; 5785 5786 memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid)); 5787 return 0; 5788 } 5789 5790 static void 5791 nvmf_vfio_user_qpair_abort_request(struct spdk_nvmf_qpair *qpair, 5792 struct spdk_nvmf_request *req) 5793 { 5794 struct spdk_nvmf_request *req_to_abort = NULL; 5795 struct spdk_nvmf_request *temp_req = NULL; 5796 uint16_t cid; 5797 5798 cid = req->cmd->nvme_cmd.cdw10_bits.abort.cid; 5799 5800 TAILQ_FOREACH(temp_req, &qpair->outstanding, link) { 5801 struct nvmf_vfio_user_req *vu_req; 5802 5803 vu_req = SPDK_CONTAINEROF(temp_req, struct nvmf_vfio_user_req, req); 5804 5805 if (vu_req->state == VFIO_USER_REQUEST_STATE_EXECUTING && vu_req->cmd.cid == cid) { 5806 req_to_abort = temp_req; 5807 break; 5808 } 5809 } 5810 5811 if (req_to_abort == NULL) { 5812 spdk_nvmf_request_complete(req); 5813 return; 5814 } 5815 5816 req->req_to_abort = req_to_abort; 5817 nvmf_ctrlr_abort_request(req); 5818 } 5819 5820 static void 5821 nvmf_vfio_user_poll_group_dump_stat(struct spdk_nvmf_transport_poll_group *group, 5822 struct spdk_json_write_ctx *w) 5823 { 5824 struct nvmf_vfio_user_poll_group *vu_group = SPDK_CONTAINEROF(group, 5825 struct nvmf_vfio_user_poll_group, group); 5826 uint64_t polls_denom; 5827 5828 spdk_json_write_named_uint64(w, "ctrlr_intr", vu_group->stats.ctrlr_intr); 5829 spdk_json_write_named_uint64(w, "ctrlr_kicks", vu_group->stats.ctrlr_kicks); 5830 spdk_json_write_named_uint64(w, "won", vu_group->stats.won); 5831 spdk_json_write_named_uint64(w, "lost", vu_group->stats.lost); 5832 spdk_json_write_named_uint64(w, "lost_count", vu_group->stats.lost_count); 5833 spdk_json_write_named_uint64(w, "rearms", vu_group->stats.rearms); 5834 spdk_json_write_named_uint64(w, "pg_process_count", vu_group->stats.pg_process_count); 5835 spdk_json_write_named_uint64(w, "intr", vu_group->stats.intr); 5836 spdk_json_write_named_uint64(w, "polls", vu_group->stats.polls); 5837 spdk_json_write_named_uint64(w, "polls_spurious", vu_group->stats.polls_spurious); 5838 spdk_json_write_named_uint64(w, "poll_reqs", vu_group->stats.poll_reqs); 5839 polls_denom = vu_group->stats.polls * (vu_group->stats.polls - 1); 5840 if (polls_denom) { 5841 uint64_t n = vu_group->stats.polls * vu_group->stats.poll_reqs_squared - vu_group->stats.poll_reqs * 5842 vu_group->stats.poll_reqs; 5843 spdk_json_write_named_double(w, "poll_reqs_variance", sqrt(n / polls_denom)); 5844 } 5845 5846 spdk_json_write_named_uint64(w, "cqh_admin_writes", vu_group->stats.cqh_admin_writes); 5847 spdk_json_write_named_uint64(w, "cqh_io_writes", vu_group->stats.cqh_io_writes); 5848 } 5849 5850 static void 5851 nvmf_vfio_user_opts_init(struct spdk_nvmf_transport_opts *opts) 5852 { 5853 opts->max_queue_depth = NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH; 5854 opts->max_qpairs_per_ctrlr = NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; 5855 opts->in_capsule_data_size = 0; 5856 opts->max_io_size = NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE; 5857 opts->io_unit_size = NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE; 5858 opts->max_aq_depth = NVMF_VFIO_USER_DEFAULT_AQ_DEPTH; 5859 opts->num_shared_buffers = 0; 5860 opts->buf_cache_size = 0; 5861 opts->association_timeout = 0; 5862 opts->transport_specific = NULL; 5863 } 5864 5865 const struct spdk_nvmf_transport_ops spdk_nvmf_transport_vfio_user = { 5866 .name = "VFIOUSER", 5867 .type = SPDK_NVME_TRANSPORT_VFIOUSER, 5868 .opts_init = nvmf_vfio_user_opts_init, 5869 .create = nvmf_vfio_user_create, 5870 .destroy = nvmf_vfio_user_destroy, 5871 5872 .listen = nvmf_vfio_user_listen, 5873 .stop_listen = nvmf_vfio_user_stop_listen, 5874 .cdata_init = nvmf_vfio_user_cdata_init, 5875 .listen_associate = nvmf_vfio_user_listen_associate, 5876 5877 .listener_discover = nvmf_vfio_user_discover, 5878 5879 .poll_group_create = nvmf_vfio_user_poll_group_create, 5880 .get_optimal_poll_group = nvmf_vfio_user_get_optimal_poll_group, 5881 .poll_group_destroy = nvmf_vfio_user_poll_group_destroy, 5882 .poll_group_add = nvmf_vfio_user_poll_group_add, 5883 .poll_group_remove = nvmf_vfio_user_poll_group_remove, 5884 .poll_group_poll = nvmf_vfio_user_poll_group_poll, 5885 5886 .req_free = nvmf_vfio_user_req_free, 5887 .req_complete = nvmf_vfio_user_req_complete, 5888 5889 .qpair_fini = nvmf_vfio_user_close_qpair, 5890 .qpair_get_local_trid = nvmf_vfio_user_qpair_get_local_trid, 5891 .qpair_get_peer_trid = nvmf_vfio_user_qpair_get_peer_trid, 5892 .qpair_get_listen_trid = nvmf_vfio_user_qpair_get_listen_trid, 5893 .qpair_abort_request = nvmf_vfio_user_qpair_abort_request, 5894 5895 .poll_group_dump_stat = nvmf_vfio_user_poll_group_dump_stat, 5896 }; 5897 5898 SPDK_NVMF_TRANSPORT_REGISTER(muser, &spdk_nvmf_transport_vfio_user); 5899 SPDK_LOG_REGISTER_COMPONENT(nvmf_vfio) 5900 SPDK_LOG_REGISTER_COMPONENT(vfio_user_db) 5901