1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2020 Intel Corporation. 3 * Copyright (c) 2019-2022, Nutanix Inc. All rights reserved. 4 * Copyright (c) 2022, 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 /* 8 * NVMe over vfio-user transport 9 */ 10 11 #include <sys/param.h> 12 13 #include <vfio-user/libvfio-user.h> 14 #include <vfio-user/pci_defs.h> 15 16 #include "spdk/barrier.h" 17 #include "spdk/stdinc.h" 18 #include "spdk/assert.h" 19 #include "spdk/thread.h" 20 #include "spdk/nvmf_transport.h" 21 #include "spdk/sock.h" 22 #include "spdk/string.h" 23 #include "spdk/util.h" 24 #include "spdk/log.h" 25 26 #include "transport.h" 27 28 #include "nvmf_internal.h" 29 30 #define SWAP(x, y) \ 31 do \ 32 { \ 33 typeof(x) _tmp = x; \ 34 x = y; \ 35 y = _tmp; \ 36 } while (0) 37 38 #define NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH 256 39 #define NVMF_VFIO_USER_DEFAULT_AQ_DEPTH 32 40 #define NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE ((NVMF_REQ_MAX_BUFFERS - 1) << SHIFT_4KB) 41 #define NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE 42 43 #define NVME_DOORBELLS_OFFSET 0x1000 44 #define NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT 2 45 #define NVMF_VFIO_USER_SET_EVENTIDX_MAX_ATTEMPTS 3 46 #define NVMF_VFIO_USER_EVENTIDX_POLL UINT32_MAX 47 48 #define NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR 512 49 #define NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR (NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR / 4) 50 51 /* NVMe spec 1.4, section 5.21.1.7 */ 52 SPDK_STATIC_ASSERT(NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR >= 2 && 53 NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR <= SPDK_NVME_MAX_IO_QUEUES, 54 "bad number of queues"); 55 56 /* 57 * NVMe driver reads 4096 bytes, which is the extended PCI configuration space 58 * available on PCI-X 2.0 and PCI Express buses 59 */ 60 #define NVME_REG_CFG_SIZE 0x1000 61 62 /* 63 * Doorbells must be page aligned so that they can memory mapped. 64 * 65 * TODO does the NVMe spec also require this? Document it. 66 */ 67 #define NVMF_VFIO_USER_DOORBELLS_SIZE \ 68 SPDK_ALIGN_CEIL( \ 69 (NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR * 2 * SPDK_NVME_DOORBELL_REGISTER_SIZE), \ 70 0x1000) 71 #define NVME_REG_BAR0_SIZE (NVME_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE) 72 73 /* 74 * TODO check the PCI spec whether BAR4 and BAR5 really have to be at least one 75 * page and a multiple of page size (maybe QEMU also needs this?). Document all 76 * this. 77 */ 78 79 /* 80 * MSI-X Pending Bit Array Size 81 * 82 * TODO according to the PCI spec we need one bit per vector, document the 83 * relevant section. 84 * 85 * If the first argument to SPDK_ALIGN_CEIL is 0 then the result is 0, so we 86 * would end up with a 0-size BAR5. 87 */ 88 #define NVME_IRQ_MSIX_NUM MAX(CHAR_BIT, NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) 89 #define NVME_BAR5_SIZE SPDK_ALIGN_CEIL((NVME_IRQ_MSIX_NUM / CHAR_BIT), 0x1000) 90 SPDK_STATIC_ASSERT(NVME_BAR5_SIZE > 0, "Incorrect size"); 91 92 /* MSI-X Table Size */ 93 #define NVME_BAR4_SIZE SPDK_ALIGN_CEIL((NVME_IRQ_MSIX_NUM * 16), 0x1000) 94 SPDK_STATIC_ASSERT(NVME_BAR4_SIZE > 0, "Incorrect size"); 95 96 struct nvmf_vfio_user_req; 97 98 typedef int (*nvmf_vfio_user_req_cb_fn)(struct nvmf_vfio_user_req *req, void *cb_arg); 99 100 /* 1 more for PRP2 list itself */ 101 #define NVMF_VFIO_USER_MAX_IOVECS (NVMF_REQ_MAX_BUFFERS + 1) 102 103 enum nvmf_vfio_user_req_state { 104 VFIO_USER_REQUEST_STATE_FREE = 0, 105 VFIO_USER_REQUEST_STATE_EXECUTING, 106 }; 107 108 /* 109 * Support for live migration in NVMf/vfio-user: live migration is implemented 110 * by stopping the NVMf subsystem when the device is instructed to enter the 111 * stop-and-copy state and then trivially, and most importantly safely, 112 * collecting migration state and providing it to the vfio-user client. We 113 * don't provide any migration state at the pre-copy state as that's too 114 * complicated to do, we might support this in the future. 115 */ 116 117 118 /* NVMe device state representation */ 119 struct nvme_migr_sq_state { 120 uint16_t sqid; 121 uint16_t cqid; 122 uint32_t head; 123 uint32_t size; 124 uint32_t reserved; 125 uint64_t dma_addr; 126 }; 127 SPDK_STATIC_ASSERT(sizeof(struct nvme_migr_sq_state) == 0x18, "Incorrect size"); 128 129 struct nvme_migr_cq_state { 130 uint16_t cqid; 131 uint16_t phase; 132 uint32_t tail; 133 uint32_t size; 134 uint32_t iv; 135 uint32_t ien; 136 uint32_t reserved; 137 uint64_t dma_addr; 138 }; 139 SPDK_STATIC_ASSERT(sizeof(struct nvme_migr_cq_state) == 0x20, "Incorrect size"); 140 141 #define VFIO_USER_NVME_MIGR_MAGIC 0xAFEDBC23 142 143 /* The device state is in VFIO MIGRATION BAR(9) region, keep the device state page aligned. 144 * 145 * NVMe device migration region is defined as below: 146 * ------------------------------------------------------------------------- 147 * | vfio_user_nvme_migr_header | nvmf controller data | queue pairs | BARs | 148 * ------------------------------------------------------------------------- 149 * 150 * Keep vfio_user_nvme_migr_header as a fixed 0x1000 length, all new added fields 151 * can use the reserved space at the end of the data structure. 152 */ 153 struct vfio_user_nvme_migr_header { 154 /* Magic value to validate migration data */ 155 uint32_t magic; 156 /* Version to check the data is same from source to destination */ 157 uint32_t version; 158 159 /* The library uses this field to know how many fields in this 160 * structure are valid, starting at the beginning of this data 161 * structure. New added fields in future use `unused` memory 162 * spaces. 163 */ 164 uint32_t opts_size; 165 uint32_t reserved0; 166 167 /* BARs information */ 168 uint64_t bar_offset[VFU_PCI_DEV_NUM_REGIONS]; 169 uint64_t bar_len[VFU_PCI_DEV_NUM_REGIONS]; 170 171 /* Queue pair start offset, starting at the beginning of this 172 * data structure. 173 */ 174 uint64_t qp_offset; 175 uint64_t qp_len; 176 177 /* Controller data structure */ 178 uint32_t num_io_queues; 179 uint32_t reserved1; 180 181 /* NVMf controller data offset and length if exist, starting at 182 * the beginning of this data structure. 183 */ 184 uint64_t nvmf_data_offset; 185 uint64_t nvmf_data_len; 186 187 /* 188 * Whether or not shadow doorbells are used in the source. 0 is a valid DMA 189 * address. 190 */ 191 uint32_t sdbl; 192 193 /* Shadow doorbell DMA addresses. */ 194 uint64_t shadow_doorbell_buffer; 195 uint64_t eventidx_buffer; 196 197 /* Reserved memory space for new added fields, the 198 * field is always at the end of this data structure. 199 */ 200 uint8_t unused[3856]; 201 }; 202 SPDK_STATIC_ASSERT(sizeof(struct vfio_user_nvme_migr_header) == 0x1000, "Incorrect size"); 203 204 struct vfio_user_nvme_migr_qp { 205 struct nvme_migr_sq_state sq; 206 struct nvme_migr_cq_state cq; 207 }; 208 209 /* NVMe state definition used to load/restore from/to NVMe migration BAR region */ 210 struct vfio_user_nvme_migr_state { 211 struct vfio_user_nvme_migr_header ctrlr_header; 212 struct spdk_nvmf_ctrlr_migr_data nvmf_data; 213 struct vfio_user_nvme_migr_qp qps[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 214 uint8_t doorbells[NVMF_VFIO_USER_DOORBELLS_SIZE]; 215 uint8_t cfg[NVME_REG_CFG_SIZE]; 216 }; 217 218 struct nvmf_vfio_user_req { 219 struct spdk_nvmf_request req; 220 struct spdk_nvme_cpl rsp; 221 struct spdk_nvme_cmd cmd; 222 223 enum nvmf_vfio_user_req_state state; 224 nvmf_vfio_user_req_cb_fn cb_fn; 225 void *cb_arg; 226 227 /* old CC before prop_set_cc fabric command */ 228 union spdk_nvme_cc_register cc; 229 230 TAILQ_ENTRY(nvmf_vfio_user_req) link; 231 232 struct iovec iov[NVMF_VFIO_USER_MAX_IOVECS]; 233 uint8_t iovcnt; 234 235 /* NVMF_VFIO_USER_MAX_IOVECS worth of dma_sg_t. */ 236 uint8_t sg[]; 237 }; 238 239 /* 240 * Mapping of an NVMe queue. 241 * 242 * This holds the information tracking a local process mapping of an NVMe queue 243 * shared by the client. 244 */ 245 struct nvme_q_mapping { 246 /* iov of local process mapping. */ 247 struct iovec iov; 248 /* Stored sg, needed for unmap. */ 249 dma_sg_t *sg; 250 /* Client PRP of queue. */ 251 uint64_t prp1; 252 }; 253 254 enum nvmf_vfio_user_sq_state { 255 VFIO_USER_SQ_UNUSED = 0, 256 VFIO_USER_SQ_CREATED, 257 VFIO_USER_SQ_DELETED, 258 VFIO_USER_SQ_ACTIVE, 259 VFIO_USER_SQ_INACTIVE 260 }; 261 262 enum nvmf_vfio_user_cq_state { 263 VFIO_USER_CQ_UNUSED = 0, 264 VFIO_USER_CQ_CREATED, 265 VFIO_USER_CQ_DELETED, 266 }; 267 268 enum nvmf_vfio_user_ctrlr_state { 269 VFIO_USER_CTRLR_CREATING = 0, 270 VFIO_USER_CTRLR_RUNNING, 271 /* Quiesce requested by libvfio-user */ 272 VFIO_USER_CTRLR_PAUSING, 273 /* NVMf subsystem is paused, it's safe to do PCI reset, memory register, 274 * memory unergister, and vfio migration state transition in this state. 275 */ 276 VFIO_USER_CTRLR_PAUSED, 277 /* 278 * Implies that the NVMf subsystem is paused. Device will be unquiesced (PCI 279 * reset, memory register and unregister, controller in destination VM has 280 * been restored). NVMf subsystem resume has been requested. 281 */ 282 VFIO_USER_CTRLR_RESUMING, 283 /* 284 * Implies that the NVMf subsystem is paused. Both controller in source VM and 285 * destinatiom VM is in this state when doing live migration. 286 */ 287 VFIO_USER_CTRLR_MIGRATING 288 }; 289 290 struct nvmf_vfio_user_sq { 291 struct spdk_nvmf_qpair qpair; 292 struct spdk_nvmf_transport_poll_group *group; 293 struct nvmf_vfio_user_ctrlr *ctrlr; 294 295 uint32_t qid; 296 /* Number of entries in queue. */ 297 uint32_t size; 298 struct nvme_q_mapping mapping; 299 enum nvmf_vfio_user_sq_state sq_state; 300 301 uint32_t head; 302 volatile uint32_t *dbl_tailp; 303 304 /* Whether a shadow doorbell eventidx needs setting. */ 305 bool need_rearm; 306 307 /* multiple SQs can be mapped to the same CQ */ 308 uint16_t cqid; 309 310 /* handle_queue_connect_rsp() can be used both for CREATE IO SQ response 311 * and SQ re-connect response in the destination VM, for the prior case, 312 * we will post a NVMe completion to VM, we will not set this flag when 313 * re-connecting SQs in the destination VM. 314 */ 315 bool post_create_io_sq_completion; 316 /* Copy of Create IO SQ command, this field is used together with 317 * `post_create_io_sq_completion` flag. 318 */ 319 struct spdk_nvme_cmd create_io_sq_cmd; 320 321 struct vfio_user_delete_sq_ctx *delete_ctx; 322 323 /* Currently unallocated reqs. */ 324 TAILQ_HEAD(, nvmf_vfio_user_req) free_reqs; 325 /* Poll group entry */ 326 TAILQ_ENTRY(nvmf_vfio_user_sq) link; 327 /* Connected SQ entry */ 328 TAILQ_ENTRY(nvmf_vfio_user_sq) tailq; 329 }; 330 331 struct nvmf_vfio_user_cq { 332 struct spdk_nvmf_transport_poll_group *group; 333 int cq_ref; 334 335 uint32_t qid; 336 /* Number of entries in queue. */ 337 uint32_t size; 338 struct nvme_q_mapping mapping; 339 enum nvmf_vfio_user_cq_state cq_state; 340 341 uint32_t tail; 342 volatile uint32_t *dbl_headp; 343 344 bool phase; 345 346 uint16_t iv; 347 bool ien; 348 349 uint32_t last_head; 350 uint32_t last_trigger_irq_tail; 351 }; 352 353 struct nvmf_vfio_user_poll_group { 354 struct spdk_nvmf_transport_poll_group group; 355 TAILQ_ENTRY(nvmf_vfio_user_poll_group) link; 356 TAILQ_HEAD(, nvmf_vfio_user_sq) sqs; 357 struct spdk_interrupt *intr; 358 int intr_fd; 359 struct { 360 361 /* 362 * ctrlr_intr and ctrlr_kicks will be zero for all other poll 363 * groups. However, they can be zero even for the poll group 364 * the controller belongs are if no vfio-user message has been 365 * received or the controller hasn't been kicked yet. 366 */ 367 368 /* 369 * Number of times vfio_user_ctrlr_intr() has run: 370 * vfio-user file descriptor has been ready or explicitly 371 * kicked (see below). 372 */ 373 uint64_t ctrlr_intr; 374 375 /* 376 * Kicks to the controller by ctrlr_kick(). 377 * ctrlr_intr - ctrlr_kicks is the number of times the 378 * vfio-user poll file descriptor has been ready. 379 */ 380 uint64_t ctrlr_kicks; 381 382 /* 383 * How many times we won the race arming an SQ. 384 */ 385 uint64_t won; 386 387 /* 388 * How many times we lost the race arming an SQ 389 */ 390 uint64_t lost; 391 392 /* 393 * How many requests we processed in total each time we lost 394 * the rearm race. 395 */ 396 uint64_t lost_count; 397 398 /* 399 * Number of attempts we attempted to rearm all the SQs in the 400 * poll group. 401 */ 402 uint64_t rearms; 403 404 uint64_t pg_process_count; 405 uint64_t intr; 406 uint64_t polls; 407 uint64_t polls_spurious; 408 uint64_t poll_reqs; 409 uint64_t poll_reqs_squared; 410 uint64_t cqh_admin_writes; 411 uint64_t cqh_io_writes; 412 } stats; 413 }; 414 415 struct nvmf_vfio_user_shadow_doorbells { 416 volatile uint32_t *shadow_doorbells; 417 volatile uint32_t *eventidxs; 418 dma_sg_t *sgs; 419 struct iovec *iovs; 420 }; 421 422 struct nvmf_vfio_user_ctrlr { 423 struct nvmf_vfio_user_endpoint *endpoint; 424 struct nvmf_vfio_user_transport *transport; 425 426 /* Connected SQs list */ 427 TAILQ_HEAD(, nvmf_vfio_user_sq) connected_sqs; 428 enum nvmf_vfio_user_ctrlr_state state; 429 430 /* 431 * Tells whether live migration data have been prepared. This is used 432 * by the get_pending_bytes callback to tell whether or not the 433 * previous iteration finished. 434 */ 435 bool migr_data_prepared; 436 437 /* Controller is in source VM when doing live migration */ 438 bool in_source_vm; 439 440 struct spdk_thread *thread; 441 struct spdk_poller *vfu_ctx_poller; 442 struct spdk_interrupt *intr; 443 int intr_fd; 444 445 bool queued_quiesce; 446 447 bool reset_shn; 448 bool disconnect; 449 450 uint16_t cntlid; 451 struct spdk_nvmf_ctrlr *ctrlr; 452 453 struct nvmf_vfio_user_sq *sqs[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 454 struct nvmf_vfio_user_cq *cqs[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 455 456 TAILQ_ENTRY(nvmf_vfio_user_ctrlr) link; 457 458 volatile uint32_t *bar0_doorbells; 459 struct nvmf_vfio_user_shadow_doorbells *sdbl; 460 /* 461 * Shadow doorbells PRPs to provide during the stop-and-copy state. 462 */ 463 uint64_t shadow_doorbell_buffer; 464 uint64_t eventidx_buffer; 465 466 bool adaptive_irqs_enabled; 467 }; 468 469 /* Endpoint in vfio-user is associated with a socket file, which 470 * is the representative of a PCI endpoint. 471 */ 472 struct nvmf_vfio_user_endpoint { 473 struct nvmf_vfio_user_transport *transport; 474 vfu_ctx_t *vfu_ctx; 475 struct spdk_poller *accept_poller; 476 struct spdk_thread *accept_thread; 477 bool interrupt_mode; 478 struct msixcap *msix; 479 vfu_pci_config_space_t *pci_config_space; 480 int devmem_fd; 481 int accept_intr_fd; 482 struct spdk_interrupt *accept_intr; 483 484 volatile uint32_t *bar0_doorbells; 485 486 int migr_fd; 487 void *migr_data; 488 489 struct spdk_nvme_transport_id trid; 490 struct spdk_nvmf_subsystem *subsystem; 491 492 /* Controller is associated with an active socket connection, 493 * the lifecycle of the controller is same as the VM. 494 * Currently we only support one active connection, as the NVMe 495 * specification defines, we may support multiple controllers in 496 * future, so that it can support e.g: RESERVATION. 497 */ 498 struct nvmf_vfio_user_ctrlr *ctrlr; 499 pthread_mutex_t lock; 500 501 bool need_async_destroy; 502 /* The subsystem is in PAUSED state and need to be resumed, TRUE 503 * only when migration is done successfully and the controller is 504 * in source VM. 505 */ 506 bool need_resume; 507 /* Start the accept poller again after destroying the controller */ 508 bool need_relisten; 509 510 TAILQ_ENTRY(nvmf_vfio_user_endpoint) link; 511 }; 512 513 struct nvmf_vfio_user_transport_opts { 514 bool disable_mappable_bar0; 515 bool disable_adaptive_irq; 516 bool disable_shadow_doorbells; 517 bool disable_compare; 518 bool enable_intr_mode_sq_spreading; 519 }; 520 521 struct nvmf_vfio_user_transport { 522 struct spdk_nvmf_transport transport; 523 struct nvmf_vfio_user_transport_opts transport_opts; 524 bool intr_mode_supported; 525 pthread_mutex_t lock; 526 TAILQ_HEAD(, nvmf_vfio_user_endpoint) endpoints; 527 528 pthread_mutex_t pg_lock; 529 TAILQ_HEAD(, nvmf_vfio_user_poll_group) poll_groups; 530 struct nvmf_vfio_user_poll_group *next_pg; 531 }; 532 533 /* 534 * function prototypes 535 */ 536 static int nvmf_vfio_user_req_free(struct spdk_nvmf_request *req); 537 538 static struct nvmf_vfio_user_req *get_nvmf_vfio_user_req(struct nvmf_vfio_user_sq *sq); 539 540 /* 541 * Local process virtual address of a queue. 542 */ 543 static inline void * 544 q_addr(struct nvme_q_mapping *mapping) 545 { 546 return mapping->iov.iov_base; 547 } 548 549 static inline int 550 queue_index(uint16_t qid, bool is_cq) 551 { 552 return (qid * 2) + is_cq; 553 } 554 555 static inline volatile uint32_t * 556 sq_headp(struct nvmf_vfio_user_sq *sq) 557 { 558 assert(sq != NULL); 559 return &sq->head; 560 } 561 562 static inline volatile uint32_t * 563 sq_dbl_tailp(struct nvmf_vfio_user_sq *sq) 564 { 565 assert(sq != NULL); 566 return sq->dbl_tailp; 567 } 568 569 static inline volatile uint32_t * 570 cq_dbl_headp(struct nvmf_vfio_user_cq *cq) 571 { 572 assert(cq != NULL); 573 return cq->dbl_headp; 574 } 575 576 static inline volatile uint32_t * 577 cq_tailp(struct nvmf_vfio_user_cq *cq) 578 { 579 assert(cq != NULL); 580 return &cq->tail; 581 } 582 583 static inline void 584 sq_head_advance(struct nvmf_vfio_user_sq *sq) 585 { 586 assert(sq != NULL); 587 588 assert(*sq_headp(sq) < sq->size); 589 (*sq_headp(sq))++; 590 591 if (spdk_unlikely(*sq_headp(sq) == sq->size)) { 592 *sq_headp(sq) = 0; 593 } 594 } 595 596 static inline void 597 cq_tail_advance(struct nvmf_vfio_user_cq *cq) 598 { 599 assert(cq != NULL); 600 601 assert(*cq_tailp(cq) < cq->size); 602 (*cq_tailp(cq))++; 603 604 if (spdk_unlikely(*cq_tailp(cq) == cq->size)) { 605 *cq_tailp(cq) = 0; 606 cq->phase = !cq->phase; 607 } 608 } 609 610 static bool 611 io_q_exists(struct nvmf_vfio_user_ctrlr *vu_ctrlr, const uint16_t qid, const bool is_cq) 612 { 613 assert(vu_ctrlr != NULL); 614 615 if (qid == 0 || qid >= NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) { 616 return false; 617 } 618 619 if (is_cq) { 620 if (vu_ctrlr->cqs[qid] == NULL) { 621 return false; 622 } 623 624 return (vu_ctrlr->cqs[qid]->cq_state != VFIO_USER_CQ_DELETED && 625 vu_ctrlr->cqs[qid]->cq_state != VFIO_USER_CQ_UNUSED); 626 } 627 628 if (vu_ctrlr->sqs[qid] == NULL) { 629 return false; 630 } 631 632 return (vu_ctrlr->sqs[qid]->sq_state != VFIO_USER_SQ_DELETED && 633 vu_ctrlr->sqs[qid]->sq_state != VFIO_USER_SQ_UNUSED); 634 } 635 636 static char * 637 endpoint_id(struct nvmf_vfio_user_endpoint *endpoint) 638 { 639 return endpoint->trid.traddr; 640 } 641 642 static char * 643 ctrlr_id(struct nvmf_vfio_user_ctrlr *ctrlr) 644 { 645 if (!ctrlr || !ctrlr->endpoint) { 646 return "Null Ctrlr"; 647 } 648 649 return endpoint_id(ctrlr->endpoint); 650 } 651 652 /* Return the poll group for the admin queue of the controller. */ 653 static inline struct nvmf_vfio_user_poll_group * 654 ctrlr_to_poll_group(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 655 { 656 return SPDK_CONTAINEROF(vu_ctrlr->sqs[0]->group, 657 struct nvmf_vfio_user_poll_group, 658 group); 659 } 660 661 static inline struct spdk_thread * 662 poll_group_to_thread(struct nvmf_vfio_user_poll_group *vu_pg) 663 { 664 return vu_pg->group.group->thread; 665 } 666 667 static dma_sg_t * 668 index_to_sg_t(void *arr, size_t i) 669 { 670 return (dma_sg_t *)((uintptr_t)arr + i * dma_sg_size()); 671 } 672 673 static inline size_t 674 vfio_user_migr_data_len(void) 675 { 676 return SPDK_ALIGN_CEIL(sizeof(struct vfio_user_nvme_migr_state), PAGE_SIZE); 677 } 678 679 static inline bool 680 in_interrupt_mode(struct nvmf_vfio_user_transport *vu_transport) 681 { 682 return spdk_interrupt_mode_is_enabled() && 683 vu_transport->intr_mode_supported; 684 } 685 686 static int vfio_user_ctrlr_intr(void *ctx); 687 688 static void 689 vfio_user_msg_ctrlr_intr(void *ctx) 690 { 691 struct nvmf_vfio_user_ctrlr *vu_ctrlr = ctx; 692 struct nvmf_vfio_user_poll_group *vu_ctrlr_group = ctrlr_to_poll_group(vu_ctrlr); 693 694 vu_ctrlr_group->stats.ctrlr_kicks++; 695 696 vfio_user_ctrlr_intr(ctx); 697 } 698 699 /* 700 * Kick (force a wakeup) of all poll groups for this controller. 701 * vfio_user_ctrlr_intr() itself arranges for kicking other poll groups if 702 * needed. 703 */ 704 static void 705 ctrlr_kick(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 706 { 707 struct nvmf_vfio_user_poll_group *vu_ctrlr_group; 708 709 SPDK_DEBUGLOG(vfio_user_db, "%s: kicked\n", ctrlr_id(vu_ctrlr)); 710 711 vu_ctrlr_group = ctrlr_to_poll_group(vu_ctrlr); 712 713 spdk_thread_send_msg(poll_group_to_thread(vu_ctrlr_group), 714 vfio_user_msg_ctrlr_intr, vu_ctrlr); 715 } 716 717 /* 718 * Make the given DMA address and length available (locally mapped) via iov. 719 */ 720 static void * 721 map_one(vfu_ctx_t *ctx, uint64_t addr, uint64_t len, dma_sg_t *sg, 722 struct iovec *iov, int prot) 723 { 724 int ret; 725 726 assert(ctx != NULL); 727 assert(sg != NULL); 728 assert(iov != NULL); 729 730 ret = vfu_addr_to_sgl(ctx, (void *)(uintptr_t)addr, len, sg, 1, prot); 731 if (ret < 0) { 732 return NULL; 733 } 734 735 ret = vfu_sgl_get(ctx, sg, iov, 1, 0); 736 if (ret != 0) { 737 return NULL; 738 } 739 740 assert(iov->iov_base != NULL); 741 return iov->iov_base; 742 } 743 744 static int 745 nvme_cmd_map_prps(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, 746 uint32_t max_iovcnt, uint32_t len, size_t mps, 747 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 748 { 749 uint64_t prp1, prp2; 750 void *vva; 751 uint32_t i; 752 uint32_t residue_len, nents; 753 uint64_t *prp_list; 754 uint32_t iovcnt; 755 756 assert(max_iovcnt > 0); 757 758 prp1 = cmd->dptr.prp.prp1; 759 prp2 = cmd->dptr.prp.prp2; 760 761 /* PRP1 may started with unaligned page address */ 762 residue_len = mps - (prp1 % mps); 763 residue_len = spdk_min(len, residue_len); 764 765 vva = gpa_to_vva(prv, prp1, residue_len, PROT_READ | PROT_WRITE); 766 if (spdk_unlikely(vva == NULL)) { 767 SPDK_ERRLOG("GPA to VVA failed\n"); 768 return -EINVAL; 769 } 770 len -= residue_len; 771 if (len && max_iovcnt < 2) { 772 SPDK_ERRLOG("Too many page entries, at least two iovs are required\n"); 773 return -ERANGE; 774 } 775 iovs[0].iov_base = vva; 776 iovs[0].iov_len = residue_len; 777 778 if (len) { 779 if (spdk_unlikely(prp2 == 0)) { 780 SPDK_ERRLOG("no PRP2, %d remaining\n", len); 781 return -EINVAL; 782 } 783 784 if (len <= mps) { 785 /* 2 PRP used */ 786 iovcnt = 2; 787 vva = gpa_to_vva(prv, prp2, len, PROT_READ | PROT_WRITE); 788 if (spdk_unlikely(vva == NULL)) { 789 SPDK_ERRLOG("no VVA for %#" PRIx64 ", len%#x\n", 790 prp2, len); 791 return -EINVAL; 792 } 793 iovs[1].iov_base = vva; 794 iovs[1].iov_len = len; 795 } else { 796 /* PRP list used */ 797 nents = (len + mps - 1) / mps; 798 if (spdk_unlikely(nents + 1 > max_iovcnt)) { 799 SPDK_ERRLOG("Too many page entries\n"); 800 return -ERANGE; 801 } 802 803 vva = gpa_to_vva(prv, prp2, nents * sizeof(*prp_list), PROT_READ); 804 if (spdk_unlikely(vva == NULL)) { 805 SPDK_ERRLOG("no VVA for %#" PRIx64 ", nents=%#x\n", 806 prp2, nents); 807 return -EINVAL; 808 } 809 prp_list = vva; 810 i = 0; 811 while (len != 0) { 812 residue_len = spdk_min(len, mps); 813 vva = gpa_to_vva(prv, prp_list[i], residue_len, PROT_READ | PROT_WRITE); 814 if (spdk_unlikely(vva == NULL)) { 815 SPDK_ERRLOG("no VVA for %#" PRIx64 ", residue_len=%#x\n", 816 prp_list[i], residue_len); 817 return -EINVAL; 818 } 819 iovs[i + 1].iov_base = vva; 820 iovs[i + 1].iov_len = residue_len; 821 len -= residue_len; 822 i++; 823 } 824 iovcnt = i + 1; 825 } 826 } else { 827 /* 1 PRP used */ 828 iovcnt = 1; 829 } 830 831 assert(iovcnt <= max_iovcnt); 832 return iovcnt; 833 } 834 835 static int 836 nvme_cmd_map_sgls_data(void *prv, struct spdk_nvme_sgl_descriptor *sgls, uint32_t num_sgls, 837 struct iovec *iovs, uint32_t max_iovcnt, 838 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 839 { 840 uint32_t i; 841 void *vva; 842 843 if (spdk_unlikely(max_iovcnt < num_sgls)) { 844 return -ERANGE; 845 } 846 847 for (i = 0; i < num_sgls; i++) { 848 if (spdk_unlikely(sgls[i].unkeyed.type != SPDK_NVME_SGL_TYPE_DATA_BLOCK)) { 849 SPDK_ERRLOG("Invalid SGL type %u\n", sgls[i].unkeyed.type); 850 return -EINVAL; 851 } 852 vva = gpa_to_vva(prv, sgls[i].address, sgls[i].unkeyed.length, PROT_READ | PROT_WRITE); 853 if (spdk_unlikely(vva == NULL)) { 854 SPDK_ERRLOG("GPA to VVA failed\n"); 855 return -EINVAL; 856 } 857 iovs[i].iov_base = vva; 858 iovs[i].iov_len = sgls[i].unkeyed.length; 859 } 860 861 return num_sgls; 862 } 863 864 static int 865 nvme_cmd_map_sgls(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt, 866 uint32_t len, size_t mps, 867 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 868 { 869 struct spdk_nvme_sgl_descriptor *sgl, *last_sgl; 870 uint32_t num_sgls, seg_len; 871 void *vva; 872 int ret; 873 uint32_t total_iovcnt = 0; 874 875 /* SGL cases */ 876 sgl = &cmd->dptr.sgl1; 877 878 /* only one SGL segment */ 879 if (sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { 880 assert(max_iovcnt > 0); 881 vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, PROT_READ | PROT_WRITE); 882 if (spdk_unlikely(vva == NULL)) { 883 SPDK_ERRLOG("GPA to VVA failed\n"); 884 return -EINVAL; 885 } 886 iovs[0].iov_base = vva; 887 iovs[0].iov_len = sgl->unkeyed.length; 888 assert(sgl->unkeyed.length == len); 889 890 return 1; 891 } 892 893 for (;;) { 894 if (spdk_unlikely((sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_SEGMENT) && 895 (sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_LAST_SEGMENT))) { 896 SPDK_ERRLOG("Invalid SGL type %u\n", sgl->unkeyed.type); 897 return -EINVAL; 898 } 899 900 seg_len = sgl->unkeyed.length; 901 if (spdk_unlikely(seg_len % sizeof(struct spdk_nvme_sgl_descriptor))) { 902 SPDK_ERRLOG("Invalid SGL segment len %u\n", seg_len); 903 return -EINVAL; 904 } 905 906 num_sgls = seg_len / sizeof(struct spdk_nvme_sgl_descriptor); 907 vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, PROT_READ); 908 if (spdk_unlikely(vva == NULL)) { 909 SPDK_ERRLOG("GPA to VVA failed\n"); 910 return -EINVAL; 911 } 912 913 /* sgl point to the first segment */ 914 sgl = (struct spdk_nvme_sgl_descriptor *)vva; 915 last_sgl = &sgl[num_sgls - 1]; 916 917 /* we are done */ 918 if (last_sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { 919 /* map whole sgl list */ 920 ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls, &iovs[total_iovcnt], 921 max_iovcnt - total_iovcnt, gpa_to_vva); 922 if (spdk_unlikely(ret < 0)) { 923 return ret; 924 } 925 total_iovcnt += ret; 926 927 return total_iovcnt; 928 } 929 930 if (num_sgls > 1) { 931 /* map whole sgl exclude last_sgl */ 932 ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls - 1, &iovs[total_iovcnt], 933 max_iovcnt - total_iovcnt, gpa_to_vva); 934 if (spdk_unlikely(ret < 0)) { 935 return ret; 936 } 937 total_iovcnt += ret; 938 } 939 940 /* move to next level's segments */ 941 sgl = last_sgl; 942 } 943 944 return 0; 945 } 946 947 static int 948 nvme_map_cmd(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt, 949 uint32_t len, size_t mps, 950 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, int prot)) 951 { 952 if (cmd->psdt == SPDK_NVME_PSDT_PRP) { 953 return nvme_cmd_map_prps(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva); 954 } 955 956 return nvme_cmd_map_sgls(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva); 957 } 958 959 /* 960 * For each queue, update the location of its doorbell to the correct location: 961 * either our own BAR0, or the guest's configured shadow doorbell area. 962 * 963 * The Admin queue (qid: 0) does not ever use shadow doorbells. 964 */ 965 static void 966 vfio_user_ctrlr_switch_doorbells(struct nvmf_vfio_user_ctrlr *ctrlr, bool shadow) 967 { 968 volatile uint32_t *doorbells = shadow ? ctrlr->sdbl->shadow_doorbells : 969 ctrlr->bar0_doorbells; 970 971 assert(doorbells != NULL); 972 973 for (size_t i = 1; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; i++) { 974 struct nvmf_vfio_user_sq *sq = ctrlr->sqs[i]; 975 struct nvmf_vfio_user_cq *cq = ctrlr->cqs[i]; 976 977 if (sq != NULL) { 978 sq->dbl_tailp = doorbells + queue_index(sq->qid, false); 979 980 ctrlr->sqs[i]->need_rearm = shadow; 981 } 982 983 if (cq != NULL) { 984 cq->dbl_headp = doorbells + queue_index(cq->qid, true); 985 } 986 } 987 } 988 989 static void 990 unmap_sdbl(vfu_ctx_t *vfu_ctx, struct nvmf_vfio_user_shadow_doorbells *sdbl) 991 { 992 assert(vfu_ctx != NULL); 993 assert(sdbl != NULL); 994 995 /* 996 * An allocation error would result in only one of the two being 997 * non-NULL. If that is the case, no memory should have been mapped. 998 */ 999 if (sdbl->iovs == NULL || sdbl->sgs == NULL) { 1000 return; 1001 } 1002 1003 for (size_t i = 0; i < NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT; ++i) { 1004 struct iovec *iov; 1005 dma_sg_t *sg; 1006 1007 if (!sdbl->iovs[i].iov_len) { 1008 continue; 1009 } 1010 1011 sg = index_to_sg_t(sdbl->sgs, i); 1012 iov = sdbl->iovs + i; 1013 1014 vfu_sgl_put(vfu_ctx, sg, iov, 1); 1015 } 1016 } 1017 1018 static void 1019 free_sdbl(vfu_ctx_t *vfu_ctx, struct nvmf_vfio_user_shadow_doorbells *sdbl) 1020 { 1021 if (sdbl == NULL) { 1022 return; 1023 } 1024 1025 unmap_sdbl(vfu_ctx, sdbl); 1026 1027 /* 1028 * sdbl->shadow_doorbells and sdbl->eventidxs were mapped, 1029 * not allocated, so don't free() them. 1030 */ 1031 free(sdbl->sgs); 1032 free(sdbl->iovs); 1033 free(sdbl); 1034 } 1035 1036 static struct nvmf_vfio_user_shadow_doorbells * 1037 map_sdbl(vfu_ctx_t *vfu_ctx, uint64_t prp1, uint64_t prp2, size_t len) 1038 { 1039 struct nvmf_vfio_user_shadow_doorbells *sdbl = NULL; 1040 dma_sg_t *sg2 = NULL; 1041 void *p; 1042 1043 assert(vfu_ctx != NULL); 1044 1045 sdbl = calloc(1, sizeof(*sdbl)); 1046 if (sdbl == NULL) { 1047 goto err; 1048 } 1049 1050 sdbl->sgs = calloc(NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT, dma_sg_size()); 1051 sdbl->iovs = calloc(NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT, sizeof(*sdbl->iovs)); 1052 if (sdbl->sgs == NULL || sdbl->iovs == NULL) { 1053 goto err; 1054 } 1055 1056 /* Map shadow doorbell buffer (PRP1). */ 1057 p = map_one(vfu_ctx, prp1, len, sdbl->sgs, sdbl->iovs, 1058 PROT_READ | PROT_WRITE); 1059 1060 if (p == NULL) { 1061 goto err; 1062 } 1063 1064 /* 1065 * Map eventidx buffer (PRP2). 1066 * Should only be written to by the controller. 1067 */ 1068 1069 sg2 = index_to_sg_t(sdbl->sgs, 1); 1070 1071 p = map_one(vfu_ctx, prp2, len, sg2, sdbl->iovs + 1, 1072 PROT_READ | PROT_WRITE); 1073 1074 if (p == NULL) { 1075 goto err; 1076 } 1077 1078 sdbl->shadow_doorbells = (uint32_t *)sdbl->iovs[0].iov_base; 1079 sdbl->eventidxs = (uint32_t *)sdbl->iovs[1].iov_base; 1080 1081 return sdbl; 1082 1083 err: 1084 free_sdbl(vfu_ctx, sdbl); 1085 return NULL; 1086 } 1087 1088 /* 1089 * Copy doorbells from one buffer to the other, during switches betweeen BAR0 1090 * doorbells and shadow doorbells. 1091 */ 1092 static void 1093 copy_doorbells(struct nvmf_vfio_user_ctrlr *ctrlr, 1094 const volatile uint32_t *from, volatile uint32_t *to) 1095 { 1096 assert(ctrlr != NULL); 1097 assert(from != NULL); 1098 assert(to != NULL); 1099 1100 SPDK_DEBUGLOG(vfio_user_db, 1101 "%s: migrating shadow doorbells from %p to %p\n", 1102 ctrlr_id(ctrlr), from, to); 1103 1104 /* Can't use memcpy because it doesn't respect volatile semantics. */ 1105 for (size_t i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; ++i) { 1106 if (ctrlr->sqs[i] != NULL) { 1107 to[queue_index(i, false)] = from[queue_index(i, false)]; 1108 } 1109 1110 if (ctrlr->cqs[i] != NULL) { 1111 to[queue_index(i, true)] = from[queue_index(i, true)]; 1112 } 1113 } 1114 } 1115 1116 static void 1117 fail_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1118 { 1119 const struct spdk_nvmf_registers *regs; 1120 1121 assert(vu_ctrlr != NULL); 1122 assert(vu_ctrlr->ctrlr != NULL); 1123 1124 regs = spdk_nvmf_ctrlr_get_regs(vu_ctrlr->ctrlr); 1125 if (regs->csts.bits.cfs == 0) { 1126 SPDK_ERRLOG(":%s failing controller\n", ctrlr_id(vu_ctrlr)); 1127 } 1128 1129 nvmf_ctrlr_set_fatal_status(vu_ctrlr->ctrlr); 1130 } 1131 1132 static inline bool 1133 ctrlr_interrupt_enabled(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1134 { 1135 assert(vu_ctrlr != NULL); 1136 assert(vu_ctrlr->endpoint != NULL); 1137 1138 vfu_pci_config_space_t *pci = vu_ctrlr->endpoint->pci_config_space; 1139 1140 return (!pci->hdr.cmd.id || vu_ctrlr->endpoint->msix->mxc.mxe); 1141 } 1142 1143 static void 1144 nvmf_vfio_user_destroy_endpoint(struct nvmf_vfio_user_endpoint *endpoint) 1145 { 1146 SPDK_DEBUGLOG(nvmf_vfio, "destroy endpoint %s\n", endpoint_id(endpoint)); 1147 1148 spdk_interrupt_unregister(&endpoint->accept_intr); 1149 spdk_poller_unregister(&endpoint->accept_poller); 1150 1151 if (endpoint->bar0_doorbells) { 1152 munmap((void *)endpoint->bar0_doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 1153 } 1154 1155 if (endpoint->devmem_fd > 0) { 1156 close(endpoint->devmem_fd); 1157 } 1158 1159 if (endpoint->migr_data) { 1160 munmap(endpoint->migr_data, vfio_user_migr_data_len()); 1161 } 1162 1163 if (endpoint->migr_fd > 0) { 1164 close(endpoint->migr_fd); 1165 } 1166 1167 if (endpoint->vfu_ctx) { 1168 vfu_destroy_ctx(endpoint->vfu_ctx); 1169 } 1170 1171 pthread_mutex_destroy(&endpoint->lock); 1172 free(endpoint); 1173 } 1174 1175 /* called when process exits */ 1176 static int 1177 nvmf_vfio_user_destroy(struct spdk_nvmf_transport *transport, 1178 spdk_nvmf_transport_destroy_done_cb cb_fn, void *cb_arg) 1179 { 1180 struct nvmf_vfio_user_transport *vu_transport; 1181 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 1182 1183 SPDK_DEBUGLOG(nvmf_vfio, "destroy transport\n"); 1184 1185 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 1186 transport); 1187 1188 pthread_mutex_destroy(&vu_transport->lock); 1189 pthread_mutex_destroy(&vu_transport->pg_lock); 1190 1191 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 1192 TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link); 1193 nvmf_vfio_user_destroy_endpoint(endpoint); 1194 } 1195 1196 free(vu_transport); 1197 1198 if (cb_fn) { 1199 cb_fn(cb_arg); 1200 } 1201 1202 return 0; 1203 } 1204 1205 static const struct spdk_json_object_decoder vfio_user_transport_opts_decoder[] = { 1206 { 1207 "disable_mappable_bar0", 1208 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_mappable_bar0), 1209 spdk_json_decode_bool, true 1210 }, 1211 { 1212 "disable_adaptive_irq", 1213 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_adaptive_irq), 1214 spdk_json_decode_bool, true 1215 }, 1216 { 1217 "disable_shadow_doorbells", 1218 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_shadow_doorbells), 1219 spdk_json_decode_bool, true 1220 }, 1221 { 1222 "disable_compare", 1223 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_compare), 1224 spdk_json_decode_bool, true 1225 }, 1226 { 1227 "enable_intr_mode_sq_spreading", 1228 offsetof(struct nvmf_vfio_user_transport, transport_opts.enable_intr_mode_sq_spreading), 1229 spdk_json_decode_bool, true 1230 }, 1231 }; 1232 1233 static struct spdk_nvmf_transport * 1234 nvmf_vfio_user_create(struct spdk_nvmf_transport_opts *opts) 1235 { 1236 struct nvmf_vfio_user_transport *vu_transport; 1237 int err; 1238 1239 if (opts->max_qpairs_per_ctrlr > NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) { 1240 SPDK_ERRLOG("Invalid max_qpairs_per_ctrlr=%d, supported max_qpairs_per_ctrlr=%d\n", 1241 opts->max_qpairs_per_ctrlr, NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR); 1242 return NULL; 1243 } 1244 1245 vu_transport = calloc(1, sizeof(*vu_transport)); 1246 if (vu_transport == NULL) { 1247 SPDK_ERRLOG("Transport alloc fail: %m\n"); 1248 return NULL; 1249 } 1250 1251 err = pthread_mutex_init(&vu_transport->lock, NULL); 1252 if (err != 0) { 1253 SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err); 1254 goto err; 1255 } 1256 TAILQ_INIT(&vu_transport->endpoints); 1257 1258 err = pthread_mutex_init(&vu_transport->pg_lock, NULL); 1259 if (err != 0) { 1260 pthread_mutex_destroy(&vu_transport->lock); 1261 SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err); 1262 goto err; 1263 } 1264 TAILQ_INIT(&vu_transport->poll_groups); 1265 1266 if (opts->transport_specific != NULL && 1267 spdk_json_decode_object_relaxed(opts->transport_specific, vfio_user_transport_opts_decoder, 1268 SPDK_COUNTOF(vfio_user_transport_opts_decoder), 1269 vu_transport)) { 1270 SPDK_ERRLOG("spdk_json_decode_object_relaxed failed\n"); 1271 goto cleanup; 1272 } 1273 1274 /* 1275 * To support interrupt mode, the transport must be configured with 1276 * mappable BAR0 disabled: we need a vfio-user message to wake us up 1277 * when a client writes new doorbell values to BAR0, via the 1278 * libvfio-user socket fd. 1279 */ 1280 vu_transport->intr_mode_supported = 1281 vu_transport->transport_opts.disable_mappable_bar0; 1282 1283 /* 1284 * If BAR0 is mappable, it doesn't make sense to support shadow 1285 * doorbells, so explicitly turn it off. 1286 */ 1287 if (!vu_transport->transport_opts.disable_mappable_bar0) { 1288 vu_transport->transport_opts.disable_shadow_doorbells = true; 1289 } 1290 1291 if (spdk_interrupt_mode_is_enabled()) { 1292 if (!vu_transport->intr_mode_supported) { 1293 SPDK_ERRLOG("interrupt mode not supported\n"); 1294 goto cleanup; 1295 } 1296 1297 /* 1298 * If we are in interrupt mode, we cannot support adaptive IRQs, 1299 * as there is no guarantee the SQ poller will run subsequently 1300 * to send pending IRQs. 1301 */ 1302 vu_transport->transport_opts.disable_adaptive_irq = true; 1303 } 1304 1305 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_mappable_bar0=%d\n", 1306 vu_transport->transport_opts.disable_mappable_bar0); 1307 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_adaptive_irq=%d\n", 1308 vu_transport->transport_opts.disable_adaptive_irq); 1309 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_shadow_doorbells=%d\n", 1310 vu_transport->transport_opts.disable_shadow_doorbells); 1311 1312 return &vu_transport->transport; 1313 1314 cleanup: 1315 pthread_mutex_destroy(&vu_transport->lock); 1316 pthread_mutex_destroy(&vu_transport->pg_lock); 1317 err: 1318 free(vu_transport); 1319 return NULL; 1320 } 1321 1322 static uint32_t 1323 max_queue_size(struct nvmf_vfio_user_ctrlr const *vu_ctrlr) 1324 { 1325 assert(vu_ctrlr != NULL); 1326 assert(vu_ctrlr->ctrlr != NULL); 1327 1328 return vu_ctrlr->ctrlr->vcprop.cap.bits.mqes + 1; 1329 } 1330 1331 static uint32_t 1332 doorbell_stride(const struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1333 { 1334 assert(vu_ctrlr != NULL); 1335 assert(vu_ctrlr->ctrlr != NULL); 1336 1337 return vu_ctrlr->ctrlr->vcprop.cap.bits.dstrd; 1338 } 1339 1340 static uintptr_t 1341 memory_page_size(const struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1342 { 1343 uint32_t memory_page_shift = vu_ctrlr->ctrlr->vcprop.cc.bits.mps + 12; 1344 return 1ul << memory_page_shift; 1345 } 1346 1347 static uintptr_t 1348 memory_page_mask(const struct nvmf_vfio_user_ctrlr *ctrlr) 1349 { 1350 return ~(memory_page_size(ctrlr) - 1); 1351 } 1352 1353 static int 1354 map_q(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvme_q_mapping *mapping, 1355 uint32_t q_size, bool is_cq, bool unmap) 1356 { 1357 uint64_t len; 1358 void *ret; 1359 1360 assert(q_size); 1361 assert(q_addr(mapping) == NULL); 1362 1363 if (is_cq) { 1364 len = q_size * sizeof(struct spdk_nvme_cpl); 1365 } else { 1366 len = q_size * sizeof(struct spdk_nvme_cmd); 1367 } 1368 1369 ret = map_one(vu_ctrlr->endpoint->vfu_ctx, mapping->prp1, len, 1370 mapping->sg, &mapping->iov, 1371 is_cq ? PROT_READ | PROT_WRITE : PROT_READ); 1372 if (ret == NULL) { 1373 return -EFAULT; 1374 } 1375 1376 if (unmap) { 1377 memset(q_addr(mapping), 0, len); 1378 } 1379 1380 return 0; 1381 } 1382 1383 static inline void 1384 unmap_q(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvme_q_mapping *mapping) 1385 { 1386 if (q_addr(mapping) != NULL) { 1387 vfu_sgl_put(vu_ctrlr->endpoint->vfu_ctx, mapping->sg, 1388 &mapping->iov, 1); 1389 mapping->iov.iov_base = NULL; 1390 } 1391 } 1392 1393 static int 1394 asq_setup(struct nvmf_vfio_user_ctrlr *ctrlr) 1395 { 1396 struct nvmf_vfio_user_sq *sq; 1397 const struct spdk_nvmf_registers *regs; 1398 int ret; 1399 1400 assert(ctrlr != NULL); 1401 1402 sq = ctrlr->sqs[0]; 1403 1404 assert(sq != NULL); 1405 assert(q_addr(&sq->mapping) == NULL); 1406 /* XXX ctrlr->asq == 0 is a valid memory address */ 1407 1408 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->ctrlr); 1409 sq->qid = 0; 1410 sq->size = regs->aqa.bits.asqs + 1; 1411 sq->mapping.prp1 = regs->asq; 1412 *sq_headp(sq) = 0; 1413 sq->cqid = 0; 1414 1415 ret = map_q(ctrlr, &sq->mapping, sq->size, false, true); 1416 if (ret) { 1417 return ret; 1418 } 1419 1420 /* The Admin queue (qid: 0) does not ever use shadow doorbells. */ 1421 sq->dbl_tailp = ctrlr->bar0_doorbells + queue_index(0, false); 1422 1423 *sq_dbl_tailp(sq) = 0; 1424 1425 return 0; 1426 } 1427 1428 /* 1429 * Updates eventidx to set an SQ into interrupt or polling mode. 1430 * 1431 * Returns false if the current SQ tail does not match the SQ head, as 1432 * this means that the host has submitted more items to the queue while we were 1433 * not looking - or during the event index update. In that case, we must retry, 1434 * or otherwise make sure we are going to wake up again. 1435 */ 1436 static bool 1437 set_sq_eventidx(struct nvmf_vfio_user_sq *sq) 1438 { 1439 struct nvmf_vfio_user_ctrlr *ctrlr; 1440 volatile uint32_t *sq_tail_eidx; 1441 uint32_t old_tail, new_tail; 1442 1443 assert(sq != NULL); 1444 assert(sq->ctrlr != NULL); 1445 assert(sq->ctrlr->sdbl != NULL); 1446 assert(sq->need_rearm); 1447 assert(sq->qid != 0); 1448 1449 ctrlr = sq->ctrlr; 1450 1451 SPDK_DEBUGLOG(vfio_user_db, "%s: updating eventidx of sqid:%u\n", 1452 ctrlr_id(ctrlr), sq->qid); 1453 1454 sq_tail_eidx = ctrlr->sdbl->eventidxs + queue_index(sq->qid, false); 1455 1456 assert(ctrlr->endpoint != NULL); 1457 1458 if (!ctrlr->endpoint->interrupt_mode) { 1459 /* No synchronisation necessary. */ 1460 *sq_tail_eidx = NVMF_VFIO_USER_EVENTIDX_POLL; 1461 return true; 1462 } 1463 1464 old_tail = *sq_dbl_tailp(sq); 1465 *sq_tail_eidx = old_tail; 1466 1467 /* 1468 * Ensure that the event index is updated before re-reading the tail 1469 * doorbell. If it's not, then the host might race us and update the 1470 * tail after the second read but before the event index is written, so 1471 * it won't write to BAR0 and we'll miss the update. 1472 * 1473 * The driver should provide similar ordering with an mb(). 1474 */ 1475 spdk_mb(); 1476 1477 /* 1478 * Check if the host has updated the tail doorbell after we've read it 1479 * for the first time, but before the event index was written. If that's 1480 * the case, then we've lost the race and we need to update the event 1481 * index again (after polling the queue, since the host won't write to 1482 * BAR0). 1483 */ 1484 new_tail = *sq_dbl_tailp(sq); 1485 1486 /* 1487 * We might poll the queue straight after this function returns if the 1488 * tail has been updated, so we need to ensure that any changes to the 1489 * queue will be visible to us if the doorbell has been updated. 1490 * 1491 * The driver should provide similar ordering with a wmb() to ensure 1492 * that the queue is written before it updates the tail doorbell. 1493 */ 1494 spdk_rmb(); 1495 1496 SPDK_DEBUGLOG(vfio_user_db, "%s: sqid:%u, old_tail=%u, new_tail=%u, " 1497 "sq_head=%u\n", ctrlr_id(ctrlr), sq->qid, old_tail, 1498 new_tail, *sq_headp(sq)); 1499 1500 if (new_tail == *sq_headp(sq)) { 1501 sq->need_rearm = false; 1502 return true; 1503 } 1504 1505 /* 1506 * We've lost the race: the tail was updated since we last polled, 1507 * including if it happened within this routine. 1508 * 1509 * The caller should retry after polling (think of this as a cmpxchg 1510 * loop); if we go to sleep while the SQ is not empty, then we won't 1511 * process the remaining events. 1512 */ 1513 return false; 1514 } 1515 1516 static int nvmf_vfio_user_sq_poll(struct nvmf_vfio_user_sq *sq); 1517 1518 /* 1519 * Arrange for an SQ to interrupt us if written. Returns non-zero if we 1520 * processed some SQ entries. 1521 */ 1522 static int 1523 vfio_user_sq_rearm(struct nvmf_vfio_user_ctrlr *ctrlr, 1524 struct nvmf_vfio_user_sq *sq, 1525 struct nvmf_vfio_user_poll_group *vu_group) 1526 { 1527 int count = 0; 1528 size_t i; 1529 1530 assert(sq->need_rearm); 1531 1532 for (i = 0; i < NVMF_VFIO_USER_SET_EVENTIDX_MAX_ATTEMPTS; i++) { 1533 int ret; 1534 1535 if (set_sq_eventidx(sq)) { 1536 /* We won the race and set eventidx; done. */ 1537 vu_group->stats.won++; 1538 return count; 1539 } 1540 1541 ret = nvmf_vfio_user_sq_poll(sq); 1542 1543 count += (ret < 0) ? 1 : ret; 1544 1545 /* 1546 * set_sq_eventidx() hit the race, so we expected 1547 * to process at least one command from this queue. 1548 * If there were no new commands waiting for us, then 1549 * we must have hit an unexpected race condition. 1550 */ 1551 if (ret == 0) { 1552 SPDK_ERRLOG("%s: unexpected race condition detected " 1553 "while updating the shadow doorbell buffer\n", 1554 ctrlr_id(ctrlr)); 1555 1556 fail_ctrlr(ctrlr); 1557 return count; 1558 } 1559 } 1560 1561 SPDK_DEBUGLOG(vfio_user_db, 1562 "%s: set_sq_eventidx() lost the race %zu times\n", 1563 ctrlr_id(ctrlr), i); 1564 1565 vu_group->stats.lost++; 1566 vu_group->stats.lost_count += count; 1567 1568 /* 1569 * We couldn't arrange an eventidx guaranteed to cause a BAR0 write, as 1570 * we raced with the producer too many times; force ourselves to wake up 1571 * instead. We'll process all queues at that point. 1572 */ 1573 ctrlr_kick(ctrlr); 1574 1575 return count; 1576 } 1577 1578 /* 1579 * We're in interrupt mode, and potentially about to go to sleep. We need to 1580 * make sure any further I/O submissions are guaranteed to wake us up: for 1581 * shadow doorbells that means we may need to go through set_sq_eventidx() for 1582 * every SQ that needs re-arming. 1583 * 1584 * Returns non-zero if we processed something. 1585 */ 1586 static int 1587 vfio_user_poll_group_rearm(struct nvmf_vfio_user_poll_group *vu_group) 1588 { 1589 struct nvmf_vfio_user_sq *sq; 1590 int count = 0; 1591 1592 vu_group->stats.rearms++; 1593 1594 TAILQ_FOREACH(sq, &vu_group->sqs, link) { 1595 if (spdk_unlikely(sq->sq_state != VFIO_USER_SQ_ACTIVE || !sq->size)) { 1596 continue; 1597 } 1598 1599 if (sq->need_rearm) { 1600 count += vfio_user_sq_rearm(sq->ctrlr, sq, vu_group); 1601 } 1602 } 1603 1604 return count; 1605 } 1606 1607 static int 1608 acq_setup(struct nvmf_vfio_user_ctrlr *ctrlr) 1609 { 1610 struct nvmf_vfio_user_cq *cq; 1611 const struct spdk_nvmf_registers *regs; 1612 int ret; 1613 1614 assert(ctrlr != NULL); 1615 1616 cq = ctrlr->cqs[0]; 1617 1618 assert(cq != NULL); 1619 1620 assert(q_addr(&cq->mapping) == NULL); 1621 1622 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->ctrlr); 1623 assert(regs != NULL); 1624 cq->qid = 0; 1625 cq->size = regs->aqa.bits.acqs + 1; 1626 cq->mapping.prp1 = regs->acq; 1627 *cq_tailp(cq) = 0; 1628 cq->ien = true; 1629 cq->phase = true; 1630 1631 ret = map_q(ctrlr, &cq->mapping, cq->size, true, true); 1632 if (ret) { 1633 return ret; 1634 } 1635 1636 /* The Admin queue (qid: 0) does not ever use shadow doorbells. */ 1637 cq->dbl_headp = ctrlr->bar0_doorbells + queue_index(0, true); 1638 1639 *cq_dbl_headp(cq) = 0; 1640 1641 return 0; 1642 } 1643 1644 static void * 1645 _map_one(void *prv, uint64_t addr, uint64_t len, int prot) 1646 { 1647 struct spdk_nvmf_request *req = (struct spdk_nvmf_request *)prv; 1648 struct spdk_nvmf_qpair *qpair; 1649 struct nvmf_vfio_user_req *vu_req; 1650 struct nvmf_vfio_user_sq *sq; 1651 void *ret; 1652 1653 assert(req != NULL); 1654 qpair = req->qpair; 1655 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 1656 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 1657 1658 assert(vu_req->iovcnt < NVMF_VFIO_USER_MAX_IOVECS); 1659 ret = map_one(sq->ctrlr->endpoint->vfu_ctx, addr, len, 1660 index_to_sg_t(vu_req->sg, vu_req->iovcnt), 1661 &vu_req->iov[vu_req->iovcnt], prot); 1662 if (spdk_likely(ret != NULL)) { 1663 vu_req->iovcnt++; 1664 } 1665 return ret; 1666 } 1667 1668 static int 1669 vfio_user_map_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req, 1670 struct iovec *iov, uint32_t length) 1671 { 1672 /* Map PRP list to from Guest physical memory to 1673 * virtual memory address. 1674 */ 1675 return nvme_map_cmd(req, &req->cmd->nvme_cmd, iov, NVMF_REQ_MAX_BUFFERS, 1676 length, 4096, _map_one); 1677 } 1678 1679 static int handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 1680 struct nvmf_vfio_user_sq *sq); 1681 1682 static uint32_t 1683 cq_free_slots(struct nvmf_vfio_user_cq *cq) 1684 { 1685 uint32_t free_slots; 1686 1687 assert(cq != NULL); 1688 1689 if (cq->tail == cq->last_head) { 1690 free_slots = cq->size; 1691 } else if (cq->tail > cq->last_head) { 1692 free_slots = cq->size - (cq->tail - cq->last_head); 1693 } else { 1694 free_slots = cq->last_head - cq->tail; 1695 } 1696 assert(free_slots > 0); 1697 1698 return free_slots - 1; 1699 } 1700 1701 /* 1702 * Since reading the head doorbell is relatively expensive, we use the cached 1703 * value, so we only have to read it for real if it appears that we are full. 1704 */ 1705 static inline bool 1706 cq_is_full(struct nvmf_vfio_user_cq *cq) 1707 { 1708 uint32_t free_cq_slots; 1709 1710 assert(cq != NULL); 1711 1712 free_cq_slots = cq_free_slots(cq); 1713 1714 if (spdk_unlikely(free_cq_slots == 0)) { 1715 cq->last_head = *cq_dbl_headp(cq); 1716 free_cq_slots = cq_free_slots(cq); 1717 } 1718 1719 return free_cq_slots == 0; 1720 } 1721 1722 /* 1723 * Posts a CQE in the completion queue. 1724 * 1725 * @ctrlr: the vfio-user controller 1726 * @cq: the completion queue 1727 * @cdw0: cdw0 as reported by NVMf 1728 * @sqid: submission queue ID 1729 * @cid: command identifier in NVMe command 1730 * @sc: the NVMe CQE status code 1731 * @sct: the NVMe CQE status code type 1732 */ 1733 static int 1734 post_completion(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_cq *cq, 1735 uint32_t cdw0, uint16_t sqid, uint16_t cid, uint16_t sc, uint16_t sct) 1736 { 1737 struct spdk_nvme_status cpl_status = { 0 }; 1738 struct spdk_nvme_cpl *cpl; 1739 int err; 1740 1741 assert(ctrlr != NULL); 1742 1743 if (spdk_unlikely(cq == NULL || q_addr(&cq->mapping) == NULL)) { 1744 return 0; 1745 } 1746 1747 if (cq->qid == 0) { 1748 assert(spdk_get_thread() == cq->group->group->thread); 1749 } 1750 1751 /* 1752 * As per NVMe Base spec 3.3.1.2.1, we are supposed to implement CQ flow 1753 * control: if there is no space in the CQ, we should wait until there is. 1754 * 1755 * In practice, we just fail the controller instead: as it happens, all host 1756 * implementations we care about right-size the CQ: this is required anyway for 1757 * NVMEoF support (see 3.3.2.8). 1758 */ 1759 if (cq_is_full(cq)) { 1760 SPDK_ERRLOG("%s: cqid:%d full (tail=%d, head=%d)\n", 1761 ctrlr_id(ctrlr), cq->qid, *cq_tailp(cq), 1762 *cq_dbl_headp(cq)); 1763 return -1; 1764 } 1765 1766 cpl = ((struct spdk_nvme_cpl *)q_addr(&cq->mapping)) + *cq_tailp(cq); 1767 1768 assert(ctrlr->sqs[sqid] != NULL); 1769 SPDK_DEBUGLOG(nvmf_vfio, 1770 "%s: request complete sqid:%d cid=%d status=%#x " 1771 "sqhead=%d cq tail=%d\n", ctrlr_id(ctrlr), sqid, cid, sc, 1772 *sq_headp(ctrlr->sqs[sqid]), *cq_tailp(cq)); 1773 1774 cpl->sqhd = *sq_headp(ctrlr->sqs[sqid]); 1775 cpl->sqid = sqid; 1776 cpl->cid = cid; 1777 cpl->cdw0 = cdw0; 1778 1779 /* 1780 * This is a bitfield: instead of setting the individual bits we need 1781 * directly in cpl->status, which would cause a read-modify-write cycle, 1782 * we'll avoid reading from the CPL altogether by filling in a local 1783 * cpl_status variable, then writing the whole thing. 1784 */ 1785 cpl_status.sct = sct; 1786 cpl_status.sc = sc; 1787 cpl_status.p = cq->phase; 1788 cpl->status = cpl_status; 1789 1790 /* Ensure the Completion Queue Entry is visible. */ 1791 spdk_wmb(); 1792 cq_tail_advance(cq); 1793 1794 if ((cq->qid == 0 || !ctrlr->adaptive_irqs_enabled) && 1795 cq->ien && ctrlr_interrupt_enabled(ctrlr)) { 1796 err = vfu_irq_trigger(ctrlr->endpoint->vfu_ctx, cq->iv); 1797 if (err != 0) { 1798 SPDK_ERRLOG("%s: failed to trigger interrupt: %m\n", 1799 ctrlr_id(ctrlr)); 1800 return err; 1801 } 1802 } 1803 1804 return 0; 1805 } 1806 1807 static void 1808 free_sq_reqs(struct nvmf_vfio_user_sq *sq) 1809 { 1810 while (!TAILQ_EMPTY(&sq->free_reqs)) { 1811 struct nvmf_vfio_user_req *vu_req = TAILQ_FIRST(&sq->free_reqs); 1812 TAILQ_REMOVE(&sq->free_reqs, vu_req, link); 1813 free(vu_req); 1814 } 1815 } 1816 1817 static void 1818 delete_cq_done(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_cq *cq) 1819 { 1820 assert(cq->cq_ref == 0); 1821 unmap_q(ctrlr, &cq->mapping); 1822 cq->size = 0; 1823 cq->cq_state = VFIO_USER_CQ_DELETED; 1824 cq->group = NULL; 1825 } 1826 1827 /* Deletes a SQ, if this SQ is the last user of the associated CQ 1828 * and the controller is being shut down/reset or vfio-user client disconnects, 1829 * then the CQ is also deleted. 1830 */ 1831 static void 1832 delete_sq_done(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvmf_vfio_user_sq *sq) 1833 { 1834 struct nvmf_vfio_user_cq *cq; 1835 uint16_t cqid; 1836 1837 SPDK_DEBUGLOG(nvmf_vfio, "%s: delete sqid:%d=%p done\n", ctrlr_id(vu_ctrlr), 1838 sq->qid, sq); 1839 1840 /* Free SQ resources */ 1841 unmap_q(vu_ctrlr, &sq->mapping); 1842 1843 free_sq_reqs(sq); 1844 1845 sq->size = 0; 1846 1847 sq->sq_state = VFIO_USER_SQ_DELETED; 1848 1849 /* Controller RESET and SHUTDOWN are special cases, 1850 * VM may not send DELETE IO SQ/CQ commands, NVMf library 1851 * will disconnect IO queue pairs. 1852 */ 1853 if (vu_ctrlr->reset_shn || vu_ctrlr->disconnect) { 1854 cqid = sq->cqid; 1855 cq = vu_ctrlr->cqs[cqid]; 1856 1857 SPDK_DEBUGLOG(nvmf_vfio, "%s: try to delete cqid:%u=%p\n", ctrlr_id(vu_ctrlr), 1858 cq->qid, cq); 1859 1860 assert(cq->cq_ref > 0); 1861 if (--cq->cq_ref == 0) { 1862 delete_cq_done(vu_ctrlr, cq); 1863 } 1864 } 1865 } 1866 1867 static void 1868 free_qp(struct nvmf_vfio_user_ctrlr *ctrlr, uint16_t qid) 1869 { 1870 struct nvmf_vfio_user_sq *sq; 1871 struct nvmf_vfio_user_cq *cq; 1872 1873 if (ctrlr == NULL) { 1874 return; 1875 } 1876 1877 sq = ctrlr->sqs[qid]; 1878 if (sq) { 1879 SPDK_DEBUGLOG(nvmf_vfio, "%s: Free sqid:%u\n", ctrlr_id(ctrlr), qid); 1880 unmap_q(ctrlr, &sq->mapping); 1881 1882 free_sq_reqs(sq); 1883 1884 free(sq->mapping.sg); 1885 free(sq); 1886 ctrlr->sqs[qid] = NULL; 1887 } 1888 1889 cq = ctrlr->cqs[qid]; 1890 if (cq) { 1891 SPDK_DEBUGLOG(nvmf_vfio, "%s: Free cqid:%u\n", ctrlr_id(ctrlr), qid); 1892 unmap_q(ctrlr, &cq->mapping); 1893 free(cq->mapping.sg); 1894 free(cq); 1895 ctrlr->cqs[qid] = NULL; 1896 } 1897 } 1898 1899 static int 1900 init_sq(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_transport *transport, 1901 const uint16_t id) 1902 { 1903 struct nvmf_vfio_user_sq *sq; 1904 1905 assert(ctrlr != NULL); 1906 assert(transport != NULL); 1907 assert(ctrlr->sqs[id] == NULL); 1908 1909 sq = calloc(1, sizeof(*sq)); 1910 if (sq == NULL) { 1911 return -ENOMEM; 1912 } 1913 sq->mapping.sg = calloc(1, dma_sg_size()); 1914 if (sq->mapping.sg == NULL) { 1915 free(sq); 1916 return -ENOMEM; 1917 } 1918 1919 sq->qid = id; 1920 sq->qpair.qid = id; 1921 sq->qpair.transport = transport; 1922 sq->ctrlr = ctrlr; 1923 ctrlr->sqs[id] = sq; 1924 1925 TAILQ_INIT(&sq->free_reqs); 1926 1927 return 0; 1928 } 1929 1930 static int 1931 init_cq(struct nvmf_vfio_user_ctrlr *vu_ctrlr, const uint16_t id) 1932 { 1933 struct nvmf_vfio_user_cq *cq; 1934 1935 assert(vu_ctrlr != NULL); 1936 assert(vu_ctrlr->cqs[id] == NULL); 1937 1938 cq = calloc(1, sizeof(*cq)); 1939 if (cq == NULL) { 1940 return -ENOMEM; 1941 } 1942 cq->mapping.sg = calloc(1, dma_sg_size()); 1943 if (cq->mapping.sg == NULL) { 1944 free(cq); 1945 return -ENOMEM; 1946 } 1947 1948 cq->qid = id; 1949 vu_ctrlr->cqs[id] = cq; 1950 1951 return 0; 1952 } 1953 1954 static int 1955 alloc_sq_reqs(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvmf_vfio_user_sq *sq) 1956 { 1957 struct nvmf_vfio_user_req *vu_req, *tmp; 1958 size_t req_size; 1959 uint32_t i; 1960 1961 req_size = sizeof(struct nvmf_vfio_user_req) + 1962 (dma_sg_size() * NVMF_VFIO_USER_MAX_IOVECS); 1963 1964 for (i = 0; i < sq->size; i++) { 1965 struct spdk_nvmf_request *req; 1966 1967 vu_req = calloc(1, req_size); 1968 if (vu_req == NULL) { 1969 goto err; 1970 } 1971 1972 req = &vu_req->req; 1973 req->qpair = &sq->qpair; 1974 req->rsp = (union nvmf_c2h_msg *)&vu_req->rsp; 1975 req->cmd = (union nvmf_h2c_msg *)&vu_req->cmd; 1976 req->stripped_data = NULL; 1977 1978 TAILQ_INSERT_TAIL(&sq->free_reqs, vu_req, link); 1979 } 1980 1981 return 0; 1982 1983 err: 1984 TAILQ_FOREACH_SAFE(vu_req, &sq->free_reqs, link, tmp) { 1985 free(vu_req); 1986 } 1987 return -ENOMEM; 1988 } 1989 1990 static volatile uint32_t * 1991 ctrlr_doorbell_ptr(struct nvmf_vfio_user_ctrlr *ctrlr) 1992 { 1993 return ctrlr->sdbl != NULL ? 1994 ctrlr->sdbl->shadow_doorbells : 1995 ctrlr->bar0_doorbells; 1996 } 1997 1998 static uint16_t 1999 handle_create_io_sq(struct nvmf_vfio_user_ctrlr *ctrlr, 2000 struct spdk_nvme_cmd *cmd, uint16_t *sct) 2001 { 2002 struct nvmf_vfio_user_transport *vu_transport = ctrlr->transport; 2003 struct nvmf_vfio_user_sq *sq; 2004 uint32_t qsize; 2005 uint16_t cqid; 2006 uint16_t qid; 2007 int err; 2008 2009 qid = cmd->cdw10_bits.create_io_q.qid; 2010 cqid = cmd->cdw11_bits.create_io_sq.cqid; 2011 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 2012 2013 if (ctrlr->sqs[qid] == NULL) { 2014 err = init_sq(ctrlr, ctrlr->sqs[0]->qpair.transport, qid); 2015 if (err != 0) { 2016 *sct = SPDK_NVME_SCT_GENERIC; 2017 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2018 } 2019 } 2020 2021 if (cqid == 0 || cqid >= vu_transport->transport.opts.max_qpairs_per_ctrlr) { 2022 SPDK_ERRLOG("%s: invalid cqid:%u\n", ctrlr_id(ctrlr), cqid); 2023 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2024 return SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2025 } 2026 2027 /* CQ must be created before SQ. */ 2028 if (!io_q_exists(ctrlr, cqid, true)) { 2029 SPDK_ERRLOG("%s: cqid:%u does not exist\n", ctrlr_id(ctrlr), cqid); 2030 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2031 return SPDK_NVME_SC_COMPLETION_QUEUE_INVALID; 2032 } 2033 2034 if (cmd->cdw11_bits.create_io_sq.pc != 0x1) { 2035 SPDK_ERRLOG("%s: non-PC SQ not supported\n", ctrlr_id(ctrlr)); 2036 *sct = SPDK_NVME_SCT_GENERIC; 2037 return SPDK_NVME_SC_INVALID_FIELD; 2038 } 2039 2040 sq = ctrlr->sqs[qid]; 2041 sq->size = qsize; 2042 2043 SPDK_DEBUGLOG(nvmf_vfio, "%s: sqid:%d cqid:%d\n", ctrlr_id(ctrlr), 2044 qid, cqid); 2045 2046 sq->mapping.prp1 = cmd->dptr.prp.prp1; 2047 2048 err = map_q(ctrlr, &sq->mapping, sq->size, false, true); 2049 if (err) { 2050 SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr)); 2051 *sct = SPDK_NVME_SCT_GENERIC; 2052 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2053 } 2054 2055 SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped sqid:%d IOVA=%#lx vaddr=%p\n", 2056 ctrlr_id(ctrlr), qid, cmd->dptr.prp.prp1, 2057 q_addr(&sq->mapping)); 2058 2059 err = alloc_sq_reqs(ctrlr, sq); 2060 if (err < 0) { 2061 SPDK_ERRLOG("%s: failed to allocate SQ requests: %m\n", ctrlr_id(ctrlr)); 2062 *sct = SPDK_NVME_SCT_GENERIC; 2063 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2064 } 2065 2066 sq->cqid = cqid; 2067 ctrlr->cqs[sq->cqid]->cq_ref++; 2068 sq->sq_state = VFIO_USER_SQ_CREATED; 2069 *sq_headp(sq) = 0; 2070 2071 sq->dbl_tailp = ctrlr_doorbell_ptr(ctrlr) + queue_index(qid, false); 2072 2073 /* 2074 * We should always reset the doorbells. 2075 * 2076 * The Specification prohibits the controller from writing to the shadow 2077 * doorbell buffer, however older versions of the Linux NVMe driver 2078 * don't reset the shadow doorbell buffer after a Queue-Level or 2079 * Controller-Level reset, which means that we're left with garbage 2080 * doorbell values. 2081 */ 2082 *sq_dbl_tailp(sq) = 0; 2083 2084 if (ctrlr->sdbl != NULL) { 2085 sq->need_rearm = true; 2086 2087 if (!set_sq_eventidx(sq)) { 2088 SPDK_ERRLOG("%s: host updated SQ tail doorbell before " 2089 "sqid:%hu was initialized\n", 2090 ctrlr_id(ctrlr), qid); 2091 fail_ctrlr(ctrlr); 2092 *sct = SPDK_NVME_SCT_GENERIC; 2093 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2094 } 2095 } 2096 2097 /* 2098 * Create our new I/O qpair. This asynchronously invokes, on a suitable 2099 * poll group, the nvmf_vfio_user_poll_group_add() callback, which will 2100 * call spdk_nvmf_request_exec_fabrics() with a generated fabrics 2101 * connect command. This command is then eventually completed via 2102 * handle_queue_connect_rsp(). 2103 */ 2104 sq->create_io_sq_cmd = *cmd; 2105 sq->post_create_io_sq_completion = true; 2106 2107 spdk_nvmf_tgt_new_qpair(ctrlr->transport->transport.tgt, 2108 &sq->qpair); 2109 2110 *sct = SPDK_NVME_SCT_GENERIC; 2111 return SPDK_NVME_SC_SUCCESS; 2112 } 2113 2114 static uint16_t 2115 handle_create_io_cq(struct nvmf_vfio_user_ctrlr *ctrlr, 2116 struct spdk_nvme_cmd *cmd, uint16_t *sct) 2117 { 2118 struct nvmf_vfio_user_cq *cq; 2119 uint32_t qsize; 2120 uint16_t qid; 2121 int err; 2122 2123 qid = cmd->cdw10_bits.create_io_q.qid; 2124 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 2125 2126 if (ctrlr->cqs[qid] == NULL) { 2127 err = init_cq(ctrlr, qid); 2128 if (err != 0) { 2129 *sct = SPDK_NVME_SCT_GENERIC; 2130 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2131 } 2132 } 2133 2134 if (cmd->cdw11_bits.create_io_cq.pc != 0x1) { 2135 SPDK_ERRLOG("%s: non-PC CQ not supported\n", ctrlr_id(ctrlr)); 2136 *sct = SPDK_NVME_SCT_GENERIC; 2137 return SPDK_NVME_SC_INVALID_FIELD; 2138 } 2139 2140 if (cmd->cdw11_bits.create_io_cq.iv > NVME_IRQ_MSIX_NUM - 1) { 2141 SPDK_ERRLOG("%s: IV is too big\n", ctrlr_id(ctrlr)); 2142 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2143 return SPDK_NVME_SC_INVALID_INTERRUPT_VECTOR; 2144 } 2145 2146 cq = ctrlr->cqs[qid]; 2147 cq->size = qsize; 2148 2149 cq->mapping.prp1 = cmd->dptr.prp.prp1; 2150 2151 cq->dbl_headp = ctrlr_doorbell_ptr(ctrlr) + queue_index(qid, true); 2152 2153 err = map_q(ctrlr, &cq->mapping, cq->size, true, true); 2154 if (err) { 2155 SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr)); 2156 *sct = SPDK_NVME_SCT_GENERIC; 2157 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2158 } 2159 2160 SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped cqid:%u IOVA=%#lx vaddr=%p\n", 2161 ctrlr_id(ctrlr), qid, cmd->dptr.prp.prp1, 2162 q_addr(&cq->mapping)); 2163 2164 cq->ien = cmd->cdw11_bits.create_io_cq.ien; 2165 cq->iv = cmd->cdw11_bits.create_io_cq.iv; 2166 cq->phase = true; 2167 cq->cq_state = VFIO_USER_CQ_CREATED; 2168 2169 *cq_tailp(cq) = 0; 2170 2171 /* 2172 * We should always reset the doorbells. 2173 * 2174 * The Specification prohibits the controller from writing to the shadow 2175 * doorbell buffer, however older versions of the Linux NVMe driver 2176 * don't reset the shadow doorbell buffer after a Queue-Level or 2177 * Controller-Level reset, which means that we're left with garbage 2178 * doorbell values. 2179 */ 2180 *cq_dbl_headp(cq) = 0; 2181 2182 *sct = SPDK_NVME_SCT_GENERIC; 2183 return SPDK_NVME_SC_SUCCESS; 2184 } 2185 2186 /* 2187 * Creates a completion or submission I/O queue. Returns 0 on success, -errno 2188 * on error. 2189 */ 2190 static int 2191 handle_create_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, 2192 struct spdk_nvme_cmd *cmd, const bool is_cq) 2193 { 2194 struct nvmf_vfio_user_transport *vu_transport = ctrlr->transport; 2195 uint16_t sct = SPDK_NVME_SCT_GENERIC; 2196 uint16_t sc = SPDK_NVME_SC_SUCCESS; 2197 uint32_t qsize; 2198 uint16_t qid; 2199 2200 assert(ctrlr != NULL); 2201 assert(cmd != NULL); 2202 2203 qid = cmd->cdw10_bits.create_io_q.qid; 2204 if (qid == 0 || qid >= vu_transport->transport.opts.max_qpairs_per_ctrlr) { 2205 SPDK_ERRLOG("%s: invalid qid=%d, max=%d\n", ctrlr_id(ctrlr), 2206 qid, vu_transport->transport.opts.max_qpairs_per_ctrlr); 2207 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2208 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2209 goto out; 2210 } 2211 2212 if (io_q_exists(ctrlr, qid, is_cq)) { 2213 SPDK_ERRLOG("%s: %cqid:%d already exists\n", ctrlr_id(ctrlr), 2214 is_cq ? 'c' : 's', qid); 2215 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2216 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2217 goto out; 2218 } 2219 2220 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 2221 if (qsize == 1 || qsize > max_queue_size(ctrlr)) { 2222 SPDK_ERRLOG("%s: invalid I/O queue size %u\n", ctrlr_id(ctrlr), qsize); 2223 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2224 sc = SPDK_NVME_SC_INVALID_QUEUE_SIZE; 2225 goto out; 2226 } 2227 2228 if (is_cq) { 2229 sc = handle_create_io_cq(ctrlr, cmd, &sct); 2230 } else { 2231 sc = handle_create_io_sq(ctrlr, cmd, &sct); 2232 2233 if (sct == SPDK_NVME_SCT_GENERIC && 2234 sc == SPDK_NVME_SC_SUCCESS) { 2235 /* Completion posted asynchronously. */ 2236 return 0; 2237 } 2238 } 2239 2240 out: 2241 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 2242 } 2243 2244 /* For ADMIN I/O DELETE SUBMISSION QUEUE the NVMf library will disconnect and free 2245 * queue pair, so save the command id and controller in a context. 2246 */ 2247 struct vfio_user_delete_sq_ctx { 2248 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 2249 uint16_t cid; 2250 }; 2251 2252 static void 2253 vfio_user_qpair_delete_cb(void *cb_arg) 2254 { 2255 struct vfio_user_delete_sq_ctx *ctx = cb_arg; 2256 struct nvmf_vfio_user_ctrlr *vu_ctrlr = ctx->vu_ctrlr; 2257 struct nvmf_vfio_user_cq *admin_cq = vu_ctrlr->cqs[0]; 2258 2259 assert(admin_cq != NULL); 2260 assert(admin_cq->group != NULL); 2261 assert(admin_cq->group->group->thread != NULL); 2262 if (admin_cq->group->group->thread != spdk_get_thread()) { 2263 spdk_thread_send_msg(admin_cq->group->group->thread, 2264 vfio_user_qpair_delete_cb, 2265 cb_arg); 2266 } else { 2267 post_completion(vu_ctrlr, admin_cq, 0, 0, 2268 ctx->cid, 2269 SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC); 2270 free(ctx); 2271 } 2272 } 2273 2274 /* 2275 * Deletes a completion or submission I/O queue. 2276 */ 2277 static int 2278 handle_del_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, 2279 struct spdk_nvme_cmd *cmd, const bool is_cq) 2280 { 2281 uint16_t sct = SPDK_NVME_SCT_GENERIC; 2282 uint16_t sc = SPDK_NVME_SC_SUCCESS; 2283 struct nvmf_vfio_user_sq *sq; 2284 struct nvmf_vfio_user_cq *cq; 2285 2286 SPDK_DEBUGLOG(nvmf_vfio, "%s: delete I/O %cqid:%d\n", 2287 ctrlr_id(ctrlr), is_cq ? 'c' : 's', 2288 cmd->cdw10_bits.delete_io_q.qid); 2289 2290 if (!io_q_exists(ctrlr, cmd->cdw10_bits.delete_io_q.qid, is_cq)) { 2291 SPDK_ERRLOG("%s: I/O %cqid:%d does not exist\n", ctrlr_id(ctrlr), 2292 is_cq ? 'c' : 's', cmd->cdw10_bits.delete_io_q.qid); 2293 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2294 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2295 goto out; 2296 } 2297 2298 if (is_cq) { 2299 cq = ctrlr->cqs[cmd->cdw10_bits.delete_io_q.qid]; 2300 if (cq->cq_ref) { 2301 SPDK_ERRLOG("%s: the associated SQ must be deleted first\n", ctrlr_id(ctrlr)); 2302 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2303 sc = SPDK_NVME_SC_INVALID_QUEUE_DELETION; 2304 goto out; 2305 } 2306 delete_cq_done(ctrlr, cq); 2307 } else { 2308 /* 2309 * Deletion of the CQ is only deferred to delete_sq_done() on 2310 * VM reboot or CC.EN change, so we have to delete it in all 2311 * other cases. 2312 */ 2313 sq = ctrlr->sqs[cmd->cdw10_bits.delete_io_q.qid]; 2314 sq->delete_ctx = calloc(1, sizeof(*sq->delete_ctx)); 2315 if (!sq->delete_ctx) { 2316 sct = SPDK_NVME_SCT_GENERIC; 2317 sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2318 goto out; 2319 } 2320 sq->delete_ctx->vu_ctrlr = ctrlr; 2321 sq->delete_ctx->cid = cmd->cid; 2322 sq->sq_state = VFIO_USER_SQ_DELETED; 2323 assert(ctrlr->cqs[sq->cqid]->cq_ref); 2324 ctrlr->cqs[sq->cqid]->cq_ref--; 2325 2326 spdk_nvmf_qpair_disconnect(&sq->qpair, NULL, NULL); 2327 return 0; 2328 } 2329 2330 out: 2331 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 2332 } 2333 2334 /* 2335 * Configures Shadow Doorbells. 2336 */ 2337 static int 2338 handle_doorbell_buffer_config(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd) 2339 { 2340 struct nvmf_vfio_user_shadow_doorbells *sdbl = NULL; 2341 uint32_t dstrd; 2342 uintptr_t page_size, page_mask; 2343 uint64_t prp1, prp2; 2344 uint16_t sct = SPDK_NVME_SCT_GENERIC; 2345 uint16_t sc = SPDK_NVME_SC_INVALID_FIELD; 2346 2347 assert(ctrlr != NULL); 2348 assert(ctrlr->endpoint != NULL); 2349 assert(cmd != NULL); 2350 2351 dstrd = doorbell_stride(ctrlr); 2352 page_size = memory_page_size(ctrlr); 2353 page_mask = memory_page_mask(ctrlr); 2354 2355 /* FIXME: we don't check doorbell stride when setting queue doorbells. */ 2356 if ((4u << dstrd) * NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR > page_size) { 2357 SPDK_ERRLOG("%s: doorbells do not fit in a single host page", 2358 ctrlr_id(ctrlr)); 2359 2360 goto out; 2361 } 2362 2363 /* Verify guest physical addresses passed as PRPs. */ 2364 if (cmd->psdt != SPDK_NVME_PSDT_PRP) { 2365 SPDK_ERRLOG("%s: received Doorbell Buffer Config without PRPs", 2366 ctrlr_id(ctrlr)); 2367 2368 goto out; 2369 } 2370 2371 prp1 = cmd->dptr.prp.prp1; 2372 prp2 = cmd->dptr.prp.prp2; 2373 2374 SPDK_DEBUGLOG(nvmf_vfio, 2375 "%s: configuring shadow doorbells with PRP1=%#lx and PRP2=%#lx (GPAs)\n", 2376 ctrlr_id(ctrlr), prp1, prp2); 2377 2378 if (prp1 == prp2 2379 || prp1 != (prp1 & page_mask) 2380 || prp2 != (prp2 & page_mask)) { 2381 SPDK_ERRLOG("%s: invalid shadow doorbell GPAs\n", 2382 ctrlr_id(ctrlr)); 2383 2384 goto out; 2385 } 2386 2387 /* Map guest physical addresses to our virtual address space. */ 2388 sdbl = map_sdbl(ctrlr->endpoint->vfu_ctx, prp1, prp2, page_size); 2389 if (sdbl == NULL) { 2390 SPDK_ERRLOG("%s: failed to map shadow doorbell buffers\n", 2391 ctrlr_id(ctrlr)); 2392 2393 goto out; 2394 } 2395 2396 ctrlr->shadow_doorbell_buffer = prp1; 2397 ctrlr->eventidx_buffer = prp2; 2398 2399 SPDK_DEBUGLOG(nvmf_vfio, 2400 "%s: mapped shadow doorbell buffers [%p, %p) and [%p, %p)\n", 2401 ctrlr_id(ctrlr), 2402 sdbl->iovs[0].iov_base, 2403 sdbl->iovs[0].iov_base + sdbl->iovs[0].iov_len, 2404 sdbl->iovs[1].iov_base, 2405 sdbl->iovs[1].iov_base + sdbl->iovs[1].iov_len); 2406 2407 2408 /* 2409 * Set all possible CQ head doorbells to polling mode now, such that we 2410 * don't have to worry about it later if the host creates more queues. 2411 * 2412 * We only ever want interrupts for writes to the SQ tail doorbells 2413 * (which are initialised in set_ctrlr_intr_mode() below). 2414 */ 2415 for (uint16_t i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; ++i) { 2416 sdbl->eventidxs[queue_index(i, true)] = NVMF_VFIO_USER_EVENTIDX_POLL; 2417 } 2418 2419 /* Update controller. */ 2420 SWAP(ctrlr->sdbl, sdbl); 2421 2422 /* 2423 * Copy doorbells from either the previous shadow doorbell buffer or the 2424 * BAR0 doorbells and make I/O queue doorbells point to the new buffer. 2425 * 2426 * This needs to account for older versions of the Linux NVMe driver, 2427 * which don't clear out the buffer after a controller reset. 2428 */ 2429 copy_doorbells(ctrlr, sdbl != NULL ? 2430 sdbl->shadow_doorbells : ctrlr->bar0_doorbells, 2431 ctrlr->sdbl->shadow_doorbells); 2432 2433 vfio_user_ctrlr_switch_doorbells(ctrlr, true); 2434 2435 ctrlr_kick(ctrlr); 2436 2437 sc = SPDK_NVME_SC_SUCCESS; 2438 2439 out: 2440 /* 2441 * Unmap existing buffers, in case Doorbell Buffer Config was sent 2442 * more than once (pointless, but not prohibited by the spec), or 2443 * in case of an error. 2444 * 2445 * If this is the first time Doorbell Buffer Config was processed, 2446 * then we've just swapped a NULL from ctrlr->sdbl into sdbl, so 2447 * free_sdbl() becomes a noop. 2448 */ 2449 free_sdbl(ctrlr->endpoint->vfu_ctx, sdbl); 2450 2451 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 2452 } 2453 2454 /* Returns 0 on success and -errno on error. */ 2455 static int 2456 consume_admin_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd) 2457 { 2458 assert(ctrlr != NULL); 2459 assert(cmd != NULL); 2460 2461 if (cmd->fuse != 0) { 2462 /* Fused admin commands are not supported. */ 2463 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, 2464 SPDK_NVME_SC_INVALID_FIELD, 2465 SPDK_NVME_SCT_GENERIC); 2466 } 2467 2468 switch (cmd->opc) { 2469 case SPDK_NVME_OPC_CREATE_IO_CQ: 2470 case SPDK_NVME_OPC_CREATE_IO_SQ: 2471 return handle_create_io_q(ctrlr, cmd, 2472 cmd->opc == SPDK_NVME_OPC_CREATE_IO_CQ); 2473 case SPDK_NVME_OPC_DELETE_IO_SQ: 2474 case SPDK_NVME_OPC_DELETE_IO_CQ: 2475 return handle_del_io_q(ctrlr, cmd, 2476 cmd->opc == SPDK_NVME_OPC_DELETE_IO_CQ); 2477 case SPDK_NVME_OPC_DOORBELL_BUFFER_CONFIG: 2478 if (!ctrlr->transport->transport_opts.disable_shadow_doorbells) { 2479 return handle_doorbell_buffer_config(ctrlr, cmd); 2480 } 2481 /* FALLTHROUGH */ 2482 default: 2483 return handle_cmd_req(ctrlr, cmd, ctrlr->sqs[0]); 2484 } 2485 } 2486 2487 static int 2488 handle_cmd_rsp(struct nvmf_vfio_user_req *vu_req, void *cb_arg) 2489 { 2490 struct nvmf_vfio_user_sq *sq = cb_arg; 2491 struct nvmf_vfio_user_ctrlr *vu_ctrlr = sq->ctrlr; 2492 uint16_t sqid, cqid; 2493 2494 assert(sq != NULL); 2495 assert(vu_req != NULL); 2496 assert(vu_ctrlr != NULL); 2497 2498 if (spdk_likely(vu_req->iovcnt)) { 2499 vfu_sgl_put(vu_ctrlr->endpoint->vfu_ctx, 2500 index_to_sg_t(vu_req->sg, 0), 2501 vu_req->iov, vu_req->iovcnt); 2502 } 2503 sqid = sq->qid; 2504 cqid = sq->cqid; 2505 2506 return post_completion(vu_ctrlr, vu_ctrlr->cqs[cqid], 2507 vu_req->req.rsp->nvme_cpl.cdw0, 2508 sqid, 2509 vu_req->req.cmd->nvme_cmd.cid, 2510 vu_req->req.rsp->nvme_cpl.status.sc, 2511 vu_req->req.rsp->nvme_cpl.status.sct); 2512 } 2513 2514 static int 2515 consume_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_sq *sq, 2516 struct spdk_nvme_cmd *cmd) 2517 { 2518 assert(sq != NULL); 2519 if (spdk_unlikely(nvmf_qpair_is_admin_queue(&sq->qpair))) { 2520 return consume_admin_cmd(ctrlr, cmd); 2521 } 2522 2523 return handle_cmd_req(ctrlr, cmd, sq); 2524 } 2525 2526 /* Returns the number of commands processed, or a negative value on error. */ 2527 static int 2528 handle_sq_tdbl_write(struct nvmf_vfio_user_ctrlr *ctrlr, const uint32_t new_tail, 2529 struct nvmf_vfio_user_sq *sq) 2530 { 2531 struct spdk_nvme_cmd *queue; 2532 struct nvmf_vfio_user_cq *cq = ctrlr->cqs[sq->cqid]; 2533 int count = 0; 2534 uint32_t free_cq_slots; 2535 2536 assert(ctrlr != NULL); 2537 assert(sq != NULL); 2538 2539 if (ctrlr->sdbl != NULL && sq->qid != 0) { 2540 /* 2541 * Submission queue index has moved past the event index, so it 2542 * needs to be re-armed before we go to sleep. 2543 */ 2544 sq->need_rearm = true; 2545 } 2546 2547 free_cq_slots = cq_free_slots(cq); 2548 queue = q_addr(&sq->mapping); 2549 while (*sq_headp(sq) != new_tail) { 2550 int err; 2551 struct spdk_nvme_cmd *cmd; 2552 2553 /* 2554 * Linux host nvme driver can submit cmd's more than free cq slots 2555 * available. So process only those who have cq slots available. 2556 */ 2557 if (free_cq_slots-- == 0) { 2558 cq->last_head = *cq_dbl_headp(cq); 2559 2560 free_cq_slots = cq_free_slots(cq); 2561 if (free_cq_slots > 0) { 2562 continue; 2563 } 2564 2565 /* 2566 * If there are no free cq slots then kick interrupt FD to loop 2567 * again to process remaining sq cmds. 2568 * In case of polling mode we will process remaining sq cmds during 2569 * next polling interation. 2570 * sq head is advanced only for consumed commands. 2571 */ 2572 if (in_interrupt_mode(ctrlr->transport)) { 2573 eventfd_write(ctrlr->intr_fd, 1); 2574 } 2575 break; 2576 } 2577 2578 cmd = &queue[*sq_headp(sq)]; 2579 count++; 2580 2581 /* 2582 * SQHD must contain the new head pointer, so we must increase 2583 * it before we generate a completion. 2584 */ 2585 sq_head_advance(sq); 2586 2587 err = consume_cmd(ctrlr, sq, cmd); 2588 if (spdk_unlikely(err != 0)) { 2589 return err; 2590 } 2591 } 2592 2593 return count; 2594 } 2595 2596 /* Checks whether endpoint is connected from the same process */ 2597 static bool 2598 is_peer_same_process(struct nvmf_vfio_user_endpoint *endpoint) 2599 { 2600 struct ucred ucred; 2601 socklen_t ucredlen = sizeof(ucred); 2602 2603 if (endpoint == NULL) { 2604 return false; 2605 } 2606 2607 if (getsockopt(vfu_get_poll_fd(endpoint->vfu_ctx), SOL_SOCKET, SO_PEERCRED, &ucred, 2608 &ucredlen) < 0) { 2609 SPDK_ERRLOG("getsockopt(SO_PEERCRED): %s\n", strerror(errno)); 2610 return false; 2611 } 2612 2613 return ucred.pid == getpid(); 2614 } 2615 2616 static void 2617 memory_region_add_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) 2618 { 2619 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2620 struct nvmf_vfio_user_ctrlr *ctrlr; 2621 struct nvmf_vfio_user_sq *sq; 2622 struct nvmf_vfio_user_cq *cq; 2623 void *map_start, *map_end; 2624 int ret; 2625 2626 /* 2627 * We're not interested in any DMA regions that aren't mappable (we don't 2628 * support clients that don't share their memory). 2629 */ 2630 if (!info->vaddr) { 2631 return; 2632 } 2633 2634 map_start = info->mapping.iov_base; 2635 map_end = info->mapping.iov_base + info->mapping.iov_len; 2636 2637 if (((uintptr_t)info->mapping.iov_base & MASK_2MB) || 2638 (info->mapping.iov_len & MASK_2MB)) { 2639 SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %p-%p\n", 2640 info->vaddr, map_start, map_end); 2641 return; 2642 } 2643 2644 assert(endpoint != NULL); 2645 if (endpoint->ctrlr == NULL) { 2646 return; 2647 } 2648 ctrlr = endpoint->ctrlr; 2649 2650 SPDK_DEBUGLOG(nvmf_vfio, "%s: map IOVA %p-%p\n", endpoint_id(endpoint), 2651 map_start, map_end); 2652 2653 /* VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE are enabled when registering to VFIO, here we also 2654 * check the protection bits before registering. When vfio client and server are run in same process 2655 * there is no need to register the same memory again. 2656 */ 2657 if (info->prot == (PROT_WRITE | PROT_READ) && !is_peer_same_process(endpoint)) { 2658 ret = spdk_mem_register(info->mapping.iov_base, info->mapping.iov_len); 2659 if (ret) { 2660 SPDK_ERRLOG("Memory region register %p-%p failed, ret=%d\n", 2661 map_start, map_end, ret); 2662 } 2663 } 2664 2665 pthread_mutex_lock(&endpoint->lock); 2666 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 2667 if (sq->sq_state != VFIO_USER_SQ_INACTIVE) { 2668 continue; 2669 } 2670 2671 cq = ctrlr->cqs[sq->cqid]; 2672 2673 /* For shared CQ case, we will use q_addr() to avoid mapping CQ multiple times */ 2674 if (cq->size && q_addr(&cq->mapping) == NULL) { 2675 ret = map_q(ctrlr, &cq->mapping, cq->size, true, false); 2676 if (ret) { 2677 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap cqid:%d %#lx-%#lx\n", 2678 cq->qid, cq->mapping.prp1, 2679 cq->mapping.prp1 + cq->size * sizeof(struct spdk_nvme_cpl)); 2680 continue; 2681 } 2682 } 2683 2684 if (sq->size) { 2685 ret = map_q(ctrlr, &sq->mapping, sq->size, false, false); 2686 if (ret) { 2687 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap sqid:%d %#lx-%#lx\n", 2688 sq->qid, sq->mapping.prp1, 2689 sq->mapping.prp1 + sq->size * sizeof(struct spdk_nvme_cmd)); 2690 continue; 2691 } 2692 } 2693 sq->sq_state = VFIO_USER_SQ_ACTIVE; 2694 SPDK_DEBUGLOG(nvmf_vfio, "Remap sqid:%u successfully\n", sq->qid); 2695 } 2696 pthread_mutex_unlock(&endpoint->lock); 2697 } 2698 2699 static void 2700 memory_region_remove_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) 2701 { 2702 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2703 struct nvmf_vfio_user_sq *sq; 2704 struct nvmf_vfio_user_cq *cq; 2705 void *map_start, *map_end; 2706 int ret = 0; 2707 2708 if (!info->vaddr) { 2709 return; 2710 } 2711 2712 map_start = info->mapping.iov_base; 2713 map_end = info->mapping.iov_base + info->mapping.iov_len; 2714 2715 if (((uintptr_t)info->mapping.iov_base & MASK_2MB) || 2716 (info->mapping.iov_len & MASK_2MB)) { 2717 SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %p-%p\n", 2718 info->vaddr, map_start, map_end); 2719 return; 2720 } 2721 2722 assert(endpoint != NULL); 2723 SPDK_DEBUGLOG(nvmf_vfio, "%s: unmap IOVA %p-%p\n", endpoint_id(endpoint), 2724 map_start, map_end); 2725 2726 if (endpoint->ctrlr != NULL) { 2727 struct nvmf_vfio_user_ctrlr *ctrlr; 2728 ctrlr = endpoint->ctrlr; 2729 2730 pthread_mutex_lock(&endpoint->lock); 2731 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 2732 if (q_addr(&sq->mapping) >= map_start && q_addr(&sq->mapping) <= map_end) { 2733 unmap_q(ctrlr, &sq->mapping); 2734 sq->sq_state = VFIO_USER_SQ_INACTIVE; 2735 } 2736 2737 cq = ctrlr->cqs[sq->cqid]; 2738 if (q_addr(&cq->mapping) >= map_start && q_addr(&cq->mapping) <= map_end) { 2739 unmap_q(ctrlr, &cq->mapping); 2740 } 2741 } 2742 2743 if (ctrlr->sdbl != NULL) { 2744 size_t i; 2745 2746 for (i = 0; i < NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT; i++) { 2747 const void *const iov_base = ctrlr->sdbl->iovs[i].iov_base; 2748 2749 if (iov_base >= map_start && iov_base < map_end) { 2750 copy_doorbells(ctrlr, 2751 ctrlr->sdbl->shadow_doorbells, 2752 ctrlr->bar0_doorbells); 2753 vfio_user_ctrlr_switch_doorbells(ctrlr, false); 2754 free_sdbl(endpoint->vfu_ctx, ctrlr->sdbl); 2755 ctrlr->sdbl = NULL; 2756 break; 2757 } 2758 } 2759 } 2760 2761 pthread_mutex_unlock(&endpoint->lock); 2762 } 2763 2764 if (info->prot == (PROT_WRITE | PROT_READ) && !is_peer_same_process(endpoint)) { 2765 ret = spdk_mem_unregister(info->mapping.iov_base, info->mapping.iov_len); 2766 if (ret) { 2767 SPDK_ERRLOG("Memory region unregister %p-%p failed, ret=%d\n", 2768 map_start, map_end, ret); 2769 } 2770 } 2771 } 2772 2773 /* Used to initiate a controller-level reset or a controller shutdown. */ 2774 static void 2775 disable_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 2776 { 2777 SPDK_DEBUGLOG(nvmf_vfio, "%s: disabling controller\n", 2778 ctrlr_id(vu_ctrlr)); 2779 2780 /* Unmap Admin queue. */ 2781 2782 assert(vu_ctrlr->sqs[0] != NULL); 2783 assert(vu_ctrlr->cqs[0] != NULL); 2784 2785 unmap_q(vu_ctrlr, &vu_ctrlr->sqs[0]->mapping); 2786 unmap_q(vu_ctrlr, &vu_ctrlr->cqs[0]->mapping); 2787 2788 vu_ctrlr->sqs[0]->size = 0; 2789 *sq_headp(vu_ctrlr->sqs[0]) = 0; 2790 2791 vu_ctrlr->sqs[0]->sq_state = VFIO_USER_SQ_INACTIVE; 2792 2793 vu_ctrlr->cqs[0]->size = 0; 2794 *cq_tailp(vu_ctrlr->cqs[0]) = 0; 2795 2796 /* 2797 * For PCIe controller reset or shutdown, we will drop all AER 2798 * responses. 2799 */ 2800 spdk_nvmf_ctrlr_abort_aer(vu_ctrlr->ctrlr); 2801 2802 /* Free the shadow doorbell buffer. */ 2803 vfio_user_ctrlr_switch_doorbells(vu_ctrlr, false); 2804 free_sdbl(vu_ctrlr->endpoint->vfu_ctx, vu_ctrlr->sdbl); 2805 vu_ctrlr->sdbl = NULL; 2806 } 2807 2808 /* Used to re-enable the controller after a controller-level reset. */ 2809 static int 2810 enable_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 2811 { 2812 int err; 2813 2814 assert(vu_ctrlr != NULL); 2815 2816 SPDK_DEBUGLOG(nvmf_vfio, "%s: enabling controller\n", 2817 ctrlr_id(vu_ctrlr)); 2818 2819 err = acq_setup(vu_ctrlr); 2820 if (err != 0) { 2821 return err; 2822 } 2823 2824 err = asq_setup(vu_ctrlr); 2825 if (err != 0) { 2826 return err; 2827 } 2828 2829 vu_ctrlr->sqs[0]->sq_state = VFIO_USER_SQ_ACTIVE; 2830 2831 return 0; 2832 } 2833 2834 static int 2835 nvmf_vfio_user_prop_req_rsp_set(struct nvmf_vfio_user_req *req, 2836 struct nvmf_vfio_user_sq *sq) 2837 { 2838 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 2839 union spdk_nvme_cc_register cc, diff; 2840 2841 assert(req->req.cmd->prop_set_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET); 2842 assert(sq->ctrlr != NULL); 2843 vu_ctrlr = sq->ctrlr; 2844 2845 if (req->req.cmd->prop_set_cmd.ofst != offsetof(struct spdk_nvme_registers, cc)) { 2846 return 0; 2847 } 2848 2849 cc.raw = req->req.cmd->prop_set_cmd.value.u64; 2850 diff.raw = cc.raw ^ req->cc.raw; 2851 2852 if (diff.bits.en) { 2853 if (cc.bits.en) { 2854 int ret = enable_ctrlr(vu_ctrlr); 2855 if (ret) { 2856 SPDK_ERRLOG("%s: failed to enable ctrlr\n", ctrlr_id(vu_ctrlr)); 2857 return ret; 2858 } 2859 vu_ctrlr->reset_shn = false; 2860 } else { 2861 vu_ctrlr->reset_shn = true; 2862 } 2863 } 2864 2865 if (diff.bits.shn) { 2866 if (cc.bits.shn == SPDK_NVME_SHN_NORMAL || cc.bits.shn == SPDK_NVME_SHN_ABRUPT) { 2867 vu_ctrlr->reset_shn = true; 2868 } 2869 } 2870 2871 if (vu_ctrlr->reset_shn) { 2872 disable_ctrlr(vu_ctrlr); 2873 } 2874 return 0; 2875 } 2876 2877 static int 2878 nvmf_vfio_user_prop_req_rsp(struct nvmf_vfio_user_req *req, void *cb_arg) 2879 { 2880 struct nvmf_vfio_user_sq *sq = cb_arg; 2881 2882 assert(sq != NULL); 2883 assert(req != NULL); 2884 2885 if (req->req.cmd->prop_get_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET) { 2886 assert(sq->ctrlr != NULL); 2887 assert(req != NULL); 2888 2889 memcpy(req->req.iov[0].iov_base, 2890 &req->req.rsp->prop_get_rsp.value.u64, 2891 req->req.length); 2892 return 0; 2893 } 2894 2895 return nvmf_vfio_user_prop_req_rsp_set(req, sq); 2896 } 2897 2898 /* 2899 * Handles a write at offset 0x1000 or more; this is the non-mapped path when a 2900 * doorbell is written via access_bar0_fn(). 2901 * 2902 * DSTRD is set to fixed value 0 for NVMf. 2903 * 2904 */ 2905 static int 2906 handle_dbl_access(struct nvmf_vfio_user_ctrlr *ctrlr, uint32_t *buf, 2907 const size_t count, loff_t pos, const bool is_write) 2908 { 2909 struct nvmf_vfio_user_poll_group *group; 2910 2911 assert(ctrlr != NULL); 2912 assert(buf != NULL); 2913 2914 if (spdk_unlikely(!is_write)) { 2915 SPDK_WARNLOG("%s: host tried to read BAR0 doorbell %#lx\n", 2916 ctrlr_id(ctrlr), pos); 2917 errno = EPERM; 2918 return -1; 2919 } 2920 2921 if (spdk_unlikely(count != sizeof(uint32_t))) { 2922 SPDK_ERRLOG("%s: bad doorbell buffer size %ld\n", 2923 ctrlr_id(ctrlr), count); 2924 errno = EINVAL; 2925 return -1; 2926 } 2927 2928 pos -= NVME_DOORBELLS_OFFSET; 2929 2930 /* pos must be dword aligned */ 2931 if (spdk_unlikely((pos & 0x3) != 0)) { 2932 SPDK_ERRLOG("%s: bad doorbell offset %#lx\n", ctrlr_id(ctrlr), pos); 2933 errno = EINVAL; 2934 return -1; 2935 } 2936 2937 /* convert byte offset to array index */ 2938 pos >>= 2; 2939 2940 if (spdk_unlikely(pos >= NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR * 2)) { 2941 SPDK_ERRLOG("%s: bad doorbell index %#lx\n", ctrlr_id(ctrlr), pos); 2942 errno = EINVAL; 2943 return -1; 2944 } 2945 2946 ctrlr->bar0_doorbells[pos] = *buf; 2947 spdk_wmb(); 2948 2949 group = ctrlr_to_poll_group(ctrlr); 2950 if (pos == 1) { 2951 group->stats.cqh_admin_writes++; 2952 } else if (pos & 1) { 2953 group->stats.cqh_io_writes++; 2954 } 2955 2956 SPDK_DEBUGLOG(vfio_user_db, "%s: updating BAR0 doorbell %s:%ld to %u\n", 2957 ctrlr_id(ctrlr), (pos & 1) ? "cqid" : "sqid", 2958 pos / 2, *buf); 2959 2960 2961 return 0; 2962 } 2963 2964 static size_t 2965 vfio_user_property_access(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 2966 char *buf, size_t count, loff_t pos, 2967 bool is_write) 2968 { 2969 struct nvmf_vfio_user_req *req; 2970 const struct spdk_nvmf_registers *regs; 2971 2972 if ((count != 4) && (count != 8)) { 2973 errno = EINVAL; 2974 return -1; 2975 } 2976 2977 /* Construct a Fabric Property Get/Set command and send it */ 2978 req = get_nvmf_vfio_user_req(vu_ctrlr->sqs[0]); 2979 if (req == NULL) { 2980 errno = ENOBUFS; 2981 return -1; 2982 } 2983 regs = spdk_nvmf_ctrlr_get_regs(vu_ctrlr->ctrlr); 2984 req->cc.raw = regs->cc.raw; 2985 2986 req->cb_fn = nvmf_vfio_user_prop_req_rsp; 2987 req->cb_arg = vu_ctrlr->sqs[0]; 2988 req->req.cmd->prop_set_cmd.opcode = SPDK_NVME_OPC_FABRIC; 2989 req->req.cmd->prop_set_cmd.cid = 0; 2990 if (count == 4) { 2991 req->req.cmd->prop_set_cmd.attrib.size = 0; 2992 } else { 2993 req->req.cmd->prop_set_cmd.attrib.size = 1; 2994 } 2995 req->req.cmd->prop_set_cmd.ofst = pos; 2996 if (is_write) { 2997 req->req.cmd->prop_set_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET; 2998 if (req->req.cmd->prop_set_cmd.attrib.size) { 2999 req->req.cmd->prop_set_cmd.value.u64 = *(uint64_t *)buf; 3000 } else { 3001 req->req.cmd->prop_set_cmd.value.u32.high = 0; 3002 req->req.cmd->prop_set_cmd.value.u32.low = *(uint32_t *)buf; 3003 } 3004 } else { 3005 req->req.cmd->prop_get_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET; 3006 } 3007 req->req.length = count; 3008 spdk_iov_one(req->req.iov, &req->req.iovcnt, buf, req->req.length); 3009 req->req.data = buf; 3010 3011 spdk_nvmf_request_exec_fabrics(&req->req); 3012 3013 return count; 3014 } 3015 3016 static ssize_t 3017 access_bar0_fn(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t pos, 3018 bool is_write) 3019 { 3020 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3021 struct nvmf_vfio_user_ctrlr *ctrlr; 3022 int ret; 3023 3024 ctrlr = endpoint->ctrlr; 3025 if (spdk_unlikely(endpoint->need_async_destroy || !ctrlr)) { 3026 errno = EIO; 3027 return -1; 3028 } 3029 3030 if (pos >= NVME_DOORBELLS_OFFSET) { 3031 /* 3032 * The fact that the doorbells can be memory mapped doesn't mean 3033 * that the client (VFIO in QEMU) is obliged to memory map them, 3034 * it might still elect to access them via regular read/write; 3035 * we might also have had disable_mappable_bar0 set. 3036 */ 3037 ret = handle_dbl_access(ctrlr, (uint32_t *)buf, count, 3038 pos, is_write); 3039 if (ret == 0) { 3040 return count; 3041 } 3042 return ret; 3043 } 3044 3045 return vfio_user_property_access(ctrlr, buf, count, pos, is_write); 3046 } 3047 3048 static ssize_t 3049 access_pci_config(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t offset, 3050 bool is_write) 3051 { 3052 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3053 3054 if (is_write) { 3055 SPDK_ERRLOG("%s: write %#lx-%#lx not supported\n", 3056 endpoint_id(endpoint), offset, offset + count); 3057 errno = EINVAL; 3058 return -1; 3059 } 3060 3061 if (offset + count > NVME_REG_CFG_SIZE) { 3062 SPDK_ERRLOG("%s: access past end of extended PCI configuration space, want=%ld+%ld, max=%d\n", 3063 endpoint_id(endpoint), offset, count, 3064 NVME_REG_CFG_SIZE); 3065 errno = ERANGE; 3066 return -1; 3067 } 3068 3069 memcpy(buf, ((unsigned char *)endpoint->pci_config_space) + offset, count); 3070 3071 return count; 3072 } 3073 3074 static void 3075 vfio_user_log(vfu_ctx_t *vfu_ctx, int level, char const *msg) 3076 { 3077 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3078 3079 if (level >= LOG_DEBUG) { 3080 SPDK_DEBUGLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg); 3081 } else if (level >= LOG_INFO) { 3082 SPDK_INFOLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg); 3083 } else if (level >= LOG_NOTICE) { 3084 SPDK_NOTICELOG("%s: %s\n", endpoint_id(endpoint), msg); 3085 } else if (level >= LOG_WARNING) { 3086 SPDK_WARNLOG("%s: %s\n", endpoint_id(endpoint), msg); 3087 } else { 3088 SPDK_ERRLOG("%s: %s\n", endpoint_id(endpoint), msg); 3089 } 3090 } 3091 3092 static int 3093 vfio_user_get_log_level(void) 3094 { 3095 int level; 3096 3097 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 3098 return LOG_DEBUG; 3099 } 3100 3101 level = spdk_log_to_syslog_level(spdk_log_get_level()); 3102 if (level < 0) { 3103 return LOG_ERR; 3104 } 3105 3106 return level; 3107 } 3108 3109 static void 3110 init_pci_config_space(vfu_pci_config_space_t *p) 3111 { 3112 /* MLBAR */ 3113 p->hdr.bars[0].raw = 0x0; 3114 /* MUBAR */ 3115 p->hdr.bars[1].raw = 0x0; 3116 3117 /* vendor specific, let's set them to zero for now */ 3118 p->hdr.bars[3].raw = 0x0; 3119 p->hdr.bars[4].raw = 0x0; 3120 p->hdr.bars[5].raw = 0x0; 3121 3122 /* enable INTx */ 3123 p->hdr.intr.ipin = 0x1; 3124 } 3125 3126 struct ctrlr_quiesce_ctx { 3127 struct nvmf_vfio_user_endpoint *endpoint; 3128 struct nvmf_vfio_user_poll_group *group; 3129 int status; 3130 }; 3131 3132 static void ctrlr_quiesce(struct nvmf_vfio_user_ctrlr *vu_ctrlr); 3133 3134 static void 3135 _vfio_user_endpoint_resume_done_msg(void *ctx) 3136 { 3137 struct nvmf_vfio_user_endpoint *endpoint = ctx; 3138 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3139 3140 endpoint->need_resume = false; 3141 3142 if (!vu_ctrlr) { 3143 return; 3144 } 3145 3146 if (!vu_ctrlr->queued_quiesce) { 3147 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 3148 3149 /* 3150 * We might have ignored new SQ entries while we were quiesced: 3151 * kick ourselves so we'll definitely check again while in 3152 * VFIO_USER_CTRLR_RUNNING state. 3153 */ 3154 if (in_interrupt_mode(endpoint->transport)) { 3155 ctrlr_kick(vu_ctrlr); 3156 } 3157 return; 3158 } 3159 3160 3161 /* 3162 * Basically, once we call `vfu_device_quiesced` the device is 3163 * unquiesced from libvfio-user's perspective so from the moment 3164 * `vfio_user_quiesce_done` returns libvfio-user might quiesce the device 3165 * again. However, because the NVMf subsytem is an asynchronous 3166 * operation, this quiesce might come _before_ the NVMf subsystem has 3167 * been resumed, so in the callback of `spdk_nvmf_subsystem_resume` we 3168 * need to check whether a quiesce was requested. 3169 */ 3170 SPDK_DEBUGLOG(nvmf_vfio, "%s has queued quiesce event, quiesce again\n", 3171 ctrlr_id(vu_ctrlr)); 3172 ctrlr_quiesce(vu_ctrlr); 3173 } 3174 3175 static void 3176 vfio_user_endpoint_resume_done(struct spdk_nvmf_subsystem *subsystem, 3177 void *cb_arg, int status) 3178 { 3179 struct nvmf_vfio_user_endpoint *endpoint = cb_arg; 3180 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3181 3182 SPDK_DEBUGLOG(nvmf_vfio, "%s resumed done with status %d\n", endpoint_id(endpoint), status); 3183 3184 if (!vu_ctrlr) { 3185 return; 3186 } 3187 3188 spdk_thread_send_msg(vu_ctrlr->thread, _vfio_user_endpoint_resume_done_msg, endpoint); 3189 } 3190 3191 static void 3192 vfio_user_quiesce_done(void *ctx) 3193 { 3194 struct ctrlr_quiesce_ctx *quiesce_ctx = ctx; 3195 struct nvmf_vfio_user_endpoint *endpoint = quiesce_ctx->endpoint; 3196 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3197 int ret; 3198 3199 if (!vu_ctrlr) { 3200 free(quiesce_ctx); 3201 return; 3202 } 3203 3204 SPDK_DEBUGLOG(nvmf_vfio, "%s device quiesced\n", ctrlr_id(vu_ctrlr)); 3205 3206 assert(vu_ctrlr->state == VFIO_USER_CTRLR_PAUSING); 3207 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 3208 vfu_device_quiesced(endpoint->vfu_ctx, quiesce_ctx->status); 3209 vu_ctrlr->queued_quiesce = false; 3210 free(quiesce_ctx); 3211 3212 /* `vfu_device_quiesced` can change the migration state, 3213 * so we need to re-check `vu_ctrlr->state`. 3214 */ 3215 if (vu_ctrlr->state == VFIO_USER_CTRLR_MIGRATING) { 3216 SPDK_DEBUGLOG(nvmf_vfio, "%s is in MIGRATION state\n", ctrlr_id(vu_ctrlr)); 3217 return; 3218 } 3219 3220 SPDK_DEBUGLOG(nvmf_vfio, "%s start to resume\n", ctrlr_id(vu_ctrlr)); 3221 vu_ctrlr->state = VFIO_USER_CTRLR_RESUMING; 3222 ret = spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 3223 vfio_user_endpoint_resume_done, endpoint); 3224 if (ret < 0) { 3225 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 3226 SPDK_ERRLOG("%s: failed to resume, ret=%d\n", endpoint_id(endpoint), ret); 3227 } 3228 } 3229 3230 static void 3231 vfio_user_pause_done(struct spdk_nvmf_subsystem *subsystem, 3232 void *ctx, int status) 3233 { 3234 struct ctrlr_quiesce_ctx *quiesce_ctx = ctx; 3235 struct nvmf_vfio_user_endpoint *endpoint = quiesce_ctx->endpoint; 3236 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3237 3238 if (!vu_ctrlr) { 3239 free(quiesce_ctx); 3240 return; 3241 } 3242 3243 quiesce_ctx->status = status; 3244 3245 SPDK_DEBUGLOG(nvmf_vfio, "%s pause done with status %d\n", 3246 ctrlr_id(vu_ctrlr), status); 3247 3248 spdk_thread_send_msg(vu_ctrlr->thread, 3249 vfio_user_quiesce_done, ctx); 3250 } 3251 3252 /* 3253 * Ensure that, for this PG, we've stopped running in nvmf_vfio_user_sq_poll(); 3254 * we've already set ctrlr->state, so we won't process new entries, but we need 3255 * to ensure that this PG is quiesced. This only works because there's no 3256 * callback context set up between polling the SQ and spdk_nvmf_request_exec(). 3257 * 3258 * Once we've walked all PGs, we need to pause any submitted I/O via 3259 * spdk_nvmf_subsystem_pause(SPDK_NVME_GLOBAL_NS_TAG). 3260 */ 3261 static void 3262 vfio_user_quiesce_pg(void *ctx) 3263 { 3264 struct ctrlr_quiesce_ctx *quiesce_ctx = ctx; 3265 struct nvmf_vfio_user_endpoint *endpoint = quiesce_ctx->endpoint; 3266 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3267 struct nvmf_vfio_user_poll_group *vu_group = quiesce_ctx->group; 3268 struct spdk_nvmf_subsystem *subsystem = endpoint->subsystem; 3269 int ret; 3270 3271 SPDK_DEBUGLOG(nvmf_vfio, "quiesced pg:%p\n", vu_group); 3272 3273 if (!vu_ctrlr) { 3274 free(quiesce_ctx); 3275 return; 3276 } 3277 3278 quiesce_ctx->group = TAILQ_NEXT(vu_group, link); 3279 if (quiesce_ctx->group != NULL) { 3280 spdk_thread_send_msg(poll_group_to_thread(quiesce_ctx->group), 3281 vfio_user_quiesce_pg, quiesce_ctx); 3282 return; 3283 } 3284 3285 ret = spdk_nvmf_subsystem_pause(subsystem, SPDK_NVME_GLOBAL_NS_TAG, 3286 vfio_user_pause_done, quiesce_ctx); 3287 if (ret < 0) { 3288 SPDK_ERRLOG("%s: failed to pause, ret=%d\n", 3289 endpoint_id(endpoint), ret); 3290 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 3291 fail_ctrlr(vu_ctrlr); 3292 free(quiesce_ctx); 3293 } 3294 } 3295 3296 static void 3297 ctrlr_quiesce(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3298 { 3299 struct ctrlr_quiesce_ctx *quiesce_ctx; 3300 3301 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSING; 3302 3303 quiesce_ctx = calloc(1, sizeof(*quiesce_ctx)); 3304 if (!quiesce_ctx) { 3305 SPDK_ERRLOG("Failed to allocate subsystem pause context\n"); 3306 assert(false); 3307 return; 3308 } 3309 3310 quiesce_ctx->endpoint = vu_ctrlr->endpoint; 3311 quiesce_ctx->status = 0; 3312 quiesce_ctx->group = TAILQ_FIRST(&vu_ctrlr->transport->poll_groups); 3313 3314 spdk_thread_send_msg(poll_group_to_thread(quiesce_ctx->group), 3315 vfio_user_quiesce_pg, quiesce_ctx); 3316 } 3317 3318 static int 3319 vfio_user_dev_quiesce_cb(vfu_ctx_t *vfu_ctx) 3320 { 3321 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3322 struct spdk_nvmf_subsystem *subsystem = endpoint->subsystem; 3323 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3324 3325 if (!vu_ctrlr) { 3326 return 0; 3327 } 3328 3329 /* NVMf library will destruct controller when no 3330 * connected queue pairs. 3331 */ 3332 if (!nvmf_subsystem_get_ctrlr(subsystem, vu_ctrlr->cntlid)) { 3333 return 0; 3334 } 3335 3336 SPDK_DEBUGLOG(nvmf_vfio, "%s starts to quiesce\n", ctrlr_id(vu_ctrlr)); 3337 3338 /* There is no race condition here as device quiesce callback 3339 * and nvmf_prop_set_cc() are running in the same thread context. 3340 */ 3341 if (!vu_ctrlr->ctrlr->vcprop.cc.bits.en) { 3342 return 0; 3343 } else if (!vu_ctrlr->ctrlr->vcprop.csts.bits.rdy) { 3344 return 0; 3345 } else if (vu_ctrlr->ctrlr->vcprop.csts.bits.shst == SPDK_NVME_SHST_COMPLETE) { 3346 return 0; 3347 } 3348 3349 switch (vu_ctrlr->state) { 3350 case VFIO_USER_CTRLR_PAUSED: 3351 case VFIO_USER_CTRLR_MIGRATING: 3352 return 0; 3353 case VFIO_USER_CTRLR_RUNNING: 3354 ctrlr_quiesce(vu_ctrlr); 3355 break; 3356 case VFIO_USER_CTRLR_RESUMING: 3357 vu_ctrlr->queued_quiesce = true; 3358 SPDK_DEBUGLOG(nvmf_vfio, "%s is busy to quiesce, current state %u\n", ctrlr_id(vu_ctrlr), 3359 vu_ctrlr->state); 3360 break; 3361 default: 3362 assert(vu_ctrlr->state != VFIO_USER_CTRLR_PAUSING); 3363 break; 3364 } 3365 3366 errno = EBUSY; 3367 return -1; 3368 } 3369 3370 static void 3371 vfio_user_ctrlr_dump_migr_data(const char *name, 3372 struct vfio_user_nvme_migr_state *migr_data, 3373 struct nvmf_vfio_user_shadow_doorbells *sdbl) 3374 { 3375 struct spdk_nvmf_registers *regs; 3376 struct nvme_migr_sq_state *sq; 3377 struct nvme_migr_cq_state *cq; 3378 uint32_t *doorbell_base; 3379 uint32_t i; 3380 3381 SPDK_NOTICELOG("Dump %s\n", name); 3382 3383 regs = &migr_data->nvmf_data.regs; 3384 doorbell_base = (uint32_t *)&migr_data->doorbells; 3385 3386 SPDK_NOTICELOG("Registers\n"); 3387 SPDK_NOTICELOG("CSTS 0x%x\n", regs->csts.raw); 3388 SPDK_NOTICELOG("CAP 0x%"PRIx64"\n", regs->cap.raw); 3389 SPDK_NOTICELOG("VS 0x%x\n", regs->vs.raw); 3390 SPDK_NOTICELOG("CC 0x%x\n", regs->cc.raw); 3391 SPDK_NOTICELOG("AQA 0x%x\n", regs->aqa.raw); 3392 SPDK_NOTICELOG("ASQ 0x%"PRIx64"\n", regs->asq); 3393 SPDK_NOTICELOG("ACQ 0x%"PRIx64"\n", regs->acq); 3394 3395 SPDK_NOTICELOG("Number of IO Queues %u\n", migr_data->ctrlr_header.num_io_queues); 3396 3397 if (sdbl != NULL) { 3398 SPDK_NOTICELOG("shadow doorbell buffer=%#lx\n", 3399 migr_data->ctrlr_header.shadow_doorbell_buffer); 3400 SPDK_NOTICELOG("eventidx buffer=%#lx\n", 3401 migr_data->ctrlr_header.eventidx_buffer); 3402 } 3403 3404 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3405 sq = &migr_data->qps[i].sq; 3406 cq = &migr_data->qps[i].cq; 3407 3408 if (sq->size) { 3409 SPDK_NOTICELOG("sqid:%u, bar0_doorbell:%u\n", sq->sqid, doorbell_base[i * 2]); 3410 if (i > 0 && sdbl != NULL) { 3411 SPDK_NOTICELOG("sqid:%u, shadow_doorbell:%u, eventidx:%u\n", 3412 sq->sqid, 3413 sdbl->shadow_doorbells[queue_index(i, false)], 3414 sdbl->eventidxs[queue_index(i, false)]); 3415 } 3416 SPDK_NOTICELOG("SQ sqid:%u, cqid:%u, sqhead:%u, size:%u, dma_addr:0x%"PRIx64"\n", 3417 sq->sqid, sq->cqid, sq->head, sq->size, sq->dma_addr); 3418 } 3419 3420 if (cq->size) { 3421 SPDK_NOTICELOG("cqid:%u, bar0_doorbell:%u\n", cq->cqid, doorbell_base[i * 2 + 1]); 3422 if (i > 0 && sdbl != NULL) { 3423 SPDK_NOTICELOG("cqid:%u, shadow_doorbell:%u, eventidx:%u\n", 3424 cq->cqid, 3425 sdbl->shadow_doorbells[queue_index(i, true)], 3426 sdbl->eventidxs[queue_index(i, true)]); 3427 } 3428 SPDK_NOTICELOG("CQ cqid:%u, phase:%u, cqtail:%u, size:%u, iv:%u, ien:%u, dma_addr:0x%"PRIx64"\n", 3429 cq->cqid, cq->phase, cq->tail, cq->size, cq->iv, cq->ien, cq->dma_addr); 3430 } 3431 } 3432 3433 SPDK_NOTICELOG("%s Dump Done\n", name); 3434 } 3435 3436 /* Read region 9 content and restore it to migration data structures */ 3437 static int 3438 vfio_user_migr_stream_to_data(struct nvmf_vfio_user_endpoint *endpoint, 3439 struct vfio_user_nvme_migr_state *migr_state) 3440 { 3441 void *data_ptr = endpoint->migr_data; 3442 3443 /* Load vfio_user_nvme_migr_header first */ 3444 memcpy(&migr_state->ctrlr_header, data_ptr, sizeof(struct vfio_user_nvme_migr_header)); 3445 /* TODO: version check */ 3446 if (migr_state->ctrlr_header.magic != VFIO_USER_NVME_MIGR_MAGIC) { 3447 SPDK_ERRLOG("%s: bad magic number %x\n", endpoint_id(endpoint), migr_state->ctrlr_header.magic); 3448 return -EINVAL; 3449 } 3450 3451 /* Load nvmf controller data */ 3452 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.nvmf_data_offset; 3453 memcpy(&migr_state->nvmf_data, data_ptr, migr_state->ctrlr_header.nvmf_data_len); 3454 3455 /* Load queue pairs */ 3456 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.qp_offset; 3457 memcpy(&migr_state->qps, data_ptr, migr_state->ctrlr_header.qp_len); 3458 3459 /* Load doorbells */ 3460 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.bar_offset[VFU_PCI_DEV_BAR0_REGION_IDX]; 3461 memcpy(&migr_state->doorbells, data_ptr, 3462 migr_state->ctrlr_header.bar_len[VFU_PCI_DEV_BAR0_REGION_IDX]); 3463 3464 /* Load CFG */ 3465 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.bar_offset[VFU_PCI_DEV_CFG_REGION_IDX]; 3466 memcpy(&migr_state->cfg, data_ptr, migr_state->ctrlr_header.bar_len[VFU_PCI_DEV_CFG_REGION_IDX]); 3467 3468 return 0; 3469 } 3470 3471 3472 static void 3473 vfio_user_migr_ctrlr_save_data(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3474 { 3475 struct spdk_nvmf_ctrlr *ctrlr = vu_ctrlr->ctrlr; 3476 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 3477 struct nvmf_vfio_user_sq *sq; 3478 struct nvmf_vfio_user_cq *cq; 3479 uint64_t data_offset; 3480 void *data_ptr; 3481 uint32_t *doorbell_base; 3482 uint32_t i = 0; 3483 uint16_t sqid, cqid; 3484 struct vfio_user_nvme_migr_state migr_state = { 3485 .nvmf_data = { 3486 .data_size = offsetof(struct spdk_nvmf_ctrlr_migr_data, unused), 3487 .regs_size = sizeof(struct spdk_nvmf_registers), 3488 .feat_size = sizeof(struct spdk_nvmf_ctrlr_feat) 3489 } 3490 }; 3491 3492 /* Save all data to vfio_user_nvme_migr_state first, then we will 3493 * copy it to device migration region at last. 3494 */ 3495 3496 /* save magic number */ 3497 migr_state.ctrlr_header.magic = VFIO_USER_NVME_MIGR_MAGIC; 3498 3499 /* save controller data */ 3500 spdk_nvmf_ctrlr_save_migr_data(ctrlr, &migr_state.nvmf_data); 3501 3502 /* save connected queue pairs */ 3503 TAILQ_FOREACH(sq, &vu_ctrlr->connected_sqs, tailq) { 3504 /* save sq */ 3505 sqid = sq->qid; 3506 migr_state.qps[sqid].sq.sqid = sq->qid; 3507 migr_state.qps[sqid].sq.cqid = sq->cqid; 3508 migr_state.qps[sqid].sq.head = *sq_headp(sq); 3509 migr_state.qps[sqid].sq.size = sq->size; 3510 migr_state.qps[sqid].sq.dma_addr = sq->mapping.prp1; 3511 3512 /* save cq, for shared cq case, cq may be saved multiple times */ 3513 cqid = sq->cqid; 3514 cq = vu_ctrlr->cqs[cqid]; 3515 migr_state.qps[cqid].cq.cqid = cqid; 3516 migr_state.qps[cqid].cq.tail = *cq_tailp(cq); 3517 migr_state.qps[cqid].cq.ien = cq->ien; 3518 migr_state.qps[cqid].cq.iv = cq->iv; 3519 migr_state.qps[cqid].cq.size = cq->size; 3520 migr_state.qps[cqid].cq.phase = cq->phase; 3521 migr_state.qps[cqid].cq.dma_addr = cq->mapping.prp1; 3522 i++; 3523 } 3524 3525 assert(i > 0); 3526 migr_state.ctrlr_header.num_io_queues = i - 1; 3527 3528 /* Save doorbells */ 3529 doorbell_base = (uint32_t *)&migr_state.doorbells; 3530 memcpy(doorbell_base, (void *)vu_ctrlr->bar0_doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 3531 3532 /* Save PCI configuration space */ 3533 memcpy(&migr_state.cfg, (void *)endpoint->pci_config_space, NVME_REG_CFG_SIZE); 3534 3535 /* Save all data to device migration region */ 3536 data_ptr = endpoint->migr_data; 3537 3538 /* Copy nvmf controller data */ 3539 data_offset = sizeof(struct vfio_user_nvme_migr_header); 3540 data_ptr += data_offset; 3541 migr_state.ctrlr_header.nvmf_data_offset = data_offset; 3542 migr_state.ctrlr_header.nvmf_data_len = sizeof(struct spdk_nvmf_ctrlr_migr_data); 3543 memcpy(data_ptr, &migr_state.nvmf_data, sizeof(struct spdk_nvmf_ctrlr_migr_data)); 3544 3545 /* Copy queue pairs */ 3546 data_offset += sizeof(struct spdk_nvmf_ctrlr_migr_data); 3547 data_ptr += sizeof(struct spdk_nvmf_ctrlr_migr_data); 3548 migr_state.ctrlr_header.qp_offset = data_offset; 3549 migr_state.ctrlr_header.qp_len = i * (sizeof(struct nvme_migr_sq_state) + sizeof( 3550 struct nvme_migr_cq_state)); 3551 memcpy(data_ptr, &migr_state.qps, migr_state.ctrlr_header.qp_len); 3552 3553 /* Copy doorbells */ 3554 data_offset += migr_state.ctrlr_header.qp_len; 3555 data_ptr += migr_state.ctrlr_header.qp_len; 3556 migr_state.ctrlr_header.bar_offset[VFU_PCI_DEV_BAR0_REGION_IDX] = data_offset; 3557 migr_state.ctrlr_header.bar_len[VFU_PCI_DEV_BAR0_REGION_IDX] = NVMF_VFIO_USER_DOORBELLS_SIZE; 3558 memcpy(data_ptr, &migr_state.doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 3559 3560 /* Copy CFG */ 3561 data_offset += NVMF_VFIO_USER_DOORBELLS_SIZE; 3562 data_ptr += NVMF_VFIO_USER_DOORBELLS_SIZE; 3563 migr_state.ctrlr_header.bar_offset[VFU_PCI_DEV_CFG_REGION_IDX] = data_offset; 3564 migr_state.ctrlr_header.bar_len[VFU_PCI_DEV_CFG_REGION_IDX] = NVME_REG_CFG_SIZE; 3565 memcpy(data_ptr, &migr_state.cfg, NVME_REG_CFG_SIZE); 3566 3567 /* copy shadow doorbells */ 3568 if (vu_ctrlr->sdbl != NULL) { 3569 migr_state.ctrlr_header.sdbl = true; 3570 migr_state.ctrlr_header.shadow_doorbell_buffer = vu_ctrlr->shadow_doorbell_buffer; 3571 migr_state.ctrlr_header.eventidx_buffer = vu_ctrlr->eventidx_buffer; 3572 } 3573 3574 /* Copy nvme migration header finally */ 3575 memcpy(endpoint->migr_data, &migr_state.ctrlr_header, sizeof(struct vfio_user_nvme_migr_header)); 3576 3577 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 3578 vfio_user_ctrlr_dump_migr_data("SAVE", &migr_state, vu_ctrlr->sdbl); 3579 } 3580 } 3581 3582 /* 3583 * If we are about to close the connection, we need to unregister the interrupt, 3584 * as the library will subsequently close the file descriptor we registered. 3585 */ 3586 static int 3587 vfio_user_device_reset(vfu_ctx_t *vfu_ctx, vfu_reset_type_t type) 3588 { 3589 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3590 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 3591 3592 SPDK_DEBUGLOG(nvmf_vfio, "Device reset type %u\n", type); 3593 3594 if (type == VFU_RESET_LOST_CONN) { 3595 if (ctrlr != NULL) { 3596 spdk_interrupt_unregister(&ctrlr->intr); 3597 ctrlr->intr_fd = -1; 3598 } 3599 return 0; 3600 } 3601 3602 /* FIXME: LOST_CONN case ? */ 3603 if (ctrlr->sdbl != NULL) { 3604 vfio_user_ctrlr_switch_doorbells(ctrlr, false); 3605 free_sdbl(vfu_ctx, ctrlr->sdbl); 3606 ctrlr->sdbl = NULL; 3607 } 3608 3609 /* FIXME: much more needed here. */ 3610 3611 return 0; 3612 } 3613 3614 static int 3615 vfio_user_migr_ctrlr_construct_qps(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 3616 struct vfio_user_nvme_migr_state *migr_state) 3617 { 3618 uint32_t i, qsize = 0; 3619 uint16_t sqid, cqid; 3620 struct vfio_user_nvme_migr_qp migr_qp; 3621 void *addr; 3622 uint32_t cqs_ref[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR] = {}; 3623 int ret; 3624 3625 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 3626 vfio_user_ctrlr_dump_migr_data("RESUME", migr_state, vu_ctrlr->sdbl); 3627 } 3628 3629 /* restore submission queues */ 3630 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3631 migr_qp = migr_state->qps[i]; 3632 3633 qsize = migr_qp.sq.size; 3634 if (qsize) { 3635 struct nvmf_vfio_user_sq *sq; 3636 3637 sqid = migr_qp.sq.sqid; 3638 if (sqid != i) { 3639 SPDK_ERRLOG("Expected sqid %u while got %u", i, sqid); 3640 return -EINVAL; 3641 } 3642 3643 /* allocate sq if necessary */ 3644 if (vu_ctrlr->sqs[sqid] == NULL) { 3645 ret = init_sq(vu_ctrlr, &vu_ctrlr->transport->transport, sqid); 3646 if (ret) { 3647 SPDK_ERRLOG("Construct qpair with qid %u failed\n", sqid); 3648 return -EFAULT; 3649 } 3650 } 3651 3652 sq = vu_ctrlr->sqs[sqid]; 3653 sq->size = qsize; 3654 3655 ret = alloc_sq_reqs(vu_ctrlr, sq); 3656 if (ret) { 3657 SPDK_ERRLOG("Construct sq with qid %u failed\n", sqid); 3658 return -EFAULT; 3659 } 3660 3661 /* restore sq */ 3662 sq->sq_state = VFIO_USER_SQ_CREATED; 3663 sq->cqid = migr_qp.sq.cqid; 3664 *sq_headp(sq) = migr_qp.sq.head; 3665 sq->mapping.prp1 = migr_qp.sq.dma_addr; 3666 addr = map_one(vu_ctrlr->endpoint->vfu_ctx, 3667 sq->mapping.prp1, sq->size * 64, 3668 sq->mapping.sg, &sq->mapping.iov, 3669 PROT_READ); 3670 if (addr == NULL) { 3671 SPDK_ERRLOG("Restore sq with qid %u PRP1 0x%"PRIx64" with size %u failed\n", 3672 sqid, sq->mapping.prp1, sq->size); 3673 return -EFAULT; 3674 } 3675 cqs_ref[sq->cqid]++; 3676 } 3677 } 3678 3679 /* restore completion queues */ 3680 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3681 migr_qp = migr_state->qps[i]; 3682 3683 qsize = migr_qp.cq.size; 3684 if (qsize) { 3685 struct nvmf_vfio_user_cq *cq; 3686 3687 /* restore cq */ 3688 cqid = migr_qp.sq.cqid; 3689 assert(cqid == i); 3690 3691 /* allocate cq if necessary */ 3692 if (vu_ctrlr->cqs[cqid] == NULL) { 3693 ret = init_cq(vu_ctrlr, cqid); 3694 if (ret) { 3695 SPDK_ERRLOG("Construct qpair with qid %u failed\n", cqid); 3696 return -EFAULT; 3697 } 3698 } 3699 3700 cq = vu_ctrlr->cqs[cqid]; 3701 3702 cq->size = qsize; 3703 3704 cq->cq_state = VFIO_USER_CQ_CREATED; 3705 cq->cq_ref = cqs_ref[cqid]; 3706 *cq_tailp(cq) = migr_qp.cq.tail; 3707 cq->mapping.prp1 = migr_qp.cq.dma_addr; 3708 cq->ien = migr_qp.cq.ien; 3709 cq->iv = migr_qp.cq.iv; 3710 cq->phase = migr_qp.cq.phase; 3711 addr = map_one(vu_ctrlr->endpoint->vfu_ctx, 3712 cq->mapping.prp1, cq->size * 16, 3713 cq->mapping.sg, &cq->mapping.iov, 3714 PROT_READ | PROT_WRITE); 3715 if (addr == NULL) { 3716 SPDK_ERRLOG("Restore cq with qid %u PRP1 0x%"PRIx64" with size %u failed\n", 3717 cqid, cq->mapping.prp1, cq->size); 3718 return -EFAULT; 3719 } 3720 } 3721 } 3722 3723 return 0; 3724 } 3725 3726 static int 3727 vfio_user_migr_ctrlr_restore(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3728 { 3729 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 3730 struct spdk_nvmf_ctrlr *ctrlr = vu_ctrlr->ctrlr; 3731 uint32_t *doorbell_base; 3732 struct spdk_nvme_cmd cmd; 3733 uint16_t i; 3734 int rc = 0; 3735 struct vfio_user_nvme_migr_state migr_state = { 3736 .nvmf_data = { 3737 .data_size = offsetof(struct spdk_nvmf_ctrlr_migr_data, unused), 3738 .regs_size = sizeof(struct spdk_nvmf_registers), 3739 .feat_size = sizeof(struct spdk_nvmf_ctrlr_feat) 3740 } 3741 }; 3742 3743 assert(endpoint->migr_data != NULL); 3744 assert(ctrlr != NULL); 3745 rc = vfio_user_migr_stream_to_data(endpoint, &migr_state); 3746 if (rc) { 3747 return rc; 3748 } 3749 3750 /* restore shadow doorbells */ 3751 if (migr_state.ctrlr_header.sdbl) { 3752 struct nvmf_vfio_user_shadow_doorbells *sdbl; 3753 sdbl = map_sdbl(vu_ctrlr->endpoint->vfu_ctx, 3754 migr_state.ctrlr_header.shadow_doorbell_buffer, 3755 migr_state.ctrlr_header.eventidx_buffer, 3756 memory_page_size(vu_ctrlr)); 3757 if (sdbl == NULL) { 3758 SPDK_ERRLOG("%s: failed to re-map shadow doorbell buffers\n", 3759 ctrlr_id(vu_ctrlr)); 3760 return -1; 3761 } 3762 3763 vu_ctrlr->shadow_doorbell_buffer = migr_state.ctrlr_header.shadow_doorbell_buffer; 3764 vu_ctrlr->eventidx_buffer = migr_state.ctrlr_header.eventidx_buffer; 3765 3766 SWAP(vu_ctrlr->sdbl, sdbl); 3767 } 3768 3769 rc = vfio_user_migr_ctrlr_construct_qps(vu_ctrlr, &migr_state); 3770 if (rc) { 3771 return rc; 3772 } 3773 3774 /* restore PCI configuration space */ 3775 memcpy((void *)endpoint->pci_config_space, &migr_state.cfg, NVME_REG_CFG_SIZE); 3776 3777 doorbell_base = (uint32_t *)&migr_state.doorbells; 3778 /* restore doorbells from saved registers */ 3779 memcpy((void *)vu_ctrlr->bar0_doorbells, doorbell_base, NVMF_VFIO_USER_DOORBELLS_SIZE); 3780 3781 /* restore nvmf controller data */ 3782 rc = spdk_nvmf_ctrlr_restore_migr_data(ctrlr, &migr_state.nvmf_data); 3783 if (rc) { 3784 return rc; 3785 } 3786 3787 /* resubmit pending AERs */ 3788 for (i = 0; i < migr_state.nvmf_data.num_aer_cids; i++) { 3789 SPDK_DEBUGLOG(nvmf_vfio, "%s AER resubmit, CID %u\n", ctrlr_id(vu_ctrlr), 3790 migr_state.nvmf_data.aer_cids[i]); 3791 memset(&cmd, 0, sizeof(cmd)); 3792 cmd.opc = SPDK_NVME_OPC_ASYNC_EVENT_REQUEST; 3793 cmd.cid = migr_state.nvmf_data.aer_cids[i]; 3794 rc = handle_cmd_req(vu_ctrlr, &cmd, vu_ctrlr->sqs[0]); 3795 if (spdk_unlikely(rc)) { 3796 break; 3797 } 3798 } 3799 3800 return rc; 3801 } 3802 3803 static void 3804 vfio_user_migr_ctrlr_enable_sqs(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3805 { 3806 uint32_t i; 3807 struct nvmf_vfio_user_sq *sq; 3808 3809 /* The Admin queue (qid: 0) does not ever use shadow doorbells. */ 3810 3811 if (vu_ctrlr->sqs[0] != NULL) { 3812 vu_ctrlr->sqs[0]->dbl_tailp = vu_ctrlr->bar0_doorbells + 3813 queue_index(0, false); 3814 } 3815 3816 if (vu_ctrlr->cqs[0] != NULL) { 3817 vu_ctrlr->cqs[0]->dbl_headp = vu_ctrlr->bar0_doorbells + 3818 queue_index(0, true); 3819 } 3820 3821 vfio_user_ctrlr_switch_doorbells(vu_ctrlr, vu_ctrlr->sdbl != NULL); 3822 3823 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3824 sq = vu_ctrlr->sqs[i]; 3825 if (!sq || !sq->size) { 3826 continue; 3827 } 3828 3829 if (nvmf_qpair_is_admin_queue(&sq->qpair)) { 3830 /* ADMIN queue pair is always in the poll group, just enable it */ 3831 sq->sq_state = VFIO_USER_SQ_ACTIVE; 3832 } else { 3833 spdk_nvmf_tgt_new_qpair(vu_ctrlr->transport->transport.tgt, &sq->qpair); 3834 } 3835 } 3836 } 3837 3838 /* 3839 * We are in stop-and-copy state, but still potentially have some current dirty 3840 * sgls: while we're quiesced and thus should have no active requests, we still 3841 * have potentially dirty maps of the shadow doorbells and the CQs (SQs are 3842 * mapped read only). 3843 * 3844 * Since we won't be calling vfu_sgl_put() for them, we need to explicitly 3845 * mark them dirty now. 3846 */ 3847 static void 3848 vfio_user_migr_ctrlr_mark_dirty(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3849 { 3850 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 3851 3852 assert(vu_ctrlr->state == VFIO_USER_CTRLR_MIGRATING); 3853 3854 for (size_t i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3855 struct nvmf_vfio_user_cq *cq = vu_ctrlr->cqs[i]; 3856 3857 if (cq == NULL || q_addr(&cq->mapping) == NULL) { 3858 continue; 3859 } 3860 3861 vfu_sgl_mark_dirty(endpoint->vfu_ctx, cq->mapping.sg, 1); 3862 } 3863 3864 if (vu_ctrlr->sdbl != NULL) { 3865 dma_sg_t *sg; 3866 size_t i; 3867 3868 for (i = 0; i < NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT; 3869 ++i) { 3870 3871 if (!vu_ctrlr->sdbl->iovs[i].iov_len) { 3872 continue; 3873 } 3874 3875 sg = index_to_sg_t(vu_ctrlr->sdbl->sgs, i); 3876 3877 vfu_sgl_mark_dirty(endpoint->vfu_ctx, sg, 1); 3878 } 3879 } 3880 } 3881 3882 static int 3883 vfio_user_migration_device_state_transition(vfu_ctx_t *vfu_ctx, vfu_migr_state_t state) 3884 { 3885 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3886 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3887 struct nvmf_vfio_user_sq *sq; 3888 int ret = 0; 3889 3890 SPDK_DEBUGLOG(nvmf_vfio, "%s controller state %u, migration state %u\n", endpoint_id(endpoint), 3891 vu_ctrlr->state, state); 3892 3893 switch (state) { 3894 case VFU_MIGR_STATE_STOP_AND_COPY: 3895 vu_ctrlr->in_source_vm = true; 3896 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 3897 vfio_user_migr_ctrlr_mark_dirty(vu_ctrlr); 3898 vfio_user_migr_ctrlr_save_data(vu_ctrlr); 3899 break; 3900 case VFU_MIGR_STATE_STOP: 3901 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 3902 /* The controller associates with source VM is dead now, we will resume 3903 * the subsystem after destroying the controller data structure, then the 3904 * subsystem can be re-used for another new client. 3905 */ 3906 if (vu_ctrlr->in_source_vm) { 3907 endpoint->need_resume = true; 3908 } 3909 break; 3910 case VFU_MIGR_STATE_PRE_COPY: 3911 assert(vu_ctrlr->state == VFIO_USER_CTRLR_PAUSED); 3912 break; 3913 case VFU_MIGR_STATE_RESUME: 3914 /* 3915 * Destination ADMIN queue pair is connected when starting the VM, 3916 * but the ADMIN queue pair isn't enabled in destination VM, the poll 3917 * group will do nothing to ADMIN queue pair for now. 3918 */ 3919 if (vu_ctrlr->state != VFIO_USER_CTRLR_RUNNING) { 3920 break; 3921 } 3922 3923 assert(!vu_ctrlr->in_source_vm); 3924 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 3925 3926 sq = TAILQ_FIRST(&vu_ctrlr->connected_sqs); 3927 assert(sq != NULL); 3928 assert(sq->qpair.qid == 0); 3929 sq->sq_state = VFIO_USER_SQ_INACTIVE; 3930 3931 /* Free ADMIN SQ resources first, SQ resources will be 3932 * allocated based on queue size from source VM. 3933 */ 3934 free_sq_reqs(sq); 3935 sq->size = 0; 3936 break; 3937 case VFU_MIGR_STATE_RUNNING: 3938 3939 if (vu_ctrlr->state != VFIO_USER_CTRLR_MIGRATING) { 3940 break; 3941 } 3942 3943 if (!vu_ctrlr->in_source_vm) { 3944 /* Restore destination VM from BAR9 */ 3945 ret = vfio_user_migr_ctrlr_restore(vu_ctrlr); 3946 if (ret) { 3947 break; 3948 } 3949 3950 vfio_user_ctrlr_switch_doorbells(vu_ctrlr, false); 3951 vfio_user_migr_ctrlr_enable_sqs(vu_ctrlr); 3952 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 3953 /* FIXME where do we resume nvmf? */ 3954 } else { 3955 /* Rollback source VM */ 3956 vu_ctrlr->state = VFIO_USER_CTRLR_RESUMING; 3957 ret = spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 3958 vfio_user_endpoint_resume_done, endpoint); 3959 if (ret < 0) { 3960 /* TODO: fail controller with CFS bit set */ 3961 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 3962 SPDK_ERRLOG("%s: failed to resume, ret=%d\n", endpoint_id(endpoint), ret); 3963 } 3964 } 3965 vu_ctrlr->migr_data_prepared = false; 3966 vu_ctrlr->in_source_vm = false; 3967 break; 3968 3969 default: 3970 return -EINVAL; 3971 } 3972 3973 return ret; 3974 } 3975 3976 static uint64_t 3977 vfio_user_migration_get_pending_bytes(vfu_ctx_t *vfu_ctx) 3978 { 3979 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3980 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 3981 uint64_t pending_bytes; 3982 3983 if (ctrlr->migr_data_prepared) { 3984 assert(ctrlr->state == VFIO_USER_CTRLR_MIGRATING); 3985 pending_bytes = 0; 3986 } else { 3987 pending_bytes = vfio_user_migr_data_len(); 3988 } 3989 3990 SPDK_DEBUGLOG(nvmf_vfio, 3991 "%s current state %u, pending bytes 0x%"PRIx64"\n", 3992 endpoint_id(endpoint), ctrlr->state, pending_bytes); 3993 3994 return pending_bytes; 3995 } 3996 3997 static int 3998 vfio_user_migration_prepare_data(vfu_ctx_t *vfu_ctx, uint64_t *offset, uint64_t *size) 3999 { 4000 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 4001 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 4002 4003 /* 4004 * When transitioning to pre-copy state we set pending_bytes to 0, 4005 * so the vfio-user client shouldn't attempt to read any migration 4006 * data. This is not yet guaranteed by libvfio-user. 4007 */ 4008 if (ctrlr->state != VFIO_USER_CTRLR_MIGRATING) { 4009 assert(size != NULL); 4010 *offset = 0; 4011 *size = 0; 4012 return 0; 4013 } 4014 4015 if (ctrlr->in_source_vm) { /* migration source */ 4016 assert(size != NULL); 4017 *size = vfio_user_migr_data_len(); 4018 vfio_user_migr_ctrlr_save_data(ctrlr); 4019 } else { /* migration destination */ 4020 assert(size == NULL); 4021 assert(!ctrlr->migr_data_prepared); 4022 } 4023 *offset = 0; 4024 ctrlr->migr_data_prepared = true; 4025 4026 SPDK_DEBUGLOG(nvmf_vfio, "%s current state %u\n", endpoint_id(endpoint), ctrlr->state); 4027 4028 return 0; 4029 } 4030 4031 static ssize_t 4032 vfio_user_migration_read_data(vfu_ctx_t *vfu_ctx __attribute__((unused)), 4033 void *buf __attribute__((unused)), 4034 uint64_t count __attribute__((unused)), 4035 uint64_t offset __attribute__((unused))) 4036 { 4037 SPDK_DEBUGLOG(nvmf_vfio, "%s: migration read data not supported\n", 4038 endpoint_id(vfu_get_private(vfu_ctx))); 4039 errno = ENOTSUP; 4040 return -1; 4041 } 4042 4043 static ssize_t 4044 vfio_user_migration_write_data(vfu_ctx_t *vfu_ctx __attribute__((unused)), 4045 void *buf __attribute__((unused)), 4046 uint64_t count __attribute__((unused)), 4047 uint64_t offset __attribute__((unused))) 4048 { 4049 SPDK_DEBUGLOG(nvmf_vfio, "%s: migration write data not supported\n", 4050 endpoint_id(vfu_get_private(vfu_ctx))); 4051 errno = ENOTSUP; 4052 return -1; 4053 } 4054 4055 static int 4056 vfio_user_migration_data_written(vfu_ctx_t *vfu_ctx __attribute__((unused)), 4057 uint64_t count) 4058 { 4059 SPDK_DEBUGLOG(nvmf_vfio, "write 0x%"PRIx64"\n", (uint64_t)count); 4060 4061 if (count != vfio_user_migr_data_len()) { 4062 SPDK_DEBUGLOG(nvmf_vfio, "%s bad count %#lx\n", 4063 endpoint_id(vfu_get_private(vfu_ctx)), count); 4064 errno = EINVAL; 4065 return -1; 4066 } 4067 4068 return 0; 4069 } 4070 4071 static int 4072 vfio_user_dev_info_fill(struct nvmf_vfio_user_transport *vu_transport, 4073 struct nvmf_vfio_user_endpoint *endpoint) 4074 { 4075 int ret; 4076 ssize_t cap_offset; 4077 vfu_ctx_t *vfu_ctx = endpoint->vfu_ctx; 4078 struct iovec migr_sparse_mmap = {}; 4079 4080 struct pmcap pmcap = { .hdr.id = PCI_CAP_ID_PM, .pmcs.nsfrst = 0x1 }; 4081 struct pxcap pxcap = { 4082 .hdr.id = PCI_CAP_ID_EXP, 4083 .pxcaps.ver = 0x2, 4084 .pxdcap = {.rer = 0x1, .flrc = 0x1}, 4085 .pxdcap2.ctds = 0x1 4086 }; 4087 4088 struct msixcap msixcap = { 4089 .hdr.id = PCI_CAP_ID_MSIX, 4090 .mxc.ts = NVME_IRQ_MSIX_NUM - 1, 4091 .mtab = {.tbir = 0x4, .to = 0x0}, 4092 .mpba = {.pbir = 0x5, .pbao = 0x0} 4093 }; 4094 4095 struct iovec sparse_mmap[] = { 4096 { 4097 .iov_base = (void *)NVME_DOORBELLS_OFFSET, 4098 .iov_len = NVMF_VFIO_USER_DOORBELLS_SIZE, 4099 }, 4100 }; 4101 4102 const vfu_migration_callbacks_t migr_callbacks = { 4103 .version = VFU_MIGR_CALLBACKS_VERS, 4104 .transition = &vfio_user_migration_device_state_transition, 4105 .get_pending_bytes = &vfio_user_migration_get_pending_bytes, 4106 .prepare_data = &vfio_user_migration_prepare_data, 4107 .read_data = &vfio_user_migration_read_data, 4108 .data_written = &vfio_user_migration_data_written, 4109 .write_data = &vfio_user_migration_write_data 4110 }; 4111 4112 ret = vfu_pci_init(vfu_ctx, VFU_PCI_TYPE_EXPRESS, PCI_HEADER_TYPE_NORMAL, 0); 4113 if (ret < 0) { 4114 SPDK_ERRLOG("vfu_ctx %p failed to initialize PCI\n", vfu_ctx); 4115 return ret; 4116 } 4117 vfu_pci_set_id(vfu_ctx, SPDK_PCI_VID_NUTANIX, 0x0001, SPDK_PCI_VID_NUTANIX, 0); 4118 /* 4119 * 0x02, controller uses the NVM Express programming interface 4120 * 0x08, non-volatile memory controller 4121 * 0x01, mass storage controller 4122 */ 4123 vfu_pci_set_class(vfu_ctx, 0x01, 0x08, 0x02); 4124 4125 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pmcap); 4126 if (cap_offset < 0) { 4127 SPDK_ERRLOG("vfu_ctx %p failed add pmcap\n", vfu_ctx); 4128 return ret; 4129 } 4130 4131 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pxcap); 4132 if (cap_offset < 0) { 4133 SPDK_ERRLOG("vfu_ctx %p failed add pxcap\n", vfu_ctx); 4134 return ret; 4135 } 4136 4137 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &msixcap); 4138 if (cap_offset < 0) { 4139 SPDK_ERRLOG("vfu_ctx %p failed add msixcap\n", vfu_ctx); 4140 return ret; 4141 } 4142 4143 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_CFG_REGION_IDX, NVME_REG_CFG_SIZE, 4144 access_pci_config, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 4145 if (ret < 0) { 4146 SPDK_ERRLOG("vfu_ctx %p failed to setup cfg\n", vfu_ctx); 4147 return ret; 4148 } 4149 4150 if (vu_transport->transport_opts.disable_mappable_bar0) { 4151 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE, 4152 access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, 4153 NULL, 0, -1, 0); 4154 } else { 4155 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE, 4156 access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, 4157 sparse_mmap, 1, endpoint->devmem_fd, 0); 4158 } 4159 4160 if (ret < 0) { 4161 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 0\n", vfu_ctx); 4162 return ret; 4163 } 4164 4165 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR4_REGION_IDX, NVME_BAR4_SIZE, 4166 NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 4167 if (ret < 0) { 4168 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 4\n", vfu_ctx); 4169 return ret; 4170 } 4171 4172 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR5_REGION_IDX, NVME_BAR5_SIZE, 4173 NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 4174 if (ret < 0) { 4175 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 5\n", vfu_ctx); 4176 return ret; 4177 } 4178 4179 ret = vfu_setup_device_dma(vfu_ctx, memory_region_add_cb, memory_region_remove_cb); 4180 if (ret < 0) { 4181 SPDK_ERRLOG("vfu_ctx %p failed to setup dma callback\n", vfu_ctx); 4182 return ret; 4183 } 4184 4185 ret = vfu_setup_device_reset_cb(vfu_ctx, vfio_user_device_reset); 4186 if (ret < 0) { 4187 SPDK_ERRLOG("vfu_ctx %p failed to setup reset callback\n", vfu_ctx); 4188 return ret; 4189 } 4190 4191 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_INTX_IRQ, 1); 4192 if (ret < 0) { 4193 SPDK_ERRLOG("vfu_ctx %p failed to setup INTX\n", vfu_ctx); 4194 return ret; 4195 } 4196 4197 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSIX_IRQ, NVME_IRQ_MSIX_NUM); 4198 if (ret < 0) { 4199 SPDK_ERRLOG("vfu_ctx %p failed to setup MSIX\n", vfu_ctx); 4200 return ret; 4201 } 4202 4203 vfu_setup_device_quiesce_cb(vfu_ctx, vfio_user_dev_quiesce_cb); 4204 4205 migr_sparse_mmap.iov_base = (void *)4096; 4206 migr_sparse_mmap.iov_len = vfio_user_migr_data_len(); 4207 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_MIGR_REGION_IDX, 4208 vfu_get_migr_register_area_size() + vfio_user_migr_data_len(), 4209 NULL, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, &migr_sparse_mmap, 4210 1, endpoint->migr_fd, 0); 4211 if (ret < 0) { 4212 SPDK_ERRLOG("vfu_ctx %p failed to setup migration region\n", vfu_ctx); 4213 return ret; 4214 } 4215 4216 ret = vfu_setup_device_migration_callbacks(vfu_ctx, &migr_callbacks, 4217 vfu_get_migr_register_area_size()); 4218 if (ret < 0) { 4219 SPDK_ERRLOG("vfu_ctx %p failed to setup migration callbacks\n", vfu_ctx); 4220 return ret; 4221 } 4222 4223 ret = vfu_realize_ctx(vfu_ctx); 4224 if (ret < 0) { 4225 SPDK_ERRLOG("vfu_ctx %p failed to realize\n", vfu_ctx); 4226 return ret; 4227 } 4228 4229 endpoint->pci_config_space = vfu_pci_get_config_space(endpoint->vfu_ctx); 4230 assert(endpoint->pci_config_space != NULL); 4231 init_pci_config_space(endpoint->pci_config_space); 4232 4233 assert(cap_offset != 0); 4234 endpoint->msix = (struct msixcap *)((uint8_t *)endpoint->pci_config_space + cap_offset); 4235 4236 return 0; 4237 } 4238 4239 static int nvmf_vfio_user_accept(void *ctx); 4240 4241 static void 4242 set_intr_mode_noop(struct spdk_poller *poller, void *arg, bool interrupt_mode) 4243 { 4244 /* Nothing for us to do here. */ 4245 } 4246 4247 /* 4248 * Register an "accept" poller: this is polling for incoming vfio-user socket 4249 * connections (on the listening socket). 4250 * 4251 * We need to do this on first listening, and also after destroying a 4252 * controller, so we can accept another connection. 4253 */ 4254 static int 4255 vfio_user_register_accept_poller(struct nvmf_vfio_user_endpoint *endpoint) 4256 { 4257 uint64_t poll_rate_us = endpoint->transport->transport.opts.acceptor_poll_rate; 4258 4259 SPDK_DEBUGLOG(nvmf_vfio, "registering accept poller\n"); 4260 4261 endpoint->accept_poller = SPDK_POLLER_REGISTER(nvmf_vfio_user_accept, 4262 endpoint, poll_rate_us); 4263 4264 if (!endpoint->accept_poller) { 4265 return -1; 4266 } 4267 4268 endpoint->accept_thread = spdk_get_thread(); 4269 endpoint->need_relisten = false; 4270 4271 if (!spdk_interrupt_mode_is_enabled()) { 4272 return 0; 4273 } 4274 4275 endpoint->accept_intr_fd = vfu_get_poll_fd(endpoint->vfu_ctx); 4276 assert(endpoint->accept_intr_fd != -1); 4277 4278 endpoint->accept_intr = SPDK_INTERRUPT_REGISTER(endpoint->accept_intr_fd, 4279 nvmf_vfio_user_accept, endpoint); 4280 4281 assert(endpoint->accept_intr != NULL); 4282 4283 spdk_poller_register_interrupt(endpoint->accept_poller, 4284 set_intr_mode_noop, NULL); 4285 return 0; 4286 } 4287 4288 static void 4289 _vfio_user_relisten(void *ctx) 4290 { 4291 struct nvmf_vfio_user_endpoint *endpoint = ctx; 4292 4293 vfio_user_register_accept_poller(endpoint); 4294 } 4295 4296 static void 4297 _free_ctrlr(void *ctx) 4298 { 4299 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 4300 struct nvmf_vfio_user_endpoint *endpoint = ctrlr->endpoint; 4301 4302 free_sdbl(endpoint->vfu_ctx, ctrlr->sdbl); 4303 4304 spdk_interrupt_unregister(&ctrlr->intr); 4305 ctrlr->intr_fd = -1; 4306 spdk_poller_unregister(&ctrlr->vfu_ctx_poller); 4307 4308 free(ctrlr); 4309 4310 if (endpoint->need_async_destroy) { 4311 nvmf_vfio_user_destroy_endpoint(endpoint); 4312 } else if (endpoint->need_relisten) { 4313 spdk_thread_send_msg(endpoint->accept_thread, 4314 _vfio_user_relisten, endpoint); 4315 } 4316 } 4317 4318 static void 4319 free_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 4320 { 4321 int i; 4322 assert(ctrlr != NULL); 4323 4324 SPDK_DEBUGLOG(nvmf_vfio, "free %s\n", ctrlr_id(ctrlr)); 4325 4326 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 4327 free_qp(ctrlr, i); 4328 } 4329 4330 spdk_thread_exec_msg(ctrlr->thread, _free_ctrlr, ctrlr); 4331 } 4332 4333 static int 4334 nvmf_vfio_user_create_ctrlr(struct nvmf_vfio_user_transport *transport, 4335 struct nvmf_vfio_user_endpoint *endpoint) 4336 { 4337 struct nvmf_vfio_user_ctrlr *ctrlr; 4338 int err = 0; 4339 4340 SPDK_DEBUGLOG(nvmf_vfio, "%s\n", endpoint_id(endpoint)); 4341 4342 /* First, construct a vfio-user CUSTOM transport controller */ 4343 ctrlr = calloc(1, sizeof(*ctrlr)); 4344 if (ctrlr == NULL) { 4345 err = -ENOMEM; 4346 goto out; 4347 } 4348 /* We can only support one connection for now */ 4349 ctrlr->cntlid = 0x1; 4350 ctrlr->intr_fd = -1; 4351 ctrlr->transport = transport; 4352 ctrlr->endpoint = endpoint; 4353 ctrlr->bar0_doorbells = endpoint->bar0_doorbells; 4354 TAILQ_INIT(&ctrlr->connected_sqs); 4355 4356 ctrlr->adaptive_irqs_enabled = 4357 !transport->transport_opts.disable_adaptive_irq; 4358 4359 /* Then, construct an admin queue pair */ 4360 err = init_sq(ctrlr, &transport->transport, 0); 4361 if (err != 0) { 4362 free(ctrlr); 4363 goto out; 4364 } 4365 4366 err = init_cq(ctrlr, 0); 4367 if (err != 0) { 4368 free(ctrlr); 4369 goto out; 4370 } 4371 4372 ctrlr->sqs[0]->size = NVMF_VFIO_USER_DEFAULT_AQ_DEPTH; 4373 4374 err = alloc_sq_reqs(ctrlr, ctrlr->sqs[0]); 4375 if (err != 0) { 4376 free(ctrlr); 4377 goto out; 4378 } 4379 endpoint->ctrlr = ctrlr; 4380 4381 /* Notify the generic layer about the new admin queue pair */ 4382 spdk_nvmf_tgt_new_qpair(transport->transport.tgt, &ctrlr->sqs[0]->qpair); 4383 4384 out: 4385 if (err != 0) { 4386 SPDK_ERRLOG("%s: failed to create vfio-user controller: %s\n", 4387 endpoint_id(endpoint), strerror(-err)); 4388 } 4389 4390 return err; 4391 } 4392 4393 static int 4394 nvmf_vfio_user_listen(struct spdk_nvmf_transport *transport, 4395 const struct spdk_nvme_transport_id *trid, 4396 struct spdk_nvmf_listen_opts *listen_opts) 4397 { 4398 struct nvmf_vfio_user_transport *vu_transport; 4399 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 4400 char path[PATH_MAX] = {}; 4401 char uuid[PATH_MAX] = {}; 4402 int ret; 4403 4404 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4405 transport); 4406 4407 pthread_mutex_lock(&vu_transport->lock); 4408 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 4409 /* Only compare traddr */ 4410 if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) { 4411 pthread_mutex_unlock(&vu_transport->lock); 4412 return -EEXIST; 4413 } 4414 } 4415 pthread_mutex_unlock(&vu_transport->lock); 4416 4417 endpoint = calloc(1, sizeof(*endpoint)); 4418 if (!endpoint) { 4419 return -ENOMEM; 4420 } 4421 4422 pthread_mutex_init(&endpoint->lock, NULL); 4423 endpoint->devmem_fd = -1; 4424 memcpy(&endpoint->trid, trid, sizeof(endpoint->trid)); 4425 endpoint->transport = vu_transport; 4426 4427 ret = snprintf(path, PATH_MAX, "%s/bar0", endpoint_id(endpoint)); 4428 if (ret < 0 || ret >= PATH_MAX) { 4429 SPDK_ERRLOG("%s: error to get socket path: %s.\n", endpoint_id(endpoint), spdk_strerror(errno)); 4430 ret = -1; 4431 goto out; 4432 } 4433 4434 ret = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); 4435 if (ret == -1) { 4436 SPDK_ERRLOG("%s: failed to open device memory at %s: %s.\n", 4437 endpoint_id(endpoint), path, spdk_strerror(errno)); 4438 goto out; 4439 } 4440 unlink(path); 4441 4442 endpoint->devmem_fd = ret; 4443 ret = ftruncate(endpoint->devmem_fd, 4444 NVME_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE); 4445 if (ret != 0) { 4446 SPDK_ERRLOG("%s: error to ftruncate file %s: %s.\n", endpoint_id(endpoint), path, 4447 spdk_strerror(errno)); 4448 goto out; 4449 } 4450 4451 endpoint->bar0_doorbells = mmap(NULL, NVMF_VFIO_USER_DOORBELLS_SIZE, 4452 PROT_READ | PROT_WRITE, MAP_SHARED, endpoint->devmem_fd, NVME_DOORBELLS_OFFSET); 4453 if (endpoint->bar0_doorbells == MAP_FAILED) { 4454 SPDK_ERRLOG("%s: error to mmap file %s: %s.\n", endpoint_id(endpoint), path, spdk_strerror(errno)); 4455 endpoint->bar0_doorbells = NULL; 4456 ret = -1; 4457 goto out; 4458 } 4459 4460 ret = snprintf(path, PATH_MAX, "%s/migr", endpoint_id(endpoint)); 4461 if (ret < 0 || ret >= PATH_MAX) { 4462 SPDK_ERRLOG("%s: error to get migration file path: %s.\n", endpoint_id(endpoint), 4463 spdk_strerror(errno)); 4464 ret = -1; 4465 goto out; 4466 } 4467 ret = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); 4468 if (ret == -1) { 4469 SPDK_ERRLOG("%s: failed to open device memory at %s: %s.\n", 4470 endpoint_id(endpoint), path, spdk_strerror(errno)); 4471 goto out; 4472 } 4473 unlink(path); 4474 4475 endpoint->migr_fd = ret; 4476 ret = ftruncate(endpoint->migr_fd, 4477 vfu_get_migr_register_area_size() + vfio_user_migr_data_len()); 4478 if (ret != 0) { 4479 SPDK_ERRLOG("%s: error to ftruncate migration file %s: %s.\n", endpoint_id(endpoint), path, 4480 spdk_strerror(errno)); 4481 goto out; 4482 } 4483 4484 endpoint->migr_data = mmap(NULL, vfio_user_migr_data_len(), 4485 PROT_READ | PROT_WRITE, MAP_SHARED, endpoint->migr_fd, vfu_get_migr_register_area_size()); 4486 if (endpoint->migr_data == MAP_FAILED) { 4487 SPDK_ERRLOG("%s: error to mmap file %s: %s.\n", endpoint_id(endpoint), path, spdk_strerror(errno)); 4488 endpoint->migr_data = NULL; 4489 ret = -1; 4490 goto out; 4491 } 4492 4493 ret = snprintf(uuid, PATH_MAX, "%s/cntrl", endpoint_id(endpoint)); 4494 if (ret < 0 || ret >= PATH_MAX) { 4495 SPDK_ERRLOG("%s: error to get ctrlr file path: %s\n", endpoint_id(endpoint), spdk_strerror(errno)); 4496 ret = -1; 4497 goto out; 4498 } 4499 4500 endpoint->vfu_ctx = vfu_create_ctx(VFU_TRANS_SOCK, uuid, LIBVFIO_USER_FLAG_ATTACH_NB, 4501 endpoint, VFU_DEV_TYPE_PCI); 4502 if (endpoint->vfu_ctx == NULL) { 4503 SPDK_ERRLOG("%s: error creating libmuser context: %m\n", 4504 endpoint_id(endpoint)); 4505 ret = -1; 4506 goto out; 4507 } 4508 4509 ret = vfu_setup_log(endpoint->vfu_ctx, vfio_user_log, 4510 vfio_user_get_log_level()); 4511 if (ret < 0) { 4512 goto out; 4513 } 4514 4515 4516 ret = vfio_user_dev_info_fill(vu_transport, endpoint); 4517 if (ret < 0) { 4518 goto out; 4519 } 4520 4521 ret = vfio_user_register_accept_poller(endpoint); 4522 4523 if (ret != 0) { 4524 goto out; 4525 } 4526 4527 pthread_mutex_lock(&vu_transport->lock); 4528 TAILQ_INSERT_TAIL(&vu_transport->endpoints, endpoint, link); 4529 pthread_mutex_unlock(&vu_transport->lock); 4530 4531 out: 4532 if (ret != 0) { 4533 nvmf_vfio_user_destroy_endpoint(endpoint); 4534 } 4535 4536 return ret; 4537 } 4538 4539 static void 4540 nvmf_vfio_user_stop_listen(struct spdk_nvmf_transport *transport, 4541 const struct spdk_nvme_transport_id *trid) 4542 { 4543 struct nvmf_vfio_user_transport *vu_transport; 4544 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 4545 4546 assert(trid != NULL); 4547 assert(trid->traddr != NULL); 4548 4549 SPDK_DEBUGLOG(nvmf_vfio, "%s: stop listen\n", trid->traddr); 4550 4551 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4552 transport); 4553 4554 pthread_mutex_lock(&vu_transport->lock); 4555 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 4556 if (strcmp(trid->traddr, endpoint->trid.traddr) == 0) { 4557 TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link); 4558 /* Defer to free endpoint resources until the controller 4559 * is freed. There are two cases when running here: 4560 * 1. kill nvmf target while VM is connected 4561 * 2. remove listener via RPC call 4562 * nvmf library will disconnect all queue paris. 4563 */ 4564 if (endpoint->ctrlr) { 4565 assert(!endpoint->need_async_destroy); 4566 endpoint->need_async_destroy = true; 4567 pthread_mutex_unlock(&vu_transport->lock); 4568 return; 4569 } 4570 4571 nvmf_vfio_user_destroy_endpoint(endpoint); 4572 pthread_mutex_unlock(&vu_transport->lock); 4573 return; 4574 } 4575 } 4576 pthread_mutex_unlock(&vu_transport->lock); 4577 4578 SPDK_DEBUGLOG(nvmf_vfio, "%s: not found\n", trid->traddr); 4579 } 4580 4581 static void 4582 nvmf_vfio_user_cdata_init(struct spdk_nvmf_transport *transport, 4583 struct spdk_nvmf_subsystem *subsystem, 4584 struct spdk_nvmf_ctrlr_data *cdata) 4585 { 4586 struct nvmf_vfio_user_transport *vu_transport; 4587 4588 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, transport); 4589 4590 cdata->vid = SPDK_PCI_VID_NUTANIX; 4591 cdata->ssvid = SPDK_PCI_VID_NUTANIX; 4592 cdata->ieee[0] = 0x8d; 4593 cdata->ieee[1] = 0x6b; 4594 cdata->ieee[2] = 0x50; 4595 memset(&cdata->sgls, 0, sizeof(struct spdk_nvme_cdata_sgls)); 4596 cdata->sgls.supported = SPDK_NVME_SGLS_SUPPORTED_DWORD_ALIGNED; 4597 cdata->oncs.compare = !vu_transport->transport_opts.disable_compare; 4598 /* libvfio-user can only support 1 connection for now */ 4599 cdata->oncs.reservations = 0; 4600 cdata->oacs.doorbell_buffer_config = !vu_transport->transport_opts.disable_shadow_doorbells; 4601 cdata->fuses.compare_and_write = !vu_transport->transport_opts.disable_compare; 4602 } 4603 4604 static int 4605 nvmf_vfio_user_listen_associate(struct spdk_nvmf_transport *transport, 4606 const struct spdk_nvmf_subsystem *subsystem, 4607 const struct spdk_nvme_transport_id *trid) 4608 { 4609 struct nvmf_vfio_user_transport *vu_transport; 4610 struct nvmf_vfio_user_endpoint *endpoint; 4611 4612 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, transport); 4613 4614 pthread_mutex_lock(&vu_transport->lock); 4615 TAILQ_FOREACH(endpoint, &vu_transport->endpoints, link) { 4616 if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) { 4617 break; 4618 } 4619 } 4620 pthread_mutex_unlock(&vu_transport->lock); 4621 4622 if (endpoint == NULL) { 4623 return -ENOENT; 4624 } 4625 4626 /* Drop const - we will later need to pause/unpause. */ 4627 endpoint->subsystem = (struct spdk_nvmf_subsystem *)subsystem; 4628 4629 return 0; 4630 } 4631 4632 /* 4633 * Executed periodically at a default SPDK_NVMF_DEFAULT_ACCEPT_POLL_RATE_US 4634 * frequency. 4635 * 4636 * For this endpoint (which at the libvfio-user level corresponds to a socket), 4637 * if we don't currently have a controller set up, peek to see if the socket is 4638 * able to accept a new connection. 4639 */ 4640 static int 4641 nvmf_vfio_user_accept(void *ctx) 4642 { 4643 struct nvmf_vfio_user_endpoint *endpoint = ctx; 4644 struct nvmf_vfio_user_transport *vu_transport; 4645 int err; 4646 4647 vu_transport = endpoint->transport; 4648 4649 if (endpoint->ctrlr != NULL) { 4650 return SPDK_POLLER_IDLE; 4651 } 4652 4653 /* While we're here, the controller is already destroyed, 4654 * subsystem may still be in RESUMING state, we will wait 4655 * until the subsystem is in RUNNING state. 4656 */ 4657 if (endpoint->need_resume) { 4658 return SPDK_POLLER_IDLE; 4659 } 4660 4661 err = vfu_attach_ctx(endpoint->vfu_ctx); 4662 if (err == 0) { 4663 SPDK_DEBUGLOG(nvmf_vfio, "attach succeeded\n"); 4664 err = nvmf_vfio_user_create_ctrlr(vu_transport, endpoint); 4665 if (err == 0) { 4666 /* 4667 * Unregister ourselves: now we've accepted a 4668 * connection, there is nothing for us to poll for, and 4669 * we will poll the connection via vfu_run_ctx() 4670 * instead. 4671 */ 4672 spdk_interrupt_unregister(&endpoint->accept_intr); 4673 spdk_poller_unregister(&endpoint->accept_poller); 4674 } 4675 return SPDK_POLLER_BUSY; 4676 } 4677 4678 if (errno == EAGAIN || errno == EWOULDBLOCK) { 4679 return SPDK_POLLER_IDLE; 4680 } 4681 4682 return SPDK_POLLER_BUSY; 4683 } 4684 4685 static void 4686 nvmf_vfio_user_discover(struct spdk_nvmf_transport *transport, 4687 struct spdk_nvme_transport_id *trid, 4688 struct spdk_nvmf_discovery_log_page_entry *entry) 4689 { } 4690 4691 static int vfio_user_poll_group_intr(void *ctx); 4692 4693 static void 4694 vfio_user_poll_group_add_intr(struct nvmf_vfio_user_poll_group *vu_group, 4695 struct spdk_nvmf_poll_group *group) 4696 { 4697 vu_group->intr_fd = eventfd(0, EFD_NONBLOCK); 4698 assert(vu_group->intr_fd != -1); 4699 4700 vu_group->intr = SPDK_INTERRUPT_REGISTER(vu_group->intr_fd, 4701 vfio_user_poll_group_intr, vu_group); 4702 assert(vu_group->intr != NULL); 4703 4704 spdk_poller_register_interrupt(group->poller, set_intr_mode_noop, 4705 vu_group); 4706 } 4707 4708 static struct spdk_nvmf_transport_poll_group * 4709 nvmf_vfio_user_poll_group_create(struct spdk_nvmf_transport *transport, 4710 struct spdk_nvmf_poll_group *group) 4711 { 4712 struct nvmf_vfio_user_transport *vu_transport; 4713 struct nvmf_vfio_user_poll_group *vu_group; 4714 4715 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4716 transport); 4717 4718 SPDK_DEBUGLOG(nvmf_vfio, "create poll group\n"); 4719 4720 vu_group = calloc(1, sizeof(*vu_group)); 4721 if (vu_group == NULL) { 4722 SPDK_ERRLOG("Error allocating poll group: %m"); 4723 return NULL; 4724 } 4725 4726 if (in_interrupt_mode(vu_transport)) { 4727 vfio_user_poll_group_add_intr(vu_group, group); 4728 } 4729 4730 TAILQ_INIT(&vu_group->sqs); 4731 4732 pthread_mutex_lock(&vu_transport->pg_lock); 4733 TAILQ_INSERT_TAIL(&vu_transport->poll_groups, vu_group, link); 4734 if (vu_transport->next_pg == NULL) { 4735 vu_transport->next_pg = vu_group; 4736 } 4737 pthread_mutex_unlock(&vu_transport->pg_lock); 4738 4739 return &vu_group->group; 4740 } 4741 4742 static struct spdk_nvmf_transport_poll_group * 4743 nvmf_vfio_user_get_optimal_poll_group(struct spdk_nvmf_qpair *qpair) 4744 { 4745 struct nvmf_vfio_user_transport *vu_transport; 4746 struct nvmf_vfio_user_poll_group **vu_group; 4747 struct nvmf_vfio_user_sq *sq; 4748 struct nvmf_vfio_user_cq *cq; 4749 4750 struct spdk_nvmf_transport_poll_group *result = NULL; 4751 4752 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 4753 cq = sq->ctrlr->cqs[sq->cqid]; 4754 assert(cq != NULL); 4755 vu_transport = SPDK_CONTAINEROF(qpair->transport, struct nvmf_vfio_user_transport, transport); 4756 4757 pthread_mutex_lock(&vu_transport->pg_lock); 4758 if (TAILQ_EMPTY(&vu_transport->poll_groups)) { 4759 goto out; 4760 } 4761 4762 if (!nvmf_qpair_is_admin_queue(qpair)) { 4763 /* 4764 * If this is shared IO CQ case, just return the used CQ's poll 4765 * group, so I/O completions don't have to use 4766 * spdk_thread_send_msg(). 4767 */ 4768 if (cq->group != NULL) { 4769 result = cq->group; 4770 goto out; 4771 } 4772 4773 /* 4774 * If we're in interrupt mode, align all qpairs for a controller 4775 * on the same poll group by default, unless requested. This can 4776 * be lower in performance than running on a single poll group, 4777 * so we disable spreading by default. 4778 */ 4779 if (in_interrupt_mode(vu_transport) && 4780 !vu_transport->transport_opts.enable_intr_mode_sq_spreading) { 4781 result = sq->ctrlr->sqs[0]->group; 4782 goto out; 4783 } 4784 4785 } 4786 4787 vu_group = &vu_transport->next_pg; 4788 assert(*vu_group != NULL); 4789 4790 result = &(*vu_group)->group; 4791 *vu_group = TAILQ_NEXT(*vu_group, link); 4792 if (*vu_group == NULL) { 4793 *vu_group = TAILQ_FIRST(&vu_transport->poll_groups); 4794 } 4795 4796 out: 4797 if (cq->group == NULL) { 4798 cq->group = result; 4799 } 4800 4801 pthread_mutex_unlock(&vu_transport->pg_lock); 4802 return result; 4803 } 4804 4805 static void 4806 vfio_user_poll_group_del_intr(struct nvmf_vfio_user_poll_group *vu_group) 4807 { 4808 assert(vu_group->intr_fd != -1); 4809 4810 spdk_interrupt_unregister(&vu_group->intr); 4811 4812 close(vu_group->intr_fd); 4813 vu_group->intr_fd = -1; 4814 } 4815 4816 /* called when process exits */ 4817 static void 4818 nvmf_vfio_user_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group) 4819 { 4820 struct nvmf_vfio_user_poll_group *vu_group, *next_tgroup; 4821 struct nvmf_vfio_user_transport *vu_transport; 4822 4823 SPDK_DEBUGLOG(nvmf_vfio, "destroy poll group\n"); 4824 4825 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 4826 vu_transport = SPDK_CONTAINEROF(vu_group->group.transport, struct nvmf_vfio_user_transport, 4827 transport); 4828 4829 if (in_interrupt_mode(vu_transport)) { 4830 vfio_user_poll_group_del_intr(vu_group); 4831 } 4832 4833 pthread_mutex_lock(&vu_transport->pg_lock); 4834 next_tgroup = TAILQ_NEXT(vu_group, link); 4835 TAILQ_REMOVE(&vu_transport->poll_groups, vu_group, link); 4836 if (next_tgroup == NULL) { 4837 next_tgroup = TAILQ_FIRST(&vu_transport->poll_groups); 4838 } 4839 if (vu_transport->next_pg == vu_group) { 4840 vu_transport->next_pg = next_tgroup; 4841 } 4842 pthread_mutex_unlock(&vu_transport->pg_lock); 4843 4844 free(vu_group); 4845 } 4846 4847 static void 4848 _vfio_user_qpair_disconnect(void *ctx) 4849 { 4850 struct nvmf_vfio_user_sq *sq = ctx; 4851 4852 spdk_nvmf_qpair_disconnect(&sq->qpair, NULL, NULL); 4853 } 4854 4855 /* The function is used when socket connection is destroyed */ 4856 static int 4857 vfio_user_destroy_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 4858 { 4859 struct nvmf_vfio_user_sq *sq; 4860 struct nvmf_vfio_user_endpoint *endpoint; 4861 4862 SPDK_DEBUGLOG(nvmf_vfio, "%s stop processing\n", ctrlr_id(ctrlr)); 4863 4864 endpoint = ctrlr->endpoint; 4865 assert(endpoint != NULL); 4866 4867 pthread_mutex_lock(&endpoint->lock); 4868 endpoint->need_relisten = true; 4869 ctrlr->disconnect = true; 4870 if (TAILQ_EMPTY(&ctrlr->connected_sqs)) { 4871 endpoint->ctrlr = NULL; 4872 free_ctrlr(ctrlr); 4873 pthread_mutex_unlock(&endpoint->lock); 4874 return 0; 4875 } 4876 4877 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 4878 /* add another round thread poll to avoid recursive endpoint lock */ 4879 spdk_thread_send_msg(ctrlr->thread, _vfio_user_qpair_disconnect, sq); 4880 } 4881 pthread_mutex_unlock(&endpoint->lock); 4882 4883 return 0; 4884 } 4885 4886 /* 4887 * Poll for and process any incoming vfio-user messages. 4888 */ 4889 static int 4890 vfio_user_poll_vfu_ctx(void *ctx) 4891 { 4892 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 4893 int ret; 4894 4895 assert(ctrlr != NULL); 4896 4897 /* This will call access_bar0_fn() if there are any writes 4898 * to the portion of the BAR that is not mmap'd */ 4899 ret = vfu_run_ctx(ctrlr->endpoint->vfu_ctx); 4900 if (spdk_unlikely(ret == -1)) { 4901 if (errno == EBUSY) { 4902 return SPDK_POLLER_IDLE; 4903 } 4904 4905 spdk_poller_unregister(&ctrlr->vfu_ctx_poller); 4906 4907 /* 4908 * We lost the client; the reset callback will already have 4909 * unregistered the interrupt. 4910 */ 4911 if (errno == ENOTCONN) { 4912 vfio_user_destroy_ctrlr(ctrlr); 4913 return SPDK_POLLER_BUSY; 4914 } 4915 4916 /* 4917 * We might not have got a reset callback in this case, so 4918 * explicitly unregister the interrupt here. 4919 */ 4920 spdk_interrupt_unregister(&ctrlr->intr); 4921 ctrlr->intr_fd = -1; 4922 fail_ctrlr(ctrlr); 4923 } 4924 4925 return ret != 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 4926 } 4927 4928 struct vfio_user_post_cpl_ctx { 4929 struct nvmf_vfio_user_ctrlr *ctrlr; 4930 struct nvmf_vfio_user_cq *cq; 4931 struct spdk_nvme_cpl cpl; 4932 }; 4933 4934 static void 4935 _post_completion_msg(void *ctx) 4936 { 4937 struct vfio_user_post_cpl_ctx *cpl_ctx = ctx; 4938 4939 post_completion(cpl_ctx->ctrlr, cpl_ctx->cq, cpl_ctx->cpl.cdw0, cpl_ctx->cpl.sqid, 4940 cpl_ctx->cpl.cid, cpl_ctx->cpl.status.sc, cpl_ctx->cpl.status.sct); 4941 free(cpl_ctx); 4942 } 4943 4944 static int nvmf_vfio_user_poll_group_poll(struct spdk_nvmf_transport_poll_group *group); 4945 4946 static int 4947 vfio_user_poll_group_process(void *ctx) 4948 { 4949 struct nvmf_vfio_user_poll_group *vu_group = ctx; 4950 int ret = 0; 4951 4952 SPDK_DEBUGLOG(vfio_user_db, "pg:%p got intr\n", vu_group); 4953 4954 ret |= nvmf_vfio_user_poll_group_poll(&vu_group->group); 4955 4956 /* 4957 * Re-arm the event indexes. NB: this also could rearm other 4958 * controller's SQs. 4959 */ 4960 ret |= vfio_user_poll_group_rearm(vu_group); 4961 4962 vu_group->stats.pg_process_count++; 4963 return ret != 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 4964 } 4965 4966 static int 4967 vfio_user_poll_group_intr(void *ctx) 4968 { 4969 struct nvmf_vfio_user_poll_group *vu_group = ctx; 4970 eventfd_t val; 4971 4972 eventfd_read(vu_group->intr_fd, &val); 4973 4974 vu_group->stats.intr++; 4975 4976 return vfio_user_poll_group_process(ctx); 4977 } 4978 4979 /* 4980 * Handle an interrupt for the given controller: we must poll the vfu_ctx, and 4981 * the SQs assigned to our own poll group. Other poll groups are handled via 4982 * vfio_user_poll_group_intr(). 4983 */ 4984 static int 4985 vfio_user_ctrlr_intr(void *ctx) 4986 { 4987 struct nvmf_vfio_user_poll_group *vu_ctrlr_group; 4988 struct nvmf_vfio_user_ctrlr *vu_ctrlr = ctx; 4989 struct nvmf_vfio_user_poll_group *vu_group; 4990 int ret = SPDK_POLLER_IDLE; 4991 4992 vu_ctrlr_group = ctrlr_to_poll_group(vu_ctrlr); 4993 4994 SPDK_DEBUGLOG(vfio_user_db, "ctrlr pg:%p got intr\n", vu_ctrlr_group); 4995 4996 vu_ctrlr_group->stats.ctrlr_intr++; 4997 4998 /* 4999 * Poll vfio-user for this controller. We need to do this before polling 5000 * any SQs, as this is where doorbell writes may be handled. 5001 */ 5002 ret = vfio_user_poll_vfu_ctx(vu_ctrlr); 5003 5004 /* 5005 * `sqs[0]` could be set to NULL in vfio_user_poll_vfu_ctx() context, 5006 * just return for this case. 5007 */ 5008 if (vu_ctrlr->sqs[0] == NULL) { 5009 return ret; 5010 } 5011 5012 if (vu_ctrlr->transport->transport_opts.enable_intr_mode_sq_spreading) { 5013 /* 5014 * We may have just written to a doorbell owned by another 5015 * reactor: we need to prod them to make sure its SQs are polled 5016 * *after* the doorbell value is updated. 5017 */ 5018 TAILQ_FOREACH(vu_group, &vu_ctrlr->transport->poll_groups, link) { 5019 if (vu_group != vu_ctrlr_group) { 5020 SPDK_DEBUGLOG(vfio_user_db, "prodding pg:%p\n", vu_group); 5021 eventfd_write(vu_group->intr_fd, 1); 5022 } 5023 } 5024 } 5025 5026 ret |= vfio_user_poll_group_process(vu_ctrlr_group); 5027 5028 return ret; 5029 } 5030 5031 static void 5032 vfio_user_ctrlr_set_intr_mode(struct spdk_poller *poller, void *ctx, 5033 bool interrupt_mode) 5034 { 5035 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 5036 assert(ctrlr != NULL); 5037 assert(ctrlr->endpoint != NULL); 5038 5039 SPDK_DEBUGLOG(nvmf_vfio, "%s: setting interrupt mode to %d\n", 5040 ctrlr_id(ctrlr), interrupt_mode); 5041 5042 /* 5043 * interrupt_mode needs to persist across controller resets, so store 5044 * it in the endpoint instead. 5045 */ 5046 ctrlr->endpoint->interrupt_mode = interrupt_mode; 5047 5048 vfio_user_poll_group_rearm(ctrlr_to_poll_group(ctrlr)); 5049 } 5050 5051 /* 5052 * In response to the nvmf_vfio_user_create_ctrlr() path, the admin queue is now 5053 * set up and we can start operating on this controller. 5054 */ 5055 static void 5056 start_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 5057 struct spdk_nvmf_ctrlr *ctrlr) 5058 { 5059 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 5060 5061 vu_ctrlr->ctrlr = ctrlr; 5062 vu_ctrlr->cntlid = ctrlr->cntlid; 5063 vu_ctrlr->thread = spdk_get_thread(); 5064 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 5065 5066 if (!in_interrupt_mode(endpoint->transport)) { 5067 vu_ctrlr->vfu_ctx_poller = SPDK_POLLER_REGISTER(vfio_user_poll_vfu_ctx, 5068 vu_ctrlr, 1000); 5069 return; 5070 } 5071 5072 vu_ctrlr->vfu_ctx_poller = SPDK_POLLER_REGISTER(vfio_user_poll_vfu_ctx, 5073 vu_ctrlr, 0); 5074 5075 vu_ctrlr->intr_fd = vfu_get_poll_fd(vu_ctrlr->endpoint->vfu_ctx); 5076 assert(vu_ctrlr->intr_fd != -1); 5077 5078 vu_ctrlr->intr = SPDK_INTERRUPT_REGISTER(vu_ctrlr->intr_fd, 5079 vfio_user_ctrlr_intr, vu_ctrlr); 5080 5081 assert(vu_ctrlr->intr != NULL); 5082 5083 spdk_poller_register_interrupt(vu_ctrlr->vfu_ctx_poller, 5084 vfio_user_ctrlr_set_intr_mode, 5085 vu_ctrlr); 5086 } 5087 5088 static int 5089 handle_queue_connect_rsp(struct nvmf_vfio_user_req *req, void *cb_arg) 5090 { 5091 struct nvmf_vfio_user_poll_group *vu_group; 5092 struct nvmf_vfio_user_sq *sq = cb_arg; 5093 struct nvmf_vfio_user_cq *admin_cq; 5094 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 5095 struct nvmf_vfio_user_endpoint *endpoint; 5096 5097 assert(sq != NULL); 5098 assert(req != NULL); 5099 5100 vu_ctrlr = sq->ctrlr; 5101 assert(vu_ctrlr != NULL); 5102 endpoint = vu_ctrlr->endpoint; 5103 assert(endpoint != NULL); 5104 5105 if (spdk_nvme_cpl_is_error(&req->req.rsp->nvme_cpl)) { 5106 SPDK_ERRLOG("SC %u, SCT %u\n", req->req.rsp->nvme_cpl.status.sc, req->req.rsp->nvme_cpl.status.sct); 5107 endpoint->ctrlr = NULL; 5108 free_ctrlr(vu_ctrlr); 5109 return -1; 5110 } 5111 5112 vu_group = SPDK_CONTAINEROF(sq->group, struct nvmf_vfio_user_poll_group, group); 5113 TAILQ_INSERT_TAIL(&vu_group->sqs, sq, link); 5114 5115 admin_cq = vu_ctrlr->cqs[0]; 5116 assert(admin_cq != NULL); 5117 assert(admin_cq->group != NULL); 5118 assert(admin_cq->group->group->thread != NULL); 5119 5120 pthread_mutex_lock(&endpoint->lock); 5121 if (nvmf_qpair_is_admin_queue(&sq->qpair)) { 5122 assert(admin_cq->group->group->thread == spdk_get_thread()); 5123 /* 5124 * The admin queue is special as SQ0 and CQ0 are created 5125 * together. 5126 */ 5127 admin_cq->cq_ref = 1; 5128 start_ctrlr(vu_ctrlr, sq->qpair.ctrlr); 5129 } else { 5130 /* For I/O queues this command was generated in response to an 5131 * ADMIN I/O CREATE SUBMISSION QUEUE command which has not yet 5132 * been completed. Complete it now. 5133 */ 5134 if (sq->post_create_io_sq_completion) { 5135 if (admin_cq->group->group->thread != spdk_get_thread()) { 5136 struct vfio_user_post_cpl_ctx *cpl_ctx; 5137 5138 cpl_ctx = calloc(1, sizeof(*cpl_ctx)); 5139 if (!cpl_ctx) { 5140 return -ENOMEM; 5141 } 5142 cpl_ctx->ctrlr = vu_ctrlr; 5143 cpl_ctx->cq = admin_cq; 5144 cpl_ctx->cpl.sqid = 0; 5145 cpl_ctx->cpl.cdw0 = 0; 5146 cpl_ctx->cpl.cid = sq->create_io_sq_cmd.cid; 5147 cpl_ctx->cpl.status.sc = SPDK_NVME_SC_SUCCESS; 5148 cpl_ctx->cpl.status.sct = SPDK_NVME_SCT_GENERIC; 5149 5150 spdk_thread_send_msg(admin_cq->group->group->thread, 5151 _post_completion_msg, 5152 cpl_ctx); 5153 } else { 5154 post_completion(vu_ctrlr, admin_cq, 0, 0, 5155 sq->create_io_sq_cmd.cid, SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC); 5156 } 5157 sq->post_create_io_sq_completion = false; 5158 } else if (in_interrupt_mode(endpoint->transport)) { 5159 /* 5160 * If we're live migrating a guest, there is a window 5161 * where the I/O queues haven't been set up but the 5162 * device is in running state, during which the guest 5163 * might write to a doorbell. This doorbell write will 5164 * go unnoticed, so let's poll the whole controller to 5165 * pick that up. 5166 */ 5167 ctrlr_kick(vu_ctrlr); 5168 } 5169 sq->sq_state = VFIO_USER_SQ_ACTIVE; 5170 } 5171 5172 TAILQ_INSERT_TAIL(&vu_ctrlr->connected_sqs, sq, tailq); 5173 pthread_mutex_unlock(&endpoint->lock); 5174 5175 free(req->req.iov[0].iov_base); 5176 req->req.iov[0].iov_base = NULL; 5177 req->req.iovcnt = 0; 5178 req->req.data = NULL; 5179 5180 return 0; 5181 } 5182 5183 /* 5184 * Add the given qpair to the given poll group. New qpairs are added via 5185 * spdk_nvmf_tgt_new_qpair(), which picks a poll group via 5186 * nvmf_vfio_user_get_optimal_poll_group(), then calls back here via 5187 * nvmf_transport_poll_group_add(). 5188 */ 5189 static int 5190 nvmf_vfio_user_poll_group_add(struct spdk_nvmf_transport_poll_group *group, 5191 struct spdk_nvmf_qpair *qpair) 5192 { 5193 struct nvmf_vfio_user_sq *sq; 5194 struct nvmf_vfio_user_req *vu_req; 5195 struct nvmf_vfio_user_ctrlr *ctrlr; 5196 struct spdk_nvmf_request *req; 5197 struct spdk_nvmf_fabric_connect_data *data; 5198 bool admin; 5199 5200 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5201 sq->group = group; 5202 ctrlr = sq->ctrlr; 5203 5204 SPDK_DEBUGLOG(nvmf_vfio, "%s: add QP%d=%p(%p) to poll_group=%p\n", 5205 ctrlr_id(ctrlr), sq->qpair.qid, 5206 sq, qpair, group); 5207 5208 admin = nvmf_qpair_is_admin_queue(&sq->qpair); 5209 5210 vu_req = get_nvmf_vfio_user_req(sq); 5211 if (vu_req == NULL) { 5212 return -1; 5213 } 5214 5215 req = &vu_req->req; 5216 req->cmd->connect_cmd.opcode = SPDK_NVME_OPC_FABRIC; 5217 req->cmd->connect_cmd.cid = 0; 5218 req->cmd->connect_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_CONNECT; 5219 req->cmd->connect_cmd.recfmt = 0; 5220 req->cmd->connect_cmd.sqsize = sq->size - 1; 5221 req->cmd->connect_cmd.qid = admin ? 0 : qpair->qid; 5222 5223 req->length = sizeof(struct spdk_nvmf_fabric_connect_data); 5224 5225 data = calloc(1, req->length); 5226 if (data == NULL) { 5227 nvmf_vfio_user_req_free(req); 5228 return -ENOMEM; 5229 } 5230 5231 spdk_iov_one(req->iov, &req->iovcnt, data, req->length); 5232 req->data = data; 5233 5234 data->cntlid = ctrlr->cntlid; 5235 snprintf(data->subnqn, sizeof(data->subnqn), "%s", 5236 spdk_nvmf_subsystem_get_nqn(ctrlr->endpoint->subsystem)); 5237 5238 vu_req->cb_fn = handle_queue_connect_rsp; 5239 vu_req->cb_arg = sq; 5240 5241 SPDK_DEBUGLOG(nvmf_vfio, 5242 "%s: sending connect fabrics command for qid:%#x cntlid=%#x\n", 5243 ctrlr_id(ctrlr), qpair->qid, data->cntlid); 5244 5245 spdk_nvmf_request_exec_fabrics(req); 5246 return 0; 5247 } 5248 5249 static int 5250 nvmf_vfio_user_poll_group_remove(struct spdk_nvmf_transport_poll_group *group, 5251 struct spdk_nvmf_qpair *qpair) 5252 { 5253 struct nvmf_vfio_user_sq *sq; 5254 struct nvmf_vfio_user_poll_group *vu_group; 5255 5256 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5257 5258 SPDK_DEBUGLOG(nvmf_vfio, 5259 "%s: remove NVMf QP%d=%p from NVMf poll_group=%p\n", 5260 ctrlr_id(sq->ctrlr), qpair->qid, qpair, group); 5261 5262 5263 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 5264 TAILQ_REMOVE(&vu_group->sqs, sq, link); 5265 5266 return 0; 5267 } 5268 5269 static void 5270 _nvmf_vfio_user_req_free(struct nvmf_vfio_user_sq *sq, struct nvmf_vfio_user_req *vu_req) 5271 { 5272 memset(&vu_req->cmd, 0, sizeof(vu_req->cmd)); 5273 memset(&vu_req->rsp, 0, sizeof(vu_req->rsp)); 5274 vu_req->iovcnt = 0; 5275 vu_req->req.iovcnt = 0; 5276 vu_req->req.data = NULL; 5277 vu_req->req.length = 0; 5278 vu_req->state = VFIO_USER_REQUEST_STATE_FREE; 5279 5280 TAILQ_INSERT_TAIL(&sq->free_reqs, vu_req, link); 5281 } 5282 5283 static int 5284 nvmf_vfio_user_req_free(struct spdk_nvmf_request *req) 5285 { 5286 struct nvmf_vfio_user_sq *sq; 5287 struct nvmf_vfio_user_req *vu_req; 5288 5289 assert(req != NULL); 5290 5291 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 5292 sq = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 5293 5294 _nvmf_vfio_user_req_free(sq, vu_req); 5295 5296 return 0; 5297 } 5298 5299 static int 5300 nvmf_vfio_user_req_complete(struct spdk_nvmf_request *req) 5301 { 5302 struct nvmf_vfio_user_sq *sq; 5303 struct nvmf_vfio_user_req *vu_req; 5304 5305 assert(req != NULL); 5306 5307 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 5308 sq = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 5309 5310 if (vu_req->cb_fn != NULL) { 5311 if (vu_req->cb_fn(vu_req, vu_req->cb_arg) != 0) { 5312 fail_ctrlr(sq->ctrlr); 5313 } 5314 } 5315 5316 _nvmf_vfio_user_req_free(sq, vu_req); 5317 5318 return 0; 5319 } 5320 5321 static void 5322 nvmf_vfio_user_close_qpair(struct spdk_nvmf_qpair *qpair, 5323 spdk_nvmf_transport_qpair_fini_cb cb_fn, void *cb_arg) 5324 { 5325 struct nvmf_vfio_user_sq *sq; 5326 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 5327 struct nvmf_vfio_user_endpoint *endpoint; 5328 struct vfio_user_delete_sq_ctx *del_ctx; 5329 5330 assert(qpair != NULL); 5331 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5332 vu_ctrlr = sq->ctrlr; 5333 endpoint = vu_ctrlr->endpoint; 5334 del_ctx = sq->delete_ctx; 5335 sq->delete_ctx = NULL; 5336 5337 pthread_mutex_lock(&endpoint->lock); 5338 TAILQ_REMOVE(&vu_ctrlr->connected_sqs, sq, tailq); 5339 delete_sq_done(vu_ctrlr, sq); 5340 if (TAILQ_EMPTY(&vu_ctrlr->connected_sqs)) { 5341 endpoint->ctrlr = NULL; 5342 if (vu_ctrlr->in_source_vm && endpoint->need_resume) { 5343 /* The controller will be freed, we can resume the subsystem 5344 * now so that the endpoint can be ready to accept another 5345 * new connection. 5346 */ 5347 spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 5348 vfio_user_endpoint_resume_done, endpoint); 5349 } 5350 free_ctrlr(vu_ctrlr); 5351 } 5352 pthread_mutex_unlock(&endpoint->lock); 5353 5354 if (del_ctx) { 5355 vfio_user_qpair_delete_cb(del_ctx); 5356 } 5357 5358 if (cb_fn) { 5359 cb_fn(cb_arg); 5360 } 5361 } 5362 5363 /** 5364 * Returns a preallocated request, or NULL if there isn't one available. 5365 */ 5366 static struct nvmf_vfio_user_req * 5367 get_nvmf_vfio_user_req(struct nvmf_vfio_user_sq *sq) 5368 { 5369 struct nvmf_vfio_user_req *req; 5370 5371 if (sq == NULL) { 5372 return NULL; 5373 } 5374 5375 req = TAILQ_FIRST(&sq->free_reqs); 5376 if (req == NULL) { 5377 return NULL; 5378 } 5379 5380 TAILQ_REMOVE(&sq->free_reqs, req, link); 5381 5382 return req; 5383 } 5384 5385 static int 5386 get_nvmf_io_req_length(struct spdk_nvmf_request *req) 5387 { 5388 uint16_t nr; 5389 uint32_t nlb, nsid; 5390 struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; 5391 struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; 5392 struct spdk_nvmf_ns *ns; 5393 5394 nsid = cmd->nsid; 5395 ns = _nvmf_subsystem_get_ns(ctrlr->subsys, nsid); 5396 if (ns == NULL || ns->bdev == NULL) { 5397 SPDK_ERRLOG("unsuccessful query for nsid %u\n", cmd->nsid); 5398 return -EINVAL; 5399 } 5400 5401 if (cmd->opc == SPDK_NVME_OPC_DATASET_MANAGEMENT) { 5402 nr = cmd->cdw10_bits.dsm.nr + 1; 5403 return nr * sizeof(struct spdk_nvme_dsm_range); 5404 } 5405 5406 if (cmd->opc == SPDK_NVME_OPC_COPY) { 5407 nr = (cmd->cdw12 & 0x000000ffu) + 1; 5408 return nr * sizeof(struct spdk_nvme_scc_source_range); 5409 } 5410 5411 nlb = (cmd->cdw12 & 0x0000ffffu) + 1; 5412 return nlb * spdk_bdev_get_block_size(ns->bdev); 5413 } 5414 5415 static int 5416 map_admin_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req) 5417 { 5418 struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; 5419 uint32_t len = 0, numdw = 0; 5420 uint8_t fid; 5421 int iovcnt; 5422 5423 req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 5424 5425 if (req->xfer == SPDK_NVME_DATA_NONE) { 5426 return 0; 5427 } 5428 5429 switch (cmd->opc) { 5430 case SPDK_NVME_OPC_IDENTIFY: 5431 len = 4096; 5432 break; 5433 case SPDK_NVME_OPC_GET_LOG_PAGE: 5434 numdw = ((((uint32_t)cmd->cdw11_bits.get_log_page.numdu << 16) | 5435 cmd->cdw10_bits.get_log_page.numdl) + 1); 5436 if (numdw > UINT32_MAX / 4) { 5437 return -EINVAL; 5438 } 5439 len = numdw * 4; 5440 break; 5441 case SPDK_NVME_OPC_GET_FEATURES: 5442 case SPDK_NVME_OPC_SET_FEATURES: 5443 fid = cmd->cdw10_bits.set_features.fid; 5444 switch (fid) { 5445 case SPDK_NVME_FEAT_LBA_RANGE_TYPE: 5446 len = 4096; 5447 break; 5448 case SPDK_NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 5449 len = 256; 5450 break; 5451 case SPDK_NVME_FEAT_TIMESTAMP: 5452 len = 8; 5453 break; 5454 case SPDK_NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 5455 len = 512; 5456 break; 5457 case SPDK_NVME_FEAT_HOST_IDENTIFIER: 5458 if (cmd->cdw11_bits.feat_host_identifier.bits.exhid) { 5459 len = 16; 5460 } else { 5461 len = 8; 5462 } 5463 break; 5464 default: 5465 return 0; 5466 } 5467 break; 5468 default: 5469 return 0; 5470 } 5471 5472 /* ADMIN command will not use SGL */ 5473 if (cmd->psdt != 0) { 5474 return -EINVAL; 5475 } 5476 5477 iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, len); 5478 if (iovcnt < 0) { 5479 SPDK_ERRLOG("%s: map Admin Opc %x failed\n", 5480 ctrlr_id(ctrlr), cmd->opc); 5481 return -1; 5482 } 5483 req->length = len; 5484 req->data = req->iov[0].iov_base; 5485 req->iovcnt = iovcnt; 5486 5487 return 0; 5488 } 5489 5490 /* 5491 * Map an I/O command's buffers. 5492 * 5493 * Returns 0 on success and -errno on failure. 5494 */ 5495 static int 5496 map_io_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req) 5497 { 5498 int len, iovcnt; 5499 struct spdk_nvme_cmd *cmd; 5500 5501 assert(ctrlr != NULL); 5502 assert(req != NULL); 5503 5504 cmd = &req->cmd->nvme_cmd; 5505 req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 5506 5507 if (spdk_unlikely(req->xfer == SPDK_NVME_DATA_NONE)) { 5508 return 0; 5509 } 5510 5511 len = get_nvmf_io_req_length(req); 5512 if (len < 0) { 5513 return -EINVAL; 5514 } 5515 req->length = len; 5516 5517 iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, req->length); 5518 if (iovcnt < 0) { 5519 SPDK_ERRLOG("%s: failed to map IO OPC %u\n", ctrlr_id(ctrlr), cmd->opc); 5520 return -EFAULT; 5521 } 5522 req->data = req->iov[0].iov_base; 5523 req->iovcnt = iovcnt; 5524 5525 return 0; 5526 } 5527 5528 static int 5529 handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 5530 struct nvmf_vfio_user_sq *sq) 5531 { 5532 int err; 5533 struct nvmf_vfio_user_req *vu_req; 5534 struct spdk_nvmf_request *req; 5535 5536 assert(ctrlr != NULL); 5537 assert(cmd != NULL); 5538 5539 vu_req = get_nvmf_vfio_user_req(sq); 5540 if (spdk_unlikely(vu_req == NULL)) { 5541 SPDK_ERRLOG("%s: no request for NVMe command opc 0x%x\n", ctrlr_id(ctrlr), cmd->opc); 5542 return post_completion(ctrlr, ctrlr->cqs[sq->cqid], 0, 0, cmd->cid, 5543 SPDK_NVME_SC_INTERNAL_DEVICE_ERROR, SPDK_NVME_SCT_GENERIC); 5544 5545 } 5546 req = &vu_req->req; 5547 5548 assert(req->qpair != NULL); 5549 SPDK_DEBUGLOG(nvmf_vfio, "%s: handle sqid:%u, req opc=%#x cid=%d\n", 5550 ctrlr_id(ctrlr), req->qpair->qid, cmd->opc, cmd->cid); 5551 5552 vu_req->cb_fn = handle_cmd_rsp; 5553 vu_req->cb_arg = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 5554 req->cmd->nvme_cmd = *cmd; 5555 5556 if (nvmf_qpair_is_admin_queue(req->qpair)) { 5557 err = map_admin_cmd_req(ctrlr, req); 5558 } else { 5559 switch (cmd->opc) { 5560 case SPDK_NVME_OPC_RESERVATION_REGISTER: 5561 case SPDK_NVME_OPC_RESERVATION_REPORT: 5562 case SPDK_NVME_OPC_RESERVATION_ACQUIRE: 5563 case SPDK_NVME_OPC_RESERVATION_RELEASE: 5564 err = -ENOTSUP; 5565 break; 5566 default: 5567 err = map_io_cmd_req(ctrlr, req); 5568 break; 5569 } 5570 } 5571 5572 if (spdk_unlikely(err < 0)) { 5573 SPDK_ERRLOG("%s: process NVMe command opc 0x%x failed\n", 5574 ctrlr_id(ctrlr), cmd->opc); 5575 req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 5576 req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; 5577 err = handle_cmd_rsp(vu_req, vu_req->cb_arg); 5578 _nvmf_vfio_user_req_free(sq, vu_req); 5579 return err; 5580 } 5581 5582 vu_req->state = VFIO_USER_REQUEST_STATE_EXECUTING; 5583 spdk_nvmf_request_exec(req); 5584 5585 return 0; 5586 } 5587 5588 /* 5589 * If we suppressed an IRQ in post_completion(), check if it needs to be fired 5590 * here: if the host isn't up to date, and is apparently not actively processing 5591 * the queue (i.e. ->last_head isn't changing), we need an IRQ. 5592 */ 5593 static void 5594 handle_suppressed_irq(struct nvmf_vfio_user_ctrlr *ctrlr, 5595 struct nvmf_vfio_user_sq *sq) 5596 { 5597 struct nvmf_vfio_user_cq *cq = ctrlr->cqs[sq->cqid]; 5598 uint32_t cq_head; 5599 uint32_t cq_tail; 5600 5601 if (!cq->ien || cq->qid == 0 || !ctrlr_interrupt_enabled(ctrlr)) { 5602 return; 5603 } 5604 5605 cq_tail = *cq_tailp(cq); 5606 5607 /* Already sent? */ 5608 if (cq_tail == cq->last_trigger_irq_tail) { 5609 return; 5610 } 5611 5612 spdk_ivdt_dcache(cq_dbl_headp(cq)); 5613 cq_head = *cq_dbl_headp(cq); 5614 5615 if (cq_head != cq_tail && cq_head == cq->last_head) { 5616 int err = vfu_irq_trigger(ctrlr->endpoint->vfu_ctx, cq->iv); 5617 if (err != 0) { 5618 SPDK_ERRLOG("%s: failed to trigger interrupt: %m\n", 5619 ctrlr_id(ctrlr)); 5620 } else { 5621 cq->last_trigger_irq_tail = cq_tail; 5622 } 5623 } 5624 5625 cq->last_head = cq_head; 5626 } 5627 5628 /* Returns the number of commands processed, or a negative value on error. */ 5629 static int 5630 nvmf_vfio_user_sq_poll(struct nvmf_vfio_user_sq *sq) 5631 { 5632 struct nvmf_vfio_user_ctrlr *ctrlr; 5633 uint32_t new_tail; 5634 int count = 0; 5635 5636 assert(sq != NULL); 5637 5638 ctrlr = sq->ctrlr; 5639 5640 /* 5641 * A quiesced, or migrating, controller should never process new 5642 * commands. 5643 */ 5644 if (ctrlr->state != VFIO_USER_CTRLR_RUNNING) { 5645 return SPDK_POLLER_IDLE; 5646 } 5647 5648 if (ctrlr->adaptive_irqs_enabled) { 5649 handle_suppressed_irq(ctrlr, sq); 5650 } 5651 5652 /* On aarch64 platforms, doorbells update from guest VM may not be seen 5653 * on SPDK target side. This is because there is memory type mismatch 5654 * situation here. That is on guest VM side, the doorbells are treated as 5655 * device memory while on SPDK target side, it is treated as normal 5656 * memory. And this situation cause problem on ARM platform. 5657 * Refer to "https://developer.arm.com/documentation/102376/0100/ 5658 * Memory-aliasing-and-mismatched-memory-types". Only using spdk_mb() 5659 * cannot fix this. Use "dc civac" to invalidate cache may solve 5660 * this. 5661 */ 5662 spdk_ivdt_dcache(sq_dbl_tailp(sq)); 5663 5664 /* Load-Acquire. */ 5665 new_tail = *sq_dbl_tailp(sq); 5666 5667 new_tail = new_tail & 0xffffu; 5668 if (spdk_unlikely(new_tail >= sq->size)) { 5669 SPDK_DEBUGLOG(nvmf_vfio, "%s: invalid sqid:%u doorbell value %u\n", ctrlr_id(ctrlr), sq->qid, 5670 new_tail); 5671 spdk_nvmf_ctrlr_async_event_error_event(ctrlr->ctrlr, SPDK_NVME_ASYNC_EVENT_INVALID_DB_WRITE); 5672 5673 return -1; 5674 } 5675 5676 if (*sq_headp(sq) == new_tail) { 5677 return 0; 5678 } 5679 5680 SPDK_DEBUGLOG(nvmf_vfio, "%s: sqid:%u doorbell old=%u new=%u\n", 5681 ctrlr_id(ctrlr), sq->qid, *sq_headp(sq), new_tail); 5682 if (ctrlr->sdbl != NULL) { 5683 SPDK_DEBUGLOG(nvmf_vfio, 5684 "%s: sqid:%u bar0_doorbell=%u shadow_doorbell=%u eventidx=%u\n", 5685 ctrlr_id(ctrlr), sq->qid, 5686 ctrlr->bar0_doorbells[queue_index(sq->qid, false)], 5687 ctrlr->sdbl->shadow_doorbells[queue_index(sq->qid, false)], 5688 ctrlr->sdbl->eventidxs[queue_index(sq->qid, false)]); 5689 } 5690 5691 /* 5692 * Ensure that changes to the queue are visible to us. 5693 * The host driver should write the queue first, do a wmb(), and then 5694 * update the SQ tail doorbell (their Store-Release). 5695 */ 5696 spdk_rmb(); 5697 5698 count = handle_sq_tdbl_write(ctrlr, new_tail, sq); 5699 if (spdk_unlikely(count < 0)) { 5700 fail_ctrlr(ctrlr); 5701 } 5702 5703 return count; 5704 } 5705 5706 /* 5707 * vfio-user transport poll handler. Note that the library context is polled in 5708 * a separate poller (->vfu_ctx_poller), so this poller only needs to poll the 5709 * active SQs. 5710 * 5711 * Returns the number of commands processed, or a negative value on error. 5712 */ 5713 static int 5714 nvmf_vfio_user_poll_group_poll(struct spdk_nvmf_transport_poll_group *group) 5715 { 5716 struct nvmf_vfio_user_poll_group *vu_group; 5717 struct nvmf_vfio_user_sq *sq, *tmp; 5718 int count = 0; 5719 5720 assert(group != NULL); 5721 5722 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 5723 5724 SPDK_DEBUGLOG(vfio_user_db, "polling all SQs\n"); 5725 5726 TAILQ_FOREACH_SAFE(sq, &vu_group->sqs, link, tmp) { 5727 int ret; 5728 5729 if (spdk_unlikely(sq->sq_state != VFIO_USER_SQ_ACTIVE || !sq->size)) { 5730 continue; 5731 } 5732 5733 ret = nvmf_vfio_user_sq_poll(sq); 5734 5735 if (spdk_unlikely(ret < 0)) { 5736 return ret; 5737 } 5738 5739 count += ret; 5740 } 5741 5742 vu_group->stats.polls++; 5743 vu_group->stats.poll_reqs += count; 5744 vu_group->stats.poll_reqs_squared += count * count; 5745 if (count == 0) { 5746 vu_group->stats.polls_spurious++; 5747 } 5748 5749 return count; 5750 } 5751 5752 static int 5753 nvmf_vfio_user_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair, 5754 struct spdk_nvme_transport_id *trid) 5755 { 5756 struct nvmf_vfio_user_sq *sq; 5757 struct nvmf_vfio_user_ctrlr *ctrlr; 5758 5759 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5760 ctrlr = sq->ctrlr; 5761 5762 memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid)); 5763 return 0; 5764 } 5765 5766 static int 5767 nvmf_vfio_user_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair, 5768 struct spdk_nvme_transport_id *trid) 5769 { 5770 return 0; 5771 } 5772 5773 static int 5774 nvmf_vfio_user_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair, 5775 struct spdk_nvme_transport_id *trid) 5776 { 5777 struct nvmf_vfio_user_sq *sq; 5778 struct nvmf_vfio_user_ctrlr *ctrlr; 5779 5780 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5781 ctrlr = sq->ctrlr; 5782 5783 memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid)); 5784 return 0; 5785 } 5786 5787 static void 5788 nvmf_vfio_user_qpair_abort_request(struct spdk_nvmf_qpair *qpair, 5789 struct spdk_nvmf_request *req) 5790 { 5791 struct spdk_nvmf_request *req_to_abort = NULL; 5792 struct spdk_nvmf_request *temp_req = NULL; 5793 uint16_t cid; 5794 5795 cid = req->cmd->nvme_cmd.cdw10_bits.abort.cid; 5796 5797 TAILQ_FOREACH(temp_req, &qpair->outstanding, link) { 5798 struct nvmf_vfio_user_req *vu_req; 5799 5800 vu_req = SPDK_CONTAINEROF(temp_req, struct nvmf_vfio_user_req, req); 5801 5802 if (vu_req->state == VFIO_USER_REQUEST_STATE_EXECUTING && vu_req->cmd.cid == cid) { 5803 req_to_abort = temp_req; 5804 break; 5805 } 5806 } 5807 5808 if (req_to_abort == NULL) { 5809 spdk_nvmf_request_complete(req); 5810 return; 5811 } 5812 5813 req->req_to_abort = req_to_abort; 5814 nvmf_ctrlr_abort_request(req); 5815 } 5816 5817 static void 5818 nvmf_vfio_user_poll_group_dump_stat(struct spdk_nvmf_transport_poll_group *group, 5819 struct spdk_json_write_ctx *w) 5820 { 5821 struct nvmf_vfio_user_poll_group *vu_group = SPDK_CONTAINEROF(group, 5822 struct nvmf_vfio_user_poll_group, group); 5823 uint64_t polls_denom; 5824 5825 spdk_json_write_named_uint64(w, "ctrlr_intr", vu_group->stats.ctrlr_intr); 5826 spdk_json_write_named_uint64(w, "ctrlr_kicks", vu_group->stats.ctrlr_kicks); 5827 spdk_json_write_named_uint64(w, "won", vu_group->stats.won); 5828 spdk_json_write_named_uint64(w, "lost", vu_group->stats.lost); 5829 spdk_json_write_named_uint64(w, "lost_count", vu_group->stats.lost_count); 5830 spdk_json_write_named_uint64(w, "rearms", vu_group->stats.rearms); 5831 spdk_json_write_named_uint64(w, "pg_process_count", vu_group->stats.pg_process_count); 5832 spdk_json_write_named_uint64(w, "intr", vu_group->stats.intr); 5833 spdk_json_write_named_uint64(w, "polls", vu_group->stats.polls); 5834 spdk_json_write_named_uint64(w, "polls_spurious", vu_group->stats.polls_spurious); 5835 spdk_json_write_named_uint64(w, "poll_reqs", vu_group->stats.poll_reqs); 5836 polls_denom = vu_group->stats.polls * (vu_group->stats.polls - 1); 5837 if (polls_denom) { 5838 uint64_t n = vu_group->stats.polls * vu_group->stats.poll_reqs_squared - vu_group->stats.poll_reqs * 5839 vu_group->stats.poll_reqs; 5840 spdk_json_write_named_double(w, "poll_reqs_variance", sqrt(n / polls_denom)); 5841 } 5842 5843 spdk_json_write_named_uint64(w, "cqh_admin_writes", vu_group->stats.cqh_admin_writes); 5844 spdk_json_write_named_uint64(w, "cqh_io_writes", vu_group->stats.cqh_io_writes); 5845 } 5846 5847 static void 5848 nvmf_vfio_user_opts_init(struct spdk_nvmf_transport_opts *opts) 5849 { 5850 opts->max_queue_depth = NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH; 5851 opts->max_qpairs_per_ctrlr = NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; 5852 opts->in_capsule_data_size = 0; 5853 opts->max_io_size = NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE; 5854 opts->io_unit_size = NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE; 5855 opts->max_aq_depth = NVMF_VFIO_USER_DEFAULT_AQ_DEPTH; 5856 opts->num_shared_buffers = 0; 5857 opts->buf_cache_size = 0; 5858 opts->association_timeout = 0; 5859 opts->transport_specific = NULL; 5860 } 5861 5862 const struct spdk_nvmf_transport_ops spdk_nvmf_transport_vfio_user = { 5863 .name = "VFIOUSER", 5864 .type = SPDK_NVME_TRANSPORT_VFIOUSER, 5865 .opts_init = nvmf_vfio_user_opts_init, 5866 .create = nvmf_vfio_user_create, 5867 .destroy = nvmf_vfio_user_destroy, 5868 5869 .listen = nvmf_vfio_user_listen, 5870 .stop_listen = nvmf_vfio_user_stop_listen, 5871 .cdata_init = nvmf_vfio_user_cdata_init, 5872 .listen_associate = nvmf_vfio_user_listen_associate, 5873 5874 .listener_discover = nvmf_vfio_user_discover, 5875 5876 .poll_group_create = nvmf_vfio_user_poll_group_create, 5877 .get_optimal_poll_group = nvmf_vfio_user_get_optimal_poll_group, 5878 .poll_group_destroy = nvmf_vfio_user_poll_group_destroy, 5879 .poll_group_add = nvmf_vfio_user_poll_group_add, 5880 .poll_group_remove = nvmf_vfio_user_poll_group_remove, 5881 .poll_group_poll = nvmf_vfio_user_poll_group_poll, 5882 5883 .req_free = nvmf_vfio_user_req_free, 5884 .req_complete = nvmf_vfio_user_req_complete, 5885 5886 .qpair_fini = nvmf_vfio_user_close_qpair, 5887 .qpair_get_local_trid = nvmf_vfio_user_qpair_get_local_trid, 5888 .qpair_get_peer_trid = nvmf_vfio_user_qpair_get_peer_trid, 5889 .qpair_get_listen_trid = nvmf_vfio_user_qpair_get_listen_trid, 5890 .qpair_abort_request = nvmf_vfio_user_qpair_abort_request, 5891 5892 .poll_group_dump_stat = nvmf_vfio_user_poll_group_dump_stat, 5893 }; 5894 5895 SPDK_NVMF_TRANSPORT_REGISTER(muser, &spdk_nvmf_transport_vfio_user); 5896 SPDK_LOG_REGISTER_COMPONENT(nvmf_vfio) 5897 SPDK_LOG_REGISTER_COMPONENT(vfio_user_db) 5898