1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2020 Intel Corporation. 3 * Copyright (c) 2019-2022, Nutanix Inc. All rights reserved. 4 * Copyright (c) 2022, 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 /* 8 * NVMe over vfio-user transport 9 */ 10 11 #include <sys/param.h> 12 13 #include <vfio-user/libvfio-user.h> 14 #include <vfio-user/pci_defs.h> 15 16 #include "spdk/barrier.h" 17 #include "spdk/stdinc.h" 18 #include "spdk/assert.h" 19 #include "spdk/thread.h" 20 #include "spdk/nvmf_transport.h" 21 #include "spdk/sock.h" 22 #include "spdk/string.h" 23 #include "spdk/util.h" 24 #include "spdk/log.h" 25 26 #include "transport.h" 27 28 #include "nvmf_internal.h" 29 30 #define SWAP(x, y) \ 31 do \ 32 { \ 33 typeof(x) _tmp = x; \ 34 x = y; \ 35 y = _tmp; \ 36 } while (0) 37 38 #define NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH 256 39 #define NVMF_VFIO_USER_DEFAULT_AQ_DEPTH 32 40 #define NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE ((NVMF_REQ_MAX_BUFFERS - 1) << SHIFT_4KB) 41 #define NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE 42 43 #define NVME_DOORBELLS_OFFSET 0x1000 44 #define NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT 2 45 #define NVMF_VFIO_USER_SET_EVENTIDX_MAX_ATTEMPTS 3 46 #define NVMF_VFIO_USER_EVENTIDX_POLL UINT32_MAX 47 48 #define NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR 512 49 #define NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR (NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR / 4) 50 51 /* NVMe spec 1.4, section 5.21.1.7 */ 52 SPDK_STATIC_ASSERT(NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR >= 2 && 53 NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR <= SPDK_NVME_MAX_IO_QUEUES, 54 "bad number of queues"); 55 56 /* 57 * NVMe driver reads 4096 bytes, which is the extended PCI configuration space 58 * available on PCI-X 2.0 and PCI Express buses 59 */ 60 #define NVME_REG_CFG_SIZE 0x1000 61 62 /* 63 * Doorbells must be page aligned so that they can memory mapped. 64 * 65 * TODO does the NVMe spec also require this? Document it. 66 */ 67 #define NVMF_VFIO_USER_DOORBELLS_SIZE \ 68 SPDK_ALIGN_CEIL( \ 69 (NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR * 2 * SPDK_NVME_DOORBELL_REGISTER_SIZE), \ 70 0x1000) 71 #define NVME_REG_BAR0_SIZE (NVME_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE) 72 73 /* 74 * TODO check the PCI spec whether BAR4 and BAR5 really have to be at least one 75 * page and a multiple of page size (maybe QEMU also needs this?). Document all 76 * this. 77 */ 78 79 #define NVMF_VFIO_USER_MSIX_NUM MAX(CHAR_BIT, NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) 80 81 #define NVMF_VFIO_USER_MSIX_TABLE_BIR (4) 82 #define NVMF_VFIO_USER_BAR4_SIZE SPDK_ALIGN_CEIL((NVMF_VFIO_USER_MSIX_NUM * 16), 0x1000) 83 SPDK_STATIC_ASSERT(NVMF_VFIO_USER_BAR4_SIZE > 0, "Incorrect size"); 84 85 /* 86 * TODO according to the PCI spec we need one bit per vector, document the 87 * relevant section. 88 */ 89 #define NVMF_VFIO_USER_MSIX_PBA_BIR (5) 90 #define NVMF_VFIO_USER_BAR5_SIZE SPDK_ALIGN_CEIL((NVMF_VFIO_USER_MSIX_NUM / CHAR_BIT), 0x1000) 91 SPDK_STATIC_ASSERT(NVMF_VFIO_USER_BAR5_SIZE > 0, "Incorrect size"); 92 struct nvmf_vfio_user_req; 93 94 typedef int (*nvmf_vfio_user_req_cb_fn)(struct nvmf_vfio_user_req *req, void *cb_arg); 95 96 /* 1 more for PRP2 list itself */ 97 #define NVMF_VFIO_USER_MAX_IOVECS (NVMF_REQ_MAX_BUFFERS + 1) 98 99 enum nvmf_vfio_user_req_state { 100 VFIO_USER_REQUEST_STATE_FREE = 0, 101 VFIO_USER_REQUEST_STATE_EXECUTING, 102 }; 103 104 /* 105 * Support for live migration in NVMf/vfio-user: live migration is implemented 106 * by stopping the NVMf subsystem when the device is instructed to enter the 107 * stop-and-copy state and then trivially, and most importantly safely, 108 * collecting migration state and providing it to the vfio-user client. We 109 * don't provide any migration state at the pre-copy state as that's too 110 * complicated to do, we might support this in the future. 111 */ 112 113 114 /* NVMe device state representation */ 115 struct nvme_migr_sq_state { 116 uint16_t sqid; 117 uint16_t cqid; 118 uint32_t head; 119 uint32_t size; 120 uint32_t reserved; 121 uint64_t dma_addr; 122 }; 123 SPDK_STATIC_ASSERT(sizeof(struct nvme_migr_sq_state) == 0x18, "Incorrect size"); 124 125 struct nvme_migr_cq_state { 126 uint16_t cqid; 127 uint16_t phase; 128 uint32_t tail; 129 uint32_t size; 130 uint32_t iv; 131 uint32_t ien; 132 uint32_t reserved; 133 uint64_t dma_addr; 134 }; 135 SPDK_STATIC_ASSERT(sizeof(struct nvme_migr_cq_state) == 0x20, "Incorrect size"); 136 137 #define VFIO_USER_MIGR_CALLBACK_VERS 1 138 #define VFIO_USER_NVME_MIGR_MAGIC 0xAFEDBC23 139 140 /* The device state is in VFIO MIGRATION BAR(9) region, keep the device state page aligned. 141 * 142 * NVMe device migration region is defined as below: 143 * ------------------------------------------------------------------------- 144 * | vfio_user_nvme_migr_header | nvmf controller data | queue pairs | BARs | 145 * ------------------------------------------------------------------------- 146 * 147 * Keep vfio_user_nvme_migr_header as a fixed 0x1000 length, all new added fields 148 * can use the reserved space at the end of the data structure. 149 */ 150 struct vfio_user_nvme_migr_header { 151 /* Magic value to validate migration data */ 152 uint32_t magic; 153 /* Version to check the data is same from source to destination */ 154 uint32_t version; 155 156 /* The library uses this field to know how many fields in this 157 * structure are valid, starting at the beginning of this data 158 * structure. New added fields in future use `unused` memory 159 * spaces. 160 */ 161 uint32_t opts_size; 162 uint32_t reserved0; 163 164 /* BARs information */ 165 uint64_t bar_offset[VFU_PCI_DEV_NUM_REGIONS]; 166 uint64_t bar_len[VFU_PCI_DEV_NUM_REGIONS]; 167 168 /* Queue pair start offset, starting at the beginning of this 169 * data structure. 170 */ 171 uint64_t qp_offset; 172 uint64_t qp_len; 173 174 /* Controller data structure */ 175 uint32_t num_io_queues; 176 uint32_t reserved1; 177 178 /* NVMf controller data offset and length if exist, starting at 179 * the beginning of this data structure. 180 */ 181 uint64_t nvmf_data_offset; 182 uint64_t nvmf_data_len; 183 184 /* 185 * Whether or not shadow doorbells are used in the source. 0 is a valid DMA 186 * address. 187 */ 188 uint32_t sdbl; 189 190 /* Shadow doorbell DMA addresses. */ 191 uint64_t shadow_doorbell_buffer; 192 uint64_t eventidx_buffer; 193 194 /* Reserved memory space for new added fields, the 195 * field is always at the end of this data structure. 196 */ 197 uint8_t unused[3856]; 198 }; 199 SPDK_STATIC_ASSERT(sizeof(struct vfio_user_nvme_migr_header) == 0x1000, "Incorrect size"); 200 201 struct vfio_user_nvme_migr_qp { 202 struct nvme_migr_sq_state sq; 203 struct nvme_migr_cq_state cq; 204 }; 205 206 /* NVMe state definition used to load/restore from/to NVMe migration BAR region */ 207 struct vfio_user_nvme_migr_state { 208 struct vfio_user_nvme_migr_header ctrlr_header; 209 struct spdk_nvmf_ctrlr_migr_data nvmf_data; 210 struct vfio_user_nvme_migr_qp qps[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 211 uint8_t doorbells[NVMF_VFIO_USER_DOORBELLS_SIZE]; 212 uint8_t cfg[NVME_REG_CFG_SIZE]; 213 }; 214 215 struct nvmf_vfio_user_req { 216 struct spdk_nvmf_request req; 217 struct spdk_nvme_cpl rsp; 218 struct spdk_nvme_cmd cmd; 219 220 enum nvmf_vfio_user_req_state state; 221 nvmf_vfio_user_req_cb_fn cb_fn; 222 void *cb_arg; 223 224 /* old CC before prop_set_cc fabric command */ 225 union spdk_nvme_cc_register cc; 226 227 TAILQ_ENTRY(nvmf_vfio_user_req) link; 228 229 struct iovec iov[NVMF_VFIO_USER_MAX_IOVECS]; 230 uint8_t iovcnt; 231 232 /* NVMF_VFIO_USER_MAX_IOVECS worth of dma_sg_t. */ 233 uint8_t sg[]; 234 }; 235 236 #define MAP_R (0) 237 #define MAP_RW (1 << 0) 238 #define MAP_INITIALIZE (1 << 1) 239 #define MAP_QUIET (1 << 2) 240 241 /* 242 * Mapping of an NVMe queue. 243 * 244 * This holds the information tracking a local process mapping of an NVMe queue 245 * shared by the client. 246 */ 247 struct nvme_q_mapping { 248 /* iov of local process mapping. */ 249 struct iovec iov; 250 /* Stored sg, needed for unmap. */ 251 dma_sg_t *sg; 252 /* Client PRP of queue. */ 253 uint64_t prp1; 254 /* Total length in bytes. */ 255 uint64_t len; 256 }; 257 258 enum nvmf_vfio_user_sq_state { 259 VFIO_USER_SQ_UNUSED = 0, 260 VFIO_USER_SQ_CREATED, 261 VFIO_USER_SQ_DELETED, 262 VFIO_USER_SQ_ACTIVE, 263 VFIO_USER_SQ_INACTIVE 264 }; 265 266 enum nvmf_vfio_user_cq_state { 267 VFIO_USER_CQ_UNUSED = 0, 268 VFIO_USER_CQ_CREATED, 269 VFIO_USER_CQ_DELETED, 270 }; 271 272 enum nvmf_vfio_user_ctrlr_state { 273 VFIO_USER_CTRLR_CREATING = 0, 274 VFIO_USER_CTRLR_RUNNING, 275 /* Quiesce requested by libvfio-user */ 276 VFIO_USER_CTRLR_PAUSING, 277 /* NVMf subsystem is paused, it's safe to do PCI reset, memory register, 278 * memory unergister, and vfio migration state transition in this state. 279 */ 280 VFIO_USER_CTRLR_PAUSED, 281 /* 282 * Implies that the NVMf subsystem is paused. Device will be unquiesced (PCI 283 * reset, memory register and unregister, controller in destination VM has 284 * been restored). NVMf subsystem resume has been requested. 285 */ 286 VFIO_USER_CTRLR_RESUMING, 287 /* 288 * Implies that the NVMf subsystem is paused. Both controller in source VM and 289 * destinatiom VM is in this state when doing live migration. 290 */ 291 VFIO_USER_CTRLR_MIGRATING 292 }; 293 294 struct nvmf_vfio_user_sq { 295 struct spdk_nvmf_qpair qpair; 296 struct spdk_nvmf_transport_poll_group *group; 297 struct nvmf_vfio_user_ctrlr *ctrlr; 298 299 uint32_t qid; 300 /* Number of entries in queue. */ 301 uint32_t size; 302 struct nvme_q_mapping mapping; 303 enum nvmf_vfio_user_sq_state sq_state; 304 305 uint32_t head; 306 volatile uint32_t *dbl_tailp; 307 308 /* Whether a shadow doorbell eventidx needs setting. */ 309 bool need_rearm; 310 311 /* multiple SQs can be mapped to the same CQ */ 312 uint16_t cqid; 313 314 /* handle_queue_connect_rsp() can be used both for CREATE IO SQ response 315 * and SQ re-connect response in the destination VM, for the prior case, 316 * we will post a NVMe completion to VM, we will not set this flag when 317 * re-connecting SQs in the destination VM. 318 */ 319 bool post_create_io_sq_completion; 320 /* Copy of Create IO SQ command, this field is used together with 321 * `post_create_io_sq_completion` flag. 322 */ 323 struct spdk_nvme_cmd create_io_sq_cmd; 324 325 struct vfio_user_delete_sq_ctx *delete_ctx; 326 327 /* Currently unallocated reqs. */ 328 TAILQ_HEAD(, nvmf_vfio_user_req) free_reqs; 329 /* Poll group entry */ 330 TAILQ_ENTRY(nvmf_vfio_user_sq) link; 331 /* Connected SQ entry */ 332 TAILQ_ENTRY(nvmf_vfio_user_sq) tailq; 333 }; 334 335 struct nvmf_vfio_user_cq { 336 struct spdk_nvmf_transport_poll_group *group; 337 int cq_ref; 338 339 uint32_t qid; 340 /* Number of entries in queue. */ 341 uint32_t size; 342 struct nvme_q_mapping mapping; 343 enum nvmf_vfio_user_cq_state cq_state; 344 345 uint32_t tail; 346 volatile uint32_t *dbl_headp; 347 348 bool phase; 349 350 uint16_t iv; 351 bool ien; 352 353 uint32_t last_head; 354 uint32_t last_trigger_irq_tail; 355 }; 356 357 struct nvmf_vfio_user_poll_group { 358 struct spdk_nvmf_transport_poll_group group; 359 TAILQ_ENTRY(nvmf_vfio_user_poll_group) link; 360 TAILQ_HEAD(, nvmf_vfio_user_sq) sqs; 361 struct spdk_interrupt *intr; 362 int intr_fd; 363 struct { 364 365 /* 366 * ctrlr_intr and ctrlr_kicks will be zero for all other poll 367 * groups. However, they can be zero even for the poll group 368 * the controller belongs are if no vfio-user message has been 369 * received or the controller hasn't been kicked yet. 370 */ 371 372 /* 373 * Number of times vfio_user_ctrlr_intr() has run: 374 * vfio-user file descriptor has been ready or explicitly 375 * kicked (see below). 376 */ 377 uint64_t ctrlr_intr; 378 379 /* 380 * Kicks to the controller by ctrlr_kick(). 381 * ctrlr_intr - ctrlr_kicks is the number of times the 382 * vfio-user poll file descriptor has been ready. 383 */ 384 uint64_t ctrlr_kicks; 385 386 /* 387 * How many times we won the race arming an SQ. 388 */ 389 uint64_t won; 390 391 /* 392 * How many times we lost the race arming an SQ 393 */ 394 uint64_t lost; 395 396 /* 397 * How many requests we processed in total each time we lost 398 * the rearm race. 399 */ 400 uint64_t lost_count; 401 402 /* 403 * Number of attempts we attempted to rearm all the SQs in the 404 * poll group. 405 */ 406 uint64_t rearms; 407 408 uint64_t pg_process_count; 409 uint64_t intr; 410 uint64_t polls; 411 uint64_t polls_spurious; 412 uint64_t poll_reqs; 413 uint64_t poll_reqs_squared; 414 uint64_t cqh_admin_writes; 415 uint64_t cqh_io_writes; 416 } stats; 417 }; 418 419 struct nvmf_vfio_user_shadow_doorbells { 420 volatile uint32_t *shadow_doorbells; 421 volatile uint32_t *eventidxs; 422 dma_sg_t *sgs; 423 struct iovec *iovs; 424 }; 425 426 struct nvmf_vfio_user_ctrlr { 427 struct nvmf_vfio_user_endpoint *endpoint; 428 struct nvmf_vfio_user_transport *transport; 429 430 /* Connected SQs list */ 431 TAILQ_HEAD(, nvmf_vfio_user_sq) connected_sqs; 432 enum nvmf_vfio_user_ctrlr_state state; 433 434 /* 435 * Tells whether live migration data have been prepared. This is used 436 * by the get_pending_bytes callback to tell whether or not the 437 * previous iteration finished. 438 */ 439 bool migr_data_prepared; 440 441 /* Controller is in source VM when doing live migration */ 442 bool in_source_vm; 443 444 struct spdk_thread *thread; 445 struct spdk_poller *vfu_ctx_poller; 446 struct spdk_interrupt *intr; 447 int intr_fd; 448 449 bool queued_quiesce; 450 451 bool reset_shn; 452 bool disconnect; 453 454 uint16_t cntlid; 455 struct spdk_nvmf_ctrlr *ctrlr; 456 457 struct nvmf_vfio_user_sq *sqs[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 458 struct nvmf_vfio_user_cq *cqs[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 459 460 TAILQ_ENTRY(nvmf_vfio_user_ctrlr) link; 461 462 volatile uint32_t *bar0_doorbells; 463 struct nvmf_vfio_user_shadow_doorbells *sdbl; 464 /* 465 * Shadow doorbells PRPs to provide during the stop-and-copy state. 466 */ 467 uint64_t shadow_doorbell_buffer; 468 uint64_t eventidx_buffer; 469 470 bool adaptive_irqs_enabled; 471 }; 472 473 /* Endpoint in vfio-user is associated with a socket file, which 474 * is the representative of a PCI endpoint. 475 */ 476 struct nvmf_vfio_user_endpoint { 477 struct nvmf_vfio_user_transport *transport; 478 vfu_ctx_t *vfu_ctx; 479 struct spdk_poller *accept_poller; 480 struct spdk_thread *accept_thread; 481 bool interrupt_mode; 482 struct msixcap *msix; 483 vfu_pci_config_space_t *pci_config_space; 484 int devmem_fd; 485 int accept_intr_fd; 486 struct spdk_interrupt *accept_intr; 487 488 volatile uint32_t *bar0_doorbells; 489 490 int migr_fd; 491 void *migr_data; 492 493 struct spdk_nvme_transport_id trid; 494 struct spdk_nvmf_subsystem *subsystem; 495 496 /* Controller is associated with an active socket connection, 497 * the lifecycle of the controller is same as the VM. 498 * Currently we only support one active connection, as the NVMe 499 * specification defines, we may support multiple controllers in 500 * future, so that it can support e.g: RESERVATION. 501 */ 502 struct nvmf_vfio_user_ctrlr *ctrlr; 503 pthread_mutex_t lock; 504 505 bool need_async_destroy; 506 /* The subsystem is in PAUSED state and need to be resumed, TRUE 507 * only when migration is done successfully and the controller is 508 * in source VM. 509 */ 510 bool need_resume; 511 /* Start the accept poller again after destroying the controller */ 512 bool need_relisten; 513 514 TAILQ_ENTRY(nvmf_vfio_user_endpoint) link; 515 }; 516 517 struct nvmf_vfio_user_transport_opts { 518 bool disable_mappable_bar0; 519 bool disable_adaptive_irq; 520 bool disable_shadow_doorbells; 521 bool disable_compare; 522 bool enable_intr_mode_sq_spreading; 523 }; 524 525 struct nvmf_vfio_user_transport { 526 struct spdk_nvmf_transport transport; 527 struct nvmf_vfio_user_transport_opts transport_opts; 528 bool intr_mode_supported; 529 pthread_mutex_t lock; 530 TAILQ_HEAD(, nvmf_vfio_user_endpoint) endpoints; 531 532 pthread_mutex_t pg_lock; 533 TAILQ_HEAD(, nvmf_vfio_user_poll_group) poll_groups; 534 struct nvmf_vfio_user_poll_group *next_pg; 535 }; 536 537 /* 538 * function prototypes 539 */ 540 static int nvmf_vfio_user_req_free(struct spdk_nvmf_request *req); 541 542 static struct nvmf_vfio_user_req *get_nvmf_vfio_user_req(struct nvmf_vfio_user_sq *sq); 543 544 /* 545 * Local process virtual address of a queue. 546 */ 547 static inline void * 548 q_addr(struct nvme_q_mapping *mapping) 549 { 550 return mapping->iov.iov_base; 551 } 552 553 static inline int 554 queue_index(uint16_t qid, bool is_cq) 555 { 556 return (qid * 2) + is_cq; 557 } 558 559 static inline volatile uint32_t * 560 sq_headp(struct nvmf_vfio_user_sq *sq) 561 { 562 assert(sq != NULL); 563 return &sq->head; 564 } 565 566 static inline volatile uint32_t * 567 sq_dbl_tailp(struct nvmf_vfio_user_sq *sq) 568 { 569 assert(sq != NULL); 570 return sq->dbl_tailp; 571 } 572 573 static inline volatile uint32_t * 574 cq_dbl_headp(struct nvmf_vfio_user_cq *cq) 575 { 576 assert(cq != NULL); 577 return cq->dbl_headp; 578 } 579 580 static inline volatile uint32_t * 581 cq_tailp(struct nvmf_vfio_user_cq *cq) 582 { 583 assert(cq != NULL); 584 return &cq->tail; 585 } 586 587 static inline void 588 sq_head_advance(struct nvmf_vfio_user_sq *sq) 589 { 590 assert(sq != NULL); 591 592 assert(*sq_headp(sq) < sq->size); 593 (*sq_headp(sq))++; 594 595 if (spdk_unlikely(*sq_headp(sq) == sq->size)) { 596 *sq_headp(sq) = 0; 597 } 598 } 599 600 static inline void 601 cq_tail_advance(struct nvmf_vfio_user_cq *cq) 602 { 603 assert(cq != NULL); 604 605 assert(*cq_tailp(cq) < cq->size); 606 (*cq_tailp(cq))++; 607 608 if (spdk_unlikely(*cq_tailp(cq) == cq->size)) { 609 *cq_tailp(cq) = 0; 610 cq->phase = !cq->phase; 611 } 612 } 613 614 static bool 615 io_q_exists(struct nvmf_vfio_user_ctrlr *vu_ctrlr, const uint16_t qid, const bool is_cq) 616 { 617 assert(vu_ctrlr != NULL); 618 619 if (qid == 0 || qid >= NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) { 620 return false; 621 } 622 623 if (is_cq) { 624 if (vu_ctrlr->cqs[qid] == NULL) { 625 return false; 626 } 627 628 return (vu_ctrlr->cqs[qid]->cq_state != VFIO_USER_CQ_DELETED && 629 vu_ctrlr->cqs[qid]->cq_state != VFIO_USER_CQ_UNUSED); 630 } 631 632 if (vu_ctrlr->sqs[qid] == NULL) { 633 return false; 634 } 635 636 return (vu_ctrlr->sqs[qid]->sq_state != VFIO_USER_SQ_DELETED && 637 vu_ctrlr->sqs[qid]->sq_state != VFIO_USER_SQ_UNUSED); 638 } 639 640 static char * 641 endpoint_id(struct nvmf_vfio_user_endpoint *endpoint) 642 { 643 return endpoint->trid.traddr; 644 } 645 646 static char * 647 ctrlr_id(struct nvmf_vfio_user_ctrlr *ctrlr) 648 { 649 if (!ctrlr || !ctrlr->endpoint) { 650 return "Null Ctrlr"; 651 } 652 653 return endpoint_id(ctrlr->endpoint); 654 } 655 656 /* Return the poll group for the admin queue of the controller. */ 657 static inline struct nvmf_vfio_user_poll_group * 658 ctrlr_to_poll_group(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 659 { 660 return SPDK_CONTAINEROF(vu_ctrlr->sqs[0]->group, 661 struct nvmf_vfio_user_poll_group, 662 group); 663 } 664 665 static inline struct spdk_thread * 666 poll_group_to_thread(struct nvmf_vfio_user_poll_group *vu_pg) 667 { 668 return vu_pg->group.group->thread; 669 } 670 671 static dma_sg_t * 672 index_to_sg_t(void *arr, size_t i) 673 { 674 return (dma_sg_t *)((uintptr_t)arr + i * dma_sg_size()); 675 } 676 677 static inline size_t 678 vfio_user_migr_data_len(void) 679 { 680 return SPDK_ALIGN_CEIL(sizeof(struct vfio_user_nvme_migr_state), PAGE_SIZE); 681 } 682 683 static inline bool 684 in_interrupt_mode(struct nvmf_vfio_user_transport *vu_transport) 685 { 686 return spdk_interrupt_mode_is_enabled() && 687 vu_transport->intr_mode_supported; 688 } 689 690 static int vfio_user_ctrlr_intr(void *ctx); 691 692 static void 693 vfio_user_msg_ctrlr_intr(void *ctx) 694 { 695 struct nvmf_vfio_user_ctrlr *vu_ctrlr = ctx; 696 struct nvmf_vfio_user_poll_group *vu_ctrlr_group = ctrlr_to_poll_group(vu_ctrlr); 697 698 vu_ctrlr_group->stats.ctrlr_kicks++; 699 700 vfio_user_ctrlr_intr(ctx); 701 } 702 703 /* 704 * Kick (force a wakeup) of all poll groups for this controller. 705 * vfio_user_ctrlr_intr() itself arranges for kicking other poll groups if 706 * needed. 707 */ 708 static void 709 ctrlr_kick(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 710 { 711 struct nvmf_vfio_user_poll_group *vu_ctrlr_group; 712 713 SPDK_DEBUGLOG(vfio_user_db, "%s: kicked\n", ctrlr_id(vu_ctrlr)); 714 715 vu_ctrlr_group = ctrlr_to_poll_group(vu_ctrlr); 716 717 spdk_thread_send_msg(poll_group_to_thread(vu_ctrlr_group), 718 vfio_user_msg_ctrlr_intr, vu_ctrlr); 719 } 720 721 /* 722 * Make the given DMA address and length available (locally mapped) via iov. 723 */ 724 static void * 725 map_one(vfu_ctx_t *ctx, uint64_t addr, uint64_t len, dma_sg_t *sg, 726 struct iovec *iov, int32_t flags) 727 { 728 int prot = PROT_READ; 729 int ret; 730 731 if (flags & MAP_RW) { 732 prot |= PROT_WRITE; 733 } 734 735 assert(ctx != NULL); 736 assert(sg != NULL); 737 assert(iov != NULL); 738 739 ret = vfu_addr_to_sgl(ctx, (void *)(uintptr_t)addr, len, sg, 1, prot); 740 if (ret < 0) { 741 if (ret == -1) { 742 if (!(flags & MAP_QUIET)) { 743 SPDK_ERRLOG("failed to translate IOVA [%#lx, %#lx) (prot=%d) to local VA: %m\n", 744 addr, addr + len, prot); 745 } 746 } else { 747 SPDK_ERRLOG("failed to translate IOVA [%#lx, %#lx) (prot=%d) to local VA: %d segments needed\n", 748 addr, addr + len, prot, -(ret + 1)); 749 } 750 return NULL; 751 } 752 753 ret = vfu_sgl_get(ctx, sg, iov, 1, 0); 754 if (ret != 0) { 755 SPDK_ERRLOG("failed to get iovec for IOVA [%#lx, %#lx): %m\n", 756 addr, addr + len); 757 return NULL; 758 } 759 760 assert(iov->iov_base != NULL); 761 return iov->iov_base; 762 } 763 764 static int 765 nvme_cmd_map_prps(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, 766 uint32_t max_iovcnt, uint32_t len, size_t mps, 767 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, uint32_t flags)) 768 { 769 uint64_t prp1, prp2; 770 void *vva; 771 uint32_t i; 772 uint32_t residue_len, nents; 773 uint64_t *prp_list; 774 uint32_t iovcnt; 775 776 assert(max_iovcnt > 0); 777 778 prp1 = cmd->dptr.prp.prp1; 779 prp2 = cmd->dptr.prp.prp2; 780 781 /* PRP1 may started with unaligned page address */ 782 residue_len = mps - (prp1 % mps); 783 residue_len = spdk_min(len, residue_len); 784 785 vva = gpa_to_vva(prv, prp1, residue_len, MAP_RW); 786 if (spdk_unlikely(vva == NULL)) { 787 SPDK_ERRLOG("GPA to VVA failed\n"); 788 return -EINVAL; 789 } 790 len -= residue_len; 791 if (len && max_iovcnt < 2) { 792 SPDK_ERRLOG("Too many page entries, at least two iovs are required\n"); 793 return -ERANGE; 794 } 795 iovs[0].iov_base = vva; 796 iovs[0].iov_len = residue_len; 797 798 if (len) { 799 if (spdk_unlikely(prp2 == 0)) { 800 SPDK_ERRLOG("no PRP2, %d remaining\n", len); 801 return -EINVAL; 802 } 803 804 if (len <= mps) { 805 /* 2 PRP used */ 806 iovcnt = 2; 807 vva = gpa_to_vva(prv, prp2, len, MAP_RW); 808 if (spdk_unlikely(vva == NULL)) { 809 SPDK_ERRLOG("no VVA for %#" PRIx64 ", len%#x\n", 810 prp2, len); 811 return -EINVAL; 812 } 813 iovs[1].iov_base = vva; 814 iovs[1].iov_len = len; 815 } else { 816 /* PRP list used */ 817 nents = (len + mps - 1) / mps; 818 if (spdk_unlikely(nents + 1 > max_iovcnt)) { 819 SPDK_ERRLOG("Too many page entries\n"); 820 return -ERANGE; 821 } 822 823 vva = gpa_to_vva(prv, prp2, nents * sizeof(*prp_list), MAP_R); 824 if (spdk_unlikely(vva == NULL)) { 825 SPDK_ERRLOG("no VVA for %#" PRIx64 ", nents=%#x\n", 826 prp2, nents); 827 return -EINVAL; 828 } 829 prp_list = vva; 830 i = 0; 831 while (len != 0) { 832 residue_len = spdk_min(len, mps); 833 vva = gpa_to_vva(prv, prp_list[i], residue_len, MAP_RW); 834 if (spdk_unlikely(vva == NULL)) { 835 SPDK_ERRLOG("no VVA for %#" PRIx64 ", residue_len=%#x\n", 836 prp_list[i], residue_len); 837 return -EINVAL; 838 } 839 iovs[i + 1].iov_base = vva; 840 iovs[i + 1].iov_len = residue_len; 841 len -= residue_len; 842 i++; 843 } 844 iovcnt = i + 1; 845 } 846 } else { 847 /* 1 PRP used */ 848 iovcnt = 1; 849 } 850 851 assert(iovcnt <= max_iovcnt); 852 return iovcnt; 853 } 854 855 static int 856 nvme_cmd_map_sgls_data(void *prv, struct spdk_nvme_sgl_descriptor *sgls, uint32_t num_sgls, 857 struct iovec *iovs, uint32_t max_iovcnt, 858 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, uint32_t flags)) 859 { 860 uint32_t i; 861 void *vva; 862 863 if (spdk_unlikely(max_iovcnt < num_sgls)) { 864 return -ERANGE; 865 } 866 867 for (i = 0; i < num_sgls; i++) { 868 if (spdk_unlikely(sgls[i].unkeyed.type != SPDK_NVME_SGL_TYPE_DATA_BLOCK)) { 869 SPDK_ERRLOG("Invalid SGL type %u\n", sgls[i].unkeyed.type); 870 return -EINVAL; 871 } 872 vva = gpa_to_vva(prv, sgls[i].address, sgls[i].unkeyed.length, MAP_RW); 873 if (spdk_unlikely(vva == NULL)) { 874 SPDK_ERRLOG("GPA to VVA failed\n"); 875 return -EINVAL; 876 } 877 iovs[i].iov_base = vva; 878 iovs[i].iov_len = sgls[i].unkeyed.length; 879 } 880 881 return num_sgls; 882 } 883 884 static int 885 nvme_cmd_map_sgls(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt, 886 uint32_t len, size_t mps, 887 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, uint32_t flags)) 888 { 889 struct spdk_nvme_sgl_descriptor *sgl, *last_sgl; 890 uint32_t num_sgls, seg_len; 891 void *vva; 892 int ret; 893 uint32_t total_iovcnt = 0; 894 895 /* SGL cases */ 896 sgl = &cmd->dptr.sgl1; 897 898 /* only one SGL segment */ 899 if (sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { 900 assert(max_iovcnt > 0); 901 vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, MAP_RW); 902 if (spdk_unlikely(vva == NULL)) { 903 SPDK_ERRLOG("GPA to VVA failed\n"); 904 return -EINVAL; 905 } 906 iovs[0].iov_base = vva; 907 iovs[0].iov_len = sgl->unkeyed.length; 908 assert(sgl->unkeyed.length == len); 909 910 return 1; 911 } 912 913 for (;;) { 914 if (spdk_unlikely((sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_SEGMENT) && 915 (sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_LAST_SEGMENT))) { 916 SPDK_ERRLOG("Invalid SGL type %u\n", sgl->unkeyed.type); 917 return -EINVAL; 918 } 919 920 seg_len = sgl->unkeyed.length; 921 if (spdk_unlikely(seg_len % sizeof(struct spdk_nvme_sgl_descriptor))) { 922 SPDK_ERRLOG("Invalid SGL segment len %u\n", seg_len); 923 return -EINVAL; 924 } 925 926 num_sgls = seg_len / sizeof(struct spdk_nvme_sgl_descriptor); 927 vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, MAP_R); 928 if (spdk_unlikely(vva == NULL)) { 929 SPDK_ERRLOG("GPA to VVA failed\n"); 930 return -EINVAL; 931 } 932 933 /* sgl point to the first segment */ 934 sgl = (struct spdk_nvme_sgl_descriptor *)vva; 935 last_sgl = &sgl[num_sgls - 1]; 936 937 /* we are done */ 938 if (last_sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { 939 /* map whole sgl list */ 940 ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls, &iovs[total_iovcnt], 941 max_iovcnt - total_iovcnt, gpa_to_vva); 942 if (spdk_unlikely(ret < 0)) { 943 return ret; 944 } 945 total_iovcnt += ret; 946 947 return total_iovcnt; 948 } 949 950 if (num_sgls > 1) { 951 /* map whole sgl exclude last_sgl */ 952 ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls - 1, &iovs[total_iovcnt], 953 max_iovcnt - total_iovcnt, gpa_to_vva); 954 if (spdk_unlikely(ret < 0)) { 955 return ret; 956 } 957 total_iovcnt += ret; 958 } 959 960 /* move to next level's segments */ 961 sgl = last_sgl; 962 } 963 964 return 0; 965 } 966 967 static int 968 nvme_map_cmd(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt, 969 uint32_t len, size_t mps, 970 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, uint32_t flags)) 971 { 972 if (cmd->psdt == SPDK_NVME_PSDT_PRP) { 973 return nvme_cmd_map_prps(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva); 974 } 975 976 return nvme_cmd_map_sgls(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva); 977 } 978 979 /* 980 * For each queue, update the location of its doorbell to the correct location: 981 * either our own BAR0, or the guest's configured shadow doorbell area. 982 * 983 * The Admin queue (qid: 0) does not ever use shadow doorbells. 984 */ 985 static void 986 vfio_user_ctrlr_switch_doorbells(struct nvmf_vfio_user_ctrlr *ctrlr, bool shadow) 987 { 988 volatile uint32_t *doorbells = shadow ? ctrlr->sdbl->shadow_doorbells : 989 ctrlr->bar0_doorbells; 990 991 assert(doorbells != NULL); 992 993 for (size_t i = 1; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; i++) { 994 struct nvmf_vfio_user_sq *sq = ctrlr->sqs[i]; 995 struct nvmf_vfio_user_cq *cq = ctrlr->cqs[i]; 996 997 if (sq != NULL) { 998 sq->dbl_tailp = doorbells + queue_index(sq->qid, false); 999 1000 ctrlr->sqs[i]->need_rearm = shadow; 1001 } 1002 1003 if (cq != NULL) { 1004 cq->dbl_headp = doorbells + queue_index(cq->qid, true); 1005 } 1006 } 1007 } 1008 1009 static void 1010 unmap_sdbl(vfu_ctx_t *vfu_ctx, struct nvmf_vfio_user_shadow_doorbells *sdbl) 1011 { 1012 assert(vfu_ctx != NULL); 1013 assert(sdbl != NULL); 1014 1015 /* 1016 * An allocation error would result in only one of the two being 1017 * non-NULL. If that is the case, no memory should have been mapped. 1018 */ 1019 if (sdbl->iovs == NULL || sdbl->sgs == NULL) { 1020 return; 1021 } 1022 1023 for (size_t i = 0; i < NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT; ++i) { 1024 struct iovec *iov; 1025 dma_sg_t *sg; 1026 1027 if (!sdbl->iovs[i].iov_len) { 1028 continue; 1029 } 1030 1031 sg = index_to_sg_t(sdbl->sgs, i); 1032 iov = sdbl->iovs + i; 1033 1034 vfu_sgl_put(vfu_ctx, sg, iov, 1); 1035 } 1036 } 1037 1038 static void 1039 free_sdbl(vfu_ctx_t *vfu_ctx, struct nvmf_vfio_user_shadow_doorbells *sdbl) 1040 { 1041 if (sdbl == NULL) { 1042 return; 1043 } 1044 1045 unmap_sdbl(vfu_ctx, sdbl); 1046 1047 /* 1048 * sdbl->shadow_doorbells and sdbl->eventidxs were mapped, 1049 * not allocated, so don't free() them. 1050 */ 1051 free(sdbl->sgs); 1052 free(sdbl->iovs); 1053 free(sdbl); 1054 } 1055 1056 static struct nvmf_vfio_user_shadow_doorbells * 1057 map_sdbl(vfu_ctx_t *vfu_ctx, uint64_t prp1, uint64_t prp2, size_t len) 1058 { 1059 struct nvmf_vfio_user_shadow_doorbells *sdbl = NULL; 1060 dma_sg_t *sg2 = NULL; 1061 void *p; 1062 1063 assert(vfu_ctx != NULL); 1064 1065 sdbl = calloc(1, sizeof(*sdbl)); 1066 if (sdbl == NULL) { 1067 goto err; 1068 } 1069 1070 sdbl->sgs = calloc(NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT, dma_sg_size()); 1071 sdbl->iovs = calloc(NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT, sizeof(*sdbl->iovs)); 1072 if (sdbl->sgs == NULL || sdbl->iovs == NULL) { 1073 goto err; 1074 } 1075 1076 /* Map shadow doorbell buffer (PRP1). */ 1077 p = map_one(vfu_ctx, prp1, len, sdbl->sgs, sdbl->iovs, MAP_RW); 1078 1079 if (p == NULL) { 1080 goto err; 1081 } 1082 1083 /* 1084 * Map eventidx buffer (PRP2). 1085 * Should only be written to by the controller. 1086 */ 1087 1088 sg2 = index_to_sg_t(sdbl->sgs, 1); 1089 1090 p = map_one(vfu_ctx, prp2, len, sg2, sdbl->iovs + 1, MAP_RW); 1091 1092 if (p == NULL) { 1093 goto err; 1094 } 1095 1096 sdbl->shadow_doorbells = (uint32_t *)sdbl->iovs[0].iov_base; 1097 sdbl->eventidxs = (uint32_t *)sdbl->iovs[1].iov_base; 1098 1099 return sdbl; 1100 1101 err: 1102 free_sdbl(vfu_ctx, sdbl); 1103 return NULL; 1104 } 1105 1106 /* 1107 * Copy doorbells from one buffer to the other, during switches between BAR0 1108 * doorbells and shadow doorbells. 1109 */ 1110 static void 1111 copy_doorbells(struct nvmf_vfio_user_ctrlr *ctrlr, 1112 const volatile uint32_t *from, volatile uint32_t *to) 1113 { 1114 assert(ctrlr != NULL); 1115 assert(from != NULL); 1116 assert(to != NULL); 1117 1118 SPDK_DEBUGLOG(vfio_user_db, 1119 "%s: migrating shadow doorbells from %p to %p\n", 1120 ctrlr_id(ctrlr), from, to); 1121 1122 /* Can't use memcpy because it doesn't respect volatile semantics. */ 1123 for (size_t i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; ++i) { 1124 if (ctrlr->sqs[i] != NULL) { 1125 to[queue_index(i, false)] = from[queue_index(i, false)]; 1126 } 1127 1128 if (ctrlr->cqs[i] != NULL) { 1129 to[queue_index(i, true)] = from[queue_index(i, true)]; 1130 } 1131 } 1132 } 1133 1134 static void 1135 fail_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1136 { 1137 const struct spdk_nvmf_registers *regs; 1138 1139 assert(vu_ctrlr != NULL); 1140 assert(vu_ctrlr->ctrlr != NULL); 1141 1142 regs = spdk_nvmf_ctrlr_get_regs(vu_ctrlr->ctrlr); 1143 if (regs->csts.bits.cfs == 0) { 1144 SPDK_ERRLOG(":%s failing controller\n", ctrlr_id(vu_ctrlr)); 1145 } 1146 1147 nvmf_ctrlr_set_fatal_status(vu_ctrlr->ctrlr); 1148 } 1149 1150 static inline bool 1151 ctrlr_interrupt_enabled(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1152 { 1153 assert(vu_ctrlr != NULL); 1154 assert(vu_ctrlr->endpoint != NULL); 1155 1156 vfu_pci_config_space_t *pci = vu_ctrlr->endpoint->pci_config_space; 1157 1158 return (!pci->hdr.cmd.id || vu_ctrlr->endpoint->msix->mxc.mxe); 1159 } 1160 1161 static void 1162 nvmf_vfio_user_destroy_endpoint(struct nvmf_vfio_user_endpoint *endpoint) 1163 { 1164 SPDK_DEBUGLOG(nvmf_vfio, "destroy endpoint %s\n", endpoint_id(endpoint)); 1165 1166 spdk_interrupt_unregister(&endpoint->accept_intr); 1167 spdk_poller_unregister(&endpoint->accept_poller); 1168 1169 if (endpoint->bar0_doorbells) { 1170 munmap((void *)endpoint->bar0_doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 1171 } 1172 1173 if (endpoint->devmem_fd > 0) { 1174 close(endpoint->devmem_fd); 1175 } 1176 1177 if (endpoint->migr_data) { 1178 munmap(endpoint->migr_data, vfio_user_migr_data_len()); 1179 } 1180 1181 if (endpoint->migr_fd > 0) { 1182 close(endpoint->migr_fd); 1183 } 1184 1185 if (endpoint->vfu_ctx) { 1186 vfu_destroy_ctx(endpoint->vfu_ctx); 1187 } 1188 1189 pthread_mutex_destroy(&endpoint->lock); 1190 free(endpoint); 1191 } 1192 1193 /* called when process exits */ 1194 static int 1195 nvmf_vfio_user_destroy(struct spdk_nvmf_transport *transport, 1196 spdk_nvmf_transport_destroy_done_cb cb_fn, void *cb_arg) 1197 { 1198 struct nvmf_vfio_user_transport *vu_transport; 1199 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 1200 1201 SPDK_DEBUGLOG(nvmf_vfio, "destroy transport\n"); 1202 1203 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 1204 transport); 1205 1206 pthread_mutex_destroy(&vu_transport->lock); 1207 pthread_mutex_destroy(&vu_transport->pg_lock); 1208 1209 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 1210 TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link); 1211 nvmf_vfio_user_destroy_endpoint(endpoint); 1212 } 1213 1214 free(vu_transport); 1215 1216 if (cb_fn) { 1217 cb_fn(cb_arg); 1218 } 1219 1220 return 0; 1221 } 1222 1223 static const struct spdk_json_object_decoder vfio_user_transport_opts_decoder[] = { 1224 { 1225 "disable_mappable_bar0", 1226 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_mappable_bar0), 1227 spdk_json_decode_bool, true 1228 }, 1229 { 1230 "disable_adaptive_irq", 1231 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_adaptive_irq), 1232 spdk_json_decode_bool, true 1233 }, 1234 { 1235 "disable_shadow_doorbells", 1236 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_shadow_doorbells), 1237 spdk_json_decode_bool, true 1238 }, 1239 { 1240 "disable_compare", 1241 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_compare), 1242 spdk_json_decode_bool, true 1243 }, 1244 { 1245 "enable_intr_mode_sq_spreading", 1246 offsetof(struct nvmf_vfio_user_transport, transport_opts.enable_intr_mode_sq_spreading), 1247 spdk_json_decode_bool, true 1248 }, 1249 }; 1250 1251 static struct spdk_nvmf_transport * 1252 nvmf_vfio_user_create(struct spdk_nvmf_transport_opts *opts) 1253 { 1254 struct nvmf_vfio_user_transport *vu_transport; 1255 int err; 1256 1257 if (opts->max_qpairs_per_ctrlr > NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) { 1258 SPDK_ERRLOG("Invalid max_qpairs_per_ctrlr=%d, supported max_qpairs_per_ctrlr=%d\n", 1259 opts->max_qpairs_per_ctrlr, NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR); 1260 return NULL; 1261 } 1262 1263 vu_transport = calloc(1, sizeof(*vu_transport)); 1264 if (vu_transport == NULL) { 1265 SPDK_ERRLOG("Transport alloc fail: %m\n"); 1266 return NULL; 1267 } 1268 1269 err = pthread_mutex_init(&vu_transport->lock, NULL); 1270 if (err != 0) { 1271 SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err); 1272 goto err; 1273 } 1274 TAILQ_INIT(&vu_transport->endpoints); 1275 1276 err = pthread_mutex_init(&vu_transport->pg_lock, NULL); 1277 if (err != 0) { 1278 pthread_mutex_destroy(&vu_transport->lock); 1279 SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err); 1280 goto err; 1281 } 1282 TAILQ_INIT(&vu_transport->poll_groups); 1283 1284 if (opts->transport_specific != NULL && 1285 spdk_json_decode_object_relaxed(opts->transport_specific, vfio_user_transport_opts_decoder, 1286 SPDK_COUNTOF(vfio_user_transport_opts_decoder), 1287 vu_transport)) { 1288 SPDK_ERRLOG("spdk_json_decode_object_relaxed failed\n"); 1289 goto cleanup; 1290 } 1291 1292 /* 1293 * To support interrupt mode, the transport must be configured with 1294 * mappable BAR0 disabled: we need a vfio-user message to wake us up 1295 * when a client writes new doorbell values to BAR0, via the 1296 * libvfio-user socket fd. 1297 */ 1298 vu_transport->intr_mode_supported = 1299 vu_transport->transport_opts.disable_mappable_bar0; 1300 1301 /* 1302 * If BAR0 is mappable, it doesn't make sense to support shadow 1303 * doorbells, so explicitly turn it off. 1304 */ 1305 if (!vu_transport->transport_opts.disable_mappable_bar0) { 1306 vu_transport->transport_opts.disable_shadow_doorbells = true; 1307 } 1308 1309 if (spdk_interrupt_mode_is_enabled()) { 1310 if (!vu_transport->intr_mode_supported) { 1311 SPDK_ERRLOG("interrupt mode not supported\n"); 1312 goto cleanup; 1313 } 1314 1315 /* 1316 * If we are in interrupt mode, we cannot support adaptive IRQs, 1317 * as there is no guarantee the SQ poller will run subsequently 1318 * to send pending IRQs. 1319 */ 1320 vu_transport->transport_opts.disable_adaptive_irq = true; 1321 } 1322 1323 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_mappable_bar0=%d\n", 1324 vu_transport->transport_opts.disable_mappable_bar0); 1325 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_adaptive_irq=%d\n", 1326 vu_transport->transport_opts.disable_adaptive_irq); 1327 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_shadow_doorbells=%d\n", 1328 vu_transport->transport_opts.disable_shadow_doorbells); 1329 1330 return &vu_transport->transport; 1331 1332 cleanup: 1333 pthread_mutex_destroy(&vu_transport->lock); 1334 pthread_mutex_destroy(&vu_transport->pg_lock); 1335 err: 1336 free(vu_transport); 1337 return NULL; 1338 } 1339 1340 static uint32_t 1341 max_queue_size(struct nvmf_vfio_user_ctrlr const *vu_ctrlr) 1342 { 1343 assert(vu_ctrlr != NULL); 1344 assert(vu_ctrlr->ctrlr != NULL); 1345 1346 return vu_ctrlr->ctrlr->vcprop.cap.bits.mqes + 1; 1347 } 1348 1349 static uint32_t 1350 doorbell_stride(const struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1351 { 1352 assert(vu_ctrlr != NULL); 1353 assert(vu_ctrlr->ctrlr != NULL); 1354 1355 return vu_ctrlr->ctrlr->vcprop.cap.bits.dstrd; 1356 } 1357 1358 static uintptr_t 1359 memory_page_size(const struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1360 { 1361 uint32_t memory_page_shift = vu_ctrlr->ctrlr->vcprop.cc.bits.mps + 12; 1362 return 1ul << memory_page_shift; 1363 } 1364 1365 static uintptr_t 1366 memory_page_mask(const struct nvmf_vfio_user_ctrlr *ctrlr) 1367 { 1368 return ~(memory_page_size(ctrlr) - 1); 1369 } 1370 1371 static int 1372 map_q(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvme_q_mapping *mapping, 1373 uint32_t flags) 1374 { 1375 void *ret; 1376 1377 assert(mapping->len != 0); 1378 assert(q_addr(mapping) == NULL); 1379 1380 ret = map_one(vu_ctrlr->endpoint->vfu_ctx, mapping->prp1, mapping->len, 1381 mapping->sg, &mapping->iov, flags); 1382 if (ret == NULL) { 1383 return -EFAULT; 1384 } 1385 1386 if (flags & MAP_INITIALIZE) { 1387 memset(q_addr(mapping), 0, mapping->len); 1388 } 1389 1390 return 0; 1391 } 1392 1393 static inline void 1394 unmap_q(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvme_q_mapping *mapping) 1395 { 1396 if (q_addr(mapping) != NULL) { 1397 vfu_sgl_put(vu_ctrlr->endpoint->vfu_ctx, mapping->sg, 1398 &mapping->iov, 1); 1399 mapping->iov.iov_base = NULL; 1400 } 1401 } 1402 1403 static int 1404 asq_setup(struct nvmf_vfio_user_ctrlr *ctrlr) 1405 { 1406 struct nvmf_vfio_user_sq *sq; 1407 const struct spdk_nvmf_registers *regs; 1408 int ret; 1409 1410 assert(ctrlr != NULL); 1411 1412 sq = ctrlr->sqs[0]; 1413 1414 assert(sq != NULL); 1415 assert(q_addr(&sq->mapping) == NULL); 1416 /* XXX ctrlr->asq == 0 is a valid memory address */ 1417 1418 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->ctrlr); 1419 sq->qid = 0; 1420 sq->size = regs->aqa.bits.asqs + 1; 1421 sq->mapping.prp1 = regs->asq; 1422 sq->mapping.len = sq->size * sizeof(struct spdk_nvme_cmd); 1423 *sq_headp(sq) = 0; 1424 sq->cqid = 0; 1425 1426 ret = map_q(ctrlr, &sq->mapping, MAP_INITIALIZE); 1427 if (ret) { 1428 return ret; 1429 } 1430 1431 /* The Admin queue (qid: 0) does not ever use shadow doorbells. */ 1432 sq->dbl_tailp = ctrlr->bar0_doorbells + queue_index(0, false); 1433 1434 *sq_dbl_tailp(sq) = 0; 1435 1436 return 0; 1437 } 1438 1439 /* 1440 * Updates eventidx to set an SQ into interrupt or polling mode. 1441 * 1442 * Returns false if the current SQ tail does not match the SQ head, as 1443 * this means that the host has submitted more items to the queue while we were 1444 * not looking - or during the event index update. In that case, we must retry, 1445 * or otherwise make sure we are going to wake up again. 1446 */ 1447 static bool 1448 set_sq_eventidx(struct nvmf_vfio_user_sq *sq) 1449 { 1450 struct nvmf_vfio_user_ctrlr *ctrlr; 1451 volatile uint32_t *sq_tail_eidx; 1452 uint32_t old_tail, new_tail; 1453 1454 assert(sq != NULL); 1455 assert(sq->ctrlr != NULL); 1456 assert(sq->ctrlr->sdbl != NULL); 1457 assert(sq->need_rearm); 1458 assert(sq->qid != 0); 1459 1460 ctrlr = sq->ctrlr; 1461 1462 SPDK_DEBUGLOG(vfio_user_db, "%s: updating eventidx of sqid:%u\n", 1463 ctrlr_id(ctrlr), sq->qid); 1464 1465 sq_tail_eidx = ctrlr->sdbl->eventidxs + queue_index(sq->qid, false); 1466 1467 assert(ctrlr->endpoint != NULL); 1468 1469 if (!ctrlr->endpoint->interrupt_mode) { 1470 /* No synchronisation necessary. */ 1471 *sq_tail_eidx = NVMF_VFIO_USER_EVENTIDX_POLL; 1472 return true; 1473 } 1474 1475 old_tail = *sq_dbl_tailp(sq); 1476 *sq_tail_eidx = old_tail; 1477 1478 /* 1479 * Ensure that the event index is updated before re-reading the tail 1480 * doorbell. If it's not, then the host might race us and update the 1481 * tail after the second read but before the event index is written, so 1482 * it won't write to BAR0 and we'll miss the update. 1483 * 1484 * The driver should provide similar ordering with an mb(). 1485 */ 1486 spdk_mb(); 1487 1488 /* 1489 * Check if the host has updated the tail doorbell after we've read it 1490 * for the first time, but before the event index was written. If that's 1491 * the case, then we've lost the race and we need to update the event 1492 * index again (after polling the queue, since the host won't write to 1493 * BAR0). 1494 */ 1495 new_tail = *sq_dbl_tailp(sq); 1496 1497 /* 1498 * We might poll the queue straight after this function returns if the 1499 * tail has been updated, so we need to ensure that any changes to the 1500 * queue will be visible to us if the doorbell has been updated. 1501 * 1502 * The driver should provide similar ordering with a wmb() to ensure 1503 * that the queue is written before it updates the tail doorbell. 1504 */ 1505 spdk_rmb(); 1506 1507 SPDK_DEBUGLOG(vfio_user_db, "%s: sqid:%u, old_tail=%u, new_tail=%u, " 1508 "sq_head=%u\n", ctrlr_id(ctrlr), sq->qid, old_tail, 1509 new_tail, *sq_headp(sq)); 1510 1511 if (new_tail == *sq_headp(sq)) { 1512 sq->need_rearm = false; 1513 return true; 1514 } 1515 1516 /* 1517 * We've lost the race: the tail was updated since we last polled, 1518 * including if it happened within this routine. 1519 * 1520 * The caller should retry after polling (think of this as a cmpxchg 1521 * loop); if we go to sleep while the SQ is not empty, then we won't 1522 * process the remaining events. 1523 */ 1524 return false; 1525 } 1526 1527 static int nvmf_vfio_user_sq_poll(struct nvmf_vfio_user_sq *sq); 1528 1529 /* 1530 * Arrange for an SQ to interrupt us if written. Returns non-zero if we 1531 * processed some SQ entries. 1532 */ 1533 static int 1534 vfio_user_sq_rearm(struct nvmf_vfio_user_ctrlr *ctrlr, 1535 struct nvmf_vfio_user_sq *sq, 1536 struct nvmf_vfio_user_poll_group *vu_group) 1537 { 1538 int count = 0; 1539 size_t i; 1540 1541 assert(sq->need_rearm); 1542 1543 for (i = 0; i < NVMF_VFIO_USER_SET_EVENTIDX_MAX_ATTEMPTS; i++) { 1544 int ret; 1545 1546 if (set_sq_eventidx(sq)) { 1547 /* We won the race and set eventidx; done. */ 1548 vu_group->stats.won++; 1549 return count; 1550 } 1551 1552 ret = nvmf_vfio_user_sq_poll(sq); 1553 1554 count += (ret < 0) ? 1 : ret; 1555 1556 /* 1557 * set_sq_eventidx() hit the race, so we expected 1558 * to process at least one command from this queue. 1559 * If there were no new commands waiting for us, then 1560 * we must have hit an unexpected race condition. 1561 */ 1562 if (ret == 0) { 1563 SPDK_ERRLOG("%s: unexpected race condition detected " 1564 "while updating the shadow doorbell buffer\n", 1565 ctrlr_id(ctrlr)); 1566 1567 fail_ctrlr(ctrlr); 1568 return count; 1569 } 1570 } 1571 1572 SPDK_DEBUGLOG(vfio_user_db, 1573 "%s: set_sq_eventidx() lost the race %zu times\n", 1574 ctrlr_id(ctrlr), i); 1575 1576 vu_group->stats.lost++; 1577 vu_group->stats.lost_count += count; 1578 1579 /* 1580 * We couldn't arrange an eventidx guaranteed to cause a BAR0 write, as 1581 * we raced with the producer too many times; force ourselves to wake up 1582 * instead. We'll process all queues at that point. 1583 */ 1584 ctrlr_kick(ctrlr); 1585 1586 return count; 1587 } 1588 1589 /* 1590 * We're in interrupt mode, and potentially about to go to sleep. We need to 1591 * make sure any further I/O submissions are guaranteed to wake us up: for 1592 * shadow doorbells that means we may need to go through set_sq_eventidx() for 1593 * every SQ that needs re-arming. 1594 * 1595 * Returns non-zero if we processed something. 1596 */ 1597 static int 1598 vfio_user_poll_group_rearm(struct nvmf_vfio_user_poll_group *vu_group) 1599 { 1600 struct nvmf_vfio_user_sq *sq; 1601 int count = 0; 1602 1603 vu_group->stats.rearms++; 1604 1605 TAILQ_FOREACH(sq, &vu_group->sqs, link) { 1606 if (spdk_unlikely(sq->sq_state != VFIO_USER_SQ_ACTIVE || !sq->size)) { 1607 continue; 1608 } 1609 1610 if (sq->need_rearm) { 1611 count += vfio_user_sq_rearm(sq->ctrlr, sq, vu_group); 1612 } 1613 } 1614 1615 return count; 1616 } 1617 1618 static int 1619 acq_setup(struct nvmf_vfio_user_ctrlr *ctrlr) 1620 { 1621 struct nvmf_vfio_user_cq *cq; 1622 const struct spdk_nvmf_registers *regs; 1623 int ret; 1624 1625 assert(ctrlr != NULL); 1626 1627 cq = ctrlr->cqs[0]; 1628 1629 assert(cq != NULL); 1630 1631 assert(q_addr(&cq->mapping) == NULL); 1632 1633 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->ctrlr); 1634 assert(regs != NULL); 1635 cq->qid = 0; 1636 cq->size = regs->aqa.bits.acqs + 1; 1637 cq->mapping.prp1 = regs->acq; 1638 cq->mapping.len = cq->size * sizeof(struct spdk_nvme_cpl); 1639 *cq_tailp(cq) = 0; 1640 cq->ien = true; 1641 cq->phase = true; 1642 1643 ret = map_q(ctrlr, &cq->mapping, MAP_RW | MAP_INITIALIZE); 1644 if (ret) { 1645 return ret; 1646 } 1647 1648 /* The Admin queue (qid: 0) does not ever use shadow doorbells. */ 1649 cq->dbl_headp = ctrlr->bar0_doorbells + queue_index(0, true); 1650 1651 *cq_dbl_headp(cq) = 0; 1652 1653 return 0; 1654 } 1655 1656 static void * 1657 _map_one(void *prv, uint64_t addr, uint64_t len, uint32_t flags) 1658 { 1659 struct spdk_nvmf_request *req = (struct spdk_nvmf_request *)prv; 1660 struct spdk_nvmf_qpair *qpair; 1661 struct nvmf_vfio_user_req *vu_req; 1662 struct nvmf_vfio_user_sq *sq; 1663 void *ret; 1664 1665 assert(req != NULL); 1666 qpair = req->qpair; 1667 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 1668 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 1669 1670 assert(vu_req->iovcnt < NVMF_VFIO_USER_MAX_IOVECS); 1671 ret = map_one(sq->ctrlr->endpoint->vfu_ctx, addr, len, 1672 index_to_sg_t(vu_req->sg, vu_req->iovcnt), 1673 &vu_req->iov[vu_req->iovcnt], flags); 1674 if (spdk_likely(ret != NULL)) { 1675 vu_req->iovcnt++; 1676 } 1677 return ret; 1678 } 1679 1680 static int 1681 vfio_user_map_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req, 1682 struct iovec *iov, uint32_t length) 1683 { 1684 /* Map PRP list to from Guest physical memory to 1685 * virtual memory address. 1686 */ 1687 return nvme_map_cmd(req, &req->cmd->nvme_cmd, iov, NVMF_REQ_MAX_BUFFERS, 1688 length, 4096, _map_one); 1689 } 1690 1691 static int handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 1692 struct nvmf_vfio_user_sq *sq); 1693 1694 static uint32_t 1695 cq_free_slots(struct nvmf_vfio_user_cq *cq) 1696 { 1697 uint32_t free_slots; 1698 1699 assert(cq != NULL); 1700 1701 if (cq->tail == cq->last_head) { 1702 free_slots = cq->size; 1703 } else if (cq->tail > cq->last_head) { 1704 free_slots = cq->size - (cq->tail - cq->last_head); 1705 } else { 1706 free_slots = cq->last_head - cq->tail; 1707 } 1708 assert(free_slots > 0); 1709 1710 return free_slots - 1; 1711 } 1712 1713 /* 1714 * Since reading the head doorbell is relatively expensive, we use the cached 1715 * value, so we only have to read it for real if it appears that we are full. 1716 */ 1717 static inline bool 1718 cq_is_full(struct nvmf_vfio_user_cq *cq) 1719 { 1720 uint32_t free_cq_slots; 1721 1722 assert(cq != NULL); 1723 1724 free_cq_slots = cq_free_slots(cq); 1725 1726 if (spdk_unlikely(free_cq_slots == 0)) { 1727 cq->last_head = *cq_dbl_headp(cq); 1728 free_cq_slots = cq_free_slots(cq); 1729 } 1730 1731 return free_cq_slots == 0; 1732 } 1733 1734 /* 1735 * Posts a CQE in the completion queue. 1736 * 1737 * @ctrlr: the vfio-user controller 1738 * @cq: the completion queue 1739 * @cdw0: cdw0 as reported by NVMf 1740 * @sqid: submission queue ID 1741 * @cid: command identifier in NVMe command 1742 * @sc: the NVMe CQE status code 1743 * @sct: the NVMe CQE status code type 1744 */ 1745 static int 1746 post_completion(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_cq *cq, 1747 uint32_t cdw0, uint16_t sqid, uint16_t cid, uint16_t sc, uint16_t sct) 1748 { 1749 struct spdk_nvme_status cpl_status = { 0 }; 1750 struct spdk_nvme_cpl *cpl; 1751 int err; 1752 1753 assert(ctrlr != NULL); 1754 1755 if (spdk_unlikely(cq == NULL || q_addr(&cq->mapping) == NULL)) { 1756 return 0; 1757 } 1758 1759 if (cq->qid == 0) { 1760 assert(spdk_get_thread() == cq->group->group->thread); 1761 } 1762 1763 /* 1764 * As per NVMe Base spec 3.3.1.2.1, we are supposed to implement CQ flow 1765 * control: if there is no space in the CQ, we should wait until there is. 1766 * 1767 * In practice, we just fail the controller instead: as it happens, all host 1768 * implementations we care about right-size the CQ: this is required anyway for 1769 * NVMEoF support (see 3.3.2.8). 1770 */ 1771 if (cq_is_full(cq)) { 1772 SPDK_ERRLOG("%s: cqid:%d full (tail=%d, head=%d)\n", 1773 ctrlr_id(ctrlr), cq->qid, *cq_tailp(cq), 1774 *cq_dbl_headp(cq)); 1775 return -1; 1776 } 1777 1778 cpl = ((struct spdk_nvme_cpl *)q_addr(&cq->mapping)) + *cq_tailp(cq); 1779 1780 assert(ctrlr->sqs[sqid] != NULL); 1781 SPDK_DEBUGLOG(nvmf_vfio, 1782 "%s: request complete sqid:%d cid=%d status=%#x " 1783 "sqhead=%d cq tail=%d\n", ctrlr_id(ctrlr), sqid, cid, sc, 1784 *sq_headp(ctrlr->sqs[sqid]), *cq_tailp(cq)); 1785 1786 cpl->sqhd = *sq_headp(ctrlr->sqs[sqid]); 1787 cpl->sqid = sqid; 1788 cpl->cid = cid; 1789 cpl->cdw0 = cdw0; 1790 1791 /* 1792 * This is a bitfield: instead of setting the individual bits we need 1793 * directly in cpl->status, which would cause a read-modify-write cycle, 1794 * we'll avoid reading from the CPL altogether by filling in a local 1795 * cpl_status variable, then writing the whole thing. 1796 */ 1797 cpl_status.sct = sct; 1798 cpl_status.sc = sc; 1799 cpl_status.p = cq->phase; 1800 cpl->status = cpl_status; 1801 1802 /* Ensure the Completion Queue Entry is visible. */ 1803 spdk_wmb(); 1804 cq_tail_advance(cq); 1805 1806 if ((cq->qid == 0 || !ctrlr->adaptive_irqs_enabled) && 1807 cq->ien && ctrlr_interrupt_enabled(ctrlr)) { 1808 err = vfu_irq_trigger(ctrlr->endpoint->vfu_ctx, cq->iv); 1809 if (err != 0) { 1810 SPDK_ERRLOG("%s: failed to trigger interrupt: %m\n", 1811 ctrlr_id(ctrlr)); 1812 return err; 1813 } 1814 } 1815 1816 return 0; 1817 } 1818 1819 static void 1820 free_sq_reqs(struct nvmf_vfio_user_sq *sq) 1821 { 1822 while (!TAILQ_EMPTY(&sq->free_reqs)) { 1823 struct nvmf_vfio_user_req *vu_req = TAILQ_FIRST(&sq->free_reqs); 1824 TAILQ_REMOVE(&sq->free_reqs, vu_req, link); 1825 free(vu_req); 1826 } 1827 } 1828 1829 static void 1830 delete_cq_done(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_cq *cq) 1831 { 1832 assert(cq->cq_ref == 0); 1833 unmap_q(ctrlr, &cq->mapping); 1834 cq->size = 0; 1835 cq->cq_state = VFIO_USER_CQ_DELETED; 1836 cq->group = NULL; 1837 } 1838 1839 /* Deletes a SQ, if this SQ is the last user of the associated CQ 1840 * and the controller is being shut down/reset or vfio-user client disconnects, 1841 * then the CQ is also deleted. 1842 */ 1843 static void 1844 delete_sq_done(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvmf_vfio_user_sq *sq) 1845 { 1846 struct nvmf_vfio_user_cq *cq; 1847 uint16_t cqid; 1848 1849 SPDK_DEBUGLOG(nvmf_vfio, "%s: delete sqid:%d=%p done\n", ctrlr_id(vu_ctrlr), 1850 sq->qid, sq); 1851 1852 /* Free SQ resources */ 1853 unmap_q(vu_ctrlr, &sq->mapping); 1854 1855 free_sq_reqs(sq); 1856 1857 sq->size = 0; 1858 1859 sq->sq_state = VFIO_USER_SQ_DELETED; 1860 1861 /* Controller RESET and SHUTDOWN are special cases, 1862 * VM may not send DELETE IO SQ/CQ commands, NVMf library 1863 * will disconnect IO queue pairs. 1864 */ 1865 if (vu_ctrlr->reset_shn || vu_ctrlr->disconnect) { 1866 cqid = sq->cqid; 1867 cq = vu_ctrlr->cqs[cqid]; 1868 1869 SPDK_DEBUGLOG(nvmf_vfio, "%s: try to delete cqid:%u=%p\n", ctrlr_id(vu_ctrlr), 1870 cq->qid, cq); 1871 1872 assert(cq->cq_ref > 0); 1873 if (--cq->cq_ref == 0) { 1874 delete_cq_done(vu_ctrlr, cq); 1875 } 1876 } 1877 } 1878 1879 static void 1880 free_qp(struct nvmf_vfio_user_ctrlr *ctrlr, uint16_t qid) 1881 { 1882 struct nvmf_vfio_user_sq *sq; 1883 struct nvmf_vfio_user_cq *cq; 1884 1885 if (ctrlr == NULL) { 1886 return; 1887 } 1888 1889 sq = ctrlr->sqs[qid]; 1890 if (sq) { 1891 SPDK_DEBUGLOG(nvmf_vfio, "%s: Free sqid:%u\n", ctrlr_id(ctrlr), qid); 1892 unmap_q(ctrlr, &sq->mapping); 1893 1894 free_sq_reqs(sq); 1895 1896 free(sq->mapping.sg); 1897 free(sq); 1898 ctrlr->sqs[qid] = NULL; 1899 } 1900 1901 cq = ctrlr->cqs[qid]; 1902 if (cq) { 1903 SPDK_DEBUGLOG(nvmf_vfio, "%s: Free cqid:%u\n", ctrlr_id(ctrlr), qid); 1904 unmap_q(ctrlr, &cq->mapping); 1905 free(cq->mapping.sg); 1906 free(cq); 1907 ctrlr->cqs[qid] = NULL; 1908 } 1909 } 1910 1911 static int 1912 init_sq(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_transport *transport, 1913 const uint16_t id) 1914 { 1915 struct nvmf_vfio_user_sq *sq; 1916 1917 assert(ctrlr != NULL); 1918 assert(transport != NULL); 1919 assert(ctrlr->sqs[id] == NULL); 1920 1921 sq = calloc(1, sizeof(*sq)); 1922 if (sq == NULL) { 1923 return -ENOMEM; 1924 } 1925 sq->mapping.sg = calloc(1, dma_sg_size()); 1926 if (sq->mapping.sg == NULL) { 1927 free(sq); 1928 return -ENOMEM; 1929 } 1930 1931 sq->qid = id; 1932 sq->qpair.qid = id; 1933 sq->qpair.transport = transport; 1934 sq->ctrlr = ctrlr; 1935 ctrlr->sqs[id] = sq; 1936 1937 TAILQ_INIT(&sq->free_reqs); 1938 1939 return 0; 1940 } 1941 1942 static int 1943 init_cq(struct nvmf_vfio_user_ctrlr *vu_ctrlr, const uint16_t id) 1944 { 1945 struct nvmf_vfio_user_cq *cq; 1946 1947 assert(vu_ctrlr != NULL); 1948 assert(vu_ctrlr->cqs[id] == NULL); 1949 1950 cq = calloc(1, sizeof(*cq)); 1951 if (cq == NULL) { 1952 return -ENOMEM; 1953 } 1954 cq->mapping.sg = calloc(1, dma_sg_size()); 1955 if (cq->mapping.sg == NULL) { 1956 free(cq); 1957 return -ENOMEM; 1958 } 1959 1960 cq->qid = id; 1961 vu_ctrlr->cqs[id] = cq; 1962 1963 return 0; 1964 } 1965 1966 static int 1967 alloc_sq_reqs(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvmf_vfio_user_sq *sq) 1968 { 1969 struct nvmf_vfio_user_req *vu_req, *tmp; 1970 size_t req_size; 1971 uint32_t i; 1972 1973 req_size = sizeof(struct nvmf_vfio_user_req) + 1974 (dma_sg_size() * NVMF_VFIO_USER_MAX_IOVECS); 1975 1976 for (i = 0; i < sq->size; i++) { 1977 struct spdk_nvmf_request *req; 1978 1979 vu_req = calloc(1, req_size); 1980 if (vu_req == NULL) { 1981 goto err; 1982 } 1983 1984 req = &vu_req->req; 1985 req->qpair = &sq->qpair; 1986 req->rsp = (union nvmf_c2h_msg *)&vu_req->rsp; 1987 req->cmd = (union nvmf_h2c_msg *)&vu_req->cmd; 1988 req->stripped_data = NULL; 1989 1990 TAILQ_INSERT_TAIL(&sq->free_reqs, vu_req, link); 1991 } 1992 1993 return 0; 1994 1995 err: 1996 TAILQ_FOREACH_SAFE(vu_req, &sq->free_reqs, link, tmp) { 1997 free(vu_req); 1998 } 1999 return -ENOMEM; 2000 } 2001 2002 static volatile uint32_t * 2003 ctrlr_doorbell_ptr(struct nvmf_vfio_user_ctrlr *ctrlr) 2004 { 2005 return ctrlr->sdbl != NULL ? 2006 ctrlr->sdbl->shadow_doorbells : 2007 ctrlr->bar0_doorbells; 2008 } 2009 2010 static uint16_t 2011 handle_create_io_sq(struct nvmf_vfio_user_ctrlr *ctrlr, 2012 struct spdk_nvme_cmd *cmd, uint16_t *sct) 2013 { 2014 struct nvmf_vfio_user_transport *vu_transport = ctrlr->transport; 2015 struct nvmf_vfio_user_sq *sq; 2016 uint32_t qsize; 2017 uint16_t cqid; 2018 uint16_t qid; 2019 int err; 2020 2021 qid = cmd->cdw10_bits.create_io_q.qid; 2022 cqid = cmd->cdw11_bits.create_io_sq.cqid; 2023 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 2024 2025 if (ctrlr->sqs[qid] == NULL) { 2026 err = init_sq(ctrlr, ctrlr->sqs[0]->qpair.transport, qid); 2027 if (err != 0) { 2028 *sct = SPDK_NVME_SCT_GENERIC; 2029 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2030 } 2031 } 2032 2033 if (cqid == 0 || cqid >= vu_transport->transport.opts.max_qpairs_per_ctrlr) { 2034 SPDK_ERRLOG("%s: invalid cqid:%u\n", ctrlr_id(ctrlr), cqid); 2035 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2036 return SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2037 } 2038 2039 /* CQ must be created before SQ. */ 2040 if (!io_q_exists(ctrlr, cqid, true)) { 2041 SPDK_ERRLOG("%s: cqid:%u does not exist\n", ctrlr_id(ctrlr), cqid); 2042 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2043 return SPDK_NVME_SC_COMPLETION_QUEUE_INVALID; 2044 } 2045 2046 if (cmd->cdw11_bits.create_io_sq.pc != 0x1) { 2047 SPDK_ERRLOG("%s: non-PC SQ not supported\n", ctrlr_id(ctrlr)); 2048 *sct = SPDK_NVME_SCT_GENERIC; 2049 return SPDK_NVME_SC_INVALID_FIELD; 2050 } 2051 2052 sq = ctrlr->sqs[qid]; 2053 sq->size = qsize; 2054 2055 SPDK_DEBUGLOG(nvmf_vfio, "%s: sqid:%d cqid:%d\n", ctrlr_id(ctrlr), 2056 qid, cqid); 2057 2058 sq->mapping.prp1 = cmd->dptr.prp.prp1; 2059 sq->mapping.len = sq->size * sizeof(struct spdk_nvme_cmd); 2060 2061 err = map_q(ctrlr, &sq->mapping, MAP_INITIALIZE); 2062 if (err) { 2063 SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr)); 2064 *sct = SPDK_NVME_SCT_GENERIC; 2065 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2066 } 2067 2068 SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped sqid:%d IOVA=%#lx vaddr=%p\n", 2069 ctrlr_id(ctrlr), qid, cmd->dptr.prp.prp1, 2070 q_addr(&sq->mapping)); 2071 2072 err = alloc_sq_reqs(ctrlr, sq); 2073 if (err < 0) { 2074 SPDK_ERRLOG("%s: failed to allocate SQ requests: %m\n", ctrlr_id(ctrlr)); 2075 *sct = SPDK_NVME_SCT_GENERIC; 2076 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2077 } 2078 2079 sq->cqid = cqid; 2080 ctrlr->cqs[sq->cqid]->cq_ref++; 2081 sq->sq_state = VFIO_USER_SQ_CREATED; 2082 *sq_headp(sq) = 0; 2083 2084 sq->dbl_tailp = ctrlr_doorbell_ptr(ctrlr) + queue_index(qid, false); 2085 2086 /* 2087 * We should always reset the doorbells. 2088 * 2089 * The Specification prohibits the controller from writing to the shadow 2090 * doorbell buffer, however older versions of the Linux NVMe driver 2091 * don't reset the shadow doorbell buffer after a Queue-Level or 2092 * Controller-Level reset, which means that we're left with garbage 2093 * doorbell values. 2094 */ 2095 *sq_dbl_tailp(sq) = 0; 2096 2097 if (ctrlr->sdbl != NULL) { 2098 sq->need_rearm = true; 2099 2100 if (!set_sq_eventidx(sq)) { 2101 SPDK_ERRLOG("%s: host updated SQ tail doorbell before " 2102 "sqid:%hu was initialized\n", 2103 ctrlr_id(ctrlr), qid); 2104 fail_ctrlr(ctrlr); 2105 *sct = SPDK_NVME_SCT_GENERIC; 2106 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2107 } 2108 } 2109 2110 /* 2111 * Create our new I/O qpair. This asynchronously invokes, on a suitable 2112 * poll group, the nvmf_vfio_user_poll_group_add() callback, which will 2113 * call spdk_nvmf_request_exec() with a generated fabrics 2114 * connect command. This command is then eventually completed via 2115 * handle_queue_connect_rsp(). 2116 */ 2117 sq->create_io_sq_cmd = *cmd; 2118 sq->post_create_io_sq_completion = true; 2119 2120 spdk_nvmf_tgt_new_qpair(ctrlr->transport->transport.tgt, 2121 &sq->qpair); 2122 2123 *sct = SPDK_NVME_SCT_GENERIC; 2124 return SPDK_NVME_SC_SUCCESS; 2125 } 2126 2127 static uint16_t 2128 handle_create_io_cq(struct nvmf_vfio_user_ctrlr *ctrlr, 2129 struct spdk_nvme_cmd *cmd, uint16_t *sct) 2130 { 2131 struct nvmf_vfio_user_cq *cq; 2132 uint32_t qsize; 2133 uint16_t qid; 2134 int err; 2135 2136 qid = cmd->cdw10_bits.create_io_q.qid; 2137 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 2138 2139 if (ctrlr->cqs[qid] == NULL) { 2140 err = init_cq(ctrlr, qid); 2141 if (err != 0) { 2142 *sct = SPDK_NVME_SCT_GENERIC; 2143 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2144 } 2145 } 2146 2147 if (cmd->cdw11_bits.create_io_cq.pc != 0x1) { 2148 SPDK_ERRLOG("%s: non-PC CQ not supported\n", ctrlr_id(ctrlr)); 2149 *sct = SPDK_NVME_SCT_GENERIC; 2150 return SPDK_NVME_SC_INVALID_FIELD; 2151 } 2152 2153 if (cmd->cdw11_bits.create_io_cq.iv > NVMF_VFIO_USER_MSIX_NUM - 1) { 2154 SPDK_ERRLOG("%s: IV is too big\n", ctrlr_id(ctrlr)); 2155 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2156 return SPDK_NVME_SC_INVALID_INTERRUPT_VECTOR; 2157 } 2158 2159 cq = ctrlr->cqs[qid]; 2160 cq->size = qsize; 2161 2162 cq->mapping.prp1 = cmd->dptr.prp.prp1; 2163 cq->mapping.len = cq->size * sizeof(struct spdk_nvme_cpl); 2164 2165 cq->dbl_headp = ctrlr_doorbell_ptr(ctrlr) + queue_index(qid, true); 2166 2167 err = map_q(ctrlr, &cq->mapping, MAP_RW | MAP_INITIALIZE); 2168 if (err) { 2169 SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr)); 2170 *sct = SPDK_NVME_SCT_GENERIC; 2171 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2172 } 2173 2174 SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped cqid:%u IOVA=%#lx vaddr=%p\n", 2175 ctrlr_id(ctrlr), qid, cmd->dptr.prp.prp1, 2176 q_addr(&cq->mapping)); 2177 2178 cq->ien = cmd->cdw11_bits.create_io_cq.ien; 2179 cq->iv = cmd->cdw11_bits.create_io_cq.iv; 2180 cq->phase = true; 2181 cq->cq_state = VFIO_USER_CQ_CREATED; 2182 2183 *cq_tailp(cq) = 0; 2184 2185 /* 2186 * We should always reset the doorbells. 2187 * 2188 * The Specification prohibits the controller from writing to the shadow 2189 * doorbell buffer, however older versions of the Linux NVMe driver 2190 * don't reset the shadow doorbell buffer after a Queue-Level or 2191 * Controller-Level reset, which means that we're left with garbage 2192 * doorbell values. 2193 */ 2194 *cq_dbl_headp(cq) = 0; 2195 2196 *sct = SPDK_NVME_SCT_GENERIC; 2197 return SPDK_NVME_SC_SUCCESS; 2198 } 2199 2200 /* 2201 * Creates a completion or submission I/O queue. Returns 0 on success, -errno 2202 * on error. 2203 */ 2204 static int 2205 handle_create_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, 2206 struct spdk_nvme_cmd *cmd, const bool is_cq) 2207 { 2208 struct nvmf_vfio_user_transport *vu_transport = ctrlr->transport; 2209 uint16_t sct = SPDK_NVME_SCT_GENERIC; 2210 uint16_t sc = SPDK_NVME_SC_SUCCESS; 2211 uint32_t qsize; 2212 uint16_t qid; 2213 2214 assert(ctrlr != NULL); 2215 assert(cmd != NULL); 2216 2217 qid = cmd->cdw10_bits.create_io_q.qid; 2218 if (qid == 0 || qid >= vu_transport->transport.opts.max_qpairs_per_ctrlr) { 2219 SPDK_ERRLOG("%s: invalid qid=%d, max=%d\n", ctrlr_id(ctrlr), 2220 qid, vu_transport->transport.opts.max_qpairs_per_ctrlr); 2221 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2222 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2223 goto out; 2224 } 2225 2226 if (io_q_exists(ctrlr, qid, is_cq)) { 2227 SPDK_ERRLOG("%s: %cqid:%d already exists\n", ctrlr_id(ctrlr), 2228 is_cq ? 'c' : 's', qid); 2229 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2230 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2231 goto out; 2232 } 2233 2234 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 2235 if (qsize == 1 || qsize > max_queue_size(ctrlr)) { 2236 SPDK_ERRLOG("%s: invalid I/O queue size %u\n", ctrlr_id(ctrlr), qsize); 2237 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2238 sc = SPDK_NVME_SC_INVALID_QUEUE_SIZE; 2239 goto out; 2240 } 2241 2242 if (is_cq) { 2243 sc = handle_create_io_cq(ctrlr, cmd, &sct); 2244 } else { 2245 sc = handle_create_io_sq(ctrlr, cmd, &sct); 2246 2247 if (sct == SPDK_NVME_SCT_GENERIC && 2248 sc == SPDK_NVME_SC_SUCCESS) { 2249 /* Completion posted asynchronously. */ 2250 return 0; 2251 } 2252 } 2253 2254 out: 2255 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 2256 } 2257 2258 /* For ADMIN I/O DELETE SUBMISSION QUEUE the NVMf library will disconnect and free 2259 * queue pair, so save the command id and controller in a context. 2260 */ 2261 struct vfio_user_delete_sq_ctx { 2262 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 2263 uint16_t cid; 2264 }; 2265 2266 static void 2267 vfio_user_qpair_delete_cb(void *cb_arg) 2268 { 2269 struct vfio_user_delete_sq_ctx *ctx = cb_arg; 2270 struct nvmf_vfio_user_ctrlr *vu_ctrlr = ctx->vu_ctrlr; 2271 struct nvmf_vfio_user_cq *admin_cq = vu_ctrlr->cqs[0]; 2272 2273 assert(admin_cq != NULL); 2274 assert(admin_cq->group != NULL); 2275 assert(admin_cq->group->group->thread != NULL); 2276 if (admin_cq->group->group->thread != spdk_get_thread()) { 2277 spdk_thread_send_msg(admin_cq->group->group->thread, 2278 vfio_user_qpair_delete_cb, 2279 cb_arg); 2280 } else { 2281 post_completion(vu_ctrlr, admin_cq, 0, 0, 2282 ctx->cid, 2283 SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC); 2284 free(ctx); 2285 } 2286 } 2287 2288 /* 2289 * Deletes a completion or submission I/O queue. 2290 */ 2291 static int 2292 handle_del_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, 2293 struct spdk_nvme_cmd *cmd, const bool is_cq) 2294 { 2295 uint16_t sct = SPDK_NVME_SCT_GENERIC; 2296 uint16_t sc = SPDK_NVME_SC_SUCCESS; 2297 struct nvmf_vfio_user_sq *sq; 2298 struct nvmf_vfio_user_cq *cq; 2299 2300 SPDK_DEBUGLOG(nvmf_vfio, "%s: delete I/O %cqid:%d\n", 2301 ctrlr_id(ctrlr), is_cq ? 'c' : 's', 2302 cmd->cdw10_bits.delete_io_q.qid); 2303 2304 if (!io_q_exists(ctrlr, cmd->cdw10_bits.delete_io_q.qid, is_cq)) { 2305 SPDK_ERRLOG("%s: I/O %cqid:%d does not exist\n", ctrlr_id(ctrlr), 2306 is_cq ? 'c' : 's', cmd->cdw10_bits.delete_io_q.qid); 2307 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2308 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2309 goto out; 2310 } 2311 2312 if (is_cq) { 2313 cq = ctrlr->cqs[cmd->cdw10_bits.delete_io_q.qid]; 2314 if (cq->cq_ref) { 2315 SPDK_ERRLOG("%s: the associated SQ must be deleted first\n", ctrlr_id(ctrlr)); 2316 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2317 sc = SPDK_NVME_SC_INVALID_QUEUE_DELETION; 2318 goto out; 2319 } 2320 delete_cq_done(ctrlr, cq); 2321 } else { 2322 /* 2323 * Deletion of the CQ is only deferred to delete_sq_done() on 2324 * VM reboot or CC.EN change, so we have to delete it in all 2325 * other cases. 2326 */ 2327 sq = ctrlr->sqs[cmd->cdw10_bits.delete_io_q.qid]; 2328 sq->delete_ctx = calloc(1, sizeof(*sq->delete_ctx)); 2329 if (!sq->delete_ctx) { 2330 sct = SPDK_NVME_SCT_GENERIC; 2331 sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2332 goto out; 2333 } 2334 sq->delete_ctx->vu_ctrlr = ctrlr; 2335 sq->delete_ctx->cid = cmd->cid; 2336 sq->sq_state = VFIO_USER_SQ_DELETED; 2337 assert(ctrlr->cqs[sq->cqid]->cq_ref); 2338 ctrlr->cqs[sq->cqid]->cq_ref--; 2339 2340 spdk_nvmf_qpair_disconnect(&sq->qpair); 2341 return 0; 2342 } 2343 2344 out: 2345 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 2346 } 2347 2348 /* 2349 * Configures Shadow Doorbells. 2350 */ 2351 static int 2352 handle_doorbell_buffer_config(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd) 2353 { 2354 struct nvmf_vfio_user_shadow_doorbells *sdbl = NULL; 2355 uint32_t dstrd; 2356 uintptr_t page_size, page_mask; 2357 uint64_t prp1, prp2; 2358 uint16_t sct = SPDK_NVME_SCT_GENERIC; 2359 uint16_t sc = SPDK_NVME_SC_INVALID_FIELD; 2360 2361 assert(ctrlr != NULL); 2362 assert(ctrlr->endpoint != NULL); 2363 assert(cmd != NULL); 2364 2365 dstrd = doorbell_stride(ctrlr); 2366 page_size = memory_page_size(ctrlr); 2367 page_mask = memory_page_mask(ctrlr); 2368 2369 /* FIXME: we don't check doorbell stride when setting queue doorbells. */ 2370 if ((4u << dstrd) * NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR > page_size) { 2371 SPDK_ERRLOG("%s: doorbells do not fit in a single host page", 2372 ctrlr_id(ctrlr)); 2373 2374 goto out; 2375 } 2376 2377 /* Verify guest physical addresses passed as PRPs. */ 2378 if (cmd->psdt != SPDK_NVME_PSDT_PRP) { 2379 SPDK_ERRLOG("%s: received Doorbell Buffer Config without PRPs", 2380 ctrlr_id(ctrlr)); 2381 2382 goto out; 2383 } 2384 2385 prp1 = cmd->dptr.prp.prp1; 2386 prp2 = cmd->dptr.prp.prp2; 2387 2388 SPDK_DEBUGLOG(nvmf_vfio, 2389 "%s: configuring shadow doorbells with PRP1=%#lx and PRP2=%#lx (GPAs)\n", 2390 ctrlr_id(ctrlr), prp1, prp2); 2391 2392 if (prp1 == prp2 2393 || prp1 != (prp1 & page_mask) 2394 || prp2 != (prp2 & page_mask)) { 2395 SPDK_ERRLOG("%s: invalid shadow doorbell GPAs\n", 2396 ctrlr_id(ctrlr)); 2397 2398 goto out; 2399 } 2400 2401 /* Map guest physical addresses to our virtual address space. */ 2402 sdbl = map_sdbl(ctrlr->endpoint->vfu_ctx, prp1, prp2, page_size); 2403 if (sdbl == NULL) { 2404 SPDK_ERRLOG("%s: failed to map shadow doorbell buffers\n", 2405 ctrlr_id(ctrlr)); 2406 2407 goto out; 2408 } 2409 2410 ctrlr->shadow_doorbell_buffer = prp1; 2411 ctrlr->eventidx_buffer = prp2; 2412 2413 SPDK_DEBUGLOG(nvmf_vfio, 2414 "%s: mapped shadow doorbell buffers [%p, %p) and [%p, %p)\n", 2415 ctrlr_id(ctrlr), 2416 sdbl->iovs[0].iov_base, 2417 sdbl->iovs[0].iov_base + sdbl->iovs[0].iov_len, 2418 sdbl->iovs[1].iov_base, 2419 sdbl->iovs[1].iov_base + sdbl->iovs[1].iov_len); 2420 2421 2422 /* 2423 * Set all possible CQ head doorbells to polling mode now, such that we 2424 * don't have to worry about it later if the host creates more queues. 2425 * 2426 * We only ever want interrupts for writes to the SQ tail doorbells 2427 * (which are initialised in set_ctrlr_intr_mode() below). 2428 */ 2429 for (uint16_t i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; ++i) { 2430 sdbl->eventidxs[queue_index(i, true)] = NVMF_VFIO_USER_EVENTIDX_POLL; 2431 } 2432 2433 /* Update controller. */ 2434 SWAP(ctrlr->sdbl, sdbl); 2435 2436 /* 2437 * Copy doorbells from either the previous shadow doorbell buffer or the 2438 * BAR0 doorbells and make I/O queue doorbells point to the new buffer. 2439 * 2440 * This needs to account for older versions of the Linux NVMe driver, 2441 * which don't clear out the buffer after a controller reset. 2442 */ 2443 copy_doorbells(ctrlr, sdbl != NULL ? 2444 sdbl->shadow_doorbells : ctrlr->bar0_doorbells, 2445 ctrlr->sdbl->shadow_doorbells); 2446 2447 vfio_user_ctrlr_switch_doorbells(ctrlr, true); 2448 2449 ctrlr_kick(ctrlr); 2450 2451 sc = SPDK_NVME_SC_SUCCESS; 2452 2453 out: 2454 /* 2455 * Unmap existing buffers, in case Doorbell Buffer Config was sent 2456 * more than once (pointless, but not prohibited by the spec), or 2457 * in case of an error. 2458 * 2459 * If this is the first time Doorbell Buffer Config was processed, 2460 * then we've just swapped a NULL from ctrlr->sdbl into sdbl, so 2461 * free_sdbl() becomes a noop. 2462 */ 2463 free_sdbl(ctrlr->endpoint->vfu_ctx, sdbl); 2464 2465 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 2466 } 2467 2468 /* Returns 0 on success and -errno on error. */ 2469 static int 2470 consume_admin_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd) 2471 { 2472 assert(ctrlr != NULL); 2473 assert(cmd != NULL); 2474 2475 if (cmd->fuse != 0) { 2476 /* Fused admin commands are not supported. */ 2477 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, 2478 SPDK_NVME_SC_INVALID_FIELD, 2479 SPDK_NVME_SCT_GENERIC); 2480 } 2481 2482 switch (cmd->opc) { 2483 case SPDK_NVME_OPC_CREATE_IO_CQ: 2484 case SPDK_NVME_OPC_CREATE_IO_SQ: 2485 return handle_create_io_q(ctrlr, cmd, 2486 cmd->opc == SPDK_NVME_OPC_CREATE_IO_CQ); 2487 case SPDK_NVME_OPC_DELETE_IO_SQ: 2488 case SPDK_NVME_OPC_DELETE_IO_CQ: 2489 return handle_del_io_q(ctrlr, cmd, 2490 cmd->opc == SPDK_NVME_OPC_DELETE_IO_CQ); 2491 case SPDK_NVME_OPC_DOORBELL_BUFFER_CONFIG: 2492 SPDK_NOTICELOG("%s: requested shadow doorbells (supported: %d)\n", 2493 ctrlr_id(ctrlr), 2494 !ctrlr->transport->transport_opts.disable_shadow_doorbells); 2495 if (!ctrlr->transport->transport_opts.disable_shadow_doorbells) { 2496 return handle_doorbell_buffer_config(ctrlr, cmd); 2497 } 2498 /* FALLTHROUGH */ 2499 default: 2500 return handle_cmd_req(ctrlr, cmd, ctrlr->sqs[0]); 2501 } 2502 } 2503 2504 static int 2505 handle_cmd_rsp(struct nvmf_vfio_user_req *vu_req, void *cb_arg) 2506 { 2507 struct nvmf_vfio_user_sq *sq = cb_arg; 2508 struct nvmf_vfio_user_ctrlr *vu_ctrlr = sq->ctrlr; 2509 uint16_t sqid, cqid; 2510 2511 assert(sq != NULL); 2512 assert(vu_req != NULL); 2513 assert(vu_ctrlr != NULL); 2514 2515 if (spdk_likely(vu_req->iovcnt)) { 2516 vfu_sgl_put(vu_ctrlr->endpoint->vfu_ctx, 2517 index_to_sg_t(vu_req->sg, 0), 2518 vu_req->iov, vu_req->iovcnt); 2519 } 2520 sqid = sq->qid; 2521 cqid = sq->cqid; 2522 2523 return post_completion(vu_ctrlr, vu_ctrlr->cqs[cqid], 2524 vu_req->req.rsp->nvme_cpl.cdw0, 2525 sqid, 2526 vu_req->req.cmd->nvme_cmd.cid, 2527 vu_req->req.rsp->nvme_cpl.status.sc, 2528 vu_req->req.rsp->nvme_cpl.status.sct); 2529 } 2530 2531 static int 2532 consume_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_sq *sq, 2533 struct spdk_nvme_cmd *cmd) 2534 { 2535 assert(sq != NULL); 2536 if (spdk_unlikely(nvmf_qpair_is_admin_queue(&sq->qpair))) { 2537 return consume_admin_cmd(ctrlr, cmd); 2538 } 2539 2540 return handle_cmd_req(ctrlr, cmd, sq); 2541 } 2542 2543 /* Returns the number of commands processed, or a negative value on error. */ 2544 static int 2545 handle_sq_tdbl_write(struct nvmf_vfio_user_ctrlr *ctrlr, const uint32_t new_tail, 2546 struct nvmf_vfio_user_sq *sq) 2547 { 2548 struct spdk_nvme_cmd *queue; 2549 struct nvmf_vfio_user_cq *cq = ctrlr->cqs[sq->cqid]; 2550 int count = 0; 2551 uint32_t free_cq_slots; 2552 2553 assert(ctrlr != NULL); 2554 assert(sq != NULL); 2555 2556 if (ctrlr->sdbl != NULL && sq->qid != 0) { 2557 /* 2558 * Submission queue index has moved past the event index, so it 2559 * needs to be re-armed before we go to sleep. 2560 */ 2561 sq->need_rearm = true; 2562 } 2563 2564 free_cq_slots = cq_free_slots(cq); 2565 queue = q_addr(&sq->mapping); 2566 while (*sq_headp(sq) != new_tail) { 2567 int err; 2568 struct spdk_nvme_cmd *cmd; 2569 2570 /* 2571 * Linux host nvme driver can submit cmd's more than free cq slots 2572 * available. So process only those who have cq slots available. 2573 */ 2574 if (free_cq_slots-- == 0) { 2575 cq->last_head = *cq_dbl_headp(cq); 2576 2577 free_cq_slots = cq_free_slots(cq); 2578 if (free_cq_slots > 0) { 2579 continue; 2580 } 2581 2582 /* 2583 * If there are no free cq slots then kick interrupt FD to loop 2584 * again to process remaining sq cmds. 2585 * In case of polling mode we will process remaining sq cmds during 2586 * next polling iteration. 2587 * sq head is advanced only for consumed commands. 2588 */ 2589 if (in_interrupt_mode(ctrlr->transport)) { 2590 eventfd_write(ctrlr->intr_fd, 1); 2591 } 2592 break; 2593 } 2594 2595 cmd = &queue[*sq_headp(sq)]; 2596 count++; 2597 2598 /* 2599 * SQHD must contain the new head pointer, so we must increase 2600 * it before we generate a completion. 2601 */ 2602 sq_head_advance(sq); 2603 2604 err = consume_cmd(ctrlr, sq, cmd); 2605 if (spdk_unlikely(err != 0)) { 2606 return err; 2607 } 2608 } 2609 2610 return count; 2611 } 2612 2613 /* Checks whether endpoint is connected from the same process */ 2614 static bool 2615 is_peer_same_process(struct nvmf_vfio_user_endpoint *endpoint) 2616 { 2617 struct ucred ucred; 2618 socklen_t ucredlen = sizeof(ucred); 2619 2620 if (endpoint == NULL) { 2621 return false; 2622 } 2623 2624 if (getsockopt(vfu_get_poll_fd(endpoint->vfu_ctx), SOL_SOCKET, SO_PEERCRED, &ucred, 2625 &ucredlen) < 0) { 2626 SPDK_ERRLOG("getsockopt(SO_PEERCRED): %s\n", strerror(errno)); 2627 return false; 2628 } 2629 2630 return ucred.pid == getpid(); 2631 } 2632 2633 static void 2634 memory_region_add_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) 2635 { 2636 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2637 struct nvmf_vfio_user_ctrlr *ctrlr; 2638 struct nvmf_vfio_user_sq *sq; 2639 struct nvmf_vfio_user_cq *cq; 2640 void *map_start, *map_end; 2641 int ret; 2642 2643 /* 2644 * We're not interested in any DMA regions that aren't mappable (we don't 2645 * support clients that don't share their memory). 2646 */ 2647 if (!info->vaddr) { 2648 return; 2649 } 2650 2651 map_start = info->mapping.iov_base; 2652 map_end = info->mapping.iov_base + info->mapping.iov_len; 2653 2654 if (((uintptr_t)info->mapping.iov_base & MASK_2MB) || 2655 (info->mapping.iov_len & MASK_2MB)) { 2656 SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %p-%p\n", 2657 info->vaddr, map_start, map_end); 2658 return; 2659 } 2660 2661 assert(endpoint != NULL); 2662 if (endpoint->ctrlr == NULL) { 2663 return; 2664 } 2665 ctrlr = endpoint->ctrlr; 2666 2667 SPDK_DEBUGLOG(nvmf_vfio, "%s: map IOVA %p-%p\n", endpoint_id(endpoint), 2668 map_start, map_end); 2669 2670 /* VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE are enabled when registering to VFIO, here we also 2671 * check the protection bits before registering. When vfio client and server are run in same process 2672 * there is no need to register the same memory again. 2673 */ 2674 if (info->prot == (PROT_WRITE | PROT_READ) && !is_peer_same_process(endpoint)) { 2675 ret = spdk_mem_register(info->mapping.iov_base, info->mapping.iov_len); 2676 if (ret) { 2677 SPDK_ERRLOG("Memory region register %p-%p failed, ret=%d\n", 2678 map_start, map_end, ret); 2679 } 2680 } 2681 2682 pthread_mutex_lock(&endpoint->lock); 2683 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 2684 if (sq->sq_state != VFIO_USER_SQ_INACTIVE) { 2685 continue; 2686 } 2687 2688 cq = ctrlr->cqs[sq->cqid]; 2689 2690 /* For shared CQ case, we will use q_addr() to avoid mapping CQ multiple times */ 2691 if (cq->size && q_addr(&cq->mapping) == NULL) { 2692 ret = map_q(ctrlr, &cq->mapping, MAP_RW | MAP_QUIET); 2693 if (ret) { 2694 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap cqid:%d %#lx-%#lx\n", 2695 cq->qid, cq->mapping.prp1, 2696 cq->mapping.prp1 + cq->mapping.len); 2697 continue; 2698 } 2699 } 2700 2701 if (sq->size) { 2702 ret = map_q(ctrlr, &sq->mapping, MAP_R | MAP_QUIET); 2703 if (ret) { 2704 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap sqid:%d %#lx-%#lx\n", 2705 sq->qid, sq->mapping.prp1, 2706 sq->mapping.prp1 + sq->mapping.len); 2707 continue; 2708 } 2709 } 2710 sq->sq_state = VFIO_USER_SQ_ACTIVE; 2711 SPDK_DEBUGLOG(nvmf_vfio, "Remap sqid:%u successfully\n", sq->qid); 2712 } 2713 pthread_mutex_unlock(&endpoint->lock); 2714 } 2715 2716 static void 2717 memory_region_remove_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) 2718 { 2719 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2720 struct nvmf_vfio_user_sq *sq; 2721 struct nvmf_vfio_user_cq *cq; 2722 void *map_start, *map_end; 2723 int ret = 0; 2724 2725 if (!info->vaddr) { 2726 return; 2727 } 2728 2729 map_start = info->mapping.iov_base; 2730 map_end = info->mapping.iov_base + info->mapping.iov_len; 2731 2732 if (((uintptr_t)info->mapping.iov_base & MASK_2MB) || 2733 (info->mapping.iov_len & MASK_2MB)) { 2734 SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %p-%p\n", 2735 info->vaddr, map_start, map_end); 2736 return; 2737 } 2738 2739 assert(endpoint != NULL); 2740 SPDK_DEBUGLOG(nvmf_vfio, "%s: unmap IOVA %p-%p\n", endpoint_id(endpoint), 2741 map_start, map_end); 2742 2743 if (endpoint->ctrlr != NULL) { 2744 struct nvmf_vfio_user_ctrlr *ctrlr; 2745 ctrlr = endpoint->ctrlr; 2746 2747 pthread_mutex_lock(&endpoint->lock); 2748 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 2749 if (q_addr(&sq->mapping) >= map_start && q_addr(&sq->mapping) <= map_end) { 2750 unmap_q(ctrlr, &sq->mapping); 2751 sq->sq_state = VFIO_USER_SQ_INACTIVE; 2752 } 2753 2754 cq = ctrlr->cqs[sq->cqid]; 2755 if (q_addr(&cq->mapping) >= map_start && q_addr(&cq->mapping) <= map_end) { 2756 unmap_q(ctrlr, &cq->mapping); 2757 } 2758 } 2759 2760 if (ctrlr->sdbl != NULL) { 2761 size_t i; 2762 2763 for (i = 0; i < NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT; i++) { 2764 const void *const iov_base = ctrlr->sdbl->iovs[i].iov_base; 2765 2766 if (iov_base >= map_start && iov_base < map_end) { 2767 copy_doorbells(ctrlr, 2768 ctrlr->sdbl->shadow_doorbells, 2769 ctrlr->bar0_doorbells); 2770 vfio_user_ctrlr_switch_doorbells(ctrlr, false); 2771 free_sdbl(endpoint->vfu_ctx, ctrlr->sdbl); 2772 ctrlr->sdbl = NULL; 2773 break; 2774 } 2775 } 2776 } 2777 2778 pthread_mutex_unlock(&endpoint->lock); 2779 } 2780 2781 if (info->prot == (PROT_WRITE | PROT_READ) && !is_peer_same_process(endpoint)) { 2782 ret = spdk_mem_unregister(info->mapping.iov_base, info->mapping.iov_len); 2783 if (ret) { 2784 SPDK_ERRLOG("Memory region unregister %p-%p failed, ret=%d\n", 2785 map_start, map_end, ret); 2786 } 2787 } 2788 } 2789 2790 /* Used to initiate a controller-level reset or a controller shutdown. */ 2791 static void 2792 disable_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 2793 { 2794 SPDK_NOTICELOG("%s: disabling controller\n", ctrlr_id(vu_ctrlr)); 2795 2796 /* Unmap Admin queue. */ 2797 2798 assert(vu_ctrlr->sqs[0] != NULL); 2799 assert(vu_ctrlr->cqs[0] != NULL); 2800 2801 unmap_q(vu_ctrlr, &vu_ctrlr->sqs[0]->mapping); 2802 unmap_q(vu_ctrlr, &vu_ctrlr->cqs[0]->mapping); 2803 2804 vu_ctrlr->sqs[0]->size = 0; 2805 *sq_headp(vu_ctrlr->sqs[0]) = 0; 2806 2807 vu_ctrlr->sqs[0]->sq_state = VFIO_USER_SQ_INACTIVE; 2808 2809 vu_ctrlr->cqs[0]->size = 0; 2810 *cq_tailp(vu_ctrlr->cqs[0]) = 0; 2811 2812 /* 2813 * For PCIe controller reset or shutdown, we will drop all AER 2814 * responses. 2815 */ 2816 spdk_nvmf_ctrlr_abort_aer(vu_ctrlr->ctrlr); 2817 2818 /* Free the shadow doorbell buffer. */ 2819 vfio_user_ctrlr_switch_doorbells(vu_ctrlr, false); 2820 free_sdbl(vu_ctrlr->endpoint->vfu_ctx, vu_ctrlr->sdbl); 2821 vu_ctrlr->sdbl = NULL; 2822 } 2823 2824 /* Used to re-enable the controller after a controller-level reset. */ 2825 static int 2826 enable_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 2827 { 2828 int err; 2829 2830 assert(vu_ctrlr != NULL); 2831 2832 SPDK_NOTICELOG("%s: enabling controller\n", ctrlr_id(vu_ctrlr)); 2833 2834 err = acq_setup(vu_ctrlr); 2835 if (err != 0) { 2836 return err; 2837 } 2838 2839 err = asq_setup(vu_ctrlr); 2840 if (err != 0) { 2841 return err; 2842 } 2843 2844 vu_ctrlr->sqs[0]->sq_state = VFIO_USER_SQ_ACTIVE; 2845 2846 return 0; 2847 } 2848 2849 static int 2850 nvmf_vfio_user_prop_req_rsp_set(struct nvmf_vfio_user_req *req, 2851 struct nvmf_vfio_user_sq *sq) 2852 { 2853 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 2854 union spdk_nvme_cc_register cc, diff; 2855 2856 assert(req->req.cmd->prop_set_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET); 2857 assert(sq->ctrlr != NULL); 2858 vu_ctrlr = sq->ctrlr; 2859 2860 if (req->req.cmd->prop_set_cmd.ofst != offsetof(struct spdk_nvme_registers, cc)) { 2861 return 0; 2862 } 2863 2864 cc.raw = req->req.cmd->prop_set_cmd.value.u64; 2865 diff.raw = cc.raw ^ req->cc.raw; 2866 2867 if (diff.bits.en) { 2868 if (cc.bits.en) { 2869 int ret = enable_ctrlr(vu_ctrlr); 2870 if (ret) { 2871 SPDK_ERRLOG("%s: failed to enable ctrlr\n", ctrlr_id(vu_ctrlr)); 2872 return ret; 2873 } 2874 vu_ctrlr->reset_shn = false; 2875 } else { 2876 vu_ctrlr->reset_shn = true; 2877 } 2878 } 2879 2880 if (diff.bits.shn) { 2881 if (cc.bits.shn == SPDK_NVME_SHN_NORMAL || cc.bits.shn == SPDK_NVME_SHN_ABRUPT) { 2882 vu_ctrlr->reset_shn = true; 2883 } 2884 } 2885 2886 if (vu_ctrlr->reset_shn) { 2887 disable_ctrlr(vu_ctrlr); 2888 } 2889 return 0; 2890 } 2891 2892 static int 2893 nvmf_vfio_user_prop_req_rsp(struct nvmf_vfio_user_req *req, void *cb_arg) 2894 { 2895 struct nvmf_vfio_user_sq *sq = cb_arg; 2896 2897 assert(sq != NULL); 2898 assert(req != NULL); 2899 2900 if (req->req.cmd->prop_get_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET) { 2901 assert(sq->ctrlr != NULL); 2902 assert(req != NULL); 2903 2904 memcpy(req->req.iov[0].iov_base, 2905 &req->req.rsp->prop_get_rsp.value.u64, 2906 req->req.length); 2907 return 0; 2908 } 2909 2910 return nvmf_vfio_user_prop_req_rsp_set(req, sq); 2911 } 2912 2913 /* 2914 * Handles a write at offset 0x1000 or more; this is the non-mapped path when a 2915 * doorbell is written via access_bar0_fn(). 2916 * 2917 * DSTRD is set to fixed value 0 for NVMf. 2918 * 2919 */ 2920 static int 2921 handle_dbl_access(struct nvmf_vfio_user_ctrlr *ctrlr, uint32_t *buf, 2922 const size_t count, loff_t pos, const bool is_write) 2923 { 2924 struct nvmf_vfio_user_poll_group *group; 2925 2926 assert(ctrlr != NULL); 2927 assert(buf != NULL); 2928 2929 if (spdk_unlikely(!is_write)) { 2930 SPDK_WARNLOG("%s: host tried to read BAR0 doorbell %#lx\n", 2931 ctrlr_id(ctrlr), pos); 2932 errno = EPERM; 2933 return -1; 2934 } 2935 2936 if (spdk_unlikely(count != sizeof(uint32_t))) { 2937 SPDK_ERRLOG("%s: bad doorbell buffer size %ld\n", 2938 ctrlr_id(ctrlr), count); 2939 errno = EINVAL; 2940 return -1; 2941 } 2942 2943 pos -= NVME_DOORBELLS_OFFSET; 2944 2945 /* pos must be dword aligned */ 2946 if (spdk_unlikely((pos & 0x3) != 0)) { 2947 SPDK_ERRLOG("%s: bad doorbell offset %#lx\n", ctrlr_id(ctrlr), pos); 2948 errno = EINVAL; 2949 return -1; 2950 } 2951 2952 /* convert byte offset to array index */ 2953 pos >>= 2; 2954 2955 if (spdk_unlikely(pos >= NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR * 2)) { 2956 SPDK_ERRLOG("%s: bad doorbell index %#lx\n", ctrlr_id(ctrlr), pos); 2957 errno = EINVAL; 2958 return -1; 2959 } 2960 2961 ctrlr->bar0_doorbells[pos] = *buf; 2962 spdk_wmb(); 2963 2964 group = ctrlr_to_poll_group(ctrlr); 2965 if (pos == 1) { 2966 group->stats.cqh_admin_writes++; 2967 } else if (pos & 1) { 2968 group->stats.cqh_io_writes++; 2969 } 2970 2971 SPDK_DEBUGLOG(vfio_user_db, "%s: updating BAR0 doorbell %s:%ld to %u\n", 2972 ctrlr_id(ctrlr), (pos & 1) ? "cqid" : "sqid", 2973 pos / 2, *buf); 2974 2975 2976 return 0; 2977 } 2978 2979 static size_t 2980 vfio_user_property_access(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 2981 char *buf, size_t count, loff_t pos, 2982 bool is_write) 2983 { 2984 struct nvmf_vfio_user_req *req; 2985 const struct spdk_nvmf_registers *regs; 2986 2987 if ((count != 4) && (count != 8)) { 2988 errno = EINVAL; 2989 return -1; 2990 } 2991 2992 /* Construct a Fabric Property Get/Set command and send it */ 2993 req = get_nvmf_vfio_user_req(vu_ctrlr->sqs[0]); 2994 if (req == NULL) { 2995 errno = ENOBUFS; 2996 return -1; 2997 } 2998 regs = spdk_nvmf_ctrlr_get_regs(vu_ctrlr->ctrlr); 2999 req->cc.raw = regs->cc.raw; 3000 3001 req->cb_fn = nvmf_vfio_user_prop_req_rsp; 3002 req->cb_arg = vu_ctrlr->sqs[0]; 3003 req->req.cmd->prop_set_cmd.opcode = SPDK_NVME_OPC_FABRIC; 3004 req->req.cmd->prop_set_cmd.cid = 0; 3005 if (count == 4) { 3006 req->req.cmd->prop_set_cmd.attrib.size = 0; 3007 } else { 3008 req->req.cmd->prop_set_cmd.attrib.size = 1; 3009 } 3010 req->req.cmd->prop_set_cmd.ofst = pos; 3011 if (is_write) { 3012 req->req.cmd->prop_set_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET; 3013 if (req->req.cmd->prop_set_cmd.attrib.size) { 3014 req->req.cmd->prop_set_cmd.value.u64 = *(uint64_t *)buf; 3015 } else { 3016 req->req.cmd->prop_set_cmd.value.u32.high = 0; 3017 req->req.cmd->prop_set_cmd.value.u32.low = *(uint32_t *)buf; 3018 } 3019 } else { 3020 req->req.cmd->prop_get_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET; 3021 } 3022 req->req.length = count; 3023 SPDK_IOV_ONE(req->req.iov, &req->req.iovcnt, buf, req->req.length); 3024 3025 spdk_nvmf_request_exec(&req->req); 3026 3027 return count; 3028 } 3029 3030 static ssize_t 3031 access_bar0_fn(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t pos, 3032 bool is_write) 3033 { 3034 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3035 struct nvmf_vfio_user_ctrlr *ctrlr; 3036 int ret; 3037 3038 ctrlr = endpoint->ctrlr; 3039 if (spdk_unlikely(endpoint->need_async_destroy || !ctrlr)) { 3040 errno = EIO; 3041 return -1; 3042 } 3043 3044 if (pos >= NVME_DOORBELLS_OFFSET) { 3045 /* 3046 * The fact that the doorbells can be memory mapped doesn't mean 3047 * that the client (VFIO in QEMU) is obliged to memory map them, 3048 * it might still elect to access them via regular read/write; 3049 * we might also have had disable_mappable_bar0 set. 3050 */ 3051 ret = handle_dbl_access(ctrlr, (uint32_t *)buf, count, 3052 pos, is_write); 3053 if (ret == 0) { 3054 return count; 3055 } 3056 return ret; 3057 } 3058 3059 return vfio_user_property_access(ctrlr, buf, count, pos, is_write); 3060 } 3061 3062 static ssize_t 3063 access_pci_config(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t offset, 3064 bool is_write) 3065 { 3066 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3067 3068 if (is_write) { 3069 SPDK_ERRLOG("%s: write %#lx-%#lx not supported\n", 3070 endpoint_id(endpoint), offset, offset + count); 3071 errno = EINVAL; 3072 return -1; 3073 } 3074 3075 if (offset + count > NVME_REG_CFG_SIZE) { 3076 SPDK_ERRLOG("%s: access past end of extended PCI configuration space, want=%ld+%ld, max=%d\n", 3077 endpoint_id(endpoint), offset, count, 3078 NVME_REG_CFG_SIZE); 3079 errno = ERANGE; 3080 return -1; 3081 } 3082 3083 memcpy(buf, ((unsigned char *)endpoint->pci_config_space) + offset, count); 3084 3085 return count; 3086 } 3087 3088 static void 3089 vfio_user_log(vfu_ctx_t *vfu_ctx, int level, char const *msg) 3090 { 3091 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3092 3093 if (level >= LOG_DEBUG) { 3094 SPDK_DEBUGLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg); 3095 } else if (level >= LOG_INFO) { 3096 SPDK_INFOLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg); 3097 } else if (level >= LOG_NOTICE) { 3098 SPDK_NOTICELOG("%s: %s\n", endpoint_id(endpoint), msg); 3099 } else if (level >= LOG_WARNING) { 3100 SPDK_WARNLOG("%s: %s\n", endpoint_id(endpoint), msg); 3101 } else { 3102 SPDK_ERRLOG("%s: %s\n", endpoint_id(endpoint), msg); 3103 } 3104 } 3105 3106 static int 3107 vfio_user_get_log_level(void) 3108 { 3109 int level; 3110 3111 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 3112 return LOG_DEBUG; 3113 } 3114 3115 level = spdk_log_to_syslog_level(spdk_log_get_level()); 3116 if (level < 0) { 3117 return LOG_ERR; 3118 } 3119 3120 return level; 3121 } 3122 3123 static void 3124 init_pci_config_space(vfu_pci_config_space_t *p) 3125 { 3126 /* MLBAR */ 3127 p->hdr.bars[0].raw = 0x0; 3128 /* MUBAR */ 3129 p->hdr.bars[1].raw = 0x0; 3130 3131 /* vendor specific, let's set them to zero for now */ 3132 p->hdr.bars[3].raw = 0x0; 3133 p->hdr.bars[4].raw = 0x0; 3134 p->hdr.bars[5].raw = 0x0; 3135 3136 /* enable INTx */ 3137 p->hdr.intr.ipin = 0x1; 3138 } 3139 3140 struct ctrlr_quiesce_ctx { 3141 struct nvmf_vfio_user_endpoint *endpoint; 3142 struct nvmf_vfio_user_poll_group *group; 3143 int status; 3144 }; 3145 3146 static void ctrlr_quiesce(struct nvmf_vfio_user_ctrlr *vu_ctrlr); 3147 3148 static void 3149 _vfio_user_endpoint_resume_done_msg(void *ctx) 3150 { 3151 struct nvmf_vfio_user_endpoint *endpoint = ctx; 3152 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3153 3154 endpoint->need_resume = false; 3155 3156 if (!vu_ctrlr) { 3157 return; 3158 } 3159 3160 if (!vu_ctrlr->queued_quiesce) { 3161 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 3162 3163 /* 3164 * We might have ignored new SQ entries while we were quiesced: 3165 * kick ourselves so we'll definitely check again while in 3166 * VFIO_USER_CTRLR_RUNNING state. 3167 */ 3168 if (in_interrupt_mode(endpoint->transport)) { 3169 ctrlr_kick(vu_ctrlr); 3170 } 3171 return; 3172 } 3173 3174 3175 /* 3176 * Basically, once we call `vfu_device_quiesced` the device is 3177 * unquiesced from libvfio-user's perspective so from the moment 3178 * `vfio_user_quiesce_done` returns libvfio-user might quiesce the device 3179 * again. However, because the NVMf subsystem is an asynchronous 3180 * operation, this quiesce might come _before_ the NVMf subsystem has 3181 * been resumed, so in the callback of `spdk_nvmf_subsystem_resume` we 3182 * need to check whether a quiesce was requested. 3183 */ 3184 SPDK_DEBUGLOG(nvmf_vfio, "%s has queued quiesce event, quiesce again\n", 3185 ctrlr_id(vu_ctrlr)); 3186 ctrlr_quiesce(vu_ctrlr); 3187 } 3188 3189 static void 3190 vfio_user_endpoint_resume_done(struct spdk_nvmf_subsystem *subsystem, 3191 void *cb_arg, int status) 3192 { 3193 struct nvmf_vfio_user_endpoint *endpoint = cb_arg; 3194 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3195 3196 SPDK_DEBUGLOG(nvmf_vfio, "%s resumed done with status %d\n", endpoint_id(endpoint), status); 3197 3198 if (!vu_ctrlr) { 3199 return; 3200 } 3201 3202 spdk_thread_send_msg(vu_ctrlr->thread, _vfio_user_endpoint_resume_done_msg, endpoint); 3203 } 3204 3205 static void 3206 vfio_user_quiesce_done(void *ctx) 3207 { 3208 struct ctrlr_quiesce_ctx *quiesce_ctx = ctx; 3209 struct nvmf_vfio_user_endpoint *endpoint = quiesce_ctx->endpoint; 3210 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3211 int ret; 3212 3213 if (!vu_ctrlr) { 3214 free(quiesce_ctx); 3215 return; 3216 } 3217 3218 SPDK_DEBUGLOG(nvmf_vfio, "%s device quiesced\n", ctrlr_id(vu_ctrlr)); 3219 3220 assert(vu_ctrlr->state == VFIO_USER_CTRLR_PAUSING); 3221 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 3222 vfu_device_quiesced(endpoint->vfu_ctx, quiesce_ctx->status); 3223 vu_ctrlr->queued_quiesce = false; 3224 free(quiesce_ctx); 3225 3226 /* `vfu_device_quiesced` can change the migration state, 3227 * so we need to re-check `vu_ctrlr->state`. 3228 */ 3229 if (vu_ctrlr->state == VFIO_USER_CTRLR_MIGRATING) { 3230 SPDK_DEBUGLOG(nvmf_vfio, "%s is in MIGRATION state\n", ctrlr_id(vu_ctrlr)); 3231 return; 3232 } 3233 3234 SPDK_DEBUGLOG(nvmf_vfio, "%s start to resume\n", ctrlr_id(vu_ctrlr)); 3235 vu_ctrlr->state = VFIO_USER_CTRLR_RESUMING; 3236 ret = spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 3237 vfio_user_endpoint_resume_done, endpoint); 3238 if (ret < 0) { 3239 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 3240 SPDK_ERRLOG("%s: failed to resume, ret=%d\n", endpoint_id(endpoint), ret); 3241 } 3242 } 3243 3244 static void 3245 vfio_user_pause_done(struct spdk_nvmf_subsystem *subsystem, 3246 void *ctx, int status) 3247 { 3248 struct ctrlr_quiesce_ctx *quiesce_ctx = ctx; 3249 struct nvmf_vfio_user_endpoint *endpoint = quiesce_ctx->endpoint; 3250 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3251 3252 if (!vu_ctrlr) { 3253 free(quiesce_ctx); 3254 return; 3255 } 3256 3257 quiesce_ctx->status = status; 3258 3259 SPDK_DEBUGLOG(nvmf_vfio, "%s pause done with status %d\n", 3260 ctrlr_id(vu_ctrlr), status); 3261 3262 spdk_thread_send_msg(vu_ctrlr->thread, 3263 vfio_user_quiesce_done, ctx); 3264 } 3265 3266 /* 3267 * Ensure that, for this PG, we've stopped running in nvmf_vfio_user_sq_poll(); 3268 * we've already set ctrlr->state, so we won't process new entries, but we need 3269 * to ensure that this PG is quiesced. This only works because there's no 3270 * callback context set up between polling the SQ and spdk_nvmf_request_exec(). 3271 * 3272 * Once we've walked all PGs, we need to pause any submitted I/O via 3273 * spdk_nvmf_subsystem_pause(SPDK_NVME_GLOBAL_NS_TAG). 3274 */ 3275 static void 3276 vfio_user_quiesce_pg(void *ctx) 3277 { 3278 struct ctrlr_quiesce_ctx *quiesce_ctx = ctx; 3279 struct nvmf_vfio_user_endpoint *endpoint = quiesce_ctx->endpoint; 3280 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3281 struct nvmf_vfio_user_poll_group *vu_group = quiesce_ctx->group; 3282 struct spdk_nvmf_subsystem *subsystem = endpoint->subsystem; 3283 int ret; 3284 3285 SPDK_DEBUGLOG(nvmf_vfio, "quiesced pg:%p\n", vu_group); 3286 3287 if (!vu_ctrlr) { 3288 free(quiesce_ctx); 3289 return; 3290 } 3291 3292 quiesce_ctx->group = TAILQ_NEXT(vu_group, link); 3293 if (quiesce_ctx->group != NULL) { 3294 spdk_thread_send_msg(poll_group_to_thread(quiesce_ctx->group), 3295 vfio_user_quiesce_pg, quiesce_ctx); 3296 return; 3297 } 3298 3299 ret = spdk_nvmf_subsystem_pause(subsystem, SPDK_NVME_GLOBAL_NS_TAG, 3300 vfio_user_pause_done, quiesce_ctx); 3301 if (ret < 0) { 3302 SPDK_ERRLOG("%s: failed to pause, ret=%d\n", 3303 endpoint_id(endpoint), ret); 3304 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 3305 fail_ctrlr(vu_ctrlr); 3306 free(quiesce_ctx); 3307 } 3308 } 3309 3310 static void 3311 ctrlr_quiesce(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3312 { 3313 struct ctrlr_quiesce_ctx *quiesce_ctx; 3314 3315 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSING; 3316 3317 quiesce_ctx = calloc(1, sizeof(*quiesce_ctx)); 3318 if (!quiesce_ctx) { 3319 SPDK_ERRLOG("Failed to allocate subsystem pause context\n"); 3320 assert(false); 3321 return; 3322 } 3323 3324 quiesce_ctx->endpoint = vu_ctrlr->endpoint; 3325 quiesce_ctx->status = 0; 3326 quiesce_ctx->group = TAILQ_FIRST(&vu_ctrlr->transport->poll_groups); 3327 3328 spdk_thread_send_msg(poll_group_to_thread(quiesce_ctx->group), 3329 vfio_user_quiesce_pg, quiesce_ctx); 3330 } 3331 3332 static int 3333 vfio_user_dev_quiesce_cb(vfu_ctx_t *vfu_ctx) 3334 { 3335 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3336 struct spdk_nvmf_subsystem *subsystem = endpoint->subsystem; 3337 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3338 3339 if (!vu_ctrlr) { 3340 return 0; 3341 } 3342 3343 /* NVMf library will destruct controller when no 3344 * connected queue pairs. 3345 */ 3346 if (!nvmf_subsystem_get_ctrlr(subsystem, vu_ctrlr->cntlid)) { 3347 return 0; 3348 } 3349 3350 SPDK_DEBUGLOG(nvmf_vfio, "%s starts to quiesce\n", ctrlr_id(vu_ctrlr)); 3351 3352 /* There is no race condition here as device quiesce callback 3353 * and nvmf_prop_set_cc() are running in the same thread context. 3354 */ 3355 if (!vu_ctrlr->ctrlr->vcprop.cc.bits.en) { 3356 return 0; 3357 } else if (!vu_ctrlr->ctrlr->vcprop.csts.bits.rdy) { 3358 return 0; 3359 } else if (vu_ctrlr->ctrlr->vcprop.csts.bits.shst == SPDK_NVME_SHST_COMPLETE) { 3360 return 0; 3361 } 3362 3363 switch (vu_ctrlr->state) { 3364 case VFIO_USER_CTRLR_PAUSED: 3365 case VFIO_USER_CTRLR_MIGRATING: 3366 return 0; 3367 case VFIO_USER_CTRLR_RUNNING: 3368 ctrlr_quiesce(vu_ctrlr); 3369 break; 3370 case VFIO_USER_CTRLR_RESUMING: 3371 vu_ctrlr->queued_quiesce = true; 3372 SPDK_DEBUGLOG(nvmf_vfio, "%s is busy to quiesce, current state %u\n", ctrlr_id(vu_ctrlr), 3373 vu_ctrlr->state); 3374 break; 3375 default: 3376 assert(vu_ctrlr->state != VFIO_USER_CTRLR_PAUSING); 3377 break; 3378 } 3379 3380 errno = EBUSY; 3381 return -1; 3382 } 3383 3384 static void 3385 vfio_user_ctrlr_dump_migr_data(const char *name, 3386 struct vfio_user_nvme_migr_state *migr_data, 3387 struct nvmf_vfio_user_shadow_doorbells *sdbl) 3388 { 3389 struct spdk_nvmf_registers *regs; 3390 struct nvme_migr_sq_state *sq; 3391 struct nvme_migr_cq_state *cq; 3392 uint32_t *doorbell_base; 3393 uint32_t i; 3394 3395 SPDK_NOTICELOG("Dump %s\n", name); 3396 3397 regs = &migr_data->nvmf_data.regs; 3398 doorbell_base = (uint32_t *)&migr_data->doorbells; 3399 3400 SPDK_NOTICELOG("Registers\n"); 3401 SPDK_NOTICELOG("CSTS 0x%x\n", regs->csts.raw); 3402 SPDK_NOTICELOG("CAP 0x%"PRIx64"\n", regs->cap.raw); 3403 SPDK_NOTICELOG("VS 0x%x\n", regs->vs.raw); 3404 SPDK_NOTICELOG("CC 0x%x\n", regs->cc.raw); 3405 SPDK_NOTICELOG("AQA 0x%x\n", regs->aqa.raw); 3406 SPDK_NOTICELOG("ASQ 0x%"PRIx64"\n", regs->asq); 3407 SPDK_NOTICELOG("ACQ 0x%"PRIx64"\n", regs->acq); 3408 3409 SPDK_NOTICELOG("Number of IO Queues %u\n", migr_data->ctrlr_header.num_io_queues); 3410 3411 if (sdbl != NULL) { 3412 SPDK_NOTICELOG("shadow doorbell buffer=%#lx\n", 3413 migr_data->ctrlr_header.shadow_doorbell_buffer); 3414 SPDK_NOTICELOG("eventidx buffer=%#lx\n", 3415 migr_data->ctrlr_header.eventidx_buffer); 3416 } 3417 3418 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3419 sq = &migr_data->qps[i].sq; 3420 cq = &migr_data->qps[i].cq; 3421 3422 if (sq->size) { 3423 SPDK_NOTICELOG("sqid:%u, bar0_doorbell:%u\n", sq->sqid, doorbell_base[i * 2]); 3424 if (i > 0 && sdbl != NULL) { 3425 SPDK_NOTICELOG("sqid:%u, shadow_doorbell:%u, eventidx:%u\n", 3426 sq->sqid, 3427 sdbl->shadow_doorbells[queue_index(i, false)], 3428 sdbl->eventidxs[queue_index(i, false)]); 3429 } 3430 SPDK_NOTICELOG("SQ sqid:%u, cqid:%u, sqhead:%u, size:%u, dma_addr:0x%"PRIx64"\n", 3431 sq->sqid, sq->cqid, sq->head, sq->size, sq->dma_addr); 3432 } 3433 3434 if (cq->size) { 3435 SPDK_NOTICELOG("cqid:%u, bar0_doorbell:%u\n", cq->cqid, doorbell_base[i * 2 + 1]); 3436 if (i > 0 && sdbl != NULL) { 3437 SPDK_NOTICELOG("cqid:%u, shadow_doorbell:%u, eventidx:%u\n", 3438 cq->cqid, 3439 sdbl->shadow_doorbells[queue_index(i, true)], 3440 sdbl->eventidxs[queue_index(i, true)]); 3441 } 3442 SPDK_NOTICELOG("CQ cqid:%u, phase:%u, cqtail:%u, size:%u, iv:%u, ien:%u, dma_addr:0x%"PRIx64"\n", 3443 cq->cqid, cq->phase, cq->tail, cq->size, cq->iv, cq->ien, cq->dma_addr); 3444 } 3445 } 3446 3447 SPDK_NOTICELOG("%s Dump Done\n", name); 3448 } 3449 3450 /* Read region 9 content and restore it to migration data structures */ 3451 static int 3452 vfio_user_migr_stream_to_data(struct nvmf_vfio_user_endpoint *endpoint, 3453 struct vfio_user_nvme_migr_state *migr_state) 3454 { 3455 void *data_ptr = endpoint->migr_data; 3456 3457 /* Load vfio_user_nvme_migr_header first */ 3458 memcpy(&migr_state->ctrlr_header, data_ptr, sizeof(struct vfio_user_nvme_migr_header)); 3459 /* TODO: version check */ 3460 if (migr_state->ctrlr_header.magic != VFIO_USER_NVME_MIGR_MAGIC) { 3461 SPDK_ERRLOG("%s: bad magic number %x\n", endpoint_id(endpoint), migr_state->ctrlr_header.magic); 3462 return -EINVAL; 3463 } 3464 3465 /* Load nvmf controller data */ 3466 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.nvmf_data_offset; 3467 memcpy(&migr_state->nvmf_data, data_ptr, migr_state->ctrlr_header.nvmf_data_len); 3468 3469 /* Load queue pairs */ 3470 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.qp_offset; 3471 memcpy(&migr_state->qps, data_ptr, migr_state->ctrlr_header.qp_len); 3472 3473 /* Load doorbells */ 3474 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.bar_offset[VFU_PCI_DEV_BAR0_REGION_IDX]; 3475 memcpy(&migr_state->doorbells, data_ptr, 3476 migr_state->ctrlr_header.bar_len[VFU_PCI_DEV_BAR0_REGION_IDX]); 3477 3478 /* Load CFG */ 3479 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.bar_offset[VFU_PCI_DEV_CFG_REGION_IDX]; 3480 memcpy(&migr_state->cfg, data_ptr, migr_state->ctrlr_header.bar_len[VFU_PCI_DEV_CFG_REGION_IDX]); 3481 3482 return 0; 3483 } 3484 3485 3486 static void 3487 vfio_user_migr_ctrlr_save_data(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3488 { 3489 struct spdk_nvmf_ctrlr *ctrlr = vu_ctrlr->ctrlr; 3490 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 3491 struct nvmf_vfio_user_sq *sq; 3492 struct nvmf_vfio_user_cq *cq; 3493 uint64_t data_offset; 3494 void *data_ptr; 3495 uint32_t *doorbell_base; 3496 uint32_t i = 0; 3497 uint16_t sqid, cqid; 3498 struct vfio_user_nvme_migr_state migr_state = { 3499 .nvmf_data = { 3500 .data_size = offsetof(struct spdk_nvmf_ctrlr_migr_data, unused), 3501 .regs_size = sizeof(struct spdk_nvmf_registers), 3502 .feat_size = sizeof(struct spdk_nvmf_ctrlr_feat) 3503 } 3504 }; 3505 3506 /* Save all data to vfio_user_nvme_migr_state first, then we will 3507 * copy it to device migration region at last. 3508 */ 3509 3510 /* save magic number */ 3511 migr_state.ctrlr_header.magic = VFIO_USER_NVME_MIGR_MAGIC; 3512 3513 /* save controller data */ 3514 spdk_nvmf_ctrlr_save_migr_data(ctrlr, &migr_state.nvmf_data); 3515 3516 /* save connected queue pairs */ 3517 TAILQ_FOREACH(sq, &vu_ctrlr->connected_sqs, tailq) { 3518 /* save sq */ 3519 sqid = sq->qid; 3520 migr_state.qps[sqid].sq.sqid = sq->qid; 3521 migr_state.qps[sqid].sq.cqid = sq->cqid; 3522 migr_state.qps[sqid].sq.head = *sq_headp(sq); 3523 migr_state.qps[sqid].sq.size = sq->size; 3524 migr_state.qps[sqid].sq.dma_addr = sq->mapping.prp1; 3525 3526 /* save cq, for shared cq case, cq may be saved multiple times */ 3527 cqid = sq->cqid; 3528 cq = vu_ctrlr->cqs[cqid]; 3529 migr_state.qps[cqid].cq.cqid = cqid; 3530 migr_state.qps[cqid].cq.tail = *cq_tailp(cq); 3531 migr_state.qps[cqid].cq.ien = cq->ien; 3532 migr_state.qps[cqid].cq.iv = cq->iv; 3533 migr_state.qps[cqid].cq.size = cq->size; 3534 migr_state.qps[cqid].cq.phase = cq->phase; 3535 migr_state.qps[cqid].cq.dma_addr = cq->mapping.prp1; 3536 i++; 3537 } 3538 3539 assert(i > 0); 3540 migr_state.ctrlr_header.num_io_queues = i - 1; 3541 3542 /* Save doorbells */ 3543 doorbell_base = (uint32_t *)&migr_state.doorbells; 3544 memcpy(doorbell_base, (void *)vu_ctrlr->bar0_doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 3545 3546 /* Save PCI configuration space */ 3547 memcpy(&migr_state.cfg, (void *)endpoint->pci_config_space, NVME_REG_CFG_SIZE); 3548 3549 /* Save all data to device migration region */ 3550 data_ptr = endpoint->migr_data; 3551 3552 /* Copy nvmf controller data */ 3553 data_offset = sizeof(struct vfio_user_nvme_migr_header); 3554 data_ptr += data_offset; 3555 migr_state.ctrlr_header.nvmf_data_offset = data_offset; 3556 migr_state.ctrlr_header.nvmf_data_len = sizeof(struct spdk_nvmf_ctrlr_migr_data); 3557 memcpy(data_ptr, &migr_state.nvmf_data, sizeof(struct spdk_nvmf_ctrlr_migr_data)); 3558 3559 /* Copy queue pairs */ 3560 data_offset += sizeof(struct spdk_nvmf_ctrlr_migr_data); 3561 data_ptr += sizeof(struct spdk_nvmf_ctrlr_migr_data); 3562 migr_state.ctrlr_header.qp_offset = data_offset; 3563 migr_state.ctrlr_header.qp_len = i * (sizeof(struct nvme_migr_sq_state) + sizeof( 3564 struct nvme_migr_cq_state)); 3565 memcpy(data_ptr, &migr_state.qps, migr_state.ctrlr_header.qp_len); 3566 3567 /* Copy doorbells */ 3568 data_offset += migr_state.ctrlr_header.qp_len; 3569 data_ptr += migr_state.ctrlr_header.qp_len; 3570 migr_state.ctrlr_header.bar_offset[VFU_PCI_DEV_BAR0_REGION_IDX] = data_offset; 3571 migr_state.ctrlr_header.bar_len[VFU_PCI_DEV_BAR0_REGION_IDX] = NVMF_VFIO_USER_DOORBELLS_SIZE; 3572 memcpy(data_ptr, &migr_state.doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 3573 3574 /* Copy CFG */ 3575 data_offset += NVMF_VFIO_USER_DOORBELLS_SIZE; 3576 data_ptr += NVMF_VFIO_USER_DOORBELLS_SIZE; 3577 migr_state.ctrlr_header.bar_offset[VFU_PCI_DEV_CFG_REGION_IDX] = data_offset; 3578 migr_state.ctrlr_header.bar_len[VFU_PCI_DEV_CFG_REGION_IDX] = NVME_REG_CFG_SIZE; 3579 memcpy(data_ptr, &migr_state.cfg, NVME_REG_CFG_SIZE); 3580 3581 /* copy shadow doorbells */ 3582 if (vu_ctrlr->sdbl != NULL) { 3583 migr_state.ctrlr_header.sdbl = true; 3584 migr_state.ctrlr_header.shadow_doorbell_buffer = vu_ctrlr->shadow_doorbell_buffer; 3585 migr_state.ctrlr_header.eventidx_buffer = vu_ctrlr->eventidx_buffer; 3586 } 3587 3588 /* Copy nvme migration header finally */ 3589 memcpy(endpoint->migr_data, &migr_state.ctrlr_header, sizeof(struct vfio_user_nvme_migr_header)); 3590 3591 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 3592 vfio_user_ctrlr_dump_migr_data("SAVE", &migr_state, vu_ctrlr->sdbl); 3593 } 3594 } 3595 3596 /* 3597 * If we are about to close the connection, we need to unregister the interrupt, 3598 * as the library will subsequently close the file descriptor we registered. 3599 */ 3600 static int 3601 vfio_user_device_reset(vfu_ctx_t *vfu_ctx, vfu_reset_type_t type) 3602 { 3603 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3604 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 3605 3606 SPDK_DEBUGLOG(nvmf_vfio, "Device reset type %u\n", type); 3607 3608 if (type == VFU_RESET_LOST_CONN) { 3609 if (ctrlr != NULL) { 3610 spdk_interrupt_unregister(&ctrlr->intr); 3611 ctrlr->intr_fd = -1; 3612 } 3613 return 0; 3614 } 3615 3616 /* FIXME: LOST_CONN case ? */ 3617 if (ctrlr->sdbl != NULL) { 3618 vfio_user_ctrlr_switch_doorbells(ctrlr, false); 3619 free_sdbl(vfu_ctx, ctrlr->sdbl); 3620 ctrlr->sdbl = NULL; 3621 } 3622 3623 /* FIXME: much more needed here. */ 3624 3625 return 0; 3626 } 3627 3628 static int 3629 vfio_user_migr_ctrlr_construct_qps(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 3630 struct vfio_user_nvme_migr_state *migr_state) 3631 { 3632 uint32_t i, qsize = 0; 3633 uint16_t sqid, cqid; 3634 struct vfio_user_nvme_migr_qp migr_qp; 3635 void *addr; 3636 uint32_t cqs_ref[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR] = {}; 3637 int ret; 3638 3639 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 3640 vfio_user_ctrlr_dump_migr_data("RESUME", migr_state, vu_ctrlr->sdbl); 3641 } 3642 3643 /* restore submission queues */ 3644 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3645 migr_qp = migr_state->qps[i]; 3646 3647 qsize = migr_qp.sq.size; 3648 if (qsize) { 3649 struct nvmf_vfio_user_sq *sq; 3650 3651 sqid = migr_qp.sq.sqid; 3652 if (sqid != i) { 3653 SPDK_ERRLOG("Expected sqid %u while got %u", i, sqid); 3654 return -EINVAL; 3655 } 3656 3657 /* allocate sq if necessary */ 3658 if (vu_ctrlr->sqs[sqid] == NULL) { 3659 ret = init_sq(vu_ctrlr, &vu_ctrlr->transport->transport, sqid); 3660 if (ret) { 3661 SPDK_ERRLOG("Construct qpair with qid %u failed\n", sqid); 3662 return -EFAULT; 3663 } 3664 } 3665 3666 sq = vu_ctrlr->sqs[sqid]; 3667 sq->size = qsize; 3668 3669 ret = alloc_sq_reqs(vu_ctrlr, sq); 3670 if (ret) { 3671 SPDK_ERRLOG("Construct sq with qid %u failed\n", sqid); 3672 return -EFAULT; 3673 } 3674 3675 /* restore sq */ 3676 sq->sq_state = VFIO_USER_SQ_CREATED; 3677 sq->cqid = migr_qp.sq.cqid; 3678 *sq_headp(sq) = migr_qp.sq.head; 3679 sq->mapping.prp1 = migr_qp.sq.dma_addr; 3680 sq->mapping.len = sq->size * sizeof(struct spdk_nvme_cmd); 3681 addr = map_one(vu_ctrlr->endpoint->vfu_ctx, 3682 sq->mapping.prp1, sq->mapping.len, 3683 sq->mapping.sg, &sq->mapping.iov, 3684 PROT_READ); 3685 if (addr == NULL) { 3686 SPDK_ERRLOG("Restore sq with qid %u PRP1 0x%"PRIx64" with size %u failed\n", 3687 sqid, sq->mapping.prp1, sq->size); 3688 return -EFAULT; 3689 } 3690 cqs_ref[sq->cqid]++; 3691 } 3692 } 3693 3694 /* restore completion queues */ 3695 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3696 migr_qp = migr_state->qps[i]; 3697 3698 qsize = migr_qp.cq.size; 3699 if (qsize) { 3700 struct nvmf_vfio_user_cq *cq; 3701 3702 /* restore cq */ 3703 cqid = migr_qp.sq.cqid; 3704 assert(cqid == i); 3705 3706 /* allocate cq if necessary */ 3707 if (vu_ctrlr->cqs[cqid] == NULL) { 3708 ret = init_cq(vu_ctrlr, cqid); 3709 if (ret) { 3710 SPDK_ERRLOG("Construct qpair with qid %u failed\n", cqid); 3711 return -EFAULT; 3712 } 3713 } 3714 3715 cq = vu_ctrlr->cqs[cqid]; 3716 3717 cq->size = qsize; 3718 3719 cq->cq_state = VFIO_USER_CQ_CREATED; 3720 cq->cq_ref = cqs_ref[cqid]; 3721 *cq_tailp(cq) = migr_qp.cq.tail; 3722 cq->mapping.prp1 = migr_qp.cq.dma_addr; 3723 cq->mapping.len = cq->size * sizeof(struct spdk_nvme_cpl); 3724 cq->ien = migr_qp.cq.ien; 3725 cq->iv = migr_qp.cq.iv; 3726 cq->phase = migr_qp.cq.phase; 3727 addr = map_one(vu_ctrlr->endpoint->vfu_ctx, 3728 cq->mapping.prp1, cq->mapping.len, 3729 cq->mapping.sg, &cq->mapping.iov, 3730 PROT_READ | PROT_WRITE); 3731 if (addr == NULL) { 3732 SPDK_ERRLOG("Restore cq with qid %u PRP1 0x%"PRIx64" with size %u failed\n", 3733 cqid, cq->mapping.prp1, cq->size); 3734 return -EFAULT; 3735 } 3736 } 3737 } 3738 3739 return 0; 3740 } 3741 3742 static int 3743 vfio_user_migr_ctrlr_restore(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3744 { 3745 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 3746 struct spdk_nvmf_ctrlr *ctrlr = vu_ctrlr->ctrlr; 3747 uint32_t *doorbell_base; 3748 struct spdk_nvme_cmd cmd; 3749 uint16_t i; 3750 int rc = 0; 3751 struct vfio_user_nvme_migr_state migr_state = { 3752 .nvmf_data = { 3753 .data_size = offsetof(struct spdk_nvmf_ctrlr_migr_data, unused), 3754 .regs_size = sizeof(struct spdk_nvmf_registers), 3755 .feat_size = sizeof(struct spdk_nvmf_ctrlr_feat) 3756 } 3757 }; 3758 3759 assert(endpoint->migr_data != NULL); 3760 assert(ctrlr != NULL); 3761 rc = vfio_user_migr_stream_to_data(endpoint, &migr_state); 3762 if (rc) { 3763 return rc; 3764 } 3765 3766 /* restore shadow doorbells */ 3767 if (migr_state.ctrlr_header.sdbl) { 3768 struct nvmf_vfio_user_shadow_doorbells *sdbl; 3769 sdbl = map_sdbl(vu_ctrlr->endpoint->vfu_ctx, 3770 migr_state.ctrlr_header.shadow_doorbell_buffer, 3771 migr_state.ctrlr_header.eventidx_buffer, 3772 memory_page_size(vu_ctrlr)); 3773 if (sdbl == NULL) { 3774 SPDK_ERRLOG("%s: failed to re-map shadow doorbell buffers\n", 3775 ctrlr_id(vu_ctrlr)); 3776 return -1; 3777 } 3778 3779 vu_ctrlr->shadow_doorbell_buffer = migr_state.ctrlr_header.shadow_doorbell_buffer; 3780 vu_ctrlr->eventidx_buffer = migr_state.ctrlr_header.eventidx_buffer; 3781 3782 SWAP(vu_ctrlr->sdbl, sdbl); 3783 } 3784 3785 rc = vfio_user_migr_ctrlr_construct_qps(vu_ctrlr, &migr_state); 3786 if (rc) { 3787 return rc; 3788 } 3789 3790 /* restore PCI configuration space */ 3791 memcpy((void *)endpoint->pci_config_space, &migr_state.cfg, NVME_REG_CFG_SIZE); 3792 3793 doorbell_base = (uint32_t *)&migr_state.doorbells; 3794 /* restore doorbells from saved registers */ 3795 memcpy((void *)vu_ctrlr->bar0_doorbells, doorbell_base, NVMF_VFIO_USER_DOORBELLS_SIZE); 3796 3797 /* restore nvmf controller data */ 3798 rc = spdk_nvmf_ctrlr_restore_migr_data(ctrlr, &migr_state.nvmf_data); 3799 if (rc) { 3800 return rc; 3801 } 3802 3803 /* resubmit pending AERs */ 3804 for (i = 0; i < migr_state.nvmf_data.num_aer_cids; i++) { 3805 SPDK_DEBUGLOG(nvmf_vfio, "%s AER resubmit, CID %u\n", ctrlr_id(vu_ctrlr), 3806 migr_state.nvmf_data.aer_cids[i]); 3807 memset(&cmd, 0, sizeof(cmd)); 3808 cmd.opc = SPDK_NVME_OPC_ASYNC_EVENT_REQUEST; 3809 cmd.cid = migr_state.nvmf_data.aer_cids[i]; 3810 rc = handle_cmd_req(vu_ctrlr, &cmd, vu_ctrlr->sqs[0]); 3811 if (spdk_unlikely(rc)) { 3812 break; 3813 } 3814 } 3815 3816 return rc; 3817 } 3818 3819 static void 3820 vfio_user_migr_ctrlr_enable_sqs(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3821 { 3822 uint32_t i; 3823 struct nvmf_vfio_user_sq *sq; 3824 3825 /* The Admin queue (qid: 0) does not ever use shadow doorbells. */ 3826 3827 if (vu_ctrlr->sqs[0] != NULL) { 3828 vu_ctrlr->sqs[0]->dbl_tailp = vu_ctrlr->bar0_doorbells + 3829 queue_index(0, false); 3830 } 3831 3832 if (vu_ctrlr->cqs[0] != NULL) { 3833 vu_ctrlr->cqs[0]->dbl_headp = vu_ctrlr->bar0_doorbells + 3834 queue_index(0, true); 3835 } 3836 3837 vfio_user_ctrlr_switch_doorbells(vu_ctrlr, vu_ctrlr->sdbl != NULL); 3838 3839 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3840 sq = vu_ctrlr->sqs[i]; 3841 if (!sq || !sq->size) { 3842 continue; 3843 } 3844 3845 if (nvmf_qpair_is_admin_queue(&sq->qpair)) { 3846 /* ADMIN queue pair is always in the poll group, just enable it */ 3847 sq->sq_state = VFIO_USER_SQ_ACTIVE; 3848 } else { 3849 spdk_nvmf_tgt_new_qpair(vu_ctrlr->transport->transport.tgt, &sq->qpair); 3850 } 3851 } 3852 } 3853 3854 /* 3855 * We are in stop-and-copy state, but still potentially have some current dirty 3856 * sgls: while we're quiesced and thus should have no active requests, we still 3857 * have potentially dirty maps of the shadow doorbells and the CQs (SQs are 3858 * mapped read only). 3859 * 3860 * Since we won't be calling vfu_sgl_put() for them, we need to explicitly 3861 * mark them dirty now. 3862 */ 3863 static void 3864 vfio_user_migr_ctrlr_mark_dirty(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3865 { 3866 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 3867 3868 assert(vu_ctrlr->state == VFIO_USER_CTRLR_MIGRATING); 3869 3870 for (size_t i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3871 struct nvmf_vfio_user_cq *cq = vu_ctrlr->cqs[i]; 3872 3873 if (cq == NULL || q_addr(&cq->mapping) == NULL) { 3874 continue; 3875 } 3876 3877 vfu_sgl_mark_dirty(endpoint->vfu_ctx, cq->mapping.sg, 1); 3878 } 3879 3880 if (vu_ctrlr->sdbl != NULL) { 3881 dma_sg_t *sg; 3882 size_t i; 3883 3884 for (i = 0; i < NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT; 3885 ++i) { 3886 3887 if (!vu_ctrlr->sdbl->iovs[i].iov_len) { 3888 continue; 3889 } 3890 3891 sg = index_to_sg_t(vu_ctrlr->sdbl->sgs, i); 3892 3893 vfu_sgl_mark_dirty(endpoint->vfu_ctx, sg, 1); 3894 } 3895 } 3896 } 3897 3898 static int 3899 vfio_user_migration_device_state_transition(vfu_ctx_t *vfu_ctx, vfu_migr_state_t state) 3900 { 3901 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3902 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3903 struct nvmf_vfio_user_sq *sq; 3904 int ret = 0; 3905 3906 SPDK_DEBUGLOG(nvmf_vfio, "%s controller state %u, migration state %u\n", endpoint_id(endpoint), 3907 vu_ctrlr->state, state); 3908 3909 switch (state) { 3910 case VFU_MIGR_STATE_STOP_AND_COPY: 3911 vu_ctrlr->in_source_vm = true; 3912 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 3913 vfio_user_migr_ctrlr_mark_dirty(vu_ctrlr); 3914 vfio_user_migr_ctrlr_save_data(vu_ctrlr); 3915 break; 3916 case VFU_MIGR_STATE_STOP: 3917 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 3918 /* The controller associates with source VM is dead now, we will resume 3919 * the subsystem after destroying the controller data structure, then the 3920 * subsystem can be re-used for another new client. 3921 */ 3922 if (vu_ctrlr->in_source_vm) { 3923 endpoint->need_resume = true; 3924 } 3925 break; 3926 case VFU_MIGR_STATE_PRE_COPY: 3927 assert(vu_ctrlr->state == VFIO_USER_CTRLR_PAUSED); 3928 break; 3929 case VFU_MIGR_STATE_RESUME: 3930 /* 3931 * Destination ADMIN queue pair is connected when starting the VM, 3932 * but the ADMIN queue pair isn't enabled in destination VM, the poll 3933 * group will do nothing to ADMIN queue pair for now. 3934 */ 3935 if (vu_ctrlr->state != VFIO_USER_CTRLR_RUNNING) { 3936 break; 3937 } 3938 3939 assert(!vu_ctrlr->in_source_vm); 3940 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 3941 3942 sq = TAILQ_FIRST(&vu_ctrlr->connected_sqs); 3943 assert(sq != NULL); 3944 assert(sq->qpair.qid == 0); 3945 sq->sq_state = VFIO_USER_SQ_INACTIVE; 3946 3947 /* Free ADMIN SQ resources first, SQ resources will be 3948 * allocated based on queue size from source VM. 3949 */ 3950 free_sq_reqs(sq); 3951 sq->size = 0; 3952 break; 3953 case VFU_MIGR_STATE_RUNNING: 3954 3955 if (vu_ctrlr->state != VFIO_USER_CTRLR_MIGRATING) { 3956 break; 3957 } 3958 3959 if (!vu_ctrlr->in_source_vm) { 3960 /* Restore destination VM from BAR9 */ 3961 ret = vfio_user_migr_ctrlr_restore(vu_ctrlr); 3962 if (ret) { 3963 break; 3964 } 3965 3966 vfio_user_ctrlr_switch_doorbells(vu_ctrlr, false); 3967 vfio_user_migr_ctrlr_enable_sqs(vu_ctrlr); 3968 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 3969 /* FIXME where do we resume nvmf? */ 3970 } else { 3971 /* Rollback source VM */ 3972 vu_ctrlr->state = VFIO_USER_CTRLR_RESUMING; 3973 ret = spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 3974 vfio_user_endpoint_resume_done, endpoint); 3975 if (ret < 0) { 3976 /* TODO: fail controller with CFS bit set */ 3977 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 3978 SPDK_ERRLOG("%s: failed to resume, ret=%d\n", endpoint_id(endpoint), ret); 3979 } 3980 } 3981 vu_ctrlr->migr_data_prepared = false; 3982 vu_ctrlr->in_source_vm = false; 3983 break; 3984 3985 default: 3986 return -EINVAL; 3987 } 3988 3989 return ret; 3990 } 3991 3992 static uint64_t 3993 vfio_user_migration_get_pending_bytes(vfu_ctx_t *vfu_ctx) 3994 { 3995 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3996 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 3997 uint64_t pending_bytes; 3998 3999 if (ctrlr->migr_data_prepared) { 4000 assert(ctrlr->state == VFIO_USER_CTRLR_MIGRATING); 4001 pending_bytes = 0; 4002 } else { 4003 pending_bytes = vfio_user_migr_data_len(); 4004 } 4005 4006 SPDK_DEBUGLOG(nvmf_vfio, 4007 "%s current state %u, pending bytes 0x%"PRIx64"\n", 4008 endpoint_id(endpoint), ctrlr->state, pending_bytes); 4009 4010 return pending_bytes; 4011 } 4012 4013 static int 4014 vfio_user_migration_prepare_data(vfu_ctx_t *vfu_ctx, uint64_t *offset, uint64_t *size) 4015 { 4016 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 4017 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 4018 4019 /* 4020 * When transitioning to pre-copy state we set pending_bytes to 0, 4021 * so the vfio-user client shouldn't attempt to read any migration 4022 * data. This is not yet guaranteed by libvfio-user. 4023 */ 4024 if (ctrlr->state != VFIO_USER_CTRLR_MIGRATING) { 4025 assert(size != NULL); 4026 *offset = 0; 4027 *size = 0; 4028 return 0; 4029 } 4030 4031 if (ctrlr->in_source_vm) { /* migration source */ 4032 assert(size != NULL); 4033 *size = vfio_user_migr_data_len(); 4034 vfio_user_migr_ctrlr_save_data(ctrlr); 4035 } else { /* migration destination */ 4036 assert(size == NULL); 4037 assert(!ctrlr->migr_data_prepared); 4038 } 4039 *offset = 0; 4040 ctrlr->migr_data_prepared = true; 4041 4042 SPDK_DEBUGLOG(nvmf_vfio, "%s current state %u\n", endpoint_id(endpoint), ctrlr->state); 4043 4044 return 0; 4045 } 4046 4047 static ssize_t 4048 vfio_user_migration_read_data(vfu_ctx_t *vfu_ctx __attribute__((unused)), 4049 void *buf __attribute__((unused)), 4050 uint64_t count __attribute__((unused)), 4051 uint64_t offset __attribute__((unused))) 4052 { 4053 SPDK_DEBUGLOG(nvmf_vfio, "%s: migration read data not supported\n", 4054 endpoint_id(vfu_get_private(vfu_ctx))); 4055 errno = ENOTSUP; 4056 return -1; 4057 } 4058 4059 static ssize_t 4060 vfio_user_migration_write_data(vfu_ctx_t *vfu_ctx __attribute__((unused)), 4061 void *buf __attribute__((unused)), 4062 uint64_t count __attribute__((unused)), 4063 uint64_t offset __attribute__((unused))) 4064 { 4065 SPDK_DEBUGLOG(nvmf_vfio, "%s: migration write data not supported\n", 4066 endpoint_id(vfu_get_private(vfu_ctx))); 4067 errno = ENOTSUP; 4068 return -1; 4069 } 4070 4071 static int 4072 vfio_user_migration_data_written(vfu_ctx_t *vfu_ctx __attribute__((unused)), 4073 uint64_t count) 4074 { 4075 SPDK_DEBUGLOG(nvmf_vfio, "write 0x%"PRIx64"\n", (uint64_t)count); 4076 4077 if (count != vfio_user_migr_data_len()) { 4078 SPDK_DEBUGLOG(nvmf_vfio, "%s bad count %#lx\n", 4079 endpoint_id(vfu_get_private(vfu_ctx)), count); 4080 errno = EINVAL; 4081 return -1; 4082 } 4083 4084 return 0; 4085 } 4086 4087 static int 4088 vfio_user_dev_info_fill(struct nvmf_vfio_user_transport *vu_transport, 4089 struct nvmf_vfio_user_endpoint *endpoint) 4090 { 4091 int ret; 4092 ssize_t cap_offset; 4093 vfu_ctx_t *vfu_ctx = endpoint->vfu_ctx; 4094 struct iovec migr_sparse_mmap = {}; 4095 4096 struct pmcap pmcap = { .hdr.id = PCI_CAP_ID_PM, .pmcs.nsfrst = 0x1 }; 4097 struct pxcap pxcap = { 4098 .hdr.id = PCI_CAP_ID_EXP, 4099 .pxcaps.ver = 0x2, 4100 .pxdcap = {.rer = 0x1, .flrc = 0x1}, 4101 .pxdcap2.ctds = 0x1 4102 }; 4103 4104 struct msixcap msixcap = { 4105 .hdr.id = PCI_CAP_ID_MSIX, 4106 .mxc.ts = NVMF_VFIO_USER_MSIX_NUM - 1, 4107 .mtab = {.tbir = NVMF_VFIO_USER_MSIX_TABLE_BIR, .to = 0x0}, 4108 .mpba = {.pbir = NVMF_VFIO_USER_MSIX_PBA_BIR, .pbao = 0x0} 4109 }; 4110 4111 struct iovec sparse_mmap[] = { 4112 { 4113 .iov_base = (void *)NVME_DOORBELLS_OFFSET, 4114 .iov_len = NVMF_VFIO_USER_DOORBELLS_SIZE, 4115 }, 4116 }; 4117 4118 const vfu_migration_callbacks_t migr_callbacks = { 4119 .version = VFIO_USER_MIGR_CALLBACK_VERS, 4120 .transition = &vfio_user_migration_device_state_transition, 4121 .get_pending_bytes = &vfio_user_migration_get_pending_bytes, 4122 .prepare_data = &vfio_user_migration_prepare_data, 4123 .read_data = &vfio_user_migration_read_data, 4124 .data_written = &vfio_user_migration_data_written, 4125 .write_data = &vfio_user_migration_write_data 4126 }; 4127 4128 ret = vfu_pci_init(vfu_ctx, VFU_PCI_TYPE_EXPRESS, PCI_HEADER_TYPE_NORMAL, 0); 4129 if (ret < 0) { 4130 SPDK_ERRLOG("vfu_ctx %p failed to initialize PCI\n", vfu_ctx); 4131 return ret; 4132 } 4133 vfu_pci_set_id(vfu_ctx, SPDK_PCI_VID_NUTANIX, 0x0001, SPDK_PCI_VID_NUTANIX, 0); 4134 /* 4135 * 0x02, controller uses the NVM Express programming interface 4136 * 0x08, non-volatile memory controller 4137 * 0x01, mass storage controller 4138 */ 4139 vfu_pci_set_class(vfu_ctx, 0x01, 0x08, 0x02); 4140 4141 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pmcap); 4142 if (cap_offset < 0) { 4143 SPDK_ERRLOG("vfu_ctx %p failed add pmcap\n", vfu_ctx); 4144 return ret; 4145 } 4146 4147 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pxcap); 4148 if (cap_offset < 0) { 4149 SPDK_ERRLOG("vfu_ctx %p failed add pxcap\n", vfu_ctx); 4150 return ret; 4151 } 4152 4153 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &msixcap); 4154 if (cap_offset < 0) { 4155 SPDK_ERRLOG("vfu_ctx %p failed add msixcap\n", vfu_ctx); 4156 return ret; 4157 } 4158 4159 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_CFG_REGION_IDX, NVME_REG_CFG_SIZE, 4160 access_pci_config, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 4161 if (ret < 0) { 4162 SPDK_ERRLOG("vfu_ctx %p failed to setup cfg\n", vfu_ctx); 4163 return ret; 4164 } 4165 4166 if (vu_transport->transport_opts.disable_mappable_bar0) { 4167 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE, 4168 access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, 4169 NULL, 0, -1, 0); 4170 } else { 4171 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE, 4172 access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, 4173 sparse_mmap, 1, endpoint->devmem_fd, 0); 4174 } 4175 4176 if (ret < 0) { 4177 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 0\n", vfu_ctx); 4178 return ret; 4179 } 4180 4181 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR4_REGION_IDX, NVMF_VFIO_USER_BAR4_SIZE, 4182 NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 4183 if (ret < 0) { 4184 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 4\n", vfu_ctx); 4185 return ret; 4186 } 4187 4188 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR5_REGION_IDX, NVMF_VFIO_USER_BAR5_SIZE, 4189 NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 4190 if (ret < 0) { 4191 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 5\n", vfu_ctx); 4192 return ret; 4193 } 4194 4195 ret = vfu_setup_device_dma(vfu_ctx, memory_region_add_cb, memory_region_remove_cb); 4196 if (ret < 0) { 4197 SPDK_ERRLOG("vfu_ctx %p failed to setup dma callback\n", vfu_ctx); 4198 return ret; 4199 } 4200 4201 ret = vfu_setup_device_reset_cb(vfu_ctx, vfio_user_device_reset); 4202 if (ret < 0) { 4203 SPDK_ERRLOG("vfu_ctx %p failed to setup reset callback\n", vfu_ctx); 4204 return ret; 4205 } 4206 4207 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_INTX_IRQ, 1); 4208 if (ret < 0) { 4209 SPDK_ERRLOG("vfu_ctx %p failed to setup INTX\n", vfu_ctx); 4210 return ret; 4211 } 4212 4213 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSIX_IRQ, NVMF_VFIO_USER_MSIX_NUM); 4214 if (ret < 0) { 4215 SPDK_ERRLOG("vfu_ctx %p failed to setup MSIX\n", vfu_ctx); 4216 return ret; 4217 } 4218 4219 vfu_setup_device_quiesce_cb(vfu_ctx, vfio_user_dev_quiesce_cb); 4220 4221 migr_sparse_mmap.iov_base = (void *)4096; 4222 migr_sparse_mmap.iov_len = vfio_user_migr_data_len(); 4223 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_MIGR_REGION_IDX, 4224 vfu_get_migr_register_area_size() + vfio_user_migr_data_len(), 4225 NULL, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, &migr_sparse_mmap, 4226 1, endpoint->migr_fd, 0); 4227 if (ret < 0) { 4228 SPDK_ERRLOG("vfu_ctx %p failed to setup migration region\n", vfu_ctx); 4229 return ret; 4230 } 4231 4232 ret = vfu_setup_device_migration_callbacks(vfu_ctx, &migr_callbacks, 4233 vfu_get_migr_register_area_size()); 4234 if (ret < 0) { 4235 SPDK_ERRLOG("vfu_ctx %p failed to setup migration callbacks\n", vfu_ctx); 4236 return ret; 4237 } 4238 4239 ret = vfu_realize_ctx(vfu_ctx); 4240 if (ret < 0) { 4241 SPDK_ERRLOG("vfu_ctx %p failed to realize\n", vfu_ctx); 4242 return ret; 4243 } 4244 4245 endpoint->pci_config_space = vfu_pci_get_config_space(endpoint->vfu_ctx); 4246 assert(endpoint->pci_config_space != NULL); 4247 init_pci_config_space(endpoint->pci_config_space); 4248 4249 assert(cap_offset != 0); 4250 endpoint->msix = (struct msixcap *)((uint8_t *)endpoint->pci_config_space + cap_offset); 4251 4252 return 0; 4253 } 4254 4255 static int nvmf_vfio_user_accept(void *ctx); 4256 4257 /* 4258 * Register an "accept" poller: this is polling for incoming vfio-user socket 4259 * connections (on the listening socket). 4260 * 4261 * We need to do this on first listening, and also after destroying a 4262 * controller, so we can accept another connection. 4263 */ 4264 static int 4265 vfio_user_register_accept_poller(struct nvmf_vfio_user_endpoint *endpoint) 4266 { 4267 uint64_t poll_rate_us = endpoint->transport->transport.opts.acceptor_poll_rate; 4268 4269 SPDK_DEBUGLOG(nvmf_vfio, "registering accept poller\n"); 4270 4271 endpoint->accept_poller = SPDK_POLLER_REGISTER(nvmf_vfio_user_accept, 4272 endpoint, poll_rate_us); 4273 4274 if (!endpoint->accept_poller) { 4275 return -1; 4276 } 4277 4278 endpoint->accept_thread = spdk_get_thread(); 4279 endpoint->need_relisten = false; 4280 4281 if (!spdk_interrupt_mode_is_enabled()) { 4282 return 0; 4283 } 4284 4285 endpoint->accept_intr_fd = vfu_get_poll_fd(endpoint->vfu_ctx); 4286 assert(endpoint->accept_intr_fd != -1); 4287 4288 endpoint->accept_intr = SPDK_INTERRUPT_REGISTER(endpoint->accept_intr_fd, 4289 nvmf_vfio_user_accept, endpoint); 4290 4291 assert(endpoint->accept_intr != NULL); 4292 4293 spdk_poller_register_interrupt(endpoint->accept_poller, NULL, NULL); 4294 return 0; 4295 } 4296 4297 static void 4298 _vfio_user_relisten(void *ctx) 4299 { 4300 struct nvmf_vfio_user_endpoint *endpoint = ctx; 4301 4302 vfio_user_register_accept_poller(endpoint); 4303 } 4304 4305 static void 4306 _free_ctrlr(void *ctx) 4307 { 4308 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 4309 struct nvmf_vfio_user_endpoint *endpoint = ctrlr->endpoint; 4310 4311 free_sdbl(endpoint->vfu_ctx, ctrlr->sdbl); 4312 4313 spdk_interrupt_unregister(&ctrlr->intr); 4314 ctrlr->intr_fd = -1; 4315 spdk_poller_unregister(&ctrlr->vfu_ctx_poller); 4316 4317 free(ctrlr); 4318 4319 if (endpoint->need_async_destroy) { 4320 nvmf_vfio_user_destroy_endpoint(endpoint); 4321 } else if (endpoint->need_relisten) { 4322 spdk_thread_send_msg(endpoint->accept_thread, 4323 _vfio_user_relisten, endpoint); 4324 } 4325 } 4326 4327 static void 4328 free_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 4329 { 4330 struct spdk_thread *thread; 4331 int i; 4332 4333 assert(ctrlr != NULL); 4334 thread = ctrlr->thread ? ctrlr->thread : spdk_get_thread(); 4335 4336 SPDK_DEBUGLOG(nvmf_vfio, "free %s\n", ctrlr_id(ctrlr)); 4337 4338 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 4339 free_qp(ctrlr, i); 4340 } 4341 4342 spdk_thread_exec_msg(thread, _free_ctrlr, ctrlr); 4343 } 4344 4345 static int 4346 nvmf_vfio_user_create_ctrlr(struct nvmf_vfio_user_transport *transport, 4347 struct nvmf_vfio_user_endpoint *endpoint) 4348 { 4349 struct nvmf_vfio_user_ctrlr *ctrlr; 4350 int err = 0; 4351 4352 SPDK_DEBUGLOG(nvmf_vfio, "%s\n", endpoint_id(endpoint)); 4353 4354 /* First, construct a vfio-user CUSTOM transport controller */ 4355 ctrlr = calloc(1, sizeof(*ctrlr)); 4356 if (ctrlr == NULL) { 4357 err = -ENOMEM; 4358 goto out; 4359 } 4360 /* 4361 * We can only support one connection for now, but generate a unique cntlid in case vfio-user 4362 * transport is used together with RDMA or TCP transports in the same target 4363 */ 4364 ctrlr->cntlid = nvmf_subsystem_gen_cntlid(endpoint->subsystem); 4365 ctrlr->intr_fd = -1; 4366 ctrlr->transport = transport; 4367 ctrlr->endpoint = endpoint; 4368 ctrlr->bar0_doorbells = endpoint->bar0_doorbells; 4369 TAILQ_INIT(&ctrlr->connected_sqs); 4370 4371 ctrlr->adaptive_irqs_enabled = 4372 !transport->transport_opts.disable_adaptive_irq; 4373 4374 /* Then, construct an admin queue pair */ 4375 err = init_sq(ctrlr, &transport->transport, 0); 4376 if (err != 0) { 4377 free(ctrlr); 4378 goto out; 4379 } 4380 4381 err = init_cq(ctrlr, 0); 4382 if (err != 0) { 4383 free(ctrlr); 4384 goto out; 4385 } 4386 4387 ctrlr->sqs[0]->size = NVMF_VFIO_USER_DEFAULT_AQ_DEPTH; 4388 4389 err = alloc_sq_reqs(ctrlr, ctrlr->sqs[0]); 4390 if (err != 0) { 4391 free(ctrlr); 4392 goto out; 4393 } 4394 endpoint->ctrlr = ctrlr; 4395 4396 /* Notify the generic layer about the new admin queue pair */ 4397 spdk_nvmf_tgt_new_qpair(transport->transport.tgt, &ctrlr->sqs[0]->qpair); 4398 4399 out: 4400 if (err != 0) { 4401 SPDK_ERRLOG("%s: failed to create vfio-user controller: %s\n", 4402 endpoint_id(endpoint), strerror(-err)); 4403 } 4404 4405 return err; 4406 } 4407 4408 static int 4409 nvmf_vfio_user_listen(struct spdk_nvmf_transport *transport, 4410 const struct spdk_nvme_transport_id *trid, 4411 struct spdk_nvmf_listen_opts *listen_opts) 4412 { 4413 struct nvmf_vfio_user_transport *vu_transport; 4414 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 4415 char path[PATH_MAX] = {}; 4416 char uuid[PATH_MAX] = {}; 4417 int ret; 4418 4419 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4420 transport); 4421 4422 pthread_mutex_lock(&vu_transport->lock); 4423 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 4424 /* Only compare traddr */ 4425 if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) { 4426 pthread_mutex_unlock(&vu_transport->lock); 4427 return -EEXIST; 4428 } 4429 } 4430 pthread_mutex_unlock(&vu_transport->lock); 4431 4432 endpoint = calloc(1, sizeof(*endpoint)); 4433 if (!endpoint) { 4434 return -ENOMEM; 4435 } 4436 4437 pthread_mutex_init(&endpoint->lock, NULL); 4438 endpoint->devmem_fd = -1; 4439 memcpy(&endpoint->trid, trid, sizeof(endpoint->trid)); 4440 endpoint->transport = vu_transport; 4441 4442 ret = snprintf(path, PATH_MAX, "%s/bar0", endpoint_id(endpoint)); 4443 if (ret < 0 || ret >= PATH_MAX) { 4444 SPDK_ERRLOG("%s: error to get socket path: %s.\n", endpoint_id(endpoint), spdk_strerror(errno)); 4445 ret = -1; 4446 goto out; 4447 } 4448 4449 ret = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); 4450 if (ret == -1) { 4451 SPDK_ERRLOG("%s: failed to open device memory at %s: %s.\n", 4452 endpoint_id(endpoint), path, spdk_strerror(errno)); 4453 goto out; 4454 } 4455 unlink(path); 4456 4457 endpoint->devmem_fd = ret; 4458 ret = ftruncate(endpoint->devmem_fd, 4459 NVME_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE); 4460 if (ret != 0) { 4461 SPDK_ERRLOG("%s: error to ftruncate file %s: %s.\n", endpoint_id(endpoint), path, 4462 spdk_strerror(errno)); 4463 goto out; 4464 } 4465 4466 endpoint->bar0_doorbells = mmap(NULL, NVMF_VFIO_USER_DOORBELLS_SIZE, 4467 PROT_READ | PROT_WRITE, MAP_SHARED, endpoint->devmem_fd, NVME_DOORBELLS_OFFSET); 4468 if (endpoint->bar0_doorbells == MAP_FAILED) { 4469 SPDK_ERRLOG("%s: error to mmap file %s: %s.\n", endpoint_id(endpoint), path, spdk_strerror(errno)); 4470 endpoint->bar0_doorbells = NULL; 4471 ret = -1; 4472 goto out; 4473 } 4474 4475 ret = snprintf(path, PATH_MAX, "%s/migr", endpoint_id(endpoint)); 4476 if (ret < 0 || ret >= PATH_MAX) { 4477 SPDK_ERRLOG("%s: error to get migration file path: %s.\n", endpoint_id(endpoint), 4478 spdk_strerror(errno)); 4479 ret = -1; 4480 goto out; 4481 } 4482 ret = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); 4483 if (ret == -1) { 4484 SPDK_ERRLOG("%s: failed to open device memory at %s: %s.\n", 4485 endpoint_id(endpoint), path, spdk_strerror(errno)); 4486 goto out; 4487 } 4488 unlink(path); 4489 4490 endpoint->migr_fd = ret; 4491 ret = ftruncate(endpoint->migr_fd, 4492 vfu_get_migr_register_area_size() + vfio_user_migr_data_len()); 4493 if (ret != 0) { 4494 SPDK_ERRLOG("%s: error to ftruncate migration file %s: %s.\n", endpoint_id(endpoint), path, 4495 spdk_strerror(errno)); 4496 goto out; 4497 } 4498 4499 endpoint->migr_data = mmap(NULL, vfio_user_migr_data_len(), 4500 PROT_READ | PROT_WRITE, MAP_SHARED, endpoint->migr_fd, vfu_get_migr_register_area_size()); 4501 if (endpoint->migr_data == MAP_FAILED) { 4502 SPDK_ERRLOG("%s: error to mmap file %s: %s.\n", endpoint_id(endpoint), path, spdk_strerror(errno)); 4503 endpoint->migr_data = NULL; 4504 ret = -1; 4505 goto out; 4506 } 4507 4508 ret = snprintf(uuid, PATH_MAX, "%s/cntrl", endpoint_id(endpoint)); 4509 if (ret < 0 || ret >= PATH_MAX) { 4510 SPDK_ERRLOG("%s: error to get ctrlr file path: %s\n", endpoint_id(endpoint), spdk_strerror(errno)); 4511 ret = -1; 4512 goto out; 4513 } 4514 4515 endpoint->vfu_ctx = vfu_create_ctx(VFU_TRANS_SOCK, uuid, LIBVFIO_USER_FLAG_ATTACH_NB, 4516 endpoint, VFU_DEV_TYPE_PCI); 4517 if (endpoint->vfu_ctx == NULL) { 4518 SPDK_ERRLOG("%s: error creating libmuser context: %m\n", 4519 endpoint_id(endpoint)); 4520 ret = -1; 4521 goto out; 4522 } 4523 4524 ret = vfu_setup_log(endpoint->vfu_ctx, vfio_user_log, 4525 vfio_user_get_log_level()); 4526 if (ret < 0) { 4527 goto out; 4528 } 4529 4530 4531 ret = vfio_user_dev_info_fill(vu_transport, endpoint); 4532 if (ret < 0) { 4533 goto out; 4534 } 4535 4536 ret = vfio_user_register_accept_poller(endpoint); 4537 4538 if (ret != 0) { 4539 goto out; 4540 } 4541 4542 pthread_mutex_lock(&vu_transport->lock); 4543 TAILQ_INSERT_TAIL(&vu_transport->endpoints, endpoint, link); 4544 pthread_mutex_unlock(&vu_transport->lock); 4545 4546 out: 4547 if (ret != 0) { 4548 nvmf_vfio_user_destroy_endpoint(endpoint); 4549 } 4550 4551 return ret; 4552 } 4553 4554 static void 4555 nvmf_vfio_user_stop_listen(struct spdk_nvmf_transport *transport, 4556 const struct spdk_nvme_transport_id *trid) 4557 { 4558 struct nvmf_vfio_user_transport *vu_transport; 4559 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 4560 4561 assert(trid != NULL); 4562 assert(trid->traddr != NULL); 4563 4564 SPDK_DEBUGLOG(nvmf_vfio, "%s: stop listen\n", trid->traddr); 4565 4566 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4567 transport); 4568 4569 pthread_mutex_lock(&vu_transport->lock); 4570 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 4571 if (strcmp(trid->traddr, endpoint->trid.traddr) == 0) { 4572 TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link); 4573 /* Defer to free endpoint resources until the controller 4574 * is freed. There are two cases when running here: 4575 * 1. kill nvmf target while VM is connected 4576 * 2. remove listener via RPC call 4577 * nvmf library will disconnect all queue paris. 4578 */ 4579 if (endpoint->ctrlr) { 4580 assert(!endpoint->need_async_destroy); 4581 endpoint->need_async_destroy = true; 4582 pthread_mutex_unlock(&vu_transport->lock); 4583 return; 4584 } 4585 4586 nvmf_vfio_user_destroy_endpoint(endpoint); 4587 pthread_mutex_unlock(&vu_transport->lock); 4588 return; 4589 } 4590 } 4591 pthread_mutex_unlock(&vu_transport->lock); 4592 4593 SPDK_DEBUGLOG(nvmf_vfio, "%s: not found\n", trid->traddr); 4594 } 4595 4596 static void 4597 nvmf_vfio_user_cdata_init(struct spdk_nvmf_transport *transport, 4598 struct spdk_nvmf_subsystem *subsystem, 4599 struct spdk_nvmf_ctrlr_data *cdata) 4600 { 4601 struct nvmf_vfio_user_transport *vu_transport; 4602 4603 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, transport); 4604 4605 cdata->vid = SPDK_PCI_VID_NUTANIX; 4606 cdata->ssvid = SPDK_PCI_VID_NUTANIX; 4607 cdata->ieee[0] = 0x8d; 4608 cdata->ieee[1] = 0x6b; 4609 cdata->ieee[2] = 0x50; 4610 memset(&cdata->sgls, 0, sizeof(struct spdk_nvme_cdata_sgls)); 4611 cdata->sgls.supported = SPDK_NVME_SGLS_SUPPORTED_DWORD_ALIGNED; 4612 cdata->oncs.compare = !vu_transport->transport_opts.disable_compare; 4613 /* libvfio-user can only support 1 connection for now */ 4614 cdata->oncs.reservations = 0; 4615 cdata->oacs.doorbell_buffer_config = !vu_transport->transport_opts.disable_shadow_doorbells; 4616 cdata->fuses.compare_and_write = !vu_transport->transport_opts.disable_compare; 4617 } 4618 4619 static int 4620 nvmf_vfio_user_listen_associate(struct spdk_nvmf_transport *transport, 4621 const struct spdk_nvmf_subsystem *subsystem, 4622 const struct spdk_nvme_transport_id *trid) 4623 { 4624 struct nvmf_vfio_user_transport *vu_transport; 4625 struct nvmf_vfio_user_endpoint *endpoint; 4626 4627 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, transport); 4628 4629 pthread_mutex_lock(&vu_transport->lock); 4630 TAILQ_FOREACH(endpoint, &vu_transport->endpoints, link) { 4631 if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) { 4632 break; 4633 } 4634 } 4635 pthread_mutex_unlock(&vu_transport->lock); 4636 4637 if (endpoint == NULL) { 4638 return -ENOENT; 4639 } 4640 4641 /* Drop const - we will later need to pause/unpause. */ 4642 endpoint->subsystem = (struct spdk_nvmf_subsystem *)subsystem; 4643 4644 return 0; 4645 } 4646 4647 /* 4648 * Executed periodically at a default SPDK_NVMF_DEFAULT_ACCEPT_POLL_RATE_US 4649 * frequency. 4650 * 4651 * For this endpoint (which at the libvfio-user level corresponds to a socket), 4652 * if we don't currently have a controller set up, peek to see if the socket is 4653 * able to accept a new connection. 4654 */ 4655 static int 4656 nvmf_vfio_user_accept(void *ctx) 4657 { 4658 struct nvmf_vfio_user_endpoint *endpoint = ctx; 4659 struct nvmf_vfio_user_transport *vu_transport; 4660 int err; 4661 4662 vu_transport = endpoint->transport; 4663 4664 if (endpoint->ctrlr != NULL) { 4665 return SPDK_POLLER_IDLE; 4666 } 4667 4668 /* While we're here, the controller is already destroyed, 4669 * subsystem may still be in RESUMING state, we will wait 4670 * until the subsystem is in RUNNING state. 4671 */ 4672 if (endpoint->need_resume) { 4673 return SPDK_POLLER_IDLE; 4674 } 4675 4676 err = vfu_attach_ctx(endpoint->vfu_ctx); 4677 if (err == 0) { 4678 SPDK_DEBUGLOG(nvmf_vfio, "attach succeeded\n"); 4679 err = nvmf_vfio_user_create_ctrlr(vu_transport, endpoint); 4680 if (err == 0) { 4681 /* 4682 * Unregister ourselves: now we've accepted a 4683 * connection, there is nothing for us to poll for, and 4684 * we will poll the connection via vfu_run_ctx() 4685 * instead. 4686 */ 4687 spdk_interrupt_unregister(&endpoint->accept_intr); 4688 spdk_poller_unregister(&endpoint->accept_poller); 4689 } 4690 return SPDK_POLLER_BUSY; 4691 } 4692 4693 if (errno == EAGAIN || errno == EWOULDBLOCK) { 4694 return SPDK_POLLER_IDLE; 4695 } 4696 4697 return SPDK_POLLER_BUSY; 4698 } 4699 4700 static void 4701 nvmf_vfio_user_discover(struct spdk_nvmf_transport *transport, 4702 struct spdk_nvme_transport_id *trid, 4703 struct spdk_nvmf_discovery_log_page_entry *entry) 4704 { } 4705 4706 static int vfio_user_poll_group_intr(void *ctx); 4707 4708 static void 4709 vfio_user_poll_group_add_intr(struct nvmf_vfio_user_poll_group *vu_group, 4710 struct spdk_nvmf_poll_group *group) 4711 { 4712 vu_group->intr_fd = eventfd(0, EFD_NONBLOCK); 4713 assert(vu_group->intr_fd != -1); 4714 4715 vu_group->intr = SPDK_INTERRUPT_REGISTER(vu_group->intr_fd, 4716 vfio_user_poll_group_intr, vu_group); 4717 assert(vu_group->intr != NULL); 4718 } 4719 4720 static struct spdk_nvmf_transport_poll_group * 4721 nvmf_vfio_user_poll_group_create(struct spdk_nvmf_transport *transport, 4722 struct spdk_nvmf_poll_group *group) 4723 { 4724 struct nvmf_vfio_user_transport *vu_transport; 4725 struct nvmf_vfio_user_poll_group *vu_group; 4726 4727 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4728 transport); 4729 4730 SPDK_DEBUGLOG(nvmf_vfio, "create poll group\n"); 4731 4732 vu_group = calloc(1, sizeof(*vu_group)); 4733 if (vu_group == NULL) { 4734 SPDK_ERRLOG("Error allocating poll group: %m"); 4735 return NULL; 4736 } 4737 4738 if (in_interrupt_mode(vu_transport)) { 4739 vfio_user_poll_group_add_intr(vu_group, group); 4740 } 4741 4742 TAILQ_INIT(&vu_group->sqs); 4743 4744 pthread_mutex_lock(&vu_transport->pg_lock); 4745 TAILQ_INSERT_TAIL(&vu_transport->poll_groups, vu_group, link); 4746 if (vu_transport->next_pg == NULL) { 4747 vu_transport->next_pg = vu_group; 4748 } 4749 pthread_mutex_unlock(&vu_transport->pg_lock); 4750 4751 return &vu_group->group; 4752 } 4753 4754 static struct spdk_nvmf_transport_poll_group * 4755 nvmf_vfio_user_get_optimal_poll_group(struct spdk_nvmf_qpair *qpair) 4756 { 4757 struct nvmf_vfio_user_transport *vu_transport; 4758 struct nvmf_vfio_user_poll_group **vu_group; 4759 struct nvmf_vfio_user_sq *sq; 4760 struct nvmf_vfio_user_cq *cq; 4761 4762 struct spdk_nvmf_transport_poll_group *result = NULL; 4763 4764 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 4765 cq = sq->ctrlr->cqs[sq->cqid]; 4766 assert(cq != NULL); 4767 vu_transport = SPDK_CONTAINEROF(qpair->transport, struct nvmf_vfio_user_transport, transport); 4768 4769 pthread_mutex_lock(&vu_transport->pg_lock); 4770 if (TAILQ_EMPTY(&vu_transport->poll_groups)) { 4771 goto out; 4772 } 4773 4774 if (!nvmf_qpair_is_admin_queue(qpair)) { 4775 /* 4776 * If this is shared IO CQ case, just return the used CQ's poll 4777 * group, so I/O completions don't have to use 4778 * spdk_thread_send_msg(). 4779 */ 4780 if (cq->group != NULL) { 4781 result = cq->group; 4782 goto out; 4783 } 4784 4785 /* 4786 * If we're in interrupt mode, align all qpairs for a controller 4787 * on the same poll group by default, unless requested. This can 4788 * be lower in performance than running on a single poll group, 4789 * so we disable spreading by default. 4790 */ 4791 if (in_interrupt_mode(vu_transport) && 4792 !vu_transport->transport_opts.enable_intr_mode_sq_spreading) { 4793 result = sq->ctrlr->sqs[0]->group; 4794 goto out; 4795 } 4796 4797 } 4798 4799 vu_group = &vu_transport->next_pg; 4800 assert(*vu_group != NULL); 4801 4802 result = &(*vu_group)->group; 4803 *vu_group = TAILQ_NEXT(*vu_group, link); 4804 if (*vu_group == NULL) { 4805 *vu_group = TAILQ_FIRST(&vu_transport->poll_groups); 4806 } 4807 4808 out: 4809 if (cq->group == NULL) { 4810 cq->group = result; 4811 } 4812 4813 pthread_mutex_unlock(&vu_transport->pg_lock); 4814 return result; 4815 } 4816 4817 static void 4818 vfio_user_poll_group_del_intr(struct nvmf_vfio_user_poll_group *vu_group) 4819 { 4820 assert(vu_group->intr_fd != -1); 4821 4822 spdk_interrupt_unregister(&vu_group->intr); 4823 4824 close(vu_group->intr_fd); 4825 vu_group->intr_fd = -1; 4826 } 4827 4828 /* called when process exits */ 4829 static void 4830 nvmf_vfio_user_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group) 4831 { 4832 struct nvmf_vfio_user_poll_group *vu_group, *next_tgroup; 4833 struct nvmf_vfio_user_transport *vu_transport; 4834 4835 SPDK_DEBUGLOG(nvmf_vfio, "destroy poll group\n"); 4836 4837 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 4838 vu_transport = SPDK_CONTAINEROF(vu_group->group.transport, struct nvmf_vfio_user_transport, 4839 transport); 4840 4841 if (in_interrupt_mode(vu_transport)) { 4842 vfio_user_poll_group_del_intr(vu_group); 4843 } 4844 4845 pthread_mutex_lock(&vu_transport->pg_lock); 4846 next_tgroup = TAILQ_NEXT(vu_group, link); 4847 TAILQ_REMOVE(&vu_transport->poll_groups, vu_group, link); 4848 if (next_tgroup == NULL) { 4849 next_tgroup = TAILQ_FIRST(&vu_transport->poll_groups); 4850 } 4851 if (vu_transport->next_pg == vu_group) { 4852 vu_transport->next_pg = next_tgroup; 4853 } 4854 pthread_mutex_unlock(&vu_transport->pg_lock); 4855 4856 free(vu_group); 4857 } 4858 4859 static void 4860 _vfio_user_qpair_disconnect(void *ctx) 4861 { 4862 struct nvmf_vfio_user_sq *sq = ctx; 4863 4864 spdk_nvmf_qpair_disconnect(&sq->qpair); 4865 } 4866 4867 /* The function is used when socket connection is destroyed */ 4868 static int 4869 vfio_user_destroy_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 4870 { 4871 struct nvmf_vfio_user_sq *sq; 4872 struct nvmf_vfio_user_endpoint *endpoint; 4873 4874 SPDK_DEBUGLOG(nvmf_vfio, "%s stop processing\n", ctrlr_id(ctrlr)); 4875 4876 endpoint = ctrlr->endpoint; 4877 assert(endpoint != NULL); 4878 4879 pthread_mutex_lock(&endpoint->lock); 4880 endpoint->need_relisten = true; 4881 ctrlr->disconnect = true; 4882 if (TAILQ_EMPTY(&ctrlr->connected_sqs)) { 4883 endpoint->ctrlr = NULL; 4884 free_ctrlr(ctrlr); 4885 pthread_mutex_unlock(&endpoint->lock); 4886 return 0; 4887 } 4888 4889 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 4890 /* add another round thread poll to avoid recursive endpoint lock */ 4891 spdk_thread_send_msg(ctrlr->thread, _vfio_user_qpair_disconnect, sq); 4892 } 4893 pthread_mutex_unlock(&endpoint->lock); 4894 4895 return 0; 4896 } 4897 4898 /* 4899 * Poll for and process any incoming vfio-user messages. 4900 */ 4901 static int 4902 vfio_user_poll_vfu_ctx(void *ctx) 4903 { 4904 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 4905 int ret; 4906 4907 assert(ctrlr != NULL); 4908 4909 /* This will call access_bar0_fn() if there are any writes 4910 * to the portion of the BAR that is not mmap'd */ 4911 ret = vfu_run_ctx(ctrlr->endpoint->vfu_ctx); 4912 if (spdk_unlikely(ret == -1)) { 4913 if (errno == EBUSY) { 4914 return SPDK_POLLER_IDLE; 4915 } 4916 4917 spdk_poller_unregister(&ctrlr->vfu_ctx_poller); 4918 4919 /* 4920 * We lost the client; the reset callback will already have 4921 * unregistered the interrupt. 4922 */ 4923 if (errno == ENOTCONN) { 4924 vfio_user_destroy_ctrlr(ctrlr); 4925 return SPDK_POLLER_BUSY; 4926 } 4927 4928 /* 4929 * We might not have got a reset callback in this case, so 4930 * explicitly unregister the interrupt here. 4931 */ 4932 spdk_interrupt_unregister(&ctrlr->intr); 4933 ctrlr->intr_fd = -1; 4934 fail_ctrlr(ctrlr); 4935 } 4936 4937 return ret != 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 4938 } 4939 4940 struct vfio_user_post_cpl_ctx { 4941 struct nvmf_vfio_user_ctrlr *ctrlr; 4942 struct nvmf_vfio_user_cq *cq; 4943 struct spdk_nvme_cpl cpl; 4944 }; 4945 4946 static void 4947 _post_completion_msg(void *ctx) 4948 { 4949 struct vfio_user_post_cpl_ctx *cpl_ctx = ctx; 4950 4951 post_completion(cpl_ctx->ctrlr, cpl_ctx->cq, cpl_ctx->cpl.cdw0, cpl_ctx->cpl.sqid, 4952 cpl_ctx->cpl.cid, cpl_ctx->cpl.status.sc, cpl_ctx->cpl.status.sct); 4953 free(cpl_ctx); 4954 } 4955 4956 static int nvmf_vfio_user_poll_group_poll(struct spdk_nvmf_transport_poll_group *group); 4957 4958 static int 4959 vfio_user_poll_group_process(void *ctx) 4960 { 4961 struct nvmf_vfio_user_poll_group *vu_group = ctx; 4962 int ret = 0; 4963 4964 SPDK_DEBUGLOG(vfio_user_db, "pg:%p got intr\n", vu_group); 4965 4966 ret |= nvmf_vfio_user_poll_group_poll(&vu_group->group); 4967 4968 /* 4969 * Re-arm the event indexes. NB: this also could rearm other 4970 * controller's SQs. 4971 */ 4972 ret |= vfio_user_poll_group_rearm(vu_group); 4973 4974 vu_group->stats.pg_process_count++; 4975 return ret != 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 4976 } 4977 4978 static int 4979 vfio_user_poll_group_intr(void *ctx) 4980 { 4981 struct nvmf_vfio_user_poll_group *vu_group = ctx; 4982 eventfd_t val; 4983 4984 eventfd_read(vu_group->intr_fd, &val); 4985 4986 vu_group->stats.intr++; 4987 4988 return vfio_user_poll_group_process(ctx); 4989 } 4990 4991 /* 4992 * Handle an interrupt for the given controller: we must poll the vfu_ctx, and 4993 * the SQs assigned to our own poll group. Other poll groups are handled via 4994 * vfio_user_poll_group_intr(). 4995 */ 4996 static int 4997 vfio_user_ctrlr_intr(void *ctx) 4998 { 4999 struct nvmf_vfio_user_poll_group *vu_ctrlr_group; 5000 struct nvmf_vfio_user_ctrlr *vu_ctrlr = ctx; 5001 struct nvmf_vfio_user_poll_group *vu_group; 5002 int ret = SPDK_POLLER_IDLE; 5003 5004 vu_ctrlr_group = ctrlr_to_poll_group(vu_ctrlr); 5005 5006 SPDK_DEBUGLOG(vfio_user_db, "ctrlr pg:%p got intr\n", vu_ctrlr_group); 5007 5008 vu_ctrlr_group->stats.ctrlr_intr++; 5009 5010 /* 5011 * Poll vfio-user for this controller. We need to do this before polling 5012 * any SQs, as this is where doorbell writes may be handled. 5013 */ 5014 ret = vfio_user_poll_vfu_ctx(vu_ctrlr); 5015 5016 /* 5017 * `sqs[0]` could be set to NULL in vfio_user_poll_vfu_ctx() context, 5018 * just return for this case. 5019 */ 5020 if (vu_ctrlr->sqs[0] == NULL) { 5021 return ret; 5022 } 5023 5024 if (vu_ctrlr->transport->transport_opts.enable_intr_mode_sq_spreading) { 5025 /* 5026 * We may have just written to a doorbell owned by another 5027 * reactor: we need to prod them to make sure its SQs are polled 5028 * *after* the doorbell value is updated. 5029 */ 5030 TAILQ_FOREACH(vu_group, &vu_ctrlr->transport->poll_groups, link) { 5031 if (vu_group != vu_ctrlr_group) { 5032 SPDK_DEBUGLOG(vfio_user_db, "prodding pg:%p\n", vu_group); 5033 eventfd_write(vu_group->intr_fd, 1); 5034 } 5035 } 5036 } 5037 5038 ret |= vfio_user_poll_group_process(vu_ctrlr_group); 5039 5040 return ret; 5041 } 5042 5043 static void 5044 vfio_user_ctrlr_set_intr_mode(struct spdk_poller *poller, void *ctx, 5045 bool interrupt_mode) 5046 { 5047 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 5048 assert(ctrlr != NULL); 5049 assert(ctrlr->endpoint != NULL); 5050 5051 SPDK_DEBUGLOG(nvmf_vfio, "%s: setting interrupt mode to %d\n", 5052 ctrlr_id(ctrlr), interrupt_mode); 5053 5054 /* 5055 * interrupt_mode needs to persist across controller resets, so store 5056 * it in the endpoint instead. 5057 */ 5058 ctrlr->endpoint->interrupt_mode = interrupt_mode; 5059 5060 vfio_user_poll_group_rearm(ctrlr_to_poll_group(ctrlr)); 5061 } 5062 5063 /* 5064 * In response to the nvmf_vfio_user_create_ctrlr() path, the admin queue is now 5065 * set up and we can start operating on this controller. 5066 */ 5067 static void 5068 start_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 5069 struct spdk_nvmf_ctrlr *ctrlr) 5070 { 5071 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 5072 5073 vu_ctrlr->ctrlr = ctrlr; 5074 vu_ctrlr->cntlid = ctrlr->cntlid; 5075 vu_ctrlr->thread = spdk_get_thread(); 5076 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 5077 5078 if (!in_interrupt_mode(endpoint->transport)) { 5079 vu_ctrlr->vfu_ctx_poller = SPDK_POLLER_REGISTER(vfio_user_poll_vfu_ctx, 5080 vu_ctrlr, 1000); 5081 return; 5082 } 5083 5084 vu_ctrlr->vfu_ctx_poller = SPDK_POLLER_REGISTER(vfio_user_poll_vfu_ctx, 5085 vu_ctrlr, 0); 5086 5087 vu_ctrlr->intr_fd = vfu_get_poll_fd(vu_ctrlr->endpoint->vfu_ctx); 5088 assert(vu_ctrlr->intr_fd != -1); 5089 5090 vu_ctrlr->intr = SPDK_INTERRUPT_REGISTER(vu_ctrlr->intr_fd, 5091 vfio_user_ctrlr_intr, vu_ctrlr); 5092 5093 assert(vu_ctrlr->intr != NULL); 5094 5095 spdk_poller_register_interrupt(vu_ctrlr->vfu_ctx_poller, 5096 vfio_user_ctrlr_set_intr_mode, 5097 vu_ctrlr); 5098 } 5099 5100 static int 5101 handle_queue_connect_rsp(struct nvmf_vfio_user_req *req, void *cb_arg) 5102 { 5103 struct nvmf_vfio_user_poll_group *vu_group; 5104 struct nvmf_vfio_user_sq *sq = cb_arg; 5105 struct nvmf_vfio_user_cq *admin_cq; 5106 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 5107 struct nvmf_vfio_user_endpoint *endpoint; 5108 5109 assert(sq != NULL); 5110 assert(req != NULL); 5111 5112 vu_ctrlr = sq->ctrlr; 5113 assert(vu_ctrlr != NULL); 5114 endpoint = vu_ctrlr->endpoint; 5115 assert(endpoint != NULL); 5116 5117 if (spdk_nvme_cpl_is_error(&req->req.rsp->nvme_cpl)) { 5118 SPDK_ERRLOG("SC %u, SCT %u\n", req->req.rsp->nvme_cpl.status.sc, req->req.rsp->nvme_cpl.status.sct); 5119 endpoint->ctrlr = NULL; 5120 free_ctrlr(vu_ctrlr); 5121 return -1; 5122 } 5123 5124 vu_group = SPDK_CONTAINEROF(sq->group, struct nvmf_vfio_user_poll_group, group); 5125 TAILQ_INSERT_TAIL(&vu_group->sqs, sq, link); 5126 5127 admin_cq = vu_ctrlr->cqs[0]; 5128 assert(admin_cq != NULL); 5129 assert(admin_cq->group != NULL); 5130 assert(admin_cq->group->group->thread != NULL); 5131 5132 pthread_mutex_lock(&endpoint->lock); 5133 if (nvmf_qpair_is_admin_queue(&sq->qpair)) { 5134 assert(admin_cq->group->group->thread == spdk_get_thread()); 5135 /* 5136 * The admin queue is special as SQ0 and CQ0 are created 5137 * together. 5138 */ 5139 admin_cq->cq_ref = 1; 5140 start_ctrlr(vu_ctrlr, sq->qpair.ctrlr); 5141 } else { 5142 /* For I/O queues this command was generated in response to an 5143 * ADMIN I/O CREATE SUBMISSION QUEUE command which has not yet 5144 * been completed. Complete it now. 5145 */ 5146 if (sq->post_create_io_sq_completion) { 5147 if (admin_cq->group->group->thread != spdk_get_thread()) { 5148 struct vfio_user_post_cpl_ctx *cpl_ctx; 5149 5150 cpl_ctx = calloc(1, sizeof(*cpl_ctx)); 5151 if (!cpl_ctx) { 5152 return -ENOMEM; 5153 } 5154 cpl_ctx->ctrlr = vu_ctrlr; 5155 cpl_ctx->cq = admin_cq; 5156 cpl_ctx->cpl.sqid = 0; 5157 cpl_ctx->cpl.cdw0 = 0; 5158 cpl_ctx->cpl.cid = sq->create_io_sq_cmd.cid; 5159 cpl_ctx->cpl.status.sc = SPDK_NVME_SC_SUCCESS; 5160 cpl_ctx->cpl.status.sct = SPDK_NVME_SCT_GENERIC; 5161 5162 spdk_thread_send_msg(admin_cq->group->group->thread, 5163 _post_completion_msg, 5164 cpl_ctx); 5165 } else { 5166 post_completion(vu_ctrlr, admin_cq, 0, 0, 5167 sq->create_io_sq_cmd.cid, SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC); 5168 } 5169 sq->post_create_io_sq_completion = false; 5170 } else if (in_interrupt_mode(endpoint->transport)) { 5171 /* 5172 * If we're live migrating a guest, there is a window 5173 * where the I/O queues haven't been set up but the 5174 * device is in running state, during which the guest 5175 * might write to a doorbell. This doorbell write will 5176 * go unnoticed, so let's poll the whole controller to 5177 * pick that up. 5178 */ 5179 ctrlr_kick(vu_ctrlr); 5180 } 5181 sq->sq_state = VFIO_USER_SQ_ACTIVE; 5182 } 5183 5184 TAILQ_INSERT_TAIL(&vu_ctrlr->connected_sqs, sq, tailq); 5185 pthread_mutex_unlock(&endpoint->lock); 5186 5187 free(req->req.iov[0].iov_base); 5188 req->req.iov[0].iov_base = NULL; 5189 req->req.iovcnt = 0; 5190 5191 return 0; 5192 } 5193 5194 static void 5195 _nvmf_vfio_user_poll_group_add(void *req) 5196 { 5197 spdk_nvmf_request_exec(req); 5198 } 5199 5200 /* 5201 * Add the given qpair to the given poll group. New qpairs are added via 5202 * spdk_nvmf_tgt_new_qpair(), which picks a poll group via 5203 * nvmf_vfio_user_get_optimal_poll_group(), then calls back here via 5204 * nvmf_transport_poll_group_add(). 5205 */ 5206 static int 5207 nvmf_vfio_user_poll_group_add(struct spdk_nvmf_transport_poll_group *group, 5208 struct spdk_nvmf_qpair *qpair) 5209 { 5210 struct nvmf_vfio_user_sq *sq; 5211 struct nvmf_vfio_user_req *vu_req; 5212 struct nvmf_vfio_user_ctrlr *ctrlr; 5213 struct spdk_nvmf_request *req; 5214 struct spdk_nvmf_fabric_connect_data *data; 5215 bool admin; 5216 5217 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5218 sq->group = group; 5219 ctrlr = sq->ctrlr; 5220 5221 SPDK_DEBUGLOG(nvmf_vfio, "%s: add QP%d=%p(%p) to poll_group=%p\n", 5222 ctrlr_id(ctrlr), sq->qpair.qid, 5223 sq, qpair, group); 5224 5225 admin = nvmf_qpair_is_admin_queue(&sq->qpair); 5226 5227 vu_req = get_nvmf_vfio_user_req(sq); 5228 if (vu_req == NULL) { 5229 return -1; 5230 } 5231 5232 req = &vu_req->req; 5233 req->cmd->connect_cmd.opcode = SPDK_NVME_OPC_FABRIC; 5234 req->cmd->connect_cmd.cid = 0; 5235 req->cmd->connect_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_CONNECT; 5236 req->cmd->connect_cmd.recfmt = 0; 5237 req->cmd->connect_cmd.sqsize = sq->size - 1; 5238 req->cmd->connect_cmd.qid = admin ? 0 : qpair->qid; 5239 5240 req->length = sizeof(struct spdk_nvmf_fabric_connect_data); 5241 5242 data = calloc(1, req->length); 5243 if (data == NULL) { 5244 nvmf_vfio_user_req_free(req); 5245 return -ENOMEM; 5246 } 5247 5248 SPDK_IOV_ONE(req->iov, &req->iovcnt, data, req->length); 5249 5250 data->cntlid = ctrlr->cntlid; 5251 snprintf(data->subnqn, sizeof(data->subnqn), "%s", 5252 spdk_nvmf_subsystem_get_nqn(ctrlr->endpoint->subsystem)); 5253 5254 vu_req->cb_fn = handle_queue_connect_rsp; 5255 vu_req->cb_arg = sq; 5256 5257 SPDK_DEBUGLOG(nvmf_vfio, 5258 "%s: sending connect fabrics command for qid:%#x cntlid=%#x\n", 5259 ctrlr_id(ctrlr), qpair->qid, data->cntlid); 5260 5261 /* 5262 * By the time transport's poll_group_add() callback is executed, the 5263 * qpair isn't in the ACTIVE state yet, so spdk_nvmf_request_exec() 5264 * would fail. The state changes to ACTIVE immediately after the 5265 * callback finishes, so delay spdk_nvmf_request_exec() by sending a 5266 * message. 5267 */ 5268 spdk_thread_send_msg(spdk_get_thread(), _nvmf_vfio_user_poll_group_add, req); 5269 return 0; 5270 } 5271 5272 static int 5273 nvmf_vfio_user_poll_group_remove(struct spdk_nvmf_transport_poll_group *group, 5274 struct spdk_nvmf_qpair *qpair) 5275 { 5276 struct nvmf_vfio_user_sq *sq; 5277 struct nvmf_vfio_user_poll_group *vu_group; 5278 5279 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5280 5281 SPDK_DEBUGLOG(nvmf_vfio, 5282 "%s: remove NVMf QP%d=%p from NVMf poll_group=%p\n", 5283 ctrlr_id(sq->ctrlr), qpair->qid, qpair, group); 5284 5285 5286 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 5287 TAILQ_REMOVE(&vu_group->sqs, sq, link); 5288 5289 return 0; 5290 } 5291 5292 static void 5293 _nvmf_vfio_user_req_free(struct nvmf_vfio_user_sq *sq, struct nvmf_vfio_user_req *vu_req) 5294 { 5295 memset(&vu_req->cmd, 0, sizeof(vu_req->cmd)); 5296 memset(&vu_req->rsp, 0, sizeof(vu_req->rsp)); 5297 vu_req->iovcnt = 0; 5298 vu_req->req.iovcnt = 0; 5299 vu_req->req.length = 0; 5300 vu_req->state = VFIO_USER_REQUEST_STATE_FREE; 5301 5302 TAILQ_INSERT_TAIL(&sq->free_reqs, vu_req, link); 5303 } 5304 5305 static int 5306 nvmf_vfio_user_req_free(struct spdk_nvmf_request *req) 5307 { 5308 struct nvmf_vfio_user_sq *sq; 5309 struct nvmf_vfio_user_req *vu_req; 5310 5311 assert(req != NULL); 5312 5313 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 5314 sq = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 5315 5316 _nvmf_vfio_user_req_free(sq, vu_req); 5317 5318 return 0; 5319 } 5320 5321 static int 5322 nvmf_vfio_user_req_complete(struct spdk_nvmf_request *req) 5323 { 5324 struct nvmf_vfio_user_sq *sq; 5325 struct nvmf_vfio_user_req *vu_req; 5326 5327 assert(req != NULL); 5328 5329 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 5330 sq = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 5331 5332 if (vu_req->cb_fn != NULL) { 5333 if (vu_req->cb_fn(vu_req, vu_req->cb_arg) != 0) { 5334 fail_ctrlr(sq->ctrlr); 5335 } 5336 } 5337 5338 _nvmf_vfio_user_req_free(sq, vu_req); 5339 5340 return 0; 5341 } 5342 5343 static void 5344 nvmf_vfio_user_close_qpair(struct spdk_nvmf_qpair *qpair, 5345 spdk_nvmf_transport_qpair_fini_cb cb_fn, void *cb_arg) 5346 { 5347 struct nvmf_vfio_user_sq *sq; 5348 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 5349 struct nvmf_vfio_user_endpoint *endpoint; 5350 struct vfio_user_delete_sq_ctx *del_ctx; 5351 5352 assert(qpair != NULL); 5353 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5354 vu_ctrlr = sq->ctrlr; 5355 endpoint = vu_ctrlr->endpoint; 5356 del_ctx = sq->delete_ctx; 5357 sq->delete_ctx = NULL; 5358 5359 pthread_mutex_lock(&endpoint->lock); 5360 TAILQ_REMOVE(&vu_ctrlr->connected_sqs, sq, tailq); 5361 delete_sq_done(vu_ctrlr, sq); 5362 if (TAILQ_EMPTY(&vu_ctrlr->connected_sqs)) { 5363 endpoint->ctrlr = NULL; 5364 if (vu_ctrlr->in_source_vm && endpoint->need_resume) { 5365 /* The controller will be freed, we can resume the subsystem 5366 * now so that the endpoint can be ready to accept another 5367 * new connection. 5368 */ 5369 spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 5370 vfio_user_endpoint_resume_done, endpoint); 5371 } 5372 free_ctrlr(vu_ctrlr); 5373 } 5374 pthread_mutex_unlock(&endpoint->lock); 5375 5376 if (del_ctx) { 5377 vfio_user_qpair_delete_cb(del_ctx); 5378 } 5379 5380 if (cb_fn) { 5381 cb_fn(cb_arg); 5382 } 5383 } 5384 5385 /** 5386 * Returns a preallocated request, or NULL if there isn't one available. 5387 */ 5388 static struct nvmf_vfio_user_req * 5389 get_nvmf_vfio_user_req(struct nvmf_vfio_user_sq *sq) 5390 { 5391 struct nvmf_vfio_user_req *req; 5392 5393 if (sq == NULL) { 5394 return NULL; 5395 } 5396 5397 req = TAILQ_FIRST(&sq->free_reqs); 5398 if (req == NULL) { 5399 return NULL; 5400 } 5401 5402 TAILQ_REMOVE(&sq->free_reqs, req, link); 5403 5404 return req; 5405 } 5406 5407 static int 5408 get_nvmf_io_req_length(struct spdk_nvmf_request *req) 5409 { 5410 uint16_t nr; 5411 uint32_t nlb, nsid; 5412 struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; 5413 struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; 5414 struct spdk_nvmf_ns *ns; 5415 5416 nsid = cmd->nsid; 5417 ns = _nvmf_subsystem_get_ns(ctrlr->subsys, nsid); 5418 if (ns == NULL || ns->bdev == NULL) { 5419 SPDK_ERRLOG("unsuccessful query for nsid %u\n", cmd->nsid); 5420 return -EINVAL; 5421 } 5422 5423 if (cmd->opc == SPDK_NVME_OPC_DATASET_MANAGEMENT) { 5424 nr = cmd->cdw10_bits.dsm.nr + 1; 5425 return nr * sizeof(struct spdk_nvme_dsm_range); 5426 } 5427 5428 if (cmd->opc == SPDK_NVME_OPC_COPY) { 5429 nr = (cmd->cdw12 & 0x000000ffu) + 1; 5430 return nr * sizeof(struct spdk_nvme_scc_source_range); 5431 } 5432 5433 nlb = (cmd->cdw12 & 0x0000ffffu) + 1; 5434 return nlb * spdk_bdev_get_block_size(ns->bdev); 5435 } 5436 5437 static int 5438 map_admin_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req) 5439 { 5440 struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; 5441 uint32_t len = 0, numdw = 0; 5442 uint8_t fid; 5443 int iovcnt; 5444 5445 req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 5446 5447 if (req->xfer == SPDK_NVME_DATA_NONE) { 5448 return 0; 5449 } 5450 5451 switch (cmd->opc) { 5452 case SPDK_NVME_OPC_IDENTIFY: 5453 len = 4096; 5454 break; 5455 case SPDK_NVME_OPC_GET_LOG_PAGE: 5456 numdw = ((((uint32_t)cmd->cdw11_bits.get_log_page.numdu << 16) | 5457 cmd->cdw10_bits.get_log_page.numdl) + 1); 5458 if (numdw > UINT32_MAX / 4) { 5459 return -EINVAL; 5460 } 5461 len = numdw * 4; 5462 break; 5463 case SPDK_NVME_OPC_GET_FEATURES: 5464 case SPDK_NVME_OPC_SET_FEATURES: 5465 fid = cmd->cdw10_bits.set_features.fid; 5466 switch (fid) { 5467 case SPDK_NVME_FEAT_LBA_RANGE_TYPE: 5468 len = 4096; 5469 break; 5470 case SPDK_NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 5471 len = 256; 5472 break; 5473 case SPDK_NVME_FEAT_TIMESTAMP: 5474 len = 8; 5475 break; 5476 case SPDK_NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 5477 len = 512; 5478 break; 5479 case SPDK_NVME_FEAT_HOST_IDENTIFIER: 5480 if (cmd->cdw11_bits.feat_host_identifier.bits.exhid) { 5481 len = 16; 5482 } else { 5483 len = 8; 5484 } 5485 break; 5486 default: 5487 return 0; 5488 } 5489 break; 5490 case SPDK_NVME_OPC_FABRIC: 5491 return -ENOTSUP; 5492 default: 5493 return 0; 5494 } 5495 5496 /* ADMIN command will not use SGL */ 5497 if (cmd->psdt != 0) { 5498 return -EINVAL; 5499 } 5500 5501 iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, len); 5502 if (iovcnt < 0) { 5503 SPDK_ERRLOG("%s: map Admin Opc %x failed\n", 5504 ctrlr_id(ctrlr), cmd->opc); 5505 return -1; 5506 } 5507 req->length = len; 5508 req->iovcnt = iovcnt; 5509 5510 return 0; 5511 } 5512 5513 /* 5514 * Map an I/O command's buffers. 5515 * 5516 * Returns 0 on success and -errno on failure. 5517 */ 5518 static int 5519 map_io_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req) 5520 { 5521 int len, iovcnt; 5522 struct spdk_nvme_cmd *cmd; 5523 5524 assert(ctrlr != NULL); 5525 assert(req != NULL); 5526 5527 cmd = &req->cmd->nvme_cmd; 5528 req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 5529 5530 if (spdk_unlikely(req->xfer == SPDK_NVME_DATA_NONE)) { 5531 return 0; 5532 } 5533 5534 len = get_nvmf_io_req_length(req); 5535 if (len < 0) { 5536 return -EINVAL; 5537 } 5538 req->length = len; 5539 5540 iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, req->length); 5541 if (iovcnt < 0) { 5542 SPDK_ERRLOG("%s: failed to map IO OPC %u\n", ctrlr_id(ctrlr), cmd->opc); 5543 return -EFAULT; 5544 } 5545 req->iovcnt = iovcnt; 5546 5547 return 0; 5548 } 5549 5550 static int 5551 handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 5552 struct nvmf_vfio_user_sq *sq) 5553 { 5554 int err; 5555 struct nvmf_vfio_user_req *vu_req; 5556 struct spdk_nvmf_request *req; 5557 5558 assert(ctrlr != NULL); 5559 assert(cmd != NULL); 5560 5561 vu_req = get_nvmf_vfio_user_req(sq); 5562 if (spdk_unlikely(vu_req == NULL)) { 5563 SPDK_ERRLOG("%s: no request for NVMe command opc 0x%x\n", ctrlr_id(ctrlr), cmd->opc); 5564 return post_completion(ctrlr, ctrlr->cqs[sq->cqid], 0, 0, cmd->cid, 5565 SPDK_NVME_SC_INTERNAL_DEVICE_ERROR, SPDK_NVME_SCT_GENERIC); 5566 5567 } 5568 req = &vu_req->req; 5569 5570 assert(req->qpair != NULL); 5571 SPDK_DEBUGLOG(nvmf_vfio, "%s: handle sqid:%u, req opc=%#x cid=%d\n", 5572 ctrlr_id(ctrlr), req->qpair->qid, cmd->opc, cmd->cid); 5573 5574 vu_req->cb_fn = handle_cmd_rsp; 5575 vu_req->cb_arg = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 5576 req->cmd->nvme_cmd = *cmd; 5577 5578 if (nvmf_qpair_is_admin_queue(req->qpair)) { 5579 err = map_admin_cmd_req(ctrlr, req); 5580 } else { 5581 switch (cmd->opc) { 5582 case SPDK_NVME_OPC_RESERVATION_REGISTER: 5583 case SPDK_NVME_OPC_RESERVATION_REPORT: 5584 case SPDK_NVME_OPC_RESERVATION_ACQUIRE: 5585 case SPDK_NVME_OPC_RESERVATION_RELEASE: 5586 case SPDK_NVME_OPC_FABRIC: 5587 err = -ENOTSUP; 5588 break; 5589 default: 5590 err = map_io_cmd_req(ctrlr, req); 5591 break; 5592 } 5593 } 5594 5595 if (spdk_unlikely(err < 0)) { 5596 SPDK_ERRLOG("%s: process NVMe command opc 0x%x failed\n", 5597 ctrlr_id(ctrlr), cmd->opc); 5598 req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; 5599 req->rsp->nvme_cpl.status.sc = err == -ENOTSUP ? 5600 SPDK_NVME_SC_INVALID_OPCODE : 5601 SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 5602 err = handle_cmd_rsp(vu_req, vu_req->cb_arg); 5603 _nvmf_vfio_user_req_free(sq, vu_req); 5604 return err; 5605 } 5606 5607 vu_req->state = VFIO_USER_REQUEST_STATE_EXECUTING; 5608 spdk_nvmf_request_exec(req); 5609 5610 return 0; 5611 } 5612 5613 /* 5614 * If we suppressed an IRQ in post_completion(), check if it needs to be fired 5615 * here: if the host isn't up to date, and is apparently not actively processing 5616 * the queue (i.e. ->last_head isn't changing), we need an IRQ. 5617 */ 5618 static void 5619 handle_suppressed_irq(struct nvmf_vfio_user_ctrlr *ctrlr, 5620 struct nvmf_vfio_user_sq *sq) 5621 { 5622 struct nvmf_vfio_user_cq *cq = ctrlr->cqs[sq->cqid]; 5623 uint32_t cq_head; 5624 uint32_t cq_tail; 5625 5626 if (!cq->ien || cq->qid == 0 || !ctrlr_interrupt_enabled(ctrlr)) { 5627 return; 5628 } 5629 5630 cq_tail = *cq_tailp(cq); 5631 5632 /* Already sent? */ 5633 if (cq_tail == cq->last_trigger_irq_tail) { 5634 return; 5635 } 5636 5637 spdk_ivdt_dcache(cq_dbl_headp(cq)); 5638 cq_head = *cq_dbl_headp(cq); 5639 5640 if (cq_head != cq_tail && cq_head == cq->last_head) { 5641 int err = vfu_irq_trigger(ctrlr->endpoint->vfu_ctx, cq->iv); 5642 if (err != 0) { 5643 SPDK_ERRLOG("%s: failed to trigger interrupt: %m\n", 5644 ctrlr_id(ctrlr)); 5645 } else { 5646 cq->last_trigger_irq_tail = cq_tail; 5647 } 5648 } 5649 5650 cq->last_head = cq_head; 5651 } 5652 5653 /* Returns the number of commands processed, or a negative value on error. */ 5654 static int 5655 nvmf_vfio_user_sq_poll(struct nvmf_vfio_user_sq *sq) 5656 { 5657 struct nvmf_vfio_user_ctrlr *ctrlr; 5658 uint32_t new_tail; 5659 int count = 0; 5660 5661 assert(sq != NULL); 5662 5663 ctrlr = sq->ctrlr; 5664 5665 /* 5666 * A quiesced, or migrating, controller should never process new 5667 * commands. 5668 */ 5669 if (ctrlr->state != VFIO_USER_CTRLR_RUNNING) { 5670 return SPDK_POLLER_IDLE; 5671 } 5672 5673 if (ctrlr->adaptive_irqs_enabled) { 5674 handle_suppressed_irq(ctrlr, sq); 5675 } 5676 5677 /* On aarch64 platforms, doorbells update from guest VM may not be seen 5678 * on SPDK target side. This is because there is memory type mismatch 5679 * situation here. That is on guest VM side, the doorbells are treated as 5680 * device memory while on SPDK target side, it is treated as normal 5681 * memory. And this situation cause problem on ARM platform. 5682 * Refer to "https://developer.arm.com/documentation/102376/0100/ 5683 * Memory-aliasing-and-mismatched-memory-types". Only using spdk_mb() 5684 * cannot fix this. Use "dc civac" to invalidate cache may solve 5685 * this. 5686 */ 5687 spdk_ivdt_dcache(sq_dbl_tailp(sq)); 5688 5689 /* Load-Acquire. */ 5690 new_tail = *sq_dbl_tailp(sq); 5691 5692 new_tail = new_tail & 0xffffu; 5693 if (spdk_unlikely(new_tail >= sq->size)) { 5694 SPDK_DEBUGLOG(nvmf_vfio, "%s: invalid sqid:%u doorbell value %u\n", ctrlr_id(ctrlr), sq->qid, 5695 new_tail); 5696 spdk_nvmf_ctrlr_async_event_error_event(ctrlr->ctrlr, SPDK_NVME_ASYNC_EVENT_INVALID_DB_WRITE); 5697 5698 return -1; 5699 } 5700 5701 if (*sq_headp(sq) == new_tail) { 5702 return 0; 5703 } 5704 5705 SPDK_DEBUGLOG(nvmf_vfio, "%s: sqid:%u doorbell old=%u new=%u\n", 5706 ctrlr_id(ctrlr), sq->qid, *sq_headp(sq), new_tail); 5707 if (ctrlr->sdbl != NULL) { 5708 SPDK_DEBUGLOG(nvmf_vfio, 5709 "%s: sqid:%u bar0_doorbell=%u shadow_doorbell=%u eventidx=%u\n", 5710 ctrlr_id(ctrlr), sq->qid, 5711 ctrlr->bar0_doorbells[queue_index(sq->qid, false)], 5712 ctrlr->sdbl->shadow_doorbells[queue_index(sq->qid, false)], 5713 ctrlr->sdbl->eventidxs[queue_index(sq->qid, false)]); 5714 } 5715 5716 /* 5717 * Ensure that changes to the queue are visible to us. 5718 * The host driver should write the queue first, do a wmb(), and then 5719 * update the SQ tail doorbell (their Store-Release). 5720 */ 5721 spdk_rmb(); 5722 5723 count = handle_sq_tdbl_write(ctrlr, new_tail, sq); 5724 if (spdk_unlikely(count < 0)) { 5725 fail_ctrlr(ctrlr); 5726 } 5727 5728 return count; 5729 } 5730 5731 /* 5732 * vfio-user transport poll handler. Note that the library context is polled in 5733 * a separate poller (->vfu_ctx_poller), so this poller only needs to poll the 5734 * active SQs. 5735 * 5736 * Returns the number of commands processed, or a negative value on error. 5737 */ 5738 static int 5739 nvmf_vfio_user_poll_group_poll(struct spdk_nvmf_transport_poll_group *group) 5740 { 5741 struct nvmf_vfio_user_poll_group *vu_group; 5742 struct nvmf_vfio_user_sq *sq, *tmp; 5743 int count = 0; 5744 5745 assert(group != NULL); 5746 5747 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 5748 5749 SPDK_DEBUGLOG(vfio_user_db, "polling all SQs\n"); 5750 5751 TAILQ_FOREACH_SAFE(sq, &vu_group->sqs, link, tmp) { 5752 int ret; 5753 5754 if (spdk_unlikely(sq->sq_state != VFIO_USER_SQ_ACTIVE || !sq->size)) { 5755 continue; 5756 } 5757 5758 ret = nvmf_vfio_user_sq_poll(sq); 5759 5760 if (spdk_unlikely(ret < 0)) { 5761 return ret; 5762 } 5763 5764 count += ret; 5765 } 5766 5767 vu_group->stats.polls++; 5768 vu_group->stats.poll_reqs += count; 5769 vu_group->stats.poll_reqs_squared += count * count; 5770 if (count == 0) { 5771 vu_group->stats.polls_spurious++; 5772 } 5773 5774 return count; 5775 } 5776 5777 static int 5778 nvmf_vfio_user_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair, 5779 struct spdk_nvme_transport_id *trid) 5780 { 5781 struct nvmf_vfio_user_sq *sq; 5782 struct nvmf_vfio_user_ctrlr *ctrlr; 5783 5784 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5785 ctrlr = sq->ctrlr; 5786 5787 memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid)); 5788 return 0; 5789 } 5790 5791 static int 5792 nvmf_vfio_user_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair, 5793 struct spdk_nvme_transport_id *trid) 5794 { 5795 return 0; 5796 } 5797 5798 static int 5799 nvmf_vfio_user_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair, 5800 struct spdk_nvme_transport_id *trid) 5801 { 5802 struct nvmf_vfio_user_sq *sq; 5803 struct nvmf_vfio_user_ctrlr *ctrlr; 5804 5805 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5806 ctrlr = sq->ctrlr; 5807 5808 memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid)); 5809 return 0; 5810 } 5811 5812 static void 5813 nvmf_vfio_user_qpair_abort_request(struct spdk_nvmf_qpair *qpair, 5814 struct spdk_nvmf_request *req) 5815 { 5816 struct spdk_nvmf_request *req_to_abort = NULL; 5817 struct spdk_nvmf_request *temp_req = NULL; 5818 uint16_t cid; 5819 5820 cid = req->cmd->nvme_cmd.cdw10_bits.abort.cid; 5821 5822 TAILQ_FOREACH(temp_req, &qpair->outstanding, link) { 5823 struct nvmf_vfio_user_req *vu_req; 5824 5825 vu_req = SPDK_CONTAINEROF(temp_req, struct nvmf_vfio_user_req, req); 5826 5827 if (vu_req->state == VFIO_USER_REQUEST_STATE_EXECUTING && vu_req->cmd.cid == cid) { 5828 req_to_abort = temp_req; 5829 break; 5830 } 5831 } 5832 5833 if (req_to_abort == NULL) { 5834 spdk_nvmf_request_complete(req); 5835 return; 5836 } 5837 5838 req->req_to_abort = req_to_abort; 5839 nvmf_ctrlr_abort_request(req); 5840 } 5841 5842 static void 5843 nvmf_vfio_user_poll_group_dump_stat(struct spdk_nvmf_transport_poll_group *group, 5844 struct spdk_json_write_ctx *w) 5845 { 5846 struct nvmf_vfio_user_poll_group *vu_group = SPDK_CONTAINEROF(group, 5847 struct nvmf_vfio_user_poll_group, group); 5848 uint64_t polls_denom; 5849 5850 spdk_json_write_named_uint64(w, "ctrlr_intr", vu_group->stats.ctrlr_intr); 5851 spdk_json_write_named_uint64(w, "ctrlr_kicks", vu_group->stats.ctrlr_kicks); 5852 spdk_json_write_named_uint64(w, "won", vu_group->stats.won); 5853 spdk_json_write_named_uint64(w, "lost", vu_group->stats.lost); 5854 spdk_json_write_named_uint64(w, "lost_count", vu_group->stats.lost_count); 5855 spdk_json_write_named_uint64(w, "rearms", vu_group->stats.rearms); 5856 spdk_json_write_named_uint64(w, "pg_process_count", vu_group->stats.pg_process_count); 5857 spdk_json_write_named_uint64(w, "intr", vu_group->stats.intr); 5858 spdk_json_write_named_uint64(w, "polls", vu_group->stats.polls); 5859 spdk_json_write_named_uint64(w, "polls_spurious", vu_group->stats.polls_spurious); 5860 spdk_json_write_named_uint64(w, "poll_reqs", vu_group->stats.poll_reqs); 5861 polls_denom = vu_group->stats.polls * (vu_group->stats.polls - 1); 5862 if (polls_denom) { 5863 uint64_t n = vu_group->stats.polls * vu_group->stats.poll_reqs_squared - vu_group->stats.poll_reqs * 5864 vu_group->stats.poll_reqs; 5865 spdk_json_write_named_double(w, "poll_reqs_variance", sqrt(n / polls_denom)); 5866 } 5867 5868 spdk_json_write_named_uint64(w, "cqh_admin_writes", vu_group->stats.cqh_admin_writes); 5869 spdk_json_write_named_uint64(w, "cqh_io_writes", vu_group->stats.cqh_io_writes); 5870 } 5871 5872 static void 5873 nvmf_vfio_user_opts_init(struct spdk_nvmf_transport_opts *opts) 5874 { 5875 opts->max_queue_depth = NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH; 5876 opts->max_qpairs_per_ctrlr = NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; 5877 opts->in_capsule_data_size = 0; 5878 opts->max_io_size = NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE; 5879 opts->io_unit_size = NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE; 5880 opts->max_aq_depth = NVMF_VFIO_USER_DEFAULT_AQ_DEPTH; 5881 opts->num_shared_buffers = 0; 5882 opts->buf_cache_size = 0; 5883 opts->association_timeout = 0; 5884 opts->transport_specific = NULL; 5885 } 5886 5887 const struct spdk_nvmf_transport_ops spdk_nvmf_transport_vfio_user = { 5888 .name = "VFIOUSER", 5889 .type = SPDK_NVME_TRANSPORT_VFIOUSER, 5890 .opts_init = nvmf_vfio_user_opts_init, 5891 .create = nvmf_vfio_user_create, 5892 .destroy = nvmf_vfio_user_destroy, 5893 5894 .listen = nvmf_vfio_user_listen, 5895 .stop_listen = nvmf_vfio_user_stop_listen, 5896 .cdata_init = nvmf_vfio_user_cdata_init, 5897 .listen_associate = nvmf_vfio_user_listen_associate, 5898 5899 .listener_discover = nvmf_vfio_user_discover, 5900 5901 .poll_group_create = nvmf_vfio_user_poll_group_create, 5902 .get_optimal_poll_group = nvmf_vfio_user_get_optimal_poll_group, 5903 .poll_group_destroy = nvmf_vfio_user_poll_group_destroy, 5904 .poll_group_add = nvmf_vfio_user_poll_group_add, 5905 .poll_group_remove = nvmf_vfio_user_poll_group_remove, 5906 .poll_group_poll = nvmf_vfio_user_poll_group_poll, 5907 5908 .req_free = nvmf_vfio_user_req_free, 5909 .req_complete = nvmf_vfio_user_req_complete, 5910 5911 .qpair_fini = nvmf_vfio_user_close_qpair, 5912 .qpair_get_local_trid = nvmf_vfio_user_qpair_get_local_trid, 5913 .qpair_get_peer_trid = nvmf_vfio_user_qpair_get_peer_trid, 5914 .qpair_get_listen_trid = nvmf_vfio_user_qpair_get_listen_trid, 5915 .qpair_abort_request = nvmf_vfio_user_qpair_abort_request, 5916 5917 .poll_group_dump_stat = nvmf_vfio_user_poll_group_dump_stat, 5918 }; 5919 5920 SPDK_NVMF_TRANSPORT_REGISTER(muser, &spdk_nvmf_transport_vfio_user); 5921 SPDK_LOG_REGISTER_COMPONENT(nvmf_vfio) 5922 SPDK_LOG_REGISTER_COMPONENT(vfio_user_db) 5923