1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2020 Intel Corporation. 3 * Copyright (c) 2019-2022, Nutanix Inc. All rights reserved. 4 * Copyright (c) 2022, 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 /* 8 * NVMe over vfio-user transport 9 */ 10 11 #include <sys/param.h> 12 13 #include <vfio-user/libvfio-user.h> 14 #include <vfio-user/pci_defs.h> 15 16 #include "spdk/barrier.h" 17 #include "spdk/stdinc.h" 18 #include "spdk/assert.h" 19 #include "spdk/thread.h" 20 #include "spdk/nvmf_transport.h" 21 #include "spdk/sock.h" 22 #include "spdk/string.h" 23 #include "spdk/util.h" 24 #include "spdk/log.h" 25 26 #include "transport.h" 27 28 #include "nvmf_internal.h" 29 30 #define SWAP(x, y) \ 31 do \ 32 { \ 33 typeof(x) _tmp = x; \ 34 x = y; \ 35 y = _tmp; \ 36 } while (0) 37 38 #define NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH 256 39 #define NVMF_VFIO_USER_DEFAULT_AQ_DEPTH 32 40 #define NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE ((NVMF_REQ_MAX_BUFFERS - 1) << SHIFT_4KB) 41 #define NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE 42 43 #define NVME_DOORBELLS_OFFSET 0x1000 44 #define NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT 2 45 #define NVMF_VFIO_USER_SET_EVENTIDX_MAX_ATTEMPTS 3 46 #define NVMF_VFIO_USER_EVENTIDX_POLL UINT32_MAX 47 48 #define NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR 512 49 #define NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR (NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR / 4) 50 51 /* NVMe spec 1.4, section 5.21.1.7 */ 52 SPDK_STATIC_ASSERT(NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR >= 2 && 53 NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR <= SPDK_NVME_MAX_IO_QUEUES, 54 "bad number of queues"); 55 56 /* 57 * NVMe driver reads 4096 bytes, which is the extended PCI configuration space 58 * available on PCI-X 2.0 and PCI Express buses 59 */ 60 #define NVME_REG_CFG_SIZE 0x1000 61 62 /* 63 * Doorbells must be page aligned so that they can memory mapped. 64 * 65 * TODO does the NVMe spec also require this? Document it. 66 */ 67 #define NVMF_VFIO_USER_DOORBELLS_SIZE \ 68 SPDK_ALIGN_CEIL( \ 69 (NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR * 2 * SPDK_NVME_DOORBELL_REGISTER_SIZE), \ 70 0x1000) 71 #define NVME_REG_BAR0_SIZE (NVME_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE) 72 73 /* 74 * TODO check the PCI spec whether BAR4 and BAR5 really have to be at least one 75 * page and a multiple of page size (maybe QEMU also needs this?). Document all 76 * this. 77 */ 78 79 #define NVMF_VFIO_USER_MSIX_NUM MAX(CHAR_BIT, NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) 80 81 #define NVMF_VFIO_USER_MSIX_TABLE_BIR (4) 82 #define NVMF_VFIO_USER_BAR4_SIZE SPDK_ALIGN_CEIL((NVMF_VFIO_USER_MSIX_NUM * 16), 0x1000) 83 SPDK_STATIC_ASSERT(NVMF_VFIO_USER_BAR4_SIZE > 0, "Incorrect size"); 84 85 /* 86 * TODO according to the PCI spec we need one bit per vector, document the 87 * relevant section. 88 */ 89 #define NVMF_VFIO_USER_MSIX_PBA_BIR (5) 90 #define NVMF_VFIO_USER_BAR5_SIZE SPDK_ALIGN_CEIL((NVMF_VFIO_USER_MSIX_NUM / CHAR_BIT), 0x1000) 91 SPDK_STATIC_ASSERT(NVMF_VFIO_USER_BAR5_SIZE > 0, "Incorrect size"); 92 struct nvmf_vfio_user_req; 93 94 typedef int (*nvmf_vfio_user_req_cb_fn)(struct nvmf_vfio_user_req *req, void *cb_arg); 95 96 /* 1 more for PRP2 list itself */ 97 #define NVMF_VFIO_USER_MAX_IOVECS (NVMF_REQ_MAX_BUFFERS + 1) 98 99 enum nvmf_vfio_user_req_state { 100 VFIO_USER_REQUEST_STATE_FREE = 0, 101 VFIO_USER_REQUEST_STATE_EXECUTING, 102 }; 103 104 /* 105 * Support for live migration in NVMf/vfio-user: live migration is implemented 106 * by stopping the NVMf subsystem when the device is instructed to enter the 107 * stop-and-copy state and then trivially, and most importantly safely, 108 * collecting migration state and providing it to the vfio-user client. We 109 * don't provide any migration state at the pre-copy state as that's too 110 * complicated to do, we might support this in the future. 111 */ 112 113 114 /* NVMe device state representation */ 115 struct nvme_migr_sq_state { 116 uint16_t sqid; 117 uint16_t cqid; 118 uint32_t head; 119 uint32_t size; 120 uint32_t reserved; 121 uint64_t dma_addr; 122 }; 123 SPDK_STATIC_ASSERT(sizeof(struct nvme_migr_sq_state) == 0x18, "Incorrect size"); 124 125 struct nvme_migr_cq_state { 126 uint16_t cqid; 127 uint16_t phase; 128 uint32_t tail; 129 uint32_t size; 130 uint32_t iv; 131 uint32_t ien; 132 uint32_t reserved; 133 uint64_t dma_addr; 134 }; 135 SPDK_STATIC_ASSERT(sizeof(struct nvme_migr_cq_state) == 0x20, "Incorrect size"); 136 137 #define VFIO_USER_MIGR_CALLBACK_VERS 1 138 #define VFIO_USER_NVME_MIGR_MAGIC 0xAFEDBC23 139 140 /* The device state is in VFIO MIGRATION BAR(9) region, keep the device state page aligned. 141 * 142 * NVMe device migration region is defined as below: 143 * ------------------------------------------------------------------------- 144 * | vfio_user_nvme_migr_header | nvmf controller data | queue pairs | BARs | 145 * ------------------------------------------------------------------------- 146 * 147 * Keep vfio_user_nvme_migr_header as a fixed 0x1000 length, all new added fields 148 * can use the reserved space at the end of the data structure. 149 */ 150 struct vfio_user_nvme_migr_header { 151 /* Magic value to validate migration data */ 152 uint32_t magic; 153 /* Version to check the data is same from source to destination */ 154 uint32_t version; 155 156 /* The library uses this field to know how many fields in this 157 * structure are valid, starting at the beginning of this data 158 * structure. New added fields in future use `unused` memory 159 * spaces. 160 */ 161 uint32_t opts_size; 162 uint32_t reserved0; 163 164 /* BARs information */ 165 uint64_t bar_offset[VFU_PCI_DEV_NUM_REGIONS]; 166 uint64_t bar_len[VFU_PCI_DEV_NUM_REGIONS]; 167 168 /* Queue pair start offset, starting at the beginning of this 169 * data structure. 170 */ 171 uint64_t qp_offset; 172 uint64_t qp_len; 173 174 /* Controller data structure */ 175 uint32_t num_io_queues; 176 uint32_t reserved1; 177 178 /* NVMf controller data offset and length if exist, starting at 179 * the beginning of this data structure. 180 */ 181 uint64_t nvmf_data_offset; 182 uint64_t nvmf_data_len; 183 184 /* 185 * Whether or not shadow doorbells are used in the source. 0 is a valid DMA 186 * address. 187 */ 188 uint32_t sdbl; 189 190 /* Shadow doorbell DMA addresses. */ 191 uint64_t shadow_doorbell_buffer; 192 uint64_t eventidx_buffer; 193 194 /* Reserved memory space for new added fields, the 195 * field is always at the end of this data structure. 196 */ 197 uint8_t unused[3856]; 198 }; 199 SPDK_STATIC_ASSERT(sizeof(struct vfio_user_nvme_migr_header) == 0x1000, "Incorrect size"); 200 201 struct vfio_user_nvme_migr_qp { 202 struct nvme_migr_sq_state sq; 203 struct nvme_migr_cq_state cq; 204 }; 205 206 /* NVMe state definition used to load/restore from/to NVMe migration BAR region */ 207 struct vfio_user_nvme_migr_state { 208 struct vfio_user_nvme_migr_header ctrlr_header; 209 struct spdk_nvmf_ctrlr_migr_data nvmf_data; 210 struct vfio_user_nvme_migr_qp qps[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 211 uint8_t doorbells[NVMF_VFIO_USER_DOORBELLS_SIZE]; 212 uint8_t cfg[NVME_REG_CFG_SIZE]; 213 }; 214 215 struct nvmf_vfio_user_req { 216 struct spdk_nvmf_request req; 217 struct spdk_nvme_cpl rsp; 218 struct spdk_nvme_cmd cmd; 219 220 enum nvmf_vfio_user_req_state state; 221 nvmf_vfio_user_req_cb_fn cb_fn; 222 void *cb_arg; 223 224 /* old CC before prop_set_cc fabric command */ 225 union spdk_nvme_cc_register cc; 226 227 TAILQ_ENTRY(nvmf_vfio_user_req) link; 228 229 struct iovec iov[NVMF_VFIO_USER_MAX_IOVECS]; 230 uint8_t iovcnt; 231 232 /* NVMF_VFIO_USER_MAX_IOVECS worth of dma_sg_t. */ 233 uint8_t sg[]; 234 }; 235 236 #define MAP_R (0) 237 #define MAP_RW (1 << 0) 238 #define MAP_INITIALIZE (1 << 1) 239 #define MAP_QUIET (1 << 2) 240 241 /* 242 * Mapping of an NVMe queue. 243 * 244 * This holds the information tracking a local process mapping of an NVMe queue 245 * shared by the client. 246 */ 247 struct nvme_q_mapping { 248 /* iov of local process mapping. */ 249 struct iovec iov; 250 /* Stored sg, needed for unmap. */ 251 dma_sg_t *sg; 252 /* Client PRP of queue. */ 253 uint64_t prp1; 254 /* Total length in bytes. */ 255 uint64_t len; 256 }; 257 258 enum nvmf_vfio_user_sq_state { 259 VFIO_USER_SQ_UNUSED = 0, 260 VFIO_USER_SQ_CREATED, 261 VFIO_USER_SQ_DELETED, 262 VFIO_USER_SQ_ACTIVE, 263 VFIO_USER_SQ_INACTIVE 264 }; 265 266 enum nvmf_vfio_user_cq_state { 267 VFIO_USER_CQ_UNUSED = 0, 268 VFIO_USER_CQ_CREATED, 269 VFIO_USER_CQ_DELETED, 270 }; 271 272 enum nvmf_vfio_user_ctrlr_state { 273 VFIO_USER_CTRLR_CREATING = 0, 274 VFIO_USER_CTRLR_RUNNING, 275 /* Quiesce requested by libvfio-user */ 276 VFIO_USER_CTRLR_PAUSING, 277 /* NVMf subsystem is paused, it's safe to do PCI reset, memory register, 278 * memory unergister, and vfio migration state transition in this state. 279 */ 280 VFIO_USER_CTRLR_PAUSED, 281 /* 282 * Implies that the NVMf subsystem is paused. Device will be unquiesced (PCI 283 * reset, memory register and unregister, controller in destination VM has 284 * been restored). NVMf subsystem resume has been requested. 285 */ 286 VFIO_USER_CTRLR_RESUMING, 287 /* 288 * Implies that the NVMf subsystem is paused. Both controller in source VM and 289 * destinatiom VM is in this state when doing live migration. 290 */ 291 VFIO_USER_CTRLR_MIGRATING 292 }; 293 294 struct nvmf_vfio_user_sq { 295 struct spdk_nvmf_qpair qpair; 296 struct spdk_nvmf_transport_poll_group *group; 297 struct nvmf_vfio_user_ctrlr *ctrlr; 298 299 uint32_t qid; 300 /* Number of entries in queue. */ 301 uint32_t size; 302 struct nvme_q_mapping mapping; 303 enum nvmf_vfio_user_sq_state sq_state; 304 305 uint32_t head; 306 volatile uint32_t *dbl_tailp; 307 308 /* Whether a shadow doorbell eventidx needs setting. */ 309 bool need_rearm; 310 311 /* multiple SQs can be mapped to the same CQ */ 312 uint16_t cqid; 313 314 /* handle_queue_connect_rsp() can be used both for CREATE IO SQ response 315 * and SQ re-connect response in the destination VM, for the prior case, 316 * we will post a NVMe completion to VM, we will not set this flag when 317 * re-connecting SQs in the destination VM. 318 */ 319 bool post_create_io_sq_completion; 320 /* Copy of Create IO SQ command, this field is used together with 321 * `post_create_io_sq_completion` flag. 322 */ 323 struct spdk_nvme_cmd create_io_sq_cmd; 324 325 struct vfio_user_delete_sq_ctx *delete_ctx; 326 327 /* Currently unallocated reqs. */ 328 TAILQ_HEAD(, nvmf_vfio_user_req) free_reqs; 329 /* Poll group entry */ 330 TAILQ_ENTRY(nvmf_vfio_user_sq) link; 331 /* Connected SQ entry */ 332 TAILQ_ENTRY(nvmf_vfio_user_sq) tailq; 333 }; 334 335 struct nvmf_vfio_user_cq { 336 struct spdk_nvmf_transport_poll_group *group; 337 int cq_ref; 338 339 uint32_t qid; 340 /* Number of entries in queue. */ 341 uint32_t size; 342 struct nvme_q_mapping mapping; 343 enum nvmf_vfio_user_cq_state cq_state; 344 345 uint32_t tail; 346 volatile uint32_t *dbl_headp; 347 348 bool phase; 349 350 uint16_t iv; 351 bool ien; 352 353 uint32_t last_head; 354 uint32_t last_trigger_irq_tail; 355 }; 356 357 struct nvmf_vfio_user_poll_group { 358 struct spdk_nvmf_transport_poll_group group; 359 TAILQ_ENTRY(nvmf_vfio_user_poll_group) link; 360 TAILQ_HEAD(, nvmf_vfio_user_sq) sqs; 361 struct spdk_interrupt *intr; 362 int intr_fd; 363 struct { 364 365 /* 366 * ctrlr_intr and ctrlr_kicks will be zero for all other poll 367 * groups. However, they can be zero even for the poll group 368 * the controller belongs are if no vfio-user message has been 369 * received or the controller hasn't been kicked yet. 370 */ 371 372 /* 373 * Number of times vfio_user_ctrlr_intr() has run: 374 * vfio-user file descriptor has been ready or explicitly 375 * kicked (see below). 376 */ 377 uint64_t ctrlr_intr; 378 379 /* 380 * Kicks to the controller by ctrlr_kick(). 381 * ctrlr_intr - ctrlr_kicks is the number of times the 382 * vfio-user poll file descriptor has been ready. 383 */ 384 uint64_t ctrlr_kicks; 385 386 /* 387 * How many times we won the race arming an SQ. 388 */ 389 uint64_t won; 390 391 /* 392 * How many times we lost the race arming an SQ 393 */ 394 uint64_t lost; 395 396 /* 397 * How many requests we processed in total each time we lost 398 * the rearm race. 399 */ 400 uint64_t lost_count; 401 402 /* 403 * Number of attempts we attempted to rearm all the SQs in the 404 * poll group. 405 */ 406 uint64_t rearms; 407 408 uint64_t pg_process_count; 409 uint64_t intr; 410 uint64_t polls; 411 uint64_t polls_spurious; 412 uint64_t poll_reqs; 413 uint64_t poll_reqs_squared; 414 uint64_t cqh_admin_writes; 415 uint64_t cqh_io_writes; 416 } stats; 417 }; 418 419 struct nvmf_vfio_user_shadow_doorbells { 420 volatile uint32_t *shadow_doorbells; 421 volatile uint32_t *eventidxs; 422 dma_sg_t *sgs; 423 struct iovec *iovs; 424 }; 425 426 struct nvmf_vfio_user_ctrlr { 427 struct nvmf_vfio_user_endpoint *endpoint; 428 struct nvmf_vfio_user_transport *transport; 429 430 /* Connected SQs list */ 431 TAILQ_HEAD(, nvmf_vfio_user_sq) connected_sqs; 432 enum nvmf_vfio_user_ctrlr_state state; 433 434 /* 435 * Tells whether live migration data have been prepared. This is used 436 * by the get_pending_bytes callback to tell whether or not the 437 * previous iteration finished. 438 */ 439 bool migr_data_prepared; 440 441 /* Controller is in source VM when doing live migration */ 442 bool in_source_vm; 443 444 struct spdk_thread *thread; 445 struct spdk_poller *vfu_ctx_poller; 446 struct spdk_interrupt *intr; 447 int intr_fd; 448 449 bool queued_quiesce; 450 451 bool reset_shn; 452 bool disconnect; 453 454 uint16_t cntlid; 455 struct spdk_nvmf_ctrlr *ctrlr; 456 457 struct nvmf_vfio_user_sq *sqs[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 458 struct nvmf_vfio_user_cq *cqs[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 459 460 TAILQ_ENTRY(nvmf_vfio_user_ctrlr) link; 461 462 volatile uint32_t *bar0_doorbells; 463 struct nvmf_vfio_user_shadow_doorbells *sdbl; 464 /* 465 * Shadow doorbells PRPs to provide during the stop-and-copy state. 466 */ 467 uint64_t shadow_doorbell_buffer; 468 uint64_t eventidx_buffer; 469 470 bool adaptive_irqs_enabled; 471 }; 472 473 /* Endpoint in vfio-user is associated with a socket file, which 474 * is the representative of a PCI endpoint. 475 */ 476 struct nvmf_vfio_user_endpoint { 477 struct nvmf_vfio_user_transport *transport; 478 vfu_ctx_t *vfu_ctx; 479 struct spdk_poller *accept_poller; 480 struct spdk_thread *accept_thread; 481 bool interrupt_mode; 482 struct msixcap *msix; 483 vfu_pci_config_space_t *pci_config_space; 484 int devmem_fd; 485 int accept_intr_fd; 486 struct spdk_interrupt *accept_intr; 487 488 volatile uint32_t *bar0_doorbells; 489 490 int migr_fd; 491 void *migr_data; 492 493 struct spdk_nvme_transport_id trid; 494 struct spdk_nvmf_subsystem *subsystem; 495 496 /* Controller is associated with an active socket connection, 497 * the lifecycle of the controller is same as the VM. 498 * Currently we only support one active connection, as the NVMe 499 * specification defines, we may support multiple controllers in 500 * future, so that it can support e.g: RESERVATION. 501 */ 502 struct nvmf_vfio_user_ctrlr *ctrlr; 503 pthread_mutex_t lock; 504 505 bool need_async_destroy; 506 /* The subsystem is in PAUSED state and need to be resumed, TRUE 507 * only when migration is done successfully and the controller is 508 * in source VM. 509 */ 510 bool need_resume; 511 /* Start the accept poller again after destroying the controller */ 512 bool need_relisten; 513 514 TAILQ_ENTRY(nvmf_vfio_user_endpoint) link; 515 }; 516 517 struct nvmf_vfio_user_transport_opts { 518 bool disable_mappable_bar0; 519 bool disable_adaptive_irq; 520 bool disable_shadow_doorbells; 521 bool disable_compare; 522 bool enable_intr_mode_sq_spreading; 523 }; 524 525 struct nvmf_vfio_user_transport { 526 struct spdk_nvmf_transport transport; 527 struct nvmf_vfio_user_transport_opts transport_opts; 528 bool intr_mode_supported; 529 pthread_mutex_t lock; 530 TAILQ_HEAD(, nvmf_vfio_user_endpoint) endpoints; 531 532 pthread_mutex_t pg_lock; 533 TAILQ_HEAD(, nvmf_vfio_user_poll_group) poll_groups; 534 struct nvmf_vfio_user_poll_group *next_pg; 535 }; 536 537 /* 538 * function prototypes 539 */ 540 static int nvmf_vfio_user_req_free(struct spdk_nvmf_request *req); 541 542 static struct nvmf_vfio_user_req *get_nvmf_vfio_user_req(struct nvmf_vfio_user_sq *sq); 543 544 /* 545 * Local process virtual address of a queue. 546 */ 547 static inline void * 548 q_addr(struct nvme_q_mapping *mapping) 549 { 550 return mapping->iov.iov_base; 551 } 552 553 static inline int 554 queue_index(uint16_t qid, bool is_cq) 555 { 556 return (qid * 2) + is_cq; 557 } 558 559 static inline volatile uint32_t * 560 sq_headp(struct nvmf_vfio_user_sq *sq) 561 { 562 assert(sq != NULL); 563 return &sq->head; 564 } 565 566 static inline volatile uint32_t * 567 sq_dbl_tailp(struct nvmf_vfio_user_sq *sq) 568 { 569 assert(sq != NULL); 570 return sq->dbl_tailp; 571 } 572 573 static inline volatile uint32_t * 574 cq_dbl_headp(struct nvmf_vfio_user_cq *cq) 575 { 576 assert(cq != NULL); 577 return cq->dbl_headp; 578 } 579 580 static inline volatile uint32_t * 581 cq_tailp(struct nvmf_vfio_user_cq *cq) 582 { 583 assert(cq != NULL); 584 return &cq->tail; 585 } 586 587 static inline void 588 sq_head_advance(struct nvmf_vfio_user_sq *sq) 589 { 590 assert(sq != NULL); 591 592 assert(*sq_headp(sq) < sq->size); 593 (*sq_headp(sq))++; 594 595 if (spdk_unlikely(*sq_headp(sq) == sq->size)) { 596 *sq_headp(sq) = 0; 597 } 598 } 599 600 static inline void 601 cq_tail_advance(struct nvmf_vfio_user_cq *cq) 602 { 603 assert(cq != NULL); 604 605 assert(*cq_tailp(cq) < cq->size); 606 (*cq_tailp(cq))++; 607 608 if (spdk_unlikely(*cq_tailp(cq) == cq->size)) { 609 *cq_tailp(cq) = 0; 610 cq->phase = !cq->phase; 611 } 612 } 613 614 static bool 615 io_q_exists(struct nvmf_vfio_user_ctrlr *vu_ctrlr, const uint16_t qid, const bool is_cq) 616 { 617 assert(vu_ctrlr != NULL); 618 619 if (qid == 0 || qid >= NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) { 620 return false; 621 } 622 623 if (is_cq) { 624 if (vu_ctrlr->cqs[qid] == NULL) { 625 return false; 626 } 627 628 return (vu_ctrlr->cqs[qid]->cq_state != VFIO_USER_CQ_DELETED && 629 vu_ctrlr->cqs[qid]->cq_state != VFIO_USER_CQ_UNUSED); 630 } 631 632 if (vu_ctrlr->sqs[qid] == NULL) { 633 return false; 634 } 635 636 return (vu_ctrlr->sqs[qid]->sq_state != VFIO_USER_SQ_DELETED && 637 vu_ctrlr->sqs[qid]->sq_state != VFIO_USER_SQ_UNUSED); 638 } 639 640 static char * 641 endpoint_id(struct nvmf_vfio_user_endpoint *endpoint) 642 { 643 return endpoint->trid.traddr; 644 } 645 646 static char * 647 ctrlr_id(struct nvmf_vfio_user_ctrlr *ctrlr) 648 { 649 if (!ctrlr || !ctrlr->endpoint) { 650 return "Null Ctrlr"; 651 } 652 653 return endpoint_id(ctrlr->endpoint); 654 } 655 656 /* Return the poll group for the admin queue of the controller. */ 657 static inline struct nvmf_vfio_user_poll_group * 658 ctrlr_to_poll_group(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 659 { 660 return SPDK_CONTAINEROF(vu_ctrlr->sqs[0]->group, 661 struct nvmf_vfio_user_poll_group, 662 group); 663 } 664 665 static inline struct nvmf_vfio_user_poll_group * 666 sq_to_poll_group(struct nvmf_vfio_user_sq *sq) 667 { 668 return SPDK_CONTAINEROF(sq->group, struct nvmf_vfio_user_poll_group, 669 group); 670 } 671 672 static inline struct spdk_thread * 673 poll_group_to_thread(struct nvmf_vfio_user_poll_group *vu_pg) 674 { 675 return vu_pg->group.group->thread; 676 } 677 678 static dma_sg_t * 679 index_to_sg_t(void *arr, size_t i) 680 { 681 return (dma_sg_t *)((uintptr_t)arr + i * dma_sg_size()); 682 } 683 684 static inline size_t 685 vfio_user_migr_data_len(void) 686 { 687 return SPDK_ALIGN_CEIL(sizeof(struct vfio_user_nvme_migr_state), PAGE_SIZE); 688 } 689 690 static inline bool 691 in_interrupt_mode(struct nvmf_vfio_user_transport *vu_transport) 692 { 693 return spdk_interrupt_mode_is_enabled() && 694 vu_transport->intr_mode_supported; 695 } 696 697 static int vfio_user_ctrlr_intr(void *ctx); 698 699 static void 700 vfio_user_msg_ctrlr_intr(void *ctx) 701 { 702 struct nvmf_vfio_user_ctrlr *vu_ctrlr = ctx; 703 struct nvmf_vfio_user_poll_group *vu_ctrlr_group = ctrlr_to_poll_group(vu_ctrlr); 704 705 vu_ctrlr_group->stats.ctrlr_kicks++; 706 707 vfio_user_ctrlr_intr(ctx); 708 } 709 710 /* 711 * Kick (force a wakeup) of all poll groups for this controller. 712 * vfio_user_ctrlr_intr() itself arranges for kicking other poll groups if 713 * needed. 714 */ 715 static void 716 ctrlr_kick(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 717 { 718 struct nvmf_vfio_user_poll_group *vu_ctrlr_group; 719 720 SPDK_DEBUGLOG(vfio_user_db, "%s: kicked\n", ctrlr_id(vu_ctrlr)); 721 722 vu_ctrlr_group = ctrlr_to_poll_group(vu_ctrlr); 723 724 spdk_thread_send_msg(poll_group_to_thread(vu_ctrlr_group), 725 vfio_user_msg_ctrlr_intr, vu_ctrlr); 726 } 727 728 /* 729 * Make the given DMA address and length available (locally mapped) via iov. 730 */ 731 static void * 732 map_one(vfu_ctx_t *ctx, uint64_t addr, uint64_t len, dma_sg_t *sg, 733 struct iovec *iov, int32_t flags) 734 { 735 int prot = PROT_READ; 736 int ret; 737 738 if (flags & MAP_RW) { 739 prot |= PROT_WRITE; 740 } 741 742 assert(ctx != NULL); 743 assert(sg != NULL); 744 assert(iov != NULL); 745 746 ret = vfu_addr_to_sgl(ctx, (void *)(uintptr_t)addr, len, sg, 1, prot); 747 if (ret < 0) { 748 if (ret == -1) { 749 if (!(flags & MAP_QUIET)) { 750 SPDK_ERRLOG("failed to translate IOVA [%#lx, %#lx) (prot=%d) to local VA: %m\n", 751 addr, addr + len, prot); 752 } 753 } else { 754 SPDK_ERRLOG("failed to translate IOVA [%#lx, %#lx) (prot=%d) to local VA: %d segments needed\n", 755 addr, addr + len, prot, -(ret + 1)); 756 } 757 return NULL; 758 } 759 760 ret = vfu_sgl_get(ctx, sg, iov, 1, 0); 761 if (ret != 0) { 762 SPDK_ERRLOG("failed to get iovec for IOVA [%#lx, %#lx): %m\n", 763 addr, addr + len); 764 return NULL; 765 } 766 767 assert(iov->iov_base != NULL); 768 return iov->iov_base; 769 } 770 771 static int 772 nvme_cmd_map_prps(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, 773 uint32_t max_iovcnt, uint32_t len, size_t mps, 774 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, uint32_t flags)) 775 { 776 uint64_t prp1, prp2; 777 void *vva; 778 uint32_t i; 779 uint32_t residue_len, nents; 780 uint64_t *prp_list; 781 uint32_t iovcnt; 782 783 assert(max_iovcnt > 0); 784 785 prp1 = cmd->dptr.prp.prp1; 786 prp2 = cmd->dptr.prp.prp2; 787 788 /* PRP1 may started with unaligned page address */ 789 residue_len = mps - (prp1 % mps); 790 residue_len = spdk_min(len, residue_len); 791 792 vva = gpa_to_vva(prv, prp1, residue_len, MAP_RW); 793 if (spdk_unlikely(vva == NULL)) { 794 SPDK_ERRLOG("GPA to VVA failed\n"); 795 return -EINVAL; 796 } 797 len -= residue_len; 798 if (len && max_iovcnt < 2) { 799 SPDK_ERRLOG("Too many page entries, at least two iovs are required\n"); 800 return -ERANGE; 801 } 802 iovs[0].iov_base = vva; 803 iovs[0].iov_len = residue_len; 804 805 if (len) { 806 if (spdk_unlikely(prp2 == 0)) { 807 SPDK_ERRLOG("no PRP2, %d remaining\n", len); 808 return -EINVAL; 809 } 810 811 if (len <= mps) { 812 /* 2 PRP used */ 813 iovcnt = 2; 814 vva = gpa_to_vva(prv, prp2, len, MAP_RW); 815 if (spdk_unlikely(vva == NULL)) { 816 SPDK_ERRLOG("no VVA for %#" PRIx64 ", len%#x\n", 817 prp2, len); 818 return -EINVAL; 819 } 820 iovs[1].iov_base = vva; 821 iovs[1].iov_len = len; 822 } else { 823 /* PRP list used */ 824 nents = (len + mps - 1) / mps; 825 if (spdk_unlikely(nents + 1 > max_iovcnt)) { 826 SPDK_ERRLOG("Too many page entries\n"); 827 return -ERANGE; 828 } 829 830 vva = gpa_to_vva(prv, prp2, nents * sizeof(*prp_list), MAP_R); 831 if (spdk_unlikely(vva == NULL)) { 832 SPDK_ERRLOG("no VVA for %#" PRIx64 ", nents=%#x\n", 833 prp2, nents); 834 return -EINVAL; 835 } 836 prp_list = vva; 837 i = 0; 838 while (len != 0) { 839 residue_len = spdk_min(len, mps); 840 vva = gpa_to_vva(prv, prp_list[i], residue_len, MAP_RW); 841 if (spdk_unlikely(vva == NULL)) { 842 SPDK_ERRLOG("no VVA for %#" PRIx64 ", residue_len=%#x\n", 843 prp_list[i], residue_len); 844 return -EINVAL; 845 } 846 iovs[i + 1].iov_base = vva; 847 iovs[i + 1].iov_len = residue_len; 848 len -= residue_len; 849 i++; 850 } 851 iovcnt = i + 1; 852 } 853 } else { 854 /* 1 PRP used */ 855 iovcnt = 1; 856 } 857 858 assert(iovcnt <= max_iovcnt); 859 return iovcnt; 860 } 861 862 static int 863 nvme_cmd_map_sgls_data(void *prv, struct spdk_nvme_sgl_descriptor *sgls, uint32_t num_sgls, 864 struct iovec *iovs, uint32_t max_iovcnt, 865 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, uint32_t flags)) 866 { 867 uint32_t i; 868 void *vva; 869 870 if (spdk_unlikely(max_iovcnt < num_sgls)) { 871 return -ERANGE; 872 } 873 874 for (i = 0; i < num_sgls; i++) { 875 if (spdk_unlikely(sgls[i].unkeyed.type != SPDK_NVME_SGL_TYPE_DATA_BLOCK)) { 876 SPDK_ERRLOG("Invalid SGL type %u\n", sgls[i].unkeyed.type); 877 return -EINVAL; 878 } 879 vva = gpa_to_vva(prv, sgls[i].address, sgls[i].unkeyed.length, MAP_RW); 880 if (spdk_unlikely(vva == NULL)) { 881 SPDK_ERRLOG("GPA to VVA failed\n"); 882 return -EINVAL; 883 } 884 iovs[i].iov_base = vva; 885 iovs[i].iov_len = sgls[i].unkeyed.length; 886 } 887 888 return num_sgls; 889 } 890 891 static int 892 nvme_cmd_map_sgls(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt, 893 uint32_t len, size_t mps, 894 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, uint32_t flags)) 895 { 896 struct spdk_nvme_sgl_descriptor *sgl, *last_sgl; 897 uint32_t num_sgls, seg_len; 898 void *vva; 899 int ret; 900 uint32_t total_iovcnt = 0; 901 902 /* SGL cases */ 903 sgl = &cmd->dptr.sgl1; 904 905 /* only one SGL segment */ 906 if (sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { 907 assert(max_iovcnt > 0); 908 vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, MAP_RW); 909 if (spdk_unlikely(vva == NULL)) { 910 SPDK_ERRLOG("GPA to VVA failed\n"); 911 return -EINVAL; 912 } 913 iovs[0].iov_base = vva; 914 iovs[0].iov_len = sgl->unkeyed.length; 915 assert(sgl->unkeyed.length == len); 916 917 return 1; 918 } 919 920 for (;;) { 921 if (spdk_unlikely((sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_SEGMENT) && 922 (sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_LAST_SEGMENT))) { 923 SPDK_ERRLOG("Invalid SGL type %u\n", sgl->unkeyed.type); 924 return -EINVAL; 925 } 926 927 seg_len = sgl->unkeyed.length; 928 if (spdk_unlikely(seg_len % sizeof(struct spdk_nvme_sgl_descriptor))) { 929 SPDK_ERRLOG("Invalid SGL segment len %u\n", seg_len); 930 return -EINVAL; 931 } 932 933 num_sgls = seg_len / sizeof(struct spdk_nvme_sgl_descriptor); 934 vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, MAP_R); 935 if (spdk_unlikely(vva == NULL)) { 936 SPDK_ERRLOG("GPA to VVA failed\n"); 937 return -EINVAL; 938 } 939 940 /* sgl point to the first segment */ 941 sgl = (struct spdk_nvme_sgl_descriptor *)vva; 942 last_sgl = &sgl[num_sgls - 1]; 943 944 /* we are done */ 945 if (last_sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { 946 /* map whole sgl list */ 947 ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls, &iovs[total_iovcnt], 948 max_iovcnt - total_iovcnt, gpa_to_vva); 949 if (spdk_unlikely(ret < 0)) { 950 return ret; 951 } 952 total_iovcnt += ret; 953 954 return total_iovcnt; 955 } 956 957 if (num_sgls > 1) { 958 /* map whole sgl exclude last_sgl */ 959 ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls - 1, &iovs[total_iovcnt], 960 max_iovcnt - total_iovcnt, gpa_to_vva); 961 if (spdk_unlikely(ret < 0)) { 962 return ret; 963 } 964 total_iovcnt += ret; 965 } 966 967 /* move to next level's segments */ 968 sgl = last_sgl; 969 } 970 971 return 0; 972 } 973 974 static int 975 nvme_map_cmd(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt, 976 uint32_t len, size_t mps, 977 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, uint32_t flags)) 978 { 979 if (cmd->psdt == SPDK_NVME_PSDT_PRP) { 980 return nvme_cmd_map_prps(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva); 981 } 982 983 return nvme_cmd_map_sgls(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva); 984 } 985 986 /* 987 * For each queue, update the location of its doorbell to the correct location: 988 * either our own BAR0, or the guest's configured shadow doorbell area. 989 * 990 * The Admin queue (qid: 0) does not ever use shadow doorbells. 991 */ 992 static void 993 vfio_user_ctrlr_switch_doorbells(struct nvmf_vfio_user_ctrlr *ctrlr, bool shadow) 994 { 995 volatile uint32_t *doorbells = shadow ? ctrlr->sdbl->shadow_doorbells : 996 ctrlr->bar0_doorbells; 997 998 assert(doorbells != NULL); 999 1000 for (size_t i = 1; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; i++) { 1001 struct nvmf_vfio_user_sq *sq = ctrlr->sqs[i]; 1002 struct nvmf_vfio_user_cq *cq = ctrlr->cqs[i]; 1003 1004 if (sq != NULL) { 1005 sq->dbl_tailp = doorbells + queue_index(sq->qid, false); 1006 1007 ctrlr->sqs[i]->need_rearm = shadow; 1008 } 1009 1010 if (cq != NULL) { 1011 cq->dbl_headp = doorbells + queue_index(cq->qid, true); 1012 } 1013 } 1014 } 1015 1016 static void 1017 unmap_sdbl(vfu_ctx_t *vfu_ctx, struct nvmf_vfio_user_shadow_doorbells *sdbl) 1018 { 1019 assert(vfu_ctx != NULL); 1020 assert(sdbl != NULL); 1021 1022 /* 1023 * An allocation error would result in only one of the two being 1024 * non-NULL. If that is the case, no memory should have been mapped. 1025 */ 1026 if (sdbl->iovs == NULL || sdbl->sgs == NULL) { 1027 return; 1028 } 1029 1030 for (size_t i = 0; i < NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT; ++i) { 1031 struct iovec *iov; 1032 dma_sg_t *sg; 1033 1034 if (!sdbl->iovs[i].iov_len) { 1035 continue; 1036 } 1037 1038 sg = index_to_sg_t(sdbl->sgs, i); 1039 iov = sdbl->iovs + i; 1040 1041 vfu_sgl_put(vfu_ctx, sg, iov, 1); 1042 } 1043 } 1044 1045 static void 1046 free_sdbl(vfu_ctx_t *vfu_ctx, struct nvmf_vfio_user_shadow_doorbells *sdbl) 1047 { 1048 if (sdbl == NULL) { 1049 return; 1050 } 1051 1052 unmap_sdbl(vfu_ctx, sdbl); 1053 1054 /* 1055 * sdbl->shadow_doorbells and sdbl->eventidxs were mapped, 1056 * not allocated, so don't free() them. 1057 */ 1058 free(sdbl->sgs); 1059 free(sdbl->iovs); 1060 free(sdbl); 1061 } 1062 1063 static struct nvmf_vfio_user_shadow_doorbells * 1064 map_sdbl(vfu_ctx_t *vfu_ctx, uint64_t prp1, uint64_t prp2, size_t len) 1065 { 1066 struct nvmf_vfio_user_shadow_doorbells *sdbl = NULL; 1067 dma_sg_t *sg2 = NULL; 1068 void *p; 1069 1070 assert(vfu_ctx != NULL); 1071 1072 sdbl = calloc(1, sizeof(*sdbl)); 1073 if (sdbl == NULL) { 1074 goto err; 1075 } 1076 1077 sdbl->sgs = calloc(NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT, dma_sg_size()); 1078 sdbl->iovs = calloc(NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT, sizeof(*sdbl->iovs)); 1079 if (sdbl->sgs == NULL || sdbl->iovs == NULL) { 1080 goto err; 1081 } 1082 1083 /* Map shadow doorbell buffer (PRP1). */ 1084 p = map_one(vfu_ctx, prp1, len, sdbl->sgs, sdbl->iovs, MAP_RW); 1085 1086 if (p == NULL) { 1087 goto err; 1088 } 1089 1090 /* 1091 * Map eventidx buffer (PRP2). 1092 * Should only be written to by the controller. 1093 */ 1094 1095 sg2 = index_to_sg_t(sdbl->sgs, 1); 1096 1097 p = map_one(vfu_ctx, prp2, len, sg2, sdbl->iovs + 1, MAP_RW); 1098 1099 if (p == NULL) { 1100 goto err; 1101 } 1102 1103 sdbl->shadow_doorbells = (uint32_t *)sdbl->iovs[0].iov_base; 1104 sdbl->eventidxs = (uint32_t *)sdbl->iovs[1].iov_base; 1105 1106 return sdbl; 1107 1108 err: 1109 free_sdbl(vfu_ctx, sdbl); 1110 return NULL; 1111 } 1112 1113 /* 1114 * Copy doorbells from one buffer to the other, during switches between BAR0 1115 * doorbells and shadow doorbells. 1116 */ 1117 static void 1118 copy_doorbells(struct nvmf_vfio_user_ctrlr *ctrlr, 1119 const volatile uint32_t *from, volatile uint32_t *to) 1120 { 1121 assert(ctrlr != NULL); 1122 assert(from != NULL); 1123 assert(to != NULL); 1124 1125 SPDK_DEBUGLOG(vfio_user_db, 1126 "%s: migrating shadow doorbells from %p to %p\n", 1127 ctrlr_id(ctrlr), from, to); 1128 1129 /* Can't use memcpy because it doesn't respect volatile semantics. */ 1130 for (size_t i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; ++i) { 1131 if (ctrlr->sqs[i] != NULL) { 1132 to[queue_index(i, false)] = from[queue_index(i, false)]; 1133 } 1134 1135 if (ctrlr->cqs[i] != NULL) { 1136 to[queue_index(i, true)] = from[queue_index(i, true)]; 1137 } 1138 } 1139 } 1140 1141 static void 1142 fail_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1143 { 1144 const struct spdk_nvmf_registers *regs; 1145 1146 assert(vu_ctrlr != NULL); 1147 assert(vu_ctrlr->ctrlr != NULL); 1148 1149 regs = spdk_nvmf_ctrlr_get_regs(vu_ctrlr->ctrlr); 1150 if (regs->csts.bits.cfs == 0) { 1151 SPDK_ERRLOG(":%s failing controller\n", ctrlr_id(vu_ctrlr)); 1152 } 1153 1154 nvmf_ctrlr_set_fatal_status(vu_ctrlr->ctrlr); 1155 } 1156 1157 static inline bool 1158 ctrlr_interrupt_enabled(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1159 { 1160 assert(vu_ctrlr != NULL); 1161 assert(vu_ctrlr->endpoint != NULL); 1162 1163 vfu_pci_config_space_t *pci = vu_ctrlr->endpoint->pci_config_space; 1164 1165 return (!pci->hdr.cmd.id || vu_ctrlr->endpoint->msix->mxc.mxe); 1166 } 1167 1168 static void 1169 nvmf_vfio_user_destroy_endpoint(struct nvmf_vfio_user_endpoint *endpoint) 1170 { 1171 SPDK_DEBUGLOG(nvmf_vfio, "destroy endpoint %s\n", endpoint_id(endpoint)); 1172 1173 spdk_interrupt_unregister(&endpoint->accept_intr); 1174 spdk_poller_unregister(&endpoint->accept_poller); 1175 1176 if (endpoint->bar0_doorbells) { 1177 munmap((void *)endpoint->bar0_doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 1178 } 1179 1180 if (endpoint->devmem_fd > 0) { 1181 close(endpoint->devmem_fd); 1182 } 1183 1184 if (endpoint->migr_data) { 1185 munmap(endpoint->migr_data, vfio_user_migr_data_len()); 1186 } 1187 1188 if (endpoint->migr_fd > 0) { 1189 close(endpoint->migr_fd); 1190 } 1191 1192 if (endpoint->vfu_ctx) { 1193 vfu_destroy_ctx(endpoint->vfu_ctx); 1194 } 1195 1196 pthread_mutex_destroy(&endpoint->lock); 1197 free(endpoint); 1198 } 1199 1200 /* called when process exits */ 1201 static int 1202 nvmf_vfio_user_destroy(struct spdk_nvmf_transport *transport, 1203 spdk_nvmf_transport_destroy_done_cb cb_fn, void *cb_arg) 1204 { 1205 struct nvmf_vfio_user_transport *vu_transport; 1206 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 1207 1208 SPDK_DEBUGLOG(nvmf_vfio, "destroy transport\n"); 1209 1210 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 1211 transport); 1212 1213 pthread_mutex_destroy(&vu_transport->lock); 1214 pthread_mutex_destroy(&vu_transport->pg_lock); 1215 1216 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 1217 TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link); 1218 nvmf_vfio_user_destroy_endpoint(endpoint); 1219 } 1220 1221 free(vu_transport); 1222 1223 if (cb_fn) { 1224 cb_fn(cb_arg); 1225 } 1226 1227 return 0; 1228 } 1229 1230 static const struct spdk_json_object_decoder vfio_user_transport_opts_decoder[] = { 1231 { 1232 "disable_mappable_bar0", 1233 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_mappable_bar0), 1234 spdk_json_decode_bool, true 1235 }, 1236 { 1237 "disable_adaptive_irq", 1238 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_adaptive_irq), 1239 spdk_json_decode_bool, true 1240 }, 1241 { 1242 "disable_shadow_doorbells", 1243 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_shadow_doorbells), 1244 spdk_json_decode_bool, true 1245 }, 1246 { 1247 "disable_compare", 1248 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_compare), 1249 spdk_json_decode_bool, true 1250 }, 1251 { 1252 "enable_intr_mode_sq_spreading", 1253 offsetof(struct nvmf_vfio_user_transport, transport_opts.enable_intr_mode_sq_spreading), 1254 spdk_json_decode_bool, true 1255 }, 1256 }; 1257 1258 static struct spdk_nvmf_transport * 1259 nvmf_vfio_user_create(struct spdk_nvmf_transport_opts *opts) 1260 { 1261 struct nvmf_vfio_user_transport *vu_transport; 1262 int err; 1263 1264 if (opts->max_qpairs_per_ctrlr > NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) { 1265 SPDK_ERRLOG("Invalid max_qpairs_per_ctrlr=%d, supported max_qpairs_per_ctrlr=%d\n", 1266 opts->max_qpairs_per_ctrlr, NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR); 1267 return NULL; 1268 } 1269 1270 vu_transport = calloc(1, sizeof(*vu_transport)); 1271 if (vu_transport == NULL) { 1272 SPDK_ERRLOG("Transport alloc fail: %m\n"); 1273 return NULL; 1274 } 1275 1276 err = pthread_mutex_init(&vu_transport->lock, NULL); 1277 if (err != 0) { 1278 SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err); 1279 goto err; 1280 } 1281 TAILQ_INIT(&vu_transport->endpoints); 1282 1283 err = pthread_mutex_init(&vu_transport->pg_lock, NULL); 1284 if (err != 0) { 1285 pthread_mutex_destroy(&vu_transport->lock); 1286 SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err); 1287 goto err; 1288 } 1289 TAILQ_INIT(&vu_transport->poll_groups); 1290 1291 if (opts->transport_specific != NULL && 1292 spdk_json_decode_object_relaxed(opts->transport_specific, vfio_user_transport_opts_decoder, 1293 SPDK_COUNTOF(vfio_user_transport_opts_decoder), 1294 vu_transport)) { 1295 SPDK_ERRLOG("spdk_json_decode_object_relaxed failed\n"); 1296 goto cleanup; 1297 } 1298 1299 /* 1300 * To support interrupt mode, the transport must be configured with 1301 * mappable BAR0 disabled: we need a vfio-user message to wake us up 1302 * when a client writes new doorbell values to BAR0, via the 1303 * libvfio-user socket fd. 1304 */ 1305 vu_transport->intr_mode_supported = 1306 vu_transport->transport_opts.disable_mappable_bar0; 1307 1308 /* 1309 * If BAR0 is mappable, it doesn't make sense to support shadow 1310 * doorbells, so explicitly turn it off. 1311 */ 1312 if (!vu_transport->transport_opts.disable_mappable_bar0) { 1313 vu_transport->transport_opts.disable_shadow_doorbells = true; 1314 } 1315 1316 if (spdk_interrupt_mode_is_enabled()) { 1317 if (!vu_transport->intr_mode_supported) { 1318 SPDK_ERRLOG("interrupt mode not supported\n"); 1319 goto cleanup; 1320 } 1321 1322 /* 1323 * If we are in interrupt mode, we cannot support adaptive IRQs, 1324 * as there is no guarantee the SQ poller will run subsequently 1325 * to send pending IRQs. 1326 */ 1327 vu_transport->transport_opts.disable_adaptive_irq = true; 1328 } 1329 1330 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_mappable_bar0=%d\n", 1331 vu_transport->transport_opts.disable_mappable_bar0); 1332 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_adaptive_irq=%d\n", 1333 vu_transport->transport_opts.disable_adaptive_irq); 1334 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_shadow_doorbells=%d\n", 1335 vu_transport->transport_opts.disable_shadow_doorbells); 1336 1337 return &vu_transport->transport; 1338 1339 cleanup: 1340 pthread_mutex_destroy(&vu_transport->lock); 1341 pthread_mutex_destroy(&vu_transport->pg_lock); 1342 err: 1343 free(vu_transport); 1344 return NULL; 1345 } 1346 1347 static uint32_t 1348 max_queue_size(struct nvmf_vfio_user_ctrlr const *vu_ctrlr) 1349 { 1350 assert(vu_ctrlr != NULL); 1351 assert(vu_ctrlr->ctrlr != NULL); 1352 1353 return vu_ctrlr->ctrlr->vcprop.cap.bits.mqes + 1; 1354 } 1355 1356 static uint32_t 1357 doorbell_stride(const struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1358 { 1359 assert(vu_ctrlr != NULL); 1360 assert(vu_ctrlr->ctrlr != NULL); 1361 1362 return vu_ctrlr->ctrlr->vcprop.cap.bits.dstrd; 1363 } 1364 1365 static uintptr_t 1366 memory_page_size(const struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1367 { 1368 uint32_t memory_page_shift = vu_ctrlr->ctrlr->vcprop.cc.bits.mps + 12; 1369 return 1ul << memory_page_shift; 1370 } 1371 1372 static uintptr_t 1373 memory_page_mask(const struct nvmf_vfio_user_ctrlr *ctrlr) 1374 { 1375 return ~(memory_page_size(ctrlr) - 1); 1376 } 1377 1378 static int 1379 map_q(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvme_q_mapping *mapping, 1380 uint32_t flags) 1381 { 1382 void *ret; 1383 1384 assert(mapping->len != 0); 1385 assert(q_addr(mapping) == NULL); 1386 1387 ret = map_one(vu_ctrlr->endpoint->vfu_ctx, mapping->prp1, mapping->len, 1388 mapping->sg, &mapping->iov, flags); 1389 if (ret == NULL) { 1390 return -EFAULT; 1391 } 1392 1393 if (flags & MAP_INITIALIZE) { 1394 memset(q_addr(mapping), 0, mapping->len); 1395 } 1396 1397 return 0; 1398 } 1399 1400 static inline void 1401 unmap_q(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvme_q_mapping *mapping) 1402 { 1403 if (q_addr(mapping) != NULL) { 1404 vfu_sgl_put(vu_ctrlr->endpoint->vfu_ctx, mapping->sg, 1405 &mapping->iov, 1); 1406 mapping->iov.iov_base = NULL; 1407 } 1408 } 1409 1410 static int 1411 asq_setup(struct nvmf_vfio_user_ctrlr *ctrlr) 1412 { 1413 struct nvmf_vfio_user_sq *sq; 1414 const struct spdk_nvmf_registers *regs; 1415 int ret; 1416 1417 assert(ctrlr != NULL); 1418 1419 sq = ctrlr->sqs[0]; 1420 1421 assert(sq != NULL); 1422 assert(q_addr(&sq->mapping) == NULL); 1423 /* XXX ctrlr->asq == 0 is a valid memory address */ 1424 1425 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->ctrlr); 1426 sq->qid = 0; 1427 sq->size = regs->aqa.bits.asqs + 1; 1428 sq->mapping.prp1 = regs->asq; 1429 sq->mapping.len = sq->size * sizeof(struct spdk_nvme_cmd); 1430 *sq_headp(sq) = 0; 1431 sq->cqid = 0; 1432 1433 ret = map_q(ctrlr, &sq->mapping, MAP_INITIALIZE); 1434 if (ret) { 1435 return ret; 1436 } 1437 1438 /* The Admin queue (qid: 0) does not ever use shadow doorbells. */ 1439 sq->dbl_tailp = ctrlr->bar0_doorbells + queue_index(0, false); 1440 1441 *sq_dbl_tailp(sq) = 0; 1442 1443 return 0; 1444 } 1445 1446 /* 1447 * Updates eventidx to set an SQ into interrupt or polling mode. 1448 * 1449 * Returns false if the current SQ tail does not match the SQ head, as 1450 * this means that the host has submitted more items to the queue while we were 1451 * not looking - or during the event index update. In that case, we must retry, 1452 * or otherwise make sure we are going to wake up again. 1453 */ 1454 static bool 1455 set_sq_eventidx(struct nvmf_vfio_user_sq *sq) 1456 { 1457 struct nvmf_vfio_user_ctrlr *ctrlr; 1458 volatile uint32_t *sq_tail_eidx; 1459 uint32_t old_tail, new_tail; 1460 1461 assert(sq != NULL); 1462 assert(sq->ctrlr != NULL); 1463 assert(sq->ctrlr->sdbl != NULL); 1464 assert(sq->need_rearm); 1465 assert(sq->qid != 0); 1466 1467 ctrlr = sq->ctrlr; 1468 1469 SPDK_DEBUGLOG(vfio_user_db, "%s: updating eventidx of sqid:%u\n", 1470 ctrlr_id(ctrlr), sq->qid); 1471 1472 sq_tail_eidx = ctrlr->sdbl->eventidxs + queue_index(sq->qid, false); 1473 1474 assert(ctrlr->endpoint != NULL); 1475 1476 if (!ctrlr->endpoint->interrupt_mode) { 1477 /* No synchronisation necessary. */ 1478 *sq_tail_eidx = NVMF_VFIO_USER_EVENTIDX_POLL; 1479 return true; 1480 } 1481 1482 old_tail = *sq_dbl_tailp(sq); 1483 *sq_tail_eidx = old_tail; 1484 1485 /* 1486 * Ensure that the event index is updated before re-reading the tail 1487 * doorbell. If it's not, then the host might race us and update the 1488 * tail after the second read but before the event index is written, so 1489 * it won't write to BAR0 and we'll miss the update. 1490 * 1491 * The driver should provide similar ordering with an mb(). 1492 */ 1493 spdk_mb(); 1494 1495 /* 1496 * Check if the host has updated the tail doorbell after we've read it 1497 * for the first time, but before the event index was written. If that's 1498 * the case, then we've lost the race and we need to update the event 1499 * index again (after polling the queue, since the host won't write to 1500 * BAR0). 1501 */ 1502 new_tail = *sq_dbl_tailp(sq); 1503 1504 /* 1505 * We might poll the queue straight after this function returns if the 1506 * tail has been updated, so we need to ensure that any changes to the 1507 * queue will be visible to us if the doorbell has been updated. 1508 * 1509 * The driver should provide similar ordering with a wmb() to ensure 1510 * that the queue is written before it updates the tail doorbell. 1511 */ 1512 spdk_rmb(); 1513 1514 SPDK_DEBUGLOG(vfio_user_db, "%s: sqid:%u, old_tail=%u, new_tail=%u, " 1515 "sq_head=%u\n", ctrlr_id(ctrlr), sq->qid, old_tail, 1516 new_tail, *sq_headp(sq)); 1517 1518 if (new_tail == *sq_headp(sq)) { 1519 sq->need_rearm = false; 1520 return true; 1521 } 1522 1523 /* 1524 * We've lost the race: the tail was updated since we last polled, 1525 * including if it happened within this routine. 1526 * 1527 * The caller should retry after polling (think of this as a cmpxchg 1528 * loop); if we go to sleep while the SQ is not empty, then we won't 1529 * process the remaining events. 1530 */ 1531 return false; 1532 } 1533 1534 static int nvmf_vfio_user_sq_poll(struct nvmf_vfio_user_sq *sq); 1535 1536 /* 1537 * Arrange for an SQ to interrupt us if written. Returns non-zero if we 1538 * processed some SQ entries. 1539 */ 1540 static int 1541 vfio_user_sq_rearm(struct nvmf_vfio_user_ctrlr *ctrlr, 1542 struct nvmf_vfio_user_sq *sq, 1543 struct nvmf_vfio_user_poll_group *vu_group) 1544 { 1545 int count = 0; 1546 size_t i; 1547 1548 assert(sq->need_rearm); 1549 1550 for (i = 0; i < NVMF_VFIO_USER_SET_EVENTIDX_MAX_ATTEMPTS; i++) { 1551 int ret; 1552 1553 if (set_sq_eventidx(sq)) { 1554 /* We won the race and set eventidx; done. */ 1555 vu_group->stats.won++; 1556 return count; 1557 } 1558 1559 ret = nvmf_vfio_user_sq_poll(sq); 1560 1561 count += (ret < 0) ? 1 : ret; 1562 1563 /* 1564 * set_sq_eventidx() hit the race, so we expected 1565 * to process at least one command from this queue. 1566 * If there were no new commands waiting for us, then 1567 * we must have hit an unexpected race condition. 1568 */ 1569 if (ret == 0) { 1570 SPDK_ERRLOG("%s: unexpected race condition detected " 1571 "while updating the shadow doorbell buffer\n", 1572 ctrlr_id(ctrlr)); 1573 1574 fail_ctrlr(ctrlr); 1575 return count; 1576 } 1577 } 1578 1579 SPDK_DEBUGLOG(vfio_user_db, 1580 "%s: set_sq_eventidx() lost the race %zu times\n", 1581 ctrlr_id(ctrlr), i); 1582 1583 vu_group->stats.lost++; 1584 vu_group->stats.lost_count += count; 1585 1586 /* 1587 * We couldn't arrange an eventidx guaranteed to cause a BAR0 write, as 1588 * we raced with the producer too many times; force ourselves to wake up 1589 * instead. We'll process all queues at that point. 1590 */ 1591 ctrlr_kick(ctrlr); 1592 1593 return count; 1594 } 1595 1596 /* 1597 * We're in interrupt mode, and potentially about to go to sleep. We need to 1598 * make sure any further I/O submissions are guaranteed to wake us up: for 1599 * shadow doorbells that means we may need to go through set_sq_eventidx() for 1600 * every SQ that needs re-arming. 1601 * 1602 * Returns non-zero if we processed something. 1603 */ 1604 static int 1605 vfio_user_poll_group_rearm(struct nvmf_vfio_user_poll_group *vu_group) 1606 { 1607 struct nvmf_vfio_user_sq *sq; 1608 int count = 0; 1609 1610 vu_group->stats.rearms++; 1611 1612 TAILQ_FOREACH(sq, &vu_group->sqs, link) { 1613 if (spdk_unlikely(sq->sq_state != VFIO_USER_SQ_ACTIVE || !sq->size)) { 1614 continue; 1615 } 1616 1617 if (sq->need_rearm) { 1618 count += vfio_user_sq_rearm(sq->ctrlr, sq, vu_group); 1619 } 1620 } 1621 1622 return count; 1623 } 1624 1625 static int 1626 acq_setup(struct nvmf_vfio_user_ctrlr *ctrlr) 1627 { 1628 struct nvmf_vfio_user_cq *cq; 1629 const struct spdk_nvmf_registers *regs; 1630 int ret; 1631 1632 assert(ctrlr != NULL); 1633 1634 cq = ctrlr->cqs[0]; 1635 1636 assert(cq != NULL); 1637 1638 assert(q_addr(&cq->mapping) == NULL); 1639 1640 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->ctrlr); 1641 assert(regs != NULL); 1642 cq->qid = 0; 1643 cq->size = regs->aqa.bits.acqs + 1; 1644 cq->mapping.prp1 = regs->acq; 1645 cq->mapping.len = cq->size * sizeof(struct spdk_nvme_cpl); 1646 *cq_tailp(cq) = 0; 1647 cq->ien = true; 1648 cq->phase = true; 1649 1650 ret = map_q(ctrlr, &cq->mapping, MAP_RW | MAP_INITIALIZE); 1651 if (ret) { 1652 return ret; 1653 } 1654 1655 /* The Admin queue (qid: 0) does not ever use shadow doorbells. */ 1656 cq->dbl_headp = ctrlr->bar0_doorbells + queue_index(0, true); 1657 1658 *cq_dbl_headp(cq) = 0; 1659 1660 return 0; 1661 } 1662 1663 static void * 1664 _map_one(void *prv, uint64_t addr, uint64_t len, uint32_t flags) 1665 { 1666 struct spdk_nvmf_request *req = (struct spdk_nvmf_request *)prv; 1667 struct spdk_nvmf_qpair *qpair; 1668 struct nvmf_vfio_user_req *vu_req; 1669 struct nvmf_vfio_user_sq *sq; 1670 void *ret; 1671 1672 assert(req != NULL); 1673 qpair = req->qpair; 1674 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 1675 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 1676 1677 assert(vu_req->iovcnt < NVMF_VFIO_USER_MAX_IOVECS); 1678 ret = map_one(sq->ctrlr->endpoint->vfu_ctx, addr, len, 1679 index_to_sg_t(vu_req->sg, vu_req->iovcnt), 1680 &vu_req->iov[vu_req->iovcnt], flags); 1681 if (spdk_likely(ret != NULL)) { 1682 vu_req->iovcnt++; 1683 } 1684 return ret; 1685 } 1686 1687 static int 1688 vfio_user_map_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req, 1689 struct iovec *iov, uint32_t length) 1690 { 1691 /* Map PRP list to from Guest physical memory to 1692 * virtual memory address. 1693 */ 1694 return nvme_map_cmd(req, &req->cmd->nvme_cmd, iov, NVMF_REQ_MAX_BUFFERS, 1695 length, 4096, _map_one); 1696 } 1697 1698 static int handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 1699 struct nvmf_vfio_user_sq *sq); 1700 1701 static uint32_t 1702 cq_free_slots(struct nvmf_vfio_user_cq *cq) 1703 { 1704 uint32_t free_slots; 1705 1706 assert(cq != NULL); 1707 1708 if (cq->tail == cq->last_head) { 1709 free_slots = cq->size; 1710 } else if (cq->tail > cq->last_head) { 1711 free_slots = cq->size - (cq->tail - cq->last_head); 1712 } else { 1713 free_slots = cq->last_head - cq->tail; 1714 } 1715 assert(free_slots > 0); 1716 1717 return free_slots - 1; 1718 } 1719 1720 /* 1721 * Since reading the head doorbell is relatively expensive, we use the cached 1722 * value, so we only have to read it for real if it appears that we are full. 1723 */ 1724 static inline bool 1725 cq_is_full(struct nvmf_vfio_user_cq *cq) 1726 { 1727 uint32_t free_cq_slots; 1728 1729 assert(cq != NULL); 1730 1731 free_cq_slots = cq_free_slots(cq); 1732 1733 if (spdk_unlikely(free_cq_slots == 0)) { 1734 cq->last_head = *cq_dbl_headp(cq); 1735 free_cq_slots = cq_free_slots(cq); 1736 } 1737 1738 return free_cq_slots == 0; 1739 } 1740 1741 /* 1742 * Posts a CQE in the completion queue. 1743 * 1744 * @ctrlr: the vfio-user controller 1745 * @cq: the completion queue 1746 * @cdw0: cdw0 as reported by NVMf 1747 * @sqid: submission queue ID 1748 * @cid: command identifier in NVMe command 1749 * @sc: the NVMe CQE status code 1750 * @sct: the NVMe CQE status code type 1751 */ 1752 static int 1753 post_completion(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_cq *cq, 1754 uint32_t cdw0, uint16_t sqid, uint16_t cid, uint16_t sc, uint16_t sct) 1755 { 1756 struct spdk_nvme_status cpl_status = { 0 }; 1757 struct spdk_nvme_cpl *cpl; 1758 int err; 1759 1760 assert(ctrlr != NULL); 1761 1762 if (spdk_unlikely(cq == NULL || q_addr(&cq->mapping) == NULL)) { 1763 return 0; 1764 } 1765 1766 if (cq->qid == 0) { 1767 assert(spdk_get_thread() == cq->group->group->thread); 1768 } 1769 1770 /* 1771 * As per NVMe Base spec 3.3.1.2.1, we are supposed to implement CQ flow 1772 * control: if there is no space in the CQ, we should wait until there is. 1773 * 1774 * In practice, we just fail the controller instead: as it happens, all host 1775 * implementations we care about right-size the CQ: this is required anyway for 1776 * NVMEoF support (see 3.3.2.8). 1777 */ 1778 if (cq_is_full(cq)) { 1779 SPDK_ERRLOG("%s: cqid:%d full (tail=%d, head=%d)\n", 1780 ctrlr_id(ctrlr), cq->qid, *cq_tailp(cq), 1781 *cq_dbl_headp(cq)); 1782 return -1; 1783 } 1784 1785 cpl = ((struct spdk_nvme_cpl *)q_addr(&cq->mapping)) + *cq_tailp(cq); 1786 1787 assert(ctrlr->sqs[sqid] != NULL); 1788 SPDK_DEBUGLOG(nvmf_vfio, 1789 "%s: request complete sqid:%d cid=%d status=%#x " 1790 "sqhead=%d cq tail=%d\n", ctrlr_id(ctrlr), sqid, cid, sc, 1791 *sq_headp(ctrlr->sqs[sqid]), *cq_tailp(cq)); 1792 1793 cpl->sqhd = *sq_headp(ctrlr->sqs[sqid]); 1794 cpl->sqid = sqid; 1795 cpl->cid = cid; 1796 cpl->cdw0 = cdw0; 1797 1798 /* 1799 * This is a bitfield: instead of setting the individual bits we need 1800 * directly in cpl->status, which would cause a read-modify-write cycle, 1801 * we'll avoid reading from the CPL altogether by filling in a local 1802 * cpl_status variable, then writing the whole thing. 1803 */ 1804 cpl_status.sct = sct; 1805 cpl_status.sc = sc; 1806 cpl_status.p = cq->phase; 1807 cpl->status = cpl_status; 1808 1809 /* Ensure the Completion Queue Entry is visible. */ 1810 spdk_wmb(); 1811 cq_tail_advance(cq); 1812 1813 if ((cq->qid == 0 || !ctrlr->adaptive_irqs_enabled) && 1814 cq->ien && ctrlr_interrupt_enabled(ctrlr)) { 1815 err = vfu_irq_trigger(ctrlr->endpoint->vfu_ctx, cq->iv); 1816 if (err != 0) { 1817 SPDK_ERRLOG("%s: failed to trigger interrupt: %m\n", 1818 ctrlr_id(ctrlr)); 1819 return err; 1820 } 1821 } 1822 1823 return 0; 1824 } 1825 1826 static void 1827 free_sq_reqs(struct nvmf_vfio_user_sq *sq) 1828 { 1829 while (!TAILQ_EMPTY(&sq->free_reqs)) { 1830 struct nvmf_vfio_user_req *vu_req = TAILQ_FIRST(&sq->free_reqs); 1831 TAILQ_REMOVE(&sq->free_reqs, vu_req, link); 1832 free(vu_req); 1833 } 1834 } 1835 1836 static void 1837 delete_cq_done(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_cq *cq) 1838 { 1839 assert(cq->cq_ref == 0); 1840 unmap_q(ctrlr, &cq->mapping); 1841 cq->size = 0; 1842 cq->cq_state = VFIO_USER_CQ_DELETED; 1843 cq->group = NULL; 1844 } 1845 1846 /* Deletes a SQ, if this SQ is the last user of the associated CQ 1847 * and the controller is being shut down/reset or vfio-user client disconnects, 1848 * then the CQ is also deleted. 1849 */ 1850 static void 1851 delete_sq_done(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvmf_vfio_user_sq *sq) 1852 { 1853 struct nvmf_vfio_user_cq *cq; 1854 uint16_t cqid; 1855 1856 SPDK_DEBUGLOG(nvmf_vfio, "%s: delete sqid:%d=%p done\n", ctrlr_id(vu_ctrlr), 1857 sq->qid, sq); 1858 1859 /* Free SQ resources */ 1860 unmap_q(vu_ctrlr, &sq->mapping); 1861 1862 free_sq_reqs(sq); 1863 1864 sq->size = 0; 1865 1866 sq->sq_state = VFIO_USER_SQ_DELETED; 1867 1868 /* Controller RESET and SHUTDOWN are special cases, 1869 * VM may not send DELETE IO SQ/CQ commands, NVMf library 1870 * will disconnect IO queue pairs. 1871 */ 1872 if (vu_ctrlr->reset_shn || vu_ctrlr->disconnect) { 1873 cqid = sq->cqid; 1874 cq = vu_ctrlr->cqs[cqid]; 1875 1876 SPDK_DEBUGLOG(nvmf_vfio, "%s: try to delete cqid:%u=%p\n", ctrlr_id(vu_ctrlr), 1877 cq->qid, cq); 1878 1879 assert(cq->cq_ref > 0); 1880 if (--cq->cq_ref == 0) { 1881 delete_cq_done(vu_ctrlr, cq); 1882 } 1883 } 1884 } 1885 1886 static void 1887 free_qp(struct nvmf_vfio_user_ctrlr *ctrlr, uint16_t qid) 1888 { 1889 struct nvmf_vfio_user_sq *sq; 1890 struct nvmf_vfio_user_cq *cq; 1891 1892 if (ctrlr == NULL) { 1893 return; 1894 } 1895 1896 sq = ctrlr->sqs[qid]; 1897 if (sq) { 1898 SPDK_DEBUGLOG(nvmf_vfio, "%s: Free sqid:%u\n", ctrlr_id(ctrlr), qid); 1899 unmap_q(ctrlr, &sq->mapping); 1900 1901 free_sq_reqs(sq); 1902 1903 free(sq->mapping.sg); 1904 free(sq); 1905 ctrlr->sqs[qid] = NULL; 1906 } 1907 1908 cq = ctrlr->cqs[qid]; 1909 if (cq) { 1910 SPDK_DEBUGLOG(nvmf_vfio, "%s: Free cqid:%u\n", ctrlr_id(ctrlr), qid); 1911 unmap_q(ctrlr, &cq->mapping); 1912 free(cq->mapping.sg); 1913 free(cq); 1914 ctrlr->cqs[qid] = NULL; 1915 } 1916 } 1917 1918 static int 1919 init_sq(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_transport *transport, 1920 const uint16_t id) 1921 { 1922 struct nvmf_vfio_user_sq *sq; 1923 1924 assert(ctrlr != NULL); 1925 assert(transport != NULL); 1926 assert(ctrlr->sqs[id] == NULL); 1927 1928 sq = calloc(1, sizeof(*sq)); 1929 if (sq == NULL) { 1930 return -ENOMEM; 1931 } 1932 sq->mapping.sg = calloc(1, dma_sg_size()); 1933 if (sq->mapping.sg == NULL) { 1934 free(sq); 1935 return -ENOMEM; 1936 } 1937 1938 sq->qid = id; 1939 sq->qpair.qid = id; 1940 sq->qpair.transport = transport; 1941 sq->ctrlr = ctrlr; 1942 ctrlr->sqs[id] = sq; 1943 1944 TAILQ_INIT(&sq->free_reqs); 1945 1946 return 0; 1947 } 1948 1949 static int 1950 init_cq(struct nvmf_vfio_user_ctrlr *vu_ctrlr, const uint16_t id) 1951 { 1952 struct nvmf_vfio_user_cq *cq; 1953 1954 assert(vu_ctrlr != NULL); 1955 assert(vu_ctrlr->cqs[id] == NULL); 1956 1957 cq = calloc(1, sizeof(*cq)); 1958 if (cq == NULL) { 1959 return -ENOMEM; 1960 } 1961 cq->mapping.sg = calloc(1, dma_sg_size()); 1962 if (cq->mapping.sg == NULL) { 1963 free(cq); 1964 return -ENOMEM; 1965 } 1966 1967 cq->qid = id; 1968 vu_ctrlr->cqs[id] = cq; 1969 1970 return 0; 1971 } 1972 1973 static int 1974 alloc_sq_reqs(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvmf_vfio_user_sq *sq) 1975 { 1976 struct nvmf_vfio_user_req *vu_req, *tmp; 1977 size_t req_size; 1978 uint32_t i; 1979 1980 req_size = sizeof(struct nvmf_vfio_user_req) + 1981 (dma_sg_size() * NVMF_VFIO_USER_MAX_IOVECS); 1982 1983 for (i = 0; i < sq->size; i++) { 1984 struct spdk_nvmf_request *req; 1985 1986 vu_req = calloc(1, req_size); 1987 if (vu_req == NULL) { 1988 goto err; 1989 } 1990 1991 req = &vu_req->req; 1992 req->qpair = &sq->qpair; 1993 req->rsp = (union nvmf_c2h_msg *)&vu_req->rsp; 1994 req->cmd = (union nvmf_h2c_msg *)&vu_req->cmd; 1995 req->stripped_data = NULL; 1996 1997 TAILQ_INSERT_TAIL(&sq->free_reqs, vu_req, link); 1998 } 1999 2000 return 0; 2001 2002 err: 2003 TAILQ_FOREACH_SAFE(vu_req, &sq->free_reqs, link, tmp) { 2004 free(vu_req); 2005 } 2006 return -ENOMEM; 2007 } 2008 2009 static volatile uint32_t * 2010 ctrlr_doorbell_ptr(struct nvmf_vfio_user_ctrlr *ctrlr) 2011 { 2012 return ctrlr->sdbl != NULL ? 2013 ctrlr->sdbl->shadow_doorbells : 2014 ctrlr->bar0_doorbells; 2015 } 2016 2017 static uint16_t 2018 handle_create_io_sq(struct nvmf_vfio_user_ctrlr *ctrlr, 2019 struct spdk_nvme_cmd *cmd, uint16_t *sct) 2020 { 2021 struct nvmf_vfio_user_transport *vu_transport = ctrlr->transport; 2022 struct nvmf_vfio_user_sq *sq; 2023 uint32_t qsize; 2024 uint16_t cqid; 2025 uint16_t qid; 2026 int err; 2027 2028 qid = cmd->cdw10_bits.create_io_q.qid; 2029 cqid = cmd->cdw11_bits.create_io_sq.cqid; 2030 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 2031 2032 if (ctrlr->sqs[qid] == NULL) { 2033 err = init_sq(ctrlr, ctrlr->sqs[0]->qpair.transport, qid); 2034 if (err != 0) { 2035 *sct = SPDK_NVME_SCT_GENERIC; 2036 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2037 } 2038 } 2039 2040 if (cqid == 0 || cqid >= vu_transport->transport.opts.max_qpairs_per_ctrlr) { 2041 SPDK_ERRLOG("%s: invalid cqid:%u\n", ctrlr_id(ctrlr), cqid); 2042 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2043 return SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2044 } 2045 2046 /* CQ must be created before SQ. */ 2047 if (!io_q_exists(ctrlr, cqid, true)) { 2048 SPDK_ERRLOG("%s: cqid:%u does not exist\n", ctrlr_id(ctrlr), cqid); 2049 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2050 return SPDK_NVME_SC_COMPLETION_QUEUE_INVALID; 2051 } 2052 2053 if (cmd->cdw11_bits.create_io_sq.pc != 0x1) { 2054 SPDK_ERRLOG("%s: non-PC SQ not supported\n", ctrlr_id(ctrlr)); 2055 *sct = SPDK_NVME_SCT_GENERIC; 2056 return SPDK_NVME_SC_INVALID_FIELD; 2057 } 2058 2059 sq = ctrlr->sqs[qid]; 2060 sq->size = qsize; 2061 2062 SPDK_DEBUGLOG(nvmf_vfio, "%s: sqid:%d cqid:%d\n", ctrlr_id(ctrlr), 2063 qid, cqid); 2064 2065 sq->mapping.prp1 = cmd->dptr.prp.prp1; 2066 sq->mapping.len = sq->size * sizeof(struct spdk_nvme_cmd); 2067 2068 err = map_q(ctrlr, &sq->mapping, MAP_INITIALIZE); 2069 if (err) { 2070 SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr)); 2071 *sct = SPDK_NVME_SCT_GENERIC; 2072 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2073 } 2074 2075 SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped sqid:%d IOVA=%#lx vaddr=%p\n", 2076 ctrlr_id(ctrlr), qid, cmd->dptr.prp.prp1, 2077 q_addr(&sq->mapping)); 2078 2079 err = alloc_sq_reqs(ctrlr, sq); 2080 if (err < 0) { 2081 SPDK_ERRLOG("%s: failed to allocate SQ requests: %m\n", ctrlr_id(ctrlr)); 2082 *sct = SPDK_NVME_SCT_GENERIC; 2083 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2084 } 2085 2086 sq->cqid = cqid; 2087 ctrlr->cqs[sq->cqid]->cq_ref++; 2088 sq->sq_state = VFIO_USER_SQ_CREATED; 2089 *sq_headp(sq) = 0; 2090 2091 sq->dbl_tailp = ctrlr_doorbell_ptr(ctrlr) + queue_index(qid, false); 2092 2093 /* 2094 * We should always reset the doorbells. 2095 * 2096 * The Specification prohibits the controller from writing to the shadow 2097 * doorbell buffer, however older versions of the Linux NVMe driver 2098 * don't reset the shadow doorbell buffer after a Queue-Level or 2099 * Controller-Level reset, which means that we're left with garbage 2100 * doorbell values. 2101 */ 2102 *sq_dbl_tailp(sq) = 0; 2103 2104 if (ctrlr->sdbl != NULL) { 2105 sq->need_rearm = true; 2106 2107 if (!set_sq_eventidx(sq)) { 2108 SPDK_ERRLOG("%s: host updated SQ tail doorbell before " 2109 "sqid:%hu was initialized\n", 2110 ctrlr_id(ctrlr), qid); 2111 fail_ctrlr(ctrlr); 2112 *sct = SPDK_NVME_SCT_GENERIC; 2113 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2114 } 2115 } 2116 2117 /* 2118 * Create our new I/O qpair. This asynchronously invokes, on a suitable 2119 * poll group, the nvmf_vfio_user_poll_group_add() callback, which will 2120 * call spdk_nvmf_request_exec() with a generated fabrics 2121 * connect command. This command is then eventually completed via 2122 * handle_queue_connect_rsp(). 2123 */ 2124 sq->create_io_sq_cmd = *cmd; 2125 sq->post_create_io_sq_completion = true; 2126 2127 spdk_nvmf_tgt_new_qpair(ctrlr->transport->transport.tgt, 2128 &sq->qpair); 2129 2130 *sct = SPDK_NVME_SCT_GENERIC; 2131 return SPDK_NVME_SC_SUCCESS; 2132 } 2133 2134 static uint16_t 2135 handle_create_io_cq(struct nvmf_vfio_user_ctrlr *ctrlr, 2136 struct spdk_nvme_cmd *cmd, uint16_t *sct) 2137 { 2138 struct nvmf_vfio_user_cq *cq; 2139 uint32_t qsize; 2140 uint16_t qid; 2141 int err; 2142 2143 qid = cmd->cdw10_bits.create_io_q.qid; 2144 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 2145 2146 if (ctrlr->cqs[qid] == NULL) { 2147 err = init_cq(ctrlr, qid); 2148 if (err != 0) { 2149 *sct = SPDK_NVME_SCT_GENERIC; 2150 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2151 } 2152 } 2153 2154 if (cmd->cdw11_bits.create_io_cq.pc != 0x1) { 2155 SPDK_ERRLOG("%s: non-PC CQ not supported\n", ctrlr_id(ctrlr)); 2156 *sct = SPDK_NVME_SCT_GENERIC; 2157 return SPDK_NVME_SC_INVALID_FIELD; 2158 } 2159 2160 if (cmd->cdw11_bits.create_io_cq.iv > NVMF_VFIO_USER_MSIX_NUM - 1) { 2161 SPDK_ERRLOG("%s: IV is too big\n", ctrlr_id(ctrlr)); 2162 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2163 return SPDK_NVME_SC_INVALID_INTERRUPT_VECTOR; 2164 } 2165 2166 cq = ctrlr->cqs[qid]; 2167 cq->size = qsize; 2168 2169 cq->mapping.prp1 = cmd->dptr.prp.prp1; 2170 cq->mapping.len = cq->size * sizeof(struct spdk_nvme_cpl); 2171 2172 cq->dbl_headp = ctrlr_doorbell_ptr(ctrlr) + queue_index(qid, true); 2173 2174 err = map_q(ctrlr, &cq->mapping, MAP_RW | MAP_INITIALIZE); 2175 if (err) { 2176 SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr)); 2177 *sct = SPDK_NVME_SCT_GENERIC; 2178 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2179 } 2180 2181 SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped cqid:%u IOVA=%#lx vaddr=%p\n", 2182 ctrlr_id(ctrlr), qid, cmd->dptr.prp.prp1, 2183 q_addr(&cq->mapping)); 2184 2185 cq->ien = cmd->cdw11_bits.create_io_cq.ien; 2186 cq->iv = cmd->cdw11_bits.create_io_cq.iv; 2187 cq->phase = true; 2188 cq->cq_state = VFIO_USER_CQ_CREATED; 2189 2190 *cq_tailp(cq) = 0; 2191 2192 /* 2193 * We should always reset the doorbells. 2194 * 2195 * The Specification prohibits the controller from writing to the shadow 2196 * doorbell buffer, however older versions of the Linux NVMe driver 2197 * don't reset the shadow doorbell buffer after a Queue-Level or 2198 * Controller-Level reset, which means that we're left with garbage 2199 * doorbell values. 2200 */ 2201 *cq_dbl_headp(cq) = 0; 2202 2203 *sct = SPDK_NVME_SCT_GENERIC; 2204 return SPDK_NVME_SC_SUCCESS; 2205 } 2206 2207 /* 2208 * Creates a completion or submission I/O queue. Returns 0 on success, -errno 2209 * on error. 2210 */ 2211 static int 2212 handle_create_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, 2213 struct spdk_nvme_cmd *cmd, const bool is_cq) 2214 { 2215 struct nvmf_vfio_user_transport *vu_transport = ctrlr->transport; 2216 uint16_t sct = SPDK_NVME_SCT_GENERIC; 2217 uint16_t sc = SPDK_NVME_SC_SUCCESS; 2218 uint32_t qsize; 2219 uint16_t qid; 2220 2221 assert(ctrlr != NULL); 2222 assert(cmd != NULL); 2223 2224 qid = cmd->cdw10_bits.create_io_q.qid; 2225 if (qid == 0 || qid >= vu_transport->transport.opts.max_qpairs_per_ctrlr) { 2226 SPDK_ERRLOG("%s: invalid qid=%d, max=%d\n", ctrlr_id(ctrlr), 2227 qid, vu_transport->transport.opts.max_qpairs_per_ctrlr); 2228 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2229 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2230 goto out; 2231 } 2232 2233 if (io_q_exists(ctrlr, qid, is_cq)) { 2234 SPDK_ERRLOG("%s: %cqid:%d already exists\n", ctrlr_id(ctrlr), 2235 is_cq ? 'c' : 's', qid); 2236 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2237 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2238 goto out; 2239 } 2240 2241 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 2242 if (qsize == 1 || qsize > max_queue_size(ctrlr)) { 2243 SPDK_ERRLOG("%s: invalid I/O queue size %u\n", ctrlr_id(ctrlr), qsize); 2244 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2245 sc = SPDK_NVME_SC_INVALID_QUEUE_SIZE; 2246 goto out; 2247 } 2248 2249 if (is_cq) { 2250 sc = handle_create_io_cq(ctrlr, cmd, &sct); 2251 } else { 2252 sc = handle_create_io_sq(ctrlr, cmd, &sct); 2253 2254 if (sct == SPDK_NVME_SCT_GENERIC && 2255 sc == SPDK_NVME_SC_SUCCESS) { 2256 /* Completion posted asynchronously. */ 2257 return 0; 2258 } 2259 } 2260 2261 out: 2262 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 2263 } 2264 2265 /* For ADMIN I/O DELETE SUBMISSION QUEUE the NVMf library will disconnect and free 2266 * queue pair, so save the command id and controller in a context. 2267 */ 2268 struct vfio_user_delete_sq_ctx { 2269 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 2270 uint16_t cid; 2271 }; 2272 2273 static void 2274 vfio_user_qpair_delete_cb(void *cb_arg) 2275 { 2276 struct vfio_user_delete_sq_ctx *ctx = cb_arg; 2277 struct nvmf_vfio_user_ctrlr *vu_ctrlr = ctx->vu_ctrlr; 2278 struct nvmf_vfio_user_cq *admin_cq = vu_ctrlr->cqs[0]; 2279 2280 assert(admin_cq != NULL); 2281 assert(admin_cq->group != NULL); 2282 assert(admin_cq->group->group->thread != NULL); 2283 if (admin_cq->group->group->thread != spdk_get_thread()) { 2284 spdk_thread_send_msg(admin_cq->group->group->thread, 2285 vfio_user_qpair_delete_cb, 2286 cb_arg); 2287 } else { 2288 post_completion(vu_ctrlr, admin_cq, 0, 0, 2289 ctx->cid, 2290 SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC); 2291 free(ctx); 2292 } 2293 } 2294 2295 /* 2296 * Deletes a completion or submission I/O queue. 2297 */ 2298 static int 2299 handle_del_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, 2300 struct spdk_nvme_cmd *cmd, const bool is_cq) 2301 { 2302 uint16_t sct = SPDK_NVME_SCT_GENERIC; 2303 uint16_t sc = SPDK_NVME_SC_SUCCESS; 2304 struct nvmf_vfio_user_sq *sq; 2305 struct nvmf_vfio_user_cq *cq; 2306 2307 SPDK_DEBUGLOG(nvmf_vfio, "%s: delete I/O %cqid:%d\n", 2308 ctrlr_id(ctrlr), is_cq ? 'c' : 's', 2309 cmd->cdw10_bits.delete_io_q.qid); 2310 2311 if (!io_q_exists(ctrlr, cmd->cdw10_bits.delete_io_q.qid, is_cq)) { 2312 SPDK_ERRLOG("%s: I/O %cqid:%d does not exist\n", ctrlr_id(ctrlr), 2313 is_cq ? 'c' : 's', cmd->cdw10_bits.delete_io_q.qid); 2314 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2315 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2316 goto out; 2317 } 2318 2319 if (is_cq) { 2320 cq = ctrlr->cqs[cmd->cdw10_bits.delete_io_q.qid]; 2321 if (cq->cq_ref) { 2322 SPDK_ERRLOG("%s: the associated SQ must be deleted first\n", ctrlr_id(ctrlr)); 2323 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2324 sc = SPDK_NVME_SC_INVALID_QUEUE_DELETION; 2325 goto out; 2326 } 2327 delete_cq_done(ctrlr, cq); 2328 } else { 2329 /* 2330 * Deletion of the CQ is only deferred to delete_sq_done() on 2331 * VM reboot or CC.EN change, so we have to delete it in all 2332 * other cases. 2333 */ 2334 sq = ctrlr->sqs[cmd->cdw10_bits.delete_io_q.qid]; 2335 sq->delete_ctx = calloc(1, sizeof(*sq->delete_ctx)); 2336 if (!sq->delete_ctx) { 2337 sct = SPDK_NVME_SCT_GENERIC; 2338 sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2339 goto out; 2340 } 2341 sq->delete_ctx->vu_ctrlr = ctrlr; 2342 sq->delete_ctx->cid = cmd->cid; 2343 sq->sq_state = VFIO_USER_SQ_DELETED; 2344 assert(ctrlr->cqs[sq->cqid]->cq_ref); 2345 ctrlr->cqs[sq->cqid]->cq_ref--; 2346 2347 spdk_nvmf_qpair_disconnect(&sq->qpair); 2348 return 0; 2349 } 2350 2351 out: 2352 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 2353 } 2354 2355 /* 2356 * Configures Shadow Doorbells. 2357 */ 2358 static int 2359 handle_doorbell_buffer_config(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd) 2360 { 2361 struct nvmf_vfio_user_shadow_doorbells *sdbl = NULL; 2362 uint32_t dstrd; 2363 uintptr_t page_size, page_mask; 2364 uint64_t prp1, prp2; 2365 uint16_t sct = SPDK_NVME_SCT_GENERIC; 2366 uint16_t sc = SPDK_NVME_SC_INVALID_FIELD; 2367 2368 assert(ctrlr != NULL); 2369 assert(ctrlr->endpoint != NULL); 2370 assert(cmd != NULL); 2371 2372 dstrd = doorbell_stride(ctrlr); 2373 page_size = memory_page_size(ctrlr); 2374 page_mask = memory_page_mask(ctrlr); 2375 2376 /* FIXME: we don't check doorbell stride when setting queue doorbells. */ 2377 if ((4u << dstrd) * NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR > page_size) { 2378 SPDK_ERRLOG("%s: doorbells do not fit in a single host page", 2379 ctrlr_id(ctrlr)); 2380 2381 goto out; 2382 } 2383 2384 /* Verify guest physical addresses passed as PRPs. */ 2385 if (cmd->psdt != SPDK_NVME_PSDT_PRP) { 2386 SPDK_ERRLOG("%s: received Doorbell Buffer Config without PRPs", 2387 ctrlr_id(ctrlr)); 2388 2389 goto out; 2390 } 2391 2392 prp1 = cmd->dptr.prp.prp1; 2393 prp2 = cmd->dptr.prp.prp2; 2394 2395 SPDK_DEBUGLOG(nvmf_vfio, 2396 "%s: configuring shadow doorbells with PRP1=%#lx and PRP2=%#lx (GPAs)\n", 2397 ctrlr_id(ctrlr), prp1, prp2); 2398 2399 if (prp1 == prp2 2400 || prp1 != (prp1 & page_mask) 2401 || prp2 != (prp2 & page_mask)) { 2402 SPDK_ERRLOG("%s: invalid shadow doorbell GPAs\n", 2403 ctrlr_id(ctrlr)); 2404 2405 goto out; 2406 } 2407 2408 /* Map guest physical addresses to our virtual address space. */ 2409 sdbl = map_sdbl(ctrlr->endpoint->vfu_ctx, prp1, prp2, page_size); 2410 if (sdbl == NULL) { 2411 SPDK_ERRLOG("%s: failed to map shadow doorbell buffers\n", 2412 ctrlr_id(ctrlr)); 2413 2414 goto out; 2415 } 2416 2417 ctrlr->shadow_doorbell_buffer = prp1; 2418 ctrlr->eventidx_buffer = prp2; 2419 2420 SPDK_DEBUGLOG(nvmf_vfio, 2421 "%s: mapped shadow doorbell buffers [%p, %p) and [%p, %p)\n", 2422 ctrlr_id(ctrlr), 2423 sdbl->iovs[0].iov_base, 2424 sdbl->iovs[0].iov_base + sdbl->iovs[0].iov_len, 2425 sdbl->iovs[1].iov_base, 2426 sdbl->iovs[1].iov_base + sdbl->iovs[1].iov_len); 2427 2428 2429 /* 2430 * Set all possible CQ head doorbells to polling mode now, such that we 2431 * don't have to worry about it later if the host creates more queues. 2432 * 2433 * We only ever want interrupts for writes to the SQ tail doorbells 2434 * (which are initialised in set_ctrlr_intr_mode() below). 2435 */ 2436 for (uint16_t i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; ++i) { 2437 sdbl->eventidxs[queue_index(i, true)] = NVMF_VFIO_USER_EVENTIDX_POLL; 2438 } 2439 2440 /* Update controller. */ 2441 SWAP(ctrlr->sdbl, sdbl); 2442 2443 /* 2444 * Copy doorbells from either the previous shadow doorbell buffer or the 2445 * BAR0 doorbells and make I/O queue doorbells point to the new buffer. 2446 * 2447 * This needs to account for older versions of the Linux NVMe driver, 2448 * which don't clear out the buffer after a controller reset. 2449 */ 2450 copy_doorbells(ctrlr, sdbl != NULL ? 2451 sdbl->shadow_doorbells : ctrlr->bar0_doorbells, 2452 ctrlr->sdbl->shadow_doorbells); 2453 2454 vfio_user_ctrlr_switch_doorbells(ctrlr, true); 2455 2456 ctrlr_kick(ctrlr); 2457 2458 sc = SPDK_NVME_SC_SUCCESS; 2459 2460 out: 2461 /* 2462 * Unmap existing buffers, in case Doorbell Buffer Config was sent 2463 * more than once (pointless, but not prohibited by the spec), or 2464 * in case of an error. 2465 * 2466 * If this is the first time Doorbell Buffer Config was processed, 2467 * then we've just swapped a NULL from ctrlr->sdbl into sdbl, so 2468 * free_sdbl() becomes a noop. 2469 */ 2470 free_sdbl(ctrlr->endpoint->vfu_ctx, sdbl); 2471 2472 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 2473 } 2474 2475 /* Returns 0 on success and -errno on error. */ 2476 static int 2477 consume_admin_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd) 2478 { 2479 assert(ctrlr != NULL); 2480 assert(cmd != NULL); 2481 2482 if (cmd->fuse != 0) { 2483 /* Fused admin commands are not supported. */ 2484 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, 2485 SPDK_NVME_SC_INVALID_FIELD, 2486 SPDK_NVME_SCT_GENERIC); 2487 } 2488 2489 switch (cmd->opc) { 2490 case SPDK_NVME_OPC_CREATE_IO_CQ: 2491 case SPDK_NVME_OPC_CREATE_IO_SQ: 2492 return handle_create_io_q(ctrlr, cmd, 2493 cmd->opc == SPDK_NVME_OPC_CREATE_IO_CQ); 2494 case SPDK_NVME_OPC_DELETE_IO_SQ: 2495 case SPDK_NVME_OPC_DELETE_IO_CQ: 2496 return handle_del_io_q(ctrlr, cmd, 2497 cmd->opc == SPDK_NVME_OPC_DELETE_IO_CQ); 2498 case SPDK_NVME_OPC_DOORBELL_BUFFER_CONFIG: 2499 SPDK_NOTICELOG("%s: requested shadow doorbells (supported: %d)\n", 2500 ctrlr_id(ctrlr), 2501 !ctrlr->transport->transport_opts.disable_shadow_doorbells); 2502 if (!ctrlr->transport->transport_opts.disable_shadow_doorbells) { 2503 return handle_doorbell_buffer_config(ctrlr, cmd); 2504 } 2505 /* FALLTHROUGH */ 2506 default: 2507 return handle_cmd_req(ctrlr, cmd, ctrlr->sqs[0]); 2508 } 2509 } 2510 2511 static int 2512 handle_cmd_rsp(struct nvmf_vfio_user_req *vu_req, void *cb_arg) 2513 { 2514 struct nvmf_vfio_user_sq *sq = cb_arg; 2515 struct nvmf_vfio_user_ctrlr *vu_ctrlr = sq->ctrlr; 2516 uint16_t sqid, cqid; 2517 2518 assert(sq != NULL); 2519 assert(vu_req != NULL); 2520 assert(vu_ctrlr != NULL); 2521 2522 if (spdk_likely(vu_req->iovcnt)) { 2523 vfu_sgl_put(vu_ctrlr->endpoint->vfu_ctx, 2524 index_to_sg_t(vu_req->sg, 0), 2525 vu_req->iov, vu_req->iovcnt); 2526 } 2527 sqid = sq->qid; 2528 cqid = sq->cqid; 2529 2530 return post_completion(vu_ctrlr, vu_ctrlr->cqs[cqid], 2531 vu_req->req.rsp->nvme_cpl.cdw0, 2532 sqid, 2533 vu_req->req.cmd->nvme_cmd.cid, 2534 vu_req->req.rsp->nvme_cpl.status.sc, 2535 vu_req->req.rsp->nvme_cpl.status.sct); 2536 } 2537 2538 static int 2539 consume_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_sq *sq, 2540 struct spdk_nvme_cmd *cmd) 2541 { 2542 assert(sq != NULL); 2543 if (spdk_unlikely(nvmf_qpair_is_admin_queue(&sq->qpair))) { 2544 return consume_admin_cmd(ctrlr, cmd); 2545 } 2546 2547 return handle_cmd_req(ctrlr, cmd, sq); 2548 } 2549 2550 /* Returns the number of commands processed, or a negative value on error. */ 2551 static int 2552 handle_sq_tdbl_write(struct nvmf_vfio_user_ctrlr *ctrlr, const uint32_t new_tail, 2553 struct nvmf_vfio_user_sq *sq) 2554 { 2555 struct spdk_nvme_cmd *queue; 2556 struct nvmf_vfio_user_cq *cq = ctrlr->cqs[sq->cqid]; 2557 int count = 0; 2558 uint32_t free_cq_slots; 2559 2560 assert(ctrlr != NULL); 2561 assert(sq != NULL); 2562 2563 if (ctrlr->sdbl != NULL && sq->qid != 0) { 2564 /* 2565 * Submission queue index has moved past the event index, so it 2566 * needs to be re-armed before we go to sleep. 2567 */ 2568 sq->need_rearm = true; 2569 } 2570 2571 free_cq_slots = cq_free_slots(cq); 2572 queue = q_addr(&sq->mapping); 2573 while (*sq_headp(sq) != new_tail) { 2574 int err; 2575 struct spdk_nvme_cmd *cmd; 2576 2577 /* 2578 * Linux host nvme driver can submit cmd's more than free cq slots 2579 * available. So process only those who have cq slots available. 2580 */ 2581 if (free_cq_slots-- == 0) { 2582 cq->last_head = *cq_dbl_headp(cq); 2583 2584 free_cq_slots = cq_free_slots(cq); 2585 if (free_cq_slots > 0) { 2586 continue; 2587 } 2588 2589 /* 2590 * If there are no free cq slots then kick interrupt FD to loop 2591 * again to process remaining sq cmds. 2592 * In case of polling mode we will process remaining sq cmds during 2593 * next polling iteration. 2594 * sq head is advanced only for consumed commands. 2595 */ 2596 if (in_interrupt_mode(ctrlr->transport)) { 2597 struct nvmf_vfio_user_poll_group *vu_group = sq_to_poll_group(sq); 2598 eventfd_write(vu_group->intr_fd, 1); 2599 } 2600 break; 2601 } 2602 2603 cmd = &queue[*sq_headp(sq)]; 2604 count++; 2605 2606 /* 2607 * SQHD must contain the new head pointer, so we must increase 2608 * it before we generate a completion. 2609 */ 2610 sq_head_advance(sq); 2611 2612 err = consume_cmd(ctrlr, sq, cmd); 2613 if (spdk_unlikely(err != 0)) { 2614 return err; 2615 } 2616 } 2617 2618 return count; 2619 } 2620 2621 /* Checks whether endpoint is connected from the same process */ 2622 static bool 2623 is_peer_same_process(struct nvmf_vfio_user_endpoint *endpoint) 2624 { 2625 struct ucred ucred; 2626 socklen_t ucredlen = sizeof(ucred); 2627 2628 if (endpoint == NULL) { 2629 return false; 2630 } 2631 2632 if (getsockopt(vfu_get_poll_fd(endpoint->vfu_ctx), SOL_SOCKET, SO_PEERCRED, &ucred, 2633 &ucredlen) < 0) { 2634 SPDK_ERRLOG("getsockopt(SO_PEERCRED): %s\n", strerror(errno)); 2635 return false; 2636 } 2637 2638 return ucred.pid == getpid(); 2639 } 2640 2641 static void 2642 memory_region_add_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) 2643 { 2644 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2645 struct nvmf_vfio_user_ctrlr *ctrlr; 2646 struct nvmf_vfio_user_sq *sq; 2647 struct nvmf_vfio_user_cq *cq; 2648 void *map_start, *map_end; 2649 int ret; 2650 2651 /* 2652 * We're not interested in any DMA regions that aren't mappable (we don't 2653 * support clients that don't share their memory). 2654 */ 2655 if (!info->vaddr) { 2656 return; 2657 } 2658 2659 map_start = info->mapping.iov_base; 2660 map_end = info->mapping.iov_base + info->mapping.iov_len; 2661 2662 if (((uintptr_t)info->mapping.iov_base & MASK_2MB) || 2663 (info->mapping.iov_len & MASK_2MB)) { 2664 SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %p-%p\n", 2665 info->vaddr, map_start, map_end); 2666 return; 2667 } 2668 2669 assert(endpoint != NULL); 2670 if (endpoint->ctrlr == NULL) { 2671 return; 2672 } 2673 ctrlr = endpoint->ctrlr; 2674 2675 SPDK_DEBUGLOG(nvmf_vfio, "%s: map IOVA %p-%p\n", endpoint_id(endpoint), 2676 map_start, map_end); 2677 2678 /* VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE are enabled when registering to VFIO, here we also 2679 * check the protection bits before registering. When vfio client and server are run in same process 2680 * there is no need to register the same memory again. 2681 */ 2682 if (info->prot == (PROT_WRITE | PROT_READ) && !is_peer_same_process(endpoint)) { 2683 ret = spdk_mem_register(info->mapping.iov_base, info->mapping.iov_len); 2684 if (ret) { 2685 SPDK_ERRLOG("Memory region register %p-%p failed, ret=%d\n", 2686 map_start, map_end, ret); 2687 } 2688 } 2689 2690 pthread_mutex_lock(&endpoint->lock); 2691 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 2692 if (sq->sq_state != VFIO_USER_SQ_INACTIVE) { 2693 continue; 2694 } 2695 2696 cq = ctrlr->cqs[sq->cqid]; 2697 2698 /* For shared CQ case, we will use q_addr() to avoid mapping CQ multiple times */ 2699 if (cq->size && q_addr(&cq->mapping) == NULL) { 2700 ret = map_q(ctrlr, &cq->mapping, MAP_RW | MAP_QUIET); 2701 if (ret) { 2702 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap cqid:%d %#lx-%#lx\n", 2703 cq->qid, cq->mapping.prp1, 2704 cq->mapping.prp1 + cq->mapping.len); 2705 continue; 2706 } 2707 } 2708 2709 if (sq->size) { 2710 ret = map_q(ctrlr, &sq->mapping, MAP_R | MAP_QUIET); 2711 if (ret) { 2712 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap sqid:%d %#lx-%#lx\n", 2713 sq->qid, sq->mapping.prp1, 2714 sq->mapping.prp1 + sq->mapping.len); 2715 continue; 2716 } 2717 } 2718 sq->sq_state = VFIO_USER_SQ_ACTIVE; 2719 SPDK_DEBUGLOG(nvmf_vfio, "Remap sqid:%u successfully\n", sq->qid); 2720 } 2721 pthread_mutex_unlock(&endpoint->lock); 2722 } 2723 2724 static void 2725 memory_region_remove_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) 2726 { 2727 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2728 struct nvmf_vfio_user_sq *sq; 2729 struct nvmf_vfio_user_cq *cq; 2730 void *map_start, *map_end; 2731 int ret = 0; 2732 2733 if (!info->vaddr) { 2734 return; 2735 } 2736 2737 map_start = info->mapping.iov_base; 2738 map_end = info->mapping.iov_base + info->mapping.iov_len; 2739 2740 if (((uintptr_t)info->mapping.iov_base & MASK_2MB) || 2741 (info->mapping.iov_len & MASK_2MB)) { 2742 SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %p-%p\n", 2743 info->vaddr, map_start, map_end); 2744 return; 2745 } 2746 2747 assert(endpoint != NULL); 2748 SPDK_DEBUGLOG(nvmf_vfio, "%s: unmap IOVA %p-%p\n", endpoint_id(endpoint), 2749 map_start, map_end); 2750 2751 if (endpoint->ctrlr != NULL) { 2752 struct nvmf_vfio_user_ctrlr *ctrlr; 2753 ctrlr = endpoint->ctrlr; 2754 2755 pthread_mutex_lock(&endpoint->lock); 2756 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 2757 if (q_addr(&sq->mapping) >= map_start && q_addr(&sq->mapping) <= map_end) { 2758 unmap_q(ctrlr, &sq->mapping); 2759 sq->sq_state = VFIO_USER_SQ_INACTIVE; 2760 } 2761 2762 cq = ctrlr->cqs[sq->cqid]; 2763 if (q_addr(&cq->mapping) >= map_start && q_addr(&cq->mapping) <= map_end) { 2764 unmap_q(ctrlr, &cq->mapping); 2765 } 2766 } 2767 2768 if (ctrlr->sdbl != NULL) { 2769 size_t i; 2770 2771 for (i = 0; i < NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT; i++) { 2772 const void *const iov_base = ctrlr->sdbl->iovs[i].iov_base; 2773 2774 if (iov_base >= map_start && iov_base < map_end) { 2775 copy_doorbells(ctrlr, 2776 ctrlr->sdbl->shadow_doorbells, 2777 ctrlr->bar0_doorbells); 2778 vfio_user_ctrlr_switch_doorbells(ctrlr, false); 2779 free_sdbl(endpoint->vfu_ctx, ctrlr->sdbl); 2780 ctrlr->sdbl = NULL; 2781 break; 2782 } 2783 } 2784 } 2785 2786 pthread_mutex_unlock(&endpoint->lock); 2787 } 2788 2789 if (info->prot == (PROT_WRITE | PROT_READ) && !is_peer_same_process(endpoint)) { 2790 ret = spdk_mem_unregister(info->mapping.iov_base, info->mapping.iov_len); 2791 if (ret) { 2792 SPDK_ERRLOG("Memory region unregister %p-%p failed, ret=%d\n", 2793 map_start, map_end, ret); 2794 } 2795 } 2796 } 2797 2798 /* Used to initiate a controller-level reset or a controller shutdown. */ 2799 static void 2800 disable_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 2801 { 2802 SPDK_NOTICELOG("%s: disabling controller\n", ctrlr_id(vu_ctrlr)); 2803 2804 /* Unmap Admin queue. */ 2805 2806 assert(vu_ctrlr->sqs[0] != NULL); 2807 assert(vu_ctrlr->cqs[0] != NULL); 2808 2809 unmap_q(vu_ctrlr, &vu_ctrlr->sqs[0]->mapping); 2810 unmap_q(vu_ctrlr, &vu_ctrlr->cqs[0]->mapping); 2811 2812 vu_ctrlr->sqs[0]->size = 0; 2813 *sq_headp(vu_ctrlr->sqs[0]) = 0; 2814 2815 vu_ctrlr->sqs[0]->sq_state = VFIO_USER_SQ_INACTIVE; 2816 2817 vu_ctrlr->cqs[0]->size = 0; 2818 *cq_tailp(vu_ctrlr->cqs[0]) = 0; 2819 2820 /* 2821 * For PCIe controller reset or shutdown, we will drop all AER 2822 * responses. 2823 */ 2824 spdk_nvmf_ctrlr_abort_aer(vu_ctrlr->ctrlr); 2825 2826 /* Free the shadow doorbell buffer. */ 2827 vfio_user_ctrlr_switch_doorbells(vu_ctrlr, false); 2828 free_sdbl(vu_ctrlr->endpoint->vfu_ctx, vu_ctrlr->sdbl); 2829 vu_ctrlr->sdbl = NULL; 2830 } 2831 2832 /* Used to re-enable the controller after a controller-level reset. */ 2833 static int 2834 enable_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 2835 { 2836 int err; 2837 2838 assert(vu_ctrlr != NULL); 2839 2840 SPDK_NOTICELOG("%s: enabling controller\n", ctrlr_id(vu_ctrlr)); 2841 2842 err = acq_setup(vu_ctrlr); 2843 if (err != 0) { 2844 return err; 2845 } 2846 2847 err = asq_setup(vu_ctrlr); 2848 if (err != 0) { 2849 return err; 2850 } 2851 2852 vu_ctrlr->sqs[0]->sq_state = VFIO_USER_SQ_ACTIVE; 2853 2854 return 0; 2855 } 2856 2857 static int 2858 nvmf_vfio_user_prop_req_rsp_set(struct nvmf_vfio_user_req *req, 2859 struct nvmf_vfio_user_sq *sq) 2860 { 2861 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 2862 union spdk_nvme_cc_register cc, diff; 2863 2864 assert(req->req.cmd->prop_set_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET); 2865 assert(sq->ctrlr != NULL); 2866 vu_ctrlr = sq->ctrlr; 2867 2868 if (req->req.cmd->prop_set_cmd.ofst != offsetof(struct spdk_nvme_registers, cc)) { 2869 return 0; 2870 } 2871 2872 cc.raw = req->req.cmd->prop_set_cmd.value.u64; 2873 diff.raw = cc.raw ^ req->cc.raw; 2874 2875 if (diff.bits.en) { 2876 if (cc.bits.en) { 2877 int ret = enable_ctrlr(vu_ctrlr); 2878 if (ret) { 2879 SPDK_ERRLOG("%s: failed to enable ctrlr\n", ctrlr_id(vu_ctrlr)); 2880 return ret; 2881 } 2882 vu_ctrlr->reset_shn = false; 2883 } else { 2884 vu_ctrlr->reset_shn = true; 2885 } 2886 } 2887 2888 if (diff.bits.shn) { 2889 if (cc.bits.shn == SPDK_NVME_SHN_NORMAL || cc.bits.shn == SPDK_NVME_SHN_ABRUPT) { 2890 vu_ctrlr->reset_shn = true; 2891 } 2892 } 2893 2894 if (vu_ctrlr->reset_shn) { 2895 disable_ctrlr(vu_ctrlr); 2896 } 2897 return 0; 2898 } 2899 2900 static int 2901 nvmf_vfio_user_prop_req_rsp(struct nvmf_vfio_user_req *req, void *cb_arg) 2902 { 2903 struct nvmf_vfio_user_sq *sq = cb_arg; 2904 2905 assert(sq != NULL); 2906 assert(req != NULL); 2907 2908 if (req->req.cmd->prop_get_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET) { 2909 assert(sq->ctrlr != NULL); 2910 assert(req != NULL); 2911 2912 memcpy(req->req.iov[0].iov_base, 2913 &req->req.rsp->prop_get_rsp.value.u64, 2914 req->req.length); 2915 return 0; 2916 } 2917 2918 return nvmf_vfio_user_prop_req_rsp_set(req, sq); 2919 } 2920 2921 /* 2922 * Handles a write at offset 0x1000 or more; this is the non-mapped path when a 2923 * doorbell is written via access_bar0_fn(). 2924 * 2925 * DSTRD is set to fixed value 0 for NVMf. 2926 * 2927 */ 2928 static int 2929 handle_dbl_access(struct nvmf_vfio_user_ctrlr *ctrlr, uint32_t *buf, 2930 const size_t count, loff_t pos, const bool is_write) 2931 { 2932 struct nvmf_vfio_user_poll_group *group; 2933 2934 assert(ctrlr != NULL); 2935 assert(buf != NULL); 2936 2937 if (spdk_unlikely(!is_write)) { 2938 SPDK_WARNLOG("%s: host tried to read BAR0 doorbell %#lx\n", 2939 ctrlr_id(ctrlr), pos); 2940 errno = EPERM; 2941 return -1; 2942 } 2943 2944 if (spdk_unlikely(count != sizeof(uint32_t))) { 2945 SPDK_ERRLOG("%s: bad doorbell buffer size %ld\n", 2946 ctrlr_id(ctrlr), count); 2947 errno = EINVAL; 2948 return -1; 2949 } 2950 2951 pos -= NVME_DOORBELLS_OFFSET; 2952 2953 /* pos must be dword aligned */ 2954 if (spdk_unlikely((pos & 0x3) != 0)) { 2955 SPDK_ERRLOG("%s: bad doorbell offset %#lx\n", ctrlr_id(ctrlr), pos); 2956 errno = EINVAL; 2957 return -1; 2958 } 2959 2960 /* convert byte offset to array index */ 2961 pos >>= 2; 2962 2963 if (spdk_unlikely(pos >= NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR * 2)) { 2964 SPDK_ERRLOG("%s: bad doorbell index %#lx\n", ctrlr_id(ctrlr), pos); 2965 errno = EINVAL; 2966 return -1; 2967 } 2968 2969 ctrlr->bar0_doorbells[pos] = *buf; 2970 spdk_wmb(); 2971 2972 group = ctrlr_to_poll_group(ctrlr); 2973 if (pos == 1) { 2974 group->stats.cqh_admin_writes++; 2975 } else if (pos & 1) { 2976 group->stats.cqh_io_writes++; 2977 } 2978 2979 SPDK_DEBUGLOG(vfio_user_db, "%s: updating BAR0 doorbell %s:%ld to %u\n", 2980 ctrlr_id(ctrlr), (pos & 1) ? "cqid" : "sqid", 2981 pos / 2, *buf); 2982 2983 2984 return 0; 2985 } 2986 2987 static size_t 2988 vfio_user_property_access(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 2989 char *buf, size_t count, loff_t pos, 2990 bool is_write) 2991 { 2992 struct nvmf_vfio_user_req *req; 2993 const struct spdk_nvmf_registers *regs; 2994 2995 if ((count != 4) && (count != 8)) { 2996 errno = EINVAL; 2997 return -1; 2998 } 2999 3000 /* Construct a Fabric Property Get/Set command and send it */ 3001 req = get_nvmf_vfio_user_req(vu_ctrlr->sqs[0]); 3002 if (req == NULL) { 3003 errno = ENOBUFS; 3004 return -1; 3005 } 3006 regs = spdk_nvmf_ctrlr_get_regs(vu_ctrlr->ctrlr); 3007 req->cc.raw = regs->cc.raw; 3008 3009 req->cb_fn = nvmf_vfio_user_prop_req_rsp; 3010 req->cb_arg = vu_ctrlr->sqs[0]; 3011 req->req.cmd->prop_set_cmd.opcode = SPDK_NVME_OPC_FABRIC; 3012 req->req.cmd->prop_set_cmd.cid = 0; 3013 if (count == 4) { 3014 req->req.cmd->prop_set_cmd.attrib.size = 0; 3015 } else { 3016 req->req.cmd->prop_set_cmd.attrib.size = 1; 3017 } 3018 req->req.cmd->prop_set_cmd.ofst = pos; 3019 if (is_write) { 3020 req->req.cmd->prop_set_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET; 3021 if (req->req.cmd->prop_set_cmd.attrib.size) { 3022 req->req.cmd->prop_set_cmd.value.u64 = *(uint64_t *)buf; 3023 } else { 3024 req->req.cmd->prop_set_cmd.value.u32.high = 0; 3025 req->req.cmd->prop_set_cmd.value.u32.low = *(uint32_t *)buf; 3026 } 3027 } else { 3028 req->req.cmd->prop_get_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET; 3029 } 3030 req->req.length = count; 3031 SPDK_IOV_ONE(req->req.iov, &req->req.iovcnt, buf, req->req.length); 3032 3033 spdk_nvmf_request_exec(&req->req); 3034 3035 return count; 3036 } 3037 3038 static ssize_t 3039 access_bar0_fn(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t pos, 3040 bool is_write) 3041 { 3042 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3043 struct nvmf_vfio_user_ctrlr *ctrlr; 3044 int ret; 3045 3046 ctrlr = endpoint->ctrlr; 3047 if (spdk_unlikely(endpoint->need_async_destroy || !ctrlr)) { 3048 errno = EIO; 3049 return -1; 3050 } 3051 3052 if (pos >= NVME_DOORBELLS_OFFSET) { 3053 /* 3054 * The fact that the doorbells can be memory mapped doesn't mean 3055 * that the client (VFIO in QEMU) is obliged to memory map them, 3056 * it might still elect to access them via regular read/write; 3057 * we might also have had disable_mappable_bar0 set. 3058 */ 3059 ret = handle_dbl_access(ctrlr, (uint32_t *)buf, count, 3060 pos, is_write); 3061 if (ret == 0) { 3062 return count; 3063 } 3064 return ret; 3065 } 3066 3067 return vfio_user_property_access(ctrlr, buf, count, pos, is_write); 3068 } 3069 3070 static ssize_t 3071 access_pci_config(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t offset, 3072 bool is_write) 3073 { 3074 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3075 3076 if (is_write) { 3077 SPDK_ERRLOG("%s: write %#lx-%#lx not supported\n", 3078 endpoint_id(endpoint), offset, offset + count); 3079 errno = EINVAL; 3080 return -1; 3081 } 3082 3083 if (offset + count > NVME_REG_CFG_SIZE) { 3084 SPDK_ERRLOG("%s: access past end of extended PCI configuration space, want=%ld+%ld, max=%d\n", 3085 endpoint_id(endpoint), offset, count, 3086 NVME_REG_CFG_SIZE); 3087 errno = ERANGE; 3088 return -1; 3089 } 3090 3091 memcpy(buf, ((unsigned char *)endpoint->pci_config_space) + offset, count); 3092 3093 return count; 3094 } 3095 3096 static void 3097 vfio_user_log(vfu_ctx_t *vfu_ctx, int level, char const *msg) 3098 { 3099 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3100 3101 if (level >= LOG_DEBUG) { 3102 SPDK_DEBUGLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg); 3103 } else if (level >= LOG_INFO) { 3104 SPDK_INFOLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg); 3105 } else if (level >= LOG_NOTICE) { 3106 SPDK_NOTICELOG("%s: %s\n", endpoint_id(endpoint), msg); 3107 } else if (level >= LOG_WARNING) { 3108 SPDK_WARNLOG("%s: %s\n", endpoint_id(endpoint), msg); 3109 } else { 3110 SPDK_ERRLOG("%s: %s\n", endpoint_id(endpoint), msg); 3111 } 3112 } 3113 3114 static int 3115 vfio_user_get_log_level(void) 3116 { 3117 int level; 3118 3119 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 3120 return LOG_DEBUG; 3121 } 3122 3123 level = spdk_log_to_syslog_level(spdk_log_get_level()); 3124 if (level < 0) { 3125 return LOG_ERR; 3126 } 3127 3128 return level; 3129 } 3130 3131 static void 3132 init_pci_config_space(vfu_pci_config_space_t *p) 3133 { 3134 /* MLBAR */ 3135 p->hdr.bars[0].raw = 0x0; 3136 /* MUBAR */ 3137 p->hdr.bars[1].raw = 0x0; 3138 3139 /* vendor specific, let's set them to zero for now */ 3140 p->hdr.bars[3].raw = 0x0; 3141 p->hdr.bars[4].raw = 0x0; 3142 p->hdr.bars[5].raw = 0x0; 3143 3144 /* enable INTx */ 3145 p->hdr.intr.ipin = 0x1; 3146 } 3147 3148 struct ctrlr_quiesce_ctx { 3149 struct nvmf_vfio_user_endpoint *endpoint; 3150 struct nvmf_vfio_user_poll_group *group; 3151 int status; 3152 }; 3153 3154 static void ctrlr_quiesce(struct nvmf_vfio_user_ctrlr *vu_ctrlr); 3155 3156 static void 3157 _vfio_user_endpoint_resume_done_msg(void *ctx) 3158 { 3159 struct nvmf_vfio_user_endpoint *endpoint = ctx; 3160 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3161 3162 endpoint->need_resume = false; 3163 3164 if (!vu_ctrlr) { 3165 return; 3166 } 3167 3168 if (!vu_ctrlr->queued_quiesce) { 3169 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 3170 3171 /* 3172 * We might have ignored new SQ entries while we were quiesced: 3173 * kick ourselves so we'll definitely check again while in 3174 * VFIO_USER_CTRLR_RUNNING state. 3175 */ 3176 if (in_interrupt_mode(endpoint->transport)) { 3177 ctrlr_kick(vu_ctrlr); 3178 } 3179 return; 3180 } 3181 3182 3183 /* 3184 * Basically, once we call `vfu_device_quiesced` the device is 3185 * unquiesced from libvfio-user's perspective so from the moment 3186 * `vfio_user_quiesce_done` returns libvfio-user might quiesce the device 3187 * again. However, because the NVMf subsystem is an asynchronous 3188 * operation, this quiesce might come _before_ the NVMf subsystem has 3189 * been resumed, so in the callback of `spdk_nvmf_subsystem_resume` we 3190 * need to check whether a quiesce was requested. 3191 */ 3192 SPDK_DEBUGLOG(nvmf_vfio, "%s has queued quiesce event, quiesce again\n", 3193 ctrlr_id(vu_ctrlr)); 3194 ctrlr_quiesce(vu_ctrlr); 3195 } 3196 3197 static void 3198 vfio_user_endpoint_resume_done(struct spdk_nvmf_subsystem *subsystem, 3199 void *cb_arg, int status) 3200 { 3201 struct nvmf_vfio_user_endpoint *endpoint = cb_arg; 3202 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3203 3204 SPDK_DEBUGLOG(nvmf_vfio, "%s resumed done with status %d\n", endpoint_id(endpoint), status); 3205 3206 if (!vu_ctrlr) { 3207 return; 3208 } 3209 3210 spdk_thread_send_msg(vu_ctrlr->thread, _vfio_user_endpoint_resume_done_msg, endpoint); 3211 } 3212 3213 static void 3214 vfio_user_quiesce_done(void *ctx) 3215 { 3216 struct ctrlr_quiesce_ctx *quiesce_ctx = ctx; 3217 struct nvmf_vfio_user_endpoint *endpoint = quiesce_ctx->endpoint; 3218 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3219 int ret; 3220 3221 if (!vu_ctrlr) { 3222 free(quiesce_ctx); 3223 return; 3224 } 3225 3226 SPDK_DEBUGLOG(nvmf_vfio, "%s device quiesced\n", ctrlr_id(vu_ctrlr)); 3227 3228 assert(vu_ctrlr->state == VFIO_USER_CTRLR_PAUSING); 3229 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 3230 vfu_device_quiesced(endpoint->vfu_ctx, quiesce_ctx->status); 3231 vu_ctrlr->queued_quiesce = false; 3232 free(quiesce_ctx); 3233 3234 /* `vfu_device_quiesced` can change the migration state, 3235 * so we need to re-check `vu_ctrlr->state`. 3236 */ 3237 if (vu_ctrlr->state == VFIO_USER_CTRLR_MIGRATING) { 3238 SPDK_DEBUGLOG(nvmf_vfio, "%s is in MIGRATION state\n", ctrlr_id(vu_ctrlr)); 3239 return; 3240 } 3241 3242 SPDK_DEBUGLOG(nvmf_vfio, "%s start to resume\n", ctrlr_id(vu_ctrlr)); 3243 vu_ctrlr->state = VFIO_USER_CTRLR_RESUMING; 3244 ret = spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 3245 vfio_user_endpoint_resume_done, endpoint); 3246 if (ret < 0) { 3247 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 3248 SPDK_ERRLOG("%s: failed to resume, ret=%d\n", endpoint_id(endpoint), ret); 3249 } 3250 } 3251 3252 static void 3253 vfio_user_pause_done(struct spdk_nvmf_subsystem *subsystem, 3254 void *ctx, int status) 3255 { 3256 struct ctrlr_quiesce_ctx *quiesce_ctx = ctx; 3257 struct nvmf_vfio_user_endpoint *endpoint = quiesce_ctx->endpoint; 3258 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3259 3260 if (!vu_ctrlr) { 3261 free(quiesce_ctx); 3262 return; 3263 } 3264 3265 quiesce_ctx->status = status; 3266 3267 SPDK_DEBUGLOG(nvmf_vfio, "%s pause done with status %d\n", 3268 ctrlr_id(vu_ctrlr), status); 3269 3270 spdk_thread_send_msg(vu_ctrlr->thread, 3271 vfio_user_quiesce_done, ctx); 3272 } 3273 3274 /* 3275 * Ensure that, for this PG, we've stopped running in nvmf_vfio_user_sq_poll(); 3276 * we've already set ctrlr->state, so we won't process new entries, but we need 3277 * to ensure that this PG is quiesced. This only works because there's no 3278 * callback context set up between polling the SQ and spdk_nvmf_request_exec(). 3279 * 3280 * Once we've walked all PGs, we need to pause any submitted I/O via 3281 * spdk_nvmf_subsystem_pause(SPDK_NVME_GLOBAL_NS_TAG). 3282 */ 3283 static void 3284 vfio_user_quiesce_pg(void *ctx) 3285 { 3286 struct ctrlr_quiesce_ctx *quiesce_ctx = ctx; 3287 struct nvmf_vfio_user_endpoint *endpoint = quiesce_ctx->endpoint; 3288 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3289 struct nvmf_vfio_user_poll_group *vu_group = quiesce_ctx->group; 3290 struct spdk_nvmf_subsystem *subsystem = endpoint->subsystem; 3291 int ret; 3292 3293 SPDK_DEBUGLOG(nvmf_vfio, "quiesced pg:%p\n", vu_group); 3294 3295 if (!vu_ctrlr) { 3296 free(quiesce_ctx); 3297 return; 3298 } 3299 3300 quiesce_ctx->group = TAILQ_NEXT(vu_group, link); 3301 if (quiesce_ctx->group != NULL) { 3302 spdk_thread_send_msg(poll_group_to_thread(quiesce_ctx->group), 3303 vfio_user_quiesce_pg, quiesce_ctx); 3304 return; 3305 } 3306 3307 ret = spdk_nvmf_subsystem_pause(subsystem, SPDK_NVME_GLOBAL_NS_TAG, 3308 vfio_user_pause_done, quiesce_ctx); 3309 if (ret < 0) { 3310 SPDK_ERRLOG("%s: failed to pause, ret=%d\n", 3311 endpoint_id(endpoint), ret); 3312 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 3313 fail_ctrlr(vu_ctrlr); 3314 free(quiesce_ctx); 3315 } 3316 } 3317 3318 static void 3319 ctrlr_quiesce(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3320 { 3321 struct ctrlr_quiesce_ctx *quiesce_ctx; 3322 3323 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSING; 3324 3325 quiesce_ctx = calloc(1, sizeof(*quiesce_ctx)); 3326 if (!quiesce_ctx) { 3327 SPDK_ERRLOG("Failed to allocate subsystem pause context\n"); 3328 assert(false); 3329 return; 3330 } 3331 3332 quiesce_ctx->endpoint = vu_ctrlr->endpoint; 3333 quiesce_ctx->status = 0; 3334 quiesce_ctx->group = TAILQ_FIRST(&vu_ctrlr->transport->poll_groups); 3335 3336 spdk_thread_send_msg(poll_group_to_thread(quiesce_ctx->group), 3337 vfio_user_quiesce_pg, quiesce_ctx); 3338 } 3339 3340 static int 3341 vfio_user_dev_quiesce_cb(vfu_ctx_t *vfu_ctx) 3342 { 3343 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3344 struct spdk_nvmf_subsystem *subsystem = endpoint->subsystem; 3345 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3346 3347 if (!vu_ctrlr) { 3348 return 0; 3349 } 3350 3351 /* NVMf library will destruct controller when no 3352 * connected queue pairs. 3353 */ 3354 if (!nvmf_subsystem_get_ctrlr(subsystem, vu_ctrlr->cntlid)) { 3355 return 0; 3356 } 3357 3358 SPDK_DEBUGLOG(nvmf_vfio, "%s starts to quiesce\n", ctrlr_id(vu_ctrlr)); 3359 3360 /* There is no race condition here as device quiesce callback 3361 * and nvmf_prop_set_cc() are running in the same thread context. 3362 */ 3363 if (!vu_ctrlr->ctrlr->vcprop.cc.bits.en) { 3364 return 0; 3365 } else if (!vu_ctrlr->ctrlr->vcprop.csts.bits.rdy) { 3366 return 0; 3367 } else if (vu_ctrlr->ctrlr->vcprop.csts.bits.shst == SPDK_NVME_SHST_COMPLETE) { 3368 return 0; 3369 } 3370 3371 switch (vu_ctrlr->state) { 3372 case VFIO_USER_CTRLR_PAUSED: 3373 case VFIO_USER_CTRLR_MIGRATING: 3374 return 0; 3375 case VFIO_USER_CTRLR_RUNNING: 3376 ctrlr_quiesce(vu_ctrlr); 3377 break; 3378 case VFIO_USER_CTRLR_RESUMING: 3379 vu_ctrlr->queued_quiesce = true; 3380 SPDK_DEBUGLOG(nvmf_vfio, "%s is busy to quiesce, current state %u\n", ctrlr_id(vu_ctrlr), 3381 vu_ctrlr->state); 3382 break; 3383 default: 3384 assert(vu_ctrlr->state != VFIO_USER_CTRLR_PAUSING); 3385 break; 3386 } 3387 3388 errno = EBUSY; 3389 return -1; 3390 } 3391 3392 static void 3393 vfio_user_ctrlr_dump_migr_data(const char *name, 3394 struct vfio_user_nvme_migr_state *migr_data, 3395 struct nvmf_vfio_user_shadow_doorbells *sdbl) 3396 { 3397 struct spdk_nvmf_registers *regs; 3398 struct nvme_migr_sq_state *sq; 3399 struct nvme_migr_cq_state *cq; 3400 uint32_t *doorbell_base; 3401 uint32_t i; 3402 3403 SPDK_NOTICELOG("Dump %s\n", name); 3404 3405 regs = &migr_data->nvmf_data.regs; 3406 doorbell_base = (uint32_t *)&migr_data->doorbells; 3407 3408 SPDK_NOTICELOG("Registers\n"); 3409 SPDK_NOTICELOG("CSTS 0x%x\n", regs->csts.raw); 3410 SPDK_NOTICELOG("CAP 0x%"PRIx64"\n", regs->cap.raw); 3411 SPDK_NOTICELOG("VS 0x%x\n", regs->vs.raw); 3412 SPDK_NOTICELOG("CC 0x%x\n", regs->cc.raw); 3413 SPDK_NOTICELOG("AQA 0x%x\n", regs->aqa.raw); 3414 SPDK_NOTICELOG("ASQ 0x%"PRIx64"\n", regs->asq); 3415 SPDK_NOTICELOG("ACQ 0x%"PRIx64"\n", regs->acq); 3416 3417 SPDK_NOTICELOG("Number of IO Queues %u\n", migr_data->ctrlr_header.num_io_queues); 3418 3419 if (sdbl != NULL) { 3420 SPDK_NOTICELOG("shadow doorbell buffer=%#lx\n", 3421 migr_data->ctrlr_header.shadow_doorbell_buffer); 3422 SPDK_NOTICELOG("eventidx buffer=%#lx\n", 3423 migr_data->ctrlr_header.eventidx_buffer); 3424 } 3425 3426 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3427 sq = &migr_data->qps[i].sq; 3428 cq = &migr_data->qps[i].cq; 3429 3430 if (sq->size) { 3431 SPDK_NOTICELOG("sqid:%u, bar0_doorbell:%u\n", sq->sqid, doorbell_base[i * 2]); 3432 if (i > 0 && sdbl != NULL) { 3433 SPDK_NOTICELOG("sqid:%u, shadow_doorbell:%u, eventidx:%u\n", 3434 sq->sqid, 3435 sdbl->shadow_doorbells[queue_index(i, false)], 3436 sdbl->eventidxs[queue_index(i, false)]); 3437 } 3438 SPDK_NOTICELOG("SQ sqid:%u, cqid:%u, sqhead:%u, size:%u, dma_addr:0x%"PRIx64"\n", 3439 sq->sqid, sq->cqid, sq->head, sq->size, sq->dma_addr); 3440 } 3441 3442 if (cq->size) { 3443 SPDK_NOTICELOG("cqid:%u, bar0_doorbell:%u\n", cq->cqid, doorbell_base[i * 2 + 1]); 3444 if (i > 0 && sdbl != NULL) { 3445 SPDK_NOTICELOG("cqid:%u, shadow_doorbell:%u, eventidx:%u\n", 3446 cq->cqid, 3447 sdbl->shadow_doorbells[queue_index(i, true)], 3448 sdbl->eventidxs[queue_index(i, true)]); 3449 } 3450 SPDK_NOTICELOG("CQ cqid:%u, phase:%u, cqtail:%u, size:%u, iv:%u, ien:%u, dma_addr:0x%"PRIx64"\n", 3451 cq->cqid, cq->phase, cq->tail, cq->size, cq->iv, cq->ien, cq->dma_addr); 3452 } 3453 } 3454 3455 SPDK_NOTICELOG("%s Dump Done\n", name); 3456 } 3457 3458 /* Read region 9 content and restore it to migration data structures */ 3459 static int 3460 vfio_user_migr_stream_to_data(struct nvmf_vfio_user_endpoint *endpoint, 3461 struct vfio_user_nvme_migr_state *migr_state) 3462 { 3463 void *data_ptr = endpoint->migr_data; 3464 3465 /* Load vfio_user_nvme_migr_header first */ 3466 memcpy(&migr_state->ctrlr_header, data_ptr, sizeof(struct vfio_user_nvme_migr_header)); 3467 /* TODO: version check */ 3468 if (migr_state->ctrlr_header.magic != VFIO_USER_NVME_MIGR_MAGIC) { 3469 SPDK_ERRLOG("%s: bad magic number %x\n", endpoint_id(endpoint), migr_state->ctrlr_header.magic); 3470 return -EINVAL; 3471 } 3472 3473 /* Load nvmf controller data */ 3474 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.nvmf_data_offset; 3475 memcpy(&migr_state->nvmf_data, data_ptr, migr_state->ctrlr_header.nvmf_data_len); 3476 3477 /* Load queue pairs */ 3478 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.qp_offset; 3479 memcpy(&migr_state->qps, data_ptr, migr_state->ctrlr_header.qp_len); 3480 3481 /* Load doorbells */ 3482 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.bar_offset[VFU_PCI_DEV_BAR0_REGION_IDX]; 3483 memcpy(&migr_state->doorbells, data_ptr, 3484 migr_state->ctrlr_header.bar_len[VFU_PCI_DEV_BAR0_REGION_IDX]); 3485 3486 /* Load CFG */ 3487 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.bar_offset[VFU_PCI_DEV_CFG_REGION_IDX]; 3488 memcpy(&migr_state->cfg, data_ptr, migr_state->ctrlr_header.bar_len[VFU_PCI_DEV_CFG_REGION_IDX]); 3489 3490 return 0; 3491 } 3492 3493 3494 static void 3495 vfio_user_migr_ctrlr_save_data(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3496 { 3497 struct spdk_nvmf_ctrlr *ctrlr = vu_ctrlr->ctrlr; 3498 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 3499 struct nvmf_vfio_user_sq *sq; 3500 struct nvmf_vfio_user_cq *cq; 3501 uint64_t data_offset; 3502 void *data_ptr; 3503 uint32_t *doorbell_base; 3504 uint32_t i = 0; 3505 uint16_t sqid, cqid; 3506 struct vfio_user_nvme_migr_state migr_state = { 3507 .nvmf_data = { 3508 .data_size = offsetof(struct spdk_nvmf_ctrlr_migr_data, unused), 3509 .regs_size = sizeof(struct spdk_nvmf_registers), 3510 .feat_size = sizeof(struct spdk_nvmf_ctrlr_feat) 3511 } 3512 }; 3513 3514 /* Save all data to vfio_user_nvme_migr_state first, then we will 3515 * copy it to device migration region at last. 3516 */ 3517 3518 /* save magic number */ 3519 migr_state.ctrlr_header.magic = VFIO_USER_NVME_MIGR_MAGIC; 3520 3521 /* save controller data */ 3522 spdk_nvmf_ctrlr_save_migr_data(ctrlr, &migr_state.nvmf_data); 3523 3524 /* save connected queue pairs */ 3525 TAILQ_FOREACH(sq, &vu_ctrlr->connected_sqs, tailq) { 3526 /* save sq */ 3527 sqid = sq->qid; 3528 migr_state.qps[sqid].sq.sqid = sq->qid; 3529 migr_state.qps[sqid].sq.cqid = sq->cqid; 3530 migr_state.qps[sqid].sq.head = *sq_headp(sq); 3531 migr_state.qps[sqid].sq.size = sq->size; 3532 migr_state.qps[sqid].sq.dma_addr = sq->mapping.prp1; 3533 3534 /* save cq, for shared cq case, cq may be saved multiple times */ 3535 cqid = sq->cqid; 3536 cq = vu_ctrlr->cqs[cqid]; 3537 migr_state.qps[cqid].cq.cqid = cqid; 3538 migr_state.qps[cqid].cq.tail = *cq_tailp(cq); 3539 migr_state.qps[cqid].cq.ien = cq->ien; 3540 migr_state.qps[cqid].cq.iv = cq->iv; 3541 migr_state.qps[cqid].cq.size = cq->size; 3542 migr_state.qps[cqid].cq.phase = cq->phase; 3543 migr_state.qps[cqid].cq.dma_addr = cq->mapping.prp1; 3544 i++; 3545 } 3546 3547 assert(i > 0); 3548 migr_state.ctrlr_header.num_io_queues = i - 1; 3549 3550 /* Save doorbells */ 3551 doorbell_base = (uint32_t *)&migr_state.doorbells; 3552 memcpy(doorbell_base, (void *)vu_ctrlr->bar0_doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 3553 3554 /* Save PCI configuration space */ 3555 memcpy(&migr_state.cfg, (void *)endpoint->pci_config_space, NVME_REG_CFG_SIZE); 3556 3557 /* Save all data to device migration region */ 3558 data_ptr = endpoint->migr_data; 3559 3560 /* Copy nvmf controller data */ 3561 data_offset = sizeof(struct vfio_user_nvme_migr_header); 3562 data_ptr += data_offset; 3563 migr_state.ctrlr_header.nvmf_data_offset = data_offset; 3564 migr_state.ctrlr_header.nvmf_data_len = sizeof(struct spdk_nvmf_ctrlr_migr_data); 3565 memcpy(data_ptr, &migr_state.nvmf_data, sizeof(struct spdk_nvmf_ctrlr_migr_data)); 3566 3567 /* Copy queue pairs */ 3568 data_offset += sizeof(struct spdk_nvmf_ctrlr_migr_data); 3569 data_ptr += sizeof(struct spdk_nvmf_ctrlr_migr_data); 3570 migr_state.ctrlr_header.qp_offset = data_offset; 3571 migr_state.ctrlr_header.qp_len = i * (sizeof(struct nvme_migr_sq_state) + sizeof( 3572 struct nvme_migr_cq_state)); 3573 memcpy(data_ptr, &migr_state.qps, migr_state.ctrlr_header.qp_len); 3574 3575 /* Copy doorbells */ 3576 data_offset += migr_state.ctrlr_header.qp_len; 3577 data_ptr += migr_state.ctrlr_header.qp_len; 3578 migr_state.ctrlr_header.bar_offset[VFU_PCI_DEV_BAR0_REGION_IDX] = data_offset; 3579 migr_state.ctrlr_header.bar_len[VFU_PCI_DEV_BAR0_REGION_IDX] = NVMF_VFIO_USER_DOORBELLS_SIZE; 3580 memcpy(data_ptr, &migr_state.doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 3581 3582 /* Copy CFG */ 3583 data_offset += NVMF_VFIO_USER_DOORBELLS_SIZE; 3584 data_ptr += NVMF_VFIO_USER_DOORBELLS_SIZE; 3585 migr_state.ctrlr_header.bar_offset[VFU_PCI_DEV_CFG_REGION_IDX] = data_offset; 3586 migr_state.ctrlr_header.bar_len[VFU_PCI_DEV_CFG_REGION_IDX] = NVME_REG_CFG_SIZE; 3587 memcpy(data_ptr, &migr_state.cfg, NVME_REG_CFG_SIZE); 3588 3589 /* copy shadow doorbells */ 3590 if (vu_ctrlr->sdbl != NULL) { 3591 migr_state.ctrlr_header.sdbl = true; 3592 migr_state.ctrlr_header.shadow_doorbell_buffer = vu_ctrlr->shadow_doorbell_buffer; 3593 migr_state.ctrlr_header.eventidx_buffer = vu_ctrlr->eventidx_buffer; 3594 } 3595 3596 /* Copy nvme migration header finally */ 3597 memcpy(endpoint->migr_data, &migr_state.ctrlr_header, sizeof(struct vfio_user_nvme_migr_header)); 3598 3599 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 3600 vfio_user_ctrlr_dump_migr_data("SAVE", &migr_state, vu_ctrlr->sdbl); 3601 } 3602 } 3603 3604 /* 3605 * If we are about to close the connection, we need to unregister the interrupt, 3606 * as the library will subsequently close the file descriptor we registered. 3607 */ 3608 static int 3609 vfio_user_device_reset(vfu_ctx_t *vfu_ctx, vfu_reset_type_t type) 3610 { 3611 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3612 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 3613 3614 SPDK_DEBUGLOG(nvmf_vfio, "Device reset type %u\n", type); 3615 3616 if (type == VFU_RESET_LOST_CONN) { 3617 if (ctrlr != NULL) { 3618 spdk_interrupt_unregister(&ctrlr->intr); 3619 ctrlr->intr_fd = -1; 3620 } 3621 return 0; 3622 } 3623 3624 /* FIXME: LOST_CONN case ? */ 3625 if (ctrlr->sdbl != NULL) { 3626 vfio_user_ctrlr_switch_doorbells(ctrlr, false); 3627 free_sdbl(vfu_ctx, ctrlr->sdbl); 3628 ctrlr->sdbl = NULL; 3629 } 3630 3631 /* FIXME: much more needed here. */ 3632 3633 return 0; 3634 } 3635 3636 static int 3637 vfio_user_migr_ctrlr_construct_qps(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 3638 struct vfio_user_nvme_migr_state *migr_state) 3639 { 3640 uint32_t i, qsize = 0; 3641 uint16_t sqid, cqid; 3642 struct vfio_user_nvme_migr_qp migr_qp; 3643 void *addr; 3644 uint32_t cqs_ref[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR] = {}; 3645 int ret; 3646 3647 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 3648 vfio_user_ctrlr_dump_migr_data("RESUME", migr_state, vu_ctrlr->sdbl); 3649 } 3650 3651 /* restore submission queues */ 3652 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3653 migr_qp = migr_state->qps[i]; 3654 3655 qsize = migr_qp.sq.size; 3656 if (qsize) { 3657 struct nvmf_vfio_user_sq *sq; 3658 3659 sqid = migr_qp.sq.sqid; 3660 if (sqid != i) { 3661 SPDK_ERRLOG("Expected sqid %u while got %u", i, sqid); 3662 return -EINVAL; 3663 } 3664 3665 /* allocate sq if necessary */ 3666 if (vu_ctrlr->sqs[sqid] == NULL) { 3667 ret = init_sq(vu_ctrlr, &vu_ctrlr->transport->transport, sqid); 3668 if (ret) { 3669 SPDK_ERRLOG("Construct qpair with qid %u failed\n", sqid); 3670 return -EFAULT; 3671 } 3672 } 3673 3674 sq = vu_ctrlr->sqs[sqid]; 3675 sq->size = qsize; 3676 3677 ret = alloc_sq_reqs(vu_ctrlr, sq); 3678 if (ret) { 3679 SPDK_ERRLOG("Construct sq with qid %u failed\n", sqid); 3680 return -EFAULT; 3681 } 3682 3683 /* restore sq */ 3684 sq->sq_state = VFIO_USER_SQ_CREATED; 3685 sq->cqid = migr_qp.sq.cqid; 3686 *sq_headp(sq) = migr_qp.sq.head; 3687 sq->mapping.prp1 = migr_qp.sq.dma_addr; 3688 sq->mapping.len = sq->size * sizeof(struct spdk_nvme_cmd); 3689 addr = map_one(vu_ctrlr->endpoint->vfu_ctx, 3690 sq->mapping.prp1, sq->mapping.len, 3691 sq->mapping.sg, &sq->mapping.iov, 3692 PROT_READ); 3693 if (addr == NULL) { 3694 SPDK_ERRLOG("Restore sq with qid %u PRP1 0x%"PRIx64" with size %u failed\n", 3695 sqid, sq->mapping.prp1, sq->size); 3696 return -EFAULT; 3697 } 3698 cqs_ref[sq->cqid]++; 3699 } 3700 } 3701 3702 /* restore completion queues */ 3703 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3704 migr_qp = migr_state->qps[i]; 3705 3706 qsize = migr_qp.cq.size; 3707 if (qsize) { 3708 struct nvmf_vfio_user_cq *cq; 3709 3710 /* restore cq */ 3711 cqid = migr_qp.sq.cqid; 3712 assert(cqid == i); 3713 3714 /* allocate cq if necessary */ 3715 if (vu_ctrlr->cqs[cqid] == NULL) { 3716 ret = init_cq(vu_ctrlr, cqid); 3717 if (ret) { 3718 SPDK_ERRLOG("Construct qpair with qid %u failed\n", cqid); 3719 return -EFAULT; 3720 } 3721 } 3722 3723 cq = vu_ctrlr->cqs[cqid]; 3724 3725 cq->size = qsize; 3726 3727 cq->cq_state = VFIO_USER_CQ_CREATED; 3728 cq->cq_ref = cqs_ref[cqid]; 3729 *cq_tailp(cq) = migr_qp.cq.tail; 3730 cq->mapping.prp1 = migr_qp.cq.dma_addr; 3731 cq->mapping.len = cq->size * sizeof(struct spdk_nvme_cpl); 3732 cq->ien = migr_qp.cq.ien; 3733 cq->iv = migr_qp.cq.iv; 3734 cq->phase = migr_qp.cq.phase; 3735 addr = map_one(vu_ctrlr->endpoint->vfu_ctx, 3736 cq->mapping.prp1, cq->mapping.len, 3737 cq->mapping.sg, &cq->mapping.iov, 3738 PROT_READ | PROT_WRITE); 3739 if (addr == NULL) { 3740 SPDK_ERRLOG("Restore cq with qid %u PRP1 0x%"PRIx64" with size %u failed\n", 3741 cqid, cq->mapping.prp1, cq->size); 3742 return -EFAULT; 3743 } 3744 } 3745 } 3746 3747 return 0; 3748 } 3749 3750 static int 3751 vfio_user_migr_ctrlr_restore(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3752 { 3753 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 3754 struct spdk_nvmf_ctrlr *ctrlr = vu_ctrlr->ctrlr; 3755 uint32_t *doorbell_base; 3756 struct spdk_nvme_cmd cmd; 3757 uint16_t i; 3758 int rc = 0; 3759 struct vfio_user_nvme_migr_state migr_state = { 3760 .nvmf_data = { 3761 .data_size = offsetof(struct spdk_nvmf_ctrlr_migr_data, unused), 3762 .regs_size = sizeof(struct spdk_nvmf_registers), 3763 .feat_size = sizeof(struct spdk_nvmf_ctrlr_feat) 3764 } 3765 }; 3766 3767 assert(endpoint->migr_data != NULL); 3768 assert(ctrlr != NULL); 3769 rc = vfio_user_migr_stream_to_data(endpoint, &migr_state); 3770 if (rc) { 3771 return rc; 3772 } 3773 3774 /* restore shadow doorbells */ 3775 if (migr_state.ctrlr_header.sdbl) { 3776 struct nvmf_vfio_user_shadow_doorbells *sdbl; 3777 sdbl = map_sdbl(vu_ctrlr->endpoint->vfu_ctx, 3778 migr_state.ctrlr_header.shadow_doorbell_buffer, 3779 migr_state.ctrlr_header.eventidx_buffer, 3780 memory_page_size(vu_ctrlr)); 3781 if (sdbl == NULL) { 3782 SPDK_ERRLOG("%s: failed to re-map shadow doorbell buffers\n", 3783 ctrlr_id(vu_ctrlr)); 3784 return -1; 3785 } 3786 3787 vu_ctrlr->shadow_doorbell_buffer = migr_state.ctrlr_header.shadow_doorbell_buffer; 3788 vu_ctrlr->eventidx_buffer = migr_state.ctrlr_header.eventidx_buffer; 3789 3790 SWAP(vu_ctrlr->sdbl, sdbl); 3791 } 3792 3793 rc = vfio_user_migr_ctrlr_construct_qps(vu_ctrlr, &migr_state); 3794 if (rc) { 3795 return rc; 3796 } 3797 3798 /* restore PCI configuration space */ 3799 memcpy((void *)endpoint->pci_config_space, &migr_state.cfg, NVME_REG_CFG_SIZE); 3800 3801 doorbell_base = (uint32_t *)&migr_state.doorbells; 3802 /* restore doorbells from saved registers */ 3803 memcpy((void *)vu_ctrlr->bar0_doorbells, doorbell_base, NVMF_VFIO_USER_DOORBELLS_SIZE); 3804 3805 /* restore nvmf controller data */ 3806 rc = spdk_nvmf_ctrlr_restore_migr_data(ctrlr, &migr_state.nvmf_data); 3807 if (rc) { 3808 return rc; 3809 } 3810 3811 /* resubmit pending AERs */ 3812 for (i = 0; i < migr_state.nvmf_data.num_aer_cids; i++) { 3813 SPDK_DEBUGLOG(nvmf_vfio, "%s AER resubmit, CID %u\n", ctrlr_id(vu_ctrlr), 3814 migr_state.nvmf_data.aer_cids[i]); 3815 memset(&cmd, 0, sizeof(cmd)); 3816 cmd.opc = SPDK_NVME_OPC_ASYNC_EVENT_REQUEST; 3817 cmd.cid = migr_state.nvmf_data.aer_cids[i]; 3818 rc = handle_cmd_req(vu_ctrlr, &cmd, vu_ctrlr->sqs[0]); 3819 if (spdk_unlikely(rc)) { 3820 break; 3821 } 3822 } 3823 3824 return rc; 3825 } 3826 3827 static void 3828 vfio_user_migr_ctrlr_enable_sqs(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3829 { 3830 uint32_t i; 3831 struct nvmf_vfio_user_sq *sq; 3832 3833 /* The Admin queue (qid: 0) does not ever use shadow doorbells. */ 3834 3835 if (vu_ctrlr->sqs[0] != NULL) { 3836 vu_ctrlr->sqs[0]->dbl_tailp = vu_ctrlr->bar0_doorbells + 3837 queue_index(0, false); 3838 } 3839 3840 if (vu_ctrlr->cqs[0] != NULL) { 3841 vu_ctrlr->cqs[0]->dbl_headp = vu_ctrlr->bar0_doorbells + 3842 queue_index(0, true); 3843 } 3844 3845 vfio_user_ctrlr_switch_doorbells(vu_ctrlr, vu_ctrlr->sdbl != NULL); 3846 3847 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3848 sq = vu_ctrlr->sqs[i]; 3849 if (!sq || !sq->size) { 3850 continue; 3851 } 3852 3853 if (nvmf_qpair_is_admin_queue(&sq->qpair)) { 3854 /* ADMIN queue pair is always in the poll group, just enable it */ 3855 sq->sq_state = VFIO_USER_SQ_ACTIVE; 3856 } else { 3857 spdk_nvmf_tgt_new_qpair(vu_ctrlr->transport->transport.tgt, &sq->qpair); 3858 } 3859 } 3860 } 3861 3862 /* 3863 * We are in stop-and-copy state, but still potentially have some current dirty 3864 * sgls: while we're quiesced and thus should have no active requests, we still 3865 * have potentially dirty maps of the shadow doorbells and the CQs (SQs are 3866 * mapped read only). 3867 * 3868 * Since we won't be calling vfu_sgl_put() for them, we need to explicitly 3869 * mark them dirty now. 3870 */ 3871 static void 3872 vfio_user_migr_ctrlr_mark_dirty(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3873 { 3874 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 3875 3876 assert(vu_ctrlr->state == VFIO_USER_CTRLR_MIGRATING); 3877 3878 for (size_t i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3879 struct nvmf_vfio_user_cq *cq = vu_ctrlr->cqs[i]; 3880 3881 if (cq == NULL || q_addr(&cq->mapping) == NULL) { 3882 continue; 3883 } 3884 3885 vfu_sgl_mark_dirty(endpoint->vfu_ctx, cq->mapping.sg, 1); 3886 } 3887 3888 if (vu_ctrlr->sdbl != NULL) { 3889 dma_sg_t *sg; 3890 size_t i; 3891 3892 for (i = 0; i < NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT; 3893 ++i) { 3894 3895 if (!vu_ctrlr->sdbl->iovs[i].iov_len) { 3896 continue; 3897 } 3898 3899 sg = index_to_sg_t(vu_ctrlr->sdbl->sgs, i); 3900 3901 vfu_sgl_mark_dirty(endpoint->vfu_ctx, sg, 1); 3902 } 3903 } 3904 } 3905 3906 static int 3907 vfio_user_migration_device_state_transition(vfu_ctx_t *vfu_ctx, vfu_migr_state_t state) 3908 { 3909 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3910 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3911 struct nvmf_vfio_user_sq *sq; 3912 int ret = 0; 3913 3914 SPDK_DEBUGLOG(nvmf_vfio, "%s controller state %u, migration state %u\n", endpoint_id(endpoint), 3915 vu_ctrlr->state, state); 3916 3917 switch (state) { 3918 case VFU_MIGR_STATE_STOP_AND_COPY: 3919 vu_ctrlr->in_source_vm = true; 3920 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 3921 vfio_user_migr_ctrlr_mark_dirty(vu_ctrlr); 3922 vfio_user_migr_ctrlr_save_data(vu_ctrlr); 3923 break; 3924 case VFU_MIGR_STATE_STOP: 3925 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 3926 /* The controller associates with source VM is dead now, we will resume 3927 * the subsystem after destroying the controller data structure, then the 3928 * subsystem can be re-used for another new client. 3929 */ 3930 if (vu_ctrlr->in_source_vm) { 3931 endpoint->need_resume = true; 3932 } 3933 break; 3934 case VFU_MIGR_STATE_PRE_COPY: 3935 assert(vu_ctrlr->state == VFIO_USER_CTRLR_PAUSED); 3936 break; 3937 case VFU_MIGR_STATE_RESUME: 3938 /* 3939 * Destination ADMIN queue pair is connected when starting the VM, 3940 * but the ADMIN queue pair isn't enabled in destination VM, the poll 3941 * group will do nothing to ADMIN queue pair for now. 3942 */ 3943 if (vu_ctrlr->state != VFIO_USER_CTRLR_RUNNING) { 3944 break; 3945 } 3946 3947 assert(!vu_ctrlr->in_source_vm); 3948 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 3949 3950 sq = TAILQ_FIRST(&vu_ctrlr->connected_sqs); 3951 assert(sq != NULL); 3952 assert(sq->qpair.qid == 0); 3953 sq->sq_state = VFIO_USER_SQ_INACTIVE; 3954 3955 /* Free ADMIN SQ resources first, SQ resources will be 3956 * allocated based on queue size from source VM. 3957 */ 3958 free_sq_reqs(sq); 3959 sq->size = 0; 3960 break; 3961 case VFU_MIGR_STATE_RUNNING: 3962 3963 if (vu_ctrlr->state != VFIO_USER_CTRLR_MIGRATING) { 3964 break; 3965 } 3966 3967 if (!vu_ctrlr->in_source_vm) { 3968 /* Restore destination VM from BAR9 */ 3969 ret = vfio_user_migr_ctrlr_restore(vu_ctrlr); 3970 if (ret) { 3971 break; 3972 } 3973 3974 vfio_user_ctrlr_switch_doorbells(vu_ctrlr, false); 3975 vfio_user_migr_ctrlr_enable_sqs(vu_ctrlr); 3976 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 3977 /* FIXME where do we resume nvmf? */ 3978 } else { 3979 /* Rollback source VM */ 3980 vu_ctrlr->state = VFIO_USER_CTRLR_RESUMING; 3981 ret = spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 3982 vfio_user_endpoint_resume_done, endpoint); 3983 if (ret < 0) { 3984 /* TODO: fail controller with CFS bit set */ 3985 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 3986 SPDK_ERRLOG("%s: failed to resume, ret=%d\n", endpoint_id(endpoint), ret); 3987 } 3988 } 3989 vu_ctrlr->migr_data_prepared = false; 3990 vu_ctrlr->in_source_vm = false; 3991 break; 3992 3993 default: 3994 return -EINVAL; 3995 } 3996 3997 return ret; 3998 } 3999 4000 static uint64_t 4001 vfio_user_migration_get_pending_bytes(vfu_ctx_t *vfu_ctx) 4002 { 4003 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 4004 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 4005 uint64_t pending_bytes; 4006 4007 if (ctrlr->migr_data_prepared) { 4008 assert(ctrlr->state == VFIO_USER_CTRLR_MIGRATING); 4009 pending_bytes = 0; 4010 } else { 4011 pending_bytes = vfio_user_migr_data_len(); 4012 } 4013 4014 SPDK_DEBUGLOG(nvmf_vfio, 4015 "%s current state %u, pending bytes 0x%"PRIx64"\n", 4016 endpoint_id(endpoint), ctrlr->state, pending_bytes); 4017 4018 return pending_bytes; 4019 } 4020 4021 static int 4022 vfio_user_migration_prepare_data(vfu_ctx_t *vfu_ctx, uint64_t *offset, uint64_t *size) 4023 { 4024 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 4025 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 4026 4027 /* 4028 * When transitioning to pre-copy state we set pending_bytes to 0, 4029 * so the vfio-user client shouldn't attempt to read any migration 4030 * data. This is not yet guaranteed by libvfio-user. 4031 */ 4032 if (ctrlr->state != VFIO_USER_CTRLR_MIGRATING) { 4033 assert(size != NULL); 4034 *offset = 0; 4035 *size = 0; 4036 return 0; 4037 } 4038 4039 if (ctrlr->in_source_vm) { /* migration source */ 4040 assert(size != NULL); 4041 *size = vfio_user_migr_data_len(); 4042 vfio_user_migr_ctrlr_save_data(ctrlr); 4043 } else { /* migration destination */ 4044 assert(size == NULL); 4045 assert(!ctrlr->migr_data_prepared); 4046 } 4047 *offset = 0; 4048 ctrlr->migr_data_prepared = true; 4049 4050 SPDK_DEBUGLOG(nvmf_vfio, "%s current state %u\n", endpoint_id(endpoint), ctrlr->state); 4051 4052 return 0; 4053 } 4054 4055 static ssize_t 4056 vfio_user_migration_read_data(vfu_ctx_t *vfu_ctx __attribute__((unused)), 4057 void *buf __attribute__((unused)), 4058 uint64_t count __attribute__((unused)), 4059 uint64_t offset __attribute__((unused))) 4060 { 4061 SPDK_DEBUGLOG(nvmf_vfio, "%s: migration read data not supported\n", 4062 endpoint_id(vfu_get_private(vfu_ctx))); 4063 errno = ENOTSUP; 4064 return -1; 4065 } 4066 4067 static ssize_t 4068 vfio_user_migration_write_data(vfu_ctx_t *vfu_ctx __attribute__((unused)), 4069 void *buf __attribute__((unused)), 4070 uint64_t count __attribute__((unused)), 4071 uint64_t offset __attribute__((unused))) 4072 { 4073 SPDK_DEBUGLOG(nvmf_vfio, "%s: migration write data not supported\n", 4074 endpoint_id(vfu_get_private(vfu_ctx))); 4075 errno = ENOTSUP; 4076 return -1; 4077 } 4078 4079 static int 4080 vfio_user_migration_data_written(vfu_ctx_t *vfu_ctx __attribute__((unused)), 4081 uint64_t count) 4082 { 4083 SPDK_DEBUGLOG(nvmf_vfio, "write 0x%"PRIx64"\n", (uint64_t)count); 4084 4085 if (count != vfio_user_migr_data_len()) { 4086 SPDK_DEBUGLOG(nvmf_vfio, "%s bad count %#lx\n", 4087 endpoint_id(vfu_get_private(vfu_ctx)), count); 4088 errno = EINVAL; 4089 return -1; 4090 } 4091 4092 return 0; 4093 } 4094 4095 static int 4096 vfio_user_dev_info_fill(struct nvmf_vfio_user_transport *vu_transport, 4097 struct nvmf_vfio_user_endpoint *endpoint) 4098 { 4099 int ret; 4100 ssize_t cap_offset; 4101 vfu_ctx_t *vfu_ctx = endpoint->vfu_ctx; 4102 struct iovec migr_sparse_mmap = {}; 4103 4104 struct pmcap pmcap = { .hdr.id = PCI_CAP_ID_PM, .pmcs.nsfrst = 0x1 }; 4105 struct pxcap pxcap = { 4106 .hdr.id = PCI_CAP_ID_EXP, 4107 .pxcaps.ver = 0x2, 4108 .pxdcap = {.rer = 0x1, .flrc = 0x1}, 4109 .pxdcap2.ctds = 0x1 4110 }; 4111 4112 struct msixcap msixcap = { 4113 .hdr.id = PCI_CAP_ID_MSIX, 4114 .mxc.ts = NVMF_VFIO_USER_MSIX_NUM - 1, 4115 .mtab = {.tbir = NVMF_VFIO_USER_MSIX_TABLE_BIR, .to = 0x0}, 4116 .mpba = {.pbir = NVMF_VFIO_USER_MSIX_PBA_BIR, .pbao = 0x0} 4117 }; 4118 4119 struct iovec sparse_mmap[] = { 4120 { 4121 .iov_base = (void *)NVME_DOORBELLS_OFFSET, 4122 .iov_len = NVMF_VFIO_USER_DOORBELLS_SIZE, 4123 }, 4124 }; 4125 4126 const vfu_migration_callbacks_t migr_callbacks = { 4127 .version = VFIO_USER_MIGR_CALLBACK_VERS, 4128 .transition = &vfio_user_migration_device_state_transition, 4129 .get_pending_bytes = &vfio_user_migration_get_pending_bytes, 4130 .prepare_data = &vfio_user_migration_prepare_data, 4131 .read_data = &vfio_user_migration_read_data, 4132 .data_written = &vfio_user_migration_data_written, 4133 .write_data = &vfio_user_migration_write_data 4134 }; 4135 4136 ret = vfu_pci_init(vfu_ctx, VFU_PCI_TYPE_EXPRESS, PCI_HEADER_TYPE_NORMAL, 0); 4137 if (ret < 0) { 4138 SPDK_ERRLOG("vfu_ctx %p failed to initialize PCI\n", vfu_ctx); 4139 return ret; 4140 } 4141 vfu_pci_set_id(vfu_ctx, SPDK_PCI_VID_NUTANIX, 0x0001, SPDK_PCI_VID_NUTANIX, 0); 4142 /* 4143 * 0x02, controller uses the NVM Express programming interface 4144 * 0x08, non-volatile memory controller 4145 * 0x01, mass storage controller 4146 */ 4147 vfu_pci_set_class(vfu_ctx, 0x01, 0x08, 0x02); 4148 4149 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pmcap); 4150 if (cap_offset < 0) { 4151 SPDK_ERRLOG("vfu_ctx %p failed add pmcap\n", vfu_ctx); 4152 return ret; 4153 } 4154 4155 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pxcap); 4156 if (cap_offset < 0) { 4157 SPDK_ERRLOG("vfu_ctx %p failed add pxcap\n", vfu_ctx); 4158 return ret; 4159 } 4160 4161 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &msixcap); 4162 if (cap_offset < 0) { 4163 SPDK_ERRLOG("vfu_ctx %p failed add msixcap\n", vfu_ctx); 4164 return ret; 4165 } 4166 4167 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_CFG_REGION_IDX, NVME_REG_CFG_SIZE, 4168 access_pci_config, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 4169 if (ret < 0) { 4170 SPDK_ERRLOG("vfu_ctx %p failed to setup cfg\n", vfu_ctx); 4171 return ret; 4172 } 4173 4174 if (vu_transport->transport_opts.disable_mappable_bar0) { 4175 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE, 4176 access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, 4177 NULL, 0, -1, 0); 4178 } else { 4179 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE, 4180 access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, 4181 sparse_mmap, 1, endpoint->devmem_fd, 0); 4182 } 4183 4184 if (ret < 0) { 4185 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 0\n", vfu_ctx); 4186 return ret; 4187 } 4188 4189 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR4_REGION_IDX, NVMF_VFIO_USER_BAR4_SIZE, 4190 NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 4191 if (ret < 0) { 4192 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 4\n", vfu_ctx); 4193 return ret; 4194 } 4195 4196 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR5_REGION_IDX, NVMF_VFIO_USER_BAR5_SIZE, 4197 NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 4198 if (ret < 0) { 4199 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 5\n", vfu_ctx); 4200 return ret; 4201 } 4202 4203 ret = vfu_setup_device_dma(vfu_ctx, memory_region_add_cb, memory_region_remove_cb); 4204 if (ret < 0) { 4205 SPDK_ERRLOG("vfu_ctx %p failed to setup dma callback\n", vfu_ctx); 4206 return ret; 4207 } 4208 4209 ret = vfu_setup_device_reset_cb(vfu_ctx, vfio_user_device_reset); 4210 if (ret < 0) { 4211 SPDK_ERRLOG("vfu_ctx %p failed to setup reset callback\n", vfu_ctx); 4212 return ret; 4213 } 4214 4215 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_INTX_IRQ, 1); 4216 if (ret < 0) { 4217 SPDK_ERRLOG("vfu_ctx %p failed to setup INTX\n", vfu_ctx); 4218 return ret; 4219 } 4220 4221 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSIX_IRQ, NVMF_VFIO_USER_MSIX_NUM); 4222 if (ret < 0) { 4223 SPDK_ERRLOG("vfu_ctx %p failed to setup MSIX\n", vfu_ctx); 4224 return ret; 4225 } 4226 4227 vfu_setup_device_quiesce_cb(vfu_ctx, vfio_user_dev_quiesce_cb); 4228 4229 migr_sparse_mmap.iov_base = (void *)4096; 4230 migr_sparse_mmap.iov_len = vfio_user_migr_data_len(); 4231 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_MIGR_REGION_IDX, 4232 vfu_get_migr_register_area_size() + vfio_user_migr_data_len(), 4233 NULL, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, &migr_sparse_mmap, 4234 1, endpoint->migr_fd, 0); 4235 if (ret < 0) { 4236 SPDK_ERRLOG("vfu_ctx %p failed to setup migration region\n", vfu_ctx); 4237 return ret; 4238 } 4239 4240 ret = vfu_setup_device_migration_callbacks(vfu_ctx, &migr_callbacks, 4241 vfu_get_migr_register_area_size()); 4242 if (ret < 0) { 4243 SPDK_ERRLOG("vfu_ctx %p failed to setup migration callbacks\n", vfu_ctx); 4244 return ret; 4245 } 4246 4247 ret = vfu_realize_ctx(vfu_ctx); 4248 if (ret < 0) { 4249 SPDK_ERRLOG("vfu_ctx %p failed to realize\n", vfu_ctx); 4250 return ret; 4251 } 4252 4253 endpoint->pci_config_space = vfu_pci_get_config_space(endpoint->vfu_ctx); 4254 assert(endpoint->pci_config_space != NULL); 4255 init_pci_config_space(endpoint->pci_config_space); 4256 4257 assert(cap_offset != 0); 4258 endpoint->msix = (struct msixcap *)((uint8_t *)endpoint->pci_config_space + cap_offset); 4259 4260 return 0; 4261 } 4262 4263 static int nvmf_vfio_user_accept(void *ctx); 4264 4265 /* 4266 * Register an "accept" poller: this is polling for incoming vfio-user socket 4267 * connections (on the listening socket). 4268 * 4269 * We need to do this on first listening, and also after destroying a 4270 * controller, so we can accept another connection. 4271 */ 4272 static int 4273 vfio_user_register_accept_poller(struct nvmf_vfio_user_endpoint *endpoint) 4274 { 4275 uint64_t poll_rate_us = endpoint->transport->transport.opts.acceptor_poll_rate; 4276 4277 SPDK_DEBUGLOG(nvmf_vfio, "registering accept poller\n"); 4278 4279 endpoint->accept_poller = SPDK_POLLER_REGISTER(nvmf_vfio_user_accept, 4280 endpoint, poll_rate_us); 4281 4282 if (!endpoint->accept_poller) { 4283 return -1; 4284 } 4285 4286 endpoint->accept_thread = spdk_get_thread(); 4287 endpoint->need_relisten = false; 4288 4289 if (!spdk_interrupt_mode_is_enabled()) { 4290 return 0; 4291 } 4292 4293 endpoint->accept_intr_fd = vfu_get_poll_fd(endpoint->vfu_ctx); 4294 assert(endpoint->accept_intr_fd != -1); 4295 4296 endpoint->accept_intr = SPDK_INTERRUPT_REGISTER(endpoint->accept_intr_fd, 4297 nvmf_vfio_user_accept, endpoint); 4298 4299 assert(endpoint->accept_intr != NULL); 4300 4301 spdk_poller_register_interrupt(endpoint->accept_poller, NULL, NULL); 4302 return 0; 4303 } 4304 4305 static void 4306 _vfio_user_relisten(void *ctx) 4307 { 4308 struct nvmf_vfio_user_endpoint *endpoint = ctx; 4309 4310 vfio_user_register_accept_poller(endpoint); 4311 } 4312 4313 static void 4314 _free_ctrlr(void *ctx) 4315 { 4316 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 4317 struct nvmf_vfio_user_endpoint *endpoint = ctrlr->endpoint; 4318 4319 free_sdbl(endpoint->vfu_ctx, ctrlr->sdbl); 4320 4321 spdk_interrupt_unregister(&ctrlr->intr); 4322 ctrlr->intr_fd = -1; 4323 spdk_poller_unregister(&ctrlr->vfu_ctx_poller); 4324 4325 free(ctrlr); 4326 4327 if (endpoint->need_async_destroy) { 4328 nvmf_vfio_user_destroy_endpoint(endpoint); 4329 } else if (endpoint->need_relisten) { 4330 spdk_thread_send_msg(endpoint->accept_thread, 4331 _vfio_user_relisten, endpoint); 4332 } 4333 } 4334 4335 static void 4336 free_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 4337 { 4338 struct spdk_thread *thread; 4339 int i; 4340 4341 assert(ctrlr != NULL); 4342 thread = ctrlr->thread ? ctrlr->thread : spdk_get_thread(); 4343 4344 SPDK_DEBUGLOG(nvmf_vfio, "free %s\n", ctrlr_id(ctrlr)); 4345 4346 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 4347 free_qp(ctrlr, i); 4348 } 4349 4350 spdk_thread_exec_msg(thread, _free_ctrlr, ctrlr); 4351 } 4352 4353 static int 4354 nvmf_vfio_user_create_ctrlr(struct nvmf_vfio_user_transport *transport, 4355 struct nvmf_vfio_user_endpoint *endpoint) 4356 { 4357 struct nvmf_vfio_user_ctrlr *ctrlr; 4358 int err = 0; 4359 4360 SPDK_DEBUGLOG(nvmf_vfio, "%s\n", endpoint_id(endpoint)); 4361 4362 /* First, construct a vfio-user CUSTOM transport controller */ 4363 ctrlr = calloc(1, sizeof(*ctrlr)); 4364 if (ctrlr == NULL) { 4365 err = -ENOMEM; 4366 goto out; 4367 } 4368 /* 4369 * We can only support one connection for now, but generate a unique cntlid in case vfio-user 4370 * transport is used together with RDMA or TCP transports in the same target 4371 */ 4372 ctrlr->cntlid = nvmf_subsystem_gen_cntlid(endpoint->subsystem); 4373 ctrlr->intr_fd = -1; 4374 ctrlr->transport = transport; 4375 ctrlr->endpoint = endpoint; 4376 ctrlr->bar0_doorbells = endpoint->bar0_doorbells; 4377 TAILQ_INIT(&ctrlr->connected_sqs); 4378 4379 ctrlr->adaptive_irqs_enabled = 4380 !transport->transport_opts.disable_adaptive_irq; 4381 4382 /* Then, construct an admin queue pair */ 4383 err = init_sq(ctrlr, &transport->transport, 0); 4384 if (err != 0) { 4385 free(ctrlr); 4386 goto out; 4387 } 4388 4389 err = init_cq(ctrlr, 0); 4390 if (err != 0) { 4391 free(ctrlr); 4392 goto out; 4393 } 4394 4395 ctrlr->sqs[0]->size = NVMF_VFIO_USER_DEFAULT_AQ_DEPTH; 4396 4397 err = alloc_sq_reqs(ctrlr, ctrlr->sqs[0]); 4398 if (err != 0) { 4399 free(ctrlr); 4400 goto out; 4401 } 4402 endpoint->ctrlr = ctrlr; 4403 4404 /* Notify the generic layer about the new admin queue pair */ 4405 spdk_nvmf_tgt_new_qpair(transport->transport.tgt, &ctrlr->sqs[0]->qpair); 4406 4407 out: 4408 if (err != 0) { 4409 SPDK_ERRLOG("%s: failed to create vfio-user controller: %s\n", 4410 endpoint_id(endpoint), strerror(-err)); 4411 } 4412 4413 return err; 4414 } 4415 4416 static int 4417 nvmf_vfio_user_listen(struct spdk_nvmf_transport *transport, 4418 const struct spdk_nvme_transport_id *trid, 4419 struct spdk_nvmf_listen_opts *listen_opts) 4420 { 4421 struct nvmf_vfio_user_transport *vu_transport; 4422 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 4423 char path[PATH_MAX] = {}; 4424 char uuid[PATH_MAX] = {}; 4425 int ret; 4426 4427 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4428 transport); 4429 4430 pthread_mutex_lock(&vu_transport->lock); 4431 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 4432 /* Only compare traddr */ 4433 if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) { 4434 pthread_mutex_unlock(&vu_transport->lock); 4435 return -EEXIST; 4436 } 4437 } 4438 pthread_mutex_unlock(&vu_transport->lock); 4439 4440 endpoint = calloc(1, sizeof(*endpoint)); 4441 if (!endpoint) { 4442 return -ENOMEM; 4443 } 4444 4445 pthread_mutex_init(&endpoint->lock, NULL); 4446 endpoint->devmem_fd = -1; 4447 memcpy(&endpoint->trid, trid, sizeof(endpoint->trid)); 4448 endpoint->transport = vu_transport; 4449 4450 ret = snprintf(path, PATH_MAX, "%s/bar0", endpoint_id(endpoint)); 4451 if (ret < 0 || ret >= PATH_MAX) { 4452 SPDK_ERRLOG("%s: error to get socket path: %s.\n", endpoint_id(endpoint), spdk_strerror(errno)); 4453 ret = -1; 4454 goto out; 4455 } 4456 4457 ret = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); 4458 if (ret == -1) { 4459 SPDK_ERRLOG("%s: failed to open device memory at %s: %s.\n", 4460 endpoint_id(endpoint), path, spdk_strerror(errno)); 4461 goto out; 4462 } 4463 unlink(path); 4464 4465 endpoint->devmem_fd = ret; 4466 ret = ftruncate(endpoint->devmem_fd, 4467 NVME_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE); 4468 if (ret != 0) { 4469 SPDK_ERRLOG("%s: error to ftruncate file %s: %s.\n", endpoint_id(endpoint), path, 4470 spdk_strerror(errno)); 4471 goto out; 4472 } 4473 4474 endpoint->bar0_doorbells = mmap(NULL, NVMF_VFIO_USER_DOORBELLS_SIZE, 4475 PROT_READ | PROT_WRITE, MAP_SHARED, endpoint->devmem_fd, NVME_DOORBELLS_OFFSET); 4476 if (endpoint->bar0_doorbells == MAP_FAILED) { 4477 SPDK_ERRLOG("%s: error to mmap file %s: %s.\n", endpoint_id(endpoint), path, spdk_strerror(errno)); 4478 endpoint->bar0_doorbells = NULL; 4479 ret = -1; 4480 goto out; 4481 } 4482 4483 ret = snprintf(path, PATH_MAX, "%s/migr", endpoint_id(endpoint)); 4484 if (ret < 0 || ret >= PATH_MAX) { 4485 SPDK_ERRLOG("%s: error to get migration file path: %s.\n", endpoint_id(endpoint), 4486 spdk_strerror(errno)); 4487 ret = -1; 4488 goto out; 4489 } 4490 ret = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); 4491 if (ret == -1) { 4492 SPDK_ERRLOG("%s: failed to open device memory at %s: %s.\n", 4493 endpoint_id(endpoint), path, spdk_strerror(errno)); 4494 goto out; 4495 } 4496 unlink(path); 4497 4498 endpoint->migr_fd = ret; 4499 ret = ftruncate(endpoint->migr_fd, 4500 vfu_get_migr_register_area_size() + vfio_user_migr_data_len()); 4501 if (ret != 0) { 4502 SPDK_ERRLOG("%s: error to ftruncate migration file %s: %s.\n", endpoint_id(endpoint), path, 4503 spdk_strerror(errno)); 4504 goto out; 4505 } 4506 4507 endpoint->migr_data = mmap(NULL, vfio_user_migr_data_len(), 4508 PROT_READ | PROT_WRITE, MAP_SHARED, endpoint->migr_fd, vfu_get_migr_register_area_size()); 4509 if (endpoint->migr_data == MAP_FAILED) { 4510 SPDK_ERRLOG("%s: error to mmap file %s: %s.\n", endpoint_id(endpoint), path, spdk_strerror(errno)); 4511 endpoint->migr_data = NULL; 4512 ret = -1; 4513 goto out; 4514 } 4515 4516 ret = snprintf(uuid, PATH_MAX, "%s/cntrl", endpoint_id(endpoint)); 4517 if (ret < 0 || ret >= PATH_MAX) { 4518 SPDK_ERRLOG("%s: error to get ctrlr file path: %s\n", endpoint_id(endpoint), spdk_strerror(errno)); 4519 ret = -1; 4520 goto out; 4521 } 4522 4523 endpoint->vfu_ctx = vfu_create_ctx(VFU_TRANS_SOCK, uuid, LIBVFIO_USER_FLAG_ATTACH_NB, 4524 endpoint, VFU_DEV_TYPE_PCI); 4525 if (endpoint->vfu_ctx == NULL) { 4526 SPDK_ERRLOG("%s: error creating libmuser context: %m\n", 4527 endpoint_id(endpoint)); 4528 ret = -1; 4529 goto out; 4530 } 4531 4532 ret = vfu_setup_log(endpoint->vfu_ctx, vfio_user_log, 4533 vfio_user_get_log_level()); 4534 if (ret < 0) { 4535 goto out; 4536 } 4537 4538 4539 ret = vfio_user_dev_info_fill(vu_transport, endpoint); 4540 if (ret < 0) { 4541 goto out; 4542 } 4543 4544 ret = vfio_user_register_accept_poller(endpoint); 4545 4546 if (ret != 0) { 4547 goto out; 4548 } 4549 4550 pthread_mutex_lock(&vu_transport->lock); 4551 TAILQ_INSERT_TAIL(&vu_transport->endpoints, endpoint, link); 4552 pthread_mutex_unlock(&vu_transport->lock); 4553 4554 out: 4555 if (ret != 0) { 4556 nvmf_vfio_user_destroy_endpoint(endpoint); 4557 } 4558 4559 return ret; 4560 } 4561 4562 static void 4563 nvmf_vfio_user_stop_listen(struct spdk_nvmf_transport *transport, 4564 const struct spdk_nvme_transport_id *trid) 4565 { 4566 struct nvmf_vfio_user_transport *vu_transport; 4567 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 4568 4569 assert(trid != NULL); 4570 assert(trid->traddr != NULL); 4571 4572 SPDK_DEBUGLOG(nvmf_vfio, "%s: stop listen\n", trid->traddr); 4573 4574 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4575 transport); 4576 4577 pthread_mutex_lock(&vu_transport->lock); 4578 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 4579 if (strcmp(trid->traddr, endpoint->trid.traddr) == 0) { 4580 TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link); 4581 /* Defer to free endpoint resources until the controller 4582 * is freed. There are two cases when running here: 4583 * 1. kill nvmf target while VM is connected 4584 * 2. remove listener via RPC call 4585 * nvmf library will disconnect all queue paris. 4586 */ 4587 if (endpoint->ctrlr) { 4588 assert(!endpoint->need_async_destroy); 4589 endpoint->need_async_destroy = true; 4590 pthread_mutex_unlock(&vu_transport->lock); 4591 return; 4592 } 4593 4594 nvmf_vfio_user_destroy_endpoint(endpoint); 4595 pthread_mutex_unlock(&vu_transport->lock); 4596 return; 4597 } 4598 } 4599 pthread_mutex_unlock(&vu_transport->lock); 4600 4601 SPDK_DEBUGLOG(nvmf_vfio, "%s: not found\n", trid->traddr); 4602 } 4603 4604 static void 4605 nvmf_vfio_user_cdata_init(struct spdk_nvmf_transport *transport, 4606 struct spdk_nvmf_subsystem *subsystem, 4607 struct spdk_nvmf_ctrlr_data *cdata) 4608 { 4609 struct nvmf_vfio_user_transport *vu_transport; 4610 4611 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, transport); 4612 4613 cdata->vid = SPDK_PCI_VID_NUTANIX; 4614 cdata->ssvid = SPDK_PCI_VID_NUTANIX; 4615 cdata->ieee[0] = 0x8d; 4616 cdata->ieee[1] = 0x6b; 4617 cdata->ieee[2] = 0x50; 4618 memset(&cdata->sgls, 0, sizeof(struct spdk_nvme_cdata_sgls)); 4619 cdata->sgls.supported = SPDK_NVME_SGLS_SUPPORTED_DWORD_ALIGNED; 4620 cdata->oncs.compare = !vu_transport->transport_opts.disable_compare; 4621 /* libvfio-user can only support 1 connection for now */ 4622 cdata->oncs.reservations = 0; 4623 cdata->oacs.doorbell_buffer_config = !vu_transport->transport_opts.disable_shadow_doorbells; 4624 cdata->fuses.compare_and_write = !vu_transport->transport_opts.disable_compare; 4625 } 4626 4627 static int 4628 nvmf_vfio_user_listen_associate(struct spdk_nvmf_transport *transport, 4629 const struct spdk_nvmf_subsystem *subsystem, 4630 const struct spdk_nvme_transport_id *trid) 4631 { 4632 struct nvmf_vfio_user_transport *vu_transport; 4633 struct nvmf_vfio_user_endpoint *endpoint; 4634 4635 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, transport); 4636 4637 pthread_mutex_lock(&vu_transport->lock); 4638 TAILQ_FOREACH(endpoint, &vu_transport->endpoints, link) { 4639 if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) { 4640 break; 4641 } 4642 } 4643 pthread_mutex_unlock(&vu_transport->lock); 4644 4645 if (endpoint == NULL) { 4646 return -ENOENT; 4647 } 4648 4649 /* Drop const - we will later need to pause/unpause. */ 4650 endpoint->subsystem = (struct spdk_nvmf_subsystem *)subsystem; 4651 4652 return 0; 4653 } 4654 4655 /* 4656 * Executed periodically at a default SPDK_NVMF_DEFAULT_ACCEPT_POLL_RATE_US 4657 * frequency. 4658 * 4659 * For this endpoint (which at the libvfio-user level corresponds to a socket), 4660 * if we don't currently have a controller set up, peek to see if the socket is 4661 * able to accept a new connection. 4662 */ 4663 static int 4664 nvmf_vfio_user_accept(void *ctx) 4665 { 4666 struct nvmf_vfio_user_endpoint *endpoint = ctx; 4667 struct nvmf_vfio_user_transport *vu_transport; 4668 int err; 4669 4670 vu_transport = endpoint->transport; 4671 4672 if (endpoint->ctrlr != NULL) { 4673 return SPDK_POLLER_IDLE; 4674 } 4675 4676 /* While we're here, the controller is already destroyed, 4677 * subsystem may still be in RESUMING state, we will wait 4678 * until the subsystem is in RUNNING state. 4679 */ 4680 if (endpoint->need_resume) { 4681 return SPDK_POLLER_IDLE; 4682 } 4683 4684 err = vfu_attach_ctx(endpoint->vfu_ctx); 4685 if (err == 0) { 4686 SPDK_DEBUGLOG(nvmf_vfio, "attach succeeded\n"); 4687 err = nvmf_vfio_user_create_ctrlr(vu_transport, endpoint); 4688 if (err == 0) { 4689 /* 4690 * Unregister ourselves: now we've accepted a 4691 * connection, there is nothing for us to poll for, and 4692 * we will poll the connection via vfu_run_ctx() 4693 * instead. 4694 */ 4695 spdk_interrupt_unregister(&endpoint->accept_intr); 4696 spdk_poller_unregister(&endpoint->accept_poller); 4697 } 4698 return SPDK_POLLER_BUSY; 4699 } 4700 4701 if (errno == EAGAIN || errno == EWOULDBLOCK) { 4702 return SPDK_POLLER_IDLE; 4703 } 4704 4705 return SPDK_POLLER_BUSY; 4706 } 4707 4708 static void 4709 nvmf_vfio_user_discover(struct spdk_nvmf_transport *transport, 4710 struct spdk_nvme_transport_id *trid, 4711 struct spdk_nvmf_discovery_log_page_entry *entry) 4712 { } 4713 4714 static int vfio_user_poll_group_intr(void *ctx); 4715 4716 static void 4717 vfio_user_poll_group_add_intr(struct nvmf_vfio_user_poll_group *vu_group, 4718 struct spdk_nvmf_poll_group *group) 4719 { 4720 vu_group->intr_fd = eventfd(0, EFD_NONBLOCK); 4721 assert(vu_group->intr_fd != -1); 4722 4723 vu_group->intr = SPDK_INTERRUPT_REGISTER(vu_group->intr_fd, 4724 vfio_user_poll_group_intr, vu_group); 4725 assert(vu_group->intr != NULL); 4726 } 4727 4728 static struct spdk_nvmf_transport_poll_group * 4729 nvmf_vfio_user_poll_group_create(struct spdk_nvmf_transport *transport, 4730 struct spdk_nvmf_poll_group *group) 4731 { 4732 struct nvmf_vfio_user_transport *vu_transport; 4733 struct nvmf_vfio_user_poll_group *vu_group; 4734 4735 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4736 transport); 4737 4738 SPDK_DEBUGLOG(nvmf_vfio, "create poll group\n"); 4739 4740 vu_group = calloc(1, sizeof(*vu_group)); 4741 if (vu_group == NULL) { 4742 SPDK_ERRLOG("Error allocating poll group: %m"); 4743 return NULL; 4744 } 4745 4746 if (in_interrupt_mode(vu_transport)) { 4747 vfio_user_poll_group_add_intr(vu_group, group); 4748 } 4749 4750 TAILQ_INIT(&vu_group->sqs); 4751 4752 pthread_mutex_lock(&vu_transport->pg_lock); 4753 TAILQ_INSERT_TAIL(&vu_transport->poll_groups, vu_group, link); 4754 if (vu_transport->next_pg == NULL) { 4755 vu_transport->next_pg = vu_group; 4756 } 4757 pthread_mutex_unlock(&vu_transport->pg_lock); 4758 4759 return &vu_group->group; 4760 } 4761 4762 static struct spdk_nvmf_transport_poll_group * 4763 nvmf_vfio_user_get_optimal_poll_group(struct spdk_nvmf_qpair *qpair) 4764 { 4765 struct nvmf_vfio_user_transport *vu_transport; 4766 struct nvmf_vfio_user_poll_group **vu_group; 4767 struct nvmf_vfio_user_sq *sq; 4768 struct nvmf_vfio_user_cq *cq; 4769 4770 struct spdk_nvmf_transport_poll_group *result = NULL; 4771 4772 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 4773 cq = sq->ctrlr->cqs[sq->cqid]; 4774 assert(cq != NULL); 4775 vu_transport = SPDK_CONTAINEROF(qpair->transport, struct nvmf_vfio_user_transport, transport); 4776 4777 pthread_mutex_lock(&vu_transport->pg_lock); 4778 if (TAILQ_EMPTY(&vu_transport->poll_groups)) { 4779 goto out; 4780 } 4781 4782 if (!nvmf_qpair_is_admin_queue(qpair)) { 4783 /* 4784 * If this is shared IO CQ case, just return the used CQ's poll 4785 * group, so I/O completions don't have to use 4786 * spdk_thread_send_msg(). 4787 */ 4788 if (cq->group != NULL) { 4789 result = cq->group; 4790 goto out; 4791 } 4792 4793 /* 4794 * If we're in interrupt mode, align all qpairs for a controller 4795 * on the same poll group by default, unless requested. This can 4796 * be lower in performance than running on a single poll group, 4797 * so we disable spreading by default. 4798 */ 4799 if (in_interrupt_mode(vu_transport) && 4800 !vu_transport->transport_opts.enable_intr_mode_sq_spreading) { 4801 result = sq->ctrlr->sqs[0]->group; 4802 goto out; 4803 } 4804 4805 } 4806 4807 vu_group = &vu_transport->next_pg; 4808 assert(*vu_group != NULL); 4809 4810 result = &(*vu_group)->group; 4811 *vu_group = TAILQ_NEXT(*vu_group, link); 4812 if (*vu_group == NULL) { 4813 *vu_group = TAILQ_FIRST(&vu_transport->poll_groups); 4814 } 4815 4816 out: 4817 if (cq->group == NULL) { 4818 cq->group = result; 4819 } 4820 4821 pthread_mutex_unlock(&vu_transport->pg_lock); 4822 return result; 4823 } 4824 4825 static void 4826 vfio_user_poll_group_del_intr(struct nvmf_vfio_user_poll_group *vu_group) 4827 { 4828 assert(vu_group->intr_fd != -1); 4829 4830 spdk_interrupt_unregister(&vu_group->intr); 4831 4832 close(vu_group->intr_fd); 4833 vu_group->intr_fd = -1; 4834 } 4835 4836 /* called when process exits */ 4837 static void 4838 nvmf_vfio_user_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group) 4839 { 4840 struct nvmf_vfio_user_poll_group *vu_group, *next_tgroup; 4841 struct nvmf_vfio_user_transport *vu_transport; 4842 4843 SPDK_DEBUGLOG(nvmf_vfio, "destroy poll group\n"); 4844 4845 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 4846 vu_transport = SPDK_CONTAINEROF(vu_group->group.transport, struct nvmf_vfio_user_transport, 4847 transport); 4848 4849 if (in_interrupt_mode(vu_transport)) { 4850 vfio_user_poll_group_del_intr(vu_group); 4851 } 4852 4853 pthread_mutex_lock(&vu_transport->pg_lock); 4854 next_tgroup = TAILQ_NEXT(vu_group, link); 4855 TAILQ_REMOVE(&vu_transport->poll_groups, vu_group, link); 4856 if (next_tgroup == NULL) { 4857 next_tgroup = TAILQ_FIRST(&vu_transport->poll_groups); 4858 } 4859 if (vu_transport->next_pg == vu_group) { 4860 vu_transport->next_pg = next_tgroup; 4861 } 4862 pthread_mutex_unlock(&vu_transport->pg_lock); 4863 4864 free(vu_group); 4865 } 4866 4867 static void 4868 _vfio_user_qpair_disconnect(void *ctx) 4869 { 4870 struct nvmf_vfio_user_sq *sq = ctx; 4871 4872 spdk_nvmf_qpair_disconnect(&sq->qpair); 4873 } 4874 4875 /* The function is used when socket connection is destroyed */ 4876 static int 4877 vfio_user_destroy_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 4878 { 4879 struct nvmf_vfio_user_sq *sq; 4880 struct nvmf_vfio_user_endpoint *endpoint; 4881 4882 SPDK_DEBUGLOG(nvmf_vfio, "%s stop processing\n", ctrlr_id(ctrlr)); 4883 4884 endpoint = ctrlr->endpoint; 4885 assert(endpoint != NULL); 4886 4887 pthread_mutex_lock(&endpoint->lock); 4888 endpoint->need_relisten = true; 4889 ctrlr->disconnect = true; 4890 if (TAILQ_EMPTY(&ctrlr->connected_sqs)) { 4891 endpoint->ctrlr = NULL; 4892 free_ctrlr(ctrlr); 4893 pthread_mutex_unlock(&endpoint->lock); 4894 return 0; 4895 } 4896 4897 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 4898 /* add another round thread poll to avoid recursive endpoint lock */ 4899 spdk_thread_send_msg(ctrlr->thread, _vfio_user_qpair_disconnect, sq); 4900 } 4901 pthread_mutex_unlock(&endpoint->lock); 4902 4903 return 0; 4904 } 4905 4906 /* 4907 * Poll for and process any incoming vfio-user messages. 4908 */ 4909 static int 4910 vfio_user_poll_vfu_ctx(void *ctx) 4911 { 4912 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 4913 int ret; 4914 4915 assert(ctrlr != NULL); 4916 4917 /* This will call access_bar0_fn() if there are any writes 4918 * to the portion of the BAR that is not mmap'd */ 4919 ret = vfu_run_ctx(ctrlr->endpoint->vfu_ctx); 4920 if (spdk_unlikely(ret == -1)) { 4921 if (errno == EBUSY) { 4922 return SPDK_POLLER_IDLE; 4923 } 4924 4925 spdk_poller_unregister(&ctrlr->vfu_ctx_poller); 4926 4927 /* 4928 * We lost the client; the reset callback will already have 4929 * unregistered the interrupt. 4930 */ 4931 if (errno == ENOTCONN) { 4932 vfio_user_destroy_ctrlr(ctrlr); 4933 return SPDK_POLLER_BUSY; 4934 } 4935 4936 /* 4937 * We might not have got a reset callback in this case, so 4938 * explicitly unregister the interrupt here. 4939 */ 4940 spdk_interrupt_unregister(&ctrlr->intr); 4941 ctrlr->intr_fd = -1; 4942 fail_ctrlr(ctrlr); 4943 } 4944 4945 return ret != 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 4946 } 4947 4948 struct vfio_user_post_cpl_ctx { 4949 struct nvmf_vfio_user_ctrlr *ctrlr; 4950 struct nvmf_vfio_user_cq *cq; 4951 struct spdk_nvme_cpl cpl; 4952 }; 4953 4954 static void 4955 _post_completion_msg(void *ctx) 4956 { 4957 struct vfio_user_post_cpl_ctx *cpl_ctx = ctx; 4958 4959 post_completion(cpl_ctx->ctrlr, cpl_ctx->cq, cpl_ctx->cpl.cdw0, cpl_ctx->cpl.sqid, 4960 cpl_ctx->cpl.cid, cpl_ctx->cpl.status.sc, cpl_ctx->cpl.status.sct); 4961 free(cpl_ctx); 4962 } 4963 4964 static int nvmf_vfio_user_poll_group_poll(struct spdk_nvmf_transport_poll_group *group); 4965 4966 static int 4967 vfio_user_poll_group_process(void *ctx) 4968 { 4969 struct nvmf_vfio_user_poll_group *vu_group = ctx; 4970 int ret = 0; 4971 4972 SPDK_DEBUGLOG(vfio_user_db, "pg:%p got intr\n", vu_group); 4973 4974 ret |= nvmf_vfio_user_poll_group_poll(&vu_group->group); 4975 4976 /* 4977 * Re-arm the event indexes. NB: this also could rearm other 4978 * controller's SQs. 4979 */ 4980 ret |= vfio_user_poll_group_rearm(vu_group); 4981 4982 vu_group->stats.pg_process_count++; 4983 return ret != 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 4984 } 4985 4986 static int 4987 vfio_user_poll_group_intr(void *ctx) 4988 { 4989 struct nvmf_vfio_user_poll_group *vu_group = ctx; 4990 eventfd_t val; 4991 4992 eventfd_read(vu_group->intr_fd, &val); 4993 4994 vu_group->stats.intr++; 4995 4996 return vfio_user_poll_group_process(ctx); 4997 } 4998 4999 /* 5000 * Handle an interrupt for the given controller: we must poll the vfu_ctx, and 5001 * the SQs assigned to our own poll group. Other poll groups are handled via 5002 * vfio_user_poll_group_intr(). 5003 */ 5004 static int 5005 vfio_user_ctrlr_intr(void *ctx) 5006 { 5007 struct nvmf_vfio_user_poll_group *vu_ctrlr_group; 5008 struct nvmf_vfio_user_ctrlr *vu_ctrlr = ctx; 5009 struct nvmf_vfio_user_poll_group *vu_group; 5010 int ret = SPDK_POLLER_IDLE; 5011 5012 vu_ctrlr_group = ctrlr_to_poll_group(vu_ctrlr); 5013 5014 SPDK_DEBUGLOG(vfio_user_db, "ctrlr pg:%p got intr\n", vu_ctrlr_group); 5015 5016 vu_ctrlr_group->stats.ctrlr_intr++; 5017 5018 /* 5019 * Poll vfio-user for this controller. We need to do this before polling 5020 * any SQs, as this is where doorbell writes may be handled. 5021 */ 5022 ret = vfio_user_poll_vfu_ctx(vu_ctrlr); 5023 5024 /* 5025 * `sqs[0]` could be set to NULL in vfio_user_poll_vfu_ctx() context, 5026 * just return for this case. 5027 */ 5028 if (vu_ctrlr->sqs[0] == NULL) { 5029 return ret; 5030 } 5031 5032 if (vu_ctrlr->transport->transport_opts.enable_intr_mode_sq_spreading) { 5033 /* 5034 * We may have just written to a doorbell owned by another 5035 * reactor: we need to prod them to make sure its SQs are polled 5036 * *after* the doorbell value is updated. 5037 */ 5038 TAILQ_FOREACH(vu_group, &vu_ctrlr->transport->poll_groups, link) { 5039 if (vu_group != vu_ctrlr_group) { 5040 SPDK_DEBUGLOG(vfio_user_db, "prodding pg:%p\n", vu_group); 5041 eventfd_write(vu_group->intr_fd, 1); 5042 } 5043 } 5044 } 5045 5046 ret |= vfio_user_poll_group_process(vu_ctrlr_group); 5047 5048 return ret; 5049 } 5050 5051 static void 5052 vfio_user_ctrlr_set_intr_mode(struct spdk_poller *poller, void *ctx, 5053 bool interrupt_mode) 5054 { 5055 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 5056 assert(ctrlr != NULL); 5057 assert(ctrlr->endpoint != NULL); 5058 5059 SPDK_DEBUGLOG(nvmf_vfio, "%s: setting interrupt mode to %d\n", 5060 ctrlr_id(ctrlr), interrupt_mode); 5061 5062 /* 5063 * interrupt_mode needs to persist across controller resets, so store 5064 * it in the endpoint instead. 5065 */ 5066 ctrlr->endpoint->interrupt_mode = interrupt_mode; 5067 5068 vfio_user_poll_group_rearm(ctrlr_to_poll_group(ctrlr)); 5069 } 5070 5071 /* 5072 * In response to the nvmf_vfio_user_create_ctrlr() path, the admin queue is now 5073 * set up and we can start operating on this controller. 5074 */ 5075 static void 5076 start_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 5077 struct spdk_nvmf_ctrlr *ctrlr) 5078 { 5079 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 5080 5081 vu_ctrlr->ctrlr = ctrlr; 5082 vu_ctrlr->cntlid = ctrlr->cntlid; 5083 vu_ctrlr->thread = spdk_get_thread(); 5084 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 5085 5086 if (!in_interrupt_mode(endpoint->transport)) { 5087 vu_ctrlr->vfu_ctx_poller = SPDK_POLLER_REGISTER(vfio_user_poll_vfu_ctx, 5088 vu_ctrlr, 1000); 5089 return; 5090 } 5091 5092 vu_ctrlr->vfu_ctx_poller = SPDK_POLLER_REGISTER(vfio_user_poll_vfu_ctx, 5093 vu_ctrlr, 0); 5094 5095 vu_ctrlr->intr_fd = vfu_get_poll_fd(vu_ctrlr->endpoint->vfu_ctx); 5096 assert(vu_ctrlr->intr_fd != -1); 5097 5098 vu_ctrlr->intr = SPDK_INTERRUPT_REGISTER(vu_ctrlr->intr_fd, 5099 vfio_user_ctrlr_intr, vu_ctrlr); 5100 5101 assert(vu_ctrlr->intr != NULL); 5102 5103 spdk_poller_register_interrupt(vu_ctrlr->vfu_ctx_poller, 5104 vfio_user_ctrlr_set_intr_mode, 5105 vu_ctrlr); 5106 } 5107 5108 static int 5109 handle_queue_connect_rsp(struct nvmf_vfio_user_req *req, void *cb_arg) 5110 { 5111 struct nvmf_vfio_user_poll_group *vu_group; 5112 struct nvmf_vfio_user_sq *sq = cb_arg; 5113 struct nvmf_vfio_user_cq *admin_cq; 5114 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 5115 struct nvmf_vfio_user_endpoint *endpoint; 5116 5117 assert(sq != NULL); 5118 assert(req != NULL); 5119 5120 vu_ctrlr = sq->ctrlr; 5121 assert(vu_ctrlr != NULL); 5122 endpoint = vu_ctrlr->endpoint; 5123 assert(endpoint != NULL); 5124 5125 if (spdk_nvme_cpl_is_error(&req->req.rsp->nvme_cpl)) { 5126 SPDK_ERRLOG("SC %u, SCT %u\n", req->req.rsp->nvme_cpl.status.sc, req->req.rsp->nvme_cpl.status.sct); 5127 endpoint->ctrlr = NULL; 5128 free_ctrlr(vu_ctrlr); 5129 return -1; 5130 } 5131 5132 vu_group = SPDK_CONTAINEROF(sq->group, struct nvmf_vfio_user_poll_group, group); 5133 TAILQ_INSERT_TAIL(&vu_group->sqs, sq, link); 5134 5135 admin_cq = vu_ctrlr->cqs[0]; 5136 assert(admin_cq != NULL); 5137 assert(admin_cq->group != NULL); 5138 assert(admin_cq->group->group->thread != NULL); 5139 5140 pthread_mutex_lock(&endpoint->lock); 5141 if (nvmf_qpair_is_admin_queue(&sq->qpair)) { 5142 assert(admin_cq->group->group->thread == spdk_get_thread()); 5143 /* 5144 * The admin queue is special as SQ0 and CQ0 are created 5145 * together. 5146 */ 5147 admin_cq->cq_ref = 1; 5148 start_ctrlr(vu_ctrlr, sq->qpair.ctrlr); 5149 } else { 5150 /* For I/O queues this command was generated in response to an 5151 * ADMIN I/O CREATE SUBMISSION QUEUE command which has not yet 5152 * been completed. Complete it now. 5153 */ 5154 if (sq->post_create_io_sq_completion) { 5155 if (admin_cq->group->group->thread != spdk_get_thread()) { 5156 struct vfio_user_post_cpl_ctx *cpl_ctx; 5157 5158 cpl_ctx = calloc(1, sizeof(*cpl_ctx)); 5159 if (!cpl_ctx) { 5160 return -ENOMEM; 5161 } 5162 cpl_ctx->ctrlr = vu_ctrlr; 5163 cpl_ctx->cq = admin_cq; 5164 cpl_ctx->cpl.sqid = 0; 5165 cpl_ctx->cpl.cdw0 = 0; 5166 cpl_ctx->cpl.cid = sq->create_io_sq_cmd.cid; 5167 cpl_ctx->cpl.status.sc = SPDK_NVME_SC_SUCCESS; 5168 cpl_ctx->cpl.status.sct = SPDK_NVME_SCT_GENERIC; 5169 5170 spdk_thread_send_msg(admin_cq->group->group->thread, 5171 _post_completion_msg, 5172 cpl_ctx); 5173 } else { 5174 post_completion(vu_ctrlr, admin_cq, 0, 0, 5175 sq->create_io_sq_cmd.cid, SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC); 5176 } 5177 sq->post_create_io_sq_completion = false; 5178 } else if (in_interrupt_mode(endpoint->transport)) { 5179 /* 5180 * If we're live migrating a guest, there is a window 5181 * where the I/O queues haven't been set up but the 5182 * device is in running state, during which the guest 5183 * might write to a doorbell. This doorbell write will 5184 * go unnoticed, so let's poll the whole controller to 5185 * pick that up. 5186 */ 5187 ctrlr_kick(vu_ctrlr); 5188 } 5189 sq->sq_state = VFIO_USER_SQ_ACTIVE; 5190 } 5191 5192 TAILQ_INSERT_TAIL(&vu_ctrlr->connected_sqs, sq, tailq); 5193 pthread_mutex_unlock(&endpoint->lock); 5194 5195 free(req->req.iov[0].iov_base); 5196 req->req.iov[0].iov_base = NULL; 5197 req->req.iovcnt = 0; 5198 5199 return 0; 5200 } 5201 5202 static void 5203 _nvmf_vfio_user_poll_group_add(void *req) 5204 { 5205 spdk_nvmf_request_exec(req); 5206 } 5207 5208 /* 5209 * Add the given qpair to the given poll group. New qpairs are added via 5210 * spdk_nvmf_tgt_new_qpair(), which picks a poll group via 5211 * nvmf_vfio_user_get_optimal_poll_group(), then calls back here via 5212 * nvmf_transport_poll_group_add(). 5213 */ 5214 static int 5215 nvmf_vfio_user_poll_group_add(struct spdk_nvmf_transport_poll_group *group, 5216 struct spdk_nvmf_qpair *qpair) 5217 { 5218 struct nvmf_vfio_user_sq *sq; 5219 struct nvmf_vfio_user_req *vu_req; 5220 struct nvmf_vfio_user_ctrlr *ctrlr; 5221 struct spdk_nvmf_request *req; 5222 struct spdk_nvmf_fabric_connect_data *data; 5223 bool admin; 5224 5225 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5226 sq->group = group; 5227 ctrlr = sq->ctrlr; 5228 5229 SPDK_DEBUGLOG(nvmf_vfio, "%s: add QP%d=%p(%p) to poll_group=%p\n", 5230 ctrlr_id(ctrlr), sq->qpair.qid, 5231 sq, qpair, group); 5232 5233 admin = nvmf_qpair_is_admin_queue(&sq->qpair); 5234 5235 vu_req = get_nvmf_vfio_user_req(sq); 5236 if (vu_req == NULL) { 5237 return -1; 5238 } 5239 5240 req = &vu_req->req; 5241 req->cmd->connect_cmd.opcode = SPDK_NVME_OPC_FABRIC; 5242 req->cmd->connect_cmd.cid = 0; 5243 req->cmd->connect_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_CONNECT; 5244 req->cmd->connect_cmd.recfmt = 0; 5245 req->cmd->connect_cmd.sqsize = sq->size - 1; 5246 req->cmd->connect_cmd.qid = admin ? 0 : qpair->qid; 5247 5248 req->length = sizeof(struct spdk_nvmf_fabric_connect_data); 5249 5250 data = calloc(1, req->length); 5251 if (data == NULL) { 5252 nvmf_vfio_user_req_free(req); 5253 return -ENOMEM; 5254 } 5255 5256 SPDK_IOV_ONE(req->iov, &req->iovcnt, data, req->length); 5257 5258 data->cntlid = ctrlr->cntlid; 5259 snprintf(data->subnqn, sizeof(data->subnqn), "%s", 5260 spdk_nvmf_subsystem_get_nqn(ctrlr->endpoint->subsystem)); 5261 5262 vu_req->cb_fn = handle_queue_connect_rsp; 5263 vu_req->cb_arg = sq; 5264 5265 SPDK_DEBUGLOG(nvmf_vfio, 5266 "%s: sending connect fabrics command for qid:%#x cntlid=%#x\n", 5267 ctrlr_id(ctrlr), qpair->qid, data->cntlid); 5268 5269 /* 5270 * By the time transport's poll_group_add() callback is executed, the 5271 * qpair isn't in the ACTIVE state yet, so spdk_nvmf_request_exec() 5272 * would fail. The state changes to ACTIVE immediately after the 5273 * callback finishes, so delay spdk_nvmf_request_exec() by sending a 5274 * message. 5275 */ 5276 spdk_thread_send_msg(spdk_get_thread(), _nvmf_vfio_user_poll_group_add, req); 5277 return 0; 5278 } 5279 5280 static int 5281 nvmf_vfio_user_poll_group_remove(struct spdk_nvmf_transport_poll_group *group, 5282 struct spdk_nvmf_qpair *qpair) 5283 { 5284 struct nvmf_vfio_user_sq *sq; 5285 struct nvmf_vfio_user_poll_group *vu_group; 5286 5287 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5288 5289 SPDK_DEBUGLOG(nvmf_vfio, 5290 "%s: remove NVMf QP%d=%p from NVMf poll_group=%p\n", 5291 ctrlr_id(sq->ctrlr), qpair->qid, qpair, group); 5292 5293 5294 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 5295 TAILQ_REMOVE(&vu_group->sqs, sq, link); 5296 5297 return 0; 5298 } 5299 5300 static void 5301 _nvmf_vfio_user_req_free(struct nvmf_vfio_user_sq *sq, struct nvmf_vfio_user_req *vu_req) 5302 { 5303 memset(&vu_req->cmd, 0, sizeof(vu_req->cmd)); 5304 memset(&vu_req->rsp, 0, sizeof(vu_req->rsp)); 5305 vu_req->iovcnt = 0; 5306 vu_req->req.iovcnt = 0; 5307 vu_req->req.length = 0; 5308 vu_req->state = VFIO_USER_REQUEST_STATE_FREE; 5309 5310 TAILQ_INSERT_TAIL(&sq->free_reqs, vu_req, link); 5311 } 5312 5313 static int 5314 nvmf_vfio_user_req_free(struct spdk_nvmf_request *req) 5315 { 5316 struct nvmf_vfio_user_sq *sq; 5317 struct nvmf_vfio_user_req *vu_req; 5318 5319 assert(req != NULL); 5320 5321 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 5322 sq = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 5323 5324 _nvmf_vfio_user_req_free(sq, vu_req); 5325 5326 return 0; 5327 } 5328 5329 static int 5330 nvmf_vfio_user_req_complete(struct spdk_nvmf_request *req) 5331 { 5332 struct nvmf_vfio_user_sq *sq; 5333 struct nvmf_vfio_user_req *vu_req; 5334 5335 assert(req != NULL); 5336 5337 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 5338 sq = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 5339 5340 if (vu_req->cb_fn != NULL) { 5341 if (vu_req->cb_fn(vu_req, vu_req->cb_arg) != 0) { 5342 fail_ctrlr(sq->ctrlr); 5343 } 5344 } 5345 5346 _nvmf_vfio_user_req_free(sq, vu_req); 5347 5348 return 0; 5349 } 5350 5351 static void 5352 nvmf_vfio_user_close_qpair(struct spdk_nvmf_qpair *qpair, 5353 spdk_nvmf_transport_qpair_fini_cb cb_fn, void *cb_arg) 5354 { 5355 struct nvmf_vfio_user_sq *sq; 5356 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 5357 struct nvmf_vfio_user_endpoint *endpoint; 5358 struct vfio_user_delete_sq_ctx *del_ctx; 5359 5360 assert(qpair != NULL); 5361 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5362 vu_ctrlr = sq->ctrlr; 5363 endpoint = vu_ctrlr->endpoint; 5364 del_ctx = sq->delete_ctx; 5365 sq->delete_ctx = NULL; 5366 5367 pthread_mutex_lock(&endpoint->lock); 5368 TAILQ_REMOVE(&vu_ctrlr->connected_sqs, sq, tailq); 5369 delete_sq_done(vu_ctrlr, sq); 5370 if (TAILQ_EMPTY(&vu_ctrlr->connected_sqs)) { 5371 endpoint->ctrlr = NULL; 5372 if (vu_ctrlr->in_source_vm && endpoint->need_resume) { 5373 /* The controller will be freed, we can resume the subsystem 5374 * now so that the endpoint can be ready to accept another 5375 * new connection. 5376 */ 5377 spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 5378 vfio_user_endpoint_resume_done, endpoint); 5379 } 5380 free_ctrlr(vu_ctrlr); 5381 } 5382 pthread_mutex_unlock(&endpoint->lock); 5383 5384 if (del_ctx) { 5385 vfio_user_qpair_delete_cb(del_ctx); 5386 } 5387 5388 if (cb_fn) { 5389 cb_fn(cb_arg); 5390 } 5391 } 5392 5393 /** 5394 * Returns a preallocated request, or NULL if there isn't one available. 5395 */ 5396 static struct nvmf_vfio_user_req * 5397 get_nvmf_vfio_user_req(struct nvmf_vfio_user_sq *sq) 5398 { 5399 struct nvmf_vfio_user_req *req; 5400 5401 if (sq == NULL) { 5402 return NULL; 5403 } 5404 5405 req = TAILQ_FIRST(&sq->free_reqs); 5406 if (req == NULL) { 5407 return NULL; 5408 } 5409 5410 TAILQ_REMOVE(&sq->free_reqs, req, link); 5411 5412 return req; 5413 } 5414 5415 static int 5416 get_nvmf_io_req_length(struct spdk_nvmf_request *req) 5417 { 5418 uint16_t nr; 5419 uint32_t nlb, nsid; 5420 struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; 5421 struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; 5422 struct spdk_nvmf_ns *ns; 5423 5424 nsid = cmd->nsid; 5425 ns = _nvmf_subsystem_get_ns(ctrlr->subsys, nsid); 5426 if (ns == NULL || ns->bdev == NULL) { 5427 SPDK_ERRLOG("unsuccessful query for nsid %u\n", cmd->nsid); 5428 return -EINVAL; 5429 } 5430 5431 if (cmd->opc == SPDK_NVME_OPC_DATASET_MANAGEMENT) { 5432 nr = cmd->cdw10_bits.dsm.nr + 1; 5433 return nr * sizeof(struct spdk_nvme_dsm_range); 5434 } 5435 5436 if (cmd->opc == SPDK_NVME_OPC_COPY) { 5437 nr = (cmd->cdw12 & 0x000000ffu) + 1; 5438 return nr * sizeof(struct spdk_nvme_scc_source_range); 5439 } 5440 5441 nlb = (cmd->cdw12 & 0x0000ffffu) + 1; 5442 return nlb * spdk_bdev_desc_get_block_size(ns->desc); 5443 } 5444 5445 static int 5446 map_admin_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req) 5447 { 5448 struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; 5449 uint32_t len = 0, numdw = 0; 5450 uint8_t fid; 5451 int iovcnt; 5452 5453 req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 5454 5455 if (req->xfer == SPDK_NVME_DATA_NONE) { 5456 return 0; 5457 } 5458 5459 switch (cmd->opc) { 5460 case SPDK_NVME_OPC_IDENTIFY: 5461 len = 4096; 5462 break; 5463 case SPDK_NVME_OPC_GET_LOG_PAGE: 5464 numdw = ((((uint32_t)cmd->cdw11_bits.get_log_page.numdu << 16) | 5465 cmd->cdw10_bits.get_log_page.numdl) + 1); 5466 if (numdw > UINT32_MAX / 4) { 5467 return -EINVAL; 5468 } 5469 len = numdw * 4; 5470 break; 5471 case SPDK_NVME_OPC_GET_FEATURES: 5472 case SPDK_NVME_OPC_SET_FEATURES: 5473 fid = cmd->cdw10_bits.set_features.fid; 5474 switch (fid) { 5475 case SPDK_NVME_FEAT_LBA_RANGE_TYPE: 5476 len = 4096; 5477 break; 5478 case SPDK_NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 5479 len = 256; 5480 break; 5481 case SPDK_NVME_FEAT_TIMESTAMP: 5482 len = 8; 5483 break; 5484 case SPDK_NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 5485 len = 512; 5486 break; 5487 case SPDK_NVME_FEAT_HOST_IDENTIFIER: 5488 if (cmd->cdw11_bits.feat_host_identifier.bits.exhid) { 5489 len = 16; 5490 } else { 5491 len = 8; 5492 } 5493 break; 5494 default: 5495 return 0; 5496 } 5497 break; 5498 case SPDK_NVME_OPC_FABRIC: 5499 return -ENOTSUP; 5500 default: 5501 return 0; 5502 } 5503 5504 /* ADMIN command will not use SGL */ 5505 if (cmd->psdt != 0) { 5506 return -EINVAL; 5507 } 5508 5509 iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, len); 5510 if (iovcnt < 0) { 5511 SPDK_ERRLOG("%s: map Admin Opc %x failed\n", 5512 ctrlr_id(ctrlr), cmd->opc); 5513 return -1; 5514 } 5515 req->length = len; 5516 req->iovcnt = iovcnt; 5517 5518 return 0; 5519 } 5520 5521 /* 5522 * Map an I/O command's buffers. 5523 * 5524 * Returns 0 on success and -errno on failure. 5525 */ 5526 static int 5527 map_io_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req) 5528 { 5529 int len, iovcnt; 5530 struct spdk_nvme_cmd *cmd; 5531 5532 assert(ctrlr != NULL); 5533 assert(req != NULL); 5534 5535 cmd = &req->cmd->nvme_cmd; 5536 req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 5537 5538 if (spdk_unlikely(req->xfer == SPDK_NVME_DATA_NONE)) { 5539 return 0; 5540 } 5541 5542 len = get_nvmf_io_req_length(req); 5543 if (len < 0) { 5544 return -EINVAL; 5545 } 5546 req->length = len; 5547 5548 iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, req->length); 5549 if (iovcnt < 0) { 5550 SPDK_ERRLOG("%s: failed to map IO OPC %u\n", ctrlr_id(ctrlr), cmd->opc); 5551 return -EFAULT; 5552 } 5553 req->iovcnt = iovcnt; 5554 5555 return 0; 5556 } 5557 5558 static int 5559 handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 5560 struct nvmf_vfio_user_sq *sq) 5561 { 5562 int err; 5563 struct nvmf_vfio_user_req *vu_req; 5564 struct spdk_nvmf_request *req; 5565 5566 assert(ctrlr != NULL); 5567 assert(cmd != NULL); 5568 5569 vu_req = get_nvmf_vfio_user_req(sq); 5570 if (spdk_unlikely(vu_req == NULL)) { 5571 SPDK_ERRLOG("%s: no request for NVMe command opc 0x%x\n", ctrlr_id(ctrlr), cmd->opc); 5572 return post_completion(ctrlr, ctrlr->cqs[sq->cqid], 0, 0, cmd->cid, 5573 SPDK_NVME_SC_INTERNAL_DEVICE_ERROR, SPDK_NVME_SCT_GENERIC); 5574 5575 } 5576 req = &vu_req->req; 5577 5578 assert(req->qpair != NULL); 5579 SPDK_DEBUGLOG(nvmf_vfio, "%s: handle sqid:%u, req opc=%#x cid=%d\n", 5580 ctrlr_id(ctrlr), req->qpair->qid, cmd->opc, cmd->cid); 5581 5582 vu_req->cb_fn = handle_cmd_rsp; 5583 vu_req->cb_arg = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 5584 req->cmd->nvme_cmd = *cmd; 5585 5586 if (nvmf_qpair_is_admin_queue(req->qpair)) { 5587 err = map_admin_cmd_req(ctrlr, req); 5588 } else { 5589 switch (cmd->opc) { 5590 case SPDK_NVME_OPC_RESERVATION_REGISTER: 5591 case SPDK_NVME_OPC_RESERVATION_REPORT: 5592 case SPDK_NVME_OPC_RESERVATION_ACQUIRE: 5593 case SPDK_NVME_OPC_RESERVATION_RELEASE: 5594 case SPDK_NVME_OPC_FABRIC: 5595 err = -ENOTSUP; 5596 break; 5597 default: 5598 err = map_io_cmd_req(ctrlr, req); 5599 break; 5600 } 5601 } 5602 5603 if (spdk_unlikely(err < 0)) { 5604 SPDK_ERRLOG("%s: process NVMe command opc 0x%x failed\n", 5605 ctrlr_id(ctrlr), cmd->opc); 5606 req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; 5607 req->rsp->nvme_cpl.status.sc = err == -ENOTSUP ? 5608 SPDK_NVME_SC_INVALID_OPCODE : 5609 SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 5610 err = handle_cmd_rsp(vu_req, vu_req->cb_arg); 5611 _nvmf_vfio_user_req_free(sq, vu_req); 5612 return err; 5613 } 5614 5615 vu_req->state = VFIO_USER_REQUEST_STATE_EXECUTING; 5616 spdk_nvmf_request_exec(req); 5617 5618 return 0; 5619 } 5620 5621 /* 5622 * If we suppressed an IRQ in post_completion(), check if it needs to be fired 5623 * here: if the host isn't up to date, and is apparently not actively processing 5624 * the queue (i.e. ->last_head isn't changing), we need an IRQ. 5625 */ 5626 static void 5627 handle_suppressed_irq(struct nvmf_vfio_user_ctrlr *ctrlr, 5628 struct nvmf_vfio_user_sq *sq) 5629 { 5630 struct nvmf_vfio_user_cq *cq = ctrlr->cqs[sq->cqid]; 5631 uint32_t cq_head; 5632 uint32_t cq_tail; 5633 5634 if (!cq->ien || cq->qid == 0 || !ctrlr_interrupt_enabled(ctrlr)) { 5635 return; 5636 } 5637 5638 cq_tail = *cq_tailp(cq); 5639 5640 /* Already sent? */ 5641 if (cq_tail == cq->last_trigger_irq_tail) { 5642 return; 5643 } 5644 5645 spdk_ivdt_dcache(cq_dbl_headp(cq)); 5646 cq_head = *cq_dbl_headp(cq); 5647 5648 if (cq_head != cq_tail && cq_head == cq->last_head) { 5649 int err = vfu_irq_trigger(ctrlr->endpoint->vfu_ctx, cq->iv); 5650 if (err != 0) { 5651 SPDK_ERRLOG("%s: failed to trigger interrupt: %m\n", 5652 ctrlr_id(ctrlr)); 5653 } else { 5654 cq->last_trigger_irq_tail = cq_tail; 5655 } 5656 } 5657 5658 cq->last_head = cq_head; 5659 } 5660 5661 /* Returns the number of commands processed, or a negative value on error. */ 5662 static int 5663 nvmf_vfio_user_sq_poll(struct nvmf_vfio_user_sq *sq) 5664 { 5665 struct nvmf_vfio_user_ctrlr *ctrlr; 5666 uint32_t new_tail; 5667 int count = 0; 5668 5669 assert(sq != NULL); 5670 5671 ctrlr = sq->ctrlr; 5672 5673 /* 5674 * A quiesced, or migrating, controller should never process new 5675 * commands. 5676 */ 5677 if (ctrlr->state != VFIO_USER_CTRLR_RUNNING) { 5678 return SPDK_POLLER_IDLE; 5679 } 5680 5681 if (ctrlr->adaptive_irqs_enabled) { 5682 handle_suppressed_irq(ctrlr, sq); 5683 } 5684 5685 /* On aarch64 platforms, doorbells update from guest VM may not be seen 5686 * on SPDK target side. This is because there is memory type mismatch 5687 * situation here. That is on guest VM side, the doorbells are treated as 5688 * device memory while on SPDK target side, it is treated as normal 5689 * memory. And this situation cause problem on ARM platform. 5690 * Refer to "https://developer.arm.com/documentation/102376/0100/ 5691 * Memory-aliasing-and-mismatched-memory-types". Only using spdk_mb() 5692 * cannot fix this. Use "dc civac" to invalidate cache may solve 5693 * this. 5694 */ 5695 spdk_ivdt_dcache(sq_dbl_tailp(sq)); 5696 5697 /* Load-Acquire. */ 5698 new_tail = *sq_dbl_tailp(sq); 5699 5700 new_tail = new_tail & 0xffffu; 5701 if (spdk_unlikely(new_tail >= sq->size)) { 5702 SPDK_DEBUGLOG(nvmf_vfio, "%s: invalid sqid:%u doorbell value %u\n", ctrlr_id(ctrlr), sq->qid, 5703 new_tail); 5704 spdk_nvmf_ctrlr_async_event_error_event(ctrlr->ctrlr, SPDK_NVME_ASYNC_EVENT_INVALID_DB_WRITE); 5705 5706 return -1; 5707 } 5708 5709 if (*sq_headp(sq) == new_tail) { 5710 return 0; 5711 } 5712 5713 SPDK_DEBUGLOG(nvmf_vfio, "%s: sqid:%u doorbell old=%u new=%u\n", 5714 ctrlr_id(ctrlr), sq->qid, *sq_headp(sq), new_tail); 5715 if (ctrlr->sdbl != NULL) { 5716 SPDK_DEBUGLOG(nvmf_vfio, 5717 "%s: sqid:%u bar0_doorbell=%u shadow_doorbell=%u eventidx=%u\n", 5718 ctrlr_id(ctrlr), sq->qid, 5719 ctrlr->bar0_doorbells[queue_index(sq->qid, false)], 5720 ctrlr->sdbl->shadow_doorbells[queue_index(sq->qid, false)], 5721 ctrlr->sdbl->eventidxs[queue_index(sq->qid, false)]); 5722 } 5723 5724 /* 5725 * Ensure that changes to the queue are visible to us. 5726 * The host driver should write the queue first, do a wmb(), and then 5727 * update the SQ tail doorbell (their Store-Release). 5728 */ 5729 spdk_rmb(); 5730 5731 count = handle_sq_tdbl_write(ctrlr, new_tail, sq); 5732 if (spdk_unlikely(count < 0)) { 5733 fail_ctrlr(ctrlr); 5734 } 5735 5736 return count; 5737 } 5738 5739 /* 5740 * vfio-user transport poll handler. Note that the library context is polled in 5741 * a separate poller (->vfu_ctx_poller), so this poller only needs to poll the 5742 * active SQs. 5743 * 5744 * Returns the number of commands processed, or a negative value on error. 5745 */ 5746 static int 5747 nvmf_vfio_user_poll_group_poll(struct spdk_nvmf_transport_poll_group *group) 5748 { 5749 struct nvmf_vfio_user_poll_group *vu_group; 5750 struct nvmf_vfio_user_sq *sq, *tmp; 5751 int count = 0; 5752 5753 assert(group != NULL); 5754 5755 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 5756 5757 SPDK_DEBUGLOG(vfio_user_db, "polling all SQs\n"); 5758 5759 TAILQ_FOREACH_SAFE(sq, &vu_group->sqs, link, tmp) { 5760 int ret; 5761 5762 if (spdk_unlikely(sq->sq_state != VFIO_USER_SQ_ACTIVE || !sq->size)) { 5763 continue; 5764 } 5765 5766 ret = nvmf_vfio_user_sq_poll(sq); 5767 5768 if (spdk_unlikely(ret < 0)) { 5769 return ret; 5770 } 5771 5772 count += ret; 5773 } 5774 5775 vu_group->stats.polls++; 5776 vu_group->stats.poll_reqs += count; 5777 vu_group->stats.poll_reqs_squared += count * count; 5778 if (count == 0) { 5779 vu_group->stats.polls_spurious++; 5780 } 5781 5782 return count; 5783 } 5784 5785 static int 5786 nvmf_vfio_user_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair, 5787 struct spdk_nvme_transport_id *trid) 5788 { 5789 struct nvmf_vfio_user_sq *sq; 5790 struct nvmf_vfio_user_ctrlr *ctrlr; 5791 5792 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5793 ctrlr = sq->ctrlr; 5794 5795 memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid)); 5796 return 0; 5797 } 5798 5799 static int 5800 nvmf_vfio_user_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair, 5801 struct spdk_nvme_transport_id *trid) 5802 { 5803 return 0; 5804 } 5805 5806 static int 5807 nvmf_vfio_user_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair, 5808 struct spdk_nvme_transport_id *trid) 5809 { 5810 struct nvmf_vfio_user_sq *sq; 5811 struct nvmf_vfio_user_ctrlr *ctrlr; 5812 5813 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5814 ctrlr = sq->ctrlr; 5815 5816 memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid)); 5817 return 0; 5818 } 5819 5820 static void 5821 nvmf_vfio_user_qpair_abort_request(struct spdk_nvmf_qpair *qpair, 5822 struct spdk_nvmf_request *req) 5823 { 5824 struct spdk_nvmf_request *req_to_abort = NULL; 5825 struct spdk_nvmf_request *temp_req = NULL; 5826 uint16_t cid; 5827 5828 cid = req->cmd->nvme_cmd.cdw10_bits.abort.cid; 5829 5830 TAILQ_FOREACH(temp_req, &qpair->outstanding, link) { 5831 struct nvmf_vfio_user_req *vu_req; 5832 5833 vu_req = SPDK_CONTAINEROF(temp_req, struct nvmf_vfio_user_req, req); 5834 5835 if (vu_req->state == VFIO_USER_REQUEST_STATE_EXECUTING && vu_req->cmd.cid == cid) { 5836 req_to_abort = temp_req; 5837 break; 5838 } 5839 } 5840 5841 if (req_to_abort == NULL) { 5842 spdk_nvmf_request_complete(req); 5843 return; 5844 } 5845 5846 req->req_to_abort = req_to_abort; 5847 nvmf_ctrlr_abort_request(req); 5848 } 5849 5850 static void 5851 nvmf_vfio_user_poll_group_dump_stat(struct spdk_nvmf_transport_poll_group *group, 5852 struct spdk_json_write_ctx *w) 5853 { 5854 struct nvmf_vfio_user_poll_group *vu_group = SPDK_CONTAINEROF(group, 5855 struct nvmf_vfio_user_poll_group, group); 5856 uint64_t polls_denom; 5857 5858 spdk_json_write_named_uint64(w, "ctrlr_intr", vu_group->stats.ctrlr_intr); 5859 spdk_json_write_named_uint64(w, "ctrlr_kicks", vu_group->stats.ctrlr_kicks); 5860 spdk_json_write_named_uint64(w, "won", vu_group->stats.won); 5861 spdk_json_write_named_uint64(w, "lost", vu_group->stats.lost); 5862 spdk_json_write_named_uint64(w, "lost_count", vu_group->stats.lost_count); 5863 spdk_json_write_named_uint64(w, "rearms", vu_group->stats.rearms); 5864 spdk_json_write_named_uint64(w, "pg_process_count", vu_group->stats.pg_process_count); 5865 spdk_json_write_named_uint64(w, "intr", vu_group->stats.intr); 5866 spdk_json_write_named_uint64(w, "polls", vu_group->stats.polls); 5867 spdk_json_write_named_uint64(w, "polls_spurious", vu_group->stats.polls_spurious); 5868 spdk_json_write_named_uint64(w, "poll_reqs", vu_group->stats.poll_reqs); 5869 polls_denom = vu_group->stats.polls * (vu_group->stats.polls - 1); 5870 if (polls_denom) { 5871 uint64_t n = vu_group->stats.polls * vu_group->stats.poll_reqs_squared - vu_group->stats.poll_reqs * 5872 vu_group->stats.poll_reqs; 5873 spdk_json_write_named_double(w, "poll_reqs_variance", sqrt(n / polls_denom)); 5874 } 5875 5876 spdk_json_write_named_uint64(w, "cqh_admin_writes", vu_group->stats.cqh_admin_writes); 5877 spdk_json_write_named_uint64(w, "cqh_io_writes", vu_group->stats.cqh_io_writes); 5878 } 5879 5880 static void 5881 nvmf_vfio_user_opts_init(struct spdk_nvmf_transport_opts *opts) 5882 { 5883 opts->max_queue_depth = NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH; 5884 opts->max_qpairs_per_ctrlr = NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; 5885 opts->in_capsule_data_size = 0; 5886 opts->max_io_size = NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE; 5887 opts->io_unit_size = NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE; 5888 opts->max_aq_depth = NVMF_VFIO_USER_DEFAULT_AQ_DEPTH; 5889 opts->num_shared_buffers = 0; 5890 opts->buf_cache_size = 0; 5891 opts->association_timeout = 0; 5892 opts->transport_specific = NULL; 5893 } 5894 5895 const struct spdk_nvmf_transport_ops spdk_nvmf_transport_vfio_user = { 5896 .name = "VFIOUSER", 5897 .type = SPDK_NVME_TRANSPORT_VFIOUSER, 5898 .opts_init = nvmf_vfio_user_opts_init, 5899 .create = nvmf_vfio_user_create, 5900 .destroy = nvmf_vfio_user_destroy, 5901 5902 .listen = nvmf_vfio_user_listen, 5903 .stop_listen = nvmf_vfio_user_stop_listen, 5904 .cdata_init = nvmf_vfio_user_cdata_init, 5905 .listen_associate = nvmf_vfio_user_listen_associate, 5906 5907 .listener_discover = nvmf_vfio_user_discover, 5908 5909 .poll_group_create = nvmf_vfio_user_poll_group_create, 5910 .get_optimal_poll_group = nvmf_vfio_user_get_optimal_poll_group, 5911 .poll_group_destroy = nvmf_vfio_user_poll_group_destroy, 5912 .poll_group_add = nvmf_vfio_user_poll_group_add, 5913 .poll_group_remove = nvmf_vfio_user_poll_group_remove, 5914 .poll_group_poll = nvmf_vfio_user_poll_group_poll, 5915 5916 .req_free = nvmf_vfio_user_req_free, 5917 .req_complete = nvmf_vfio_user_req_complete, 5918 5919 .qpair_fini = nvmf_vfio_user_close_qpair, 5920 .qpair_get_local_trid = nvmf_vfio_user_qpair_get_local_trid, 5921 .qpair_get_peer_trid = nvmf_vfio_user_qpair_get_peer_trid, 5922 .qpair_get_listen_trid = nvmf_vfio_user_qpair_get_listen_trid, 5923 .qpair_abort_request = nvmf_vfio_user_qpair_abort_request, 5924 5925 .poll_group_dump_stat = nvmf_vfio_user_poll_group_dump_stat, 5926 }; 5927 5928 SPDK_NVMF_TRANSPORT_REGISTER(muser, &spdk_nvmf_transport_vfio_user); 5929 SPDK_LOG_REGISTER_COMPONENT(nvmf_vfio) 5930 SPDK_LOG_REGISTER_COMPONENT(vfio_user_db) 5931