1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2020 Intel Corporation. 3 * Copyright (c) 2019-2022, Nutanix Inc. All rights reserved. 4 * Copyright (c) 2022, 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 /* 8 * NVMe over vfio-user transport 9 */ 10 11 #include <sys/param.h> 12 13 #include <vfio-user/libvfio-user.h> 14 #include <vfio-user/pci_defs.h> 15 16 #include "spdk/barrier.h" 17 #include "spdk/stdinc.h" 18 #include "spdk/assert.h" 19 #include "spdk/thread.h" 20 #include "spdk/nvmf_transport.h" 21 #include "spdk/sock.h" 22 #include "spdk/string.h" 23 #include "spdk/util.h" 24 #include "spdk/log.h" 25 26 #include "transport.h" 27 28 #include "nvmf_internal.h" 29 30 #define SWAP(x, y) \ 31 do \ 32 { \ 33 typeof(x) _tmp = x; \ 34 x = y; \ 35 y = _tmp; \ 36 } while (0) 37 38 #define NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH 256 39 #define NVMF_VFIO_USER_DEFAULT_AQ_DEPTH 32 40 #define NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE ((NVMF_REQ_MAX_BUFFERS - 1) << SHIFT_4KB) 41 #define NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE 42 43 #define NVME_DOORBELLS_OFFSET 0x1000 44 #define NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT 2 45 #define NVMF_VFIO_USER_SET_EVENTIDX_MAX_ATTEMPTS 3 46 #define NVMF_VFIO_USER_EVENTIDX_POLL UINT32_MAX 47 48 #define NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR 512 49 #define NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR (NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR / 4) 50 51 /* NVMe spec 1.4, section 5.21.1.7 */ 52 SPDK_STATIC_ASSERT(NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR >= 2 && 53 NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR <= SPDK_NVME_MAX_IO_QUEUES, 54 "bad number of queues"); 55 56 /* 57 * NVMe driver reads 4096 bytes, which is the extended PCI configuration space 58 * available on PCI-X 2.0 and PCI Express buses 59 */ 60 #define NVME_REG_CFG_SIZE 0x1000 61 62 /* 63 * Doorbells must be page aligned so that they can memory mapped. 64 * 65 * TODO does the NVMe spec also require this? Document it. 66 */ 67 #define NVMF_VFIO_USER_DOORBELLS_SIZE \ 68 SPDK_ALIGN_CEIL( \ 69 (NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR * 2 * SPDK_NVME_DOORBELL_REGISTER_SIZE), \ 70 0x1000) 71 #define NVME_REG_BAR0_SIZE (NVME_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE) 72 73 /* 74 * TODO check the PCI spec whether BAR4 and BAR5 really have to be at least one 75 * page and a multiple of page size (maybe QEMU also needs this?). Document all 76 * this. 77 */ 78 79 #define NVMF_VFIO_USER_MSIX_NUM MAX(CHAR_BIT, NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) 80 81 #define NVMF_VFIO_USER_MSIX_TABLE_BIR (4) 82 #define NVMF_VFIO_USER_BAR4_SIZE SPDK_ALIGN_CEIL((NVMF_VFIO_USER_MSIX_NUM * 16), 0x1000) 83 SPDK_STATIC_ASSERT(NVMF_VFIO_USER_BAR4_SIZE > 0, "Incorrect size"); 84 85 /* 86 * TODO according to the PCI spec we need one bit per vector, document the 87 * relevant section. 88 */ 89 #define NVMF_VFIO_USER_MSIX_PBA_BIR (5) 90 #define NVMF_VFIO_USER_BAR5_SIZE SPDK_ALIGN_CEIL((NVMF_VFIO_USER_MSIX_NUM / CHAR_BIT), 0x1000) 91 SPDK_STATIC_ASSERT(NVMF_VFIO_USER_BAR5_SIZE > 0, "Incorrect size"); 92 struct nvmf_vfio_user_req; 93 94 typedef int (*nvmf_vfio_user_req_cb_fn)(struct nvmf_vfio_user_req *req, void *cb_arg); 95 96 /* 1 more for PRP2 list itself */ 97 #define NVMF_VFIO_USER_MAX_IOVECS (NVMF_REQ_MAX_BUFFERS + 1) 98 99 enum nvmf_vfio_user_req_state { 100 VFIO_USER_REQUEST_STATE_FREE = 0, 101 VFIO_USER_REQUEST_STATE_EXECUTING, 102 }; 103 104 /* 105 * Support for live migration in NVMf/vfio-user: live migration is implemented 106 * by stopping the NVMf subsystem when the device is instructed to enter the 107 * stop-and-copy state and then trivially, and most importantly safely, 108 * collecting migration state and providing it to the vfio-user client. We 109 * don't provide any migration state at the pre-copy state as that's too 110 * complicated to do, we might support this in the future. 111 */ 112 113 114 /* NVMe device state representation */ 115 struct nvme_migr_sq_state { 116 uint16_t sqid; 117 uint16_t cqid; 118 uint32_t head; 119 uint32_t size; 120 uint32_t reserved; 121 uint64_t dma_addr; 122 }; 123 SPDK_STATIC_ASSERT(sizeof(struct nvme_migr_sq_state) == 0x18, "Incorrect size"); 124 125 struct nvme_migr_cq_state { 126 uint16_t cqid; 127 uint16_t phase; 128 uint32_t tail; 129 uint32_t size; 130 uint32_t iv; 131 uint32_t ien; 132 uint32_t reserved; 133 uint64_t dma_addr; 134 }; 135 SPDK_STATIC_ASSERT(sizeof(struct nvme_migr_cq_state) == 0x20, "Incorrect size"); 136 137 #define VFIO_USER_MIGR_CALLBACK_VERS 1 138 #define VFIO_USER_NVME_MIGR_MAGIC 0xAFEDBC23 139 140 /* The device state is in VFIO MIGRATION BAR(9) region, keep the device state page aligned. 141 * 142 * NVMe device migration region is defined as below: 143 * ------------------------------------------------------------------------- 144 * | vfio_user_nvme_migr_header | nvmf controller data | queue pairs | BARs | 145 * ------------------------------------------------------------------------- 146 * 147 * Keep vfio_user_nvme_migr_header as a fixed 0x1000 length, all new added fields 148 * can use the reserved space at the end of the data structure. 149 */ 150 struct vfio_user_nvme_migr_header { 151 /* Magic value to validate migration data */ 152 uint32_t magic; 153 /* Version to check the data is same from source to destination */ 154 uint32_t version; 155 156 /* The library uses this field to know how many fields in this 157 * structure are valid, starting at the beginning of this data 158 * structure. New added fields in future use `unused` memory 159 * spaces. 160 */ 161 uint32_t opts_size; 162 uint32_t reserved0; 163 164 /* BARs information */ 165 uint64_t bar_offset[VFU_PCI_DEV_NUM_REGIONS]; 166 uint64_t bar_len[VFU_PCI_DEV_NUM_REGIONS]; 167 168 /* Queue pair start offset, starting at the beginning of this 169 * data structure. 170 */ 171 uint64_t qp_offset; 172 uint64_t qp_len; 173 174 /* Controller data structure */ 175 uint32_t num_io_queues; 176 uint32_t reserved1; 177 178 /* NVMf controller data offset and length if exist, starting at 179 * the beginning of this data structure. 180 */ 181 uint64_t nvmf_data_offset; 182 uint64_t nvmf_data_len; 183 184 /* 185 * Whether or not shadow doorbells are used in the source. 0 is a valid DMA 186 * address. 187 */ 188 uint32_t sdbl; 189 190 /* Shadow doorbell DMA addresses. */ 191 uint64_t shadow_doorbell_buffer; 192 uint64_t eventidx_buffer; 193 194 /* Reserved memory space for new added fields, the 195 * field is always at the end of this data structure. 196 */ 197 uint8_t unused[3856]; 198 }; 199 SPDK_STATIC_ASSERT(sizeof(struct vfio_user_nvme_migr_header) == 0x1000, "Incorrect size"); 200 201 struct vfio_user_nvme_migr_qp { 202 struct nvme_migr_sq_state sq; 203 struct nvme_migr_cq_state cq; 204 }; 205 206 /* NVMe state definition used to load/restore from/to NVMe migration BAR region */ 207 struct vfio_user_nvme_migr_state { 208 struct vfio_user_nvme_migr_header ctrlr_header; 209 struct spdk_nvmf_ctrlr_migr_data nvmf_data; 210 struct vfio_user_nvme_migr_qp qps[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 211 uint8_t doorbells[NVMF_VFIO_USER_DOORBELLS_SIZE]; 212 uint8_t cfg[NVME_REG_CFG_SIZE]; 213 }; 214 215 struct nvmf_vfio_user_req { 216 struct spdk_nvmf_request req; 217 struct spdk_nvme_cpl rsp; 218 struct spdk_nvme_cmd cmd; 219 220 enum nvmf_vfio_user_req_state state; 221 nvmf_vfio_user_req_cb_fn cb_fn; 222 void *cb_arg; 223 224 /* old CC before prop_set_cc fabric command */ 225 union spdk_nvme_cc_register cc; 226 227 TAILQ_ENTRY(nvmf_vfio_user_req) link; 228 229 struct iovec iov[NVMF_VFIO_USER_MAX_IOVECS]; 230 uint8_t iovcnt; 231 232 /* NVMF_VFIO_USER_MAX_IOVECS worth of dma_sg_t. */ 233 uint8_t sg[]; 234 }; 235 236 #define MAP_R (0) 237 #define MAP_RW (1 << 0) 238 #define MAP_INITIALIZE (1 << 1) 239 #define MAP_QUIET (1 << 2) 240 241 /* 242 * Mapping of an NVMe queue. 243 * 244 * This holds the information tracking a local process mapping of an NVMe queue 245 * shared by the client. 246 */ 247 struct nvme_q_mapping { 248 /* iov of local process mapping. */ 249 struct iovec iov; 250 /* Stored sg, needed for unmap. */ 251 dma_sg_t *sg; 252 /* Client PRP of queue. */ 253 uint64_t prp1; 254 /* Total length in bytes. */ 255 uint64_t len; 256 }; 257 258 enum nvmf_vfio_user_sq_state { 259 VFIO_USER_SQ_UNUSED = 0, 260 VFIO_USER_SQ_CREATED, 261 VFIO_USER_SQ_DELETED, 262 VFIO_USER_SQ_ACTIVE, 263 VFIO_USER_SQ_INACTIVE 264 }; 265 266 enum nvmf_vfio_user_cq_state { 267 VFIO_USER_CQ_UNUSED = 0, 268 VFIO_USER_CQ_CREATED, 269 VFIO_USER_CQ_DELETED, 270 }; 271 272 enum nvmf_vfio_user_ctrlr_state { 273 VFIO_USER_CTRLR_CREATING = 0, 274 VFIO_USER_CTRLR_RUNNING, 275 /* Quiesce requested by libvfio-user */ 276 VFIO_USER_CTRLR_PAUSING, 277 /* NVMf subsystem is paused, it's safe to do PCI reset, memory register, 278 * memory unergister, and vfio migration state transition in this state. 279 */ 280 VFIO_USER_CTRLR_PAUSED, 281 /* 282 * Implies that the NVMf subsystem is paused. Device will be unquiesced (PCI 283 * reset, memory register and unregister, controller in destination VM has 284 * been restored). NVMf subsystem resume has been requested. 285 */ 286 VFIO_USER_CTRLR_RESUMING, 287 /* 288 * Implies that the NVMf subsystem is paused. Both controller in source VM and 289 * destinatiom VM is in this state when doing live migration. 290 */ 291 VFIO_USER_CTRLR_MIGRATING 292 }; 293 294 struct nvmf_vfio_user_sq { 295 struct spdk_nvmf_qpair qpair; 296 struct spdk_nvmf_transport_poll_group *group; 297 struct nvmf_vfio_user_ctrlr *ctrlr; 298 299 uint32_t qid; 300 /* Number of entries in queue. */ 301 uint32_t size; 302 struct nvme_q_mapping mapping; 303 enum nvmf_vfio_user_sq_state sq_state; 304 305 uint32_t head; 306 volatile uint32_t *dbl_tailp; 307 308 /* Whether a shadow doorbell eventidx needs setting. */ 309 bool need_rearm; 310 311 /* multiple SQs can be mapped to the same CQ */ 312 uint16_t cqid; 313 314 /* handle_queue_connect_rsp() can be used both for CREATE IO SQ response 315 * and SQ re-connect response in the destination VM, for the prior case, 316 * we will post a NVMe completion to VM, we will not set this flag when 317 * re-connecting SQs in the destination VM. 318 */ 319 bool post_create_io_sq_completion; 320 /* Copy of Create IO SQ command, this field is used together with 321 * `post_create_io_sq_completion` flag. 322 */ 323 struct spdk_nvme_cmd create_io_sq_cmd; 324 325 struct vfio_user_delete_sq_ctx *delete_ctx; 326 327 /* Currently unallocated reqs. */ 328 TAILQ_HEAD(, nvmf_vfio_user_req) free_reqs; 329 /* Poll group entry */ 330 TAILQ_ENTRY(nvmf_vfio_user_sq) link; 331 /* Connected SQ entry */ 332 TAILQ_ENTRY(nvmf_vfio_user_sq) tailq; 333 }; 334 335 struct nvmf_vfio_user_cq { 336 struct spdk_nvmf_transport_poll_group *group; 337 int cq_ref; 338 339 uint32_t qid; 340 /* Number of entries in queue. */ 341 uint32_t size; 342 struct nvme_q_mapping mapping; 343 enum nvmf_vfio_user_cq_state cq_state; 344 345 uint32_t tail; 346 volatile uint32_t *dbl_headp; 347 348 bool phase; 349 350 uint16_t iv; 351 bool ien; 352 353 /* Number of outstanding IOs that will complete in this queue. */ 354 size_t nr_outstanding; 355 356 uint32_t last_head; 357 uint32_t last_trigger_irq_tail; 358 }; 359 360 struct nvmf_vfio_user_poll_group { 361 struct spdk_nvmf_transport_poll_group group; 362 TAILQ_ENTRY(nvmf_vfio_user_poll_group) link; 363 TAILQ_HEAD(, nvmf_vfio_user_sq) sqs; 364 struct spdk_interrupt *intr; 365 int intr_fd; 366 struct { 367 368 /* 369 * ctrlr_intr and ctrlr_kicks will be zero for all other poll 370 * groups. However, they can be zero even for the poll group 371 * the controller belongs are if no vfio-user message has been 372 * received or the controller hasn't been kicked yet. 373 */ 374 375 /* 376 * Number of times vfio_user_ctrlr_intr() has run: 377 * vfio-user file descriptor has been ready or explicitly 378 * kicked (see below). 379 */ 380 uint64_t ctrlr_intr; 381 382 /* 383 * Kicks to the controller by ctrlr_kick(). 384 * ctrlr_intr - ctrlr_kicks is the number of times the 385 * vfio-user poll file descriptor has been ready. 386 */ 387 uint64_t ctrlr_kicks; 388 389 /* 390 * Number of times this poll group was kicked. 391 */ 392 uint64_t pg_kicks; 393 394 /* 395 * How many times we won the race arming an SQ. 396 */ 397 uint64_t won; 398 399 /* 400 * How many times we lost the race arming an SQ 401 */ 402 uint64_t lost; 403 404 /* 405 * How many requests we processed in total each time we lost 406 * the rearm race. 407 */ 408 uint64_t lost_count; 409 410 /* 411 * Number of attempts we attempted to rearm all the SQs in the 412 * poll group. 413 */ 414 uint64_t rearms; 415 416 /* 417 * Number of times we had to apply flow control to this SQ. 418 */ 419 uint64_t cq_full; 420 421 uint64_t pg_process_count; 422 uint64_t intr; 423 uint64_t polls; 424 uint64_t polls_spurious; 425 uint64_t poll_reqs; 426 uint64_t poll_reqs_squared; 427 uint64_t cqh_admin_writes; 428 uint64_t cqh_io_writes; 429 } stats; 430 431 /* Whether this PG needs kicking to wake up again. */ 432 bool need_kick; 433 }; 434 435 struct nvmf_vfio_user_shadow_doorbells { 436 volatile uint32_t *shadow_doorbells; 437 volatile uint32_t *eventidxs; 438 dma_sg_t *sgs; 439 struct iovec *iovs; 440 }; 441 442 struct nvmf_vfio_user_ctrlr { 443 struct nvmf_vfio_user_endpoint *endpoint; 444 struct nvmf_vfio_user_transport *transport; 445 446 /* Connected SQs list */ 447 TAILQ_HEAD(, nvmf_vfio_user_sq) connected_sqs; 448 enum nvmf_vfio_user_ctrlr_state state; 449 450 /* 451 * Tells whether live migration data have been prepared. This is used 452 * by the get_pending_bytes callback to tell whether or not the 453 * previous iteration finished. 454 */ 455 bool migr_data_prepared; 456 457 /* Controller is in source VM when doing live migration */ 458 bool in_source_vm; 459 460 struct spdk_thread *thread; 461 struct spdk_poller *vfu_ctx_poller; 462 struct spdk_interrupt *intr; 463 int intr_fd; 464 465 bool queued_quiesce; 466 467 bool reset_shn; 468 bool disconnect; 469 470 uint16_t cntlid; 471 struct spdk_nvmf_ctrlr *ctrlr; 472 473 struct nvmf_vfio_user_sq *sqs[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 474 struct nvmf_vfio_user_cq *cqs[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR]; 475 476 TAILQ_ENTRY(nvmf_vfio_user_ctrlr) link; 477 478 volatile uint32_t *bar0_doorbells; 479 struct nvmf_vfio_user_shadow_doorbells *sdbl; 480 /* 481 * Shadow doorbells PRPs to provide during the stop-and-copy state. 482 */ 483 uint64_t shadow_doorbell_buffer; 484 uint64_t eventidx_buffer; 485 486 bool adaptive_irqs_enabled; 487 }; 488 489 /* Endpoint in vfio-user is associated with a socket file, which 490 * is the representative of a PCI endpoint. 491 */ 492 struct nvmf_vfio_user_endpoint { 493 struct nvmf_vfio_user_transport *transport; 494 vfu_ctx_t *vfu_ctx; 495 struct spdk_poller *accept_poller; 496 struct spdk_thread *accept_thread; 497 bool interrupt_mode; 498 struct msixcap *msix; 499 vfu_pci_config_space_t *pci_config_space; 500 int devmem_fd; 501 int accept_intr_fd; 502 struct spdk_interrupt *accept_intr; 503 504 volatile uint32_t *bar0_doorbells; 505 506 int migr_fd; 507 void *migr_data; 508 509 struct spdk_nvme_transport_id trid; 510 struct spdk_nvmf_subsystem *subsystem; 511 512 /* Controller is associated with an active socket connection, 513 * the lifecycle of the controller is same as the VM. 514 * Currently we only support one active connection, as the NVMe 515 * specification defines, we may support multiple controllers in 516 * future, so that it can support e.g: RESERVATION. 517 */ 518 struct nvmf_vfio_user_ctrlr *ctrlr; 519 pthread_mutex_t lock; 520 521 bool need_async_destroy; 522 /* The subsystem is in PAUSED state and need to be resumed, TRUE 523 * only when migration is done successfully and the controller is 524 * in source VM. 525 */ 526 bool need_resume; 527 /* Start the accept poller again after destroying the controller */ 528 bool need_relisten; 529 530 TAILQ_ENTRY(nvmf_vfio_user_endpoint) link; 531 }; 532 533 struct nvmf_vfio_user_transport_opts { 534 bool disable_mappable_bar0; 535 bool disable_adaptive_irq; 536 bool disable_shadow_doorbells; 537 bool disable_compare; 538 bool enable_intr_mode_sq_spreading; 539 }; 540 541 struct nvmf_vfio_user_transport { 542 struct spdk_nvmf_transport transport; 543 struct nvmf_vfio_user_transport_opts transport_opts; 544 bool intr_mode_supported; 545 pthread_mutex_t lock; 546 TAILQ_HEAD(, nvmf_vfio_user_endpoint) endpoints; 547 548 pthread_mutex_t pg_lock; 549 TAILQ_HEAD(, nvmf_vfio_user_poll_group) poll_groups; 550 struct nvmf_vfio_user_poll_group *next_pg; 551 }; 552 553 /* 554 * function prototypes 555 */ 556 static int nvmf_vfio_user_req_free(struct spdk_nvmf_request *req); 557 558 static struct nvmf_vfio_user_req *get_nvmf_vfio_user_req(struct nvmf_vfio_user_sq *sq); 559 560 /* 561 * Local process virtual address of a queue. 562 */ 563 static inline void * 564 q_addr(struct nvme_q_mapping *mapping) 565 { 566 return mapping->iov.iov_base; 567 } 568 569 static inline int 570 queue_index(uint16_t qid, bool is_cq) 571 { 572 return (qid * 2) + is_cq; 573 } 574 575 static inline volatile uint32_t * 576 sq_headp(struct nvmf_vfio_user_sq *sq) 577 { 578 assert(sq != NULL); 579 return &sq->head; 580 } 581 582 static inline volatile uint32_t * 583 sq_dbl_tailp(struct nvmf_vfio_user_sq *sq) 584 { 585 assert(sq != NULL); 586 return sq->dbl_tailp; 587 } 588 589 static inline volatile uint32_t * 590 cq_dbl_headp(struct nvmf_vfio_user_cq *cq) 591 { 592 assert(cq != NULL); 593 return cq->dbl_headp; 594 } 595 596 static inline volatile uint32_t * 597 cq_tailp(struct nvmf_vfio_user_cq *cq) 598 { 599 assert(cq != NULL); 600 return &cq->tail; 601 } 602 603 static inline void 604 sq_head_advance(struct nvmf_vfio_user_sq *sq) 605 { 606 assert(sq != NULL); 607 608 assert(*sq_headp(sq) < sq->size); 609 (*sq_headp(sq))++; 610 611 if (spdk_unlikely(*sq_headp(sq) == sq->size)) { 612 *sq_headp(sq) = 0; 613 } 614 } 615 616 static inline void 617 cq_tail_advance(struct nvmf_vfio_user_cq *cq) 618 { 619 assert(cq != NULL); 620 621 assert(*cq_tailp(cq) < cq->size); 622 (*cq_tailp(cq))++; 623 624 if (spdk_unlikely(*cq_tailp(cq) == cq->size)) { 625 *cq_tailp(cq) = 0; 626 cq->phase = !cq->phase; 627 } 628 } 629 630 static bool 631 io_q_exists(struct nvmf_vfio_user_ctrlr *vu_ctrlr, const uint16_t qid, const bool is_cq) 632 { 633 assert(vu_ctrlr != NULL); 634 635 if (qid == 0 || qid >= NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) { 636 return false; 637 } 638 639 if (is_cq) { 640 if (vu_ctrlr->cqs[qid] == NULL) { 641 return false; 642 } 643 644 return (vu_ctrlr->cqs[qid]->cq_state != VFIO_USER_CQ_DELETED && 645 vu_ctrlr->cqs[qid]->cq_state != VFIO_USER_CQ_UNUSED); 646 } 647 648 if (vu_ctrlr->sqs[qid] == NULL) { 649 return false; 650 } 651 652 return (vu_ctrlr->sqs[qid]->sq_state != VFIO_USER_SQ_DELETED && 653 vu_ctrlr->sqs[qid]->sq_state != VFIO_USER_SQ_UNUSED); 654 } 655 656 static char * 657 endpoint_id(struct nvmf_vfio_user_endpoint *endpoint) 658 { 659 return endpoint->trid.traddr; 660 } 661 662 static char * 663 ctrlr_id(struct nvmf_vfio_user_ctrlr *ctrlr) 664 { 665 if (!ctrlr || !ctrlr->endpoint) { 666 return "Null Ctrlr"; 667 } 668 669 return endpoint_id(ctrlr->endpoint); 670 } 671 672 /* Return the poll group for the admin queue of the controller. */ 673 static inline struct nvmf_vfio_user_poll_group * 674 ctrlr_to_poll_group(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 675 { 676 return SPDK_CONTAINEROF(vu_ctrlr->sqs[0]->group, 677 struct nvmf_vfio_user_poll_group, 678 group); 679 } 680 681 static inline struct nvmf_vfio_user_poll_group * 682 sq_to_poll_group(struct nvmf_vfio_user_sq *sq) 683 { 684 return SPDK_CONTAINEROF(sq->group, struct nvmf_vfio_user_poll_group, 685 group); 686 } 687 688 static inline struct spdk_thread * 689 poll_group_to_thread(struct nvmf_vfio_user_poll_group *vu_pg) 690 { 691 return vu_pg->group.group->thread; 692 } 693 694 static dma_sg_t * 695 index_to_sg_t(void *arr, size_t i) 696 { 697 return (dma_sg_t *)((uintptr_t)arr + i * dma_sg_size()); 698 } 699 700 static inline size_t 701 vfio_user_migr_data_len(void) 702 { 703 return SPDK_ALIGN_CEIL(sizeof(struct vfio_user_nvme_migr_state), PAGE_SIZE); 704 } 705 706 static inline bool 707 in_interrupt_mode(struct nvmf_vfio_user_transport *vu_transport) 708 { 709 return spdk_interrupt_mode_is_enabled() && 710 vu_transport->intr_mode_supported; 711 } 712 713 static int vfio_user_ctrlr_intr(void *ctx); 714 715 static void 716 vfio_user_ctrlr_intr_msg(void *ctx) 717 { 718 vfio_user_ctrlr_intr(ctx); 719 } 720 721 /* 722 * Kick (force a wakeup) of all poll groups for this controller. 723 * vfio_user_ctrlr_intr() itself arranges for kicking other poll groups if 724 * needed. 725 */ 726 static void 727 ctrlr_kick(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 728 { 729 struct nvmf_vfio_user_poll_group *vu_ctrlr_group; 730 731 SPDK_DEBUGLOG(vfio_user_db, "%s: kicked\n", ctrlr_id(vu_ctrlr)); 732 733 vu_ctrlr_group = ctrlr_to_poll_group(vu_ctrlr); 734 735 vu_ctrlr_group->stats.ctrlr_kicks++; 736 737 spdk_thread_send_msg(poll_group_to_thread(vu_ctrlr_group), 738 vfio_user_ctrlr_intr_msg, vu_ctrlr); 739 } 740 741 /* 742 * Force a wake-up for this particular poll group and its contained SQs. 743 */ 744 static void 745 poll_group_kick(struct nvmf_vfio_user_poll_group *vu_group) 746 { 747 vu_group->stats.pg_kicks++; 748 assert(vu_group->need_kick); 749 vu_group->need_kick = false; 750 eventfd_write(vu_group->intr_fd, 1); 751 } 752 753 /* 754 * Make the given DMA address and length available (locally mapped) via iov. 755 */ 756 static void * 757 map_one(vfu_ctx_t *ctx, uint64_t addr, uint64_t len, dma_sg_t *sg, 758 struct iovec *iov, int32_t flags) 759 { 760 int prot = PROT_READ; 761 int ret; 762 763 if (flags & MAP_RW) { 764 prot |= PROT_WRITE; 765 } 766 767 assert(ctx != NULL); 768 assert(sg != NULL); 769 assert(iov != NULL); 770 771 ret = vfu_addr_to_sgl(ctx, (void *)(uintptr_t)addr, len, sg, 1, prot); 772 if (ret < 0) { 773 if (ret == -1) { 774 if (!(flags & MAP_QUIET)) { 775 SPDK_ERRLOG("failed to translate IOVA [%#lx, %#lx) (prot=%d) to local VA: %m\n", 776 addr, addr + len, prot); 777 } 778 } else { 779 SPDK_ERRLOG("failed to translate IOVA [%#lx, %#lx) (prot=%d) to local VA: %d segments needed\n", 780 addr, addr + len, prot, -(ret + 1)); 781 } 782 return NULL; 783 } 784 785 ret = vfu_sgl_get(ctx, sg, iov, 1, 0); 786 if (ret != 0) { 787 SPDK_ERRLOG("failed to get iovec for IOVA [%#lx, %#lx): %m\n", 788 addr, addr + len); 789 return NULL; 790 } 791 792 assert(iov->iov_base != NULL); 793 return iov->iov_base; 794 } 795 796 static int 797 nvme_cmd_map_prps(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, 798 uint32_t max_iovcnt, uint32_t len, size_t mps, 799 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, uint32_t flags)) 800 { 801 uint64_t prp1, prp2; 802 void *vva; 803 uint32_t i; 804 uint32_t residue_len, nents; 805 uint64_t *prp_list; 806 uint32_t iovcnt; 807 808 assert(max_iovcnt > 0); 809 810 prp1 = cmd->dptr.prp.prp1; 811 prp2 = cmd->dptr.prp.prp2; 812 813 /* PRP1 may started with unaligned page address */ 814 residue_len = mps - (prp1 % mps); 815 residue_len = spdk_min(len, residue_len); 816 817 vva = gpa_to_vva(prv, prp1, residue_len, MAP_RW); 818 if (spdk_unlikely(vva == NULL)) { 819 SPDK_ERRLOG("GPA to VVA failed\n"); 820 return -EINVAL; 821 } 822 len -= residue_len; 823 if (len && max_iovcnt < 2) { 824 SPDK_ERRLOG("Too many page entries, at least two iovs are required\n"); 825 return -ERANGE; 826 } 827 iovs[0].iov_base = vva; 828 iovs[0].iov_len = residue_len; 829 830 if (len) { 831 if (spdk_unlikely(prp2 == 0)) { 832 SPDK_ERRLOG("no PRP2, %d remaining\n", len); 833 return -EINVAL; 834 } 835 836 if (len <= mps) { 837 /* 2 PRP used */ 838 iovcnt = 2; 839 vva = gpa_to_vva(prv, prp2, len, MAP_RW); 840 if (spdk_unlikely(vva == NULL)) { 841 SPDK_ERRLOG("no VVA for %#" PRIx64 ", len%#x\n", 842 prp2, len); 843 return -EINVAL; 844 } 845 iovs[1].iov_base = vva; 846 iovs[1].iov_len = len; 847 } else { 848 /* PRP list used */ 849 nents = (len + mps - 1) / mps; 850 if (spdk_unlikely(nents + 1 > max_iovcnt)) { 851 SPDK_ERRLOG("Too many page entries\n"); 852 return -ERANGE; 853 } 854 855 vva = gpa_to_vva(prv, prp2, nents * sizeof(*prp_list), MAP_R); 856 if (spdk_unlikely(vva == NULL)) { 857 SPDK_ERRLOG("no VVA for %#" PRIx64 ", nents=%#x\n", 858 prp2, nents); 859 return -EINVAL; 860 } 861 prp_list = vva; 862 i = 0; 863 while (len != 0) { 864 residue_len = spdk_min(len, mps); 865 vva = gpa_to_vva(prv, prp_list[i], residue_len, MAP_RW); 866 if (spdk_unlikely(vva == NULL)) { 867 SPDK_ERRLOG("no VVA for %#" PRIx64 ", residue_len=%#x\n", 868 prp_list[i], residue_len); 869 return -EINVAL; 870 } 871 iovs[i + 1].iov_base = vva; 872 iovs[i + 1].iov_len = residue_len; 873 len -= residue_len; 874 i++; 875 } 876 iovcnt = i + 1; 877 } 878 } else { 879 /* 1 PRP used */ 880 iovcnt = 1; 881 } 882 883 assert(iovcnt <= max_iovcnt); 884 return iovcnt; 885 } 886 887 static int 888 nvme_cmd_map_sgls_data(void *prv, struct spdk_nvme_sgl_descriptor *sgls, uint32_t num_sgls, 889 struct iovec *iovs, uint32_t max_iovcnt, 890 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, uint32_t flags)) 891 { 892 uint32_t i; 893 void *vva; 894 895 if (spdk_unlikely(max_iovcnt < num_sgls)) { 896 return -ERANGE; 897 } 898 899 for (i = 0; i < num_sgls; i++) { 900 if (spdk_unlikely(sgls[i].unkeyed.type != SPDK_NVME_SGL_TYPE_DATA_BLOCK)) { 901 SPDK_ERRLOG("Invalid SGL type %u\n", sgls[i].unkeyed.type); 902 return -EINVAL; 903 } 904 vva = gpa_to_vva(prv, sgls[i].address, sgls[i].unkeyed.length, MAP_RW); 905 if (spdk_unlikely(vva == NULL)) { 906 SPDK_ERRLOG("GPA to VVA failed\n"); 907 return -EINVAL; 908 } 909 iovs[i].iov_base = vva; 910 iovs[i].iov_len = sgls[i].unkeyed.length; 911 } 912 913 return num_sgls; 914 } 915 916 static int 917 nvme_cmd_map_sgls(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt, 918 uint32_t len, size_t mps, 919 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, uint32_t flags)) 920 { 921 struct spdk_nvme_sgl_descriptor *sgl, *last_sgl; 922 uint32_t num_sgls, seg_len; 923 void *vva; 924 int ret; 925 uint32_t total_iovcnt = 0; 926 927 /* SGL cases */ 928 sgl = &cmd->dptr.sgl1; 929 930 /* only one SGL segment */ 931 if (sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { 932 assert(max_iovcnt > 0); 933 vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, MAP_RW); 934 if (spdk_unlikely(vva == NULL)) { 935 SPDK_ERRLOG("GPA to VVA failed\n"); 936 return -EINVAL; 937 } 938 iovs[0].iov_base = vva; 939 iovs[0].iov_len = sgl->unkeyed.length; 940 assert(sgl->unkeyed.length == len); 941 942 return 1; 943 } 944 945 for (;;) { 946 if (spdk_unlikely((sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_SEGMENT) && 947 (sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_LAST_SEGMENT))) { 948 SPDK_ERRLOG("Invalid SGL type %u\n", sgl->unkeyed.type); 949 return -EINVAL; 950 } 951 952 seg_len = sgl->unkeyed.length; 953 if (spdk_unlikely(seg_len % sizeof(struct spdk_nvme_sgl_descriptor))) { 954 SPDK_ERRLOG("Invalid SGL segment len %u\n", seg_len); 955 return -EINVAL; 956 } 957 958 num_sgls = seg_len / sizeof(struct spdk_nvme_sgl_descriptor); 959 vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, MAP_R); 960 if (spdk_unlikely(vva == NULL)) { 961 SPDK_ERRLOG("GPA to VVA failed\n"); 962 return -EINVAL; 963 } 964 965 /* sgl point to the first segment */ 966 sgl = (struct spdk_nvme_sgl_descriptor *)vva; 967 last_sgl = &sgl[num_sgls - 1]; 968 969 /* we are done */ 970 if (last_sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { 971 /* map whole sgl list */ 972 ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls, &iovs[total_iovcnt], 973 max_iovcnt - total_iovcnt, gpa_to_vva); 974 if (spdk_unlikely(ret < 0)) { 975 return ret; 976 } 977 total_iovcnt += ret; 978 979 return total_iovcnt; 980 } 981 982 if (num_sgls > 1) { 983 /* map whole sgl exclude last_sgl */ 984 ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls - 1, &iovs[total_iovcnt], 985 max_iovcnt - total_iovcnt, gpa_to_vva); 986 if (spdk_unlikely(ret < 0)) { 987 return ret; 988 } 989 total_iovcnt += ret; 990 } 991 992 /* move to next level's segments */ 993 sgl = last_sgl; 994 } 995 996 return 0; 997 } 998 999 static int 1000 nvme_map_cmd(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, uint32_t max_iovcnt, 1001 uint32_t len, size_t mps, 1002 void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len, uint32_t flags)) 1003 { 1004 if (cmd->psdt == SPDK_NVME_PSDT_PRP) { 1005 return nvme_cmd_map_prps(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva); 1006 } 1007 1008 return nvme_cmd_map_sgls(prv, cmd, iovs, max_iovcnt, len, mps, gpa_to_vva); 1009 } 1010 1011 /* 1012 * For each queue, update the location of its doorbell to the correct location: 1013 * either our own BAR0, or the guest's configured shadow doorbell area. 1014 * 1015 * The Admin queue (qid: 0) does not ever use shadow doorbells. 1016 */ 1017 static void 1018 vfio_user_ctrlr_switch_doorbells(struct nvmf_vfio_user_ctrlr *ctrlr, bool shadow) 1019 { 1020 volatile uint32_t *doorbells = shadow ? ctrlr->sdbl->shadow_doorbells : 1021 ctrlr->bar0_doorbells; 1022 1023 assert(doorbells != NULL); 1024 1025 for (size_t i = 1; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; i++) { 1026 struct nvmf_vfio_user_sq *sq = ctrlr->sqs[i]; 1027 struct nvmf_vfio_user_cq *cq = ctrlr->cqs[i]; 1028 1029 if (sq != NULL) { 1030 sq->dbl_tailp = doorbells + queue_index(sq->qid, false); 1031 1032 ctrlr->sqs[i]->need_rearm = shadow; 1033 } 1034 1035 if (cq != NULL) { 1036 cq->dbl_headp = doorbells + queue_index(cq->qid, true); 1037 } 1038 } 1039 } 1040 1041 static void 1042 unmap_sdbl(vfu_ctx_t *vfu_ctx, struct nvmf_vfio_user_shadow_doorbells *sdbl) 1043 { 1044 assert(vfu_ctx != NULL); 1045 assert(sdbl != NULL); 1046 1047 /* 1048 * An allocation error would result in only one of the two being 1049 * non-NULL. If that is the case, no memory should have been mapped. 1050 */ 1051 if (sdbl->iovs == NULL || sdbl->sgs == NULL) { 1052 return; 1053 } 1054 1055 for (size_t i = 0; i < NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT; ++i) { 1056 struct iovec *iov; 1057 dma_sg_t *sg; 1058 1059 if (!sdbl->iovs[i].iov_len) { 1060 continue; 1061 } 1062 1063 sg = index_to_sg_t(sdbl->sgs, i); 1064 iov = sdbl->iovs + i; 1065 1066 vfu_sgl_put(vfu_ctx, sg, iov, 1); 1067 } 1068 } 1069 1070 static void 1071 free_sdbl(vfu_ctx_t *vfu_ctx, struct nvmf_vfio_user_shadow_doorbells *sdbl) 1072 { 1073 if (sdbl == NULL) { 1074 return; 1075 } 1076 1077 unmap_sdbl(vfu_ctx, sdbl); 1078 1079 /* 1080 * sdbl->shadow_doorbells and sdbl->eventidxs were mapped, 1081 * not allocated, so don't free() them. 1082 */ 1083 free(sdbl->sgs); 1084 free(sdbl->iovs); 1085 free(sdbl); 1086 } 1087 1088 static struct nvmf_vfio_user_shadow_doorbells * 1089 map_sdbl(vfu_ctx_t *vfu_ctx, uint64_t prp1, uint64_t prp2, size_t len) 1090 { 1091 struct nvmf_vfio_user_shadow_doorbells *sdbl = NULL; 1092 dma_sg_t *sg2 = NULL; 1093 void *p; 1094 1095 assert(vfu_ctx != NULL); 1096 1097 sdbl = calloc(1, sizeof(*sdbl)); 1098 if (sdbl == NULL) { 1099 goto err; 1100 } 1101 1102 sdbl->sgs = calloc(NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT, dma_sg_size()); 1103 sdbl->iovs = calloc(NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT, sizeof(*sdbl->iovs)); 1104 if (sdbl->sgs == NULL || sdbl->iovs == NULL) { 1105 goto err; 1106 } 1107 1108 /* Map shadow doorbell buffer (PRP1). */ 1109 p = map_one(vfu_ctx, prp1, len, sdbl->sgs, sdbl->iovs, MAP_RW); 1110 1111 if (p == NULL) { 1112 goto err; 1113 } 1114 1115 /* 1116 * Map eventidx buffer (PRP2). 1117 * Should only be written to by the controller. 1118 */ 1119 1120 sg2 = index_to_sg_t(sdbl->sgs, 1); 1121 1122 p = map_one(vfu_ctx, prp2, len, sg2, sdbl->iovs + 1, MAP_RW); 1123 1124 if (p == NULL) { 1125 goto err; 1126 } 1127 1128 sdbl->shadow_doorbells = (uint32_t *)sdbl->iovs[0].iov_base; 1129 sdbl->eventidxs = (uint32_t *)sdbl->iovs[1].iov_base; 1130 1131 return sdbl; 1132 1133 err: 1134 free_sdbl(vfu_ctx, sdbl); 1135 return NULL; 1136 } 1137 1138 /* 1139 * Copy doorbells from one buffer to the other, during switches between BAR0 1140 * doorbells and shadow doorbells. 1141 */ 1142 static void 1143 copy_doorbells(struct nvmf_vfio_user_ctrlr *ctrlr, 1144 const volatile uint32_t *from, volatile uint32_t *to) 1145 { 1146 assert(ctrlr != NULL); 1147 assert(from != NULL); 1148 assert(to != NULL); 1149 1150 SPDK_DEBUGLOG(vfio_user_db, 1151 "%s: migrating shadow doorbells from %p to %p\n", 1152 ctrlr_id(ctrlr), from, to); 1153 1154 /* Can't use memcpy because it doesn't respect volatile semantics. */ 1155 for (size_t i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; ++i) { 1156 if (ctrlr->sqs[i] != NULL) { 1157 to[queue_index(i, false)] = from[queue_index(i, false)]; 1158 } 1159 1160 if (ctrlr->cqs[i] != NULL) { 1161 to[queue_index(i, true)] = from[queue_index(i, true)]; 1162 } 1163 } 1164 } 1165 1166 static void 1167 fail_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1168 { 1169 const struct spdk_nvmf_registers *regs; 1170 1171 assert(vu_ctrlr != NULL); 1172 assert(vu_ctrlr->ctrlr != NULL); 1173 1174 regs = spdk_nvmf_ctrlr_get_regs(vu_ctrlr->ctrlr); 1175 if (regs->csts.bits.cfs == 0) { 1176 SPDK_ERRLOG(":%s failing controller\n", ctrlr_id(vu_ctrlr)); 1177 } 1178 1179 nvmf_ctrlr_set_fatal_status(vu_ctrlr->ctrlr); 1180 } 1181 1182 static inline bool 1183 ctrlr_interrupt_enabled(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1184 { 1185 assert(vu_ctrlr != NULL); 1186 assert(vu_ctrlr->endpoint != NULL); 1187 1188 vfu_pci_config_space_t *pci = vu_ctrlr->endpoint->pci_config_space; 1189 1190 return (!pci->hdr.cmd.id || vu_ctrlr->endpoint->msix->mxc.mxe); 1191 } 1192 1193 static void 1194 nvmf_vfio_user_destroy_endpoint(struct nvmf_vfio_user_endpoint *endpoint) 1195 { 1196 SPDK_DEBUGLOG(nvmf_vfio, "destroy endpoint %s\n", endpoint_id(endpoint)); 1197 1198 spdk_interrupt_unregister(&endpoint->accept_intr); 1199 spdk_poller_unregister(&endpoint->accept_poller); 1200 1201 if (endpoint->bar0_doorbells) { 1202 munmap((void *)endpoint->bar0_doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 1203 } 1204 1205 if (endpoint->devmem_fd > 0) { 1206 close(endpoint->devmem_fd); 1207 } 1208 1209 if (endpoint->migr_data) { 1210 munmap(endpoint->migr_data, vfio_user_migr_data_len()); 1211 } 1212 1213 if (endpoint->migr_fd > 0) { 1214 close(endpoint->migr_fd); 1215 } 1216 1217 if (endpoint->vfu_ctx) { 1218 vfu_destroy_ctx(endpoint->vfu_ctx); 1219 } 1220 1221 pthread_mutex_destroy(&endpoint->lock); 1222 free(endpoint); 1223 } 1224 1225 /* called when process exits */ 1226 static int 1227 nvmf_vfio_user_destroy(struct spdk_nvmf_transport *transport, 1228 spdk_nvmf_transport_destroy_done_cb cb_fn, void *cb_arg) 1229 { 1230 struct nvmf_vfio_user_transport *vu_transport; 1231 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 1232 1233 SPDK_DEBUGLOG(nvmf_vfio, "destroy transport\n"); 1234 1235 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 1236 transport); 1237 1238 pthread_mutex_destroy(&vu_transport->lock); 1239 pthread_mutex_destroy(&vu_transport->pg_lock); 1240 1241 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 1242 TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link); 1243 nvmf_vfio_user_destroy_endpoint(endpoint); 1244 } 1245 1246 free(vu_transport); 1247 1248 if (cb_fn) { 1249 cb_fn(cb_arg); 1250 } 1251 1252 return 0; 1253 } 1254 1255 static const struct spdk_json_object_decoder vfio_user_transport_opts_decoder[] = { 1256 { 1257 "disable_mappable_bar0", 1258 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_mappable_bar0), 1259 spdk_json_decode_bool, true 1260 }, 1261 { 1262 "disable_adaptive_irq", 1263 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_adaptive_irq), 1264 spdk_json_decode_bool, true 1265 }, 1266 { 1267 "disable_shadow_doorbells", 1268 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_shadow_doorbells), 1269 spdk_json_decode_bool, true 1270 }, 1271 { 1272 "disable_compare", 1273 offsetof(struct nvmf_vfio_user_transport, transport_opts.disable_compare), 1274 spdk_json_decode_bool, true 1275 }, 1276 { 1277 "enable_intr_mode_sq_spreading", 1278 offsetof(struct nvmf_vfio_user_transport, transport_opts.enable_intr_mode_sq_spreading), 1279 spdk_json_decode_bool, true 1280 }, 1281 }; 1282 1283 static struct spdk_nvmf_transport * 1284 nvmf_vfio_user_create(struct spdk_nvmf_transport_opts *opts) 1285 { 1286 struct nvmf_vfio_user_transport *vu_transport; 1287 int err; 1288 1289 if (opts->max_qpairs_per_ctrlr > NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) { 1290 SPDK_ERRLOG("Invalid max_qpairs_per_ctrlr=%d, supported max_qpairs_per_ctrlr=%d\n", 1291 opts->max_qpairs_per_ctrlr, NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR); 1292 return NULL; 1293 } 1294 1295 vu_transport = calloc(1, sizeof(*vu_transport)); 1296 if (vu_transport == NULL) { 1297 SPDK_ERRLOG("Transport alloc fail: %m\n"); 1298 return NULL; 1299 } 1300 1301 err = pthread_mutex_init(&vu_transport->lock, NULL); 1302 if (err != 0) { 1303 SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err); 1304 goto err; 1305 } 1306 TAILQ_INIT(&vu_transport->endpoints); 1307 1308 err = pthread_mutex_init(&vu_transport->pg_lock, NULL); 1309 if (err != 0) { 1310 pthread_mutex_destroy(&vu_transport->lock); 1311 SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err); 1312 goto err; 1313 } 1314 TAILQ_INIT(&vu_transport->poll_groups); 1315 1316 if (opts->transport_specific != NULL && 1317 spdk_json_decode_object_relaxed(opts->transport_specific, vfio_user_transport_opts_decoder, 1318 SPDK_COUNTOF(vfio_user_transport_opts_decoder), 1319 vu_transport)) { 1320 SPDK_ERRLOG("spdk_json_decode_object_relaxed failed\n"); 1321 goto cleanup; 1322 } 1323 1324 /* 1325 * To support interrupt mode, the transport must be configured with 1326 * mappable BAR0 disabled: we need a vfio-user message to wake us up 1327 * when a client writes new doorbell values to BAR0, via the 1328 * libvfio-user socket fd. 1329 */ 1330 vu_transport->intr_mode_supported = 1331 vu_transport->transport_opts.disable_mappable_bar0; 1332 1333 /* 1334 * If BAR0 is mappable, it doesn't make sense to support shadow 1335 * doorbells, so explicitly turn it off. 1336 */ 1337 if (!vu_transport->transport_opts.disable_mappable_bar0) { 1338 vu_transport->transport_opts.disable_shadow_doorbells = true; 1339 } 1340 1341 if (spdk_interrupt_mode_is_enabled()) { 1342 if (!vu_transport->intr_mode_supported) { 1343 SPDK_ERRLOG("interrupt mode not supported\n"); 1344 goto cleanup; 1345 } 1346 1347 /* 1348 * If we are in interrupt mode, we cannot support adaptive IRQs, 1349 * as there is no guarantee the SQ poller will run subsequently 1350 * to send pending IRQs. 1351 */ 1352 vu_transport->transport_opts.disable_adaptive_irq = true; 1353 } 1354 1355 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_mappable_bar0=%d\n", 1356 vu_transport->transport_opts.disable_mappable_bar0); 1357 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_adaptive_irq=%d\n", 1358 vu_transport->transport_opts.disable_adaptive_irq); 1359 SPDK_DEBUGLOG(nvmf_vfio, "vfio_user transport: disable_shadow_doorbells=%d\n", 1360 vu_transport->transport_opts.disable_shadow_doorbells); 1361 1362 return &vu_transport->transport; 1363 1364 cleanup: 1365 pthread_mutex_destroy(&vu_transport->lock); 1366 pthread_mutex_destroy(&vu_transport->pg_lock); 1367 err: 1368 free(vu_transport); 1369 return NULL; 1370 } 1371 1372 static uint32_t 1373 max_queue_size(struct nvmf_vfio_user_ctrlr const *vu_ctrlr) 1374 { 1375 assert(vu_ctrlr != NULL); 1376 assert(vu_ctrlr->ctrlr != NULL); 1377 1378 return vu_ctrlr->ctrlr->vcprop.cap.bits.mqes + 1; 1379 } 1380 1381 static uint32_t 1382 doorbell_stride(const struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1383 { 1384 assert(vu_ctrlr != NULL); 1385 assert(vu_ctrlr->ctrlr != NULL); 1386 1387 return vu_ctrlr->ctrlr->vcprop.cap.bits.dstrd; 1388 } 1389 1390 static uintptr_t 1391 memory_page_size(const struct nvmf_vfio_user_ctrlr *vu_ctrlr) 1392 { 1393 uint32_t memory_page_shift = vu_ctrlr->ctrlr->vcprop.cc.bits.mps + 12; 1394 return 1ul << memory_page_shift; 1395 } 1396 1397 static uintptr_t 1398 memory_page_mask(const struct nvmf_vfio_user_ctrlr *ctrlr) 1399 { 1400 return ~(memory_page_size(ctrlr) - 1); 1401 } 1402 1403 static int 1404 map_q(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvme_q_mapping *mapping, 1405 uint32_t flags) 1406 { 1407 void *ret; 1408 1409 assert(mapping->len != 0); 1410 assert(q_addr(mapping) == NULL); 1411 1412 ret = map_one(vu_ctrlr->endpoint->vfu_ctx, mapping->prp1, mapping->len, 1413 mapping->sg, &mapping->iov, flags); 1414 if (ret == NULL) { 1415 return -EFAULT; 1416 } 1417 1418 if (flags & MAP_INITIALIZE) { 1419 memset(q_addr(mapping), 0, mapping->len); 1420 } 1421 1422 return 0; 1423 } 1424 1425 static inline void 1426 unmap_q(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvme_q_mapping *mapping) 1427 { 1428 if (q_addr(mapping) != NULL) { 1429 vfu_sgl_put(vu_ctrlr->endpoint->vfu_ctx, mapping->sg, 1430 &mapping->iov, 1); 1431 mapping->iov.iov_base = NULL; 1432 } 1433 } 1434 1435 static int 1436 asq_setup(struct nvmf_vfio_user_ctrlr *ctrlr) 1437 { 1438 struct nvmf_vfio_user_sq *sq; 1439 const struct spdk_nvmf_registers *regs; 1440 int ret; 1441 1442 assert(ctrlr != NULL); 1443 1444 sq = ctrlr->sqs[0]; 1445 1446 assert(sq != NULL); 1447 assert(q_addr(&sq->mapping) == NULL); 1448 /* XXX ctrlr->asq == 0 is a valid memory address */ 1449 1450 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->ctrlr); 1451 sq->qid = 0; 1452 sq->size = regs->aqa.bits.asqs + 1; 1453 sq->mapping.prp1 = regs->asq; 1454 sq->mapping.len = sq->size * sizeof(struct spdk_nvme_cmd); 1455 *sq_headp(sq) = 0; 1456 sq->cqid = 0; 1457 1458 ret = map_q(ctrlr, &sq->mapping, MAP_INITIALIZE); 1459 if (ret) { 1460 return ret; 1461 } 1462 1463 /* The Admin queue (qid: 0) does not ever use shadow doorbells. */ 1464 sq->dbl_tailp = ctrlr->bar0_doorbells + queue_index(0, false); 1465 1466 *sq_dbl_tailp(sq) = 0; 1467 1468 return 0; 1469 } 1470 1471 /* 1472 * Updates eventidx to set an SQ into interrupt or polling mode. 1473 * 1474 * Returns false if the current SQ tail does not match the SQ head, as 1475 * this means that the host has submitted more items to the queue while we were 1476 * not looking - or during the event index update. In that case, we must retry, 1477 * or otherwise make sure we are going to wake up again. 1478 */ 1479 static bool 1480 set_sq_eventidx(struct nvmf_vfio_user_sq *sq) 1481 { 1482 struct nvmf_vfio_user_ctrlr *ctrlr; 1483 volatile uint32_t *sq_tail_eidx; 1484 uint32_t old_tail, new_tail; 1485 1486 assert(sq != NULL); 1487 assert(sq->ctrlr != NULL); 1488 assert(sq->ctrlr->sdbl != NULL); 1489 assert(sq->need_rearm); 1490 assert(sq->qid != 0); 1491 1492 ctrlr = sq->ctrlr; 1493 1494 SPDK_DEBUGLOG(vfio_user_db, "%s: updating eventidx of sqid:%u\n", 1495 ctrlr_id(ctrlr), sq->qid); 1496 1497 sq_tail_eidx = ctrlr->sdbl->eventidxs + queue_index(sq->qid, false); 1498 1499 assert(ctrlr->endpoint != NULL); 1500 1501 if (!ctrlr->endpoint->interrupt_mode) { 1502 /* No synchronisation necessary. */ 1503 *sq_tail_eidx = NVMF_VFIO_USER_EVENTIDX_POLL; 1504 return true; 1505 } 1506 1507 old_tail = *sq_dbl_tailp(sq); 1508 *sq_tail_eidx = old_tail; 1509 1510 /* 1511 * Ensure that the event index is updated before re-reading the tail 1512 * doorbell. If it's not, then the host might race us and update the 1513 * tail after the second read but before the event index is written, so 1514 * it won't write to BAR0 and we'll miss the update. 1515 * 1516 * The driver should provide similar ordering with an mb(). 1517 */ 1518 spdk_mb(); 1519 1520 /* 1521 * Check if the host has updated the tail doorbell after we've read it 1522 * for the first time, but before the event index was written. If that's 1523 * the case, then we've lost the race and we need to update the event 1524 * index again (after polling the queue, since the host won't write to 1525 * BAR0). 1526 */ 1527 new_tail = *sq_dbl_tailp(sq); 1528 1529 /* 1530 * We might poll the queue straight after this function returns if the 1531 * tail has been updated, so we need to ensure that any changes to the 1532 * queue will be visible to us if the doorbell has been updated. 1533 * 1534 * The driver should provide similar ordering with a wmb() to ensure 1535 * that the queue is written before it updates the tail doorbell. 1536 */ 1537 spdk_rmb(); 1538 1539 SPDK_DEBUGLOG(vfio_user_db, "%s: sqid:%u, old_tail=%u, new_tail=%u, " 1540 "sq_head=%u\n", ctrlr_id(ctrlr), sq->qid, old_tail, 1541 new_tail, *sq_headp(sq)); 1542 1543 if (new_tail == *sq_headp(sq)) { 1544 sq->need_rearm = false; 1545 return true; 1546 } 1547 1548 /* 1549 * We've lost the race: the tail was updated since we last polled, 1550 * including if it happened within this routine. 1551 * 1552 * The caller should retry after polling (think of this as a cmpxchg 1553 * loop); if we go to sleep while the SQ is not empty, then we won't 1554 * process the remaining events. 1555 */ 1556 return false; 1557 } 1558 1559 static int nvmf_vfio_user_sq_poll(struct nvmf_vfio_user_sq *sq); 1560 1561 /* 1562 * Arrange for an SQ to interrupt us if written. Returns non-zero if we 1563 * processed some SQ entries. 1564 */ 1565 static int 1566 vfio_user_sq_rearm(struct nvmf_vfio_user_ctrlr *ctrlr, 1567 struct nvmf_vfio_user_sq *sq, 1568 struct nvmf_vfio_user_poll_group *vu_group) 1569 { 1570 int count = 0; 1571 size_t i; 1572 1573 assert(sq->need_rearm); 1574 1575 for (i = 0; i < NVMF_VFIO_USER_SET_EVENTIDX_MAX_ATTEMPTS; i++) { 1576 int ret; 1577 1578 if (set_sq_eventidx(sq)) { 1579 /* We won the race and set eventidx; done. */ 1580 vu_group->stats.won++; 1581 return count; 1582 } 1583 1584 ret = nvmf_vfio_user_sq_poll(sq); 1585 1586 count += (ret < 0) ? 1 : ret; 1587 } 1588 1589 /* 1590 * We couldn't arrange an eventidx guaranteed to cause a BAR0 write, as 1591 * we raced with the producer too many times; force ourselves to wake up 1592 * instead. We'll process all queues at that point. 1593 */ 1594 vu_group->need_kick = true; 1595 1596 SPDK_DEBUGLOG(vfio_user_db, 1597 "%s: set_sq_eventidx() lost the race %zu times\n", 1598 ctrlr_id(ctrlr), i); 1599 1600 vu_group->stats.lost++; 1601 vu_group->stats.lost_count += count; 1602 1603 return count; 1604 } 1605 1606 /* 1607 * We're in interrupt mode, and potentially about to go to sleep. We need to 1608 * make sure any further I/O submissions are guaranteed to wake us up: for 1609 * shadow doorbells that means we may need to go through set_sq_eventidx() for 1610 * every SQ that needs re-arming. 1611 * 1612 * Returns non-zero if we processed something. 1613 */ 1614 static int 1615 vfio_user_poll_group_rearm(struct nvmf_vfio_user_poll_group *vu_group) 1616 { 1617 struct nvmf_vfio_user_sq *sq; 1618 int count = 0; 1619 1620 vu_group->stats.rearms++; 1621 1622 TAILQ_FOREACH(sq, &vu_group->sqs, link) { 1623 if (spdk_unlikely(sq->sq_state != VFIO_USER_SQ_ACTIVE || !sq->size)) { 1624 continue; 1625 } 1626 1627 if (sq->need_rearm) { 1628 count += vfio_user_sq_rearm(sq->ctrlr, sq, vu_group); 1629 } 1630 } 1631 1632 if (vu_group->need_kick) { 1633 poll_group_kick(vu_group); 1634 } 1635 1636 return count; 1637 } 1638 1639 static int 1640 acq_setup(struct nvmf_vfio_user_ctrlr *ctrlr) 1641 { 1642 struct nvmf_vfio_user_cq *cq; 1643 const struct spdk_nvmf_registers *regs; 1644 int ret; 1645 1646 assert(ctrlr != NULL); 1647 1648 cq = ctrlr->cqs[0]; 1649 1650 assert(cq != NULL); 1651 1652 assert(q_addr(&cq->mapping) == NULL); 1653 1654 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->ctrlr); 1655 assert(regs != NULL); 1656 cq->qid = 0; 1657 cq->size = regs->aqa.bits.acqs + 1; 1658 cq->mapping.prp1 = regs->acq; 1659 cq->mapping.len = cq->size * sizeof(struct spdk_nvme_cpl); 1660 *cq_tailp(cq) = 0; 1661 cq->ien = true; 1662 cq->phase = true; 1663 cq->nr_outstanding = 0; 1664 1665 ret = map_q(ctrlr, &cq->mapping, MAP_RW | MAP_INITIALIZE); 1666 if (ret) { 1667 return ret; 1668 } 1669 1670 /* The Admin queue (qid: 0) does not ever use shadow doorbells. */ 1671 cq->dbl_headp = ctrlr->bar0_doorbells + queue_index(0, true); 1672 1673 *cq_dbl_headp(cq) = 0; 1674 1675 return 0; 1676 } 1677 1678 static void * 1679 _map_one(void *prv, uint64_t addr, uint64_t len, uint32_t flags) 1680 { 1681 struct spdk_nvmf_request *req = (struct spdk_nvmf_request *)prv; 1682 struct spdk_nvmf_qpair *qpair; 1683 struct nvmf_vfio_user_req *vu_req; 1684 struct nvmf_vfio_user_sq *sq; 1685 void *ret; 1686 1687 assert(req != NULL); 1688 qpair = req->qpair; 1689 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 1690 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 1691 1692 assert(vu_req->iovcnt < NVMF_VFIO_USER_MAX_IOVECS); 1693 ret = map_one(sq->ctrlr->endpoint->vfu_ctx, addr, len, 1694 index_to_sg_t(vu_req->sg, vu_req->iovcnt), 1695 &vu_req->iov[vu_req->iovcnt], flags); 1696 if (spdk_likely(ret != NULL)) { 1697 vu_req->iovcnt++; 1698 } 1699 return ret; 1700 } 1701 1702 static int 1703 vfio_user_map_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req, 1704 struct iovec *iov, uint32_t length) 1705 { 1706 /* Map PRP list to from Guest physical memory to 1707 * virtual memory address. 1708 */ 1709 return nvme_map_cmd(req, &req->cmd->nvme_cmd, iov, NVMF_REQ_MAX_BUFFERS, 1710 length, 4096, _map_one); 1711 } 1712 1713 static int handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 1714 struct nvmf_vfio_user_sq *sq); 1715 1716 static uint32_t 1717 cq_free_slots(struct nvmf_vfio_user_cq *cq) 1718 { 1719 uint32_t free_slots; 1720 1721 assert(cq != NULL); 1722 1723 if (cq->tail == cq->last_head) { 1724 free_slots = cq->size; 1725 } else if (cq->tail > cq->last_head) { 1726 free_slots = cq->size - (cq->tail - cq->last_head); 1727 } else { 1728 free_slots = cq->last_head - cq->tail; 1729 } 1730 assert(free_slots > 0); 1731 1732 return free_slots - 1; 1733 } 1734 1735 /* 1736 * Since reading the head doorbell is relatively expensive, we use the cached 1737 * value, so we only have to read it for real if it appears that we are full. 1738 */ 1739 static inline bool 1740 cq_is_full(struct nvmf_vfio_user_cq *cq) 1741 { 1742 uint32_t free_cq_slots; 1743 1744 assert(cq != NULL); 1745 1746 free_cq_slots = cq_free_slots(cq); 1747 1748 if (spdk_unlikely(free_cq_slots == 0)) { 1749 cq->last_head = *cq_dbl_headp(cq); 1750 free_cq_slots = cq_free_slots(cq); 1751 } 1752 1753 return free_cq_slots == 0; 1754 } 1755 1756 /* 1757 * Posts a CQE in the completion queue. 1758 * 1759 * @ctrlr: the vfio-user controller 1760 * @cq: the completion queue 1761 * @cdw0: cdw0 as reported by NVMf 1762 * @sqid: submission queue ID 1763 * @cid: command identifier in NVMe command 1764 * @sc: the NVMe CQE status code 1765 * @sct: the NVMe CQE status code type 1766 */ 1767 static int 1768 post_completion(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_cq *cq, 1769 uint32_t cdw0, uint16_t sqid, uint16_t cid, uint16_t sc, uint16_t sct) 1770 { 1771 struct spdk_nvme_status cpl_status = { 0 }; 1772 struct spdk_nvme_cpl *cpl; 1773 int err; 1774 1775 assert(ctrlr != NULL); 1776 1777 if (spdk_unlikely(cq == NULL || q_addr(&cq->mapping) == NULL)) { 1778 return 0; 1779 } 1780 1781 if (cq->qid == 0) { 1782 assert(spdk_get_thread() == cq->group->group->thread); 1783 } 1784 1785 /* 1786 * As per NVMe Base spec 3.3.1.2.1, we are supposed to implement CQ flow 1787 * control: that is, we should handle running out of free CQ slots. 1788 * 1789 * Instead, we implement this by applying flow control on the submission 1790 * side: see handle_sq_tdbl_write(). 1791 */ 1792 if (cq_is_full(cq)) { 1793 SPDK_ERRLOG("%s: cqid:%d full (tail=%d, head=%d)\n", 1794 ctrlr_id(ctrlr), cq->qid, *cq_tailp(cq), 1795 *cq_dbl_headp(cq)); 1796 return -1; 1797 } 1798 1799 cpl = ((struct spdk_nvme_cpl *)q_addr(&cq->mapping)) + *cq_tailp(cq); 1800 1801 assert(ctrlr->sqs[sqid] != NULL); 1802 SPDK_DEBUGLOG(nvmf_vfio, 1803 "%s: request complete sqid:%d cid=%d status=%#x " 1804 "sqhead=%d cq tail=%d\n", ctrlr_id(ctrlr), sqid, cid, sc, 1805 *sq_headp(ctrlr->sqs[sqid]), *cq_tailp(cq)); 1806 1807 cpl->sqhd = *sq_headp(ctrlr->sqs[sqid]); 1808 cpl->sqid = sqid; 1809 cpl->cid = cid; 1810 cpl->cdw0 = cdw0; 1811 1812 /* 1813 * This is a bitfield: instead of setting the individual bits we need 1814 * directly in cpl->status, which would cause a read-modify-write cycle, 1815 * we'll avoid reading from the CPL altogether by filling in a local 1816 * cpl_status variable, then writing the whole thing. 1817 */ 1818 cpl_status.sct = sct; 1819 cpl_status.sc = sc; 1820 cpl_status.p = cq->phase; 1821 cpl->status = cpl_status; 1822 1823 cq->nr_outstanding--; 1824 1825 /* Ensure the Completion Queue Entry is visible. */ 1826 spdk_wmb(); 1827 cq_tail_advance(cq); 1828 1829 if ((cq->qid == 0 || !ctrlr->adaptive_irqs_enabled) && 1830 cq->ien && ctrlr_interrupt_enabled(ctrlr)) { 1831 err = vfu_irq_trigger(ctrlr->endpoint->vfu_ctx, cq->iv); 1832 if (err != 0) { 1833 SPDK_ERRLOG("%s: failed to trigger interrupt: %m\n", 1834 ctrlr_id(ctrlr)); 1835 return err; 1836 } 1837 } 1838 1839 return 0; 1840 } 1841 1842 static void 1843 free_sq_reqs(struct nvmf_vfio_user_sq *sq) 1844 { 1845 while (!TAILQ_EMPTY(&sq->free_reqs)) { 1846 struct nvmf_vfio_user_req *vu_req = TAILQ_FIRST(&sq->free_reqs); 1847 TAILQ_REMOVE(&sq->free_reqs, vu_req, link); 1848 free(vu_req); 1849 } 1850 } 1851 1852 static void 1853 delete_cq_done(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_cq *cq) 1854 { 1855 assert(cq->cq_ref == 0); 1856 unmap_q(ctrlr, &cq->mapping); 1857 cq->size = 0; 1858 cq->cq_state = VFIO_USER_CQ_DELETED; 1859 cq->group = NULL; 1860 cq->nr_outstanding = 0; 1861 } 1862 1863 /* Deletes a SQ, if this SQ is the last user of the associated CQ 1864 * and the controller is being shut down/reset or vfio-user client disconnects, 1865 * then the CQ is also deleted. 1866 */ 1867 static void 1868 delete_sq_done(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvmf_vfio_user_sq *sq) 1869 { 1870 struct nvmf_vfio_user_cq *cq; 1871 uint16_t cqid; 1872 1873 SPDK_DEBUGLOG(nvmf_vfio, "%s: delete sqid:%d=%p done\n", ctrlr_id(vu_ctrlr), 1874 sq->qid, sq); 1875 1876 /* Free SQ resources */ 1877 unmap_q(vu_ctrlr, &sq->mapping); 1878 1879 free_sq_reqs(sq); 1880 1881 sq->size = 0; 1882 1883 sq->sq_state = VFIO_USER_SQ_DELETED; 1884 1885 /* Controller RESET and SHUTDOWN are special cases, 1886 * VM may not send DELETE IO SQ/CQ commands, NVMf library 1887 * will disconnect IO queue pairs. 1888 */ 1889 if (vu_ctrlr->reset_shn || vu_ctrlr->disconnect) { 1890 cqid = sq->cqid; 1891 cq = vu_ctrlr->cqs[cqid]; 1892 1893 SPDK_DEBUGLOG(nvmf_vfio, "%s: try to delete cqid:%u=%p\n", ctrlr_id(vu_ctrlr), 1894 cq->qid, cq); 1895 1896 assert(cq->cq_ref > 0); 1897 if (--cq->cq_ref == 0) { 1898 delete_cq_done(vu_ctrlr, cq); 1899 } 1900 } 1901 } 1902 1903 static void 1904 free_qp(struct nvmf_vfio_user_ctrlr *ctrlr, uint16_t qid) 1905 { 1906 struct nvmf_vfio_user_sq *sq; 1907 struct nvmf_vfio_user_cq *cq; 1908 1909 if (ctrlr == NULL) { 1910 return; 1911 } 1912 1913 sq = ctrlr->sqs[qid]; 1914 if (sq) { 1915 SPDK_DEBUGLOG(nvmf_vfio, "%s: Free sqid:%u\n", ctrlr_id(ctrlr), qid); 1916 unmap_q(ctrlr, &sq->mapping); 1917 1918 free_sq_reqs(sq); 1919 1920 free(sq->mapping.sg); 1921 free(sq); 1922 ctrlr->sqs[qid] = NULL; 1923 } 1924 1925 cq = ctrlr->cqs[qid]; 1926 if (cq) { 1927 SPDK_DEBUGLOG(nvmf_vfio, "%s: Free cqid:%u\n", ctrlr_id(ctrlr), qid); 1928 unmap_q(ctrlr, &cq->mapping); 1929 free(cq->mapping.sg); 1930 free(cq); 1931 ctrlr->cqs[qid] = NULL; 1932 } 1933 } 1934 1935 static int 1936 init_sq(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_transport *transport, 1937 const uint16_t id) 1938 { 1939 struct nvmf_vfio_user_sq *sq; 1940 1941 assert(ctrlr != NULL); 1942 assert(transport != NULL); 1943 assert(ctrlr->sqs[id] == NULL); 1944 1945 sq = calloc(1, sizeof(*sq)); 1946 if (sq == NULL) { 1947 return -ENOMEM; 1948 } 1949 sq->mapping.sg = calloc(1, dma_sg_size()); 1950 if (sq->mapping.sg == NULL) { 1951 free(sq); 1952 return -ENOMEM; 1953 } 1954 1955 sq->qid = id; 1956 sq->qpair.qid = id; 1957 sq->qpair.transport = transport; 1958 sq->ctrlr = ctrlr; 1959 ctrlr->sqs[id] = sq; 1960 1961 TAILQ_INIT(&sq->free_reqs); 1962 1963 return 0; 1964 } 1965 1966 static int 1967 init_cq(struct nvmf_vfio_user_ctrlr *vu_ctrlr, const uint16_t id) 1968 { 1969 struct nvmf_vfio_user_cq *cq; 1970 1971 assert(vu_ctrlr != NULL); 1972 assert(vu_ctrlr->cqs[id] == NULL); 1973 1974 cq = calloc(1, sizeof(*cq)); 1975 if (cq == NULL) { 1976 return -ENOMEM; 1977 } 1978 cq->mapping.sg = calloc(1, dma_sg_size()); 1979 if (cq->mapping.sg == NULL) { 1980 free(cq); 1981 return -ENOMEM; 1982 } 1983 1984 cq->qid = id; 1985 vu_ctrlr->cqs[id] = cq; 1986 1987 return 0; 1988 } 1989 1990 static int 1991 alloc_sq_reqs(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvmf_vfio_user_sq *sq) 1992 { 1993 struct nvmf_vfio_user_req *vu_req, *tmp; 1994 size_t req_size; 1995 uint32_t i; 1996 1997 req_size = sizeof(struct nvmf_vfio_user_req) + 1998 (dma_sg_size() * NVMF_VFIO_USER_MAX_IOVECS); 1999 2000 for (i = 0; i < sq->size; i++) { 2001 struct spdk_nvmf_request *req; 2002 2003 vu_req = calloc(1, req_size); 2004 if (vu_req == NULL) { 2005 goto err; 2006 } 2007 2008 req = &vu_req->req; 2009 req->qpair = &sq->qpair; 2010 req->rsp = (union nvmf_c2h_msg *)&vu_req->rsp; 2011 req->cmd = (union nvmf_h2c_msg *)&vu_req->cmd; 2012 req->stripped_data = NULL; 2013 2014 TAILQ_INSERT_TAIL(&sq->free_reqs, vu_req, link); 2015 } 2016 2017 return 0; 2018 2019 err: 2020 TAILQ_FOREACH_SAFE(vu_req, &sq->free_reqs, link, tmp) { 2021 free(vu_req); 2022 } 2023 return -ENOMEM; 2024 } 2025 2026 static volatile uint32_t * 2027 ctrlr_doorbell_ptr(struct nvmf_vfio_user_ctrlr *ctrlr) 2028 { 2029 return ctrlr->sdbl != NULL ? 2030 ctrlr->sdbl->shadow_doorbells : 2031 ctrlr->bar0_doorbells; 2032 } 2033 2034 static uint16_t 2035 handle_create_io_sq(struct nvmf_vfio_user_ctrlr *ctrlr, 2036 struct spdk_nvme_cmd *cmd, uint16_t *sct) 2037 { 2038 struct nvmf_vfio_user_transport *vu_transport = ctrlr->transport; 2039 struct nvmf_vfio_user_sq *sq; 2040 uint32_t qsize; 2041 uint16_t cqid; 2042 uint16_t qid; 2043 int err; 2044 2045 qid = cmd->cdw10_bits.create_io_q.qid; 2046 cqid = cmd->cdw11_bits.create_io_sq.cqid; 2047 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 2048 2049 if (ctrlr->sqs[qid] == NULL) { 2050 err = init_sq(ctrlr, ctrlr->sqs[0]->qpair.transport, qid); 2051 if (err != 0) { 2052 *sct = SPDK_NVME_SCT_GENERIC; 2053 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2054 } 2055 } 2056 2057 if (cqid == 0 || cqid >= vu_transport->transport.opts.max_qpairs_per_ctrlr) { 2058 SPDK_ERRLOG("%s: invalid cqid:%u\n", ctrlr_id(ctrlr), cqid); 2059 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2060 return SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2061 } 2062 2063 /* CQ must be created before SQ. */ 2064 if (!io_q_exists(ctrlr, cqid, true)) { 2065 SPDK_ERRLOG("%s: cqid:%u does not exist\n", ctrlr_id(ctrlr), cqid); 2066 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2067 return SPDK_NVME_SC_COMPLETION_QUEUE_INVALID; 2068 } 2069 2070 if (cmd->cdw11_bits.create_io_sq.pc != 0x1) { 2071 SPDK_ERRLOG("%s: non-PC SQ not supported\n", ctrlr_id(ctrlr)); 2072 *sct = SPDK_NVME_SCT_GENERIC; 2073 return SPDK_NVME_SC_INVALID_FIELD; 2074 } 2075 2076 sq = ctrlr->sqs[qid]; 2077 sq->size = qsize; 2078 2079 SPDK_DEBUGLOG(nvmf_vfio, "%s: sqid:%d cqid:%d\n", ctrlr_id(ctrlr), 2080 qid, cqid); 2081 2082 sq->mapping.prp1 = cmd->dptr.prp.prp1; 2083 sq->mapping.len = sq->size * sizeof(struct spdk_nvme_cmd); 2084 2085 err = map_q(ctrlr, &sq->mapping, MAP_INITIALIZE); 2086 if (err) { 2087 SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr)); 2088 *sct = SPDK_NVME_SCT_GENERIC; 2089 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2090 } 2091 2092 SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped sqid:%d IOVA=%#lx vaddr=%p\n", 2093 ctrlr_id(ctrlr), qid, cmd->dptr.prp.prp1, 2094 q_addr(&sq->mapping)); 2095 2096 err = alloc_sq_reqs(ctrlr, sq); 2097 if (err < 0) { 2098 SPDK_ERRLOG("%s: failed to allocate SQ requests: %m\n", ctrlr_id(ctrlr)); 2099 *sct = SPDK_NVME_SCT_GENERIC; 2100 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2101 } 2102 2103 sq->cqid = cqid; 2104 ctrlr->cqs[sq->cqid]->cq_ref++; 2105 sq->sq_state = VFIO_USER_SQ_CREATED; 2106 *sq_headp(sq) = 0; 2107 2108 sq->dbl_tailp = ctrlr_doorbell_ptr(ctrlr) + queue_index(qid, false); 2109 2110 /* 2111 * We should always reset the doorbells. 2112 * 2113 * The Specification prohibits the controller from writing to the shadow 2114 * doorbell buffer, however older versions of the Linux NVMe driver 2115 * don't reset the shadow doorbell buffer after a Queue-Level or 2116 * Controller-Level reset, which means that we're left with garbage 2117 * doorbell values. 2118 */ 2119 *sq_dbl_tailp(sq) = 0; 2120 2121 if (ctrlr->sdbl != NULL) { 2122 sq->need_rearm = true; 2123 2124 if (!set_sq_eventidx(sq)) { 2125 SPDK_ERRLOG("%s: host updated SQ tail doorbell before " 2126 "sqid:%hu was initialized\n", 2127 ctrlr_id(ctrlr), qid); 2128 fail_ctrlr(ctrlr); 2129 *sct = SPDK_NVME_SCT_GENERIC; 2130 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2131 } 2132 } 2133 2134 /* 2135 * Create our new I/O qpair. This asynchronously invokes, on a suitable 2136 * poll group, the nvmf_vfio_user_poll_group_add() callback, which will 2137 * call spdk_nvmf_request_exec() with a generated fabrics 2138 * connect command. This command is then eventually completed via 2139 * handle_queue_connect_rsp(). 2140 */ 2141 sq->create_io_sq_cmd = *cmd; 2142 sq->post_create_io_sq_completion = true; 2143 2144 spdk_nvmf_tgt_new_qpair(ctrlr->transport->transport.tgt, 2145 &sq->qpair); 2146 2147 *sct = SPDK_NVME_SCT_GENERIC; 2148 return SPDK_NVME_SC_SUCCESS; 2149 } 2150 2151 static uint16_t 2152 handle_create_io_cq(struct nvmf_vfio_user_ctrlr *ctrlr, 2153 struct spdk_nvme_cmd *cmd, uint16_t *sct) 2154 { 2155 struct nvmf_vfio_user_cq *cq; 2156 uint32_t qsize; 2157 uint16_t qid; 2158 int err; 2159 2160 qid = cmd->cdw10_bits.create_io_q.qid; 2161 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 2162 2163 if (ctrlr->cqs[qid] == NULL) { 2164 err = init_cq(ctrlr, qid); 2165 if (err != 0) { 2166 *sct = SPDK_NVME_SCT_GENERIC; 2167 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2168 } 2169 } 2170 2171 if (cmd->cdw11_bits.create_io_cq.pc != 0x1) { 2172 SPDK_ERRLOG("%s: non-PC CQ not supported\n", ctrlr_id(ctrlr)); 2173 *sct = SPDK_NVME_SCT_GENERIC; 2174 return SPDK_NVME_SC_INVALID_FIELD; 2175 } 2176 2177 if (cmd->cdw11_bits.create_io_cq.iv > NVMF_VFIO_USER_MSIX_NUM - 1) { 2178 SPDK_ERRLOG("%s: IV is too big\n", ctrlr_id(ctrlr)); 2179 *sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2180 return SPDK_NVME_SC_INVALID_INTERRUPT_VECTOR; 2181 } 2182 2183 cq = ctrlr->cqs[qid]; 2184 cq->size = qsize; 2185 2186 cq->mapping.prp1 = cmd->dptr.prp.prp1; 2187 cq->mapping.len = cq->size * sizeof(struct spdk_nvme_cpl); 2188 2189 cq->dbl_headp = ctrlr_doorbell_ptr(ctrlr) + queue_index(qid, true); 2190 2191 err = map_q(ctrlr, &cq->mapping, MAP_RW | MAP_INITIALIZE); 2192 if (err) { 2193 SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr)); 2194 *sct = SPDK_NVME_SCT_GENERIC; 2195 return SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2196 } 2197 2198 SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped cqid:%u IOVA=%#lx vaddr=%p\n", 2199 ctrlr_id(ctrlr), qid, cmd->dptr.prp.prp1, 2200 q_addr(&cq->mapping)); 2201 2202 cq->ien = cmd->cdw11_bits.create_io_cq.ien; 2203 cq->iv = cmd->cdw11_bits.create_io_cq.iv; 2204 cq->phase = true; 2205 cq->cq_state = VFIO_USER_CQ_CREATED; 2206 2207 *cq_tailp(cq) = 0; 2208 2209 /* 2210 * We should always reset the doorbells. 2211 * 2212 * The Specification prohibits the controller from writing to the shadow 2213 * doorbell buffer, however older versions of the Linux NVMe driver 2214 * don't reset the shadow doorbell buffer after a Queue-Level or 2215 * Controller-Level reset, which means that we're left with garbage 2216 * doorbell values. 2217 */ 2218 *cq_dbl_headp(cq) = 0; 2219 2220 *sct = SPDK_NVME_SCT_GENERIC; 2221 return SPDK_NVME_SC_SUCCESS; 2222 } 2223 2224 /* 2225 * Creates a completion or submission I/O queue. Returns 0 on success, -errno 2226 * on error. 2227 */ 2228 static int 2229 handle_create_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, 2230 struct spdk_nvme_cmd *cmd, const bool is_cq) 2231 { 2232 struct nvmf_vfio_user_transport *vu_transport = ctrlr->transport; 2233 uint16_t sct = SPDK_NVME_SCT_GENERIC; 2234 uint16_t sc = SPDK_NVME_SC_SUCCESS; 2235 uint32_t qsize; 2236 uint16_t qid; 2237 2238 assert(ctrlr != NULL); 2239 assert(cmd != NULL); 2240 2241 qid = cmd->cdw10_bits.create_io_q.qid; 2242 if (qid == 0 || qid >= vu_transport->transport.opts.max_qpairs_per_ctrlr) { 2243 SPDK_ERRLOG("%s: invalid qid=%d, max=%d\n", ctrlr_id(ctrlr), 2244 qid, vu_transport->transport.opts.max_qpairs_per_ctrlr); 2245 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2246 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2247 goto out; 2248 } 2249 2250 if (io_q_exists(ctrlr, qid, is_cq)) { 2251 SPDK_ERRLOG("%s: %cqid:%d already exists\n", ctrlr_id(ctrlr), 2252 is_cq ? 'c' : 's', qid); 2253 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2254 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2255 goto out; 2256 } 2257 2258 qsize = cmd->cdw10_bits.create_io_q.qsize + 1; 2259 if (qsize == 1 || qsize > max_queue_size(ctrlr)) { 2260 SPDK_ERRLOG("%s: invalid I/O queue size %u\n", ctrlr_id(ctrlr), qsize); 2261 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2262 sc = SPDK_NVME_SC_INVALID_QUEUE_SIZE; 2263 goto out; 2264 } 2265 2266 if (is_cq) { 2267 sc = handle_create_io_cq(ctrlr, cmd, &sct); 2268 } else { 2269 sc = handle_create_io_sq(ctrlr, cmd, &sct); 2270 2271 if (sct == SPDK_NVME_SCT_GENERIC && 2272 sc == SPDK_NVME_SC_SUCCESS) { 2273 /* Completion posted asynchronously. */ 2274 return 0; 2275 } 2276 } 2277 2278 out: 2279 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 2280 } 2281 2282 /* For ADMIN I/O DELETE SUBMISSION QUEUE the NVMf library will disconnect and free 2283 * queue pair, so save the command id and controller in a context. 2284 */ 2285 struct vfio_user_delete_sq_ctx { 2286 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 2287 uint16_t cid; 2288 }; 2289 2290 static void 2291 vfio_user_qpair_delete_cb(void *cb_arg) 2292 { 2293 struct vfio_user_delete_sq_ctx *ctx = cb_arg; 2294 struct nvmf_vfio_user_ctrlr *vu_ctrlr = ctx->vu_ctrlr; 2295 struct nvmf_vfio_user_cq *admin_cq = vu_ctrlr->cqs[0]; 2296 2297 assert(admin_cq != NULL); 2298 assert(admin_cq->group != NULL); 2299 assert(admin_cq->group->group->thread != NULL); 2300 if (admin_cq->group->group->thread != spdk_get_thread()) { 2301 spdk_thread_send_msg(admin_cq->group->group->thread, 2302 vfio_user_qpair_delete_cb, 2303 cb_arg); 2304 } else { 2305 post_completion(vu_ctrlr, admin_cq, 0, 0, 2306 ctx->cid, 2307 SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC); 2308 free(ctx); 2309 } 2310 } 2311 2312 /* 2313 * Deletes a completion or submission I/O queue. 2314 */ 2315 static int 2316 handle_del_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, 2317 struct spdk_nvme_cmd *cmd, const bool is_cq) 2318 { 2319 uint16_t sct = SPDK_NVME_SCT_GENERIC; 2320 uint16_t sc = SPDK_NVME_SC_SUCCESS; 2321 struct nvmf_vfio_user_sq *sq; 2322 struct nvmf_vfio_user_cq *cq; 2323 2324 SPDK_DEBUGLOG(nvmf_vfio, "%s: delete I/O %cqid:%d\n", 2325 ctrlr_id(ctrlr), is_cq ? 'c' : 's', 2326 cmd->cdw10_bits.delete_io_q.qid); 2327 2328 if (!io_q_exists(ctrlr, cmd->cdw10_bits.delete_io_q.qid, is_cq)) { 2329 SPDK_ERRLOG("%s: I/O %cqid:%d does not exist\n", ctrlr_id(ctrlr), 2330 is_cq ? 'c' : 's', cmd->cdw10_bits.delete_io_q.qid); 2331 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2332 sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; 2333 goto out; 2334 } 2335 2336 if (is_cq) { 2337 cq = ctrlr->cqs[cmd->cdw10_bits.delete_io_q.qid]; 2338 if (cq->cq_ref) { 2339 SPDK_ERRLOG("%s: the associated SQ must be deleted first\n", ctrlr_id(ctrlr)); 2340 sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; 2341 sc = SPDK_NVME_SC_INVALID_QUEUE_DELETION; 2342 goto out; 2343 } 2344 delete_cq_done(ctrlr, cq); 2345 } else { 2346 /* 2347 * Deletion of the CQ is only deferred to delete_sq_done() on 2348 * VM reboot or CC.EN change, so we have to delete it in all 2349 * other cases. 2350 */ 2351 sq = ctrlr->sqs[cmd->cdw10_bits.delete_io_q.qid]; 2352 sq->delete_ctx = calloc(1, sizeof(*sq->delete_ctx)); 2353 if (!sq->delete_ctx) { 2354 sct = SPDK_NVME_SCT_GENERIC; 2355 sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2356 goto out; 2357 } 2358 sq->delete_ctx->vu_ctrlr = ctrlr; 2359 sq->delete_ctx->cid = cmd->cid; 2360 sq->sq_state = VFIO_USER_SQ_DELETED; 2361 assert(ctrlr->cqs[sq->cqid]->cq_ref); 2362 ctrlr->cqs[sq->cqid]->cq_ref--; 2363 2364 spdk_nvmf_qpair_disconnect(&sq->qpair); 2365 return 0; 2366 } 2367 2368 out: 2369 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 2370 } 2371 2372 /* 2373 * Configures Shadow Doorbells. 2374 */ 2375 static int 2376 handle_doorbell_buffer_config(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd) 2377 { 2378 struct nvmf_vfio_user_shadow_doorbells *sdbl = NULL; 2379 uint32_t dstrd; 2380 uintptr_t page_size, page_mask; 2381 uint64_t prp1, prp2; 2382 uint16_t sct = SPDK_NVME_SCT_GENERIC; 2383 uint16_t sc = SPDK_NVME_SC_INVALID_FIELD; 2384 2385 assert(ctrlr != NULL); 2386 assert(ctrlr->endpoint != NULL); 2387 assert(cmd != NULL); 2388 2389 dstrd = doorbell_stride(ctrlr); 2390 page_size = memory_page_size(ctrlr); 2391 page_mask = memory_page_mask(ctrlr); 2392 2393 /* FIXME: we don't check doorbell stride when setting queue doorbells. */ 2394 if ((4u << dstrd) * NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR > page_size) { 2395 SPDK_ERRLOG("%s: doorbells do not fit in a single host page", 2396 ctrlr_id(ctrlr)); 2397 2398 goto out; 2399 } 2400 2401 /* Verify guest physical addresses passed as PRPs. */ 2402 if (cmd->psdt != SPDK_NVME_PSDT_PRP) { 2403 SPDK_ERRLOG("%s: received Doorbell Buffer Config without PRPs", 2404 ctrlr_id(ctrlr)); 2405 2406 goto out; 2407 } 2408 2409 prp1 = cmd->dptr.prp.prp1; 2410 prp2 = cmd->dptr.prp.prp2; 2411 2412 SPDK_DEBUGLOG(nvmf_vfio, 2413 "%s: configuring shadow doorbells with PRP1=%#lx and PRP2=%#lx (GPAs)\n", 2414 ctrlr_id(ctrlr), prp1, prp2); 2415 2416 if (prp1 == prp2 2417 || prp1 != (prp1 & page_mask) 2418 || prp2 != (prp2 & page_mask)) { 2419 SPDK_ERRLOG("%s: invalid shadow doorbell GPAs\n", 2420 ctrlr_id(ctrlr)); 2421 2422 goto out; 2423 } 2424 2425 /* Map guest physical addresses to our virtual address space. */ 2426 sdbl = map_sdbl(ctrlr->endpoint->vfu_ctx, prp1, prp2, page_size); 2427 if (sdbl == NULL) { 2428 SPDK_ERRLOG("%s: failed to map shadow doorbell buffers\n", 2429 ctrlr_id(ctrlr)); 2430 2431 goto out; 2432 } 2433 2434 ctrlr->shadow_doorbell_buffer = prp1; 2435 ctrlr->eventidx_buffer = prp2; 2436 2437 SPDK_DEBUGLOG(nvmf_vfio, 2438 "%s: mapped shadow doorbell buffers [%p, %p) and [%p, %p)\n", 2439 ctrlr_id(ctrlr), 2440 sdbl->iovs[0].iov_base, 2441 sdbl->iovs[0].iov_base + sdbl->iovs[0].iov_len, 2442 sdbl->iovs[1].iov_base, 2443 sdbl->iovs[1].iov_base + sdbl->iovs[1].iov_len); 2444 2445 2446 /* 2447 * Set all possible CQ head doorbells to polling mode now, such that we 2448 * don't have to worry about it later if the host creates more queues. 2449 * 2450 * We only ever want interrupts for writes to the SQ tail doorbells 2451 * (which are initialised in set_ctrlr_intr_mode() below). 2452 */ 2453 for (uint16_t i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; ++i) { 2454 sdbl->eventidxs[queue_index(i, true)] = NVMF_VFIO_USER_EVENTIDX_POLL; 2455 } 2456 2457 /* Update controller. */ 2458 SWAP(ctrlr->sdbl, sdbl); 2459 2460 /* 2461 * Copy doorbells from either the previous shadow doorbell buffer or the 2462 * BAR0 doorbells and make I/O queue doorbells point to the new buffer. 2463 * 2464 * This needs to account for older versions of the Linux NVMe driver, 2465 * which don't clear out the buffer after a controller reset. 2466 */ 2467 copy_doorbells(ctrlr, sdbl != NULL ? 2468 sdbl->shadow_doorbells : ctrlr->bar0_doorbells, 2469 ctrlr->sdbl->shadow_doorbells); 2470 2471 vfio_user_ctrlr_switch_doorbells(ctrlr, true); 2472 2473 ctrlr_kick(ctrlr); 2474 2475 sc = SPDK_NVME_SC_SUCCESS; 2476 2477 out: 2478 /* 2479 * Unmap existing buffers, in case Doorbell Buffer Config was sent 2480 * more than once (pointless, but not prohibited by the spec), or 2481 * in case of an error. 2482 * 2483 * If this is the first time Doorbell Buffer Config was processed, 2484 * then we've just swapped a NULL from ctrlr->sdbl into sdbl, so 2485 * free_sdbl() becomes a noop. 2486 */ 2487 free_sdbl(ctrlr->endpoint->vfu_ctx, sdbl); 2488 2489 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct); 2490 } 2491 2492 /* Returns 0 on success and -errno on error. */ 2493 static int 2494 consume_admin_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd) 2495 { 2496 assert(ctrlr != NULL); 2497 assert(cmd != NULL); 2498 2499 if (cmd->fuse != 0) { 2500 /* Fused admin commands are not supported. */ 2501 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, 2502 SPDK_NVME_SC_INVALID_FIELD, 2503 SPDK_NVME_SCT_GENERIC); 2504 } 2505 2506 switch (cmd->opc) { 2507 case SPDK_NVME_OPC_CREATE_IO_CQ: 2508 case SPDK_NVME_OPC_CREATE_IO_SQ: 2509 return handle_create_io_q(ctrlr, cmd, 2510 cmd->opc == SPDK_NVME_OPC_CREATE_IO_CQ); 2511 case SPDK_NVME_OPC_DELETE_IO_SQ: 2512 case SPDK_NVME_OPC_DELETE_IO_CQ: 2513 return handle_del_io_q(ctrlr, cmd, 2514 cmd->opc == SPDK_NVME_OPC_DELETE_IO_CQ); 2515 case SPDK_NVME_OPC_DOORBELL_BUFFER_CONFIG: 2516 SPDK_NOTICELOG("%s: requested shadow doorbells (supported: %d)\n", 2517 ctrlr_id(ctrlr), 2518 !ctrlr->transport->transport_opts.disable_shadow_doorbells); 2519 if (!ctrlr->transport->transport_opts.disable_shadow_doorbells) { 2520 return handle_doorbell_buffer_config(ctrlr, cmd); 2521 } 2522 /* FALLTHROUGH */ 2523 default: 2524 return handle_cmd_req(ctrlr, cmd, ctrlr->sqs[0]); 2525 } 2526 } 2527 2528 static int 2529 handle_cmd_rsp(struct nvmf_vfio_user_req *vu_req, void *cb_arg) 2530 { 2531 struct nvmf_vfio_user_sq *sq = cb_arg; 2532 struct nvmf_vfio_user_ctrlr *vu_ctrlr = sq->ctrlr; 2533 uint16_t sqid, cqid; 2534 2535 assert(sq != NULL); 2536 assert(vu_req != NULL); 2537 assert(vu_ctrlr != NULL); 2538 2539 if (spdk_likely(vu_req->iovcnt)) { 2540 vfu_sgl_put(vu_ctrlr->endpoint->vfu_ctx, 2541 index_to_sg_t(vu_req->sg, 0), 2542 vu_req->iov, vu_req->iovcnt); 2543 } 2544 sqid = sq->qid; 2545 cqid = sq->cqid; 2546 2547 return post_completion(vu_ctrlr, vu_ctrlr->cqs[cqid], 2548 vu_req->req.rsp->nvme_cpl.cdw0, 2549 sqid, 2550 vu_req->req.cmd->nvme_cmd.cid, 2551 vu_req->req.rsp->nvme_cpl.status.sc, 2552 vu_req->req.rsp->nvme_cpl.status.sct); 2553 } 2554 2555 static int 2556 consume_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_sq *sq, 2557 struct spdk_nvme_cmd *cmd) 2558 { 2559 assert(sq != NULL); 2560 if (spdk_unlikely(nvmf_qpair_is_admin_queue(&sq->qpair))) { 2561 return consume_admin_cmd(ctrlr, cmd); 2562 } 2563 2564 return handle_cmd_req(ctrlr, cmd, sq); 2565 } 2566 2567 /* Returns the number of commands processed, or a negative value on error. */ 2568 static int 2569 handle_sq_tdbl_write(struct nvmf_vfio_user_ctrlr *ctrlr, const uint32_t new_tail, 2570 struct nvmf_vfio_user_sq *sq) 2571 { 2572 struct spdk_nvme_cmd *queue; 2573 struct nvmf_vfio_user_cq *cq = ctrlr->cqs[sq->cqid]; 2574 int count = 0; 2575 uint32_t free_cq_slots; 2576 2577 assert(ctrlr != NULL); 2578 assert(sq != NULL); 2579 2580 if (ctrlr->sdbl != NULL && sq->qid != 0) { 2581 /* 2582 * Submission queue index has moved past the event index, so it 2583 * needs to be re-armed before we go to sleep. 2584 */ 2585 sq->need_rearm = true; 2586 } 2587 2588 free_cq_slots = cq_free_slots(cq); 2589 queue = q_addr(&sq->mapping); 2590 while (*sq_headp(sq) != new_tail) { 2591 int err; 2592 struct spdk_nvme_cmd *cmd; 2593 2594 /* 2595 * At least the Linux nvme driver can submit more requests than 2596 * our current view of the available free CQ slots, although it 2597 * is not clear exactly why or how; it is relatively rare even 2598 * under high load. 2599 * 2600 * As we need to make sure we have free CQ slots (see 2601 * post_completion()), we implement flow control here: if the 2602 * number of currently outstanding requests for this SQ would 2603 * use all the available CQ slots, then we cannot submit this 2604 * new request. 2605 * 2606 * Instead we back off until the driver has informed us that CQ 2607 * slots are available. 2608 */ 2609 if ((free_cq_slots-- <= cq->nr_outstanding)) { 2610 struct nvmf_vfio_user_poll_group *vu_group; 2611 cq->last_head = *cq_dbl_headp(cq); 2612 2613 free_cq_slots = cq_free_slots(cq); 2614 if (free_cq_slots > cq->nr_outstanding) { 2615 continue; 2616 } 2617 2618 vu_group = sq_to_poll_group(sq); 2619 2620 vu_group->stats.cq_full++; 2621 2622 /* 2623 * There are no free CQ slots, so stop processing 2624 * submissions for this SQ until "a later time". In 2625 * interrupt mode, we need to kick ourselves, so that we 2626 * are guaranteed to wake up and come back here. 2627 */ 2628 if (in_interrupt_mode(ctrlr->transport)) { 2629 vu_group->need_kick = true; 2630 } 2631 break; 2632 } 2633 2634 cmd = &queue[*sq_headp(sq)]; 2635 count++; 2636 2637 cq->nr_outstanding++; 2638 2639 /* 2640 * SQHD must contain the new head pointer, so we must increase 2641 * it before we generate a completion. 2642 */ 2643 sq_head_advance(sq); 2644 2645 err = consume_cmd(ctrlr, sq, cmd); 2646 if (spdk_unlikely(err != 0)) { 2647 return err; 2648 } 2649 } 2650 2651 return count; 2652 } 2653 2654 /* Checks whether endpoint is connected from the same process */ 2655 static bool 2656 is_peer_same_process(struct nvmf_vfio_user_endpoint *endpoint) 2657 { 2658 struct ucred ucred; 2659 socklen_t ucredlen = sizeof(ucred); 2660 2661 if (endpoint == NULL) { 2662 return false; 2663 } 2664 2665 if (getsockopt(vfu_get_poll_fd(endpoint->vfu_ctx), SOL_SOCKET, SO_PEERCRED, &ucred, 2666 &ucredlen) < 0) { 2667 SPDK_ERRLOG("getsockopt(SO_PEERCRED): %s\n", strerror(errno)); 2668 return false; 2669 } 2670 2671 return ucred.pid == getpid(); 2672 } 2673 2674 static void 2675 memory_region_add_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) 2676 { 2677 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2678 struct nvmf_vfio_user_ctrlr *ctrlr; 2679 struct nvmf_vfio_user_sq *sq; 2680 struct nvmf_vfio_user_cq *cq; 2681 void *map_start, *map_end; 2682 int ret; 2683 2684 /* 2685 * We're not interested in any DMA regions that aren't mappable (we don't 2686 * support clients that don't share their memory). 2687 */ 2688 if (!info->vaddr) { 2689 return; 2690 } 2691 2692 map_start = info->mapping.iov_base; 2693 map_end = info->mapping.iov_base + info->mapping.iov_len; 2694 2695 if (((uintptr_t)info->mapping.iov_base & MASK_2MB) || 2696 (info->mapping.iov_len & MASK_2MB)) { 2697 SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %p-%p\n", 2698 info->vaddr, map_start, map_end); 2699 return; 2700 } 2701 2702 assert(endpoint != NULL); 2703 if (endpoint->ctrlr == NULL) { 2704 return; 2705 } 2706 ctrlr = endpoint->ctrlr; 2707 2708 SPDK_DEBUGLOG(nvmf_vfio, "%s: map IOVA %p-%p\n", endpoint_id(endpoint), 2709 map_start, map_end); 2710 2711 /* VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE are enabled when registering to VFIO, here we also 2712 * check the protection bits before registering. When vfio client and server are run in same process 2713 * there is no need to register the same memory again. 2714 */ 2715 if (info->prot == (PROT_WRITE | PROT_READ) && !is_peer_same_process(endpoint)) { 2716 ret = spdk_mem_register(info->mapping.iov_base, info->mapping.iov_len); 2717 if (ret) { 2718 SPDK_ERRLOG("Memory region register %p-%p failed, ret=%d\n", 2719 map_start, map_end, ret); 2720 } 2721 } 2722 2723 pthread_mutex_lock(&endpoint->lock); 2724 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 2725 if (sq->sq_state != VFIO_USER_SQ_INACTIVE) { 2726 continue; 2727 } 2728 2729 cq = ctrlr->cqs[sq->cqid]; 2730 2731 /* For shared CQ case, we will use q_addr() to avoid mapping CQ multiple times */ 2732 if (cq->size && q_addr(&cq->mapping) == NULL) { 2733 ret = map_q(ctrlr, &cq->mapping, MAP_RW | MAP_QUIET); 2734 if (ret) { 2735 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap cqid:%d %#lx-%#lx\n", 2736 cq->qid, cq->mapping.prp1, 2737 cq->mapping.prp1 + cq->mapping.len); 2738 continue; 2739 } 2740 } 2741 2742 if (sq->size) { 2743 ret = map_q(ctrlr, &sq->mapping, MAP_R | MAP_QUIET); 2744 if (ret) { 2745 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap sqid:%d %#lx-%#lx\n", 2746 sq->qid, sq->mapping.prp1, 2747 sq->mapping.prp1 + sq->mapping.len); 2748 continue; 2749 } 2750 } 2751 sq->sq_state = VFIO_USER_SQ_ACTIVE; 2752 SPDK_DEBUGLOG(nvmf_vfio, "Remap sqid:%u successfully\n", sq->qid); 2753 } 2754 pthread_mutex_unlock(&endpoint->lock); 2755 } 2756 2757 static void 2758 memory_region_remove_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) 2759 { 2760 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 2761 struct nvmf_vfio_user_sq *sq; 2762 struct nvmf_vfio_user_cq *cq; 2763 void *map_start, *map_end; 2764 int ret = 0; 2765 2766 if (!info->vaddr) { 2767 return; 2768 } 2769 2770 map_start = info->mapping.iov_base; 2771 map_end = info->mapping.iov_base + info->mapping.iov_len; 2772 2773 if (((uintptr_t)info->mapping.iov_base & MASK_2MB) || 2774 (info->mapping.iov_len & MASK_2MB)) { 2775 SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %p-%p\n", 2776 info->vaddr, map_start, map_end); 2777 return; 2778 } 2779 2780 assert(endpoint != NULL); 2781 SPDK_DEBUGLOG(nvmf_vfio, "%s: unmap IOVA %p-%p\n", endpoint_id(endpoint), 2782 map_start, map_end); 2783 2784 if (endpoint->ctrlr != NULL) { 2785 struct nvmf_vfio_user_ctrlr *ctrlr; 2786 ctrlr = endpoint->ctrlr; 2787 2788 pthread_mutex_lock(&endpoint->lock); 2789 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 2790 if (q_addr(&sq->mapping) >= map_start && q_addr(&sq->mapping) <= map_end) { 2791 unmap_q(ctrlr, &sq->mapping); 2792 sq->sq_state = VFIO_USER_SQ_INACTIVE; 2793 } 2794 2795 cq = ctrlr->cqs[sq->cqid]; 2796 if (q_addr(&cq->mapping) >= map_start && q_addr(&cq->mapping) <= map_end) { 2797 unmap_q(ctrlr, &cq->mapping); 2798 } 2799 } 2800 2801 if (ctrlr->sdbl != NULL) { 2802 size_t i; 2803 2804 for (i = 0; i < NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT; i++) { 2805 const void *const iov_base = ctrlr->sdbl->iovs[i].iov_base; 2806 2807 if (iov_base >= map_start && iov_base < map_end) { 2808 copy_doorbells(ctrlr, 2809 ctrlr->sdbl->shadow_doorbells, 2810 ctrlr->bar0_doorbells); 2811 vfio_user_ctrlr_switch_doorbells(ctrlr, false); 2812 free_sdbl(endpoint->vfu_ctx, ctrlr->sdbl); 2813 ctrlr->sdbl = NULL; 2814 break; 2815 } 2816 } 2817 } 2818 2819 pthread_mutex_unlock(&endpoint->lock); 2820 } 2821 2822 if (info->prot == (PROT_WRITE | PROT_READ) && !is_peer_same_process(endpoint)) { 2823 ret = spdk_mem_unregister(info->mapping.iov_base, info->mapping.iov_len); 2824 if (ret) { 2825 SPDK_ERRLOG("Memory region unregister %p-%p failed, ret=%d\n", 2826 map_start, map_end, ret); 2827 } 2828 } 2829 } 2830 2831 /* Used to initiate a controller-level reset or a controller shutdown. */ 2832 static void 2833 disable_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 2834 { 2835 SPDK_NOTICELOG("%s: disabling controller\n", ctrlr_id(vu_ctrlr)); 2836 2837 /* Unmap Admin queue. */ 2838 2839 assert(vu_ctrlr->sqs[0] != NULL); 2840 assert(vu_ctrlr->cqs[0] != NULL); 2841 2842 unmap_q(vu_ctrlr, &vu_ctrlr->sqs[0]->mapping); 2843 unmap_q(vu_ctrlr, &vu_ctrlr->cqs[0]->mapping); 2844 2845 vu_ctrlr->sqs[0]->size = 0; 2846 *sq_headp(vu_ctrlr->sqs[0]) = 0; 2847 2848 vu_ctrlr->sqs[0]->sq_state = VFIO_USER_SQ_INACTIVE; 2849 2850 vu_ctrlr->cqs[0]->size = 0; 2851 *cq_tailp(vu_ctrlr->cqs[0]) = 0; 2852 2853 /* 2854 * For PCIe controller reset or shutdown, we will drop all AER 2855 * responses. 2856 */ 2857 spdk_nvmf_ctrlr_abort_aer(vu_ctrlr->ctrlr); 2858 2859 /* Free the shadow doorbell buffer. */ 2860 vfio_user_ctrlr_switch_doorbells(vu_ctrlr, false); 2861 free_sdbl(vu_ctrlr->endpoint->vfu_ctx, vu_ctrlr->sdbl); 2862 vu_ctrlr->sdbl = NULL; 2863 } 2864 2865 /* Used to re-enable the controller after a controller-level reset. */ 2866 static int 2867 enable_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 2868 { 2869 int err; 2870 2871 assert(vu_ctrlr != NULL); 2872 2873 SPDK_NOTICELOG("%s: enabling controller\n", ctrlr_id(vu_ctrlr)); 2874 2875 err = acq_setup(vu_ctrlr); 2876 if (err != 0) { 2877 return err; 2878 } 2879 2880 err = asq_setup(vu_ctrlr); 2881 if (err != 0) { 2882 return err; 2883 } 2884 2885 vu_ctrlr->sqs[0]->sq_state = VFIO_USER_SQ_ACTIVE; 2886 2887 return 0; 2888 } 2889 2890 static int 2891 nvmf_vfio_user_prop_req_rsp_set(struct nvmf_vfio_user_req *req, 2892 struct nvmf_vfio_user_sq *sq) 2893 { 2894 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 2895 union spdk_nvme_cc_register cc, diff; 2896 2897 assert(req->req.cmd->prop_set_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET); 2898 assert(sq->ctrlr != NULL); 2899 vu_ctrlr = sq->ctrlr; 2900 2901 if (req->req.cmd->prop_set_cmd.ofst != offsetof(struct spdk_nvme_registers, cc)) { 2902 return 0; 2903 } 2904 2905 cc.raw = req->req.cmd->prop_set_cmd.value.u64; 2906 diff.raw = cc.raw ^ req->cc.raw; 2907 2908 if (diff.bits.en) { 2909 if (cc.bits.en) { 2910 int ret = enable_ctrlr(vu_ctrlr); 2911 if (ret) { 2912 SPDK_ERRLOG("%s: failed to enable ctrlr\n", ctrlr_id(vu_ctrlr)); 2913 return ret; 2914 } 2915 vu_ctrlr->reset_shn = false; 2916 } else { 2917 vu_ctrlr->reset_shn = true; 2918 } 2919 } 2920 2921 if (diff.bits.shn) { 2922 if (cc.bits.shn == SPDK_NVME_SHN_NORMAL || cc.bits.shn == SPDK_NVME_SHN_ABRUPT) { 2923 vu_ctrlr->reset_shn = true; 2924 } 2925 } 2926 2927 if (vu_ctrlr->reset_shn) { 2928 disable_ctrlr(vu_ctrlr); 2929 } 2930 return 0; 2931 } 2932 2933 static int 2934 nvmf_vfio_user_prop_req_rsp(struct nvmf_vfio_user_req *req, void *cb_arg) 2935 { 2936 struct nvmf_vfio_user_sq *sq = cb_arg; 2937 2938 assert(sq != NULL); 2939 assert(req != NULL); 2940 2941 if (req->req.cmd->prop_get_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET) { 2942 assert(sq->ctrlr != NULL); 2943 assert(req != NULL); 2944 2945 memcpy(req->req.iov[0].iov_base, 2946 &req->req.rsp->prop_get_rsp.value.u64, 2947 req->req.length); 2948 return 0; 2949 } 2950 2951 return nvmf_vfio_user_prop_req_rsp_set(req, sq); 2952 } 2953 2954 /* 2955 * Handles a write at offset 0x1000 or more; this is the non-mapped path when a 2956 * doorbell is written via access_bar0_fn(). 2957 * 2958 * DSTRD is set to fixed value 0 for NVMf. 2959 * 2960 */ 2961 static int 2962 handle_dbl_access(struct nvmf_vfio_user_ctrlr *ctrlr, uint32_t *buf, 2963 const size_t count, loff_t pos, const bool is_write) 2964 { 2965 struct nvmf_vfio_user_poll_group *group; 2966 2967 assert(ctrlr != NULL); 2968 assert(buf != NULL); 2969 2970 if (spdk_unlikely(!is_write)) { 2971 SPDK_WARNLOG("%s: host tried to read BAR0 doorbell %#lx\n", 2972 ctrlr_id(ctrlr), pos); 2973 errno = EPERM; 2974 return -1; 2975 } 2976 2977 if (spdk_unlikely(count != sizeof(uint32_t))) { 2978 SPDK_ERRLOG("%s: bad doorbell buffer size %ld\n", 2979 ctrlr_id(ctrlr), count); 2980 errno = EINVAL; 2981 return -1; 2982 } 2983 2984 pos -= NVME_DOORBELLS_OFFSET; 2985 2986 /* pos must be dword aligned */ 2987 if (spdk_unlikely((pos & 0x3) != 0)) { 2988 SPDK_ERRLOG("%s: bad doorbell offset %#lx\n", ctrlr_id(ctrlr), pos); 2989 errno = EINVAL; 2990 return -1; 2991 } 2992 2993 /* convert byte offset to array index */ 2994 pos >>= 2; 2995 2996 if (spdk_unlikely(pos >= NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR * 2)) { 2997 SPDK_ERRLOG("%s: bad doorbell index %#lx\n", ctrlr_id(ctrlr), pos); 2998 errno = EINVAL; 2999 return -1; 3000 } 3001 3002 ctrlr->bar0_doorbells[pos] = *buf; 3003 spdk_wmb(); 3004 3005 group = ctrlr_to_poll_group(ctrlr); 3006 if (pos == 1) { 3007 group->stats.cqh_admin_writes++; 3008 } else if (pos & 1) { 3009 group->stats.cqh_io_writes++; 3010 } 3011 3012 SPDK_DEBUGLOG(vfio_user_db, "%s: updating BAR0 doorbell %s:%ld to %u\n", 3013 ctrlr_id(ctrlr), (pos & 1) ? "cqid" : "sqid", 3014 pos / 2, *buf); 3015 3016 3017 return 0; 3018 } 3019 3020 static size_t 3021 vfio_user_property_access(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 3022 char *buf, size_t count, loff_t pos, 3023 bool is_write) 3024 { 3025 struct nvmf_vfio_user_req *req; 3026 const struct spdk_nvmf_registers *regs; 3027 3028 if ((count != 4) && (count != 8)) { 3029 errno = EINVAL; 3030 return -1; 3031 } 3032 3033 /* Construct a Fabric Property Get/Set command and send it */ 3034 req = get_nvmf_vfio_user_req(vu_ctrlr->sqs[0]); 3035 if (req == NULL) { 3036 errno = ENOBUFS; 3037 return -1; 3038 } 3039 regs = spdk_nvmf_ctrlr_get_regs(vu_ctrlr->ctrlr); 3040 req->cc.raw = regs->cc.raw; 3041 3042 req->cb_fn = nvmf_vfio_user_prop_req_rsp; 3043 req->cb_arg = vu_ctrlr->sqs[0]; 3044 req->req.cmd->prop_set_cmd.opcode = SPDK_NVME_OPC_FABRIC; 3045 req->req.cmd->prop_set_cmd.cid = 0; 3046 if (count == 4) { 3047 req->req.cmd->prop_set_cmd.attrib.size = 0; 3048 } else { 3049 req->req.cmd->prop_set_cmd.attrib.size = 1; 3050 } 3051 req->req.cmd->prop_set_cmd.ofst = pos; 3052 if (is_write) { 3053 req->req.cmd->prop_set_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET; 3054 if (req->req.cmd->prop_set_cmd.attrib.size) { 3055 req->req.cmd->prop_set_cmd.value.u64 = *(uint64_t *)buf; 3056 } else { 3057 req->req.cmd->prop_set_cmd.value.u32.high = 0; 3058 req->req.cmd->prop_set_cmd.value.u32.low = *(uint32_t *)buf; 3059 } 3060 } else { 3061 req->req.cmd->prop_get_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET; 3062 } 3063 req->req.length = count; 3064 SPDK_IOV_ONE(req->req.iov, &req->req.iovcnt, buf, req->req.length); 3065 3066 spdk_nvmf_request_exec(&req->req); 3067 3068 return count; 3069 } 3070 3071 static ssize_t 3072 access_bar0_fn(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t pos, 3073 bool is_write) 3074 { 3075 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3076 struct nvmf_vfio_user_ctrlr *ctrlr; 3077 int ret; 3078 3079 ctrlr = endpoint->ctrlr; 3080 if (spdk_unlikely(endpoint->need_async_destroy || !ctrlr)) { 3081 errno = EIO; 3082 return -1; 3083 } 3084 3085 if (pos >= NVME_DOORBELLS_OFFSET) { 3086 /* 3087 * The fact that the doorbells can be memory mapped doesn't mean 3088 * that the client (VFIO in QEMU) is obliged to memory map them, 3089 * it might still elect to access them via regular read/write; 3090 * we might also have had disable_mappable_bar0 set. 3091 */ 3092 ret = handle_dbl_access(ctrlr, (uint32_t *)buf, count, 3093 pos, is_write); 3094 if (ret == 0) { 3095 return count; 3096 } 3097 return ret; 3098 } 3099 3100 return vfio_user_property_access(ctrlr, buf, count, pos, is_write); 3101 } 3102 3103 static ssize_t 3104 access_pci_config(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t offset, 3105 bool is_write) 3106 { 3107 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3108 3109 if (is_write) { 3110 SPDK_ERRLOG("%s: write %#lx-%#lx not supported\n", 3111 endpoint_id(endpoint), offset, offset + count); 3112 errno = EINVAL; 3113 return -1; 3114 } 3115 3116 if (offset + count > NVME_REG_CFG_SIZE) { 3117 SPDK_ERRLOG("%s: access past end of extended PCI configuration space, want=%ld+%ld, max=%d\n", 3118 endpoint_id(endpoint), offset, count, 3119 NVME_REG_CFG_SIZE); 3120 errno = ERANGE; 3121 return -1; 3122 } 3123 3124 memcpy(buf, ((unsigned char *)endpoint->pci_config_space) + offset, count); 3125 3126 return count; 3127 } 3128 3129 static void 3130 vfio_user_log(vfu_ctx_t *vfu_ctx, int level, char const *msg) 3131 { 3132 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3133 3134 if (level >= LOG_DEBUG) { 3135 SPDK_DEBUGLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg); 3136 } else if (level >= LOG_INFO) { 3137 SPDK_INFOLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg); 3138 } else if (level >= LOG_NOTICE) { 3139 SPDK_NOTICELOG("%s: %s\n", endpoint_id(endpoint), msg); 3140 } else if (level >= LOG_WARNING) { 3141 SPDK_WARNLOG("%s: %s\n", endpoint_id(endpoint), msg); 3142 } else { 3143 SPDK_ERRLOG("%s: %s\n", endpoint_id(endpoint), msg); 3144 } 3145 } 3146 3147 static int 3148 vfio_user_get_log_level(void) 3149 { 3150 int level; 3151 3152 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 3153 return LOG_DEBUG; 3154 } 3155 3156 level = spdk_log_to_syslog_level(spdk_log_get_level()); 3157 if (level < 0) { 3158 return LOG_ERR; 3159 } 3160 3161 return level; 3162 } 3163 3164 static void 3165 init_pci_config_space(vfu_pci_config_space_t *p) 3166 { 3167 /* MLBAR */ 3168 p->hdr.bars[0].raw = 0x0; 3169 /* MUBAR */ 3170 p->hdr.bars[1].raw = 0x0; 3171 3172 /* vendor specific, let's set them to zero for now */ 3173 p->hdr.bars[3].raw = 0x0; 3174 p->hdr.bars[4].raw = 0x0; 3175 p->hdr.bars[5].raw = 0x0; 3176 3177 /* enable INTx */ 3178 p->hdr.intr.ipin = 0x1; 3179 } 3180 3181 struct ctrlr_quiesce_ctx { 3182 struct nvmf_vfio_user_endpoint *endpoint; 3183 struct nvmf_vfio_user_poll_group *group; 3184 int status; 3185 }; 3186 3187 static void ctrlr_quiesce(struct nvmf_vfio_user_ctrlr *vu_ctrlr); 3188 3189 static void 3190 _vfio_user_endpoint_resume_done_msg(void *ctx) 3191 { 3192 struct nvmf_vfio_user_endpoint *endpoint = ctx; 3193 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3194 3195 endpoint->need_resume = false; 3196 3197 if (!vu_ctrlr) { 3198 return; 3199 } 3200 3201 if (!vu_ctrlr->queued_quiesce) { 3202 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 3203 3204 /* 3205 * We might have ignored new SQ entries while we were quiesced: 3206 * kick ourselves so we'll definitely check again while in 3207 * VFIO_USER_CTRLR_RUNNING state. 3208 */ 3209 if (in_interrupt_mode(endpoint->transport)) { 3210 ctrlr_kick(vu_ctrlr); 3211 } 3212 return; 3213 } 3214 3215 3216 /* 3217 * Basically, once we call `vfu_device_quiesced` the device is 3218 * unquiesced from libvfio-user's perspective so from the moment 3219 * `vfio_user_quiesce_done` returns libvfio-user might quiesce the device 3220 * again. However, because the NVMf subsystem is an asynchronous 3221 * operation, this quiesce might come _before_ the NVMf subsystem has 3222 * been resumed, so in the callback of `spdk_nvmf_subsystem_resume` we 3223 * need to check whether a quiesce was requested. 3224 */ 3225 SPDK_DEBUGLOG(nvmf_vfio, "%s has queued quiesce event, quiesce again\n", 3226 ctrlr_id(vu_ctrlr)); 3227 ctrlr_quiesce(vu_ctrlr); 3228 } 3229 3230 static void 3231 vfio_user_endpoint_resume_done(struct spdk_nvmf_subsystem *subsystem, 3232 void *cb_arg, int status) 3233 { 3234 struct nvmf_vfio_user_endpoint *endpoint = cb_arg; 3235 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3236 3237 SPDK_DEBUGLOG(nvmf_vfio, "%s resumed done with status %d\n", endpoint_id(endpoint), status); 3238 3239 if (!vu_ctrlr) { 3240 return; 3241 } 3242 3243 spdk_thread_send_msg(vu_ctrlr->thread, _vfio_user_endpoint_resume_done_msg, endpoint); 3244 } 3245 3246 static void 3247 vfio_user_quiesce_done(void *ctx) 3248 { 3249 struct ctrlr_quiesce_ctx *quiesce_ctx = ctx; 3250 struct nvmf_vfio_user_endpoint *endpoint = quiesce_ctx->endpoint; 3251 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3252 int ret; 3253 3254 if (!vu_ctrlr) { 3255 free(quiesce_ctx); 3256 return; 3257 } 3258 3259 SPDK_DEBUGLOG(nvmf_vfio, "%s device quiesced\n", ctrlr_id(vu_ctrlr)); 3260 3261 assert(vu_ctrlr->state == VFIO_USER_CTRLR_PAUSING); 3262 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 3263 vfu_device_quiesced(endpoint->vfu_ctx, quiesce_ctx->status); 3264 vu_ctrlr->queued_quiesce = false; 3265 free(quiesce_ctx); 3266 3267 /* `vfu_device_quiesced` can change the migration state, 3268 * so we need to re-check `vu_ctrlr->state`. 3269 */ 3270 if (vu_ctrlr->state == VFIO_USER_CTRLR_MIGRATING) { 3271 SPDK_DEBUGLOG(nvmf_vfio, "%s is in MIGRATION state\n", ctrlr_id(vu_ctrlr)); 3272 return; 3273 } 3274 3275 SPDK_DEBUGLOG(nvmf_vfio, "%s start to resume\n", ctrlr_id(vu_ctrlr)); 3276 vu_ctrlr->state = VFIO_USER_CTRLR_RESUMING; 3277 ret = spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 3278 vfio_user_endpoint_resume_done, endpoint); 3279 if (ret < 0) { 3280 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 3281 SPDK_ERRLOG("%s: failed to resume, ret=%d\n", endpoint_id(endpoint), ret); 3282 } 3283 } 3284 3285 static void 3286 vfio_user_pause_done(struct spdk_nvmf_subsystem *subsystem, 3287 void *ctx, int status) 3288 { 3289 struct ctrlr_quiesce_ctx *quiesce_ctx = ctx; 3290 struct nvmf_vfio_user_endpoint *endpoint = quiesce_ctx->endpoint; 3291 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3292 3293 if (!vu_ctrlr) { 3294 free(quiesce_ctx); 3295 return; 3296 } 3297 3298 quiesce_ctx->status = status; 3299 3300 SPDK_DEBUGLOG(nvmf_vfio, "%s pause done with status %d\n", 3301 ctrlr_id(vu_ctrlr), status); 3302 3303 spdk_thread_send_msg(vu_ctrlr->thread, 3304 vfio_user_quiesce_done, ctx); 3305 } 3306 3307 /* 3308 * Ensure that, for this PG, we've stopped running in nvmf_vfio_user_sq_poll(); 3309 * we've already set ctrlr->state, so we won't process new entries, but we need 3310 * to ensure that this PG is quiesced. This only works because there's no 3311 * callback context set up between polling the SQ and spdk_nvmf_request_exec(). 3312 * 3313 * Once we've walked all PGs, we need to pause any submitted I/O via 3314 * spdk_nvmf_subsystem_pause(SPDK_NVME_GLOBAL_NS_TAG). 3315 */ 3316 static void 3317 vfio_user_quiesce_pg(void *ctx) 3318 { 3319 struct ctrlr_quiesce_ctx *quiesce_ctx = ctx; 3320 struct nvmf_vfio_user_endpoint *endpoint = quiesce_ctx->endpoint; 3321 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3322 struct nvmf_vfio_user_poll_group *vu_group = quiesce_ctx->group; 3323 struct spdk_nvmf_subsystem *subsystem = endpoint->subsystem; 3324 int ret; 3325 3326 SPDK_DEBUGLOG(nvmf_vfio, "quiesced pg:%p\n", vu_group); 3327 3328 if (!vu_ctrlr) { 3329 free(quiesce_ctx); 3330 return; 3331 } 3332 3333 quiesce_ctx->group = TAILQ_NEXT(vu_group, link); 3334 if (quiesce_ctx->group != NULL) { 3335 spdk_thread_send_msg(poll_group_to_thread(quiesce_ctx->group), 3336 vfio_user_quiesce_pg, quiesce_ctx); 3337 return; 3338 } 3339 3340 ret = spdk_nvmf_subsystem_pause(subsystem, SPDK_NVME_GLOBAL_NS_TAG, 3341 vfio_user_pause_done, quiesce_ctx); 3342 if (ret < 0) { 3343 SPDK_ERRLOG("%s: failed to pause, ret=%d\n", 3344 endpoint_id(endpoint), ret); 3345 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 3346 fail_ctrlr(vu_ctrlr); 3347 free(quiesce_ctx); 3348 } 3349 } 3350 3351 static void 3352 ctrlr_quiesce(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3353 { 3354 struct ctrlr_quiesce_ctx *quiesce_ctx; 3355 3356 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSING; 3357 3358 quiesce_ctx = calloc(1, sizeof(*quiesce_ctx)); 3359 if (!quiesce_ctx) { 3360 SPDK_ERRLOG("Failed to allocate subsystem pause context\n"); 3361 assert(false); 3362 return; 3363 } 3364 3365 quiesce_ctx->endpoint = vu_ctrlr->endpoint; 3366 quiesce_ctx->status = 0; 3367 quiesce_ctx->group = TAILQ_FIRST(&vu_ctrlr->transport->poll_groups); 3368 3369 spdk_thread_send_msg(poll_group_to_thread(quiesce_ctx->group), 3370 vfio_user_quiesce_pg, quiesce_ctx); 3371 } 3372 3373 static int 3374 vfio_user_dev_quiesce_cb(vfu_ctx_t *vfu_ctx) 3375 { 3376 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3377 struct spdk_nvmf_subsystem *subsystem = endpoint->subsystem; 3378 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3379 3380 if (!vu_ctrlr) { 3381 return 0; 3382 } 3383 3384 /* NVMf library will destruct controller when no 3385 * connected queue pairs. 3386 */ 3387 if (!nvmf_subsystem_get_ctrlr(subsystem, vu_ctrlr->cntlid)) { 3388 return 0; 3389 } 3390 3391 SPDK_DEBUGLOG(nvmf_vfio, "%s starts to quiesce\n", ctrlr_id(vu_ctrlr)); 3392 3393 /* There is no race condition here as device quiesce callback 3394 * and nvmf_prop_set_cc() are running in the same thread context. 3395 */ 3396 if (!vu_ctrlr->ctrlr->vcprop.cc.bits.en) { 3397 return 0; 3398 } else if (!vu_ctrlr->ctrlr->vcprop.csts.bits.rdy) { 3399 return 0; 3400 } else if (vu_ctrlr->ctrlr->vcprop.csts.bits.shst == SPDK_NVME_SHST_COMPLETE) { 3401 return 0; 3402 } 3403 3404 switch (vu_ctrlr->state) { 3405 case VFIO_USER_CTRLR_PAUSED: 3406 case VFIO_USER_CTRLR_MIGRATING: 3407 return 0; 3408 case VFIO_USER_CTRLR_RUNNING: 3409 ctrlr_quiesce(vu_ctrlr); 3410 break; 3411 case VFIO_USER_CTRLR_RESUMING: 3412 vu_ctrlr->queued_quiesce = true; 3413 SPDK_DEBUGLOG(nvmf_vfio, "%s is busy to quiesce, current state %u\n", ctrlr_id(vu_ctrlr), 3414 vu_ctrlr->state); 3415 break; 3416 default: 3417 assert(vu_ctrlr->state != VFIO_USER_CTRLR_PAUSING); 3418 break; 3419 } 3420 3421 errno = EBUSY; 3422 return -1; 3423 } 3424 3425 static void 3426 vfio_user_ctrlr_dump_migr_data(const char *name, 3427 struct vfio_user_nvme_migr_state *migr_data, 3428 struct nvmf_vfio_user_shadow_doorbells *sdbl) 3429 { 3430 struct spdk_nvmf_registers *regs; 3431 struct nvme_migr_sq_state *sq; 3432 struct nvme_migr_cq_state *cq; 3433 uint32_t *doorbell_base; 3434 uint32_t i; 3435 3436 SPDK_NOTICELOG("Dump %s\n", name); 3437 3438 regs = &migr_data->nvmf_data.regs; 3439 doorbell_base = (uint32_t *)&migr_data->doorbells; 3440 3441 SPDK_NOTICELOG("Registers\n"); 3442 SPDK_NOTICELOG("CSTS 0x%x\n", regs->csts.raw); 3443 SPDK_NOTICELOG("CAP 0x%"PRIx64"\n", regs->cap.raw); 3444 SPDK_NOTICELOG("VS 0x%x\n", regs->vs.raw); 3445 SPDK_NOTICELOG("CC 0x%x\n", regs->cc.raw); 3446 SPDK_NOTICELOG("AQA 0x%x\n", regs->aqa.raw); 3447 SPDK_NOTICELOG("ASQ 0x%"PRIx64"\n", regs->asq); 3448 SPDK_NOTICELOG("ACQ 0x%"PRIx64"\n", regs->acq); 3449 3450 SPDK_NOTICELOG("Number of IO Queues %u\n", migr_data->ctrlr_header.num_io_queues); 3451 3452 if (sdbl != NULL) { 3453 SPDK_NOTICELOG("shadow doorbell buffer=%#lx\n", 3454 migr_data->ctrlr_header.shadow_doorbell_buffer); 3455 SPDK_NOTICELOG("eventidx buffer=%#lx\n", 3456 migr_data->ctrlr_header.eventidx_buffer); 3457 } 3458 3459 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3460 sq = &migr_data->qps[i].sq; 3461 cq = &migr_data->qps[i].cq; 3462 3463 if (sq->size) { 3464 SPDK_NOTICELOG("sqid:%u, bar0_doorbell:%u\n", sq->sqid, doorbell_base[i * 2]); 3465 if (i > 0 && sdbl != NULL) { 3466 SPDK_NOTICELOG("sqid:%u, shadow_doorbell:%u, eventidx:%u\n", 3467 sq->sqid, 3468 sdbl->shadow_doorbells[queue_index(i, false)], 3469 sdbl->eventidxs[queue_index(i, false)]); 3470 } 3471 SPDK_NOTICELOG("SQ sqid:%u, cqid:%u, sqhead:%u, size:%u, dma_addr:0x%"PRIx64"\n", 3472 sq->sqid, sq->cqid, sq->head, sq->size, sq->dma_addr); 3473 } 3474 3475 if (cq->size) { 3476 SPDK_NOTICELOG("cqid:%u, bar0_doorbell:%u\n", cq->cqid, doorbell_base[i * 2 + 1]); 3477 if (i > 0 && sdbl != NULL) { 3478 SPDK_NOTICELOG("cqid:%u, shadow_doorbell:%u, eventidx:%u\n", 3479 cq->cqid, 3480 sdbl->shadow_doorbells[queue_index(i, true)], 3481 sdbl->eventidxs[queue_index(i, true)]); 3482 } 3483 SPDK_NOTICELOG("CQ cqid:%u, phase:%u, cqtail:%u, size:%u, iv:%u, ien:%u, dma_addr:0x%"PRIx64"\n", 3484 cq->cqid, cq->phase, cq->tail, cq->size, cq->iv, cq->ien, cq->dma_addr); 3485 } 3486 } 3487 3488 SPDK_NOTICELOG("%s Dump Done\n", name); 3489 } 3490 3491 /* Read region 9 content and restore it to migration data structures */ 3492 static int 3493 vfio_user_migr_stream_to_data(struct nvmf_vfio_user_endpoint *endpoint, 3494 struct vfio_user_nvme_migr_state *migr_state) 3495 { 3496 void *data_ptr = endpoint->migr_data; 3497 3498 /* Load vfio_user_nvme_migr_header first */ 3499 memcpy(&migr_state->ctrlr_header, data_ptr, sizeof(struct vfio_user_nvme_migr_header)); 3500 /* TODO: version check */ 3501 if (migr_state->ctrlr_header.magic != VFIO_USER_NVME_MIGR_MAGIC) { 3502 SPDK_ERRLOG("%s: bad magic number %x\n", endpoint_id(endpoint), migr_state->ctrlr_header.magic); 3503 return -EINVAL; 3504 } 3505 3506 /* Load nvmf controller data */ 3507 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.nvmf_data_offset; 3508 memcpy(&migr_state->nvmf_data, data_ptr, migr_state->ctrlr_header.nvmf_data_len); 3509 3510 /* Load queue pairs */ 3511 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.qp_offset; 3512 memcpy(&migr_state->qps, data_ptr, migr_state->ctrlr_header.qp_len); 3513 3514 /* Load doorbells */ 3515 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.bar_offset[VFU_PCI_DEV_BAR0_REGION_IDX]; 3516 memcpy(&migr_state->doorbells, data_ptr, 3517 migr_state->ctrlr_header.bar_len[VFU_PCI_DEV_BAR0_REGION_IDX]); 3518 3519 /* Load CFG */ 3520 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.bar_offset[VFU_PCI_DEV_CFG_REGION_IDX]; 3521 memcpy(&migr_state->cfg, data_ptr, migr_state->ctrlr_header.bar_len[VFU_PCI_DEV_CFG_REGION_IDX]); 3522 3523 return 0; 3524 } 3525 3526 3527 static void 3528 vfio_user_migr_ctrlr_save_data(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3529 { 3530 struct spdk_nvmf_ctrlr *ctrlr = vu_ctrlr->ctrlr; 3531 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 3532 struct nvmf_vfio_user_sq *sq; 3533 struct nvmf_vfio_user_cq *cq; 3534 uint64_t data_offset; 3535 void *data_ptr; 3536 uint32_t *doorbell_base; 3537 uint32_t i = 0; 3538 uint16_t sqid, cqid; 3539 struct vfio_user_nvme_migr_state migr_state = { 3540 .nvmf_data = { 3541 .data_size = offsetof(struct spdk_nvmf_ctrlr_migr_data, unused), 3542 .regs_size = sizeof(struct spdk_nvmf_registers), 3543 .feat_size = sizeof(struct spdk_nvmf_ctrlr_feat) 3544 } 3545 }; 3546 3547 /* Save all data to vfio_user_nvme_migr_state first, then we will 3548 * copy it to device migration region at last. 3549 */ 3550 3551 /* save magic number */ 3552 migr_state.ctrlr_header.magic = VFIO_USER_NVME_MIGR_MAGIC; 3553 3554 /* save controller data */ 3555 spdk_nvmf_ctrlr_save_migr_data(ctrlr, &migr_state.nvmf_data); 3556 3557 /* save connected queue pairs */ 3558 TAILQ_FOREACH(sq, &vu_ctrlr->connected_sqs, tailq) { 3559 /* save sq */ 3560 sqid = sq->qid; 3561 migr_state.qps[sqid].sq.sqid = sq->qid; 3562 migr_state.qps[sqid].sq.cqid = sq->cqid; 3563 migr_state.qps[sqid].sq.head = *sq_headp(sq); 3564 migr_state.qps[sqid].sq.size = sq->size; 3565 migr_state.qps[sqid].sq.dma_addr = sq->mapping.prp1; 3566 3567 /* save cq, for shared cq case, cq may be saved multiple times */ 3568 cqid = sq->cqid; 3569 cq = vu_ctrlr->cqs[cqid]; 3570 migr_state.qps[cqid].cq.cqid = cqid; 3571 migr_state.qps[cqid].cq.tail = *cq_tailp(cq); 3572 migr_state.qps[cqid].cq.ien = cq->ien; 3573 migr_state.qps[cqid].cq.iv = cq->iv; 3574 migr_state.qps[cqid].cq.size = cq->size; 3575 migr_state.qps[cqid].cq.phase = cq->phase; 3576 migr_state.qps[cqid].cq.dma_addr = cq->mapping.prp1; 3577 i++; 3578 } 3579 3580 assert(i > 0); 3581 migr_state.ctrlr_header.num_io_queues = i - 1; 3582 3583 /* Save doorbells */ 3584 doorbell_base = (uint32_t *)&migr_state.doorbells; 3585 memcpy(doorbell_base, (void *)vu_ctrlr->bar0_doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 3586 3587 /* Save PCI configuration space */ 3588 memcpy(&migr_state.cfg, (void *)endpoint->pci_config_space, NVME_REG_CFG_SIZE); 3589 3590 /* Save all data to device migration region */ 3591 data_ptr = endpoint->migr_data; 3592 3593 /* Copy nvmf controller data */ 3594 data_offset = sizeof(struct vfio_user_nvme_migr_header); 3595 data_ptr += data_offset; 3596 migr_state.ctrlr_header.nvmf_data_offset = data_offset; 3597 migr_state.ctrlr_header.nvmf_data_len = sizeof(struct spdk_nvmf_ctrlr_migr_data); 3598 memcpy(data_ptr, &migr_state.nvmf_data, sizeof(struct spdk_nvmf_ctrlr_migr_data)); 3599 3600 /* Copy queue pairs */ 3601 data_offset += sizeof(struct spdk_nvmf_ctrlr_migr_data); 3602 data_ptr += sizeof(struct spdk_nvmf_ctrlr_migr_data); 3603 migr_state.ctrlr_header.qp_offset = data_offset; 3604 migr_state.ctrlr_header.qp_len = i * (sizeof(struct nvme_migr_sq_state) + sizeof( 3605 struct nvme_migr_cq_state)); 3606 memcpy(data_ptr, &migr_state.qps, migr_state.ctrlr_header.qp_len); 3607 3608 /* Copy doorbells */ 3609 data_offset += migr_state.ctrlr_header.qp_len; 3610 data_ptr += migr_state.ctrlr_header.qp_len; 3611 migr_state.ctrlr_header.bar_offset[VFU_PCI_DEV_BAR0_REGION_IDX] = data_offset; 3612 migr_state.ctrlr_header.bar_len[VFU_PCI_DEV_BAR0_REGION_IDX] = NVMF_VFIO_USER_DOORBELLS_SIZE; 3613 memcpy(data_ptr, &migr_state.doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE); 3614 3615 /* Copy CFG */ 3616 data_offset += NVMF_VFIO_USER_DOORBELLS_SIZE; 3617 data_ptr += NVMF_VFIO_USER_DOORBELLS_SIZE; 3618 migr_state.ctrlr_header.bar_offset[VFU_PCI_DEV_CFG_REGION_IDX] = data_offset; 3619 migr_state.ctrlr_header.bar_len[VFU_PCI_DEV_CFG_REGION_IDX] = NVME_REG_CFG_SIZE; 3620 memcpy(data_ptr, &migr_state.cfg, NVME_REG_CFG_SIZE); 3621 3622 /* copy shadow doorbells */ 3623 if (vu_ctrlr->sdbl != NULL) { 3624 migr_state.ctrlr_header.sdbl = true; 3625 migr_state.ctrlr_header.shadow_doorbell_buffer = vu_ctrlr->shadow_doorbell_buffer; 3626 migr_state.ctrlr_header.eventidx_buffer = vu_ctrlr->eventidx_buffer; 3627 } 3628 3629 /* Copy nvme migration header finally */ 3630 memcpy(endpoint->migr_data, &migr_state.ctrlr_header, sizeof(struct vfio_user_nvme_migr_header)); 3631 3632 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 3633 vfio_user_ctrlr_dump_migr_data("SAVE", &migr_state, vu_ctrlr->sdbl); 3634 } 3635 } 3636 3637 /* 3638 * If we are about to close the connection, we need to unregister the interrupt, 3639 * as the library will subsequently close the file descriptor we registered. 3640 */ 3641 static int 3642 vfio_user_device_reset(vfu_ctx_t *vfu_ctx, vfu_reset_type_t type) 3643 { 3644 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3645 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 3646 3647 SPDK_DEBUGLOG(nvmf_vfio, "Device reset type %u\n", type); 3648 3649 if (type == VFU_RESET_LOST_CONN) { 3650 if (ctrlr != NULL) { 3651 spdk_interrupt_unregister(&ctrlr->intr); 3652 ctrlr->intr_fd = -1; 3653 } 3654 return 0; 3655 } 3656 3657 /* FIXME: LOST_CONN case ? */ 3658 if (ctrlr->sdbl != NULL) { 3659 vfio_user_ctrlr_switch_doorbells(ctrlr, false); 3660 free_sdbl(vfu_ctx, ctrlr->sdbl); 3661 ctrlr->sdbl = NULL; 3662 } 3663 3664 /* FIXME: much more needed here. */ 3665 3666 return 0; 3667 } 3668 3669 static int 3670 vfio_user_migr_ctrlr_construct_qps(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 3671 struct vfio_user_nvme_migr_state *migr_state) 3672 { 3673 uint32_t i, qsize = 0; 3674 uint16_t sqid, cqid; 3675 struct vfio_user_nvme_migr_qp migr_qp; 3676 void *addr; 3677 uint32_t cqs_ref[NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR] = {}; 3678 int ret; 3679 3680 if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio")) { 3681 vfio_user_ctrlr_dump_migr_data("RESUME", migr_state, vu_ctrlr->sdbl); 3682 } 3683 3684 /* restore submission queues */ 3685 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3686 migr_qp = migr_state->qps[i]; 3687 3688 qsize = migr_qp.sq.size; 3689 if (qsize) { 3690 struct nvmf_vfio_user_sq *sq; 3691 3692 sqid = migr_qp.sq.sqid; 3693 if (sqid != i) { 3694 SPDK_ERRLOG("Expected sqid %u while got %u", i, sqid); 3695 return -EINVAL; 3696 } 3697 3698 /* allocate sq if necessary */ 3699 if (vu_ctrlr->sqs[sqid] == NULL) { 3700 ret = init_sq(vu_ctrlr, &vu_ctrlr->transport->transport, sqid); 3701 if (ret) { 3702 SPDK_ERRLOG("Construct qpair with qid %u failed\n", sqid); 3703 return -EFAULT; 3704 } 3705 } 3706 3707 sq = vu_ctrlr->sqs[sqid]; 3708 sq->size = qsize; 3709 3710 ret = alloc_sq_reqs(vu_ctrlr, sq); 3711 if (ret) { 3712 SPDK_ERRLOG("Construct sq with qid %u failed\n", sqid); 3713 return -EFAULT; 3714 } 3715 3716 /* restore sq */ 3717 sq->sq_state = VFIO_USER_SQ_CREATED; 3718 sq->cqid = migr_qp.sq.cqid; 3719 *sq_headp(sq) = migr_qp.sq.head; 3720 sq->mapping.prp1 = migr_qp.sq.dma_addr; 3721 sq->mapping.len = sq->size * sizeof(struct spdk_nvme_cmd); 3722 addr = map_one(vu_ctrlr->endpoint->vfu_ctx, 3723 sq->mapping.prp1, sq->mapping.len, 3724 sq->mapping.sg, &sq->mapping.iov, 3725 PROT_READ); 3726 if (addr == NULL) { 3727 SPDK_ERRLOG("Restore sq with qid %u PRP1 0x%"PRIx64" with size %u failed\n", 3728 sqid, sq->mapping.prp1, sq->size); 3729 return -EFAULT; 3730 } 3731 cqs_ref[sq->cqid]++; 3732 } 3733 } 3734 3735 /* restore completion queues */ 3736 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3737 migr_qp = migr_state->qps[i]; 3738 3739 qsize = migr_qp.cq.size; 3740 if (qsize) { 3741 struct nvmf_vfio_user_cq *cq; 3742 3743 /* restore cq */ 3744 cqid = migr_qp.sq.cqid; 3745 assert(cqid == i); 3746 3747 /* allocate cq if necessary */ 3748 if (vu_ctrlr->cqs[cqid] == NULL) { 3749 ret = init_cq(vu_ctrlr, cqid); 3750 if (ret) { 3751 SPDK_ERRLOG("Construct qpair with qid %u failed\n", cqid); 3752 return -EFAULT; 3753 } 3754 } 3755 3756 cq = vu_ctrlr->cqs[cqid]; 3757 3758 cq->size = qsize; 3759 3760 cq->cq_state = VFIO_USER_CQ_CREATED; 3761 cq->cq_ref = cqs_ref[cqid]; 3762 *cq_tailp(cq) = migr_qp.cq.tail; 3763 cq->mapping.prp1 = migr_qp.cq.dma_addr; 3764 cq->mapping.len = cq->size * sizeof(struct spdk_nvme_cpl); 3765 cq->ien = migr_qp.cq.ien; 3766 cq->iv = migr_qp.cq.iv; 3767 cq->phase = migr_qp.cq.phase; 3768 addr = map_one(vu_ctrlr->endpoint->vfu_ctx, 3769 cq->mapping.prp1, cq->mapping.len, 3770 cq->mapping.sg, &cq->mapping.iov, 3771 PROT_READ | PROT_WRITE); 3772 if (addr == NULL) { 3773 SPDK_ERRLOG("Restore cq with qid %u PRP1 0x%"PRIx64" with size %u failed\n", 3774 cqid, cq->mapping.prp1, cq->size); 3775 return -EFAULT; 3776 } 3777 } 3778 } 3779 3780 return 0; 3781 } 3782 3783 static int 3784 vfio_user_migr_ctrlr_restore(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3785 { 3786 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 3787 struct spdk_nvmf_ctrlr *ctrlr = vu_ctrlr->ctrlr; 3788 uint32_t *doorbell_base; 3789 struct spdk_nvme_cmd cmd; 3790 uint16_t i; 3791 int rc = 0; 3792 struct vfio_user_nvme_migr_state migr_state = { 3793 .nvmf_data = { 3794 .data_size = offsetof(struct spdk_nvmf_ctrlr_migr_data, unused), 3795 .regs_size = sizeof(struct spdk_nvmf_registers), 3796 .feat_size = sizeof(struct spdk_nvmf_ctrlr_feat) 3797 } 3798 }; 3799 3800 assert(endpoint->migr_data != NULL); 3801 assert(ctrlr != NULL); 3802 rc = vfio_user_migr_stream_to_data(endpoint, &migr_state); 3803 if (rc) { 3804 return rc; 3805 } 3806 3807 /* restore shadow doorbells */ 3808 if (migr_state.ctrlr_header.sdbl) { 3809 struct nvmf_vfio_user_shadow_doorbells *sdbl; 3810 sdbl = map_sdbl(vu_ctrlr->endpoint->vfu_ctx, 3811 migr_state.ctrlr_header.shadow_doorbell_buffer, 3812 migr_state.ctrlr_header.eventidx_buffer, 3813 memory_page_size(vu_ctrlr)); 3814 if (sdbl == NULL) { 3815 SPDK_ERRLOG("%s: failed to re-map shadow doorbell buffers\n", 3816 ctrlr_id(vu_ctrlr)); 3817 return -1; 3818 } 3819 3820 vu_ctrlr->shadow_doorbell_buffer = migr_state.ctrlr_header.shadow_doorbell_buffer; 3821 vu_ctrlr->eventidx_buffer = migr_state.ctrlr_header.eventidx_buffer; 3822 3823 SWAP(vu_ctrlr->sdbl, sdbl); 3824 } 3825 3826 rc = vfio_user_migr_ctrlr_construct_qps(vu_ctrlr, &migr_state); 3827 if (rc) { 3828 return rc; 3829 } 3830 3831 /* restore PCI configuration space */ 3832 memcpy((void *)endpoint->pci_config_space, &migr_state.cfg, NVME_REG_CFG_SIZE); 3833 3834 doorbell_base = (uint32_t *)&migr_state.doorbells; 3835 /* restore doorbells from saved registers */ 3836 memcpy((void *)vu_ctrlr->bar0_doorbells, doorbell_base, NVMF_VFIO_USER_DOORBELLS_SIZE); 3837 3838 /* restore nvmf controller data */ 3839 rc = spdk_nvmf_ctrlr_restore_migr_data(ctrlr, &migr_state.nvmf_data); 3840 if (rc) { 3841 return rc; 3842 } 3843 3844 /* resubmit pending AERs */ 3845 for (i = 0; i < migr_state.nvmf_data.num_aer_cids; i++) { 3846 SPDK_DEBUGLOG(nvmf_vfio, "%s AER resubmit, CID %u\n", ctrlr_id(vu_ctrlr), 3847 migr_state.nvmf_data.aer_cids[i]); 3848 memset(&cmd, 0, sizeof(cmd)); 3849 cmd.opc = SPDK_NVME_OPC_ASYNC_EVENT_REQUEST; 3850 cmd.cid = migr_state.nvmf_data.aer_cids[i]; 3851 rc = handle_cmd_req(vu_ctrlr, &cmd, vu_ctrlr->sqs[0]); 3852 if (spdk_unlikely(rc)) { 3853 break; 3854 } 3855 } 3856 3857 return rc; 3858 } 3859 3860 static void 3861 vfio_user_migr_ctrlr_enable_sqs(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3862 { 3863 uint32_t i; 3864 struct nvmf_vfio_user_sq *sq; 3865 3866 /* The Admin queue (qid: 0) does not ever use shadow doorbells. */ 3867 3868 if (vu_ctrlr->sqs[0] != NULL) { 3869 vu_ctrlr->sqs[0]->dbl_tailp = vu_ctrlr->bar0_doorbells + 3870 queue_index(0, false); 3871 } 3872 3873 if (vu_ctrlr->cqs[0] != NULL) { 3874 vu_ctrlr->cqs[0]->dbl_headp = vu_ctrlr->bar0_doorbells + 3875 queue_index(0, true); 3876 } 3877 3878 vfio_user_ctrlr_switch_doorbells(vu_ctrlr, vu_ctrlr->sdbl != NULL); 3879 3880 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3881 sq = vu_ctrlr->sqs[i]; 3882 if (!sq || !sq->size) { 3883 continue; 3884 } 3885 3886 if (nvmf_qpair_is_admin_queue(&sq->qpair)) { 3887 /* ADMIN queue pair is always in the poll group, just enable it */ 3888 sq->sq_state = VFIO_USER_SQ_ACTIVE; 3889 } else { 3890 spdk_nvmf_tgt_new_qpair(vu_ctrlr->transport->transport.tgt, &sq->qpair); 3891 } 3892 } 3893 } 3894 3895 /* 3896 * We are in stop-and-copy state, but still potentially have some current dirty 3897 * sgls: while we're quiesced and thus should have no active requests, we still 3898 * have potentially dirty maps of the shadow doorbells and the CQs (SQs are 3899 * mapped read only). 3900 * 3901 * Since we won't be calling vfu_sgl_put() for them, we need to explicitly 3902 * mark them dirty now. 3903 */ 3904 static void 3905 vfio_user_migr_ctrlr_mark_dirty(struct nvmf_vfio_user_ctrlr *vu_ctrlr) 3906 { 3907 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 3908 3909 assert(vu_ctrlr->state == VFIO_USER_CTRLR_MIGRATING); 3910 3911 for (size_t i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 3912 struct nvmf_vfio_user_cq *cq = vu_ctrlr->cqs[i]; 3913 3914 if (cq == NULL || q_addr(&cq->mapping) == NULL) { 3915 continue; 3916 } 3917 3918 vfu_sgl_mark_dirty(endpoint->vfu_ctx, cq->mapping.sg, 1); 3919 } 3920 3921 if (vu_ctrlr->sdbl != NULL) { 3922 dma_sg_t *sg; 3923 size_t i; 3924 3925 for (i = 0; i < NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT; 3926 ++i) { 3927 3928 if (!vu_ctrlr->sdbl->iovs[i].iov_len) { 3929 continue; 3930 } 3931 3932 sg = index_to_sg_t(vu_ctrlr->sdbl->sgs, i); 3933 3934 vfu_sgl_mark_dirty(endpoint->vfu_ctx, sg, 1); 3935 } 3936 } 3937 } 3938 3939 static int 3940 vfio_user_migration_device_state_transition(vfu_ctx_t *vfu_ctx, vfu_migr_state_t state) 3941 { 3942 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 3943 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; 3944 struct nvmf_vfio_user_sq *sq; 3945 int ret = 0; 3946 3947 SPDK_DEBUGLOG(nvmf_vfio, "%s controller state %u, migration state %u\n", endpoint_id(endpoint), 3948 vu_ctrlr->state, state); 3949 3950 switch (state) { 3951 case VFU_MIGR_STATE_STOP_AND_COPY: 3952 vu_ctrlr->in_source_vm = true; 3953 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 3954 vfio_user_migr_ctrlr_mark_dirty(vu_ctrlr); 3955 vfio_user_migr_ctrlr_save_data(vu_ctrlr); 3956 break; 3957 case VFU_MIGR_STATE_STOP: 3958 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 3959 /* The controller associates with source VM is dead now, we will resume 3960 * the subsystem after destroying the controller data structure, then the 3961 * subsystem can be re-used for another new client. 3962 */ 3963 if (vu_ctrlr->in_source_vm) { 3964 endpoint->need_resume = true; 3965 } 3966 break; 3967 case VFU_MIGR_STATE_PRE_COPY: 3968 assert(vu_ctrlr->state == VFIO_USER_CTRLR_PAUSED); 3969 break; 3970 case VFU_MIGR_STATE_RESUME: 3971 /* 3972 * Destination ADMIN queue pair is connected when starting the VM, 3973 * but the ADMIN queue pair isn't enabled in destination VM, the poll 3974 * group will do nothing to ADMIN queue pair for now. 3975 */ 3976 if (vu_ctrlr->state != VFIO_USER_CTRLR_RUNNING) { 3977 break; 3978 } 3979 3980 assert(!vu_ctrlr->in_source_vm); 3981 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING; 3982 3983 sq = TAILQ_FIRST(&vu_ctrlr->connected_sqs); 3984 assert(sq != NULL); 3985 assert(sq->qpair.qid == 0); 3986 sq->sq_state = VFIO_USER_SQ_INACTIVE; 3987 3988 /* Free ADMIN SQ resources first, SQ resources will be 3989 * allocated based on queue size from source VM. 3990 */ 3991 free_sq_reqs(sq); 3992 sq->size = 0; 3993 break; 3994 case VFU_MIGR_STATE_RUNNING: 3995 3996 if (vu_ctrlr->state != VFIO_USER_CTRLR_MIGRATING) { 3997 break; 3998 } 3999 4000 if (!vu_ctrlr->in_source_vm) { 4001 /* Restore destination VM from BAR9 */ 4002 ret = vfio_user_migr_ctrlr_restore(vu_ctrlr); 4003 if (ret) { 4004 break; 4005 } 4006 4007 vfio_user_ctrlr_switch_doorbells(vu_ctrlr, false); 4008 vfio_user_migr_ctrlr_enable_sqs(vu_ctrlr); 4009 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 4010 /* FIXME where do we resume nvmf? */ 4011 } else { 4012 /* Rollback source VM */ 4013 vu_ctrlr->state = VFIO_USER_CTRLR_RESUMING; 4014 ret = spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 4015 vfio_user_endpoint_resume_done, endpoint); 4016 if (ret < 0) { 4017 /* TODO: fail controller with CFS bit set */ 4018 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; 4019 SPDK_ERRLOG("%s: failed to resume, ret=%d\n", endpoint_id(endpoint), ret); 4020 } 4021 } 4022 vu_ctrlr->migr_data_prepared = false; 4023 vu_ctrlr->in_source_vm = false; 4024 break; 4025 4026 default: 4027 return -EINVAL; 4028 } 4029 4030 return ret; 4031 } 4032 4033 static uint64_t 4034 vfio_user_migration_get_pending_bytes(vfu_ctx_t *vfu_ctx) 4035 { 4036 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 4037 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 4038 uint64_t pending_bytes; 4039 4040 if (ctrlr->migr_data_prepared) { 4041 assert(ctrlr->state == VFIO_USER_CTRLR_MIGRATING); 4042 pending_bytes = 0; 4043 } else { 4044 pending_bytes = vfio_user_migr_data_len(); 4045 } 4046 4047 SPDK_DEBUGLOG(nvmf_vfio, 4048 "%s current state %u, pending bytes 0x%"PRIx64"\n", 4049 endpoint_id(endpoint), ctrlr->state, pending_bytes); 4050 4051 return pending_bytes; 4052 } 4053 4054 static int 4055 vfio_user_migration_prepare_data(vfu_ctx_t *vfu_ctx, uint64_t *offset, uint64_t *size) 4056 { 4057 struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); 4058 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr; 4059 4060 /* 4061 * When transitioning to pre-copy state we set pending_bytes to 0, 4062 * so the vfio-user client shouldn't attempt to read any migration 4063 * data. This is not yet guaranteed by libvfio-user. 4064 */ 4065 if (ctrlr->state != VFIO_USER_CTRLR_MIGRATING) { 4066 assert(size != NULL); 4067 *offset = 0; 4068 *size = 0; 4069 return 0; 4070 } 4071 4072 if (ctrlr->in_source_vm) { /* migration source */ 4073 assert(size != NULL); 4074 *size = vfio_user_migr_data_len(); 4075 vfio_user_migr_ctrlr_save_data(ctrlr); 4076 } else { /* migration destination */ 4077 assert(size == NULL); 4078 assert(!ctrlr->migr_data_prepared); 4079 } 4080 *offset = 0; 4081 ctrlr->migr_data_prepared = true; 4082 4083 SPDK_DEBUGLOG(nvmf_vfio, "%s current state %u\n", endpoint_id(endpoint), ctrlr->state); 4084 4085 return 0; 4086 } 4087 4088 static ssize_t 4089 vfio_user_migration_read_data(vfu_ctx_t *vfu_ctx __attribute__((unused)), 4090 void *buf __attribute__((unused)), 4091 uint64_t count __attribute__((unused)), 4092 uint64_t offset __attribute__((unused))) 4093 { 4094 SPDK_DEBUGLOG(nvmf_vfio, "%s: migration read data not supported\n", 4095 endpoint_id(vfu_get_private(vfu_ctx))); 4096 errno = ENOTSUP; 4097 return -1; 4098 } 4099 4100 static ssize_t 4101 vfio_user_migration_write_data(vfu_ctx_t *vfu_ctx __attribute__((unused)), 4102 void *buf __attribute__((unused)), 4103 uint64_t count __attribute__((unused)), 4104 uint64_t offset __attribute__((unused))) 4105 { 4106 SPDK_DEBUGLOG(nvmf_vfio, "%s: migration write data not supported\n", 4107 endpoint_id(vfu_get_private(vfu_ctx))); 4108 errno = ENOTSUP; 4109 return -1; 4110 } 4111 4112 static int 4113 vfio_user_migration_data_written(vfu_ctx_t *vfu_ctx __attribute__((unused)), 4114 uint64_t count) 4115 { 4116 SPDK_DEBUGLOG(nvmf_vfio, "write 0x%"PRIx64"\n", (uint64_t)count); 4117 4118 if (count != vfio_user_migr_data_len()) { 4119 SPDK_DEBUGLOG(nvmf_vfio, "%s bad count %#lx\n", 4120 endpoint_id(vfu_get_private(vfu_ctx)), count); 4121 errno = EINVAL; 4122 return -1; 4123 } 4124 4125 return 0; 4126 } 4127 4128 static int 4129 vfio_user_dev_info_fill(struct nvmf_vfio_user_transport *vu_transport, 4130 struct nvmf_vfio_user_endpoint *endpoint) 4131 { 4132 int ret; 4133 ssize_t cap_offset; 4134 vfu_ctx_t *vfu_ctx = endpoint->vfu_ctx; 4135 struct iovec migr_sparse_mmap = {}; 4136 4137 struct pmcap pmcap = { .hdr.id = PCI_CAP_ID_PM, .pmcs.nsfrst = 0x1 }; 4138 struct pxcap pxcap = { 4139 .hdr.id = PCI_CAP_ID_EXP, 4140 .pxcaps.ver = 0x2, 4141 .pxdcap = {.rer = 0x1, .flrc = 0x1}, 4142 .pxdcap2.ctds = 0x1 4143 }; 4144 4145 struct msixcap msixcap = { 4146 .hdr.id = PCI_CAP_ID_MSIX, 4147 .mxc.ts = NVMF_VFIO_USER_MSIX_NUM - 1, 4148 .mtab = {.tbir = NVMF_VFIO_USER_MSIX_TABLE_BIR, .to = 0x0}, 4149 .mpba = {.pbir = NVMF_VFIO_USER_MSIX_PBA_BIR, .pbao = 0x0} 4150 }; 4151 4152 struct iovec sparse_mmap[] = { 4153 { 4154 .iov_base = (void *)NVME_DOORBELLS_OFFSET, 4155 .iov_len = NVMF_VFIO_USER_DOORBELLS_SIZE, 4156 }, 4157 }; 4158 4159 const vfu_migration_callbacks_t migr_callbacks = { 4160 .version = VFIO_USER_MIGR_CALLBACK_VERS, 4161 .transition = &vfio_user_migration_device_state_transition, 4162 .get_pending_bytes = &vfio_user_migration_get_pending_bytes, 4163 .prepare_data = &vfio_user_migration_prepare_data, 4164 .read_data = &vfio_user_migration_read_data, 4165 .data_written = &vfio_user_migration_data_written, 4166 .write_data = &vfio_user_migration_write_data 4167 }; 4168 4169 ret = vfu_pci_init(vfu_ctx, VFU_PCI_TYPE_EXPRESS, PCI_HEADER_TYPE_NORMAL, 0); 4170 if (ret < 0) { 4171 SPDK_ERRLOG("vfu_ctx %p failed to initialize PCI\n", vfu_ctx); 4172 return ret; 4173 } 4174 vfu_pci_set_id(vfu_ctx, SPDK_PCI_VID_NUTANIX, 0x0001, SPDK_PCI_VID_NUTANIX, 0); 4175 /* 4176 * 0x02, controller uses the NVM Express programming interface 4177 * 0x08, non-volatile memory controller 4178 * 0x01, mass storage controller 4179 */ 4180 vfu_pci_set_class(vfu_ctx, 0x01, 0x08, 0x02); 4181 4182 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pmcap); 4183 if (cap_offset < 0) { 4184 SPDK_ERRLOG("vfu_ctx %p failed add pmcap\n", vfu_ctx); 4185 return ret; 4186 } 4187 4188 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pxcap); 4189 if (cap_offset < 0) { 4190 SPDK_ERRLOG("vfu_ctx %p failed add pxcap\n", vfu_ctx); 4191 return ret; 4192 } 4193 4194 cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &msixcap); 4195 if (cap_offset < 0) { 4196 SPDK_ERRLOG("vfu_ctx %p failed add msixcap\n", vfu_ctx); 4197 return ret; 4198 } 4199 4200 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_CFG_REGION_IDX, NVME_REG_CFG_SIZE, 4201 access_pci_config, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 4202 if (ret < 0) { 4203 SPDK_ERRLOG("vfu_ctx %p failed to setup cfg\n", vfu_ctx); 4204 return ret; 4205 } 4206 4207 if (vu_transport->transport_opts.disable_mappable_bar0) { 4208 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE, 4209 access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, 4210 NULL, 0, -1, 0); 4211 } else { 4212 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE, 4213 access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, 4214 sparse_mmap, 1, endpoint->devmem_fd, 0); 4215 } 4216 4217 if (ret < 0) { 4218 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 0\n", vfu_ctx); 4219 return ret; 4220 } 4221 4222 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR4_REGION_IDX, NVMF_VFIO_USER_BAR4_SIZE, 4223 NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 4224 if (ret < 0) { 4225 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 4\n", vfu_ctx); 4226 return ret; 4227 } 4228 4229 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR5_REGION_IDX, NVMF_VFIO_USER_BAR5_SIZE, 4230 NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); 4231 if (ret < 0) { 4232 SPDK_ERRLOG("vfu_ctx %p failed to setup bar 5\n", vfu_ctx); 4233 return ret; 4234 } 4235 4236 ret = vfu_setup_device_dma(vfu_ctx, memory_region_add_cb, memory_region_remove_cb); 4237 if (ret < 0) { 4238 SPDK_ERRLOG("vfu_ctx %p failed to setup dma callback\n", vfu_ctx); 4239 return ret; 4240 } 4241 4242 ret = vfu_setup_device_reset_cb(vfu_ctx, vfio_user_device_reset); 4243 if (ret < 0) { 4244 SPDK_ERRLOG("vfu_ctx %p failed to setup reset callback\n", vfu_ctx); 4245 return ret; 4246 } 4247 4248 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_INTX_IRQ, 1); 4249 if (ret < 0) { 4250 SPDK_ERRLOG("vfu_ctx %p failed to setup INTX\n", vfu_ctx); 4251 return ret; 4252 } 4253 4254 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSIX_IRQ, NVMF_VFIO_USER_MSIX_NUM); 4255 if (ret < 0) { 4256 SPDK_ERRLOG("vfu_ctx %p failed to setup MSIX\n", vfu_ctx); 4257 return ret; 4258 } 4259 4260 vfu_setup_device_quiesce_cb(vfu_ctx, vfio_user_dev_quiesce_cb); 4261 4262 migr_sparse_mmap.iov_base = (void *)4096; 4263 migr_sparse_mmap.iov_len = vfio_user_migr_data_len(); 4264 ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_MIGR_REGION_IDX, 4265 vfu_get_migr_register_area_size() + vfio_user_migr_data_len(), 4266 NULL, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, &migr_sparse_mmap, 4267 1, endpoint->migr_fd, 0); 4268 if (ret < 0) { 4269 SPDK_ERRLOG("vfu_ctx %p failed to setup migration region\n", vfu_ctx); 4270 return ret; 4271 } 4272 4273 ret = vfu_setup_device_migration_callbacks(vfu_ctx, &migr_callbacks, 4274 vfu_get_migr_register_area_size()); 4275 if (ret < 0) { 4276 SPDK_ERRLOG("vfu_ctx %p failed to setup migration callbacks\n", vfu_ctx); 4277 return ret; 4278 } 4279 4280 ret = vfu_realize_ctx(vfu_ctx); 4281 if (ret < 0) { 4282 SPDK_ERRLOG("vfu_ctx %p failed to realize\n", vfu_ctx); 4283 return ret; 4284 } 4285 4286 endpoint->pci_config_space = vfu_pci_get_config_space(endpoint->vfu_ctx); 4287 assert(endpoint->pci_config_space != NULL); 4288 init_pci_config_space(endpoint->pci_config_space); 4289 4290 assert(cap_offset != 0); 4291 endpoint->msix = (struct msixcap *)((uint8_t *)endpoint->pci_config_space + cap_offset); 4292 4293 return 0; 4294 } 4295 4296 static int nvmf_vfio_user_accept(void *ctx); 4297 4298 /* 4299 * Register an "accept" poller: this is polling for incoming vfio-user socket 4300 * connections (on the listening socket). 4301 * 4302 * We need to do this on first listening, and also after destroying a 4303 * controller, so we can accept another connection. 4304 */ 4305 static int 4306 vfio_user_register_accept_poller(struct nvmf_vfio_user_endpoint *endpoint) 4307 { 4308 uint64_t poll_rate_us = endpoint->transport->transport.opts.acceptor_poll_rate; 4309 4310 SPDK_DEBUGLOG(nvmf_vfio, "registering accept poller\n"); 4311 4312 endpoint->accept_poller = SPDK_POLLER_REGISTER(nvmf_vfio_user_accept, 4313 endpoint, poll_rate_us); 4314 4315 if (!endpoint->accept_poller) { 4316 return -1; 4317 } 4318 4319 endpoint->accept_thread = spdk_get_thread(); 4320 endpoint->need_relisten = false; 4321 4322 if (!spdk_interrupt_mode_is_enabled()) { 4323 return 0; 4324 } 4325 4326 endpoint->accept_intr_fd = vfu_get_poll_fd(endpoint->vfu_ctx); 4327 assert(endpoint->accept_intr_fd != -1); 4328 4329 endpoint->accept_intr = SPDK_INTERRUPT_REGISTER(endpoint->accept_intr_fd, 4330 nvmf_vfio_user_accept, endpoint); 4331 4332 assert(endpoint->accept_intr != NULL); 4333 4334 spdk_poller_register_interrupt(endpoint->accept_poller, NULL, NULL); 4335 return 0; 4336 } 4337 4338 static void 4339 _vfio_user_relisten(void *ctx) 4340 { 4341 struct nvmf_vfio_user_endpoint *endpoint = ctx; 4342 4343 vfio_user_register_accept_poller(endpoint); 4344 } 4345 4346 static void 4347 _free_ctrlr(void *ctx) 4348 { 4349 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 4350 struct nvmf_vfio_user_endpoint *endpoint = ctrlr->endpoint; 4351 4352 free_sdbl(endpoint->vfu_ctx, ctrlr->sdbl); 4353 4354 spdk_interrupt_unregister(&ctrlr->intr); 4355 ctrlr->intr_fd = -1; 4356 spdk_poller_unregister(&ctrlr->vfu_ctx_poller); 4357 4358 free(ctrlr); 4359 4360 if (endpoint->need_async_destroy) { 4361 nvmf_vfio_user_destroy_endpoint(endpoint); 4362 } else if (endpoint->need_relisten) { 4363 spdk_thread_send_msg(endpoint->accept_thread, 4364 _vfio_user_relisten, endpoint); 4365 } 4366 } 4367 4368 static void 4369 free_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 4370 { 4371 struct spdk_thread *thread; 4372 int i; 4373 4374 assert(ctrlr != NULL); 4375 thread = ctrlr->thread ? ctrlr->thread : spdk_get_thread(); 4376 4377 SPDK_DEBUGLOG(nvmf_vfio, "free %s\n", ctrlr_id(ctrlr)); 4378 4379 for (i = 0; i < NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR; i++) { 4380 free_qp(ctrlr, i); 4381 } 4382 4383 spdk_thread_exec_msg(thread, _free_ctrlr, ctrlr); 4384 } 4385 4386 static int 4387 nvmf_vfio_user_create_ctrlr(struct nvmf_vfio_user_transport *transport, 4388 struct nvmf_vfio_user_endpoint *endpoint) 4389 { 4390 struct nvmf_vfio_user_ctrlr *ctrlr; 4391 int err = 0; 4392 4393 SPDK_DEBUGLOG(nvmf_vfio, "%s\n", endpoint_id(endpoint)); 4394 4395 /* First, construct a vfio-user CUSTOM transport controller */ 4396 ctrlr = calloc(1, sizeof(*ctrlr)); 4397 if (ctrlr == NULL) { 4398 err = -ENOMEM; 4399 goto out; 4400 } 4401 /* 4402 * We can only support one connection for now, but generate a unique cntlid in case vfio-user 4403 * transport is used together with RDMA or TCP transports in the same target 4404 */ 4405 ctrlr->cntlid = nvmf_subsystem_gen_cntlid(endpoint->subsystem); 4406 ctrlr->intr_fd = -1; 4407 ctrlr->transport = transport; 4408 ctrlr->endpoint = endpoint; 4409 ctrlr->bar0_doorbells = endpoint->bar0_doorbells; 4410 TAILQ_INIT(&ctrlr->connected_sqs); 4411 4412 ctrlr->adaptive_irqs_enabled = 4413 !transport->transport_opts.disable_adaptive_irq; 4414 4415 /* Then, construct an admin queue pair */ 4416 err = init_sq(ctrlr, &transport->transport, 0); 4417 if (err != 0) { 4418 free(ctrlr); 4419 goto out; 4420 } 4421 4422 err = init_cq(ctrlr, 0); 4423 if (err != 0) { 4424 free(ctrlr); 4425 goto out; 4426 } 4427 4428 ctrlr->sqs[0]->size = NVMF_VFIO_USER_DEFAULT_AQ_DEPTH; 4429 4430 err = alloc_sq_reqs(ctrlr, ctrlr->sqs[0]); 4431 if (err != 0) { 4432 free(ctrlr); 4433 goto out; 4434 } 4435 endpoint->ctrlr = ctrlr; 4436 4437 /* Notify the generic layer about the new admin queue pair */ 4438 spdk_nvmf_tgt_new_qpair(transport->transport.tgt, &ctrlr->sqs[0]->qpair); 4439 4440 out: 4441 if (err != 0) { 4442 SPDK_ERRLOG("%s: failed to create vfio-user controller: %s\n", 4443 endpoint_id(endpoint), strerror(-err)); 4444 } 4445 4446 return err; 4447 } 4448 4449 static int 4450 nvmf_vfio_user_listen(struct spdk_nvmf_transport *transport, 4451 const struct spdk_nvme_transport_id *trid, 4452 struct spdk_nvmf_listen_opts *listen_opts) 4453 { 4454 struct nvmf_vfio_user_transport *vu_transport; 4455 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 4456 char path[PATH_MAX] = {}; 4457 char uuid[PATH_MAX] = {}; 4458 int ret; 4459 4460 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4461 transport); 4462 4463 pthread_mutex_lock(&vu_transport->lock); 4464 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 4465 /* Only compare traddr */ 4466 if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) { 4467 pthread_mutex_unlock(&vu_transport->lock); 4468 return -EEXIST; 4469 } 4470 } 4471 pthread_mutex_unlock(&vu_transport->lock); 4472 4473 endpoint = calloc(1, sizeof(*endpoint)); 4474 if (!endpoint) { 4475 return -ENOMEM; 4476 } 4477 4478 pthread_mutex_init(&endpoint->lock, NULL); 4479 endpoint->devmem_fd = -1; 4480 memcpy(&endpoint->trid, trid, sizeof(endpoint->trid)); 4481 endpoint->transport = vu_transport; 4482 4483 ret = snprintf(path, PATH_MAX, "%s/bar0", endpoint_id(endpoint)); 4484 if (ret < 0 || ret >= PATH_MAX) { 4485 SPDK_ERRLOG("%s: error to get socket path: %s.\n", endpoint_id(endpoint), spdk_strerror(errno)); 4486 ret = -1; 4487 goto out; 4488 } 4489 4490 ret = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); 4491 if (ret == -1) { 4492 SPDK_ERRLOG("%s: failed to open device memory at %s: %s.\n", 4493 endpoint_id(endpoint), path, spdk_strerror(errno)); 4494 goto out; 4495 } 4496 unlink(path); 4497 4498 endpoint->devmem_fd = ret; 4499 ret = ftruncate(endpoint->devmem_fd, 4500 NVME_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE); 4501 if (ret != 0) { 4502 SPDK_ERRLOG("%s: error to ftruncate file %s: %s.\n", endpoint_id(endpoint), path, 4503 spdk_strerror(errno)); 4504 goto out; 4505 } 4506 4507 endpoint->bar0_doorbells = mmap(NULL, NVMF_VFIO_USER_DOORBELLS_SIZE, 4508 PROT_READ | PROT_WRITE, MAP_SHARED, endpoint->devmem_fd, NVME_DOORBELLS_OFFSET); 4509 if (endpoint->bar0_doorbells == MAP_FAILED) { 4510 SPDK_ERRLOG("%s: error to mmap file %s: %s.\n", endpoint_id(endpoint), path, spdk_strerror(errno)); 4511 endpoint->bar0_doorbells = NULL; 4512 ret = -1; 4513 goto out; 4514 } 4515 4516 ret = snprintf(path, PATH_MAX, "%s/migr", endpoint_id(endpoint)); 4517 if (ret < 0 || ret >= PATH_MAX) { 4518 SPDK_ERRLOG("%s: error to get migration file path: %s.\n", endpoint_id(endpoint), 4519 spdk_strerror(errno)); 4520 ret = -1; 4521 goto out; 4522 } 4523 ret = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); 4524 if (ret == -1) { 4525 SPDK_ERRLOG("%s: failed to open device memory at %s: %s.\n", 4526 endpoint_id(endpoint), path, spdk_strerror(errno)); 4527 goto out; 4528 } 4529 unlink(path); 4530 4531 endpoint->migr_fd = ret; 4532 ret = ftruncate(endpoint->migr_fd, 4533 vfu_get_migr_register_area_size() + vfio_user_migr_data_len()); 4534 if (ret != 0) { 4535 SPDK_ERRLOG("%s: error to ftruncate migration file %s: %s.\n", endpoint_id(endpoint), path, 4536 spdk_strerror(errno)); 4537 goto out; 4538 } 4539 4540 endpoint->migr_data = mmap(NULL, vfio_user_migr_data_len(), 4541 PROT_READ | PROT_WRITE, MAP_SHARED, endpoint->migr_fd, vfu_get_migr_register_area_size()); 4542 if (endpoint->migr_data == MAP_FAILED) { 4543 SPDK_ERRLOG("%s: error to mmap file %s: %s.\n", endpoint_id(endpoint), path, spdk_strerror(errno)); 4544 endpoint->migr_data = NULL; 4545 ret = -1; 4546 goto out; 4547 } 4548 4549 ret = snprintf(uuid, PATH_MAX, "%s/cntrl", endpoint_id(endpoint)); 4550 if (ret < 0 || ret >= PATH_MAX) { 4551 SPDK_ERRLOG("%s: error to get ctrlr file path: %s\n", endpoint_id(endpoint), spdk_strerror(errno)); 4552 ret = -1; 4553 goto out; 4554 } 4555 4556 endpoint->vfu_ctx = vfu_create_ctx(VFU_TRANS_SOCK, uuid, LIBVFIO_USER_FLAG_ATTACH_NB, 4557 endpoint, VFU_DEV_TYPE_PCI); 4558 if (endpoint->vfu_ctx == NULL) { 4559 SPDK_ERRLOG("%s: error creating libmuser context: %m\n", 4560 endpoint_id(endpoint)); 4561 ret = -1; 4562 goto out; 4563 } 4564 4565 ret = vfu_setup_log(endpoint->vfu_ctx, vfio_user_log, 4566 vfio_user_get_log_level()); 4567 if (ret < 0) { 4568 goto out; 4569 } 4570 4571 4572 ret = vfio_user_dev_info_fill(vu_transport, endpoint); 4573 if (ret < 0) { 4574 goto out; 4575 } 4576 4577 ret = vfio_user_register_accept_poller(endpoint); 4578 4579 if (ret != 0) { 4580 goto out; 4581 } 4582 4583 pthread_mutex_lock(&vu_transport->lock); 4584 TAILQ_INSERT_TAIL(&vu_transport->endpoints, endpoint, link); 4585 pthread_mutex_unlock(&vu_transport->lock); 4586 4587 out: 4588 if (ret != 0) { 4589 nvmf_vfio_user_destroy_endpoint(endpoint); 4590 } 4591 4592 return ret; 4593 } 4594 4595 static void 4596 nvmf_vfio_user_stop_listen(struct spdk_nvmf_transport *transport, 4597 const struct spdk_nvme_transport_id *trid) 4598 { 4599 struct nvmf_vfio_user_transport *vu_transport; 4600 struct nvmf_vfio_user_endpoint *endpoint, *tmp; 4601 4602 assert(trid != NULL); 4603 assert(trid->traddr != NULL); 4604 4605 SPDK_DEBUGLOG(nvmf_vfio, "%s: stop listen\n", trid->traddr); 4606 4607 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4608 transport); 4609 4610 pthread_mutex_lock(&vu_transport->lock); 4611 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) { 4612 if (strcmp(trid->traddr, endpoint->trid.traddr) == 0) { 4613 TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link); 4614 /* Defer to free endpoint resources until the controller 4615 * is freed. There are two cases when running here: 4616 * 1. kill nvmf target while VM is connected 4617 * 2. remove listener via RPC call 4618 * nvmf library will disconnect all queue paris. 4619 */ 4620 if (endpoint->ctrlr) { 4621 assert(!endpoint->need_async_destroy); 4622 endpoint->need_async_destroy = true; 4623 pthread_mutex_unlock(&vu_transport->lock); 4624 return; 4625 } 4626 4627 nvmf_vfio_user_destroy_endpoint(endpoint); 4628 pthread_mutex_unlock(&vu_transport->lock); 4629 return; 4630 } 4631 } 4632 pthread_mutex_unlock(&vu_transport->lock); 4633 4634 SPDK_DEBUGLOG(nvmf_vfio, "%s: not found\n", trid->traddr); 4635 } 4636 4637 static void 4638 nvmf_vfio_user_cdata_init(struct spdk_nvmf_transport *transport, 4639 struct spdk_nvmf_subsystem *subsystem, 4640 struct spdk_nvmf_ctrlr_data *cdata) 4641 { 4642 struct nvmf_vfio_user_transport *vu_transport; 4643 4644 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, transport); 4645 4646 cdata->vid = SPDK_PCI_VID_NUTANIX; 4647 cdata->ssvid = SPDK_PCI_VID_NUTANIX; 4648 cdata->ieee[0] = 0x8d; 4649 cdata->ieee[1] = 0x6b; 4650 cdata->ieee[2] = 0x50; 4651 memset(&cdata->sgls, 0, sizeof(struct spdk_nvme_cdata_sgls)); 4652 cdata->sgls.supported = SPDK_NVME_SGLS_SUPPORTED_DWORD_ALIGNED; 4653 cdata->oncs.compare = !vu_transport->transport_opts.disable_compare; 4654 /* libvfio-user can only support 1 connection for now */ 4655 cdata->oncs.reservations = 0; 4656 cdata->oacs.doorbell_buffer_config = !vu_transport->transport_opts.disable_shadow_doorbells; 4657 cdata->fuses.compare_and_write = !vu_transport->transport_opts.disable_compare; 4658 } 4659 4660 static int 4661 nvmf_vfio_user_listen_associate(struct spdk_nvmf_transport *transport, 4662 const struct spdk_nvmf_subsystem *subsystem, 4663 const struct spdk_nvme_transport_id *trid) 4664 { 4665 struct nvmf_vfio_user_transport *vu_transport; 4666 struct nvmf_vfio_user_endpoint *endpoint; 4667 4668 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, transport); 4669 4670 pthread_mutex_lock(&vu_transport->lock); 4671 TAILQ_FOREACH(endpoint, &vu_transport->endpoints, link) { 4672 if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) { 4673 break; 4674 } 4675 } 4676 pthread_mutex_unlock(&vu_transport->lock); 4677 4678 if (endpoint == NULL) { 4679 return -ENOENT; 4680 } 4681 4682 /* Drop const - we will later need to pause/unpause. */ 4683 endpoint->subsystem = (struct spdk_nvmf_subsystem *)subsystem; 4684 4685 return 0; 4686 } 4687 4688 /* 4689 * Executed periodically at a default SPDK_NVMF_DEFAULT_ACCEPT_POLL_RATE_US 4690 * frequency. 4691 * 4692 * For this endpoint (which at the libvfio-user level corresponds to a socket), 4693 * if we don't currently have a controller set up, peek to see if the socket is 4694 * able to accept a new connection. 4695 */ 4696 static int 4697 nvmf_vfio_user_accept(void *ctx) 4698 { 4699 struct nvmf_vfio_user_endpoint *endpoint = ctx; 4700 struct nvmf_vfio_user_transport *vu_transport; 4701 int err; 4702 4703 vu_transport = endpoint->transport; 4704 4705 if (endpoint->ctrlr != NULL) { 4706 return SPDK_POLLER_IDLE; 4707 } 4708 4709 /* While we're here, the controller is already destroyed, 4710 * subsystem may still be in RESUMING state, we will wait 4711 * until the subsystem is in RUNNING state. 4712 */ 4713 if (endpoint->need_resume) { 4714 return SPDK_POLLER_IDLE; 4715 } 4716 4717 err = vfu_attach_ctx(endpoint->vfu_ctx); 4718 if (err == 0) { 4719 SPDK_DEBUGLOG(nvmf_vfio, "attach succeeded\n"); 4720 err = nvmf_vfio_user_create_ctrlr(vu_transport, endpoint); 4721 if (err == 0) { 4722 /* 4723 * Unregister ourselves: now we've accepted a 4724 * connection, there is nothing for us to poll for, and 4725 * we will poll the connection via vfu_run_ctx() 4726 * instead. 4727 */ 4728 spdk_interrupt_unregister(&endpoint->accept_intr); 4729 spdk_poller_unregister(&endpoint->accept_poller); 4730 } 4731 return SPDK_POLLER_BUSY; 4732 } 4733 4734 if (errno == EAGAIN || errno == EWOULDBLOCK) { 4735 return SPDK_POLLER_IDLE; 4736 } 4737 4738 return SPDK_POLLER_BUSY; 4739 } 4740 4741 static void 4742 nvmf_vfio_user_discover(struct spdk_nvmf_transport *transport, 4743 struct spdk_nvme_transport_id *trid, 4744 struct spdk_nvmf_discovery_log_page_entry *entry) 4745 { } 4746 4747 static int vfio_user_poll_group_intr(void *ctx); 4748 4749 static void 4750 vfio_user_poll_group_add_intr(struct nvmf_vfio_user_poll_group *vu_group, 4751 struct spdk_nvmf_poll_group *group) 4752 { 4753 vu_group->intr_fd = eventfd(0, EFD_NONBLOCK); 4754 assert(vu_group->intr_fd != -1); 4755 4756 vu_group->intr = SPDK_INTERRUPT_REGISTER(vu_group->intr_fd, 4757 vfio_user_poll_group_intr, vu_group); 4758 assert(vu_group->intr != NULL); 4759 } 4760 4761 static struct spdk_nvmf_transport_poll_group * 4762 nvmf_vfio_user_poll_group_create(struct spdk_nvmf_transport *transport, 4763 struct spdk_nvmf_poll_group *group) 4764 { 4765 struct nvmf_vfio_user_transport *vu_transport; 4766 struct nvmf_vfio_user_poll_group *vu_group; 4767 4768 vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, 4769 transport); 4770 4771 SPDK_DEBUGLOG(nvmf_vfio, "create poll group\n"); 4772 4773 vu_group = calloc(1, sizeof(*vu_group)); 4774 if (vu_group == NULL) { 4775 SPDK_ERRLOG("Error allocating poll group: %m"); 4776 return NULL; 4777 } 4778 4779 if (in_interrupt_mode(vu_transport)) { 4780 vfio_user_poll_group_add_intr(vu_group, group); 4781 } 4782 4783 TAILQ_INIT(&vu_group->sqs); 4784 4785 pthread_mutex_lock(&vu_transport->pg_lock); 4786 TAILQ_INSERT_TAIL(&vu_transport->poll_groups, vu_group, link); 4787 if (vu_transport->next_pg == NULL) { 4788 vu_transport->next_pg = vu_group; 4789 } 4790 pthread_mutex_unlock(&vu_transport->pg_lock); 4791 4792 return &vu_group->group; 4793 } 4794 4795 static struct spdk_nvmf_transport_poll_group * 4796 nvmf_vfio_user_get_optimal_poll_group(struct spdk_nvmf_qpair *qpair) 4797 { 4798 struct nvmf_vfio_user_transport *vu_transport; 4799 struct nvmf_vfio_user_poll_group **vu_group; 4800 struct nvmf_vfio_user_sq *sq; 4801 struct nvmf_vfio_user_cq *cq; 4802 4803 struct spdk_nvmf_transport_poll_group *result = NULL; 4804 4805 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 4806 cq = sq->ctrlr->cqs[sq->cqid]; 4807 assert(cq != NULL); 4808 vu_transport = SPDK_CONTAINEROF(qpair->transport, struct nvmf_vfio_user_transport, transport); 4809 4810 pthread_mutex_lock(&vu_transport->pg_lock); 4811 if (TAILQ_EMPTY(&vu_transport->poll_groups)) { 4812 goto out; 4813 } 4814 4815 if (!nvmf_qpair_is_admin_queue(qpair)) { 4816 /* 4817 * If this is shared IO CQ case, just return the used CQ's poll 4818 * group, so I/O completions don't have to use 4819 * spdk_thread_send_msg(). 4820 */ 4821 if (cq->group != NULL) { 4822 result = cq->group; 4823 goto out; 4824 } 4825 4826 /* 4827 * If we're in interrupt mode, align all qpairs for a controller 4828 * on the same poll group by default, unless requested. This can 4829 * be lower in performance than running on a single poll group, 4830 * so we disable spreading by default. 4831 */ 4832 if (in_interrupt_mode(vu_transport) && 4833 !vu_transport->transport_opts.enable_intr_mode_sq_spreading) { 4834 result = sq->ctrlr->sqs[0]->group; 4835 goto out; 4836 } 4837 4838 } 4839 4840 vu_group = &vu_transport->next_pg; 4841 assert(*vu_group != NULL); 4842 4843 result = &(*vu_group)->group; 4844 *vu_group = TAILQ_NEXT(*vu_group, link); 4845 if (*vu_group == NULL) { 4846 *vu_group = TAILQ_FIRST(&vu_transport->poll_groups); 4847 } 4848 4849 out: 4850 if (cq->group == NULL) { 4851 cq->group = result; 4852 } 4853 4854 pthread_mutex_unlock(&vu_transport->pg_lock); 4855 return result; 4856 } 4857 4858 static void 4859 vfio_user_poll_group_del_intr(struct nvmf_vfio_user_poll_group *vu_group) 4860 { 4861 assert(vu_group->intr_fd != -1); 4862 4863 spdk_interrupt_unregister(&vu_group->intr); 4864 4865 close(vu_group->intr_fd); 4866 vu_group->intr_fd = -1; 4867 } 4868 4869 /* called when process exits */ 4870 static void 4871 nvmf_vfio_user_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group) 4872 { 4873 struct nvmf_vfio_user_poll_group *vu_group, *next_tgroup; 4874 struct nvmf_vfio_user_transport *vu_transport; 4875 4876 SPDK_DEBUGLOG(nvmf_vfio, "destroy poll group\n"); 4877 4878 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 4879 vu_transport = SPDK_CONTAINEROF(vu_group->group.transport, struct nvmf_vfio_user_transport, 4880 transport); 4881 4882 if (in_interrupt_mode(vu_transport)) { 4883 vfio_user_poll_group_del_intr(vu_group); 4884 } 4885 4886 pthread_mutex_lock(&vu_transport->pg_lock); 4887 next_tgroup = TAILQ_NEXT(vu_group, link); 4888 TAILQ_REMOVE(&vu_transport->poll_groups, vu_group, link); 4889 if (next_tgroup == NULL) { 4890 next_tgroup = TAILQ_FIRST(&vu_transport->poll_groups); 4891 } 4892 if (vu_transport->next_pg == vu_group) { 4893 vu_transport->next_pg = next_tgroup; 4894 } 4895 pthread_mutex_unlock(&vu_transport->pg_lock); 4896 4897 free(vu_group); 4898 } 4899 4900 static void 4901 _vfio_user_qpair_disconnect(void *ctx) 4902 { 4903 struct nvmf_vfio_user_sq *sq = ctx; 4904 4905 spdk_nvmf_qpair_disconnect(&sq->qpair); 4906 } 4907 4908 /* The function is used when socket connection is destroyed */ 4909 static int 4910 vfio_user_destroy_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr) 4911 { 4912 struct nvmf_vfio_user_sq *sq; 4913 struct nvmf_vfio_user_endpoint *endpoint; 4914 4915 SPDK_DEBUGLOG(nvmf_vfio, "%s stop processing\n", ctrlr_id(ctrlr)); 4916 4917 endpoint = ctrlr->endpoint; 4918 assert(endpoint != NULL); 4919 4920 pthread_mutex_lock(&endpoint->lock); 4921 endpoint->need_relisten = true; 4922 ctrlr->disconnect = true; 4923 if (TAILQ_EMPTY(&ctrlr->connected_sqs)) { 4924 endpoint->ctrlr = NULL; 4925 free_ctrlr(ctrlr); 4926 pthread_mutex_unlock(&endpoint->lock); 4927 return 0; 4928 } 4929 4930 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) { 4931 /* add another round thread poll to avoid recursive endpoint lock */ 4932 spdk_thread_send_msg(ctrlr->thread, _vfio_user_qpair_disconnect, sq); 4933 } 4934 pthread_mutex_unlock(&endpoint->lock); 4935 4936 return 0; 4937 } 4938 4939 /* 4940 * Poll for and process any incoming vfio-user messages. 4941 */ 4942 static int 4943 vfio_user_poll_vfu_ctx(void *ctx) 4944 { 4945 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 4946 int ret; 4947 4948 assert(ctrlr != NULL); 4949 4950 /* This will call access_bar0_fn() if there are any writes 4951 * to the portion of the BAR that is not mmap'd */ 4952 ret = vfu_run_ctx(ctrlr->endpoint->vfu_ctx); 4953 if (spdk_unlikely(ret == -1)) { 4954 if (errno == EBUSY) { 4955 return SPDK_POLLER_IDLE; 4956 } 4957 4958 spdk_poller_unregister(&ctrlr->vfu_ctx_poller); 4959 4960 /* 4961 * We lost the client; the reset callback will already have 4962 * unregistered the interrupt. 4963 */ 4964 if (errno == ENOTCONN) { 4965 vfio_user_destroy_ctrlr(ctrlr); 4966 return SPDK_POLLER_BUSY; 4967 } 4968 4969 /* 4970 * We might not have got a reset callback in this case, so 4971 * explicitly unregister the interrupt here. 4972 */ 4973 spdk_interrupt_unregister(&ctrlr->intr); 4974 ctrlr->intr_fd = -1; 4975 fail_ctrlr(ctrlr); 4976 } 4977 4978 return ret != 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 4979 } 4980 4981 struct vfio_user_post_cpl_ctx { 4982 struct nvmf_vfio_user_ctrlr *ctrlr; 4983 struct nvmf_vfio_user_cq *cq; 4984 struct spdk_nvme_cpl cpl; 4985 }; 4986 4987 static void 4988 _post_completion_msg(void *ctx) 4989 { 4990 struct vfio_user_post_cpl_ctx *cpl_ctx = ctx; 4991 4992 post_completion(cpl_ctx->ctrlr, cpl_ctx->cq, cpl_ctx->cpl.cdw0, cpl_ctx->cpl.sqid, 4993 cpl_ctx->cpl.cid, cpl_ctx->cpl.status.sc, cpl_ctx->cpl.status.sct); 4994 free(cpl_ctx); 4995 } 4996 4997 static int nvmf_vfio_user_poll_group_poll(struct spdk_nvmf_transport_poll_group *group); 4998 4999 static int 5000 vfio_user_poll_group_process(void *ctx) 5001 { 5002 struct nvmf_vfio_user_poll_group *vu_group = ctx; 5003 int ret = 0; 5004 5005 SPDK_DEBUGLOG(vfio_user_db, "pg:%p got intr\n", vu_group); 5006 5007 ret |= nvmf_vfio_user_poll_group_poll(&vu_group->group); 5008 5009 /* 5010 * Re-arm the event indexes. NB: this also could rearm other 5011 * controller's SQs. 5012 */ 5013 ret |= vfio_user_poll_group_rearm(vu_group); 5014 5015 vu_group->stats.pg_process_count++; 5016 return ret != 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 5017 } 5018 5019 static int 5020 vfio_user_poll_group_intr(void *ctx) 5021 { 5022 struct nvmf_vfio_user_poll_group *vu_group = ctx; 5023 eventfd_t val; 5024 5025 eventfd_read(vu_group->intr_fd, &val); 5026 5027 vu_group->stats.intr++; 5028 5029 return vfio_user_poll_group_process(ctx); 5030 } 5031 5032 /* 5033 * Handle an interrupt for the given controller: we must poll the vfu_ctx, and 5034 * the SQs assigned to our own poll group. Other poll groups are handled via 5035 * vfio_user_poll_group_intr(). 5036 */ 5037 static int 5038 vfio_user_ctrlr_intr(void *ctx) 5039 { 5040 struct nvmf_vfio_user_poll_group *vu_ctrlr_group; 5041 struct nvmf_vfio_user_ctrlr *vu_ctrlr = ctx; 5042 struct nvmf_vfio_user_poll_group *vu_group; 5043 int ret = SPDK_POLLER_IDLE; 5044 5045 vu_ctrlr_group = ctrlr_to_poll_group(vu_ctrlr); 5046 5047 SPDK_DEBUGLOG(vfio_user_db, "ctrlr pg:%p got intr\n", vu_ctrlr_group); 5048 5049 vu_ctrlr_group->stats.ctrlr_intr++; 5050 5051 /* 5052 * Poll vfio-user for this controller. We need to do this before polling 5053 * any SQs, as this is where doorbell writes may be handled. 5054 */ 5055 ret = vfio_user_poll_vfu_ctx(vu_ctrlr); 5056 5057 /* 5058 * `sqs[0]` could be set to NULL in vfio_user_poll_vfu_ctx() context, 5059 * just return for this case. 5060 */ 5061 if (vu_ctrlr->sqs[0] == NULL) { 5062 return ret; 5063 } 5064 5065 if (vu_ctrlr->transport->transport_opts.enable_intr_mode_sq_spreading) { 5066 /* 5067 * We may have just written to a doorbell owned by another 5068 * reactor: we need to prod them to make sure its SQs are polled 5069 * *after* the doorbell value is updated. 5070 */ 5071 TAILQ_FOREACH(vu_group, &vu_ctrlr->transport->poll_groups, link) { 5072 if (vu_group != vu_ctrlr_group) { 5073 SPDK_DEBUGLOG(vfio_user_db, "prodding pg:%p\n", vu_group); 5074 eventfd_write(vu_group->intr_fd, 1); 5075 } 5076 } 5077 } 5078 5079 ret |= vfio_user_poll_group_process(vu_ctrlr_group); 5080 5081 return ret; 5082 } 5083 5084 static void 5085 vfio_user_ctrlr_set_intr_mode(struct spdk_poller *poller, void *ctx, 5086 bool interrupt_mode) 5087 { 5088 struct nvmf_vfio_user_ctrlr *ctrlr = ctx; 5089 assert(ctrlr != NULL); 5090 assert(ctrlr->endpoint != NULL); 5091 5092 SPDK_DEBUGLOG(nvmf_vfio, "%s: setting interrupt mode to %d\n", 5093 ctrlr_id(ctrlr), interrupt_mode); 5094 5095 /* 5096 * interrupt_mode needs to persist across controller resets, so store 5097 * it in the endpoint instead. 5098 */ 5099 ctrlr->endpoint->interrupt_mode = interrupt_mode; 5100 5101 vfio_user_poll_group_rearm(ctrlr_to_poll_group(ctrlr)); 5102 } 5103 5104 /* 5105 * In response to the nvmf_vfio_user_create_ctrlr() path, the admin queue is now 5106 * set up and we can start operating on this controller. 5107 */ 5108 static void 5109 start_ctrlr(struct nvmf_vfio_user_ctrlr *vu_ctrlr, 5110 struct spdk_nvmf_ctrlr *ctrlr) 5111 { 5112 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; 5113 5114 vu_ctrlr->ctrlr = ctrlr; 5115 vu_ctrlr->cntlid = ctrlr->cntlid; 5116 vu_ctrlr->thread = spdk_get_thread(); 5117 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; 5118 5119 if (!in_interrupt_mode(endpoint->transport)) { 5120 vu_ctrlr->vfu_ctx_poller = SPDK_POLLER_REGISTER(vfio_user_poll_vfu_ctx, 5121 vu_ctrlr, 1000); 5122 return; 5123 } 5124 5125 vu_ctrlr->vfu_ctx_poller = SPDK_POLLER_REGISTER(vfio_user_poll_vfu_ctx, 5126 vu_ctrlr, 0); 5127 5128 vu_ctrlr->intr_fd = vfu_get_poll_fd(vu_ctrlr->endpoint->vfu_ctx); 5129 assert(vu_ctrlr->intr_fd != -1); 5130 5131 vu_ctrlr->intr = SPDK_INTERRUPT_REGISTER(vu_ctrlr->intr_fd, 5132 vfio_user_ctrlr_intr, vu_ctrlr); 5133 5134 assert(vu_ctrlr->intr != NULL); 5135 5136 spdk_poller_register_interrupt(vu_ctrlr->vfu_ctx_poller, 5137 vfio_user_ctrlr_set_intr_mode, 5138 vu_ctrlr); 5139 } 5140 5141 static int 5142 handle_queue_connect_rsp(struct nvmf_vfio_user_req *req, void *cb_arg) 5143 { 5144 struct nvmf_vfio_user_poll_group *vu_group; 5145 struct nvmf_vfio_user_sq *sq = cb_arg; 5146 struct nvmf_vfio_user_cq *admin_cq; 5147 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 5148 struct nvmf_vfio_user_endpoint *endpoint; 5149 5150 assert(sq != NULL); 5151 assert(req != NULL); 5152 5153 vu_ctrlr = sq->ctrlr; 5154 assert(vu_ctrlr != NULL); 5155 endpoint = vu_ctrlr->endpoint; 5156 assert(endpoint != NULL); 5157 5158 if (spdk_nvme_cpl_is_error(&req->req.rsp->nvme_cpl)) { 5159 SPDK_ERRLOG("SC %u, SCT %u\n", req->req.rsp->nvme_cpl.status.sc, req->req.rsp->nvme_cpl.status.sct); 5160 endpoint->ctrlr = NULL; 5161 free_ctrlr(vu_ctrlr); 5162 return -1; 5163 } 5164 5165 vu_group = SPDK_CONTAINEROF(sq->group, struct nvmf_vfio_user_poll_group, group); 5166 TAILQ_INSERT_TAIL(&vu_group->sqs, sq, link); 5167 5168 admin_cq = vu_ctrlr->cqs[0]; 5169 assert(admin_cq != NULL); 5170 assert(admin_cq->group != NULL); 5171 assert(admin_cq->group->group->thread != NULL); 5172 5173 pthread_mutex_lock(&endpoint->lock); 5174 if (nvmf_qpair_is_admin_queue(&sq->qpair)) { 5175 assert(admin_cq->group->group->thread == spdk_get_thread()); 5176 /* 5177 * The admin queue is special as SQ0 and CQ0 are created 5178 * together. 5179 */ 5180 admin_cq->cq_ref = 1; 5181 start_ctrlr(vu_ctrlr, sq->qpair.ctrlr); 5182 } else { 5183 /* For I/O queues this command was generated in response to an 5184 * ADMIN I/O CREATE SUBMISSION QUEUE command which has not yet 5185 * been completed. Complete it now. 5186 */ 5187 if (sq->post_create_io_sq_completion) { 5188 if (admin_cq->group->group->thread != spdk_get_thread()) { 5189 struct vfio_user_post_cpl_ctx *cpl_ctx; 5190 5191 cpl_ctx = calloc(1, sizeof(*cpl_ctx)); 5192 if (!cpl_ctx) { 5193 return -ENOMEM; 5194 } 5195 cpl_ctx->ctrlr = vu_ctrlr; 5196 cpl_ctx->cq = admin_cq; 5197 cpl_ctx->cpl.sqid = 0; 5198 cpl_ctx->cpl.cdw0 = 0; 5199 cpl_ctx->cpl.cid = sq->create_io_sq_cmd.cid; 5200 cpl_ctx->cpl.status.sc = SPDK_NVME_SC_SUCCESS; 5201 cpl_ctx->cpl.status.sct = SPDK_NVME_SCT_GENERIC; 5202 5203 spdk_thread_send_msg(admin_cq->group->group->thread, 5204 _post_completion_msg, 5205 cpl_ctx); 5206 } else { 5207 post_completion(vu_ctrlr, admin_cq, 0, 0, 5208 sq->create_io_sq_cmd.cid, SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC); 5209 } 5210 sq->post_create_io_sq_completion = false; 5211 } else if (in_interrupt_mode(endpoint->transport)) { 5212 /* 5213 * If we're live migrating a guest, there is a window 5214 * where the I/O queues haven't been set up but the 5215 * device is in running state, during which the guest 5216 * might write to a doorbell. This doorbell write will 5217 * go unnoticed, so let's poll the whole controller to 5218 * pick that up. 5219 */ 5220 ctrlr_kick(vu_ctrlr); 5221 } 5222 sq->sq_state = VFIO_USER_SQ_ACTIVE; 5223 } 5224 5225 TAILQ_INSERT_TAIL(&vu_ctrlr->connected_sqs, sq, tailq); 5226 pthread_mutex_unlock(&endpoint->lock); 5227 5228 free(req->req.iov[0].iov_base); 5229 req->req.iov[0].iov_base = NULL; 5230 req->req.iovcnt = 0; 5231 5232 return 0; 5233 } 5234 5235 static void 5236 _nvmf_vfio_user_poll_group_add(void *req) 5237 { 5238 spdk_nvmf_request_exec(req); 5239 } 5240 5241 /* 5242 * Add the given qpair to the given poll group. New qpairs are added via 5243 * spdk_nvmf_tgt_new_qpair(), which picks a poll group via 5244 * nvmf_vfio_user_get_optimal_poll_group(), then calls back here via 5245 * nvmf_transport_poll_group_add(). 5246 */ 5247 static int 5248 nvmf_vfio_user_poll_group_add(struct spdk_nvmf_transport_poll_group *group, 5249 struct spdk_nvmf_qpair *qpair) 5250 { 5251 struct nvmf_vfio_user_sq *sq; 5252 struct nvmf_vfio_user_req *vu_req; 5253 struct nvmf_vfio_user_ctrlr *ctrlr; 5254 struct spdk_nvmf_request *req; 5255 struct spdk_nvmf_fabric_connect_data *data; 5256 bool admin; 5257 5258 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5259 sq->group = group; 5260 ctrlr = sq->ctrlr; 5261 5262 SPDK_DEBUGLOG(nvmf_vfio, "%s: add QP%d=%p(%p) to poll_group=%p\n", 5263 ctrlr_id(ctrlr), sq->qpair.qid, 5264 sq, qpair, group); 5265 5266 admin = nvmf_qpair_is_admin_queue(&sq->qpair); 5267 5268 vu_req = get_nvmf_vfio_user_req(sq); 5269 if (vu_req == NULL) { 5270 return -1; 5271 } 5272 5273 req = &vu_req->req; 5274 req->cmd->connect_cmd.opcode = SPDK_NVME_OPC_FABRIC; 5275 req->cmd->connect_cmd.cid = 0; 5276 req->cmd->connect_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_CONNECT; 5277 req->cmd->connect_cmd.recfmt = 0; 5278 req->cmd->connect_cmd.sqsize = sq->size - 1; 5279 req->cmd->connect_cmd.qid = admin ? 0 : qpair->qid; 5280 5281 req->length = sizeof(struct spdk_nvmf_fabric_connect_data); 5282 5283 data = calloc(1, req->length); 5284 if (data == NULL) { 5285 nvmf_vfio_user_req_free(req); 5286 return -ENOMEM; 5287 } 5288 5289 SPDK_IOV_ONE(req->iov, &req->iovcnt, data, req->length); 5290 5291 data->cntlid = ctrlr->cntlid; 5292 snprintf(data->subnqn, sizeof(data->subnqn), "%s", 5293 spdk_nvmf_subsystem_get_nqn(ctrlr->endpoint->subsystem)); 5294 5295 vu_req->cb_fn = handle_queue_connect_rsp; 5296 vu_req->cb_arg = sq; 5297 5298 SPDK_DEBUGLOG(nvmf_vfio, 5299 "%s: sending connect fabrics command for qid:%#x cntlid=%#x\n", 5300 ctrlr_id(ctrlr), qpair->qid, data->cntlid); 5301 5302 /* 5303 * By the time transport's poll_group_add() callback is executed, the 5304 * qpair isn't in the ACTIVE state yet, so spdk_nvmf_request_exec() 5305 * would fail. The state changes to ACTIVE immediately after the 5306 * callback finishes, so delay spdk_nvmf_request_exec() by sending a 5307 * message. 5308 */ 5309 spdk_thread_send_msg(spdk_get_thread(), _nvmf_vfio_user_poll_group_add, req); 5310 return 0; 5311 } 5312 5313 static int 5314 nvmf_vfio_user_poll_group_remove(struct spdk_nvmf_transport_poll_group *group, 5315 struct spdk_nvmf_qpair *qpair) 5316 { 5317 struct nvmf_vfio_user_sq *sq; 5318 struct nvmf_vfio_user_poll_group *vu_group; 5319 5320 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5321 5322 SPDK_DEBUGLOG(nvmf_vfio, 5323 "%s: remove NVMf QP%d=%p from NVMf poll_group=%p\n", 5324 ctrlr_id(sq->ctrlr), qpair->qid, qpair, group); 5325 5326 5327 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 5328 TAILQ_REMOVE(&vu_group->sqs, sq, link); 5329 5330 return 0; 5331 } 5332 5333 static void 5334 _nvmf_vfio_user_req_free(struct nvmf_vfio_user_sq *sq, struct nvmf_vfio_user_req *vu_req) 5335 { 5336 memset(&vu_req->cmd, 0, sizeof(vu_req->cmd)); 5337 memset(&vu_req->rsp, 0, sizeof(vu_req->rsp)); 5338 vu_req->iovcnt = 0; 5339 vu_req->req.iovcnt = 0; 5340 vu_req->req.length = 0; 5341 vu_req->state = VFIO_USER_REQUEST_STATE_FREE; 5342 5343 TAILQ_INSERT_TAIL(&sq->free_reqs, vu_req, link); 5344 } 5345 5346 static int 5347 nvmf_vfio_user_req_free(struct spdk_nvmf_request *req) 5348 { 5349 struct nvmf_vfio_user_sq *sq; 5350 struct nvmf_vfio_user_req *vu_req; 5351 5352 assert(req != NULL); 5353 5354 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 5355 sq = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 5356 5357 _nvmf_vfio_user_req_free(sq, vu_req); 5358 5359 return 0; 5360 } 5361 5362 static int 5363 nvmf_vfio_user_req_complete(struct spdk_nvmf_request *req) 5364 { 5365 struct nvmf_vfio_user_sq *sq; 5366 struct nvmf_vfio_user_req *vu_req; 5367 5368 assert(req != NULL); 5369 5370 vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req); 5371 sq = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 5372 5373 if (vu_req->cb_fn != NULL) { 5374 if (vu_req->cb_fn(vu_req, vu_req->cb_arg) != 0) { 5375 fail_ctrlr(sq->ctrlr); 5376 } 5377 } 5378 5379 _nvmf_vfio_user_req_free(sq, vu_req); 5380 5381 return 0; 5382 } 5383 5384 static void 5385 nvmf_vfio_user_close_qpair(struct spdk_nvmf_qpair *qpair, 5386 spdk_nvmf_transport_qpair_fini_cb cb_fn, void *cb_arg) 5387 { 5388 struct nvmf_vfio_user_sq *sq; 5389 struct nvmf_vfio_user_ctrlr *vu_ctrlr; 5390 struct nvmf_vfio_user_endpoint *endpoint; 5391 struct vfio_user_delete_sq_ctx *del_ctx; 5392 5393 assert(qpair != NULL); 5394 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5395 vu_ctrlr = sq->ctrlr; 5396 endpoint = vu_ctrlr->endpoint; 5397 del_ctx = sq->delete_ctx; 5398 sq->delete_ctx = NULL; 5399 5400 pthread_mutex_lock(&endpoint->lock); 5401 TAILQ_REMOVE(&vu_ctrlr->connected_sqs, sq, tailq); 5402 delete_sq_done(vu_ctrlr, sq); 5403 if (TAILQ_EMPTY(&vu_ctrlr->connected_sqs)) { 5404 endpoint->ctrlr = NULL; 5405 if (vu_ctrlr->in_source_vm && endpoint->need_resume) { 5406 /* The controller will be freed, we can resume the subsystem 5407 * now so that the endpoint can be ready to accept another 5408 * new connection. 5409 */ 5410 spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, 5411 vfio_user_endpoint_resume_done, endpoint); 5412 } 5413 free_ctrlr(vu_ctrlr); 5414 } 5415 pthread_mutex_unlock(&endpoint->lock); 5416 5417 if (del_ctx) { 5418 vfio_user_qpair_delete_cb(del_ctx); 5419 } 5420 5421 if (cb_fn) { 5422 cb_fn(cb_arg); 5423 } 5424 } 5425 5426 /** 5427 * Returns a preallocated request, or NULL if there isn't one available. 5428 */ 5429 static struct nvmf_vfio_user_req * 5430 get_nvmf_vfio_user_req(struct nvmf_vfio_user_sq *sq) 5431 { 5432 struct nvmf_vfio_user_req *req; 5433 5434 if (sq == NULL) { 5435 return NULL; 5436 } 5437 5438 req = TAILQ_FIRST(&sq->free_reqs); 5439 if (req == NULL) { 5440 return NULL; 5441 } 5442 5443 TAILQ_REMOVE(&sq->free_reqs, req, link); 5444 5445 return req; 5446 } 5447 5448 static int 5449 get_nvmf_io_req_length(struct spdk_nvmf_request *req) 5450 { 5451 uint16_t nr; 5452 uint32_t nlb, nsid; 5453 struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; 5454 struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; 5455 struct spdk_nvmf_ns *ns; 5456 5457 nsid = cmd->nsid; 5458 ns = _nvmf_subsystem_get_ns(ctrlr->subsys, nsid); 5459 if (ns == NULL || ns->bdev == NULL) { 5460 SPDK_ERRLOG("unsuccessful query for nsid %u\n", cmd->nsid); 5461 return -EINVAL; 5462 } 5463 5464 if (cmd->opc == SPDK_NVME_OPC_DATASET_MANAGEMENT) { 5465 nr = cmd->cdw10_bits.dsm.nr + 1; 5466 return nr * sizeof(struct spdk_nvme_dsm_range); 5467 } 5468 5469 if (cmd->opc == SPDK_NVME_OPC_COPY) { 5470 nr = (cmd->cdw12 & 0x000000ffu) + 1; 5471 return nr * sizeof(struct spdk_nvme_scc_source_range); 5472 } 5473 5474 nlb = (cmd->cdw12 & 0x0000ffffu) + 1; 5475 return nlb * spdk_bdev_desc_get_block_size(ns->desc); 5476 } 5477 5478 static int 5479 map_admin_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req) 5480 { 5481 struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; 5482 uint32_t len = 0, numdw = 0; 5483 uint8_t fid; 5484 int iovcnt; 5485 5486 req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 5487 5488 if (req->xfer == SPDK_NVME_DATA_NONE) { 5489 return 0; 5490 } 5491 5492 switch (cmd->opc) { 5493 case SPDK_NVME_OPC_IDENTIFY: 5494 len = 4096; 5495 break; 5496 case SPDK_NVME_OPC_GET_LOG_PAGE: 5497 numdw = ((((uint32_t)cmd->cdw11_bits.get_log_page.numdu << 16) | 5498 cmd->cdw10_bits.get_log_page.numdl) + 1); 5499 if (numdw > UINT32_MAX / 4) { 5500 return -EINVAL; 5501 } 5502 len = numdw * 4; 5503 break; 5504 case SPDK_NVME_OPC_GET_FEATURES: 5505 case SPDK_NVME_OPC_SET_FEATURES: 5506 fid = cmd->cdw10_bits.set_features.fid; 5507 switch (fid) { 5508 case SPDK_NVME_FEAT_LBA_RANGE_TYPE: 5509 len = 4096; 5510 break; 5511 case SPDK_NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 5512 len = 256; 5513 break; 5514 case SPDK_NVME_FEAT_TIMESTAMP: 5515 len = 8; 5516 break; 5517 case SPDK_NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 5518 len = 512; 5519 break; 5520 case SPDK_NVME_FEAT_HOST_IDENTIFIER: 5521 if (cmd->cdw11_bits.feat_host_identifier.bits.exhid) { 5522 len = 16; 5523 } else { 5524 len = 8; 5525 } 5526 break; 5527 default: 5528 return 0; 5529 } 5530 break; 5531 case SPDK_NVME_OPC_FABRIC: 5532 return -ENOTSUP; 5533 default: 5534 return 0; 5535 } 5536 5537 /* ADMIN command will not use SGL */ 5538 if (cmd->psdt != 0) { 5539 return -EINVAL; 5540 } 5541 5542 iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, len); 5543 if (iovcnt < 0) { 5544 SPDK_ERRLOG("%s: map Admin Opc %x failed\n", 5545 ctrlr_id(ctrlr), cmd->opc); 5546 return -1; 5547 } 5548 req->length = len; 5549 req->iovcnt = iovcnt; 5550 5551 return 0; 5552 } 5553 5554 /* 5555 * Map an I/O command's buffers. 5556 * 5557 * Returns 0 on success and -errno on failure. 5558 */ 5559 static int 5560 map_io_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req) 5561 { 5562 int len, iovcnt; 5563 struct spdk_nvme_cmd *cmd; 5564 5565 assert(ctrlr != NULL); 5566 assert(req != NULL); 5567 5568 cmd = &req->cmd->nvme_cmd; 5569 req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); 5570 5571 if (spdk_unlikely(req->xfer == SPDK_NVME_DATA_NONE)) { 5572 return 0; 5573 } 5574 5575 len = get_nvmf_io_req_length(req); 5576 if (len < 0) { 5577 return -EINVAL; 5578 } 5579 req->length = len; 5580 5581 iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, req->length); 5582 if (iovcnt < 0) { 5583 SPDK_ERRLOG("%s: failed to map IO OPC %u\n", ctrlr_id(ctrlr), cmd->opc); 5584 return -EFAULT; 5585 } 5586 req->iovcnt = iovcnt; 5587 5588 return 0; 5589 } 5590 5591 static int 5592 handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, 5593 struct nvmf_vfio_user_sq *sq) 5594 { 5595 int err; 5596 struct nvmf_vfio_user_req *vu_req; 5597 struct spdk_nvmf_request *req; 5598 5599 assert(ctrlr != NULL); 5600 assert(cmd != NULL); 5601 5602 vu_req = get_nvmf_vfio_user_req(sq); 5603 if (spdk_unlikely(vu_req == NULL)) { 5604 SPDK_ERRLOG("%s: no request for NVMe command opc 0x%x\n", ctrlr_id(ctrlr), cmd->opc); 5605 return post_completion(ctrlr, ctrlr->cqs[sq->cqid], 0, 0, cmd->cid, 5606 SPDK_NVME_SC_INTERNAL_DEVICE_ERROR, SPDK_NVME_SCT_GENERIC); 5607 5608 } 5609 req = &vu_req->req; 5610 5611 assert(req->qpair != NULL); 5612 SPDK_DEBUGLOG(nvmf_vfio, "%s: handle sqid:%u, req opc=%#x cid=%d\n", 5613 ctrlr_id(ctrlr), req->qpair->qid, cmd->opc, cmd->cid); 5614 5615 vu_req->cb_fn = handle_cmd_rsp; 5616 vu_req->cb_arg = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair); 5617 req->cmd->nvme_cmd = *cmd; 5618 5619 if (nvmf_qpair_is_admin_queue(req->qpair)) { 5620 err = map_admin_cmd_req(ctrlr, req); 5621 } else { 5622 switch (cmd->opc) { 5623 case SPDK_NVME_OPC_RESERVATION_REGISTER: 5624 case SPDK_NVME_OPC_RESERVATION_REPORT: 5625 case SPDK_NVME_OPC_RESERVATION_ACQUIRE: 5626 case SPDK_NVME_OPC_RESERVATION_RELEASE: 5627 case SPDK_NVME_OPC_FABRIC: 5628 err = -ENOTSUP; 5629 break; 5630 default: 5631 err = map_io_cmd_req(ctrlr, req); 5632 break; 5633 } 5634 } 5635 5636 if (spdk_unlikely(err < 0)) { 5637 SPDK_ERRLOG("%s: process NVMe command opc 0x%x failed\n", 5638 ctrlr_id(ctrlr), cmd->opc); 5639 req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; 5640 req->rsp->nvme_cpl.status.sc = err == -ENOTSUP ? 5641 SPDK_NVME_SC_INVALID_OPCODE : 5642 SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 5643 err = handle_cmd_rsp(vu_req, vu_req->cb_arg); 5644 _nvmf_vfio_user_req_free(sq, vu_req); 5645 return err; 5646 } 5647 5648 vu_req->state = VFIO_USER_REQUEST_STATE_EXECUTING; 5649 spdk_nvmf_request_exec(req); 5650 5651 return 0; 5652 } 5653 5654 /* 5655 * If we suppressed an IRQ in post_completion(), check if it needs to be fired 5656 * here: if the host isn't up to date, and is apparently not actively processing 5657 * the queue (i.e. ->last_head isn't changing), we need an IRQ. 5658 */ 5659 static void 5660 handle_suppressed_irq(struct nvmf_vfio_user_ctrlr *ctrlr, 5661 struct nvmf_vfio_user_sq *sq) 5662 { 5663 struct nvmf_vfio_user_cq *cq = ctrlr->cqs[sq->cqid]; 5664 uint32_t cq_head; 5665 uint32_t cq_tail; 5666 5667 if (!cq->ien || cq->qid == 0 || !ctrlr_interrupt_enabled(ctrlr)) { 5668 return; 5669 } 5670 5671 cq_tail = *cq_tailp(cq); 5672 5673 /* Already sent? */ 5674 if (cq_tail == cq->last_trigger_irq_tail) { 5675 return; 5676 } 5677 5678 spdk_ivdt_dcache(cq_dbl_headp(cq)); 5679 cq_head = *cq_dbl_headp(cq); 5680 5681 if (cq_head != cq_tail && cq_head == cq->last_head) { 5682 int err = vfu_irq_trigger(ctrlr->endpoint->vfu_ctx, cq->iv); 5683 if (err != 0) { 5684 SPDK_ERRLOG("%s: failed to trigger interrupt: %m\n", 5685 ctrlr_id(ctrlr)); 5686 } else { 5687 cq->last_trigger_irq_tail = cq_tail; 5688 } 5689 } 5690 5691 cq->last_head = cq_head; 5692 } 5693 5694 /* Returns the number of commands processed, or a negative value on error. */ 5695 static int 5696 nvmf_vfio_user_sq_poll(struct nvmf_vfio_user_sq *sq) 5697 { 5698 struct nvmf_vfio_user_ctrlr *ctrlr; 5699 uint32_t new_tail; 5700 int count = 0; 5701 5702 assert(sq != NULL); 5703 5704 ctrlr = sq->ctrlr; 5705 5706 /* 5707 * A quiesced, or migrating, controller should never process new 5708 * commands. 5709 */ 5710 if (ctrlr->state != VFIO_USER_CTRLR_RUNNING) { 5711 return SPDK_POLLER_IDLE; 5712 } 5713 5714 if (ctrlr->adaptive_irqs_enabled) { 5715 handle_suppressed_irq(ctrlr, sq); 5716 } 5717 5718 /* On aarch64 platforms, doorbells update from guest VM may not be seen 5719 * on SPDK target side. This is because there is memory type mismatch 5720 * situation here. That is on guest VM side, the doorbells are treated as 5721 * device memory while on SPDK target side, it is treated as normal 5722 * memory. And this situation cause problem on ARM platform. 5723 * Refer to "https://developer.arm.com/documentation/102376/0100/ 5724 * Memory-aliasing-and-mismatched-memory-types". Only using spdk_mb() 5725 * cannot fix this. Use "dc civac" to invalidate cache may solve 5726 * this. 5727 */ 5728 spdk_ivdt_dcache(sq_dbl_tailp(sq)); 5729 5730 /* Load-Acquire. */ 5731 new_tail = *sq_dbl_tailp(sq); 5732 5733 new_tail = new_tail & 0xffffu; 5734 if (spdk_unlikely(new_tail >= sq->size)) { 5735 SPDK_DEBUGLOG(nvmf_vfio, "%s: invalid sqid:%u doorbell value %u\n", ctrlr_id(ctrlr), sq->qid, 5736 new_tail); 5737 spdk_nvmf_ctrlr_async_event_error_event(ctrlr->ctrlr, SPDK_NVME_ASYNC_EVENT_INVALID_DB_WRITE); 5738 5739 return -1; 5740 } 5741 5742 if (*sq_headp(sq) == new_tail) { 5743 return 0; 5744 } 5745 5746 SPDK_DEBUGLOG(nvmf_vfio, "%s: sqid:%u doorbell old=%u new=%u\n", 5747 ctrlr_id(ctrlr), sq->qid, *sq_headp(sq), new_tail); 5748 if (ctrlr->sdbl != NULL) { 5749 SPDK_DEBUGLOG(nvmf_vfio, 5750 "%s: sqid:%u bar0_doorbell=%u shadow_doorbell=%u eventidx=%u\n", 5751 ctrlr_id(ctrlr), sq->qid, 5752 ctrlr->bar0_doorbells[queue_index(sq->qid, false)], 5753 ctrlr->sdbl->shadow_doorbells[queue_index(sq->qid, false)], 5754 ctrlr->sdbl->eventidxs[queue_index(sq->qid, false)]); 5755 } 5756 5757 /* 5758 * Ensure that changes to the queue are visible to us. 5759 * The host driver should write the queue first, do a wmb(), and then 5760 * update the SQ tail doorbell (their Store-Release). 5761 */ 5762 spdk_rmb(); 5763 5764 count = handle_sq_tdbl_write(ctrlr, new_tail, sq); 5765 if (spdk_unlikely(count < 0)) { 5766 fail_ctrlr(ctrlr); 5767 } 5768 5769 return count; 5770 } 5771 5772 /* 5773 * vfio-user transport poll handler. Note that the library context is polled in 5774 * a separate poller (->vfu_ctx_poller), so this poller only needs to poll the 5775 * active SQs. 5776 * 5777 * Returns the number of commands processed, or a negative value on error. 5778 */ 5779 static int 5780 nvmf_vfio_user_poll_group_poll(struct spdk_nvmf_transport_poll_group *group) 5781 { 5782 struct nvmf_vfio_user_poll_group *vu_group; 5783 struct nvmf_vfio_user_sq *sq, *tmp; 5784 int count = 0; 5785 5786 assert(group != NULL); 5787 5788 vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group); 5789 5790 SPDK_DEBUGLOG(vfio_user_db, "polling all SQs\n"); 5791 5792 TAILQ_FOREACH_SAFE(sq, &vu_group->sqs, link, tmp) { 5793 int ret; 5794 5795 if (spdk_unlikely(sq->sq_state != VFIO_USER_SQ_ACTIVE || !sq->size)) { 5796 continue; 5797 } 5798 5799 ret = nvmf_vfio_user_sq_poll(sq); 5800 5801 if (spdk_unlikely(ret < 0)) { 5802 return ret; 5803 } 5804 5805 count += ret; 5806 } 5807 5808 vu_group->stats.polls++; 5809 vu_group->stats.poll_reqs += count; 5810 vu_group->stats.poll_reqs_squared += count * count; 5811 if (count == 0) { 5812 vu_group->stats.polls_spurious++; 5813 } 5814 5815 if (vu_group->need_kick) { 5816 poll_group_kick(vu_group); 5817 } 5818 5819 return count; 5820 } 5821 5822 static int 5823 nvmf_vfio_user_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair, 5824 struct spdk_nvme_transport_id *trid) 5825 { 5826 struct nvmf_vfio_user_sq *sq; 5827 struct nvmf_vfio_user_ctrlr *ctrlr; 5828 5829 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5830 ctrlr = sq->ctrlr; 5831 5832 memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid)); 5833 return 0; 5834 } 5835 5836 static int 5837 nvmf_vfio_user_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair, 5838 struct spdk_nvme_transport_id *trid) 5839 { 5840 return 0; 5841 } 5842 5843 static int 5844 nvmf_vfio_user_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair, 5845 struct spdk_nvme_transport_id *trid) 5846 { 5847 struct nvmf_vfio_user_sq *sq; 5848 struct nvmf_vfio_user_ctrlr *ctrlr; 5849 5850 sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair); 5851 ctrlr = sq->ctrlr; 5852 5853 memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid)); 5854 return 0; 5855 } 5856 5857 static void 5858 nvmf_vfio_user_qpair_abort_request(struct spdk_nvmf_qpair *qpair, 5859 struct spdk_nvmf_request *req) 5860 { 5861 struct spdk_nvmf_request *req_to_abort = NULL; 5862 struct spdk_nvmf_request *temp_req = NULL; 5863 uint16_t cid; 5864 5865 cid = req->cmd->nvme_cmd.cdw10_bits.abort.cid; 5866 5867 TAILQ_FOREACH(temp_req, &qpair->outstanding, link) { 5868 struct nvmf_vfio_user_req *vu_req; 5869 5870 vu_req = SPDK_CONTAINEROF(temp_req, struct nvmf_vfio_user_req, req); 5871 5872 if (vu_req->state == VFIO_USER_REQUEST_STATE_EXECUTING && vu_req->cmd.cid == cid) { 5873 req_to_abort = temp_req; 5874 break; 5875 } 5876 } 5877 5878 if (req_to_abort == NULL) { 5879 spdk_nvmf_request_complete(req); 5880 return; 5881 } 5882 5883 req->req_to_abort = req_to_abort; 5884 nvmf_ctrlr_abort_request(req); 5885 } 5886 5887 static void 5888 nvmf_vfio_user_poll_group_dump_stat(struct spdk_nvmf_transport_poll_group *group, 5889 struct spdk_json_write_ctx *w) 5890 { 5891 struct nvmf_vfio_user_poll_group *vu_group = SPDK_CONTAINEROF(group, 5892 struct nvmf_vfio_user_poll_group, group); 5893 uint64_t polls_denom; 5894 5895 spdk_json_write_named_uint64(w, "ctrlr_intr", vu_group->stats.ctrlr_intr); 5896 spdk_json_write_named_uint64(w, "ctrlr_kicks", vu_group->stats.ctrlr_kicks); 5897 spdk_json_write_named_uint64(w, "pg_kicks", vu_group->stats.pg_kicks); 5898 spdk_json_write_named_uint64(w, "won", vu_group->stats.won); 5899 spdk_json_write_named_uint64(w, "lost", vu_group->stats.lost); 5900 spdk_json_write_named_uint64(w, "lost_count", vu_group->stats.lost_count); 5901 spdk_json_write_named_uint64(w, "rearms", vu_group->stats.rearms); 5902 spdk_json_write_named_uint64(w, "cq_full", vu_group->stats.cq_full); 5903 spdk_json_write_named_uint64(w, "pg_process_count", vu_group->stats.pg_process_count); 5904 spdk_json_write_named_uint64(w, "intr", vu_group->stats.intr); 5905 spdk_json_write_named_uint64(w, "polls", vu_group->stats.polls); 5906 spdk_json_write_named_uint64(w, "polls_spurious", vu_group->stats.polls_spurious); 5907 spdk_json_write_named_uint64(w, "poll_reqs", vu_group->stats.poll_reqs); 5908 polls_denom = vu_group->stats.polls * (vu_group->stats.polls - 1); 5909 if (polls_denom) { 5910 uint64_t n = vu_group->stats.polls * vu_group->stats.poll_reqs_squared - vu_group->stats.poll_reqs * 5911 vu_group->stats.poll_reqs; 5912 spdk_json_write_named_double(w, "poll_reqs_variance", sqrt(n / polls_denom)); 5913 } 5914 5915 spdk_json_write_named_uint64(w, "cqh_admin_writes", vu_group->stats.cqh_admin_writes); 5916 spdk_json_write_named_uint64(w, "cqh_io_writes", vu_group->stats.cqh_io_writes); 5917 } 5918 5919 static void 5920 nvmf_vfio_user_opts_init(struct spdk_nvmf_transport_opts *opts) 5921 { 5922 opts->max_queue_depth = NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH; 5923 opts->max_qpairs_per_ctrlr = NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; 5924 opts->in_capsule_data_size = 0; 5925 opts->max_io_size = NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE; 5926 opts->io_unit_size = NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE; 5927 opts->max_aq_depth = NVMF_VFIO_USER_DEFAULT_AQ_DEPTH; 5928 opts->num_shared_buffers = 0; 5929 opts->buf_cache_size = 0; 5930 opts->association_timeout = 0; 5931 opts->transport_specific = NULL; 5932 } 5933 5934 const struct spdk_nvmf_transport_ops spdk_nvmf_transport_vfio_user = { 5935 .name = "VFIOUSER", 5936 .type = SPDK_NVME_TRANSPORT_VFIOUSER, 5937 .opts_init = nvmf_vfio_user_opts_init, 5938 .create = nvmf_vfio_user_create, 5939 .destroy = nvmf_vfio_user_destroy, 5940 5941 .listen = nvmf_vfio_user_listen, 5942 .stop_listen = nvmf_vfio_user_stop_listen, 5943 .cdata_init = nvmf_vfio_user_cdata_init, 5944 .listen_associate = nvmf_vfio_user_listen_associate, 5945 5946 .listener_discover = nvmf_vfio_user_discover, 5947 5948 .poll_group_create = nvmf_vfio_user_poll_group_create, 5949 .get_optimal_poll_group = nvmf_vfio_user_get_optimal_poll_group, 5950 .poll_group_destroy = nvmf_vfio_user_poll_group_destroy, 5951 .poll_group_add = nvmf_vfio_user_poll_group_add, 5952 .poll_group_remove = nvmf_vfio_user_poll_group_remove, 5953 .poll_group_poll = nvmf_vfio_user_poll_group_poll, 5954 5955 .req_free = nvmf_vfio_user_req_free, 5956 .req_complete = nvmf_vfio_user_req_complete, 5957 5958 .qpair_fini = nvmf_vfio_user_close_qpair, 5959 .qpair_get_local_trid = nvmf_vfio_user_qpair_get_local_trid, 5960 .qpair_get_peer_trid = nvmf_vfio_user_qpair_get_peer_trid, 5961 .qpair_get_listen_trid = nvmf_vfio_user_qpair_get_listen_trid, 5962 .qpair_abort_request = nvmf_vfio_user_qpair_abort_request, 5963 5964 .poll_group_dump_stat = nvmf_vfio_user_poll_group_dump_stat, 5965 }; 5966 5967 SPDK_NVMF_TRANSPORT_REGISTER(muser, &spdk_nvmf_transport_vfio_user); 5968 SPDK_LOG_REGISTER_COMPONENT(nvmf_vfio) 5969 SPDK_LOG_REGISTER_COMPONENT(vfio_user_db) 5970