Lines Matching +full:storage +full:- +full:target
1 /* SPDX-License-Identifier: BSD-3-Clause
3 * Copyright (c) 2019-2022, Nutanix Inc. All rights reserved.
8 * NVMe over vfio-user transport
13 #include <vfio-user/libvfio-user.h>
14 #include <vfio-user/pci_defs.h>
40 #define NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE ((NVMF_REQ_MAX_BUFFERS - 1) << SHIFT_4KB)
58 * available on PCI-X 2.0 and PCI Express buses
105 * Support for live migration in NVMf/vfio-user: live migration is implemented
107 * stop-and-copy state and then trivially, and most importantly safely,
108 * collecting migration state and providing it to the vfio-user client. We
109 * don't provide any migration state at the pre-copy state as that's too
143 * -------------------------------------------------------------------------
145 * -------------------------------------------------------------------------
275 /* Quiesce requested by libvfio-user */
315 * and SQ re-connect response in the destination VM, for the prior case,
317 * re-connecting SQs in the destination VM.
371 * the controller belongs are if no vfio-user message has been
377 * vfio-user file descriptor has been ready or explicitly
384 * ctrlr_intr - ctrlr_kicks is the number of times the
385 * vfio-user poll file descriptor has been ready.
481 * Shadow doorbells PRPs to provide during the stop-and-copy state.
489 /* Endpoint in vfio-user is associated with a socket file, which
566 return mapping->iov.iov_base;
579 return &sq->head;
586 return sq->dbl_tailp;
593 return cq->dbl_headp;
600 return &cq->tail;
608 assert(*sq_headp(sq) < sq->size);
611 if (spdk_unlikely(*sq_headp(sq) == sq->size)) {
621 assert(*cq_tailp(cq) < cq->size);
624 if (spdk_unlikely(*cq_tailp(cq) == cq->size)) {
626 cq->phase = !cq->phase;
640 if (vu_ctrlr->cqs[qid] == NULL) {
644 return (vu_ctrlr->cqs[qid]->cq_state != VFIO_USER_CQ_DELETED &&
645 vu_ctrlr->cqs[qid]->cq_state != VFIO_USER_CQ_UNUSED);
648 if (vu_ctrlr->sqs[qid] == NULL) {
652 return (vu_ctrlr->sqs[qid]->sq_state != VFIO_USER_SQ_DELETED &&
653 vu_ctrlr->sqs[qid]->sq_state != VFIO_USER_SQ_UNUSED);
659 return endpoint->trid.traddr;
665 if (!ctrlr || !ctrlr->endpoint) {
669 return endpoint_id(ctrlr->endpoint);
676 return SPDK_CONTAINEROF(vu_ctrlr->sqs[0]->group,
684 return SPDK_CONTAINEROF(sq->group, struct nvmf_vfio_user_poll_group,
691 return vu_pg->group.group->thread;
710 vu_transport->intr_mode_supported;
735 vu_ctrlr_group->stats.ctrlr_kicks++;
742 * Force a wake-up for this particular poll group and its contained SQs.
747 vu_group->stats.pg_kicks++;
748 assert(vu_group->need_kick);
749 vu_group->need_kick = false;
750 eventfd_write(vu_group->intr_fd, 1);
773 if (ret == -1) {
780 addr, addr + len, prot, -(ret + 1));
792 assert(iov->iov_base != NULL);
793 return iov->iov_base;
810 prp1 = cmd->dptr.prp.prp1;
811 prp2 = cmd->dptr.prp.prp2;
814 residue_len = mps - (prp1 % mps);
820 return -EINVAL;
822 len -= residue_len;
825 return -ERANGE;
833 return -EINVAL;
843 return -EINVAL;
849 nents = (len + mps - 1) / mps;
852 return -ERANGE;
859 return -EINVAL;
869 return -EINVAL;
873 len -= residue_len;
896 return -ERANGE;
902 return -EINVAL;
907 return -EINVAL;
928 sgl = &cmd->dptr.sgl1;
931 if (sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) {
933 vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, MAP_RW);
936 return -EINVAL;
939 iovs[0].iov_len = sgl->unkeyed.length;
940 assert(sgl->unkeyed.length == len);
946 if (spdk_unlikely((sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_SEGMENT) &&
947 (sgl->unkeyed.type != SPDK_NVME_SGL_TYPE_LAST_SEGMENT))) {
948 SPDK_ERRLOG("Invalid SGL type %u\n", sgl->unkeyed.type);
949 return -EINVAL;
952 seg_len = sgl->unkeyed.length;
955 return -EINVAL;
959 vva = gpa_to_vva(prv, sgl->address, sgl->unkeyed.length, MAP_R);
962 return -EINVAL;
967 last_sgl = &sgl[num_sgls - 1];
970 if (last_sgl->unkeyed.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) {
973 max_iovcnt - total_iovcnt, gpa_to_vva);
984 ret = nvme_cmd_map_sgls_data(prv, sgl, num_sgls - 1, &iovs[total_iovcnt],
985 max_iovcnt - total_iovcnt, gpa_to_vva);
1004 if (cmd->psdt == SPDK_NVME_PSDT_PRP) {
1020 volatile uint32_t *doorbells = shadow ? ctrlr->sdbl->shadow_doorbells :
1021 ctrlr->bar0_doorbells;
1026 struct nvmf_vfio_user_sq *sq = ctrlr->sqs[i];
1027 struct nvmf_vfio_user_cq *cq = ctrlr->cqs[i];
1030 sq->dbl_tailp = doorbells + queue_index(sq->qid, false);
1032 ctrlr->sqs[i]->need_rearm = shadow;
1036 cq->dbl_headp = doorbells + queue_index(cq->qid, true);
1049 * non-NULL. If that is the case, no memory should have been mapped.
1051 if (sdbl->iovs == NULL || sdbl->sgs == NULL) {
1059 if (!sdbl->iovs[i].iov_len) {
1063 sg = index_to_sg_t(sdbl->sgs, i);
1064 iov = sdbl->iovs + i;
1080 * sdbl->shadow_doorbells and sdbl->eventidxs were mapped,
1083 free(sdbl->sgs);
1084 free(sdbl->iovs);
1102 sdbl->sgs = calloc(NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT, dma_sg_size());
1103 sdbl->iovs = calloc(NVMF_VFIO_USER_SHADOW_DOORBELLS_BUFFER_COUNT, sizeof(*sdbl->iovs));
1104 if (sdbl->sgs == NULL || sdbl->iovs == NULL) {
1109 p = map_one(vfu_ctx, prp1, len, sdbl->sgs, sdbl->iovs, MAP_RW);
1120 sg2 = index_to_sg_t(sdbl->sgs, 1);
1122 p = map_one(vfu_ctx, prp2, len, sg2, sdbl->iovs + 1, MAP_RW);
1128 sdbl->shadow_doorbells = (uint32_t *)sdbl->iovs[0].iov_base;
1129 sdbl->eventidxs = (uint32_t *)sdbl->iovs[1].iov_base;
1156 if (ctrlr->sqs[i] != NULL) {
1160 if (ctrlr->cqs[i] != NULL) {
1172 assert(vu_ctrlr->ctrlr != NULL);
1174 regs = spdk_nvmf_ctrlr_get_regs(vu_ctrlr->ctrlr);
1175 if (regs->csts.bits.cfs == 0) {
1179 nvmf_ctrlr_set_fatal_status(vu_ctrlr->ctrlr);
1186 assert(vu_ctrlr->endpoint != NULL);
1188 vfu_pci_config_space_t *pci = vu_ctrlr->endpoint->pci_config_space;
1190 return (!pci->hdr.cmd.id || vu_ctrlr->endpoint->msix->mxc.mxe);
1198 spdk_interrupt_unregister(&endpoint->accept_intr);
1199 spdk_poller_unregister(&endpoint->accept_poller);
1201 if (endpoint->bar0_doorbells) {
1202 munmap((void *)endpoint->bar0_doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE);
1205 if (endpoint->devmem_fd > 0) {
1206 close(endpoint->devmem_fd);
1209 if (endpoint->migr_data) {
1210 munmap(endpoint->migr_data, vfio_user_migr_data_len());
1213 if (endpoint->migr_fd > 0) {
1214 close(endpoint->migr_fd);
1217 if (endpoint->vfu_ctx) {
1218 vfu_destroy_ctx(endpoint->vfu_ctx);
1221 pthread_mutex_destroy(&endpoint->lock);
1238 pthread_mutex_destroy(&vu_transport->lock);
1239 pthread_mutex_destroy(&vu_transport->pg_lock);
1241 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) {
1242 TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link);
1289 if (opts->max_qpairs_per_ctrlr > NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR) {
1291 opts->max_qpairs_per_ctrlr, NVMF_VFIO_USER_MAX_QPAIRS_PER_CTRLR);
1301 err = pthread_mutex_init(&vu_transport->lock, NULL);
1306 TAILQ_INIT(&vu_transport->endpoints);
1308 err = pthread_mutex_init(&vu_transport->pg_lock, NULL);
1310 pthread_mutex_destroy(&vu_transport->lock);
1314 TAILQ_INIT(&vu_transport->poll_groups);
1316 if (opts->transport_specific != NULL &&
1317 spdk_json_decode_object_relaxed(opts->transport_specific, vfio_user_transport_opts_decoder,
1326 * mappable BAR0 disabled: we need a vfio-user message to wake us up
1328 * libvfio-user socket fd.
1330 vu_transport->intr_mode_supported =
1331 vu_transport->transport_opts.disable_mappable_bar0;
1337 if (!vu_transport->transport_opts.disable_mappable_bar0) {
1338 vu_transport->transport_opts.disable_shadow_doorbells = true;
1342 if (!vu_transport->intr_mode_supported) {
1352 vu_transport->transport_opts.disable_adaptive_irq = true;
1356 vu_transport->transport_opts.disable_mappable_bar0);
1358 vu_transport->transport_opts.disable_adaptive_irq);
1360 vu_transport->transport_opts.disable_shadow_doorbells);
1362 return &vu_transport->transport;
1365 pthread_mutex_destroy(&vu_transport->lock);
1366 pthread_mutex_destroy(&vu_transport->pg_lock);
1376 assert(vu_ctrlr->ctrlr != NULL);
1378 return vu_ctrlr->ctrlr->vcprop.cap.bits.mqes + 1;
1385 assert(vu_ctrlr->ctrlr != NULL);
1387 return vu_ctrlr->ctrlr->vcprop.cap.bits.dstrd;
1393 uint32_t memory_page_shift = vu_ctrlr->ctrlr->vcprop.cc.bits.mps + 12;
1400 return ~(memory_page_size(ctrlr) - 1);
1409 assert(mapping->len != 0);
1412 ret = map_one(vu_ctrlr->endpoint->vfu_ctx, mapping->prp1, mapping->len,
1413 mapping->sg, &mapping->iov, flags);
1415 return -EFAULT;
1419 memset(q_addr(mapping), 0, mapping->len);
1429 vfu_sgl_put(vu_ctrlr->endpoint->vfu_ctx, mapping->sg,
1430 &mapping->iov, 1);
1431 mapping->iov.iov_base = NULL;
1444 sq = ctrlr->sqs[0];
1447 assert(q_addr(&sq->mapping) == NULL);
1448 /* XXX ctrlr->asq == 0 is a valid memory address */
1450 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->ctrlr);
1451 sq->qid = 0;
1452 sq->size = regs->aqa.bits.asqs + 1;
1453 sq->mapping.prp1 = regs->asq;
1454 sq->mapping.len = sq->size * sizeof(struct spdk_nvme_cmd);
1456 sq->cqid = 0;
1458 ret = map_q(ctrlr, &sq->mapping, MAP_INITIALIZE);
1464 sq->dbl_tailp = ctrlr->bar0_doorbells + queue_index(0, false);
1476 * not looking - or during the event index update. In that case, we must retry,
1487 assert(sq->ctrlr != NULL);
1488 assert(sq->ctrlr->sdbl != NULL);
1489 assert(sq->need_rearm);
1490 assert(sq->qid != 0);
1492 ctrlr = sq->ctrlr;
1495 ctrlr_id(ctrlr), sq->qid);
1497 sq_tail_eidx = ctrlr->sdbl->eventidxs + queue_index(sq->qid, false);
1499 assert(ctrlr->endpoint != NULL);
1501 if (!ctrlr->endpoint->interrupt_mode) {
1511 * Ensure that the event index is updated before re-reading the tail
1540 "sq_head=%u\n", ctrlr_id(ctrlr), sq->qid, old_tail,
1544 sq->need_rearm = false;
1562 * Arrange for an SQ to interrupt us if written. Returns non-zero if we
1573 assert(sq->need_rearm);
1580 vu_group->stats.won++;
1594 vu_group->need_kick = true;
1600 vu_group->stats.lost++;
1601 vu_group->stats.lost_count += count;
1610 * every SQ that needs re-arming.
1612 * Returns non-zero if we processed something.
1620 vu_group->stats.rearms++;
1622 TAILQ_FOREACH(sq, &vu_group->sqs, link) {
1623 if (spdk_unlikely(sq->sq_state != VFIO_USER_SQ_ACTIVE || !sq->size)) {
1627 if (sq->need_rearm) {
1628 count += vfio_user_sq_rearm(sq->ctrlr, sq, vu_group);
1632 if (vu_group->need_kick) {
1648 cq = ctrlr->cqs[0];
1652 assert(q_addr(&cq->mapping) == NULL);
1654 regs = spdk_nvmf_ctrlr_get_regs(ctrlr->ctrlr);
1656 cq->qid = 0;
1657 cq->size = regs->aqa.bits.acqs + 1;
1658 cq->mapping.prp1 = regs->acq;
1659 cq->mapping.len = cq->size * sizeof(struct spdk_nvme_cpl);
1661 cq->ien = true;
1662 cq->phase = true;
1663 cq->nr_outstanding = 0;
1665 ret = map_q(ctrlr, &cq->mapping, MAP_RW | MAP_INITIALIZE);
1671 cq->dbl_headp = ctrlr->bar0_doorbells + queue_index(0, true);
1688 qpair = req->qpair;
1692 assert(vu_req->iovcnt < NVMF_VFIO_USER_MAX_IOVECS);
1693 ret = map_one(sq->ctrlr->endpoint->vfu_ctx, addr, len,
1694 index_to_sg_t(vu_req->sg, vu_req->iovcnt),
1695 &vu_req->iov[vu_req->iovcnt], flags);
1697 vu_req->iovcnt++;
1709 return nvme_map_cmd(req, &req->cmd->nvme_cmd, iov, NVMF_REQ_MAX_BUFFERS,
1723 if (cq->tail == cq->last_head) {
1724 free_slots = cq->size;
1725 } else if (cq->tail > cq->last_head) {
1726 free_slots = cq->size - (cq->tail - cq->last_head);
1728 free_slots = cq->last_head - cq->tail;
1732 return free_slots - 1;
1749 cq->last_head = *cq_dbl_headp(cq);
1759 * @ctrlr: the vfio-user controller
1777 if (spdk_unlikely(cq == NULL || q_addr(&cq->mapping) == NULL)) {
1781 if (cq->qid == 0) {
1782 assert(spdk_get_thread() == cq->group->group->thread);
1794 ctrlr_id(ctrlr), cq->qid, *cq_tailp(cq),
1796 return -1;
1799 cpl = ((struct spdk_nvme_cpl *)q_addr(&cq->mapping)) + *cq_tailp(cq);
1801 assert(ctrlr->sqs[sqid] != NULL);
1805 *sq_headp(ctrlr->sqs[sqid]), *cq_tailp(cq));
1807 cpl->sqhd = *sq_headp(ctrlr->sqs[sqid]);
1808 cpl->sqid = sqid;
1809 cpl->cid = cid;
1810 cpl->cdw0 = cdw0;
1814 * directly in cpl->status, which would cause a read-modify-write cycle,
1820 cpl_status.p = cq->phase;
1821 cpl->status = cpl_status;
1823 cq->nr_outstanding--;
1829 if ((cq->qid == 0 || !ctrlr->adaptive_irqs_enabled) &&
1830 cq->ien && ctrlr_interrupt_enabled(ctrlr)) {
1831 err = vfu_irq_trigger(ctrlr->endpoint->vfu_ctx, cq->iv);
1845 while (!TAILQ_EMPTY(&sq->free_reqs)) {
1846 struct nvmf_vfio_user_req *vu_req = TAILQ_FIRST(&sq->free_reqs);
1847 TAILQ_REMOVE(&sq->free_reqs, vu_req, link);
1855 assert(cq->cq_ref == 0);
1856 unmap_q(ctrlr, &cq->mapping);
1857 cq->size = 0;
1858 cq->cq_state = VFIO_USER_CQ_DELETED;
1859 cq->group = NULL;
1860 cq->nr_outstanding = 0;
1864 * and the controller is being shut down/reset or vfio-user client disconnects,
1874 sq->qid, sq);
1877 unmap_q(vu_ctrlr, &sq->mapping);
1881 sq->size = 0;
1883 sq->sq_state = VFIO_USER_SQ_DELETED;
1889 if (vu_ctrlr->reset_shn || vu_ctrlr->disconnect) {
1890 cqid = sq->cqid;
1891 cq = vu_ctrlr->cqs[cqid];
1894 cq->qid, cq);
1896 assert(cq->cq_ref > 0);
1897 if (--cq->cq_ref == 0) {
1913 sq = ctrlr->sqs[qid];
1916 unmap_q(ctrlr, &sq->mapping);
1920 free(sq->mapping.sg);
1922 ctrlr->sqs[qid] = NULL;
1925 cq = ctrlr->cqs[qid];
1928 unmap_q(ctrlr, &cq->mapping);
1929 free(cq->mapping.sg);
1931 ctrlr->cqs[qid] = NULL;
1943 assert(ctrlr->sqs[id] == NULL);
1947 return -ENOMEM;
1949 sq->mapping.sg = calloc(1, dma_sg_size());
1950 if (sq->mapping.sg == NULL) {
1952 return -ENOMEM;
1955 sq->qid = id;
1956 sq->qpair.qid = id;
1957 sq->qpair.transport = transport;
1958 sq->ctrlr = ctrlr;
1959 ctrlr->sqs[id] = sq;
1961 TAILQ_INIT(&sq->free_reqs);
1972 assert(vu_ctrlr->cqs[id] == NULL);
1976 return -ENOMEM;
1978 cq->mapping.sg = calloc(1, dma_sg_size());
1979 if (cq->mapping.sg == NULL) {
1981 return -ENOMEM;
1984 cq->qid = id;
1985 vu_ctrlr->cqs[id] = cq;
2000 for (i = 0; i < sq->size; i++) {
2008 req = &vu_req->req;
2009 req->qpair = &sq->qpair;
2010 req->rsp = (union nvmf_c2h_msg *)&vu_req->rsp;
2011 req->cmd = (union nvmf_h2c_msg *)&vu_req->cmd;
2012 req->stripped_data = NULL;
2014 TAILQ_INSERT_TAIL(&sq->free_reqs, vu_req, link);
2020 TAILQ_FOREACH_SAFE(vu_req, &sq->free_reqs, link, tmp) {
2023 return -ENOMEM;
2029 return ctrlr->sdbl != NULL ?
2030 ctrlr->sdbl->shadow_doorbells :
2031 ctrlr->bar0_doorbells;
2038 struct nvmf_vfio_user_transport *vu_transport = ctrlr->transport;
2045 qid = cmd->cdw10_bits.create_io_q.qid;
2046 cqid = cmd->cdw11_bits.create_io_sq.cqid;
2047 qsize = cmd->cdw10_bits.create_io_q.qsize + 1;
2049 if (ctrlr->sqs[qid] == NULL) {
2050 err = init_sq(ctrlr, ctrlr->sqs[0]->qpair.transport, qid);
2057 if (cqid == 0 || cqid >= vu_transport->transport.opts.max_qpairs_per_ctrlr) {
2070 if (cmd->cdw11_bits.create_io_sq.pc != 0x1) {
2071 SPDK_ERRLOG("%s: non-PC SQ not supported\n", ctrlr_id(ctrlr));
2076 sq = ctrlr->sqs[qid];
2077 sq->size = qsize;
2082 sq->mapping.prp1 = cmd->dptr.prp.prp1;
2083 sq->mapping.len = sq->size * sizeof(struct spdk_nvme_cmd);
2085 err = map_q(ctrlr, &sq->mapping, MAP_INITIALIZE);
2093 ctrlr_id(ctrlr), qid, cmd->dptr.prp.prp1,
2094 q_addr(&sq->mapping));
2103 sq->cqid = cqid;
2104 ctrlr->cqs[sq->cqid]->cq_ref++;
2105 sq->sq_state = VFIO_USER_SQ_CREATED;
2108 sq->dbl_tailp = ctrlr_doorbell_ptr(ctrlr) + queue_index(qid, false);
2115 * don't reset the shadow doorbell buffer after a Queue-Level or
2116 * Controller-Level reset, which means that we're left with garbage
2121 if (ctrlr->sdbl != NULL) {
2122 sq->need_rearm = true;
2141 sq->create_io_sq_cmd = *cmd;
2142 sq->post_create_io_sq_completion = true;
2144 spdk_nvmf_tgt_new_qpair(ctrlr->transport->transport.tgt,
2145 &sq->qpair);
2160 qid = cmd->cdw10_bits.create_io_q.qid;
2161 qsize = cmd->cdw10_bits.create_io_q.qsize + 1;
2163 if (ctrlr->cqs[qid] == NULL) {
2171 if (cmd->cdw11_bits.create_io_cq.pc != 0x1) {
2172 SPDK_ERRLOG("%s: non-PC CQ not supported\n", ctrlr_id(ctrlr));
2177 if (cmd->cdw11_bits.create_io_cq.iv > NVMF_VFIO_USER_MSIX_NUM - 1) {
2183 cq = ctrlr->cqs[qid];
2184 cq->size = qsize;
2186 cq->mapping.prp1 = cmd->dptr.prp.prp1;
2187 cq->mapping.len = cq->size * sizeof(struct spdk_nvme_cpl);
2189 cq->dbl_headp = ctrlr_doorbell_ptr(ctrlr) + queue_index(qid, true);
2191 err = map_q(ctrlr, &cq->mapping, MAP_RW | MAP_INITIALIZE);
2199 ctrlr_id(ctrlr), qid, cmd->dptr.prp.prp1,
2200 q_addr(&cq->mapping));
2202 cq->ien = cmd->cdw11_bits.create_io_cq.ien;
2203 cq->iv = cmd->cdw11_bits.create_io_cq.iv;
2204 cq->phase = true;
2205 cq->cq_state = VFIO_USER_CQ_CREATED;
2214 * don't reset the shadow doorbell buffer after a Queue-Level or
2215 * Controller-Level reset, which means that we're left with garbage
2225 * Creates a completion or submission I/O queue. Returns 0 on success, -errno
2232 struct nvmf_vfio_user_transport *vu_transport = ctrlr->transport;
2241 qid = cmd->cdw10_bits.create_io_q.qid;
2242 if (qid == 0 || qid >= vu_transport->transport.opts.max_qpairs_per_ctrlr) {
2244 qid, vu_transport->transport.opts.max_qpairs_per_ctrlr);
2258 qsize = cmd->cdw10_bits.create_io_q.qsize + 1;
2279 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct);
2294 struct nvmf_vfio_user_ctrlr *vu_ctrlr = ctx->vu_ctrlr;
2295 struct nvmf_vfio_user_cq *admin_cq = vu_ctrlr->cqs[0];
2298 assert(admin_cq->group != NULL);
2299 assert(admin_cq->group->group->thread != NULL);
2300 if (admin_cq->group->group->thread != spdk_get_thread()) {
2301 spdk_thread_send_msg(admin_cq->group->group->thread,
2306 ctx->cid,
2326 cmd->cdw10_bits.delete_io_q.qid);
2328 if (!io_q_exists(ctrlr, cmd->cdw10_bits.delete_io_q.qid, is_cq)) {
2330 is_cq ? 'c' : 's', cmd->cdw10_bits.delete_io_q.qid);
2337 cq = ctrlr->cqs[cmd->cdw10_bits.delete_io_q.qid];
2338 if (cq->cq_ref) {
2351 sq = ctrlr->sqs[cmd->cdw10_bits.delete_io_q.qid];
2352 sq->delete_ctx = calloc(1, sizeof(*sq->delete_ctx));
2353 if (!sq->delete_ctx) {
2358 sq->delete_ctx->vu_ctrlr = ctrlr;
2359 sq->delete_ctx->cid = cmd->cid;
2360 sq->sq_state = VFIO_USER_SQ_DELETED;
2361 assert(ctrlr->cqs[sq->cqid]->cq_ref);
2362 ctrlr->cqs[sq->cqid]->cq_ref--;
2364 spdk_nvmf_qpair_disconnect(&sq->qpair);
2369 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct);
2386 assert(ctrlr->endpoint != NULL);
2402 if (cmd->psdt != SPDK_NVME_PSDT_PRP) {
2409 prp1 = cmd->dptr.prp.prp1;
2410 prp2 = cmd->dptr.prp.prp2;
2426 sdbl = map_sdbl(ctrlr->endpoint->vfu_ctx, prp1, prp2, page_size);
2434 ctrlr->shadow_doorbell_buffer = prp1;
2435 ctrlr->eventidx_buffer = prp2;
2440 sdbl->iovs[0].iov_base,
2441 sdbl->iovs[0].iov_base + sdbl->iovs[0].iov_len,
2442 sdbl->iovs[1].iov_base,
2443 sdbl->iovs[1].iov_base + sdbl->iovs[1].iov_len);
2454 sdbl->eventidxs[queue_index(i, true)] = NVMF_VFIO_USER_EVENTIDX_POLL;
2458 SWAP(ctrlr->sdbl, sdbl);
2468 sdbl->shadow_doorbells : ctrlr->bar0_doorbells,
2469 ctrlr->sdbl->shadow_doorbells);
2484 * then we've just swapped a NULL from ctrlr->sdbl into sdbl, so
2487 free_sdbl(ctrlr->endpoint->vfu_ctx, sdbl);
2489 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid, sc, sct);
2492 /* Returns 0 on success and -errno on error. */
2499 if (cmd->fuse != 0) {
2501 return post_completion(ctrlr, ctrlr->cqs[0], 0, 0, cmd->cid,
2506 switch (cmd->opc) {
2510 cmd->opc == SPDK_NVME_OPC_CREATE_IO_CQ);
2514 cmd->opc == SPDK_NVME_OPC_DELETE_IO_CQ);
2518 !ctrlr->transport->transport_opts.disable_shadow_doorbells);
2519 if (!ctrlr->transport->transport_opts.disable_shadow_doorbells) {
2524 return handle_cmd_req(ctrlr, cmd, ctrlr->sqs[0]);
2532 struct nvmf_vfio_user_ctrlr *vu_ctrlr = sq->ctrlr;
2539 if (spdk_likely(vu_req->iovcnt)) {
2540 vfu_sgl_put(vu_ctrlr->endpoint->vfu_ctx,
2541 index_to_sg_t(vu_req->sg, 0),
2542 vu_req->iov, vu_req->iovcnt);
2544 sqid = sq->qid;
2545 cqid = sq->cqid;
2547 return post_completion(vu_ctrlr, vu_ctrlr->cqs[cqid],
2548 vu_req->req.rsp->nvme_cpl.cdw0,
2550 vu_req->req.cmd->nvme_cmd.cid,
2551 vu_req->req.rsp->nvme_cpl.status.sc,
2552 vu_req->req.rsp->nvme_cpl.status.sct);
2560 if (spdk_unlikely(nvmf_qpair_is_admin_queue(&sq->qpair))) {
2573 struct nvmf_vfio_user_cq *cq = ctrlr->cqs[sq->cqid];
2580 if (ctrlr->sdbl != NULL && sq->qid != 0) {
2583 * needs to be re-armed before we go to sleep.
2585 sq->need_rearm = true;
2589 queue = q_addr(&sq->mapping);
2609 if ((free_cq_slots-- <= cq->nr_outstanding)) {
2611 cq->last_head = *cq_dbl_headp(cq);
2614 if (free_cq_slots > cq->nr_outstanding) {
2620 vu_group->stats.cq_full++;
2628 if (in_interrupt_mode(ctrlr->transport)) {
2629 vu_group->need_kick = true;
2637 cq->nr_outstanding++;
2665 if (getsockopt(vfu_get_poll_fd(endpoint->vfu_ctx), SOL_SOCKET, SO_PEERCRED, &ucred,
2688 if (!info->vaddr) {
2692 map_start = info->mapping.iov_base;
2693 map_end = info->mapping.iov_base + info->mapping.iov_len;
2695 if (((uintptr_t)info->mapping.iov_base & MASK_2MB) ||
2696 (info->mapping.iov_len & MASK_2MB)) {
2697 SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %p-%p\n",
2698 info->vaddr, map_start, map_end);
2703 if (endpoint->ctrlr == NULL) {
2706 ctrlr = endpoint->ctrlr;
2708 SPDK_DEBUGLOG(nvmf_vfio, "%s: map IOVA %p-%p\n", endpoint_id(endpoint),
2715 if (info->prot == (PROT_WRITE | PROT_READ) && !is_peer_same_process(endpoint)) {
2716 ret = spdk_mem_register(info->mapping.iov_base, info->mapping.iov_len);
2718 SPDK_ERRLOG("Memory region register %p-%p failed, ret=%d\n",
2723 pthread_mutex_lock(&endpoint->lock);
2724 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) {
2725 if (sq->sq_state != VFIO_USER_SQ_INACTIVE) {
2729 cq = ctrlr->cqs[sq->cqid];
2732 if (cq->size && q_addr(&cq->mapping) == NULL) {
2733 ret = map_q(ctrlr, &cq->mapping, MAP_RW | MAP_QUIET);
2735 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap cqid:%d %#lx-%#lx\n",
2736 cq->qid, cq->mapping.prp1,
2737 cq->mapping.prp1 + cq->mapping.len);
2742 if (sq->size) {
2743 ret = map_q(ctrlr, &sq->mapping, MAP_R | MAP_QUIET);
2745 SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap sqid:%d %#lx-%#lx\n",
2746 sq->qid, sq->mapping.prp1,
2747 sq->mapping.prp1 + sq->mapping.len);
2751 sq->sq_state = VFIO_USER_SQ_ACTIVE;
2752 SPDK_DEBUGLOG(nvmf_vfio, "Remap sqid:%u successfully\n", sq->qid);
2754 pthread_mutex_unlock(&endpoint->lock);
2766 if (!info->vaddr) {
2770 map_start = info->mapping.iov_base;
2771 map_end = info->mapping.iov_base + info->mapping.iov_len;
2773 if (((uintptr_t)info->mapping.iov_base & MASK_2MB) ||
2774 (info->mapping.iov_len & MASK_2MB)) {
2775 SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %p-%p\n",
2776 info->vaddr, map_start, map_end);
2781 SPDK_DEBUGLOG(nvmf_vfio, "%s: unmap IOVA %p-%p\n", endpoint_id(endpoint),
2784 if (endpoint->ctrlr != NULL) {
2786 ctrlr = endpoint->ctrlr;
2788 pthread_mutex_lock(&endpoint->lock);
2789 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) {
2790 if (q_addr(&sq->mapping) >= map_start && q_addr(&sq->mapping) <= map_end) {
2791 unmap_q(ctrlr, &sq->mapping);
2792 sq->sq_state = VFIO_USER_SQ_INACTIVE;
2795 cq = ctrlr->cqs[sq->cqid];
2796 if (q_addr(&cq->mapping) >= map_start && q_addr(&cq->mapping) <= map_end) {
2797 unmap_q(ctrlr, &cq->mapping);
2801 if (ctrlr->sdbl != NULL) {
2805 const void *const iov_base = ctrlr->sdbl->iovs[i].iov_base;
2809 ctrlr->sdbl->shadow_doorbells,
2810 ctrlr->bar0_doorbells);
2812 free_sdbl(endpoint->vfu_ctx, ctrlr->sdbl);
2813 ctrlr->sdbl = NULL;
2819 pthread_mutex_unlock(&endpoint->lock);
2822 if (info->prot == (PROT_WRITE | PROT_READ) && !is_peer_same_process(endpoint)) {
2823 ret = spdk_mem_unregister(info->mapping.iov_base, info->mapping.iov_len);
2825 SPDK_ERRLOG("Memory region unregister %p-%p failed, ret=%d\n",
2831 /* Used to initiate a controller-level reset or a controller shutdown. */
2839 assert(vu_ctrlr->sqs[0] != NULL);
2840 assert(vu_ctrlr->cqs[0] != NULL);
2842 unmap_q(vu_ctrlr, &vu_ctrlr->sqs[0]->mapping);
2843 unmap_q(vu_ctrlr, &vu_ctrlr->cqs[0]->mapping);
2845 vu_ctrlr->sqs[0]->size = 0;
2846 *sq_headp(vu_ctrlr->sqs[0]) = 0;
2848 vu_ctrlr->sqs[0]->sq_state = VFIO_USER_SQ_INACTIVE;
2850 vu_ctrlr->cqs[0]->size = 0;
2851 *cq_tailp(vu_ctrlr->cqs[0]) = 0;
2857 spdk_nvmf_ctrlr_abort_aer(vu_ctrlr->ctrlr);
2861 free_sdbl(vu_ctrlr->endpoint->vfu_ctx, vu_ctrlr->sdbl);
2862 vu_ctrlr->sdbl = NULL;
2865 /* Used to re-enable the controller after a controller-level reset. */
2885 vu_ctrlr->sqs[0]->sq_state = VFIO_USER_SQ_ACTIVE;
2897 assert(req->req.cmd->prop_set_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET);
2898 assert(sq->ctrlr != NULL);
2899 vu_ctrlr = sq->ctrlr;
2901 if (req->req.cmd->prop_set_cmd.ofst != offsetof(struct spdk_nvme_registers, cc)) {
2905 cc.raw = req->req.cmd->prop_set_cmd.value.u64;
2906 diff.raw = cc.raw ^ req->cc.raw;
2915 vu_ctrlr->reset_shn = false;
2917 vu_ctrlr->reset_shn = true;
2923 vu_ctrlr->reset_shn = true;
2927 if (vu_ctrlr->reset_shn) {
2941 if (req->req.cmd->prop_get_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET) {
2942 assert(sq->ctrlr != NULL);
2945 memcpy(req->req.iov[0].iov_base,
2946 &req->req.rsp->prop_get_rsp.value.u64,
2947 req->req.length);
2955 * Handles a write at offset 0x1000 or more; this is the non-mapped path when a
2974 return -1;
2981 return -1;
2984 pos -= NVME_DOORBELLS_OFFSET;
2990 return -1;
2999 return -1;
3002 ctrlr->bar0_doorbells[pos] = *buf;
3007 group->stats.cqh_admin_writes++;
3009 group->stats.cqh_io_writes++;
3030 return -1;
3034 req = get_nvmf_vfio_user_req(vu_ctrlr->sqs[0]);
3037 return -1;
3039 regs = spdk_nvmf_ctrlr_get_regs(vu_ctrlr->ctrlr);
3040 req->cc.raw = regs->cc.raw;
3042 req->cb_fn = nvmf_vfio_user_prop_req_rsp;
3043 req->cb_arg = vu_ctrlr->sqs[0];
3044 req->req.cmd->prop_set_cmd.opcode = SPDK_NVME_OPC_FABRIC;
3045 req->req.cmd->prop_set_cmd.cid = 0;
3047 req->req.cmd->prop_set_cmd.attrib.size = 0;
3049 req->req.cmd->prop_set_cmd.attrib.size = 1;
3051 req->req.cmd->prop_set_cmd.ofst = pos;
3053 req->req.cmd->prop_set_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET;
3054 if (req->req.cmd->prop_set_cmd.attrib.size) {
3055 req->req.cmd->prop_set_cmd.value.u64 = *(uint64_t *)buf;
3057 req->req.cmd->prop_set_cmd.value.u32.high = 0;
3058 req->req.cmd->prop_set_cmd.value.u32.low = *(uint32_t *)buf;
3061 req->req.cmd->prop_get_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET;
3063 req->req.length = count;
3064 SPDK_IOV_ONE(req->req.iov, &req->req.iovcnt, buf, req->req.length);
3066 spdk_nvmf_request_exec(&req->req);
3079 ctrlr = endpoint->ctrlr;
3080 if (spdk_unlikely(endpoint->need_async_destroy || !ctrlr)) {
3082 return -1;
3110 SPDK_ERRLOG("%s: write %#lx-%#lx not supported\n",
3113 return -1;
3121 return -1;
3124 memcpy(buf, ((unsigned char *)endpoint->pci_config_space) + offset, count);
3168 p->hdr.bars[0].raw = 0x0;
3170 p->hdr.bars[1].raw = 0x0;
3173 p->hdr.bars[3].raw = 0x0;
3174 p->hdr.bars[4].raw = 0x0;
3175 p->hdr.bars[5].raw = 0x0;
3178 p->hdr.intr.ipin = 0x1;
3193 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr;
3195 endpoint->need_resume = false;
3201 if (!vu_ctrlr->queued_quiesce) {
3202 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING;
3209 if (in_interrupt_mode(endpoint->transport)) {
3218 * unquiesced from libvfio-user's perspective so from the moment
3219 * `vfio_user_quiesce_done` returns libvfio-user might quiesce the device
3235 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr;
3243 spdk_thread_send_msg(vu_ctrlr->thread, _vfio_user_endpoint_resume_done_msg, endpoint);
3250 struct nvmf_vfio_user_endpoint *endpoint = quiesce_ctx->endpoint;
3251 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr;
3261 assert(vu_ctrlr->state == VFIO_USER_CTRLR_PAUSING);
3262 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED;
3263 vfu_device_quiesced(endpoint->vfu_ctx, quiesce_ctx->status);
3264 vu_ctrlr->queued_quiesce = false;
3268 * so we need to re-check `vu_ctrlr->state`.
3270 if (vu_ctrlr->state == VFIO_USER_CTRLR_MIGRATING) {
3276 vu_ctrlr->state = VFIO_USER_CTRLR_RESUMING;
3277 ret = spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem,
3280 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED;
3290 struct nvmf_vfio_user_endpoint *endpoint = quiesce_ctx->endpoint;
3291 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr;
3298 quiesce_ctx->status = status;
3303 spdk_thread_send_msg(vu_ctrlr->thread,
3309 * we've already set ctrlr->state, so we won't process new entries, but we need
3320 struct nvmf_vfio_user_endpoint *endpoint = quiesce_ctx->endpoint;
3321 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr;
3322 struct nvmf_vfio_user_poll_group *vu_group = quiesce_ctx->group;
3323 struct spdk_nvmf_subsystem *subsystem = endpoint->subsystem;
3333 quiesce_ctx->group = TAILQ_NEXT(vu_group, link);
3334 if (quiesce_ctx->group != NULL) {
3335 spdk_thread_send_msg(poll_group_to_thread(quiesce_ctx->group),
3345 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING;
3356 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSING;
3365 quiesce_ctx->endpoint = vu_ctrlr->endpoint;
3366 quiesce_ctx->status = 0;
3367 quiesce_ctx->group = TAILQ_FIRST(&vu_ctrlr->transport->poll_groups);
3369 spdk_thread_send_msg(poll_group_to_thread(quiesce_ctx->group),
3377 struct spdk_nvmf_subsystem *subsystem = endpoint->subsystem;
3378 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr;
3387 if (!nvmf_subsystem_get_ctrlr(subsystem, vu_ctrlr->cntlid)) {
3396 if (!vu_ctrlr->ctrlr->vcprop.cc.bits.en) {
3398 } else if (!vu_ctrlr->ctrlr->vcprop.csts.bits.rdy) {
3400 } else if (vu_ctrlr->ctrlr->vcprop.csts.bits.shst == SPDK_NVME_SHST_COMPLETE) {
3404 switch (vu_ctrlr->state) {
3412 vu_ctrlr->queued_quiesce = true;
3414 vu_ctrlr->state);
3417 assert(vu_ctrlr->state != VFIO_USER_CTRLR_PAUSING);
3422 return -1;
3438 regs = &migr_data->nvmf_data.regs;
3439 doorbell_base = (uint32_t *)&migr_data->doorbells;
3442 SPDK_NOTICELOG("CSTS 0x%x\n", regs->csts.raw);
3443 SPDK_NOTICELOG("CAP 0x%"PRIx64"\n", regs->cap.raw);
3444 SPDK_NOTICELOG("VS 0x%x\n", regs->vs.raw);
3445 SPDK_NOTICELOG("CC 0x%x\n", regs->cc.raw);
3446 SPDK_NOTICELOG("AQA 0x%x\n", regs->aqa.raw);
3447 SPDK_NOTICELOG("ASQ 0x%"PRIx64"\n", regs->asq);
3448 SPDK_NOTICELOG("ACQ 0x%"PRIx64"\n", regs->acq);
3450 SPDK_NOTICELOG("Number of IO Queues %u\n", migr_data->ctrlr_header.num_io_queues);
3454 migr_data->ctrlr_header.shadow_doorbell_buffer);
3456 migr_data->ctrlr_header.eventidx_buffer);
3460 sq = &migr_data->qps[i].sq;
3461 cq = &migr_data->qps[i].cq;
3463 if (sq->size) {
3464 SPDK_NOTICELOG("sqid:%u, bar0_doorbell:%u\n", sq->sqid, doorbell_base[i * 2]);
3467 sq->sqid,
3468 sdbl->shadow_doorbells[queue_index(i, false)],
3469 sdbl->eventidxs[queue_index(i, false)]);
3472 sq->sqid, sq->cqid, sq->head, sq->size, sq->dma_addr);
3475 if (cq->size) {
3476 SPDK_NOTICELOG("cqid:%u, bar0_doorbell:%u\n", cq->cqid, doorbell_base[i * 2 + 1]);
3479 cq->cqid,
3480 sdbl->shadow_doorbells[queue_index(i, true)],
3481 sdbl->eventidxs[queue_index(i, true)]);
3484 cq->cqid, cq->phase, cq->tail, cq->size, cq->iv, cq->ien, cq->dma_addr);
3496 void *data_ptr = endpoint->migr_data;
3499 memcpy(&migr_state->ctrlr_header, data_ptr, sizeof(struct vfio_user_nvme_migr_header));
3501 if (migr_state->ctrlr_header.magic != VFIO_USER_NVME_MIGR_MAGIC) {
3502 SPDK_ERRLOG("%s: bad magic number %x\n", endpoint_id(endpoint), migr_state->ctrlr_header.magic);
3503 return -EINVAL;
3507 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.nvmf_data_offset;
3508 memcpy(&migr_state->nvmf_data, data_ptr, migr_state->ctrlr_header.nvmf_data_len);
3511 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.qp_offset;
3512 memcpy(&migr_state->qps, data_ptr, migr_state->ctrlr_header.qp_len);
3515 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.bar_offset[VFU_PCI_DEV_BAR0_REGION_IDX];
3516 memcpy(&migr_state->doorbells, data_ptr,
3517 migr_state->ctrlr_header.bar_len[VFU_PCI_DEV_BAR0_REGION_IDX]);
3520 data_ptr = endpoint->migr_data + migr_state->ctrlr_header.bar_offset[VFU_PCI_DEV_CFG_REGION_IDX];
3521 memcpy(&migr_state->cfg, data_ptr, migr_state->ctrlr_header.bar_len[VFU_PCI_DEV_CFG_REGION_IDX]);
3530 struct spdk_nvmf_ctrlr *ctrlr = vu_ctrlr->ctrlr;
3531 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint;
3558 TAILQ_FOREACH(sq, &vu_ctrlr->connected_sqs, tailq) {
3560 sqid = sq->qid;
3561 migr_state.qps[sqid].sq.sqid = sq->qid;
3562 migr_state.qps[sqid].sq.cqid = sq->cqid;
3564 migr_state.qps[sqid].sq.size = sq->size;
3565 migr_state.qps[sqid].sq.dma_addr = sq->mapping.prp1;
3568 cqid = sq->cqid;
3569 cq = vu_ctrlr->cqs[cqid];
3572 migr_state.qps[cqid].cq.ien = cq->ien;
3573 migr_state.qps[cqid].cq.iv = cq->iv;
3574 migr_state.qps[cqid].cq.size = cq->size;
3575 migr_state.qps[cqid].cq.phase = cq->phase;
3576 migr_state.qps[cqid].cq.dma_addr = cq->mapping.prp1;
3581 migr_state.ctrlr_header.num_io_queues = i - 1;
3585 memcpy(doorbell_base, (void *)vu_ctrlr->bar0_doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE);
3588 memcpy(&migr_state.cfg, (void *)endpoint->pci_config_space, NVME_REG_CFG_SIZE);
3591 data_ptr = endpoint->migr_data;
3623 if (vu_ctrlr->sdbl != NULL) {
3625 migr_state.ctrlr_header.shadow_doorbell_buffer = vu_ctrlr->shadow_doorbell_buffer;
3626 migr_state.ctrlr_header.eventidx_buffer = vu_ctrlr->eventidx_buffer;
3630 memcpy(endpoint->migr_data, &migr_state.ctrlr_header, sizeof(struct vfio_user_nvme_migr_header));
3633 vfio_user_ctrlr_dump_migr_data("SAVE", &migr_state, vu_ctrlr->sdbl);
3645 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr;
3651 spdk_interrupt_unregister(&ctrlr->intr);
3652 ctrlr->intr_fd = -1;
3658 if (ctrlr->sdbl != NULL) {
3660 free_sdbl(vfu_ctx, ctrlr->sdbl);
3661 ctrlr->sdbl = NULL;
3681 vfio_user_ctrlr_dump_migr_data("RESUME", migr_state, vu_ctrlr->sdbl);
3686 migr_qp = migr_state->qps[i];
3695 return -EINVAL;
3699 if (vu_ctrlr->sqs[sqid] == NULL) {
3700 ret = init_sq(vu_ctrlr, &vu_ctrlr->transport->transport, sqid);
3703 return -EFAULT;
3707 sq = vu_ctrlr->sqs[sqid];
3708 sq->size = qsize;
3713 return -EFAULT;
3717 sq->sq_state = VFIO_USER_SQ_CREATED;
3718 sq->cqid = migr_qp.sq.cqid;
3720 sq->mapping.prp1 = migr_qp.sq.dma_addr;
3721 sq->mapping.len = sq->size * sizeof(struct spdk_nvme_cmd);
3722 addr = map_one(vu_ctrlr->endpoint->vfu_ctx,
3723 sq->mapping.prp1, sq->mapping.len,
3724 sq->mapping.sg, &sq->mapping.iov,
3728 sqid, sq->mapping.prp1, sq->size);
3729 return -EFAULT;
3731 cqs_ref[sq->cqid]++;
3737 migr_qp = migr_state->qps[i];
3748 if (vu_ctrlr->cqs[cqid] == NULL) {
3752 return -EFAULT;
3756 cq = vu_ctrlr->cqs[cqid];
3758 cq->size = qsize;
3760 cq->cq_state = VFIO_USER_CQ_CREATED;
3761 cq->cq_ref = cqs_ref[cqid];
3763 cq->mapping.prp1 = migr_qp.cq.dma_addr;
3764 cq->mapping.len = cq->size * sizeof(struct spdk_nvme_cpl);
3765 cq->ien = migr_qp.cq.ien;
3766 cq->iv = migr_qp.cq.iv;
3767 cq->phase = migr_qp.cq.phase;
3768 addr = map_one(vu_ctrlr->endpoint->vfu_ctx,
3769 cq->mapping.prp1, cq->mapping.len,
3770 cq->mapping.sg, &cq->mapping.iov,
3774 cqid, cq->mapping.prp1, cq->size);
3775 return -EFAULT;
3786 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint;
3787 struct spdk_nvmf_ctrlr *ctrlr = vu_ctrlr->ctrlr;
3800 assert(endpoint->migr_data != NULL);
3810 sdbl = map_sdbl(vu_ctrlr->endpoint->vfu_ctx,
3815 SPDK_ERRLOG("%s: failed to re-map shadow doorbell buffers\n",
3817 return -1;
3820 vu_ctrlr->shadow_doorbell_buffer = migr_state.ctrlr_header.shadow_doorbell_buffer;
3821 vu_ctrlr->eventidx_buffer = migr_state.ctrlr_header.eventidx_buffer;
3823 SWAP(vu_ctrlr->sdbl, sdbl);
3832 memcpy((void *)endpoint->pci_config_space, &migr_state.cfg, NVME_REG_CFG_SIZE);
3836 memcpy((void *)vu_ctrlr->bar0_doorbells, doorbell_base, NVMF_VFIO_USER_DOORBELLS_SIZE);
3851 rc = handle_cmd_req(vu_ctrlr, &cmd, vu_ctrlr->sqs[0]);
3868 if (vu_ctrlr->sqs[0] != NULL) {
3869 vu_ctrlr->sqs[0]->dbl_tailp = vu_ctrlr->bar0_doorbells +
3873 if (vu_ctrlr->cqs[0] != NULL) {
3874 vu_ctrlr->cqs[0]->dbl_headp = vu_ctrlr->bar0_doorbells +
3878 vfio_user_ctrlr_switch_doorbells(vu_ctrlr, vu_ctrlr->sdbl != NULL);
3881 sq = vu_ctrlr->sqs[i];
3882 if (!sq || !sq->size) {
3886 if (nvmf_qpair_is_admin_queue(&sq->qpair)) {
3888 sq->sq_state = VFIO_USER_SQ_ACTIVE;
3890 spdk_nvmf_tgt_new_qpair(vu_ctrlr->transport->transport.tgt, &sq->qpair);
3896 * We are in stop-and-copy state, but still potentially have some current dirty
3907 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint;
3909 assert(vu_ctrlr->state == VFIO_USER_CTRLR_MIGRATING);
3912 struct nvmf_vfio_user_cq *cq = vu_ctrlr->cqs[i];
3914 if (cq == NULL || q_addr(&cq->mapping) == NULL) {
3918 vfu_sgl_mark_dirty(endpoint->vfu_ctx, cq->mapping.sg, 1);
3921 if (vu_ctrlr->sdbl != NULL) {
3928 if (!vu_ctrlr->sdbl->iovs[i].iov_len) {
3932 sg = index_to_sg_t(vu_ctrlr->sdbl->sgs, i);
3934 vfu_sgl_mark_dirty(endpoint->vfu_ctx, sg, 1);
3943 struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr;
3948 vu_ctrlr->state, state);
3952 vu_ctrlr->in_source_vm = true;
3953 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING;
3958 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING;
3961 * subsystem can be re-used for another new client.
3963 if (vu_ctrlr->in_source_vm) {
3964 endpoint->need_resume = true;
3968 assert(vu_ctrlr->state == VFIO_USER_CTRLR_PAUSED);
3976 if (vu_ctrlr->state != VFIO_USER_CTRLR_RUNNING) {
3980 assert(!vu_ctrlr->in_source_vm);
3981 vu_ctrlr->state = VFIO_USER_CTRLR_MIGRATING;
3983 sq = TAILQ_FIRST(&vu_ctrlr->connected_sqs);
3985 assert(sq->qpair.qid == 0);
3986 sq->sq_state = VFIO_USER_SQ_INACTIVE;
3992 sq->size = 0;
3996 if (vu_ctrlr->state != VFIO_USER_CTRLR_MIGRATING) {
4000 if (!vu_ctrlr->in_source_vm) {
4009 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING;
4013 vu_ctrlr->state = VFIO_USER_CTRLR_RESUMING;
4014 ret = spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem,
4018 vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED;
4022 vu_ctrlr->migr_data_prepared = false;
4023 vu_ctrlr->in_source_vm = false;
4027 return -EINVAL;
4037 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr;
4040 if (ctrlr->migr_data_prepared) {
4041 assert(ctrlr->state == VFIO_USER_CTRLR_MIGRATING);
4049 endpoint_id(endpoint), ctrlr->state, pending_bytes);
4058 struct nvmf_vfio_user_ctrlr *ctrlr = endpoint->ctrlr;
4061 * When transitioning to pre-copy state we set pending_bytes to 0,
4062 * so the vfio-user client shouldn't attempt to read any migration
4063 * data. This is not yet guaranteed by libvfio-user.
4065 if (ctrlr->state != VFIO_USER_CTRLR_MIGRATING) {
4072 if (ctrlr->in_source_vm) { /* migration source */
4078 assert(!ctrlr->migr_data_prepared);
4081 ctrlr->migr_data_prepared = true;
4083 SPDK_DEBUGLOG(nvmf_vfio, "%s current state %u\n", endpoint_id(endpoint), ctrlr->state);
4097 return -1;
4109 return -1;
4122 return -1;
4134 vfu_ctx_t *vfu_ctx = endpoint->vfu_ctx;
4147 .mxc.ts = NVMF_VFIO_USER_MSIX_NUM - 1,
4177 * 0x08, non-volatile memory controller
4178 * 0x01, mass storage controller
4201 access_pci_config, VFU_REGION_FLAG_RW, NULL, 0, -1, 0);
4207 if (vu_transport->transport_opts.disable_mappable_bar0) {
4210 NULL, 0, -1, 0);
4214 sparse_mmap, 1, endpoint->devmem_fd, 0);
4223 NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0);
4230 NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0);
4267 1, endpoint->migr_fd, 0);
4286 endpoint->pci_config_space = vfu_pci_get_config_space(endpoint->vfu_ctx);
4287 assert(endpoint->pci_config_space != NULL);
4288 init_pci_config_space(endpoint->pci_config_space);
4291 endpoint->msix = (struct msixcap *)((uint8_t *)endpoint->pci_config_space + cap_offset);
4299 * Register an "accept" poller: this is polling for incoming vfio-user socket
4308 uint64_t poll_rate_us = endpoint->transport->transport.opts.acceptor_poll_rate;
4312 endpoint->accept_poller = SPDK_POLLER_REGISTER(nvmf_vfio_user_accept,
4315 if (!endpoint->accept_poller) {
4316 return -1;
4319 endpoint->accept_thread = spdk_get_thread();
4320 endpoint->need_relisten = false;
4326 endpoint->accept_intr_fd = vfu_get_poll_fd(endpoint->vfu_ctx);
4327 assert(endpoint->accept_intr_fd != -1);
4329 endpoint->accept_intr = SPDK_INTERRUPT_REGISTER(endpoint->accept_intr_fd,
4332 assert(endpoint->accept_intr != NULL);
4334 spdk_poller_register_interrupt(endpoint->accept_poller, NULL, NULL);
4350 struct nvmf_vfio_user_endpoint *endpoint = ctrlr->endpoint;
4352 free_sdbl(endpoint->vfu_ctx, ctrlr->sdbl);
4354 spdk_interrupt_unregister(&ctrlr->intr);
4355 ctrlr->intr_fd = -1;
4356 spdk_poller_unregister(&ctrlr->vfu_ctx_poller);
4360 if (endpoint->need_async_destroy) {
4362 } else if (endpoint->need_relisten) {
4363 spdk_thread_send_msg(endpoint->accept_thread,
4375 thread = ctrlr->thread ? ctrlr->thread : spdk_get_thread();
4395 /* First, construct a vfio-user CUSTOM transport controller */
4398 err = -ENOMEM;
4402 * We can only support one connection for now, but generate a unique cntlid in case vfio-user
4403 * transport is used together with RDMA or TCP transports in the same target
4405 ctrlr->cntlid = nvmf_subsystem_gen_cntlid(endpoint->subsystem);
4406 ctrlr->intr_fd = -1;
4407 ctrlr->transport = transport;
4408 ctrlr->endpoint = endpoint;
4409 ctrlr->bar0_doorbells = endpoint->bar0_doorbells;
4410 TAILQ_INIT(&ctrlr->connected_sqs);
4412 ctrlr->adaptive_irqs_enabled =
4413 !transport->transport_opts.disable_adaptive_irq;
4416 err = init_sq(ctrlr, &transport->transport, 0);
4428 ctrlr->sqs[0]->size = NVMF_VFIO_USER_DEFAULT_AQ_DEPTH;
4430 err = alloc_sq_reqs(ctrlr, ctrlr->sqs[0]);
4435 endpoint->ctrlr = ctrlr;
4438 spdk_nvmf_tgt_new_qpair(transport->transport.tgt, &ctrlr->sqs[0]->qpair);
4442 SPDK_ERRLOG("%s: failed to create vfio-user controller: %s\n",
4443 endpoint_id(endpoint), strerror(-err));
4463 pthread_mutex_lock(&vu_transport->lock);
4464 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) {
4466 if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) {
4467 pthread_mutex_unlock(&vu_transport->lock);
4468 return -EEXIST;
4471 pthread_mutex_unlock(&vu_transport->lock);
4475 return -ENOMEM;
4478 pthread_mutex_init(&endpoint->lock, NULL);
4479 endpoint->devmem_fd = -1;
4480 memcpy(&endpoint->trid, trid, sizeof(endpoint->trid));
4481 endpoint->transport = vu_transport;
4486 ret = -1;
4491 if (ret == -1) {
4498 endpoint->devmem_fd = ret;
4499 ret = ftruncate(endpoint->devmem_fd,
4507 endpoint->bar0_doorbells = mmap(NULL, NVMF_VFIO_USER_DOORBELLS_SIZE,
4508 PROT_READ | PROT_WRITE, MAP_SHARED, endpoint->devmem_fd, NVME_DOORBELLS_OFFSET);
4509 if (endpoint->bar0_doorbells == MAP_FAILED) {
4511 endpoint->bar0_doorbells = NULL;
4512 ret = -1;
4520 ret = -1;
4524 if (ret == -1) {
4531 endpoint->migr_fd = ret;
4532 ret = ftruncate(endpoint->migr_fd,
4540 endpoint->migr_data = mmap(NULL, vfio_user_migr_data_len(),
4541 PROT_READ | PROT_WRITE, MAP_SHARED, endpoint->migr_fd, vfu_get_migr_register_area_size());
4542 if (endpoint->migr_data == MAP_FAILED) {
4544 endpoint->migr_data = NULL;
4545 ret = -1;
4552 ret = -1;
4556 endpoint->vfu_ctx = vfu_create_ctx(VFU_TRANS_SOCK, uuid, LIBVFIO_USER_FLAG_ATTACH_NB,
4558 if (endpoint->vfu_ctx == NULL) {
4561 ret = -1;
4565 ret = vfu_setup_log(endpoint->vfu_ctx, vfio_user_log,
4583 pthread_mutex_lock(&vu_transport->lock);
4584 TAILQ_INSERT_TAIL(&vu_transport->endpoints, endpoint, link);
4585 pthread_mutex_unlock(&vu_transport->lock);
4603 assert(trid->traddr != NULL);
4605 SPDK_DEBUGLOG(nvmf_vfio, "%s: stop listen\n", trid->traddr);
4610 pthread_mutex_lock(&vu_transport->lock);
4611 TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) {
4612 if (strcmp(trid->traddr, endpoint->trid.traddr) == 0) {
4613 TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link);
4616 * 1. kill nvmf target while VM is connected
4620 if (endpoint->ctrlr) {
4621 assert(!endpoint->need_async_destroy);
4622 endpoint->need_async_destroy = true;
4623 pthread_mutex_unlock(&vu_transport->lock);
4628 pthread_mutex_unlock(&vu_transport->lock);
4632 pthread_mutex_unlock(&vu_transport->lock);
4634 SPDK_DEBUGLOG(nvmf_vfio, "%s: not found\n", trid->traddr);
4646 cdata->vid = SPDK_PCI_VID_NUTANIX;
4647 cdata->ssvid = SPDK_PCI_VID_NUTANIX;
4648 cdata->ieee[0] = 0x8d;
4649 cdata->ieee[1] = 0x6b;
4650 cdata->ieee[2] = 0x50;
4651 memset(&cdata->sgls, 0, sizeof(struct spdk_nvme_cdata_sgls));
4652 cdata->sgls.supported = SPDK_NVME_SGLS_SUPPORTED_DWORD_ALIGNED;
4653 cdata->oncs.compare = !vu_transport->transport_opts.disable_compare;
4654 /* libvfio-user can only support 1 connection for now */
4655 cdata->oncs.reservations = 0;
4656 cdata->oacs.doorbell_buffer_config = !vu_transport->transport_opts.disable_shadow_doorbells;
4657 cdata->fuses.compare_and_write = !vu_transport->transport_opts.disable_compare;
4670 pthread_mutex_lock(&vu_transport->lock);
4671 TAILQ_FOREACH(endpoint, &vu_transport->endpoints, link) {
4672 if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) {
4676 pthread_mutex_unlock(&vu_transport->lock);
4679 return -ENOENT;
4682 /* Drop const - we will later need to pause/unpause. */
4683 endpoint->subsystem = (struct spdk_nvmf_subsystem *)subsystem;
4692 * For this endpoint (which at the libvfio-user level corresponds to a socket),
4703 vu_transport = endpoint->transport;
4705 if (endpoint->ctrlr != NULL) {
4713 if (endpoint->need_resume) {
4717 err = vfu_attach_ctx(endpoint->vfu_ctx);
4728 spdk_interrupt_unregister(&endpoint->accept_intr);
4729 spdk_poller_unregister(&endpoint->accept_poller);
4753 vu_group->intr_fd = eventfd(0, EFD_NONBLOCK);
4754 assert(vu_group->intr_fd != -1);
4756 vu_group->intr = SPDK_INTERRUPT_REGISTER(vu_group->intr_fd,
4758 assert(vu_group->intr != NULL);
4783 TAILQ_INIT(&vu_group->sqs);
4785 pthread_mutex_lock(&vu_transport->pg_lock);
4786 TAILQ_INSERT_TAIL(&vu_transport->poll_groups, vu_group, link);
4787 if (vu_transport->next_pg == NULL) {
4788 vu_transport->next_pg = vu_group;
4790 pthread_mutex_unlock(&vu_transport->pg_lock);
4792 return &vu_group->group;
4806 cq = sq->ctrlr->cqs[sq->cqid];
4808 vu_transport = SPDK_CONTAINEROF(qpair->transport, struct nvmf_vfio_user_transport, transport);
4810 pthread_mutex_lock(&vu_transport->pg_lock);
4811 if (TAILQ_EMPTY(&vu_transport->poll_groups)) {
4821 if (cq->group != NULL) {
4822 result = cq->group;
4833 !vu_transport->transport_opts.enable_intr_mode_sq_spreading) {
4834 result = sq->ctrlr->sqs[0]->group;
4840 vu_group = &vu_transport->next_pg;
4843 result = &(*vu_group)->group;
4846 *vu_group = TAILQ_FIRST(&vu_transport->poll_groups);
4850 if (cq->group == NULL) {
4851 cq->group = result;
4854 pthread_mutex_unlock(&vu_transport->pg_lock);
4861 assert(vu_group->intr_fd != -1);
4863 spdk_interrupt_unregister(&vu_group->intr);
4865 close(vu_group->intr_fd);
4866 vu_group->intr_fd = -1;
4879 vu_transport = SPDK_CONTAINEROF(vu_group->group.transport, struct nvmf_vfio_user_transport,
4886 pthread_mutex_lock(&vu_transport->pg_lock);
4888 TAILQ_REMOVE(&vu_transport->poll_groups, vu_group, link);
4890 next_tgroup = TAILQ_FIRST(&vu_transport->poll_groups);
4892 if (vu_transport->next_pg == vu_group) {
4893 vu_transport->next_pg = next_tgroup;
4895 pthread_mutex_unlock(&vu_transport->pg_lock);
4905 spdk_nvmf_qpair_disconnect(&sq->qpair);
4917 endpoint = ctrlr->endpoint;
4920 pthread_mutex_lock(&endpoint->lock);
4921 endpoint->need_relisten = true;
4922 ctrlr->disconnect = true;
4923 if (TAILQ_EMPTY(&ctrlr->connected_sqs)) {
4924 endpoint->ctrlr = NULL;
4926 pthread_mutex_unlock(&endpoint->lock);
4930 TAILQ_FOREACH(sq, &ctrlr->connected_sqs, tailq) {
4932 spdk_thread_send_msg(ctrlr->thread, _vfio_user_qpair_disconnect, sq);
4934 pthread_mutex_unlock(&endpoint->lock);
4940 * Poll for and process any incoming vfio-user messages.
4952 ret = vfu_run_ctx(ctrlr->endpoint->vfu_ctx);
4953 if (spdk_unlikely(ret == -1)) {
4958 spdk_poller_unregister(&ctrlr->vfu_ctx_poller);
4973 spdk_interrupt_unregister(&ctrlr->intr);
4974 ctrlr->intr_fd = -1;
4992 post_completion(cpl_ctx->ctrlr, cpl_ctx->cq, cpl_ctx->cpl.cdw0, cpl_ctx->cpl.sqid,
4993 cpl_ctx->cpl.cid, cpl_ctx->cpl.status.sc, cpl_ctx->cpl.status.sct);
5007 ret |= nvmf_vfio_user_poll_group_poll(&vu_group->group);
5010 * Re-arm the event indexes. NB: this also could rearm other
5015 vu_group->stats.pg_process_count++;
5025 eventfd_read(vu_group->intr_fd, &val);
5027 vu_group->stats.intr++;
5049 vu_ctrlr_group->stats.ctrlr_intr++;
5052 * Poll vfio-user for this controller. We need to do this before polling
5061 if (vu_ctrlr->sqs[0] == NULL) {
5065 if (vu_ctrlr->transport->transport_opts.enable_intr_mode_sq_spreading) {
5071 TAILQ_FOREACH(vu_group, &vu_ctrlr->transport->poll_groups, link) {
5074 eventfd_write(vu_group->intr_fd, 1);
5090 assert(ctrlr->endpoint != NULL);
5099 ctrlr->endpoint->interrupt_mode = interrupt_mode;
5112 struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint;
5114 vu_ctrlr->ctrlr = ctrlr;
5115 vu_ctrlr->cntlid = ctrlr->cntlid;
5116 vu_ctrlr->thread = spdk_get_thread();
5117 vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING;
5119 if (!in_interrupt_mode(endpoint->transport)) {
5120 vu_ctrlr->vfu_ctx_poller = SPDK_POLLER_REGISTER(vfio_user_poll_vfu_ctx,
5125 vu_ctrlr->vfu_ctx_poller = SPDK_POLLER_REGISTER(vfio_user_poll_vfu_ctx,
5128 vu_ctrlr->intr_fd = vfu_get_poll_fd(vu_ctrlr->endpoint->vfu_ctx);
5129 assert(vu_ctrlr->intr_fd != -1);
5131 vu_ctrlr->intr = SPDK_INTERRUPT_REGISTER(vu_ctrlr->intr_fd,
5134 assert(vu_ctrlr->intr != NULL);
5136 spdk_poller_register_interrupt(vu_ctrlr->vfu_ctx_poller,
5153 vu_ctrlr = sq->ctrlr;
5155 endpoint = vu_ctrlr->endpoint;
5158 if (spdk_nvme_cpl_is_error(&req->req.rsp->nvme_cpl)) {
5159 SPDK_ERRLOG("SC %u, SCT %u\n", req->req.rsp->nvme_cpl.status.sc, req->req.rsp->nvme_cpl.status.sct);
5160 endpoint->ctrlr = NULL;
5162 return -1;
5165 vu_group = SPDK_CONTAINEROF(sq->group, struct nvmf_vfio_user_poll_group, group);
5166 TAILQ_INSERT_TAIL(&vu_group->sqs, sq, link);
5168 admin_cq = vu_ctrlr->cqs[0];
5170 assert(admin_cq->group != NULL);
5171 assert(admin_cq->group->group->thread != NULL);
5173 pthread_mutex_lock(&endpoint->lock);
5174 if (nvmf_qpair_is_admin_queue(&sq->qpair)) {
5175 assert(admin_cq->group->group->thread == spdk_get_thread());
5180 admin_cq->cq_ref = 1;
5181 start_ctrlr(vu_ctrlr, sq->qpair.ctrlr);
5187 if (sq->post_create_io_sq_completion) {
5188 if (admin_cq->group->group->thread != spdk_get_thread()) {
5193 return -ENOMEM;
5195 cpl_ctx->ctrlr = vu_ctrlr;
5196 cpl_ctx->cq = admin_cq;
5197 cpl_ctx->cpl.sqid = 0;
5198 cpl_ctx->cpl.cdw0 = 0;
5199 cpl_ctx->cpl.cid = sq->create_io_sq_cmd.cid;
5200 cpl_ctx->cpl.status.sc = SPDK_NVME_SC_SUCCESS;
5201 cpl_ctx->cpl.status.sct = SPDK_NVME_SCT_GENERIC;
5203 spdk_thread_send_msg(admin_cq->group->group->thread,
5208 sq->create_io_sq_cmd.cid, SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC);
5210 sq->post_create_io_sq_completion = false;
5211 } else if (in_interrupt_mode(endpoint->transport)) {
5222 sq->sq_state = VFIO_USER_SQ_ACTIVE;
5225 TAILQ_INSERT_TAIL(&vu_ctrlr->connected_sqs, sq, tailq);
5226 pthread_mutex_unlock(&endpoint->lock);
5228 free(req->req.iov[0].iov_base);
5229 req->req.iov[0].iov_base = NULL;
5230 req->req.iovcnt = 0;
5259 sq->group = group;
5260 ctrlr = sq->ctrlr;
5263 ctrlr_id(ctrlr), sq->qpair.qid,
5266 admin = nvmf_qpair_is_admin_queue(&sq->qpair);
5270 return -1;
5273 req = &vu_req->req;
5274 req->cmd->connect_cmd.opcode = SPDK_NVME_OPC_FABRIC;
5275 req->cmd->connect_cmd.cid = 0;
5276 req->cmd->connect_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_CONNECT;
5277 req->cmd->connect_cmd.recfmt = 0;
5278 req->cmd->connect_cmd.sqsize = sq->size - 1;
5279 req->cmd->connect_cmd.qid = admin ? 0 : qpair->qid;
5281 req->length = sizeof(struct spdk_nvmf_fabric_connect_data);
5283 data = calloc(1, req->length);
5286 return -ENOMEM;
5289 SPDK_IOV_ONE(req->iov, &req->iovcnt, data, req->length);
5291 data->cntlid = ctrlr->cntlid;
5292 snprintf(data->subnqn, sizeof(data->subnqn), "%s",
5293 spdk_nvmf_subsystem_get_nqn(ctrlr->endpoint->subsystem));
5295 vu_req->cb_fn = handle_queue_connect_rsp;
5296 vu_req->cb_arg = sq;
5300 ctrlr_id(ctrlr), qpair->qid, data->cntlid);
5324 ctrlr_id(sq->ctrlr), qpair->qid, qpair, group);
5328 TAILQ_REMOVE(&vu_group->sqs, sq, link);
5336 memset(&vu_req->cmd, 0, sizeof(vu_req->cmd));
5337 memset(&vu_req->rsp, 0, sizeof(vu_req->rsp));
5338 vu_req->iovcnt = 0;
5339 vu_req->req.iovcnt = 0;
5340 vu_req->req.length = 0;
5341 vu_req->state = VFIO_USER_REQUEST_STATE_FREE;
5343 TAILQ_INSERT_TAIL(&sq->free_reqs, vu_req, link);
5355 sq = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair);
5371 sq = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair);
5373 if (vu_req->cb_fn != NULL) {
5374 if (vu_req->cb_fn(vu_req, vu_req->cb_arg) != 0) {
5375 fail_ctrlr(sq->ctrlr);
5395 vu_ctrlr = sq->ctrlr;
5396 endpoint = vu_ctrlr->endpoint;
5397 del_ctx = sq->delete_ctx;
5398 sq->delete_ctx = NULL;
5400 pthread_mutex_lock(&endpoint->lock);
5401 TAILQ_REMOVE(&vu_ctrlr->connected_sqs, sq, tailq);
5403 if (TAILQ_EMPTY(&vu_ctrlr->connected_sqs)) {
5404 endpoint->ctrlr = NULL;
5405 if (vu_ctrlr->in_source_vm && endpoint->need_resume) {
5410 spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem,
5415 pthread_mutex_unlock(&endpoint->lock);
5438 req = TAILQ_FIRST(&sq->free_reqs);
5443 TAILQ_REMOVE(&sq->free_reqs, req, link);
5453 struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
5454 struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
5457 nsid = cmd->nsid;
5458 ns = _nvmf_subsystem_get_ns(ctrlr->subsys, nsid);
5459 if (ns == NULL || ns->bdev == NULL) {
5460 SPDK_ERRLOG("unsuccessful query for nsid %u\n", cmd->nsid);
5461 return -EINVAL;
5464 if (cmd->opc == SPDK_NVME_OPC_DATASET_MANAGEMENT) {
5465 nr = cmd->cdw10_bits.dsm.nr + 1;
5469 if (cmd->opc == SPDK_NVME_OPC_COPY) {
5470 nr = (cmd->cdw12 & 0x000000ffu) + 1;
5474 nlb = (cmd->cdw12 & 0x0000ffffu) + 1;
5475 return nlb * spdk_bdev_desc_get_block_size(ns->desc);
5481 struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
5486 req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc);
5488 if (req->xfer == SPDK_NVME_DATA_NONE) {
5492 switch (cmd->opc) {
5497 numdw = ((((uint32_t)cmd->cdw11_bits.get_log_page.numdu << 16) |
5498 cmd->cdw10_bits.get_log_page.numdl) + 1);
5500 return -EINVAL;
5506 fid = cmd->cdw10_bits.set_features.fid;
5521 if (cmd->cdw11_bits.feat_host_identifier.bits.exhid) {
5532 return -ENOTSUP;
5538 if (cmd->psdt != 0) {
5539 return -EINVAL;
5542 iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, len);
5545 ctrlr_id(ctrlr), cmd->opc);
5546 return -1;
5548 req->length = len;
5549 req->iovcnt = iovcnt;
5557 * Returns 0 on success and -errno on failure.
5568 cmd = &req->cmd->nvme_cmd;
5569 req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc);
5571 if (spdk_unlikely(req->xfer == SPDK_NVME_DATA_NONE)) {
5577 return -EINVAL;
5579 req->length = len;
5581 iovcnt = vfio_user_map_cmd(ctrlr, req, req->iov, req->length);
5583 SPDK_ERRLOG("%s: failed to map IO OPC %u\n", ctrlr_id(ctrlr), cmd->opc);
5584 return -EFAULT;
5586 req->iovcnt = iovcnt;
5604 SPDK_ERRLOG("%s: no request for NVMe command opc 0x%x\n", ctrlr_id(ctrlr), cmd->opc);
5605 return post_completion(ctrlr, ctrlr->cqs[sq->cqid], 0, 0, cmd->cid,
5609 req = &vu_req->req;
5611 assert(req->qpair != NULL);
5613 ctrlr_id(ctrlr), req->qpair->qid, cmd->opc, cmd->cid);
5615 vu_req->cb_fn = handle_cmd_rsp;
5616 vu_req->cb_arg = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_sq, qpair);
5617 req->cmd->nvme_cmd = *cmd;
5619 if (nvmf_qpair_is_admin_queue(req->qpair)) {
5622 switch (cmd->opc) {
5628 err = -ENOTSUP;
5638 ctrlr_id(ctrlr), cmd->opc);
5639 req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC;
5640 req->rsp->nvme_cpl.status.sc = err == -ENOTSUP ?
5643 err = handle_cmd_rsp(vu_req, vu_req->cb_arg);
5648 vu_req->state = VFIO_USER_REQUEST_STATE_EXECUTING;
5657 * the queue (i.e. ->last_head isn't changing), we need an IRQ.
5663 struct nvmf_vfio_user_cq *cq = ctrlr->cqs[sq->cqid];
5667 if (!cq->ien || cq->qid == 0 || !ctrlr_interrupt_enabled(ctrlr)) {
5674 if (cq_tail == cq->last_trigger_irq_tail) {
5681 if (cq_head != cq_tail && cq_head == cq->last_head) {
5682 int err = vfu_irq_trigger(ctrlr->endpoint->vfu_ctx, cq->iv);
5687 cq->last_trigger_irq_tail = cq_tail;
5691 cq->last_head = cq_head;
5704 ctrlr = sq->ctrlr;
5710 if (ctrlr->state != VFIO_USER_CTRLR_RUNNING) {
5714 if (ctrlr->adaptive_irqs_enabled) {
5719 * on SPDK target side. This is because there is memory type mismatch
5721 * device memory while on SPDK target side, it is treated as normal
5724 * Memory-aliasing-and-mismatched-memory-types". Only using spdk_mb()
5730 /* Load-Acquire. */
5734 if (spdk_unlikely(new_tail >= sq->size)) {
5735 SPDK_DEBUGLOG(nvmf_vfio, "%s: invalid sqid:%u doorbell value %u\n", ctrlr_id(ctrlr), sq->qid,
5737 spdk_nvmf_ctrlr_async_event_error_event(ctrlr->ctrlr, SPDK_NVME_ASYNC_EVENT_INVALID_DB_WRITE);
5739 return -1;
5747 ctrlr_id(ctrlr), sq->qid, *sq_headp(sq), new_tail);
5748 if (ctrlr->sdbl != NULL) {
5751 ctrlr_id(ctrlr), sq->qid,
5752 ctrlr->bar0_doorbells[queue_index(sq->qid, false)],
5753 ctrlr->sdbl->shadow_doorbells[queue_index(sq->qid, false)],
5754 ctrlr->sdbl->eventidxs[queue_index(sq->qid, false)]);
5760 * update the SQ tail doorbell (their Store-Release).
5773 * vfio-user transport poll handler. Note that the library context is polled in
5774 * a separate poller (->vfu_ctx_poller), so this poller only needs to poll the
5792 TAILQ_FOREACH_SAFE(sq, &vu_group->sqs, link, tmp) {
5795 if (spdk_unlikely(sq->sq_state != VFIO_USER_SQ_ACTIVE || !sq->size)) {
5808 vu_group->stats.polls++;
5809 vu_group->stats.poll_reqs += count;
5810 vu_group->stats.poll_reqs_squared += count * count;
5812 vu_group->stats.polls_spurious++;
5815 if (vu_group->need_kick) {
5830 ctrlr = sq->ctrlr;
5832 memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid));
5851 ctrlr = sq->ctrlr;
5853 memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid));
5865 cid = req->cmd->nvme_cmd.cdw10_bits.abort.cid;
5867 TAILQ_FOREACH(temp_req, &qpair->outstanding, link) {
5872 if (vu_req->state == VFIO_USER_REQUEST_STATE_EXECUTING && vu_req->cmd.cid == cid) {
5883 req->req_to_abort = req_to_abort;
5895 spdk_json_write_named_uint64(w, "ctrlr_intr", vu_group->stats.ctrlr_intr);
5896 spdk_json_write_named_uint64(w, "ctrlr_kicks", vu_group->stats.ctrlr_kicks);
5897 spdk_json_write_named_uint64(w, "pg_kicks", vu_group->stats.pg_kicks);
5898 spdk_json_write_named_uint64(w, "won", vu_group->stats.won);
5899 spdk_json_write_named_uint64(w, "lost", vu_group->stats.lost);
5900 spdk_json_write_named_uint64(w, "lost_count", vu_group->stats.lost_count);
5901 spdk_json_write_named_uint64(w, "rearms", vu_group->stats.rearms);
5902 spdk_json_write_named_uint64(w, "cq_full", vu_group->stats.cq_full);
5903 spdk_json_write_named_uint64(w, "pg_process_count", vu_group->stats.pg_process_count);
5904 spdk_json_write_named_uint64(w, "intr", vu_group->stats.intr);
5905 spdk_json_write_named_uint64(w, "polls", vu_group->stats.polls);
5906 spdk_json_write_named_uint64(w, "polls_spurious", vu_group->stats.polls_spurious);
5907 spdk_json_write_named_uint64(w, "poll_reqs", vu_group->stats.poll_reqs);
5908 polls_denom = vu_group->stats.polls * (vu_group->stats.polls - 1);
5910 uint64_t n = vu_group->stats.polls * vu_group->stats.poll_reqs_squared - vu_group->stats.poll_reqs *
5911 vu_group->stats.poll_reqs;
5915 spdk_json_write_named_uint64(w, "cqh_admin_writes", vu_group->stats.cqh_admin_writes);
5916 spdk_json_write_named_uint64(w, "cqh_io_writes", vu_group->stats.cqh_io_writes);
5922 opts->max_queue_depth = NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH;
5923 opts->max_qpairs_per_ctrlr = NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR;
5924 opts->in_capsule_data_size = 0;
5925 opts->max_io_size = NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE;
5926 opts->io_unit_size = NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE;
5927 opts->max_aq_depth = NVMF_VFIO_USER_DEFAULT_AQ_DEPTH;
5928 opts->num_shared_buffers = 0;
5929 opts->buf_cache_size = 0;
5930 opts->association_timeout = 0;
5931 opts->transport_specific = NULL;