xref: /dpdk/drivers/vdpa/ifc/ifcvf_vdpa.c (revision 2b843cac232eb3f2fa79e4254e21766817e2019f)
15c060bf1SMatan Azrad /* SPDX-License-Identifier: BSD-3-Clause
25c060bf1SMatan Azrad  * Copyright(c) 2018 Intel Corporation
35c060bf1SMatan Azrad  */
45c060bf1SMatan Azrad 
55c060bf1SMatan Azrad #include <unistd.h>
65c060bf1SMatan Azrad #include <pthread.h>
75c060bf1SMatan Azrad #include <fcntl.h>
85c060bf1SMatan Azrad #include <string.h>
95c060bf1SMatan Azrad #include <sys/ioctl.h>
105c060bf1SMatan Azrad #include <sys/epoll.h>
115c060bf1SMatan Azrad #include <linux/virtio_net.h>
125c060bf1SMatan Azrad #include <stdbool.h>
135c060bf1SMatan Azrad 
14924e6b76SThomas Monjalon #include <rte_eal_paging.h>
155c060bf1SMatan Azrad #include <rte_malloc.h>
165c060bf1SMatan Azrad #include <rte_memory.h>
171f37cb2bSDavid Marchand #include <bus_pci_driver.h>
185c060bf1SMatan Azrad #include <rte_vhost.h>
195c060bf1SMatan Azrad #include <rte_vdpa.h>
2094c16e89SMaxime Coquelin #include <vdpa_driver.h>
215c060bf1SMatan Azrad #include <rte_vfio.h>
225c060bf1SMatan Azrad #include <rte_spinlock.h>
235c060bf1SMatan Azrad #include <rte_log.h>
245c060bf1SMatan Azrad #include <rte_kvargs.h>
255c060bf1SMatan Azrad #include <rte_devargs.h>
265c060bf1SMatan Azrad 
275c060bf1SMatan Azrad #include "base/ifcvf.h"
285c060bf1SMatan Azrad 
29b97f361cSAndy Pei /*
30b97f361cSAndy Pei  * RTE_MIN() cannot be used since braced-group within expression allowed
31b97f361cSAndy Pei  * only inside a function.
32b97f361cSAndy Pei  */
33b97f361cSAndy Pei #define MIN(v1, v2)	((v1) < (v2) ? (v1) : (v2))
34b97f361cSAndy Pei 
35b1641987SThomas Monjalon RTE_LOG_REGISTER(ifcvf_vdpa_logtype, pmd.vdpa.ifcvf, NOTICE);
36*2b843cacSDavid Marchand #define RTE_LOGTYPE_IFCVF_VDPA ifcvf_vdpa_logtype
37*2b843cacSDavid Marchand #define DRV_LOG(level, ...) \
38*2b843cacSDavid Marchand 	RTE_LOG_LINE_PREFIX(level, IFCVF_VDPA, "%s(): ", __func__, __VA_ARGS__)
395c060bf1SMatan Azrad 
405c060bf1SMatan Azrad #define IFCVF_USED_RING_LEN(size) \
415c060bf1SMatan Azrad 	((size) * sizeof(struct vring_used_elem) + sizeof(uint16_t) * 3)
425c060bf1SMatan Azrad 
435c060bf1SMatan Azrad #define IFCVF_VDPA_MODE		"vdpa"
445c060bf1SMatan Azrad #define IFCVF_SW_FALLBACK_LM	"sw-live-migration"
455c060bf1SMatan Azrad 
46a011555fSChengwen Feng #define THREAD_NAME_LEN	16
47a011555fSChengwen Feng 
485c060bf1SMatan Azrad static const char * const ifcvf_valid_arguments[] = {
495c060bf1SMatan Azrad 	IFCVF_VDPA_MODE,
505c060bf1SMatan Azrad 	IFCVF_SW_FALLBACK_LM,
515c060bf1SMatan Azrad 	NULL
525c060bf1SMatan Azrad };
535c060bf1SMatan Azrad 
545c060bf1SMatan Azrad struct ifcvf_internal {
555c060bf1SMatan Azrad 	struct rte_pci_device *pdev;
565c060bf1SMatan Azrad 	struct ifcvf_hw hw;
57e2a1a08aSChenbo Xia 	int configured;
585c060bf1SMatan Azrad 	int vfio_container_fd;
595c060bf1SMatan Azrad 	int vfio_group_fd;
605c060bf1SMatan Azrad 	int vfio_dev_fd;
61a7ba40b2SThomas Monjalon 	rte_thread_t tid; /* thread for notify relay */
62a7ba40b2SThomas Monjalon 	rte_thread_t intr_tid; /* thread for config space change interrupt relay */
635c060bf1SMatan Azrad 	int epfd;
6465575dadSAndy Pei 	int csc_epfd;
655c060bf1SMatan Azrad 	int vid;
6681a6b7feSMaxime Coquelin 	struct rte_vdpa_device *vdev;
675c060bf1SMatan Azrad 	uint16_t max_queues;
685c060bf1SMatan Azrad 	uint64_t features;
695c060bf1SMatan Azrad 	rte_atomic32_t started;
705c060bf1SMatan Azrad 	rte_atomic32_t dev_attached;
715c060bf1SMatan Azrad 	rte_atomic32_t running;
725c060bf1SMatan Azrad 	rte_spinlock_t lock;
735c060bf1SMatan Azrad 	bool sw_lm;
745c060bf1SMatan Azrad 	bool sw_fallback_running;
755c060bf1SMatan Azrad 	/* mediated vring for sw fallback */
765c060bf1SMatan Azrad 	struct vring m_vring[IFCVF_MAX_QUEUES * 2];
775c060bf1SMatan Azrad 	/* eventfd for used ring interrupt */
785c060bf1SMatan Azrad 	int intr_fd[IFCVF_MAX_QUEUES * 2];
795c060bf1SMatan Azrad };
805c060bf1SMatan Azrad 
815c060bf1SMatan Azrad struct internal_list {
825c060bf1SMatan Azrad 	TAILQ_ENTRY(internal_list) next;
835c060bf1SMatan Azrad 	struct ifcvf_internal *internal;
845c060bf1SMatan Azrad };
855c060bf1SMatan Azrad 
86a60b747dSAndy Pei /* vdpa device info includes device features and devcic operation. */
87a60b747dSAndy Pei struct rte_vdpa_dev_info {
88a60b747dSAndy Pei 	uint64_t features;
89a60b747dSAndy Pei 	struct rte_vdpa_dev_ops *ops;
90a60b747dSAndy Pei };
91a60b747dSAndy Pei 
925c060bf1SMatan Azrad TAILQ_HEAD(internal_list_head, internal_list);
935c060bf1SMatan Azrad static struct internal_list_head internal_list =
945c060bf1SMatan Azrad 	TAILQ_HEAD_INITIALIZER(internal_list);
955c060bf1SMatan Azrad 
965c060bf1SMatan Azrad static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
975c060bf1SMatan Azrad 
985c060bf1SMatan Azrad static void update_used_ring(struct ifcvf_internal *internal, uint16_t qid);
995c060bf1SMatan Azrad 
1005c060bf1SMatan Azrad static struct internal_list *
10181a6b7feSMaxime Coquelin find_internal_resource_by_vdev(struct rte_vdpa_device *vdev)
1025c060bf1SMatan Azrad {
1035c060bf1SMatan Azrad 	int found = 0;
1045c060bf1SMatan Azrad 	struct internal_list *list;
1055c060bf1SMatan Azrad 
1065c060bf1SMatan Azrad 	pthread_mutex_lock(&internal_list_lock);
1075c060bf1SMatan Azrad 
1085c060bf1SMatan Azrad 	TAILQ_FOREACH(list, &internal_list, next) {
10981a6b7feSMaxime Coquelin 		if (vdev == list->internal->vdev) {
1105c060bf1SMatan Azrad 			found = 1;
1115c060bf1SMatan Azrad 			break;
1125c060bf1SMatan Azrad 		}
1135c060bf1SMatan Azrad 	}
1145c060bf1SMatan Azrad 
1155c060bf1SMatan Azrad 	pthread_mutex_unlock(&internal_list_lock);
1165c060bf1SMatan Azrad 
1175c060bf1SMatan Azrad 	if (!found)
1185c060bf1SMatan Azrad 		return NULL;
1195c060bf1SMatan Azrad 
1205c060bf1SMatan Azrad 	return list;
1215c060bf1SMatan Azrad }
1225c060bf1SMatan Azrad 
1235c060bf1SMatan Azrad static struct internal_list *
124146247f4SAndy Pei find_internal_resource_by_pci_dev(struct rte_pci_device *pdev)
1255c060bf1SMatan Azrad {
1265c060bf1SMatan Azrad 	int found = 0;
1275c060bf1SMatan Azrad 	struct internal_list *list;
1285c060bf1SMatan Azrad 
1295c060bf1SMatan Azrad 	pthread_mutex_lock(&internal_list_lock);
1305c060bf1SMatan Azrad 
1315c060bf1SMatan Azrad 	TAILQ_FOREACH(list, &internal_list, next) {
13238f8ab0bSMaxime Coquelin 		if (!rte_pci_addr_cmp(&pdev->addr,
13338f8ab0bSMaxime Coquelin 					&list->internal->pdev->addr)) {
1345c060bf1SMatan Azrad 			found = 1;
1355c060bf1SMatan Azrad 			break;
1365c060bf1SMatan Azrad 		}
1375c060bf1SMatan Azrad 	}
1385c060bf1SMatan Azrad 
1395c060bf1SMatan Azrad 	pthread_mutex_unlock(&internal_list_lock);
1405c060bf1SMatan Azrad 
1415c060bf1SMatan Azrad 	if (!found)
1425c060bf1SMatan Azrad 		return NULL;
1435c060bf1SMatan Azrad 
1445c060bf1SMatan Azrad 	return list;
1455c060bf1SMatan Azrad }
1465c060bf1SMatan Azrad 
14710d0458aSAndy Pei static struct internal_list *
14810d0458aSAndy Pei find_internal_resource_by_rte_dev(struct rte_device *rte_dev)
14910d0458aSAndy Pei {
15010d0458aSAndy Pei 	int found = 0;
15110d0458aSAndy Pei 	struct internal_list *list;
15210d0458aSAndy Pei 
15310d0458aSAndy Pei 	pthread_mutex_lock(&internal_list_lock);
15410d0458aSAndy Pei 
15510d0458aSAndy Pei 	TAILQ_FOREACH(list, &internal_list, next) {
15610d0458aSAndy Pei 		if (rte_dev == &list->internal->pdev->device) {
15710d0458aSAndy Pei 			found = 1;
15810d0458aSAndy Pei 			break;
15910d0458aSAndy Pei 		}
16010d0458aSAndy Pei 	}
16110d0458aSAndy Pei 
16210d0458aSAndy Pei 	pthread_mutex_unlock(&internal_list_lock);
16310d0458aSAndy Pei 
16410d0458aSAndy Pei 	if (!found)
16510d0458aSAndy Pei 		return NULL;
16610d0458aSAndy Pei 
16710d0458aSAndy Pei 	return list;
16810d0458aSAndy Pei }
16910d0458aSAndy Pei 
1705c060bf1SMatan Azrad static int
1715c060bf1SMatan Azrad ifcvf_vfio_setup(struct ifcvf_internal *internal)
1725c060bf1SMatan Azrad {
1735c060bf1SMatan Azrad 	struct rte_pci_device *dev = internal->pdev;
1745c060bf1SMatan Azrad 	char devname[RTE_DEV_NAME_MAX_LEN] = {0};
1755c060bf1SMatan Azrad 	int iommu_group_num;
1765c060bf1SMatan Azrad 	int i, ret;
1775c060bf1SMatan Azrad 
1785c060bf1SMatan Azrad 	internal->vfio_dev_fd = -1;
1795c060bf1SMatan Azrad 	internal->vfio_group_fd = -1;
1805c060bf1SMatan Azrad 	internal->vfio_container_fd = -1;
1815c060bf1SMatan Azrad 
1825c060bf1SMatan Azrad 	rte_pci_device_name(&dev->addr, devname, RTE_DEV_NAME_MAX_LEN);
1835c060bf1SMatan Azrad 	ret = rte_vfio_get_group_num(rte_pci_get_sysfs_path(), devname,
1845c060bf1SMatan Azrad 			&iommu_group_num);
1855c060bf1SMatan Azrad 	if (ret <= 0) {
1865c060bf1SMatan Azrad 		DRV_LOG(ERR, "%s failed to get IOMMU group", devname);
1875c060bf1SMatan Azrad 		return -1;
1885c060bf1SMatan Azrad 	}
1895c060bf1SMatan Azrad 
1905c060bf1SMatan Azrad 	internal->vfio_container_fd = rte_vfio_container_create();
1915c060bf1SMatan Azrad 	if (internal->vfio_container_fd < 0)
1925c060bf1SMatan Azrad 		return -1;
1935c060bf1SMatan Azrad 
1945c060bf1SMatan Azrad 	internal->vfio_group_fd = rte_vfio_container_group_bind(
1955c060bf1SMatan Azrad 			internal->vfio_container_fd, iommu_group_num);
1965c060bf1SMatan Azrad 	if (internal->vfio_group_fd < 0)
1975c060bf1SMatan Azrad 		goto err;
1985c060bf1SMatan Azrad 
1995c060bf1SMatan Azrad 	if (rte_pci_map_device(dev))
2005c060bf1SMatan Azrad 		goto err;
2015c060bf1SMatan Azrad 
202d61138d4SHarman Kalra 	internal->vfio_dev_fd = rte_intr_dev_fd_get(dev->intr_handle);
2035c060bf1SMatan Azrad 
2045c060bf1SMatan Azrad 	for (i = 0; i < RTE_MIN(PCI_MAX_RESOURCE, IFCVF_PCI_MAX_RESOURCE);
2055c060bf1SMatan Azrad 			i++) {
2065c060bf1SMatan Azrad 		internal->hw.mem_resource[i].addr =
2075c060bf1SMatan Azrad 			internal->pdev->mem_resource[i].addr;
2085c060bf1SMatan Azrad 		internal->hw.mem_resource[i].phys_addr =
2095c060bf1SMatan Azrad 			internal->pdev->mem_resource[i].phys_addr;
2105c060bf1SMatan Azrad 		internal->hw.mem_resource[i].len =
2115c060bf1SMatan Azrad 			internal->pdev->mem_resource[i].len;
2125c060bf1SMatan Azrad 	}
2135c060bf1SMatan Azrad 
2145c060bf1SMatan Azrad 	return 0;
2155c060bf1SMatan Azrad 
2165c060bf1SMatan Azrad err:
2175c060bf1SMatan Azrad 	rte_vfio_container_destroy(internal->vfio_container_fd);
2185c060bf1SMatan Azrad 	return -1;
2195c060bf1SMatan Azrad }
2205c060bf1SMatan Azrad 
2215c060bf1SMatan Azrad static int
2225abb634cSJilei Chen ifcvf_dma_map(struct ifcvf_internal *internal, bool do_map)
2235c060bf1SMatan Azrad {
2245c060bf1SMatan Azrad 	uint32_t i;
2255c060bf1SMatan Azrad 	int ret;
2265c060bf1SMatan Azrad 	struct rte_vhost_memory *mem = NULL;
2275c060bf1SMatan Azrad 	int vfio_container_fd;
2285c060bf1SMatan Azrad 
2295c060bf1SMatan Azrad 	ret = rte_vhost_get_mem_table(internal->vid, &mem);
2305c060bf1SMatan Azrad 	if (ret < 0) {
2315c060bf1SMatan Azrad 		DRV_LOG(ERR, "failed to get VM memory layout.");
2325c060bf1SMatan Azrad 		goto exit;
2335c060bf1SMatan Azrad 	}
2345c060bf1SMatan Azrad 
2355c060bf1SMatan Azrad 	vfio_container_fd = internal->vfio_container_fd;
2365c060bf1SMatan Azrad 
2375c060bf1SMatan Azrad 	for (i = 0; i < mem->nregions; i++) {
2385c060bf1SMatan Azrad 		struct rte_vhost_mem_region *reg;
2395c060bf1SMatan Azrad 
2405c060bf1SMatan Azrad 		reg = &mem->regions[i];
2415c060bf1SMatan Azrad 		DRV_LOG(INFO, "%s, region %u: HVA 0x%" PRIx64 ", "
2425c060bf1SMatan Azrad 			"GPA 0x%" PRIx64 ", size 0x%" PRIx64 ".",
2435c060bf1SMatan Azrad 			do_map ? "DMA map" : "DMA unmap", i,
2445c060bf1SMatan Azrad 			reg->host_user_addr, reg->guest_phys_addr, reg->size);
2455c060bf1SMatan Azrad 
2465c060bf1SMatan Azrad 		if (do_map) {
2475c060bf1SMatan Azrad 			ret = rte_vfio_container_dma_map(vfio_container_fd,
2485c060bf1SMatan Azrad 				reg->host_user_addr, reg->guest_phys_addr,
2495c060bf1SMatan Azrad 				reg->size);
2505c060bf1SMatan Azrad 			if (ret < 0) {
2515c060bf1SMatan Azrad 				DRV_LOG(ERR, "DMA map failed.");
2525c060bf1SMatan Azrad 				goto exit;
2535c060bf1SMatan Azrad 			}
2545c060bf1SMatan Azrad 		} else {
2555c060bf1SMatan Azrad 			ret = rte_vfio_container_dma_unmap(vfio_container_fd,
2565c060bf1SMatan Azrad 				reg->host_user_addr, reg->guest_phys_addr,
2575c060bf1SMatan Azrad 				reg->size);
2585c060bf1SMatan Azrad 			if (ret < 0) {
2595c060bf1SMatan Azrad 				DRV_LOG(ERR, "DMA unmap failed.");
2605c060bf1SMatan Azrad 				goto exit;
2615c060bf1SMatan Azrad 			}
2625c060bf1SMatan Azrad 		}
2635c060bf1SMatan Azrad 	}
2645c060bf1SMatan Azrad 
2655c060bf1SMatan Azrad exit:
2665c060bf1SMatan Azrad 	free(mem);
2675c060bf1SMatan Azrad 	return ret;
2685c060bf1SMatan Azrad }
2695c060bf1SMatan Azrad 
2705c060bf1SMatan Azrad static uint64_t
2715c060bf1SMatan Azrad hva_to_gpa(int vid, uint64_t hva)
2725c060bf1SMatan Azrad {
2735c060bf1SMatan Azrad 	struct rte_vhost_memory *mem = NULL;
2745c060bf1SMatan Azrad 	struct rte_vhost_mem_region *reg;
2755c060bf1SMatan Azrad 	uint32_t i;
2765c060bf1SMatan Azrad 	uint64_t gpa = 0;
2775c060bf1SMatan Azrad 
2785c060bf1SMatan Azrad 	if (rte_vhost_get_mem_table(vid, &mem) < 0)
2795c060bf1SMatan Azrad 		goto exit;
2805c060bf1SMatan Azrad 
2815c060bf1SMatan Azrad 	for (i = 0; i < mem->nregions; i++) {
2825c060bf1SMatan Azrad 		reg = &mem->regions[i];
2835c060bf1SMatan Azrad 
2845c060bf1SMatan Azrad 		if (hva >= reg->host_user_addr &&
2855c060bf1SMatan Azrad 				hva < reg->host_user_addr + reg->size) {
2865c060bf1SMatan Azrad 			gpa = hva - reg->host_user_addr + reg->guest_phys_addr;
2875c060bf1SMatan Azrad 			break;
2885c060bf1SMatan Azrad 		}
2895c060bf1SMatan Azrad 	}
2905c060bf1SMatan Azrad 
2915c060bf1SMatan Azrad exit:
2925c060bf1SMatan Azrad 	free(mem);
2935c060bf1SMatan Azrad 	return gpa;
2945c060bf1SMatan Azrad }
2955c060bf1SMatan Azrad 
2965c060bf1SMatan Azrad static int
2975c060bf1SMatan Azrad vdpa_ifcvf_start(struct ifcvf_internal *internal)
2985c060bf1SMatan Azrad {
2995c060bf1SMatan Azrad 	struct ifcvf_hw *hw = &internal->hw;
3005c060bf1SMatan Azrad 	int i, nr_vring;
3015c060bf1SMatan Azrad 	int vid;
3025c060bf1SMatan Azrad 	struct rte_vhost_vring vq;
3035c060bf1SMatan Azrad 	uint64_t gpa;
3045c060bf1SMatan Azrad 
3055c060bf1SMatan Azrad 	vid = internal->vid;
3065c060bf1SMatan Azrad 	nr_vring = rte_vhost_get_vring_num(vid);
3075c060bf1SMatan Azrad 	rte_vhost_get_negotiated_features(vid, &hw->req_features);
3085c060bf1SMatan Azrad 
3095c060bf1SMatan Azrad 	for (i = 0; i < nr_vring; i++) {
31083c7370aSAndy Pei 		if (!hw->vring[i].enable)
31183c7370aSAndy Pei 			continue;
3125c060bf1SMatan Azrad 		rte_vhost_get_vhost_vring(vid, i, &vq);
3135c060bf1SMatan Azrad 		gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc);
3145c060bf1SMatan Azrad 		if (gpa == 0) {
3155c060bf1SMatan Azrad 			DRV_LOG(ERR, "Fail to get GPA for descriptor ring.");
3165c060bf1SMatan Azrad 			return -1;
3175c060bf1SMatan Azrad 		}
3185c060bf1SMatan Azrad 		hw->vring[i].desc = gpa;
3195c060bf1SMatan Azrad 
3205c060bf1SMatan Azrad 		gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.avail);
3215c060bf1SMatan Azrad 		if (gpa == 0) {
3225c060bf1SMatan Azrad 			DRV_LOG(ERR, "Fail to get GPA for available ring.");
3235c060bf1SMatan Azrad 			return -1;
3245c060bf1SMatan Azrad 		}
3255c060bf1SMatan Azrad 		hw->vring[i].avail = gpa;
3265c060bf1SMatan Azrad 
3275c060bf1SMatan Azrad 		gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used);
3285c060bf1SMatan Azrad 		if (gpa == 0) {
3295c060bf1SMatan Azrad 			DRV_LOG(ERR, "Fail to get GPA for used ring.");
3305c060bf1SMatan Azrad 			return -1;
3315c060bf1SMatan Azrad 		}
3325c060bf1SMatan Azrad 		hw->vring[i].used = gpa;
3335c060bf1SMatan Azrad 
3345c060bf1SMatan Azrad 		hw->vring[i].size = vq.size;
3355c060bf1SMatan Azrad 		rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx,
3365c060bf1SMatan Azrad 				&hw->vring[i].last_used_idx);
3375c060bf1SMatan Azrad 	}
3385c060bf1SMatan Azrad 	hw->nr_vring = i;
3395c060bf1SMatan Azrad 
3405c060bf1SMatan Azrad 	return ifcvf_start_hw(&internal->hw);
3415c060bf1SMatan Azrad }
3425c060bf1SMatan Azrad 
3435c060bf1SMatan Azrad static void
3445c060bf1SMatan Azrad vdpa_ifcvf_stop(struct ifcvf_internal *internal)
3455c060bf1SMatan Azrad {
3465c060bf1SMatan Azrad 	struct ifcvf_hw *hw = &internal->hw;
3475c060bf1SMatan Azrad 	uint32_t i;
3485c060bf1SMatan Azrad 	int vid;
3495c060bf1SMatan Azrad 	uint64_t features = 0;
3505c060bf1SMatan Azrad 	uint64_t log_base = 0, log_size = 0;
3515c060bf1SMatan Azrad 	uint64_t len;
3527015b657SAndy Pei 	u32 ring_state = 0;
3535c060bf1SMatan Azrad 
3545c060bf1SMatan Azrad 	vid = internal->vid;
3557015b657SAndy Pei 
3567015b657SAndy Pei 	/* to make sure no packet is lost for blk device
3577015b657SAndy Pei 	 * do not stop until last_avail_idx == last_used_idx
3587015b657SAndy Pei 	 */
3597015b657SAndy Pei 	if (internal->hw.device_type == IFCVF_BLK) {
3607015b657SAndy Pei 		for (i = 0; i < hw->nr_vring; i++) {
3617015b657SAndy Pei 			do {
3627015b657SAndy Pei 				if (hw->lm_cfg != NULL)
3637015b657SAndy Pei 					ring_state = *(u32 *)(hw->lm_cfg +
3647015b657SAndy Pei 						IFCVF_LM_RING_STATE_OFFSET +
3657015b657SAndy Pei 						i * IFCVF_LM_CFG_SIZE);
3667015b657SAndy Pei 				hw->vring[i].last_avail_idx =
3677015b657SAndy Pei 					(u16)(ring_state & IFCVF_16_BIT_MASK);
3687015b657SAndy Pei 				hw->vring[i].last_used_idx =
3697015b657SAndy Pei 					(u16)(ring_state >> 16);
3707015b657SAndy Pei 				usleep(10);
3717015b657SAndy Pei 			} while (hw->vring[i].last_avail_idx !=
3727015b657SAndy Pei 				hw->vring[i].last_used_idx);
3737015b657SAndy Pei 		}
3747015b657SAndy Pei 	}
3757015b657SAndy Pei 
3765c060bf1SMatan Azrad 	ifcvf_stop_hw(hw);
3775c060bf1SMatan Azrad 
3785c060bf1SMatan Azrad 	for (i = 0; i < hw->nr_vring; i++)
3795c060bf1SMatan Azrad 		rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
3805c060bf1SMatan Azrad 				hw->vring[i].last_used_idx);
3815c060bf1SMatan Azrad 
3825c060bf1SMatan Azrad 	if (internal->sw_lm)
3835c060bf1SMatan Azrad 		return;
3845c060bf1SMatan Azrad 
3855c060bf1SMatan Azrad 	rte_vhost_get_negotiated_features(vid, &features);
3865c060bf1SMatan Azrad 	if (RTE_VHOST_NEED_LOG(features)) {
3875c060bf1SMatan Azrad 		ifcvf_disable_logging(hw);
3885c060bf1SMatan Azrad 		rte_vhost_get_log_base(internal->vid, &log_base, &log_size);
3895c060bf1SMatan Azrad 		rte_vfio_container_dma_unmap(internal->vfio_container_fd,
3905c060bf1SMatan Azrad 				log_base, IFCVF_LOG_BASE, log_size);
3915c060bf1SMatan Azrad 		/*
3925c060bf1SMatan Azrad 		 * IFCVF marks dirty memory pages for only packet buffer,
3935c060bf1SMatan Azrad 		 * SW helps to mark the used ring as dirty after device stops.
3945c060bf1SMatan Azrad 		 */
3955c060bf1SMatan Azrad 		for (i = 0; i < hw->nr_vring; i++) {
3965c060bf1SMatan Azrad 			len = IFCVF_USED_RING_LEN(hw->vring[i].size);
3975c060bf1SMatan Azrad 			rte_vhost_log_used_vring(vid, i, 0, len);
3985c060bf1SMatan Azrad 		}
3995c060bf1SMatan Azrad 	}
4005c060bf1SMatan Azrad }
4015c060bf1SMatan Azrad 
4025c060bf1SMatan Azrad #define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
4035c060bf1SMatan Azrad 		sizeof(int) * (IFCVF_MAX_QUEUES * 2 + 1))
4045c060bf1SMatan Azrad static int
4055c060bf1SMatan Azrad vdpa_enable_vfio_intr(struct ifcvf_internal *internal, bool m_rx)
4065c060bf1SMatan Azrad {
4075c060bf1SMatan Azrad 	int ret;
4085c060bf1SMatan Azrad 	uint32_t i, nr_vring;
4095c060bf1SMatan Azrad 	char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
4105c060bf1SMatan Azrad 	struct vfio_irq_set *irq_set;
4115c060bf1SMatan Azrad 	int *fd_ptr;
4125c060bf1SMatan Azrad 	struct rte_vhost_vring vring;
4135c060bf1SMatan Azrad 	int fd;
4145c060bf1SMatan Azrad 
4155c060bf1SMatan Azrad 	vring.callfd = -1;
4165c060bf1SMatan Azrad 
4175c060bf1SMatan Azrad 	nr_vring = rte_vhost_get_vring_num(internal->vid);
4182a213b79SDavid Marchand 	if (nr_vring > IFCVF_MAX_QUEUES * 2)
4192a213b79SDavid Marchand 		return -1;
4205c060bf1SMatan Azrad 
4215c060bf1SMatan Azrad 	irq_set = (struct vfio_irq_set *)irq_set_buf;
4225c060bf1SMatan Azrad 	irq_set->argsz = sizeof(irq_set_buf);
4235c060bf1SMatan Azrad 	irq_set->count = nr_vring + 1;
4245c060bf1SMatan Azrad 	irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
4255c060bf1SMatan Azrad 			 VFIO_IRQ_SET_ACTION_TRIGGER;
4265c060bf1SMatan Azrad 	irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
4275c060bf1SMatan Azrad 	irq_set->start = 0;
4285c060bf1SMatan Azrad 	fd_ptr = (int *)&irq_set->data;
429ff53e977SAndy Pei 	/* The first interrupt is for the configure space change notification */
430d61138d4SHarman Kalra 	fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] =
431d61138d4SHarman Kalra 		rte_intr_fd_get(internal->pdev->intr_handle);
4325c060bf1SMatan Azrad 
4335c060bf1SMatan Azrad 	for (i = 0; i < nr_vring; i++)
4345c060bf1SMatan Azrad 		internal->intr_fd[i] = -1;
4355c060bf1SMatan Azrad 
4365c060bf1SMatan Azrad 	for (i = 0; i < nr_vring; i++) {
4375c060bf1SMatan Azrad 		rte_vhost_get_vhost_vring(internal->vid, i, &vring);
4385c060bf1SMatan Azrad 		fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = vring.callfd;
439ff53e977SAndy Pei 		if (m_rx == true &&
440ff53e977SAndy Pei 			((i & 1) == 0 || internal->hw.device_type == IFCVF_BLK)) {
441ff53e977SAndy Pei 			/* For the net we only need to relay rx queue,
442ff53e977SAndy Pei 			 * which will change the mem of VM.
443ff53e977SAndy Pei 			 * For the blk we need to relay all the read cmd
444ff53e977SAndy Pei 			 * of each queue
445ff53e977SAndy Pei 			 */
4465c060bf1SMatan Azrad 			fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
4475c060bf1SMatan Azrad 			if (fd < 0) {
4485c060bf1SMatan Azrad 				DRV_LOG(ERR, "can't setup eventfd: %s",
4495c060bf1SMatan Azrad 					strerror(errno));
4505c060bf1SMatan Azrad 				return -1;
4515c060bf1SMatan Azrad 			}
4525c060bf1SMatan Azrad 			internal->intr_fd[i] = fd;
4535c060bf1SMatan Azrad 			fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = fd;
4545c060bf1SMatan Azrad 		}
4555c060bf1SMatan Azrad 	}
4565c060bf1SMatan Azrad 
4575c060bf1SMatan Azrad 	ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
4585c060bf1SMatan Azrad 	if (ret) {
4595c060bf1SMatan Azrad 		DRV_LOG(ERR, "Error enabling MSI-X interrupts: %s",
4605c060bf1SMatan Azrad 				strerror(errno));
4615c060bf1SMatan Azrad 		return -1;
4625c060bf1SMatan Azrad 	}
4635c060bf1SMatan Azrad 
4645c060bf1SMatan Azrad 	return 0;
4655c060bf1SMatan Azrad }
4665c060bf1SMatan Azrad 
4675c060bf1SMatan Azrad static int
4685c060bf1SMatan Azrad vdpa_disable_vfio_intr(struct ifcvf_internal *internal)
4695c060bf1SMatan Azrad {
4705c060bf1SMatan Azrad 	int ret;
4715c060bf1SMatan Azrad 	uint32_t i, nr_vring;
4725c060bf1SMatan Azrad 	char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
4735c060bf1SMatan Azrad 	struct vfio_irq_set *irq_set;
4745c060bf1SMatan Azrad 
4755c060bf1SMatan Azrad 	irq_set = (struct vfio_irq_set *)irq_set_buf;
4765c060bf1SMatan Azrad 	irq_set->argsz = sizeof(irq_set_buf);
4775c060bf1SMatan Azrad 	irq_set->count = 0;
4785c060bf1SMatan Azrad 	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
4795c060bf1SMatan Azrad 	irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
4805c060bf1SMatan Azrad 	irq_set->start = 0;
4815c060bf1SMatan Azrad 
4825c060bf1SMatan Azrad 	nr_vring = rte_vhost_get_vring_num(internal->vid);
4835c060bf1SMatan Azrad 	for (i = 0; i < nr_vring; i++) {
4845c060bf1SMatan Azrad 		if (internal->intr_fd[i] >= 0)
4855c060bf1SMatan Azrad 			close(internal->intr_fd[i]);
4865c060bf1SMatan Azrad 		internal->intr_fd[i] = -1;
4875c060bf1SMatan Azrad 	}
4885c060bf1SMatan Azrad 
4895c060bf1SMatan Azrad 	ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
4905c060bf1SMatan Azrad 	if (ret) {
4915c060bf1SMatan Azrad 		DRV_LOG(ERR, "Error disabling MSI-X interrupts: %s",
4925c060bf1SMatan Azrad 				strerror(errno));
4935c060bf1SMatan Azrad 		return -1;
4945c060bf1SMatan Azrad 	}
4955c060bf1SMatan Azrad 
4965c060bf1SMatan Azrad 	return 0;
4975c060bf1SMatan Azrad }
4985c060bf1SMatan Azrad 
499a7ba40b2SThomas Monjalon static uint32_t
5005c060bf1SMatan Azrad notify_relay(void *arg)
5015c060bf1SMatan Azrad {
5025c060bf1SMatan Azrad 	int i, kickfd, epfd, nfds = 0;
5035c060bf1SMatan Azrad 	uint32_t qid, q_num;
5045c060bf1SMatan Azrad 	struct epoll_event events[IFCVF_MAX_QUEUES * 2];
5055c060bf1SMatan Azrad 	struct epoll_event ev;
5065c060bf1SMatan Azrad 	uint64_t buf;
5075c060bf1SMatan Azrad 	int nbytes;
5085c060bf1SMatan Azrad 	struct rte_vhost_vring vring;
5095c060bf1SMatan Azrad 	struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
5105c060bf1SMatan Azrad 	struct ifcvf_hw *hw = &internal->hw;
5115c060bf1SMatan Azrad 
5125c060bf1SMatan Azrad 	q_num = rte_vhost_get_vring_num(internal->vid);
5135c060bf1SMatan Azrad 
5145c060bf1SMatan Azrad 	epfd = epoll_create(IFCVF_MAX_QUEUES * 2);
5155c060bf1SMatan Azrad 	if (epfd < 0) {
5165c060bf1SMatan Azrad 		DRV_LOG(ERR, "failed to create epoll instance.");
517a7ba40b2SThomas Monjalon 		return 1;
5185c060bf1SMatan Azrad 	}
5195c060bf1SMatan Azrad 	internal->epfd = epfd;
5205c060bf1SMatan Azrad 
5215c060bf1SMatan Azrad 	vring.kickfd = -1;
5225c060bf1SMatan Azrad 	for (qid = 0; qid < q_num; qid++) {
52383c7370aSAndy Pei 		if (!hw->vring[qid].enable)
52483c7370aSAndy Pei 			continue;
5255c060bf1SMatan Azrad 		ev.events = EPOLLIN | EPOLLPRI;
5265c060bf1SMatan Azrad 		rte_vhost_get_vhost_vring(internal->vid, qid, &vring);
5275c060bf1SMatan Azrad 		ev.data.u64 = qid | (uint64_t)vring.kickfd << 32;
5285c060bf1SMatan Azrad 		if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) {
5295c060bf1SMatan Azrad 			DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
530a7ba40b2SThomas Monjalon 			return 1;
5315c060bf1SMatan Azrad 		}
5325c060bf1SMatan Azrad 	}
5335c060bf1SMatan Azrad 
5345c060bf1SMatan Azrad 	for (;;) {
5355c060bf1SMatan Azrad 		nfds = epoll_wait(epfd, events, q_num, -1);
5365c060bf1SMatan Azrad 		if (nfds < 0) {
5375c060bf1SMatan Azrad 			if (errno == EINTR)
5385c060bf1SMatan Azrad 				continue;
539f665790aSDavid Marchand 			DRV_LOG(ERR, "epoll_wait return fail");
540a7ba40b2SThomas Monjalon 			return 1;
5415c060bf1SMatan Azrad 		}
5425c060bf1SMatan Azrad 
5435c060bf1SMatan Azrad 		for (i = 0; i < nfds; i++) {
5445c060bf1SMatan Azrad 			qid = events[i].data.u32;
5455c060bf1SMatan Azrad 			kickfd = (uint32_t)(events[i].data.u64 >> 32);
5465c060bf1SMatan Azrad 			do {
5475c060bf1SMatan Azrad 				nbytes = read(kickfd, &buf, 8);
5485c060bf1SMatan Azrad 				if (nbytes < 0) {
5495c060bf1SMatan Azrad 					if (errno == EINTR ||
5505c060bf1SMatan Azrad 					    errno == EWOULDBLOCK ||
5515c060bf1SMatan Azrad 					    errno == EAGAIN)
5525c060bf1SMatan Azrad 						continue;
5535c060bf1SMatan Azrad 					DRV_LOG(INFO, "Error reading "
5545c060bf1SMatan Azrad 						"kickfd: %s",
5555c060bf1SMatan Azrad 						strerror(errno));
5565c060bf1SMatan Azrad 				}
5575c060bf1SMatan Azrad 				break;
5585c060bf1SMatan Azrad 			} while (1);
5595c060bf1SMatan Azrad 
5605c060bf1SMatan Azrad 			ifcvf_notify_queue(hw, qid);
5615c060bf1SMatan Azrad 		}
5625c060bf1SMatan Azrad 	}
5635c060bf1SMatan Azrad 
564a7ba40b2SThomas Monjalon 	return 0;
5655c060bf1SMatan Azrad }
5665c060bf1SMatan Azrad 
5675c060bf1SMatan Azrad static int
5685c060bf1SMatan Azrad setup_notify_relay(struct ifcvf_internal *internal)
5695c060bf1SMatan Azrad {
570a7ba40b2SThomas Monjalon 	char name[RTE_THREAD_INTERNAL_NAME_SIZE];
5715c060bf1SMatan Azrad 	int ret;
5725c060bf1SMatan Azrad 
573a7ba40b2SThomas Monjalon 	snprintf(name, sizeof(name), "ifc-noti%d", internal->vid);
574a7ba40b2SThomas Monjalon 	ret = rte_thread_create_internal_control(&internal->tid, name,
575a7ba40b2SThomas Monjalon 			notify_relay, internal);
576a011555fSChengwen Feng 	if (ret != 0) {
5775c060bf1SMatan Azrad 		DRV_LOG(ERR, "failed to create notify relay pthread.");
5785c060bf1SMatan Azrad 		return -1;
5795c060bf1SMatan Azrad 	}
580a011555fSChengwen Feng 
5815c060bf1SMatan Azrad 	return 0;
5825c060bf1SMatan Azrad }
5835c060bf1SMatan Azrad 
5845c060bf1SMatan Azrad static int
5855c060bf1SMatan Azrad unset_notify_relay(struct ifcvf_internal *internal)
5865c060bf1SMatan Azrad {
587a7ba40b2SThomas Monjalon 	if (internal->tid.opaque_id != 0) {
588a7ba40b2SThomas Monjalon 		pthread_cancel((pthread_t)internal->tid.opaque_id);
589a7ba40b2SThomas Monjalon 		rte_thread_join(internal->tid, NULL);
5905c060bf1SMatan Azrad 	}
591a7ba40b2SThomas Monjalon 	internal->tid.opaque_id = 0;
5925c060bf1SMatan Azrad 
5935c060bf1SMatan Azrad 	if (internal->epfd >= 0)
5945c060bf1SMatan Azrad 		close(internal->epfd);
5955c060bf1SMatan Azrad 	internal->epfd = -1;
5965c060bf1SMatan Azrad 
5975c060bf1SMatan Azrad 	return 0;
5985c060bf1SMatan Azrad }
5995c060bf1SMatan Azrad 
60065575dadSAndy Pei static void
60165575dadSAndy Pei virtio_interrupt_handler(struct ifcvf_internal *internal)
60265575dadSAndy Pei {
60365575dadSAndy Pei 	int vid = internal->vid;
60465575dadSAndy Pei 	int ret;
60565575dadSAndy Pei 
60671998eb6SNobuhiro Miki 	ret = rte_vhost_backend_config_change(vid, 1);
60765575dadSAndy Pei 	if (ret)
60865575dadSAndy Pei 		DRV_LOG(ERR, "failed to notify the guest about configuration space change.");
60965575dadSAndy Pei }
61065575dadSAndy Pei 
611a7ba40b2SThomas Monjalon static uint32_t
61265575dadSAndy Pei intr_relay(void *arg)
61365575dadSAndy Pei {
61465575dadSAndy Pei 	struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
61565575dadSAndy Pei 	struct epoll_event csc_event;
61665575dadSAndy Pei 	struct epoll_event ev;
61765575dadSAndy Pei 	uint64_t buf;
61865575dadSAndy Pei 	int nbytes;
61965575dadSAndy Pei 	int csc_epfd, csc_val = 0;
62065575dadSAndy Pei 
62165575dadSAndy Pei 	csc_epfd = epoll_create(1);
62265575dadSAndy Pei 	if (csc_epfd < 0) {
62365575dadSAndy Pei 		DRV_LOG(ERR, "failed to create epoll for config space change.");
624a7ba40b2SThomas Monjalon 		return 1;
62565575dadSAndy Pei 	}
62665575dadSAndy Pei 
62765575dadSAndy Pei 	ev.events = EPOLLIN | EPOLLPRI | EPOLLRDHUP | EPOLLHUP;
62865575dadSAndy Pei 	ev.data.fd = rte_intr_fd_get(internal->pdev->intr_handle);
62965575dadSAndy Pei 	if (epoll_ctl(csc_epfd, EPOLL_CTL_ADD,
63065575dadSAndy Pei 		rte_intr_fd_get(internal->pdev->intr_handle), &ev) < 0) {
63165575dadSAndy Pei 		DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
63265575dadSAndy Pei 		goto out;
63365575dadSAndy Pei 	}
63465575dadSAndy Pei 
63565575dadSAndy Pei 	internal->csc_epfd = csc_epfd;
63665575dadSAndy Pei 
63765575dadSAndy Pei 	for (;;) {
63865575dadSAndy Pei 		csc_val = epoll_wait(csc_epfd, &csc_event, 1, -1);
63965575dadSAndy Pei 		if (csc_val < 0) {
64065575dadSAndy Pei 			if (errno == EINTR)
64165575dadSAndy Pei 				continue;
64265575dadSAndy Pei 			DRV_LOG(ERR, "epoll_wait return fail.");
64365575dadSAndy Pei 			goto out;
64465575dadSAndy Pei 		} else if (csc_val == 0) {
64565575dadSAndy Pei 			continue;
64665575dadSAndy Pei 		} else {
64765575dadSAndy Pei 			/* csc_val > 0 */
64865575dadSAndy Pei 			nbytes = read(csc_event.data.fd, &buf, 8);
64965575dadSAndy Pei 			if (nbytes < 0) {
65065575dadSAndy Pei 				if (errno == EINTR ||
65165575dadSAndy Pei 				    errno == EWOULDBLOCK ||
65265575dadSAndy Pei 				    errno == EAGAIN)
65365575dadSAndy Pei 					continue;
654f665790aSDavid Marchand 				DRV_LOG(ERR, "Error reading from file descriptor %d: %s",
65565575dadSAndy Pei 					csc_event.data.fd,
65665575dadSAndy Pei 					strerror(errno));
65765575dadSAndy Pei 				goto out;
65865575dadSAndy Pei 			} else if (nbytes == 0) {
659f665790aSDavid Marchand 				DRV_LOG(ERR, "Read nothing from file descriptor %d",
66065575dadSAndy Pei 					csc_event.data.fd);
66165575dadSAndy Pei 				continue;
66265575dadSAndy Pei 			} else {
66365575dadSAndy Pei 				virtio_interrupt_handler(internal);
66465575dadSAndy Pei 			}
66565575dadSAndy Pei 		}
66665575dadSAndy Pei 	}
66765575dadSAndy Pei 
66865575dadSAndy Pei out:
66965575dadSAndy Pei 	if (csc_epfd >= 0)
67065575dadSAndy Pei 		close(csc_epfd);
67165575dadSAndy Pei 	internal->csc_epfd = -1;
67265575dadSAndy Pei 
673a7ba40b2SThomas Monjalon 	return 0;
67465575dadSAndy Pei }
67565575dadSAndy Pei 
67665575dadSAndy Pei static int
67765575dadSAndy Pei setup_intr_relay(struct ifcvf_internal *internal)
67865575dadSAndy Pei {
679a7ba40b2SThomas Monjalon 	char name[RTE_THREAD_INTERNAL_NAME_SIZE];
68065575dadSAndy Pei 	int ret;
68165575dadSAndy Pei 
682a7ba40b2SThomas Monjalon 	snprintf(name, sizeof(name), "ifc-int%d", internal->vid);
683a7ba40b2SThomas Monjalon 	ret = rte_thread_create_internal_control(&internal->intr_tid, name,
68465575dadSAndy Pei 			intr_relay, (void *)internal);
68565575dadSAndy Pei 	if (ret) {
68665575dadSAndy Pei 		DRV_LOG(ERR, "failed to create notify relay pthread.");
68765575dadSAndy Pei 		return -1;
68865575dadSAndy Pei 	}
68965575dadSAndy Pei 	return 0;
69065575dadSAndy Pei }
69165575dadSAndy Pei 
69265575dadSAndy Pei static void
69365575dadSAndy Pei unset_intr_relay(struct ifcvf_internal *internal)
69465575dadSAndy Pei {
695a7ba40b2SThomas Monjalon 	if (internal->intr_tid.opaque_id != 0) {
696a7ba40b2SThomas Monjalon 		pthread_cancel((pthread_t)internal->intr_tid.opaque_id);
697a7ba40b2SThomas Monjalon 		rte_thread_join(internal->intr_tid, NULL);
69865575dadSAndy Pei 	}
699a7ba40b2SThomas Monjalon 	internal->intr_tid.opaque_id = 0;
70065575dadSAndy Pei 
70165575dadSAndy Pei 	if (internal->csc_epfd >= 0)
70265575dadSAndy Pei 		close(internal->csc_epfd);
70365575dadSAndy Pei 	internal->csc_epfd = -1;
70465575dadSAndy Pei }
70565575dadSAndy Pei 
7065c060bf1SMatan Azrad static int
7075c060bf1SMatan Azrad update_datapath(struct ifcvf_internal *internal)
7085c060bf1SMatan Azrad {
7095c060bf1SMatan Azrad 	int ret;
7105c060bf1SMatan Azrad 
7115c060bf1SMatan Azrad 	rte_spinlock_lock(&internal->lock);
7125c060bf1SMatan Azrad 
7135c060bf1SMatan Azrad 	if (!rte_atomic32_read(&internal->running) &&
7145c060bf1SMatan Azrad 	    (rte_atomic32_read(&internal->started) &&
7155c060bf1SMatan Azrad 	     rte_atomic32_read(&internal->dev_attached))) {
7165abb634cSJilei Chen 		ret = ifcvf_dma_map(internal, true);
7175c060bf1SMatan Azrad 		if (ret)
7185c060bf1SMatan Azrad 			goto err;
7195c060bf1SMatan Azrad 
7205abb634cSJilei Chen 		ret = vdpa_enable_vfio_intr(internal, false);
7215c060bf1SMatan Azrad 		if (ret)
7225c060bf1SMatan Azrad 			goto err;
7235c060bf1SMatan Azrad 
7245c060bf1SMatan Azrad 		ret = vdpa_ifcvf_start(internal);
7255c060bf1SMatan Azrad 		if (ret)
7265c060bf1SMatan Azrad 			goto err;
7275c060bf1SMatan Azrad 
7285c060bf1SMatan Azrad 		ret = setup_notify_relay(internal);
7295c060bf1SMatan Azrad 		if (ret)
7305c060bf1SMatan Azrad 			goto err;
7315c060bf1SMatan Azrad 
73265575dadSAndy Pei 		ret = setup_intr_relay(internal);
73365575dadSAndy Pei 		if (ret)
73465575dadSAndy Pei 			goto err;
73565575dadSAndy Pei 
7365c060bf1SMatan Azrad 		rte_atomic32_set(&internal->running, 1);
7375c060bf1SMatan Azrad 	} else if (rte_atomic32_read(&internal->running) &&
7385c060bf1SMatan Azrad 		   (!rte_atomic32_read(&internal->started) ||
7395c060bf1SMatan Azrad 		    !rte_atomic32_read(&internal->dev_attached))) {
74065575dadSAndy Pei 		unset_intr_relay(internal);
74165575dadSAndy Pei 
7425c060bf1SMatan Azrad 		ret = unset_notify_relay(internal);
7435c060bf1SMatan Azrad 		if (ret)
7445c060bf1SMatan Azrad 			goto err;
7455c060bf1SMatan Azrad 
7465c060bf1SMatan Azrad 		vdpa_ifcvf_stop(internal);
7475c060bf1SMatan Azrad 
7485c060bf1SMatan Azrad 		ret = vdpa_disable_vfio_intr(internal);
7495c060bf1SMatan Azrad 		if (ret)
7505c060bf1SMatan Azrad 			goto err;
7515c060bf1SMatan Azrad 
7525abb634cSJilei Chen 		ret = ifcvf_dma_map(internal, false);
7535c060bf1SMatan Azrad 		if (ret)
7545c060bf1SMatan Azrad 			goto err;
7555c060bf1SMatan Azrad 
7565c060bf1SMatan Azrad 		rte_atomic32_set(&internal->running, 0);
7575c060bf1SMatan Azrad 	}
7585c060bf1SMatan Azrad 
7595c060bf1SMatan Azrad 	rte_spinlock_unlock(&internal->lock);
7605c060bf1SMatan Azrad 	return 0;
7615c060bf1SMatan Azrad err:
7625c060bf1SMatan Azrad 	rte_spinlock_unlock(&internal->lock);
7635c060bf1SMatan Azrad 	return ret;
7645c060bf1SMatan Azrad }
7655c060bf1SMatan Azrad 
7665c060bf1SMatan Azrad static int
7675c060bf1SMatan Azrad m_ifcvf_start(struct ifcvf_internal *internal)
7685c060bf1SMatan Azrad {
7695c060bf1SMatan Azrad 	struct ifcvf_hw *hw = &internal->hw;
7705c060bf1SMatan Azrad 	uint32_t i, nr_vring;
7715c060bf1SMatan Azrad 	int vid, ret;
7725c060bf1SMatan Azrad 	struct rte_vhost_vring vq;
7735c060bf1SMatan Azrad 	void *vring_buf;
7745c060bf1SMatan Azrad 	uint64_t m_vring_iova = IFCVF_MEDIATED_VRING;
7755c060bf1SMatan Azrad 	uint64_t size;
7765c060bf1SMatan Azrad 	uint64_t gpa;
7775c060bf1SMatan Azrad 
7785c060bf1SMatan Azrad 	memset(&vq, 0, sizeof(vq));
7795c060bf1SMatan Azrad 	vid = internal->vid;
7805c060bf1SMatan Azrad 	nr_vring = rte_vhost_get_vring_num(vid);
7815c060bf1SMatan Azrad 	rte_vhost_get_negotiated_features(vid, &hw->req_features);
7825c060bf1SMatan Azrad 
7835c060bf1SMatan Azrad 	for (i = 0; i < nr_vring; i++) {
7845c060bf1SMatan Azrad 		rte_vhost_get_vhost_vring(vid, i, &vq);
7855c060bf1SMatan Azrad 
786924e6b76SThomas Monjalon 		size = RTE_ALIGN_CEIL(vring_size(vq.size, rte_mem_page_size()),
787924e6b76SThomas Monjalon 				rte_mem_page_size());
788924e6b76SThomas Monjalon 		vring_buf = rte_zmalloc("ifcvf", size, rte_mem_page_size());
7895c060bf1SMatan Azrad 		vring_init(&internal->m_vring[i], vq.size, vring_buf,
790924e6b76SThomas Monjalon 				rte_mem_page_size());
7915c060bf1SMatan Azrad 
7925c060bf1SMatan Azrad 		ret = rte_vfio_container_dma_map(internal->vfio_container_fd,
7935c060bf1SMatan Azrad 			(uint64_t)(uintptr_t)vring_buf, m_vring_iova, size);
7945c060bf1SMatan Azrad 		if (ret < 0) {
7955c060bf1SMatan Azrad 			DRV_LOG(ERR, "mediated vring DMA map failed.");
7965c060bf1SMatan Azrad 			goto error;
7975c060bf1SMatan Azrad 		}
7985c060bf1SMatan Azrad 
7995c060bf1SMatan Azrad 		gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc);
8005c060bf1SMatan Azrad 		if (gpa == 0) {
8015c060bf1SMatan Azrad 			DRV_LOG(ERR, "Fail to get GPA for descriptor ring.");
8025c060bf1SMatan Azrad 			return -1;
8035c060bf1SMatan Azrad 		}
8045c060bf1SMatan Azrad 		hw->vring[i].desc = gpa;
8055c060bf1SMatan Azrad 
8065c060bf1SMatan Azrad 		gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.avail);
8075c060bf1SMatan Azrad 		if (gpa == 0) {
8085c060bf1SMatan Azrad 			DRV_LOG(ERR, "Fail to get GPA for available ring.");
8095c060bf1SMatan Azrad 			return -1;
8105c060bf1SMatan Azrad 		}
8115c060bf1SMatan Azrad 		hw->vring[i].avail = gpa;
8125c060bf1SMatan Azrad 
8137015b657SAndy Pei 		/* NET: Direct I/O for Tx queue, relay for Rx queue
8147015b657SAndy Pei 		 * BLK: relay every queue
8157015b657SAndy Pei 		 */
8167015b657SAndy Pei 		if ((internal->hw.device_type == IFCVF_NET) && (i & 1)) {
8175c060bf1SMatan Azrad 			gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used);
8185c060bf1SMatan Azrad 			if (gpa == 0) {
8195c060bf1SMatan Azrad 				DRV_LOG(ERR, "Fail to get GPA for used ring.");
8205c060bf1SMatan Azrad 				return -1;
8215c060bf1SMatan Azrad 			}
8225c060bf1SMatan Azrad 			hw->vring[i].used = gpa;
8235c060bf1SMatan Azrad 		} else {
8245c060bf1SMatan Azrad 			hw->vring[i].used = m_vring_iova +
8255c060bf1SMatan Azrad 				(char *)internal->m_vring[i].used -
8265c060bf1SMatan Azrad 				(char *)internal->m_vring[i].desc;
8275c060bf1SMatan Azrad 		}
8285c060bf1SMatan Azrad 
8295c060bf1SMatan Azrad 		hw->vring[i].size = vq.size;
8305c060bf1SMatan Azrad 
8315c060bf1SMatan Azrad 		rte_vhost_get_vring_base(vid, i,
8325c060bf1SMatan Azrad 				&internal->m_vring[i].avail->idx,
8335c060bf1SMatan Azrad 				&internal->m_vring[i].used->idx);
8345c060bf1SMatan Azrad 
8355c060bf1SMatan Azrad 		rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx,
8365c060bf1SMatan Azrad 				&hw->vring[i].last_used_idx);
8375c060bf1SMatan Azrad 
8385c060bf1SMatan Azrad 		m_vring_iova += size;
8395c060bf1SMatan Azrad 	}
8405c060bf1SMatan Azrad 	hw->nr_vring = nr_vring;
8415c060bf1SMatan Azrad 
8425c060bf1SMatan Azrad 	return ifcvf_start_hw(&internal->hw);
8435c060bf1SMatan Azrad 
8445c060bf1SMatan Azrad error:
8455c060bf1SMatan Azrad 	for (i = 0; i < nr_vring; i++)
8465c060bf1SMatan Azrad 		rte_free(internal->m_vring[i].desc);
8475c060bf1SMatan Azrad 
8485c060bf1SMatan Azrad 	return -1;
8495c060bf1SMatan Azrad }
8505c060bf1SMatan Azrad 
8515c060bf1SMatan Azrad static int
8525c060bf1SMatan Azrad m_ifcvf_stop(struct ifcvf_internal *internal)
8535c060bf1SMatan Azrad {
8545c060bf1SMatan Azrad 	int vid;
8555c060bf1SMatan Azrad 	uint32_t i;
8565c060bf1SMatan Azrad 	struct rte_vhost_vring vq;
8575c060bf1SMatan Azrad 	struct ifcvf_hw *hw = &internal->hw;
8585c060bf1SMatan Azrad 	uint64_t m_vring_iova = IFCVF_MEDIATED_VRING;
8595c060bf1SMatan Azrad 	uint64_t size, len;
8608c34be17SAndy Pei 	u32 ring_state = 0;
8615c060bf1SMatan Azrad 
8625c060bf1SMatan Azrad 	vid = internal->vid;
8638c34be17SAndy Pei 
8648c34be17SAndy Pei 	/* to make sure no packet is lost for blk device
8658c34be17SAndy Pei 	 * do not stop until last_avail_idx == last_used_idx
8668c34be17SAndy Pei 	 */
8678c34be17SAndy Pei 	if (internal->hw.device_type == IFCVF_BLK) {
8688c34be17SAndy Pei 		for (i = 0; i < hw->nr_vring; i++) {
8698c34be17SAndy Pei 			do {
8708c34be17SAndy Pei 				if (hw->lm_cfg != NULL)
8718c34be17SAndy Pei 					ring_state = *(u32 *)(hw->lm_cfg +
8728c34be17SAndy Pei 						IFCVF_LM_RING_STATE_OFFSET +
8738c34be17SAndy Pei 						i * IFCVF_LM_CFG_SIZE);
8748c34be17SAndy Pei 				hw->vring[i].last_avail_idx =
8758c34be17SAndy Pei 					(u16)(ring_state & IFCVF_16_BIT_MASK);
8768c34be17SAndy Pei 				hw->vring[i].last_used_idx =
8778c34be17SAndy Pei 					(u16)(ring_state >> 16);
8788c34be17SAndy Pei 				usleep(10);
8798c34be17SAndy Pei 			} while (hw->vring[i].last_avail_idx !=
8808c34be17SAndy Pei 				hw->vring[i].last_used_idx);
8818c34be17SAndy Pei 		}
8828c34be17SAndy Pei 	}
8838c34be17SAndy Pei 
8845c060bf1SMatan Azrad 	ifcvf_stop_hw(hw);
8855c060bf1SMatan Azrad 
8865c060bf1SMatan Azrad 	for (i = 0; i < hw->nr_vring; i++) {
8875c060bf1SMatan Azrad 		/* synchronize remaining new used entries if any */
8887015b657SAndy Pei 		if (internal->hw.device_type == IFCVF_NET) {
8895c060bf1SMatan Azrad 			if ((i & 1) == 0)
8905c060bf1SMatan Azrad 				update_used_ring(internal, i);
8917015b657SAndy Pei 		} else if (internal->hw.device_type == IFCVF_BLK) {
8927015b657SAndy Pei 			update_used_ring(internal, i);
8937015b657SAndy Pei 		}
8945c060bf1SMatan Azrad 
8955c060bf1SMatan Azrad 		rte_vhost_get_vhost_vring(vid, i, &vq);
8965c060bf1SMatan Azrad 		len = IFCVF_USED_RING_LEN(vq.size);
8975c060bf1SMatan Azrad 		rte_vhost_log_used_vring(vid, i, 0, len);
8985c060bf1SMatan Azrad 
899924e6b76SThomas Monjalon 		size = RTE_ALIGN_CEIL(vring_size(vq.size, rte_mem_page_size()),
900924e6b76SThomas Monjalon 				rte_mem_page_size());
9015c060bf1SMatan Azrad 		rte_vfio_container_dma_unmap(internal->vfio_container_fd,
9025c060bf1SMatan Azrad 			(uint64_t)(uintptr_t)internal->m_vring[i].desc,
9035c060bf1SMatan Azrad 			m_vring_iova, size);
9045c060bf1SMatan Azrad 
9055c060bf1SMatan Azrad 		rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
9065c060bf1SMatan Azrad 				hw->vring[i].last_used_idx);
9075c060bf1SMatan Azrad 		rte_free(internal->m_vring[i].desc);
9085c060bf1SMatan Azrad 		m_vring_iova += size;
9095c060bf1SMatan Azrad 	}
9105c060bf1SMatan Azrad 
9115c060bf1SMatan Azrad 	return 0;
9125c060bf1SMatan Azrad }
9135c060bf1SMatan Azrad 
9145c060bf1SMatan Azrad static void
9155c060bf1SMatan Azrad update_used_ring(struct ifcvf_internal *internal, uint16_t qid)
9165c060bf1SMatan Azrad {
9175c060bf1SMatan Azrad 	rte_vdpa_relay_vring_used(internal->vid, qid, &internal->m_vring[qid]);
9185c060bf1SMatan Azrad 	rte_vhost_vring_call(internal->vid, qid);
9195c060bf1SMatan Azrad }
9205c060bf1SMatan Azrad 
921a7ba40b2SThomas Monjalon static uint32_t
9225c060bf1SMatan Azrad vring_relay(void *arg)
9235c060bf1SMatan Azrad {
9245c060bf1SMatan Azrad 	int i, vid, epfd, fd, nfds;
9255c060bf1SMatan Azrad 	struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
9265c060bf1SMatan Azrad 	struct rte_vhost_vring vring;
9275c060bf1SMatan Azrad 	uint16_t qid, q_num;
9285c060bf1SMatan Azrad 	struct epoll_event events[IFCVF_MAX_QUEUES * 4];
9295c060bf1SMatan Azrad 	struct epoll_event ev;
9305c060bf1SMatan Azrad 	int nbytes;
9315c060bf1SMatan Azrad 	uint64_t buf;
9325c060bf1SMatan Azrad 
9335c060bf1SMatan Azrad 	vid = internal->vid;
9345c060bf1SMatan Azrad 	q_num = rte_vhost_get_vring_num(vid);
9355c060bf1SMatan Azrad 
9365c060bf1SMatan Azrad 	/* add notify fd and interrupt fd to epoll */
9375c060bf1SMatan Azrad 	epfd = epoll_create(IFCVF_MAX_QUEUES * 2);
9385c060bf1SMatan Azrad 	if (epfd < 0) {
9395c060bf1SMatan Azrad 		DRV_LOG(ERR, "failed to create epoll instance.");
940a7ba40b2SThomas Monjalon 		return 1;
9415c060bf1SMatan Azrad 	}
9425c060bf1SMatan Azrad 	internal->epfd = epfd;
9435c060bf1SMatan Azrad 
9445c060bf1SMatan Azrad 	vring.kickfd = -1;
9455c060bf1SMatan Azrad 	for (qid = 0; qid < q_num; qid++) {
9465c060bf1SMatan Azrad 		ev.events = EPOLLIN | EPOLLPRI;
9475c060bf1SMatan Azrad 		rte_vhost_get_vhost_vring(vid, qid, &vring);
9485c060bf1SMatan Azrad 		ev.data.u64 = qid << 1 | (uint64_t)vring.kickfd << 32;
9495c060bf1SMatan Azrad 		if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) {
9505c060bf1SMatan Azrad 			DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
951a7ba40b2SThomas Monjalon 			return 1;
9525c060bf1SMatan Azrad 		}
9535c060bf1SMatan Azrad 	}
9545c060bf1SMatan Azrad 
9557015b657SAndy Pei 	for (qid = 0; qid < q_num; qid += 1) {
9567015b657SAndy Pei 		if ((internal->hw.device_type == IFCVF_NET) && (qid & 1))
9577015b657SAndy Pei 			continue;
9585c060bf1SMatan Azrad 		ev.events = EPOLLIN | EPOLLPRI;
9595c060bf1SMatan Azrad 		/* leave a flag to mark it's for interrupt */
9605c060bf1SMatan Azrad 		ev.data.u64 = 1 | qid << 1 |
9615c060bf1SMatan Azrad 			(uint64_t)internal->intr_fd[qid] << 32;
9625c060bf1SMatan Azrad 		if (epoll_ctl(epfd, EPOLL_CTL_ADD, internal->intr_fd[qid], &ev)
9635c060bf1SMatan Azrad 				< 0) {
9645c060bf1SMatan Azrad 			DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
965a7ba40b2SThomas Monjalon 			return 1;
9665c060bf1SMatan Azrad 		}
9675c060bf1SMatan Azrad 		update_used_ring(internal, qid);
9685c060bf1SMatan Azrad 	}
9695c060bf1SMatan Azrad 
9705c060bf1SMatan Azrad 	/* start relay with a first kick */
9715c060bf1SMatan Azrad 	for (qid = 0; qid < q_num; qid++)
9725c060bf1SMatan Azrad 		ifcvf_notify_queue(&internal->hw, qid);
9735c060bf1SMatan Azrad 
9745c060bf1SMatan Azrad 	/* listen to the events and react accordingly */
9755c060bf1SMatan Azrad 	for (;;) {
9765c060bf1SMatan Azrad 		nfds = epoll_wait(epfd, events, q_num * 2, -1);
9775c060bf1SMatan Azrad 		if (nfds < 0) {
9785c060bf1SMatan Azrad 			if (errno == EINTR)
9795c060bf1SMatan Azrad 				continue;
98065575dadSAndy Pei 			DRV_LOG(ERR, "epoll_wait return fail.");
981a7ba40b2SThomas Monjalon 			return 1;
9825c060bf1SMatan Azrad 		}
9835c060bf1SMatan Azrad 
9845c060bf1SMatan Azrad 		for (i = 0; i < nfds; i++) {
9855c060bf1SMatan Azrad 			fd = (uint32_t)(events[i].data.u64 >> 32);
9865c060bf1SMatan Azrad 			do {
9875c060bf1SMatan Azrad 				nbytes = read(fd, &buf, 8);
9885c060bf1SMatan Azrad 				if (nbytes < 0) {
9895c060bf1SMatan Azrad 					if (errno == EINTR ||
9905c060bf1SMatan Azrad 					    errno == EWOULDBLOCK ||
9915c060bf1SMatan Azrad 					    errno == EAGAIN)
9925c060bf1SMatan Azrad 						continue;
9935c060bf1SMatan Azrad 					DRV_LOG(INFO, "Error reading "
9945c060bf1SMatan Azrad 						"kickfd: %s",
9955c060bf1SMatan Azrad 						strerror(errno));
9965c060bf1SMatan Azrad 				}
9975c060bf1SMatan Azrad 				break;
9985c060bf1SMatan Azrad 			} while (1);
9995c060bf1SMatan Azrad 
10005c060bf1SMatan Azrad 			qid = events[i].data.u32 >> 1;
10015c060bf1SMatan Azrad 
10025c060bf1SMatan Azrad 			if (events[i].data.u32 & 1)
10035c060bf1SMatan Azrad 				update_used_ring(internal, qid);
10045c060bf1SMatan Azrad 			else
10055c060bf1SMatan Azrad 				ifcvf_notify_queue(&internal->hw, qid);
10065c060bf1SMatan Azrad 		}
10075c060bf1SMatan Azrad 	}
10085c060bf1SMatan Azrad 
1009a7ba40b2SThomas Monjalon 	return 0;
10105c060bf1SMatan Azrad }
10115c060bf1SMatan Azrad 
10125c060bf1SMatan Azrad static int
10135c060bf1SMatan Azrad setup_vring_relay(struct ifcvf_internal *internal)
10145c060bf1SMatan Azrad {
1015a7ba40b2SThomas Monjalon 	char name[RTE_THREAD_INTERNAL_NAME_SIZE];
10165c060bf1SMatan Azrad 	int ret;
10175c060bf1SMatan Azrad 
1018a7ba40b2SThomas Monjalon 	snprintf(name, sizeof(name), "ifc-ring%d", internal->vid);
1019a7ba40b2SThomas Monjalon 	ret = rte_thread_create_internal_control(&internal->tid, name,
1020a7ba40b2SThomas Monjalon 			vring_relay, internal);
1021a011555fSChengwen Feng 	if (ret != 0) {
10225c060bf1SMatan Azrad 		DRV_LOG(ERR, "failed to create ring relay pthread.");
10235c060bf1SMatan Azrad 		return -1;
10245c060bf1SMatan Azrad 	}
1025a011555fSChengwen Feng 
10265c060bf1SMatan Azrad 	return 0;
10275c060bf1SMatan Azrad }
10285c060bf1SMatan Azrad 
10295c060bf1SMatan Azrad static int
10305c060bf1SMatan Azrad unset_vring_relay(struct ifcvf_internal *internal)
10315c060bf1SMatan Azrad {
1032a7ba40b2SThomas Monjalon 	if (internal->tid.opaque_id != 0) {
1033a7ba40b2SThomas Monjalon 		pthread_cancel((pthread_t)internal->tid.opaque_id);
1034a7ba40b2SThomas Monjalon 		rte_thread_join(internal->tid, NULL);
10355c060bf1SMatan Azrad 	}
1036a7ba40b2SThomas Monjalon 	internal->tid.opaque_id = 0;
10375c060bf1SMatan Azrad 
10385c060bf1SMatan Azrad 	if (internal->epfd >= 0)
10395c060bf1SMatan Azrad 		close(internal->epfd);
10405c060bf1SMatan Azrad 	internal->epfd = -1;
10415c060bf1SMatan Azrad 
10425c060bf1SMatan Azrad 	return 0;
10435c060bf1SMatan Azrad }
10445c060bf1SMatan Azrad 
10455c060bf1SMatan Azrad static int
10465c060bf1SMatan Azrad ifcvf_sw_fallback_switchover(struct ifcvf_internal *internal)
10475c060bf1SMatan Azrad {
10485c060bf1SMatan Azrad 	int ret;
10495c060bf1SMatan Azrad 	int vid = internal->vid;
10505c060bf1SMatan Azrad 
10515c060bf1SMatan Azrad 	/* stop the direct IO data path */
10525c060bf1SMatan Azrad 	unset_notify_relay(internal);
10535c060bf1SMatan Azrad 	vdpa_ifcvf_stop(internal);
105465575dadSAndy Pei 
105565575dadSAndy Pei 	unset_intr_relay(internal);
105665575dadSAndy Pei 
10575c060bf1SMatan Azrad 	vdpa_disable_vfio_intr(internal);
10585c060bf1SMatan Azrad 
1059ef3be7e2SAndy Pei 	rte_atomic32_set(&internal->running, 0);
1060ef3be7e2SAndy Pei 
10610329868dSMatan Azrad 	ret = rte_vhost_host_notifier_ctrl(vid, RTE_VHOST_QUEUE_ALL, false);
10625c060bf1SMatan Azrad 	if (ret && ret != -ENOTSUP)
10635c060bf1SMatan Azrad 		goto error;
10645c060bf1SMatan Azrad 
10655c060bf1SMatan Azrad 	/* set up interrupt for interrupt relay */
10665abb634cSJilei Chen 	ret = vdpa_enable_vfio_intr(internal, true);
10675c060bf1SMatan Azrad 	if (ret)
10685c060bf1SMatan Azrad 		goto unmap;
10695c060bf1SMatan Azrad 
10705c060bf1SMatan Azrad 	/* config the VF */
10715c060bf1SMatan Azrad 	ret = m_ifcvf_start(internal);
10725c060bf1SMatan Azrad 	if (ret)
10735c060bf1SMatan Azrad 		goto unset_intr;
10745c060bf1SMatan Azrad 
10755c060bf1SMatan Azrad 	/* set up vring relay thread */
10765c060bf1SMatan Azrad 	ret = setup_vring_relay(internal);
10775c060bf1SMatan Azrad 	if (ret)
10785c060bf1SMatan Azrad 		goto stop_vf;
10795c060bf1SMatan Azrad 
10800329868dSMatan Azrad 	rte_vhost_host_notifier_ctrl(vid, RTE_VHOST_QUEUE_ALL, true);
10815c060bf1SMatan Azrad 
10825c060bf1SMatan Azrad 	internal->sw_fallback_running = true;
10835c060bf1SMatan Azrad 
10845c060bf1SMatan Azrad 	return 0;
10855c060bf1SMatan Azrad 
10865c060bf1SMatan Azrad stop_vf:
10875c060bf1SMatan Azrad 	m_ifcvf_stop(internal);
10885c060bf1SMatan Azrad unset_intr:
10895c060bf1SMatan Azrad 	vdpa_disable_vfio_intr(internal);
10905c060bf1SMatan Azrad unmap:
10915abb634cSJilei Chen 	ifcvf_dma_map(internal, false);
10925c060bf1SMatan Azrad error:
10935c060bf1SMatan Azrad 	return -1;
10945c060bf1SMatan Azrad }
10955c060bf1SMatan Azrad 
10965c060bf1SMatan Azrad static int
10975c060bf1SMatan Azrad ifcvf_dev_config(int vid)
10985c060bf1SMatan Azrad {
109981a6b7feSMaxime Coquelin 	struct rte_vdpa_device *vdev;
11005c060bf1SMatan Azrad 	struct internal_list *list;
11015c060bf1SMatan Azrad 	struct ifcvf_internal *internal;
110283c7370aSAndy Pei 	struct ifcvf_hw *hw;
110383c7370aSAndy Pei 	uint16_t i;
11045c060bf1SMatan Azrad 
11052263f139SMaxime Coquelin 	vdev = rte_vhost_get_vdpa_device(vid);
110681a6b7feSMaxime Coquelin 	list = find_internal_resource_by_vdev(vdev);
11075c060bf1SMatan Azrad 	if (list == NULL) {
110881a6b7feSMaxime Coquelin 		DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
11095c060bf1SMatan Azrad 		return -1;
11105c060bf1SMatan Azrad 	}
11115c060bf1SMatan Azrad 
11125c060bf1SMatan Azrad 	internal = list->internal;
11135c060bf1SMatan Azrad 	internal->vid = vid;
11145c060bf1SMatan Azrad 	rte_atomic32_set(&internal->dev_attached, 1);
1115903ec2b1STaekyung Kim 	if (update_datapath(internal) < 0) {
1116903ec2b1STaekyung Kim 		DRV_LOG(ERR, "failed to update datapath for vDPA device %s",
1117903ec2b1STaekyung Kim 			vdev->device->name);
1118903ec2b1STaekyung Kim 		rte_atomic32_set(&internal->dev_attached, 0);
1119903ec2b1STaekyung Kim 		return -1;
1120903ec2b1STaekyung Kim 	}
11215c060bf1SMatan Azrad 
112283c7370aSAndy Pei 	hw = &internal->hw;
112383c7370aSAndy Pei 	for (i = 0; i < hw->nr_vring; i++) {
112483c7370aSAndy Pei 		if (!hw->vring[i].enable)
112583c7370aSAndy Pei 			continue;
112683c7370aSAndy Pei 		if (rte_vhost_host_notifier_ctrl(vid, i, true) != 0)
112781a6b7feSMaxime Coquelin 			DRV_LOG(NOTICE, "vDPA (%s): software relay is used.",
112881a6b7feSMaxime Coquelin 				vdev->device->name);
112983c7370aSAndy Pei 	}
11305c060bf1SMatan Azrad 
1131e2a1a08aSChenbo Xia 	internal->configured = 1;
113283c7370aSAndy Pei 	DRV_LOG(INFO, "vDPA device %s is configured", vdev->device->name);
11335c060bf1SMatan Azrad 	return 0;
11345c060bf1SMatan Azrad }
11355c060bf1SMatan Azrad 
11365c060bf1SMatan Azrad static int
11375c060bf1SMatan Azrad ifcvf_dev_close(int vid)
11385c060bf1SMatan Azrad {
113981a6b7feSMaxime Coquelin 	struct rte_vdpa_device *vdev;
11405c060bf1SMatan Azrad 	struct internal_list *list;
11415c060bf1SMatan Azrad 	struct ifcvf_internal *internal;
11425c060bf1SMatan Azrad 
11432263f139SMaxime Coquelin 	vdev = rte_vhost_get_vdpa_device(vid);
114481a6b7feSMaxime Coquelin 	list = find_internal_resource_by_vdev(vdev);
11455c060bf1SMatan Azrad 	if (list == NULL) {
114681a6b7feSMaxime Coquelin 		DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
11475c060bf1SMatan Azrad 		return -1;
11485c060bf1SMatan Azrad 	}
11495c060bf1SMatan Azrad 
11505c060bf1SMatan Azrad 	internal = list->internal;
11515c060bf1SMatan Azrad 
11525c060bf1SMatan Azrad 	if (internal->sw_fallback_running) {
11535c060bf1SMatan Azrad 		/* unset ring relay */
11545c060bf1SMatan Azrad 		unset_vring_relay(internal);
11555c060bf1SMatan Azrad 
11565c060bf1SMatan Azrad 		/* reset VF */
11575c060bf1SMatan Azrad 		m_ifcvf_stop(internal);
11585c060bf1SMatan Azrad 
11595c060bf1SMatan Azrad 		/* remove interrupt setting */
11605c060bf1SMatan Azrad 		vdpa_disable_vfio_intr(internal);
11615c060bf1SMatan Azrad 
11625c060bf1SMatan Azrad 		/* unset DMA map for guest memory */
11635abb634cSJilei Chen 		ifcvf_dma_map(internal, false);
11645c060bf1SMatan Azrad 
11655c060bf1SMatan Azrad 		internal->sw_fallback_running = false;
11665c060bf1SMatan Azrad 	} else {
11675c060bf1SMatan Azrad 		rte_atomic32_set(&internal->dev_attached, 0);
1168903ec2b1STaekyung Kim 		if (update_datapath(internal) < 0) {
1169903ec2b1STaekyung Kim 			DRV_LOG(ERR, "failed to update datapath for vDPA device %s",
1170903ec2b1STaekyung Kim 				vdev->device->name);
1171903ec2b1STaekyung Kim 			internal->configured = 0;
1172903ec2b1STaekyung Kim 			return -1;
1173903ec2b1STaekyung Kim 		}
11745c060bf1SMatan Azrad 	}
11755c060bf1SMatan Azrad 
1176e2a1a08aSChenbo Xia 	internal->configured = 0;
11775c060bf1SMatan Azrad 	return 0;
11785c060bf1SMatan Azrad }
11795c060bf1SMatan Azrad 
11805c060bf1SMatan Azrad static int
11815c060bf1SMatan Azrad ifcvf_set_features(int vid)
11825c060bf1SMatan Azrad {
11835c060bf1SMatan Azrad 	uint64_t features = 0;
118481a6b7feSMaxime Coquelin 	struct rte_vdpa_device *vdev;
11855c060bf1SMatan Azrad 	struct internal_list *list;
11865c060bf1SMatan Azrad 	struct ifcvf_internal *internal;
11875c060bf1SMatan Azrad 	uint64_t log_base = 0, log_size = 0;
11885c060bf1SMatan Azrad 
11892263f139SMaxime Coquelin 	vdev = rte_vhost_get_vdpa_device(vid);
119081a6b7feSMaxime Coquelin 	list = find_internal_resource_by_vdev(vdev);
11915c060bf1SMatan Azrad 	if (list == NULL) {
119281a6b7feSMaxime Coquelin 		DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
11935c060bf1SMatan Azrad 		return -1;
11945c060bf1SMatan Azrad 	}
11955c060bf1SMatan Azrad 
11965c060bf1SMatan Azrad 	internal = list->internal;
11975c060bf1SMatan Azrad 	rte_vhost_get_negotiated_features(vid, &features);
11985c060bf1SMatan Azrad 
11995c060bf1SMatan Azrad 	if (!RTE_VHOST_NEED_LOG(features))
12005c060bf1SMatan Azrad 		return 0;
12015c060bf1SMatan Azrad 
12025c060bf1SMatan Azrad 	if (internal->sw_lm) {
12035c060bf1SMatan Azrad 		ifcvf_sw_fallback_switchover(internal);
12045c060bf1SMatan Azrad 	} else {
12055c060bf1SMatan Azrad 		rte_vhost_get_log_base(vid, &log_base, &log_size);
12065c060bf1SMatan Azrad 		rte_vfio_container_dma_map(internal->vfio_container_fd,
12075c060bf1SMatan Azrad 				log_base, IFCVF_LOG_BASE, log_size);
12085c060bf1SMatan Azrad 		ifcvf_enable_logging(&internal->hw, IFCVF_LOG_BASE, log_size);
12095c060bf1SMatan Azrad 	}
12105c060bf1SMatan Azrad 
12115c060bf1SMatan Azrad 	return 0;
12125c060bf1SMatan Azrad }
12135c060bf1SMatan Azrad 
12145c060bf1SMatan Azrad static int
12155c060bf1SMatan Azrad ifcvf_get_vfio_group_fd(int vid)
12165c060bf1SMatan Azrad {
121781a6b7feSMaxime Coquelin 	struct rte_vdpa_device *vdev;
12185c060bf1SMatan Azrad 	struct internal_list *list;
12195c060bf1SMatan Azrad 
12202263f139SMaxime Coquelin 	vdev = rte_vhost_get_vdpa_device(vid);
122181a6b7feSMaxime Coquelin 	list = find_internal_resource_by_vdev(vdev);
12225c060bf1SMatan Azrad 	if (list == NULL) {
122381a6b7feSMaxime Coquelin 		DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
12245c060bf1SMatan Azrad 		return -1;
12255c060bf1SMatan Azrad 	}
12265c060bf1SMatan Azrad 
12275c060bf1SMatan Azrad 	return list->internal->vfio_group_fd;
12285c060bf1SMatan Azrad }
12295c060bf1SMatan Azrad 
12305c060bf1SMatan Azrad static int
12315c060bf1SMatan Azrad ifcvf_get_vfio_device_fd(int vid)
12325c060bf1SMatan Azrad {
123381a6b7feSMaxime Coquelin 	struct rte_vdpa_device *vdev;
12345c060bf1SMatan Azrad 	struct internal_list *list;
12355c060bf1SMatan Azrad 
12362263f139SMaxime Coquelin 	vdev = rte_vhost_get_vdpa_device(vid);
123781a6b7feSMaxime Coquelin 	list = find_internal_resource_by_vdev(vdev);
12385c060bf1SMatan Azrad 	if (list == NULL) {
123981a6b7feSMaxime Coquelin 		DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
12405c060bf1SMatan Azrad 		return -1;
12415c060bf1SMatan Azrad 	}
12425c060bf1SMatan Azrad 
12435c060bf1SMatan Azrad 	return list->internal->vfio_dev_fd;
12445c060bf1SMatan Azrad }
12455c060bf1SMatan Azrad 
12465c060bf1SMatan Azrad static int
12475c060bf1SMatan Azrad ifcvf_get_notify_area(int vid, int qid, uint64_t *offset, uint64_t *size)
12485c060bf1SMatan Azrad {
124981a6b7feSMaxime Coquelin 	struct rte_vdpa_device *vdev;
12505c060bf1SMatan Azrad 	struct internal_list *list;
12515c060bf1SMatan Azrad 	struct ifcvf_internal *internal;
12525c060bf1SMatan Azrad 	struct vfio_region_info reg = { .argsz = sizeof(reg) };
12535c060bf1SMatan Azrad 	int ret;
12545c060bf1SMatan Azrad 
12552263f139SMaxime Coquelin 	vdev = rte_vhost_get_vdpa_device(vid);
125681a6b7feSMaxime Coquelin 	list = find_internal_resource_by_vdev(vdev);
12575c060bf1SMatan Azrad 	if (list == NULL) {
125881a6b7feSMaxime Coquelin 		DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
12595c060bf1SMatan Azrad 		return -1;
12605c060bf1SMatan Azrad 	}
12615c060bf1SMatan Azrad 
12625c060bf1SMatan Azrad 	internal = list->internal;
12635c060bf1SMatan Azrad 
12645c060bf1SMatan Azrad 	reg.index = ifcvf_get_notify_region(&internal->hw);
12655c060bf1SMatan Azrad 	ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, &reg);
12665c060bf1SMatan Azrad 	if (ret) {
12675c060bf1SMatan Azrad 		DRV_LOG(ERR, "Get not get device region info: %s",
12685c060bf1SMatan Azrad 				strerror(errno));
12695c060bf1SMatan Azrad 		return -1;
12705c060bf1SMatan Azrad 	}
12715c060bf1SMatan Azrad 
12725c060bf1SMatan Azrad 	*offset = ifcvf_get_queue_notify_off(&internal->hw, qid) + reg.offset;
12735c060bf1SMatan Azrad 	*size = 0x1000;
12745c060bf1SMatan Azrad 
12755c060bf1SMatan Azrad 	return 0;
12765c060bf1SMatan Azrad }
12775c060bf1SMatan Azrad 
12785c060bf1SMatan Azrad static int
127981a6b7feSMaxime Coquelin ifcvf_get_queue_num(struct rte_vdpa_device *vdev, uint32_t *queue_num)
12805c060bf1SMatan Azrad {
12815c060bf1SMatan Azrad 	struct internal_list *list;
12825c060bf1SMatan Azrad 
128381a6b7feSMaxime Coquelin 	list = find_internal_resource_by_vdev(vdev);
12845c060bf1SMatan Azrad 	if (list == NULL) {
128581a6b7feSMaxime Coquelin 		DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
12865c060bf1SMatan Azrad 		return -1;
12875c060bf1SMatan Azrad 	}
12885c060bf1SMatan Azrad 
12895c060bf1SMatan Azrad 	*queue_num = list->internal->max_queues;
12905c060bf1SMatan Azrad 
12915c060bf1SMatan Azrad 	return 0;
12925c060bf1SMatan Azrad }
12935c060bf1SMatan Azrad 
12945c060bf1SMatan Azrad static int
129581a6b7feSMaxime Coquelin ifcvf_get_vdpa_features(struct rte_vdpa_device *vdev, uint64_t *features)
12965c060bf1SMatan Azrad {
12975c060bf1SMatan Azrad 	struct internal_list *list;
12985c060bf1SMatan Azrad 
129981a6b7feSMaxime Coquelin 	list = find_internal_resource_by_vdev(vdev);
13005c060bf1SMatan Azrad 	if (list == NULL) {
130181a6b7feSMaxime Coquelin 		DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
13025c060bf1SMatan Azrad 		return -1;
13035c060bf1SMatan Azrad 	}
13045c060bf1SMatan Azrad 
13055c060bf1SMatan Azrad 	*features = list->internal->features;
13065c060bf1SMatan Azrad 
13075c060bf1SMatan Azrad 	return 0;
13085c060bf1SMatan Azrad }
13095c060bf1SMatan Azrad 
13105c060bf1SMatan Azrad #define VDPA_SUPPORTED_PROTOCOL_FEATURES \
13115c060bf1SMatan Azrad 		(1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK | \
1312f1eb477eSNobuhiro Miki 		 1ULL << VHOST_USER_PROTOCOL_F_BACKEND_REQ | \
1313f1eb477eSNobuhiro Miki 		 1ULL << VHOST_USER_PROTOCOL_F_BACKEND_SEND_FD | \
13145c060bf1SMatan Azrad 		 1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER | \
13150c88dfa1SMaxime Coquelin 		 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD | \
1316b6b19e23SAndy Pei 		 1ULL << VHOST_USER_PROTOCOL_F_MQ | \
13170c88dfa1SMaxime Coquelin 		 1ULL << VHOST_USER_PROTOCOL_F_STATUS)
1318856d03bcSAndy Pei 
1319856d03bcSAndy Pei #define VDPA_BLK_PROTOCOL_FEATURES \
1320856d03bcSAndy Pei 		(1ULL << VHOST_USER_PROTOCOL_F_CONFIG)
1321856d03bcSAndy Pei 
13225c060bf1SMatan Azrad static int
132381a6b7feSMaxime Coquelin ifcvf_get_protocol_features(struct rte_vdpa_device *vdev, uint64_t *features)
13245c060bf1SMatan Azrad {
132581a6b7feSMaxime Coquelin 	RTE_SET_USED(vdev);
132681a6b7feSMaxime Coquelin 
13275c060bf1SMatan Azrad 	*features = VDPA_SUPPORTED_PROTOCOL_FEATURES;
13285c060bf1SMatan Azrad 	return 0;
13295c060bf1SMatan Azrad }
13305c060bf1SMatan Azrad 
1331e2a1a08aSChenbo Xia static int
1332a9a56423SHuang Wei ifcvf_config_vring(struct ifcvf_internal *internal, int vring)
1333a9a56423SHuang Wei {
1334a9a56423SHuang Wei 	struct ifcvf_hw *hw = &internal->hw;
1335a9a56423SHuang Wei 	int vid = internal->vid;
1336a9a56423SHuang Wei 	struct rte_vhost_vring vq;
1337a9a56423SHuang Wei 	uint64_t gpa;
1338a9a56423SHuang Wei 
1339a9a56423SHuang Wei 	if (hw->vring[vring].enable) {
1340a9a56423SHuang Wei 		rte_vhost_get_vhost_vring(vid, vring, &vq);
1341a9a56423SHuang Wei 		gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc);
1342a9a56423SHuang Wei 		if (gpa == 0) {
1343a9a56423SHuang Wei 			DRV_LOG(ERR, "Fail to get GPA for descriptor ring.");
1344a9a56423SHuang Wei 			return -1;
1345a9a56423SHuang Wei 		}
1346a9a56423SHuang Wei 		hw->vring[vring].desc = gpa;
1347a9a56423SHuang Wei 
1348a9a56423SHuang Wei 		gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.avail);
1349a9a56423SHuang Wei 		if (gpa == 0) {
1350a9a56423SHuang Wei 			DRV_LOG(ERR, "Fail to get GPA for available ring.");
1351a9a56423SHuang Wei 			return -1;
1352a9a56423SHuang Wei 		}
1353a9a56423SHuang Wei 		hw->vring[vring].avail = gpa;
1354a9a56423SHuang Wei 
1355a9a56423SHuang Wei 		gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used);
1356a9a56423SHuang Wei 		if (gpa == 0) {
1357a9a56423SHuang Wei 			DRV_LOG(ERR, "Fail to get GPA for used ring.");
1358a9a56423SHuang Wei 			return -1;
1359a9a56423SHuang Wei 		}
1360a9a56423SHuang Wei 		hw->vring[vring].used = gpa;
1361a9a56423SHuang Wei 
1362a9a56423SHuang Wei 		hw->vring[vring].size = vq.size;
1363a9a56423SHuang Wei 		rte_vhost_get_vring_base(vid, vring,
1364a9a56423SHuang Wei 				&hw->vring[vring].last_avail_idx,
1365a9a56423SHuang Wei 				&hw->vring[vring].last_used_idx);
1366a9a56423SHuang Wei 		ifcvf_enable_vring_hw(&internal->hw, vring);
1367a9a56423SHuang Wei 	} else {
1368a9a56423SHuang Wei 		ifcvf_disable_vring_hw(&internal->hw, vring);
1369a9a56423SHuang Wei 		rte_vhost_set_vring_base(vid, vring,
1370a9a56423SHuang Wei 				hw->vring[vring].last_avail_idx,
1371a9a56423SHuang Wei 				hw->vring[vring].last_used_idx);
1372a9a56423SHuang Wei 	}
1373a9a56423SHuang Wei 
1374a9a56423SHuang Wei 	return 0;
1375a9a56423SHuang Wei }
1376a9a56423SHuang Wei 
1377a9a56423SHuang Wei static int
1378e2a1a08aSChenbo Xia ifcvf_set_vring_state(int vid, int vring, int state)
1379e2a1a08aSChenbo Xia {
1380e2a1a08aSChenbo Xia 	struct rte_vdpa_device *vdev;
1381e2a1a08aSChenbo Xia 	struct internal_list *list;
1382e2a1a08aSChenbo Xia 	struct ifcvf_internal *internal;
1383e2a1a08aSChenbo Xia 	struct ifcvf_hw *hw;
1384a9a56423SHuang Wei 	bool enable = !!state;
1385e2a1a08aSChenbo Xia 	int ret = 0;
1386e2a1a08aSChenbo Xia 
1387e2a1a08aSChenbo Xia 	vdev = rte_vhost_get_vdpa_device(vid);
1388e2a1a08aSChenbo Xia 	list = find_internal_resource_by_vdev(vdev);
1389e2a1a08aSChenbo Xia 	if (list == NULL) {
1390e2a1a08aSChenbo Xia 		DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1391e2a1a08aSChenbo Xia 		return -1;
1392e2a1a08aSChenbo Xia 	}
1393e2a1a08aSChenbo Xia 
1394a9a56423SHuang Wei 	DRV_LOG(INFO, "%s queue %d of vDPA device %s",
1395a9a56423SHuang Wei 		enable ? "enable" : "disable", vring, vdev->device->name);
1396a9a56423SHuang Wei 
1397e2a1a08aSChenbo Xia 	internal = list->internal;
1398e2a1a08aSChenbo Xia 	if (vring < 0 || vring >= internal->max_queues * 2) {
1399e2a1a08aSChenbo Xia 		DRV_LOG(ERR, "Vring index %d not correct", vring);
1400e2a1a08aSChenbo Xia 		return -1;
1401e2a1a08aSChenbo Xia 	}
1402e2a1a08aSChenbo Xia 
1403e2a1a08aSChenbo Xia 	hw = &internal->hw;
1404a9a56423SHuang Wei 	hw->vring[vring].enable = enable;
1405a9a56423SHuang Wei 
1406e2a1a08aSChenbo Xia 	if (!internal->configured)
1407a9a56423SHuang Wei 		return 0;
1408e2a1a08aSChenbo Xia 
1409a9a56423SHuang Wei 	unset_notify_relay(internal);
1410e2a1a08aSChenbo Xia 
14115abb634cSJilei Chen 	ret = vdpa_enable_vfio_intr(internal, false);
1412a9a56423SHuang Wei 	if (ret) {
1413a9a56423SHuang Wei 		DRV_LOG(ERR, "failed to set vfio interrupt of vDPA device %s",
1414a9a56423SHuang Wei 			vdev->device->name);
1415e2a1a08aSChenbo Xia 		return ret;
1416e2a1a08aSChenbo Xia 	}
1417e2a1a08aSChenbo Xia 
1418a9a56423SHuang Wei 	ret = ifcvf_config_vring(internal, vring);
1419a9a56423SHuang Wei 	if (ret) {
1420a9a56423SHuang Wei 		DRV_LOG(ERR, "failed to configure queue %d of vDPA device %s",
1421a9a56423SHuang Wei 			vring, vdev->device->name);
1422a9a56423SHuang Wei 		return ret;
1423a9a56423SHuang Wei 	}
1424a9a56423SHuang Wei 
1425a9a56423SHuang Wei 	ret = setup_notify_relay(internal);
1426a9a56423SHuang Wei 	if (ret) {
1427a9a56423SHuang Wei 		DRV_LOG(ERR, "failed to setup notify relay of vDPA device %s",
1428a9a56423SHuang Wei 			vdev->device->name);
1429a9a56423SHuang Wei 		return ret;
1430a9a56423SHuang Wei 	}
1431a9a56423SHuang Wei 
1432a9a56423SHuang Wei 	ret = rte_vhost_host_notifier_ctrl(vid, vring, enable);
1433a9a56423SHuang Wei 	if (ret) {
1434a9a56423SHuang Wei 		DRV_LOG(ERR, "vDPA device %s queue %d host notifier ctrl fail",
1435a9a56423SHuang Wei 			vdev->device->name, vring);
1436a9a56423SHuang Wei 		return ret;
1437a9a56423SHuang Wei 	}
1438a9a56423SHuang Wei 
1439e2a1a08aSChenbo Xia 	return 0;
1440e2a1a08aSChenbo Xia }
1441e2a1a08aSChenbo Xia 
14422872943cSAndy Pei static int
14432872943cSAndy Pei ifcvf_get_device_type(struct rte_vdpa_device *vdev,
14442872943cSAndy Pei 	uint32_t *type)
14452872943cSAndy Pei {
14462872943cSAndy Pei 	struct ifcvf_internal *internal;
14472872943cSAndy Pei 	struct internal_list *list;
144810d0458aSAndy Pei 	struct rte_device *rte_dev = vdev->device;
14492872943cSAndy Pei 
145010d0458aSAndy Pei 	list = find_internal_resource_by_rte_dev(rte_dev);
14512872943cSAndy Pei 	if (list == NULL) {
145210d0458aSAndy Pei 		DRV_LOG(ERR, "Invalid rte device: %p", rte_dev);
14532872943cSAndy Pei 		return -1;
14542872943cSAndy Pei 	}
14552872943cSAndy Pei 
14562872943cSAndy Pei 	internal = list->internal;
14572872943cSAndy Pei 
14582872943cSAndy Pei 	if (internal->hw.device_type == IFCVF_BLK)
14592872943cSAndy Pei 		*type = RTE_VHOST_VDPA_DEVICE_TYPE_BLK;
14602872943cSAndy Pei 	else
14612872943cSAndy Pei 		*type = RTE_VHOST_VDPA_DEVICE_TYPE_NET;
14622872943cSAndy Pei 
14632872943cSAndy Pei 	return 0;
14642872943cSAndy Pei }
14652872943cSAndy Pei 
1466a60b747dSAndy Pei static struct rte_vdpa_dev_ops ifcvf_net_ops = {
14675c060bf1SMatan Azrad 	.get_queue_num = ifcvf_get_queue_num,
14685c060bf1SMatan Azrad 	.get_features = ifcvf_get_vdpa_features,
14695c060bf1SMatan Azrad 	.get_protocol_features = ifcvf_get_protocol_features,
14705c060bf1SMatan Azrad 	.dev_conf = ifcvf_dev_config,
14715c060bf1SMatan Azrad 	.dev_close = ifcvf_dev_close,
1472e2a1a08aSChenbo Xia 	.set_vring_state = ifcvf_set_vring_state,
14735c060bf1SMatan Azrad 	.set_features = ifcvf_set_features,
14745c060bf1SMatan Azrad 	.migration_done = NULL,
14755c060bf1SMatan Azrad 	.get_vfio_group_fd = ifcvf_get_vfio_group_fd,
14765c060bf1SMatan Azrad 	.get_vfio_device_fd = ifcvf_get_vfio_device_fd,
14775c060bf1SMatan Azrad 	.get_notify_area = ifcvf_get_notify_area,
14782872943cSAndy Pei 	.get_dev_type = ifcvf_get_device_type,
14795c060bf1SMatan Azrad };
14805c060bf1SMatan Azrad 
14815c060bf1SMatan Azrad static inline int
14825c060bf1SMatan Azrad open_int(const char *key __rte_unused, const char *value, void *extra_args)
14835c060bf1SMatan Azrad {
14845c060bf1SMatan Azrad 	uint16_t *n = extra_args;
14855c060bf1SMatan Azrad 
14865c060bf1SMatan Azrad 	if (value == NULL || extra_args == NULL)
14875c060bf1SMatan Azrad 		return -EINVAL;
14885c060bf1SMatan Azrad 
14895c060bf1SMatan Azrad 	*n = (uint16_t)strtoul(value, NULL, 0);
14905c060bf1SMatan Azrad 	if (*n == USHRT_MAX && errno == ERANGE)
14915c060bf1SMatan Azrad 		return -1;
14925c060bf1SMatan Azrad 
14935c060bf1SMatan Azrad 	return 0;
14945c060bf1SMatan Azrad }
14955c060bf1SMatan Azrad 
1496a60b747dSAndy Pei static int16_t
1497a60b747dSAndy Pei ifcvf_pci_get_device_type(struct rte_pci_device *pci_dev)
1498a60b747dSAndy Pei {
1499a60b747dSAndy Pei 	uint16_t pci_device_id = pci_dev->id.device_id;
1500a60b747dSAndy Pei 	uint16_t device_id;
1501a60b747dSAndy Pei 
1502a60b747dSAndy Pei 	if (pci_device_id < 0x1000 || pci_device_id > 0x107f) {
1503f665790aSDavid Marchand 		DRV_LOG(ERR, "Probe device is not a virtio device");
1504a60b747dSAndy Pei 		return -1;
1505a60b747dSAndy Pei 	}
1506a60b747dSAndy Pei 
1507a60b747dSAndy Pei 	if (pci_device_id < 0x1040) {
1508a60b747dSAndy Pei 		/* Transitional devices: use the PCI subsystem device id as
1509a60b747dSAndy Pei 		 * virtio device id, same as legacy driver always did.
1510a60b747dSAndy Pei 		 */
1511a60b747dSAndy Pei 		device_id = pci_dev->id.subsystem_device_id;
1512a60b747dSAndy Pei 	} else {
1513a60b747dSAndy Pei 		/* Modern devices: simply use PCI device id,
1514a60b747dSAndy Pei 		 * but start from 0x1040.
1515a60b747dSAndy Pei 		 */
1516a60b747dSAndy Pei 		device_id = pci_device_id - 0x1040;
1517a60b747dSAndy Pei 	}
1518a60b747dSAndy Pei 
1519a60b747dSAndy Pei 	return device_id;
1520a60b747dSAndy Pei }
1521a60b747dSAndy Pei 
1522856d03bcSAndy Pei static int
1523856d03bcSAndy Pei ifcvf_blk_get_config(int vid, uint8_t *config, uint32_t size)
1524856d03bcSAndy Pei {
1525856d03bcSAndy Pei 	struct virtio_blk_config *dev_cfg;
1526856d03bcSAndy Pei 	struct ifcvf_internal *internal;
1527856d03bcSAndy Pei 	struct rte_vdpa_device *vdev;
1528856d03bcSAndy Pei 	struct internal_list *list;
1529856d03bcSAndy Pei 	uint32_t i;
1530856d03bcSAndy Pei 	uint64_t capacity = 0;
1531856d03bcSAndy Pei 	uint8_t *byte;
1532856d03bcSAndy Pei 
153398c60961SAndy Pei 	if (size < sizeof(struct virtio_blk_config)) {
1534856d03bcSAndy Pei 		DRV_LOG(ERR, "Invalid len: %u, required: %u",
1535856d03bcSAndy Pei 			size, (uint32_t)sizeof(struct virtio_blk_config));
1536856d03bcSAndy Pei 		return -1;
1537856d03bcSAndy Pei 	}
1538856d03bcSAndy Pei 
1539856d03bcSAndy Pei 	vdev = rte_vhost_get_vdpa_device(vid);
1540856d03bcSAndy Pei 	if (vdev == NULL) {
1541856d03bcSAndy Pei 		DRV_LOG(ERR, "Invalid vDPA device vid: %d", vid);
1542856d03bcSAndy Pei 		return -1;
1543856d03bcSAndy Pei 	}
1544856d03bcSAndy Pei 
1545856d03bcSAndy Pei 	list = find_internal_resource_by_vdev(vdev);
1546856d03bcSAndy Pei 	if (list == NULL) {
1547856d03bcSAndy Pei 		DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1548856d03bcSAndy Pei 		return -1;
1549856d03bcSAndy Pei 	}
1550856d03bcSAndy Pei 
1551856d03bcSAndy Pei 	internal = list->internal;
1552856d03bcSAndy Pei 
1553856d03bcSAndy Pei 	for (i = 0; i < sizeof(struct virtio_blk_config); i++)
1554856d03bcSAndy Pei 		config[i] = *((u8 *)internal->hw.blk_cfg + i);
1555856d03bcSAndy Pei 
1556856d03bcSAndy Pei 	dev_cfg = (struct virtio_blk_config *)internal->hw.blk_cfg;
1557856d03bcSAndy Pei 
1558856d03bcSAndy Pei 	/* cannot read 64-bit register in one attempt, so read byte by byte. */
1559856d03bcSAndy Pei 	for (i = 0; i < sizeof(internal->hw.blk_cfg->capacity); i++) {
1560856d03bcSAndy Pei 		byte = (uint8_t *)&internal->hw.blk_cfg->capacity + i;
1561856d03bcSAndy Pei 		capacity |= (uint64_t)*byte << (i * 8);
1562856d03bcSAndy Pei 	}
1563856d03bcSAndy Pei 	/* The capacity is number of sectors in 512-byte.
1564856d03bcSAndy Pei 	 * So right shift 1 bit  we get in K,
1565856d03bcSAndy Pei 	 * another right shift 10 bits we get in M,
1566856d03bcSAndy Pei 	 * right shift 10 more bits, we get in G.
1567856d03bcSAndy Pei 	 * To show capacity in G, we right shift 21 bits in total.
1568856d03bcSAndy Pei 	 */
1569856d03bcSAndy Pei 	DRV_LOG(DEBUG, "capacity  : %"PRIu64"G", capacity >> 21);
1570856d03bcSAndy Pei 
1571856d03bcSAndy Pei 	DRV_LOG(DEBUG, "size_max  : 0x%08x", dev_cfg->size_max);
1572856d03bcSAndy Pei 	DRV_LOG(DEBUG, "seg_max   : 0x%08x", dev_cfg->seg_max);
1573856d03bcSAndy Pei 	DRV_LOG(DEBUG, "blk_size  : 0x%08x", dev_cfg->blk_size);
1574856d03bcSAndy Pei 	DRV_LOG(DEBUG, "geometry");
1575856d03bcSAndy Pei 	DRV_LOG(DEBUG, "      cylinders: %u", dev_cfg->geometry.cylinders);
1576856d03bcSAndy Pei 	DRV_LOG(DEBUG, "      heads    : %u", dev_cfg->geometry.heads);
1577856d03bcSAndy Pei 	DRV_LOG(DEBUG, "      sectors  : %u", dev_cfg->geometry.sectors);
1578856d03bcSAndy Pei 	DRV_LOG(DEBUG, "num_queues: 0x%08x", dev_cfg->num_queues);
1579856d03bcSAndy Pei 
1580f665790aSDavid Marchand 	DRV_LOG(DEBUG, "config: [%x] [%x] [%x] [%x] [%x] [%x] [%x] [%x]",
1581856d03bcSAndy Pei 		config[0], config[1], config[2], config[3], config[4],
1582856d03bcSAndy Pei 		config[5], config[6], config[7]);
1583856d03bcSAndy Pei 	return 0;
1584856d03bcSAndy Pei }
1585856d03bcSAndy Pei 
1586856d03bcSAndy Pei static int
1587856d03bcSAndy Pei ifcvf_blk_get_protocol_features(struct rte_vdpa_device *vdev,
1588856d03bcSAndy Pei 	uint64_t *features)
1589856d03bcSAndy Pei {
1590856d03bcSAndy Pei 	RTE_SET_USED(vdev);
1591856d03bcSAndy Pei 
1592856d03bcSAndy Pei 	*features = VDPA_SUPPORTED_PROTOCOL_FEATURES;
1593856d03bcSAndy Pei 	*features |= VDPA_BLK_PROTOCOL_FEATURES;
1594856d03bcSAndy Pei 	return 0;
1595856d03bcSAndy Pei }
1596856d03bcSAndy Pei 
1597856d03bcSAndy Pei static struct rte_vdpa_dev_ops ifcvf_blk_ops = {
1598856d03bcSAndy Pei 	.get_queue_num = ifcvf_get_queue_num,
1599856d03bcSAndy Pei 	.get_features = ifcvf_get_vdpa_features,
1600856d03bcSAndy Pei 	.set_features = ifcvf_set_features,
1601856d03bcSAndy Pei 	.get_protocol_features = ifcvf_blk_get_protocol_features,
1602856d03bcSAndy Pei 	.dev_conf = ifcvf_dev_config,
1603856d03bcSAndy Pei 	.dev_close = ifcvf_dev_close,
1604856d03bcSAndy Pei 	.set_vring_state = ifcvf_set_vring_state,
1605856d03bcSAndy Pei 	.migration_done = NULL,
1606856d03bcSAndy Pei 	.get_vfio_group_fd = ifcvf_get_vfio_group_fd,
1607856d03bcSAndy Pei 	.get_vfio_device_fd = ifcvf_get_vfio_device_fd,
1608856d03bcSAndy Pei 	.get_notify_area = ifcvf_get_notify_area,
1609856d03bcSAndy Pei 	.get_config = ifcvf_blk_get_config,
16102872943cSAndy Pei 	.get_dev_type = ifcvf_get_device_type,
1611856d03bcSAndy Pei };
1612856d03bcSAndy Pei 
1613a60b747dSAndy Pei struct rte_vdpa_dev_info dev_info[] = {
1614a60b747dSAndy Pei 	{
1615a60b747dSAndy Pei 		.features = (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) |
1616a60b747dSAndy Pei 			    (1ULL << VIRTIO_NET_F_CTRL_VQ) |
1617a60b747dSAndy Pei 			    (1ULL << VIRTIO_NET_F_STATUS) |
1618a60b747dSAndy Pei 			    (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) |
1619a60b747dSAndy Pei 			    (1ULL << VHOST_F_LOG_ALL),
1620a60b747dSAndy Pei 		.ops = &ifcvf_net_ops,
1621a60b747dSAndy Pei 	},
1622a60b747dSAndy Pei 	{
1623a60b747dSAndy Pei 		.features = (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) |
1624a60b747dSAndy Pei 			    (1ULL << VHOST_F_LOG_ALL),
1625856d03bcSAndy Pei 		.ops = &ifcvf_blk_ops,
1626a60b747dSAndy Pei 	},
1627a60b747dSAndy Pei };
1628a60b747dSAndy Pei 
16295c060bf1SMatan Azrad static int
16305c060bf1SMatan Azrad ifcvf_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
16315c060bf1SMatan Azrad 		struct rte_pci_device *pci_dev)
16325c060bf1SMatan Azrad {
16335c060bf1SMatan Azrad 	uint64_t features;
16345c060bf1SMatan Azrad 	struct ifcvf_internal *internal = NULL;
16355c060bf1SMatan Azrad 	struct internal_list *list = NULL;
16365c060bf1SMatan Azrad 	int vdpa_mode = 0;
16375c060bf1SMatan Azrad 	int sw_fallback_lm = 0;
16385c060bf1SMatan Azrad 	struct rte_kvargs *kvlist = NULL;
16395c060bf1SMatan Azrad 	int ret = 0;
1640a60b747dSAndy Pei 	int16_t device_id;
1641dc4406a5SAndy Pei 	uint64_t capacity = 0;
1642dc4406a5SAndy Pei 	uint8_t *byte;
1643dc4406a5SAndy Pei 	uint32_t i;
1644b97f361cSAndy Pei 	uint16_t queue_pairs;
16455c060bf1SMatan Azrad 
16465c060bf1SMatan Azrad 	if (rte_eal_process_type() != RTE_PROC_PRIMARY)
16475c060bf1SMatan Azrad 		return 0;
16485c060bf1SMatan Azrad 
16495c060bf1SMatan Azrad 	if (!pci_dev->device.devargs)
16505c060bf1SMatan Azrad 		return 1;
16515c060bf1SMatan Azrad 
16525c060bf1SMatan Azrad 	kvlist = rte_kvargs_parse(pci_dev->device.devargs->args,
16535c060bf1SMatan Azrad 			ifcvf_valid_arguments);
16545c060bf1SMatan Azrad 	if (kvlist == NULL)
16555c060bf1SMatan Azrad 		return 1;
16565c060bf1SMatan Azrad 
16575c060bf1SMatan Azrad 	/* probe only when vdpa mode is specified */
16585c060bf1SMatan Azrad 	if (rte_kvargs_count(kvlist, IFCVF_VDPA_MODE) == 0) {
16595c060bf1SMatan Azrad 		rte_kvargs_free(kvlist);
16605c060bf1SMatan Azrad 		return 1;
16615c060bf1SMatan Azrad 	}
16625c060bf1SMatan Azrad 
16635c060bf1SMatan Azrad 	ret = rte_kvargs_process(kvlist, IFCVF_VDPA_MODE, &open_int,
16645c060bf1SMatan Azrad 			&vdpa_mode);
16655c060bf1SMatan Azrad 	if (ret < 0 || vdpa_mode == 0) {
16665c060bf1SMatan Azrad 		rte_kvargs_free(kvlist);
16675c060bf1SMatan Azrad 		return 1;
16685c060bf1SMatan Azrad 	}
16695c060bf1SMatan Azrad 
16705c060bf1SMatan Azrad 	list = rte_zmalloc("ifcvf", sizeof(*list), 0);
16715c060bf1SMatan Azrad 	if (list == NULL)
16725c060bf1SMatan Azrad 		goto error;
16735c060bf1SMatan Azrad 
16745c060bf1SMatan Azrad 	internal = rte_zmalloc("ifcvf", sizeof(*internal), 0);
16755c060bf1SMatan Azrad 	if (internal == NULL)
16765c060bf1SMatan Azrad 		goto error;
16775c060bf1SMatan Azrad 
16785c060bf1SMatan Azrad 	internal->pdev = pci_dev;
16795c060bf1SMatan Azrad 	rte_spinlock_init(&internal->lock);
16805c060bf1SMatan Azrad 
16815c060bf1SMatan Azrad 	if (ifcvf_vfio_setup(internal) < 0) {
16825c060bf1SMatan Azrad 		DRV_LOG(ERR, "failed to setup device %s", pci_dev->name);
16835c060bf1SMatan Azrad 		goto error;
16845c060bf1SMatan Azrad 	}
16855c060bf1SMatan Azrad 
16865c060bf1SMatan Azrad 	if (ifcvf_init_hw(&internal->hw, internal->pdev) < 0) {
16875c060bf1SMatan Azrad 		DRV_LOG(ERR, "failed to init device %s", pci_dev->name);
16885c060bf1SMatan Azrad 		goto error;
16895c060bf1SMatan Azrad 	}
16905c060bf1SMatan Azrad 
1691e2a1a08aSChenbo Xia 	internal->configured = 0;
16925c060bf1SMatan Azrad 	features = ifcvf_get_features(&internal->hw);
1693a60b747dSAndy Pei 
1694a60b747dSAndy Pei 	device_id = ifcvf_pci_get_device_type(pci_dev);
1695a60b747dSAndy Pei 	if (device_id < 0) {
1696a60b747dSAndy Pei 		DRV_LOG(ERR, "failed to get device %s type", pci_dev->name);
1697a60b747dSAndy Pei 		goto error;
1698a60b747dSAndy Pei 	}
1699a60b747dSAndy Pei 
1700a60b747dSAndy Pei 	if (device_id == VIRTIO_ID_NET) {
1701a60b747dSAndy Pei 		internal->hw.device_type = IFCVF_NET;
1702b97f361cSAndy Pei 		/*
1703b97f361cSAndy Pei 		 * ifc device always has CTRL_VQ,
1704b97f361cSAndy Pei 		 * and supports VIRTIO_NET_F_CTRL_VQ feature.
1705b97f361cSAndy Pei 		 */
1706b97f361cSAndy Pei 		queue_pairs = (internal->hw.common_cfg->num_queues - 1) / 2;
1707b97f361cSAndy Pei 		DRV_LOG(INFO, "%s support %u queue pairs", pci_dev->name,
1708b97f361cSAndy Pei 			queue_pairs);
1709b97f361cSAndy Pei 		internal->max_queues = MIN(IFCVF_MAX_QUEUES, queue_pairs);
1710a60b747dSAndy Pei 		internal->features = features &
1711a60b747dSAndy Pei 					~(1ULL << VIRTIO_F_IOMMU_PLATFORM);
1712a60b747dSAndy Pei 		internal->features |= dev_info[IFCVF_NET].features;
1713a60b747dSAndy Pei 	} else if (device_id == VIRTIO_ID_BLOCK) {
1714a60b747dSAndy Pei 		internal->hw.device_type = IFCVF_BLK;
1715a60b747dSAndy Pei 		internal->features = features &
1716a60b747dSAndy Pei 					~(1ULL << VIRTIO_F_IOMMU_PLATFORM);
1717a60b747dSAndy Pei 		internal->features |= dev_info[IFCVF_BLK].features;
1718dc4406a5SAndy Pei 
1719dc4406a5SAndy Pei 		/* cannot read 64-bit register in one attempt,
1720dc4406a5SAndy Pei 		 * so read byte by byte.
1721dc4406a5SAndy Pei 		 */
1722dc4406a5SAndy Pei 		for (i = 0; i < sizeof(internal->hw.blk_cfg->capacity); i++) {
1723dc4406a5SAndy Pei 			byte = (uint8_t *)&internal->hw.blk_cfg->capacity + i;
1724dc4406a5SAndy Pei 			capacity |= (uint64_t)*byte << (i * 8);
1725dc4406a5SAndy Pei 		}
1726dc4406a5SAndy Pei 		/* The capacity is number of sectors in 512-byte.
1727dc4406a5SAndy Pei 		 * So right shift 1 bit  we get in K,
1728dc4406a5SAndy Pei 		 * another right shift 10 bits we get in M,
1729dc4406a5SAndy Pei 		 * right shift 10 more bits, we get in G.
1730dc4406a5SAndy Pei 		 * To show capacity in G, we right shift 21 bits in total.
1731dc4406a5SAndy Pei 		 */
1732dc4406a5SAndy Pei 		DRV_LOG(DEBUG, "capacity  : %"PRIu64"G", capacity >> 21);
1733dc4406a5SAndy Pei 
1734dc4406a5SAndy Pei 		DRV_LOG(DEBUG, "size_max  : 0x%08x",
1735dc4406a5SAndy Pei 			internal->hw.blk_cfg->size_max);
1736dc4406a5SAndy Pei 		DRV_LOG(DEBUG, "seg_max   : 0x%08x",
1737dc4406a5SAndy Pei 			internal->hw.blk_cfg->seg_max);
1738dc4406a5SAndy Pei 		DRV_LOG(DEBUG, "blk_size  : 0x%08x",
1739dc4406a5SAndy Pei 			internal->hw.blk_cfg->blk_size);
1740dc4406a5SAndy Pei 		DRV_LOG(DEBUG, "geometry");
1741dc4406a5SAndy Pei 		DRV_LOG(DEBUG, "    cylinders: %u",
1742dc4406a5SAndy Pei 			internal->hw.blk_cfg->geometry.cylinders);
1743dc4406a5SAndy Pei 		DRV_LOG(DEBUG, "    heads    : %u",
1744dc4406a5SAndy Pei 			internal->hw.blk_cfg->geometry.heads);
1745dc4406a5SAndy Pei 		DRV_LOG(DEBUG, "    sectors  : %u",
1746dc4406a5SAndy Pei 			internal->hw.blk_cfg->geometry.sectors);
1747dc4406a5SAndy Pei 		DRV_LOG(DEBUG, "num_queues: 0x%08x",
1748dc4406a5SAndy Pei 			internal->hw.blk_cfg->num_queues);
1749b97f361cSAndy Pei 
1750b97f361cSAndy Pei 		internal->max_queues = MIN(IFCVF_MAX_QUEUES,
1751b97f361cSAndy Pei 			internal->hw.blk_cfg->num_queues);
1752a60b747dSAndy Pei 	}
17535c060bf1SMatan Azrad 
17545c060bf1SMatan Azrad 	list->internal = internal;
17555c060bf1SMatan Azrad 
17565c060bf1SMatan Azrad 	if (rte_kvargs_count(kvlist, IFCVF_SW_FALLBACK_LM)) {
17575c060bf1SMatan Azrad 		ret = rte_kvargs_process(kvlist, IFCVF_SW_FALLBACK_LM,
17585c060bf1SMatan Azrad 				&open_int, &sw_fallback_lm);
17595c060bf1SMatan Azrad 		if (ret < 0)
17605c060bf1SMatan Azrad 			goto error;
17615c060bf1SMatan Azrad 	}
17625c060bf1SMatan Azrad 	internal->sw_lm = sw_fallback_lm;
176397d2dfc4SAndy Pei 	if (!internal->sw_lm && !internal->hw.lm_cfg) {
176497d2dfc4SAndy Pei 		DRV_LOG(ERR, "Device %s does not support HW assist live migration, please enable sw-live-migration!",
176597d2dfc4SAndy Pei 			pci_dev->name);
176697d2dfc4SAndy Pei 		goto error;
176797d2dfc4SAndy Pei 	}
17685c060bf1SMatan Azrad 
17690c3094b0SAndy Pei 	pthread_mutex_lock(&internal_list_lock);
17700c3094b0SAndy Pei 	TAILQ_INSERT_TAIL(&internal_list, list, next);
17710c3094b0SAndy Pei 	pthread_mutex_unlock(&internal_list_lock);
17720c3094b0SAndy Pei 
1773a60b747dSAndy Pei 	internal->vdev = rte_vdpa_register_device(&pci_dev->device,
1774a60b747dSAndy Pei 				dev_info[internal->hw.device_type].ops);
177581a6b7feSMaxime Coquelin 	if (internal->vdev == NULL) {
17765c060bf1SMatan Azrad 		DRV_LOG(ERR, "failed to register device %s", pci_dev->name);
17770c3094b0SAndy Pei 		pthread_mutex_lock(&internal_list_lock);
17780c3094b0SAndy Pei 		TAILQ_REMOVE(&internal_list, list, next);
17790c3094b0SAndy Pei 		pthread_mutex_unlock(&internal_list_lock);
17805c060bf1SMatan Azrad 		goto error;
17815c060bf1SMatan Azrad 	}
17825c060bf1SMatan Azrad 
17835c060bf1SMatan Azrad 	rte_atomic32_set(&internal->started, 1);
1784903ec2b1STaekyung Kim 	if (update_datapath(internal) < 0) {
1785903ec2b1STaekyung Kim 		DRV_LOG(ERR, "failed to update datapath %s", pci_dev->name);
1786903ec2b1STaekyung Kim 		rte_atomic32_set(&internal->started, 0);
1787903ec2b1STaekyung Kim 		rte_vdpa_unregister_device(internal->vdev);
1788903ec2b1STaekyung Kim 		pthread_mutex_lock(&internal_list_lock);
1789903ec2b1STaekyung Kim 		TAILQ_REMOVE(&internal_list, list, next);
1790903ec2b1STaekyung Kim 		pthread_mutex_unlock(&internal_list_lock);
1791903ec2b1STaekyung Kim 		goto error;
1792903ec2b1STaekyung Kim 	}
17935c060bf1SMatan Azrad 
17945c060bf1SMatan Azrad 	rte_kvargs_free(kvlist);
17955c060bf1SMatan Azrad 	return 0;
17965c060bf1SMatan Azrad 
17975c060bf1SMatan Azrad error:
17985c060bf1SMatan Azrad 	rte_kvargs_free(kvlist);
17995c060bf1SMatan Azrad 	rte_free(list);
18005c060bf1SMatan Azrad 	rte_free(internal);
18015c060bf1SMatan Azrad 	return -1;
18025c060bf1SMatan Azrad }
18035c060bf1SMatan Azrad 
18045c060bf1SMatan Azrad static int
18055c060bf1SMatan Azrad ifcvf_pci_remove(struct rte_pci_device *pci_dev)
18065c060bf1SMatan Azrad {
18075c060bf1SMatan Azrad 	struct ifcvf_internal *internal;
18085c060bf1SMatan Azrad 	struct internal_list *list;
18095c060bf1SMatan Azrad 
18105c060bf1SMatan Azrad 	if (rte_eal_process_type() != RTE_PROC_PRIMARY)
18115c060bf1SMatan Azrad 		return 0;
18125c060bf1SMatan Azrad 
1813146247f4SAndy Pei 	list = find_internal_resource_by_pci_dev(pci_dev);
18145c060bf1SMatan Azrad 	if (list == NULL) {
18155c060bf1SMatan Azrad 		DRV_LOG(ERR, "Invalid device: %s", pci_dev->name);
18165c060bf1SMatan Azrad 		return -1;
18175c060bf1SMatan Azrad 	}
18185c060bf1SMatan Azrad 
18195c060bf1SMatan Azrad 	internal = list->internal;
18205c060bf1SMatan Azrad 	rte_atomic32_set(&internal->started, 0);
1821903ec2b1STaekyung Kim 	if (update_datapath(internal) < 0)
1822903ec2b1STaekyung Kim 		DRV_LOG(ERR, "failed to update datapath %s", pci_dev->name);
18235c060bf1SMatan Azrad 
18245c060bf1SMatan Azrad 	rte_pci_unmap_device(internal->pdev);
18255c060bf1SMatan Azrad 	rte_vfio_container_destroy(internal->vfio_container_fd);
182681a6b7feSMaxime Coquelin 	rte_vdpa_unregister_device(internal->vdev);
18275c060bf1SMatan Azrad 
18285c060bf1SMatan Azrad 	pthread_mutex_lock(&internal_list_lock);
18295c060bf1SMatan Azrad 	TAILQ_REMOVE(&internal_list, list, next);
18305c060bf1SMatan Azrad 	pthread_mutex_unlock(&internal_list_lock);
18315c060bf1SMatan Azrad 
18325c060bf1SMatan Azrad 	rte_free(list);
18335c060bf1SMatan Azrad 	rte_free(internal);
18345c060bf1SMatan Azrad 
18355c060bf1SMatan Azrad 	return 0;
18365c060bf1SMatan Azrad }
18375c060bf1SMatan Azrad 
18385c060bf1SMatan Azrad /*
18395c060bf1SMatan Azrad  * IFCVF has the same vendor ID and device ID as virtio net PCI
18405c060bf1SMatan Azrad  * device, with its specific subsystem vendor ID and device ID.
18415c060bf1SMatan Azrad  */
18425c060bf1SMatan Azrad static const struct rte_pci_id pci_id_ifcvf_map[] = {
18435c060bf1SMatan Azrad 	{ .class_id = RTE_CLASS_ANY_ID,
18445c060bf1SMatan Azrad 	  .vendor_id = IFCVF_VENDOR_ID,
18455c806b94SHuang Wei 	  .device_id = IFCVF_NET_MODERN_DEVICE_ID,
18465c060bf1SMatan Azrad 	  .subsystem_vendor_id = IFCVF_SUBSYS_VENDOR_ID,
18475c060bf1SMatan Azrad 	  .subsystem_device_id = IFCVF_SUBSYS_DEVICE_ID,
18485c060bf1SMatan Azrad 	},
18495c060bf1SMatan Azrad 
1850a60b747dSAndy Pei 	{ .class_id = RTE_CLASS_ANY_ID,
1851a60b747dSAndy Pei 	  .vendor_id = IFCVF_VENDOR_ID,
18525c806b94SHuang Wei 	  .device_id = IFCVF_NET_TRANSITIONAL_DEVICE_ID,
18535c806b94SHuang Wei 	  .subsystem_vendor_id = IFCVF_SUBSYS_VENDOR_ID,
18545c806b94SHuang Wei 	  .subsystem_device_id = IFCVF_SUBSYS_NET_DEVICE_ID,
18555c806b94SHuang Wei 	},
18565c806b94SHuang Wei 
18575c806b94SHuang Wei 	{ .class_id = RTE_CLASS_ANY_ID,
18585c806b94SHuang Wei 	  .vendor_id = IFCVF_VENDOR_ID,
1859a60b747dSAndy Pei 	  .device_id = IFCVF_BLK_TRANSITIONAL_DEVICE_ID,
1860a60b747dSAndy Pei 	  .subsystem_vendor_id = IFCVF_SUBSYS_VENDOR_ID,
18615c806b94SHuang Wei 	  .subsystem_device_id = IFCVF_SUBSYS_BLK_DEVICE_ID,
1862a60b747dSAndy Pei 	},
1863a60b747dSAndy Pei 
1864a60b747dSAndy Pei 	{ .class_id = RTE_CLASS_ANY_ID,
1865a60b747dSAndy Pei 	  .vendor_id = IFCVF_VENDOR_ID,
1866a60b747dSAndy Pei 	  .device_id = IFCVF_BLK_MODERN_DEVICE_ID,
1867a60b747dSAndy Pei 	  .subsystem_vendor_id = IFCVF_SUBSYS_VENDOR_ID,
18685c806b94SHuang Wei 	  .subsystem_device_id = IFCVF_SUBSYS_BLK_DEVICE_ID,
1869a60b747dSAndy Pei 	},
1870a60b747dSAndy Pei 
1871419a7be5SAbhishek Maheshwari 	{ .class_id = RTE_CLASS_ANY_ID,
1872419a7be5SAbhishek Maheshwari 	  .vendor_id = IFCVF_VENDOR_ID,
1873419a7be5SAbhishek Maheshwari 	  .device_id = IFCVF_BLK_MODERN_DEVICE_ID,
1874419a7be5SAbhishek Maheshwari 	  .subsystem_vendor_id = IFCVF_SUBSYS_VENDOR_ID,
1875419a7be5SAbhishek Maheshwari 	  .subsystem_device_id = IFCVF_SUBSYS_DEFAULT_DEVICE_ID,
1876419a7be5SAbhishek Maheshwari 	}, /* virtio-blk devices with default subsystem IDs */
1877419a7be5SAbhishek Maheshwari 
18785c060bf1SMatan Azrad 	{ .vendor_id = 0, /* sentinel */
18795c060bf1SMatan Azrad 	},
18805c060bf1SMatan Azrad };
18815c060bf1SMatan Azrad 
18825c060bf1SMatan Azrad static struct rte_pci_driver rte_ifcvf_vdpa = {
18835c060bf1SMatan Azrad 	.id_table = pci_id_ifcvf_map,
18845c060bf1SMatan Azrad 	.drv_flags = 0,
18855c060bf1SMatan Azrad 	.probe = ifcvf_pci_probe,
18865c060bf1SMatan Azrad 	.remove = ifcvf_pci_remove,
18875c060bf1SMatan Azrad };
18885c060bf1SMatan Azrad 
18895c060bf1SMatan Azrad RTE_PMD_REGISTER_PCI(net_ifcvf, rte_ifcvf_vdpa);
18905c060bf1SMatan Azrad RTE_PMD_REGISTER_PCI_TABLE(net_ifcvf, pci_id_ifcvf_map);
18915c060bf1SMatan Azrad RTE_PMD_REGISTER_KMOD_DEP(net_ifcvf, "* vfio-pci");
1892