xref: /dpdk/drivers/bus/pci/linux/pci_vfio.c (revision 5f7b98189de733080656989d63b0e7ffd249830a)
15566a3e3SBruce Richardson /* SPDX-License-Identifier: BSD-3-Clause
25566a3e3SBruce Richardson  * Copyright(c) 2010-2014 Intel Corporation
3c752998bSGaetan Rivet  */
4c752998bSGaetan Rivet 
5c41a103cSPhilip Prindeville #include <unistd.h>
6c752998bSGaetan Rivet #include <string.h>
7c752998bSGaetan Rivet #include <fcntl.h>
8c752998bSGaetan Rivet #include <sys/eventfd.h>
9c752998bSGaetan Rivet #include <sys/socket.h>
10c752998bSGaetan Rivet #include <sys/ioctl.h>
11c752998bSGaetan Rivet #include <sys/mman.h>
12c752998bSGaetan Rivet #include <stdbool.h>
13c752998bSGaetan Rivet 
14c752998bSGaetan Rivet #include <rte_log.h>
15c752998bSGaetan Rivet #include <rte_pci.h>
16c752998bSGaetan Rivet #include <rte_bus_pci.h>
172fd3567eSTal Shnaiderman #include <rte_eal_paging.h>
18c752998bSGaetan Rivet #include <rte_malloc.h>
19c752998bSGaetan Rivet #include <rte_vfio.h>
20c115fd00SJeff Guo #include <rte_eal.h>
21a04322f6SDavid Marchand #include <bus_driver.h>
228ffe7386SJeff Guo #include <rte_spinlock.h>
23028669bcSAnatoly Burakov #include <rte_tailq.h>
24c752998bSGaetan Rivet 
25c752998bSGaetan Rivet #include "eal_filesystem.h"
26c752998bSGaetan Rivet 
27c752998bSGaetan Rivet #include "pci_init.h"
28c752998bSGaetan Rivet #include "private.h"
29c752998bSGaetan Rivet 
30c752998bSGaetan Rivet /**
31c752998bSGaetan Rivet  * @file
32aa777f00SThomas Monjalon  * PCI probing using Linux VFIO.
33c752998bSGaetan Rivet  *
34c752998bSGaetan Rivet  * This code tries to determine if the PCI device is bound to VFIO driver,
35c752998bSGaetan Rivet  * and initialize it (map BARs, set up interrupts) if that's the case.
36c752998bSGaetan Rivet  *
37c752998bSGaetan Rivet  */
38c752998bSGaetan Rivet 
39bc104bb8SFerruh Yigit #ifdef VFIO_PRESENT
40c752998bSGaetan Rivet 
41c752998bSGaetan Rivet static struct rte_tailq_elem rte_vfio_tailq = {
42c752998bSGaetan Rivet 	.name = "VFIO_RESOURCE_LIST",
43c752998bSGaetan Rivet };
44c752998bSGaetan Rivet EAL_REGISTER_TAILQ(rte_vfio_tailq)
45c752998bSGaetan Rivet 
464b741542SChenbo Xia static int
474b741542SChenbo Xia pci_vfio_get_region(const struct rte_pci_device *dev, int index,
484b741542SChenbo Xia 		    uint64_t *size, uint64_t *offset)
49c752998bSGaetan Rivet {
504b741542SChenbo Xia 	const struct rte_pci_device_internal *pdev =
514b741542SChenbo Xia 		RTE_PCI_DEVICE_INTERNAL_CONST(dev);
52d61138d4SHarman Kalra 
534b741542SChenbo Xia 	if (index >= VFIO_PCI_NUM_REGIONS || index >= RTE_MAX_PCI_REGIONS)
54aedd054cSHarman Kalra 		return -1;
55aedd054cSHarman Kalra 
564b741542SChenbo Xia 	if (pdev->region[index].size == 0 && pdev->region[index].offset == 0)
574b741542SChenbo Xia 		return -1;
584b741542SChenbo Xia 
594b741542SChenbo Xia 	*size   = pdev->region[index].size;
604b741542SChenbo Xia 	*offset = pdev->region[index].offset;
614b741542SChenbo Xia 
624b741542SChenbo Xia 	return 0;
63c752998bSGaetan Rivet }
64c752998bSGaetan Rivet 
65c752998bSGaetan Rivet int
664b741542SChenbo Xia pci_vfio_read_config(const struct rte_pci_device *dev,
674b741542SChenbo Xia 		    void *buf, size_t len, off_t offs)
68c752998bSGaetan Rivet {
694b741542SChenbo Xia 	uint64_t size, offset;
704b741542SChenbo Xia 	int fd;
71d61138d4SHarman Kalra 
724b741542SChenbo Xia 	fd = rte_intr_dev_fd_get(dev->intr_handle);
73becb028aSChenbo Xia 	if (fd < 0)
74becb028aSChenbo Xia 		return -1;
754b741542SChenbo Xia 
764b741542SChenbo Xia 	if (pci_vfio_get_region(dev, VFIO_PCI_CONFIG_REGION_INDEX,
774b741542SChenbo Xia 				&size, &offset) != 0)
78aedd054cSHarman Kalra 		return -1;
79aedd054cSHarman Kalra 
804b741542SChenbo Xia 	if ((uint64_t)len + offs > size)
814b741542SChenbo Xia 		return -1;
824b741542SChenbo Xia 
83884f83ccSDavid Marchand 	return pread(fd, buf, len, offset + offs);
844b741542SChenbo Xia }
854b741542SChenbo Xia 
864b741542SChenbo Xia int
874b741542SChenbo Xia pci_vfio_write_config(const struct rte_pci_device *dev,
884b741542SChenbo Xia 		    const void *buf, size_t len, off_t offs)
894b741542SChenbo Xia {
904b741542SChenbo Xia 	uint64_t size, offset;
914b741542SChenbo Xia 	int fd;
924b741542SChenbo Xia 
934b741542SChenbo Xia 	fd = rte_intr_dev_fd_get(dev->intr_handle);
94becb028aSChenbo Xia 	if (fd < 0)
95becb028aSChenbo Xia 		return -1;
964b741542SChenbo Xia 
974b741542SChenbo Xia 	if (pci_vfio_get_region(dev, VFIO_PCI_CONFIG_REGION_INDEX,
984b741542SChenbo Xia 				&size, &offset) != 0)
994b741542SChenbo Xia 		return -1;
1004b741542SChenbo Xia 
1014b741542SChenbo Xia 	if ((uint64_t)len + offs > size)
1024b741542SChenbo Xia 		return -1;
1034b741542SChenbo Xia 
104884f83ccSDavid Marchand 	return pwrite(fd, buf, len, offset + offs);
105c752998bSGaetan Rivet }
106c752998bSGaetan Rivet 
107c752998bSGaetan Rivet /* get PCI BAR number where MSI-X interrupts are */
108c752998bSGaetan Rivet static int
1093dae12acSDavid Marchand pci_vfio_get_msix_bar(const struct rte_pci_device *dev,
1104b741542SChenbo Xia 	struct pci_msix_table *msix_table)
111c752998bSGaetan Rivet {
112a10b6e53SDavid Marchand 	off_t cap_offset;
113a10b6e53SDavid Marchand 
114baa9c550SDavid Marchand 	cap_offset = rte_pci_find_capability(dev, RTE_PCI_CAP_ID_MSIX);
115a10b6e53SDavid Marchand 	if (cap_offset < 0)
116a10b6e53SDavid Marchand 		return -1;
117a10b6e53SDavid Marchand 
118a10b6e53SDavid Marchand 	if (cap_offset != 0) {
119c752998bSGaetan Rivet 		uint16_t flags;
120a10b6e53SDavid Marchand 		uint32_t reg;
121c752998bSGaetan Rivet 
1227bb1168dSDavid Marchand 		if (rte_pci_read_config(dev, &reg, sizeof(reg), cap_offset +
1237bb1168dSDavid Marchand 				RTE_PCI_MSIX_TABLE) < 0) {
124849f773bSDavid Marchand 			PCI_LOG(ERR, "Cannot read MSIX table from PCI config space!");
125c752998bSGaetan Rivet 			return -1;
126c752998bSGaetan Rivet 		}
127c752998bSGaetan Rivet 
1287bb1168dSDavid Marchand 		if (rte_pci_read_config(dev, &flags, sizeof(flags), cap_offset +
1297bb1168dSDavid Marchand 				RTE_PCI_MSIX_FLAGS) < 0) {
130849f773bSDavid Marchand 			PCI_LOG(ERR, "Cannot read MSIX flags from PCI config space!");
131c752998bSGaetan Rivet 			return -1;
132c752998bSGaetan Rivet 		}
133c752998bSGaetan Rivet 
134c752998bSGaetan Rivet 		msix_table->bar_index = reg & RTE_PCI_MSIX_TABLE_BIR;
135c752998bSGaetan Rivet 		msix_table->offset = reg & RTE_PCI_MSIX_TABLE_OFFSET;
136a10b6e53SDavid Marchand 		msix_table->size = 16 * (1 + (flags & RTE_PCI_MSIX_FLAGS_QSIZE));
137a10b6e53SDavid Marchand 	}
138c752998bSGaetan Rivet 
139c752998bSGaetan Rivet 	return 0;
140c752998bSGaetan Rivet }
141c752998bSGaetan Rivet 
14254f3fb12SHaiyue Wang /* enable PCI bus memory space */
14354f3fb12SHaiyue Wang static int
1444b741542SChenbo Xia pci_vfio_enable_bus_memory(struct rte_pci_device *dev, int dev_fd)
14554f3fb12SHaiyue Wang {
1464b741542SChenbo Xia 	uint64_t size, offset;
14754f3fb12SHaiyue Wang 	uint16_t cmd;
14854f3fb12SHaiyue Wang 	int ret;
14954f3fb12SHaiyue Wang 
1504b741542SChenbo Xia 	if (pci_vfio_get_region(dev, VFIO_PCI_CONFIG_REGION_INDEX,
1514b741542SChenbo Xia 		&size, &offset) != 0) {
152849f773bSDavid Marchand 		PCI_LOG(ERR, "Cannot get offset of CONFIG region.");
1534b741542SChenbo Xia 		return -1;
1544b741542SChenbo Xia 	}
1554b741542SChenbo Xia 
156884f83ccSDavid Marchand 	ret = pread(dev_fd, &cmd, sizeof(cmd), offset + RTE_PCI_COMMAND);
15754f3fb12SHaiyue Wang 
15854f3fb12SHaiyue Wang 	if (ret != sizeof(cmd)) {
159849f773bSDavid Marchand 		PCI_LOG(ERR, "Cannot read command from PCI config space!");
16054f3fb12SHaiyue Wang 		return -1;
16154f3fb12SHaiyue Wang 	}
16254f3fb12SHaiyue Wang 
163c89450cbSDavid Marchand 	if (cmd & RTE_PCI_COMMAND_MEMORY)
16454f3fb12SHaiyue Wang 		return 0;
16554f3fb12SHaiyue Wang 
166c89450cbSDavid Marchand 	cmd |= RTE_PCI_COMMAND_MEMORY;
167884f83ccSDavid Marchand 	ret = pwrite(dev_fd, &cmd, sizeof(cmd), offset + RTE_PCI_COMMAND);
16854f3fb12SHaiyue Wang 
16954f3fb12SHaiyue Wang 	if (ret != sizeof(cmd)) {
170849f773bSDavid Marchand 		PCI_LOG(ERR, "Cannot write command to PCI config space!");
17154f3fb12SHaiyue Wang 		return -1;
17254f3fb12SHaiyue Wang 	}
17354f3fb12SHaiyue Wang 
17454f3fb12SHaiyue Wang 	return 0;
17554f3fb12SHaiyue Wang }
17654f3fb12SHaiyue Wang 
177c752998bSGaetan Rivet /* set up interrupt support (but not enable interrupts) */
178c752998bSGaetan Rivet static int
179c752998bSGaetan Rivet pci_vfio_setup_interrupts(struct rte_pci_device *dev, int vfio_dev_fd)
180c752998bSGaetan Rivet {
18189aac60eSDavid Marchand 	int i, ret, intr_idx;
18233543fb3SNithin Dabilpuram 	enum rte_intr_mode intr_mode;
183c752998bSGaetan Rivet 
184c752998bSGaetan Rivet 	/* default to invalid index */
185c752998bSGaetan Rivet 	intr_idx = VFIO_PCI_NUM_IRQS;
186c752998bSGaetan Rivet 
187c752998bSGaetan Rivet 	/* Get default / configured intr_mode */
188c752998bSGaetan Rivet 	intr_mode = rte_eal_vfio_intr_mode();
189c752998bSGaetan Rivet 
190c752998bSGaetan Rivet 	/* get interrupt type from internal config (MSI-X by default, can be
191c752998bSGaetan Rivet 	 * overridden from the command line
192c752998bSGaetan Rivet 	 */
193c752998bSGaetan Rivet 	switch (intr_mode) {
194c752998bSGaetan Rivet 	case RTE_INTR_MODE_MSIX:
195c752998bSGaetan Rivet 		intr_idx = VFIO_PCI_MSIX_IRQ_INDEX;
196c752998bSGaetan Rivet 		break;
197c752998bSGaetan Rivet 	case RTE_INTR_MODE_MSI:
198c752998bSGaetan Rivet 		intr_idx = VFIO_PCI_MSI_IRQ_INDEX;
199c752998bSGaetan Rivet 		break;
200c752998bSGaetan Rivet 	case RTE_INTR_MODE_LEGACY:
201c752998bSGaetan Rivet 		intr_idx = VFIO_PCI_INTX_IRQ_INDEX;
202c752998bSGaetan Rivet 		break;
203c752998bSGaetan Rivet 	/* don't do anything if we want to automatically determine interrupt type */
204c752998bSGaetan Rivet 	case RTE_INTR_MODE_NONE:
205c752998bSGaetan Rivet 		break;
206c752998bSGaetan Rivet 	default:
207849f773bSDavid Marchand 		PCI_LOG(ERR, "Unknown default interrupt type!");
208c752998bSGaetan Rivet 		return -1;
209c752998bSGaetan Rivet 	}
210c752998bSGaetan Rivet 
211c752998bSGaetan Rivet 	/* start from MSI-X interrupt type */
212c752998bSGaetan Rivet 	for (i = VFIO_PCI_MSIX_IRQ_INDEX; i >= 0; i--) {
213c752998bSGaetan Rivet 		struct vfio_irq_info irq = { .argsz = sizeof(irq) };
21433543fb3SNithin Dabilpuram 		int fd = -1;
215c752998bSGaetan Rivet 
216c752998bSGaetan Rivet 		/* skip interrupt modes we don't want */
217c752998bSGaetan Rivet 		if (intr_mode != RTE_INTR_MODE_NONE &&
218c752998bSGaetan Rivet 				i != intr_idx)
219c752998bSGaetan Rivet 			continue;
220c752998bSGaetan Rivet 
221c752998bSGaetan Rivet 		irq.index = i;
222c752998bSGaetan Rivet 
223c752998bSGaetan Rivet 		ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_IRQ_INFO, &irq);
224c752998bSGaetan Rivet 		if (ret < 0) {
225849f773bSDavid Marchand 			PCI_LOG(ERR, "Cannot get VFIO IRQ info, error %i (%s)",
226849f773bSDavid Marchand 				errno, strerror(errno));
227c752998bSGaetan Rivet 			return -1;
228c752998bSGaetan Rivet 		}
229c752998bSGaetan Rivet 
230c752998bSGaetan Rivet 		/* if this vector cannot be used with eventfd, fail if we explicitly
231c752998bSGaetan Rivet 		 * specified interrupt type, otherwise continue */
23233543fb3SNithin Dabilpuram 		if ((irq.flags & VFIO_IRQ_INFO_EVENTFD) == 0) {
233c752998bSGaetan Rivet 			if (intr_mode != RTE_INTR_MODE_NONE) {
234849f773bSDavid Marchand 				PCI_LOG(ERR, "Interrupt vector does not support eventfd!");
235c752998bSGaetan Rivet 				return -1;
23633543fb3SNithin Dabilpuram 			} else
23733543fb3SNithin Dabilpuram 				continue;
238c752998bSGaetan Rivet 		}
239c752998bSGaetan Rivet 
240eb89595dSDavid Marchand 		/* Reallocate the efds and elist fields of intr_handle based
241eb89595dSDavid Marchand 		 * on PCI device MSIX size.
242eb89595dSDavid Marchand 		 */
243eb89595dSDavid Marchand 		if (i == VFIO_PCI_MSIX_IRQ_INDEX &&
244eb89595dSDavid Marchand 				(uint32_t)rte_intr_nb_intr_get(dev->intr_handle) < irq.count &&
245eb89595dSDavid Marchand 				rte_intr_event_list_update(dev->intr_handle, irq.count))
246eb89595dSDavid Marchand 			return -1;
247eb89595dSDavid Marchand 
24833543fb3SNithin Dabilpuram 		/* set up an eventfd for interrupts */
249c752998bSGaetan Rivet 		fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
250c752998bSGaetan Rivet 		if (fd < 0) {
251849f773bSDavid Marchand 			PCI_LOG(ERR, "Cannot set up eventfd, error %i (%s)",
252849f773bSDavid Marchand 				errno, strerror(errno));
253c752998bSGaetan Rivet 			return -1;
254c752998bSGaetan Rivet 		}
255c752998bSGaetan Rivet 
256d61138d4SHarman Kalra 		if (rte_intr_fd_set(dev->intr_handle, fd))
257d61138d4SHarman Kalra 			return -1;
258d61138d4SHarman Kalra 
25933543fb3SNithin Dabilpuram 		switch (i) {
26033543fb3SNithin Dabilpuram 		case VFIO_PCI_MSIX_IRQ_INDEX:
26133543fb3SNithin Dabilpuram 			intr_mode = RTE_INTR_MODE_MSIX;
262d61138d4SHarman Kalra 			rte_intr_type_set(dev->intr_handle,
263d61138d4SHarman Kalra 						 RTE_INTR_HANDLE_VFIO_MSIX);
26433543fb3SNithin Dabilpuram 			break;
26533543fb3SNithin Dabilpuram 		case VFIO_PCI_MSI_IRQ_INDEX:
26633543fb3SNithin Dabilpuram 			intr_mode = RTE_INTR_MODE_MSI;
267d61138d4SHarman Kalra 			rte_intr_type_set(dev->intr_handle,
268d61138d4SHarman Kalra 						 RTE_INTR_HANDLE_VFIO_MSI);
26933543fb3SNithin Dabilpuram 			break;
27033543fb3SNithin Dabilpuram 		case VFIO_PCI_INTX_IRQ_INDEX:
27133543fb3SNithin Dabilpuram 			intr_mode = RTE_INTR_MODE_LEGACY;
272d61138d4SHarman Kalra 			rte_intr_type_set(dev->intr_handle,
273d61138d4SHarman Kalra 						 RTE_INTR_HANDLE_VFIO_LEGACY);
27433543fb3SNithin Dabilpuram 			break;
27533543fb3SNithin Dabilpuram 		default:
276849f773bSDavid Marchand 			PCI_LOG(ERR, "Unknown interrupt type!");
27733543fb3SNithin Dabilpuram 			return -1;
27833543fb3SNithin Dabilpuram 		}
279c752998bSGaetan Rivet 
280c752998bSGaetan Rivet 		return 0;
281c752998bSGaetan Rivet 	}
282c752998bSGaetan Rivet 
28333543fb3SNithin Dabilpuram 	/* if we're here, we haven't found a suitable interrupt vector */
28433543fb3SNithin Dabilpuram 	return -1;
28533543fb3SNithin Dabilpuram }
28633543fb3SNithin Dabilpuram 
287cda94419SJeff Guo #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
288b91bc6f3SThomas Monjalon /*
289b91bc6f3SThomas Monjalon  * Spinlock for device hot-unplug failure handling.
290b91bc6f3SThomas Monjalon  * If it tries to access bus or device, such as handle sigbus on bus
291b91bc6f3SThomas Monjalon  * or handle memory failure for device, just need to use this lock.
292b91bc6f3SThomas Monjalon  * It could protect the bus and the device to avoid race condition.
293b91bc6f3SThomas Monjalon  */
294b91bc6f3SThomas Monjalon static rte_spinlock_t failure_handle_lock = RTE_SPINLOCK_INITIALIZER;
295b91bc6f3SThomas Monjalon 
296c115fd00SJeff Guo static void
297c115fd00SJeff Guo pci_vfio_req_handler(void *param)
298c115fd00SJeff Guo {
299c115fd00SJeff Guo 	struct rte_bus *bus;
300c115fd00SJeff Guo 	int ret;
301c115fd00SJeff Guo 	struct rte_device *device = (struct rte_device *)param;
302c115fd00SJeff Guo 
3038ffe7386SJeff Guo 	rte_spinlock_lock(&failure_handle_lock);
304c115fd00SJeff Guo 	bus = rte_bus_find_by_device(device);
305c115fd00SJeff Guo 	if (bus == NULL) {
306849f773bSDavid Marchand 		PCI_LOG(ERR, "Cannot find bus for device (%s)", device->name);
3078ffe7386SJeff Guo 		goto handle_end;
308c115fd00SJeff Guo 	}
309c115fd00SJeff Guo 
310c115fd00SJeff Guo 	/*
311c115fd00SJeff Guo 	 * vfio kernel module request user space to release allocated
312c115fd00SJeff Guo 	 * resources before device be deleted in kernel, so it can directly
313c115fd00SJeff Guo 	 * call the vfio bus hot-unplug handler to process it.
314c115fd00SJeff Guo 	 */
315c115fd00SJeff Guo 	ret = bus->hot_unplug_handler(device);
316c115fd00SJeff Guo 	if (ret)
317849f773bSDavid Marchand 		PCI_LOG(ERR, "Can not handle hot-unplug for device (%s)", device->name);
3188ffe7386SJeff Guo handle_end:
3198ffe7386SJeff Guo 	rte_spinlock_unlock(&failure_handle_lock);
320c115fd00SJeff Guo }
321c115fd00SJeff Guo 
322c115fd00SJeff Guo /* enable notifier (only enable req now) */
323c115fd00SJeff Guo static int
324c115fd00SJeff Guo pci_vfio_enable_notifier(struct rte_pci_device *dev, int vfio_dev_fd)
325c115fd00SJeff Guo {
326c115fd00SJeff Guo 	int ret;
327c115fd00SJeff Guo 	int fd = -1;
328c115fd00SJeff Guo 
329c115fd00SJeff Guo 	/* set up an eventfd for req notifier */
330c115fd00SJeff Guo 	fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
331c115fd00SJeff Guo 	if (fd < 0) {
332849f773bSDavid Marchand 		PCI_LOG(ERR, "Cannot set up eventfd, error %i (%s)",
333c115fd00SJeff Guo 			errno, strerror(errno));
334c115fd00SJeff Guo 		return -1;
335c115fd00SJeff Guo 	}
336c115fd00SJeff Guo 
337d61138d4SHarman Kalra 	if (rte_intr_fd_set(dev->vfio_req_intr_handle, fd))
338d61138d4SHarman Kalra 		return -1;
339c115fd00SJeff Guo 
340d61138d4SHarman Kalra 	if (rte_intr_type_set(dev->vfio_req_intr_handle, RTE_INTR_HANDLE_VFIO_REQ))
341d61138d4SHarman Kalra 		return -1;
342d61138d4SHarman Kalra 
343d61138d4SHarman Kalra 	if (rte_intr_dev_fd_set(dev->vfio_req_intr_handle, vfio_dev_fd))
344d61138d4SHarman Kalra 		return -1;
345d61138d4SHarman Kalra 
346d61138d4SHarman Kalra 	ret = rte_intr_callback_register(dev->vfio_req_intr_handle,
347c115fd00SJeff Guo 					 pci_vfio_req_handler,
348c115fd00SJeff Guo 					 (void *)&dev->device);
349c115fd00SJeff Guo 	if (ret) {
350849f773bSDavid Marchand 		PCI_LOG(ERR, "Fail to register req notifier handler.");
351c115fd00SJeff Guo 		goto error;
352c115fd00SJeff Guo 	}
353c115fd00SJeff Guo 
354d61138d4SHarman Kalra 	ret = rte_intr_enable(dev->vfio_req_intr_handle);
355c115fd00SJeff Guo 	if (ret) {
356849f773bSDavid Marchand 		PCI_LOG(ERR, "Fail to enable req notifier.");
357d61138d4SHarman Kalra 		ret = rte_intr_callback_unregister(dev->vfio_req_intr_handle,
358c115fd00SJeff Guo 						 pci_vfio_req_handler,
359c115fd00SJeff Guo 						 (void *)&dev->device);
360d59ba029SDarek Stojaczyk 		if (ret < 0)
361849f773bSDavid Marchand 			PCI_LOG(ERR, "Fail to unregister req notifier handler.");
362c115fd00SJeff Guo 		goto error;
363c115fd00SJeff Guo 	}
364c115fd00SJeff Guo 
365c115fd00SJeff Guo 	return 0;
366c115fd00SJeff Guo error:
367c115fd00SJeff Guo 	close(fd);
368c115fd00SJeff Guo 
369d61138d4SHarman Kalra 	rte_intr_fd_set(dev->vfio_req_intr_handle, -1);
370d61138d4SHarman Kalra 	rte_intr_type_set(dev->vfio_req_intr_handle, RTE_INTR_HANDLE_UNKNOWN);
371d61138d4SHarman Kalra 	rte_intr_dev_fd_set(dev->vfio_req_intr_handle, -1);
372c115fd00SJeff Guo 
373c115fd00SJeff Guo 	return -1;
374c115fd00SJeff Guo }
375c115fd00SJeff Guo 
376c115fd00SJeff Guo /* disable notifier (only disable req now) */
377c115fd00SJeff Guo static int
378c115fd00SJeff Guo pci_vfio_disable_notifier(struct rte_pci_device *dev)
379c115fd00SJeff Guo {
380c115fd00SJeff Guo 	int ret;
381c115fd00SJeff Guo 
382d61138d4SHarman Kalra 	ret = rte_intr_disable(dev->vfio_req_intr_handle);
383c115fd00SJeff Guo 	if (ret) {
384849f773bSDavid Marchand 		PCI_LOG(ERR, "fail to disable req notifier.");
385c115fd00SJeff Guo 		return -1;
386c115fd00SJeff Guo 	}
387c115fd00SJeff Guo 
388d61138d4SHarman Kalra 	ret = rte_intr_callback_unregister_sync(dev->vfio_req_intr_handle,
389c115fd00SJeff Guo 					   pci_vfio_req_handler,
390c115fd00SJeff Guo 					   (void *)&dev->device);
391d59ba029SDarek Stojaczyk 	if (ret < 0) {
392849f773bSDavid Marchand 		PCI_LOG(ERR, "fail to unregister req notifier handler.");
393c115fd00SJeff Guo 		return -1;
394c115fd00SJeff Guo 	}
395c115fd00SJeff Guo 
396d61138d4SHarman Kalra 	close(rte_intr_fd_get(dev->vfio_req_intr_handle));
397c115fd00SJeff Guo 
398d61138d4SHarman Kalra 	rte_intr_fd_set(dev->vfio_req_intr_handle, -1);
399d61138d4SHarman Kalra 	rte_intr_type_set(dev->vfio_req_intr_handle, RTE_INTR_HANDLE_UNKNOWN);
400d61138d4SHarman Kalra 	rte_intr_dev_fd_set(dev->vfio_req_intr_handle, -1);
401c115fd00SJeff Guo 
402c115fd00SJeff Guo 	return 0;
403c115fd00SJeff Guo }
404cda94419SJeff Guo #endif
405c115fd00SJeff Guo 
406c752998bSGaetan Rivet static int
4074b741542SChenbo Xia pci_vfio_is_ioport_bar(const struct rte_pci_device *dev, int vfio_dev_fd,
4084b741542SChenbo Xia 	int bar_index)
409c752998bSGaetan Rivet {
4104b741542SChenbo Xia 	uint64_t size, offset;
411c752998bSGaetan Rivet 	uint32_t ioport_bar;
412c752998bSGaetan Rivet 	int ret;
413c752998bSGaetan Rivet 
4144b741542SChenbo Xia 	if (pci_vfio_get_region(dev, VFIO_PCI_CONFIG_REGION_INDEX,
4154b741542SChenbo Xia 		&size, &offset) != 0) {
416849f773bSDavid Marchand 		PCI_LOG(ERR, "Cannot get offset of CONFIG region.");
4174b741542SChenbo Xia 		return -1;
4184b741542SChenbo Xia 	}
4194b741542SChenbo Xia 
420884f83ccSDavid Marchand 	ret = pread(vfio_dev_fd, &ioport_bar, sizeof(ioport_bar),
42187146142SDavid Marchand 			  offset + RTE_PCI_BASE_ADDRESS_0 + bar_index * 4);
422c752998bSGaetan Rivet 	if (ret != sizeof(ioport_bar)) {
423849f773bSDavid Marchand 		PCI_LOG(ERR, "Cannot read command (%x) from config space!",
42487146142SDavid Marchand 			RTE_PCI_BASE_ADDRESS_0 + bar_index*4);
425c752998bSGaetan Rivet 		return -1;
426c752998bSGaetan Rivet 	}
427c752998bSGaetan Rivet 
42887146142SDavid Marchand 	return (ioport_bar & RTE_PCI_BASE_ADDRESS_SPACE_IO) != 0;
429c752998bSGaetan Rivet }
430c752998bSGaetan Rivet 
431c752998bSGaetan Rivet static int
43277dad68cSGaetan Rivet pci_rte_vfio_setup_device(struct rte_pci_device *dev, int vfio_dev_fd)
433c752998bSGaetan Rivet {
434c752998bSGaetan Rivet 	if (pci_vfio_setup_interrupts(dev, vfio_dev_fd) != 0) {
435849f773bSDavid Marchand 		PCI_LOG(ERR, "Error setting up interrupts!");
436c752998bSGaetan Rivet 		return -1;
437c752998bSGaetan Rivet 	}
438c752998bSGaetan Rivet 
4394b741542SChenbo Xia 	if (pci_vfio_enable_bus_memory(dev, vfio_dev_fd)) {
440849f773bSDavid Marchand 		PCI_LOG(ERR, "Cannot enable bus memory!");
44154f3fb12SHaiyue Wang 		return -1;
44254f3fb12SHaiyue Wang 	}
44354f3fb12SHaiyue Wang 
444b3d590a0SDavid Marchand 	if (rte_pci_set_bus_master(dev, true)) {
445849f773bSDavid Marchand 		PCI_LOG(ERR, "Cannot set up bus mastering!");
446c752998bSGaetan Rivet 		return -1;
447c752998bSGaetan Rivet 	}
448c752998bSGaetan Rivet 
4496fb00f8bSJerin Jacob 	/*
4506fb00f8bSJerin Jacob 	 * Reset the device. If the device is not capable of resetting,
4516fb00f8bSJerin Jacob 	 * then it updates errno as EINVAL.
4526fb00f8bSJerin Jacob 	 */
4536fb00f8bSJerin Jacob 	if (ioctl(vfio_dev_fd, VFIO_DEVICE_RESET) && errno != EINVAL) {
454849f773bSDavid Marchand 		PCI_LOG(ERR, "Unable to reset device! Error: %d (%s)", errno, strerror(errno));
455f25f8f36SJonas Pfefferle 		return -1;
456f25f8f36SJonas Pfefferle 	}
457c752998bSGaetan Rivet 
458c752998bSGaetan Rivet 	return 0;
459c752998bSGaetan Rivet }
460c752998bSGaetan Rivet 
461c752998bSGaetan Rivet static int
462c752998bSGaetan Rivet pci_vfio_mmap_bar(int vfio_dev_fd, struct mapped_pci_resource *vfio_res,
463c752998bSGaetan Rivet 		int bar_index, int additional_flags)
464c752998bSGaetan Rivet {
465c752998bSGaetan Rivet 	struct memreg {
4668108393dSMichal Krawczyk 		uint64_t offset;
4678108393dSMichal Krawczyk 		size_t   size;
468c752998bSGaetan Rivet 	} memreg[2] = {};
469c752998bSGaetan Rivet 	void *bar_addr;
470c752998bSGaetan Rivet 	struct pci_msix_table *msix_table = &vfio_res->msix_table;
471c752998bSGaetan Rivet 	struct pci_map *bar = &vfio_res->maps[bar_index];
472c752998bSGaetan Rivet 
4739cea8774STone Zhang 	if (bar->size == 0) {
474849f773bSDavid Marchand 		PCI_LOG(DEBUG, "Bar size is 0, skip BAR%d", bar_index);
475c752998bSGaetan Rivet 		return 0;
4769cea8774STone Zhang 	}
477c752998bSGaetan Rivet 
47845fdc3edSAnatoly Burakov 	if (msix_table->bar_index == bar_index) {
479c752998bSGaetan Rivet 		/*
48045fdc3edSAnatoly Burakov 		 * VFIO will not let us map the MSI-X table,
481c752998bSGaetan Rivet 		 * but we can map around it.
482c752998bSGaetan Rivet 		 */
483c752998bSGaetan Rivet 		uint32_t table_start = msix_table->offset;
484c752998bSGaetan Rivet 		uint32_t table_end = table_start + msix_table->size;
485924e6b76SThomas Monjalon 		table_end = RTE_ALIGN(table_end, rte_mem_page_size());
486924e6b76SThomas Monjalon 		table_start = RTE_ALIGN_FLOOR(table_start, rte_mem_page_size());
4879cea8774STone Zhang 
4889cea8774STone Zhang 		/* If page-aligned start of MSI-X table is less than the
4899cea8774STone Zhang 		 * actual MSI-X table start address, reassign to the actual
4909cea8774STone Zhang 		 * start address.
4919cea8774STone Zhang 		 */
4929cea8774STone Zhang 		if (table_start < msix_table->offset)
4939cea8774STone Zhang 			table_start = msix_table->offset;
494c752998bSGaetan Rivet 
495c752998bSGaetan Rivet 		if (table_start == 0 && table_end >= bar->size) {
496c752998bSGaetan Rivet 			/* Cannot map this BAR */
497849f773bSDavid Marchand 			PCI_LOG(DEBUG, "Skipping BAR%d", bar_index);
498c752998bSGaetan Rivet 			bar->size = 0;
499c752998bSGaetan Rivet 			bar->addr = 0;
500c752998bSGaetan Rivet 			return 0;
501c752998bSGaetan Rivet 		}
502c752998bSGaetan Rivet 
503c752998bSGaetan Rivet 		memreg[0].offset = bar->offset;
504c752998bSGaetan Rivet 		memreg[0].size = table_start;
5059cea8774STone Zhang 		if (bar->size < table_end) {
5069cea8774STone Zhang 			/*
5079cea8774STone Zhang 			 * If MSI-X table end is beyond BAR end, don't attempt
5089cea8774STone Zhang 			 * to perform second mapping.
5099cea8774STone Zhang 			 */
5109cea8774STone Zhang 			memreg[1].offset = 0;
5119cea8774STone Zhang 			memreg[1].size = 0;
5129cea8774STone Zhang 		} else {
513c752998bSGaetan Rivet 			memreg[1].offset = bar->offset + table_end;
514c752998bSGaetan Rivet 			memreg[1].size = bar->size - table_end;
5159cea8774STone Zhang 		}
516c752998bSGaetan Rivet 
517849f773bSDavid Marchand 		PCI_LOG(DEBUG, "Trying to map BAR%d that contains the MSI-X table. "
518849f773bSDavid Marchand 			"Trying offsets: 0x%04" PRIx64 ":0x%04zx, 0x%04" PRIx64 ":0x%04zx",
5198108393dSMichal Krawczyk 			bar_index,
520c752998bSGaetan Rivet 			memreg[0].offset, memreg[0].size,
521c752998bSGaetan Rivet 			memreg[1].offset, memreg[1].size);
52245fdc3edSAnatoly Burakov 	} else {
52345fdc3edSAnatoly Burakov 		memreg[0].offset = bar->offset;
52445fdc3edSAnatoly Burakov 		memreg[0].size = bar->size;
525c752998bSGaetan Rivet 	}
526c752998bSGaetan Rivet 
52745fdc3edSAnatoly Burakov 	/* reserve the address using an inaccessible mapping */
52845fdc3edSAnatoly Burakov 	bar_addr = mmap(bar->addr, bar->size, 0, MAP_PRIVATE |
52945fdc3edSAnatoly Burakov 			MAP_ANONYMOUS | additional_flags, -1, 0);
53045fdc3edSAnatoly Burakov 	if (bar_addr != MAP_FAILED) {
53145fdc3edSAnatoly Burakov 		void *map_addr = NULL;
532c752998bSGaetan Rivet 		if (memreg[0].size) {
533c752998bSGaetan Rivet 			/* actual map of first part */
534c752998bSGaetan Rivet 			map_addr = pci_map_resource(bar_addr, vfio_dev_fd,
535c752998bSGaetan Rivet 							memreg[0].offset,
536c752998bSGaetan Rivet 							memreg[0].size,
5372fd3567eSTal Shnaiderman 							RTE_MAP_FORCE_ADDRESS);
538c752998bSGaetan Rivet 		}
539c752998bSGaetan Rivet 
54077a8884cSHyong Youb Kim 		/*
54177a8884cSHyong Youb Kim 		 * Regarding "memreg[0].size == 0":
54277a8884cSHyong Youb Kim 		 * If this BAR has MSI-X table, memreg[0].size (the
54377a8884cSHyong Youb Kim 		 * first part or the part before the table) can
54477a8884cSHyong Youb Kim 		 * legitimately be 0 for hardware using vector table
54577a8884cSHyong Youb Kim 		 * offset 0 (i.e. first part does not exist).
54677a8884cSHyong Youb Kim 		 *
54777a8884cSHyong Youb Kim 		 * When memreg[0].size is 0, "mapping the first part"
54877a8884cSHyong Youb Kim 		 * never happens, and map_addr is NULL at this
54977a8884cSHyong Youb Kim 		 * point. So check that mapping has been actually
55077a8884cSHyong Youb Kim 		 * attempted.
55177a8884cSHyong Youb Kim 		 */
552c752998bSGaetan Rivet 		/* if there's a second part, try to map it */
55377a8884cSHyong Youb Kim 		if ((map_addr != NULL || memreg[0].size == 0)
554c752998bSGaetan Rivet 			&& memreg[1].offset && memreg[1].size) {
555c752998bSGaetan Rivet 			void *second_addr = RTE_PTR_ADD(bar_addr,
5568108393dSMichal Krawczyk 						(uintptr_t)(memreg[1].offset -
5578108393dSMichal Krawczyk 						bar->offset));
558c752998bSGaetan Rivet 			map_addr = pci_map_resource(second_addr,
559c752998bSGaetan Rivet 							vfio_dev_fd,
560c752998bSGaetan Rivet 							memreg[1].offset,
561c752998bSGaetan Rivet 							memreg[1].size,
5622fd3567eSTal Shnaiderman 							RTE_MAP_FORCE_ADDRESS);
563c752998bSGaetan Rivet 		}
564c752998bSGaetan Rivet 
565e200535cSDavid Marchand 		if (map_addr == NULL) {
566c752998bSGaetan Rivet 			munmap(bar_addr, bar->size);
567c752998bSGaetan Rivet 			bar_addr = MAP_FAILED;
568849f773bSDavid Marchand 			PCI_LOG(ERR, "Failed to map pci BAR%d", bar_index);
569c752998bSGaetan Rivet 			return -1;
570c752998bSGaetan Rivet 		}
57145fdc3edSAnatoly Burakov 	} else {
572849f773bSDavid Marchand 		PCI_LOG(ERR, "Failed to create inaccessible mapping for BAR%d", bar_index);
57345fdc3edSAnatoly Burakov 		return -1;
57445fdc3edSAnatoly Burakov 	}
575c752998bSGaetan Rivet 
576c752998bSGaetan Rivet 	bar->addr = bar_addr;
577c752998bSGaetan Rivet 	return 0;
578c752998bSGaetan Rivet }
579c752998bSGaetan Rivet 
580f60abf97SMiao Li static int
581f60abf97SMiao Li pci_vfio_sparse_mmap_bar(int vfio_dev_fd, struct mapped_pci_resource *vfio_res,
582f60abf97SMiao Li 		int bar_index, int additional_flags)
583f60abf97SMiao Li {
584f60abf97SMiao Li 	struct pci_map *bar = &vfio_res->maps[bar_index];
585f60abf97SMiao Li 	struct vfio_region_sparse_mmap_area *sparse;
586f60abf97SMiao Li 	void *bar_addr;
587f60abf97SMiao Li 	uint32_t i;
588f60abf97SMiao Li 
589f60abf97SMiao Li 	if (bar->size == 0) {
590849f773bSDavid Marchand 		PCI_LOG(DEBUG, "Bar size is 0, skip BAR%d", bar_index);
591f60abf97SMiao Li 		return 0;
592f60abf97SMiao Li 	}
593f60abf97SMiao Li 
594f60abf97SMiao Li 	/* reserve the address using an inaccessible mapping */
595f60abf97SMiao Li 	bar_addr = mmap(bar->addr, bar->size, 0, MAP_PRIVATE |
596f60abf97SMiao Li 			MAP_ANONYMOUS | additional_flags, -1, 0);
597f60abf97SMiao Li 	if (bar_addr != MAP_FAILED) {
598f60abf97SMiao Li 		void *map_addr = NULL;
599f60abf97SMiao Li 		for (i = 0; i < bar->nr_areas; i++) {
600f60abf97SMiao Li 			sparse = &bar->areas[i];
601f60abf97SMiao Li 			if (sparse->size) {
602f60abf97SMiao Li 				void *addr = RTE_PTR_ADD(bar_addr, (uintptr_t)sparse->offset);
603f60abf97SMiao Li 				map_addr = pci_map_resource(addr, vfio_dev_fd,
604f60abf97SMiao Li 					bar->offset + sparse->offset, sparse->size,
605f60abf97SMiao Li 					RTE_MAP_FORCE_ADDRESS);
606f60abf97SMiao Li 				if (map_addr == NULL) {
607f60abf97SMiao Li 					munmap(bar_addr, bar->size);
608849f773bSDavid Marchand 					PCI_LOG(ERR, "Failed to map pci BAR%d", bar_index);
609f60abf97SMiao Li 					goto err_map;
610f60abf97SMiao Li 				}
611f60abf97SMiao Li 			}
612f60abf97SMiao Li 		}
613f60abf97SMiao Li 	} else {
614849f773bSDavid Marchand 		PCI_LOG(ERR, "Failed to create inaccessible mapping for BAR%d", bar_index);
615f60abf97SMiao Li 		goto err_map;
616f60abf97SMiao Li 	}
617f60abf97SMiao Li 
618f60abf97SMiao Li 	bar->addr = bar_addr;
619f60abf97SMiao Li 	return 0;
620f60abf97SMiao Li 
621f60abf97SMiao Li err_map:
622f60abf97SMiao Li 	bar->nr_areas = 0;
623f60abf97SMiao Li 	return -1;
624f60abf97SMiao Li }
625f60abf97SMiao Li 
62603ba15caSAnatoly Burakov /*
62703ba15caSAnatoly Burakov  * region info may contain capability headers, so we need to keep reallocating
62803ba15caSAnatoly Burakov  * the memory until we match allocated memory size with argsz.
62903ba15caSAnatoly Burakov  */
63003ba15caSAnatoly Burakov static int
63103ba15caSAnatoly Burakov pci_vfio_get_region_info(int vfio_dev_fd, struct vfio_region_info **info,
63203ba15caSAnatoly Burakov 		int region)
63303ba15caSAnatoly Burakov {
63403ba15caSAnatoly Burakov 	struct vfio_region_info *ri;
63503ba15caSAnatoly Burakov 	size_t argsz = sizeof(*ri);
63603ba15caSAnatoly Burakov 	int ret;
63703ba15caSAnatoly Burakov 
63803ba15caSAnatoly Burakov 	ri = malloc(sizeof(*ri));
63903ba15caSAnatoly Burakov 	if (ri == NULL) {
640849f773bSDavid Marchand 		PCI_LOG(ERR, "Cannot allocate memory for VFIO region info");
64103ba15caSAnatoly Burakov 		return -1;
64203ba15caSAnatoly Burakov 	}
64303ba15caSAnatoly Burakov again:
64403ba15caSAnatoly Burakov 	memset(ri, 0, argsz);
64503ba15caSAnatoly Burakov 	ri->argsz = argsz;
64603ba15caSAnatoly Burakov 	ri->index = region;
64703ba15caSAnatoly Burakov 
64803ba15caSAnatoly Burakov 	ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, ri);
64903ba15caSAnatoly Burakov 	if (ret < 0) {
65003ba15caSAnatoly Burakov 		free(ri);
65103ba15caSAnatoly Burakov 		return ret;
65203ba15caSAnatoly Burakov 	}
65303ba15caSAnatoly Burakov 	if (ri->argsz != argsz) {
65403ba15caSAnatoly Burakov 		struct vfio_region_info *tmp;
65503ba15caSAnatoly Burakov 
65603ba15caSAnatoly Burakov 		argsz = ri->argsz;
65703ba15caSAnatoly Burakov 		tmp = realloc(ri, argsz);
65803ba15caSAnatoly Burakov 
65903ba15caSAnatoly Burakov 		if (tmp == NULL) {
66003ba15caSAnatoly Burakov 			/* realloc failed but the ri is still there */
66103ba15caSAnatoly Burakov 			free(ri);
662849f773bSDavid Marchand 			PCI_LOG(ERR, "Cannot reallocate memory for VFIO region info");
66303ba15caSAnatoly Burakov 			return -1;
66403ba15caSAnatoly Burakov 		}
66503ba15caSAnatoly Burakov 		ri = tmp;
66603ba15caSAnatoly Burakov 		goto again;
66703ba15caSAnatoly Burakov 	}
66803ba15caSAnatoly Burakov 	*info = ri;
66903ba15caSAnatoly Burakov 
67003ba15caSAnatoly Burakov 	return 0;
67103ba15caSAnatoly Burakov }
67203ba15caSAnatoly Burakov 
67303ba15caSAnatoly Burakov static struct vfio_info_cap_header *
67403ba15caSAnatoly Burakov pci_vfio_info_cap(struct vfio_region_info *info, int cap)
67503ba15caSAnatoly Burakov {
67603ba15caSAnatoly Burakov 	struct vfio_info_cap_header *h;
67703ba15caSAnatoly Burakov 	size_t offset;
67803ba15caSAnatoly Burakov 
67903ba15caSAnatoly Burakov 	if ((info->flags & RTE_VFIO_INFO_FLAG_CAPS) == 0) {
68003ba15caSAnatoly Burakov 		/* VFIO info does not advertise capabilities */
68103ba15caSAnatoly Burakov 		return NULL;
68203ba15caSAnatoly Burakov 	}
68303ba15caSAnatoly Burakov 
68403ba15caSAnatoly Burakov 	offset = VFIO_CAP_OFFSET(info);
68503ba15caSAnatoly Burakov 	while (offset != 0) {
68603ba15caSAnatoly Burakov 		h = RTE_PTR_ADD(info, offset);
68703ba15caSAnatoly Burakov 		if (h->id == cap)
68803ba15caSAnatoly Burakov 			return h;
68903ba15caSAnatoly Burakov 		offset = h->next;
69003ba15caSAnatoly Burakov 	}
69103ba15caSAnatoly Burakov 	return NULL;
69203ba15caSAnatoly Burakov }
69303ba15caSAnatoly Burakov 
69403ba15caSAnatoly Burakov static int
69503ba15caSAnatoly Burakov pci_vfio_msix_is_mappable(int vfio_dev_fd, int msix_region)
69603ba15caSAnatoly Burakov {
6974b741542SChenbo Xia 	struct vfio_region_info *info = NULL;
69803ba15caSAnatoly Burakov 	int ret;
69903ba15caSAnatoly Burakov 
70003ba15caSAnatoly Burakov 	ret = pci_vfio_get_region_info(vfio_dev_fd, &info, msix_region);
70103ba15caSAnatoly Burakov 	if (ret < 0)
70203ba15caSAnatoly Burakov 		return -1;
70303ba15caSAnatoly Burakov 
70403ba15caSAnatoly Burakov 	ret = pci_vfio_info_cap(info, RTE_VFIO_CAP_MSIX_MAPPABLE) != NULL;
70503ba15caSAnatoly Burakov 
70603ba15caSAnatoly Burakov 	/* cleanup */
70703ba15caSAnatoly Burakov 	free(info);
70803ba15caSAnatoly Burakov 
70903ba15caSAnatoly Burakov 	return ret;
71003ba15caSAnatoly Burakov }
71103ba15caSAnatoly Burakov 
7124b741542SChenbo Xia static int
7134b741542SChenbo Xia pci_vfio_fill_regions(struct rte_pci_device *dev, int vfio_dev_fd,
7144b741542SChenbo Xia 		      struct vfio_device_info *device_info)
7154b741542SChenbo Xia {
7164b741542SChenbo Xia 	struct rte_pci_device_internal *pdev = RTE_PCI_DEVICE_INTERNAL(dev);
7174b741542SChenbo Xia 	struct vfio_region_info *reg = NULL;
7184b741542SChenbo Xia 	int nb_maps, i, ret;
7194b741542SChenbo Xia 
7204b741542SChenbo Xia 	nb_maps = RTE_MIN((int)device_info->num_regions,
7214b741542SChenbo Xia 			VFIO_PCI_CONFIG_REGION_INDEX + 1);
7224b741542SChenbo Xia 
7234b741542SChenbo Xia 	for (i = 0; i < nb_maps; i++) {
7244b741542SChenbo Xia 		ret = pci_vfio_get_region_info(vfio_dev_fd, &reg, i);
7254b741542SChenbo Xia 		if (ret < 0) {
726849f773bSDavid Marchand 			PCI_LOG(DEBUG, "%s cannot get device region info error %i (%s)",
7274b741542SChenbo Xia 				dev->name, errno, strerror(errno));
7284b741542SChenbo Xia 			return -1;
7294b741542SChenbo Xia 		}
7304b741542SChenbo Xia 
7314b741542SChenbo Xia 		pdev->region[i].size = reg->size;
7324b741542SChenbo Xia 		pdev->region[i].offset = reg->offset;
7334b741542SChenbo Xia 
7344b741542SChenbo Xia 		free(reg);
7354b741542SChenbo Xia 	}
7364b741542SChenbo Xia 
7374b741542SChenbo Xia 	return 0;
7384b741542SChenbo Xia }
73903ba15caSAnatoly Burakov 
740c752998bSGaetan Rivet static int
741c752998bSGaetan Rivet pci_vfio_map_resource_primary(struct rte_pci_device *dev)
742c752998bSGaetan Rivet {
7434b741542SChenbo Xia 	struct rte_pci_device_internal *pdev = RTE_PCI_DEVICE_INTERNAL(dev);
744c752998bSGaetan Rivet 	struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
7454b741542SChenbo Xia 	struct vfio_region_info *reg = NULL;
746c752998bSGaetan Rivet 	char pci_addr[PATH_MAX] = {0};
747c752998bSGaetan Rivet 	int vfio_dev_fd;
748c752998bSGaetan Rivet 	struct rte_pci_addr *loc = &dev->addr;
749f60abf97SMiao Li 	int i, j, ret;
750c752998bSGaetan Rivet 	struct mapped_pci_resource *vfio_res = NULL;
751c752998bSGaetan Rivet 	struct mapped_pci_res_list *vfio_res_list =
752c752998bSGaetan Rivet 		RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list);
753c752998bSGaetan Rivet 
754c752998bSGaetan Rivet 	struct pci_map *maps;
755c752998bSGaetan Rivet 
756d61138d4SHarman Kalra 	if (rte_intr_fd_set(dev->intr_handle, -1))
757d61138d4SHarman Kalra 		return -1;
758d61138d4SHarman Kalra 
759cda94419SJeff Guo #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
760d61138d4SHarman Kalra 	if (rte_intr_fd_set(dev->vfio_req_intr_handle, -1))
761d61138d4SHarman Kalra 		return -1;
762cda94419SJeff Guo #endif
763c752998bSGaetan Rivet 
764c752998bSGaetan Rivet 	/* store PCI address string */
765c752998bSGaetan Rivet 	snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
766c752998bSGaetan Rivet 			loc->domain, loc->bus, loc->devid, loc->function);
767c752998bSGaetan Rivet 
768c52dd394SThomas Monjalon 	ret = rte_vfio_setup_device(rte_pci_get_sysfs_path(), pci_addr,
769c752998bSGaetan Rivet 					&vfio_dev_fd, &device_info);
770c752998bSGaetan Rivet 	if (ret)
771c752998bSGaetan Rivet 		return ret;
772c752998bSGaetan Rivet 
7733dae12acSDavid Marchand 	if (rte_intr_dev_fd_set(dev->intr_handle, vfio_dev_fd))
7743dae12acSDavid Marchand 		goto err_vfio_dev_fd;
7753dae12acSDavid Marchand 
776c752998bSGaetan Rivet 	/* allocate vfio_res and get region info */
777c752998bSGaetan Rivet 	vfio_res = rte_zmalloc("VFIO_RES", sizeof(*vfio_res), 0);
778c752998bSGaetan Rivet 	if (vfio_res == NULL) {
779849f773bSDavid Marchand 		PCI_LOG(ERR, "Cannot store VFIO mmap details");
780c752998bSGaetan Rivet 		goto err_vfio_dev_fd;
781c752998bSGaetan Rivet 	}
782c752998bSGaetan Rivet 	memcpy(&vfio_res->pci_addr, &dev->addr, sizeof(vfio_res->pci_addr));
783c752998bSGaetan Rivet 
784c752998bSGaetan Rivet 	/* get number of registers (up to BAR5) */
785c752998bSGaetan Rivet 	vfio_res->nb_maps = RTE_MIN((int) device_info.num_regions,
786c752998bSGaetan Rivet 			VFIO_PCI_BAR5_REGION_INDEX + 1);
787c752998bSGaetan Rivet 
788c752998bSGaetan Rivet 	/* map BARs */
789c752998bSGaetan Rivet 	maps = vfio_res->maps;
790c752998bSGaetan Rivet 
7914b741542SChenbo Xia 	ret = pci_vfio_get_region_info(vfio_dev_fd, &reg,
7924b741542SChenbo Xia 		VFIO_PCI_CONFIG_REGION_INDEX);
7934b741542SChenbo Xia 	if (ret < 0) {
794849f773bSDavid Marchand 		PCI_LOG(ERR, "%s cannot get device region info error %i (%s)",
7954b741542SChenbo Xia 			dev->name, errno, strerror(errno));
7964b741542SChenbo Xia 		goto err_vfio_res;
7974b741542SChenbo Xia 	}
7984b741542SChenbo Xia 	pdev->region[VFIO_PCI_CONFIG_REGION_INDEX].size = reg->size;
7994b741542SChenbo Xia 	pdev->region[VFIO_PCI_CONFIG_REGION_INDEX].offset = reg->offset;
8004b741542SChenbo Xia 	free(reg);
8014b741542SChenbo Xia 
802c752998bSGaetan Rivet 	vfio_res->msix_table.bar_index = -1;
803c752998bSGaetan Rivet 	/* get MSI-X BAR, if any (we have to know where it is because we can't
804c752998bSGaetan Rivet 	 * easily mmap it when using VFIO)
805c752998bSGaetan Rivet 	 */
8063dae12acSDavid Marchand 	ret = pci_vfio_get_msix_bar(dev, &vfio_res->msix_table);
807c752998bSGaetan Rivet 	if (ret < 0) {
808849f773bSDavid Marchand 		PCI_LOG(ERR, "%s cannot get MSI-X BAR number!", pci_addr);
80903ba15caSAnatoly Burakov 		goto err_vfio_res;
81003ba15caSAnatoly Burakov 	}
81103ba15caSAnatoly Burakov 	/* if we found our MSI-X BAR region, check if we can mmap it */
81203ba15caSAnatoly Burakov 	if (vfio_res->msix_table.bar_index != -1) {
81303ba15caSAnatoly Burakov 		int ret = pci_vfio_msix_is_mappable(vfio_dev_fd,
81403ba15caSAnatoly Burakov 				vfio_res->msix_table.bar_index);
81503ba15caSAnatoly Burakov 		if (ret < 0) {
816849f773bSDavid Marchand 			PCI_LOG(ERR, "Couldn't check if MSI-X BAR is mappable");
81703ba15caSAnatoly Burakov 			goto err_vfio_res;
81803ba15caSAnatoly Burakov 		} else if (ret != 0) {
81903ba15caSAnatoly Burakov 			/* we can map it, so we don't care where it is */
820849f773bSDavid Marchand 			PCI_LOG(DEBUG, "VFIO reports MSI-X BAR as mappable");
82103ba15caSAnatoly Burakov 			vfio_res->msix_table.bar_index = -1;
82203ba15caSAnatoly Burakov 		}
823c752998bSGaetan Rivet 	}
824c752998bSGaetan Rivet 
82540094b8eSYunjian Wang 	for (i = 0; i < vfio_res->nb_maps; i++) {
826c752998bSGaetan Rivet 		void *bar_addr;
827f60abf97SMiao Li 		struct vfio_info_cap_header *hdr;
828f60abf97SMiao Li 		struct vfio_region_info_cap_sparse_mmap *sparse;
829c752998bSGaetan Rivet 
83003ba15caSAnatoly Burakov 		ret = pci_vfio_get_region_info(vfio_dev_fd, &reg, i);
83103ba15caSAnatoly Burakov 		if (ret < 0) {
832849f773bSDavid Marchand 			PCI_LOG(ERR, "%s cannot get device region info error %i (%s)",
833849f773bSDavid Marchand 				pci_addr, errno, strerror(errno));
834f60abf97SMiao Li 			goto err_map;
835c752998bSGaetan Rivet 		}
836c752998bSGaetan Rivet 
8374b741542SChenbo Xia 		pdev->region[i].size = reg->size;
8384b741542SChenbo Xia 		pdev->region[i].offset = reg->offset;
8394b741542SChenbo Xia 
840c752998bSGaetan Rivet 		/* chk for io port region */
8414b741542SChenbo Xia 		ret = pci_vfio_is_ioport_bar(dev, vfio_dev_fd, i);
84203ba15caSAnatoly Burakov 		if (ret < 0) {
84303ba15caSAnatoly Burakov 			free(reg);
844f60abf97SMiao Li 			goto err_map;
84503ba15caSAnatoly Burakov 		} else if (ret) {
846849f773bSDavid Marchand 			PCI_LOG(INFO, "Ignore mapping IO port bar(%d)", i);
84703ba15caSAnatoly Burakov 			free(reg);
848c752998bSGaetan Rivet 			continue;
849c752998bSGaetan Rivet 		}
850c752998bSGaetan Rivet 
8517be78d02SJosh Soref 		/* skip non-mmappable BARs */
85203ba15caSAnatoly Burakov 		if ((reg->flags & VFIO_REGION_INFO_FLAG_MMAP) == 0) {
85303ba15caSAnatoly Burakov 			free(reg);
854c752998bSGaetan Rivet 			continue;
85503ba15caSAnatoly Burakov 		}
856c752998bSGaetan Rivet 
857c752998bSGaetan Rivet 		/* try mapping somewhere close to the end of hugepages */
858c752998bSGaetan Rivet 		if (pci_map_addr == NULL)
859c752998bSGaetan Rivet 			pci_map_addr = pci_find_max_end_va();
860c752998bSGaetan Rivet 
861c752998bSGaetan Rivet 		bar_addr = pci_map_addr;
86203ba15caSAnatoly Burakov 		pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg->size);
863c752998bSGaetan Rivet 
864d25ab4b7SWangyu (Eric) 		pci_map_addr = RTE_PTR_ALIGN(pci_map_addr,
865d25ab4b7SWangyu (Eric) 					sysconf(_SC_PAGE_SIZE));
866d25ab4b7SWangyu (Eric) 
867c752998bSGaetan Rivet 		maps[i].addr = bar_addr;
86803ba15caSAnatoly Burakov 		maps[i].offset = reg->offset;
86903ba15caSAnatoly Burakov 		maps[i].size = reg->size;
870c752998bSGaetan Rivet 		maps[i].path = NULL; /* vfio doesn't have per-resource paths */
871c752998bSGaetan Rivet 
872f60abf97SMiao Li 		hdr = pci_vfio_info_cap(reg, VFIO_REGION_INFO_CAP_SPARSE_MMAP);
873f60abf97SMiao Li 
874f60abf97SMiao Li 		if (hdr != NULL) {
875f60abf97SMiao Li 			sparse = container_of(hdr,
876f60abf97SMiao Li 				struct vfio_region_info_cap_sparse_mmap, header);
877f60abf97SMiao Li 			if (sparse->nr_areas > 0) {
878f60abf97SMiao Li 				maps[i].nr_areas = sparse->nr_areas;
879f60abf97SMiao Li 				maps[i].areas = rte_zmalloc(NULL,
880f60abf97SMiao Li 					sizeof(*maps[i].areas) * maps[i].nr_areas, 0);
881f60abf97SMiao Li 				if (maps[i].areas == NULL) {
882849f773bSDavid Marchand 					PCI_LOG(ERR, "Cannot alloc memory for sparse map areas");
883f60abf97SMiao Li 					goto err_map;
884f60abf97SMiao Li 				}
885f60abf97SMiao Li 				memcpy(maps[i].areas, sparse->areas,
886f60abf97SMiao Li 					sizeof(*maps[i].areas) * maps[i].nr_areas);
887f60abf97SMiao Li 			}
888f60abf97SMiao Li 		}
889f60abf97SMiao Li 
890f60abf97SMiao Li 		if (maps[i].nr_areas > 0) {
891f60abf97SMiao Li 			ret = pci_vfio_sparse_mmap_bar(vfio_dev_fd, vfio_res, i, 0);
892f60abf97SMiao Li 			if (ret < 0) {
893849f773bSDavid Marchand 				PCI_LOG(ERR, "%s sparse mapping BAR%i failed: %s",
894f60abf97SMiao Li 					pci_addr, i, strerror(errno));
895f60abf97SMiao Li 				free(reg);
896f60abf97SMiao Li 				goto err_map;
897f60abf97SMiao Li 			}
898f60abf97SMiao Li 		} else {
899c752998bSGaetan Rivet 			ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, 0);
900c752998bSGaetan Rivet 			if (ret < 0) {
901849f773bSDavid Marchand 				PCI_LOG(ERR, "%s mapping BAR%i failed: %s",
902c752998bSGaetan Rivet 					pci_addr, i, strerror(errno));
90303ba15caSAnatoly Burakov 				free(reg);
904f60abf97SMiao Li 				goto err_map;
905f60abf97SMiao Li 			}
906c752998bSGaetan Rivet 		}
907c752998bSGaetan Rivet 
908c752998bSGaetan Rivet 		dev->mem_resource[i].addr = maps[i].addr;
90903ba15caSAnatoly Burakov 
91003ba15caSAnatoly Burakov 		free(reg);
911c752998bSGaetan Rivet 	}
912c752998bSGaetan Rivet 
91377dad68cSGaetan Rivet 	if (pci_rte_vfio_setup_device(dev, vfio_dev_fd) < 0) {
914849f773bSDavid Marchand 		PCI_LOG(ERR, "%s setup device failed", pci_addr);
915f60abf97SMiao Li 		goto err_map;
916c752998bSGaetan Rivet 	}
917c752998bSGaetan Rivet 
918cda94419SJeff Guo #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
919c115fd00SJeff Guo 	if (pci_vfio_enable_notifier(dev, vfio_dev_fd) != 0) {
920849f773bSDavid Marchand 		PCI_LOG(ERR, "Error setting up notifier!");
921f60abf97SMiao Li 		goto err_map;
922c115fd00SJeff Guo 	}
923c115fd00SJeff Guo 
924cda94419SJeff Guo #endif
925c752998bSGaetan Rivet 	TAILQ_INSERT_TAIL(vfio_res_list, vfio_res, next);
926c752998bSGaetan Rivet 
927c752998bSGaetan Rivet 	return 0;
928f60abf97SMiao Li err_map:
929f60abf97SMiao Li 	for (j = 0; j < i; j++) {
930f60abf97SMiao Li 		if (maps[j].addr)
931f60abf97SMiao Li 			pci_unmap_resource(maps[j].addr, maps[j].size);
932f60abf97SMiao Li 		if (maps[j].nr_areas > 0)
933f60abf97SMiao Li 			rte_free(maps[j].areas);
934f60abf97SMiao Li 	}
935c752998bSGaetan Rivet err_vfio_res:
936c752998bSGaetan Rivet 	rte_free(vfio_res);
937c752998bSGaetan Rivet err_vfio_dev_fd:
9382a089d2dSYunjian Wang 	rte_vfio_release_device(rte_pci_get_sysfs_path(),
9392a089d2dSYunjian Wang 			pci_addr, vfio_dev_fd);
940c752998bSGaetan Rivet 	return -1;
941c752998bSGaetan Rivet }
942c752998bSGaetan Rivet 
943c752998bSGaetan Rivet static int
944c752998bSGaetan Rivet pci_vfio_map_resource_secondary(struct rte_pci_device *dev)
945c752998bSGaetan Rivet {
946c752998bSGaetan Rivet 	struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
947c752998bSGaetan Rivet 	char pci_addr[PATH_MAX] = {0};
948c752998bSGaetan Rivet 	int vfio_dev_fd;
949c752998bSGaetan Rivet 	struct rte_pci_addr *loc = &dev->addr;
950f60abf97SMiao Li 	int i, j, ret;
951c752998bSGaetan Rivet 	struct mapped_pci_resource *vfio_res = NULL;
952c752998bSGaetan Rivet 	struct mapped_pci_res_list *vfio_res_list =
953c752998bSGaetan Rivet 		RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list);
954c752998bSGaetan Rivet 
955c752998bSGaetan Rivet 	struct pci_map *maps;
956c752998bSGaetan Rivet 
957d61138d4SHarman Kalra 	if (rte_intr_fd_set(dev->intr_handle, -1))
958d61138d4SHarman Kalra 		return -1;
959cda94419SJeff Guo #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
960d61138d4SHarman Kalra 	if (rte_intr_fd_set(dev->vfio_req_intr_handle, -1))
961d61138d4SHarman Kalra 		return -1;
962cda94419SJeff Guo #endif
963c752998bSGaetan Rivet 
964c752998bSGaetan Rivet 	/* store PCI address string */
965c752998bSGaetan Rivet 	snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
966c752998bSGaetan Rivet 			loc->domain, loc->bus, loc->devid, loc->function);
967c752998bSGaetan Rivet 
968c752998bSGaetan Rivet 	/* if we're in a secondary process, just find our tailq entry */
969c752998bSGaetan Rivet 	TAILQ_FOREACH(vfio_res, vfio_res_list, next) {
9700e3ef055SGaetan Rivet 		if (rte_pci_addr_cmp(&vfio_res->pci_addr,
971c752998bSGaetan Rivet 						 &dev->addr))
972c752998bSGaetan Rivet 			continue;
973c752998bSGaetan Rivet 		break;
974c752998bSGaetan Rivet 	}
975c752998bSGaetan Rivet 	/* if we haven't found our tailq entry, something's wrong */
976c752998bSGaetan Rivet 	if (vfio_res == NULL) {
977849f773bSDavid Marchand 		PCI_LOG(ERR, "%s cannot find TAILQ entry for PCI device!", pci_addr);
978047e3f9fSDarek Stojaczyk 		return -1;
979c752998bSGaetan Rivet 	}
980c752998bSGaetan Rivet 
981047e3f9fSDarek Stojaczyk 	ret = rte_vfio_setup_device(rte_pci_get_sysfs_path(), pci_addr,
982047e3f9fSDarek Stojaczyk 					&vfio_dev_fd, &device_info);
983047e3f9fSDarek Stojaczyk 	if (ret)
984047e3f9fSDarek Stojaczyk 		return ret;
985047e3f9fSDarek Stojaczyk 
9864b741542SChenbo Xia 	ret = pci_vfio_fill_regions(dev, vfio_dev_fd, &device_info);
9874b741542SChenbo Xia 	if (ret)
9884b741542SChenbo Xia 		return ret;
9894b741542SChenbo Xia 
990c752998bSGaetan Rivet 	/* map BARs */
991c752998bSGaetan Rivet 	maps = vfio_res->maps;
992c752998bSGaetan Rivet 
99340094b8eSYunjian Wang 	for (i = 0; i < vfio_res->nb_maps; i++) {
994f60abf97SMiao Li 		if (maps[i].nr_areas > 0) {
995f60abf97SMiao Li 			ret = pci_vfio_sparse_mmap_bar(vfio_dev_fd, vfio_res, i, MAP_FIXED);
996f60abf97SMiao Li 			if (ret < 0) {
997849f773bSDavid Marchand 				PCI_LOG(ERR, "%s sparse mapping BAR%i failed: %s",
998f60abf97SMiao Li 					pci_addr, i, strerror(errno));
999f60abf97SMiao Li 				goto err_vfio_dev_fd;
1000f60abf97SMiao Li 			}
1001f60abf97SMiao Li 		} else {
1002c752998bSGaetan Rivet 			ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, MAP_FIXED);
1003c752998bSGaetan Rivet 			if (ret < 0) {
1004849f773bSDavid Marchand 				PCI_LOG(ERR, "%s mapping BAR%i failed: %s",
1005c752998bSGaetan Rivet 					pci_addr, i, strerror(errno));
1006c752998bSGaetan Rivet 				goto err_vfio_dev_fd;
1007c752998bSGaetan Rivet 			}
1008f60abf97SMiao Li 		}
1009c752998bSGaetan Rivet 
1010c752998bSGaetan Rivet 		dev->mem_resource[i].addr = maps[i].addr;
1011c752998bSGaetan Rivet 	}
1012c752998bSGaetan Rivet 
1013ab53203eSQi Zhang 	/* we need save vfio_dev_fd, so it can be used during release */
1014d61138d4SHarman Kalra 	if (rte_intr_dev_fd_set(dev->intr_handle, vfio_dev_fd))
1015d61138d4SHarman Kalra 		goto err_vfio_dev_fd;
1016cda94419SJeff Guo #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
1017d61138d4SHarman Kalra 	if (rte_intr_dev_fd_set(dev->vfio_req_intr_handle, vfio_dev_fd))
1018d61138d4SHarman Kalra 		goto err_vfio_dev_fd;
1019cda94419SJeff Guo #endif
1020ab53203eSQi Zhang 
1021c752998bSGaetan Rivet 	return 0;
1022c752998bSGaetan Rivet err_vfio_dev_fd:
1023f60abf97SMiao Li 	for (j = 0; j < i; j++) {
1024f60abf97SMiao Li 		if (maps[j].addr)
1025f60abf97SMiao Li 			pci_unmap_resource(maps[j].addr, maps[j].size);
1026f60abf97SMiao Li 	}
10272a089d2dSYunjian Wang 	rte_vfio_release_device(rte_pci_get_sysfs_path(),
10282a089d2dSYunjian Wang 			pci_addr, vfio_dev_fd);
1029c752998bSGaetan Rivet 	return -1;
1030c752998bSGaetan Rivet }
1031c752998bSGaetan Rivet 
1032c752998bSGaetan Rivet /*
1033c752998bSGaetan Rivet  * map the PCI resources of a PCI device in virtual memory (VFIO version).
1034c752998bSGaetan Rivet  * primary and secondary processes follow almost exactly the same path
1035c752998bSGaetan Rivet  */
1036c752998bSGaetan Rivet int
1037c752998bSGaetan Rivet pci_vfio_map_resource(struct rte_pci_device *dev)
1038c752998bSGaetan Rivet {
1039c752998bSGaetan Rivet 	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
1040c752998bSGaetan Rivet 		return pci_vfio_map_resource_primary(dev);
1041c752998bSGaetan Rivet 	else
1042c752998bSGaetan Rivet 		return pci_vfio_map_resource_secondary(dev);
1043c752998bSGaetan Rivet }
1044c752998bSGaetan Rivet 
1045ab53203eSQi Zhang static struct mapped_pci_resource *
1046ab53203eSQi Zhang find_and_unmap_vfio_resource(struct mapped_pci_res_list *vfio_res_list,
1047ab53203eSQi Zhang 			struct rte_pci_device *dev,
1048ab53203eSQi Zhang 			const char *pci_addr)
1049ab53203eSQi Zhang {
1050ab53203eSQi Zhang 	struct mapped_pci_resource *vfio_res = NULL;
1051ab53203eSQi Zhang 	struct pci_map *maps;
1052ab53203eSQi Zhang 	int i;
1053ab53203eSQi Zhang 
1054ab53203eSQi Zhang 	/* Get vfio_res */
1055ab53203eSQi Zhang 	TAILQ_FOREACH(vfio_res, vfio_res_list, next) {
1056ab53203eSQi Zhang 		if (rte_pci_addr_cmp(&vfio_res->pci_addr, &dev->addr))
1057ab53203eSQi Zhang 			continue;
1058ab53203eSQi Zhang 		break;
1059ab53203eSQi Zhang 	}
1060ab53203eSQi Zhang 
1061ab53203eSQi Zhang 	if (vfio_res == NULL)
1062ab53203eSQi Zhang 		return vfio_res;
1063ab53203eSQi Zhang 
1064849f773bSDavid Marchand 	PCI_LOG(INFO, "Releasing PCI mapped resource for %s", pci_addr);
1065ab53203eSQi Zhang 
1066ab53203eSQi Zhang 	maps = vfio_res->maps;
106740094b8eSYunjian Wang 	for (i = 0; i < vfio_res->nb_maps; i++) {
1068ab53203eSQi Zhang 
1069ab53203eSQi Zhang 		/*
1070ab53203eSQi Zhang 		 * We do not need to be aware of MSI-X table BAR mappings as
1071ab53203eSQi Zhang 		 * when mapping. Just using current maps array is enough
1072ab53203eSQi Zhang 		 */
1073ab53203eSQi Zhang 		if (maps[i].addr) {
1074849f773bSDavid Marchand 			PCI_LOG(INFO, "Calling pci_unmap_resource for %s at %p",
1075ab53203eSQi Zhang 				pci_addr, maps[i].addr);
1076ab53203eSQi Zhang 			pci_unmap_resource(maps[i].addr, maps[i].size);
1077ab53203eSQi Zhang 		}
1078f60abf97SMiao Li 
1079f60abf97SMiao Li 		if (maps[i].nr_areas > 0)
1080f60abf97SMiao Li 			rte_free(maps[i].areas);
1081ab53203eSQi Zhang 	}
1082ab53203eSQi Zhang 
1083ab53203eSQi Zhang 	return vfio_res;
1084ab53203eSQi Zhang }
1085ab53203eSQi Zhang 
1086ab53203eSQi Zhang static int
1087ab53203eSQi Zhang pci_vfio_unmap_resource_primary(struct rte_pci_device *dev)
1088c752998bSGaetan Rivet {
1089c752998bSGaetan Rivet 	char pci_addr[PATH_MAX] = {0};
1090c752998bSGaetan Rivet 	struct rte_pci_addr *loc = &dev->addr;
1091c752998bSGaetan Rivet 	struct mapped_pci_resource *vfio_res = NULL;
1092c752998bSGaetan Rivet 	struct mapped_pci_res_list *vfio_res_list;
1093d61138d4SHarman Kalra 	int ret, vfio_dev_fd;
1094c752998bSGaetan Rivet 
1095c752998bSGaetan Rivet 	/* store PCI address string */
1096c752998bSGaetan Rivet 	snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
1097c752998bSGaetan Rivet 			loc->domain, loc->bus, loc->devid, loc->function);
1098c752998bSGaetan Rivet 
1099cda94419SJeff Guo #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
1100c115fd00SJeff Guo 	ret = pci_vfio_disable_notifier(dev);
1101c115fd00SJeff Guo 	if (ret) {
1102849f773bSDavid Marchand 		PCI_LOG(ERR, "fail to disable req notifier.");
1103c115fd00SJeff Guo 		return -1;
1104c115fd00SJeff Guo 	}
1105c115fd00SJeff Guo 
1106cda94419SJeff Guo #endif
1107aedd054cSHarman Kalra 	if (rte_intr_fd_get(dev->intr_handle) < 0)
1108aedd054cSHarman Kalra 		return -1;
1109aedd054cSHarman Kalra 
1110d61138d4SHarman Kalra 	if (close(rte_intr_fd_get(dev->intr_handle)) < 0) {
1111849f773bSDavid Marchand 		PCI_LOG(INFO, "Error when closing eventfd file descriptor for %s", pci_addr);
1112c752998bSGaetan Rivet 		return -1;
1113c752998bSGaetan Rivet 	}
1114c752998bSGaetan Rivet 
1115d61138d4SHarman Kalra 	vfio_dev_fd = rte_intr_dev_fd_get(dev->intr_handle);
1116aedd054cSHarman Kalra 	if (vfio_dev_fd < 0)
1117aedd054cSHarman Kalra 		return -1;
1118aedd054cSHarman Kalra 
1119b3d590a0SDavid Marchand 	if (rte_pci_set_bus_master(dev, false)) {
1120849f773bSDavid Marchand 		PCI_LOG(ERR, "%s cannot unset bus mastering for PCI device!", pci_addr);
1121c752998bSGaetan Rivet 		return -1;
1122c752998bSGaetan Rivet 	}
1123c752998bSGaetan Rivet 
1124c52dd394SThomas Monjalon 	ret = rte_vfio_release_device(rte_pci_get_sysfs_path(), pci_addr,
1125d61138d4SHarman Kalra 				      vfio_dev_fd);
1126c752998bSGaetan Rivet 	if (ret < 0) {
1127849f773bSDavid Marchand 		PCI_LOG(ERR, "Cannot release VFIO device");
1128c752998bSGaetan Rivet 		return ret;
1129c752998bSGaetan Rivet 	}
1130c752998bSGaetan Rivet 
1131ab53203eSQi Zhang 	vfio_res_list =
1132ab53203eSQi Zhang 		RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list);
1133ab53203eSQi Zhang 	vfio_res = find_and_unmap_vfio_resource(vfio_res_list, dev, pci_addr);
1134ab53203eSQi Zhang 
1135c752998bSGaetan Rivet 	/* if we haven't found our tailq entry, something's wrong */
1136c752998bSGaetan Rivet 	if (vfio_res == NULL) {
1137849f773bSDavid Marchand 		PCI_LOG(ERR, "%s cannot find TAILQ entry for PCI device!", pci_addr);
1138c752998bSGaetan Rivet 		return -1;
1139c752998bSGaetan Rivet 	}
1140c752998bSGaetan Rivet 
1141c752998bSGaetan Rivet 	TAILQ_REMOVE(vfio_res_list, vfio_res, next);
1142e34a43a6SYunjian Wang 	rte_free(vfio_res);
1143c752998bSGaetan Rivet 	return 0;
1144c752998bSGaetan Rivet }
1145c752998bSGaetan Rivet 
1146ab53203eSQi Zhang static int
1147ab53203eSQi Zhang pci_vfio_unmap_resource_secondary(struct rte_pci_device *dev)
1148ab53203eSQi Zhang {
1149ab53203eSQi Zhang 	char pci_addr[PATH_MAX] = {0};
1150ab53203eSQi Zhang 	struct rte_pci_addr *loc = &dev->addr;
1151ab53203eSQi Zhang 	struct mapped_pci_resource *vfio_res = NULL;
1152ab53203eSQi Zhang 	struct mapped_pci_res_list *vfio_res_list;
1153d61138d4SHarman Kalra 	int ret, vfio_dev_fd;
1154ab53203eSQi Zhang 
1155ab53203eSQi Zhang 	/* store PCI address string */
1156ab53203eSQi Zhang 	snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
1157ab53203eSQi Zhang 			loc->domain, loc->bus, loc->devid, loc->function);
1158ab53203eSQi Zhang 
1159d61138d4SHarman Kalra 	vfio_dev_fd = rte_intr_dev_fd_get(dev->intr_handle);
1160aedd054cSHarman Kalra 	if (vfio_dev_fd < 0)
1161aedd054cSHarman Kalra 		return -1;
1162aedd054cSHarman Kalra 
1163ab53203eSQi Zhang 	ret = rte_vfio_release_device(rte_pci_get_sysfs_path(), pci_addr,
1164d61138d4SHarman Kalra 				      vfio_dev_fd);
1165ab53203eSQi Zhang 	if (ret < 0) {
1166849f773bSDavid Marchand 		PCI_LOG(ERR, "Cannot release VFIO device");
1167ab53203eSQi Zhang 		return ret;
1168ab53203eSQi Zhang 	}
1169ab53203eSQi Zhang 
1170ab53203eSQi Zhang 	vfio_res_list =
1171ab53203eSQi Zhang 		RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list);
1172ab53203eSQi Zhang 	vfio_res = find_and_unmap_vfio_resource(vfio_res_list, dev, pci_addr);
1173ab53203eSQi Zhang 
1174ab53203eSQi Zhang 	/* if we haven't found our tailq entry, something's wrong */
1175ab53203eSQi Zhang 	if (vfio_res == NULL) {
1176849f773bSDavid Marchand 		PCI_LOG(ERR, "%s cannot find TAILQ entry for PCI device!", pci_addr);
1177ab53203eSQi Zhang 		return -1;
1178ab53203eSQi Zhang 	}
1179ab53203eSQi Zhang 
1180ab53203eSQi Zhang 	return 0;
1181ab53203eSQi Zhang }
1182ab53203eSQi Zhang 
1183ab53203eSQi Zhang int
1184ab53203eSQi Zhang pci_vfio_unmap_resource(struct rte_pci_device *dev)
1185ab53203eSQi Zhang {
1186ab53203eSQi Zhang 	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
1187ab53203eSQi Zhang 		return pci_vfio_unmap_resource_primary(dev);
1188ab53203eSQi Zhang 	else
1189ab53203eSQi Zhang 		return pci_vfio_unmap_resource_secondary(dev);
1190ab53203eSQi Zhang }
1191ab53203eSQi Zhang 
1192c752998bSGaetan Rivet int
1193c752998bSGaetan Rivet pci_vfio_ioport_map(struct rte_pci_device *dev, int bar,
1194c752998bSGaetan Rivet 		    struct rte_pci_ioport *p)
1195c752998bSGaetan Rivet {
11964b741542SChenbo Xia 	uint64_t size, offset;
11974b741542SChenbo Xia 
1198c752998bSGaetan Rivet 	if (bar < VFIO_PCI_BAR0_REGION_INDEX ||
1199c752998bSGaetan Rivet 	    bar > VFIO_PCI_BAR5_REGION_INDEX) {
1200849f773bSDavid Marchand 		PCI_LOG(ERR, "invalid bar (%d)!", bar);
1201c752998bSGaetan Rivet 		return -1;
1202c752998bSGaetan Rivet 	}
1203c752998bSGaetan Rivet 
1204647a0a6eSMingjin Ye 	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
1205647a0a6eSMingjin Ye 		struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
1206647a0a6eSMingjin Ye 		char pci_addr[PATH_MAX];
1207647a0a6eSMingjin Ye 		int vfio_dev_fd;
1208647a0a6eSMingjin Ye 		struct rte_pci_addr *loc = &dev->addr;
1209647a0a6eSMingjin Ye 
1210647a0a6eSMingjin Ye 		/* store PCI address string */
1211647a0a6eSMingjin Ye 		snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
1212647a0a6eSMingjin Ye 				loc->domain, loc->bus, loc->devid, loc->function);
1213647a0a6eSMingjin Ye 
1214647a0a6eSMingjin Ye 		vfio_dev_fd = rte_intr_dev_fd_get(dev->intr_handle);
1215647a0a6eSMingjin Ye 		if (vfio_dev_fd < 0) {
1216647a0a6eSMingjin Ye 			return -1;
1217647a0a6eSMingjin Ye 		} else if (vfio_dev_fd == 0) {
1218647a0a6eSMingjin Ye 			if (rte_vfio_get_device_info(rte_pci_get_sysfs_path(), pci_addr,
1219647a0a6eSMingjin Ye 				&vfio_dev_fd, &device_info) != 0)
1220647a0a6eSMingjin Ye 				return -1;
1221647a0a6eSMingjin Ye 			/* save vfio_dev_fd so it can be used during release */
1222647a0a6eSMingjin Ye 			if (rte_intr_dev_fd_set(dev->intr_handle, vfio_dev_fd) != 0)
1223647a0a6eSMingjin Ye 				return -1;
1224647a0a6eSMingjin Ye 
1225647a0a6eSMingjin Ye 			if (pci_vfio_fill_regions(dev, vfio_dev_fd, &device_info) != 0)
1226647a0a6eSMingjin Ye 				return -1;
1227647a0a6eSMingjin Ye 		}
1228647a0a6eSMingjin Ye 	}
1229647a0a6eSMingjin Ye 
12304b741542SChenbo Xia 	if (pci_vfio_get_region(dev, bar, &size, &offset) != 0) {
1231849f773bSDavid Marchand 		PCI_LOG(ERR, "Cannot get offset of region %d.", bar);
12324b741542SChenbo Xia 		return -1;
12334b741542SChenbo Xia 	}
12344b741542SChenbo Xia 
1235c752998bSGaetan Rivet 	p->dev = dev;
12364b741542SChenbo Xia 	p->base = offset;
1237c752998bSGaetan Rivet 	return 0;
1238c752998bSGaetan Rivet }
1239c752998bSGaetan Rivet 
1240c752998bSGaetan Rivet void
1241c752998bSGaetan Rivet pci_vfio_ioport_read(struct rte_pci_ioport *p,
1242c752998bSGaetan Rivet 		     void *data, size_t len, off_t offset)
1243c752998bSGaetan Rivet {
1244d61138d4SHarman Kalra 	const struct rte_intr_handle *intr_handle = p->dev->intr_handle;
1245d61138d4SHarman Kalra 	int vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
1246c752998bSGaetan Rivet 
1247aedd054cSHarman Kalra 	if (vfio_dev_fd < 0)
1248aedd054cSHarman Kalra 		return;
1249aedd054cSHarman Kalra 
1250884f83ccSDavid Marchand 	if (pread(vfio_dev_fd, data,
1251c752998bSGaetan Rivet 		    len, p->base + offset) <= 0)
1252849f773bSDavid Marchand 		PCI_LOG(ERR, "Can't read from PCI bar (%" PRIu64 ") : offset (%x)",
1253c752998bSGaetan Rivet 			VFIO_GET_REGION_IDX(p->base), (int)offset);
1254c752998bSGaetan Rivet }
1255c752998bSGaetan Rivet 
1256c752998bSGaetan Rivet void
1257c752998bSGaetan Rivet pci_vfio_ioport_write(struct rte_pci_ioport *p,
1258c752998bSGaetan Rivet 		      const void *data, size_t len, off_t offset)
1259c752998bSGaetan Rivet {
1260d61138d4SHarman Kalra 	const struct rte_intr_handle *intr_handle = p->dev->intr_handle;
1261d61138d4SHarman Kalra 	int vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
1262c752998bSGaetan Rivet 
1263aedd054cSHarman Kalra 	if (vfio_dev_fd < 0)
1264aedd054cSHarman Kalra 		return;
1265aedd054cSHarman Kalra 
1266884f83ccSDavid Marchand 	if (pwrite(vfio_dev_fd, data,
1267c752998bSGaetan Rivet 		     len, p->base + offset) <= 0)
1268849f773bSDavid Marchand 		PCI_LOG(ERR, "Can't write to PCI bar (%" PRIu64 ") : offset (%x)",
1269c752998bSGaetan Rivet 			VFIO_GET_REGION_IDX(p->base), (int)offset);
1270c752998bSGaetan Rivet }
1271c752998bSGaetan Rivet 
1272c752998bSGaetan Rivet int
1273c752998bSGaetan Rivet pci_vfio_ioport_unmap(struct rte_pci_ioport *p)
1274c752998bSGaetan Rivet {
1275c752998bSGaetan Rivet 	RTE_SET_USED(p);
1276c752998bSGaetan Rivet 	return -1;
1277c752998bSGaetan Rivet }
1278c752998bSGaetan Rivet 
1279c752998bSGaetan Rivet int
1280095cf6e6SChenbo Xia pci_vfio_mmio_read(const struct rte_pci_device *dev, int bar,
1281095cf6e6SChenbo Xia 		   void *buf, size_t len, off_t offs)
1282095cf6e6SChenbo Xia {
1283095cf6e6SChenbo Xia 	uint64_t size, offset;
1284095cf6e6SChenbo Xia 	int fd;
1285095cf6e6SChenbo Xia 
1286095cf6e6SChenbo Xia 	fd = rte_intr_dev_fd_get(dev->intr_handle);
1287becb028aSChenbo Xia 	if (fd < 0)
1288becb028aSChenbo Xia 		return -1;
1289095cf6e6SChenbo Xia 
1290095cf6e6SChenbo Xia 	if (pci_vfio_get_region(dev, bar, &size, &offset) != 0)
1291095cf6e6SChenbo Xia 		return -1;
1292095cf6e6SChenbo Xia 
1293095cf6e6SChenbo Xia 	if ((uint64_t)len + offs > size)
1294095cf6e6SChenbo Xia 		return -1;
1295095cf6e6SChenbo Xia 
1296884f83ccSDavid Marchand 	return pread(fd, buf, len, offset + offs);
1297095cf6e6SChenbo Xia }
1298095cf6e6SChenbo Xia 
1299095cf6e6SChenbo Xia int
1300095cf6e6SChenbo Xia pci_vfio_mmio_write(const struct rte_pci_device *dev, int bar,
1301095cf6e6SChenbo Xia 		    const void *buf, size_t len, off_t offs)
1302095cf6e6SChenbo Xia {
1303095cf6e6SChenbo Xia 	uint64_t size, offset;
1304095cf6e6SChenbo Xia 	int fd;
1305095cf6e6SChenbo Xia 
1306095cf6e6SChenbo Xia 	fd = rte_intr_dev_fd_get(dev->intr_handle);
1307becb028aSChenbo Xia 	if (fd < 0)
1308becb028aSChenbo Xia 		return -1;
1309095cf6e6SChenbo Xia 
1310095cf6e6SChenbo Xia 	if (pci_vfio_get_region(dev, bar, &size, &offset) != 0)
1311095cf6e6SChenbo Xia 		return -1;
1312095cf6e6SChenbo Xia 
1313095cf6e6SChenbo Xia 	if ((uint64_t)len + offs > size)
1314095cf6e6SChenbo Xia 		return -1;
1315095cf6e6SChenbo Xia 
1316884f83ccSDavid Marchand 	return pwrite(fd, buf, len, offset + offs);
1317095cf6e6SChenbo Xia }
1318095cf6e6SChenbo Xia 
1319095cf6e6SChenbo Xia int
1320c752998bSGaetan Rivet pci_vfio_is_enabled(void)
1321c752998bSGaetan Rivet {
1322*5f7b9818SDavid Marchand 	int status = rte_vfio_is_enabled("vfio_pci");
1323*5f7b9818SDavid Marchand 
1324*5f7b9818SDavid Marchand 	if (!status) {
1325*5f7b9818SDavid Marchand 		rte_vfio_enable("vfio");
1326*5f7b9818SDavid Marchand 		status = rte_vfio_is_enabled("vfio_pci");
1327*5f7b9818SDavid Marchand 	}
1328*5f7b9818SDavid Marchand 	return status;
1329c752998bSGaetan Rivet }
1330c752998bSGaetan Rivet #endif
1331