xref: /dpdk/drivers/bus/pci/linux/pci_vfio.c (revision 5f7b98189de733080656989d63b0e7ffd249830a)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2014 Intel Corporation
3  */
4 
5 #include <unistd.h>
6 #include <string.h>
7 #include <fcntl.h>
8 #include <sys/eventfd.h>
9 #include <sys/socket.h>
10 #include <sys/ioctl.h>
11 #include <sys/mman.h>
12 #include <stdbool.h>
13 
14 #include <rte_log.h>
15 #include <rte_pci.h>
16 #include <rte_bus_pci.h>
17 #include <rte_eal_paging.h>
18 #include <rte_malloc.h>
19 #include <rte_vfio.h>
20 #include <rte_eal.h>
21 #include <bus_driver.h>
22 #include <rte_spinlock.h>
23 #include <rte_tailq.h>
24 
25 #include "eal_filesystem.h"
26 
27 #include "pci_init.h"
28 #include "private.h"
29 
30 /**
31  * @file
32  * PCI probing using Linux VFIO.
33  *
34  * This code tries to determine if the PCI device is bound to VFIO driver,
35  * and initialize it (map BARs, set up interrupts) if that's the case.
36  *
37  */
38 
39 #ifdef VFIO_PRESENT
40 
41 static struct rte_tailq_elem rte_vfio_tailq = {
42 	.name = "VFIO_RESOURCE_LIST",
43 };
44 EAL_REGISTER_TAILQ(rte_vfio_tailq)
45 
46 static int
47 pci_vfio_get_region(const struct rte_pci_device *dev, int index,
48 		    uint64_t *size, uint64_t *offset)
49 {
50 	const struct rte_pci_device_internal *pdev =
51 		RTE_PCI_DEVICE_INTERNAL_CONST(dev);
52 
53 	if (index >= VFIO_PCI_NUM_REGIONS || index >= RTE_MAX_PCI_REGIONS)
54 		return -1;
55 
56 	if (pdev->region[index].size == 0 && pdev->region[index].offset == 0)
57 		return -1;
58 
59 	*size   = pdev->region[index].size;
60 	*offset = pdev->region[index].offset;
61 
62 	return 0;
63 }
64 
65 int
66 pci_vfio_read_config(const struct rte_pci_device *dev,
67 		    void *buf, size_t len, off_t offs)
68 {
69 	uint64_t size, offset;
70 	int fd;
71 
72 	fd = rte_intr_dev_fd_get(dev->intr_handle);
73 	if (fd < 0)
74 		return -1;
75 
76 	if (pci_vfio_get_region(dev, VFIO_PCI_CONFIG_REGION_INDEX,
77 				&size, &offset) != 0)
78 		return -1;
79 
80 	if ((uint64_t)len + offs > size)
81 		return -1;
82 
83 	return pread(fd, buf, len, offset + offs);
84 }
85 
86 int
87 pci_vfio_write_config(const struct rte_pci_device *dev,
88 		    const void *buf, size_t len, off_t offs)
89 {
90 	uint64_t size, offset;
91 	int fd;
92 
93 	fd = rte_intr_dev_fd_get(dev->intr_handle);
94 	if (fd < 0)
95 		return -1;
96 
97 	if (pci_vfio_get_region(dev, VFIO_PCI_CONFIG_REGION_INDEX,
98 				&size, &offset) != 0)
99 		return -1;
100 
101 	if ((uint64_t)len + offs > size)
102 		return -1;
103 
104 	return pwrite(fd, buf, len, offset + offs);
105 }
106 
107 /* get PCI BAR number where MSI-X interrupts are */
108 static int
109 pci_vfio_get_msix_bar(const struct rte_pci_device *dev,
110 	struct pci_msix_table *msix_table)
111 {
112 	off_t cap_offset;
113 
114 	cap_offset = rte_pci_find_capability(dev, RTE_PCI_CAP_ID_MSIX);
115 	if (cap_offset < 0)
116 		return -1;
117 
118 	if (cap_offset != 0) {
119 		uint16_t flags;
120 		uint32_t reg;
121 
122 		if (rte_pci_read_config(dev, &reg, sizeof(reg), cap_offset +
123 				RTE_PCI_MSIX_TABLE) < 0) {
124 			PCI_LOG(ERR, "Cannot read MSIX table from PCI config space!");
125 			return -1;
126 		}
127 
128 		if (rte_pci_read_config(dev, &flags, sizeof(flags), cap_offset +
129 				RTE_PCI_MSIX_FLAGS) < 0) {
130 			PCI_LOG(ERR, "Cannot read MSIX flags from PCI config space!");
131 			return -1;
132 		}
133 
134 		msix_table->bar_index = reg & RTE_PCI_MSIX_TABLE_BIR;
135 		msix_table->offset = reg & RTE_PCI_MSIX_TABLE_OFFSET;
136 		msix_table->size = 16 * (1 + (flags & RTE_PCI_MSIX_FLAGS_QSIZE));
137 	}
138 
139 	return 0;
140 }
141 
142 /* enable PCI bus memory space */
143 static int
144 pci_vfio_enable_bus_memory(struct rte_pci_device *dev, int dev_fd)
145 {
146 	uint64_t size, offset;
147 	uint16_t cmd;
148 	int ret;
149 
150 	if (pci_vfio_get_region(dev, VFIO_PCI_CONFIG_REGION_INDEX,
151 		&size, &offset) != 0) {
152 		PCI_LOG(ERR, "Cannot get offset of CONFIG region.");
153 		return -1;
154 	}
155 
156 	ret = pread(dev_fd, &cmd, sizeof(cmd), offset + RTE_PCI_COMMAND);
157 
158 	if (ret != sizeof(cmd)) {
159 		PCI_LOG(ERR, "Cannot read command from PCI config space!");
160 		return -1;
161 	}
162 
163 	if (cmd & RTE_PCI_COMMAND_MEMORY)
164 		return 0;
165 
166 	cmd |= RTE_PCI_COMMAND_MEMORY;
167 	ret = pwrite(dev_fd, &cmd, sizeof(cmd), offset + RTE_PCI_COMMAND);
168 
169 	if (ret != sizeof(cmd)) {
170 		PCI_LOG(ERR, "Cannot write command to PCI config space!");
171 		return -1;
172 	}
173 
174 	return 0;
175 }
176 
177 /* set up interrupt support (but not enable interrupts) */
178 static int
179 pci_vfio_setup_interrupts(struct rte_pci_device *dev, int vfio_dev_fd)
180 {
181 	int i, ret, intr_idx;
182 	enum rte_intr_mode intr_mode;
183 
184 	/* default to invalid index */
185 	intr_idx = VFIO_PCI_NUM_IRQS;
186 
187 	/* Get default / configured intr_mode */
188 	intr_mode = rte_eal_vfio_intr_mode();
189 
190 	/* get interrupt type from internal config (MSI-X by default, can be
191 	 * overridden from the command line
192 	 */
193 	switch (intr_mode) {
194 	case RTE_INTR_MODE_MSIX:
195 		intr_idx = VFIO_PCI_MSIX_IRQ_INDEX;
196 		break;
197 	case RTE_INTR_MODE_MSI:
198 		intr_idx = VFIO_PCI_MSI_IRQ_INDEX;
199 		break;
200 	case RTE_INTR_MODE_LEGACY:
201 		intr_idx = VFIO_PCI_INTX_IRQ_INDEX;
202 		break;
203 	/* don't do anything if we want to automatically determine interrupt type */
204 	case RTE_INTR_MODE_NONE:
205 		break;
206 	default:
207 		PCI_LOG(ERR, "Unknown default interrupt type!");
208 		return -1;
209 	}
210 
211 	/* start from MSI-X interrupt type */
212 	for (i = VFIO_PCI_MSIX_IRQ_INDEX; i >= 0; i--) {
213 		struct vfio_irq_info irq = { .argsz = sizeof(irq) };
214 		int fd = -1;
215 
216 		/* skip interrupt modes we don't want */
217 		if (intr_mode != RTE_INTR_MODE_NONE &&
218 				i != intr_idx)
219 			continue;
220 
221 		irq.index = i;
222 
223 		ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_IRQ_INFO, &irq);
224 		if (ret < 0) {
225 			PCI_LOG(ERR, "Cannot get VFIO IRQ info, error %i (%s)",
226 				errno, strerror(errno));
227 			return -1;
228 		}
229 
230 		/* if this vector cannot be used with eventfd, fail if we explicitly
231 		 * specified interrupt type, otherwise continue */
232 		if ((irq.flags & VFIO_IRQ_INFO_EVENTFD) == 0) {
233 			if (intr_mode != RTE_INTR_MODE_NONE) {
234 				PCI_LOG(ERR, "Interrupt vector does not support eventfd!");
235 				return -1;
236 			} else
237 				continue;
238 		}
239 
240 		/* Reallocate the efds and elist fields of intr_handle based
241 		 * on PCI device MSIX size.
242 		 */
243 		if (i == VFIO_PCI_MSIX_IRQ_INDEX &&
244 				(uint32_t)rte_intr_nb_intr_get(dev->intr_handle) < irq.count &&
245 				rte_intr_event_list_update(dev->intr_handle, irq.count))
246 			return -1;
247 
248 		/* set up an eventfd for interrupts */
249 		fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
250 		if (fd < 0) {
251 			PCI_LOG(ERR, "Cannot set up eventfd, error %i (%s)",
252 				errno, strerror(errno));
253 			return -1;
254 		}
255 
256 		if (rte_intr_fd_set(dev->intr_handle, fd))
257 			return -1;
258 
259 		switch (i) {
260 		case VFIO_PCI_MSIX_IRQ_INDEX:
261 			intr_mode = RTE_INTR_MODE_MSIX;
262 			rte_intr_type_set(dev->intr_handle,
263 						 RTE_INTR_HANDLE_VFIO_MSIX);
264 			break;
265 		case VFIO_PCI_MSI_IRQ_INDEX:
266 			intr_mode = RTE_INTR_MODE_MSI;
267 			rte_intr_type_set(dev->intr_handle,
268 						 RTE_INTR_HANDLE_VFIO_MSI);
269 			break;
270 		case VFIO_PCI_INTX_IRQ_INDEX:
271 			intr_mode = RTE_INTR_MODE_LEGACY;
272 			rte_intr_type_set(dev->intr_handle,
273 						 RTE_INTR_HANDLE_VFIO_LEGACY);
274 			break;
275 		default:
276 			PCI_LOG(ERR, "Unknown interrupt type!");
277 			return -1;
278 		}
279 
280 		return 0;
281 	}
282 
283 	/* if we're here, we haven't found a suitable interrupt vector */
284 	return -1;
285 }
286 
287 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
288 /*
289  * Spinlock for device hot-unplug failure handling.
290  * If it tries to access bus or device, such as handle sigbus on bus
291  * or handle memory failure for device, just need to use this lock.
292  * It could protect the bus and the device to avoid race condition.
293  */
294 static rte_spinlock_t failure_handle_lock = RTE_SPINLOCK_INITIALIZER;
295 
296 static void
297 pci_vfio_req_handler(void *param)
298 {
299 	struct rte_bus *bus;
300 	int ret;
301 	struct rte_device *device = (struct rte_device *)param;
302 
303 	rte_spinlock_lock(&failure_handle_lock);
304 	bus = rte_bus_find_by_device(device);
305 	if (bus == NULL) {
306 		PCI_LOG(ERR, "Cannot find bus for device (%s)", device->name);
307 		goto handle_end;
308 	}
309 
310 	/*
311 	 * vfio kernel module request user space to release allocated
312 	 * resources before device be deleted in kernel, so it can directly
313 	 * call the vfio bus hot-unplug handler to process it.
314 	 */
315 	ret = bus->hot_unplug_handler(device);
316 	if (ret)
317 		PCI_LOG(ERR, "Can not handle hot-unplug for device (%s)", device->name);
318 handle_end:
319 	rte_spinlock_unlock(&failure_handle_lock);
320 }
321 
322 /* enable notifier (only enable req now) */
323 static int
324 pci_vfio_enable_notifier(struct rte_pci_device *dev, int vfio_dev_fd)
325 {
326 	int ret;
327 	int fd = -1;
328 
329 	/* set up an eventfd for req notifier */
330 	fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
331 	if (fd < 0) {
332 		PCI_LOG(ERR, "Cannot set up eventfd, error %i (%s)",
333 			errno, strerror(errno));
334 		return -1;
335 	}
336 
337 	if (rte_intr_fd_set(dev->vfio_req_intr_handle, fd))
338 		return -1;
339 
340 	if (rte_intr_type_set(dev->vfio_req_intr_handle, RTE_INTR_HANDLE_VFIO_REQ))
341 		return -1;
342 
343 	if (rte_intr_dev_fd_set(dev->vfio_req_intr_handle, vfio_dev_fd))
344 		return -1;
345 
346 	ret = rte_intr_callback_register(dev->vfio_req_intr_handle,
347 					 pci_vfio_req_handler,
348 					 (void *)&dev->device);
349 	if (ret) {
350 		PCI_LOG(ERR, "Fail to register req notifier handler.");
351 		goto error;
352 	}
353 
354 	ret = rte_intr_enable(dev->vfio_req_intr_handle);
355 	if (ret) {
356 		PCI_LOG(ERR, "Fail to enable req notifier.");
357 		ret = rte_intr_callback_unregister(dev->vfio_req_intr_handle,
358 						 pci_vfio_req_handler,
359 						 (void *)&dev->device);
360 		if (ret < 0)
361 			PCI_LOG(ERR, "Fail to unregister req notifier handler.");
362 		goto error;
363 	}
364 
365 	return 0;
366 error:
367 	close(fd);
368 
369 	rte_intr_fd_set(dev->vfio_req_intr_handle, -1);
370 	rte_intr_type_set(dev->vfio_req_intr_handle, RTE_INTR_HANDLE_UNKNOWN);
371 	rte_intr_dev_fd_set(dev->vfio_req_intr_handle, -1);
372 
373 	return -1;
374 }
375 
376 /* disable notifier (only disable req now) */
377 static int
378 pci_vfio_disable_notifier(struct rte_pci_device *dev)
379 {
380 	int ret;
381 
382 	ret = rte_intr_disable(dev->vfio_req_intr_handle);
383 	if (ret) {
384 		PCI_LOG(ERR, "fail to disable req notifier.");
385 		return -1;
386 	}
387 
388 	ret = rte_intr_callback_unregister_sync(dev->vfio_req_intr_handle,
389 					   pci_vfio_req_handler,
390 					   (void *)&dev->device);
391 	if (ret < 0) {
392 		PCI_LOG(ERR, "fail to unregister req notifier handler.");
393 		return -1;
394 	}
395 
396 	close(rte_intr_fd_get(dev->vfio_req_intr_handle));
397 
398 	rte_intr_fd_set(dev->vfio_req_intr_handle, -1);
399 	rte_intr_type_set(dev->vfio_req_intr_handle, RTE_INTR_HANDLE_UNKNOWN);
400 	rte_intr_dev_fd_set(dev->vfio_req_intr_handle, -1);
401 
402 	return 0;
403 }
404 #endif
405 
406 static int
407 pci_vfio_is_ioport_bar(const struct rte_pci_device *dev, int vfio_dev_fd,
408 	int bar_index)
409 {
410 	uint64_t size, offset;
411 	uint32_t ioport_bar;
412 	int ret;
413 
414 	if (pci_vfio_get_region(dev, VFIO_PCI_CONFIG_REGION_INDEX,
415 		&size, &offset) != 0) {
416 		PCI_LOG(ERR, "Cannot get offset of CONFIG region.");
417 		return -1;
418 	}
419 
420 	ret = pread(vfio_dev_fd, &ioport_bar, sizeof(ioport_bar),
421 			  offset + RTE_PCI_BASE_ADDRESS_0 + bar_index * 4);
422 	if (ret != sizeof(ioport_bar)) {
423 		PCI_LOG(ERR, "Cannot read command (%x) from config space!",
424 			RTE_PCI_BASE_ADDRESS_0 + bar_index*4);
425 		return -1;
426 	}
427 
428 	return (ioport_bar & RTE_PCI_BASE_ADDRESS_SPACE_IO) != 0;
429 }
430 
431 static int
432 pci_rte_vfio_setup_device(struct rte_pci_device *dev, int vfio_dev_fd)
433 {
434 	if (pci_vfio_setup_interrupts(dev, vfio_dev_fd) != 0) {
435 		PCI_LOG(ERR, "Error setting up interrupts!");
436 		return -1;
437 	}
438 
439 	if (pci_vfio_enable_bus_memory(dev, vfio_dev_fd)) {
440 		PCI_LOG(ERR, "Cannot enable bus memory!");
441 		return -1;
442 	}
443 
444 	if (rte_pci_set_bus_master(dev, true)) {
445 		PCI_LOG(ERR, "Cannot set up bus mastering!");
446 		return -1;
447 	}
448 
449 	/*
450 	 * Reset the device. If the device is not capable of resetting,
451 	 * then it updates errno as EINVAL.
452 	 */
453 	if (ioctl(vfio_dev_fd, VFIO_DEVICE_RESET) && errno != EINVAL) {
454 		PCI_LOG(ERR, "Unable to reset device! Error: %d (%s)", errno, strerror(errno));
455 		return -1;
456 	}
457 
458 	return 0;
459 }
460 
461 static int
462 pci_vfio_mmap_bar(int vfio_dev_fd, struct mapped_pci_resource *vfio_res,
463 		int bar_index, int additional_flags)
464 {
465 	struct memreg {
466 		uint64_t offset;
467 		size_t   size;
468 	} memreg[2] = {};
469 	void *bar_addr;
470 	struct pci_msix_table *msix_table = &vfio_res->msix_table;
471 	struct pci_map *bar = &vfio_res->maps[bar_index];
472 
473 	if (bar->size == 0) {
474 		PCI_LOG(DEBUG, "Bar size is 0, skip BAR%d", bar_index);
475 		return 0;
476 	}
477 
478 	if (msix_table->bar_index == bar_index) {
479 		/*
480 		 * VFIO will not let us map the MSI-X table,
481 		 * but we can map around it.
482 		 */
483 		uint32_t table_start = msix_table->offset;
484 		uint32_t table_end = table_start + msix_table->size;
485 		table_end = RTE_ALIGN(table_end, rte_mem_page_size());
486 		table_start = RTE_ALIGN_FLOOR(table_start, rte_mem_page_size());
487 
488 		/* If page-aligned start of MSI-X table is less than the
489 		 * actual MSI-X table start address, reassign to the actual
490 		 * start address.
491 		 */
492 		if (table_start < msix_table->offset)
493 			table_start = msix_table->offset;
494 
495 		if (table_start == 0 && table_end >= bar->size) {
496 			/* Cannot map this BAR */
497 			PCI_LOG(DEBUG, "Skipping BAR%d", bar_index);
498 			bar->size = 0;
499 			bar->addr = 0;
500 			return 0;
501 		}
502 
503 		memreg[0].offset = bar->offset;
504 		memreg[0].size = table_start;
505 		if (bar->size < table_end) {
506 			/*
507 			 * If MSI-X table end is beyond BAR end, don't attempt
508 			 * to perform second mapping.
509 			 */
510 			memreg[1].offset = 0;
511 			memreg[1].size = 0;
512 		} else {
513 			memreg[1].offset = bar->offset + table_end;
514 			memreg[1].size = bar->size - table_end;
515 		}
516 
517 		PCI_LOG(DEBUG, "Trying to map BAR%d that contains the MSI-X table. "
518 			"Trying offsets: 0x%04" PRIx64 ":0x%04zx, 0x%04" PRIx64 ":0x%04zx",
519 			bar_index,
520 			memreg[0].offset, memreg[0].size,
521 			memreg[1].offset, memreg[1].size);
522 	} else {
523 		memreg[0].offset = bar->offset;
524 		memreg[0].size = bar->size;
525 	}
526 
527 	/* reserve the address using an inaccessible mapping */
528 	bar_addr = mmap(bar->addr, bar->size, 0, MAP_PRIVATE |
529 			MAP_ANONYMOUS | additional_flags, -1, 0);
530 	if (bar_addr != MAP_FAILED) {
531 		void *map_addr = NULL;
532 		if (memreg[0].size) {
533 			/* actual map of first part */
534 			map_addr = pci_map_resource(bar_addr, vfio_dev_fd,
535 							memreg[0].offset,
536 							memreg[0].size,
537 							RTE_MAP_FORCE_ADDRESS);
538 		}
539 
540 		/*
541 		 * Regarding "memreg[0].size == 0":
542 		 * If this BAR has MSI-X table, memreg[0].size (the
543 		 * first part or the part before the table) can
544 		 * legitimately be 0 for hardware using vector table
545 		 * offset 0 (i.e. first part does not exist).
546 		 *
547 		 * When memreg[0].size is 0, "mapping the first part"
548 		 * never happens, and map_addr is NULL at this
549 		 * point. So check that mapping has been actually
550 		 * attempted.
551 		 */
552 		/* if there's a second part, try to map it */
553 		if ((map_addr != NULL || memreg[0].size == 0)
554 			&& memreg[1].offset && memreg[1].size) {
555 			void *second_addr = RTE_PTR_ADD(bar_addr,
556 						(uintptr_t)(memreg[1].offset -
557 						bar->offset));
558 			map_addr = pci_map_resource(second_addr,
559 							vfio_dev_fd,
560 							memreg[1].offset,
561 							memreg[1].size,
562 							RTE_MAP_FORCE_ADDRESS);
563 		}
564 
565 		if (map_addr == NULL) {
566 			munmap(bar_addr, bar->size);
567 			bar_addr = MAP_FAILED;
568 			PCI_LOG(ERR, "Failed to map pci BAR%d", bar_index);
569 			return -1;
570 		}
571 	} else {
572 		PCI_LOG(ERR, "Failed to create inaccessible mapping for BAR%d", bar_index);
573 		return -1;
574 	}
575 
576 	bar->addr = bar_addr;
577 	return 0;
578 }
579 
580 static int
581 pci_vfio_sparse_mmap_bar(int vfio_dev_fd, struct mapped_pci_resource *vfio_res,
582 		int bar_index, int additional_flags)
583 {
584 	struct pci_map *bar = &vfio_res->maps[bar_index];
585 	struct vfio_region_sparse_mmap_area *sparse;
586 	void *bar_addr;
587 	uint32_t i;
588 
589 	if (bar->size == 0) {
590 		PCI_LOG(DEBUG, "Bar size is 0, skip BAR%d", bar_index);
591 		return 0;
592 	}
593 
594 	/* reserve the address using an inaccessible mapping */
595 	bar_addr = mmap(bar->addr, bar->size, 0, MAP_PRIVATE |
596 			MAP_ANONYMOUS | additional_flags, -1, 0);
597 	if (bar_addr != MAP_FAILED) {
598 		void *map_addr = NULL;
599 		for (i = 0; i < bar->nr_areas; i++) {
600 			sparse = &bar->areas[i];
601 			if (sparse->size) {
602 				void *addr = RTE_PTR_ADD(bar_addr, (uintptr_t)sparse->offset);
603 				map_addr = pci_map_resource(addr, vfio_dev_fd,
604 					bar->offset + sparse->offset, sparse->size,
605 					RTE_MAP_FORCE_ADDRESS);
606 				if (map_addr == NULL) {
607 					munmap(bar_addr, bar->size);
608 					PCI_LOG(ERR, "Failed to map pci BAR%d", bar_index);
609 					goto err_map;
610 				}
611 			}
612 		}
613 	} else {
614 		PCI_LOG(ERR, "Failed to create inaccessible mapping for BAR%d", bar_index);
615 		goto err_map;
616 	}
617 
618 	bar->addr = bar_addr;
619 	return 0;
620 
621 err_map:
622 	bar->nr_areas = 0;
623 	return -1;
624 }
625 
626 /*
627  * region info may contain capability headers, so we need to keep reallocating
628  * the memory until we match allocated memory size with argsz.
629  */
630 static int
631 pci_vfio_get_region_info(int vfio_dev_fd, struct vfio_region_info **info,
632 		int region)
633 {
634 	struct vfio_region_info *ri;
635 	size_t argsz = sizeof(*ri);
636 	int ret;
637 
638 	ri = malloc(sizeof(*ri));
639 	if (ri == NULL) {
640 		PCI_LOG(ERR, "Cannot allocate memory for VFIO region info");
641 		return -1;
642 	}
643 again:
644 	memset(ri, 0, argsz);
645 	ri->argsz = argsz;
646 	ri->index = region;
647 
648 	ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, ri);
649 	if (ret < 0) {
650 		free(ri);
651 		return ret;
652 	}
653 	if (ri->argsz != argsz) {
654 		struct vfio_region_info *tmp;
655 
656 		argsz = ri->argsz;
657 		tmp = realloc(ri, argsz);
658 
659 		if (tmp == NULL) {
660 			/* realloc failed but the ri is still there */
661 			free(ri);
662 			PCI_LOG(ERR, "Cannot reallocate memory for VFIO region info");
663 			return -1;
664 		}
665 		ri = tmp;
666 		goto again;
667 	}
668 	*info = ri;
669 
670 	return 0;
671 }
672 
673 static struct vfio_info_cap_header *
674 pci_vfio_info_cap(struct vfio_region_info *info, int cap)
675 {
676 	struct vfio_info_cap_header *h;
677 	size_t offset;
678 
679 	if ((info->flags & RTE_VFIO_INFO_FLAG_CAPS) == 0) {
680 		/* VFIO info does not advertise capabilities */
681 		return NULL;
682 	}
683 
684 	offset = VFIO_CAP_OFFSET(info);
685 	while (offset != 0) {
686 		h = RTE_PTR_ADD(info, offset);
687 		if (h->id == cap)
688 			return h;
689 		offset = h->next;
690 	}
691 	return NULL;
692 }
693 
694 static int
695 pci_vfio_msix_is_mappable(int vfio_dev_fd, int msix_region)
696 {
697 	struct vfio_region_info *info = NULL;
698 	int ret;
699 
700 	ret = pci_vfio_get_region_info(vfio_dev_fd, &info, msix_region);
701 	if (ret < 0)
702 		return -1;
703 
704 	ret = pci_vfio_info_cap(info, RTE_VFIO_CAP_MSIX_MAPPABLE) != NULL;
705 
706 	/* cleanup */
707 	free(info);
708 
709 	return ret;
710 }
711 
712 static int
713 pci_vfio_fill_regions(struct rte_pci_device *dev, int vfio_dev_fd,
714 		      struct vfio_device_info *device_info)
715 {
716 	struct rte_pci_device_internal *pdev = RTE_PCI_DEVICE_INTERNAL(dev);
717 	struct vfio_region_info *reg = NULL;
718 	int nb_maps, i, ret;
719 
720 	nb_maps = RTE_MIN((int)device_info->num_regions,
721 			VFIO_PCI_CONFIG_REGION_INDEX + 1);
722 
723 	for (i = 0; i < nb_maps; i++) {
724 		ret = pci_vfio_get_region_info(vfio_dev_fd, &reg, i);
725 		if (ret < 0) {
726 			PCI_LOG(DEBUG, "%s cannot get device region info error %i (%s)",
727 				dev->name, errno, strerror(errno));
728 			return -1;
729 		}
730 
731 		pdev->region[i].size = reg->size;
732 		pdev->region[i].offset = reg->offset;
733 
734 		free(reg);
735 	}
736 
737 	return 0;
738 }
739 
740 static int
741 pci_vfio_map_resource_primary(struct rte_pci_device *dev)
742 {
743 	struct rte_pci_device_internal *pdev = RTE_PCI_DEVICE_INTERNAL(dev);
744 	struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
745 	struct vfio_region_info *reg = NULL;
746 	char pci_addr[PATH_MAX] = {0};
747 	int vfio_dev_fd;
748 	struct rte_pci_addr *loc = &dev->addr;
749 	int i, j, ret;
750 	struct mapped_pci_resource *vfio_res = NULL;
751 	struct mapped_pci_res_list *vfio_res_list =
752 		RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list);
753 
754 	struct pci_map *maps;
755 
756 	if (rte_intr_fd_set(dev->intr_handle, -1))
757 		return -1;
758 
759 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
760 	if (rte_intr_fd_set(dev->vfio_req_intr_handle, -1))
761 		return -1;
762 #endif
763 
764 	/* store PCI address string */
765 	snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
766 			loc->domain, loc->bus, loc->devid, loc->function);
767 
768 	ret = rte_vfio_setup_device(rte_pci_get_sysfs_path(), pci_addr,
769 					&vfio_dev_fd, &device_info);
770 	if (ret)
771 		return ret;
772 
773 	if (rte_intr_dev_fd_set(dev->intr_handle, vfio_dev_fd))
774 		goto err_vfio_dev_fd;
775 
776 	/* allocate vfio_res and get region info */
777 	vfio_res = rte_zmalloc("VFIO_RES", sizeof(*vfio_res), 0);
778 	if (vfio_res == NULL) {
779 		PCI_LOG(ERR, "Cannot store VFIO mmap details");
780 		goto err_vfio_dev_fd;
781 	}
782 	memcpy(&vfio_res->pci_addr, &dev->addr, sizeof(vfio_res->pci_addr));
783 
784 	/* get number of registers (up to BAR5) */
785 	vfio_res->nb_maps = RTE_MIN((int) device_info.num_regions,
786 			VFIO_PCI_BAR5_REGION_INDEX + 1);
787 
788 	/* map BARs */
789 	maps = vfio_res->maps;
790 
791 	ret = pci_vfio_get_region_info(vfio_dev_fd, &reg,
792 		VFIO_PCI_CONFIG_REGION_INDEX);
793 	if (ret < 0) {
794 		PCI_LOG(ERR, "%s cannot get device region info error %i (%s)",
795 			dev->name, errno, strerror(errno));
796 		goto err_vfio_res;
797 	}
798 	pdev->region[VFIO_PCI_CONFIG_REGION_INDEX].size = reg->size;
799 	pdev->region[VFIO_PCI_CONFIG_REGION_INDEX].offset = reg->offset;
800 	free(reg);
801 
802 	vfio_res->msix_table.bar_index = -1;
803 	/* get MSI-X BAR, if any (we have to know where it is because we can't
804 	 * easily mmap it when using VFIO)
805 	 */
806 	ret = pci_vfio_get_msix_bar(dev, &vfio_res->msix_table);
807 	if (ret < 0) {
808 		PCI_LOG(ERR, "%s cannot get MSI-X BAR number!", pci_addr);
809 		goto err_vfio_res;
810 	}
811 	/* if we found our MSI-X BAR region, check if we can mmap it */
812 	if (vfio_res->msix_table.bar_index != -1) {
813 		int ret = pci_vfio_msix_is_mappable(vfio_dev_fd,
814 				vfio_res->msix_table.bar_index);
815 		if (ret < 0) {
816 			PCI_LOG(ERR, "Couldn't check if MSI-X BAR is mappable");
817 			goto err_vfio_res;
818 		} else if (ret != 0) {
819 			/* we can map it, so we don't care where it is */
820 			PCI_LOG(DEBUG, "VFIO reports MSI-X BAR as mappable");
821 			vfio_res->msix_table.bar_index = -1;
822 		}
823 	}
824 
825 	for (i = 0; i < vfio_res->nb_maps; i++) {
826 		void *bar_addr;
827 		struct vfio_info_cap_header *hdr;
828 		struct vfio_region_info_cap_sparse_mmap *sparse;
829 
830 		ret = pci_vfio_get_region_info(vfio_dev_fd, &reg, i);
831 		if (ret < 0) {
832 			PCI_LOG(ERR, "%s cannot get device region info error %i (%s)",
833 				pci_addr, errno, strerror(errno));
834 			goto err_map;
835 		}
836 
837 		pdev->region[i].size = reg->size;
838 		pdev->region[i].offset = reg->offset;
839 
840 		/* chk for io port region */
841 		ret = pci_vfio_is_ioport_bar(dev, vfio_dev_fd, i);
842 		if (ret < 0) {
843 			free(reg);
844 			goto err_map;
845 		} else if (ret) {
846 			PCI_LOG(INFO, "Ignore mapping IO port bar(%d)", i);
847 			free(reg);
848 			continue;
849 		}
850 
851 		/* skip non-mmappable BARs */
852 		if ((reg->flags & VFIO_REGION_INFO_FLAG_MMAP) == 0) {
853 			free(reg);
854 			continue;
855 		}
856 
857 		/* try mapping somewhere close to the end of hugepages */
858 		if (pci_map_addr == NULL)
859 			pci_map_addr = pci_find_max_end_va();
860 
861 		bar_addr = pci_map_addr;
862 		pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg->size);
863 
864 		pci_map_addr = RTE_PTR_ALIGN(pci_map_addr,
865 					sysconf(_SC_PAGE_SIZE));
866 
867 		maps[i].addr = bar_addr;
868 		maps[i].offset = reg->offset;
869 		maps[i].size = reg->size;
870 		maps[i].path = NULL; /* vfio doesn't have per-resource paths */
871 
872 		hdr = pci_vfio_info_cap(reg, VFIO_REGION_INFO_CAP_SPARSE_MMAP);
873 
874 		if (hdr != NULL) {
875 			sparse = container_of(hdr,
876 				struct vfio_region_info_cap_sparse_mmap, header);
877 			if (sparse->nr_areas > 0) {
878 				maps[i].nr_areas = sparse->nr_areas;
879 				maps[i].areas = rte_zmalloc(NULL,
880 					sizeof(*maps[i].areas) * maps[i].nr_areas, 0);
881 				if (maps[i].areas == NULL) {
882 					PCI_LOG(ERR, "Cannot alloc memory for sparse map areas");
883 					goto err_map;
884 				}
885 				memcpy(maps[i].areas, sparse->areas,
886 					sizeof(*maps[i].areas) * maps[i].nr_areas);
887 			}
888 		}
889 
890 		if (maps[i].nr_areas > 0) {
891 			ret = pci_vfio_sparse_mmap_bar(vfio_dev_fd, vfio_res, i, 0);
892 			if (ret < 0) {
893 				PCI_LOG(ERR, "%s sparse mapping BAR%i failed: %s",
894 					pci_addr, i, strerror(errno));
895 				free(reg);
896 				goto err_map;
897 			}
898 		} else {
899 			ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, 0);
900 			if (ret < 0) {
901 				PCI_LOG(ERR, "%s mapping BAR%i failed: %s",
902 					pci_addr, i, strerror(errno));
903 				free(reg);
904 				goto err_map;
905 			}
906 		}
907 
908 		dev->mem_resource[i].addr = maps[i].addr;
909 
910 		free(reg);
911 	}
912 
913 	if (pci_rte_vfio_setup_device(dev, vfio_dev_fd) < 0) {
914 		PCI_LOG(ERR, "%s setup device failed", pci_addr);
915 		goto err_map;
916 	}
917 
918 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
919 	if (pci_vfio_enable_notifier(dev, vfio_dev_fd) != 0) {
920 		PCI_LOG(ERR, "Error setting up notifier!");
921 		goto err_map;
922 	}
923 
924 #endif
925 	TAILQ_INSERT_TAIL(vfio_res_list, vfio_res, next);
926 
927 	return 0;
928 err_map:
929 	for (j = 0; j < i; j++) {
930 		if (maps[j].addr)
931 			pci_unmap_resource(maps[j].addr, maps[j].size);
932 		if (maps[j].nr_areas > 0)
933 			rte_free(maps[j].areas);
934 	}
935 err_vfio_res:
936 	rte_free(vfio_res);
937 err_vfio_dev_fd:
938 	rte_vfio_release_device(rte_pci_get_sysfs_path(),
939 			pci_addr, vfio_dev_fd);
940 	return -1;
941 }
942 
943 static int
944 pci_vfio_map_resource_secondary(struct rte_pci_device *dev)
945 {
946 	struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
947 	char pci_addr[PATH_MAX] = {0};
948 	int vfio_dev_fd;
949 	struct rte_pci_addr *loc = &dev->addr;
950 	int i, j, ret;
951 	struct mapped_pci_resource *vfio_res = NULL;
952 	struct mapped_pci_res_list *vfio_res_list =
953 		RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list);
954 
955 	struct pci_map *maps;
956 
957 	if (rte_intr_fd_set(dev->intr_handle, -1))
958 		return -1;
959 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
960 	if (rte_intr_fd_set(dev->vfio_req_intr_handle, -1))
961 		return -1;
962 #endif
963 
964 	/* store PCI address string */
965 	snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
966 			loc->domain, loc->bus, loc->devid, loc->function);
967 
968 	/* if we're in a secondary process, just find our tailq entry */
969 	TAILQ_FOREACH(vfio_res, vfio_res_list, next) {
970 		if (rte_pci_addr_cmp(&vfio_res->pci_addr,
971 						 &dev->addr))
972 			continue;
973 		break;
974 	}
975 	/* if we haven't found our tailq entry, something's wrong */
976 	if (vfio_res == NULL) {
977 		PCI_LOG(ERR, "%s cannot find TAILQ entry for PCI device!", pci_addr);
978 		return -1;
979 	}
980 
981 	ret = rte_vfio_setup_device(rte_pci_get_sysfs_path(), pci_addr,
982 					&vfio_dev_fd, &device_info);
983 	if (ret)
984 		return ret;
985 
986 	ret = pci_vfio_fill_regions(dev, vfio_dev_fd, &device_info);
987 	if (ret)
988 		return ret;
989 
990 	/* map BARs */
991 	maps = vfio_res->maps;
992 
993 	for (i = 0; i < vfio_res->nb_maps; i++) {
994 		if (maps[i].nr_areas > 0) {
995 			ret = pci_vfio_sparse_mmap_bar(vfio_dev_fd, vfio_res, i, MAP_FIXED);
996 			if (ret < 0) {
997 				PCI_LOG(ERR, "%s sparse mapping BAR%i failed: %s",
998 					pci_addr, i, strerror(errno));
999 				goto err_vfio_dev_fd;
1000 			}
1001 		} else {
1002 			ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, MAP_FIXED);
1003 			if (ret < 0) {
1004 				PCI_LOG(ERR, "%s mapping BAR%i failed: %s",
1005 					pci_addr, i, strerror(errno));
1006 				goto err_vfio_dev_fd;
1007 			}
1008 		}
1009 
1010 		dev->mem_resource[i].addr = maps[i].addr;
1011 	}
1012 
1013 	/* we need save vfio_dev_fd, so it can be used during release */
1014 	if (rte_intr_dev_fd_set(dev->intr_handle, vfio_dev_fd))
1015 		goto err_vfio_dev_fd;
1016 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
1017 	if (rte_intr_dev_fd_set(dev->vfio_req_intr_handle, vfio_dev_fd))
1018 		goto err_vfio_dev_fd;
1019 #endif
1020 
1021 	return 0;
1022 err_vfio_dev_fd:
1023 	for (j = 0; j < i; j++) {
1024 		if (maps[j].addr)
1025 			pci_unmap_resource(maps[j].addr, maps[j].size);
1026 	}
1027 	rte_vfio_release_device(rte_pci_get_sysfs_path(),
1028 			pci_addr, vfio_dev_fd);
1029 	return -1;
1030 }
1031 
1032 /*
1033  * map the PCI resources of a PCI device in virtual memory (VFIO version).
1034  * primary and secondary processes follow almost exactly the same path
1035  */
1036 int
1037 pci_vfio_map_resource(struct rte_pci_device *dev)
1038 {
1039 	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
1040 		return pci_vfio_map_resource_primary(dev);
1041 	else
1042 		return pci_vfio_map_resource_secondary(dev);
1043 }
1044 
1045 static struct mapped_pci_resource *
1046 find_and_unmap_vfio_resource(struct mapped_pci_res_list *vfio_res_list,
1047 			struct rte_pci_device *dev,
1048 			const char *pci_addr)
1049 {
1050 	struct mapped_pci_resource *vfio_res = NULL;
1051 	struct pci_map *maps;
1052 	int i;
1053 
1054 	/* Get vfio_res */
1055 	TAILQ_FOREACH(vfio_res, vfio_res_list, next) {
1056 		if (rte_pci_addr_cmp(&vfio_res->pci_addr, &dev->addr))
1057 			continue;
1058 		break;
1059 	}
1060 
1061 	if (vfio_res == NULL)
1062 		return vfio_res;
1063 
1064 	PCI_LOG(INFO, "Releasing PCI mapped resource for %s", pci_addr);
1065 
1066 	maps = vfio_res->maps;
1067 	for (i = 0; i < vfio_res->nb_maps; i++) {
1068 
1069 		/*
1070 		 * We do not need to be aware of MSI-X table BAR mappings as
1071 		 * when mapping. Just using current maps array is enough
1072 		 */
1073 		if (maps[i].addr) {
1074 			PCI_LOG(INFO, "Calling pci_unmap_resource for %s at %p",
1075 				pci_addr, maps[i].addr);
1076 			pci_unmap_resource(maps[i].addr, maps[i].size);
1077 		}
1078 
1079 		if (maps[i].nr_areas > 0)
1080 			rte_free(maps[i].areas);
1081 	}
1082 
1083 	return vfio_res;
1084 }
1085 
1086 static int
1087 pci_vfio_unmap_resource_primary(struct rte_pci_device *dev)
1088 {
1089 	char pci_addr[PATH_MAX] = {0};
1090 	struct rte_pci_addr *loc = &dev->addr;
1091 	struct mapped_pci_resource *vfio_res = NULL;
1092 	struct mapped_pci_res_list *vfio_res_list;
1093 	int ret, vfio_dev_fd;
1094 
1095 	/* store PCI address string */
1096 	snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
1097 			loc->domain, loc->bus, loc->devid, loc->function);
1098 
1099 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
1100 	ret = pci_vfio_disable_notifier(dev);
1101 	if (ret) {
1102 		PCI_LOG(ERR, "fail to disable req notifier.");
1103 		return -1;
1104 	}
1105 
1106 #endif
1107 	if (rte_intr_fd_get(dev->intr_handle) < 0)
1108 		return -1;
1109 
1110 	if (close(rte_intr_fd_get(dev->intr_handle)) < 0) {
1111 		PCI_LOG(INFO, "Error when closing eventfd file descriptor for %s", pci_addr);
1112 		return -1;
1113 	}
1114 
1115 	vfio_dev_fd = rte_intr_dev_fd_get(dev->intr_handle);
1116 	if (vfio_dev_fd < 0)
1117 		return -1;
1118 
1119 	if (rte_pci_set_bus_master(dev, false)) {
1120 		PCI_LOG(ERR, "%s cannot unset bus mastering for PCI device!", pci_addr);
1121 		return -1;
1122 	}
1123 
1124 	ret = rte_vfio_release_device(rte_pci_get_sysfs_path(), pci_addr,
1125 				      vfio_dev_fd);
1126 	if (ret < 0) {
1127 		PCI_LOG(ERR, "Cannot release VFIO device");
1128 		return ret;
1129 	}
1130 
1131 	vfio_res_list =
1132 		RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list);
1133 	vfio_res = find_and_unmap_vfio_resource(vfio_res_list, dev, pci_addr);
1134 
1135 	/* if we haven't found our tailq entry, something's wrong */
1136 	if (vfio_res == NULL) {
1137 		PCI_LOG(ERR, "%s cannot find TAILQ entry for PCI device!", pci_addr);
1138 		return -1;
1139 	}
1140 
1141 	TAILQ_REMOVE(vfio_res_list, vfio_res, next);
1142 	rte_free(vfio_res);
1143 	return 0;
1144 }
1145 
1146 static int
1147 pci_vfio_unmap_resource_secondary(struct rte_pci_device *dev)
1148 {
1149 	char pci_addr[PATH_MAX] = {0};
1150 	struct rte_pci_addr *loc = &dev->addr;
1151 	struct mapped_pci_resource *vfio_res = NULL;
1152 	struct mapped_pci_res_list *vfio_res_list;
1153 	int ret, vfio_dev_fd;
1154 
1155 	/* store PCI address string */
1156 	snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
1157 			loc->domain, loc->bus, loc->devid, loc->function);
1158 
1159 	vfio_dev_fd = rte_intr_dev_fd_get(dev->intr_handle);
1160 	if (vfio_dev_fd < 0)
1161 		return -1;
1162 
1163 	ret = rte_vfio_release_device(rte_pci_get_sysfs_path(), pci_addr,
1164 				      vfio_dev_fd);
1165 	if (ret < 0) {
1166 		PCI_LOG(ERR, "Cannot release VFIO device");
1167 		return ret;
1168 	}
1169 
1170 	vfio_res_list =
1171 		RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list);
1172 	vfio_res = find_and_unmap_vfio_resource(vfio_res_list, dev, pci_addr);
1173 
1174 	/* if we haven't found our tailq entry, something's wrong */
1175 	if (vfio_res == NULL) {
1176 		PCI_LOG(ERR, "%s cannot find TAILQ entry for PCI device!", pci_addr);
1177 		return -1;
1178 	}
1179 
1180 	return 0;
1181 }
1182 
1183 int
1184 pci_vfio_unmap_resource(struct rte_pci_device *dev)
1185 {
1186 	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
1187 		return pci_vfio_unmap_resource_primary(dev);
1188 	else
1189 		return pci_vfio_unmap_resource_secondary(dev);
1190 }
1191 
1192 int
1193 pci_vfio_ioport_map(struct rte_pci_device *dev, int bar,
1194 		    struct rte_pci_ioport *p)
1195 {
1196 	uint64_t size, offset;
1197 
1198 	if (bar < VFIO_PCI_BAR0_REGION_INDEX ||
1199 	    bar > VFIO_PCI_BAR5_REGION_INDEX) {
1200 		PCI_LOG(ERR, "invalid bar (%d)!", bar);
1201 		return -1;
1202 	}
1203 
1204 	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
1205 		struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
1206 		char pci_addr[PATH_MAX];
1207 		int vfio_dev_fd;
1208 		struct rte_pci_addr *loc = &dev->addr;
1209 
1210 		/* store PCI address string */
1211 		snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
1212 				loc->domain, loc->bus, loc->devid, loc->function);
1213 
1214 		vfio_dev_fd = rte_intr_dev_fd_get(dev->intr_handle);
1215 		if (vfio_dev_fd < 0) {
1216 			return -1;
1217 		} else if (vfio_dev_fd == 0) {
1218 			if (rte_vfio_get_device_info(rte_pci_get_sysfs_path(), pci_addr,
1219 				&vfio_dev_fd, &device_info) != 0)
1220 				return -1;
1221 			/* save vfio_dev_fd so it can be used during release */
1222 			if (rte_intr_dev_fd_set(dev->intr_handle, vfio_dev_fd) != 0)
1223 				return -1;
1224 
1225 			if (pci_vfio_fill_regions(dev, vfio_dev_fd, &device_info) != 0)
1226 				return -1;
1227 		}
1228 	}
1229 
1230 	if (pci_vfio_get_region(dev, bar, &size, &offset) != 0) {
1231 		PCI_LOG(ERR, "Cannot get offset of region %d.", bar);
1232 		return -1;
1233 	}
1234 
1235 	p->dev = dev;
1236 	p->base = offset;
1237 	return 0;
1238 }
1239 
1240 void
1241 pci_vfio_ioport_read(struct rte_pci_ioport *p,
1242 		     void *data, size_t len, off_t offset)
1243 {
1244 	const struct rte_intr_handle *intr_handle = p->dev->intr_handle;
1245 	int vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
1246 
1247 	if (vfio_dev_fd < 0)
1248 		return;
1249 
1250 	if (pread(vfio_dev_fd, data,
1251 		    len, p->base + offset) <= 0)
1252 		PCI_LOG(ERR, "Can't read from PCI bar (%" PRIu64 ") : offset (%x)",
1253 			VFIO_GET_REGION_IDX(p->base), (int)offset);
1254 }
1255 
1256 void
1257 pci_vfio_ioport_write(struct rte_pci_ioport *p,
1258 		      const void *data, size_t len, off_t offset)
1259 {
1260 	const struct rte_intr_handle *intr_handle = p->dev->intr_handle;
1261 	int vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
1262 
1263 	if (vfio_dev_fd < 0)
1264 		return;
1265 
1266 	if (pwrite(vfio_dev_fd, data,
1267 		     len, p->base + offset) <= 0)
1268 		PCI_LOG(ERR, "Can't write to PCI bar (%" PRIu64 ") : offset (%x)",
1269 			VFIO_GET_REGION_IDX(p->base), (int)offset);
1270 }
1271 
1272 int
1273 pci_vfio_ioport_unmap(struct rte_pci_ioport *p)
1274 {
1275 	RTE_SET_USED(p);
1276 	return -1;
1277 }
1278 
1279 int
1280 pci_vfio_mmio_read(const struct rte_pci_device *dev, int bar,
1281 		   void *buf, size_t len, off_t offs)
1282 {
1283 	uint64_t size, offset;
1284 	int fd;
1285 
1286 	fd = rte_intr_dev_fd_get(dev->intr_handle);
1287 	if (fd < 0)
1288 		return -1;
1289 
1290 	if (pci_vfio_get_region(dev, bar, &size, &offset) != 0)
1291 		return -1;
1292 
1293 	if ((uint64_t)len + offs > size)
1294 		return -1;
1295 
1296 	return pread(fd, buf, len, offset + offs);
1297 }
1298 
1299 int
1300 pci_vfio_mmio_write(const struct rte_pci_device *dev, int bar,
1301 		    const void *buf, size_t len, off_t offs)
1302 {
1303 	uint64_t size, offset;
1304 	int fd;
1305 
1306 	fd = rte_intr_dev_fd_get(dev->intr_handle);
1307 	if (fd < 0)
1308 		return -1;
1309 
1310 	if (pci_vfio_get_region(dev, bar, &size, &offset) != 0)
1311 		return -1;
1312 
1313 	if ((uint64_t)len + offs > size)
1314 		return -1;
1315 
1316 	return pwrite(fd, buf, len, offset + offs);
1317 }
1318 
1319 int
1320 pci_vfio_is_enabled(void)
1321 {
1322 	int status = rte_vfio_is_enabled("vfio_pci");
1323 
1324 	if (!status) {
1325 		rte_vfio_enable("vfio");
1326 		status = rte_vfio_is_enabled("vfio_pci");
1327 	}
1328 	return status;
1329 }
1330 #endif
1331