xref: /dpdk/drivers/bus/pci/linux/pci_vfio.c (revision c7f5dba7d4bb7971fac51755aad09b71b10cef90)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2014 Intel Corporation
3  */
4 
5 #include <string.h>
6 #include <fcntl.h>
7 #include <linux/pci_regs.h>
8 #include <sys/eventfd.h>
9 #include <sys/socket.h>
10 #include <sys/ioctl.h>
11 #include <sys/mman.h>
12 #include <stdbool.h>
13 
14 #include <rte_log.h>
15 #include <rte_pci.h>
16 #include <rte_bus_pci.h>
17 #include <rte_eal_memconfig.h>
18 #include <rte_malloc.h>
19 #include <rte_vfio.h>
20 
21 #include "eal_filesystem.h"
22 
23 #include "pci_init.h"
24 #include "private.h"
25 
26 /**
27  * @file
28  * PCI probing under linux (VFIO version)
29  *
30  * This code tries to determine if the PCI device is bound to VFIO driver,
31  * and initialize it (map BARs, set up interrupts) if that's the case.
32  *
33  * This file is only compiled if CONFIG_RTE_EAL_VFIO is set to "y".
34  */
35 
36 #ifdef VFIO_PRESENT
37 
38 #define PAGE_SIZE   (sysconf(_SC_PAGESIZE))
39 #define PAGE_MASK   (~(PAGE_SIZE - 1))
40 
41 static struct rte_tailq_elem rte_vfio_tailq = {
42 	.name = "VFIO_RESOURCE_LIST",
43 };
44 EAL_REGISTER_TAILQ(rte_vfio_tailq)
45 
46 int
47 pci_vfio_read_config(const struct rte_intr_handle *intr_handle,
48 		    void *buf, size_t len, off_t offs)
49 {
50 	return pread64(intr_handle->vfio_dev_fd, buf, len,
51 	       VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + offs);
52 }
53 
54 int
55 pci_vfio_write_config(const struct rte_intr_handle *intr_handle,
56 		    const void *buf, size_t len, off_t offs)
57 {
58 	return pwrite64(intr_handle->vfio_dev_fd, buf, len,
59 	       VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + offs);
60 }
61 
62 /* get PCI BAR number where MSI-X interrupts are */
63 static int
64 pci_vfio_get_msix_bar(int fd, struct pci_msix_table *msix_table)
65 {
66 	int ret;
67 	uint32_t reg;
68 	uint16_t flags;
69 	uint8_t cap_id, cap_offset;
70 
71 	/* read PCI capability pointer from config space */
72 	ret = pread64(fd, &reg, sizeof(reg),
73 			VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
74 			PCI_CAPABILITY_LIST);
75 	if (ret != sizeof(reg)) {
76 		RTE_LOG(ERR, EAL, "Cannot read capability pointer from PCI "
77 				"config space!\n");
78 		return -1;
79 	}
80 
81 	/* we need first byte */
82 	cap_offset = reg & 0xFF;
83 
84 	while (cap_offset) {
85 
86 		/* read PCI capability ID */
87 		ret = pread64(fd, &reg, sizeof(reg),
88 				VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
89 				cap_offset);
90 		if (ret != sizeof(reg)) {
91 			RTE_LOG(ERR, EAL, "Cannot read capability ID from PCI "
92 					"config space!\n");
93 			return -1;
94 		}
95 
96 		/* we need first byte */
97 		cap_id = reg & 0xFF;
98 
99 		/* if we haven't reached MSI-X, check next capability */
100 		if (cap_id != PCI_CAP_ID_MSIX) {
101 			ret = pread64(fd, &reg, sizeof(reg),
102 					VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
103 					cap_offset);
104 			if (ret != sizeof(reg)) {
105 				RTE_LOG(ERR, EAL, "Cannot read capability pointer from PCI "
106 						"config space!\n");
107 				return -1;
108 			}
109 
110 			/* we need second byte */
111 			cap_offset = (reg & 0xFF00) >> 8;
112 
113 			continue;
114 		}
115 		/* else, read table offset */
116 		else {
117 			/* table offset resides in the next 4 bytes */
118 			ret = pread64(fd, &reg, sizeof(reg),
119 					VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
120 					cap_offset + 4);
121 			if (ret != sizeof(reg)) {
122 				RTE_LOG(ERR, EAL, "Cannot read table offset from PCI config "
123 						"space!\n");
124 				return -1;
125 			}
126 
127 			ret = pread64(fd, &flags, sizeof(flags),
128 					VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
129 					cap_offset + 2);
130 			if (ret != sizeof(flags)) {
131 				RTE_LOG(ERR, EAL, "Cannot read table flags from PCI config "
132 						"space!\n");
133 				return -1;
134 			}
135 
136 			msix_table->bar_index = reg & RTE_PCI_MSIX_TABLE_BIR;
137 			msix_table->offset = reg & RTE_PCI_MSIX_TABLE_OFFSET;
138 			msix_table->size =
139 				16 * (1 + (flags & RTE_PCI_MSIX_FLAGS_QSIZE));
140 
141 			return 0;
142 		}
143 	}
144 	return 0;
145 }
146 
147 /* set PCI bus mastering */
148 static int
149 pci_vfio_set_bus_master(int dev_fd, bool op)
150 {
151 	uint16_t reg;
152 	int ret;
153 
154 	ret = pread64(dev_fd, &reg, sizeof(reg),
155 			VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
156 			PCI_COMMAND);
157 	if (ret != sizeof(reg)) {
158 		RTE_LOG(ERR, EAL, "Cannot read command from PCI config space!\n");
159 		return -1;
160 	}
161 
162 	if (op)
163 		/* set the master bit */
164 		reg |= PCI_COMMAND_MASTER;
165 	else
166 		reg &= ~(PCI_COMMAND_MASTER);
167 
168 	ret = pwrite64(dev_fd, &reg, sizeof(reg),
169 			VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
170 			PCI_COMMAND);
171 
172 	if (ret != sizeof(reg)) {
173 		RTE_LOG(ERR, EAL, "Cannot write command to PCI config space!\n");
174 		return -1;
175 	}
176 
177 	return 0;
178 }
179 
180 /* set up interrupt support (but not enable interrupts) */
181 static int
182 pci_vfio_setup_interrupts(struct rte_pci_device *dev, int vfio_dev_fd)
183 {
184 	int i, ret, intr_idx;
185 	enum rte_intr_mode intr_mode;
186 
187 	/* default to invalid index */
188 	intr_idx = VFIO_PCI_NUM_IRQS;
189 
190 	/* Get default / configured intr_mode */
191 	intr_mode = rte_eal_vfio_intr_mode();
192 
193 	/* get interrupt type from internal config (MSI-X by default, can be
194 	 * overridden from the command line
195 	 */
196 	switch (intr_mode) {
197 	case RTE_INTR_MODE_MSIX:
198 		intr_idx = VFIO_PCI_MSIX_IRQ_INDEX;
199 		break;
200 	case RTE_INTR_MODE_MSI:
201 		intr_idx = VFIO_PCI_MSI_IRQ_INDEX;
202 		break;
203 	case RTE_INTR_MODE_LEGACY:
204 		intr_idx = VFIO_PCI_INTX_IRQ_INDEX;
205 		break;
206 	/* don't do anything if we want to automatically determine interrupt type */
207 	case RTE_INTR_MODE_NONE:
208 		break;
209 	default:
210 		RTE_LOG(ERR, EAL, "  unknown default interrupt type!\n");
211 		return -1;
212 	}
213 
214 	/* start from MSI-X interrupt type */
215 	for (i = VFIO_PCI_MSIX_IRQ_INDEX; i >= 0; i--) {
216 		struct vfio_irq_info irq = { .argsz = sizeof(irq) };
217 		int fd = -1;
218 
219 		/* skip interrupt modes we don't want */
220 		if (intr_mode != RTE_INTR_MODE_NONE &&
221 				i != intr_idx)
222 			continue;
223 
224 		irq.index = i;
225 
226 		ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_IRQ_INFO, &irq);
227 		if (ret < 0) {
228 			RTE_LOG(ERR, EAL, "  cannot get IRQ info, "
229 					"error %i (%s)\n", errno, strerror(errno));
230 			return -1;
231 		}
232 
233 		/* if this vector cannot be used with eventfd, fail if we explicitly
234 		 * specified interrupt type, otherwise continue */
235 		if ((irq.flags & VFIO_IRQ_INFO_EVENTFD) == 0) {
236 			if (intr_mode != RTE_INTR_MODE_NONE) {
237 				RTE_LOG(ERR, EAL,
238 						"  interrupt vector does not support eventfd!\n");
239 				return -1;
240 			} else
241 				continue;
242 		}
243 
244 		/* set up an eventfd for interrupts */
245 		fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
246 		if (fd < 0) {
247 			RTE_LOG(ERR, EAL, "  cannot set up eventfd, "
248 					"error %i (%s)\n", errno, strerror(errno));
249 			return -1;
250 		}
251 
252 		dev->intr_handle.fd = fd;
253 		dev->intr_handle.vfio_dev_fd = vfio_dev_fd;
254 
255 		switch (i) {
256 		case VFIO_PCI_MSIX_IRQ_INDEX:
257 			intr_mode = RTE_INTR_MODE_MSIX;
258 			dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_MSIX;
259 			break;
260 		case VFIO_PCI_MSI_IRQ_INDEX:
261 			intr_mode = RTE_INTR_MODE_MSI;
262 			dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_MSI;
263 			break;
264 		case VFIO_PCI_INTX_IRQ_INDEX:
265 			intr_mode = RTE_INTR_MODE_LEGACY;
266 			dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_LEGACY;
267 			break;
268 		default:
269 			RTE_LOG(ERR, EAL, "  unknown interrupt type!\n");
270 			return -1;
271 		}
272 
273 		return 0;
274 	}
275 
276 	/* if we're here, we haven't found a suitable interrupt vector */
277 	return -1;
278 }
279 
280 static int
281 pci_vfio_is_ioport_bar(int vfio_dev_fd, int bar_index)
282 {
283 	uint32_t ioport_bar;
284 	int ret;
285 
286 	ret = pread64(vfio_dev_fd, &ioport_bar, sizeof(ioport_bar),
287 			  VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX)
288 			  + PCI_BASE_ADDRESS_0 + bar_index*4);
289 	if (ret != sizeof(ioport_bar)) {
290 		RTE_LOG(ERR, EAL, "Cannot read command (%x) from config space!\n",
291 			PCI_BASE_ADDRESS_0 + bar_index*4);
292 		return -1;
293 	}
294 
295 	return (ioport_bar & PCI_BASE_ADDRESS_SPACE_IO) != 0;
296 }
297 
298 static int
299 pci_rte_vfio_setup_device(struct rte_pci_device *dev, int vfio_dev_fd)
300 {
301 	if (pci_vfio_setup_interrupts(dev, vfio_dev_fd) != 0) {
302 		RTE_LOG(ERR, EAL, "Error setting up interrupts!\n");
303 		return -1;
304 	}
305 
306 	/* set bus mastering for the device */
307 	if (pci_vfio_set_bus_master(vfio_dev_fd, true)) {
308 		RTE_LOG(ERR, EAL, "Cannot set up bus mastering!\n");
309 		return -1;
310 	}
311 
312 	/*
313 	 * Reset the device. If the device is not capable of resetting,
314 	 * then it updates errno as EINVAL.
315 	 */
316 	if (ioctl(vfio_dev_fd, VFIO_DEVICE_RESET) && errno != EINVAL) {
317 		RTE_LOG(ERR, EAL, "Unable to reset device! Error: %d (%s)\n",
318 				errno, strerror(errno));
319 		return -1;
320 	}
321 
322 	return 0;
323 }
324 
325 static int
326 pci_vfio_mmap_bar(int vfio_dev_fd, struct mapped_pci_resource *vfio_res,
327 		int bar_index, int additional_flags)
328 {
329 	struct memreg {
330 		unsigned long offset, size;
331 	} memreg[2] = {};
332 	void *bar_addr;
333 	struct pci_msix_table *msix_table = &vfio_res->msix_table;
334 	struct pci_map *bar = &vfio_res->maps[bar_index];
335 
336 	if (bar->size == 0)
337 		/* Skip this BAR */
338 		return 0;
339 
340 	if (msix_table->bar_index == bar_index) {
341 		/*
342 		 * VFIO will not let us map the MSI-X table,
343 		 * but we can map around it.
344 		 */
345 		uint32_t table_start = msix_table->offset;
346 		uint32_t table_end = table_start + msix_table->size;
347 		table_end = (table_end + ~PAGE_MASK) & PAGE_MASK;
348 		table_start &= PAGE_MASK;
349 
350 		if (table_start == 0 && table_end >= bar->size) {
351 			/* Cannot map this BAR */
352 			RTE_LOG(DEBUG, EAL, "Skipping BAR%d\n", bar_index);
353 			bar->size = 0;
354 			bar->addr = 0;
355 			return 0;
356 		}
357 
358 		memreg[0].offset = bar->offset;
359 		memreg[0].size = table_start;
360 		memreg[1].offset = bar->offset + table_end;
361 		memreg[1].size = bar->size - table_end;
362 
363 		RTE_LOG(DEBUG, EAL,
364 			"Trying to map BAR%d that contains the MSI-X "
365 			"table. Trying offsets: "
366 			"0x%04lx:0x%04lx, 0x%04lx:0x%04lx\n", bar_index,
367 			memreg[0].offset, memreg[0].size,
368 			memreg[1].offset, memreg[1].size);
369 	} else {
370 		memreg[0].offset = bar->offset;
371 		memreg[0].size = bar->size;
372 	}
373 
374 	/* reserve the address using an inaccessible mapping */
375 	bar_addr = mmap(bar->addr, bar->size, 0, MAP_PRIVATE |
376 			MAP_ANONYMOUS | additional_flags, -1, 0);
377 	if (bar_addr != MAP_FAILED) {
378 		void *map_addr = NULL;
379 		if (memreg[0].size) {
380 			/* actual map of first part */
381 			map_addr = pci_map_resource(bar_addr, vfio_dev_fd,
382 							memreg[0].offset,
383 							memreg[0].size,
384 							MAP_FIXED);
385 		}
386 
387 		/* if there's a second part, try to map it */
388 		if (map_addr != MAP_FAILED
389 			&& memreg[1].offset && memreg[1].size) {
390 			void *second_addr = RTE_PTR_ADD(bar_addr,
391 							memreg[1].offset -
392 							(uintptr_t)bar->offset);
393 			map_addr = pci_map_resource(second_addr,
394 							vfio_dev_fd,
395 							memreg[1].offset,
396 							memreg[1].size,
397 							MAP_FIXED);
398 		}
399 
400 		if (map_addr == MAP_FAILED || !map_addr) {
401 			munmap(bar_addr, bar->size);
402 			bar_addr = MAP_FAILED;
403 			RTE_LOG(ERR, EAL, "Failed to map pci BAR%d\n",
404 					bar_index);
405 			return -1;
406 		}
407 	} else {
408 		RTE_LOG(ERR, EAL,
409 				"Failed to create inaccessible mapping for BAR%d\n",
410 				bar_index);
411 		return -1;
412 	}
413 
414 	bar->addr = bar_addr;
415 	return 0;
416 }
417 
418 /*
419  * region info may contain capability headers, so we need to keep reallocating
420  * the memory until we match allocated memory size with argsz.
421  */
422 static int
423 pci_vfio_get_region_info(int vfio_dev_fd, struct vfio_region_info **info,
424 		int region)
425 {
426 	struct vfio_region_info *ri;
427 	size_t argsz = sizeof(*ri);
428 	int ret;
429 
430 	ri = malloc(sizeof(*ri));
431 	if (ri == NULL) {
432 		RTE_LOG(ERR, EAL, "Cannot allocate memory for region info\n");
433 		return -1;
434 	}
435 again:
436 	memset(ri, 0, argsz);
437 	ri->argsz = argsz;
438 	ri->index = region;
439 
440 	ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, ri);
441 	if (ret < 0) {
442 		free(ri);
443 		return ret;
444 	}
445 	if (ri->argsz != argsz) {
446 		struct vfio_region_info *tmp;
447 
448 		argsz = ri->argsz;
449 		tmp = realloc(ri, argsz);
450 
451 		if (tmp == NULL) {
452 			/* realloc failed but the ri is still there */
453 			free(ri);
454 			RTE_LOG(ERR, EAL, "Cannot reallocate memory for region info\n");
455 			return -1;
456 		}
457 		ri = tmp;
458 		goto again;
459 	}
460 	*info = ri;
461 
462 	return 0;
463 }
464 
465 static struct vfio_info_cap_header *
466 pci_vfio_info_cap(struct vfio_region_info *info, int cap)
467 {
468 	struct vfio_info_cap_header *h;
469 	size_t offset;
470 
471 	if ((info->flags & RTE_VFIO_INFO_FLAG_CAPS) == 0) {
472 		/* VFIO info does not advertise capabilities */
473 		return NULL;
474 	}
475 
476 	offset = VFIO_CAP_OFFSET(info);
477 	while (offset != 0) {
478 		h = RTE_PTR_ADD(info, offset);
479 		if (h->id == cap)
480 			return h;
481 		offset = h->next;
482 	}
483 	return NULL;
484 }
485 
486 static int
487 pci_vfio_msix_is_mappable(int vfio_dev_fd, int msix_region)
488 {
489 	struct vfio_region_info *info;
490 	int ret;
491 
492 	ret = pci_vfio_get_region_info(vfio_dev_fd, &info, msix_region);
493 	if (ret < 0)
494 		return -1;
495 
496 	ret = pci_vfio_info_cap(info, RTE_VFIO_CAP_MSIX_MAPPABLE) != NULL;
497 
498 	/* cleanup */
499 	free(info);
500 
501 	return ret;
502 }
503 
504 
505 static int
506 pci_vfio_map_resource_primary(struct rte_pci_device *dev)
507 {
508 	struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
509 	char pci_addr[PATH_MAX] = {0};
510 	int vfio_dev_fd;
511 	struct rte_pci_addr *loc = &dev->addr;
512 	int i, ret;
513 	struct mapped_pci_resource *vfio_res = NULL;
514 	struct mapped_pci_res_list *vfio_res_list =
515 		RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list);
516 
517 	struct pci_map *maps;
518 
519 	dev->intr_handle.fd = -1;
520 
521 	/* store PCI address string */
522 	snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
523 			loc->domain, loc->bus, loc->devid, loc->function);
524 
525 	ret = rte_vfio_setup_device(rte_pci_get_sysfs_path(), pci_addr,
526 					&vfio_dev_fd, &device_info);
527 	if (ret)
528 		return ret;
529 
530 	/* allocate vfio_res and get region info */
531 	vfio_res = rte_zmalloc("VFIO_RES", sizeof(*vfio_res), 0);
532 	if (vfio_res == NULL) {
533 		RTE_LOG(ERR, EAL,
534 			"%s(): cannot store uio mmap details\n", __func__);
535 		goto err_vfio_dev_fd;
536 	}
537 	memcpy(&vfio_res->pci_addr, &dev->addr, sizeof(vfio_res->pci_addr));
538 
539 	/* get number of registers (up to BAR5) */
540 	vfio_res->nb_maps = RTE_MIN((int) device_info.num_regions,
541 			VFIO_PCI_BAR5_REGION_INDEX + 1);
542 
543 	/* map BARs */
544 	maps = vfio_res->maps;
545 
546 	vfio_res->msix_table.bar_index = -1;
547 	/* get MSI-X BAR, if any (we have to know where it is because we can't
548 	 * easily mmap it when using VFIO)
549 	 */
550 	ret = pci_vfio_get_msix_bar(vfio_dev_fd, &vfio_res->msix_table);
551 	if (ret < 0) {
552 		RTE_LOG(ERR, EAL, "  %s cannot get MSI-X BAR number!\n",
553 				pci_addr);
554 		goto err_vfio_res;
555 	}
556 	/* if we found our MSI-X BAR region, check if we can mmap it */
557 	if (vfio_res->msix_table.bar_index != -1) {
558 		int ret = pci_vfio_msix_is_mappable(vfio_dev_fd,
559 				vfio_res->msix_table.bar_index);
560 		if (ret < 0) {
561 			RTE_LOG(ERR, EAL, "Couldn't check if MSI-X BAR is mappable\n");
562 			goto err_vfio_res;
563 		} else if (ret != 0) {
564 			/* we can map it, so we don't care where it is */
565 			RTE_LOG(DEBUG, EAL, "VFIO reports MSI-X BAR as mappable\n");
566 			vfio_res->msix_table.bar_index = -1;
567 		}
568 	}
569 
570 	for (i = 0; i < (int) vfio_res->nb_maps; i++) {
571 		struct vfio_region_info *reg = NULL;
572 		void *bar_addr;
573 
574 		ret = pci_vfio_get_region_info(vfio_dev_fd, &reg, i);
575 		if (ret < 0) {
576 			RTE_LOG(ERR, EAL, "  %s cannot get device region info "
577 				"error %i (%s)\n", pci_addr, errno,
578 				strerror(errno));
579 			goto err_vfio_res;
580 		}
581 
582 		/* chk for io port region */
583 		ret = pci_vfio_is_ioport_bar(vfio_dev_fd, i);
584 		if (ret < 0) {
585 			free(reg);
586 			goto err_vfio_res;
587 		} else if (ret) {
588 			RTE_LOG(INFO, EAL, "Ignore mapping IO port bar(%d)\n",
589 					i);
590 			free(reg);
591 			continue;
592 		}
593 
594 		/* skip non-mmapable BARs */
595 		if ((reg->flags & VFIO_REGION_INFO_FLAG_MMAP) == 0) {
596 			free(reg);
597 			continue;
598 		}
599 
600 		/* try mapping somewhere close to the end of hugepages */
601 		if (pci_map_addr == NULL)
602 			pci_map_addr = pci_find_max_end_va();
603 
604 		bar_addr = pci_map_addr;
605 		pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg->size);
606 
607 		maps[i].addr = bar_addr;
608 		maps[i].offset = reg->offset;
609 		maps[i].size = reg->size;
610 		maps[i].path = NULL; /* vfio doesn't have per-resource paths */
611 
612 		ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, 0);
613 		if (ret < 0) {
614 			RTE_LOG(ERR, EAL, "  %s mapping BAR%i failed: %s\n",
615 					pci_addr, i, strerror(errno));
616 			free(reg);
617 			goto err_vfio_res;
618 		}
619 
620 		dev->mem_resource[i].addr = maps[i].addr;
621 
622 		free(reg);
623 	}
624 
625 	if (pci_rte_vfio_setup_device(dev, vfio_dev_fd) < 0) {
626 		RTE_LOG(ERR, EAL, "  %s setup device failed\n", pci_addr);
627 		goto err_vfio_res;
628 	}
629 
630 	TAILQ_INSERT_TAIL(vfio_res_list, vfio_res, next);
631 
632 	return 0;
633 err_vfio_res:
634 	rte_free(vfio_res);
635 err_vfio_dev_fd:
636 	close(vfio_dev_fd);
637 	return -1;
638 }
639 
640 static int
641 pci_vfio_map_resource_secondary(struct rte_pci_device *dev)
642 {
643 	struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
644 	char pci_addr[PATH_MAX] = {0};
645 	int vfio_dev_fd;
646 	struct rte_pci_addr *loc = &dev->addr;
647 	int i, ret;
648 	struct mapped_pci_resource *vfio_res = NULL;
649 	struct mapped_pci_res_list *vfio_res_list =
650 		RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list);
651 
652 	struct pci_map *maps;
653 
654 	dev->intr_handle.fd = -1;
655 
656 	/* store PCI address string */
657 	snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
658 			loc->domain, loc->bus, loc->devid, loc->function);
659 
660 	ret = rte_vfio_setup_device(rte_pci_get_sysfs_path(), pci_addr,
661 					&vfio_dev_fd, &device_info);
662 	if (ret)
663 		return ret;
664 
665 	/* if we're in a secondary process, just find our tailq entry */
666 	TAILQ_FOREACH(vfio_res, vfio_res_list, next) {
667 		if (rte_pci_addr_cmp(&vfio_res->pci_addr,
668 						 &dev->addr))
669 			continue;
670 		break;
671 	}
672 	/* if we haven't found our tailq entry, something's wrong */
673 	if (vfio_res == NULL) {
674 		RTE_LOG(ERR, EAL, "  %s cannot find TAILQ entry for PCI device!\n",
675 				pci_addr);
676 		goto err_vfio_dev_fd;
677 	}
678 
679 	/* map BARs */
680 	maps = vfio_res->maps;
681 
682 	for (i = 0; i < (int) vfio_res->nb_maps; i++) {
683 		ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, MAP_FIXED);
684 		if (ret < 0) {
685 			RTE_LOG(ERR, EAL, "  %s mapping BAR%i failed: %s\n",
686 					pci_addr, i, strerror(errno));
687 			goto err_vfio_dev_fd;
688 		}
689 
690 		dev->mem_resource[i].addr = maps[i].addr;
691 	}
692 
693 	/* we need save vfio_dev_fd, so it can be used during release */
694 	dev->intr_handle.vfio_dev_fd = vfio_dev_fd;
695 
696 	return 0;
697 err_vfio_dev_fd:
698 	close(vfio_dev_fd);
699 	return -1;
700 }
701 
702 /*
703  * map the PCI resources of a PCI device in virtual memory (VFIO version).
704  * primary and secondary processes follow almost exactly the same path
705  */
706 int
707 pci_vfio_map_resource(struct rte_pci_device *dev)
708 {
709 	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
710 		return pci_vfio_map_resource_primary(dev);
711 	else
712 		return pci_vfio_map_resource_secondary(dev);
713 }
714 
715 static struct mapped_pci_resource *
716 find_and_unmap_vfio_resource(struct mapped_pci_res_list *vfio_res_list,
717 			struct rte_pci_device *dev,
718 			const char *pci_addr)
719 {
720 	struct mapped_pci_resource *vfio_res = NULL;
721 	struct pci_map *maps;
722 	int i;
723 
724 	/* Get vfio_res */
725 	TAILQ_FOREACH(vfio_res, vfio_res_list, next) {
726 		if (rte_pci_addr_cmp(&vfio_res->pci_addr, &dev->addr))
727 			continue;
728 		break;
729 	}
730 
731 	if  (vfio_res == NULL)
732 		return vfio_res;
733 
734 	RTE_LOG(INFO, EAL, "Releasing pci mapped resource for %s\n",
735 		pci_addr);
736 
737 	maps = vfio_res->maps;
738 	for (i = 0; i < (int) vfio_res->nb_maps; i++) {
739 
740 		/*
741 		 * We do not need to be aware of MSI-X table BAR mappings as
742 		 * when mapping. Just using current maps array is enough
743 		 */
744 		if (maps[i].addr) {
745 			RTE_LOG(INFO, EAL, "Calling pci_unmap_resource for %s at %p\n",
746 				pci_addr, maps[i].addr);
747 			pci_unmap_resource(maps[i].addr, maps[i].size);
748 		}
749 	}
750 
751 	return vfio_res;
752 }
753 
754 static int
755 pci_vfio_unmap_resource_primary(struct rte_pci_device *dev)
756 {
757 	char pci_addr[PATH_MAX] = {0};
758 	struct rte_pci_addr *loc = &dev->addr;
759 	struct mapped_pci_resource *vfio_res = NULL;
760 	struct mapped_pci_res_list *vfio_res_list;
761 	int ret;
762 
763 	/* store PCI address string */
764 	snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
765 			loc->domain, loc->bus, loc->devid, loc->function);
766 
767 	if (close(dev->intr_handle.fd) < 0) {
768 		RTE_LOG(INFO, EAL, "Error when closing eventfd file descriptor for %s\n",
769 			pci_addr);
770 		return -1;
771 	}
772 
773 	if (pci_vfio_set_bus_master(dev->intr_handle.vfio_dev_fd, false)) {
774 		RTE_LOG(ERR, EAL, "  %s cannot unset bus mastering for PCI device!\n",
775 				pci_addr);
776 		return -1;
777 	}
778 
779 	ret = rte_vfio_release_device(rte_pci_get_sysfs_path(), pci_addr,
780 				  dev->intr_handle.vfio_dev_fd);
781 	if (ret < 0) {
782 		RTE_LOG(ERR, EAL,
783 			"%s(): cannot release device\n", __func__);
784 		return ret;
785 	}
786 
787 	vfio_res_list =
788 		RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list);
789 	vfio_res = find_and_unmap_vfio_resource(vfio_res_list, dev, pci_addr);
790 
791 	/* if we haven't found our tailq entry, something's wrong */
792 	if (vfio_res == NULL) {
793 		RTE_LOG(ERR, EAL, "  %s cannot find TAILQ entry for PCI device!\n",
794 				pci_addr);
795 		return -1;
796 	}
797 
798 	TAILQ_REMOVE(vfio_res_list, vfio_res, next);
799 
800 	return 0;
801 }
802 
803 static int
804 pci_vfio_unmap_resource_secondary(struct rte_pci_device *dev)
805 {
806 	char pci_addr[PATH_MAX] = {0};
807 	struct rte_pci_addr *loc = &dev->addr;
808 	struct mapped_pci_resource *vfio_res = NULL;
809 	struct mapped_pci_res_list *vfio_res_list;
810 	int ret;
811 
812 	/* store PCI address string */
813 	snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
814 			loc->domain, loc->bus, loc->devid, loc->function);
815 
816 	ret = rte_vfio_release_device(rte_pci_get_sysfs_path(), pci_addr,
817 				  dev->intr_handle.vfio_dev_fd);
818 	if (ret < 0) {
819 		RTE_LOG(ERR, EAL,
820 			"%s(): cannot release device\n", __func__);
821 		return ret;
822 	}
823 
824 	vfio_res_list =
825 		RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list);
826 	vfio_res = find_and_unmap_vfio_resource(vfio_res_list, dev, pci_addr);
827 
828 	/* if we haven't found our tailq entry, something's wrong */
829 	if (vfio_res == NULL) {
830 		RTE_LOG(ERR, EAL, "  %s cannot find TAILQ entry for PCI device!\n",
831 				pci_addr);
832 		return -1;
833 	}
834 
835 	return 0;
836 }
837 
838 int
839 pci_vfio_unmap_resource(struct rte_pci_device *dev)
840 {
841 	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
842 		return pci_vfio_unmap_resource_primary(dev);
843 	else
844 		return pci_vfio_unmap_resource_secondary(dev);
845 }
846 
847 int
848 pci_vfio_ioport_map(struct rte_pci_device *dev, int bar,
849 		    struct rte_pci_ioport *p)
850 {
851 	if (bar < VFIO_PCI_BAR0_REGION_INDEX ||
852 	    bar > VFIO_PCI_BAR5_REGION_INDEX) {
853 		RTE_LOG(ERR, EAL, "invalid bar (%d)!\n", bar);
854 		return -1;
855 	}
856 
857 	p->dev = dev;
858 	p->base = VFIO_GET_REGION_ADDR(bar);
859 	return 0;
860 }
861 
862 void
863 pci_vfio_ioport_read(struct rte_pci_ioport *p,
864 		     void *data, size_t len, off_t offset)
865 {
866 	const struct rte_intr_handle *intr_handle = &p->dev->intr_handle;
867 
868 	if (pread64(intr_handle->vfio_dev_fd, data,
869 		    len, p->base + offset) <= 0)
870 		RTE_LOG(ERR, EAL,
871 			"Can't read from PCI bar (%" PRIu64 ") : offset (%x)\n",
872 			VFIO_GET_REGION_IDX(p->base), (int)offset);
873 }
874 
875 void
876 pci_vfio_ioport_write(struct rte_pci_ioport *p,
877 		      const void *data, size_t len, off_t offset)
878 {
879 	const struct rte_intr_handle *intr_handle = &p->dev->intr_handle;
880 
881 	if (pwrite64(intr_handle->vfio_dev_fd, data,
882 		     len, p->base + offset) <= 0)
883 		RTE_LOG(ERR, EAL,
884 			"Can't write to PCI bar (%" PRIu64 ") : offset (%x)\n",
885 			VFIO_GET_REGION_IDX(p->base), (int)offset);
886 }
887 
888 int
889 pci_vfio_ioport_unmap(struct rte_pci_ioport *p)
890 {
891 	RTE_SET_USED(p);
892 	return -1;
893 }
894 
895 int
896 pci_vfio_is_enabled(void)
897 {
898 	return rte_vfio_is_enabled("vfio_pci");
899 }
900 #endif
901