xref: /dpdk/drivers/bus/pci/linux/pci_vfio.c (revision 2490bb897182f57de80fd924dd3ae48dda819b8c)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2014 Intel Corporation
3  */
4 
5 #include <string.h>
6 #include <fcntl.h>
7 #include <linux/pci_regs.h>
8 #include <sys/eventfd.h>
9 #include <sys/socket.h>
10 #include <sys/ioctl.h>
11 #include <sys/mman.h>
12 #include <stdbool.h>
13 
14 #include <rte_log.h>
15 #include <rte_pci.h>
16 #include <rte_bus_pci.h>
17 #include <rte_eal_paging.h>
18 #include <rte_malloc.h>
19 #include <rte_vfio.h>
20 #include <rte_eal.h>
21 #include <rte_bus.h>
22 #include <rte_spinlock.h>
23 #include <rte_tailq.h>
24 
25 #include "eal_filesystem.h"
26 
27 #include "pci_init.h"
28 #include "private.h"
29 
30 /**
31  * @file
32  * PCI probing using Linux VFIO.
33  *
34  * This code tries to determine if the PCI device is bound to VFIO driver,
35  * and initialize it (map BARs, set up interrupts) if that's the case.
36  *
37  */
38 
39 #ifdef VFIO_PRESENT
40 
41 static struct rte_tailq_elem rte_vfio_tailq = {
42 	.name = "VFIO_RESOURCE_LIST",
43 };
44 EAL_REGISTER_TAILQ(rte_vfio_tailq)
45 
46 int
47 pci_vfio_read_config(const struct rte_intr_handle *intr_handle,
48 		    void *buf, size_t len, off_t offs)
49 {
50 	int vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
51 
52 	if (vfio_dev_fd < 0)
53 		return -1;
54 
55 	return pread64(vfio_dev_fd, buf, len,
56 	       VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + offs);
57 }
58 
59 int
60 pci_vfio_write_config(const struct rte_intr_handle *intr_handle,
61 		    const void *buf, size_t len, off_t offs)
62 {
63 	int vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
64 
65 	if (vfio_dev_fd < 0)
66 		return -1;
67 
68 	return pwrite64(vfio_dev_fd, buf, len,
69 	       VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + offs);
70 }
71 
72 /* get PCI BAR number where MSI-X interrupts are */
73 static int
74 pci_vfio_get_msix_bar(int fd, struct pci_msix_table *msix_table)
75 {
76 	int ret;
77 	uint32_t reg;
78 	uint16_t flags;
79 	uint8_t cap_id, cap_offset;
80 
81 	/* read PCI capability pointer from config space */
82 	ret = pread64(fd, &reg, sizeof(reg),
83 			VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
84 			PCI_CAPABILITY_LIST);
85 	if (ret != sizeof(reg)) {
86 		RTE_LOG(ERR, EAL,
87 			"Cannot read capability pointer from PCI config space!\n");
88 		return -1;
89 	}
90 
91 	/* we need first byte */
92 	cap_offset = reg & 0xFF;
93 
94 	while (cap_offset) {
95 
96 		/* read PCI capability ID */
97 		ret = pread64(fd, &reg, sizeof(reg),
98 				VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
99 				cap_offset);
100 		if (ret != sizeof(reg)) {
101 			RTE_LOG(ERR, EAL,
102 				"Cannot read capability ID from PCI config space!\n");
103 			return -1;
104 		}
105 
106 		/* we need first byte */
107 		cap_id = reg & 0xFF;
108 
109 		/* if we haven't reached MSI-X, check next capability */
110 		if (cap_id != PCI_CAP_ID_MSIX) {
111 			ret = pread64(fd, &reg, sizeof(reg),
112 					VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
113 					cap_offset);
114 			if (ret != sizeof(reg)) {
115 				RTE_LOG(ERR, EAL,
116 					"Cannot read capability pointer from PCI config space!\n");
117 				return -1;
118 			}
119 
120 			/* we need second byte */
121 			cap_offset = (reg & 0xFF00) >> 8;
122 
123 			continue;
124 		}
125 		/* else, read table offset */
126 		else {
127 			/* table offset resides in the next 4 bytes */
128 			ret = pread64(fd, &reg, sizeof(reg),
129 					VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
130 					cap_offset + 4);
131 			if (ret != sizeof(reg)) {
132 				RTE_LOG(ERR, EAL,
133 					"Cannot read table offset from PCI config space!\n");
134 				return -1;
135 			}
136 
137 			ret = pread64(fd, &flags, sizeof(flags),
138 					VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
139 					cap_offset + 2);
140 			if (ret != sizeof(flags)) {
141 				RTE_LOG(ERR, EAL,
142 					"Cannot read table flags from PCI config space!\n");
143 				return -1;
144 			}
145 
146 			msix_table->bar_index = reg & RTE_PCI_MSIX_TABLE_BIR;
147 			msix_table->offset = reg & RTE_PCI_MSIX_TABLE_OFFSET;
148 			msix_table->size =
149 				16 * (1 + (flags & RTE_PCI_MSIX_FLAGS_QSIZE));
150 
151 			return 0;
152 		}
153 	}
154 	return 0;
155 }
156 
157 /* enable PCI bus memory space */
158 static int
159 pci_vfio_enable_bus_memory(int dev_fd)
160 {
161 	uint16_t cmd;
162 	int ret;
163 
164 	ret = pread64(dev_fd, &cmd, sizeof(cmd),
165 		      VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
166 		      PCI_COMMAND);
167 
168 	if (ret != sizeof(cmd)) {
169 		RTE_LOG(ERR, EAL, "Cannot read command from PCI config space!\n");
170 		return -1;
171 	}
172 
173 	if (cmd & PCI_COMMAND_MEMORY)
174 		return 0;
175 
176 	cmd |= PCI_COMMAND_MEMORY;
177 	ret = pwrite64(dev_fd, &cmd, sizeof(cmd),
178 		       VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
179 		       PCI_COMMAND);
180 
181 	if (ret != sizeof(cmd)) {
182 		RTE_LOG(ERR, EAL, "Cannot write command to PCI config space!\n");
183 		return -1;
184 	}
185 
186 	return 0;
187 }
188 
189 /* set PCI bus mastering */
190 static int
191 pci_vfio_set_bus_master(int dev_fd, bool op)
192 {
193 	uint16_t reg;
194 	int ret;
195 
196 	ret = pread64(dev_fd, &reg, sizeof(reg),
197 			VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
198 			PCI_COMMAND);
199 	if (ret != sizeof(reg)) {
200 		RTE_LOG(ERR, EAL, "Cannot read command from PCI config space!\n");
201 		return -1;
202 	}
203 
204 	if (op)
205 		/* set the master bit */
206 		reg |= PCI_COMMAND_MASTER;
207 	else
208 		reg &= ~(PCI_COMMAND_MASTER);
209 
210 	ret = pwrite64(dev_fd, &reg, sizeof(reg),
211 			VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
212 			PCI_COMMAND);
213 
214 	if (ret != sizeof(reg)) {
215 		RTE_LOG(ERR, EAL, "Cannot write command to PCI config space!\n");
216 		return -1;
217 	}
218 
219 	return 0;
220 }
221 
222 /* set up interrupt support (but not enable interrupts) */
223 static int
224 pci_vfio_setup_interrupts(struct rte_pci_device *dev, int vfio_dev_fd)
225 {
226 	int i, ret, intr_idx;
227 	enum rte_intr_mode intr_mode;
228 
229 	/* default to invalid index */
230 	intr_idx = VFIO_PCI_NUM_IRQS;
231 
232 	/* Get default / configured intr_mode */
233 	intr_mode = rte_eal_vfio_intr_mode();
234 
235 	/* get interrupt type from internal config (MSI-X by default, can be
236 	 * overridden from the command line
237 	 */
238 	switch (intr_mode) {
239 	case RTE_INTR_MODE_MSIX:
240 		intr_idx = VFIO_PCI_MSIX_IRQ_INDEX;
241 		break;
242 	case RTE_INTR_MODE_MSI:
243 		intr_idx = VFIO_PCI_MSI_IRQ_INDEX;
244 		break;
245 	case RTE_INTR_MODE_LEGACY:
246 		intr_idx = VFIO_PCI_INTX_IRQ_INDEX;
247 		break;
248 	/* don't do anything if we want to automatically determine interrupt type */
249 	case RTE_INTR_MODE_NONE:
250 		break;
251 	default:
252 		RTE_LOG(ERR, EAL, "Unknown default interrupt type!\n");
253 		return -1;
254 	}
255 
256 	/* start from MSI-X interrupt type */
257 	for (i = VFIO_PCI_MSIX_IRQ_INDEX; i >= 0; i--) {
258 		struct vfio_irq_info irq = { .argsz = sizeof(irq) };
259 		int fd = -1;
260 
261 		/* skip interrupt modes we don't want */
262 		if (intr_mode != RTE_INTR_MODE_NONE &&
263 				i != intr_idx)
264 			continue;
265 
266 		irq.index = i;
267 
268 		ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_IRQ_INFO, &irq);
269 		if (ret < 0) {
270 			RTE_LOG(ERR, EAL, "Cannot get VFIO IRQ info, error "
271 					"%i (%s)\n", errno, strerror(errno));
272 			return -1;
273 		}
274 
275 		/* if this vector cannot be used with eventfd, fail if we explicitly
276 		 * specified interrupt type, otherwise continue */
277 		if ((irq.flags & VFIO_IRQ_INFO_EVENTFD) == 0) {
278 			if (intr_mode != RTE_INTR_MODE_NONE) {
279 				RTE_LOG(ERR, EAL,
280 					"Interrupt vector does not support eventfd!\n");
281 				return -1;
282 			} else
283 				continue;
284 		}
285 
286 		/* Reallocate the efds and elist fields of intr_handle based
287 		 * on PCI device MSIX size.
288 		 */
289 		if (i == VFIO_PCI_MSIX_IRQ_INDEX &&
290 				(uint32_t)rte_intr_nb_intr_get(dev->intr_handle) < irq.count &&
291 				rte_intr_event_list_update(dev->intr_handle, irq.count))
292 			return -1;
293 
294 		/* set up an eventfd for interrupts */
295 		fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
296 		if (fd < 0) {
297 			RTE_LOG(ERR, EAL, "Cannot set up eventfd, error "
298 					"%i (%s)\n", errno, strerror(errno));
299 			return -1;
300 		}
301 
302 		if (rte_intr_fd_set(dev->intr_handle, fd))
303 			return -1;
304 
305 		if (rte_intr_dev_fd_set(dev->intr_handle, vfio_dev_fd))
306 			return -1;
307 
308 		switch (i) {
309 		case VFIO_PCI_MSIX_IRQ_INDEX:
310 			intr_mode = RTE_INTR_MODE_MSIX;
311 			rte_intr_type_set(dev->intr_handle,
312 						 RTE_INTR_HANDLE_VFIO_MSIX);
313 			break;
314 		case VFIO_PCI_MSI_IRQ_INDEX:
315 			intr_mode = RTE_INTR_MODE_MSI;
316 			rte_intr_type_set(dev->intr_handle,
317 						 RTE_INTR_HANDLE_VFIO_MSI);
318 			break;
319 		case VFIO_PCI_INTX_IRQ_INDEX:
320 			intr_mode = RTE_INTR_MODE_LEGACY;
321 			rte_intr_type_set(dev->intr_handle,
322 						 RTE_INTR_HANDLE_VFIO_LEGACY);
323 			break;
324 		default:
325 			RTE_LOG(ERR, EAL, "Unknown interrupt type!\n");
326 			return -1;
327 		}
328 
329 		return 0;
330 	}
331 
332 	/* if we're here, we haven't found a suitable interrupt vector */
333 	return -1;
334 }
335 
336 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
337 /*
338  * Spinlock for device hot-unplug failure handling.
339  * If it tries to access bus or device, such as handle sigbus on bus
340  * or handle memory failure for device, just need to use this lock.
341  * It could protect the bus and the device to avoid race condition.
342  */
343 static rte_spinlock_t failure_handle_lock = RTE_SPINLOCK_INITIALIZER;
344 
345 static void
346 pci_vfio_req_handler(void *param)
347 {
348 	struct rte_bus *bus;
349 	int ret;
350 	struct rte_device *device = (struct rte_device *)param;
351 
352 	rte_spinlock_lock(&failure_handle_lock);
353 	bus = rte_bus_find_by_device(device);
354 	if (bus == NULL) {
355 		RTE_LOG(ERR, EAL, "Cannot find bus for device (%s)\n",
356 			device->name);
357 		goto handle_end;
358 	}
359 
360 	/*
361 	 * vfio kernel module request user space to release allocated
362 	 * resources before device be deleted in kernel, so it can directly
363 	 * call the vfio bus hot-unplug handler to process it.
364 	 */
365 	ret = bus->hot_unplug_handler(device);
366 	if (ret)
367 		RTE_LOG(ERR, EAL,
368 			"Can not handle hot-unplug for device (%s)\n",
369 			device->name);
370 handle_end:
371 	rte_spinlock_unlock(&failure_handle_lock);
372 }
373 
374 /* enable notifier (only enable req now) */
375 static int
376 pci_vfio_enable_notifier(struct rte_pci_device *dev, int vfio_dev_fd)
377 {
378 	int ret;
379 	int fd = -1;
380 
381 	/* set up an eventfd for req notifier */
382 	fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
383 	if (fd < 0) {
384 		RTE_LOG(ERR, EAL, "Cannot set up eventfd, error %i (%s)\n",
385 			errno, strerror(errno));
386 		return -1;
387 	}
388 
389 	if (rte_intr_fd_set(dev->vfio_req_intr_handle, fd))
390 		return -1;
391 
392 	if (rte_intr_type_set(dev->vfio_req_intr_handle, RTE_INTR_HANDLE_VFIO_REQ))
393 		return -1;
394 
395 	if (rte_intr_dev_fd_set(dev->vfio_req_intr_handle, vfio_dev_fd))
396 		return -1;
397 
398 	ret = rte_intr_callback_register(dev->vfio_req_intr_handle,
399 					 pci_vfio_req_handler,
400 					 (void *)&dev->device);
401 	if (ret) {
402 		RTE_LOG(ERR, EAL, "Fail to register req notifier handler.\n");
403 		goto error;
404 	}
405 
406 	ret = rte_intr_enable(dev->vfio_req_intr_handle);
407 	if (ret) {
408 		RTE_LOG(ERR, EAL, "Fail to enable req notifier.\n");
409 		ret = rte_intr_callback_unregister(dev->vfio_req_intr_handle,
410 						 pci_vfio_req_handler,
411 						 (void *)&dev->device);
412 		if (ret < 0)
413 			RTE_LOG(ERR, EAL,
414 				"Fail to unregister req notifier handler.\n");
415 		goto error;
416 	}
417 
418 	return 0;
419 error:
420 	close(fd);
421 
422 	rte_intr_fd_set(dev->vfio_req_intr_handle, -1);
423 	rte_intr_type_set(dev->vfio_req_intr_handle, RTE_INTR_HANDLE_UNKNOWN);
424 	rte_intr_dev_fd_set(dev->vfio_req_intr_handle, -1);
425 
426 	return -1;
427 }
428 
429 /* disable notifier (only disable req now) */
430 static int
431 pci_vfio_disable_notifier(struct rte_pci_device *dev)
432 {
433 	int ret;
434 
435 	ret = rte_intr_disable(dev->vfio_req_intr_handle);
436 	if (ret) {
437 		RTE_LOG(ERR, EAL, "fail to disable req notifier.\n");
438 		return -1;
439 	}
440 
441 	ret = rte_intr_callback_unregister_sync(dev->vfio_req_intr_handle,
442 					   pci_vfio_req_handler,
443 					   (void *)&dev->device);
444 	if (ret < 0) {
445 		RTE_LOG(ERR, EAL,
446 			 "fail to unregister req notifier handler.\n");
447 		return -1;
448 	}
449 
450 	close(rte_intr_fd_get(dev->vfio_req_intr_handle));
451 
452 	rte_intr_fd_set(dev->vfio_req_intr_handle, -1);
453 	rte_intr_type_set(dev->vfio_req_intr_handle, RTE_INTR_HANDLE_UNKNOWN);
454 	rte_intr_dev_fd_set(dev->vfio_req_intr_handle, -1);
455 
456 	return 0;
457 }
458 #endif
459 
460 static int
461 pci_vfio_is_ioport_bar(int vfio_dev_fd, int bar_index)
462 {
463 	uint32_t ioport_bar;
464 	int ret;
465 
466 	ret = pread64(vfio_dev_fd, &ioport_bar, sizeof(ioport_bar),
467 			  VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX)
468 			  + PCI_BASE_ADDRESS_0 + bar_index*4);
469 	if (ret != sizeof(ioport_bar)) {
470 		RTE_LOG(ERR, EAL, "Cannot read command (%x) from config space!\n",
471 			PCI_BASE_ADDRESS_0 + bar_index*4);
472 		return -1;
473 	}
474 
475 	return (ioport_bar & PCI_BASE_ADDRESS_SPACE_IO) != 0;
476 }
477 
478 static int
479 pci_rte_vfio_setup_device(struct rte_pci_device *dev, int vfio_dev_fd)
480 {
481 	if (pci_vfio_setup_interrupts(dev, vfio_dev_fd) != 0) {
482 		RTE_LOG(ERR, EAL, "Error setting up interrupts!\n");
483 		return -1;
484 	}
485 
486 	if (pci_vfio_enable_bus_memory(vfio_dev_fd)) {
487 		RTE_LOG(ERR, EAL, "Cannot enable bus memory!\n");
488 		return -1;
489 	}
490 
491 	/* set bus mastering for the device */
492 	if (pci_vfio_set_bus_master(vfio_dev_fd, true)) {
493 		RTE_LOG(ERR, EAL, "Cannot set up bus mastering!\n");
494 		return -1;
495 	}
496 
497 	/*
498 	 * Reset the device. If the device is not capable of resetting,
499 	 * then it updates errno as EINVAL.
500 	 */
501 	if (ioctl(vfio_dev_fd, VFIO_DEVICE_RESET) && errno != EINVAL) {
502 		RTE_LOG(ERR, EAL, "Unable to reset device! Error: %d (%s)\n",
503 				errno, strerror(errno));
504 		return -1;
505 	}
506 
507 	return 0;
508 }
509 
510 static int
511 pci_vfio_mmap_bar(int vfio_dev_fd, struct mapped_pci_resource *vfio_res,
512 		int bar_index, int additional_flags)
513 {
514 	struct memreg {
515 		uint64_t offset;
516 		size_t   size;
517 	} memreg[2] = {};
518 	void *bar_addr;
519 	struct pci_msix_table *msix_table = &vfio_res->msix_table;
520 	struct pci_map *bar = &vfio_res->maps[bar_index];
521 
522 	if (bar->size == 0) {
523 		RTE_LOG(DEBUG, EAL, "Bar size is 0, skip BAR%d\n", bar_index);
524 		return 0;
525 	}
526 
527 	if (msix_table->bar_index == bar_index) {
528 		/*
529 		 * VFIO will not let us map the MSI-X table,
530 		 * but we can map around it.
531 		 */
532 		uint32_t table_start = msix_table->offset;
533 		uint32_t table_end = table_start + msix_table->size;
534 		table_end = RTE_ALIGN(table_end, rte_mem_page_size());
535 		table_start = RTE_ALIGN_FLOOR(table_start, rte_mem_page_size());
536 
537 		/* If page-aligned start of MSI-X table is less than the
538 		 * actual MSI-X table start address, reassign to the actual
539 		 * start address.
540 		 */
541 		if (table_start < msix_table->offset)
542 			table_start = msix_table->offset;
543 
544 		if (table_start == 0 && table_end >= bar->size) {
545 			/* Cannot map this BAR */
546 			RTE_LOG(DEBUG, EAL, "Skipping BAR%d\n", bar_index);
547 			bar->size = 0;
548 			bar->addr = 0;
549 			return 0;
550 		}
551 
552 		memreg[0].offset = bar->offset;
553 		memreg[0].size = table_start;
554 		if (bar->size < table_end) {
555 			/*
556 			 * If MSI-X table end is beyond BAR end, don't attempt
557 			 * to perform second mapping.
558 			 */
559 			memreg[1].offset = 0;
560 			memreg[1].size = 0;
561 		} else {
562 			memreg[1].offset = bar->offset + table_end;
563 			memreg[1].size = bar->size - table_end;
564 		}
565 
566 		RTE_LOG(DEBUG, EAL,
567 			"Trying to map BAR%d that contains the MSI-X "
568 			"table. Trying offsets: "
569 			"0x%04" PRIx64 ":0x%04zx, 0x%04" PRIx64 ":0x%04zx\n",
570 			bar_index,
571 			memreg[0].offset, memreg[0].size,
572 			memreg[1].offset, memreg[1].size);
573 	} else {
574 		memreg[0].offset = bar->offset;
575 		memreg[0].size = bar->size;
576 	}
577 
578 	/* reserve the address using an inaccessible mapping */
579 	bar_addr = mmap(bar->addr, bar->size, 0, MAP_PRIVATE |
580 			MAP_ANONYMOUS | additional_flags, -1, 0);
581 	if (bar_addr != MAP_FAILED) {
582 		void *map_addr = NULL;
583 		if (memreg[0].size) {
584 			/* actual map of first part */
585 			map_addr = pci_map_resource(bar_addr, vfio_dev_fd,
586 							memreg[0].offset,
587 							memreg[0].size,
588 							RTE_MAP_FORCE_ADDRESS);
589 		}
590 
591 		/*
592 		 * Regarding "memreg[0].size == 0":
593 		 * If this BAR has MSI-X table, memreg[0].size (the
594 		 * first part or the part before the table) can
595 		 * legitimately be 0 for hardware using vector table
596 		 * offset 0 (i.e. first part does not exist).
597 		 *
598 		 * When memreg[0].size is 0, "mapping the first part"
599 		 * never happens, and map_addr is NULL at this
600 		 * point. So check that mapping has been actually
601 		 * attempted.
602 		 */
603 		/* if there's a second part, try to map it */
604 		if ((map_addr != NULL || memreg[0].size == 0)
605 			&& memreg[1].offset && memreg[1].size) {
606 			void *second_addr = RTE_PTR_ADD(bar_addr,
607 						(uintptr_t)(memreg[1].offset -
608 						bar->offset));
609 			map_addr = pci_map_resource(second_addr,
610 							vfio_dev_fd,
611 							memreg[1].offset,
612 							memreg[1].size,
613 							RTE_MAP_FORCE_ADDRESS);
614 		}
615 
616 		if (map_addr == NULL) {
617 			munmap(bar_addr, bar->size);
618 			bar_addr = MAP_FAILED;
619 			RTE_LOG(ERR, EAL, "Failed to map pci BAR%d\n",
620 					bar_index);
621 			return -1;
622 		}
623 	} else {
624 		RTE_LOG(ERR, EAL,
625 				"Failed to create inaccessible mapping for BAR%d\n",
626 				bar_index);
627 		return -1;
628 	}
629 
630 	bar->addr = bar_addr;
631 	return 0;
632 }
633 
634 /*
635  * region info may contain capability headers, so we need to keep reallocating
636  * the memory until we match allocated memory size with argsz.
637  */
638 static int
639 pci_vfio_get_region_info(int vfio_dev_fd, struct vfio_region_info **info,
640 		int region)
641 {
642 	struct vfio_region_info *ri;
643 	size_t argsz = sizeof(*ri);
644 	int ret;
645 
646 	ri = malloc(sizeof(*ri));
647 	if (ri == NULL) {
648 		RTE_LOG(ERR, EAL,
649 			"Cannot allocate memory for VFIO region info\n");
650 		return -1;
651 	}
652 again:
653 	memset(ri, 0, argsz);
654 	ri->argsz = argsz;
655 	ri->index = region;
656 
657 	ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, ri);
658 	if (ret < 0) {
659 		free(ri);
660 		return ret;
661 	}
662 	if (ri->argsz != argsz) {
663 		struct vfio_region_info *tmp;
664 
665 		argsz = ri->argsz;
666 		tmp = realloc(ri, argsz);
667 
668 		if (tmp == NULL) {
669 			/* realloc failed but the ri is still there */
670 			free(ri);
671 			RTE_LOG(ERR, EAL,
672 				"Cannot reallocate memory for VFIO region info\n");
673 			return -1;
674 		}
675 		ri = tmp;
676 		goto again;
677 	}
678 	*info = ri;
679 
680 	return 0;
681 }
682 
683 static struct vfio_info_cap_header *
684 pci_vfio_info_cap(struct vfio_region_info *info, int cap)
685 {
686 	struct vfio_info_cap_header *h;
687 	size_t offset;
688 
689 	if ((info->flags & RTE_VFIO_INFO_FLAG_CAPS) == 0) {
690 		/* VFIO info does not advertise capabilities */
691 		return NULL;
692 	}
693 
694 	offset = VFIO_CAP_OFFSET(info);
695 	while (offset != 0) {
696 		h = RTE_PTR_ADD(info, offset);
697 		if (h->id == cap)
698 			return h;
699 		offset = h->next;
700 	}
701 	return NULL;
702 }
703 
704 static int
705 pci_vfio_msix_is_mappable(int vfio_dev_fd, int msix_region)
706 {
707 	struct vfio_region_info *info;
708 	int ret;
709 
710 	ret = pci_vfio_get_region_info(vfio_dev_fd, &info, msix_region);
711 	if (ret < 0)
712 		return -1;
713 
714 	ret = pci_vfio_info_cap(info, RTE_VFIO_CAP_MSIX_MAPPABLE) != NULL;
715 
716 	/* cleanup */
717 	free(info);
718 
719 	return ret;
720 }
721 
722 
723 static int
724 pci_vfio_map_resource_primary(struct rte_pci_device *dev)
725 {
726 	struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
727 	char pci_addr[PATH_MAX] = {0};
728 	int vfio_dev_fd;
729 	struct rte_pci_addr *loc = &dev->addr;
730 	int i, ret;
731 	struct mapped_pci_resource *vfio_res = NULL;
732 	struct mapped_pci_res_list *vfio_res_list =
733 		RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list);
734 
735 	struct pci_map *maps;
736 
737 	if (rte_intr_fd_set(dev->intr_handle, -1))
738 		return -1;
739 
740 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
741 	if (rte_intr_fd_set(dev->vfio_req_intr_handle, -1))
742 		return -1;
743 #endif
744 
745 	/* store PCI address string */
746 	snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
747 			loc->domain, loc->bus, loc->devid, loc->function);
748 
749 	ret = rte_vfio_setup_device(rte_pci_get_sysfs_path(), pci_addr,
750 					&vfio_dev_fd, &device_info);
751 	if (ret)
752 		return ret;
753 
754 	/* allocate vfio_res and get region info */
755 	vfio_res = rte_zmalloc("VFIO_RES", sizeof(*vfio_res), 0);
756 	if (vfio_res == NULL) {
757 		RTE_LOG(ERR, EAL,
758 			"Cannot store VFIO mmap details\n");
759 		goto err_vfio_dev_fd;
760 	}
761 	memcpy(&vfio_res->pci_addr, &dev->addr, sizeof(vfio_res->pci_addr));
762 
763 	/* get number of registers (up to BAR5) */
764 	vfio_res->nb_maps = RTE_MIN((int) device_info.num_regions,
765 			VFIO_PCI_BAR5_REGION_INDEX + 1);
766 
767 	/* map BARs */
768 	maps = vfio_res->maps;
769 
770 	vfio_res->msix_table.bar_index = -1;
771 	/* get MSI-X BAR, if any (we have to know where it is because we can't
772 	 * easily mmap it when using VFIO)
773 	 */
774 	ret = pci_vfio_get_msix_bar(vfio_dev_fd, &vfio_res->msix_table);
775 	if (ret < 0) {
776 		RTE_LOG(ERR, EAL, "%s cannot get MSI-X BAR number!\n",
777 				pci_addr);
778 		goto err_vfio_res;
779 	}
780 	/* if we found our MSI-X BAR region, check if we can mmap it */
781 	if (vfio_res->msix_table.bar_index != -1) {
782 		int ret = pci_vfio_msix_is_mappable(vfio_dev_fd,
783 				vfio_res->msix_table.bar_index);
784 		if (ret < 0) {
785 			RTE_LOG(ERR, EAL, "Couldn't check if MSI-X BAR is mappable\n");
786 			goto err_vfio_res;
787 		} else if (ret != 0) {
788 			/* we can map it, so we don't care where it is */
789 			RTE_LOG(DEBUG, EAL, "VFIO reports MSI-X BAR as mappable\n");
790 			vfio_res->msix_table.bar_index = -1;
791 		}
792 	}
793 
794 	for (i = 0; i < vfio_res->nb_maps; i++) {
795 		struct vfio_region_info *reg = NULL;
796 		void *bar_addr;
797 
798 		ret = pci_vfio_get_region_info(vfio_dev_fd, &reg, i);
799 		if (ret < 0) {
800 			RTE_LOG(ERR, EAL,
801 				"%s cannot get device region info error "
802 				"%i (%s)\n", pci_addr, errno, strerror(errno));
803 			goto err_vfio_res;
804 		}
805 
806 		/* chk for io port region */
807 		ret = pci_vfio_is_ioport_bar(vfio_dev_fd, i);
808 		if (ret < 0) {
809 			free(reg);
810 			goto err_vfio_res;
811 		} else if (ret) {
812 			RTE_LOG(INFO, EAL, "Ignore mapping IO port bar(%d)\n",
813 					i);
814 			free(reg);
815 			continue;
816 		}
817 
818 		/* skip non-mmapable BARs */
819 		if ((reg->flags & VFIO_REGION_INFO_FLAG_MMAP) == 0) {
820 			free(reg);
821 			continue;
822 		}
823 
824 		/* try mapping somewhere close to the end of hugepages */
825 		if (pci_map_addr == NULL)
826 			pci_map_addr = pci_find_max_end_va();
827 
828 		bar_addr = pci_map_addr;
829 		pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg->size);
830 
831 		pci_map_addr = RTE_PTR_ALIGN(pci_map_addr,
832 					sysconf(_SC_PAGE_SIZE));
833 
834 		maps[i].addr = bar_addr;
835 		maps[i].offset = reg->offset;
836 		maps[i].size = reg->size;
837 		maps[i].path = NULL; /* vfio doesn't have per-resource paths */
838 
839 		ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, 0);
840 		if (ret < 0) {
841 			RTE_LOG(ERR, EAL, "%s mapping BAR%i failed: %s\n",
842 					pci_addr, i, strerror(errno));
843 			free(reg);
844 			goto err_vfio_res;
845 		}
846 
847 		dev->mem_resource[i].addr = maps[i].addr;
848 
849 		free(reg);
850 	}
851 
852 	if (pci_rte_vfio_setup_device(dev, vfio_dev_fd) < 0) {
853 		RTE_LOG(ERR, EAL, "%s setup device failed\n", pci_addr);
854 		goto err_vfio_res;
855 	}
856 
857 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
858 	if (pci_vfio_enable_notifier(dev, vfio_dev_fd) != 0) {
859 		RTE_LOG(ERR, EAL, "Error setting up notifier!\n");
860 		goto err_vfio_res;
861 	}
862 
863 #endif
864 	TAILQ_INSERT_TAIL(vfio_res_list, vfio_res, next);
865 
866 	return 0;
867 err_vfio_res:
868 	rte_free(vfio_res);
869 err_vfio_dev_fd:
870 	rte_vfio_release_device(rte_pci_get_sysfs_path(),
871 			pci_addr, vfio_dev_fd);
872 	return -1;
873 }
874 
875 static int
876 pci_vfio_map_resource_secondary(struct rte_pci_device *dev)
877 {
878 	struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
879 	char pci_addr[PATH_MAX] = {0};
880 	int vfio_dev_fd;
881 	struct rte_pci_addr *loc = &dev->addr;
882 	int i, ret;
883 	struct mapped_pci_resource *vfio_res = NULL;
884 	struct mapped_pci_res_list *vfio_res_list =
885 		RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list);
886 
887 	struct pci_map *maps;
888 
889 	if (rte_intr_fd_set(dev->intr_handle, -1))
890 		return -1;
891 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
892 	if (rte_intr_fd_set(dev->vfio_req_intr_handle, -1))
893 		return -1;
894 #endif
895 
896 	/* store PCI address string */
897 	snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
898 			loc->domain, loc->bus, loc->devid, loc->function);
899 
900 	/* if we're in a secondary process, just find our tailq entry */
901 	TAILQ_FOREACH(vfio_res, vfio_res_list, next) {
902 		if (rte_pci_addr_cmp(&vfio_res->pci_addr,
903 						 &dev->addr))
904 			continue;
905 		break;
906 	}
907 	/* if we haven't found our tailq entry, something's wrong */
908 	if (vfio_res == NULL) {
909 		RTE_LOG(ERR, EAL, "%s cannot find TAILQ entry for PCI device!\n",
910 				pci_addr);
911 		return -1;
912 	}
913 
914 	ret = rte_vfio_setup_device(rte_pci_get_sysfs_path(), pci_addr,
915 					&vfio_dev_fd, &device_info);
916 	if (ret)
917 		return ret;
918 
919 	/* map BARs */
920 	maps = vfio_res->maps;
921 
922 	for (i = 0; i < vfio_res->nb_maps; i++) {
923 		ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, MAP_FIXED);
924 		if (ret < 0) {
925 			RTE_LOG(ERR, EAL, "%s mapping BAR%i failed: %s\n",
926 					pci_addr, i, strerror(errno));
927 			goto err_vfio_dev_fd;
928 		}
929 
930 		dev->mem_resource[i].addr = maps[i].addr;
931 	}
932 
933 	/* we need save vfio_dev_fd, so it can be used during release */
934 	if (rte_intr_dev_fd_set(dev->intr_handle, vfio_dev_fd))
935 		goto err_vfio_dev_fd;
936 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
937 	if (rte_intr_dev_fd_set(dev->vfio_req_intr_handle, vfio_dev_fd))
938 		goto err_vfio_dev_fd;
939 #endif
940 
941 	return 0;
942 err_vfio_dev_fd:
943 	rte_vfio_release_device(rte_pci_get_sysfs_path(),
944 			pci_addr, vfio_dev_fd);
945 	return -1;
946 }
947 
948 /*
949  * map the PCI resources of a PCI device in virtual memory (VFIO version).
950  * primary and secondary processes follow almost exactly the same path
951  */
952 int
953 pci_vfio_map_resource(struct rte_pci_device *dev)
954 {
955 	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
956 		return pci_vfio_map_resource_primary(dev);
957 	else
958 		return pci_vfio_map_resource_secondary(dev);
959 }
960 
961 static struct mapped_pci_resource *
962 find_and_unmap_vfio_resource(struct mapped_pci_res_list *vfio_res_list,
963 			struct rte_pci_device *dev,
964 			const char *pci_addr)
965 {
966 	struct mapped_pci_resource *vfio_res = NULL;
967 	struct pci_map *maps;
968 	int i;
969 
970 	/* Get vfio_res */
971 	TAILQ_FOREACH(vfio_res, vfio_res_list, next) {
972 		if (rte_pci_addr_cmp(&vfio_res->pci_addr, &dev->addr))
973 			continue;
974 		break;
975 	}
976 
977 	if  (vfio_res == NULL)
978 		return vfio_res;
979 
980 	RTE_LOG(INFO, EAL, "Releasing PCI mapped resource for %s\n",
981 		pci_addr);
982 
983 	maps = vfio_res->maps;
984 	for (i = 0; i < vfio_res->nb_maps; i++) {
985 
986 		/*
987 		 * We do not need to be aware of MSI-X table BAR mappings as
988 		 * when mapping. Just using current maps array is enough
989 		 */
990 		if (maps[i].addr) {
991 			RTE_LOG(INFO, EAL, "Calling pci_unmap_resource for %s at %p\n",
992 				pci_addr, maps[i].addr);
993 			pci_unmap_resource(maps[i].addr, maps[i].size);
994 		}
995 	}
996 
997 	return vfio_res;
998 }
999 
1000 static int
1001 pci_vfio_unmap_resource_primary(struct rte_pci_device *dev)
1002 {
1003 	char pci_addr[PATH_MAX] = {0};
1004 	struct rte_pci_addr *loc = &dev->addr;
1005 	struct mapped_pci_resource *vfio_res = NULL;
1006 	struct mapped_pci_res_list *vfio_res_list;
1007 	int ret, vfio_dev_fd;
1008 
1009 	/* store PCI address string */
1010 	snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
1011 			loc->domain, loc->bus, loc->devid, loc->function);
1012 
1013 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
1014 	ret = pci_vfio_disable_notifier(dev);
1015 	if (ret) {
1016 		RTE_LOG(ERR, EAL, "fail to disable req notifier.\n");
1017 		return -1;
1018 	}
1019 
1020 #endif
1021 	if (rte_intr_fd_get(dev->intr_handle) < 0)
1022 		return -1;
1023 
1024 	if (close(rte_intr_fd_get(dev->intr_handle)) < 0) {
1025 		RTE_LOG(INFO, EAL, "Error when closing eventfd file descriptor for %s\n",
1026 			pci_addr);
1027 		return -1;
1028 	}
1029 
1030 	vfio_dev_fd = rte_intr_dev_fd_get(dev->intr_handle);
1031 	if (vfio_dev_fd < 0)
1032 		return -1;
1033 
1034 	if (pci_vfio_set_bus_master(vfio_dev_fd, false)) {
1035 		RTE_LOG(ERR, EAL, "%s cannot unset bus mastering for PCI device!\n",
1036 				pci_addr);
1037 		return -1;
1038 	}
1039 
1040 	ret = rte_vfio_release_device(rte_pci_get_sysfs_path(), pci_addr,
1041 				      vfio_dev_fd);
1042 	if (ret < 0) {
1043 		RTE_LOG(ERR, EAL, "Cannot release VFIO device\n");
1044 		return ret;
1045 	}
1046 
1047 	vfio_res_list =
1048 		RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list);
1049 	vfio_res = find_and_unmap_vfio_resource(vfio_res_list, dev, pci_addr);
1050 
1051 	/* if we haven't found our tailq entry, something's wrong */
1052 	if (vfio_res == NULL) {
1053 		RTE_LOG(ERR, EAL, "%s cannot find TAILQ entry for PCI device!\n",
1054 				pci_addr);
1055 		return -1;
1056 	}
1057 
1058 	TAILQ_REMOVE(vfio_res_list, vfio_res, next);
1059 	rte_free(vfio_res);
1060 	return 0;
1061 }
1062 
1063 static int
1064 pci_vfio_unmap_resource_secondary(struct rte_pci_device *dev)
1065 {
1066 	char pci_addr[PATH_MAX] = {0};
1067 	struct rte_pci_addr *loc = &dev->addr;
1068 	struct mapped_pci_resource *vfio_res = NULL;
1069 	struct mapped_pci_res_list *vfio_res_list;
1070 	int ret, vfio_dev_fd;
1071 
1072 	/* store PCI address string */
1073 	snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
1074 			loc->domain, loc->bus, loc->devid, loc->function);
1075 
1076 	vfio_dev_fd = rte_intr_dev_fd_get(dev->intr_handle);
1077 	if (vfio_dev_fd < 0)
1078 		return -1;
1079 
1080 	ret = rte_vfio_release_device(rte_pci_get_sysfs_path(), pci_addr,
1081 				      vfio_dev_fd);
1082 	if (ret < 0) {
1083 		RTE_LOG(ERR, EAL, "Cannot release VFIO device\n");
1084 		return ret;
1085 	}
1086 
1087 	vfio_res_list =
1088 		RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list);
1089 	vfio_res = find_and_unmap_vfio_resource(vfio_res_list, dev, pci_addr);
1090 
1091 	/* if we haven't found our tailq entry, something's wrong */
1092 	if (vfio_res == NULL) {
1093 		RTE_LOG(ERR, EAL, "%s cannot find TAILQ entry for PCI device!\n",
1094 				pci_addr);
1095 		return -1;
1096 	}
1097 
1098 	return 0;
1099 }
1100 
1101 int
1102 pci_vfio_unmap_resource(struct rte_pci_device *dev)
1103 {
1104 	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
1105 		return pci_vfio_unmap_resource_primary(dev);
1106 	else
1107 		return pci_vfio_unmap_resource_secondary(dev);
1108 }
1109 
1110 int
1111 pci_vfio_ioport_map(struct rte_pci_device *dev, int bar,
1112 		    struct rte_pci_ioport *p)
1113 {
1114 	if (bar < VFIO_PCI_BAR0_REGION_INDEX ||
1115 	    bar > VFIO_PCI_BAR5_REGION_INDEX) {
1116 		RTE_LOG(ERR, EAL, "invalid bar (%d)!\n", bar);
1117 		return -1;
1118 	}
1119 
1120 	p->dev = dev;
1121 	p->base = VFIO_GET_REGION_ADDR(bar);
1122 	return 0;
1123 }
1124 
1125 void
1126 pci_vfio_ioport_read(struct rte_pci_ioport *p,
1127 		     void *data, size_t len, off_t offset)
1128 {
1129 	const struct rte_intr_handle *intr_handle = p->dev->intr_handle;
1130 	int vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
1131 
1132 	if (vfio_dev_fd < 0)
1133 		return;
1134 
1135 	if (pread64(vfio_dev_fd, data,
1136 		    len, p->base + offset) <= 0)
1137 		RTE_LOG(ERR, EAL,
1138 			"Can't read from PCI bar (%" PRIu64 ") : offset (%x)\n",
1139 			VFIO_GET_REGION_IDX(p->base), (int)offset);
1140 }
1141 
1142 void
1143 pci_vfio_ioport_write(struct rte_pci_ioport *p,
1144 		      const void *data, size_t len, off_t offset)
1145 {
1146 	const struct rte_intr_handle *intr_handle = p->dev->intr_handle;
1147 	int vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
1148 
1149 	if (vfio_dev_fd < 0)
1150 		return;
1151 
1152 	if (pwrite64(vfio_dev_fd, data,
1153 		     len, p->base + offset) <= 0)
1154 		RTE_LOG(ERR, EAL,
1155 			"Can't write to PCI bar (%" PRIu64 ") : offset (%x)\n",
1156 			VFIO_GET_REGION_IDX(p->base), (int)offset);
1157 }
1158 
1159 int
1160 pci_vfio_ioport_unmap(struct rte_pci_ioport *p)
1161 {
1162 	RTE_SET_USED(p);
1163 	return -1;
1164 }
1165 
1166 int
1167 pci_vfio_is_enabled(void)
1168 {
1169 	return rte_vfio_is_enabled("vfio_pci");
1170 }
1171 #endif
1172