1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2010-2014 Intel Corporation 3 */ 4 5 #include <string.h> 6 #include <fcntl.h> 7 #include <linux/pci_regs.h> 8 #include <sys/eventfd.h> 9 #include <sys/socket.h> 10 #include <sys/ioctl.h> 11 #include <sys/mman.h> 12 #include <stdbool.h> 13 14 #include <rte_log.h> 15 #include <rte_pci.h> 16 #include <rte_bus_pci.h> 17 #include <rte_eal_memconfig.h> 18 #include <rte_malloc.h> 19 #include <rte_vfio.h> 20 #include <rte_eal.h> 21 #include <rte_bus.h> 22 #include <rte_spinlock.h> 23 24 #include "eal_filesystem.h" 25 26 #include "pci_init.h" 27 #include "private.h" 28 29 /** 30 * @file 31 * PCI probing under linux (VFIO version) 32 * 33 * This code tries to determine if the PCI device is bound to VFIO driver, 34 * and initialize it (map BARs, set up interrupts) if that's the case. 35 * 36 * This file is only compiled if CONFIG_RTE_EAL_VFIO is set to "y". 37 */ 38 39 #ifdef VFIO_PRESENT 40 41 #ifndef PAGE_SIZE 42 #define PAGE_SIZE (sysconf(_SC_PAGESIZE)) 43 #endif 44 #define PAGE_MASK (~(PAGE_SIZE - 1)) 45 46 static struct rte_tailq_elem rte_vfio_tailq = { 47 .name = "VFIO_RESOURCE_LIST", 48 }; 49 EAL_REGISTER_TAILQ(rte_vfio_tailq) 50 51 int 52 pci_vfio_read_config(const struct rte_intr_handle *intr_handle, 53 void *buf, size_t len, off_t offs) 54 { 55 return pread64(intr_handle->vfio_dev_fd, buf, len, 56 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + offs); 57 } 58 59 int 60 pci_vfio_write_config(const struct rte_intr_handle *intr_handle, 61 const void *buf, size_t len, off_t offs) 62 { 63 return pwrite64(intr_handle->vfio_dev_fd, buf, len, 64 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + offs); 65 } 66 67 /* get PCI BAR number where MSI-X interrupts are */ 68 static int 69 pci_vfio_get_msix_bar(int fd, struct pci_msix_table *msix_table) 70 { 71 int ret; 72 uint32_t reg; 73 uint16_t flags; 74 uint8_t cap_id, cap_offset; 75 76 /* read PCI capability pointer from config space */ 77 ret = pread64(fd, ®, sizeof(reg), 78 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + 79 PCI_CAPABILITY_LIST); 80 if (ret != sizeof(reg)) { 81 RTE_LOG(ERR, EAL, "Cannot read capability pointer from PCI " 82 "config space!\n"); 83 return -1; 84 } 85 86 /* we need first byte */ 87 cap_offset = reg & 0xFF; 88 89 while (cap_offset) { 90 91 /* read PCI capability ID */ 92 ret = pread64(fd, ®, sizeof(reg), 93 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + 94 cap_offset); 95 if (ret != sizeof(reg)) { 96 RTE_LOG(ERR, EAL, "Cannot read capability ID from PCI " 97 "config space!\n"); 98 return -1; 99 } 100 101 /* we need first byte */ 102 cap_id = reg & 0xFF; 103 104 /* if we haven't reached MSI-X, check next capability */ 105 if (cap_id != PCI_CAP_ID_MSIX) { 106 ret = pread64(fd, ®, sizeof(reg), 107 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + 108 cap_offset); 109 if (ret != sizeof(reg)) { 110 RTE_LOG(ERR, EAL, "Cannot read capability pointer from PCI " 111 "config space!\n"); 112 return -1; 113 } 114 115 /* we need second byte */ 116 cap_offset = (reg & 0xFF00) >> 8; 117 118 continue; 119 } 120 /* else, read table offset */ 121 else { 122 /* table offset resides in the next 4 bytes */ 123 ret = pread64(fd, ®, sizeof(reg), 124 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + 125 cap_offset + 4); 126 if (ret != sizeof(reg)) { 127 RTE_LOG(ERR, EAL, "Cannot read table offset from PCI config " 128 "space!\n"); 129 return -1; 130 } 131 132 ret = pread64(fd, &flags, sizeof(flags), 133 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + 134 cap_offset + 2); 135 if (ret != sizeof(flags)) { 136 RTE_LOG(ERR, EAL, "Cannot read table flags from PCI config " 137 "space!\n"); 138 return -1; 139 } 140 141 msix_table->bar_index = reg & RTE_PCI_MSIX_TABLE_BIR; 142 msix_table->offset = reg & RTE_PCI_MSIX_TABLE_OFFSET; 143 msix_table->size = 144 16 * (1 + (flags & RTE_PCI_MSIX_FLAGS_QSIZE)); 145 146 return 0; 147 } 148 } 149 return 0; 150 } 151 152 /* set PCI bus mastering */ 153 static int 154 pci_vfio_set_bus_master(int dev_fd, bool op) 155 { 156 uint16_t reg; 157 int ret; 158 159 ret = pread64(dev_fd, ®, sizeof(reg), 160 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + 161 PCI_COMMAND); 162 if (ret != sizeof(reg)) { 163 RTE_LOG(ERR, EAL, "Cannot read command from PCI config space!\n"); 164 return -1; 165 } 166 167 if (op) 168 /* set the master bit */ 169 reg |= PCI_COMMAND_MASTER; 170 else 171 reg &= ~(PCI_COMMAND_MASTER); 172 173 ret = pwrite64(dev_fd, ®, sizeof(reg), 174 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + 175 PCI_COMMAND); 176 177 if (ret != sizeof(reg)) { 178 RTE_LOG(ERR, EAL, "Cannot write command to PCI config space!\n"); 179 return -1; 180 } 181 182 return 0; 183 } 184 185 /* set up interrupt support (but not enable interrupts) */ 186 static int 187 pci_vfio_setup_interrupts(struct rte_pci_device *dev, int vfio_dev_fd) 188 { 189 int i, ret, intr_idx; 190 enum rte_intr_mode intr_mode; 191 192 /* default to invalid index */ 193 intr_idx = VFIO_PCI_NUM_IRQS; 194 195 /* Get default / configured intr_mode */ 196 intr_mode = rte_eal_vfio_intr_mode(); 197 198 /* get interrupt type from internal config (MSI-X by default, can be 199 * overridden from the command line 200 */ 201 switch (intr_mode) { 202 case RTE_INTR_MODE_MSIX: 203 intr_idx = VFIO_PCI_MSIX_IRQ_INDEX; 204 break; 205 case RTE_INTR_MODE_MSI: 206 intr_idx = VFIO_PCI_MSI_IRQ_INDEX; 207 break; 208 case RTE_INTR_MODE_LEGACY: 209 intr_idx = VFIO_PCI_INTX_IRQ_INDEX; 210 break; 211 /* don't do anything if we want to automatically determine interrupt type */ 212 case RTE_INTR_MODE_NONE: 213 break; 214 default: 215 RTE_LOG(ERR, EAL, " unknown default interrupt type!\n"); 216 return -1; 217 } 218 219 /* start from MSI-X interrupt type */ 220 for (i = VFIO_PCI_MSIX_IRQ_INDEX; i >= 0; i--) { 221 struct vfio_irq_info irq = { .argsz = sizeof(irq) }; 222 int fd = -1; 223 224 /* skip interrupt modes we don't want */ 225 if (intr_mode != RTE_INTR_MODE_NONE && 226 i != intr_idx) 227 continue; 228 229 irq.index = i; 230 231 ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_IRQ_INFO, &irq); 232 if (ret < 0) { 233 RTE_LOG(ERR, EAL, " cannot get IRQ info, " 234 "error %i (%s)\n", errno, strerror(errno)); 235 return -1; 236 } 237 238 /* if this vector cannot be used with eventfd, fail if we explicitly 239 * specified interrupt type, otherwise continue */ 240 if ((irq.flags & VFIO_IRQ_INFO_EVENTFD) == 0) { 241 if (intr_mode != RTE_INTR_MODE_NONE) { 242 RTE_LOG(ERR, EAL, 243 " interrupt vector does not support eventfd!\n"); 244 return -1; 245 } else 246 continue; 247 } 248 249 /* set up an eventfd for interrupts */ 250 fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); 251 if (fd < 0) { 252 RTE_LOG(ERR, EAL, " cannot set up eventfd, " 253 "error %i (%s)\n", errno, strerror(errno)); 254 return -1; 255 } 256 257 dev->intr_handle.fd = fd; 258 dev->intr_handle.vfio_dev_fd = vfio_dev_fd; 259 260 switch (i) { 261 case VFIO_PCI_MSIX_IRQ_INDEX: 262 intr_mode = RTE_INTR_MODE_MSIX; 263 dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_MSIX; 264 break; 265 case VFIO_PCI_MSI_IRQ_INDEX: 266 intr_mode = RTE_INTR_MODE_MSI; 267 dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_MSI; 268 break; 269 case VFIO_PCI_INTX_IRQ_INDEX: 270 intr_mode = RTE_INTR_MODE_LEGACY; 271 dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_LEGACY; 272 break; 273 default: 274 RTE_LOG(ERR, EAL, " unknown interrupt type!\n"); 275 return -1; 276 } 277 278 return 0; 279 } 280 281 /* if we're here, we haven't found a suitable interrupt vector */ 282 return -1; 283 } 284 285 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE 286 /* 287 * Spinlock for device hot-unplug failure handling. 288 * If it tries to access bus or device, such as handle sigbus on bus 289 * or handle memory failure for device, just need to use this lock. 290 * It could protect the bus and the device to avoid race condition. 291 */ 292 static rte_spinlock_t failure_handle_lock = RTE_SPINLOCK_INITIALIZER; 293 294 static void 295 pci_vfio_req_handler(void *param) 296 { 297 struct rte_bus *bus; 298 int ret; 299 struct rte_device *device = (struct rte_device *)param; 300 301 rte_spinlock_lock(&failure_handle_lock); 302 bus = rte_bus_find_by_device(device); 303 if (bus == NULL) { 304 RTE_LOG(ERR, EAL, "Cannot find bus for device (%s)\n", 305 device->name); 306 goto handle_end; 307 } 308 309 /* 310 * vfio kernel module request user space to release allocated 311 * resources before device be deleted in kernel, so it can directly 312 * call the vfio bus hot-unplug handler to process it. 313 */ 314 ret = bus->hot_unplug_handler(device); 315 if (ret) 316 RTE_LOG(ERR, EAL, 317 "Can not handle hot-unplug for device (%s)\n", 318 device->name); 319 handle_end: 320 rte_spinlock_unlock(&failure_handle_lock); 321 } 322 323 /* enable notifier (only enable req now) */ 324 static int 325 pci_vfio_enable_notifier(struct rte_pci_device *dev, int vfio_dev_fd) 326 { 327 int ret; 328 int fd = -1; 329 330 /* set up an eventfd for req notifier */ 331 fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); 332 if (fd < 0) { 333 RTE_LOG(ERR, EAL, "Cannot set up eventfd, error %i (%s)\n", 334 errno, strerror(errno)); 335 return -1; 336 } 337 338 dev->vfio_req_intr_handle.fd = fd; 339 dev->vfio_req_intr_handle.type = RTE_INTR_HANDLE_VFIO_REQ; 340 dev->vfio_req_intr_handle.vfio_dev_fd = vfio_dev_fd; 341 342 ret = rte_intr_callback_register(&dev->vfio_req_intr_handle, 343 pci_vfio_req_handler, 344 (void *)&dev->device); 345 if (ret) { 346 RTE_LOG(ERR, EAL, "Fail to register req notifier handler.\n"); 347 goto error; 348 } 349 350 ret = rte_intr_enable(&dev->vfio_req_intr_handle); 351 if (ret) { 352 RTE_LOG(ERR, EAL, "Fail to enable req notifier.\n"); 353 ret = rte_intr_callback_unregister(&dev->vfio_req_intr_handle, 354 pci_vfio_req_handler, 355 (void *)&dev->device); 356 if (ret < 0) 357 RTE_LOG(ERR, EAL, 358 "Fail to unregister req notifier handler.\n"); 359 goto error; 360 } 361 362 return 0; 363 error: 364 close(fd); 365 366 dev->vfio_req_intr_handle.fd = -1; 367 dev->vfio_req_intr_handle.type = RTE_INTR_HANDLE_UNKNOWN; 368 dev->vfio_req_intr_handle.vfio_dev_fd = -1; 369 370 return -1; 371 } 372 373 /* disable notifier (only disable req now) */ 374 static int 375 pci_vfio_disable_notifier(struct rte_pci_device *dev) 376 { 377 int ret; 378 379 ret = rte_intr_disable(&dev->vfio_req_intr_handle); 380 if (ret) { 381 RTE_LOG(ERR, EAL, "fail to disable req notifier.\n"); 382 return -1; 383 } 384 385 ret = rte_intr_callback_unregister(&dev->vfio_req_intr_handle, 386 pci_vfio_req_handler, 387 (void *)&dev->device); 388 if (ret < 0) { 389 RTE_LOG(ERR, EAL, 390 "fail to unregister req notifier handler.\n"); 391 return -1; 392 } 393 394 close(dev->vfio_req_intr_handle.fd); 395 396 dev->vfio_req_intr_handle.fd = -1; 397 dev->vfio_req_intr_handle.type = RTE_INTR_HANDLE_UNKNOWN; 398 dev->vfio_req_intr_handle.vfio_dev_fd = -1; 399 400 return 0; 401 } 402 #endif 403 404 static int 405 pci_vfio_is_ioport_bar(int vfio_dev_fd, int bar_index) 406 { 407 uint32_t ioport_bar; 408 int ret; 409 410 ret = pread64(vfio_dev_fd, &ioport_bar, sizeof(ioport_bar), 411 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) 412 + PCI_BASE_ADDRESS_0 + bar_index*4); 413 if (ret != sizeof(ioport_bar)) { 414 RTE_LOG(ERR, EAL, "Cannot read command (%x) from config space!\n", 415 PCI_BASE_ADDRESS_0 + bar_index*4); 416 return -1; 417 } 418 419 return (ioport_bar & PCI_BASE_ADDRESS_SPACE_IO) != 0; 420 } 421 422 static int 423 pci_rte_vfio_setup_device(struct rte_pci_device *dev, int vfio_dev_fd) 424 { 425 if (pci_vfio_setup_interrupts(dev, vfio_dev_fd) != 0) { 426 RTE_LOG(ERR, EAL, "Error setting up interrupts!\n"); 427 return -1; 428 } 429 430 /* set bus mastering for the device */ 431 if (pci_vfio_set_bus_master(vfio_dev_fd, true)) { 432 RTE_LOG(ERR, EAL, "Cannot set up bus mastering!\n"); 433 return -1; 434 } 435 436 /* 437 * Reset the device. If the device is not capable of resetting, 438 * then it updates errno as EINVAL. 439 */ 440 if (ioctl(vfio_dev_fd, VFIO_DEVICE_RESET) && errno != EINVAL) { 441 RTE_LOG(ERR, EAL, "Unable to reset device! Error: %d (%s)\n", 442 errno, strerror(errno)); 443 return -1; 444 } 445 446 return 0; 447 } 448 449 static int 450 pci_vfio_mmap_bar(int vfio_dev_fd, struct mapped_pci_resource *vfio_res, 451 int bar_index, int additional_flags) 452 { 453 struct memreg { 454 unsigned long offset, size; 455 } memreg[2] = {}; 456 void *bar_addr; 457 struct pci_msix_table *msix_table = &vfio_res->msix_table; 458 struct pci_map *bar = &vfio_res->maps[bar_index]; 459 460 if (bar->size == 0) 461 /* Skip this BAR */ 462 return 0; 463 464 if (msix_table->bar_index == bar_index) { 465 /* 466 * VFIO will not let us map the MSI-X table, 467 * but we can map around it. 468 */ 469 uint32_t table_start = msix_table->offset; 470 uint32_t table_end = table_start + msix_table->size; 471 table_end = (table_end + ~PAGE_MASK) & PAGE_MASK; 472 table_start &= PAGE_MASK; 473 474 if (table_start == 0 && table_end >= bar->size) { 475 /* Cannot map this BAR */ 476 RTE_LOG(DEBUG, EAL, "Skipping BAR%d\n", bar_index); 477 bar->size = 0; 478 bar->addr = 0; 479 return 0; 480 } 481 482 memreg[0].offset = bar->offset; 483 memreg[0].size = table_start; 484 memreg[1].offset = bar->offset + table_end; 485 memreg[1].size = bar->size - table_end; 486 487 RTE_LOG(DEBUG, EAL, 488 "Trying to map BAR%d that contains the MSI-X " 489 "table. Trying offsets: " 490 "0x%04lx:0x%04lx, 0x%04lx:0x%04lx\n", bar_index, 491 memreg[0].offset, memreg[0].size, 492 memreg[1].offset, memreg[1].size); 493 } else { 494 memreg[0].offset = bar->offset; 495 memreg[0].size = bar->size; 496 } 497 498 /* reserve the address using an inaccessible mapping */ 499 bar_addr = mmap(bar->addr, bar->size, 0, MAP_PRIVATE | 500 MAP_ANONYMOUS | additional_flags, -1, 0); 501 if (bar_addr != MAP_FAILED) { 502 void *map_addr = NULL; 503 if (memreg[0].size) { 504 /* actual map of first part */ 505 map_addr = pci_map_resource(bar_addr, vfio_dev_fd, 506 memreg[0].offset, 507 memreg[0].size, 508 MAP_FIXED); 509 } 510 511 /* if there's a second part, try to map it */ 512 if (map_addr != MAP_FAILED 513 && memreg[1].offset && memreg[1].size) { 514 void *second_addr = RTE_PTR_ADD(bar_addr, 515 memreg[1].offset - 516 (uintptr_t)bar->offset); 517 map_addr = pci_map_resource(second_addr, 518 vfio_dev_fd, 519 memreg[1].offset, 520 memreg[1].size, 521 MAP_FIXED); 522 } 523 524 if (map_addr == MAP_FAILED || !map_addr) { 525 munmap(bar_addr, bar->size); 526 bar_addr = MAP_FAILED; 527 RTE_LOG(ERR, EAL, "Failed to map pci BAR%d\n", 528 bar_index); 529 return -1; 530 } 531 } else { 532 RTE_LOG(ERR, EAL, 533 "Failed to create inaccessible mapping for BAR%d\n", 534 bar_index); 535 return -1; 536 } 537 538 bar->addr = bar_addr; 539 return 0; 540 } 541 542 /* 543 * region info may contain capability headers, so we need to keep reallocating 544 * the memory until we match allocated memory size with argsz. 545 */ 546 static int 547 pci_vfio_get_region_info(int vfio_dev_fd, struct vfio_region_info **info, 548 int region) 549 { 550 struct vfio_region_info *ri; 551 size_t argsz = sizeof(*ri); 552 int ret; 553 554 ri = malloc(sizeof(*ri)); 555 if (ri == NULL) { 556 RTE_LOG(ERR, EAL, "Cannot allocate memory for region info\n"); 557 return -1; 558 } 559 again: 560 memset(ri, 0, argsz); 561 ri->argsz = argsz; 562 ri->index = region; 563 564 ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, ri); 565 if (ret < 0) { 566 free(ri); 567 return ret; 568 } 569 if (ri->argsz != argsz) { 570 struct vfio_region_info *tmp; 571 572 argsz = ri->argsz; 573 tmp = realloc(ri, argsz); 574 575 if (tmp == NULL) { 576 /* realloc failed but the ri is still there */ 577 free(ri); 578 RTE_LOG(ERR, EAL, "Cannot reallocate memory for region info\n"); 579 return -1; 580 } 581 ri = tmp; 582 goto again; 583 } 584 *info = ri; 585 586 return 0; 587 } 588 589 static struct vfio_info_cap_header * 590 pci_vfio_info_cap(struct vfio_region_info *info, int cap) 591 { 592 struct vfio_info_cap_header *h; 593 size_t offset; 594 595 if ((info->flags & RTE_VFIO_INFO_FLAG_CAPS) == 0) { 596 /* VFIO info does not advertise capabilities */ 597 return NULL; 598 } 599 600 offset = VFIO_CAP_OFFSET(info); 601 while (offset != 0) { 602 h = RTE_PTR_ADD(info, offset); 603 if (h->id == cap) 604 return h; 605 offset = h->next; 606 } 607 return NULL; 608 } 609 610 static int 611 pci_vfio_msix_is_mappable(int vfio_dev_fd, int msix_region) 612 { 613 struct vfio_region_info *info; 614 int ret; 615 616 ret = pci_vfio_get_region_info(vfio_dev_fd, &info, msix_region); 617 if (ret < 0) 618 return -1; 619 620 ret = pci_vfio_info_cap(info, RTE_VFIO_CAP_MSIX_MAPPABLE) != NULL; 621 622 /* cleanup */ 623 free(info); 624 625 return ret; 626 } 627 628 629 static int 630 pci_vfio_map_resource_primary(struct rte_pci_device *dev) 631 { 632 struct vfio_device_info device_info = { .argsz = sizeof(device_info) }; 633 char pci_addr[PATH_MAX] = {0}; 634 int vfio_dev_fd; 635 struct rte_pci_addr *loc = &dev->addr; 636 int i, ret; 637 struct mapped_pci_resource *vfio_res = NULL; 638 struct mapped_pci_res_list *vfio_res_list = 639 RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list); 640 641 struct pci_map *maps; 642 643 dev->intr_handle.fd = -1; 644 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE 645 dev->vfio_req_intr_handle.fd = -1; 646 #endif 647 648 /* store PCI address string */ 649 snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT, 650 loc->domain, loc->bus, loc->devid, loc->function); 651 652 ret = rte_vfio_setup_device(rte_pci_get_sysfs_path(), pci_addr, 653 &vfio_dev_fd, &device_info); 654 if (ret) 655 return ret; 656 657 /* allocate vfio_res and get region info */ 658 vfio_res = rte_zmalloc("VFIO_RES", sizeof(*vfio_res), 0); 659 if (vfio_res == NULL) { 660 RTE_LOG(ERR, EAL, 661 "%s(): cannot store uio mmap details\n", __func__); 662 goto err_vfio_dev_fd; 663 } 664 memcpy(&vfio_res->pci_addr, &dev->addr, sizeof(vfio_res->pci_addr)); 665 666 /* get number of registers (up to BAR5) */ 667 vfio_res->nb_maps = RTE_MIN((int) device_info.num_regions, 668 VFIO_PCI_BAR5_REGION_INDEX + 1); 669 670 /* map BARs */ 671 maps = vfio_res->maps; 672 673 vfio_res->msix_table.bar_index = -1; 674 /* get MSI-X BAR, if any (we have to know where it is because we can't 675 * easily mmap it when using VFIO) 676 */ 677 ret = pci_vfio_get_msix_bar(vfio_dev_fd, &vfio_res->msix_table); 678 if (ret < 0) { 679 RTE_LOG(ERR, EAL, " %s cannot get MSI-X BAR number!\n", 680 pci_addr); 681 goto err_vfio_res; 682 } 683 /* if we found our MSI-X BAR region, check if we can mmap it */ 684 if (vfio_res->msix_table.bar_index != -1) { 685 int ret = pci_vfio_msix_is_mappable(vfio_dev_fd, 686 vfio_res->msix_table.bar_index); 687 if (ret < 0) { 688 RTE_LOG(ERR, EAL, "Couldn't check if MSI-X BAR is mappable\n"); 689 goto err_vfio_res; 690 } else if (ret != 0) { 691 /* we can map it, so we don't care where it is */ 692 RTE_LOG(DEBUG, EAL, "VFIO reports MSI-X BAR as mappable\n"); 693 vfio_res->msix_table.bar_index = -1; 694 } 695 } 696 697 for (i = 0; i < (int) vfio_res->nb_maps; i++) { 698 struct vfio_region_info *reg = NULL; 699 void *bar_addr; 700 701 ret = pci_vfio_get_region_info(vfio_dev_fd, ®, i); 702 if (ret < 0) { 703 RTE_LOG(ERR, EAL, " %s cannot get device region info " 704 "error %i (%s)\n", pci_addr, errno, 705 strerror(errno)); 706 goto err_vfio_res; 707 } 708 709 /* chk for io port region */ 710 ret = pci_vfio_is_ioport_bar(vfio_dev_fd, i); 711 if (ret < 0) { 712 free(reg); 713 goto err_vfio_res; 714 } else if (ret) { 715 RTE_LOG(INFO, EAL, "Ignore mapping IO port bar(%d)\n", 716 i); 717 free(reg); 718 continue; 719 } 720 721 /* skip non-mmapable BARs */ 722 if ((reg->flags & VFIO_REGION_INFO_FLAG_MMAP) == 0) { 723 free(reg); 724 continue; 725 } 726 727 /* try mapping somewhere close to the end of hugepages */ 728 if (pci_map_addr == NULL) 729 pci_map_addr = pci_find_max_end_va(); 730 731 bar_addr = pci_map_addr; 732 pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg->size); 733 734 maps[i].addr = bar_addr; 735 maps[i].offset = reg->offset; 736 maps[i].size = reg->size; 737 maps[i].path = NULL; /* vfio doesn't have per-resource paths */ 738 739 ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, 0); 740 if (ret < 0) { 741 RTE_LOG(ERR, EAL, " %s mapping BAR%i failed: %s\n", 742 pci_addr, i, strerror(errno)); 743 free(reg); 744 goto err_vfio_res; 745 } 746 747 dev->mem_resource[i].addr = maps[i].addr; 748 749 free(reg); 750 } 751 752 if (pci_rte_vfio_setup_device(dev, vfio_dev_fd) < 0) { 753 RTE_LOG(ERR, EAL, " %s setup device failed\n", pci_addr); 754 goto err_vfio_res; 755 } 756 757 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE 758 if (pci_vfio_enable_notifier(dev, vfio_dev_fd) != 0) { 759 RTE_LOG(ERR, EAL, "Error setting up notifier!\n"); 760 goto err_vfio_res; 761 } 762 763 #endif 764 TAILQ_INSERT_TAIL(vfio_res_list, vfio_res, next); 765 766 return 0; 767 err_vfio_res: 768 rte_free(vfio_res); 769 err_vfio_dev_fd: 770 close(vfio_dev_fd); 771 return -1; 772 } 773 774 static int 775 pci_vfio_map_resource_secondary(struct rte_pci_device *dev) 776 { 777 struct vfio_device_info device_info = { .argsz = sizeof(device_info) }; 778 char pci_addr[PATH_MAX] = {0}; 779 int vfio_dev_fd; 780 struct rte_pci_addr *loc = &dev->addr; 781 int i, ret; 782 struct mapped_pci_resource *vfio_res = NULL; 783 struct mapped_pci_res_list *vfio_res_list = 784 RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list); 785 786 struct pci_map *maps; 787 788 dev->intr_handle.fd = -1; 789 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE 790 dev->vfio_req_intr_handle.fd = -1; 791 #endif 792 793 /* store PCI address string */ 794 snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT, 795 loc->domain, loc->bus, loc->devid, loc->function); 796 797 ret = rte_vfio_setup_device(rte_pci_get_sysfs_path(), pci_addr, 798 &vfio_dev_fd, &device_info); 799 if (ret) 800 return ret; 801 802 /* if we're in a secondary process, just find our tailq entry */ 803 TAILQ_FOREACH(vfio_res, vfio_res_list, next) { 804 if (rte_pci_addr_cmp(&vfio_res->pci_addr, 805 &dev->addr)) 806 continue; 807 break; 808 } 809 /* if we haven't found our tailq entry, something's wrong */ 810 if (vfio_res == NULL) { 811 RTE_LOG(ERR, EAL, " %s cannot find TAILQ entry for PCI device!\n", 812 pci_addr); 813 goto err_vfio_dev_fd; 814 } 815 816 /* map BARs */ 817 maps = vfio_res->maps; 818 819 for (i = 0; i < (int) vfio_res->nb_maps; i++) { 820 ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, MAP_FIXED); 821 if (ret < 0) { 822 RTE_LOG(ERR, EAL, " %s mapping BAR%i failed: %s\n", 823 pci_addr, i, strerror(errno)); 824 goto err_vfio_dev_fd; 825 } 826 827 dev->mem_resource[i].addr = maps[i].addr; 828 } 829 830 /* we need save vfio_dev_fd, so it can be used during release */ 831 dev->intr_handle.vfio_dev_fd = vfio_dev_fd; 832 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE 833 dev->vfio_req_intr_handle.vfio_dev_fd = vfio_dev_fd; 834 #endif 835 836 return 0; 837 err_vfio_dev_fd: 838 close(vfio_dev_fd); 839 return -1; 840 } 841 842 /* 843 * map the PCI resources of a PCI device in virtual memory (VFIO version). 844 * primary and secondary processes follow almost exactly the same path 845 */ 846 int 847 pci_vfio_map_resource(struct rte_pci_device *dev) 848 { 849 if (rte_eal_process_type() == RTE_PROC_PRIMARY) 850 return pci_vfio_map_resource_primary(dev); 851 else 852 return pci_vfio_map_resource_secondary(dev); 853 } 854 855 static struct mapped_pci_resource * 856 find_and_unmap_vfio_resource(struct mapped_pci_res_list *vfio_res_list, 857 struct rte_pci_device *dev, 858 const char *pci_addr) 859 { 860 struct mapped_pci_resource *vfio_res = NULL; 861 struct pci_map *maps; 862 int i; 863 864 /* Get vfio_res */ 865 TAILQ_FOREACH(vfio_res, vfio_res_list, next) { 866 if (rte_pci_addr_cmp(&vfio_res->pci_addr, &dev->addr)) 867 continue; 868 break; 869 } 870 871 if (vfio_res == NULL) 872 return vfio_res; 873 874 RTE_LOG(INFO, EAL, "Releasing pci mapped resource for %s\n", 875 pci_addr); 876 877 maps = vfio_res->maps; 878 for (i = 0; i < (int) vfio_res->nb_maps; i++) { 879 880 /* 881 * We do not need to be aware of MSI-X table BAR mappings as 882 * when mapping. Just using current maps array is enough 883 */ 884 if (maps[i].addr) { 885 RTE_LOG(INFO, EAL, "Calling pci_unmap_resource for %s at %p\n", 886 pci_addr, maps[i].addr); 887 pci_unmap_resource(maps[i].addr, maps[i].size); 888 } 889 } 890 891 return vfio_res; 892 } 893 894 static int 895 pci_vfio_unmap_resource_primary(struct rte_pci_device *dev) 896 { 897 char pci_addr[PATH_MAX] = {0}; 898 struct rte_pci_addr *loc = &dev->addr; 899 struct mapped_pci_resource *vfio_res = NULL; 900 struct mapped_pci_res_list *vfio_res_list; 901 int ret; 902 903 /* store PCI address string */ 904 snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT, 905 loc->domain, loc->bus, loc->devid, loc->function); 906 907 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE 908 ret = pci_vfio_disable_notifier(dev); 909 if (ret) { 910 RTE_LOG(ERR, EAL, "fail to disable req notifier.\n"); 911 return -1; 912 } 913 914 #endif 915 if (close(dev->intr_handle.fd) < 0) { 916 RTE_LOG(INFO, EAL, "Error when closing eventfd file descriptor for %s\n", 917 pci_addr); 918 return -1; 919 } 920 921 if (pci_vfio_set_bus_master(dev->intr_handle.vfio_dev_fd, false)) { 922 RTE_LOG(ERR, EAL, " %s cannot unset bus mastering for PCI device!\n", 923 pci_addr); 924 return -1; 925 } 926 927 ret = rte_vfio_release_device(rte_pci_get_sysfs_path(), pci_addr, 928 dev->intr_handle.vfio_dev_fd); 929 if (ret < 0) { 930 RTE_LOG(ERR, EAL, 931 "%s(): cannot release device\n", __func__); 932 return ret; 933 } 934 935 vfio_res_list = 936 RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list); 937 vfio_res = find_and_unmap_vfio_resource(vfio_res_list, dev, pci_addr); 938 939 /* if we haven't found our tailq entry, something's wrong */ 940 if (vfio_res == NULL) { 941 RTE_LOG(ERR, EAL, " %s cannot find TAILQ entry for PCI device!\n", 942 pci_addr); 943 return -1; 944 } 945 946 TAILQ_REMOVE(vfio_res_list, vfio_res, next); 947 948 return 0; 949 } 950 951 static int 952 pci_vfio_unmap_resource_secondary(struct rte_pci_device *dev) 953 { 954 char pci_addr[PATH_MAX] = {0}; 955 struct rte_pci_addr *loc = &dev->addr; 956 struct mapped_pci_resource *vfio_res = NULL; 957 struct mapped_pci_res_list *vfio_res_list; 958 int ret; 959 960 /* store PCI address string */ 961 snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT, 962 loc->domain, loc->bus, loc->devid, loc->function); 963 964 ret = rte_vfio_release_device(rte_pci_get_sysfs_path(), pci_addr, 965 dev->intr_handle.vfio_dev_fd); 966 if (ret < 0) { 967 RTE_LOG(ERR, EAL, 968 "%s(): cannot release device\n", __func__); 969 return ret; 970 } 971 972 vfio_res_list = 973 RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list); 974 vfio_res = find_and_unmap_vfio_resource(vfio_res_list, dev, pci_addr); 975 976 /* if we haven't found our tailq entry, something's wrong */ 977 if (vfio_res == NULL) { 978 RTE_LOG(ERR, EAL, " %s cannot find TAILQ entry for PCI device!\n", 979 pci_addr); 980 return -1; 981 } 982 983 return 0; 984 } 985 986 int 987 pci_vfio_unmap_resource(struct rte_pci_device *dev) 988 { 989 if (rte_eal_process_type() == RTE_PROC_PRIMARY) 990 return pci_vfio_unmap_resource_primary(dev); 991 else 992 return pci_vfio_unmap_resource_secondary(dev); 993 } 994 995 int 996 pci_vfio_ioport_map(struct rte_pci_device *dev, int bar, 997 struct rte_pci_ioport *p) 998 { 999 if (bar < VFIO_PCI_BAR0_REGION_INDEX || 1000 bar > VFIO_PCI_BAR5_REGION_INDEX) { 1001 RTE_LOG(ERR, EAL, "invalid bar (%d)!\n", bar); 1002 return -1; 1003 } 1004 1005 p->dev = dev; 1006 p->base = VFIO_GET_REGION_ADDR(bar); 1007 return 0; 1008 } 1009 1010 void 1011 pci_vfio_ioport_read(struct rte_pci_ioport *p, 1012 void *data, size_t len, off_t offset) 1013 { 1014 const struct rte_intr_handle *intr_handle = &p->dev->intr_handle; 1015 1016 if (pread64(intr_handle->vfio_dev_fd, data, 1017 len, p->base + offset) <= 0) 1018 RTE_LOG(ERR, EAL, 1019 "Can't read from PCI bar (%" PRIu64 ") : offset (%x)\n", 1020 VFIO_GET_REGION_IDX(p->base), (int)offset); 1021 } 1022 1023 void 1024 pci_vfio_ioport_write(struct rte_pci_ioport *p, 1025 const void *data, size_t len, off_t offset) 1026 { 1027 const struct rte_intr_handle *intr_handle = &p->dev->intr_handle; 1028 1029 if (pwrite64(intr_handle->vfio_dev_fd, data, 1030 len, p->base + offset) <= 0) 1031 RTE_LOG(ERR, EAL, 1032 "Can't write to PCI bar (%" PRIu64 ") : offset (%x)\n", 1033 VFIO_GET_REGION_IDX(p->base), (int)offset); 1034 } 1035 1036 int 1037 pci_vfio_ioport_unmap(struct rte_pci_ioport *p) 1038 { 1039 RTE_SET_USED(p); 1040 return -1; 1041 } 1042 1043 int 1044 pci_vfio_is_enabled(void) 1045 { 1046 return rte_vfio_is_enabled("vfio_pci"); 1047 } 1048 #endif 1049