1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2010-2014 Intel Corporation 3 */ 4 5 #include <string.h> 6 #include <fcntl.h> 7 #include <linux/pci_regs.h> 8 #include <sys/eventfd.h> 9 #include <sys/socket.h> 10 #include <sys/ioctl.h> 11 #include <sys/mman.h> 12 #include <stdbool.h> 13 14 #include <rte_log.h> 15 #include <rte_pci.h> 16 #include <rte_bus_pci.h> 17 #include <rte_malloc.h> 18 #include <rte_vfio.h> 19 #include <rte_eal.h> 20 #include <rte_bus.h> 21 #include <rte_spinlock.h> 22 #include <rte_tailq.h> 23 24 #include "eal_filesystem.h" 25 26 #include "pci_init.h" 27 #include "private.h" 28 29 /** 30 * @file 31 * PCI probing under linux (VFIO version) 32 * 33 * This code tries to determine if the PCI device is bound to VFIO driver, 34 * and initialize it (map BARs, set up interrupts) if that's the case. 35 * 36 * This file is only compiled if CONFIG_RTE_EAL_VFIO is set to "y". 37 */ 38 39 #ifdef VFIO_PRESENT 40 41 #ifndef PAGE_SIZE 42 #define PAGE_SIZE (sysconf(_SC_PAGESIZE)) 43 #endif 44 #define PAGE_MASK (~(PAGE_SIZE - 1)) 45 46 static struct rte_tailq_elem rte_vfio_tailq = { 47 .name = "VFIO_RESOURCE_LIST", 48 }; 49 EAL_REGISTER_TAILQ(rte_vfio_tailq) 50 51 int 52 pci_vfio_read_config(const struct rte_intr_handle *intr_handle, 53 void *buf, size_t len, off_t offs) 54 { 55 return pread64(intr_handle->vfio_dev_fd, buf, len, 56 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + offs); 57 } 58 59 int 60 pci_vfio_write_config(const struct rte_intr_handle *intr_handle, 61 const void *buf, size_t len, off_t offs) 62 { 63 return pwrite64(intr_handle->vfio_dev_fd, buf, len, 64 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + offs); 65 } 66 67 /* get PCI BAR number where MSI-X interrupts are */ 68 static int 69 pci_vfio_get_msix_bar(int fd, struct pci_msix_table *msix_table) 70 { 71 int ret; 72 uint32_t reg; 73 uint16_t flags; 74 uint8_t cap_id, cap_offset; 75 76 /* read PCI capability pointer from config space */ 77 ret = pread64(fd, ®, sizeof(reg), 78 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + 79 PCI_CAPABILITY_LIST); 80 if (ret != sizeof(reg)) { 81 RTE_LOG(ERR, EAL, "Cannot read capability pointer from PCI " 82 "config space!\n"); 83 return -1; 84 } 85 86 /* we need first byte */ 87 cap_offset = reg & 0xFF; 88 89 while (cap_offset) { 90 91 /* read PCI capability ID */ 92 ret = pread64(fd, ®, sizeof(reg), 93 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + 94 cap_offset); 95 if (ret != sizeof(reg)) { 96 RTE_LOG(ERR, EAL, "Cannot read capability ID from PCI " 97 "config space!\n"); 98 return -1; 99 } 100 101 /* we need first byte */ 102 cap_id = reg & 0xFF; 103 104 /* if we haven't reached MSI-X, check next capability */ 105 if (cap_id != PCI_CAP_ID_MSIX) { 106 ret = pread64(fd, ®, sizeof(reg), 107 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + 108 cap_offset); 109 if (ret != sizeof(reg)) { 110 RTE_LOG(ERR, EAL, "Cannot read capability pointer from PCI " 111 "config space!\n"); 112 return -1; 113 } 114 115 /* we need second byte */ 116 cap_offset = (reg & 0xFF00) >> 8; 117 118 continue; 119 } 120 /* else, read table offset */ 121 else { 122 /* table offset resides in the next 4 bytes */ 123 ret = pread64(fd, ®, sizeof(reg), 124 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + 125 cap_offset + 4); 126 if (ret != sizeof(reg)) { 127 RTE_LOG(ERR, EAL, "Cannot read table offset from PCI config " 128 "space!\n"); 129 return -1; 130 } 131 132 ret = pread64(fd, &flags, sizeof(flags), 133 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + 134 cap_offset + 2); 135 if (ret != sizeof(flags)) { 136 RTE_LOG(ERR, EAL, "Cannot read table flags from PCI config " 137 "space!\n"); 138 return -1; 139 } 140 141 msix_table->bar_index = reg & RTE_PCI_MSIX_TABLE_BIR; 142 msix_table->offset = reg & RTE_PCI_MSIX_TABLE_OFFSET; 143 msix_table->size = 144 16 * (1 + (flags & RTE_PCI_MSIX_FLAGS_QSIZE)); 145 146 return 0; 147 } 148 } 149 return 0; 150 } 151 152 /* set PCI bus mastering */ 153 static int 154 pci_vfio_set_bus_master(int dev_fd, bool op) 155 { 156 uint16_t reg; 157 int ret; 158 159 ret = pread64(dev_fd, ®, sizeof(reg), 160 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + 161 PCI_COMMAND); 162 if (ret != sizeof(reg)) { 163 RTE_LOG(ERR, EAL, "Cannot read command from PCI config space!\n"); 164 return -1; 165 } 166 167 if (op) 168 /* set the master bit */ 169 reg |= PCI_COMMAND_MASTER; 170 else 171 reg &= ~(PCI_COMMAND_MASTER); 172 173 ret = pwrite64(dev_fd, ®, sizeof(reg), 174 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + 175 PCI_COMMAND); 176 177 if (ret != sizeof(reg)) { 178 RTE_LOG(ERR, EAL, "Cannot write command to PCI config space!\n"); 179 return -1; 180 } 181 182 return 0; 183 } 184 185 /* set up interrupt support (but not enable interrupts) */ 186 static int 187 pci_vfio_setup_interrupts(struct rte_pci_device *dev, int vfio_dev_fd) 188 { 189 int i, ret, intr_idx; 190 enum rte_intr_mode intr_mode; 191 192 /* default to invalid index */ 193 intr_idx = VFIO_PCI_NUM_IRQS; 194 195 /* Get default / configured intr_mode */ 196 intr_mode = rte_eal_vfio_intr_mode(); 197 198 /* get interrupt type from internal config (MSI-X by default, can be 199 * overridden from the command line 200 */ 201 switch (intr_mode) { 202 case RTE_INTR_MODE_MSIX: 203 intr_idx = VFIO_PCI_MSIX_IRQ_INDEX; 204 break; 205 case RTE_INTR_MODE_MSI: 206 intr_idx = VFIO_PCI_MSI_IRQ_INDEX; 207 break; 208 case RTE_INTR_MODE_LEGACY: 209 intr_idx = VFIO_PCI_INTX_IRQ_INDEX; 210 break; 211 /* don't do anything if we want to automatically determine interrupt type */ 212 case RTE_INTR_MODE_NONE: 213 break; 214 default: 215 RTE_LOG(ERR, EAL, " unknown default interrupt type!\n"); 216 return -1; 217 } 218 219 /* start from MSI-X interrupt type */ 220 for (i = VFIO_PCI_MSIX_IRQ_INDEX; i >= 0; i--) { 221 struct vfio_irq_info irq = { .argsz = sizeof(irq) }; 222 int fd = -1; 223 224 /* skip interrupt modes we don't want */ 225 if (intr_mode != RTE_INTR_MODE_NONE && 226 i != intr_idx) 227 continue; 228 229 irq.index = i; 230 231 ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_IRQ_INFO, &irq); 232 if (ret < 0) { 233 RTE_LOG(ERR, EAL, " cannot get IRQ info, " 234 "error %i (%s)\n", errno, strerror(errno)); 235 return -1; 236 } 237 238 /* if this vector cannot be used with eventfd, fail if we explicitly 239 * specified interrupt type, otherwise continue */ 240 if ((irq.flags & VFIO_IRQ_INFO_EVENTFD) == 0) { 241 if (intr_mode != RTE_INTR_MODE_NONE) { 242 RTE_LOG(ERR, EAL, 243 " interrupt vector does not support eventfd!\n"); 244 return -1; 245 } else 246 continue; 247 } 248 249 /* set up an eventfd for interrupts */ 250 fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); 251 if (fd < 0) { 252 RTE_LOG(ERR, EAL, " cannot set up eventfd, " 253 "error %i (%s)\n", errno, strerror(errno)); 254 return -1; 255 } 256 257 dev->intr_handle.fd = fd; 258 dev->intr_handle.vfio_dev_fd = vfio_dev_fd; 259 260 switch (i) { 261 case VFIO_PCI_MSIX_IRQ_INDEX: 262 intr_mode = RTE_INTR_MODE_MSIX; 263 dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_MSIX; 264 break; 265 case VFIO_PCI_MSI_IRQ_INDEX: 266 intr_mode = RTE_INTR_MODE_MSI; 267 dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_MSI; 268 break; 269 case VFIO_PCI_INTX_IRQ_INDEX: 270 intr_mode = RTE_INTR_MODE_LEGACY; 271 dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_LEGACY; 272 break; 273 default: 274 RTE_LOG(ERR, EAL, " unknown interrupt type!\n"); 275 return -1; 276 } 277 278 return 0; 279 } 280 281 /* if we're here, we haven't found a suitable interrupt vector */ 282 return -1; 283 } 284 285 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE 286 /* 287 * Spinlock for device hot-unplug failure handling. 288 * If it tries to access bus or device, such as handle sigbus on bus 289 * or handle memory failure for device, just need to use this lock. 290 * It could protect the bus and the device to avoid race condition. 291 */ 292 static rte_spinlock_t failure_handle_lock = RTE_SPINLOCK_INITIALIZER; 293 294 static void 295 pci_vfio_req_handler(void *param) 296 { 297 struct rte_bus *bus; 298 int ret; 299 struct rte_device *device = (struct rte_device *)param; 300 301 rte_spinlock_lock(&failure_handle_lock); 302 bus = rte_bus_find_by_device(device); 303 if (bus == NULL) { 304 RTE_LOG(ERR, EAL, "Cannot find bus for device (%s)\n", 305 device->name); 306 goto handle_end; 307 } 308 309 /* 310 * vfio kernel module request user space to release allocated 311 * resources before device be deleted in kernel, so it can directly 312 * call the vfio bus hot-unplug handler to process it. 313 */ 314 ret = bus->hot_unplug_handler(device); 315 if (ret) 316 RTE_LOG(ERR, EAL, 317 "Can not handle hot-unplug for device (%s)\n", 318 device->name); 319 handle_end: 320 rte_spinlock_unlock(&failure_handle_lock); 321 } 322 323 /* enable notifier (only enable req now) */ 324 static int 325 pci_vfio_enable_notifier(struct rte_pci_device *dev, int vfio_dev_fd) 326 { 327 int ret; 328 int fd = -1; 329 330 /* set up an eventfd for req notifier */ 331 fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); 332 if (fd < 0) { 333 RTE_LOG(ERR, EAL, "Cannot set up eventfd, error %i (%s)\n", 334 errno, strerror(errno)); 335 return -1; 336 } 337 338 dev->vfio_req_intr_handle.fd = fd; 339 dev->vfio_req_intr_handle.type = RTE_INTR_HANDLE_VFIO_REQ; 340 dev->vfio_req_intr_handle.vfio_dev_fd = vfio_dev_fd; 341 342 ret = rte_intr_callback_register(&dev->vfio_req_intr_handle, 343 pci_vfio_req_handler, 344 (void *)&dev->device); 345 if (ret) { 346 RTE_LOG(ERR, EAL, "Fail to register req notifier handler.\n"); 347 goto error; 348 } 349 350 ret = rte_intr_enable(&dev->vfio_req_intr_handle); 351 if (ret) { 352 RTE_LOG(ERR, EAL, "Fail to enable req notifier.\n"); 353 ret = rte_intr_callback_unregister(&dev->vfio_req_intr_handle, 354 pci_vfio_req_handler, 355 (void *)&dev->device); 356 if (ret < 0) 357 RTE_LOG(ERR, EAL, 358 "Fail to unregister req notifier handler.\n"); 359 goto error; 360 } 361 362 return 0; 363 error: 364 close(fd); 365 366 dev->vfio_req_intr_handle.fd = -1; 367 dev->vfio_req_intr_handle.type = RTE_INTR_HANDLE_UNKNOWN; 368 dev->vfio_req_intr_handle.vfio_dev_fd = -1; 369 370 return -1; 371 } 372 373 /* disable notifier (only disable req now) */ 374 static int 375 pci_vfio_disable_notifier(struct rte_pci_device *dev) 376 { 377 int ret; 378 379 ret = rte_intr_disable(&dev->vfio_req_intr_handle); 380 if (ret) { 381 RTE_LOG(ERR, EAL, "fail to disable req notifier.\n"); 382 return -1; 383 } 384 385 ret = rte_intr_callback_unregister(&dev->vfio_req_intr_handle, 386 pci_vfio_req_handler, 387 (void *)&dev->device); 388 if (ret < 0) { 389 RTE_LOG(ERR, EAL, 390 "fail to unregister req notifier handler.\n"); 391 return -1; 392 } 393 394 close(dev->vfio_req_intr_handle.fd); 395 396 dev->vfio_req_intr_handle.fd = -1; 397 dev->vfio_req_intr_handle.type = RTE_INTR_HANDLE_UNKNOWN; 398 dev->vfio_req_intr_handle.vfio_dev_fd = -1; 399 400 return 0; 401 } 402 #endif 403 404 static int 405 pci_vfio_is_ioport_bar(int vfio_dev_fd, int bar_index) 406 { 407 uint32_t ioport_bar; 408 int ret; 409 410 ret = pread64(vfio_dev_fd, &ioport_bar, sizeof(ioport_bar), 411 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) 412 + PCI_BASE_ADDRESS_0 + bar_index*4); 413 if (ret != sizeof(ioport_bar)) { 414 RTE_LOG(ERR, EAL, "Cannot read command (%x) from config space!\n", 415 PCI_BASE_ADDRESS_0 + bar_index*4); 416 return -1; 417 } 418 419 return (ioport_bar & PCI_BASE_ADDRESS_SPACE_IO) != 0; 420 } 421 422 static int 423 pci_rte_vfio_setup_device(struct rte_pci_device *dev, int vfio_dev_fd) 424 { 425 if (pci_vfio_setup_interrupts(dev, vfio_dev_fd) != 0) { 426 RTE_LOG(ERR, EAL, "Error setting up interrupts!\n"); 427 return -1; 428 } 429 430 /* set bus mastering for the device */ 431 if (pci_vfio_set_bus_master(vfio_dev_fd, true)) { 432 RTE_LOG(ERR, EAL, "Cannot set up bus mastering!\n"); 433 return -1; 434 } 435 436 /* 437 * Reset the device. If the device is not capable of resetting, 438 * then it updates errno as EINVAL. 439 */ 440 if (ioctl(vfio_dev_fd, VFIO_DEVICE_RESET) && errno != EINVAL) { 441 RTE_LOG(ERR, EAL, "Unable to reset device! Error: %d (%s)\n", 442 errno, strerror(errno)); 443 return -1; 444 } 445 446 return 0; 447 } 448 449 static int 450 pci_vfio_mmap_bar(int vfio_dev_fd, struct mapped_pci_resource *vfio_res, 451 int bar_index, int additional_flags) 452 { 453 struct memreg { 454 uint64_t offset; 455 size_t size; 456 } memreg[2] = {}; 457 void *bar_addr; 458 struct pci_msix_table *msix_table = &vfio_res->msix_table; 459 struct pci_map *bar = &vfio_res->maps[bar_index]; 460 461 if (bar->size == 0) { 462 RTE_LOG(DEBUG, EAL, "Bar size is 0, skip BAR%d\n", bar_index); 463 return 0; 464 } 465 466 if (msix_table->bar_index == bar_index) { 467 /* 468 * VFIO will not let us map the MSI-X table, 469 * but we can map around it. 470 */ 471 uint32_t table_start = msix_table->offset; 472 uint32_t table_end = table_start + msix_table->size; 473 table_end = RTE_ALIGN(table_end, PAGE_SIZE); 474 table_start = RTE_ALIGN_FLOOR(table_start, PAGE_SIZE); 475 476 /* If page-aligned start of MSI-X table is less than the 477 * actual MSI-X table start address, reassign to the actual 478 * start address. 479 */ 480 if (table_start < msix_table->offset) 481 table_start = msix_table->offset; 482 483 if (table_start == 0 && table_end >= bar->size) { 484 /* Cannot map this BAR */ 485 RTE_LOG(DEBUG, EAL, "Skipping BAR%d\n", bar_index); 486 bar->size = 0; 487 bar->addr = 0; 488 return 0; 489 } 490 491 memreg[0].offset = bar->offset; 492 memreg[0].size = table_start; 493 if (bar->size < table_end) { 494 /* 495 * If MSI-X table end is beyond BAR end, don't attempt 496 * to perform second mapping. 497 */ 498 memreg[1].offset = 0; 499 memreg[1].size = 0; 500 } else { 501 memreg[1].offset = bar->offset + table_end; 502 memreg[1].size = bar->size - table_end; 503 } 504 505 RTE_LOG(DEBUG, EAL, 506 "Trying to map BAR%d that contains the MSI-X " 507 "table. Trying offsets: " 508 "0x%04" PRIx64 ":0x%04zx, 0x%04" PRIx64 ":0x%04zx\n", 509 bar_index, 510 memreg[0].offset, memreg[0].size, 511 memreg[1].offset, memreg[1].size); 512 } else { 513 memreg[0].offset = bar->offset; 514 memreg[0].size = bar->size; 515 } 516 517 /* reserve the address using an inaccessible mapping */ 518 bar_addr = mmap(bar->addr, bar->size, 0, MAP_PRIVATE | 519 MAP_ANONYMOUS | additional_flags, -1, 0); 520 if (bar_addr != MAP_FAILED) { 521 void *map_addr = NULL; 522 if (memreg[0].size) { 523 /* actual map of first part */ 524 map_addr = pci_map_resource(bar_addr, vfio_dev_fd, 525 memreg[0].offset, 526 memreg[0].size, 527 MAP_FIXED); 528 } 529 530 /* if there's a second part, try to map it */ 531 if (map_addr != MAP_FAILED 532 && memreg[1].offset && memreg[1].size) { 533 void *second_addr = RTE_PTR_ADD(bar_addr, 534 (uintptr_t)(memreg[1].offset - 535 bar->offset)); 536 map_addr = pci_map_resource(second_addr, 537 vfio_dev_fd, 538 memreg[1].offset, 539 memreg[1].size, 540 MAP_FIXED); 541 } 542 543 if (map_addr == MAP_FAILED || !map_addr) { 544 munmap(bar_addr, bar->size); 545 bar_addr = MAP_FAILED; 546 RTE_LOG(ERR, EAL, "Failed to map pci BAR%d\n", 547 bar_index); 548 return -1; 549 } 550 } else { 551 RTE_LOG(ERR, EAL, 552 "Failed to create inaccessible mapping for BAR%d\n", 553 bar_index); 554 return -1; 555 } 556 557 bar->addr = bar_addr; 558 return 0; 559 } 560 561 /* 562 * region info may contain capability headers, so we need to keep reallocating 563 * the memory until we match allocated memory size with argsz. 564 */ 565 static int 566 pci_vfio_get_region_info(int vfio_dev_fd, struct vfio_region_info **info, 567 int region) 568 { 569 struct vfio_region_info *ri; 570 size_t argsz = sizeof(*ri); 571 int ret; 572 573 ri = malloc(sizeof(*ri)); 574 if (ri == NULL) { 575 RTE_LOG(ERR, EAL, "Cannot allocate memory for region info\n"); 576 return -1; 577 } 578 again: 579 memset(ri, 0, argsz); 580 ri->argsz = argsz; 581 ri->index = region; 582 583 ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, ri); 584 if (ret < 0) { 585 free(ri); 586 return ret; 587 } 588 if (ri->argsz != argsz) { 589 struct vfio_region_info *tmp; 590 591 argsz = ri->argsz; 592 tmp = realloc(ri, argsz); 593 594 if (tmp == NULL) { 595 /* realloc failed but the ri is still there */ 596 free(ri); 597 RTE_LOG(ERR, EAL, "Cannot reallocate memory for region info\n"); 598 return -1; 599 } 600 ri = tmp; 601 goto again; 602 } 603 *info = ri; 604 605 return 0; 606 } 607 608 static struct vfio_info_cap_header * 609 pci_vfio_info_cap(struct vfio_region_info *info, int cap) 610 { 611 struct vfio_info_cap_header *h; 612 size_t offset; 613 614 if ((info->flags & RTE_VFIO_INFO_FLAG_CAPS) == 0) { 615 /* VFIO info does not advertise capabilities */ 616 return NULL; 617 } 618 619 offset = VFIO_CAP_OFFSET(info); 620 while (offset != 0) { 621 h = RTE_PTR_ADD(info, offset); 622 if (h->id == cap) 623 return h; 624 offset = h->next; 625 } 626 return NULL; 627 } 628 629 static int 630 pci_vfio_msix_is_mappable(int vfio_dev_fd, int msix_region) 631 { 632 struct vfio_region_info *info; 633 int ret; 634 635 ret = pci_vfio_get_region_info(vfio_dev_fd, &info, msix_region); 636 if (ret < 0) 637 return -1; 638 639 ret = pci_vfio_info_cap(info, RTE_VFIO_CAP_MSIX_MAPPABLE) != NULL; 640 641 /* cleanup */ 642 free(info); 643 644 return ret; 645 } 646 647 648 static int 649 pci_vfio_map_resource_primary(struct rte_pci_device *dev) 650 { 651 struct vfio_device_info device_info = { .argsz = sizeof(device_info) }; 652 char pci_addr[PATH_MAX] = {0}; 653 int vfio_dev_fd; 654 struct rte_pci_addr *loc = &dev->addr; 655 int i, ret; 656 struct mapped_pci_resource *vfio_res = NULL; 657 struct mapped_pci_res_list *vfio_res_list = 658 RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list); 659 660 struct pci_map *maps; 661 662 dev->intr_handle.fd = -1; 663 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE 664 dev->vfio_req_intr_handle.fd = -1; 665 #endif 666 667 /* store PCI address string */ 668 snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT, 669 loc->domain, loc->bus, loc->devid, loc->function); 670 671 ret = rte_vfio_setup_device(rte_pci_get_sysfs_path(), pci_addr, 672 &vfio_dev_fd, &device_info); 673 if (ret) 674 return ret; 675 676 /* allocate vfio_res and get region info */ 677 vfio_res = rte_zmalloc("VFIO_RES", sizeof(*vfio_res), 0); 678 if (vfio_res == NULL) { 679 RTE_LOG(ERR, EAL, 680 "%s(): cannot store vfio mmap details\n", __func__); 681 goto err_vfio_dev_fd; 682 } 683 memcpy(&vfio_res->pci_addr, &dev->addr, sizeof(vfio_res->pci_addr)); 684 685 /* get number of registers (up to BAR5) */ 686 vfio_res->nb_maps = RTE_MIN((int) device_info.num_regions, 687 VFIO_PCI_BAR5_REGION_INDEX + 1); 688 689 /* map BARs */ 690 maps = vfio_res->maps; 691 692 vfio_res->msix_table.bar_index = -1; 693 /* get MSI-X BAR, if any (we have to know where it is because we can't 694 * easily mmap it when using VFIO) 695 */ 696 ret = pci_vfio_get_msix_bar(vfio_dev_fd, &vfio_res->msix_table); 697 if (ret < 0) { 698 RTE_LOG(ERR, EAL, " %s cannot get MSI-X BAR number!\n", 699 pci_addr); 700 goto err_vfio_res; 701 } 702 /* if we found our MSI-X BAR region, check if we can mmap it */ 703 if (vfio_res->msix_table.bar_index != -1) { 704 int ret = pci_vfio_msix_is_mappable(vfio_dev_fd, 705 vfio_res->msix_table.bar_index); 706 if (ret < 0) { 707 RTE_LOG(ERR, EAL, "Couldn't check if MSI-X BAR is mappable\n"); 708 goto err_vfio_res; 709 } else if (ret != 0) { 710 /* we can map it, so we don't care where it is */ 711 RTE_LOG(DEBUG, EAL, "VFIO reports MSI-X BAR as mappable\n"); 712 vfio_res->msix_table.bar_index = -1; 713 } 714 } 715 716 for (i = 0; i < (int) vfio_res->nb_maps; i++) { 717 struct vfio_region_info *reg = NULL; 718 void *bar_addr; 719 720 ret = pci_vfio_get_region_info(vfio_dev_fd, ®, i); 721 if (ret < 0) { 722 RTE_LOG(ERR, EAL, " %s cannot get device region info " 723 "error %i (%s)\n", pci_addr, errno, 724 strerror(errno)); 725 goto err_vfio_res; 726 } 727 728 /* chk for io port region */ 729 ret = pci_vfio_is_ioport_bar(vfio_dev_fd, i); 730 if (ret < 0) { 731 free(reg); 732 goto err_vfio_res; 733 } else if (ret) { 734 RTE_LOG(INFO, EAL, "Ignore mapping IO port bar(%d)\n", 735 i); 736 free(reg); 737 continue; 738 } 739 740 /* skip non-mmapable BARs */ 741 if ((reg->flags & VFIO_REGION_INFO_FLAG_MMAP) == 0) { 742 free(reg); 743 continue; 744 } 745 746 /* try mapping somewhere close to the end of hugepages */ 747 if (pci_map_addr == NULL) 748 pci_map_addr = pci_find_max_end_va(); 749 750 bar_addr = pci_map_addr; 751 pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg->size); 752 753 pci_map_addr = RTE_PTR_ALIGN(pci_map_addr, 754 sysconf(_SC_PAGE_SIZE)); 755 756 maps[i].addr = bar_addr; 757 maps[i].offset = reg->offset; 758 maps[i].size = reg->size; 759 maps[i].path = NULL; /* vfio doesn't have per-resource paths */ 760 761 ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, 0); 762 if (ret < 0) { 763 RTE_LOG(ERR, EAL, " %s mapping BAR%i failed: %s\n", 764 pci_addr, i, strerror(errno)); 765 free(reg); 766 goto err_vfio_res; 767 } 768 769 dev->mem_resource[i].addr = maps[i].addr; 770 771 free(reg); 772 } 773 774 if (pci_rte_vfio_setup_device(dev, vfio_dev_fd) < 0) { 775 RTE_LOG(ERR, EAL, " %s setup device failed\n", pci_addr); 776 goto err_vfio_res; 777 } 778 779 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE 780 if (pci_vfio_enable_notifier(dev, vfio_dev_fd) != 0) { 781 RTE_LOG(ERR, EAL, "Error setting up notifier!\n"); 782 goto err_vfio_res; 783 } 784 785 #endif 786 TAILQ_INSERT_TAIL(vfio_res_list, vfio_res, next); 787 788 return 0; 789 err_vfio_res: 790 rte_free(vfio_res); 791 err_vfio_dev_fd: 792 close(vfio_dev_fd); 793 return -1; 794 } 795 796 static int 797 pci_vfio_map_resource_secondary(struct rte_pci_device *dev) 798 { 799 struct vfio_device_info device_info = { .argsz = sizeof(device_info) }; 800 char pci_addr[PATH_MAX] = {0}; 801 int vfio_dev_fd; 802 struct rte_pci_addr *loc = &dev->addr; 803 int i, ret; 804 struct mapped_pci_resource *vfio_res = NULL; 805 struct mapped_pci_res_list *vfio_res_list = 806 RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list); 807 808 struct pci_map *maps; 809 810 dev->intr_handle.fd = -1; 811 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE 812 dev->vfio_req_intr_handle.fd = -1; 813 #endif 814 815 /* store PCI address string */ 816 snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT, 817 loc->domain, loc->bus, loc->devid, loc->function); 818 819 /* if we're in a secondary process, just find our tailq entry */ 820 TAILQ_FOREACH(vfio_res, vfio_res_list, next) { 821 if (rte_pci_addr_cmp(&vfio_res->pci_addr, 822 &dev->addr)) 823 continue; 824 break; 825 } 826 /* if we haven't found our tailq entry, something's wrong */ 827 if (vfio_res == NULL) { 828 RTE_LOG(ERR, EAL, " %s cannot find TAILQ entry for PCI device!\n", 829 pci_addr); 830 return -1; 831 } 832 833 ret = rte_vfio_setup_device(rte_pci_get_sysfs_path(), pci_addr, 834 &vfio_dev_fd, &device_info); 835 if (ret) 836 return ret; 837 838 /* map BARs */ 839 maps = vfio_res->maps; 840 841 for (i = 0; i < (int) vfio_res->nb_maps; i++) { 842 ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, MAP_FIXED); 843 if (ret < 0) { 844 RTE_LOG(ERR, EAL, " %s mapping BAR%i failed: %s\n", 845 pci_addr, i, strerror(errno)); 846 goto err_vfio_dev_fd; 847 } 848 849 dev->mem_resource[i].addr = maps[i].addr; 850 } 851 852 /* we need save vfio_dev_fd, so it can be used during release */ 853 dev->intr_handle.vfio_dev_fd = vfio_dev_fd; 854 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE 855 dev->vfio_req_intr_handle.vfio_dev_fd = vfio_dev_fd; 856 #endif 857 858 return 0; 859 err_vfio_dev_fd: 860 close(vfio_dev_fd); 861 return -1; 862 } 863 864 /* 865 * map the PCI resources of a PCI device in virtual memory (VFIO version). 866 * primary and secondary processes follow almost exactly the same path 867 */ 868 int 869 pci_vfio_map_resource(struct rte_pci_device *dev) 870 { 871 if (rte_eal_process_type() == RTE_PROC_PRIMARY) 872 return pci_vfio_map_resource_primary(dev); 873 else 874 return pci_vfio_map_resource_secondary(dev); 875 } 876 877 static struct mapped_pci_resource * 878 find_and_unmap_vfio_resource(struct mapped_pci_res_list *vfio_res_list, 879 struct rte_pci_device *dev, 880 const char *pci_addr) 881 { 882 struct mapped_pci_resource *vfio_res = NULL; 883 struct pci_map *maps; 884 int i; 885 886 /* Get vfio_res */ 887 TAILQ_FOREACH(vfio_res, vfio_res_list, next) { 888 if (rte_pci_addr_cmp(&vfio_res->pci_addr, &dev->addr)) 889 continue; 890 break; 891 } 892 893 if (vfio_res == NULL) 894 return vfio_res; 895 896 RTE_LOG(INFO, EAL, "Releasing pci mapped resource for %s\n", 897 pci_addr); 898 899 maps = vfio_res->maps; 900 for (i = 0; i < (int) vfio_res->nb_maps; i++) { 901 902 /* 903 * We do not need to be aware of MSI-X table BAR mappings as 904 * when mapping. Just using current maps array is enough 905 */ 906 if (maps[i].addr) { 907 RTE_LOG(INFO, EAL, "Calling pci_unmap_resource for %s at %p\n", 908 pci_addr, maps[i].addr); 909 pci_unmap_resource(maps[i].addr, maps[i].size); 910 } 911 } 912 913 return vfio_res; 914 } 915 916 static int 917 pci_vfio_unmap_resource_primary(struct rte_pci_device *dev) 918 { 919 char pci_addr[PATH_MAX] = {0}; 920 struct rte_pci_addr *loc = &dev->addr; 921 struct mapped_pci_resource *vfio_res = NULL; 922 struct mapped_pci_res_list *vfio_res_list; 923 int ret; 924 925 /* store PCI address string */ 926 snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT, 927 loc->domain, loc->bus, loc->devid, loc->function); 928 929 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE 930 ret = pci_vfio_disable_notifier(dev); 931 if (ret) { 932 RTE_LOG(ERR, EAL, "fail to disable req notifier.\n"); 933 return -1; 934 } 935 936 #endif 937 if (close(dev->intr_handle.fd) < 0) { 938 RTE_LOG(INFO, EAL, "Error when closing eventfd file descriptor for %s\n", 939 pci_addr); 940 return -1; 941 } 942 943 if (pci_vfio_set_bus_master(dev->intr_handle.vfio_dev_fd, false)) { 944 RTE_LOG(ERR, EAL, " %s cannot unset bus mastering for PCI device!\n", 945 pci_addr); 946 return -1; 947 } 948 949 ret = rte_vfio_release_device(rte_pci_get_sysfs_path(), pci_addr, 950 dev->intr_handle.vfio_dev_fd); 951 if (ret < 0) { 952 RTE_LOG(ERR, EAL, 953 "%s(): cannot release device\n", __func__); 954 return ret; 955 } 956 957 vfio_res_list = 958 RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list); 959 vfio_res = find_and_unmap_vfio_resource(vfio_res_list, dev, pci_addr); 960 961 /* if we haven't found our tailq entry, something's wrong */ 962 if (vfio_res == NULL) { 963 RTE_LOG(ERR, EAL, " %s cannot find TAILQ entry for PCI device!\n", 964 pci_addr); 965 return -1; 966 } 967 968 TAILQ_REMOVE(vfio_res_list, vfio_res, next); 969 970 return 0; 971 } 972 973 static int 974 pci_vfio_unmap_resource_secondary(struct rte_pci_device *dev) 975 { 976 char pci_addr[PATH_MAX] = {0}; 977 struct rte_pci_addr *loc = &dev->addr; 978 struct mapped_pci_resource *vfio_res = NULL; 979 struct mapped_pci_res_list *vfio_res_list; 980 int ret; 981 982 /* store PCI address string */ 983 snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT, 984 loc->domain, loc->bus, loc->devid, loc->function); 985 986 ret = rte_vfio_release_device(rte_pci_get_sysfs_path(), pci_addr, 987 dev->intr_handle.vfio_dev_fd); 988 if (ret < 0) { 989 RTE_LOG(ERR, EAL, 990 "%s(): cannot release device\n", __func__); 991 return ret; 992 } 993 994 vfio_res_list = 995 RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list); 996 vfio_res = find_and_unmap_vfio_resource(vfio_res_list, dev, pci_addr); 997 998 /* if we haven't found our tailq entry, something's wrong */ 999 if (vfio_res == NULL) { 1000 RTE_LOG(ERR, EAL, " %s cannot find TAILQ entry for PCI device!\n", 1001 pci_addr); 1002 return -1; 1003 } 1004 1005 return 0; 1006 } 1007 1008 int 1009 pci_vfio_unmap_resource(struct rte_pci_device *dev) 1010 { 1011 if (rte_eal_process_type() == RTE_PROC_PRIMARY) 1012 return pci_vfio_unmap_resource_primary(dev); 1013 else 1014 return pci_vfio_unmap_resource_secondary(dev); 1015 } 1016 1017 int 1018 pci_vfio_ioport_map(struct rte_pci_device *dev, int bar, 1019 struct rte_pci_ioport *p) 1020 { 1021 if (bar < VFIO_PCI_BAR0_REGION_INDEX || 1022 bar > VFIO_PCI_BAR5_REGION_INDEX) { 1023 RTE_LOG(ERR, EAL, "invalid bar (%d)!\n", bar); 1024 return -1; 1025 } 1026 1027 p->dev = dev; 1028 p->base = VFIO_GET_REGION_ADDR(bar); 1029 return 0; 1030 } 1031 1032 void 1033 pci_vfio_ioport_read(struct rte_pci_ioport *p, 1034 void *data, size_t len, off_t offset) 1035 { 1036 const struct rte_intr_handle *intr_handle = &p->dev->intr_handle; 1037 1038 if (pread64(intr_handle->vfio_dev_fd, data, 1039 len, p->base + offset) <= 0) 1040 RTE_LOG(ERR, EAL, 1041 "Can't read from PCI bar (%" PRIu64 ") : offset (%x)\n", 1042 VFIO_GET_REGION_IDX(p->base), (int)offset); 1043 } 1044 1045 void 1046 pci_vfio_ioport_write(struct rte_pci_ioport *p, 1047 const void *data, size_t len, off_t offset) 1048 { 1049 const struct rte_intr_handle *intr_handle = &p->dev->intr_handle; 1050 1051 if (pwrite64(intr_handle->vfio_dev_fd, data, 1052 len, p->base + offset) <= 0) 1053 RTE_LOG(ERR, EAL, 1054 "Can't write to PCI bar (%" PRIu64 ") : offset (%x)\n", 1055 VFIO_GET_REGION_IDX(p->base), (int)offset); 1056 } 1057 1058 int 1059 pci_vfio_ioport_unmap(struct rte_pci_ioport *p) 1060 { 1061 RTE_SET_USED(p); 1062 return -1; 1063 } 1064 1065 int 1066 pci_vfio_is_enabled(void) 1067 { 1068 return rte_vfio_is_enabled("vfio_pci"); 1069 } 1070 #endif 1071