1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2010-2014 Intel Corporation 3 */ 4 5 #include <string.h> 6 #include <fcntl.h> 7 #include <linux/pci_regs.h> 8 #include <sys/eventfd.h> 9 #include <sys/socket.h> 10 #include <sys/ioctl.h> 11 #include <sys/mman.h> 12 #include <stdbool.h> 13 14 #include <rte_log.h> 15 #include <rte_pci.h> 16 #include <rte_bus_pci.h> 17 #include <rte_malloc.h> 18 #include <rte_vfio.h> 19 #include <rte_eal.h> 20 #include <rte_bus.h> 21 #include <rte_spinlock.h> 22 #include <rte_tailq.h> 23 24 #include "eal_filesystem.h" 25 26 #include "pci_init.h" 27 #include "private.h" 28 29 /** 30 * @file 31 * PCI probing under linux (VFIO version) 32 * 33 * This code tries to determine if the PCI device is bound to VFIO driver, 34 * and initialize it (map BARs, set up interrupts) if that's the case. 35 * 36 * This file is only compiled if CONFIG_RTE_EAL_VFIO is set to "y". 37 */ 38 39 #ifdef VFIO_PRESENT 40 41 #ifndef PAGE_SIZE 42 #define PAGE_SIZE (sysconf(_SC_PAGESIZE)) 43 #endif 44 #define PAGE_MASK (~(PAGE_SIZE - 1)) 45 46 static struct rte_tailq_elem rte_vfio_tailq = { 47 .name = "VFIO_RESOURCE_LIST", 48 }; 49 EAL_REGISTER_TAILQ(rte_vfio_tailq) 50 51 int 52 pci_vfio_read_config(const struct rte_intr_handle *intr_handle, 53 void *buf, size_t len, off_t offs) 54 { 55 return pread64(intr_handle->vfio_dev_fd, buf, len, 56 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + offs); 57 } 58 59 int 60 pci_vfio_write_config(const struct rte_intr_handle *intr_handle, 61 const void *buf, size_t len, off_t offs) 62 { 63 return pwrite64(intr_handle->vfio_dev_fd, buf, len, 64 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + offs); 65 } 66 67 /* get PCI BAR number where MSI-X interrupts are */ 68 static int 69 pci_vfio_get_msix_bar(int fd, struct pci_msix_table *msix_table) 70 { 71 int ret; 72 uint32_t reg; 73 uint16_t flags; 74 uint8_t cap_id, cap_offset; 75 76 /* read PCI capability pointer from config space */ 77 ret = pread64(fd, ®, sizeof(reg), 78 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + 79 PCI_CAPABILITY_LIST); 80 if (ret != sizeof(reg)) { 81 RTE_LOG(ERR, EAL, "Cannot read capability pointer from PCI " 82 "config space!\n"); 83 return -1; 84 } 85 86 /* we need first byte */ 87 cap_offset = reg & 0xFF; 88 89 while (cap_offset) { 90 91 /* read PCI capability ID */ 92 ret = pread64(fd, ®, sizeof(reg), 93 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + 94 cap_offset); 95 if (ret != sizeof(reg)) { 96 RTE_LOG(ERR, EAL, "Cannot read capability ID from PCI " 97 "config space!\n"); 98 return -1; 99 } 100 101 /* we need first byte */ 102 cap_id = reg & 0xFF; 103 104 /* if we haven't reached MSI-X, check next capability */ 105 if (cap_id != PCI_CAP_ID_MSIX) { 106 ret = pread64(fd, ®, sizeof(reg), 107 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + 108 cap_offset); 109 if (ret != sizeof(reg)) { 110 RTE_LOG(ERR, EAL, "Cannot read capability pointer from PCI " 111 "config space!\n"); 112 return -1; 113 } 114 115 /* we need second byte */ 116 cap_offset = (reg & 0xFF00) >> 8; 117 118 continue; 119 } 120 /* else, read table offset */ 121 else { 122 /* table offset resides in the next 4 bytes */ 123 ret = pread64(fd, ®, sizeof(reg), 124 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + 125 cap_offset + 4); 126 if (ret != sizeof(reg)) { 127 RTE_LOG(ERR, EAL, "Cannot read table offset from PCI config " 128 "space!\n"); 129 return -1; 130 } 131 132 ret = pread64(fd, &flags, sizeof(flags), 133 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + 134 cap_offset + 2); 135 if (ret != sizeof(flags)) { 136 RTE_LOG(ERR, EAL, "Cannot read table flags from PCI config " 137 "space!\n"); 138 return -1; 139 } 140 141 msix_table->bar_index = reg & RTE_PCI_MSIX_TABLE_BIR; 142 msix_table->offset = reg & RTE_PCI_MSIX_TABLE_OFFSET; 143 msix_table->size = 144 16 * (1 + (flags & RTE_PCI_MSIX_FLAGS_QSIZE)); 145 146 return 0; 147 } 148 } 149 return 0; 150 } 151 152 /* set PCI bus mastering */ 153 static int 154 pci_vfio_set_bus_master(int dev_fd, bool op) 155 { 156 uint16_t reg; 157 int ret; 158 159 ret = pread64(dev_fd, ®, sizeof(reg), 160 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + 161 PCI_COMMAND); 162 if (ret != sizeof(reg)) { 163 RTE_LOG(ERR, EAL, "Cannot read command from PCI config space!\n"); 164 return -1; 165 } 166 167 if (op) 168 /* set the master bit */ 169 reg |= PCI_COMMAND_MASTER; 170 else 171 reg &= ~(PCI_COMMAND_MASTER); 172 173 ret = pwrite64(dev_fd, ®, sizeof(reg), 174 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + 175 PCI_COMMAND); 176 177 if (ret != sizeof(reg)) { 178 RTE_LOG(ERR, EAL, "Cannot write command to PCI config space!\n"); 179 return -1; 180 } 181 182 return 0; 183 } 184 185 /* set up interrupt support (but not enable interrupts) */ 186 static int 187 pci_vfio_setup_interrupts(struct rte_pci_device *dev, int vfio_dev_fd) 188 { 189 int i, ret, intr_idx; 190 enum rte_intr_mode intr_mode; 191 192 /* default to invalid index */ 193 intr_idx = VFIO_PCI_NUM_IRQS; 194 195 /* Get default / configured intr_mode */ 196 intr_mode = rte_eal_vfio_intr_mode(); 197 198 /* get interrupt type from internal config (MSI-X by default, can be 199 * overridden from the command line 200 */ 201 switch (intr_mode) { 202 case RTE_INTR_MODE_MSIX: 203 intr_idx = VFIO_PCI_MSIX_IRQ_INDEX; 204 break; 205 case RTE_INTR_MODE_MSI: 206 intr_idx = VFIO_PCI_MSI_IRQ_INDEX; 207 break; 208 case RTE_INTR_MODE_LEGACY: 209 intr_idx = VFIO_PCI_INTX_IRQ_INDEX; 210 break; 211 /* don't do anything if we want to automatically determine interrupt type */ 212 case RTE_INTR_MODE_NONE: 213 break; 214 default: 215 RTE_LOG(ERR, EAL, " unknown default interrupt type!\n"); 216 return -1; 217 } 218 219 /* start from MSI-X interrupt type */ 220 for (i = VFIO_PCI_MSIX_IRQ_INDEX; i >= 0; i--) { 221 struct vfio_irq_info irq = { .argsz = sizeof(irq) }; 222 int fd = -1; 223 224 /* skip interrupt modes we don't want */ 225 if (intr_mode != RTE_INTR_MODE_NONE && 226 i != intr_idx) 227 continue; 228 229 irq.index = i; 230 231 ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_IRQ_INFO, &irq); 232 if (ret < 0) { 233 RTE_LOG(ERR, EAL, " cannot get IRQ info, " 234 "error %i (%s)\n", errno, strerror(errno)); 235 return -1; 236 } 237 238 /* if this vector cannot be used with eventfd, fail if we explicitly 239 * specified interrupt type, otherwise continue */ 240 if ((irq.flags & VFIO_IRQ_INFO_EVENTFD) == 0) { 241 if (intr_mode != RTE_INTR_MODE_NONE) { 242 RTE_LOG(ERR, EAL, 243 " interrupt vector does not support eventfd!\n"); 244 return -1; 245 } else 246 continue; 247 } 248 249 /* set up an eventfd for interrupts */ 250 fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); 251 if (fd < 0) { 252 RTE_LOG(ERR, EAL, " cannot set up eventfd, " 253 "error %i (%s)\n", errno, strerror(errno)); 254 return -1; 255 } 256 257 dev->intr_handle.fd = fd; 258 dev->intr_handle.vfio_dev_fd = vfio_dev_fd; 259 260 switch (i) { 261 case VFIO_PCI_MSIX_IRQ_INDEX: 262 intr_mode = RTE_INTR_MODE_MSIX; 263 dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_MSIX; 264 break; 265 case VFIO_PCI_MSI_IRQ_INDEX: 266 intr_mode = RTE_INTR_MODE_MSI; 267 dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_MSI; 268 break; 269 case VFIO_PCI_INTX_IRQ_INDEX: 270 intr_mode = RTE_INTR_MODE_LEGACY; 271 dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_LEGACY; 272 break; 273 default: 274 RTE_LOG(ERR, EAL, " unknown interrupt type!\n"); 275 return -1; 276 } 277 278 return 0; 279 } 280 281 /* if we're here, we haven't found a suitable interrupt vector */ 282 return -1; 283 } 284 285 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE 286 /* 287 * Spinlock for device hot-unplug failure handling. 288 * If it tries to access bus or device, such as handle sigbus on bus 289 * or handle memory failure for device, just need to use this lock. 290 * It could protect the bus and the device to avoid race condition. 291 */ 292 static rte_spinlock_t failure_handle_lock = RTE_SPINLOCK_INITIALIZER; 293 294 static void 295 pci_vfio_req_handler(void *param) 296 { 297 struct rte_bus *bus; 298 int ret; 299 struct rte_device *device = (struct rte_device *)param; 300 301 rte_spinlock_lock(&failure_handle_lock); 302 bus = rte_bus_find_by_device(device); 303 if (bus == NULL) { 304 RTE_LOG(ERR, EAL, "Cannot find bus for device (%s)\n", 305 device->name); 306 goto handle_end; 307 } 308 309 /* 310 * vfio kernel module request user space to release allocated 311 * resources before device be deleted in kernel, so it can directly 312 * call the vfio bus hot-unplug handler to process it. 313 */ 314 ret = bus->hot_unplug_handler(device); 315 if (ret) 316 RTE_LOG(ERR, EAL, 317 "Can not handle hot-unplug for device (%s)\n", 318 device->name); 319 handle_end: 320 rte_spinlock_unlock(&failure_handle_lock); 321 } 322 323 /* enable notifier (only enable req now) */ 324 static int 325 pci_vfio_enable_notifier(struct rte_pci_device *dev, int vfio_dev_fd) 326 { 327 int ret; 328 int fd = -1; 329 330 /* set up an eventfd for req notifier */ 331 fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); 332 if (fd < 0) { 333 RTE_LOG(ERR, EAL, "Cannot set up eventfd, error %i (%s)\n", 334 errno, strerror(errno)); 335 return -1; 336 } 337 338 dev->vfio_req_intr_handle.fd = fd; 339 dev->vfio_req_intr_handle.type = RTE_INTR_HANDLE_VFIO_REQ; 340 dev->vfio_req_intr_handle.vfio_dev_fd = vfio_dev_fd; 341 342 ret = rte_intr_callback_register(&dev->vfio_req_intr_handle, 343 pci_vfio_req_handler, 344 (void *)&dev->device); 345 if (ret) { 346 RTE_LOG(ERR, EAL, "Fail to register req notifier handler.\n"); 347 goto error; 348 } 349 350 ret = rte_intr_enable(&dev->vfio_req_intr_handle); 351 if (ret) { 352 RTE_LOG(ERR, EAL, "Fail to enable req notifier.\n"); 353 ret = rte_intr_callback_unregister(&dev->vfio_req_intr_handle, 354 pci_vfio_req_handler, 355 (void *)&dev->device); 356 if (ret < 0) 357 RTE_LOG(ERR, EAL, 358 "Fail to unregister req notifier handler.\n"); 359 goto error; 360 } 361 362 return 0; 363 error: 364 close(fd); 365 366 dev->vfio_req_intr_handle.fd = -1; 367 dev->vfio_req_intr_handle.type = RTE_INTR_HANDLE_UNKNOWN; 368 dev->vfio_req_intr_handle.vfio_dev_fd = -1; 369 370 return -1; 371 } 372 373 /* disable notifier (only disable req now) */ 374 static int 375 pci_vfio_disable_notifier(struct rte_pci_device *dev) 376 { 377 int ret; 378 379 ret = rte_intr_disable(&dev->vfio_req_intr_handle); 380 if (ret) { 381 RTE_LOG(ERR, EAL, "fail to disable req notifier.\n"); 382 return -1; 383 } 384 385 ret = rte_intr_callback_unregister(&dev->vfio_req_intr_handle, 386 pci_vfio_req_handler, 387 (void *)&dev->device); 388 if (ret < 0) { 389 RTE_LOG(ERR, EAL, 390 "fail to unregister req notifier handler.\n"); 391 return -1; 392 } 393 394 close(dev->vfio_req_intr_handle.fd); 395 396 dev->vfio_req_intr_handle.fd = -1; 397 dev->vfio_req_intr_handle.type = RTE_INTR_HANDLE_UNKNOWN; 398 dev->vfio_req_intr_handle.vfio_dev_fd = -1; 399 400 return 0; 401 } 402 #endif 403 404 static int 405 pci_vfio_is_ioport_bar(int vfio_dev_fd, int bar_index) 406 { 407 uint32_t ioport_bar; 408 int ret; 409 410 ret = pread64(vfio_dev_fd, &ioport_bar, sizeof(ioport_bar), 411 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) 412 + PCI_BASE_ADDRESS_0 + bar_index*4); 413 if (ret != sizeof(ioport_bar)) { 414 RTE_LOG(ERR, EAL, "Cannot read command (%x) from config space!\n", 415 PCI_BASE_ADDRESS_0 + bar_index*4); 416 return -1; 417 } 418 419 return (ioport_bar & PCI_BASE_ADDRESS_SPACE_IO) != 0; 420 } 421 422 static int 423 pci_rte_vfio_setup_device(struct rte_pci_device *dev, int vfio_dev_fd) 424 { 425 if (pci_vfio_setup_interrupts(dev, vfio_dev_fd) != 0) { 426 RTE_LOG(ERR, EAL, "Error setting up interrupts!\n"); 427 return -1; 428 } 429 430 /* set bus mastering for the device */ 431 if (pci_vfio_set_bus_master(vfio_dev_fd, true)) { 432 RTE_LOG(ERR, EAL, "Cannot set up bus mastering!\n"); 433 return -1; 434 } 435 436 /* 437 * Reset the device. If the device is not capable of resetting, 438 * then it updates errno as EINVAL. 439 */ 440 if (ioctl(vfio_dev_fd, VFIO_DEVICE_RESET) && errno != EINVAL) { 441 RTE_LOG(ERR, EAL, "Unable to reset device! Error: %d (%s)\n", 442 errno, strerror(errno)); 443 return -1; 444 } 445 446 return 0; 447 } 448 449 static int 450 pci_vfio_mmap_bar(int vfio_dev_fd, struct mapped_pci_resource *vfio_res, 451 int bar_index, int additional_flags) 452 { 453 struct memreg { 454 unsigned long offset, size; 455 } memreg[2] = {}; 456 void *bar_addr; 457 struct pci_msix_table *msix_table = &vfio_res->msix_table; 458 struct pci_map *bar = &vfio_res->maps[bar_index]; 459 460 if (bar->size == 0) { 461 RTE_LOG(DEBUG, EAL, "Bar size is 0, skip BAR%d\n", bar_index); 462 return 0; 463 } 464 465 if (msix_table->bar_index == bar_index) { 466 /* 467 * VFIO will not let us map the MSI-X table, 468 * but we can map around it. 469 */ 470 uint32_t table_start = msix_table->offset; 471 uint32_t table_end = table_start + msix_table->size; 472 table_end = RTE_ALIGN(table_end, PAGE_SIZE); 473 table_start = RTE_ALIGN_FLOOR(table_start, PAGE_SIZE); 474 475 /* If page-aligned start of MSI-X table is less than the 476 * actual MSI-X table start address, reassign to the actual 477 * start address. 478 */ 479 if (table_start < msix_table->offset) 480 table_start = msix_table->offset; 481 482 if (table_start == 0 && table_end >= bar->size) { 483 /* Cannot map this BAR */ 484 RTE_LOG(DEBUG, EAL, "Skipping BAR%d\n", bar_index); 485 bar->size = 0; 486 bar->addr = 0; 487 return 0; 488 } 489 490 memreg[0].offset = bar->offset; 491 memreg[0].size = table_start; 492 if (bar->size < table_end) { 493 /* 494 * If MSI-X table end is beyond BAR end, don't attempt 495 * to perform second mapping. 496 */ 497 memreg[1].offset = 0; 498 memreg[1].size = 0; 499 } else { 500 memreg[1].offset = bar->offset + table_end; 501 memreg[1].size = bar->size - table_end; 502 } 503 504 RTE_LOG(DEBUG, EAL, 505 "Trying to map BAR%d that contains the MSI-X " 506 "table. Trying offsets: " 507 "0x%04lx:0x%04lx, 0x%04lx:0x%04lx\n", bar_index, 508 memreg[0].offset, memreg[0].size, 509 memreg[1].offset, memreg[1].size); 510 } else { 511 memreg[0].offset = bar->offset; 512 memreg[0].size = bar->size; 513 } 514 515 /* reserve the address using an inaccessible mapping */ 516 bar_addr = mmap(bar->addr, bar->size, 0, MAP_PRIVATE | 517 MAP_ANONYMOUS | additional_flags, -1, 0); 518 if (bar_addr != MAP_FAILED) { 519 void *map_addr = NULL; 520 if (memreg[0].size) { 521 /* actual map of first part */ 522 map_addr = pci_map_resource(bar_addr, vfio_dev_fd, 523 memreg[0].offset, 524 memreg[0].size, 525 MAP_FIXED); 526 } 527 528 /* if there's a second part, try to map it */ 529 if (map_addr != MAP_FAILED 530 && memreg[1].offset && memreg[1].size) { 531 void *second_addr = RTE_PTR_ADD(bar_addr, 532 memreg[1].offset - 533 (uintptr_t)bar->offset); 534 map_addr = pci_map_resource(second_addr, 535 vfio_dev_fd, 536 memreg[1].offset, 537 memreg[1].size, 538 MAP_FIXED); 539 } 540 541 if (map_addr == MAP_FAILED || !map_addr) { 542 munmap(bar_addr, bar->size); 543 bar_addr = MAP_FAILED; 544 RTE_LOG(ERR, EAL, "Failed to map pci BAR%d\n", 545 bar_index); 546 return -1; 547 } 548 } else { 549 RTE_LOG(ERR, EAL, 550 "Failed to create inaccessible mapping for BAR%d\n", 551 bar_index); 552 return -1; 553 } 554 555 bar->addr = bar_addr; 556 return 0; 557 } 558 559 /* 560 * region info may contain capability headers, so we need to keep reallocating 561 * the memory until we match allocated memory size with argsz. 562 */ 563 static int 564 pci_vfio_get_region_info(int vfio_dev_fd, struct vfio_region_info **info, 565 int region) 566 { 567 struct vfio_region_info *ri; 568 size_t argsz = sizeof(*ri); 569 int ret; 570 571 ri = malloc(sizeof(*ri)); 572 if (ri == NULL) { 573 RTE_LOG(ERR, EAL, "Cannot allocate memory for region info\n"); 574 return -1; 575 } 576 again: 577 memset(ri, 0, argsz); 578 ri->argsz = argsz; 579 ri->index = region; 580 581 ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, ri); 582 if (ret < 0) { 583 free(ri); 584 return ret; 585 } 586 if (ri->argsz != argsz) { 587 struct vfio_region_info *tmp; 588 589 argsz = ri->argsz; 590 tmp = realloc(ri, argsz); 591 592 if (tmp == NULL) { 593 /* realloc failed but the ri is still there */ 594 free(ri); 595 RTE_LOG(ERR, EAL, "Cannot reallocate memory for region info\n"); 596 return -1; 597 } 598 ri = tmp; 599 goto again; 600 } 601 *info = ri; 602 603 return 0; 604 } 605 606 static struct vfio_info_cap_header * 607 pci_vfio_info_cap(struct vfio_region_info *info, int cap) 608 { 609 struct vfio_info_cap_header *h; 610 size_t offset; 611 612 if ((info->flags & RTE_VFIO_INFO_FLAG_CAPS) == 0) { 613 /* VFIO info does not advertise capabilities */ 614 return NULL; 615 } 616 617 offset = VFIO_CAP_OFFSET(info); 618 while (offset != 0) { 619 h = RTE_PTR_ADD(info, offset); 620 if (h->id == cap) 621 return h; 622 offset = h->next; 623 } 624 return NULL; 625 } 626 627 static int 628 pci_vfio_msix_is_mappable(int vfio_dev_fd, int msix_region) 629 { 630 struct vfio_region_info *info; 631 int ret; 632 633 ret = pci_vfio_get_region_info(vfio_dev_fd, &info, msix_region); 634 if (ret < 0) 635 return -1; 636 637 ret = pci_vfio_info_cap(info, RTE_VFIO_CAP_MSIX_MAPPABLE) != NULL; 638 639 /* cleanup */ 640 free(info); 641 642 return ret; 643 } 644 645 646 static int 647 pci_vfio_map_resource_primary(struct rte_pci_device *dev) 648 { 649 struct vfio_device_info device_info = { .argsz = sizeof(device_info) }; 650 char pci_addr[PATH_MAX] = {0}; 651 int vfio_dev_fd; 652 struct rte_pci_addr *loc = &dev->addr; 653 int i, ret; 654 struct mapped_pci_resource *vfio_res = NULL; 655 struct mapped_pci_res_list *vfio_res_list = 656 RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list); 657 658 struct pci_map *maps; 659 660 dev->intr_handle.fd = -1; 661 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE 662 dev->vfio_req_intr_handle.fd = -1; 663 #endif 664 665 /* store PCI address string */ 666 snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT, 667 loc->domain, loc->bus, loc->devid, loc->function); 668 669 ret = rte_vfio_setup_device(rte_pci_get_sysfs_path(), pci_addr, 670 &vfio_dev_fd, &device_info); 671 if (ret) 672 return ret; 673 674 /* allocate vfio_res and get region info */ 675 vfio_res = rte_zmalloc("VFIO_RES", sizeof(*vfio_res), 0); 676 if (vfio_res == NULL) { 677 RTE_LOG(ERR, EAL, 678 "%s(): cannot store vfio mmap details\n", __func__); 679 goto err_vfio_dev_fd; 680 } 681 memcpy(&vfio_res->pci_addr, &dev->addr, sizeof(vfio_res->pci_addr)); 682 683 /* get number of registers (up to BAR5) */ 684 vfio_res->nb_maps = RTE_MIN((int) device_info.num_regions, 685 VFIO_PCI_BAR5_REGION_INDEX + 1); 686 687 /* map BARs */ 688 maps = vfio_res->maps; 689 690 vfio_res->msix_table.bar_index = -1; 691 /* get MSI-X BAR, if any (we have to know where it is because we can't 692 * easily mmap it when using VFIO) 693 */ 694 ret = pci_vfio_get_msix_bar(vfio_dev_fd, &vfio_res->msix_table); 695 if (ret < 0) { 696 RTE_LOG(ERR, EAL, " %s cannot get MSI-X BAR number!\n", 697 pci_addr); 698 goto err_vfio_res; 699 } 700 /* if we found our MSI-X BAR region, check if we can mmap it */ 701 if (vfio_res->msix_table.bar_index != -1) { 702 int ret = pci_vfio_msix_is_mappable(vfio_dev_fd, 703 vfio_res->msix_table.bar_index); 704 if (ret < 0) { 705 RTE_LOG(ERR, EAL, "Couldn't check if MSI-X BAR is mappable\n"); 706 goto err_vfio_res; 707 } else if (ret != 0) { 708 /* we can map it, so we don't care where it is */ 709 RTE_LOG(DEBUG, EAL, "VFIO reports MSI-X BAR as mappable\n"); 710 vfio_res->msix_table.bar_index = -1; 711 } 712 } 713 714 for (i = 0; i < (int) vfio_res->nb_maps; i++) { 715 struct vfio_region_info *reg = NULL; 716 void *bar_addr; 717 718 ret = pci_vfio_get_region_info(vfio_dev_fd, ®, i); 719 if (ret < 0) { 720 RTE_LOG(ERR, EAL, " %s cannot get device region info " 721 "error %i (%s)\n", pci_addr, errno, 722 strerror(errno)); 723 goto err_vfio_res; 724 } 725 726 /* chk for io port region */ 727 ret = pci_vfio_is_ioport_bar(vfio_dev_fd, i); 728 if (ret < 0) { 729 free(reg); 730 goto err_vfio_res; 731 } else if (ret) { 732 RTE_LOG(INFO, EAL, "Ignore mapping IO port bar(%d)\n", 733 i); 734 free(reg); 735 continue; 736 } 737 738 /* skip non-mmapable BARs */ 739 if ((reg->flags & VFIO_REGION_INFO_FLAG_MMAP) == 0) { 740 free(reg); 741 continue; 742 } 743 744 /* try mapping somewhere close to the end of hugepages */ 745 if (pci_map_addr == NULL) 746 pci_map_addr = pci_find_max_end_va(); 747 748 bar_addr = pci_map_addr; 749 pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg->size); 750 751 maps[i].addr = bar_addr; 752 maps[i].offset = reg->offset; 753 maps[i].size = reg->size; 754 maps[i].path = NULL; /* vfio doesn't have per-resource paths */ 755 756 ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, 0); 757 if (ret < 0) { 758 RTE_LOG(ERR, EAL, " %s mapping BAR%i failed: %s\n", 759 pci_addr, i, strerror(errno)); 760 free(reg); 761 goto err_vfio_res; 762 } 763 764 dev->mem_resource[i].addr = maps[i].addr; 765 766 free(reg); 767 } 768 769 if (pci_rte_vfio_setup_device(dev, vfio_dev_fd) < 0) { 770 RTE_LOG(ERR, EAL, " %s setup device failed\n", pci_addr); 771 goto err_vfio_res; 772 } 773 774 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE 775 if (pci_vfio_enable_notifier(dev, vfio_dev_fd) != 0) { 776 RTE_LOG(ERR, EAL, "Error setting up notifier!\n"); 777 goto err_vfio_res; 778 } 779 780 #endif 781 TAILQ_INSERT_TAIL(vfio_res_list, vfio_res, next); 782 783 return 0; 784 err_vfio_res: 785 rte_free(vfio_res); 786 err_vfio_dev_fd: 787 close(vfio_dev_fd); 788 return -1; 789 } 790 791 static int 792 pci_vfio_map_resource_secondary(struct rte_pci_device *dev) 793 { 794 struct vfio_device_info device_info = { .argsz = sizeof(device_info) }; 795 char pci_addr[PATH_MAX] = {0}; 796 int vfio_dev_fd; 797 struct rte_pci_addr *loc = &dev->addr; 798 int i, ret; 799 struct mapped_pci_resource *vfio_res = NULL; 800 struct mapped_pci_res_list *vfio_res_list = 801 RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list); 802 803 struct pci_map *maps; 804 805 dev->intr_handle.fd = -1; 806 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE 807 dev->vfio_req_intr_handle.fd = -1; 808 #endif 809 810 /* store PCI address string */ 811 snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT, 812 loc->domain, loc->bus, loc->devid, loc->function); 813 814 /* if we're in a secondary process, just find our tailq entry */ 815 TAILQ_FOREACH(vfio_res, vfio_res_list, next) { 816 if (rte_pci_addr_cmp(&vfio_res->pci_addr, 817 &dev->addr)) 818 continue; 819 break; 820 } 821 /* if we haven't found our tailq entry, something's wrong */ 822 if (vfio_res == NULL) { 823 RTE_LOG(ERR, EAL, " %s cannot find TAILQ entry for PCI device!\n", 824 pci_addr); 825 return -1; 826 } 827 828 ret = rte_vfio_setup_device(rte_pci_get_sysfs_path(), pci_addr, 829 &vfio_dev_fd, &device_info); 830 if (ret) 831 return ret; 832 833 /* map BARs */ 834 maps = vfio_res->maps; 835 836 for (i = 0; i < (int) vfio_res->nb_maps; i++) { 837 ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, MAP_FIXED); 838 if (ret < 0) { 839 RTE_LOG(ERR, EAL, " %s mapping BAR%i failed: %s\n", 840 pci_addr, i, strerror(errno)); 841 goto err_vfio_dev_fd; 842 } 843 844 dev->mem_resource[i].addr = maps[i].addr; 845 } 846 847 /* we need save vfio_dev_fd, so it can be used during release */ 848 dev->intr_handle.vfio_dev_fd = vfio_dev_fd; 849 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE 850 dev->vfio_req_intr_handle.vfio_dev_fd = vfio_dev_fd; 851 #endif 852 853 return 0; 854 err_vfio_dev_fd: 855 close(vfio_dev_fd); 856 return -1; 857 } 858 859 /* 860 * map the PCI resources of a PCI device in virtual memory (VFIO version). 861 * primary and secondary processes follow almost exactly the same path 862 */ 863 int 864 pci_vfio_map_resource(struct rte_pci_device *dev) 865 { 866 if (rte_eal_process_type() == RTE_PROC_PRIMARY) 867 return pci_vfio_map_resource_primary(dev); 868 else 869 return pci_vfio_map_resource_secondary(dev); 870 } 871 872 static struct mapped_pci_resource * 873 find_and_unmap_vfio_resource(struct mapped_pci_res_list *vfio_res_list, 874 struct rte_pci_device *dev, 875 const char *pci_addr) 876 { 877 struct mapped_pci_resource *vfio_res = NULL; 878 struct pci_map *maps; 879 int i; 880 881 /* Get vfio_res */ 882 TAILQ_FOREACH(vfio_res, vfio_res_list, next) { 883 if (rte_pci_addr_cmp(&vfio_res->pci_addr, &dev->addr)) 884 continue; 885 break; 886 } 887 888 if (vfio_res == NULL) 889 return vfio_res; 890 891 RTE_LOG(INFO, EAL, "Releasing pci mapped resource for %s\n", 892 pci_addr); 893 894 maps = vfio_res->maps; 895 for (i = 0; i < (int) vfio_res->nb_maps; i++) { 896 897 /* 898 * We do not need to be aware of MSI-X table BAR mappings as 899 * when mapping. Just using current maps array is enough 900 */ 901 if (maps[i].addr) { 902 RTE_LOG(INFO, EAL, "Calling pci_unmap_resource for %s at %p\n", 903 pci_addr, maps[i].addr); 904 pci_unmap_resource(maps[i].addr, maps[i].size); 905 } 906 } 907 908 return vfio_res; 909 } 910 911 static int 912 pci_vfio_unmap_resource_primary(struct rte_pci_device *dev) 913 { 914 char pci_addr[PATH_MAX] = {0}; 915 struct rte_pci_addr *loc = &dev->addr; 916 struct mapped_pci_resource *vfio_res = NULL; 917 struct mapped_pci_res_list *vfio_res_list; 918 int ret; 919 920 /* store PCI address string */ 921 snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT, 922 loc->domain, loc->bus, loc->devid, loc->function); 923 924 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE 925 ret = pci_vfio_disable_notifier(dev); 926 if (ret) { 927 RTE_LOG(ERR, EAL, "fail to disable req notifier.\n"); 928 return -1; 929 } 930 931 #endif 932 if (close(dev->intr_handle.fd) < 0) { 933 RTE_LOG(INFO, EAL, "Error when closing eventfd file descriptor for %s\n", 934 pci_addr); 935 return -1; 936 } 937 938 if (pci_vfio_set_bus_master(dev->intr_handle.vfio_dev_fd, false)) { 939 RTE_LOG(ERR, EAL, " %s cannot unset bus mastering for PCI device!\n", 940 pci_addr); 941 return -1; 942 } 943 944 ret = rte_vfio_release_device(rte_pci_get_sysfs_path(), pci_addr, 945 dev->intr_handle.vfio_dev_fd); 946 if (ret < 0) { 947 RTE_LOG(ERR, EAL, 948 "%s(): cannot release device\n", __func__); 949 return ret; 950 } 951 952 vfio_res_list = 953 RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list); 954 vfio_res = find_and_unmap_vfio_resource(vfio_res_list, dev, pci_addr); 955 956 /* if we haven't found our tailq entry, something's wrong */ 957 if (vfio_res == NULL) { 958 RTE_LOG(ERR, EAL, " %s cannot find TAILQ entry for PCI device!\n", 959 pci_addr); 960 return -1; 961 } 962 963 TAILQ_REMOVE(vfio_res_list, vfio_res, next); 964 965 return 0; 966 } 967 968 static int 969 pci_vfio_unmap_resource_secondary(struct rte_pci_device *dev) 970 { 971 char pci_addr[PATH_MAX] = {0}; 972 struct rte_pci_addr *loc = &dev->addr; 973 struct mapped_pci_resource *vfio_res = NULL; 974 struct mapped_pci_res_list *vfio_res_list; 975 int ret; 976 977 /* store PCI address string */ 978 snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT, 979 loc->domain, loc->bus, loc->devid, loc->function); 980 981 ret = rte_vfio_release_device(rte_pci_get_sysfs_path(), pci_addr, 982 dev->intr_handle.vfio_dev_fd); 983 if (ret < 0) { 984 RTE_LOG(ERR, EAL, 985 "%s(): cannot release device\n", __func__); 986 return ret; 987 } 988 989 vfio_res_list = 990 RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list); 991 vfio_res = find_and_unmap_vfio_resource(vfio_res_list, dev, pci_addr); 992 993 /* if we haven't found our tailq entry, something's wrong */ 994 if (vfio_res == NULL) { 995 RTE_LOG(ERR, EAL, " %s cannot find TAILQ entry for PCI device!\n", 996 pci_addr); 997 return -1; 998 } 999 1000 return 0; 1001 } 1002 1003 int 1004 pci_vfio_unmap_resource(struct rte_pci_device *dev) 1005 { 1006 if (rte_eal_process_type() == RTE_PROC_PRIMARY) 1007 return pci_vfio_unmap_resource_primary(dev); 1008 else 1009 return pci_vfio_unmap_resource_secondary(dev); 1010 } 1011 1012 int 1013 pci_vfio_ioport_map(struct rte_pci_device *dev, int bar, 1014 struct rte_pci_ioport *p) 1015 { 1016 if (bar < VFIO_PCI_BAR0_REGION_INDEX || 1017 bar > VFIO_PCI_BAR5_REGION_INDEX) { 1018 RTE_LOG(ERR, EAL, "invalid bar (%d)!\n", bar); 1019 return -1; 1020 } 1021 1022 p->dev = dev; 1023 p->base = VFIO_GET_REGION_ADDR(bar); 1024 return 0; 1025 } 1026 1027 void 1028 pci_vfio_ioport_read(struct rte_pci_ioport *p, 1029 void *data, size_t len, off_t offset) 1030 { 1031 const struct rte_intr_handle *intr_handle = &p->dev->intr_handle; 1032 1033 if (pread64(intr_handle->vfio_dev_fd, data, 1034 len, p->base + offset) <= 0) 1035 RTE_LOG(ERR, EAL, 1036 "Can't read from PCI bar (%" PRIu64 ") : offset (%x)\n", 1037 VFIO_GET_REGION_IDX(p->base), (int)offset); 1038 } 1039 1040 void 1041 pci_vfio_ioport_write(struct rte_pci_ioport *p, 1042 const void *data, size_t len, off_t offset) 1043 { 1044 const struct rte_intr_handle *intr_handle = &p->dev->intr_handle; 1045 1046 if (pwrite64(intr_handle->vfio_dev_fd, data, 1047 len, p->base + offset) <= 0) 1048 RTE_LOG(ERR, EAL, 1049 "Can't write to PCI bar (%" PRIu64 ") : offset (%x)\n", 1050 VFIO_GET_REGION_IDX(p->base), (int)offset); 1051 } 1052 1053 int 1054 pci_vfio_ioport_unmap(struct rte_pci_ioport *p) 1055 { 1056 RTE_SET_USED(p); 1057 return -1; 1058 } 1059 1060 int 1061 pci_vfio_is_enabled(void) 1062 { 1063 return rte_vfio_is_enabled("vfio_pci"); 1064 } 1065 #endif 1066