1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2010-2014 Intel Corporation 3 */ 4 5 #include <string.h> 6 #include <fcntl.h> 7 #include <linux/pci_regs.h> 8 #include <sys/eventfd.h> 9 #include <sys/socket.h> 10 #include <sys/ioctl.h> 11 #include <sys/mman.h> 12 #include <stdbool.h> 13 14 #include <rte_log.h> 15 #include <rte_pci.h> 16 #include <rte_bus_pci.h> 17 #include <rte_eal_paging.h> 18 #include <rte_malloc.h> 19 #include <rte_vfio.h> 20 #include <rte_eal.h> 21 #include <rte_bus.h> 22 #include <rte_spinlock.h> 23 #include <rte_tailq.h> 24 25 #include "eal_filesystem.h" 26 27 #include "pci_init.h" 28 #include "private.h" 29 30 /** 31 * @file 32 * PCI probing under linux (VFIO version) 33 * 34 * This code tries to determine if the PCI device is bound to VFIO driver, 35 * and initialize it (map BARs, set up interrupts) if that's the case. 36 * 37 */ 38 39 #ifdef VFIO_PRESENT 40 41 #ifndef PAGE_SIZE 42 #define PAGE_SIZE (sysconf(_SC_PAGESIZE)) 43 #endif 44 #define PAGE_MASK (~(PAGE_SIZE - 1)) 45 46 static struct rte_tailq_elem rte_vfio_tailq = { 47 .name = "VFIO_RESOURCE_LIST", 48 }; 49 EAL_REGISTER_TAILQ(rte_vfio_tailq) 50 51 int 52 pci_vfio_read_config(const struct rte_intr_handle *intr_handle, 53 void *buf, size_t len, off_t offs) 54 { 55 return pread64(intr_handle->vfio_dev_fd, buf, len, 56 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + offs); 57 } 58 59 int 60 pci_vfio_write_config(const struct rte_intr_handle *intr_handle, 61 const void *buf, size_t len, off_t offs) 62 { 63 return pwrite64(intr_handle->vfio_dev_fd, buf, len, 64 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + offs); 65 } 66 67 /* get PCI BAR number where MSI-X interrupts are */ 68 static int 69 pci_vfio_get_msix_bar(int fd, struct pci_msix_table *msix_table) 70 { 71 int ret; 72 uint32_t reg; 73 uint16_t flags; 74 uint8_t cap_id, cap_offset; 75 76 /* read PCI capability pointer from config space */ 77 ret = pread64(fd, ®, sizeof(reg), 78 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + 79 PCI_CAPABILITY_LIST); 80 if (ret != sizeof(reg)) { 81 RTE_LOG(ERR, EAL, "Cannot read capability pointer from PCI " 82 "config space!\n"); 83 return -1; 84 } 85 86 /* we need first byte */ 87 cap_offset = reg & 0xFF; 88 89 while (cap_offset) { 90 91 /* read PCI capability ID */ 92 ret = pread64(fd, ®, sizeof(reg), 93 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + 94 cap_offset); 95 if (ret != sizeof(reg)) { 96 RTE_LOG(ERR, EAL, "Cannot read capability ID from PCI " 97 "config space!\n"); 98 return -1; 99 } 100 101 /* we need first byte */ 102 cap_id = reg & 0xFF; 103 104 /* if we haven't reached MSI-X, check next capability */ 105 if (cap_id != PCI_CAP_ID_MSIX) { 106 ret = pread64(fd, ®, sizeof(reg), 107 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + 108 cap_offset); 109 if (ret != sizeof(reg)) { 110 RTE_LOG(ERR, EAL, "Cannot read capability pointer from PCI " 111 "config space!\n"); 112 return -1; 113 } 114 115 /* we need second byte */ 116 cap_offset = (reg & 0xFF00) >> 8; 117 118 continue; 119 } 120 /* else, read table offset */ 121 else { 122 /* table offset resides in the next 4 bytes */ 123 ret = pread64(fd, ®, sizeof(reg), 124 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + 125 cap_offset + 4); 126 if (ret != sizeof(reg)) { 127 RTE_LOG(ERR, EAL, "Cannot read table offset from PCI config " 128 "space!\n"); 129 return -1; 130 } 131 132 ret = pread64(fd, &flags, sizeof(flags), 133 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + 134 cap_offset + 2); 135 if (ret != sizeof(flags)) { 136 RTE_LOG(ERR, EAL, "Cannot read table flags from PCI config " 137 "space!\n"); 138 return -1; 139 } 140 141 msix_table->bar_index = reg & RTE_PCI_MSIX_TABLE_BIR; 142 msix_table->offset = reg & RTE_PCI_MSIX_TABLE_OFFSET; 143 msix_table->size = 144 16 * (1 + (flags & RTE_PCI_MSIX_FLAGS_QSIZE)); 145 146 return 0; 147 } 148 } 149 return 0; 150 } 151 152 /* enable PCI bus memory space */ 153 static int 154 pci_vfio_enable_bus_memory(int dev_fd) 155 { 156 uint16_t cmd; 157 int ret; 158 159 ret = pread64(dev_fd, &cmd, sizeof(cmd), 160 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + 161 PCI_COMMAND); 162 163 if (ret != sizeof(cmd)) { 164 RTE_LOG(ERR, EAL, "Cannot read command from PCI config space!\n"); 165 return -1; 166 } 167 168 if (cmd & PCI_COMMAND_MEMORY) 169 return 0; 170 171 cmd |= PCI_COMMAND_MEMORY; 172 ret = pwrite64(dev_fd, &cmd, sizeof(cmd), 173 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + 174 PCI_COMMAND); 175 176 if (ret != sizeof(cmd)) { 177 RTE_LOG(ERR, EAL, "Cannot write command to PCI config space!\n"); 178 return -1; 179 } 180 181 return 0; 182 } 183 184 /* set PCI bus mastering */ 185 static int 186 pci_vfio_set_bus_master(int dev_fd, bool op) 187 { 188 uint16_t reg; 189 int ret; 190 191 ret = pread64(dev_fd, ®, sizeof(reg), 192 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + 193 PCI_COMMAND); 194 if (ret != sizeof(reg)) { 195 RTE_LOG(ERR, EAL, "Cannot read command from PCI config space!\n"); 196 return -1; 197 } 198 199 if (op) 200 /* set the master bit */ 201 reg |= PCI_COMMAND_MASTER; 202 else 203 reg &= ~(PCI_COMMAND_MASTER); 204 205 ret = pwrite64(dev_fd, ®, sizeof(reg), 206 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + 207 PCI_COMMAND); 208 209 if (ret != sizeof(reg)) { 210 RTE_LOG(ERR, EAL, "Cannot write command to PCI config space!\n"); 211 return -1; 212 } 213 214 return 0; 215 } 216 217 /* set up interrupt support (but not enable interrupts) */ 218 static int 219 pci_vfio_setup_interrupts(struct rte_pci_device *dev, int vfio_dev_fd) 220 { 221 int i, ret, intr_idx; 222 enum rte_intr_mode intr_mode; 223 224 /* default to invalid index */ 225 intr_idx = VFIO_PCI_NUM_IRQS; 226 227 /* Get default / configured intr_mode */ 228 intr_mode = rte_eal_vfio_intr_mode(); 229 230 /* get interrupt type from internal config (MSI-X by default, can be 231 * overridden from the command line 232 */ 233 switch (intr_mode) { 234 case RTE_INTR_MODE_MSIX: 235 intr_idx = VFIO_PCI_MSIX_IRQ_INDEX; 236 break; 237 case RTE_INTR_MODE_MSI: 238 intr_idx = VFIO_PCI_MSI_IRQ_INDEX; 239 break; 240 case RTE_INTR_MODE_LEGACY: 241 intr_idx = VFIO_PCI_INTX_IRQ_INDEX; 242 break; 243 /* don't do anything if we want to automatically determine interrupt type */ 244 case RTE_INTR_MODE_NONE: 245 break; 246 default: 247 RTE_LOG(ERR, EAL, " unknown default interrupt type!\n"); 248 return -1; 249 } 250 251 /* start from MSI-X interrupt type */ 252 for (i = VFIO_PCI_MSIX_IRQ_INDEX; i >= 0; i--) { 253 struct vfio_irq_info irq = { .argsz = sizeof(irq) }; 254 int fd = -1; 255 256 /* skip interrupt modes we don't want */ 257 if (intr_mode != RTE_INTR_MODE_NONE && 258 i != intr_idx) 259 continue; 260 261 irq.index = i; 262 263 ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_IRQ_INFO, &irq); 264 if (ret < 0) { 265 RTE_LOG(ERR, EAL, " cannot get IRQ info, " 266 "error %i (%s)\n", errno, strerror(errno)); 267 return -1; 268 } 269 270 /* if this vector cannot be used with eventfd, fail if we explicitly 271 * specified interrupt type, otherwise continue */ 272 if ((irq.flags & VFIO_IRQ_INFO_EVENTFD) == 0) { 273 if (intr_mode != RTE_INTR_MODE_NONE) { 274 RTE_LOG(ERR, EAL, 275 " interrupt vector does not support eventfd!\n"); 276 return -1; 277 } else 278 continue; 279 } 280 281 /* set up an eventfd for interrupts */ 282 fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); 283 if (fd < 0) { 284 RTE_LOG(ERR, EAL, " cannot set up eventfd, " 285 "error %i (%s)\n", errno, strerror(errno)); 286 return -1; 287 } 288 289 dev->intr_handle.fd = fd; 290 dev->intr_handle.vfio_dev_fd = vfio_dev_fd; 291 292 switch (i) { 293 case VFIO_PCI_MSIX_IRQ_INDEX: 294 intr_mode = RTE_INTR_MODE_MSIX; 295 dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_MSIX; 296 break; 297 case VFIO_PCI_MSI_IRQ_INDEX: 298 intr_mode = RTE_INTR_MODE_MSI; 299 dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_MSI; 300 break; 301 case VFIO_PCI_INTX_IRQ_INDEX: 302 intr_mode = RTE_INTR_MODE_LEGACY; 303 dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_LEGACY; 304 break; 305 default: 306 RTE_LOG(ERR, EAL, " unknown interrupt type!\n"); 307 return -1; 308 } 309 310 return 0; 311 } 312 313 /* if we're here, we haven't found a suitable interrupt vector */ 314 return -1; 315 } 316 317 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE 318 /* 319 * Spinlock for device hot-unplug failure handling. 320 * If it tries to access bus or device, such as handle sigbus on bus 321 * or handle memory failure for device, just need to use this lock. 322 * It could protect the bus and the device to avoid race condition. 323 */ 324 static rte_spinlock_t failure_handle_lock = RTE_SPINLOCK_INITIALIZER; 325 326 static void 327 pci_vfio_req_handler(void *param) 328 { 329 struct rte_bus *bus; 330 int ret; 331 struct rte_device *device = (struct rte_device *)param; 332 333 rte_spinlock_lock(&failure_handle_lock); 334 bus = rte_bus_find_by_device(device); 335 if (bus == NULL) { 336 RTE_LOG(ERR, EAL, "Cannot find bus for device (%s)\n", 337 device->name); 338 goto handle_end; 339 } 340 341 /* 342 * vfio kernel module request user space to release allocated 343 * resources before device be deleted in kernel, so it can directly 344 * call the vfio bus hot-unplug handler to process it. 345 */ 346 ret = bus->hot_unplug_handler(device); 347 if (ret) 348 RTE_LOG(ERR, EAL, 349 "Can not handle hot-unplug for device (%s)\n", 350 device->name); 351 handle_end: 352 rte_spinlock_unlock(&failure_handle_lock); 353 } 354 355 /* enable notifier (only enable req now) */ 356 static int 357 pci_vfio_enable_notifier(struct rte_pci_device *dev, int vfio_dev_fd) 358 { 359 int ret; 360 int fd = -1; 361 362 /* set up an eventfd for req notifier */ 363 fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); 364 if (fd < 0) { 365 RTE_LOG(ERR, EAL, "Cannot set up eventfd, error %i (%s)\n", 366 errno, strerror(errno)); 367 return -1; 368 } 369 370 dev->vfio_req_intr_handle.fd = fd; 371 dev->vfio_req_intr_handle.type = RTE_INTR_HANDLE_VFIO_REQ; 372 dev->vfio_req_intr_handle.vfio_dev_fd = vfio_dev_fd; 373 374 ret = rte_intr_callback_register(&dev->vfio_req_intr_handle, 375 pci_vfio_req_handler, 376 (void *)&dev->device); 377 if (ret) { 378 RTE_LOG(ERR, EAL, "Fail to register req notifier handler.\n"); 379 goto error; 380 } 381 382 ret = rte_intr_enable(&dev->vfio_req_intr_handle); 383 if (ret) { 384 RTE_LOG(ERR, EAL, "Fail to enable req notifier.\n"); 385 ret = rte_intr_callback_unregister(&dev->vfio_req_intr_handle, 386 pci_vfio_req_handler, 387 (void *)&dev->device); 388 if (ret < 0) 389 RTE_LOG(ERR, EAL, 390 "Fail to unregister req notifier handler.\n"); 391 goto error; 392 } 393 394 return 0; 395 error: 396 close(fd); 397 398 dev->vfio_req_intr_handle.fd = -1; 399 dev->vfio_req_intr_handle.type = RTE_INTR_HANDLE_UNKNOWN; 400 dev->vfio_req_intr_handle.vfio_dev_fd = -1; 401 402 return -1; 403 } 404 405 /* disable notifier (only disable req now) */ 406 static int 407 pci_vfio_disable_notifier(struct rte_pci_device *dev) 408 { 409 int ret; 410 411 ret = rte_intr_disable(&dev->vfio_req_intr_handle); 412 if (ret) { 413 RTE_LOG(ERR, EAL, "fail to disable req notifier.\n"); 414 return -1; 415 } 416 417 ret = rte_intr_callback_unregister(&dev->vfio_req_intr_handle, 418 pci_vfio_req_handler, 419 (void *)&dev->device); 420 if (ret < 0) { 421 RTE_LOG(ERR, EAL, 422 "fail to unregister req notifier handler.\n"); 423 return -1; 424 } 425 426 close(dev->vfio_req_intr_handle.fd); 427 428 dev->vfio_req_intr_handle.fd = -1; 429 dev->vfio_req_intr_handle.type = RTE_INTR_HANDLE_UNKNOWN; 430 dev->vfio_req_intr_handle.vfio_dev_fd = -1; 431 432 return 0; 433 } 434 #endif 435 436 static int 437 pci_vfio_is_ioport_bar(int vfio_dev_fd, int bar_index) 438 { 439 uint32_t ioport_bar; 440 int ret; 441 442 ret = pread64(vfio_dev_fd, &ioport_bar, sizeof(ioport_bar), 443 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) 444 + PCI_BASE_ADDRESS_0 + bar_index*4); 445 if (ret != sizeof(ioport_bar)) { 446 RTE_LOG(ERR, EAL, "Cannot read command (%x) from config space!\n", 447 PCI_BASE_ADDRESS_0 + bar_index*4); 448 return -1; 449 } 450 451 return (ioport_bar & PCI_BASE_ADDRESS_SPACE_IO) != 0; 452 } 453 454 static int 455 pci_rte_vfio_setup_device(struct rte_pci_device *dev, int vfio_dev_fd) 456 { 457 if (pci_vfio_setup_interrupts(dev, vfio_dev_fd) != 0) { 458 RTE_LOG(ERR, EAL, "Error setting up interrupts!\n"); 459 return -1; 460 } 461 462 if (pci_vfio_enable_bus_memory(vfio_dev_fd)) { 463 RTE_LOG(ERR, EAL, "Cannot enable bus memory!\n"); 464 return -1; 465 } 466 467 /* set bus mastering for the device */ 468 if (pci_vfio_set_bus_master(vfio_dev_fd, true)) { 469 RTE_LOG(ERR, EAL, "Cannot set up bus mastering!\n"); 470 return -1; 471 } 472 473 /* 474 * Reset the device. If the device is not capable of resetting, 475 * then it updates errno as EINVAL. 476 */ 477 if (ioctl(vfio_dev_fd, VFIO_DEVICE_RESET) && errno != EINVAL) { 478 RTE_LOG(ERR, EAL, "Unable to reset device! Error: %d (%s)\n", 479 errno, strerror(errno)); 480 return -1; 481 } 482 483 return 0; 484 } 485 486 static int 487 pci_vfio_mmap_bar(int vfio_dev_fd, struct mapped_pci_resource *vfio_res, 488 int bar_index, int additional_flags) 489 { 490 struct memreg { 491 uint64_t offset; 492 size_t size; 493 } memreg[2] = {}; 494 void *bar_addr; 495 struct pci_msix_table *msix_table = &vfio_res->msix_table; 496 struct pci_map *bar = &vfio_res->maps[bar_index]; 497 498 if (bar->size == 0) { 499 RTE_LOG(DEBUG, EAL, "Bar size is 0, skip BAR%d\n", bar_index); 500 return 0; 501 } 502 503 if (msix_table->bar_index == bar_index) { 504 /* 505 * VFIO will not let us map the MSI-X table, 506 * but we can map around it. 507 */ 508 uint32_t table_start = msix_table->offset; 509 uint32_t table_end = table_start + msix_table->size; 510 table_end = RTE_ALIGN(table_end, PAGE_SIZE); 511 table_start = RTE_ALIGN_FLOOR(table_start, PAGE_SIZE); 512 513 /* If page-aligned start of MSI-X table is less than the 514 * actual MSI-X table start address, reassign to the actual 515 * start address. 516 */ 517 if (table_start < msix_table->offset) 518 table_start = msix_table->offset; 519 520 if (table_start == 0 && table_end >= bar->size) { 521 /* Cannot map this BAR */ 522 RTE_LOG(DEBUG, EAL, "Skipping BAR%d\n", bar_index); 523 bar->size = 0; 524 bar->addr = 0; 525 return 0; 526 } 527 528 memreg[0].offset = bar->offset; 529 memreg[0].size = table_start; 530 if (bar->size < table_end) { 531 /* 532 * If MSI-X table end is beyond BAR end, don't attempt 533 * to perform second mapping. 534 */ 535 memreg[1].offset = 0; 536 memreg[1].size = 0; 537 } else { 538 memreg[1].offset = bar->offset + table_end; 539 memreg[1].size = bar->size - table_end; 540 } 541 542 RTE_LOG(DEBUG, EAL, 543 "Trying to map BAR%d that contains the MSI-X " 544 "table. Trying offsets: " 545 "0x%04" PRIx64 ":0x%04zx, 0x%04" PRIx64 ":0x%04zx\n", 546 bar_index, 547 memreg[0].offset, memreg[0].size, 548 memreg[1].offset, memreg[1].size); 549 } else { 550 memreg[0].offset = bar->offset; 551 memreg[0].size = bar->size; 552 } 553 554 /* reserve the address using an inaccessible mapping */ 555 bar_addr = mmap(bar->addr, bar->size, 0, MAP_PRIVATE | 556 MAP_ANONYMOUS | additional_flags, -1, 0); 557 if (bar_addr != MAP_FAILED) { 558 void *map_addr = NULL; 559 if (memreg[0].size) { 560 /* actual map of first part */ 561 map_addr = pci_map_resource(bar_addr, vfio_dev_fd, 562 memreg[0].offset, 563 memreg[0].size, 564 RTE_MAP_FORCE_ADDRESS); 565 } 566 567 /* 568 * Regarding "memreg[0].size == 0": 569 * If this BAR has MSI-X table, memreg[0].size (the 570 * first part or the part before the table) can 571 * legitimately be 0 for hardware using vector table 572 * offset 0 (i.e. first part does not exist). 573 * 574 * When memreg[0].size is 0, "mapping the first part" 575 * never happens, and map_addr is NULL at this 576 * point. So check that mapping has been actually 577 * attempted. 578 */ 579 /* if there's a second part, try to map it */ 580 if ((map_addr != NULL || memreg[0].size == 0) 581 && memreg[1].offset && memreg[1].size) { 582 void *second_addr = RTE_PTR_ADD(bar_addr, 583 (uintptr_t)(memreg[1].offset - 584 bar->offset)); 585 map_addr = pci_map_resource(second_addr, 586 vfio_dev_fd, 587 memreg[1].offset, 588 memreg[1].size, 589 RTE_MAP_FORCE_ADDRESS); 590 } 591 592 if (map_addr == NULL) { 593 munmap(bar_addr, bar->size); 594 bar_addr = MAP_FAILED; 595 RTE_LOG(ERR, EAL, "Failed to map pci BAR%d\n", 596 bar_index); 597 return -1; 598 } 599 } else { 600 RTE_LOG(ERR, EAL, 601 "Failed to create inaccessible mapping for BAR%d\n", 602 bar_index); 603 return -1; 604 } 605 606 bar->addr = bar_addr; 607 return 0; 608 } 609 610 /* 611 * region info may contain capability headers, so we need to keep reallocating 612 * the memory until we match allocated memory size with argsz. 613 */ 614 static int 615 pci_vfio_get_region_info(int vfio_dev_fd, struct vfio_region_info **info, 616 int region) 617 { 618 struct vfio_region_info *ri; 619 size_t argsz = sizeof(*ri); 620 int ret; 621 622 ri = malloc(sizeof(*ri)); 623 if (ri == NULL) { 624 RTE_LOG(ERR, EAL, "Cannot allocate memory for region info\n"); 625 return -1; 626 } 627 again: 628 memset(ri, 0, argsz); 629 ri->argsz = argsz; 630 ri->index = region; 631 632 ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, ri); 633 if (ret < 0) { 634 free(ri); 635 return ret; 636 } 637 if (ri->argsz != argsz) { 638 struct vfio_region_info *tmp; 639 640 argsz = ri->argsz; 641 tmp = realloc(ri, argsz); 642 643 if (tmp == NULL) { 644 /* realloc failed but the ri is still there */ 645 free(ri); 646 RTE_LOG(ERR, EAL, "Cannot reallocate memory for region info\n"); 647 return -1; 648 } 649 ri = tmp; 650 goto again; 651 } 652 *info = ri; 653 654 return 0; 655 } 656 657 static struct vfio_info_cap_header * 658 pci_vfio_info_cap(struct vfio_region_info *info, int cap) 659 { 660 struct vfio_info_cap_header *h; 661 size_t offset; 662 663 if ((info->flags & RTE_VFIO_INFO_FLAG_CAPS) == 0) { 664 /* VFIO info does not advertise capabilities */ 665 return NULL; 666 } 667 668 offset = VFIO_CAP_OFFSET(info); 669 while (offset != 0) { 670 h = RTE_PTR_ADD(info, offset); 671 if (h->id == cap) 672 return h; 673 offset = h->next; 674 } 675 return NULL; 676 } 677 678 static int 679 pci_vfio_msix_is_mappable(int vfio_dev_fd, int msix_region) 680 { 681 struct vfio_region_info *info; 682 int ret; 683 684 ret = pci_vfio_get_region_info(vfio_dev_fd, &info, msix_region); 685 if (ret < 0) 686 return -1; 687 688 ret = pci_vfio_info_cap(info, RTE_VFIO_CAP_MSIX_MAPPABLE) != NULL; 689 690 /* cleanup */ 691 free(info); 692 693 return ret; 694 } 695 696 697 static int 698 pci_vfio_map_resource_primary(struct rte_pci_device *dev) 699 { 700 struct vfio_device_info device_info = { .argsz = sizeof(device_info) }; 701 char pci_addr[PATH_MAX] = {0}; 702 int vfio_dev_fd; 703 struct rte_pci_addr *loc = &dev->addr; 704 int i, ret; 705 struct mapped_pci_resource *vfio_res = NULL; 706 struct mapped_pci_res_list *vfio_res_list = 707 RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list); 708 709 struct pci_map *maps; 710 711 dev->intr_handle.fd = -1; 712 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE 713 dev->vfio_req_intr_handle.fd = -1; 714 #endif 715 716 /* store PCI address string */ 717 snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT, 718 loc->domain, loc->bus, loc->devid, loc->function); 719 720 ret = rte_vfio_setup_device(rte_pci_get_sysfs_path(), pci_addr, 721 &vfio_dev_fd, &device_info); 722 if (ret) 723 return ret; 724 725 /* allocate vfio_res and get region info */ 726 vfio_res = rte_zmalloc("VFIO_RES", sizeof(*vfio_res), 0); 727 if (vfio_res == NULL) { 728 RTE_LOG(ERR, EAL, 729 "%s(): cannot store vfio mmap details\n", __func__); 730 goto err_vfio_dev_fd; 731 } 732 memcpy(&vfio_res->pci_addr, &dev->addr, sizeof(vfio_res->pci_addr)); 733 734 /* get number of registers (up to BAR5) */ 735 vfio_res->nb_maps = RTE_MIN((int) device_info.num_regions, 736 VFIO_PCI_BAR5_REGION_INDEX + 1); 737 738 /* map BARs */ 739 maps = vfio_res->maps; 740 741 vfio_res->msix_table.bar_index = -1; 742 /* get MSI-X BAR, if any (we have to know where it is because we can't 743 * easily mmap it when using VFIO) 744 */ 745 ret = pci_vfio_get_msix_bar(vfio_dev_fd, &vfio_res->msix_table); 746 if (ret < 0) { 747 RTE_LOG(ERR, EAL, " %s cannot get MSI-X BAR number!\n", 748 pci_addr); 749 goto err_vfio_res; 750 } 751 /* if we found our MSI-X BAR region, check if we can mmap it */ 752 if (vfio_res->msix_table.bar_index != -1) { 753 int ret = pci_vfio_msix_is_mappable(vfio_dev_fd, 754 vfio_res->msix_table.bar_index); 755 if (ret < 0) { 756 RTE_LOG(ERR, EAL, "Couldn't check if MSI-X BAR is mappable\n"); 757 goto err_vfio_res; 758 } else if (ret != 0) { 759 /* we can map it, so we don't care where it is */ 760 RTE_LOG(DEBUG, EAL, "VFIO reports MSI-X BAR as mappable\n"); 761 vfio_res->msix_table.bar_index = -1; 762 } 763 } 764 765 for (i = 0; i < vfio_res->nb_maps; i++) { 766 struct vfio_region_info *reg = NULL; 767 void *bar_addr; 768 769 ret = pci_vfio_get_region_info(vfio_dev_fd, ®, i); 770 if (ret < 0) { 771 RTE_LOG(ERR, EAL, " %s cannot get device region info " 772 "error %i (%s)\n", pci_addr, errno, 773 strerror(errno)); 774 goto err_vfio_res; 775 } 776 777 /* chk for io port region */ 778 ret = pci_vfio_is_ioport_bar(vfio_dev_fd, i); 779 if (ret < 0) { 780 free(reg); 781 goto err_vfio_res; 782 } else if (ret) { 783 RTE_LOG(INFO, EAL, "Ignore mapping IO port bar(%d)\n", 784 i); 785 free(reg); 786 continue; 787 } 788 789 /* skip non-mmapable BARs */ 790 if ((reg->flags & VFIO_REGION_INFO_FLAG_MMAP) == 0) { 791 free(reg); 792 continue; 793 } 794 795 /* try mapping somewhere close to the end of hugepages */ 796 if (pci_map_addr == NULL) 797 pci_map_addr = pci_find_max_end_va(); 798 799 bar_addr = pci_map_addr; 800 pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg->size); 801 802 pci_map_addr = RTE_PTR_ALIGN(pci_map_addr, 803 sysconf(_SC_PAGE_SIZE)); 804 805 maps[i].addr = bar_addr; 806 maps[i].offset = reg->offset; 807 maps[i].size = reg->size; 808 maps[i].path = NULL; /* vfio doesn't have per-resource paths */ 809 810 ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, 0); 811 if (ret < 0) { 812 RTE_LOG(ERR, EAL, " %s mapping BAR%i failed: %s\n", 813 pci_addr, i, strerror(errno)); 814 free(reg); 815 goto err_vfio_res; 816 } 817 818 dev->mem_resource[i].addr = maps[i].addr; 819 820 free(reg); 821 } 822 823 if (pci_rte_vfio_setup_device(dev, vfio_dev_fd) < 0) { 824 RTE_LOG(ERR, EAL, " %s setup device failed\n", pci_addr); 825 goto err_vfio_res; 826 } 827 828 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE 829 if (pci_vfio_enable_notifier(dev, vfio_dev_fd) != 0) { 830 RTE_LOG(ERR, EAL, "Error setting up notifier!\n"); 831 goto err_vfio_res; 832 } 833 834 #endif 835 TAILQ_INSERT_TAIL(vfio_res_list, vfio_res, next); 836 837 return 0; 838 err_vfio_res: 839 rte_free(vfio_res); 840 err_vfio_dev_fd: 841 rte_vfio_release_device(rte_pci_get_sysfs_path(), 842 pci_addr, vfio_dev_fd); 843 return -1; 844 } 845 846 static int 847 pci_vfio_map_resource_secondary(struct rte_pci_device *dev) 848 { 849 struct vfio_device_info device_info = { .argsz = sizeof(device_info) }; 850 char pci_addr[PATH_MAX] = {0}; 851 int vfio_dev_fd; 852 struct rte_pci_addr *loc = &dev->addr; 853 int i, ret; 854 struct mapped_pci_resource *vfio_res = NULL; 855 struct mapped_pci_res_list *vfio_res_list = 856 RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list); 857 858 struct pci_map *maps; 859 860 dev->intr_handle.fd = -1; 861 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE 862 dev->vfio_req_intr_handle.fd = -1; 863 #endif 864 865 /* store PCI address string */ 866 snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT, 867 loc->domain, loc->bus, loc->devid, loc->function); 868 869 /* if we're in a secondary process, just find our tailq entry */ 870 TAILQ_FOREACH(vfio_res, vfio_res_list, next) { 871 if (rte_pci_addr_cmp(&vfio_res->pci_addr, 872 &dev->addr)) 873 continue; 874 break; 875 } 876 /* if we haven't found our tailq entry, something's wrong */ 877 if (vfio_res == NULL) { 878 RTE_LOG(ERR, EAL, " %s cannot find TAILQ entry for PCI device!\n", 879 pci_addr); 880 return -1; 881 } 882 883 ret = rte_vfio_setup_device(rte_pci_get_sysfs_path(), pci_addr, 884 &vfio_dev_fd, &device_info); 885 if (ret) 886 return ret; 887 888 /* map BARs */ 889 maps = vfio_res->maps; 890 891 for (i = 0; i < vfio_res->nb_maps; i++) { 892 ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, MAP_FIXED); 893 if (ret < 0) { 894 RTE_LOG(ERR, EAL, " %s mapping BAR%i failed: %s\n", 895 pci_addr, i, strerror(errno)); 896 goto err_vfio_dev_fd; 897 } 898 899 dev->mem_resource[i].addr = maps[i].addr; 900 } 901 902 /* we need save vfio_dev_fd, so it can be used during release */ 903 dev->intr_handle.vfio_dev_fd = vfio_dev_fd; 904 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE 905 dev->vfio_req_intr_handle.vfio_dev_fd = vfio_dev_fd; 906 #endif 907 908 return 0; 909 err_vfio_dev_fd: 910 rte_vfio_release_device(rte_pci_get_sysfs_path(), 911 pci_addr, vfio_dev_fd); 912 return -1; 913 } 914 915 /* 916 * map the PCI resources of a PCI device in virtual memory (VFIO version). 917 * primary and secondary processes follow almost exactly the same path 918 */ 919 int 920 pci_vfio_map_resource(struct rte_pci_device *dev) 921 { 922 if (rte_eal_process_type() == RTE_PROC_PRIMARY) 923 return pci_vfio_map_resource_primary(dev); 924 else 925 return pci_vfio_map_resource_secondary(dev); 926 } 927 928 static struct mapped_pci_resource * 929 find_and_unmap_vfio_resource(struct mapped_pci_res_list *vfio_res_list, 930 struct rte_pci_device *dev, 931 const char *pci_addr) 932 { 933 struct mapped_pci_resource *vfio_res = NULL; 934 struct pci_map *maps; 935 int i; 936 937 /* Get vfio_res */ 938 TAILQ_FOREACH(vfio_res, vfio_res_list, next) { 939 if (rte_pci_addr_cmp(&vfio_res->pci_addr, &dev->addr)) 940 continue; 941 break; 942 } 943 944 if (vfio_res == NULL) 945 return vfio_res; 946 947 RTE_LOG(INFO, EAL, "Releasing pci mapped resource for %s\n", 948 pci_addr); 949 950 maps = vfio_res->maps; 951 for (i = 0; i < vfio_res->nb_maps; i++) { 952 953 /* 954 * We do not need to be aware of MSI-X table BAR mappings as 955 * when mapping. Just using current maps array is enough 956 */ 957 if (maps[i].addr) { 958 RTE_LOG(INFO, EAL, "Calling pci_unmap_resource for %s at %p\n", 959 pci_addr, maps[i].addr); 960 pci_unmap_resource(maps[i].addr, maps[i].size); 961 } 962 } 963 964 return vfio_res; 965 } 966 967 static int 968 pci_vfio_unmap_resource_primary(struct rte_pci_device *dev) 969 { 970 char pci_addr[PATH_MAX] = {0}; 971 struct rte_pci_addr *loc = &dev->addr; 972 struct mapped_pci_resource *vfio_res = NULL; 973 struct mapped_pci_res_list *vfio_res_list; 974 int ret; 975 976 /* store PCI address string */ 977 snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT, 978 loc->domain, loc->bus, loc->devid, loc->function); 979 980 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE 981 ret = pci_vfio_disable_notifier(dev); 982 if (ret) { 983 RTE_LOG(ERR, EAL, "fail to disable req notifier.\n"); 984 return -1; 985 } 986 987 #endif 988 if (close(dev->intr_handle.fd) < 0) { 989 RTE_LOG(INFO, EAL, "Error when closing eventfd file descriptor for %s\n", 990 pci_addr); 991 return -1; 992 } 993 994 if (pci_vfio_set_bus_master(dev->intr_handle.vfio_dev_fd, false)) { 995 RTE_LOG(ERR, EAL, " %s cannot unset bus mastering for PCI device!\n", 996 pci_addr); 997 return -1; 998 } 999 1000 ret = rte_vfio_release_device(rte_pci_get_sysfs_path(), pci_addr, 1001 dev->intr_handle.vfio_dev_fd); 1002 if (ret < 0) { 1003 RTE_LOG(ERR, EAL, 1004 "%s(): cannot release device\n", __func__); 1005 return ret; 1006 } 1007 1008 vfio_res_list = 1009 RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list); 1010 vfio_res = find_and_unmap_vfio_resource(vfio_res_list, dev, pci_addr); 1011 1012 /* if we haven't found our tailq entry, something's wrong */ 1013 if (vfio_res == NULL) { 1014 RTE_LOG(ERR, EAL, " %s cannot find TAILQ entry for PCI device!\n", 1015 pci_addr); 1016 return -1; 1017 } 1018 1019 TAILQ_REMOVE(vfio_res_list, vfio_res, next); 1020 rte_free(vfio_res); 1021 return 0; 1022 } 1023 1024 static int 1025 pci_vfio_unmap_resource_secondary(struct rte_pci_device *dev) 1026 { 1027 char pci_addr[PATH_MAX] = {0}; 1028 struct rte_pci_addr *loc = &dev->addr; 1029 struct mapped_pci_resource *vfio_res = NULL; 1030 struct mapped_pci_res_list *vfio_res_list; 1031 int ret; 1032 1033 /* store PCI address string */ 1034 snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT, 1035 loc->domain, loc->bus, loc->devid, loc->function); 1036 1037 ret = rte_vfio_release_device(rte_pci_get_sysfs_path(), pci_addr, 1038 dev->intr_handle.vfio_dev_fd); 1039 if (ret < 0) { 1040 RTE_LOG(ERR, EAL, 1041 "%s(): cannot release device\n", __func__); 1042 return ret; 1043 } 1044 1045 vfio_res_list = 1046 RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list); 1047 vfio_res = find_and_unmap_vfio_resource(vfio_res_list, dev, pci_addr); 1048 1049 /* if we haven't found our tailq entry, something's wrong */ 1050 if (vfio_res == NULL) { 1051 RTE_LOG(ERR, EAL, " %s cannot find TAILQ entry for PCI device!\n", 1052 pci_addr); 1053 return -1; 1054 } 1055 1056 return 0; 1057 } 1058 1059 int 1060 pci_vfio_unmap_resource(struct rte_pci_device *dev) 1061 { 1062 if (rte_eal_process_type() == RTE_PROC_PRIMARY) 1063 return pci_vfio_unmap_resource_primary(dev); 1064 else 1065 return pci_vfio_unmap_resource_secondary(dev); 1066 } 1067 1068 int 1069 pci_vfio_ioport_map(struct rte_pci_device *dev, int bar, 1070 struct rte_pci_ioport *p) 1071 { 1072 if (bar < VFIO_PCI_BAR0_REGION_INDEX || 1073 bar > VFIO_PCI_BAR5_REGION_INDEX) { 1074 RTE_LOG(ERR, EAL, "invalid bar (%d)!\n", bar); 1075 return -1; 1076 } 1077 1078 p->dev = dev; 1079 p->base = VFIO_GET_REGION_ADDR(bar); 1080 return 0; 1081 } 1082 1083 void 1084 pci_vfio_ioport_read(struct rte_pci_ioport *p, 1085 void *data, size_t len, off_t offset) 1086 { 1087 const struct rte_intr_handle *intr_handle = &p->dev->intr_handle; 1088 1089 if (pread64(intr_handle->vfio_dev_fd, data, 1090 len, p->base + offset) <= 0) 1091 RTE_LOG(ERR, EAL, 1092 "Can't read from PCI bar (%" PRIu64 ") : offset (%x)\n", 1093 VFIO_GET_REGION_IDX(p->base), (int)offset); 1094 } 1095 1096 void 1097 pci_vfio_ioport_write(struct rte_pci_ioport *p, 1098 const void *data, size_t len, off_t offset) 1099 { 1100 const struct rte_intr_handle *intr_handle = &p->dev->intr_handle; 1101 1102 if (pwrite64(intr_handle->vfio_dev_fd, data, 1103 len, p->base + offset) <= 0) 1104 RTE_LOG(ERR, EAL, 1105 "Can't write to PCI bar (%" PRIu64 ") : offset (%x)\n", 1106 VFIO_GET_REGION_IDX(p->base), (int)offset); 1107 } 1108 1109 int 1110 pci_vfio_ioport_unmap(struct rte_pci_ioport *p) 1111 { 1112 RTE_SET_USED(p); 1113 return -1; 1114 } 1115 1116 int 1117 pci_vfio_is_enabled(void) 1118 { 1119 return rte_vfio_is_enabled("vfio_pci"); 1120 } 1121 #endif 1122