1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2010-2014 Intel Corporation. 3 * Copyright 2013-2014 6WIND S.A. 4 */ 5 6 #include <string.h> 7 #include <inttypes.h> 8 #include <stdint.h> 9 #include <stdbool.h> 10 #include <stdlib.h> 11 #include <stdio.h> 12 #include <sys/queue.h> 13 #include <sys/mman.h> 14 15 #include <rte_errno.h> 16 #include <rte_interrupts.h> 17 #include <rte_log.h> 18 #include <rte_bus.h> 19 #include <rte_pci.h> 20 #include <rte_bus_pci.h> 21 #include <rte_per_lcore.h> 22 #include <rte_memory.h> 23 #include <rte_eal.h> 24 #include <rte_string_fns.h> 25 #include <rte_common.h> 26 #include <rte_devargs.h> 27 #include <rte_vfio.h> 28 29 #include "private.h" 30 31 32 #define SYSFS_PCI_DEVICES "/sys/bus/pci/devices" 33 34 const char *rte_pci_get_sysfs_path(void) 35 { 36 const char *path = NULL; 37 38 path = getenv("SYSFS_PCI_DEVICES"); 39 if (path == NULL) 40 return SYSFS_PCI_DEVICES; 41 42 return path; 43 } 44 45 static struct rte_devargs * 46 pci_devargs_lookup(const struct rte_pci_addr *pci_addr) 47 { 48 struct rte_devargs *devargs; 49 struct rte_pci_addr addr; 50 51 RTE_EAL_DEVARGS_FOREACH("pci", devargs) { 52 devargs->bus->parse(devargs->name, &addr); 53 if (!rte_pci_addr_cmp(pci_addr, &addr)) 54 return devargs; 55 } 56 return NULL; 57 } 58 59 void 60 pci_name_set(struct rte_pci_device *dev) 61 { 62 struct rte_devargs *devargs; 63 64 /* Each device has its internal, canonical name set. */ 65 rte_pci_device_name(&dev->addr, 66 dev->name, sizeof(dev->name)); 67 devargs = pci_devargs_lookup(&dev->addr); 68 dev->device.devargs = devargs; 69 /* In blacklist mode, if the device is not blacklisted, no 70 * rte_devargs exists for it. 71 */ 72 if (devargs != NULL) 73 /* If an rte_devargs exists, the generic rte_device uses the 74 * given name as its name. 75 */ 76 dev->device.name = dev->device.devargs->name; 77 else 78 /* Otherwise, it uses the internal, canonical form. */ 79 dev->device.name = dev->name; 80 } 81 82 /* 83 * Match the PCI Driver and Device using the ID Table 84 */ 85 int 86 rte_pci_match(const struct rte_pci_driver *pci_drv, 87 const struct rte_pci_device *pci_dev) 88 { 89 const struct rte_pci_id *id_table; 90 91 for (id_table = pci_drv->id_table; id_table->vendor_id != 0; 92 id_table++) { 93 /* check if device's identifiers match the driver's ones */ 94 if (id_table->vendor_id != pci_dev->id.vendor_id && 95 id_table->vendor_id != PCI_ANY_ID) 96 continue; 97 if (id_table->device_id != pci_dev->id.device_id && 98 id_table->device_id != PCI_ANY_ID) 99 continue; 100 if (id_table->subsystem_vendor_id != 101 pci_dev->id.subsystem_vendor_id && 102 id_table->subsystem_vendor_id != PCI_ANY_ID) 103 continue; 104 if (id_table->subsystem_device_id != 105 pci_dev->id.subsystem_device_id && 106 id_table->subsystem_device_id != PCI_ANY_ID) 107 continue; 108 if (id_table->class_id != pci_dev->id.class_id && 109 id_table->class_id != RTE_CLASS_ANY_ID) 110 continue; 111 112 return 1; 113 } 114 115 return 0; 116 } 117 118 /* 119 * If vendor/device ID match, call the probe() function of the 120 * driver. 121 */ 122 static int 123 rte_pci_probe_one_driver(struct rte_pci_driver *dr, 124 struct rte_pci_device *dev) 125 { 126 int ret; 127 bool already_probed; 128 struct rte_pci_addr *loc; 129 130 if ((dr == NULL) || (dev == NULL)) 131 return -EINVAL; 132 133 loc = &dev->addr; 134 135 /* The device is not blacklisted; Check if driver supports it */ 136 if (!rte_pci_match(dr, dev)) 137 /* Match of device and driver failed */ 138 return 1; 139 140 RTE_LOG(DEBUG, EAL, "PCI device "PCI_PRI_FMT" on NUMA socket %i\n", 141 loc->domain, loc->bus, loc->devid, loc->function, 142 dev->device.numa_node); 143 144 /* no initialization when blacklisted, return without error */ 145 if (dev->device.devargs != NULL && 146 dev->device.devargs->policy == 147 RTE_DEV_BLACKLISTED) { 148 RTE_LOG(INFO, EAL, " Device is blacklisted, not" 149 " initializing\n"); 150 return 1; 151 } 152 153 if (dev->device.numa_node < 0) { 154 RTE_LOG(WARNING, EAL, " Invalid NUMA socket, default to 0\n"); 155 dev->device.numa_node = 0; 156 } 157 158 already_probed = rte_dev_is_probed(&dev->device); 159 if (already_probed && !(dr->drv_flags & RTE_PCI_DRV_PROBE_AGAIN)) { 160 RTE_LOG(DEBUG, EAL, "Device %s is already probed\n", 161 dev->device.name); 162 return -EEXIST; 163 } 164 165 RTE_LOG(DEBUG, EAL, " probe driver: %x:%x %s\n", dev->id.vendor_id, 166 dev->id.device_id, dr->driver.name); 167 168 /* 169 * reference driver structure 170 * This needs to be before rte_pci_map_device(), as it enables to use 171 * driver flags for adjusting configuration. 172 */ 173 if (!already_probed) { 174 enum rte_iova_mode dev_iova_mode; 175 enum rte_iova_mode iova_mode; 176 177 dev_iova_mode = pci_device_iova_mode(dr, dev); 178 iova_mode = rte_eal_iova_mode(); 179 if (dev_iova_mode != RTE_IOVA_DC && 180 dev_iova_mode != iova_mode) { 181 RTE_LOG(ERR, EAL, " Expecting '%s' IOVA mode but current mode is '%s', not initializing\n", 182 dev_iova_mode == RTE_IOVA_PA ? "PA" : "VA", 183 iova_mode == RTE_IOVA_PA ? "PA" : "VA"); 184 return -EINVAL; 185 } 186 187 dev->driver = dr; 188 } 189 190 if (!already_probed && (dr->drv_flags & RTE_PCI_DRV_NEED_MAPPING)) { 191 /* map resources for devices that use igb_uio */ 192 ret = rte_pci_map_device(dev); 193 if (ret != 0) { 194 dev->driver = NULL; 195 return ret; 196 } 197 } 198 199 RTE_LOG(INFO, EAL, "Probe PCI driver: %s (%x:%x) device: "PCI_PRI_FMT" (socket %i)\n", 200 dr->driver.name, dev->id.vendor_id, dev->id.device_id, 201 loc->domain, loc->bus, loc->devid, loc->function, 202 dev->device.numa_node); 203 /* call the driver probe() function */ 204 ret = dr->probe(dr, dev); 205 if (already_probed) 206 return ret; /* no rollback if already succeeded earlier */ 207 if (ret) { 208 dev->driver = NULL; 209 if ((dr->drv_flags & RTE_PCI_DRV_NEED_MAPPING) && 210 /* Don't unmap if device is unsupported and 211 * driver needs mapped resources. 212 */ 213 !(ret > 0 && 214 (dr->drv_flags & RTE_PCI_DRV_KEEP_MAPPED_RES))) 215 rte_pci_unmap_device(dev); 216 } else { 217 dev->device.driver = &dr->driver; 218 } 219 220 return ret; 221 } 222 223 /* 224 * If vendor/device ID match, call the remove() function of the 225 * driver. 226 */ 227 static int 228 rte_pci_detach_dev(struct rte_pci_device *dev) 229 { 230 struct rte_pci_addr *loc; 231 struct rte_pci_driver *dr; 232 int ret = 0; 233 234 if (dev == NULL) 235 return -EINVAL; 236 237 dr = dev->driver; 238 loc = &dev->addr; 239 240 RTE_LOG(DEBUG, EAL, "PCI device "PCI_PRI_FMT" on NUMA socket %i\n", 241 loc->domain, loc->bus, loc->devid, 242 loc->function, dev->device.numa_node); 243 244 RTE_LOG(DEBUG, EAL, " remove driver: %x:%x %s\n", dev->id.vendor_id, 245 dev->id.device_id, dr->driver.name); 246 247 if (dr->remove) { 248 ret = dr->remove(dev); 249 if (ret < 0) 250 return ret; 251 } 252 253 /* clear driver structure */ 254 dev->driver = NULL; 255 dev->device.driver = NULL; 256 257 if (dr->drv_flags & RTE_PCI_DRV_NEED_MAPPING) 258 /* unmap resources for devices that use igb_uio */ 259 rte_pci_unmap_device(dev); 260 261 return 0; 262 } 263 264 /* 265 * If vendor/device ID match, call the probe() function of all 266 * registered driver for the given device. Return < 0 if initialization 267 * failed, return 1 if no driver is found for this device. 268 */ 269 static int 270 pci_probe_all_drivers(struct rte_pci_device *dev) 271 { 272 struct rte_pci_driver *dr = NULL; 273 int rc = 0; 274 275 if (dev == NULL) 276 return -EINVAL; 277 278 FOREACH_DRIVER_ON_PCIBUS(dr) { 279 rc = rte_pci_probe_one_driver(dr, dev); 280 if (rc < 0) 281 /* negative value is an error */ 282 return rc; 283 if (rc > 0) 284 /* positive value means driver doesn't support it */ 285 continue; 286 return 0; 287 } 288 return 1; 289 } 290 291 /* 292 * Scan the content of the PCI bus, and call the probe() function for 293 * all registered drivers that have a matching entry in its id_table 294 * for discovered devices. 295 */ 296 static int 297 pci_probe(void) 298 { 299 struct rte_pci_device *dev = NULL; 300 size_t probed = 0, failed = 0; 301 int ret = 0; 302 303 FOREACH_DEVICE_ON_PCIBUS(dev) { 304 probed++; 305 306 ret = pci_probe_all_drivers(dev); 307 if (ret < 0) { 308 if (ret != -EEXIST) { 309 RTE_LOG(ERR, EAL, "Requested device " 310 PCI_PRI_FMT " cannot be used\n", 311 dev->addr.domain, dev->addr.bus, 312 dev->addr.devid, dev->addr.function); 313 rte_errno = errno; 314 failed++; 315 } 316 ret = 0; 317 } 318 } 319 320 return (probed && probed == failed) ? -1 : 0; 321 } 322 323 /* dump one device */ 324 static int 325 pci_dump_one_device(FILE *f, struct rte_pci_device *dev) 326 { 327 int i; 328 329 fprintf(f, PCI_PRI_FMT, dev->addr.domain, dev->addr.bus, 330 dev->addr.devid, dev->addr.function); 331 fprintf(f, " - vendor:%x device:%x\n", dev->id.vendor_id, 332 dev->id.device_id); 333 334 for (i = 0; i != sizeof(dev->mem_resource) / 335 sizeof(dev->mem_resource[0]); i++) { 336 fprintf(f, " %16.16"PRIx64" %16.16"PRIx64"\n", 337 dev->mem_resource[i].phys_addr, 338 dev->mem_resource[i].len); 339 } 340 return 0; 341 } 342 343 /* dump devices on the bus */ 344 void 345 rte_pci_dump(FILE *f) 346 { 347 struct rte_pci_device *dev = NULL; 348 349 FOREACH_DEVICE_ON_PCIBUS(dev) { 350 pci_dump_one_device(f, dev); 351 } 352 } 353 354 static int 355 pci_parse(const char *name, void *addr) 356 { 357 struct rte_pci_addr *out = addr; 358 struct rte_pci_addr pci_addr; 359 bool parse; 360 361 parse = (rte_pci_addr_parse(name, &pci_addr) == 0); 362 if (parse && addr != NULL) 363 *out = pci_addr; 364 return parse == false; 365 } 366 367 /* register a driver */ 368 void 369 rte_pci_register(struct rte_pci_driver *driver) 370 { 371 TAILQ_INSERT_TAIL(&rte_pci_bus.driver_list, driver, next); 372 driver->bus = &rte_pci_bus; 373 } 374 375 /* unregister a driver */ 376 void 377 rte_pci_unregister(struct rte_pci_driver *driver) 378 { 379 TAILQ_REMOVE(&rte_pci_bus.driver_list, driver, next); 380 driver->bus = NULL; 381 } 382 383 /* Add a device to PCI bus */ 384 void 385 rte_pci_add_device(struct rte_pci_device *pci_dev) 386 { 387 TAILQ_INSERT_TAIL(&rte_pci_bus.device_list, pci_dev, next); 388 } 389 390 /* Insert a device into a predefined position in PCI bus */ 391 void 392 rte_pci_insert_device(struct rte_pci_device *exist_pci_dev, 393 struct rte_pci_device *new_pci_dev) 394 { 395 TAILQ_INSERT_BEFORE(exist_pci_dev, new_pci_dev, next); 396 } 397 398 /* Remove a device from PCI bus */ 399 static void 400 rte_pci_remove_device(struct rte_pci_device *pci_dev) 401 { 402 TAILQ_REMOVE(&rte_pci_bus.device_list, pci_dev, next); 403 } 404 405 static struct rte_device * 406 pci_find_device(const struct rte_device *start, rte_dev_cmp_t cmp, 407 const void *data) 408 { 409 const struct rte_pci_device *pstart; 410 struct rte_pci_device *pdev; 411 412 if (start != NULL) { 413 pstart = RTE_DEV_TO_PCI_CONST(start); 414 pdev = TAILQ_NEXT(pstart, next); 415 } else { 416 pdev = TAILQ_FIRST(&rte_pci_bus.device_list); 417 } 418 while (pdev != NULL) { 419 if (cmp(&pdev->device, data) == 0) 420 return &pdev->device; 421 pdev = TAILQ_NEXT(pdev, next); 422 } 423 return NULL; 424 } 425 426 /* 427 * find the device which encounter the failure, by iterate over all device on 428 * PCI bus to check if the memory failure address is located in the range 429 * of the BARs of the device. 430 */ 431 static struct rte_pci_device * 432 pci_find_device_by_addr(const void *failure_addr) 433 { 434 struct rte_pci_device *pdev = NULL; 435 uint64_t check_point, start, end, len; 436 int i; 437 438 check_point = (uint64_t)(uintptr_t)failure_addr; 439 440 FOREACH_DEVICE_ON_PCIBUS(pdev) { 441 for (i = 0; i != RTE_DIM(pdev->mem_resource); i++) { 442 start = (uint64_t)(uintptr_t)pdev->mem_resource[i].addr; 443 len = pdev->mem_resource[i].len; 444 end = start + len; 445 if (check_point >= start && check_point < end) { 446 RTE_LOG(DEBUG, EAL, "Failure address %16.16" 447 PRIx64" belongs to device %s!\n", 448 check_point, pdev->device.name); 449 return pdev; 450 } 451 } 452 } 453 return NULL; 454 } 455 456 static int 457 pci_hot_unplug_handler(struct rte_device *dev) 458 { 459 struct rte_pci_device *pdev = NULL; 460 int ret = 0; 461 462 pdev = RTE_DEV_TO_PCI(dev); 463 if (!pdev) 464 return -1; 465 466 switch (pdev->kdrv) { 467 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE 468 case RTE_KDRV_VFIO: 469 /* 470 * vfio kernel module guaranty the pci device would not be 471 * deleted until the user space release the resource, so no 472 * need to remap BARs resource here, just directly notify 473 * the req event to the user space to handle it. 474 */ 475 rte_dev_event_callback_process(dev->name, 476 RTE_DEV_EVENT_REMOVE); 477 break; 478 #endif 479 case RTE_KDRV_IGB_UIO: 480 case RTE_KDRV_UIO_GENERIC: 481 case RTE_KDRV_NIC_UIO: 482 /* BARs resource is invalid, remap it to be safe. */ 483 ret = pci_uio_remap_resource(pdev); 484 break; 485 default: 486 RTE_LOG(DEBUG, EAL, 487 "Not managed by a supported kernel driver, skipped\n"); 488 ret = -1; 489 break; 490 } 491 492 return ret; 493 } 494 495 static int 496 pci_sigbus_handler(const void *failure_addr) 497 { 498 struct rte_pci_device *pdev = NULL; 499 int ret = 0; 500 501 pdev = pci_find_device_by_addr(failure_addr); 502 if (!pdev) { 503 /* It is a generic sigbus error, no bus would handle it. */ 504 ret = 1; 505 } else { 506 /* The sigbus error is caused of hot-unplug. */ 507 ret = pci_hot_unplug_handler(&pdev->device); 508 if (ret) { 509 RTE_LOG(ERR, EAL, 510 "Failed to handle hot-unplug for device %s", 511 pdev->name); 512 ret = -1; 513 } 514 } 515 return ret; 516 } 517 518 static int 519 pci_plug(struct rte_device *dev) 520 { 521 return pci_probe_all_drivers(RTE_DEV_TO_PCI(dev)); 522 } 523 524 static int 525 pci_unplug(struct rte_device *dev) 526 { 527 struct rte_pci_device *pdev; 528 int ret; 529 530 pdev = RTE_DEV_TO_PCI(dev); 531 ret = rte_pci_detach_dev(pdev); 532 if (ret == 0) { 533 rte_pci_remove_device(pdev); 534 rte_devargs_remove(dev->devargs); 535 free(pdev); 536 } 537 return ret; 538 } 539 540 static int 541 pci_dma_map(struct rte_device *dev, void *addr, uint64_t iova, size_t len) 542 { 543 struct rte_pci_device *pdev = RTE_DEV_TO_PCI(dev); 544 545 if (!pdev || !pdev->driver) { 546 rte_errno = EINVAL; 547 return -1; 548 } 549 if (pdev->driver->dma_map) 550 return pdev->driver->dma_map(pdev, addr, iova, len); 551 /** 552 * In case driver don't provides any specific mapping 553 * try fallback to VFIO. 554 */ 555 if (pdev->kdrv == RTE_KDRV_VFIO) 556 return rte_vfio_container_dma_map 557 (RTE_VFIO_DEFAULT_CONTAINER_FD, (uintptr_t)addr, 558 iova, len); 559 rte_errno = ENOTSUP; 560 return -1; 561 } 562 563 static int 564 pci_dma_unmap(struct rte_device *dev, void *addr, uint64_t iova, size_t len) 565 { 566 struct rte_pci_device *pdev = RTE_DEV_TO_PCI(dev); 567 568 if (!pdev || !pdev->driver) { 569 rte_errno = EINVAL; 570 return -1; 571 } 572 if (pdev->driver->dma_unmap) 573 return pdev->driver->dma_unmap(pdev, addr, iova, len); 574 /** 575 * In case driver don't provides any specific mapping 576 * try fallback to VFIO. 577 */ 578 if (pdev->kdrv == RTE_KDRV_VFIO) 579 return rte_vfio_container_dma_unmap 580 (RTE_VFIO_DEFAULT_CONTAINER_FD, (uintptr_t)addr, 581 iova, len); 582 rte_errno = ENOTSUP; 583 return -1; 584 } 585 586 bool 587 rte_pci_ignore_device(const struct rte_pci_addr *pci_addr) 588 { 589 struct rte_devargs *devargs = pci_devargs_lookup(pci_addr); 590 591 switch (rte_pci_bus.bus.conf.scan_mode) { 592 case RTE_BUS_SCAN_WHITELIST: 593 if (devargs && devargs->policy == RTE_DEV_WHITELISTED) 594 return false; 595 break; 596 case RTE_BUS_SCAN_UNDEFINED: 597 case RTE_BUS_SCAN_BLACKLIST: 598 if (devargs == NULL || 599 devargs->policy != RTE_DEV_BLACKLISTED) 600 return false; 601 break; 602 } 603 return true; 604 } 605 606 enum rte_iova_mode 607 rte_pci_get_iommu_class(void) 608 { 609 enum rte_iova_mode iova_mode = RTE_IOVA_DC; 610 const struct rte_pci_device *dev; 611 const struct rte_pci_driver *drv; 612 bool devices_want_va = false; 613 bool devices_want_pa = false; 614 int iommu_no_va = -1; 615 616 FOREACH_DEVICE_ON_PCIBUS(dev) { 617 /* 618 * We can check this only once, because the IOMMU hardware is 619 * the same for all of them. 620 */ 621 if (iommu_no_va == -1) 622 iommu_no_va = pci_device_iommu_support_va(dev) 623 ? 0 : 1; 624 625 if (dev->kdrv == RTE_KDRV_UNKNOWN || 626 dev->kdrv == RTE_KDRV_NONE) 627 continue; 628 FOREACH_DRIVER_ON_PCIBUS(drv) { 629 enum rte_iova_mode dev_iova_mode; 630 631 if (!rte_pci_match(drv, dev)) 632 continue; 633 634 dev_iova_mode = pci_device_iova_mode(drv, dev); 635 RTE_LOG(DEBUG, EAL, "PCI driver %s for device " 636 PCI_PRI_FMT " wants IOVA as '%s'\n", 637 drv->driver.name, 638 dev->addr.domain, dev->addr.bus, 639 dev->addr.devid, dev->addr.function, 640 dev_iova_mode == RTE_IOVA_DC ? "DC" : 641 (dev_iova_mode == RTE_IOVA_PA ? "PA" : "VA")); 642 if (dev_iova_mode == RTE_IOVA_PA) 643 devices_want_pa = true; 644 else if (dev_iova_mode == RTE_IOVA_VA) 645 devices_want_va = true; 646 } 647 } 648 if (iommu_no_va == 1) { 649 iova_mode = RTE_IOVA_PA; 650 if (devices_want_va) { 651 RTE_LOG(WARNING, EAL, "Some devices want 'VA' but IOMMU does not support 'VA'.\n"); 652 RTE_LOG(WARNING, EAL, "The devices that want 'VA' won't initialize.\n"); 653 } 654 } else if (devices_want_va && !devices_want_pa) { 655 iova_mode = RTE_IOVA_VA; 656 } else if (devices_want_pa && !devices_want_va) { 657 iova_mode = RTE_IOVA_PA; 658 } else { 659 iova_mode = RTE_IOVA_DC; 660 if (devices_want_va) { 661 RTE_LOG(WARNING, EAL, "Some devices want 'VA' but forcing 'DC' because other devices want 'PA'.\n"); 662 RTE_LOG(WARNING, EAL, "Depending on the final decision by the EAL, not all devices may be able to initialize.\n"); 663 } 664 } 665 return iova_mode; 666 } 667 668 struct rte_pci_bus rte_pci_bus = { 669 .bus = { 670 .scan = rte_pci_scan, 671 .probe = pci_probe, 672 .find_device = pci_find_device, 673 .plug = pci_plug, 674 .unplug = pci_unplug, 675 .parse = pci_parse, 676 .dma_map = pci_dma_map, 677 .dma_unmap = pci_dma_unmap, 678 .get_iommu_class = rte_pci_get_iommu_class, 679 .dev_iterate = rte_pci_dev_iterate, 680 .hot_unplug_handler = pci_hot_unplug_handler, 681 .sigbus_handler = pci_sigbus_handler, 682 }, 683 .device_list = TAILQ_HEAD_INITIALIZER(rte_pci_bus.device_list), 684 .driver_list = TAILQ_HEAD_INITIALIZER(rte_pci_bus.driver_list), 685 }; 686 687 RTE_REGISTER_BUS(pci, rte_pci_bus.bus); 688