1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2010-2014 Intel Corporation. 3 * Copyright 2013-2014 6WIND S.A. 4 */ 5 6 #include <string.h> 7 #include <inttypes.h> 8 #include <stdint.h> 9 #include <stdbool.h> 10 #include <stdlib.h> 11 #include <stdio.h> 12 #include <sys/queue.h> 13 #include <sys/mman.h> 14 15 #include <rte_errno.h> 16 #include <rte_interrupts.h> 17 #include <rte_log.h> 18 #include <rte_bus.h> 19 #include <rte_pci.h> 20 #include <rte_bus_pci.h> 21 #include <rte_per_lcore.h> 22 #include <rte_memory.h> 23 #include <rte_eal.h> 24 #include <rte_string_fns.h> 25 #include <rte_common.h> 26 #include <rte_devargs.h> 27 #include <rte_vfio.h> 28 29 #include "private.h" 30 31 32 #define SYSFS_PCI_DEVICES "/sys/bus/pci/devices" 33 34 const char *rte_pci_get_sysfs_path(void) 35 { 36 const char *path = NULL; 37 38 path = getenv("SYSFS_PCI_DEVICES"); 39 if (path == NULL) 40 return SYSFS_PCI_DEVICES; 41 42 return path; 43 } 44 45 static struct rte_devargs *pci_devargs_lookup(struct rte_pci_device *dev) 46 { 47 struct rte_devargs *devargs; 48 struct rte_pci_addr addr; 49 50 RTE_EAL_DEVARGS_FOREACH("pci", devargs) { 51 devargs->bus->parse(devargs->name, &addr); 52 if (!rte_pci_addr_cmp(&dev->addr, &addr)) 53 return devargs; 54 } 55 return NULL; 56 } 57 58 void 59 pci_name_set(struct rte_pci_device *dev) 60 { 61 struct rte_devargs *devargs; 62 63 /* Each device has its internal, canonical name set. */ 64 rte_pci_device_name(&dev->addr, 65 dev->name, sizeof(dev->name)); 66 devargs = pci_devargs_lookup(dev); 67 dev->device.devargs = devargs; 68 /* In blacklist mode, if the device is not blacklisted, no 69 * rte_devargs exists for it. 70 */ 71 if (devargs != NULL) 72 /* If an rte_devargs exists, the generic rte_device uses the 73 * given name as its name. 74 */ 75 dev->device.name = dev->device.devargs->name; 76 else 77 /* Otherwise, it uses the internal, canonical form. */ 78 dev->device.name = dev->name; 79 } 80 81 /* 82 * Match the PCI Driver and Device using the ID Table 83 */ 84 int 85 rte_pci_match(const struct rte_pci_driver *pci_drv, 86 const struct rte_pci_device *pci_dev) 87 { 88 const struct rte_pci_id *id_table; 89 90 for (id_table = pci_drv->id_table; id_table->vendor_id != 0; 91 id_table++) { 92 /* check if device's identifiers match the driver's ones */ 93 if (id_table->vendor_id != pci_dev->id.vendor_id && 94 id_table->vendor_id != PCI_ANY_ID) 95 continue; 96 if (id_table->device_id != pci_dev->id.device_id && 97 id_table->device_id != PCI_ANY_ID) 98 continue; 99 if (id_table->subsystem_vendor_id != 100 pci_dev->id.subsystem_vendor_id && 101 id_table->subsystem_vendor_id != PCI_ANY_ID) 102 continue; 103 if (id_table->subsystem_device_id != 104 pci_dev->id.subsystem_device_id && 105 id_table->subsystem_device_id != PCI_ANY_ID) 106 continue; 107 if (id_table->class_id != pci_dev->id.class_id && 108 id_table->class_id != RTE_CLASS_ANY_ID) 109 continue; 110 111 return 1; 112 } 113 114 return 0; 115 } 116 117 /* 118 * If vendor/device ID match, call the probe() function of the 119 * driver. 120 */ 121 static int 122 rte_pci_probe_one_driver(struct rte_pci_driver *dr, 123 struct rte_pci_device *dev) 124 { 125 int ret; 126 bool already_probed; 127 struct rte_pci_addr *loc; 128 129 if ((dr == NULL) || (dev == NULL)) 130 return -EINVAL; 131 132 loc = &dev->addr; 133 134 /* The device is not blacklisted; Check if driver supports it */ 135 if (!rte_pci_match(dr, dev)) 136 /* Match of device and driver failed */ 137 return 1; 138 139 RTE_LOG(INFO, EAL, "PCI device "PCI_PRI_FMT" on NUMA socket %i\n", 140 loc->domain, loc->bus, loc->devid, loc->function, 141 dev->device.numa_node); 142 143 /* no initialization when blacklisted, return without error */ 144 if (dev->device.devargs != NULL && 145 dev->device.devargs->policy == 146 RTE_DEV_BLACKLISTED) { 147 RTE_LOG(INFO, EAL, " Device is blacklisted, not" 148 " initializing\n"); 149 return 1; 150 } 151 152 if (dev->device.numa_node < 0) { 153 RTE_LOG(WARNING, EAL, " Invalid NUMA socket, default to 0\n"); 154 dev->device.numa_node = 0; 155 } 156 157 already_probed = rte_dev_is_probed(&dev->device); 158 if (already_probed && !(dr->drv_flags & RTE_PCI_DRV_PROBE_AGAIN)) { 159 RTE_LOG(DEBUG, EAL, "Device %s is already probed\n", 160 dev->device.name); 161 return -EEXIST; 162 } 163 164 RTE_LOG(INFO, EAL, " probe driver: %x:%x %s\n", dev->id.vendor_id, 165 dev->id.device_id, dr->driver.name); 166 167 /* 168 * reference driver structure 169 * This needs to be before rte_pci_map_device(), as it enables to use 170 * driver flags for adjusting configuration. 171 */ 172 if (!already_probed) { 173 enum rte_iova_mode dev_iova_mode; 174 enum rte_iova_mode iova_mode; 175 176 dev_iova_mode = pci_device_iova_mode(dr, dev); 177 iova_mode = rte_eal_iova_mode(); 178 if (dev_iova_mode != RTE_IOVA_DC && 179 dev_iova_mode != iova_mode) { 180 RTE_LOG(ERR, EAL, " Expecting '%s' IOVA mode but current mode is '%s', not initializing\n", 181 dev_iova_mode == RTE_IOVA_PA ? "PA" : "VA", 182 iova_mode == RTE_IOVA_PA ? "PA" : "VA"); 183 return -EINVAL; 184 } 185 186 dev->driver = dr; 187 } 188 189 if (!already_probed && (dr->drv_flags & RTE_PCI_DRV_NEED_MAPPING)) { 190 /* map resources for devices that use igb_uio */ 191 ret = rte_pci_map_device(dev); 192 if (ret != 0) { 193 dev->driver = NULL; 194 return ret; 195 } 196 } 197 198 /* call the driver probe() function */ 199 ret = dr->probe(dr, dev); 200 if (already_probed) 201 return ret; /* no rollback if already succeeded earlier */ 202 if (ret) { 203 dev->driver = NULL; 204 if ((dr->drv_flags & RTE_PCI_DRV_NEED_MAPPING) && 205 /* Don't unmap if device is unsupported and 206 * driver needs mapped resources. 207 */ 208 !(ret > 0 && 209 (dr->drv_flags & RTE_PCI_DRV_KEEP_MAPPED_RES))) 210 rte_pci_unmap_device(dev); 211 } else { 212 dev->device.driver = &dr->driver; 213 } 214 215 return ret; 216 } 217 218 /* 219 * If vendor/device ID match, call the remove() function of the 220 * driver. 221 */ 222 static int 223 rte_pci_detach_dev(struct rte_pci_device *dev) 224 { 225 struct rte_pci_addr *loc; 226 struct rte_pci_driver *dr; 227 int ret = 0; 228 229 if (dev == NULL) 230 return -EINVAL; 231 232 dr = dev->driver; 233 loc = &dev->addr; 234 235 RTE_LOG(DEBUG, EAL, "PCI device "PCI_PRI_FMT" on NUMA socket %i\n", 236 loc->domain, loc->bus, loc->devid, 237 loc->function, dev->device.numa_node); 238 239 RTE_LOG(DEBUG, EAL, " remove driver: %x:%x %s\n", dev->id.vendor_id, 240 dev->id.device_id, dr->driver.name); 241 242 if (dr->remove) { 243 ret = dr->remove(dev); 244 if (ret < 0) 245 return ret; 246 } 247 248 /* clear driver structure */ 249 dev->driver = NULL; 250 251 if (dr->drv_flags & RTE_PCI_DRV_NEED_MAPPING) 252 /* unmap resources for devices that use igb_uio */ 253 rte_pci_unmap_device(dev); 254 255 return 0; 256 } 257 258 /* 259 * If vendor/device ID match, call the probe() function of all 260 * registered driver for the given device. Return < 0 if initialization 261 * failed, return 1 if no driver is found for this device. 262 */ 263 static int 264 pci_probe_all_drivers(struct rte_pci_device *dev) 265 { 266 struct rte_pci_driver *dr = NULL; 267 int rc = 0; 268 269 if (dev == NULL) 270 return -EINVAL; 271 272 FOREACH_DRIVER_ON_PCIBUS(dr) { 273 rc = rte_pci_probe_one_driver(dr, dev); 274 if (rc < 0) 275 /* negative value is an error */ 276 return rc; 277 if (rc > 0) 278 /* positive value means driver doesn't support it */ 279 continue; 280 return 0; 281 } 282 return 1; 283 } 284 285 /* 286 * Scan the content of the PCI bus, and call the probe() function for 287 * all registered drivers that have a matching entry in its id_table 288 * for discovered devices. 289 */ 290 int 291 rte_pci_probe(void) 292 { 293 struct rte_pci_device *dev = NULL; 294 size_t probed = 0, failed = 0; 295 struct rte_devargs *devargs; 296 int probe_all = 0; 297 int ret = 0; 298 299 if (rte_pci_bus.bus.conf.scan_mode != RTE_BUS_SCAN_WHITELIST) 300 probe_all = 1; 301 302 FOREACH_DEVICE_ON_PCIBUS(dev) { 303 probed++; 304 305 devargs = dev->device.devargs; 306 /* probe all or only whitelisted devices */ 307 if (probe_all) 308 ret = pci_probe_all_drivers(dev); 309 else if (devargs != NULL && 310 devargs->policy == RTE_DEV_WHITELISTED) 311 ret = pci_probe_all_drivers(dev); 312 if (ret < 0) { 313 if (ret != -EEXIST) { 314 RTE_LOG(ERR, EAL, "Requested device " 315 PCI_PRI_FMT " cannot be used\n", 316 dev->addr.domain, dev->addr.bus, 317 dev->addr.devid, dev->addr.function); 318 rte_errno = errno; 319 failed++; 320 } 321 ret = 0; 322 } 323 } 324 325 return (probed && probed == failed) ? -1 : 0; 326 } 327 328 /* dump one device */ 329 static int 330 pci_dump_one_device(FILE *f, struct rte_pci_device *dev) 331 { 332 int i; 333 334 fprintf(f, PCI_PRI_FMT, dev->addr.domain, dev->addr.bus, 335 dev->addr.devid, dev->addr.function); 336 fprintf(f, " - vendor:%x device:%x\n", dev->id.vendor_id, 337 dev->id.device_id); 338 339 for (i = 0; i != sizeof(dev->mem_resource) / 340 sizeof(dev->mem_resource[0]); i++) { 341 fprintf(f, " %16.16"PRIx64" %16.16"PRIx64"\n", 342 dev->mem_resource[i].phys_addr, 343 dev->mem_resource[i].len); 344 } 345 return 0; 346 } 347 348 /* dump devices on the bus */ 349 void 350 rte_pci_dump(FILE *f) 351 { 352 struct rte_pci_device *dev = NULL; 353 354 FOREACH_DEVICE_ON_PCIBUS(dev) { 355 pci_dump_one_device(f, dev); 356 } 357 } 358 359 static int 360 pci_parse(const char *name, void *addr) 361 { 362 struct rte_pci_addr *out = addr; 363 struct rte_pci_addr pci_addr; 364 bool parse; 365 366 parse = (rte_pci_addr_parse(name, &pci_addr) == 0); 367 if (parse && addr != NULL) 368 *out = pci_addr; 369 return parse == false; 370 } 371 372 /* register a driver */ 373 void 374 rte_pci_register(struct rte_pci_driver *driver) 375 { 376 TAILQ_INSERT_TAIL(&rte_pci_bus.driver_list, driver, next); 377 driver->bus = &rte_pci_bus; 378 } 379 380 /* unregister a driver */ 381 void 382 rte_pci_unregister(struct rte_pci_driver *driver) 383 { 384 TAILQ_REMOVE(&rte_pci_bus.driver_list, driver, next); 385 driver->bus = NULL; 386 } 387 388 /* Add a device to PCI bus */ 389 void 390 rte_pci_add_device(struct rte_pci_device *pci_dev) 391 { 392 TAILQ_INSERT_TAIL(&rte_pci_bus.device_list, pci_dev, next); 393 } 394 395 /* Insert a device into a predefined position in PCI bus */ 396 void 397 rte_pci_insert_device(struct rte_pci_device *exist_pci_dev, 398 struct rte_pci_device *new_pci_dev) 399 { 400 TAILQ_INSERT_BEFORE(exist_pci_dev, new_pci_dev, next); 401 } 402 403 /* Remove a device from PCI bus */ 404 static void 405 rte_pci_remove_device(struct rte_pci_device *pci_dev) 406 { 407 TAILQ_REMOVE(&rte_pci_bus.device_list, pci_dev, next); 408 } 409 410 static struct rte_device * 411 pci_find_device(const struct rte_device *start, rte_dev_cmp_t cmp, 412 const void *data) 413 { 414 const struct rte_pci_device *pstart; 415 struct rte_pci_device *pdev; 416 417 if (start != NULL) { 418 pstart = RTE_DEV_TO_PCI_CONST(start); 419 pdev = TAILQ_NEXT(pstart, next); 420 } else { 421 pdev = TAILQ_FIRST(&rte_pci_bus.device_list); 422 } 423 while (pdev != NULL) { 424 if (cmp(&pdev->device, data) == 0) 425 return &pdev->device; 426 pdev = TAILQ_NEXT(pdev, next); 427 } 428 return NULL; 429 } 430 431 /* 432 * find the device which encounter the failure, by iterate over all device on 433 * PCI bus to check if the memory failure address is located in the range 434 * of the BARs of the device. 435 */ 436 static struct rte_pci_device * 437 pci_find_device_by_addr(const void *failure_addr) 438 { 439 struct rte_pci_device *pdev = NULL; 440 uint64_t check_point, start, end, len; 441 int i; 442 443 check_point = (uint64_t)(uintptr_t)failure_addr; 444 445 FOREACH_DEVICE_ON_PCIBUS(pdev) { 446 for (i = 0; i != RTE_DIM(pdev->mem_resource); i++) { 447 start = (uint64_t)(uintptr_t)pdev->mem_resource[i].addr; 448 len = pdev->mem_resource[i].len; 449 end = start + len; 450 if (check_point >= start && check_point < end) { 451 RTE_LOG(DEBUG, EAL, "Failure address %16.16" 452 PRIx64" belongs to device %s!\n", 453 check_point, pdev->device.name); 454 return pdev; 455 } 456 } 457 } 458 return NULL; 459 } 460 461 static int 462 pci_hot_unplug_handler(struct rte_device *dev) 463 { 464 struct rte_pci_device *pdev = NULL; 465 int ret = 0; 466 467 pdev = RTE_DEV_TO_PCI(dev); 468 if (!pdev) 469 return -1; 470 471 switch (pdev->kdrv) { 472 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE 473 case RTE_KDRV_VFIO: 474 /* 475 * vfio kernel module guaranty the pci device would not be 476 * deleted until the user space release the resource, so no 477 * need to remap BARs resource here, just directly notify 478 * the req event to the user space to handle it. 479 */ 480 rte_dev_event_callback_process(dev->name, 481 RTE_DEV_EVENT_REMOVE); 482 break; 483 #endif 484 case RTE_KDRV_IGB_UIO: 485 case RTE_KDRV_UIO_GENERIC: 486 case RTE_KDRV_NIC_UIO: 487 /* BARs resource is invalid, remap it to be safe. */ 488 ret = pci_uio_remap_resource(pdev); 489 break; 490 default: 491 RTE_LOG(DEBUG, EAL, 492 "Not managed by a supported kernel driver, skipped\n"); 493 ret = -1; 494 break; 495 } 496 497 return ret; 498 } 499 500 static int 501 pci_sigbus_handler(const void *failure_addr) 502 { 503 struct rte_pci_device *pdev = NULL; 504 int ret = 0; 505 506 pdev = pci_find_device_by_addr(failure_addr); 507 if (!pdev) { 508 /* It is a generic sigbus error, no bus would handle it. */ 509 ret = 1; 510 } else { 511 /* The sigbus error is caused of hot-unplug. */ 512 ret = pci_hot_unplug_handler(&pdev->device); 513 if (ret) { 514 RTE_LOG(ERR, EAL, 515 "Failed to handle hot-unplug for device %s", 516 pdev->name); 517 ret = -1; 518 } 519 } 520 return ret; 521 } 522 523 static int 524 pci_plug(struct rte_device *dev) 525 { 526 return pci_probe_all_drivers(RTE_DEV_TO_PCI(dev)); 527 } 528 529 static int 530 pci_unplug(struct rte_device *dev) 531 { 532 struct rte_pci_device *pdev; 533 int ret; 534 535 pdev = RTE_DEV_TO_PCI(dev); 536 ret = rte_pci_detach_dev(pdev); 537 if (ret == 0) { 538 rte_pci_remove_device(pdev); 539 rte_devargs_remove(dev->devargs); 540 free(pdev); 541 } 542 return ret; 543 } 544 545 static int 546 pci_dma_map(struct rte_device *dev, void *addr, uint64_t iova, size_t len) 547 { 548 struct rte_pci_device *pdev = RTE_DEV_TO_PCI(dev); 549 550 if (!pdev || !pdev->driver) { 551 rte_errno = EINVAL; 552 return -1; 553 } 554 if (pdev->driver->dma_map) 555 return pdev->driver->dma_map(pdev, addr, iova, len); 556 /** 557 * In case driver don't provides any specific mapping 558 * try fallback to VFIO. 559 */ 560 if (pdev->kdrv == RTE_KDRV_VFIO) 561 return rte_vfio_container_dma_map 562 (RTE_VFIO_DEFAULT_CONTAINER_FD, (uintptr_t)addr, 563 iova, len); 564 rte_errno = ENOTSUP; 565 return -1; 566 } 567 568 static int 569 pci_dma_unmap(struct rte_device *dev, void *addr, uint64_t iova, size_t len) 570 { 571 struct rte_pci_device *pdev = RTE_DEV_TO_PCI(dev); 572 573 if (!pdev || !pdev->driver) { 574 rte_errno = EINVAL; 575 return -1; 576 } 577 if (pdev->driver->dma_unmap) 578 return pdev->driver->dma_unmap(pdev, addr, iova, len); 579 /** 580 * In case driver don't provides any specific mapping 581 * try fallback to VFIO. 582 */ 583 if (pdev->kdrv == RTE_KDRV_VFIO) 584 return rte_vfio_container_dma_unmap 585 (RTE_VFIO_DEFAULT_CONTAINER_FD, (uintptr_t)addr, 586 iova, len); 587 rte_errno = ENOTSUP; 588 return -1; 589 } 590 591 static bool 592 pci_ignore_device(const struct rte_pci_device *dev) 593 { 594 struct rte_devargs *devargs = dev->device.devargs; 595 596 switch (rte_pci_bus.bus.conf.scan_mode) { 597 case RTE_BUS_SCAN_WHITELIST: 598 if (devargs && devargs->policy == RTE_DEV_WHITELISTED) 599 return false; 600 break; 601 case RTE_BUS_SCAN_UNDEFINED: 602 case RTE_BUS_SCAN_BLACKLIST: 603 if (devargs == NULL || 604 devargs->policy != RTE_DEV_BLACKLISTED) 605 return false; 606 break; 607 } 608 return true; 609 } 610 611 enum rte_iova_mode 612 rte_pci_get_iommu_class(void) 613 { 614 enum rte_iova_mode iova_mode = RTE_IOVA_DC; 615 const struct rte_pci_device *dev; 616 const struct rte_pci_driver *drv; 617 bool devices_want_va = false; 618 bool devices_want_pa = false; 619 int iommu_no_va = -1; 620 621 FOREACH_DEVICE_ON_PCIBUS(dev) { 622 /* 623 * We can check this only once, because the IOMMU hardware is 624 * the same for all of them. 625 */ 626 if (iommu_no_va == -1) 627 iommu_no_va = pci_device_iommu_support_va(dev) 628 ? 0 : 1; 629 if (pci_ignore_device(dev)) 630 continue; 631 if (dev->kdrv == RTE_KDRV_UNKNOWN || 632 dev->kdrv == RTE_KDRV_NONE) 633 continue; 634 FOREACH_DRIVER_ON_PCIBUS(drv) { 635 enum rte_iova_mode dev_iova_mode; 636 637 if (!rte_pci_match(drv, dev)) 638 continue; 639 640 dev_iova_mode = pci_device_iova_mode(drv, dev); 641 RTE_LOG(DEBUG, EAL, "PCI driver %s for device " 642 PCI_PRI_FMT " wants IOVA as '%s'\n", 643 drv->driver.name, 644 dev->addr.domain, dev->addr.bus, 645 dev->addr.devid, dev->addr.function, 646 dev_iova_mode == RTE_IOVA_DC ? "DC" : 647 (dev_iova_mode == RTE_IOVA_PA ? "PA" : "VA")); 648 if (dev_iova_mode == RTE_IOVA_PA) 649 devices_want_pa = true; 650 else if (dev_iova_mode == RTE_IOVA_VA) 651 devices_want_va = true; 652 } 653 } 654 if (iommu_no_va == 1) { 655 iova_mode = RTE_IOVA_PA; 656 if (devices_want_va) { 657 RTE_LOG(WARNING, EAL, "Some devices want 'VA' but IOMMU does not support 'VA'.\n"); 658 RTE_LOG(WARNING, EAL, "The devices that want 'VA' won't initialize.\n"); 659 } 660 } else if (devices_want_va && !devices_want_pa) { 661 iova_mode = RTE_IOVA_VA; 662 } else if (devices_want_pa && !devices_want_va) { 663 iova_mode = RTE_IOVA_PA; 664 } else { 665 iova_mode = RTE_IOVA_DC; 666 if (devices_want_va) { 667 RTE_LOG(WARNING, EAL, "Some devices want 'VA' but forcing 'DC' because other devices want 'PA'.\n"); 668 RTE_LOG(WARNING, EAL, "Depending on the final decision by the EAL, not all devices may be able to initialize.\n"); 669 } 670 } 671 return iova_mode; 672 } 673 674 struct rte_pci_bus rte_pci_bus = { 675 .bus = { 676 .scan = rte_pci_scan, 677 .probe = rte_pci_probe, 678 .find_device = pci_find_device, 679 .plug = pci_plug, 680 .unplug = pci_unplug, 681 .parse = pci_parse, 682 .dma_map = pci_dma_map, 683 .dma_unmap = pci_dma_unmap, 684 .get_iommu_class = rte_pci_get_iommu_class, 685 .dev_iterate = rte_pci_dev_iterate, 686 .hot_unplug_handler = pci_hot_unplug_handler, 687 .sigbus_handler = pci_sigbus_handler, 688 }, 689 .device_list = TAILQ_HEAD_INITIALIZER(rte_pci_bus.device_list), 690 .driver_list = TAILQ_HEAD_INITIALIZER(rte_pci_bus.driver_list), 691 }; 692 693 RTE_REGISTER_BUS(pci, rte_pci_bus.bus); 694