1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2015 Intel Corporation. 3 * All rights reserved. 4 */ 5 6 #include "env_internal.h" 7 #include "pci_dpdk.h" 8 9 #include <rte_alarm.h> 10 #include <rte_devargs.h> 11 #include <rte_pci.h> 12 #include "spdk/env.h" 13 #include "spdk/log.h" 14 #include "spdk/string.h" 15 #include "spdk/memory.h" 16 17 #define SYSFS_PCI_DRIVERS "/sys/bus/pci/drivers" 18 19 #define PCI_CFG_SIZE 256 20 #define PCI_EXT_CAP_ID_SN 0x03 21 22 /* DPDK 18.11+ hotplug isn't robust. Multiple apps starting at the same time 23 * might cause the internal IPC to misbehave. Just retry in such case. 24 */ 25 #define DPDK_HOTPLUG_RETRY_COUNT 4 26 27 /* DPDK alarm/interrupt thread */ 28 static pthread_mutex_t g_pci_mutex = PTHREAD_MUTEX_INITIALIZER; 29 static TAILQ_HEAD(, spdk_pci_device) g_pci_devices = TAILQ_HEAD_INITIALIZER(g_pci_devices); 30 /* devices hotplugged on a dpdk thread */ 31 static TAILQ_HEAD(, spdk_pci_device) g_pci_hotplugged_devices = 32 TAILQ_HEAD_INITIALIZER(g_pci_hotplugged_devices); 33 static TAILQ_HEAD(, spdk_pci_driver) g_pci_drivers = TAILQ_HEAD_INITIALIZER(g_pci_drivers); 34 static TAILQ_HEAD(, spdk_pci_device_provider) g_pci_device_providers = 35 TAILQ_HEAD_INITIALIZER(g_pci_device_providers); 36 37 int pci_device_init(struct rte_pci_driver *driver, struct rte_pci_device *device); 38 int pci_device_fini(struct rte_pci_device *device); 39 40 struct env_devargs { 41 struct rte_bus *bus; 42 char name[128]; 43 uint64_t allowed_at; 44 TAILQ_ENTRY(env_devargs) link; 45 }; 46 static TAILQ_HEAD(, env_devargs) g_env_devargs = TAILQ_HEAD_INITIALIZER(g_env_devargs); 47 48 static struct env_devargs * 49 find_env_devargs(struct rte_bus *bus, const char *name) 50 { 51 struct env_devargs *da; 52 53 TAILQ_FOREACH(da, &g_env_devargs, link) { 54 if (bus == da->bus && !strcmp(name, da->name)) { 55 return da; 56 } 57 } 58 59 return NULL; 60 } 61 62 static int 63 map_bar_rte(struct spdk_pci_device *device, uint32_t bar, 64 void **mapped_addr, uint64_t *phys_addr, uint64_t *size) 65 { 66 struct rte_mem_resource *res; 67 68 res = dpdk_pci_device_get_mem_resource(device->dev_handle, bar); 69 *mapped_addr = res->addr; 70 *phys_addr = (uint64_t)res->phys_addr; 71 *size = (uint64_t)res->len; 72 73 return 0; 74 } 75 76 static int 77 unmap_bar_rte(struct spdk_pci_device *device, uint32_t bar, void *addr) 78 { 79 return 0; 80 } 81 82 static int 83 cfg_read_rte(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset) 84 { 85 return dpdk_pci_device_read_config(dev->dev_handle, value, len, offset); 86 } 87 88 static int 89 cfg_write_rte(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset) 90 { 91 return dpdk_pci_device_write_config(dev->dev_handle, value, len, offset); 92 } 93 94 static void 95 remove_rte_dev(struct rte_pci_device *rte_dev) 96 { 97 char bdf[32]; 98 int i = 0, rc; 99 100 snprintf(bdf, sizeof(bdf), "%s", dpdk_pci_device_get_name(rte_dev)); 101 do { 102 rc = rte_eal_hotplug_remove("pci", bdf); 103 } while (rc == -ENOMSG && ++i <= DPDK_HOTPLUG_RETRY_COUNT); 104 } 105 106 static void 107 detach_rte_cb(void *_dev) 108 { 109 remove_rte_dev(_dev); 110 } 111 112 /* if it's a physical device we need to deal with DPDK on 113 * a different process and we can't just unset one flag 114 * here. We also want to stop using any device resources 115 * so that the device isn't "in use" by the userspace driver 116 * once we detach it. This would allow attaching the device 117 * to a different process, or to a kernel driver like nvme. 118 */ 119 static void 120 detach_rte(struct spdk_pci_device *dev) 121 { 122 struct rte_pci_device *rte_dev = dev->dev_handle; 123 int i; 124 bool removed; 125 126 if (!spdk_process_is_primary()) { 127 return; 128 } 129 130 pthread_mutex_lock(&g_pci_mutex); 131 dev->internal.attached = false; 132 /* prevent the hotremove notification from removing this device */ 133 dev->internal.pending_removal = true; 134 pthread_mutex_unlock(&g_pci_mutex); 135 136 rte_eal_alarm_set(1, detach_rte_cb, rte_dev); 137 138 /* wait up to 2s for the cb to execute */ 139 for (i = 2000; i > 0; i--) { 140 141 spdk_delay_us(1000); 142 pthread_mutex_lock(&g_pci_mutex); 143 removed = dev->internal.removed; 144 pthread_mutex_unlock(&g_pci_mutex); 145 146 if (removed) { 147 break; 148 } 149 } 150 151 /* besides checking the removed flag, we also need to wait 152 * for the dpdk detach function to unwind, as it's doing some 153 * operations even after calling our detach callback. Simply 154 * cancel the alarm - if it started executing already, this 155 * call will block and wait for it to finish. 156 */ 157 rte_eal_alarm_cancel(detach_rte_cb, rte_dev); 158 159 /* the device could have been finally removed, so just check 160 * it again. 161 */ 162 pthread_mutex_lock(&g_pci_mutex); 163 removed = dev->internal.removed; 164 pthread_mutex_unlock(&g_pci_mutex); 165 if (!removed) { 166 SPDK_ERRLOG("Timeout waiting for DPDK to remove PCI device %s.\n", 167 dpdk_pci_device_get_name(rte_dev)); 168 /* If we reach this state, then the device couldn't be removed and most likely 169 a subsequent hot add of a device in the same BDF will fail */ 170 } 171 } 172 173 void 174 spdk_pci_driver_register(const char *name, struct spdk_pci_id *id_table, uint32_t flags) 175 { 176 struct spdk_pci_driver *driver; 177 178 driver = calloc(1, sizeof(*driver)); 179 if (!driver) { 180 /* we can't do any better than bailing atm */ 181 return; 182 } 183 184 driver->name = name; 185 driver->id_table = id_table; 186 driver->drv_flags = flags; 187 driver->driver = (struct rte_pci_driver *)driver->driver_buf; 188 TAILQ_INSERT_TAIL(&g_pci_drivers, driver, tailq); 189 } 190 191 struct spdk_pci_driver * 192 spdk_pci_nvme_get_driver(void) 193 { 194 return spdk_pci_get_driver("nvme"); 195 } 196 197 struct spdk_pci_driver * 198 spdk_pci_get_driver(const char *name) 199 { 200 struct spdk_pci_driver *driver; 201 202 TAILQ_FOREACH(driver, &g_pci_drivers, tailq) { 203 if (strcmp(driver->name, name) == 0) { 204 return driver; 205 } 206 } 207 208 return NULL; 209 } 210 211 static void 212 pci_device_rte_dev_event(const char *device_name, 213 enum rte_dev_event_type event, 214 void *cb_arg) 215 { 216 struct spdk_pci_device *dev; 217 bool can_detach = false; 218 219 switch (event) { 220 default: 221 case RTE_DEV_EVENT_ADD: 222 /* Nothing to do here yet. */ 223 break; 224 case RTE_DEV_EVENT_REMOVE: 225 pthread_mutex_lock(&g_pci_mutex); 226 TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) { 227 struct rte_pci_device *rte_dev = dev->dev_handle; 228 229 if (strcmp(dpdk_pci_device_get_name(rte_dev), device_name)) { 230 continue; 231 } 232 233 /* Note: these ERRLOGs are useful for triaging issue #2983. */ 234 if (dev->internal.pending_removal || dev->internal.removed) { 235 SPDK_ERRLOG("Received event for device SPDK already tried to remove\n"); 236 SPDK_ERRLOG("pending_removal=%d removed=%d\n", dev->internal.pending_removal, 237 dev->internal.removed); 238 } 239 240 if (!dev->internal.pending_removal) { 241 can_detach = !dev->internal.attached; 242 /* prevent any further attaches */ 243 dev->internal.pending_removal = true; 244 break; 245 } 246 } 247 pthread_mutex_unlock(&g_pci_mutex); 248 249 if (can_detach) { 250 /* if device is not attached we can remove it right away. 251 * Otherwise it will be removed at detach. 252 * 253 * Because the user's callback is invoked in eal interrupt 254 * callback, the interrupt callback need to be finished before 255 * it can be unregistered when detaching device. So finish 256 * callback soon and use a deferred removal to detach device 257 * is need. It is a workaround, once the device detaching be 258 * moved into the eal in the future, the deferred removal could 259 * be deleted. 260 */ 261 assert(dev != NULL); 262 rte_eal_alarm_set(1, detach_rte_cb, dev->dev_handle); 263 } 264 break; 265 } 266 } 267 268 static void 269 cleanup_pci_devices(void) 270 { 271 struct spdk_pci_device *dev, *tmp; 272 273 pthread_mutex_lock(&g_pci_mutex); 274 /* cleanup removed devices */ 275 TAILQ_FOREACH_SAFE(dev, &g_pci_devices, internal.tailq, tmp) { 276 if (!dev->internal.removed) { 277 continue; 278 } 279 280 vtophys_pci_device_removed(dev->dev_handle); 281 TAILQ_REMOVE(&g_pci_devices, dev, internal.tailq); 282 free(dev); 283 } 284 285 /* add newly-attached devices */ 286 TAILQ_FOREACH_SAFE(dev, &g_pci_hotplugged_devices, internal.tailq, tmp) { 287 TAILQ_REMOVE(&g_pci_hotplugged_devices, dev, internal.tailq); 288 TAILQ_INSERT_TAIL(&g_pci_devices, dev, internal.tailq); 289 vtophys_pci_device_added(dev->dev_handle); 290 } 291 pthread_mutex_unlock(&g_pci_mutex); 292 } 293 294 static int scan_pci_bus(bool delay_init); 295 296 static inline void 297 _pci_env_init(void) 298 { 299 /* We assume devices were present on the bus for more than 2 seconds 300 * before initializing SPDK and there's no need to wait more. We scan 301 * the bus, but we don't block any devices. 302 */ 303 scan_pci_bus(false); 304 305 /* Register a single hotremove callback for all devices. */ 306 if (spdk_process_is_primary()) { 307 rte_dev_event_callback_register(NULL, pci_device_rte_dev_event, NULL); 308 } 309 } 310 311 int 312 pci_env_init(void) 313 { 314 struct spdk_pci_driver *driver; 315 int rc; 316 317 rc = dpdk_pci_init(); 318 if (rc) { 319 return rc; 320 } 321 322 TAILQ_FOREACH(driver, &g_pci_drivers, tailq) { 323 dpdk_pci_driver_register(driver, pci_device_init, pci_device_fini); 324 } 325 326 _pci_env_init(); 327 return 0; 328 } 329 330 void 331 pci_env_reinit(void) 332 { 333 /* There is no need to register pci drivers again, since they were 334 * already pre-registered in pci_env_init. 335 */ 336 337 _pci_env_init(); 338 } 339 340 void 341 pci_env_fini(void) 342 { 343 struct spdk_pci_device *dev; 344 char bdf[32]; 345 346 cleanup_pci_devices(); 347 TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) { 348 if (dev->internal.attached) { 349 spdk_pci_addr_fmt(bdf, sizeof(bdf), &dev->addr); 350 SPDK_ERRLOG("Device %s is still attached at shutdown!\n", bdf); 351 } 352 } 353 354 if (spdk_process_is_primary()) { 355 rte_dev_event_callback_unregister(NULL, pci_device_rte_dev_event, NULL); 356 } 357 } 358 359 int 360 pci_device_init(struct rte_pci_driver *_drv, 361 struct rte_pci_device *_dev) 362 { 363 struct spdk_pci_driver *driver = (struct spdk_pci_driver *)_drv; 364 struct spdk_pci_device *dev; 365 struct rte_pci_addr *addr; 366 struct rte_pci_id *id; 367 int rc; 368 369 dev = calloc(1, sizeof(*dev)); 370 if (dev == NULL) { 371 return -1; 372 } 373 374 dev->dev_handle = _dev; 375 376 addr = dpdk_pci_device_get_addr(_dev); 377 dev->addr.domain = addr->domain; 378 dev->addr.bus = addr->bus; 379 dev->addr.dev = addr->devid; 380 dev->addr.func = addr->function; 381 382 id = dpdk_pci_device_get_id(_dev); 383 dev->id.class_id = id->class_id; 384 dev->id.vendor_id = id->vendor_id; 385 dev->id.device_id = id->device_id; 386 dev->id.subvendor_id = id->subsystem_vendor_id; 387 dev->id.subdevice_id = id->subsystem_device_id; 388 389 dev->numa_id = dpdk_pci_device_get_numa_node(_dev); 390 dev->type = "pci"; 391 392 dev->map_bar = map_bar_rte; 393 dev->unmap_bar = unmap_bar_rte; 394 dev->cfg_read = cfg_read_rte; 395 dev->cfg_write = cfg_write_rte; 396 397 dev->internal.driver = driver; 398 dev->internal.claim_fd = -1; 399 400 if (driver->cb_fn != NULL) { 401 rc = driver->cb_fn(driver->cb_arg, dev); 402 if (rc != 0) { 403 free(dev); 404 return rc; 405 } 406 dev->internal.attached = true; 407 } 408 409 pthread_mutex_lock(&g_pci_mutex); 410 TAILQ_INSERT_TAIL(&g_pci_hotplugged_devices, dev, internal.tailq); 411 pthread_mutex_unlock(&g_pci_mutex); 412 return 0; 413 } 414 415 static void 416 set_allowed_at(struct rte_devargs *rte_da, uint64_t tsc) 417 { 418 struct env_devargs *env_da; 419 420 env_da = find_env_devargs(rte_da->bus, rte_da->name); 421 if (env_da == NULL) { 422 env_da = calloc(1, sizeof(*env_da)); 423 if (env_da == NULL) { 424 SPDK_ERRLOG("could not set_allowed_at for device %s\n", rte_da->name); 425 return; 426 } 427 env_da->bus = rte_da->bus; 428 spdk_strcpy_pad(env_da->name, rte_da->name, sizeof(env_da->name), 0); 429 TAILQ_INSERT_TAIL(&g_env_devargs, env_da, link); 430 } 431 432 env_da->allowed_at = tsc; 433 } 434 435 static uint64_t 436 get_allowed_at(struct rte_devargs *rte_da) 437 { 438 struct env_devargs *env_da; 439 440 env_da = find_env_devargs(rte_da->bus, rte_da->name); 441 if (env_da) { 442 return env_da->allowed_at; 443 } else { 444 return 0; 445 } 446 } 447 448 int 449 pci_device_fini(struct rte_pci_device *_dev) 450 { 451 struct spdk_pci_device *dev; 452 453 pthread_mutex_lock(&g_pci_mutex); 454 TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) { 455 if (dev->dev_handle == _dev) { 456 break; 457 } 458 } 459 460 if (dev == NULL || dev->internal.attached) { 461 /* The device might be still referenced somewhere in SPDK. */ 462 pthread_mutex_unlock(&g_pci_mutex); 463 return -EBUSY; 464 } 465 466 /* remove our allowed_at option */ 467 if (dpdk_pci_device_get_devargs(_dev)) { 468 set_allowed_at(dpdk_pci_device_get_devargs(_dev), 0); 469 } 470 471 /* It is possible that removed flag was already set when there is a race 472 * between the remove notification for this process, and another process 473 * that is also detaching from this same device (for example, when using 474 * nvme driver in multi-process mode. So do not assert here. See 475 * #2456 for additional details. 476 */ 477 dev->internal.removed = true; 478 pthread_mutex_unlock(&g_pci_mutex); 479 return 0; 480 481 } 482 483 void 484 spdk_pci_device_detach(struct spdk_pci_device *dev) 485 { 486 struct spdk_pci_device_provider *provider; 487 488 assert(dev->internal.attached); 489 490 if (dev->internal.claim_fd >= 0) { 491 spdk_pci_device_unclaim(dev); 492 } 493 494 TAILQ_FOREACH(provider, &g_pci_device_providers, tailq) { 495 if (strcmp(dev->type, provider->name) == 0) { 496 break; 497 } 498 } 499 500 assert(provider != NULL); 501 dev->internal.attached = false; 502 provider->detach_cb(dev); 503 504 cleanup_pci_devices(); 505 } 506 507 static int 508 scan_pci_bus(bool delay_init) 509 { 510 struct rte_dev_iterator it; 511 struct rte_device *rte_dev; 512 uint64_t now; 513 514 dpdk_bus_scan(); 515 now = spdk_get_ticks(); 516 517 if (!TAILQ_FIRST(&g_pci_drivers)) { 518 return 0; 519 } 520 521 RTE_DEV_FOREACH(rte_dev, "bus=pci", &it) { 522 struct rte_devargs *da; 523 524 da = dpdk_device_get_devargs(rte_dev); 525 if (!da) { 526 char devargs_str[128]; 527 528 /* the device was never blocked or allowed */ 529 da = calloc(1, sizeof(*da)); 530 if (!da) { 531 return -1; 532 } 533 534 snprintf(devargs_str, sizeof(devargs_str), "pci:%s", dpdk_device_get_name(rte_dev)); 535 if (rte_devargs_parse(da, devargs_str) != 0) { 536 free(da); 537 return -1; 538 } 539 540 rte_devargs_insert(&da); 541 dpdk_device_set_devargs(rte_dev, da); 542 } 543 544 if (get_allowed_at(da)) { 545 uint64_t allowed_at = get_allowed_at(da); 546 547 /* this device was seen by spdk before... */ 548 if (da->policy == RTE_DEV_BLOCKED && allowed_at <= now) { 549 da->policy = RTE_DEV_ALLOWED; 550 } 551 } else if ((dpdk_device_scan_allowed(rte_dev) && da->policy == RTE_DEV_ALLOWED) || 552 da->policy != RTE_DEV_BLOCKED) { 553 /* override the policy only if not permanently blocked */ 554 555 if (delay_init) { 556 da->policy = RTE_DEV_BLOCKED; 557 set_allowed_at(da, now + 2 * spdk_get_ticks_hz()); 558 } else { 559 da->policy = RTE_DEV_ALLOWED; 560 set_allowed_at(da, now); 561 } 562 } 563 } 564 565 return 0; 566 } 567 568 static int 569 pci_attach_rte(const struct spdk_pci_addr *addr) 570 { 571 char bdf[32]; 572 int rc, i = 0; 573 574 spdk_pci_addr_fmt(bdf, sizeof(bdf), addr); 575 576 do { 577 rc = rte_eal_hotplug_add("pci", bdf, ""); 578 } while (rc == -ENOMSG && ++i <= DPDK_HOTPLUG_RETRY_COUNT); 579 580 if (i > 1 && rc == -EEXIST) { 581 /* Even though the previous request timed out, the device 582 * was attached successfully. 583 */ 584 rc = 0; 585 } 586 587 return rc; 588 } 589 590 static struct spdk_pci_device_provider g_pci_rte_provider = { 591 .name = "pci", 592 .attach_cb = pci_attach_rte, 593 .detach_cb = detach_rte, 594 }; 595 596 SPDK_PCI_REGISTER_DEVICE_PROVIDER(pci, &g_pci_rte_provider); 597 598 int 599 spdk_pci_device_attach(struct spdk_pci_driver *driver, 600 spdk_pci_enum_cb enum_cb, 601 void *enum_ctx, struct spdk_pci_addr *pci_address) 602 { 603 struct spdk_pci_device *dev; 604 struct spdk_pci_device_provider *provider; 605 struct rte_pci_device *rte_dev; 606 struct rte_devargs *da; 607 int rc; 608 609 cleanup_pci_devices(); 610 611 TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) { 612 if (spdk_pci_addr_compare(&dev->addr, pci_address) == 0) { 613 break; 614 } 615 } 616 617 if (dev != NULL && dev->internal.driver == driver) { 618 pthread_mutex_lock(&g_pci_mutex); 619 if (dev->internal.attached || dev->internal.pending_removal) { 620 pthread_mutex_unlock(&g_pci_mutex); 621 return -1; 622 } 623 624 rc = enum_cb(enum_ctx, dev); 625 if (rc == 0) { 626 dev->internal.attached = true; 627 } 628 pthread_mutex_unlock(&g_pci_mutex); 629 return rc; 630 } 631 632 driver->cb_fn = enum_cb; 633 driver->cb_arg = enum_ctx; 634 635 rc = -ENODEV; 636 TAILQ_FOREACH(provider, &g_pci_device_providers, tailq) { 637 rc = provider->attach_cb(pci_address); 638 if (rc == 0) { 639 break; 640 } 641 } 642 643 driver->cb_arg = NULL; 644 driver->cb_fn = NULL; 645 646 cleanup_pci_devices(); 647 648 if (rc != 0) { 649 return -1; 650 } 651 652 /* explicit attach ignores the allowlist, so if we blocked this 653 * device before let's enable it now - just for clarity. 654 */ 655 TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) { 656 if (spdk_pci_addr_compare(&dev->addr, pci_address) == 0) { 657 break; 658 } 659 } 660 assert(dev != NULL); 661 662 rte_dev = dev->dev_handle; 663 if (rte_dev != NULL) { 664 da = dpdk_pci_device_get_devargs(rte_dev); 665 if (da && get_allowed_at(da)) { 666 set_allowed_at(da, spdk_get_ticks()); 667 da->policy = RTE_DEV_ALLOWED; 668 } 669 } 670 671 return 0; 672 } 673 674 /* Note: You can call spdk_pci_enumerate from more than one thread 675 * simultaneously safely, but you cannot call spdk_pci_enumerate 676 * and rte_eal_pci_probe simultaneously. 677 */ 678 int 679 spdk_pci_enumerate(struct spdk_pci_driver *driver, 680 spdk_pci_enum_cb enum_cb, 681 void *enum_ctx) 682 { 683 struct spdk_pci_device *dev; 684 int rc; 685 686 cleanup_pci_devices(); 687 688 pthread_mutex_lock(&g_pci_mutex); 689 TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) { 690 if (dev->internal.attached || 691 dev->internal.driver != driver || 692 dev->internal.pending_removal) { 693 continue; 694 } 695 696 rc = enum_cb(enum_ctx, dev); 697 if (rc == 0) { 698 dev->internal.attached = true; 699 } else if (rc < 0) { 700 pthread_mutex_unlock(&g_pci_mutex); 701 return -1; 702 } 703 } 704 pthread_mutex_unlock(&g_pci_mutex); 705 706 if (scan_pci_bus(true) != 0) { 707 return -1; 708 } 709 710 driver->cb_fn = enum_cb; 711 driver->cb_arg = enum_ctx; 712 713 if (dpdk_bus_probe() != 0) { 714 driver->cb_arg = NULL; 715 driver->cb_fn = NULL; 716 return -1; 717 } 718 719 driver->cb_arg = NULL; 720 driver->cb_fn = NULL; 721 722 cleanup_pci_devices(); 723 return 0; 724 } 725 726 void 727 spdk_pci_for_each_device(void *ctx, void (*fn)(void *ctx, struct spdk_pci_device *dev)) 728 { 729 struct spdk_pci_device *dev, *tmp; 730 731 pthread_mutex_lock(&g_pci_mutex); 732 TAILQ_FOREACH_SAFE(dev, &g_pci_devices, internal.tailq, tmp) { 733 fn(ctx, dev); 734 } 735 pthread_mutex_unlock(&g_pci_mutex); 736 } 737 738 int 739 spdk_pci_device_map_bar(struct spdk_pci_device *dev, uint32_t bar, 740 void **mapped_addr, uint64_t *phys_addr, uint64_t *size) 741 { 742 int rc; 743 744 rc = dev->map_bar(dev, bar, mapped_addr, phys_addr, size); 745 if (rc) { 746 return rc; 747 } 748 749 #if VFIO_ENABLED 750 /* Automatically map the BAR to the IOMMU */ 751 if (!spdk_iommu_is_enabled()) { 752 return 0; 753 } 754 755 if (rte_eal_iova_mode() == RTE_IOVA_VA) { 756 /* We'll use the virtual address as the iova to match DPDK. */ 757 rc = vtophys_iommu_map_dma_bar((uint64_t)(*mapped_addr), (uint64_t) * mapped_addr, *size); 758 if (rc) { 759 dev->unmap_bar(dev, bar, *mapped_addr); 760 return -EFAULT; 761 } 762 763 *phys_addr = (uint64_t)(*mapped_addr); 764 } else { 765 /* We'll use the physical address as the iova to match DPDK. */ 766 rc = vtophys_iommu_map_dma_bar((uint64_t)(*mapped_addr), *phys_addr, *size); 767 if (rc) { 768 dev->unmap_bar(dev, bar, *mapped_addr); 769 return -EFAULT; 770 } 771 } 772 #endif 773 return rc; 774 } 775 776 int 777 spdk_pci_device_unmap_bar(struct spdk_pci_device *dev, uint32_t bar, void *addr) 778 { 779 #if VFIO_ENABLED 780 int rc; 781 782 if (spdk_iommu_is_enabled()) { 783 rc = vtophys_iommu_unmap_dma_bar((uint64_t)addr); 784 if (rc) { 785 return -EFAULT; 786 } 787 } 788 #endif 789 790 return dev->unmap_bar(dev, bar, addr); 791 } 792 793 int 794 spdk_pci_device_enable_interrupt(struct spdk_pci_device *dev) 795 { 796 return dpdk_pci_device_enable_interrupt(dev->dev_handle); 797 } 798 799 int 800 spdk_pci_device_disable_interrupt(struct spdk_pci_device *dev) 801 { 802 return dpdk_pci_device_disable_interrupt(dev->dev_handle); 803 } 804 805 int 806 spdk_pci_device_get_interrupt_efd(struct spdk_pci_device *dev) 807 { 808 return dpdk_pci_device_get_interrupt_efd(dev->dev_handle); 809 } 810 811 int 812 spdk_pci_device_enable_interrupts(struct spdk_pci_device *dev, uint32_t efd_count) 813 { 814 struct rte_pci_device *rte_dev = dev->dev_handle; 815 int rc; 816 817 if (efd_count == 0) { 818 SPDK_ERRLOG("Invalid efd_count (%u)\n", efd_count); 819 return -EINVAL; 820 } 821 822 /* Detect if device has MSI-X capability */ 823 if (dpdk_pci_device_interrupt_cap_multi(rte_dev) != 1) { 824 SPDK_ERRLOG("VFIO MSI-X capability not present for device %s\n", 825 dpdk_pci_device_get_name(rte_dev)); 826 return -ENOTSUP; 827 } 828 829 /* Create event file descriptors */ 830 rc = dpdk_pci_device_create_interrupt_efds(rte_dev, efd_count); 831 if (rc) { 832 SPDK_ERRLOG("Can't setup eventfd (%u)\n", efd_count); 833 return rc; 834 } 835 836 /* Bind each event fd to each interrupt vector */ 837 rc = dpdk_pci_device_enable_interrupt(rte_dev); 838 if (rc) { 839 SPDK_ERRLOG("Failed to enable interrupt for PCI device %s\n", 840 dpdk_pci_device_get_name(rte_dev)); 841 dpdk_pci_device_delete_interrupt_efds(rte_dev); 842 return rc; 843 } 844 845 return 0; 846 } 847 848 int 849 spdk_pci_device_disable_interrupts(struct spdk_pci_device *dev) 850 { 851 struct rte_pci_device *rte_dev = dev->dev_handle; 852 int rc; 853 854 rc = dpdk_pci_device_disable_interrupt(rte_dev); 855 if (rc) { 856 SPDK_ERRLOG("Failed to disable interrupt for PCI device %s\n", 857 dpdk_pci_device_get_name(rte_dev)); 858 return rc; 859 } 860 861 dpdk_pci_device_delete_interrupt_efds(rte_dev); 862 863 return 0; 864 } 865 866 int 867 spdk_pci_device_get_interrupt_efd_by_index(struct spdk_pci_device *dev, uint32_t index) 868 { 869 if (index == 0) { 870 return dpdk_pci_device_get_interrupt_efd(dev->dev_handle); 871 } else { 872 /* Note: The interrupt vector offset starts from 1, and in DPDK these 873 * are mapped to efd index 0 onwards. 874 */ 875 return dpdk_pci_device_get_interrupt_efd_by_index(dev->dev_handle, index - 1); 876 } 877 } 878 879 uint32_t 880 spdk_pci_device_get_domain(struct spdk_pci_device *dev) 881 { 882 return dev->addr.domain; 883 } 884 885 uint8_t 886 spdk_pci_device_get_bus(struct spdk_pci_device *dev) 887 { 888 return dev->addr.bus; 889 } 890 891 uint8_t 892 spdk_pci_device_get_dev(struct spdk_pci_device *dev) 893 { 894 return dev->addr.dev; 895 } 896 897 uint8_t 898 spdk_pci_device_get_func(struct spdk_pci_device *dev) 899 { 900 return dev->addr.func; 901 } 902 903 uint16_t 904 spdk_pci_device_get_vendor_id(struct spdk_pci_device *dev) 905 { 906 return dev->id.vendor_id; 907 } 908 909 uint16_t 910 spdk_pci_device_get_device_id(struct spdk_pci_device *dev) 911 { 912 return dev->id.device_id; 913 } 914 915 uint16_t 916 spdk_pci_device_get_subvendor_id(struct spdk_pci_device *dev) 917 { 918 return dev->id.subvendor_id; 919 } 920 921 uint16_t 922 spdk_pci_device_get_subdevice_id(struct spdk_pci_device *dev) 923 { 924 return dev->id.subdevice_id; 925 } 926 927 struct spdk_pci_id 928 spdk_pci_device_get_id(struct spdk_pci_device *dev) 929 { 930 return dev->id; 931 } 932 933 int 934 spdk_pci_device_get_numa_id(struct spdk_pci_device *dev) 935 { 936 return dev->numa_id; 937 } 938 939 SPDK_LOG_DEPRECATION_REGISTER(pci_device_socket_id, "spdk_pci_device_get_socket_id", "v25.05", 0); 940 941 int 942 spdk_pci_device_get_socket_id(struct spdk_pci_device *dev) 943 { 944 SPDK_LOG_DEPRECATED(pci_device_socket_id); 945 return spdk_pci_device_get_numa_id(dev); 946 } 947 948 int 949 spdk_pci_device_cfg_read(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset) 950 { 951 return dev->cfg_read(dev, value, len, offset); 952 } 953 954 int 955 spdk_pci_device_cfg_write(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset) 956 { 957 return dev->cfg_write(dev, value, len, offset); 958 } 959 960 int 961 spdk_pci_device_cfg_read8(struct spdk_pci_device *dev, uint8_t *value, uint32_t offset) 962 { 963 return spdk_pci_device_cfg_read(dev, value, 1, offset); 964 } 965 966 int 967 spdk_pci_device_cfg_write8(struct spdk_pci_device *dev, uint8_t value, uint32_t offset) 968 { 969 return spdk_pci_device_cfg_write(dev, &value, 1, offset); 970 } 971 972 int 973 spdk_pci_device_cfg_read16(struct spdk_pci_device *dev, uint16_t *value, uint32_t offset) 974 { 975 return spdk_pci_device_cfg_read(dev, value, 2, offset); 976 } 977 978 int 979 spdk_pci_device_cfg_write16(struct spdk_pci_device *dev, uint16_t value, uint32_t offset) 980 { 981 return spdk_pci_device_cfg_write(dev, &value, 2, offset); 982 } 983 984 int 985 spdk_pci_device_cfg_read32(struct spdk_pci_device *dev, uint32_t *value, uint32_t offset) 986 { 987 return spdk_pci_device_cfg_read(dev, value, 4, offset); 988 } 989 990 int 991 spdk_pci_device_cfg_write32(struct spdk_pci_device *dev, uint32_t value, uint32_t offset) 992 { 993 return spdk_pci_device_cfg_write(dev, &value, 4, offset); 994 } 995 996 int 997 spdk_pci_device_get_serial_number(struct spdk_pci_device *dev, char *sn, size_t len) 998 { 999 int err; 1000 uint32_t pos, header = 0; 1001 uint32_t i, buf[2]; 1002 1003 if (len < 17) { 1004 return -1; 1005 } 1006 1007 err = spdk_pci_device_cfg_read32(dev, &header, PCI_CFG_SIZE); 1008 if (err || !header) { 1009 return -1; 1010 } 1011 1012 pos = PCI_CFG_SIZE; 1013 while (1) { 1014 if ((header & 0x0000ffff) == PCI_EXT_CAP_ID_SN) { 1015 if (pos) { 1016 /* skip the header */ 1017 pos += 4; 1018 for (i = 0; i < 2; i++) { 1019 err = spdk_pci_device_cfg_read32(dev, &buf[i], pos + 4 * i); 1020 if (err) { 1021 return -1; 1022 } 1023 } 1024 snprintf(sn, len, "%08x%08x", buf[1], buf[0]); 1025 return 0; 1026 } 1027 } 1028 pos = (header >> 20) & 0xffc; 1029 /* 0 if no other items exist */ 1030 if (pos < PCI_CFG_SIZE) { 1031 return -1; 1032 } 1033 err = spdk_pci_device_cfg_read32(dev, &header, pos); 1034 if (err) { 1035 return -1; 1036 } 1037 } 1038 return -1; 1039 } 1040 1041 struct spdk_pci_addr 1042 spdk_pci_device_get_addr(struct spdk_pci_device *dev) 1043 { 1044 return dev->addr; 1045 } 1046 1047 bool 1048 spdk_pci_device_is_removed(struct spdk_pci_device *dev) 1049 { 1050 return dev->internal.pending_removal; 1051 } 1052 1053 int 1054 spdk_pci_addr_compare(const struct spdk_pci_addr *a1, const struct spdk_pci_addr *a2) 1055 { 1056 if (a1->domain > a2->domain) { 1057 return 1; 1058 } else if (a1->domain < a2->domain) { 1059 return -1; 1060 } else if (a1->bus > a2->bus) { 1061 return 1; 1062 } else if (a1->bus < a2->bus) { 1063 return -1; 1064 } else if (a1->dev > a2->dev) { 1065 return 1; 1066 } else if (a1->dev < a2->dev) { 1067 return -1; 1068 } else if (a1->func > a2->func) { 1069 return 1; 1070 } else if (a1->func < a2->func) { 1071 return -1; 1072 } 1073 1074 return 0; 1075 } 1076 1077 #ifdef __linux__ 1078 int 1079 spdk_pci_device_claim(struct spdk_pci_device *dev) 1080 { 1081 int dev_fd; 1082 char dev_name[64]; 1083 int pid; 1084 void *dev_map; 1085 struct flock pcidev_lock = { 1086 .l_type = F_WRLCK, 1087 .l_whence = SEEK_SET, 1088 .l_start = 0, 1089 .l_len = 0, 1090 }; 1091 1092 snprintf(dev_name, sizeof(dev_name), "/var/tmp/spdk_pci_lock_%04x:%02x:%02x.%x", 1093 dev->addr.domain, dev->addr.bus, dev->addr.dev, dev->addr.func); 1094 1095 dev_fd = open(dev_name, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); 1096 if (dev_fd == -1) { 1097 SPDK_ERRLOG("could not open %s\n", dev_name); 1098 return -errno; 1099 } 1100 1101 if (ftruncate(dev_fd, sizeof(int)) != 0) { 1102 SPDK_ERRLOG("could not truncate %s\n", dev_name); 1103 close(dev_fd); 1104 return -errno; 1105 } 1106 1107 dev_map = mmap(NULL, sizeof(int), PROT_READ | PROT_WRITE, 1108 MAP_SHARED, dev_fd, 0); 1109 if (dev_map == MAP_FAILED) { 1110 SPDK_ERRLOG("could not mmap dev %s (%d)\n", dev_name, errno); 1111 close(dev_fd); 1112 return -errno; 1113 } 1114 1115 if (fcntl(dev_fd, F_SETLK, &pcidev_lock) != 0) { 1116 pid = *(int *)dev_map; 1117 SPDK_ERRLOG("Cannot create lock on device %s, probably" 1118 " process %d has claimed it\n", dev_name, pid); 1119 munmap(dev_map, sizeof(int)); 1120 close(dev_fd); 1121 /* F_SETLK returns unspecified errnos, normalize them */ 1122 return -EACCES; 1123 } 1124 1125 *(int *)dev_map = (int)getpid(); 1126 munmap(dev_map, sizeof(int)); 1127 dev->internal.claim_fd = dev_fd; 1128 /* Keep dev_fd open to maintain the lock. */ 1129 return 0; 1130 } 1131 1132 void 1133 spdk_pci_device_unclaim(struct spdk_pci_device *dev) 1134 { 1135 char dev_name[64]; 1136 1137 snprintf(dev_name, sizeof(dev_name), "/var/tmp/spdk_pci_lock_%04x:%02x:%02x.%x", 1138 dev->addr.domain, dev->addr.bus, dev->addr.dev, dev->addr.func); 1139 1140 close(dev->internal.claim_fd); 1141 dev->internal.claim_fd = -1; 1142 unlink(dev_name); 1143 } 1144 #else /* !__linux__ */ 1145 int 1146 spdk_pci_device_claim(struct spdk_pci_device *dev) 1147 { 1148 /* TODO */ 1149 return 0; 1150 } 1151 1152 void 1153 spdk_pci_device_unclaim(struct spdk_pci_device *dev) 1154 { 1155 /* TODO */ 1156 } 1157 #endif /* __linux__ */ 1158 1159 int 1160 spdk_pci_addr_parse(struct spdk_pci_addr *addr, const char *bdf) 1161 { 1162 unsigned domain, bus, dev, func; 1163 1164 if (addr == NULL || bdf == NULL) { 1165 return -EINVAL; 1166 } 1167 1168 if ((sscanf(bdf, "%x:%x:%x.%x", &domain, &bus, &dev, &func) == 4) || 1169 (sscanf(bdf, "%x.%x.%x.%x", &domain, &bus, &dev, &func) == 4)) { 1170 /* Matched a full address - all variables are initialized */ 1171 } else if (sscanf(bdf, "%x:%x:%x", &domain, &bus, &dev) == 3) { 1172 func = 0; 1173 } else if ((sscanf(bdf, "%x:%x.%x", &bus, &dev, &func) == 3) || 1174 (sscanf(bdf, "%x.%x.%x", &bus, &dev, &func) == 3)) { 1175 domain = 0; 1176 } else if ((sscanf(bdf, "%x:%x", &bus, &dev) == 2) || 1177 (sscanf(bdf, "%x.%x", &bus, &dev) == 2)) { 1178 domain = 0; 1179 func = 0; 1180 } else { 1181 return -EINVAL; 1182 } 1183 1184 if (bus > 0xFF || dev > 0x1F || func > 7) { 1185 return -EINVAL; 1186 } 1187 1188 addr->domain = domain; 1189 addr->bus = bus; 1190 addr->dev = dev; 1191 addr->func = func; 1192 1193 return 0; 1194 } 1195 1196 int 1197 spdk_pci_addr_fmt(char *bdf, size_t sz, const struct spdk_pci_addr *addr) 1198 { 1199 int rc; 1200 1201 rc = snprintf(bdf, sz, "%04x:%02x:%02x.%x", 1202 addr->domain, addr->bus, 1203 addr->dev, addr->func); 1204 1205 if (rc > 0 && (size_t)rc < sz) { 1206 return 0; 1207 } 1208 1209 return -1; 1210 } 1211 1212 int 1213 spdk_pci_hook_device(struct spdk_pci_driver *drv, struct spdk_pci_device *dev) 1214 { 1215 int rc; 1216 1217 assert(dev->map_bar != NULL); 1218 assert(dev->unmap_bar != NULL); 1219 assert(dev->cfg_read != NULL); 1220 assert(dev->cfg_write != NULL); 1221 dev->internal.driver = drv; 1222 1223 if (drv->cb_fn != NULL) { 1224 rc = drv->cb_fn(drv->cb_arg, dev); 1225 if (rc != 0) { 1226 return -ECANCELED; 1227 } 1228 1229 dev->internal.attached = true; 1230 } 1231 1232 TAILQ_INSERT_TAIL(&g_pci_devices, dev, internal.tailq); 1233 1234 return 0; 1235 } 1236 1237 void 1238 spdk_pci_unhook_device(struct spdk_pci_device *dev) 1239 { 1240 assert(!dev->internal.attached); 1241 TAILQ_REMOVE(&g_pci_devices, dev, internal.tailq); 1242 } 1243 1244 void 1245 spdk_pci_register_device_provider(struct spdk_pci_device_provider *provider) 1246 { 1247 TAILQ_INSERT_TAIL(&g_pci_device_providers, provider, tailq); 1248 } 1249 1250 const char * 1251 spdk_pci_device_get_type(const struct spdk_pci_device *dev) 1252 { 1253 return dev->type; 1254 } 1255 1256 int 1257 spdk_pci_device_allow(struct spdk_pci_addr *pci_addr) 1258 { 1259 struct rte_devargs *da; 1260 char devargs_str[128]; 1261 1262 da = calloc(1, sizeof(*da)); 1263 if (da == NULL) { 1264 SPDK_ERRLOG("could not allocate rte_devargs\n"); 1265 return -ENOMEM; 1266 } 1267 1268 snprintf(devargs_str, sizeof(devargs_str), "pci:%04x:%02x:%02x.%x", 1269 pci_addr->domain, pci_addr->bus, pci_addr->dev, pci_addr->func); 1270 if (rte_devargs_parse(da, devargs_str) != 0) { 1271 SPDK_ERRLOG("rte_devargs_parse() failed on '%s'\n", devargs_str); 1272 free(da); 1273 return -EINVAL; 1274 } 1275 da->policy = RTE_DEV_ALLOWED; 1276 /* Note: if a devargs already exists for this device address, it just gets 1277 * overridden. So we do not need to check if the devargs already exists. 1278 * DPDK will take care of memory management for the devargs structure after 1279 * it has been inserted, so there's nothing SPDK needs to track. 1280 */ 1281 if (rte_devargs_insert(&da) != 0) { 1282 SPDK_ERRLOG("rte_devargs_insert() failed on '%s'\n", devargs_str); 1283 free(da); 1284 return -EINVAL; 1285 } 1286 1287 return 0; 1288 } 1289