1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2015 Intel Corporation. 3 * All rights reserved. 4 */ 5 6 #include "env_internal.h" 7 #include "pci_dpdk.h" 8 9 #include <rte_alarm.h> 10 #include <rte_devargs.h> 11 #include <rte_pci.h> 12 #include "spdk/env.h" 13 #include "spdk/log.h" 14 #include "spdk/string.h" 15 #include "spdk/memory.h" 16 17 #define SYSFS_PCI_DRIVERS "/sys/bus/pci/drivers" 18 19 #define PCI_CFG_SIZE 256 20 #define PCI_EXT_CAP_ID_SN 0x03 21 22 /* DPDK 18.11+ hotplug isn't robust. Multiple apps starting at the same time 23 * might cause the internal IPC to misbehave. Just retry in such case. 24 */ 25 #define DPDK_HOTPLUG_RETRY_COUNT 4 26 27 /* DPDK alarm/interrupt thread */ 28 static pthread_mutex_t g_pci_mutex = PTHREAD_MUTEX_INITIALIZER; 29 static TAILQ_HEAD(, spdk_pci_device) g_pci_devices = TAILQ_HEAD_INITIALIZER(g_pci_devices); 30 /* devices hotplugged on a dpdk thread */ 31 static TAILQ_HEAD(, spdk_pci_device) g_pci_hotplugged_devices = 32 TAILQ_HEAD_INITIALIZER(g_pci_hotplugged_devices); 33 static TAILQ_HEAD(, spdk_pci_driver) g_pci_drivers = TAILQ_HEAD_INITIALIZER(g_pci_drivers); 34 static TAILQ_HEAD(, spdk_pci_device_provider) g_pci_device_providers = 35 TAILQ_HEAD_INITIALIZER(g_pci_device_providers); 36 37 int pci_device_init(struct rte_pci_driver *driver, struct rte_pci_device *device); 38 int pci_device_fini(struct rte_pci_device *device); 39 40 struct env_devargs { 41 struct rte_bus *bus; 42 char name[128]; 43 uint64_t allowed_at; 44 TAILQ_ENTRY(env_devargs) link; 45 }; 46 static TAILQ_HEAD(, env_devargs) g_env_devargs = TAILQ_HEAD_INITIALIZER(g_env_devargs); 47 48 static struct env_devargs * 49 find_env_devargs(struct rte_bus *bus, const char *name) 50 { 51 struct env_devargs *da; 52 53 TAILQ_FOREACH(da, &g_env_devargs, link) { 54 if (bus == da->bus && !strcmp(name, da->name)) { 55 return da; 56 } 57 } 58 59 return NULL; 60 } 61 62 static int 63 map_bar_rte(struct spdk_pci_device *device, uint32_t bar, 64 void **mapped_addr, uint64_t *phys_addr, uint64_t *size) 65 { 66 struct rte_mem_resource *res; 67 68 res = dpdk_pci_device_get_mem_resource(device->dev_handle, bar); 69 *mapped_addr = res->addr; 70 *phys_addr = (uint64_t)res->phys_addr; 71 *size = (uint64_t)res->len; 72 73 return 0; 74 } 75 76 static int 77 unmap_bar_rte(struct spdk_pci_device *device, uint32_t bar, void *addr) 78 { 79 return 0; 80 } 81 82 static int 83 cfg_read_rte(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset) 84 { 85 return dpdk_pci_device_read_config(dev->dev_handle, value, len, offset); 86 } 87 88 static int 89 cfg_write_rte(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset) 90 { 91 return dpdk_pci_device_write_config(dev->dev_handle, value, len, offset); 92 } 93 94 static void 95 remove_rte_dev(struct rte_pci_device *rte_dev) 96 { 97 char bdf[32]; 98 int i = 0, rc; 99 100 snprintf(bdf, sizeof(bdf), "%s", dpdk_pci_device_get_name(rte_dev)); 101 do { 102 rc = rte_eal_hotplug_remove("pci", bdf); 103 } while (rc == -ENOMSG && ++i <= DPDK_HOTPLUG_RETRY_COUNT); 104 } 105 106 static void 107 detach_rte_cb(void *_dev) 108 { 109 remove_rte_dev(_dev); 110 } 111 112 /* if it's a physical device we need to deal with DPDK on 113 * a different process and we can't just unset one flag 114 * here. We also want to stop using any device resources 115 * so that the device isn't "in use" by the userspace driver 116 * once we detach it. This would allow attaching the device 117 * to a different process, or to a kernel driver like nvme. 118 */ 119 static void 120 detach_rte(struct spdk_pci_device *dev) 121 { 122 struct rte_pci_device *rte_dev = dev->dev_handle; 123 int i; 124 bool removed; 125 126 if (!spdk_process_is_primary()) { 127 return; 128 } 129 130 pthread_mutex_lock(&g_pci_mutex); 131 dev->internal.attached = false; 132 /* prevent the hotremove notification from removing this device */ 133 dev->internal.pending_removal = true; 134 pthread_mutex_unlock(&g_pci_mutex); 135 136 rte_eal_alarm_set(1, detach_rte_cb, rte_dev); 137 138 /* wait up to 2s for the cb to execute */ 139 for (i = 2000; i > 0; i--) { 140 141 spdk_delay_us(1000); 142 pthread_mutex_lock(&g_pci_mutex); 143 removed = dev->internal.removed; 144 pthread_mutex_unlock(&g_pci_mutex); 145 146 if (removed) { 147 break; 148 } 149 } 150 151 /* besides checking the removed flag, we also need to wait 152 * for the dpdk detach function to unwind, as it's doing some 153 * operations even after calling our detach callback. Simply 154 * cancel the alarm - if it started executing already, this 155 * call will block and wait for it to finish. 156 */ 157 rte_eal_alarm_cancel(detach_rte_cb, rte_dev); 158 159 /* the device could have been finally removed, so just check 160 * it again. 161 */ 162 pthread_mutex_lock(&g_pci_mutex); 163 removed = dev->internal.removed; 164 pthread_mutex_unlock(&g_pci_mutex); 165 if (!removed) { 166 SPDK_ERRLOG("Timeout waiting for DPDK to remove PCI device %s.\n", 167 dpdk_pci_device_get_name(rte_dev)); 168 /* If we reach this state, then the device couldn't be removed and most likely 169 a subsequent hot add of a device in the same BDF will fail */ 170 } 171 } 172 173 void 174 spdk_pci_driver_register(const char *name, struct spdk_pci_id *id_table, uint32_t flags) 175 { 176 struct spdk_pci_driver *driver; 177 178 driver = calloc(1, sizeof(*driver)); 179 if (!driver) { 180 /* we can't do any better than bailing atm */ 181 return; 182 } 183 184 driver->name = name; 185 driver->id_table = id_table; 186 driver->drv_flags = flags; 187 driver->driver = (struct rte_pci_driver *)driver->driver_buf; 188 TAILQ_INSERT_TAIL(&g_pci_drivers, driver, tailq); 189 } 190 191 struct spdk_pci_driver * 192 spdk_pci_nvme_get_driver(void) 193 { 194 return spdk_pci_get_driver("nvme"); 195 } 196 197 struct spdk_pci_driver * 198 spdk_pci_get_driver(const char *name) 199 { 200 struct spdk_pci_driver *driver; 201 202 TAILQ_FOREACH(driver, &g_pci_drivers, tailq) { 203 if (strcmp(driver->name, name) == 0) { 204 return driver; 205 } 206 } 207 208 return NULL; 209 } 210 211 static void 212 pci_device_rte_dev_event(const char *device_name, 213 enum rte_dev_event_type event, 214 void *cb_arg) 215 { 216 struct spdk_pci_device *dev; 217 bool can_detach = false; 218 219 switch (event) { 220 default: 221 case RTE_DEV_EVENT_ADD: 222 /* Nothing to do here yet. */ 223 break; 224 case RTE_DEV_EVENT_REMOVE: 225 pthread_mutex_lock(&g_pci_mutex); 226 TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) { 227 struct rte_pci_device *rte_dev = dev->dev_handle; 228 229 /* Note: these ERRLOGs are useful for triaging issue #2983. */ 230 if (dev->internal.pending_removal || dev->internal.removed) { 231 SPDK_ERRLOG("Received event for device SPDK already tried to remove\n"); 232 SPDK_ERRLOG("pending_removal=%d removed=%d\n", dev->internal.pending_removal, 233 dev->internal.removed); 234 } 235 236 if (strcmp(dpdk_pci_device_get_name(rte_dev), device_name) == 0 && 237 !dev->internal.pending_removal) { 238 can_detach = !dev->internal.attached; 239 /* prevent any further attaches */ 240 dev->internal.pending_removal = true; 241 break; 242 } 243 } 244 pthread_mutex_unlock(&g_pci_mutex); 245 246 if (dev != NULL && can_detach) { 247 /* if device is not attached we can remove it right away. 248 * Otherwise it will be removed at detach. 249 * 250 * Because the user's callback is invoked in eal interrupt 251 * callback, the interrupt callback need to be finished before 252 * it can be unregistered when detaching device. So finish 253 * callback soon and use a deferred removal to detach device 254 * is need. It is a workaround, once the device detaching be 255 * moved into the eal in the future, the deferred removal could 256 * be deleted. 257 */ 258 rte_eal_alarm_set(1, detach_rte_cb, dev->dev_handle); 259 } 260 break; 261 } 262 } 263 264 static void 265 cleanup_pci_devices(void) 266 { 267 struct spdk_pci_device *dev, *tmp; 268 269 pthread_mutex_lock(&g_pci_mutex); 270 /* cleanup removed devices */ 271 TAILQ_FOREACH_SAFE(dev, &g_pci_devices, internal.tailq, tmp) { 272 if (!dev->internal.removed) { 273 continue; 274 } 275 276 vtophys_pci_device_removed(dev->dev_handle); 277 TAILQ_REMOVE(&g_pci_devices, dev, internal.tailq); 278 free(dev); 279 } 280 281 /* add newly-attached devices */ 282 TAILQ_FOREACH_SAFE(dev, &g_pci_hotplugged_devices, internal.tailq, tmp) { 283 TAILQ_REMOVE(&g_pci_hotplugged_devices, dev, internal.tailq); 284 TAILQ_INSERT_TAIL(&g_pci_devices, dev, internal.tailq); 285 vtophys_pci_device_added(dev->dev_handle); 286 } 287 pthread_mutex_unlock(&g_pci_mutex); 288 } 289 290 static int scan_pci_bus(bool delay_init); 291 292 static inline void 293 _pci_env_init(void) 294 { 295 /* We assume devices were present on the bus for more than 2 seconds 296 * before initializing SPDK and there's no need to wait more. We scan 297 * the bus, but we don't block any devices. 298 */ 299 scan_pci_bus(false); 300 301 /* Register a single hotremove callback for all devices. */ 302 if (spdk_process_is_primary()) { 303 rte_dev_event_callback_register(NULL, pci_device_rte_dev_event, NULL); 304 } 305 } 306 307 int 308 pci_env_init(void) 309 { 310 struct spdk_pci_driver *driver; 311 int rc; 312 313 rc = dpdk_pci_init(); 314 if (rc) { 315 return rc; 316 } 317 318 TAILQ_FOREACH(driver, &g_pci_drivers, tailq) { 319 dpdk_pci_driver_register(driver, pci_device_init, pci_device_fini); 320 } 321 322 _pci_env_init(); 323 return 0; 324 } 325 326 void 327 pci_env_reinit(void) 328 { 329 /* There is no need to register pci drivers again, since they were 330 * already pre-registered in pci_env_init. 331 */ 332 333 _pci_env_init(); 334 } 335 336 void 337 pci_env_fini(void) 338 { 339 struct spdk_pci_device *dev; 340 char bdf[32]; 341 342 cleanup_pci_devices(); 343 TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) { 344 if (dev->internal.attached) { 345 spdk_pci_addr_fmt(bdf, sizeof(bdf), &dev->addr); 346 SPDK_ERRLOG("Device %s is still attached at shutdown!\n", bdf); 347 } 348 } 349 350 if (spdk_process_is_primary()) { 351 rte_dev_event_callback_unregister(NULL, pci_device_rte_dev_event, NULL); 352 } 353 } 354 355 int 356 pci_device_init(struct rte_pci_driver *_drv, 357 struct rte_pci_device *_dev) 358 { 359 struct spdk_pci_driver *driver = (struct spdk_pci_driver *)_drv; 360 struct spdk_pci_device *dev; 361 struct rte_pci_addr *addr; 362 struct rte_pci_id *id; 363 int rc; 364 365 dev = calloc(1, sizeof(*dev)); 366 if (dev == NULL) { 367 return -1; 368 } 369 370 dev->dev_handle = _dev; 371 372 addr = dpdk_pci_device_get_addr(_dev); 373 dev->addr.domain = addr->domain; 374 dev->addr.bus = addr->bus; 375 dev->addr.dev = addr->devid; 376 dev->addr.func = addr->function; 377 378 id = dpdk_pci_device_get_id(_dev); 379 dev->id.class_id = id->class_id; 380 dev->id.vendor_id = id->vendor_id; 381 dev->id.device_id = id->device_id; 382 dev->id.subvendor_id = id->subsystem_vendor_id; 383 dev->id.subdevice_id = id->subsystem_device_id; 384 385 dev->socket_id = dpdk_pci_device_get_numa_node(_dev); 386 dev->type = "pci"; 387 388 dev->map_bar = map_bar_rte; 389 dev->unmap_bar = unmap_bar_rte; 390 dev->cfg_read = cfg_read_rte; 391 dev->cfg_write = cfg_write_rte; 392 393 dev->internal.driver = driver; 394 dev->internal.claim_fd = -1; 395 396 if (driver->cb_fn != NULL) { 397 rc = driver->cb_fn(driver->cb_arg, dev); 398 if (rc != 0) { 399 free(dev); 400 return rc; 401 } 402 dev->internal.attached = true; 403 } 404 405 pthread_mutex_lock(&g_pci_mutex); 406 TAILQ_INSERT_TAIL(&g_pci_hotplugged_devices, dev, internal.tailq); 407 pthread_mutex_unlock(&g_pci_mutex); 408 return 0; 409 } 410 411 static void 412 set_allowed_at(struct rte_devargs *rte_da, uint64_t tsc) 413 { 414 struct env_devargs *env_da; 415 416 env_da = find_env_devargs(rte_da->bus, rte_da->name); 417 if (env_da == NULL) { 418 env_da = calloc(1, sizeof(*env_da)); 419 if (env_da == NULL) { 420 SPDK_ERRLOG("could not set_allowed_at for device %s\n", rte_da->name); 421 return; 422 } 423 env_da->bus = rte_da->bus; 424 spdk_strcpy_pad(env_da->name, rte_da->name, sizeof(env_da->name), 0); 425 TAILQ_INSERT_TAIL(&g_env_devargs, env_da, link); 426 } 427 428 env_da->allowed_at = tsc; 429 } 430 431 static uint64_t 432 get_allowed_at(struct rte_devargs *rte_da) 433 { 434 struct env_devargs *env_da; 435 436 env_da = find_env_devargs(rte_da->bus, rte_da->name); 437 if (env_da) { 438 return env_da->allowed_at; 439 } else { 440 return 0; 441 } 442 } 443 444 int 445 pci_device_fini(struct rte_pci_device *_dev) 446 { 447 struct spdk_pci_device *dev; 448 449 pthread_mutex_lock(&g_pci_mutex); 450 TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) { 451 if (dev->dev_handle == _dev) { 452 break; 453 } 454 } 455 456 if (dev == NULL || dev->internal.attached) { 457 /* The device might be still referenced somewhere in SPDK. */ 458 pthread_mutex_unlock(&g_pci_mutex); 459 return -EBUSY; 460 } 461 462 /* remove our allowed_at option */ 463 if (dpdk_pci_device_get_devargs(_dev)) { 464 set_allowed_at(dpdk_pci_device_get_devargs(_dev), 0); 465 } 466 467 /* It is possible that removed flag was already set when there is a race 468 * between the remove notification for this process, and another process 469 * that is also detaching from this same device (for example, when using 470 * nvme driver in multi-process mode. So do not assert here. See 471 * #2456 for additional details. 472 */ 473 dev->internal.removed = true; 474 pthread_mutex_unlock(&g_pci_mutex); 475 return 0; 476 477 } 478 479 void 480 spdk_pci_device_detach(struct spdk_pci_device *dev) 481 { 482 struct spdk_pci_device_provider *provider; 483 484 assert(dev->internal.attached); 485 486 if (dev->internal.claim_fd >= 0) { 487 spdk_pci_device_unclaim(dev); 488 } 489 490 TAILQ_FOREACH(provider, &g_pci_device_providers, tailq) { 491 if (strcmp(dev->type, provider->name) == 0) { 492 break; 493 } 494 } 495 496 assert(provider != NULL); 497 dev->internal.attached = false; 498 provider->detach_cb(dev); 499 500 cleanup_pci_devices(); 501 } 502 503 static int 504 scan_pci_bus(bool delay_init) 505 { 506 struct rte_dev_iterator it; 507 struct rte_device *rte_dev; 508 uint64_t now; 509 510 dpdk_bus_scan(); 511 now = spdk_get_ticks(); 512 513 if (!TAILQ_FIRST(&g_pci_drivers)) { 514 return 0; 515 } 516 517 RTE_DEV_FOREACH(rte_dev, "bus=pci", &it) { 518 struct rte_devargs *da; 519 520 da = dpdk_device_get_devargs(rte_dev); 521 if (!da) { 522 char devargs_str[128]; 523 524 /* the device was never blocked or allowed */ 525 da = calloc(1, sizeof(*da)); 526 if (!da) { 527 return -1; 528 } 529 530 snprintf(devargs_str, sizeof(devargs_str), "pci:%s", dpdk_device_get_name(rte_dev)); 531 if (rte_devargs_parse(da, devargs_str) != 0) { 532 free(da); 533 return -1; 534 } 535 536 rte_devargs_insert(&da); 537 dpdk_device_set_devargs(rte_dev, da); 538 } 539 540 if (get_allowed_at(da)) { 541 uint64_t allowed_at = get_allowed_at(da); 542 543 /* this device was seen by spdk before... */ 544 if (da->policy == RTE_DEV_BLOCKED && allowed_at <= now) { 545 da->policy = RTE_DEV_ALLOWED; 546 } 547 } else if ((dpdk_device_scan_allowed(rte_dev) && da->policy == RTE_DEV_ALLOWED) || 548 da->policy != RTE_DEV_BLOCKED) { 549 /* override the policy only if not permanently blocked */ 550 551 if (delay_init) { 552 da->policy = RTE_DEV_BLOCKED; 553 set_allowed_at(da, now + 2 * spdk_get_ticks_hz()); 554 } else { 555 da->policy = RTE_DEV_ALLOWED; 556 set_allowed_at(da, now); 557 } 558 } 559 } 560 561 return 0; 562 } 563 564 static int 565 pci_attach_rte(const struct spdk_pci_addr *addr) 566 { 567 char bdf[32]; 568 int rc, i = 0; 569 570 spdk_pci_addr_fmt(bdf, sizeof(bdf), addr); 571 572 do { 573 rc = rte_eal_hotplug_add("pci", bdf, ""); 574 } while (rc == -ENOMSG && ++i <= DPDK_HOTPLUG_RETRY_COUNT); 575 576 if (i > 1 && rc == -EEXIST) { 577 /* Even though the previous request timed out, the device 578 * was attached successfully. 579 */ 580 rc = 0; 581 } 582 583 return rc; 584 } 585 586 static struct spdk_pci_device_provider g_pci_rte_provider = { 587 .name = "pci", 588 .attach_cb = pci_attach_rte, 589 .detach_cb = detach_rte, 590 }; 591 592 SPDK_PCI_REGISTER_DEVICE_PROVIDER(pci, &g_pci_rte_provider); 593 594 int 595 spdk_pci_device_attach(struct spdk_pci_driver *driver, 596 spdk_pci_enum_cb enum_cb, 597 void *enum_ctx, struct spdk_pci_addr *pci_address) 598 { 599 struct spdk_pci_device *dev; 600 struct spdk_pci_device_provider *provider; 601 struct rte_pci_device *rte_dev; 602 struct rte_devargs *da; 603 int rc; 604 605 cleanup_pci_devices(); 606 607 TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) { 608 if (spdk_pci_addr_compare(&dev->addr, pci_address) == 0) { 609 break; 610 } 611 } 612 613 if (dev != NULL && dev->internal.driver == driver) { 614 pthread_mutex_lock(&g_pci_mutex); 615 if (dev->internal.attached || dev->internal.pending_removal) { 616 pthread_mutex_unlock(&g_pci_mutex); 617 return -1; 618 } 619 620 rc = enum_cb(enum_ctx, dev); 621 if (rc == 0) { 622 dev->internal.attached = true; 623 } 624 pthread_mutex_unlock(&g_pci_mutex); 625 return rc; 626 } 627 628 driver->cb_fn = enum_cb; 629 driver->cb_arg = enum_ctx; 630 631 rc = -ENODEV; 632 TAILQ_FOREACH(provider, &g_pci_device_providers, tailq) { 633 rc = provider->attach_cb(pci_address); 634 if (rc == 0) { 635 break; 636 } 637 } 638 639 driver->cb_arg = NULL; 640 driver->cb_fn = NULL; 641 642 cleanup_pci_devices(); 643 644 if (rc != 0) { 645 return -1; 646 } 647 648 /* explicit attach ignores the allowlist, so if we blocked this 649 * device before let's enable it now - just for clarity. 650 */ 651 TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) { 652 if (spdk_pci_addr_compare(&dev->addr, pci_address) == 0) { 653 break; 654 } 655 } 656 assert(dev != NULL); 657 658 rte_dev = dev->dev_handle; 659 if (rte_dev != NULL) { 660 da = dpdk_pci_device_get_devargs(rte_dev); 661 if (da && get_allowed_at(da)) { 662 set_allowed_at(da, spdk_get_ticks()); 663 da->policy = RTE_DEV_ALLOWED; 664 } 665 } 666 667 return 0; 668 } 669 670 /* Note: You can call spdk_pci_enumerate from more than one thread 671 * simultaneously safely, but you cannot call spdk_pci_enumerate 672 * and rte_eal_pci_probe simultaneously. 673 */ 674 int 675 spdk_pci_enumerate(struct spdk_pci_driver *driver, 676 spdk_pci_enum_cb enum_cb, 677 void *enum_ctx) 678 { 679 struct spdk_pci_device *dev; 680 int rc; 681 682 cleanup_pci_devices(); 683 684 pthread_mutex_lock(&g_pci_mutex); 685 TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) { 686 if (dev->internal.attached || 687 dev->internal.driver != driver || 688 dev->internal.pending_removal) { 689 continue; 690 } 691 692 rc = enum_cb(enum_ctx, dev); 693 if (rc == 0) { 694 dev->internal.attached = true; 695 } else if (rc < 0) { 696 pthread_mutex_unlock(&g_pci_mutex); 697 return -1; 698 } 699 } 700 pthread_mutex_unlock(&g_pci_mutex); 701 702 if (scan_pci_bus(true) != 0) { 703 return -1; 704 } 705 706 driver->cb_fn = enum_cb; 707 driver->cb_arg = enum_ctx; 708 709 if (dpdk_bus_probe() != 0) { 710 driver->cb_arg = NULL; 711 driver->cb_fn = NULL; 712 return -1; 713 } 714 715 driver->cb_arg = NULL; 716 driver->cb_fn = NULL; 717 718 cleanup_pci_devices(); 719 return 0; 720 } 721 722 void 723 spdk_pci_for_each_device(void *ctx, void (*fn)(void *ctx, struct spdk_pci_device *dev)) 724 { 725 struct spdk_pci_device *dev, *tmp; 726 727 pthread_mutex_lock(&g_pci_mutex); 728 TAILQ_FOREACH_SAFE(dev, &g_pci_devices, internal.tailq, tmp) { 729 fn(ctx, dev); 730 } 731 pthread_mutex_unlock(&g_pci_mutex); 732 } 733 734 int 735 spdk_pci_device_map_bar(struct spdk_pci_device *dev, uint32_t bar, 736 void **mapped_addr, uint64_t *phys_addr, uint64_t *size) 737 { 738 int rc; 739 740 rc = dev->map_bar(dev, bar, mapped_addr, phys_addr, size); 741 if (rc) { 742 return rc; 743 } 744 745 #if VFIO_ENABLED 746 /* Automatically map the BAR to the IOMMU */ 747 if (!spdk_iommu_is_enabled()) { 748 return 0; 749 } 750 751 if (rte_eal_iova_mode() == RTE_IOVA_VA) { 752 /* We'll use the virtual address as the iova to match DPDK. */ 753 rc = vtophys_iommu_map_dma_bar((uint64_t)(*mapped_addr), (uint64_t) * mapped_addr, *size); 754 if (rc) { 755 dev->unmap_bar(dev, bar, *mapped_addr); 756 return -EFAULT; 757 } 758 759 *phys_addr = (uint64_t)(*mapped_addr); 760 } else { 761 /* We'll use the physical address as the iova to match DPDK. */ 762 rc = vtophys_iommu_map_dma_bar((uint64_t)(*mapped_addr), *phys_addr, *size); 763 if (rc) { 764 dev->unmap_bar(dev, bar, *mapped_addr); 765 return -EFAULT; 766 } 767 } 768 #endif 769 return rc; 770 } 771 772 int 773 spdk_pci_device_unmap_bar(struct spdk_pci_device *dev, uint32_t bar, void *addr) 774 { 775 #if VFIO_ENABLED 776 int rc; 777 778 if (spdk_iommu_is_enabled()) { 779 rc = vtophys_iommu_unmap_dma_bar((uint64_t)addr); 780 if (rc) { 781 return -EFAULT; 782 } 783 } 784 #endif 785 786 return dev->unmap_bar(dev, bar, addr); 787 } 788 789 int 790 spdk_pci_device_enable_interrupt(struct spdk_pci_device *dev) 791 { 792 return dpdk_pci_device_enable_interrupt(dev->dev_handle); 793 } 794 795 int 796 spdk_pci_device_disable_interrupt(struct spdk_pci_device *dev) 797 { 798 return dpdk_pci_device_disable_interrupt(dev->dev_handle); 799 } 800 801 int 802 spdk_pci_device_get_interrupt_efd(struct spdk_pci_device *dev) 803 { 804 return dpdk_pci_device_get_interrupt_efd(dev->dev_handle); 805 } 806 807 uint32_t 808 spdk_pci_device_get_domain(struct spdk_pci_device *dev) 809 { 810 return dev->addr.domain; 811 } 812 813 uint8_t 814 spdk_pci_device_get_bus(struct spdk_pci_device *dev) 815 { 816 return dev->addr.bus; 817 } 818 819 uint8_t 820 spdk_pci_device_get_dev(struct spdk_pci_device *dev) 821 { 822 return dev->addr.dev; 823 } 824 825 uint8_t 826 spdk_pci_device_get_func(struct spdk_pci_device *dev) 827 { 828 return dev->addr.func; 829 } 830 831 uint16_t 832 spdk_pci_device_get_vendor_id(struct spdk_pci_device *dev) 833 { 834 return dev->id.vendor_id; 835 } 836 837 uint16_t 838 spdk_pci_device_get_device_id(struct spdk_pci_device *dev) 839 { 840 return dev->id.device_id; 841 } 842 843 uint16_t 844 spdk_pci_device_get_subvendor_id(struct spdk_pci_device *dev) 845 { 846 return dev->id.subvendor_id; 847 } 848 849 uint16_t 850 spdk_pci_device_get_subdevice_id(struct spdk_pci_device *dev) 851 { 852 return dev->id.subdevice_id; 853 } 854 855 struct spdk_pci_id 856 spdk_pci_device_get_id(struct spdk_pci_device *dev) 857 { 858 return dev->id; 859 } 860 861 int 862 spdk_pci_device_get_socket_id(struct spdk_pci_device *dev) 863 { 864 return dev->socket_id; 865 } 866 867 int 868 spdk_pci_device_cfg_read(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset) 869 { 870 return dev->cfg_read(dev, value, len, offset); 871 } 872 873 int 874 spdk_pci_device_cfg_write(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset) 875 { 876 return dev->cfg_write(dev, value, len, offset); 877 } 878 879 int 880 spdk_pci_device_cfg_read8(struct spdk_pci_device *dev, uint8_t *value, uint32_t offset) 881 { 882 return spdk_pci_device_cfg_read(dev, value, 1, offset); 883 } 884 885 int 886 spdk_pci_device_cfg_write8(struct spdk_pci_device *dev, uint8_t value, uint32_t offset) 887 { 888 return spdk_pci_device_cfg_write(dev, &value, 1, offset); 889 } 890 891 int 892 spdk_pci_device_cfg_read16(struct spdk_pci_device *dev, uint16_t *value, uint32_t offset) 893 { 894 return spdk_pci_device_cfg_read(dev, value, 2, offset); 895 } 896 897 int 898 spdk_pci_device_cfg_write16(struct spdk_pci_device *dev, uint16_t value, uint32_t offset) 899 { 900 return spdk_pci_device_cfg_write(dev, &value, 2, offset); 901 } 902 903 int 904 spdk_pci_device_cfg_read32(struct spdk_pci_device *dev, uint32_t *value, uint32_t offset) 905 { 906 return spdk_pci_device_cfg_read(dev, value, 4, offset); 907 } 908 909 int 910 spdk_pci_device_cfg_write32(struct spdk_pci_device *dev, uint32_t value, uint32_t offset) 911 { 912 return spdk_pci_device_cfg_write(dev, &value, 4, offset); 913 } 914 915 int 916 spdk_pci_device_get_serial_number(struct spdk_pci_device *dev, char *sn, size_t len) 917 { 918 int err; 919 uint32_t pos, header = 0; 920 uint32_t i, buf[2]; 921 922 if (len < 17) { 923 return -1; 924 } 925 926 err = spdk_pci_device_cfg_read32(dev, &header, PCI_CFG_SIZE); 927 if (err || !header) { 928 return -1; 929 } 930 931 pos = PCI_CFG_SIZE; 932 while (1) { 933 if ((header & 0x0000ffff) == PCI_EXT_CAP_ID_SN) { 934 if (pos) { 935 /* skip the header */ 936 pos += 4; 937 for (i = 0; i < 2; i++) { 938 err = spdk_pci_device_cfg_read32(dev, &buf[i], pos + 4 * i); 939 if (err) { 940 return -1; 941 } 942 } 943 snprintf(sn, len, "%08x%08x", buf[1], buf[0]); 944 return 0; 945 } 946 } 947 pos = (header >> 20) & 0xffc; 948 /* 0 if no other items exist */ 949 if (pos < PCI_CFG_SIZE) { 950 return -1; 951 } 952 err = spdk_pci_device_cfg_read32(dev, &header, pos); 953 if (err) { 954 return -1; 955 } 956 } 957 return -1; 958 } 959 960 struct spdk_pci_addr 961 spdk_pci_device_get_addr(struct spdk_pci_device *dev) 962 { 963 return dev->addr; 964 } 965 966 bool 967 spdk_pci_device_is_removed(struct spdk_pci_device *dev) 968 { 969 return dev->internal.pending_removal; 970 } 971 972 int 973 spdk_pci_addr_compare(const struct spdk_pci_addr *a1, const struct spdk_pci_addr *a2) 974 { 975 if (a1->domain > a2->domain) { 976 return 1; 977 } else if (a1->domain < a2->domain) { 978 return -1; 979 } else if (a1->bus > a2->bus) { 980 return 1; 981 } else if (a1->bus < a2->bus) { 982 return -1; 983 } else if (a1->dev > a2->dev) { 984 return 1; 985 } else if (a1->dev < a2->dev) { 986 return -1; 987 } else if (a1->func > a2->func) { 988 return 1; 989 } else if (a1->func < a2->func) { 990 return -1; 991 } 992 993 return 0; 994 } 995 996 #ifdef __linux__ 997 int 998 spdk_pci_device_claim(struct spdk_pci_device *dev) 999 { 1000 int dev_fd; 1001 char dev_name[64]; 1002 int pid; 1003 void *dev_map; 1004 struct flock pcidev_lock = { 1005 .l_type = F_WRLCK, 1006 .l_whence = SEEK_SET, 1007 .l_start = 0, 1008 .l_len = 0, 1009 }; 1010 1011 snprintf(dev_name, sizeof(dev_name), "/var/tmp/spdk_pci_lock_%04x:%02x:%02x.%x", 1012 dev->addr.domain, dev->addr.bus, dev->addr.dev, dev->addr.func); 1013 1014 dev_fd = open(dev_name, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); 1015 if (dev_fd == -1) { 1016 SPDK_ERRLOG("could not open %s\n", dev_name); 1017 return -errno; 1018 } 1019 1020 if (ftruncate(dev_fd, sizeof(int)) != 0) { 1021 SPDK_ERRLOG("could not truncate %s\n", dev_name); 1022 close(dev_fd); 1023 return -errno; 1024 } 1025 1026 dev_map = mmap(NULL, sizeof(int), PROT_READ | PROT_WRITE, 1027 MAP_SHARED, dev_fd, 0); 1028 if (dev_map == MAP_FAILED) { 1029 SPDK_ERRLOG("could not mmap dev %s (%d)\n", dev_name, errno); 1030 close(dev_fd); 1031 return -errno; 1032 } 1033 1034 if (fcntl(dev_fd, F_SETLK, &pcidev_lock) != 0) { 1035 pid = *(int *)dev_map; 1036 SPDK_ERRLOG("Cannot create lock on device %s, probably" 1037 " process %d has claimed it\n", dev_name, pid); 1038 munmap(dev_map, sizeof(int)); 1039 close(dev_fd); 1040 /* F_SETLK returns unspecified errnos, normalize them */ 1041 return -EACCES; 1042 } 1043 1044 *(int *)dev_map = (int)getpid(); 1045 munmap(dev_map, sizeof(int)); 1046 dev->internal.claim_fd = dev_fd; 1047 /* Keep dev_fd open to maintain the lock. */ 1048 return 0; 1049 } 1050 1051 void 1052 spdk_pci_device_unclaim(struct spdk_pci_device *dev) 1053 { 1054 char dev_name[64]; 1055 1056 snprintf(dev_name, sizeof(dev_name), "/var/tmp/spdk_pci_lock_%04x:%02x:%02x.%x", 1057 dev->addr.domain, dev->addr.bus, dev->addr.dev, dev->addr.func); 1058 1059 close(dev->internal.claim_fd); 1060 dev->internal.claim_fd = -1; 1061 unlink(dev_name); 1062 } 1063 #else /* !__linux__ */ 1064 int 1065 spdk_pci_device_claim(struct spdk_pci_device *dev) 1066 { 1067 /* TODO */ 1068 return 0; 1069 } 1070 1071 void 1072 spdk_pci_device_unclaim(struct spdk_pci_device *dev) 1073 { 1074 /* TODO */ 1075 } 1076 #endif /* __linux__ */ 1077 1078 int 1079 spdk_pci_addr_parse(struct spdk_pci_addr *addr, const char *bdf) 1080 { 1081 unsigned domain, bus, dev, func; 1082 1083 if (addr == NULL || bdf == NULL) { 1084 return -EINVAL; 1085 } 1086 1087 if ((sscanf(bdf, "%x:%x:%x.%x", &domain, &bus, &dev, &func) == 4) || 1088 (sscanf(bdf, "%x.%x.%x.%x", &domain, &bus, &dev, &func) == 4)) { 1089 /* Matched a full address - all variables are initialized */ 1090 } else if (sscanf(bdf, "%x:%x:%x", &domain, &bus, &dev) == 3) { 1091 func = 0; 1092 } else if ((sscanf(bdf, "%x:%x.%x", &bus, &dev, &func) == 3) || 1093 (sscanf(bdf, "%x.%x.%x", &bus, &dev, &func) == 3)) { 1094 domain = 0; 1095 } else if ((sscanf(bdf, "%x:%x", &bus, &dev) == 2) || 1096 (sscanf(bdf, "%x.%x", &bus, &dev) == 2)) { 1097 domain = 0; 1098 func = 0; 1099 } else { 1100 return -EINVAL; 1101 } 1102 1103 if (bus > 0xFF || dev > 0x1F || func > 7) { 1104 return -EINVAL; 1105 } 1106 1107 addr->domain = domain; 1108 addr->bus = bus; 1109 addr->dev = dev; 1110 addr->func = func; 1111 1112 return 0; 1113 } 1114 1115 int 1116 spdk_pci_addr_fmt(char *bdf, size_t sz, const struct spdk_pci_addr *addr) 1117 { 1118 int rc; 1119 1120 rc = snprintf(bdf, sz, "%04x:%02x:%02x.%x", 1121 addr->domain, addr->bus, 1122 addr->dev, addr->func); 1123 1124 if (rc > 0 && (size_t)rc < sz) { 1125 return 0; 1126 } 1127 1128 return -1; 1129 } 1130 1131 int 1132 spdk_pci_hook_device(struct spdk_pci_driver *drv, struct spdk_pci_device *dev) 1133 { 1134 int rc; 1135 1136 assert(dev->map_bar != NULL); 1137 assert(dev->unmap_bar != NULL); 1138 assert(dev->cfg_read != NULL); 1139 assert(dev->cfg_write != NULL); 1140 dev->internal.driver = drv; 1141 1142 if (drv->cb_fn != NULL) { 1143 rc = drv->cb_fn(drv->cb_arg, dev); 1144 if (rc != 0) { 1145 return -ECANCELED; 1146 } 1147 1148 dev->internal.attached = true; 1149 } 1150 1151 TAILQ_INSERT_TAIL(&g_pci_devices, dev, internal.tailq); 1152 1153 return 0; 1154 } 1155 1156 void 1157 spdk_pci_unhook_device(struct spdk_pci_device *dev) 1158 { 1159 assert(!dev->internal.attached); 1160 TAILQ_REMOVE(&g_pci_devices, dev, internal.tailq); 1161 } 1162 1163 void 1164 spdk_pci_register_device_provider(struct spdk_pci_device_provider *provider) 1165 { 1166 TAILQ_INSERT_TAIL(&g_pci_device_providers, provider, tailq); 1167 } 1168 1169 const char * 1170 spdk_pci_device_get_type(const struct spdk_pci_device *dev) 1171 { 1172 return dev->type; 1173 } 1174 1175 int 1176 spdk_pci_device_allow(struct spdk_pci_addr *pci_addr) 1177 { 1178 struct rte_devargs *da; 1179 char devargs_str[128]; 1180 1181 da = calloc(1, sizeof(*da)); 1182 if (da == NULL) { 1183 SPDK_ERRLOG("could not allocate rte_devargs\n"); 1184 return -ENOMEM; 1185 } 1186 1187 snprintf(devargs_str, sizeof(devargs_str), "pci:%04x:%02x:%02x.%x", 1188 pci_addr->domain, pci_addr->bus, pci_addr->dev, pci_addr->func); 1189 if (rte_devargs_parse(da, devargs_str) != 0) { 1190 SPDK_ERRLOG("rte_devargs_parse() failed on '%s'\n", devargs_str); 1191 free(da); 1192 return -EINVAL; 1193 } 1194 da->policy = RTE_DEV_ALLOWED; 1195 /* Note: if a devargs already exists for this device address, it just gets 1196 * overridden. So we do not need to check if the devargs already exists. 1197 * DPDK will take care of memory management for the devargs structure after 1198 * it has been inserted, so there's nothing SPDK needs to track. 1199 */ 1200 if (rte_devargs_insert(&da) != 0) { 1201 SPDK_ERRLOG("rte_devargs_insert() failed on '%s'\n", devargs_str); 1202 free(da); 1203 return -EINVAL; 1204 } 1205 1206 return 0; 1207 } 1208