1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright 2020 Mellanox Technologies, Ltd 3 */ 4 5 #include <unistd.h> 6 #include <string.h> 7 #include <stdio.h> 8 #ifdef RTE_IBVERBS_LINK_DLOPEN 9 #include <dlfcn.h> 10 #endif 11 #include <dirent.h> 12 #include <net/if.h> 13 14 #include <rte_errno.h> 15 #include <rte_string_fns.h> 16 #include <rte_bus_pci.h> 17 #include <rte_bus_auxiliary.h> 18 19 #include "mlx5_common.h" 20 #include "mlx5_nl.h" 21 #include "mlx5_common_log.h" 22 #include "mlx5_common_private.h" 23 #include "mlx5_common_defs.h" 24 #include "mlx5_common_os.h" 25 #include "mlx5_glue.h" 26 27 #ifdef MLX5_GLUE 28 const struct mlx5_glue *mlx5_glue; 29 #endif 30 31 int 32 mlx5_get_pci_addr(const char *dev_path, struct rte_pci_addr *pci_addr) 33 { 34 FILE *file; 35 char line[32]; 36 int rc = -ENOENT; 37 MKSTR(path, "%s/device/uevent", dev_path); 38 39 file = fopen(path, "rb"); 40 if (file == NULL) { 41 rte_errno = errno; 42 return -rte_errno; 43 } 44 while (fgets(line, sizeof(line), file) == line) { 45 size_t len = strlen(line); 46 47 /* Truncate long lines. */ 48 if (len == (sizeof(line) - 1)) { 49 while (line[(len - 1)] != '\n') { 50 int ret = fgetc(file); 51 52 if (ret == EOF) 53 goto exit; 54 line[(len - 1)] = ret; 55 } 56 /* No match for long lines. */ 57 continue; 58 } 59 /* Extract information. */ 60 if (sscanf(line, 61 "PCI_SLOT_NAME=" 62 "%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n", 63 &pci_addr->domain, 64 &pci_addr->bus, 65 &pci_addr->devid, 66 &pci_addr->function) == 4) { 67 rc = 0; 68 break; 69 } 70 } 71 exit: 72 fclose(file); 73 if (rc) 74 rte_errno = -rc; 75 return rc; 76 } 77 78 /** 79 * Extract port name, as a number, from sysfs or netlink information. 80 * 81 * @param[in] port_name_in 82 * String representing the port name. 83 * @param[out] port_info_out 84 * Port information, including port name as a number and port name 85 * type if recognized 86 * 87 * @return 88 * port_name field set according to recognized name format. 89 */ 90 void 91 mlx5_translate_port_name(const char *port_name_in, 92 struct mlx5_switch_info *port_info_out) 93 { 94 char ctrl = 0, pf_c1, pf_c2, vf_c1, vf_c2, eol; 95 char *end; 96 int sc_items; 97 98 sc_items = sscanf(port_name_in, "%c%d", 99 &ctrl, &port_info_out->ctrl_num); 100 if (sc_items == 2 && ctrl == 'c') { 101 port_name_in++; /* 'c' */ 102 port_name_in += snprintf(NULL, 0, "%d", 103 port_info_out->ctrl_num); 104 } 105 /* Check for port-name as a string of the form pf0vf0 or pf0sf0 */ 106 sc_items = sscanf(port_name_in, "%c%c%d%c%c%d%c", 107 &pf_c1, &pf_c2, &port_info_out->pf_num, 108 &vf_c1, &vf_c2, &port_info_out->port_name, &eol); 109 if (sc_items == 6 && pf_c1 == 'p' && pf_c2 == 'f') { 110 if (vf_c1 == 'v' && vf_c2 == 'f') { 111 /* Kernel ver >= 5.0 or OFED ver >= 4.6 */ 112 port_info_out->name_type = 113 MLX5_PHYS_PORT_NAME_TYPE_PFVF; 114 return; 115 } 116 if (vf_c1 == 's' && vf_c2 == 'f') { 117 /* Kernel ver >= 5.11 or OFED ver >= 5.1 */ 118 port_info_out->name_type = 119 MLX5_PHYS_PORT_NAME_TYPE_PFSF; 120 return; 121 } 122 } 123 /* 124 * Check for port-name as a string of the form p0 125 * (support kernel ver >= 5.0, or OFED ver >= 4.6). 126 */ 127 sc_items = sscanf(port_name_in, "%c%d%c", 128 &pf_c1, &port_info_out->port_name, &eol); 129 if (sc_items == 2 && pf_c1 == 'p') { 130 port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_UPLINK; 131 return; 132 } 133 /* 134 * Check for port-name as a string of the form pf0 135 * (support kernel ver >= 5.7 for HPF representor on BF). 136 */ 137 sc_items = sscanf(port_name_in, "%c%c%d%c", 138 &pf_c1, &pf_c2, &port_info_out->pf_num, &eol); 139 if (sc_items == 3 && pf_c1 == 'p' && pf_c2 == 'f') { 140 port_info_out->port_name = -1; 141 port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_PFHPF; 142 return; 143 } 144 /* Check for port-name as a number (support kernel ver < 5.0 */ 145 errno = 0; 146 port_info_out->port_name = strtol(port_name_in, &end, 0); 147 if (!errno && 148 (size_t)(end - port_name_in) == strlen(port_name_in)) { 149 port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_LEGACY; 150 return; 151 } 152 port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN; 153 } 154 155 int 156 mlx5_get_ifname_sysfs(const char *ibdev_path, char *ifname) 157 { 158 DIR *dir; 159 struct dirent *dent; 160 unsigned int dev_type = 0; 161 unsigned int dev_port_prev = ~0u; 162 char match[IF_NAMESIZE] = ""; 163 164 MLX5_ASSERT(ibdev_path); 165 { 166 MKSTR(path, "%s/device/net", ibdev_path); 167 168 dir = opendir(path); 169 if (dir == NULL) { 170 rte_errno = errno; 171 return -rte_errno; 172 } 173 } 174 while ((dent = readdir(dir)) != NULL) { 175 char *name = dent->d_name; 176 FILE *file; 177 unsigned int dev_port; 178 int r; 179 180 if ((name[0] == '.') && 181 ((name[1] == '\0') || 182 ((name[1] == '.') && (name[2] == '\0')))) 183 continue; 184 185 MKSTR(path, "%s/device/net/%s/%s", 186 ibdev_path, name, 187 (dev_type ? "dev_id" : "dev_port")); 188 189 file = fopen(path, "rb"); 190 if (file == NULL) { 191 if (errno != ENOENT) 192 continue; 193 /* 194 * Switch to dev_id when dev_port does not exist as 195 * is the case with Linux kernel versions < 3.15. 196 */ 197 try_dev_id: 198 match[0] = '\0'; 199 if (dev_type) 200 break; 201 dev_type = 1; 202 dev_port_prev = ~0u; 203 rewinddir(dir); 204 continue; 205 } 206 r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port); 207 fclose(file); 208 if (r != 1) 209 continue; 210 /* 211 * Switch to dev_id when dev_port returns the same value for 212 * all ports. May happen when using a MOFED release older than 213 * 3.0 with a Linux kernel >= 3.15. 214 */ 215 if (dev_port == dev_port_prev) 216 goto try_dev_id; 217 dev_port_prev = dev_port; 218 if (dev_port == 0) 219 strlcpy(match, name, IF_NAMESIZE); 220 } 221 closedir(dir); 222 if (match[0] == '\0') { 223 rte_errno = ENOENT; 224 return -rte_errno; 225 } 226 strncpy(ifname, match, IF_NAMESIZE); 227 return 0; 228 } 229 230 #ifdef MLX5_GLUE 231 232 /** 233 * Suffix RTE_EAL_PMD_PATH with "-glue". 234 * 235 * This function performs a sanity check on RTE_EAL_PMD_PATH before 236 * suffixing its last component. 237 * 238 * @param buf[out] 239 * Output buffer, should be large enough otherwise NULL is returned. 240 * @param size 241 * Size of @p out. 242 * 243 * @return 244 * Pointer to @p buf or @p NULL in case suffix cannot be appended. 245 */ 246 static char * 247 mlx5_glue_path(char *buf, size_t size) 248 { 249 static const char *const bad[] = { "/", ".", "..", NULL }; 250 const char *path = RTE_EAL_PMD_PATH; 251 size_t len = strlen(path); 252 size_t off; 253 int i; 254 255 while (len && path[len - 1] == '/') 256 --len; 257 for (off = len; off && path[off - 1] != '/'; --off) 258 ; 259 for (i = 0; bad[i]; ++i) 260 if (!strncmp(path + off, bad[i], (int)(len - off))) 261 goto error; 262 i = snprintf(buf, size, "%.*s-glue", (int)len, path); 263 if (i == -1 || (size_t)i >= size) 264 goto error; 265 return buf; 266 error: 267 RTE_LOG(ERR, PMD, "unable to append \"-glue\" to last component of" 268 " RTE_EAL_PMD_PATH (\"" RTE_EAL_PMD_PATH "\"), please" 269 " re-configure DPDK"); 270 return NULL; 271 } 272 273 static int 274 mlx5_glue_dlopen(void) 275 { 276 char glue_path[sizeof(RTE_EAL_PMD_PATH) - 1 + sizeof("-glue")]; 277 void *handle = NULL; 278 279 char const *path[] = { 280 /* 281 * A basic security check is necessary before trusting 282 * MLX5_GLUE_PATH, which may override RTE_EAL_PMD_PATH. 283 */ 284 (geteuid() == getuid() && getegid() == getgid() ? 285 getenv("MLX5_GLUE_PATH") : NULL), 286 /* 287 * When RTE_EAL_PMD_PATH is set, use its glue-suffixed 288 * variant, otherwise let dlopen() look up libraries on its 289 * own. 290 */ 291 (*RTE_EAL_PMD_PATH ? 292 mlx5_glue_path(glue_path, sizeof(glue_path)) : ""), 293 }; 294 unsigned int i = 0; 295 void **sym; 296 const char *dlmsg; 297 298 while (!handle && i != RTE_DIM(path)) { 299 const char *end; 300 size_t len; 301 int ret; 302 303 if (!path[i]) { 304 ++i; 305 continue; 306 } 307 end = strpbrk(path[i], ":;"); 308 if (!end) 309 end = path[i] + strlen(path[i]); 310 len = end - path[i]; 311 ret = 0; 312 do { 313 char name[ret + 1]; 314 315 ret = snprintf(name, sizeof(name), "%.*s%s" MLX5_GLUE, 316 (int)len, path[i], 317 (!len || *(end - 1) == '/') ? "" : "/"); 318 if (ret == -1) 319 break; 320 if (sizeof(name) != (size_t)ret + 1) 321 continue; 322 DRV_LOG(DEBUG, "Looking for rdma-core glue as " 323 "\"%s\"", name); 324 handle = dlopen(name, RTLD_LAZY); 325 break; 326 } while (1); 327 path[i] = end + 1; 328 if (!*end) 329 ++i; 330 } 331 if (!handle) { 332 rte_errno = EINVAL; 333 dlmsg = dlerror(); 334 if (dlmsg) 335 DRV_LOG(WARNING, "Cannot load glue library: %s", dlmsg); 336 goto glue_error; 337 } 338 sym = dlsym(handle, "mlx5_glue"); 339 if (!sym || !*sym) { 340 rte_errno = EINVAL; 341 dlmsg = dlerror(); 342 if (dlmsg) 343 DRV_LOG(ERR, "Cannot resolve glue symbol: %s", dlmsg); 344 goto glue_error; 345 } 346 mlx5_glue = *sym; 347 return 0; 348 349 glue_error: 350 if (handle) 351 dlclose(handle); 352 return -1; 353 } 354 355 #endif 356 357 /** 358 * Initialization routine for run-time dependency on rdma-core. 359 */ 360 void 361 mlx5_glue_constructor(void) 362 { 363 /* 364 * RDMAV_HUGEPAGES_SAFE tells ibv_fork_init() we intend to use 365 * huge pages. Calling ibv_fork_init() during init allows 366 * applications to use fork() safely for purposes other than 367 * using this PMD, which is not supported in forked processes. 368 */ 369 setenv("RDMAV_HUGEPAGES_SAFE", "1", 1); 370 /* Match the size of Rx completion entry to the size of a cacheline. */ 371 if (RTE_CACHE_LINE_SIZE == 128) 372 setenv("MLX5_CQE_SIZE", "128", 0); 373 /* 374 * MLX5_DEVICE_FATAL_CLEANUP tells ibv_destroy functions to 375 * cleanup all the Verbs resources even when the device was removed. 376 */ 377 setenv("MLX5_DEVICE_FATAL_CLEANUP", "1", 1); 378 379 #ifdef MLX5_GLUE 380 if (mlx5_glue_dlopen() != 0) 381 goto glue_error; 382 #endif 383 384 #ifdef RTE_LIBRTE_MLX5_DEBUG 385 /* Glue structure must not contain any NULL pointers. */ 386 { 387 unsigned int i; 388 389 for (i = 0; i != sizeof(*mlx5_glue) / sizeof(void *); ++i) 390 MLX5_ASSERT(((const void *const *)mlx5_glue)[i]); 391 } 392 #endif 393 if (strcmp(mlx5_glue->version, MLX5_GLUE_VERSION)) { 394 rte_errno = EINVAL; 395 DRV_LOG(ERR, "rdma-core glue \"%s\" mismatch: \"%s\" is " 396 "required", mlx5_glue->version, MLX5_GLUE_VERSION); 397 goto glue_error; 398 } 399 mlx5_glue->fork_init(); 400 return; 401 402 glue_error: 403 DRV_LOG(WARNING, "Cannot initialize MLX5 common due to missing" 404 " run-time dependency on rdma-core libraries (libibverbs," 405 " libmlx5)"); 406 mlx5_glue = NULL; 407 } 408 409 static struct ibv_device * 410 mlx5_os_get_ibv_device(const struct rte_pci_addr *addr) 411 { 412 int n; 413 struct ibv_device **ibv_list = mlx5_glue->get_device_list(&n); 414 struct ibv_device *ibv_match = NULL; 415 416 if (ibv_list == NULL) { 417 rte_errno = ENOSYS; 418 return NULL; 419 } 420 while (n-- > 0) { 421 struct rte_pci_addr paddr; 422 423 DRV_LOG(DEBUG, "Checking device \"%s\"..", ibv_list[n]->name); 424 if (mlx5_get_pci_addr(ibv_list[n]->ibdev_path, &paddr) != 0) 425 continue; 426 if (rte_pci_addr_cmp(addr, &paddr) != 0) 427 continue; 428 ibv_match = ibv_list[n]; 429 break; 430 } 431 if (ibv_match == NULL) { 432 DRV_LOG(WARNING, 433 "No Verbs device matches PCI device " PCI_PRI_FMT "," 434 " are kernel drivers loaded?", 435 addr->domain, addr->bus, addr->devid, addr->function); 436 rte_errno = ENOENT; 437 } 438 mlx5_glue->free_device_list(ibv_list); 439 return ibv_match; 440 } 441 442 /* Try to disable ROCE by Netlink\Devlink. */ 443 static int 444 mlx5_nl_roce_disable(const char *addr) 445 { 446 int nlsk_fd = mlx5_nl_init(NETLINK_GENERIC); 447 int devlink_id; 448 int enable; 449 int ret; 450 451 if (nlsk_fd < 0) 452 return nlsk_fd; 453 devlink_id = mlx5_nl_devlink_family_id_get(nlsk_fd); 454 if (devlink_id < 0) { 455 ret = devlink_id; 456 DRV_LOG(DEBUG, 457 "Failed to get devlink id for ROCE operations by Netlink."); 458 goto close; 459 } 460 ret = mlx5_nl_enable_roce_get(nlsk_fd, devlink_id, addr, &enable); 461 if (ret) { 462 DRV_LOG(DEBUG, "Failed to get ROCE enable by Netlink: %d.", 463 ret); 464 goto close; 465 } else if (!enable) { 466 DRV_LOG(INFO, "ROCE has already disabled(Netlink)."); 467 goto close; 468 } 469 ret = mlx5_nl_enable_roce_set(nlsk_fd, devlink_id, addr, 0); 470 if (ret) 471 DRV_LOG(DEBUG, "Failed to disable ROCE by Netlink: %d.", ret); 472 else 473 DRV_LOG(INFO, "ROCE is disabled by Netlink successfully."); 474 close: 475 close(nlsk_fd); 476 return ret; 477 } 478 479 /* Try to disable ROCE by sysfs. */ 480 static int 481 mlx5_sys_roce_disable(const char *addr) 482 { 483 FILE *file_o; 484 int enable; 485 int ret; 486 487 MKSTR(file_p, "/sys/bus/pci/devices/%s/roce_enable", addr); 488 file_o = fopen(file_p, "rb"); 489 if (!file_o) { 490 rte_errno = ENOTSUP; 491 return -ENOTSUP; 492 } 493 ret = fscanf(file_o, "%d", &enable); 494 if (ret != 1) { 495 rte_errno = EINVAL; 496 ret = EINVAL; 497 goto close; 498 } else if (!enable) { 499 ret = 0; 500 DRV_LOG(INFO, "ROCE has already disabled(sysfs)."); 501 goto close; 502 } 503 fclose(file_o); 504 file_o = fopen(file_p, "wb"); 505 if (!file_o) { 506 rte_errno = ENOTSUP; 507 return -ENOTSUP; 508 } 509 fprintf(file_o, "0\n"); 510 ret = 0; 511 close: 512 if (ret) 513 DRV_LOG(DEBUG, "Failed to disable ROCE by sysfs: %d.", ret); 514 else 515 DRV_LOG(INFO, "ROCE is disabled by sysfs successfully."); 516 fclose(file_o); 517 return ret; 518 } 519 520 static int 521 mlx5_roce_disable(const struct rte_device *dev) 522 { 523 char pci_addr[PCI_PRI_STR_SIZE] = { 0 }; 524 525 if (mlx5_dev_to_pci_str(dev, pci_addr, sizeof(pci_addr)) < 0) 526 return -rte_errno; 527 /* Firstly try to disable ROCE by Netlink and fallback to sysfs. */ 528 if (mlx5_nl_roce_disable(pci_addr) != 0 && 529 mlx5_sys_roce_disable(pci_addr) != 0) 530 return -rte_errno; 531 return 0; 532 } 533 534 static struct ibv_device * 535 mlx5_os_get_ibv_dev(const struct rte_device *dev) 536 { 537 struct ibv_device *ibv; 538 539 if (mlx5_dev_is_pci(dev)) 540 ibv = mlx5_os_get_ibv_device(&RTE_DEV_TO_PCI_CONST(dev)->addr); 541 else 542 ibv = mlx5_get_aux_ibv_device(RTE_DEV_TO_AUXILIARY_CONST(dev)); 543 if (ibv == NULL) { 544 rte_errno = ENODEV; 545 DRV_LOG(ERR, "Verbs device not found: %s", dev->name); 546 } 547 return ibv; 548 } 549 550 static struct ibv_device * 551 mlx5_vdpa_get_ibv_dev(const struct rte_device *dev) 552 { 553 struct ibv_device *ibv; 554 int retry; 555 556 if (mlx5_roce_disable(dev) != 0) { 557 DRV_LOG(WARNING, "Failed to disable ROCE for \"%s\".", 558 dev->name); 559 return NULL; 560 } 561 /* Wait for the IB device to appear again after reload. */ 562 for (retry = MLX5_VDPA_MAX_RETRIES; retry > 0; --retry) { 563 ibv = mlx5_os_get_ibv_dev(dev); 564 if (ibv != NULL) 565 return ibv; 566 usleep(MLX5_VDPA_USEC); 567 } 568 DRV_LOG(ERR, 569 "Cannot get IB device after disabling RoCE for \"%s\", retries exceed %d.", 570 dev->name, MLX5_VDPA_MAX_RETRIES); 571 rte_errno = EAGAIN; 572 return NULL; 573 } 574 575 static int 576 mlx5_config_doorbell_mapping_env(int dbnc) 577 { 578 char *env; 579 int value; 580 581 MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY); 582 /* Get environment variable to store. */ 583 env = getenv(MLX5_SHUT_UP_BF); 584 value = env ? !!strcmp(env, "0") : MLX5_ARG_UNSET; 585 if (dbnc == MLX5_ARG_UNSET) 586 setenv(MLX5_SHUT_UP_BF, MLX5_SHUT_UP_BF_DEFAULT, 1); 587 else 588 setenv(MLX5_SHUT_UP_BF, 589 dbnc == MLX5_TXDB_NCACHED ? "1" : "0", 1); 590 return value; 591 } 592 593 static void 594 mlx5_restore_doorbell_mapping_env(int value) 595 { 596 MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY); 597 /* Restore the original environment variable state. */ 598 if (value == MLX5_ARG_UNSET) 599 unsetenv(MLX5_SHUT_UP_BF); 600 else 601 setenv(MLX5_SHUT_UP_BF, value ? "1" : "0", 1); 602 } 603 604 /** 605 * Function API to open IB device. 606 * 607 * 608 * @param cdev 609 * Pointer to the mlx5 device. 610 * @param classes 611 * Chosen classes come from device arguments. 612 * 613 * @return 614 * 0 on success, a negative errno value otherwise and rte_errno is set. 615 */ 616 int 617 mlx5_os_open_device(struct mlx5_common_device *cdev, uint32_t classes) 618 { 619 struct ibv_device *ibv; 620 struct ibv_context *ctx = NULL; 621 int dbmap_env; 622 623 if (classes & MLX5_CLASS_VDPA) 624 ibv = mlx5_vdpa_get_ibv_dev(cdev->dev); 625 else 626 ibv = mlx5_os_get_ibv_dev(cdev->dev); 627 if (!ibv) 628 return -rte_errno; 629 DRV_LOG(INFO, "Dev information matches for device \"%s\".", ibv->name); 630 /* 631 * Configure environment variable "MLX5_BF_SHUT_UP" before the device 632 * creation. The rdma_core library checks the variable at device 633 * creation and stores the result internally. 634 */ 635 dbmap_env = mlx5_config_doorbell_mapping_env(cdev->config.dbnc); 636 /* Try to open IB device with DV first, then usual Verbs. */ 637 errno = 0; 638 ctx = mlx5_glue->dv_open_device(ibv); 639 if (ctx) { 640 cdev->config.devx = 1; 641 DRV_LOG(DEBUG, "DevX is supported."); 642 } else if (classes == MLX5_CLASS_ETH) { 643 /* The environment variable is still configured. */ 644 ctx = mlx5_glue->open_device(ibv); 645 if (ctx == NULL) 646 goto error; 647 DRV_LOG(DEBUG, "DevX is NOT supported."); 648 } else { 649 goto error; 650 } 651 /* The device is created, no need for environment. */ 652 mlx5_restore_doorbell_mapping_env(dbmap_env); 653 /* Hint libmlx5 to use PMD allocator for data plane resources */ 654 mlx5_set_context_attr(cdev->dev, ctx); 655 cdev->ctx = ctx; 656 return 0; 657 error: 658 rte_errno = errno ? errno : ENODEV; 659 /* The device creation is failed, no need for environment. */ 660 mlx5_restore_doorbell_mapping_env(dbmap_env); 661 DRV_LOG(ERR, "Failed to open IB device \"%s\".", ibv->name); 662 return -rte_errno; 663 } 664