1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright 2020 Mellanox Technologies, Ltd 3 */ 4 5 #include <sys/types.h> 6 #include <unistd.h> 7 #include <string.h> 8 #include <stdio.h> 9 #ifdef RTE_IBVERBS_LINK_DLOPEN 10 #include <dlfcn.h> 11 #endif 12 #include <dirent.h> 13 #include <net/if.h> 14 #include <fcntl.h> 15 16 #include <rte_errno.h> 17 #include <rte_string_fns.h> 18 #include <rte_bus_pci.h> 19 #include <rte_bus_auxiliary.h> 20 21 #include "mlx5_common.h" 22 #include "mlx5_nl.h" 23 #include "mlx5_common_log.h" 24 #include "mlx5_common_private.h" 25 #include "mlx5_common_defs.h" 26 #include "mlx5_common_os.h" 27 #include "mlx5_glue.h" 28 29 #ifdef MLX5_GLUE 30 const struct mlx5_glue *mlx5_glue; 31 #endif 32 33 int 34 mlx5_get_pci_addr(const char *dev_path, struct rte_pci_addr *pci_addr) 35 { 36 FILE *file; 37 char line[32]; 38 int rc = -ENOENT; 39 MKSTR(path, "%s/device/uevent", dev_path); 40 41 file = fopen(path, "rb"); 42 if (file == NULL) { 43 rte_errno = errno; 44 return -rte_errno; 45 } 46 while (fgets(line, sizeof(line), file) == line) { 47 size_t len = strlen(line); 48 49 /* Truncate long lines. */ 50 if (len == (sizeof(line) - 1)) { 51 while (line[(len - 1)] != '\n') { 52 int ret = fgetc(file); 53 54 if (ret == EOF) 55 goto exit; 56 line[(len - 1)] = ret; 57 } 58 /* No match for long lines. */ 59 continue; 60 } 61 /* Extract information. */ 62 if (sscanf(line, 63 "PCI_SLOT_NAME=" 64 "%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n", 65 &pci_addr->domain, 66 &pci_addr->bus, 67 &pci_addr->devid, 68 &pci_addr->function) == 4) { 69 rc = 0; 70 break; 71 } 72 } 73 exit: 74 fclose(file); 75 if (rc) 76 rte_errno = -rc; 77 return rc; 78 } 79 80 /** 81 * Extract port name, as a number, from sysfs or netlink information. 82 * 83 * @param[in] port_name_in 84 * String representing the port name. 85 * @param[out] port_info_out 86 * Port information, including port name as a number and port name 87 * type if recognized 88 * 89 * @return 90 * port_name field set according to recognized name format. 91 */ 92 void 93 mlx5_translate_port_name(const char *port_name_in, 94 struct mlx5_switch_info *port_info_out) 95 { 96 char ctrl = 0, pf_c1, pf_c2, vf_c1, vf_c2, eol; 97 char *end; 98 int sc_items; 99 100 sc_items = sscanf(port_name_in, "%c%d", 101 &ctrl, &port_info_out->ctrl_num); 102 if (sc_items == 2 && ctrl == 'c') { 103 port_name_in++; /* 'c' */ 104 port_name_in += snprintf(NULL, 0, "%d", 105 port_info_out->ctrl_num); 106 } 107 /* Check for port-name as a string of the form pf0vf0 or pf0sf0 */ 108 sc_items = sscanf(port_name_in, "%c%c%d%c%c%d%c", 109 &pf_c1, &pf_c2, &port_info_out->pf_num, 110 &vf_c1, &vf_c2, &port_info_out->port_name, &eol); 111 if (sc_items == 6 && pf_c1 == 'p' && pf_c2 == 'f') { 112 if (vf_c1 == 'v' && vf_c2 == 'f') { 113 /* Kernel ver >= 5.0 or OFED ver >= 4.6 */ 114 port_info_out->name_type = 115 MLX5_PHYS_PORT_NAME_TYPE_PFVF; 116 return; 117 } 118 if (vf_c1 == 's' && vf_c2 == 'f') { 119 /* Kernel ver >= 5.11 or OFED ver >= 5.1 */ 120 port_info_out->name_type = 121 MLX5_PHYS_PORT_NAME_TYPE_PFSF; 122 return; 123 } 124 } 125 /* 126 * Check for port-name as a string of the form p0 127 * (support kernel ver >= 5.0, or OFED ver >= 4.6). 128 */ 129 sc_items = sscanf(port_name_in, "%c%d%c", 130 &pf_c1, &port_info_out->port_name, &eol); 131 if (sc_items == 2 && pf_c1 == 'p') { 132 port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_UPLINK; 133 return; 134 } 135 /* 136 * Check for port-name as a string of the form pf0 137 * (support kernel ver >= 5.7 for HPF representor on BF). 138 */ 139 sc_items = sscanf(port_name_in, "%c%c%d%c", 140 &pf_c1, &pf_c2, &port_info_out->pf_num, &eol); 141 if (sc_items == 3 && pf_c1 == 'p' && pf_c2 == 'f') { 142 port_info_out->port_name = -1; 143 port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_PFHPF; 144 return; 145 } 146 /* Check for port-name as a number (support kernel ver < 5.0 */ 147 errno = 0; 148 port_info_out->port_name = strtol(port_name_in, &end, 0); 149 if (!errno && 150 (size_t)(end - port_name_in) == strlen(port_name_in)) { 151 port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_LEGACY; 152 return; 153 } 154 port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN; 155 } 156 157 int 158 mlx5_get_ifname_sysfs(const char *ibdev_path, char *ifname) 159 { 160 DIR *dir; 161 struct dirent *dent; 162 unsigned int dev_type = 0; 163 unsigned int dev_port_prev = ~0u; 164 char match[IF_NAMESIZE] = ""; 165 166 MLX5_ASSERT(ibdev_path); 167 { 168 MKSTR(path, "%s/device/net", ibdev_path); 169 170 dir = opendir(path); 171 if (dir == NULL) { 172 rte_errno = errno; 173 return -rte_errno; 174 } 175 } 176 while ((dent = readdir(dir)) != NULL) { 177 char *name = dent->d_name; 178 FILE *file; 179 unsigned int dev_port; 180 int r; 181 182 if ((name[0] == '.') && 183 ((name[1] == '\0') || 184 ((name[1] == '.') && (name[2] == '\0')))) 185 continue; 186 187 MKSTR(path, "%s/device/net/%s/%s", 188 ibdev_path, name, 189 (dev_type ? "dev_id" : "dev_port")); 190 191 file = fopen(path, "rb"); 192 if (file == NULL) { 193 if (errno != ENOENT) 194 continue; 195 /* 196 * Switch to dev_id when dev_port does not exist as 197 * is the case with Linux kernel versions < 3.15. 198 */ 199 try_dev_id: 200 match[0] = '\0'; 201 if (dev_type) 202 break; 203 dev_type = 1; 204 dev_port_prev = ~0u; 205 rewinddir(dir); 206 continue; 207 } 208 r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port); 209 fclose(file); 210 if (r != 1) 211 continue; 212 /* 213 * Switch to dev_id when dev_port returns the same value for 214 * all ports. May happen when using a MOFED release older than 215 * 3.0 with a Linux kernel >= 3.15. 216 */ 217 if (dev_port == dev_port_prev) 218 goto try_dev_id; 219 dev_port_prev = dev_port; 220 if (dev_port == 0) 221 strlcpy(match, name, IF_NAMESIZE); 222 } 223 closedir(dir); 224 if (match[0] == '\0') { 225 rte_errno = ENOENT; 226 return -rte_errno; 227 } 228 strncpy(ifname, match, IF_NAMESIZE); 229 return 0; 230 } 231 232 #ifdef MLX5_GLUE 233 234 /** 235 * Suffix RTE_EAL_PMD_PATH with "-glue". 236 * 237 * This function performs a sanity check on RTE_EAL_PMD_PATH before 238 * suffixing its last component. 239 * 240 * @param buf[out] 241 * Output buffer, should be large enough otherwise NULL is returned. 242 * @param size 243 * Size of @p out. 244 * 245 * @return 246 * Pointer to @p buf or @p NULL in case suffix cannot be appended. 247 */ 248 static char * 249 mlx5_glue_path(char *buf, size_t size) 250 { 251 static const char *const bad[] = { "/", ".", "..", NULL }; 252 const char *path = RTE_EAL_PMD_PATH; 253 size_t len = strlen(path); 254 size_t off; 255 int i; 256 257 while (len && path[len - 1] == '/') 258 --len; 259 for (off = len; off && path[off - 1] != '/'; --off) 260 ; 261 for (i = 0; bad[i]; ++i) 262 if (!strncmp(path + off, bad[i], (int)(len - off))) 263 goto error; 264 i = snprintf(buf, size, "%.*s-glue", (int)len, path); 265 if (i == -1 || (size_t)i >= size) 266 goto error; 267 return buf; 268 error: 269 RTE_LOG(ERR, PMD, "unable to append \"-glue\" to last component of" 270 " RTE_EAL_PMD_PATH (\"" RTE_EAL_PMD_PATH "\"), please" 271 " re-configure DPDK"); 272 return NULL; 273 } 274 275 static int 276 mlx5_glue_dlopen(void) 277 { 278 char glue_path[sizeof(RTE_EAL_PMD_PATH) - 1 + sizeof("-glue")]; 279 void *handle = NULL; 280 281 char const *path[] = { 282 /* 283 * A basic security check is necessary before trusting 284 * MLX5_GLUE_PATH, which may override RTE_EAL_PMD_PATH. 285 */ 286 (geteuid() == getuid() && getegid() == getgid() ? 287 getenv("MLX5_GLUE_PATH") : NULL), 288 /* 289 * When RTE_EAL_PMD_PATH is set, use its glue-suffixed 290 * variant, otherwise let dlopen() look up libraries on its 291 * own. 292 */ 293 (*RTE_EAL_PMD_PATH ? 294 mlx5_glue_path(glue_path, sizeof(glue_path)) : ""), 295 }; 296 unsigned int i = 0; 297 void **sym; 298 const char *dlmsg; 299 300 while (!handle && i != RTE_DIM(path)) { 301 const char *end; 302 size_t len; 303 int ret; 304 305 if (!path[i]) { 306 ++i; 307 continue; 308 } 309 end = strpbrk(path[i], ":;"); 310 if (!end) 311 end = path[i] + strlen(path[i]); 312 len = end - path[i]; 313 ret = 0; 314 do { 315 char name[ret + 1]; 316 317 ret = snprintf(name, sizeof(name), "%.*s%s" MLX5_GLUE, 318 (int)len, path[i], 319 (!len || *(end - 1) == '/') ? "" : "/"); 320 if (ret == -1) 321 break; 322 if (sizeof(name) != (size_t)ret + 1) 323 continue; 324 DRV_LOG(DEBUG, "Looking for rdma-core glue as " 325 "\"%s\"", name); 326 handle = dlopen(name, RTLD_LAZY); 327 break; 328 } while (1); 329 path[i] = end + 1; 330 if (!*end) 331 ++i; 332 } 333 if (!handle) { 334 rte_errno = EINVAL; 335 dlmsg = dlerror(); 336 if (dlmsg) 337 DRV_LOG(WARNING, "Cannot load glue library: %s", dlmsg); 338 goto glue_error; 339 } 340 sym = dlsym(handle, "mlx5_glue"); 341 if (!sym || !*sym) { 342 rte_errno = EINVAL; 343 dlmsg = dlerror(); 344 if (dlmsg) 345 DRV_LOG(ERR, "Cannot resolve glue symbol: %s", dlmsg); 346 goto glue_error; 347 } 348 mlx5_glue = *sym; 349 return 0; 350 351 glue_error: 352 if (handle) 353 dlclose(handle); 354 return -1; 355 } 356 357 #endif 358 359 /** 360 * Initialization routine for run-time dependency on rdma-core. 361 */ 362 void 363 mlx5_glue_constructor(void) 364 { 365 /* 366 * RDMAV_HUGEPAGES_SAFE tells ibv_fork_init() we intend to use 367 * huge pages. Calling ibv_fork_init() during init allows 368 * applications to use fork() safely for purposes other than 369 * using this PMD, which is not supported in forked processes. 370 */ 371 setenv("RDMAV_HUGEPAGES_SAFE", "1", 1); 372 /* Match the size of Rx completion entry to the size of a cacheline. */ 373 if (RTE_CACHE_LINE_SIZE == 128) 374 setenv("MLX5_CQE_SIZE", "128", 0); 375 /* 376 * MLX5_DEVICE_FATAL_CLEANUP tells ibv_destroy functions to 377 * cleanup all the Verbs resources even when the device was removed. 378 */ 379 setenv("MLX5_DEVICE_FATAL_CLEANUP", "1", 1); 380 381 #ifdef MLX5_GLUE 382 if (mlx5_glue_dlopen() != 0) 383 goto glue_error; 384 #endif 385 386 #ifdef RTE_LIBRTE_MLX5_DEBUG 387 /* Glue structure must not contain any NULL pointers. */ 388 { 389 unsigned int i; 390 391 for (i = 0; i != sizeof(*mlx5_glue) / sizeof(void *); ++i) 392 MLX5_ASSERT(((const void *const *)mlx5_glue)[i]); 393 } 394 #endif 395 if (strcmp(mlx5_glue->version, MLX5_GLUE_VERSION)) { 396 rte_errno = EINVAL; 397 DRV_LOG(ERR, "rdma-core glue \"%s\" mismatch: \"%s\" is " 398 "required", mlx5_glue->version, MLX5_GLUE_VERSION); 399 goto glue_error; 400 } 401 mlx5_glue->fork_init(); 402 return; 403 404 glue_error: 405 DRV_LOG(WARNING, "Cannot initialize MLX5 common due to missing" 406 " run-time dependency on rdma-core libraries (libibverbs," 407 " libmlx5)"); 408 mlx5_glue = NULL; 409 } 410 411 /** 412 * Validate user arguments for remote PD and CTX. 413 * 414 * @param config 415 * Pointer to device configuration structure. 416 * 417 * @return 418 * 0 on success, a negative errno value otherwise and rte_errno is set. 419 */ 420 int 421 mlx5_os_remote_pd_and_ctx_validate(struct mlx5_common_dev_config *config) 422 { 423 int device_fd = config->device_fd; 424 int pd_handle = config->pd_handle; 425 426 #ifdef HAVE_MLX5_IBV_IMPORT_CTX_PD_AND_MR 427 if (device_fd == MLX5_ARG_UNSET && pd_handle != MLX5_ARG_UNSET) { 428 DRV_LOG(ERR, "Remote PD without CTX is not supported."); 429 rte_errno = EINVAL; 430 return -rte_errno; 431 } 432 if (device_fd != MLX5_ARG_UNSET && pd_handle == MLX5_ARG_UNSET) { 433 DRV_LOG(ERR, "Remote CTX without PD is not supported."); 434 rte_errno = EINVAL; 435 return -rte_errno; 436 } 437 DRV_LOG(DEBUG, "Remote PD and CTX is supported: (cmd_fd=%d, " 438 "pd_handle=%d).", device_fd, pd_handle); 439 #else 440 if (pd_handle != MLX5_ARG_UNSET || device_fd != MLX5_ARG_UNSET) { 441 DRV_LOG(ERR, 442 "Remote PD and CTX is not supported - maybe old rdma-core version?"); 443 rte_errno = ENOTSUP; 444 return -rte_errno; 445 } 446 #endif 447 return 0; 448 } 449 450 /** 451 * Release Protection Domain object. 452 * 453 * @param[out] cdev 454 * Pointer to the mlx5 device. 455 * 456 * @return 457 * 0 on success, a negative errno value otherwise. 458 */ 459 int 460 mlx5_os_pd_release(struct mlx5_common_device *cdev) 461 { 462 if (cdev->config.pd_handle == MLX5_ARG_UNSET) 463 return mlx5_glue->dealloc_pd(cdev->pd); 464 else 465 return mlx5_glue->unimport_pd(cdev->pd); 466 } 467 468 /** 469 * Allocate Protection Domain object. 470 * 471 * @param[out] cdev 472 * Pointer to the mlx5 device. 473 * 474 * @return 475 * 0 on success, a negative errno value otherwise. 476 */ 477 static int 478 mlx5_os_pd_create(struct mlx5_common_device *cdev) 479 { 480 cdev->pd = mlx5_glue->alloc_pd(cdev->ctx); 481 if (cdev->pd == NULL) { 482 DRV_LOG(ERR, "Failed to allocate PD: %s", rte_strerror(errno)); 483 return errno ? -errno : -ENOMEM; 484 } 485 return 0; 486 } 487 488 /** 489 * Import Protection Domain object according to given PD handle. 490 * 491 * @param[out] cdev 492 * Pointer to the mlx5 device. 493 * 494 * @return 495 * 0 on success, a negative errno value otherwise. 496 */ 497 static int 498 mlx5_os_pd_import(struct mlx5_common_device *cdev) 499 { 500 cdev->pd = mlx5_glue->import_pd(cdev->ctx, cdev->config.pd_handle); 501 if (cdev->pd == NULL) { 502 DRV_LOG(ERR, "Failed to import PD using handle=%d: %s", 503 cdev->config.pd_handle, rte_strerror(errno)); 504 return errno ? -errno : -ENOMEM; 505 } 506 return 0; 507 } 508 509 /** 510 * Prepare Protection Domain object and extract its pdn using DV API. 511 * 512 * @param[out] cdev 513 * Pointer to the mlx5 device. 514 * 515 * @return 516 * 0 on success, a negative errno value otherwise and rte_errno is set. 517 */ 518 int 519 mlx5_os_pd_prepare(struct mlx5_common_device *cdev) 520 { 521 #ifdef HAVE_IBV_FLOW_DV_SUPPORT 522 struct mlx5dv_obj obj; 523 struct mlx5dv_pd pd_info; 524 #endif 525 int ret; 526 527 if (cdev->config.pd_handle == MLX5_ARG_UNSET) 528 ret = mlx5_os_pd_create(cdev); 529 else 530 ret = mlx5_os_pd_import(cdev); 531 if (ret) { 532 rte_errno = -ret; 533 return ret; 534 } 535 if (cdev->config.devx == 0) 536 return 0; 537 #ifdef HAVE_IBV_FLOW_DV_SUPPORT 538 obj.pd.in = cdev->pd; 539 obj.pd.out = &pd_info; 540 ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_PD); 541 if (ret != 0) { 542 DRV_LOG(ERR, "Fail to get PD object info."); 543 rte_errno = errno; 544 claim_zero(mlx5_os_pd_release(cdev)); 545 cdev->pd = NULL; 546 return -rte_errno; 547 } 548 cdev->pdn = pd_info.pdn; 549 return 0; 550 #else 551 DRV_LOG(ERR, "Cannot get pdn - no DV support."); 552 rte_errno = ENOTSUP; 553 return -rte_errno; 554 #endif /* HAVE_IBV_FLOW_DV_SUPPORT */ 555 } 556 557 static struct ibv_device * 558 mlx5_os_get_ibv_device(const struct rte_pci_addr *addr) 559 { 560 int n; 561 struct ibv_device **ibv_list = mlx5_glue->get_device_list(&n); 562 struct ibv_device *ibv_match = NULL; 563 uint8_t guid1[32] = {0}; 564 uint8_t guid2[32] = {0}; 565 int ret1, ret2 = -1; 566 struct rte_pci_addr paddr; 567 568 if (ibv_list == NULL || !n) { 569 rte_errno = ENOSYS; 570 if (ibv_list) 571 mlx5_glue->free_device_list(ibv_list); 572 return NULL; 573 } 574 ret1 = mlx5_get_device_guid(addr, guid1, sizeof(guid1)); 575 while (n-- > 0) { 576 DRV_LOG(DEBUG, "Checking device \"%s\"..", ibv_list[n]->name); 577 if (mlx5_get_pci_addr(ibv_list[n]->ibdev_path, &paddr) != 0) 578 continue; 579 if (ret1 > 0) 580 ret2 = mlx5_get_device_guid(&paddr, guid2, sizeof(guid2)); 581 /* Bond device can bond secondary PCIe */ 582 if ((strstr(ibv_list[n]->name, "bond") && 583 ((ret1 > 0 && ret2 > 0 && !memcmp(guid1, guid2, sizeof(guid1))) || 584 (addr->domain == paddr.domain && addr->bus == paddr.bus && 585 addr->devid == paddr.devid))) || 586 !rte_pci_addr_cmp(addr, &paddr)) { 587 ibv_match = ibv_list[n]; 588 break; 589 } 590 } 591 if (ibv_match == NULL) { 592 DRV_LOG(WARNING, 593 "No Verbs device matches PCI device " PCI_PRI_FMT "," 594 " are kernel drivers loaded?", 595 addr->domain, addr->bus, addr->devid, addr->function); 596 rte_errno = ENOENT; 597 } 598 mlx5_glue->free_device_list(ibv_list); 599 return ibv_match; 600 } 601 602 /* Try to disable ROCE by Netlink\Devlink. */ 603 static int 604 mlx5_nl_roce_disable(const char *addr) 605 { 606 int nlsk_fd = mlx5_nl_init(NETLINK_GENERIC, 0); 607 int devlink_id; 608 int enable; 609 int ret; 610 611 if (nlsk_fd < 0) 612 return nlsk_fd; 613 devlink_id = mlx5_nl_devlink_family_id_get(nlsk_fd); 614 if (devlink_id < 0) { 615 ret = devlink_id; 616 DRV_LOG(DEBUG, 617 "Failed to get devlink id for ROCE operations by Netlink."); 618 goto close; 619 } 620 ret = mlx5_nl_enable_roce_get(nlsk_fd, devlink_id, addr, &enable); 621 if (ret) { 622 DRV_LOG(DEBUG, "Failed to get ROCE enable by Netlink: %d.", 623 ret); 624 goto close; 625 } else if (!enable) { 626 DRV_LOG(INFO, "ROCE has already disabled(Netlink)."); 627 goto close; 628 } 629 ret = mlx5_nl_enable_roce_set(nlsk_fd, devlink_id, addr, 0); 630 if (ret) 631 DRV_LOG(DEBUG, "Failed to disable ROCE by Netlink: %d.", ret); 632 else 633 DRV_LOG(INFO, "ROCE is disabled by Netlink successfully."); 634 close: 635 close(nlsk_fd); 636 return ret; 637 } 638 639 /* Try to disable ROCE by sysfs. */ 640 static int 641 mlx5_sys_roce_disable(const char *addr) 642 { 643 FILE *file_o; 644 int enable; 645 int ret; 646 647 MKSTR(file_p, "/sys/bus/pci/devices/%s/roce_enable", addr); 648 file_o = fopen(file_p, "rb"); 649 if (!file_o) { 650 rte_errno = ENOTSUP; 651 return -ENOTSUP; 652 } 653 ret = fscanf(file_o, "%d", &enable); 654 if (ret != 1) { 655 rte_errno = EINVAL; 656 ret = EINVAL; 657 goto close; 658 } else if (!enable) { 659 ret = 0; 660 DRV_LOG(INFO, "ROCE has already disabled(sysfs)."); 661 goto close; 662 } 663 fclose(file_o); 664 file_o = fopen(file_p, "wb"); 665 if (!file_o) { 666 rte_errno = ENOTSUP; 667 return -ENOTSUP; 668 } 669 fprintf(file_o, "0\n"); 670 ret = 0; 671 close: 672 if (ret) 673 DRV_LOG(DEBUG, "Failed to disable ROCE by sysfs: %d.", ret); 674 else 675 DRV_LOG(INFO, "ROCE is disabled by sysfs successfully."); 676 fclose(file_o); 677 return ret; 678 } 679 680 static int 681 mlx5_roce_disable(const struct rte_device *dev) 682 { 683 char pci_addr[PCI_PRI_STR_SIZE] = { 0 }; 684 685 if (mlx5_dev_to_pci_str(dev, pci_addr, sizeof(pci_addr)) < 0) 686 return -rte_errno; 687 /* Firstly try to disable ROCE by Netlink and fallback to sysfs. */ 688 if (mlx5_nl_roce_disable(pci_addr) != 0 && 689 mlx5_sys_roce_disable(pci_addr) != 0) 690 return -rte_errno; 691 return 0; 692 } 693 694 static struct ibv_device * 695 mlx5_os_get_ibv_dev(const struct rte_device *dev) 696 { 697 struct ibv_device *ibv; 698 699 if (mlx5_dev_is_pci(dev)) 700 ibv = mlx5_os_get_ibv_device(&RTE_DEV_TO_PCI_CONST(dev)->addr); 701 else 702 ibv = mlx5_get_aux_ibv_device(RTE_DEV_TO_AUXILIARY_CONST(dev)); 703 if (ibv == NULL) { 704 rte_errno = ENODEV; 705 DRV_LOG(ERR, "Verbs device not found: %s", dev->name); 706 } 707 return ibv; 708 } 709 710 static struct ibv_device * 711 mlx5_vdpa_get_ibv_dev(const struct rte_device *dev) 712 { 713 struct ibv_device *ibv; 714 int retry; 715 716 if (mlx5_roce_disable(dev) != 0) { 717 DRV_LOG(WARNING, "Failed to disable ROCE for \"%s\".", 718 dev->name); 719 return NULL; 720 } 721 /* Wait for the IB device to appear again after reload. */ 722 for (retry = MLX5_VDPA_MAX_RETRIES; retry > 0; --retry) { 723 ibv = mlx5_os_get_ibv_dev(dev); 724 if (ibv != NULL) 725 return ibv; 726 usleep(MLX5_VDPA_USEC); 727 } 728 DRV_LOG(ERR, 729 "Cannot get IB device after disabling RoCE for \"%s\", retries exceed %d.", 730 dev->name, MLX5_VDPA_MAX_RETRIES); 731 rte_errno = EAGAIN; 732 return NULL; 733 } 734 735 static int 736 mlx5_config_doorbell_mapping_env(int dbnc) 737 { 738 char *env; 739 int value; 740 741 MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY); 742 /* Get environment variable to store. */ 743 env = getenv(MLX5_SHUT_UP_BF); 744 value = env ? !!strcmp(env, "0") : MLX5_ARG_UNSET; 745 if (dbnc == MLX5_ARG_UNSET) 746 setenv(MLX5_SHUT_UP_BF, MLX5_SHUT_UP_BF_DEFAULT, 1); 747 else 748 setenv(MLX5_SHUT_UP_BF, 749 dbnc == MLX5_SQ_DB_NCACHED ? "1" : "0", 1); 750 return value; 751 } 752 753 static void 754 mlx5_restore_doorbell_mapping_env(int value) 755 { 756 MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY); 757 /* Restore the original environment variable state. */ 758 if (value == MLX5_ARG_UNSET) 759 unsetenv(MLX5_SHUT_UP_BF); 760 else 761 setenv(MLX5_SHUT_UP_BF, value ? "1" : "0", 1); 762 } 763 764 /** 765 * Function API to open IB device. 766 * 767 * @param cdev 768 * Pointer to the mlx5 device. 769 * @param classes 770 * Chosen classes come from device arguments. 771 * 772 * @return 773 * Pointer to ibv_context on success, NULL otherwise and rte_errno is set. 774 */ 775 static struct ibv_context * 776 mlx5_open_device(struct mlx5_common_device *cdev, uint32_t classes) 777 { 778 struct ibv_device *ibv; 779 struct ibv_context *ctx = NULL; 780 int dbmap_env; 781 782 MLX5_ASSERT(cdev->config.device_fd == MLX5_ARG_UNSET); 783 if (classes & MLX5_CLASS_VDPA) 784 ibv = mlx5_vdpa_get_ibv_dev(cdev->dev); 785 else 786 ibv = mlx5_os_get_ibv_dev(cdev->dev); 787 if (!ibv) 788 return NULL; 789 DRV_LOG(INFO, "Dev information matches for device \"%s\".", ibv->name); 790 /* 791 * Configure environment variable "MLX5_BF_SHUT_UP" before the device 792 * creation. The rdma_core library checks the variable at device 793 * creation and stores the result internally. 794 */ 795 dbmap_env = mlx5_config_doorbell_mapping_env(cdev->config.dbnc); 796 /* Try to open IB device with DV first, then usual Verbs. */ 797 errno = 0; 798 ctx = mlx5_glue->dv_open_device(ibv); 799 if (ctx) { 800 cdev->config.devx = 1; 801 } else if (classes == MLX5_CLASS_ETH) { 802 /* The environment variable is still configured. */ 803 ctx = mlx5_glue->open_device(ibv); 804 if (ctx == NULL) 805 goto error; 806 } else { 807 goto error; 808 } 809 /* The device is created, no need for environment. */ 810 mlx5_restore_doorbell_mapping_env(dbmap_env); 811 return ctx; 812 error: 813 rte_errno = errno ? errno : ENODEV; 814 /* The device creation is failed, no need for environment. */ 815 mlx5_restore_doorbell_mapping_env(dbmap_env); 816 DRV_LOG(ERR, "Failed to open IB device \"%s\".", ibv->name); 817 return NULL; 818 } 819 820 /** 821 * Function API to import IB device. 822 * 823 * @param cdev 824 * Pointer to the mlx5 device. 825 * 826 * @return 827 * Pointer to ibv_context on success, NULL otherwise and rte_errno is set. 828 */ 829 static struct ibv_context * 830 mlx5_import_device(struct mlx5_common_device *cdev) 831 { 832 struct ibv_context *ctx = NULL; 833 834 MLX5_ASSERT(cdev->config.device_fd != MLX5_ARG_UNSET); 835 ctx = mlx5_glue->import_device(cdev->config.device_fd); 836 if (!ctx) { 837 DRV_LOG(ERR, "Failed to import device for fd=%d: %s", 838 cdev->config.device_fd, rte_strerror(errno)); 839 rte_errno = errno; 840 } 841 return ctx; 842 } 843 844 /** 845 * Function API to prepare IB device. 846 * 847 * @param cdev 848 * Pointer to the mlx5 device. 849 * @param classes 850 * Chosen classes come from device arguments. 851 * 852 * @return 853 * 0 on success, a negative errno value otherwise and rte_errno is set. 854 */ 855 int 856 mlx5_os_open_device(struct mlx5_common_device *cdev, uint32_t classes) 857 { 858 859 struct ibv_context *ctx = NULL; 860 861 if (cdev->config.device_fd == MLX5_ARG_UNSET) 862 ctx = mlx5_open_device(cdev, classes); 863 else 864 ctx = mlx5_import_device(cdev); 865 if (ctx == NULL) 866 return -rte_errno; 867 /* Hint libmlx5 to use PMD allocator for data plane resources */ 868 mlx5_set_context_attr(cdev->dev, ctx); 869 cdev->ctx = ctx; 870 return 0; 871 } 872 873 int 874 mlx5_get_device_guid(const struct rte_pci_addr *dev, uint8_t *guid, size_t len) 875 { 876 char tmp[512]; 877 char cur_ifname[IF_NAMESIZE + 1]; 878 FILE *id_file; 879 DIR *dir; 880 struct dirent *ptr; 881 int ret; 882 883 if (guid == NULL || len < sizeof(u_int64_t) + 1) 884 return -1; 885 memset(guid, 0, len); 886 snprintf(tmp, sizeof(tmp), "/sys/bus/pci/devices/%04x:%02x:%02x.%x/net", 887 dev->domain, dev->bus, dev->devid, dev->function); 888 dir = opendir(tmp); 889 if (dir == NULL) 890 return -1; 891 /* Traverse to identify PF interface */ 892 do { 893 ptr = readdir(dir); 894 if (ptr == NULL || ptr->d_type != DT_DIR) { 895 closedir(dir); 896 return -1; 897 } 898 } while (strchr(ptr->d_name, '.') || strchr(ptr->d_name, '_') || 899 strchr(ptr->d_name, 'v')); 900 snprintf(cur_ifname, sizeof(cur_ifname), "%s", ptr->d_name); 901 closedir(dir); 902 snprintf(tmp + strlen(tmp), sizeof(tmp) - strlen(tmp), 903 "/%s/phys_switch_id", cur_ifname); 904 /* Older OFED like 5.3 doesn't support read */ 905 id_file = fopen(tmp, "r"); 906 if (!id_file) 907 return 0; 908 ret = fscanf(id_file, "%16s", guid); 909 fclose(id_file); 910 return ret; 911 } 912 913 /* 914 * Create direct mkey using the kernel ibv_reg_mr API and wrap it with a new 915 * indirect mkey created by the DevX API. 916 * This mkey should be used for DevX commands requesting mkey as a parameter. 917 */ 918 int 919 mlx5_os_wrapped_mkey_create(void *ctx, void *pd, uint32_t pdn, void *addr, 920 size_t length, struct mlx5_pmd_wrapped_mr *pmd_mr) 921 { 922 struct mlx5_klm klm = { 923 .byte_count = length, 924 .address = (uintptr_t)addr, 925 }; 926 struct mlx5_devx_mkey_attr mkey_attr = { 927 .pd = pdn, 928 .klm_array = &klm, 929 .klm_num = 1, 930 }; 931 struct mlx5_devx_obj *mkey; 932 struct ibv_mr *ibv_mr = mlx5_glue->reg_mr(pd, addr, length, 933 IBV_ACCESS_LOCAL_WRITE | 934 (haswell_broadwell_cpu ? 0 : 935 IBV_ACCESS_RELAXED_ORDERING)); 936 937 if (!ibv_mr) { 938 rte_errno = errno; 939 return -rte_errno; 940 } 941 klm.mkey = ibv_mr->lkey; 942 mkey_attr.addr = (uintptr_t)addr; 943 mkey_attr.size = length; 944 mkey = mlx5_devx_cmd_mkey_create(ctx, &mkey_attr); 945 if (!mkey) { 946 claim_zero(mlx5_glue->dereg_mr(ibv_mr)); 947 return -rte_errno; 948 } 949 pmd_mr->addr = addr; 950 pmd_mr->len = length; 951 pmd_mr->obj = (void *)ibv_mr; 952 pmd_mr->imkey = mkey; 953 pmd_mr->lkey = mkey->id; 954 return 0; 955 } 956 957 void 958 mlx5_os_wrapped_mkey_destroy(struct mlx5_pmd_wrapped_mr *pmd_mr) 959 { 960 if (!pmd_mr) 961 return; 962 if (pmd_mr->imkey) 963 claim_zero(mlx5_devx_cmd_destroy(pmd_mr->imkey)); 964 if (pmd_mr->obj) 965 claim_zero(mlx5_glue->dereg_mr(pmd_mr->obj)); 966 memset(pmd_mr, 0, sizeof(*pmd_mr)); 967 } 968 969 /** 970 * Rte_intr_handle create and init helper. 971 * 972 * @param[in] mode 973 * interrupt instance can be shared between primary and secondary 974 * processes or not. 975 * @param[in] set_fd_nonblock 976 * Whether to set fd to O_NONBLOCK. 977 * @param[in] fd 978 * Fd to set in created intr_handle. 979 * @param[in] cb 980 * Callback to register for intr_handle. 981 * @param[in] cb_arg 982 * Callback argument for cb. 983 * 984 * @return 985 * - Interrupt handle on success. 986 * - NULL on failure, with rte_errno set. 987 */ 988 struct rte_intr_handle * 989 mlx5_os_interrupt_handler_create(int mode, bool set_fd_nonblock, int fd, 990 rte_intr_callback_fn cb, void *cb_arg) 991 { 992 struct rte_intr_handle *tmp_intr_handle; 993 int ret, flags; 994 995 tmp_intr_handle = rte_intr_instance_alloc(mode); 996 if (!tmp_intr_handle) { 997 rte_errno = ENOMEM; 998 goto err; 999 } 1000 if (set_fd_nonblock) { 1001 flags = fcntl(fd, F_GETFL); 1002 ret = fcntl(fd, F_SETFL, flags | O_NONBLOCK); 1003 if (ret) { 1004 rte_errno = errno; 1005 goto err; 1006 } 1007 } 1008 ret = rte_intr_fd_set(tmp_intr_handle, fd); 1009 if (ret) 1010 goto err; 1011 ret = rte_intr_type_set(tmp_intr_handle, RTE_INTR_HANDLE_EXT); 1012 if (ret) 1013 goto err; 1014 ret = rte_intr_callback_register(tmp_intr_handle, cb, cb_arg); 1015 if (ret) { 1016 rte_errno = -ret; 1017 goto err; 1018 } 1019 return tmp_intr_handle; 1020 err: 1021 rte_intr_instance_free(tmp_intr_handle); 1022 return NULL; 1023 } 1024 1025 /* Safe unregistration for interrupt callback. */ 1026 static void 1027 mlx5_intr_callback_unregister(const struct rte_intr_handle *handle, 1028 rte_intr_callback_fn cb_fn, void *cb_arg) 1029 { 1030 uint64_t twait = 0; 1031 uint64_t start = 0; 1032 1033 do { 1034 int ret; 1035 1036 ret = rte_intr_callback_unregister(handle, cb_fn, cb_arg); 1037 if (ret >= 0) 1038 return; 1039 if (ret != -EAGAIN) { 1040 DRV_LOG(INFO, "failed to unregister interrupt" 1041 " handler (error: %d)", ret); 1042 MLX5_ASSERT(false); 1043 return; 1044 } 1045 if (twait) { 1046 struct timespec onems; 1047 1048 /* Wait one millisecond and try again. */ 1049 onems.tv_sec = 0; 1050 onems.tv_nsec = NS_PER_S / MS_PER_S; 1051 nanosleep(&onems, 0); 1052 /* Check whether one second elapsed. */ 1053 if ((rte_get_timer_cycles() - start) <= twait) 1054 continue; 1055 } else { 1056 /* 1057 * We get the amount of timer ticks for one second. 1058 * If this amount elapsed it means we spent one 1059 * second in waiting. This branch is executed once 1060 * on first iteration. 1061 */ 1062 twait = rte_get_timer_hz(); 1063 MLX5_ASSERT(twait); 1064 } 1065 /* 1066 * Timeout elapsed, show message (once a second) and retry. 1067 * We have no other acceptable option here, if we ignore 1068 * the unregistering return code the handler will not 1069 * be unregistered, fd will be closed and we may get the 1070 * crush. Hanging and messaging in the loop seems not to be 1071 * the worst choice. 1072 */ 1073 DRV_LOG(INFO, "Retrying to unregister interrupt handler"); 1074 start = rte_get_timer_cycles(); 1075 } while (true); 1076 } 1077 1078 /** 1079 * Rte_intr_handle destroy helper. 1080 * 1081 * @param[in] intr_handle 1082 * Rte_intr_handle to destroy. 1083 * @param[in] cb 1084 * Callback which is registered to intr_handle. 1085 * @param[in] cb_arg 1086 * Callback argument for cb. 1087 * 1088 */ 1089 void 1090 mlx5_os_interrupt_handler_destroy(struct rte_intr_handle *intr_handle, 1091 rte_intr_callback_fn cb, void *cb_arg) 1092 { 1093 if (rte_intr_fd_get(intr_handle) >= 0) 1094 mlx5_intr_callback_unregister(intr_handle, cb, cb_arg); 1095 rte_intr_instance_free(intr_handle); 1096 } 1097