1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright 2020 Mellanox Technologies, Ltd 3 */ 4 5 #include <sys/types.h> 6 #include <unistd.h> 7 #include <string.h> 8 #include <stdio.h> 9 #ifdef RTE_IBVERBS_LINK_DLOPEN 10 #include <dlfcn.h> 11 #endif 12 #include <dirent.h> 13 #include <net/if.h> 14 #include <fcntl.h> 15 16 #include <rte_errno.h> 17 #include <rte_string_fns.h> 18 #include <bus_pci_driver.h> 19 #include <bus_auxiliary_driver.h> 20 21 #include "mlx5_common.h" 22 #include "mlx5_nl.h" 23 #include "mlx5_common_log.h" 24 #include "mlx5_common_private.h" 25 #include "mlx5_common_defs.h" 26 #include "mlx5_common_os.h" 27 #include "mlx5_glue.h" 28 29 #ifdef MLX5_GLUE 30 const struct mlx5_glue *mlx5_glue; 31 #endif 32 33 int 34 mlx5_get_pci_addr(const char *dev_path, struct rte_pci_addr *pci_addr) 35 { 36 FILE *file; 37 char line[32]; 38 int rc = -ENOENT; 39 MKSTR(path, "%s/device/uevent", dev_path); 40 41 file = fopen(path, "rb"); 42 if (file == NULL) { 43 rte_errno = errno; 44 return -rte_errno; 45 } 46 while (fgets(line, sizeof(line), file) == line) { 47 size_t len = strlen(line); 48 49 /* Truncate long lines. */ 50 if (len == (sizeof(line) - 1)) { 51 while (line[(len - 1)] != '\n') { 52 int ret = fgetc(file); 53 54 if (ret == EOF) 55 goto exit; 56 line[(len - 1)] = ret; 57 } 58 /* No match for long lines. */ 59 continue; 60 } 61 /* Extract information. */ 62 if (sscanf(line, 63 "PCI_SLOT_NAME=" 64 "%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n", 65 &pci_addr->domain, 66 &pci_addr->bus, 67 &pci_addr->devid, 68 &pci_addr->function) == 4) { 69 rc = 0; 70 break; 71 } 72 } 73 exit: 74 fclose(file); 75 if (rc) 76 rte_errno = -rc; 77 return rc; 78 } 79 80 /** 81 * Extract port name, as a number, from sysfs or netlink information. 82 * 83 * @param[in] port_name_in 84 * String representing the port name. 85 * @param[out] port_info_out 86 * Port information, including port name as a number and port name 87 * type if recognized 88 * 89 * @return 90 * port_name field set according to recognized name format. 91 */ 92 void 93 mlx5_translate_port_name(const char *port_name_in, 94 struct mlx5_switch_info *port_info_out) 95 { 96 char ctrl = 0, pf_c1, pf_c2, vf_c1, vf_c2, eol; 97 char *end; 98 int sc_items; 99 100 sc_items = sscanf(port_name_in, "%c%d", 101 &ctrl, &port_info_out->ctrl_num); 102 if (sc_items == 2 && ctrl == 'c') { 103 port_name_in++; /* 'c' */ 104 port_name_in += snprintf(NULL, 0, "%d", 105 port_info_out->ctrl_num); 106 } 107 /* Check for port-name as a string of the form pf0vf0 or pf0sf0 */ 108 sc_items = sscanf(port_name_in, "%c%c%d%c%c%d%c", 109 &pf_c1, &pf_c2, &port_info_out->pf_num, 110 &vf_c1, &vf_c2, &port_info_out->port_name, &eol); 111 if (sc_items == 6 && pf_c1 == 'p' && pf_c2 == 'f') { 112 if (vf_c1 == 'v' && vf_c2 == 'f') { 113 /* Kernel ver >= 5.0 or OFED ver >= 4.6 */ 114 port_info_out->name_type = 115 MLX5_PHYS_PORT_NAME_TYPE_PFVF; 116 return; 117 } 118 if (vf_c1 == 's' && vf_c2 == 'f') { 119 /* Kernel ver >= 5.11 or OFED ver >= 5.1 */ 120 port_info_out->name_type = 121 MLX5_PHYS_PORT_NAME_TYPE_PFSF; 122 return; 123 } 124 } 125 /* 126 * Check for port-name as a string of the form p0 127 * (support kernel ver >= 5.0, or OFED ver >= 4.6). 128 */ 129 sc_items = sscanf(port_name_in, "%c%d%c", 130 &pf_c1, &port_info_out->port_name, &eol); 131 if (sc_items == 2 && pf_c1 == 'p') { 132 port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_UPLINK; 133 return; 134 } 135 /* 136 * Check for port-name as a string of the form pf0 137 * (support kernel ver >= 5.7 for HPF representor on BF). 138 */ 139 sc_items = sscanf(port_name_in, "%c%c%d%c", 140 &pf_c1, &pf_c2, &port_info_out->pf_num, &eol); 141 if (sc_items == 3 && pf_c1 == 'p' && pf_c2 == 'f') { 142 port_info_out->port_name = -1; 143 port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_PFHPF; 144 return; 145 } 146 /* Check for port-name as a number (support kernel ver < 5.0 */ 147 errno = 0; 148 port_info_out->port_name = strtol(port_name_in, &end, 0); 149 if (!errno && 150 (size_t)(end - port_name_in) == strlen(port_name_in)) { 151 port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_LEGACY; 152 return; 153 } 154 port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN; 155 } 156 157 int 158 mlx5_get_ifname_sysfs(const char *ibdev_path, char *ifname) 159 { 160 DIR *dir; 161 struct dirent *dent; 162 unsigned int dev_type = 0; 163 unsigned int dev_port_prev = ~0u; 164 char match[IF_NAMESIZE] = ""; 165 166 MLX5_ASSERT(ibdev_path); 167 { 168 MKSTR(path, "%s/device/net", ibdev_path); 169 170 dir = opendir(path); 171 if (dir == NULL) { 172 rte_errno = errno; 173 return -rte_errno; 174 } 175 } 176 while ((dent = readdir(dir)) != NULL) { 177 char *name = dent->d_name; 178 FILE *file; 179 unsigned int dev_port; 180 int r; 181 182 if ((name[0] == '.') && 183 ((name[1] == '\0') || 184 ((name[1] == '.') && (name[2] == '\0')))) 185 continue; 186 187 MKSTR(path, "%s/device/net/%s/%s", 188 ibdev_path, name, 189 (dev_type ? "dev_id" : "dev_port")); 190 191 file = fopen(path, "rb"); 192 if (file == NULL) { 193 if (errno != ENOENT) 194 continue; 195 /* 196 * Switch to dev_id when dev_port does not exist as 197 * is the case with Linux kernel versions < 3.15. 198 */ 199 try_dev_id: 200 match[0] = '\0'; 201 if (dev_type) 202 break; 203 dev_type = 1; 204 dev_port_prev = ~0u; 205 rewinddir(dir); 206 continue; 207 } 208 r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port); 209 fclose(file); 210 if (r != 1) 211 continue; 212 /* 213 * Switch to dev_id when dev_port returns the same value for 214 * all ports. May happen when using a MOFED release older than 215 * 3.0 with a Linux kernel >= 3.15. 216 */ 217 if (dev_port == dev_port_prev) 218 goto try_dev_id; 219 dev_port_prev = dev_port; 220 if (dev_port == 0) 221 strlcpy(match, name, IF_NAMESIZE); 222 } 223 closedir(dir); 224 if (match[0] == '\0') { 225 rte_errno = ENOENT; 226 return -rte_errno; 227 } 228 strncpy(ifname, match, IF_NAMESIZE); 229 return 0; 230 } 231 232 #ifdef MLX5_GLUE 233 234 /** 235 * Suffix RTE_EAL_PMD_PATH with "-glue". 236 * 237 * This function performs a sanity check on RTE_EAL_PMD_PATH before 238 * suffixing its last component. 239 * 240 * @param buf[out] 241 * Output buffer, should be large enough otherwise NULL is returned. 242 * @param size 243 * Size of @p out. 244 * 245 * @return 246 * Pointer to @p buf or @p NULL in case suffix cannot be appended. 247 */ 248 static char * 249 mlx5_glue_path(char *buf, size_t size) 250 { 251 static const char *const bad[] = { "/", ".", "..", NULL }; 252 const char *path = RTE_EAL_PMD_PATH; 253 size_t len = strlen(path); 254 size_t off; 255 int i; 256 257 while (len && path[len - 1] == '/') 258 --len; 259 for (off = len; off && path[off - 1] != '/'; --off) 260 ; 261 for (i = 0; bad[i]; ++i) 262 if (!strncmp(path + off, bad[i], (int)(len - off))) 263 goto error; 264 i = snprintf(buf, size, "%.*s-glue", (int)len, path); 265 if (i == -1 || (size_t)i >= size) 266 goto error; 267 return buf; 268 error: 269 RTE_LOG(ERR, PMD, "unable to append \"-glue\" to last component of" 270 " RTE_EAL_PMD_PATH (\"" RTE_EAL_PMD_PATH "\"), please" 271 " re-configure DPDK"); 272 return NULL; 273 } 274 275 static int 276 mlx5_glue_dlopen(void) 277 { 278 char glue_path[sizeof(RTE_EAL_PMD_PATH) - 1 + sizeof("-glue")]; 279 void *handle = NULL; 280 281 char const *path[] = { 282 /* 283 * A basic security check is necessary before trusting 284 * MLX5_GLUE_PATH, which may override RTE_EAL_PMD_PATH. 285 */ 286 (geteuid() == getuid() && getegid() == getgid() ? 287 getenv("MLX5_GLUE_PATH") : NULL), 288 /* 289 * When RTE_EAL_PMD_PATH is set, use its glue-suffixed 290 * variant, otherwise let dlopen() look up libraries on its 291 * own. 292 */ 293 (*RTE_EAL_PMD_PATH ? 294 mlx5_glue_path(glue_path, sizeof(glue_path)) : ""), 295 }; 296 unsigned int i = 0; 297 void **sym; 298 const char *dlmsg; 299 300 while (!handle && i != RTE_DIM(path)) { 301 const char *end; 302 size_t len; 303 int ret; 304 305 if (!path[i]) { 306 ++i; 307 continue; 308 } 309 end = strpbrk(path[i], ":;"); 310 if (!end) 311 end = path[i] + strlen(path[i]); 312 len = end - path[i]; 313 ret = 0; 314 do { 315 char name[ret + 1]; 316 317 ret = snprintf(name, sizeof(name), "%.*s%s" MLX5_GLUE, 318 (int)len, path[i], 319 (!len || *(end - 1) == '/') ? "" : "/"); 320 if (ret == -1) 321 break; 322 if (sizeof(name) != (size_t)ret + 1) 323 continue; 324 DRV_LOG(DEBUG, "Looking for rdma-core glue as " 325 "\"%s\"", name); 326 handle = dlopen(name, RTLD_LAZY); 327 break; 328 } while (1); 329 path[i] = end + 1; 330 if (!*end) 331 ++i; 332 } 333 if (!handle) { 334 rte_errno = EINVAL; 335 dlmsg = dlerror(); 336 if (dlmsg) 337 DRV_LOG(WARNING, "Cannot load glue library: %s", dlmsg); 338 goto glue_error; 339 } 340 sym = dlsym(handle, "mlx5_glue"); 341 if (!sym || !*sym) { 342 rte_errno = EINVAL; 343 dlmsg = dlerror(); 344 if (dlmsg) 345 DRV_LOG(ERR, "Cannot resolve glue symbol: %s", dlmsg); 346 goto glue_error; 347 } 348 mlx5_glue = *sym; 349 return 0; 350 351 glue_error: 352 if (handle) 353 dlclose(handle); 354 return -1; 355 } 356 357 #endif 358 359 /** 360 * Initialization routine for run-time dependency on rdma-core. 361 */ 362 void 363 mlx5_glue_constructor(void) 364 { 365 /* 366 * RDMAV_HUGEPAGES_SAFE tells ibv_fork_init() we intend to use 367 * huge pages. Calling ibv_fork_init() during init allows 368 * applications to use fork() safely for purposes other than 369 * using this PMD, which is not supported in forked processes. 370 */ 371 setenv("RDMAV_HUGEPAGES_SAFE", "1", 1); 372 /* Match the size of Rx completion entry to the size of a cacheline. */ 373 if (RTE_CACHE_LINE_SIZE == 128) 374 setenv("MLX5_CQE_SIZE", "128", 0); 375 /* 376 * MLX5_DEVICE_FATAL_CLEANUP tells ibv_destroy functions to 377 * cleanup all the Verbs resources even when the device was removed. 378 */ 379 setenv("MLX5_DEVICE_FATAL_CLEANUP", "1", 1); 380 381 #ifdef MLX5_GLUE 382 if (mlx5_glue_dlopen() != 0) 383 goto glue_error; 384 #endif 385 386 #ifdef RTE_LIBRTE_MLX5_DEBUG 387 /* Glue structure must not contain any NULL pointers. */ 388 { 389 unsigned int i; 390 391 for (i = 0; i != sizeof(*mlx5_glue) / sizeof(void *); ++i) 392 MLX5_ASSERT(((const void *const *)mlx5_glue)[i]); 393 } 394 #endif 395 if (strcmp(mlx5_glue->version, MLX5_GLUE_VERSION)) { 396 rte_errno = EINVAL; 397 DRV_LOG(ERR, "rdma-core glue \"%s\" mismatch: \"%s\" is " 398 "required", mlx5_glue->version, MLX5_GLUE_VERSION); 399 goto glue_error; 400 } 401 mlx5_glue->fork_init(); 402 return; 403 404 glue_error: 405 DRV_LOG(WARNING, "Cannot initialize MLX5 common due to missing" 406 " run-time dependency on rdma-core libraries (libibverbs," 407 " libmlx5)"); 408 mlx5_glue = NULL; 409 } 410 411 /** 412 * Validate user arguments for remote PD and CTX. 413 * 414 * @param config 415 * Pointer to device configuration structure. 416 * 417 * @return 418 * 0 on success, a negative errno value otherwise and rte_errno is set. 419 */ 420 int 421 mlx5_os_remote_pd_and_ctx_validate(struct mlx5_common_dev_config *config) 422 { 423 int device_fd = config->device_fd; 424 int pd_handle = config->pd_handle; 425 426 #ifdef HAVE_MLX5_IBV_IMPORT_CTX_PD_AND_MR 427 if (device_fd == MLX5_ARG_UNSET && pd_handle != MLX5_ARG_UNSET) { 428 DRV_LOG(ERR, "Remote PD without CTX is not supported."); 429 rte_errno = EINVAL; 430 return -rte_errno; 431 } 432 if (device_fd != MLX5_ARG_UNSET && pd_handle == MLX5_ARG_UNSET) { 433 DRV_LOG(ERR, "Remote CTX without PD is not supported."); 434 rte_errno = EINVAL; 435 return -rte_errno; 436 } 437 DRV_LOG(DEBUG, "Remote PD and CTX is supported: (cmd_fd=%d, " 438 "pd_handle=%d).", device_fd, pd_handle); 439 #else 440 if (pd_handle != MLX5_ARG_UNSET || device_fd != MLX5_ARG_UNSET) { 441 DRV_LOG(ERR, 442 "Remote PD and CTX is not supported - maybe old rdma-core version?"); 443 rte_errno = ENOTSUP; 444 return -rte_errno; 445 } 446 #endif 447 return 0; 448 } 449 450 /** 451 * Release Protection Domain object. 452 * 453 * @param[out] cdev 454 * Pointer to the mlx5 device. 455 * 456 * @return 457 * 0 on success, a negative errno value otherwise. 458 */ 459 int 460 mlx5_os_pd_release(struct mlx5_common_device *cdev) 461 { 462 if (cdev->config.pd_handle == MLX5_ARG_UNSET) 463 return mlx5_glue->dealloc_pd(cdev->pd); 464 else 465 return mlx5_glue->unimport_pd(cdev->pd); 466 } 467 468 /** 469 * Allocate Protection Domain object. 470 * 471 * @param[out] cdev 472 * Pointer to the mlx5 device. 473 * 474 * @return 475 * 0 on success, a negative errno value otherwise. 476 */ 477 static int 478 mlx5_os_pd_create(struct mlx5_common_device *cdev) 479 { 480 cdev->pd = mlx5_glue->alloc_pd(cdev->ctx); 481 if (cdev->pd == NULL) { 482 DRV_LOG(ERR, "Failed to allocate PD: %s", rte_strerror(errno)); 483 return errno ? -errno : -ENOMEM; 484 } 485 return 0; 486 } 487 488 /** 489 * Import Protection Domain object according to given PD handle. 490 * 491 * @param[out] cdev 492 * Pointer to the mlx5 device. 493 * 494 * @return 495 * 0 on success, a negative errno value otherwise. 496 */ 497 static int 498 mlx5_os_pd_import(struct mlx5_common_device *cdev) 499 { 500 cdev->pd = mlx5_glue->import_pd(cdev->ctx, cdev->config.pd_handle); 501 if (cdev->pd == NULL) { 502 DRV_LOG(ERR, "Failed to import PD using handle=%d: %s", 503 cdev->config.pd_handle, rte_strerror(errno)); 504 return errno ? -errno : -ENOMEM; 505 } 506 return 0; 507 } 508 509 /** 510 * Prepare Protection Domain object and extract its pdn using DV API. 511 * 512 * @param[out] cdev 513 * Pointer to the mlx5 device. 514 * 515 * @return 516 * 0 on success, a negative errno value otherwise and rte_errno is set. 517 */ 518 int 519 mlx5_os_pd_prepare(struct mlx5_common_device *cdev) 520 { 521 #ifdef HAVE_IBV_FLOW_DV_SUPPORT 522 struct mlx5dv_obj obj; 523 struct mlx5dv_pd pd_info; 524 #endif 525 int ret; 526 527 if (cdev->config.pd_handle == MLX5_ARG_UNSET) 528 ret = mlx5_os_pd_create(cdev); 529 else 530 ret = mlx5_os_pd_import(cdev); 531 if (ret) { 532 rte_errno = -ret; 533 return ret; 534 } 535 if (cdev->config.devx == 0) 536 return 0; 537 #ifdef HAVE_IBV_FLOW_DV_SUPPORT 538 obj.pd.in = cdev->pd; 539 obj.pd.out = &pd_info; 540 ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_PD); 541 if (ret != 0) { 542 DRV_LOG(ERR, "Fail to get PD object info."); 543 rte_errno = errno; 544 claim_zero(mlx5_os_pd_release(cdev)); 545 cdev->pd = NULL; 546 return -rte_errno; 547 } 548 cdev->pdn = pd_info.pdn; 549 return 0; 550 #else 551 DRV_LOG(ERR, "Cannot get pdn - no DV support."); 552 rte_errno = ENOTSUP; 553 return -rte_errno; 554 #endif /* HAVE_IBV_FLOW_DV_SUPPORT */ 555 } 556 557 static struct ibv_device * 558 mlx5_os_get_ibv_device(const struct rte_pci_device *pci_dev) 559 { 560 int n; 561 struct ibv_device **ibv_list = mlx5_glue->get_device_list(&n); 562 struct ibv_device *ibv_match = NULL; 563 uint8_t guid1[32] = {0}; 564 uint8_t guid2[32] = {0}; 565 int ret1, ret2 = -1; 566 struct rte_pci_addr paddr; 567 const struct rte_pci_addr *addr = &pci_dev->addr; 568 bool is_vf_dev = mlx5_dev_is_vf_pci(pci_dev); 569 570 if (ibv_list == NULL || !n) { 571 rte_errno = ENOSYS; 572 if (ibv_list) 573 mlx5_glue->free_device_list(ibv_list); 574 return NULL; 575 } 576 ret1 = mlx5_get_device_guid(addr, guid1, sizeof(guid1)); 577 while (n-- > 0) { 578 DRV_LOG(DEBUG, "Checking device \"%s\"..", ibv_list[n]->name); 579 if (mlx5_get_pci_addr(ibv_list[n]->ibdev_path, &paddr) != 0) 580 continue; 581 if (ret1 > 0) 582 ret2 = mlx5_get_device_guid(&paddr, guid2, sizeof(guid2)); 583 /* Bond device can bond secondary PCIe */ 584 if ((strstr(ibv_list[n]->name, "bond") && !is_vf_dev && 585 ((ret1 > 0 && ret2 > 0 && !memcmp(guid1, guid2, sizeof(guid1))) || 586 (addr->domain == paddr.domain && addr->bus == paddr.bus && 587 addr->devid == paddr.devid))) || 588 !rte_pci_addr_cmp(addr, &paddr)) { 589 ibv_match = ibv_list[n]; 590 break; 591 } 592 } 593 if (ibv_match == NULL) { 594 DRV_LOG(WARNING, 595 "No Verbs device matches PCI device " PCI_PRI_FMT "," 596 " are kernel drivers loaded?", 597 addr->domain, addr->bus, addr->devid, addr->function); 598 rte_errno = ENOENT; 599 } 600 mlx5_glue->free_device_list(ibv_list); 601 return ibv_match; 602 } 603 604 /* Try to disable ROCE by Netlink\Devlink. */ 605 static int 606 mlx5_nl_roce_disable(const char *addr) 607 { 608 int nlsk_fd = mlx5_nl_init(NETLINK_GENERIC, 0); 609 int devlink_id; 610 int enable; 611 int ret; 612 613 if (nlsk_fd < 0) 614 return nlsk_fd; 615 devlink_id = mlx5_nl_devlink_family_id_get(nlsk_fd); 616 if (devlink_id < 0) { 617 ret = devlink_id; 618 DRV_LOG(DEBUG, 619 "Failed to get devlink id for ROCE operations by Netlink."); 620 goto close; 621 } 622 ret = mlx5_nl_enable_roce_get(nlsk_fd, devlink_id, addr, &enable); 623 if (ret) { 624 DRV_LOG(DEBUG, "Failed to get ROCE enable by Netlink: %d.", 625 ret); 626 goto close; 627 } else if (!enable) { 628 DRV_LOG(INFO, "ROCE has already disabled(Netlink)."); 629 goto close; 630 } 631 ret = mlx5_nl_enable_roce_set(nlsk_fd, devlink_id, addr, 0); 632 if (ret) 633 DRV_LOG(DEBUG, "Failed to disable ROCE by Netlink: %d.", ret); 634 else 635 DRV_LOG(INFO, "ROCE is disabled by Netlink successfully."); 636 close: 637 close(nlsk_fd); 638 return ret; 639 } 640 641 /* Try to disable ROCE by sysfs. */ 642 static int 643 mlx5_sys_roce_disable(const char *addr) 644 { 645 FILE *file_o; 646 int enable; 647 int ret; 648 649 MKSTR(file_p, "/sys/bus/pci/devices/%s/roce_enable", addr); 650 file_o = fopen(file_p, "rb"); 651 if (!file_o) { 652 rte_errno = ENOTSUP; 653 return -ENOTSUP; 654 } 655 ret = fscanf(file_o, "%d", &enable); 656 if (ret != 1) { 657 rte_errno = EINVAL; 658 ret = EINVAL; 659 goto close; 660 } else if (!enable) { 661 ret = 0; 662 DRV_LOG(INFO, "ROCE has already disabled(sysfs)."); 663 goto close; 664 } 665 fclose(file_o); 666 file_o = fopen(file_p, "wb"); 667 if (!file_o) { 668 rte_errno = ENOTSUP; 669 return -ENOTSUP; 670 } 671 fprintf(file_o, "0\n"); 672 ret = 0; 673 close: 674 if (ret) 675 DRV_LOG(DEBUG, "Failed to disable ROCE by sysfs: %d.", ret); 676 else 677 DRV_LOG(INFO, "ROCE is disabled by sysfs successfully."); 678 fclose(file_o); 679 return ret; 680 } 681 682 static int 683 mlx5_roce_disable(const struct rte_device *dev) 684 { 685 char pci_addr[PCI_PRI_STR_SIZE] = { 0 }; 686 687 if (mlx5_dev_to_pci_str(dev, pci_addr, sizeof(pci_addr)) < 0) 688 return -rte_errno; 689 /* Firstly try to disable ROCE by Netlink and fallback to sysfs. */ 690 if (mlx5_nl_roce_disable(pci_addr) != 0 && 691 mlx5_sys_roce_disable(pci_addr) != 0) 692 return -rte_errno; 693 return 0; 694 } 695 696 static struct ibv_device * 697 mlx5_os_get_ibv_dev(const struct rte_device *dev) 698 { 699 struct ibv_device *ibv; 700 701 if (mlx5_dev_is_pci(dev)) 702 ibv = mlx5_os_get_ibv_device(RTE_DEV_TO_PCI_CONST(dev)); 703 else 704 ibv = mlx5_get_aux_ibv_device(RTE_DEV_TO_AUXILIARY_CONST(dev)); 705 if (ibv == NULL) { 706 rte_errno = ENODEV; 707 DRV_LOG(ERR, "Verbs device not found: %s", dev->name); 708 } 709 return ibv; 710 } 711 712 static struct ibv_device * 713 mlx5_vdpa_get_ibv_dev(const struct rte_device *dev) 714 { 715 struct ibv_device *ibv; 716 int retry; 717 718 if (mlx5_roce_disable(dev) != 0) { 719 DRV_LOG(WARNING, "Failed to disable ROCE for \"%s\".", 720 dev->name); 721 return NULL; 722 } 723 /* Wait for the IB device to appear again after reload. */ 724 for (retry = MLX5_VDPA_MAX_RETRIES; retry > 0; --retry) { 725 ibv = mlx5_os_get_ibv_dev(dev); 726 if (ibv != NULL) 727 return ibv; 728 usleep(MLX5_VDPA_USEC); 729 } 730 DRV_LOG(ERR, 731 "Cannot get IB device after disabling RoCE for \"%s\", retries exceed %d.", 732 dev->name, MLX5_VDPA_MAX_RETRIES); 733 rte_errno = EAGAIN; 734 return NULL; 735 } 736 737 static int 738 mlx5_config_doorbell_mapping_env(int dbnc) 739 { 740 char *env; 741 int value; 742 743 MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY); 744 /* Get environment variable to store. */ 745 env = getenv(MLX5_SHUT_UP_BF); 746 value = env ? !!strcmp(env, "0") : MLX5_ARG_UNSET; 747 if (dbnc == MLX5_ARG_UNSET) 748 setenv(MLX5_SHUT_UP_BF, MLX5_SHUT_UP_BF_DEFAULT, 1); 749 else 750 setenv(MLX5_SHUT_UP_BF, 751 dbnc == MLX5_SQ_DB_NCACHED ? "1" : "0", 1); 752 return value; 753 } 754 755 static void 756 mlx5_restore_doorbell_mapping_env(int value) 757 { 758 MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY); 759 /* Restore the original environment variable state. */ 760 if (value == MLX5_ARG_UNSET) 761 unsetenv(MLX5_SHUT_UP_BF); 762 else 763 setenv(MLX5_SHUT_UP_BF, value ? "1" : "0", 1); 764 } 765 766 /** 767 * Function API to open IB device. 768 * 769 * @param cdev 770 * Pointer to the mlx5 device. 771 * @param classes 772 * Chosen classes come from device arguments. 773 * 774 * @return 775 * Pointer to ibv_context on success, NULL otherwise and rte_errno is set. 776 */ 777 static struct ibv_context * 778 mlx5_open_device(struct mlx5_common_device *cdev, uint32_t classes) 779 { 780 struct ibv_device *ibv; 781 struct ibv_context *ctx = NULL; 782 int dbmap_env; 783 784 MLX5_ASSERT(cdev->config.device_fd == MLX5_ARG_UNSET); 785 if (classes & MLX5_CLASS_VDPA) 786 ibv = mlx5_vdpa_get_ibv_dev(cdev->dev); 787 else 788 ibv = mlx5_os_get_ibv_dev(cdev->dev); 789 if (!ibv) 790 return NULL; 791 DRV_LOG(INFO, "Dev information matches for device \"%s\".", ibv->name); 792 /* 793 * Configure environment variable "MLX5_BF_SHUT_UP" before the device 794 * creation. The rdma_core library checks the variable at device 795 * creation and stores the result internally. 796 */ 797 dbmap_env = mlx5_config_doorbell_mapping_env(cdev->config.dbnc); 798 /* Try to open IB device with DV first, then usual Verbs. */ 799 errno = 0; 800 ctx = mlx5_glue->dv_open_device(ibv); 801 if (ctx) { 802 cdev->config.devx = 1; 803 } else if (classes == MLX5_CLASS_ETH) { 804 /* The environment variable is still configured. */ 805 ctx = mlx5_glue->open_device(ibv); 806 if (ctx == NULL) 807 goto error; 808 } else { 809 goto error; 810 } 811 /* The device is created, no need for environment. */ 812 mlx5_restore_doorbell_mapping_env(dbmap_env); 813 return ctx; 814 error: 815 rte_errno = errno ? errno : ENODEV; 816 /* The device creation is failed, no need for environment. */ 817 mlx5_restore_doorbell_mapping_env(dbmap_env); 818 DRV_LOG(ERR, "Failed to open IB device \"%s\".", ibv->name); 819 return NULL; 820 } 821 822 /** 823 * Function API to import IB device. 824 * 825 * @param cdev 826 * Pointer to the mlx5 device. 827 * 828 * @return 829 * Pointer to ibv_context on success, NULL otherwise and rte_errno is set. 830 */ 831 static struct ibv_context * 832 mlx5_import_device(struct mlx5_common_device *cdev) 833 { 834 struct ibv_context *ctx = NULL; 835 836 MLX5_ASSERT(cdev->config.device_fd != MLX5_ARG_UNSET); 837 ctx = mlx5_glue->import_device(cdev->config.device_fd); 838 if (!ctx) { 839 DRV_LOG(ERR, "Failed to import device for fd=%d: %s", 840 cdev->config.device_fd, rte_strerror(errno)); 841 rte_errno = errno; 842 } 843 return ctx; 844 } 845 846 /** 847 * Function API to prepare IB device. 848 * 849 * @param cdev 850 * Pointer to the mlx5 device. 851 * @param classes 852 * Chosen classes come from device arguments. 853 * 854 * @return 855 * 0 on success, a negative errno value otherwise and rte_errno is set. 856 */ 857 int 858 mlx5_os_open_device(struct mlx5_common_device *cdev, uint32_t classes) 859 { 860 861 struct ibv_context *ctx = NULL; 862 863 if (cdev->config.device_fd == MLX5_ARG_UNSET) 864 ctx = mlx5_open_device(cdev, classes); 865 else 866 ctx = mlx5_import_device(cdev); 867 if (ctx == NULL) 868 return -rte_errno; 869 /* Hint libmlx5 to use PMD allocator for data plane resources */ 870 mlx5_set_context_attr(cdev->dev, ctx); 871 cdev->ctx = ctx; 872 return 0; 873 } 874 875 int 876 mlx5_get_device_guid(const struct rte_pci_addr *dev, uint8_t *guid, size_t len) 877 { 878 char tmp[512]; 879 char cur_ifname[IF_NAMESIZE + 1]; 880 FILE *id_file; 881 DIR *dir; 882 struct dirent *ptr; 883 int ret; 884 885 if (guid == NULL || len < sizeof(u_int64_t) + 1) 886 return -1; 887 memset(guid, 0, len); 888 snprintf(tmp, sizeof(tmp), "/sys/bus/pci/devices/%04x:%02x:%02x.%x/net", 889 dev->domain, dev->bus, dev->devid, dev->function); 890 dir = opendir(tmp); 891 if (dir == NULL) 892 return -1; 893 /* Traverse to identify PF interface */ 894 do { 895 ptr = readdir(dir); 896 if (ptr == NULL || ptr->d_type != DT_DIR) { 897 closedir(dir); 898 return -1; 899 } 900 } while (strchr(ptr->d_name, '.') || strchr(ptr->d_name, '_') || 901 strchr(ptr->d_name, 'v')); 902 snprintf(cur_ifname, sizeof(cur_ifname), "%s", ptr->d_name); 903 closedir(dir); 904 snprintf(tmp + strlen(tmp), sizeof(tmp) - strlen(tmp), 905 "/%s/phys_switch_id", cur_ifname); 906 /* Older OFED like 5.3 doesn't support read */ 907 id_file = fopen(tmp, "r"); 908 if (!id_file) 909 return 0; 910 ret = fscanf(id_file, "%16s", guid); 911 fclose(id_file); 912 return ret; 913 } 914 915 /* 916 * Create direct mkey using the kernel ibv_reg_mr API and wrap it with a new 917 * indirect mkey created by the DevX API. 918 * This mkey should be used for DevX commands requesting mkey as a parameter. 919 */ 920 int 921 mlx5_os_wrapped_mkey_create(void *ctx, void *pd, uint32_t pdn, void *addr, 922 size_t length, struct mlx5_pmd_wrapped_mr *pmd_mr) 923 { 924 struct mlx5_klm klm = { 925 .byte_count = length, 926 .address = (uintptr_t)addr, 927 }; 928 struct mlx5_devx_mkey_attr mkey_attr = { 929 .pd = pdn, 930 .klm_array = &klm, 931 .klm_num = 1, 932 }; 933 struct mlx5_devx_obj *mkey; 934 struct ibv_mr *ibv_mr = mlx5_glue->reg_mr(pd, addr, length, 935 IBV_ACCESS_LOCAL_WRITE | 936 (haswell_broadwell_cpu ? 0 : 937 IBV_ACCESS_RELAXED_ORDERING)); 938 939 if (!ibv_mr) { 940 rte_errno = errno; 941 return -rte_errno; 942 } 943 klm.mkey = ibv_mr->lkey; 944 mkey_attr.addr = (uintptr_t)addr; 945 mkey_attr.size = length; 946 mkey = mlx5_devx_cmd_mkey_create(ctx, &mkey_attr); 947 if (!mkey) { 948 claim_zero(mlx5_glue->dereg_mr(ibv_mr)); 949 return -rte_errno; 950 } 951 pmd_mr->addr = addr; 952 pmd_mr->len = length; 953 pmd_mr->obj = (void *)ibv_mr; 954 pmd_mr->imkey = mkey; 955 pmd_mr->lkey = mkey->id; 956 return 0; 957 } 958 959 void 960 mlx5_os_wrapped_mkey_destroy(struct mlx5_pmd_wrapped_mr *pmd_mr) 961 { 962 if (!pmd_mr) 963 return; 964 if (pmd_mr->imkey) 965 claim_zero(mlx5_devx_cmd_destroy(pmd_mr->imkey)); 966 if (pmd_mr->obj) 967 claim_zero(mlx5_glue->dereg_mr(pmd_mr->obj)); 968 memset(pmd_mr, 0, sizeof(*pmd_mr)); 969 } 970 971 /** 972 * Rte_intr_handle create and init helper. 973 * 974 * @param[in] mode 975 * interrupt instance can be shared between primary and secondary 976 * processes or not. 977 * @param[in] set_fd_nonblock 978 * Whether to set fd to O_NONBLOCK. 979 * @param[in] fd 980 * Fd to set in created intr_handle. 981 * @param[in] cb 982 * Callback to register for intr_handle. 983 * @param[in] cb_arg 984 * Callback argument for cb. 985 * 986 * @return 987 * - Interrupt handle on success. 988 * - NULL on failure, with rte_errno set. 989 */ 990 struct rte_intr_handle * 991 mlx5_os_interrupt_handler_create(int mode, bool set_fd_nonblock, int fd, 992 rte_intr_callback_fn cb, void *cb_arg) 993 { 994 struct rte_intr_handle *tmp_intr_handle; 995 int ret, flags; 996 997 tmp_intr_handle = rte_intr_instance_alloc(mode); 998 if (!tmp_intr_handle) { 999 rte_errno = ENOMEM; 1000 goto err; 1001 } 1002 if (set_fd_nonblock) { 1003 flags = fcntl(fd, F_GETFL); 1004 ret = fcntl(fd, F_SETFL, flags | O_NONBLOCK); 1005 if (ret) { 1006 rte_errno = errno; 1007 goto err; 1008 } 1009 } 1010 ret = rte_intr_fd_set(tmp_intr_handle, fd); 1011 if (ret) 1012 goto err; 1013 ret = rte_intr_type_set(tmp_intr_handle, RTE_INTR_HANDLE_EXT); 1014 if (ret) 1015 goto err; 1016 ret = rte_intr_callback_register(tmp_intr_handle, cb, cb_arg); 1017 if (ret) { 1018 rte_errno = -ret; 1019 goto err; 1020 } 1021 return tmp_intr_handle; 1022 err: 1023 rte_intr_instance_free(tmp_intr_handle); 1024 return NULL; 1025 } 1026 1027 /* Safe unregistration for interrupt callback. */ 1028 static void 1029 mlx5_intr_callback_unregister(const struct rte_intr_handle *handle, 1030 rte_intr_callback_fn cb_fn, void *cb_arg) 1031 { 1032 uint64_t twait = 0; 1033 uint64_t start = 0; 1034 1035 do { 1036 int ret; 1037 1038 ret = rte_intr_callback_unregister(handle, cb_fn, cb_arg); 1039 if (ret >= 0) 1040 return; 1041 if (ret != -EAGAIN) { 1042 DRV_LOG(INFO, "failed to unregister interrupt" 1043 " handler (error: %d)", ret); 1044 MLX5_ASSERT(false); 1045 return; 1046 } 1047 if (twait) { 1048 struct timespec onems; 1049 1050 /* Wait one millisecond and try again. */ 1051 onems.tv_sec = 0; 1052 onems.tv_nsec = NS_PER_S / MS_PER_S; 1053 nanosleep(&onems, 0); 1054 /* Check whether one second elapsed. */ 1055 if ((rte_get_timer_cycles() - start) <= twait) 1056 continue; 1057 } else { 1058 /* 1059 * We get the amount of timer ticks for one second. 1060 * If this amount elapsed it means we spent one 1061 * second in waiting. This branch is executed once 1062 * on first iteration. 1063 */ 1064 twait = rte_get_timer_hz(); 1065 MLX5_ASSERT(twait); 1066 } 1067 /* 1068 * Timeout elapsed, show message (once a second) and retry. 1069 * We have no other acceptable option here, if we ignore 1070 * the unregistering return code the handler will not 1071 * be unregistered, fd will be closed and we may get the 1072 * crush. Hanging and messaging in the loop seems not to be 1073 * the worst choice. 1074 */ 1075 DRV_LOG(INFO, "Retrying to unregister interrupt handler"); 1076 start = rte_get_timer_cycles(); 1077 } while (true); 1078 } 1079 1080 /** 1081 * Rte_intr_handle destroy helper. 1082 * 1083 * @param[in] intr_handle 1084 * Rte_intr_handle to destroy. 1085 * @param[in] cb 1086 * Callback which is registered to intr_handle. 1087 * @param[in] cb_arg 1088 * Callback argument for cb. 1089 * 1090 */ 1091 void 1092 mlx5_os_interrupt_handler_destroy(struct rte_intr_handle *intr_handle, 1093 rte_intr_callback_fn cb, void *cb_arg) 1094 { 1095 if (rte_intr_fd_get(intr_handle) >= 0) 1096 mlx5_intr_callback_unregister(intr_handle, cb, cb_arg); 1097 rte_intr_instance_free(intr_handle); 1098 } 1099