1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright 2020 Mellanox Technologies, Ltd 3 */ 4 5 #include <sys/types.h> 6 #include <unistd.h> 7 #include <string.h> 8 #include <stdio.h> 9 #ifdef RTE_IBVERBS_LINK_DLOPEN 10 #include <dlfcn.h> 11 #endif 12 #include <dirent.h> 13 #include <net/if.h> 14 #include <fcntl.h> 15 16 #include <rte_errno.h> 17 #include <rte_string_fns.h> 18 #include <bus_pci_driver.h> 19 #include <bus_auxiliary_driver.h> 20 21 #include "mlx5_common.h" 22 #include "mlx5_nl.h" 23 #include "mlx5_common_log.h" 24 #include "mlx5_common_private.h" 25 #include "mlx5_common_defs.h" 26 #include "mlx5_common_os.h" 27 #include "mlx5_glue.h" 28 29 #ifdef MLX5_GLUE 30 const struct mlx5_glue *mlx5_glue; 31 #endif 32 33 int 34 mlx5_get_pci_addr(const char *dev_path, struct rte_pci_addr *pci_addr) 35 { 36 FILE *file; 37 char line[32]; 38 int rc = -ENOENT; 39 MKSTR(path, "%s/device/uevent", dev_path); 40 41 file = fopen(path, "rb"); 42 if (file == NULL) { 43 rte_errno = errno; 44 return -rte_errno; 45 } 46 while (fgets(line, sizeof(line), file) == line) { 47 size_t len = strlen(line); 48 49 /* Truncate long lines. */ 50 if (len == (sizeof(line) - 1)) { 51 while (line[(len - 1)] != '\n') { 52 int ret = fgetc(file); 53 54 if (ret == EOF) 55 goto exit; 56 line[(len - 1)] = ret; 57 } 58 /* No match for long lines. */ 59 continue; 60 } 61 /* Extract information. */ 62 if (sscanf(line, 63 "PCI_SLOT_NAME=" 64 "%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n", 65 &pci_addr->domain, 66 &pci_addr->bus, 67 &pci_addr->devid, 68 &pci_addr->function) == 4) { 69 rc = 0; 70 break; 71 } 72 } 73 exit: 74 fclose(file); 75 if (rc) 76 rte_errno = -rc; 77 return rc; 78 } 79 80 /** 81 * Extract port name, as a number, from sysfs or netlink information. 82 * 83 * @param[in] port_name_in 84 * String representing the port name. 85 * @param[out] port_info_out 86 * Port information, including port name as a number and port name 87 * type if recognized 88 * 89 * @return 90 * port_name field set according to recognized name format. 91 */ 92 void 93 mlx5_translate_port_name(const char *port_name_in, 94 struct mlx5_switch_info *port_info_out) 95 { 96 char ctrl = 0, pf_c1, pf_c2, vf_c1, vf_c2, eol; 97 char *end; 98 int sc_items; 99 int32_t ctrl_num = -1; 100 101 sc_items = sscanf(port_name_in, "%c%d", &ctrl, &ctrl_num); 102 if (sc_items == 2 && ctrl == 'c') { 103 port_info_out->ctrl_num = ctrl_num; 104 port_name_in++; /* 'c' */ 105 port_name_in += snprintf(NULL, 0, "%d", 106 port_info_out->ctrl_num); 107 } 108 /* Check for port-name as a string of the form pf0vf0 or pf0sf0 */ 109 sc_items = sscanf(port_name_in, "%c%c%d%c%c%d%c", 110 &pf_c1, &pf_c2, &port_info_out->pf_num, 111 &vf_c1, &vf_c2, &port_info_out->port_name, &eol); 112 if (sc_items == 6 && pf_c1 == 'p' && pf_c2 == 'f') { 113 if (vf_c1 == 'v' && vf_c2 == 'f') { 114 /* Kernel ver >= 5.0 or OFED ver >= 4.6 */ 115 port_info_out->name_type = 116 MLX5_PHYS_PORT_NAME_TYPE_PFVF; 117 return; 118 } 119 if (vf_c1 == 's' && vf_c2 == 'f') { 120 /* Kernel ver >= 5.11 or OFED ver >= 5.1 */ 121 port_info_out->name_type = 122 MLX5_PHYS_PORT_NAME_TYPE_PFSF; 123 return; 124 } 125 } 126 /* 127 * Check for port-name as a string of the form p0 128 * (support kernel ver >= 5.0, or OFED ver >= 4.6). 129 */ 130 sc_items = sscanf(port_name_in, "%c%d%c", 131 &pf_c1, &port_info_out->port_name, &eol); 132 if (sc_items == 2 && pf_c1 == 'p') { 133 port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_UPLINK; 134 return; 135 } 136 /* 137 * Check for port-name as a string of the form pf0 138 * (support kernel ver >= 5.7 for HPF representor on BF). 139 */ 140 sc_items = sscanf(port_name_in, "%c%c%d%c", 141 &pf_c1, &pf_c2, &port_info_out->pf_num, &eol); 142 if (sc_items == 3 && pf_c1 == 'p' && pf_c2 == 'f') { 143 port_info_out->port_name = -1; 144 port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_PFHPF; 145 return; 146 } 147 /* Check for port-name as a number (support kernel ver < 5.0 */ 148 errno = 0; 149 port_info_out->port_name = strtol(port_name_in, &end, 0); 150 if (!errno && 151 (size_t)(end - port_name_in) == strlen(port_name_in)) { 152 port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_LEGACY; 153 return; 154 } 155 port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN; 156 } 157 158 int 159 mlx5_get_ifname_sysfs(const char *ibdev_path, char *ifname) 160 { 161 DIR *dir; 162 struct dirent *dent; 163 unsigned int dev_type = 0; 164 unsigned int dev_port_prev = ~0u; 165 char match[IF_NAMESIZE] = ""; 166 167 MLX5_ASSERT(ibdev_path); 168 { 169 MKSTR(path, "%s/device/net", ibdev_path); 170 171 dir = opendir(path); 172 if (dir == NULL) { 173 rte_errno = errno; 174 return -rte_errno; 175 } 176 } 177 while ((dent = readdir(dir)) != NULL) { 178 char *name = dent->d_name; 179 FILE *file; 180 unsigned int dev_port; 181 int r; 182 183 if ((name[0] == '.') && 184 ((name[1] == '\0') || 185 ((name[1] == '.') && (name[2] == '\0')))) 186 continue; 187 188 MKSTR(path, "%s/device/net/%s/%s", 189 ibdev_path, name, 190 (dev_type ? "dev_id" : "dev_port")); 191 192 file = fopen(path, "rb"); 193 if (file == NULL) { 194 if (errno != ENOENT) 195 continue; 196 /* 197 * Switch to dev_id when dev_port does not exist as 198 * is the case with Linux kernel versions < 3.15. 199 */ 200 try_dev_id: 201 match[0] = '\0'; 202 if (dev_type) 203 break; 204 dev_type = 1; 205 dev_port_prev = ~0u; 206 rewinddir(dir); 207 continue; 208 } 209 r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port); 210 fclose(file); 211 if (r != 1) 212 continue; 213 /* 214 * Switch to dev_id when dev_port returns the same value for 215 * all ports. May happen when using a MOFED release older than 216 * 3.0 with a Linux kernel >= 3.15. 217 */ 218 if (dev_port == dev_port_prev) 219 goto try_dev_id; 220 dev_port_prev = dev_port; 221 if (dev_port == 0) 222 strlcpy(match, name, IF_NAMESIZE); 223 } 224 closedir(dir); 225 if (match[0] == '\0') { 226 rte_errno = ENOENT; 227 return -rte_errno; 228 } 229 strncpy(ifname, match, IF_NAMESIZE); 230 return 0; 231 } 232 233 #ifdef MLX5_GLUE 234 235 /** 236 * Suffix RTE_EAL_PMD_PATH with "-glue". 237 * 238 * This function performs a sanity check on RTE_EAL_PMD_PATH before 239 * suffixing its last component. 240 * 241 * @param buf[out] 242 * Output buffer, should be large enough otherwise NULL is returned. 243 * @param size 244 * Size of @p out. 245 * 246 * @return 247 * Pointer to @p buf or @p NULL in case suffix cannot be appended. 248 */ 249 static char * 250 mlx5_glue_path(char *buf, size_t size) 251 { 252 static const char *const bad[] = { "/", ".", "..", NULL }; 253 const char *path = RTE_EAL_PMD_PATH; 254 size_t len = strlen(path); 255 size_t off; 256 int i; 257 258 while (len && path[len - 1] == '/') 259 --len; 260 for (off = len; off && path[off - 1] != '/'; --off) 261 ; 262 for (i = 0; bad[i]; ++i) 263 if (!strncmp(path + off, bad[i], (int)(len - off))) 264 goto error; 265 i = snprintf(buf, size, "%.*s-glue", (int)len, path); 266 if (i == -1 || (size_t)i >= size) 267 goto error; 268 return buf; 269 error: 270 DRV_LOG(ERR, "unable to append \"-glue\" to last component of" 271 " RTE_EAL_PMD_PATH (\"" RTE_EAL_PMD_PATH "\"), please" 272 " re-configure DPDK"); 273 return NULL; 274 } 275 276 static int 277 mlx5_glue_dlopen(void) 278 { 279 char glue_path[sizeof(RTE_EAL_PMD_PATH) - 1 + sizeof("-glue")]; 280 void *handle = NULL; 281 282 char const *path[] = { 283 /* 284 * A basic security check is necessary before trusting 285 * MLX5_GLUE_PATH, which may override RTE_EAL_PMD_PATH. 286 */ 287 (geteuid() == getuid() && getegid() == getgid() ? 288 getenv("MLX5_GLUE_PATH") : NULL), 289 /* 290 * When RTE_EAL_PMD_PATH is set, use its glue-suffixed 291 * variant, otherwise let dlopen() look up libraries on its 292 * own. 293 */ 294 (*RTE_EAL_PMD_PATH ? 295 mlx5_glue_path(glue_path, sizeof(glue_path)) : ""), 296 }; 297 unsigned int i = 0; 298 void **sym; 299 const char *dlmsg; 300 301 while (!handle && i != RTE_DIM(path)) { 302 const char *end; 303 size_t len; 304 int ret; 305 306 if (!path[i]) { 307 ++i; 308 continue; 309 } 310 end = strpbrk(path[i], ":;"); 311 if (!end) 312 end = path[i] + strlen(path[i]); 313 len = end - path[i]; 314 ret = 0; 315 do { 316 char name[ret + 1]; 317 318 ret = snprintf(name, sizeof(name), "%.*s%s" MLX5_GLUE, 319 (int)len, path[i], 320 (!len || *(end - 1) == '/') ? "" : "/"); 321 if (ret == -1) 322 break; 323 if (sizeof(name) != (size_t)ret + 1) 324 continue; 325 DRV_LOG(DEBUG, "Looking for rdma-core glue as " 326 "\"%s\"", name); 327 handle = dlopen(name, RTLD_LAZY); 328 break; 329 } while (1); 330 path[i] = end + 1; 331 if (!*end) 332 ++i; 333 } 334 if (!handle) { 335 rte_errno = EINVAL; 336 dlmsg = dlerror(); 337 if (dlmsg) 338 DRV_LOG(WARNING, "Cannot load glue library: %s", dlmsg); 339 goto glue_error; 340 } 341 sym = dlsym(handle, "mlx5_glue"); 342 if (!sym || !*sym) { 343 rte_errno = EINVAL; 344 dlmsg = dlerror(); 345 if (dlmsg) 346 DRV_LOG(ERR, "Cannot resolve glue symbol: %s", dlmsg); 347 goto glue_error; 348 } 349 mlx5_glue = *sym; 350 return 0; 351 352 glue_error: 353 if (handle) 354 dlclose(handle); 355 return -1; 356 } 357 358 #endif 359 360 /** 361 * Initialization routine for run-time dependency on rdma-core. 362 */ 363 void 364 mlx5_glue_constructor(void) 365 { 366 /* 367 * RDMAV_HUGEPAGES_SAFE tells ibv_fork_init() we intend to use 368 * huge pages. Calling ibv_fork_init() during init allows 369 * applications to use fork() safely for purposes other than 370 * using this PMD, which is not supported in forked processes. 371 */ 372 setenv("RDMAV_HUGEPAGES_SAFE", "1", 1); 373 /* Match the size of Rx completion entry to the size of a cacheline. */ 374 if (RTE_CACHE_LINE_SIZE == 128) 375 setenv("MLX5_CQE_SIZE", "128", 0); 376 /* 377 * MLX5_DEVICE_FATAL_CLEANUP tells ibv_destroy functions to 378 * cleanup all the Verbs resources even when the device was removed. 379 */ 380 setenv("MLX5_DEVICE_FATAL_CLEANUP", "1", 1); 381 382 #ifdef MLX5_GLUE 383 if (mlx5_glue_dlopen() != 0) 384 goto glue_error; 385 #endif 386 387 #ifdef RTE_LIBRTE_MLX5_DEBUG 388 /* Glue structure must not contain any NULL pointers. */ 389 { 390 unsigned int i; 391 392 for (i = 0; i != sizeof(*mlx5_glue) / sizeof(void *); ++i) 393 MLX5_ASSERT(((const void *const *)mlx5_glue)[i]); 394 } 395 #endif 396 if (strcmp(mlx5_glue->version, MLX5_GLUE_VERSION)) { 397 rte_errno = EINVAL; 398 DRV_LOG(ERR, "rdma-core glue \"%s\" mismatch: \"%s\" is " 399 "required", mlx5_glue->version, MLX5_GLUE_VERSION); 400 goto glue_error; 401 } 402 mlx5_glue->fork_init(); 403 return; 404 405 glue_error: 406 DRV_LOG(WARNING, "Cannot initialize MLX5 common due to missing" 407 " run-time dependency on rdma-core libraries (libibverbs," 408 " libmlx5)"); 409 mlx5_glue = NULL; 410 } 411 412 /** 413 * Validate user arguments for remote PD and CTX. 414 * 415 * @param config 416 * Pointer to device configuration structure. 417 * 418 * @return 419 * 0 on success, a negative errno value otherwise and rte_errno is set. 420 */ 421 int 422 mlx5_os_remote_pd_and_ctx_validate(struct mlx5_common_dev_config *config) 423 { 424 int device_fd = config->device_fd; 425 int pd_handle = config->pd_handle; 426 427 #ifdef HAVE_MLX5_IBV_IMPORT_CTX_PD_AND_MR 428 if (device_fd == MLX5_ARG_UNSET && pd_handle != MLX5_ARG_UNSET) { 429 DRV_LOG(ERR, "Remote PD without CTX is not supported."); 430 rte_errno = EINVAL; 431 return -rte_errno; 432 } 433 if (device_fd != MLX5_ARG_UNSET && pd_handle == MLX5_ARG_UNSET) { 434 DRV_LOG(ERR, "Remote CTX without PD is not supported."); 435 rte_errno = EINVAL; 436 return -rte_errno; 437 } 438 DRV_LOG(DEBUG, "Remote PD and CTX is supported: (cmd_fd=%d, " 439 "pd_handle=%d).", device_fd, pd_handle); 440 #else 441 if (pd_handle != MLX5_ARG_UNSET || device_fd != MLX5_ARG_UNSET) { 442 DRV_LOG(ERR, 443 "Remote PD and CTX is not supported - maybe old rdma-core version?"); 444 rte_errno = ENOTSUP; 445 return -rte_errno; 446 } 447 #endif 448 return 0; 449 } 450 451 /** 452 * Release Protection Domain object. 453 * 454 * @param[out] cdev 455 * Pointer to the mlx5 device. 456 * 457 * @return 458 * 0 on success, a negative errno value otherwise. 459 */ 460 int 461 mlx5_os_pd_release(struct mlx5_common_device *cdev) 462 { 463 if (cdev->config.pd_handle == MLX5_ARG_UNSET) 464 return mlx5_glue->dealloc_pd(cdev->pd); 465 else 466 return mlx5_glue->unimport_pd(cdev->pd); 467 } 468 469 /** 470 * Allocate Protection Domain object. 471 * 472 * @param[out] cdev 473 * Pointer to the mlx5 device. 474 * 475 * @return 476 * 0 on success, a negative errno value otherwise. 477 */ 478 static int 479 mlx5_os_pd_create(struct mlx5_common_device *cdev) 480 { 481 cdev->pd = mlx5_glue->alloc_pd(cdev->ctx); 482 if (cdev->pd == NULL) { 483 DRV_LOG(ERR, "Failed to allocate PD: %s", rte_strerror(errno)); 484 return errno ? -errno : -ENOMEM; 485 } 486 return 0; 487 } 488 489 /** 490 * Import Protection Domain object according to given PD handle. 491 * 492 * @param[out] cdev 493 * Pointer to the mlx5 device. 494 * 495 * @return 496 * 0 on success, a negative errno value otherwise. 497 */ 498 static int 499 mlx5_os_pd_import(struct mlx5_common_device *cdev) 500 { 501 cdev->pd = mlx5_glue->import_pd(cdev->ctx, cdev->config.pd_handle); 502 if (cdev->pd == NULL) { 503 DRV_LOG(ERR, "Failed to import PD using handle=%d: %s", 504 cdev->config.pd_handle, rte_strerror(errno)); 505 return errno ? -errno : -ENOMEM; 506 } 507 return 0; 508 } 509 510 /** 511 * Prepare Protection Domain object and extract its pdn using DV API. 512 * 513 * @param[out] cdev 514 * Pointer to the mlx5 device. 515 * 516 * @return 517 * 0 on success, a negative errno value otherwise and rte_errno is set. 518 */ 519 int 520 mlx5_os_pd_prepare(struct mlx5_common_device *cdev) 521 { 522 #ifdef HAVE_IBV_FLOW_DV_SUPPORT 523 struct mlx5dv_obj obj; 524 struct mlx5dv_pd pd_info; 525 #endif 526 int ret; 527 528 if (cdev->config.pd_handle == MLX5_ARG_UNSET) 529 ret = mlx5_os_pd_create(cdev); 530 else 531 ret = mlx5_os_pd_import(cdev); 532 if (ret) { 533 rte_errno = -ret; 534 return ret; 535 } 536 if (cdev->config.devx == 0) 537 return 0; 538 #ifdef HAVE_IBV_FLOW_DV_SUPPORT 539 obj.pd.in = cdev->pd; 540 obj.pd.out = &pd_info; 541 ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_PD); 542 if (ret != 0) { 543 DRV_LOG(ERR, "Fail to get PD object info."); 544 rte_errno = errno; 545 claim_zero(mlx5_os_pd_release(cdev)); 546 cdev->pd = NULL; 547 return -rte_errno; 548 } 549 cdev->pdn = pd_info.pdn; 550 return 0; 551 #else 552 DRV_LOG(ERR, "Cannot get pdn - no DV support."); 553 rte_errno = ENOTSUP; 554 return -rte_errno; 555 #endif /* HAVE_IBV_FLOW_DV_SUPPORT */ 556 } 557 558 static struct ibv_device * 559 mlx5_os_get_ibv_device(const struct rte_pci_device *pci_dev) 560 { 561 int n; 562 struct ibv_device **ibv_list = mlx5_glue->get_device_list(&n); 563 struct ibv_device *ibv_match = NULL; 564 uint8_t guid1[32] = {0}; 565 uint8_t guid2[32] = {0}; 566 int ret1, ret2 = -1; 567 struct rte_pci_addr paddr; 568 const struct rte_pci_addr *addr = &pci_dev->addr; 569 bool is_vf_dev = mlx5_dev_is_vf_pci(pci_dev); 570 571 if (ibv_list == NULL || !n) { 572 rte_errno = ENOSYS; 573 if (ibv_list) 574 mlx5_glue->free_device_list(ibv_list); 575 return NULL; 576 } 577 ret1 = mlx5_get_device_guid(addr, guid1, sizeof(guid1)); 578 while (n-- > 0) { 579 DRV_LOG(DEBUG, "Checking device \"%s\"..", ibv_list[n]->name); 580 if (mlx5_get_pci_addr(ibv_list[n]->ibdev_path, &paddr) != 0) 581 continue; 582 if (ret1 > 0) 583 ret2 = mlx5_get_device_guid(&paddr, guid2, sizeof(guid2)); 584 /* Bond device can bond secondary PCIe */ 585 if ((strstr(ibv_list[n]->name, "bond") && !is_vf_dev && 586 ((ret1 > 0 && ret2 > 0 && !memcmp(guid1, guid2, sizeof(guid1))) || 587 (addr->domain == paddr.domain && addr->bus == paddr.bus && 588 addr->devid == paddr.devid))) || 589 !rte_pci_addr_cmp(addr, &paddr)) { 590 ibv_match = ibv_list[n]; 591 break; 592 } 593 } 594 if (ibv_match == NULL) { 595 DRV_LOG(WARNING, 596 "No Verbs device matches PCI device " PCI_PRI_FMT "," 597 " are kernel drivers loaded?", 598 addr->domain, addr->bus, addr->devid, addr->function); 599 rte_errno = ENOENT; 600 } 601 mlx5_glue->free_device_list(ibv_list); 602 return ibv_match; 603 } 604 605 /* Try to disable ROCE by Netlink\Devlink. */ 606 static int 607 mlx5_nl_roce_disable(const char *addr) 608 { 609 int nlsk_fd = mlx5_nl_init(NETLINK_GENERIC, 0); 610 int devlink_id; 611 int enable; 612 int ret; 613 614 if (nlsk_fd < 0) 615 return nlsk_fd; 616 devlink_id = mlx5_nl_devlink_family_id_get(nlsk_fd); 617 if (devlink_id < 0) { 618 ret = devlink_id; 619 DRV_LOG(DEBUG, 620 "Failed to get devlink id for ROCE operations by Netlink."); 621 goto close; 622 } 623 ret = mlx5_nl_enable_roce_get(nlsk_fd, devlink_id, addr, &enable); 624 if (ret) { 625 DRV_LOG(DEBUG, "Failed to get ROCE enable by Netlink: %d.", 626 ret); 627 goto close; 628 } else if (!enable) { 629 DRV_LOG(INFO, "ROCE has already disabled(Netlink)."); 630 goto close; 631 } 632 ret = mlx5_nl_enable_roce_set(nlsk_fd, devlink_id, addr, 0); 633 if (ret) 634 DRV_LOG(DEBUG, "Failed to disable ROCE by Netlink: %d.", ret); 635 else 636 DRV_LOG(INFO, "ROCE is disabled by Netlink successfully."); 637 close: 638 close(nlsk_fd); 639 return ret; 640 } 641 642 /* Try to disable ROCE by sysfs. */ 643 static int 644 mlx5_sys_roce_disable(const char *addr) 645 { 646 FILE *file_o; 647 int enable; 648 int ret; 649 650 MKSTR(file_p, "/sys/bus/pci/devices/%s/roce_enable", addr); 651 file_o = fopen(file_p, "rb"); 652 if (!file_o) { 653 rte_errno = ENOTSUP; 654 return -ENOTSUP; 655 } 656 ret = fscanf(file_o, "%d", &enable); 657 if (ret != 1) { 658 rte_errno = EINVAL; 659 ret = EINVAL; 660 goto close; 661 } else if (!enable) { 662 ret = 0; 663 DRV_LOG(INFO, "ROCE has already disabled(sysfs)."); 664 goto close; 665 } 666 fclose(file_o); 667 file_o = fopen(file_p, "wb"); 668 if (!file_o) { 669 rte_errno = ENOTSUP; 670 return -ENOTSUP; 671 } 672 fprintf(file_o, "0\n"); 673 ret = 0; 674 close: 675 if (ret) 676 DRV_LOG(DEBUG, "Failed to disable ROCE by sysfs: %d.", ret); 677 else 678 DRV_LOG(INFO, "ROCE is disabled by sysfs successfully."); 679 fclose(file_o); 680 return ret; 681 } 682 683 static int 684 mlx5_roce_disable(const struct rte_device *dev) 685 { 686 char pci_addr[PCI_PRI_STR_SIZE] = { 0 }; 687 688 if (mlx5_dev_to_pci_str(dev, pci_addr, sizeof(pci_addr)) < 0) 689 return -rte_errno; 690 /* Firstly try to disable ROCE by Netlink and fallback to sysfs. */ 691 if (mlx5_nl_roce_disable(pci_addr) != 0 && 692 mlx5_sys_roce_disable(pci_addr) != 0) 693 return -rte_errno; 694 return 0; 695 } 696 697 static struct ibv_device * 698 mlx5_os_get_ibv_dev(const struct rte_device *dev) 699 { 700 struct ibv_device *ibv; 701 702 if (mlx5_dev_is_pci(dev)) 703 ibv = mlx5_os_get_ibv_device(RTE_DEV_TO_PCI_CONST(dev)); 704 else 705 ibv = mlx5_get_aux_ibv_device(RTE_DEV_TO_AUXILIARY_CONST(dev)); 706 if (ibv == NULL) { 707 rte_errno = ENODEV; 708 DRV_LOG(ERR, "Verbs device not found: %s", dev->name); 709 } 710 return ibv; 711 } 712 713 static struct ibv_device * 714 mlx5_vdpa_get_ibv_dev(const struct rte_device *dev) 715 { 716 struct ibv_device *ibv; 717 int retry; 718 719 if (mlx5_roce_disable(dev) != 0) { 720 DRV_LOG(WARNING, "Failed to disable ROCE for \"%s\".", 721 dev->name); 722 return NULL; 723 } 724 /* Wait for the IB device to appear again after reload. */ 725 for (retry = MLX5_VDPA_MAX_RETRIES; retry > 0; --retry) { 726 ibv = mlx5_os_get_ibv_dev(dev); 727 if (ibv != NULL) 728 return ibv; 729 usleep(MLX5_VDPA_USEC); 730 } 731 DRV_LOG(ERR, 732 "Cannot get IB device after disabling RoCE for \"%s\", retries exceed %d.", 733 dev->name, MLX5_VDPA_MAX_RETRIES); 734 rte_errno = EAGAIN; 735 return NULL; 736 } 737 738 static int 739 mlx5_config_doorbell_mapping_env(int dbnc) 740 { 741 char *env; 742 int value; 743 744 MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY); 745 /* Get environment variable to store. */ 746 env = getenv(MLX5_SHUT_UP_BF); 747 value = env ? !!strcmp(env, "0") : MLX5_ARG_UNSET; 748 if (dbnc == MLX5_ARG_UNSET) 749 setenv(MLX5_SHUT_UP_BF, MLX5_SHUT_UP_BF_DEFAULT, 1); 750 else 751 setenv(MLX5_SHUT_UP_BF, 752 dbnc == MLX5_SQ_DB_NCACHED ? "1" : "0", 1); 753 return value; 754 } 755 756 static void 757 mlx5_restore_doorbell_mapping_env(int value) 758 { 759 MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY); 760 /* Restore the original environment variable state. */ 761 if (value == MLX5_ARG_UNSET) 762 unsetenv(MLX5_SHUT_UP_BF); 763 else 764 setenv(MLX5_SHUT_UP_BF, value ? "1" : "0", 1); 765 } 766 767 /** 768 * Function API to open IB device. 769 * 770 * @param cdev 771 * Pointer to the mlx5 device. 772 * @param classes 773 * Chosen classes come from device arguments. 774 * 775 * @return 776 * Pointer to ibv_context on success, NULL otherwise and rte_errno is set. 777 */ 778 static struct ibv_context * 779 mlx5_open_device(struct mlx5_common_device *cdev, uint32_t classes) 780 { 781 struct ibv_device *ibv; 782 struct ibv_context *ctx = NULL; 783 int dbmap_env; 784 785 MLX5_ASSERT(cdev->config.device_fd == MLX5_ARG_UNSET); 786 if (classes & MLX5_CLASS_VDPA) 787 ibv = mlx5_vdpa_get_ibv_dev(cdev->dev); 788 else 789 ibv = mlx5_os_get_ibv_dev(cdev->dev); 790 if (!ibv) 791 return NULL; 792 DRV_LOG(INFO, "Dev information matches for device \"%s\".", ibv->name); 793 /* 794 * Configure environment variable "MLX5_BF_SHUT_UP" before the device 795 * creation. The rdma_core library checks the variable at device 796 * creation and stores the result internally. 797 */ 798 dbmap_env = mlx5_config_doorbell_mapping_env(cdev->config.dbnc); 799 /* Try to open IB device with DV first, then usual Verbs. */ 800 errno = 0; 801 ctx = mlx5_glue->dv_open_device(ibv); 802 if (ctx) { 803 cdev->config.devx = 1; 804 } else if (classes == MLX5_CLASS_ETH) { 805 /* The environment variable is still configured. */ 806 ctx = mlx5_glue->open_device(ibv); 807 if (ctx == NULL) 808 goto error; 809 } else { 810 goto error; 811 } 812 /* The device is created, no need for environment. */ 813 mlx5_restore_doorbell_mapping_env(dbmap_env); 814 return ctx; 815 error: 816 rte_errno = errno ? errno : ENODEV; 817 /* The device creation is failed, no need for environment. */ 818 mlx5_restore_doorbell_mapping_env(dbmap_env); 819 DRV_LOG(ERR, "Failed to open IB device \"%s\".", ibv->name); 820 return NULL; 821 } 822 823 /** 824 * Function API to import IB device. 825 * 826 * @param cdev 827 * Pointer to the mlx5 device. 828 * 829 * @return 830 * Pointer to ibv_context on success, NULL otherwise and rte_errno is set. 831 */ 832 static struct ibv_context * 833 mlx5_import_device(struct mlx5_common_device *cdev) 834 { 835 struct ibv_context *ctx = NULL; 836 837 MLX5_ASSERT(cdev->config.device_fd != MLX5_ARG_UNSET); 838 ctx = mlx5_glue->import_device(cdev->config.device_fd); 839 if (!ctx) { 840 DRV_LOG(ERR, "Failed to import device for fd=%d: %s", 841 cdev->config.device_fd, rte_strerror(errno)); 842 rte_errno = errno; 843 } 844 return ctx; 845 } 846 847 /** 848 * Function API to prepare IB device. 849 * 850 * @param cdev 851 * Pointer to the mlx5 device. 852 * @param classes 853 * Chosen classes come from device arguments. 854 * 855 * @return 856 * 0 on success, a negative errno value otherwise and rte_errno is set. 857 */ 858 int 859 mlx5_os_open_device(struct mlx5_common_device *cdev, uint32_t classes) 860 { 861 862 struct ibv_context *ctx = NULL; 863 864 if (cdev->config.device_fd == MLX5_ARG_UNSET) 865 ctx = mlx5_open_device(cdev, classes); 866 else 867 ctx = mlx5_import_device(cdev); 868 if (ctx == NULL) 869 return -rte_errno; 870 /* Hint libmlx5 to use PMD allocator for data plane resources */ 871 mlx5_set_context_attr(cdev->dev, ctx); 872 cdev->ctx = ctx; 873 return 0; 874 } 875 876 int 877 mlx5_get_device_guid(const struct rte_pci_addr *dev, uint8_t *guid, size_t len) 878 { 879 char tmp[512]; 880 char cur_ifname[IF_NAMESIZE + 1]; 881 FILE *id_file; 882 DIR *dir; 883 struct dirent *ptr; 884 int ret; 885 886 if (guid == NULL || len < sizeof(u_int64_t) + 1) 887 return -1; 888 memset(guid, 0, len); 889 snprintf(tmp, sizeof(tmp), "/sys/bus/pci/devices/%04x:%02x:%02x.%x/net", 890 dev->domain, dev->bus, dev->devid, dev->function); 891 dir = opendir(tmp); 892 if (dir == NULL) 893 return -1; 894 /* Traverse to identify PF interface */ 895 do { 896 ptr = readdir(dir); 897 if (ptr == NULL || ptr->d_type != DT_DIR) { 898 closedir(dir); 899 return -1; 900 } 901 } while (strchr(ptr->d_name, '.') || strchr(ptr->d_name, '_') || 902 strchr(ptr->d_name, 'v')); 903 snprintf(cur_ifname, sizeof(cur_ifname), "%s", ptr->d_name); 904 closedir(dir); 905 snprintf(tmp + strlen(tmp), sizeof(tmp) - strlen(tmp), 906 "/%s/phys_switch_id", cur_ifname); 907 /* Older OFED like 5.3 doesn't support read */ 908 id_file = fopen(tmp, "r"); 909 if (!id_file) 910 return 0; 911 ret = fscanf(id_file, "%16s", guid); 912 fclose(id_file); 913 return ret; 914 } 915 916 /* 917 * Create direct mkey using the kernel ibv_reg_mr API and wrap it with a new 918 * indirect mkey created by the DevX API. 919 * This mkey should be used for DevX commands requesting mkey as a parameter. 920 */ 921 int 922 mlx5_os_wrapped_mkey_create(void *ctx, void *pd, uint32_t pdn, void *addr, 923 size_t length, struct mlx5_pmd_wrapped_mr *pmd_mr) 924 { 925 struct mlx5_klm klm = { 926 .byte_count = length, 927 .address = (uintptr_t)addr, 928 }; 929 struct mlx5_devx_mkey_attr mkey_attr = { 930 .pd = pdn, 931 .klm_array = &klm, 932 .klm_num = 1, 933 }; 934 struct mlx5_devx_obj *mkey; 935 struct ibv_mr *ibv_mr = mlx5_glue->reg_mr(pd, addr, length, 936 IBV_ACCESS_LOCAL_WRITE | 937 (haswell_broadwell_cpu ? 0 : 938 IBV_ACCESS_RELAXED_ORDERING)); 939 940 if (!ibv_mr) { 941 rte_errno = errno; 942 return -rte_errno; 943 } 944 klm.mkey = ibv_mr->lkey; 945 mkey_attr.addr = (uintptr_t)addr; 946 mkey_attr.size = length; 947 mkey = mlx5_devx_cmd_mkey_create(ctx, &mkey_attr); 948 if (!mkey) { 949 claim_zero(mlx5_glue->dereg_mr(ibv_mr)); 950 return -rte_errno; 951 } 952 pmd_mr->addr = addr; 953 pmd_mr->len = length; 954 pmd_mr->obj = (void *)ibv_mr; 955 pmd_mr->imkey = mkey; 956 pmd_mr->lkey = mkey->id; 957 return 0; 958 } 959 960 void 961 mlx5_os_wrapped_mkey_destroy(struct mlx5_pmd_wrapped_mr *pmd_mr) 962 { 963 if (!pmd_mr) 964 return; 965 if (pmd_mr->imkey) 966 claim_zero(mlx5_devx_cmd_destroy(pmd_mr->imkey)); 967 if (pmd_mr->obj) 968 claim_zero(mlx5_glue->dereg_mr(pmd_mr->obj)); 969 memset(pmd_mr, 0, sizeof(*pmd_mr)); 970 } 971 972 /** 973 * Rte_intr_handle create and init helper. 974 * 975 * @param[in] mode 976 * interrupt instance can be shared between primary and secondary 977 * processes or not. 978 * @param[in] set_fd_nonblock 979 * Whether to set fd to O_NONBLOCK. 980 * @param[in] fd 981 * Fd to set in created intr_handle. 982 * @param[in] cb 983 * Callback to register for intr_handle. 984 * @param[in] cb_arg 985 * Callback argument for cb. 986 * 987 * @return 988 * - Interrupt handle on success. 989 * - NULL on failure, with rte_errno set. 990 */ 991 struct rte_intr_handle * 992 mlx5_os_interrupt_handler_create(int mode, bool set_fd_nonblock, int fd, 993 rte_intr_callback_fn cb, void *cb_arg) 994 { 995 struct rte_intr_handle *tmp_intr_handle; 996 int ret, flags; 997 998 tmp_intr_handle = rte_intr_instance_alloc(mode); 999 if (!tmp_intr_handle) { 1000 rte_errno = ENOMEM; 1001 goto err; 1002 } 1003 if (set_fd_nonblock) { 1004 flags = fcntl(fd, F_GETFL); 1005 ret = fcntl(fd, F_SETFL, flags | O_NONBLOCK); 1006 if (ret) { 1007 rte_errno = errno; 1008 goto err; 1009 } 1010 } 1011 ret = rte_intr_fd_set(tmp_intr_handle, fd); 1012 if (ret) 1013 goto err; 1014 ret = rte_intr_type_set(tmp_intr_handle, RTE_INTR_HANDLE_EXT); 1015 if (ret) 1016 goto err; 1017 ret = rte_intr_callback_register(tmp_intr_handle, cb, cb_arg); 1018 if (ret) { 1019 rte_errno = -ret; 1020 goto err; 1021 } 1022 return tmp_intr_handle; 1023 err: 1024 rte_intr_instance_free(tmp_intr_handle); 1025 return NULL; 1026 } 1027 1028 /* Safe unregistration for interrupt callback. */ 1029 static void 1030 mlx5_intr_callback_unregister(const struct rte_intr_handle *handle, 1031 rte_intr_callback_fn cb_fn, void *cb_arg) 1032 { 1033 uint64_t twait = 0; 1034 uint64_t start = 0; 1035 1036 do { 1037 int ret; 1038 1039 ret = rte_intr_callback_unregister(handle, cb_fn, cb_arg); 1040 if (ret >= 0) 1041 return; 1042 if (ret != -EAGAIN) { 1043 DRV_LOG(INFO, "failed to unregister interrupt" 1044 " handler (error: %d)", ret); 1045 MLX5_ASSERT(false); 1046 return; 1047 } 1048 if (twait) { 1049 struct timespec onems; 1050 1051 /* Wait one millisecond and try again. */ 1052 onems.tv_sec = 0; 1053 onems.tv_nsec = NS_PER_S / MS_PER_S; 1054 nanosleep(&onems, 0); 1055 /* Check whether one second elapsed. */ 1056 if ((rte_get_timer_cycles() - start) <= twait) 1057 continue; 1058 } else { 1059 /* 1060 * We get the amount of timer ticks for one second. 1061 * If this amount elapsed it means we spent one 1062 * second in waiting. This branch is executed once 1063 * on first iteration. 1064 */ 1065 twait = rte_get_timer_hz(); 1066 MLX5_ASSERT(twait); 1067 } 1068 /* 1069 * Timeout elapsed, show message (once a second) and retry. 1070 * We have no other acceptable option here, if we ignore 1071 * the unregistering return code the handler will not 1072 * be unregistered, fd will be closed and we may get the 1073 * crush. Hanging and messaging in the loop seems not to be 1074 * the worst choice. 1075 */ 1076 DRV_LOG(INFO, "Retrying to unregister interrupt handler"); 1077 start = rte_get_timer_cycles(); 1078 } while (true); 1079 } 1080 1081 /** 1082 * Rte_intr_handle destroy helper. 1083 * 1084 * @param[in] intr_handle 1085 * Rte_intr_handle to destroy. 1086 * @param[in] cb 1087 * Callback which is registered to intr_handle. 1088 * @param[in] cb_arg 1089 * Callback argument for cb. 1090 * 1091 */ 1092 void 1093 mlx5_os_interrupt_handler_destroy(struct rte_intr_handle *intr_handle, 1094 rte_intr_callback_fn cb, void *cb_arg) 1095 { 1096 if (rte_intr_fd_get(intr_handle) >= 0) 1097 mlx5_intr_callback_unregister(intr_handle, cb, cb_arg); 1098 rte_intr_instance_free(intr_handle); 1099 } 1100