1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright 2020 Mellanox Technologies, Ltd
3 */
4
5 #include <sys/types.h>
6 #include <unistd.h>
7 #include <string.h>
8 #include <stdio.h>
9 #ifdef RTE_IBVERBS_LINK_DLOPEN
10 #include <dlfcn.h>
11 #endif
12 #include <dirent.h>
13 #include <net/if.h>
14 #include <fcntl.h>
15
16 #include <rte_errno.h>
17 #include <rte_string_fns.h>
18 #include <bus_pci_driver.h>
19 #include <bus_auxiliary_driver.h>
20
21 #include "mlx5_common.h"
22 #include "mlx5_nl.h"
23 #include "mlx5_common_log.h"
24 #include "mlx5_common_private.h"
25 #include "mlx5_common_defs.h"
26 #include "mlx5_common_os.h"
27 #include "mlx5_glue.h"
28
29 #ifdef MLX5_GLUE
30 const struct mlx5_glue *mlx5_glue;
31 #endif
32
33 int
mlx5_get_pci_addr(const char * dev_path,struct rte_pci_addr * pci_addr)34 mlx5_get_pci_addr(const char *dev_path, struct rte_pci_addr *pci_addr)
35 {
36 FILE *file;
37 char line[32];
38 int rc = -ENOENT;
39 MKSTR(path, "%s/device/uevent", dev_path);
40
41 file = fopen(path, "rb");
42 if (file == NULL) {
43 rte_errno = errno;
44 return -rte_errno;
45 }
46 while (fgets(line, sizeof(line), file) == line) {
47 size_t len = strlen(line);
48
49 /* Truncate long lines. */
50 if (len == (sizeof(line) - 1)) {
51 while (line[(len - 1)] != '\n') {
52 int ret = fgetc(file);
53
54 if (ret == EOF)
55 goto exit;
56 line[(len - 1)] = ret;
57 }
58 /* No match for long lines. */
59 continue;
60 }
61 /* Extract information. */
62 if (sscanf(line,
63 "PCI_SLOT_NAME="
64 "%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n",
65 &pci_addr->domain,
66 &pci_addr->bus,
67 &pci_addr->devid,
68 &pci_addr->function) == 4) {
69 rc = 0;
70 break;
71 }
72 }
73 exit:
74 fclose(file);
75 if (rc)
76 rte_errno = -rc;
77 return rc;
78 }
79
80 /**
81 * Extract port name, as a number, from sysfs or netlink information.
82 *
83 * @param[in] port_name_in
84 * String representing the port name.
85 * @param[out] port_info_out
86 * Port information, including port name as a number and port name
87 * type if recognized
88 *
89 * @return
90 * port_name field set according to recognized name format.
91 */
92 void
mlx5_translate_port_name(const char * port_name_in,struct mlx5_switch_info * port_info_out)93 mlx5_translate_port_name(const char *port_name_in,
94 struct mlx5_switch_info *port_info_out)
95 {
96 char ctrl = 0, pf_c1, pf_c2, vf_c1, vf_c2, eol;
97 char *end;
98 int sc_items;
99 int32_t ctrl_num = -1;
100
101 sc_items = sscanf(port_name_in, "%c%d", &ctrl, &ctrl_num);
102 if (sc_items == 2 && ctrl == 'c') {
103 port_info_out->ctrl_num = ctrl_num;
104 port_name_in++; /* 'c' */
105 port_name_in += snprintf(NULL, 0, "%d",
106 port_info_out->ctrl_num);
107 }
108 /* Check for port-name as a string of the form pf0vf0 or pf0sf0 */
109 sc_items = sscanf(port_name_in, "%c%c%d%c%c%d%c",
110 &pf_c1, &pf_c2, &port_info_out->pf_num,
111 &vf_c1, &vf_c2, &port_info_out->port_name, &eol);
112 if (sc_items == 6 && pf_c1 == 'p' && pf_c2 == 'f') {
113 if (vf_c1 == 'v' && vf_c2 == 'f') {
114 /* Kernel ver >= 5.0 or OFED ver >= 4.6 */
115 port_info_out->name_type =
116 MLX5_PHYS_PORT_NAME_TYPE_PFVF;
117 return;
118 }
119 if (vf_c1 == 's' && vf_c2 == 'f') {
120 /* Kernel ver >= 5.11 or OFED ver >= 5.1 */
121 port_info_out->name_type =
122 MLX5_PHYS_PORT_NAME_TYPE_PFSF;
123 return;
124 }
125 }
126 /*
127 * Check for port-name as a string of the form p0
128 * (support kernel ver >= 5.0, or OFED ver >= 4.6).
129 */
130 sc_items = sscanf(port_name_in, "%c%d%c",
131 &pf_c1, &port_info_out->port_name, &eol);
132 if (sc_items == 2 && pf_c1 == 'p') {
133 port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_UPLINK;
134 return;
135 }
136 /*
137 * Check for port-name as a string of the form pf0
138 * (support kernel ver >= 5.7 for HPF representor on BF).
139 */
140 sc_items = sscanf(port_name_in, "%c%c%d%c",
141 &pf_c1, &pf_c2, &port_info_out->pf_num, &eol);
142 if (sc_items == 3 && pf_c1 == 'p' && pf_c2 == 'f') {
143 port_info_out->port_name = -1;
144 port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_PFHPF;
145 return;
146 }
147 /* Check for port-name as a number (support kernel ver < 5.0 */
148 errno = 0;
149 port_info_out->port_name = strtol(port_name_in, &end, 0);
150 if (!errno &&
151 (size_t)(end - port_name_in) == strlen(port_name_in)) {
152 port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_LEGACY;
153 return;
154 }
155 port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN;
156 }
157
158 int
mlx5_get_ifname_sysfs(const char * ibdev_path,char * ifname)159 mlx5_get_ifname_sysfs(const char *ibdev_path, char *ifname)
160 {
161 DIR *dir;
162 struct dirent *dent;
163 unsigned int dev_type = 0;
164 unsigned int dev_port_prev = ~0u;
165 char match[IF_NAMESIZE] = "";
166
167 MLX5_ASSERT(ibdev_path);
168 {
169 MKSTR(path, "%s/device/net", ibdev_path);
170
171 dir = opendir(path);
172 if (dir == NULL) {
173 rte_errno = errno;
174 return -rte_errno;
175 }
176 }
177 while ((dent = readdir(dir)) != NULL) {
178 char *name = dent->d_name;
179 FILE *file;
180 unsigned int dev_port;
181 int r;
182
183 if ((name[0] == '.') &&
184 ((name[1] == '\0') ||
185 ((name[1] == '.') && (name[2] == '\0'))))
186 continue;
187
188 MKSTR(path, "%s/device/net/%s/%s",
189 ibdev_path, name,
190 (dev_type ? "dev_id" : "dev_port"));
191
192 file = fopen(path, "rb");
193 if (file == NULL) {
194 if (errno != ENOENT)
195 continue;
196 /*
197 * Switch to dev_id when dev_port does not exist as
198 * is the case with Linux kernel versions < 3.15.
199 */
200 try_dev_id:
201 match[0] = '\0';
202 if (dev_type)
203 break;
204 dev_type = 1;
205 dev_port_prev = ~0u;
206 rewinddir(dir);
207 continue;
208 }
209 r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port);
210 fclose(file);
211 if (r != 1)
212 continue;
213 /*
214 * Switch to dev_id when dev_port returns the same value for
215 * all ports. May happen when using a MOFED release older than
216 * 3.0 with a Linux kernel >= 3.15.
217 */
218 if (dev_port == dev_port_prev)
219 goto try_dev_id;
220 dev_port_prev = dev_port;
221 if (dev_port == 0)
222 strlcpy(match, name, IF_NAMESIZE);
223 }
224 closedir(dir);
225 if (match[0] == '\0') {
226 rte_errno = ENOENT;
227 return -rte_errno;
228 }
229 strncpy(ifname, match, IF_NAMESIZE);
230 return 0;
231 }
232
233 #ifdef MLX5_GLUE
234
235 /**
236 * Suffix RTE_EAL_PMD_PATH with "-glue".
237 *
238 * This function performs a sanity check on RTE_EAL_PMD_PATH before
239 * suffixing its last component.
240 *
241 * @param buf[out]
242 * Output buffer, should be large enough otherwise NULL is returned.
243 * @param size
244 * Size of @p out.
245 *
246 * @return
247 * Pointer to @p buf or @p NULL in case suffix cannot be appended.
248 */
249 static char *
mlx5_glue_path(char * buf,size_t size)250 mlx5_glue_path(char *buf, size_t size)
251 {
252 static const char *const bad[] = { "/", ".", "..", NULL };
253 const char *path = RTE_EAL_PMD_PATH;
254 size_t len = strlen(path);
255 size_t off;
256 int i;
257
258 while (len && path[len - 1] == '/')
259 --len;
260 for (off = len; off && path[off - 1] != '/'; --off)
261 ;
262 for (i = 0; bad[i]; ++i)
263 if (!strncmp(path + off, bad[i], (int)(len - off)))
264 goto error;
265 i = snprintf(buf, size, "%.*s-glue", (int)len, path);
266 if (i == -1 || (size_t)i >= size)
267 goto error;
268 return buf;
269 error:
270 DRV_LOG(ERR, "unable to append \"-glue\" to last component of"
271 " RTE_EAL_PMD_PATH (\"" RTE_EAL_PMD_PATH "\"), please"
272 " re-configure DPDK");
273 return NULL;
274 }
275
276 static int
mlx5_glue_dlopen(void)277 mlx5_glue_dlopen(void)
278 {
279 char glue_path[sizeof(RTE_EAL_PMD_PATH) - 1 + sizeof("-glue")];
280 void *handle = NULL;
281
282 char const *path[] = {
283 /*
284 * A basic security check is necessary before trusting
285 * MLX5_GLUE_PATH, which may override RTE_EAL_PMD_PATH.
286 */
287 (geteuid() == getuid() && getegid() == getgid() ?
288 getenv("MLX5_GLUE_PATH") : NULL),
289 /*
290 * When RTE_EAL_PMD_PATH is set, use its glue-suffixed
291 * variant, otherwise let dlopen() look up libraries on its
292 * own.
293 */
294 (*RTE_EAL_PMD_PATH ?
295 mlx5_glue_path(glue_path, sizeof(glue_path)) : ""),
296 };
297 unsigned int i = 0;
298 void **sym;
299 const char *dlmsg;
300
301 while (!handle && i != RTE_DIM(path)) {
302 const char *end;
303 size_t len;
304 int ret;
305
306 if (!path[i]) {
307 ++i;
308 continue;
309 }
310 end = strpbrk(path[i], ":;");
311 if (!end)
312 end = path[i] + strlen(path[i]);
313 len = end - path[i];
314 ret = 0;
315 do {
316 char name[ret + 1];
317
318 ret = snprintf(name, sizeof(name), "%.*s%s" MLX5_GLUE,
319 (int)len, path[i],
320 (!len || *(end - 1) == '/') ? "" : "/");
321 if (ret == -1)
322 break;
323 if (sizeof(name) != (size_t)ret + 1)
324 continue;
325 DRV_LOG(DEBUG, "Looking for rdma-core glue as "
326 "\"%s\"", name);
327 handle = dlopen(name, RTLD_LAZY);
328 break;
329 } while (1);
330 path[i] = end + 1;
331 if (!*end)
332 ++i;
333 }
334 if (!handle) {
335 rte_errno = EINVAL;
336 dlmsg = dlerror();
337 if (dlmsg)
338 DRV_LOG(WARNING, "Cannot load glue library: %s", dlmsg);
339 goto glue_error;
340 }
341 sym = dlsym(handle, "mlx5_glue");
342 if (!sym || !*sym) {
343 rte_errno = EINVAL;
344 dlmsg = dlerror();
345 if (dlmsg)
346 DRV_LOG(ERR, "Cannot resolve glue symbol: %s", dlmsg);
347 goto glue_error;
348 }
349 mlx5_glue = *sym;
350 return 0;
351
352 glue_error:
353 if (handle)
354 dlclose(handle);
355 return -1;
356 }
357
358 #endif
359
360 /**
361 * Initialization routine for run-time dependency on rdma-core.
362 */
363 void
mlx5_glue_constructor(void)364 mlx5_glue_constructor(void)
365 {
366 /*
367 * RDMAV_HUGEPAGES_SAFE tells ibv_fork_init() we intend to use
368 * huge pages. Calling ibv_fork_init() during init allows
369 * applications to use fork() safely for purposes other than
370 * using this PMD, which is not supported in forked processes.
371 */
372 setenv("RDMAV_HUGEPAGES_SAFE", "1", 1);
373 /* Match the size of Rx completion entry to the size of a cacheline. */
374 if (RTE_CACHE_LINE_SIZE == 128)
375 setenv("MLX5_CQE_SIZE", "128", 0);
376 /*
377 * MLX5_DEVICE_FATAL_CLEANUP tells ibv_destroy functions to
378 * cleanup all the Verbs resources even when the device was removed.
379 */
380 setenv("MLX5_DEVICE_FATAL_CLEANUP", "1", 1);
381
382 #ifdef MLX5_GLUE
383 if (mlx5_glue_dlopen() != 0)
384 goto glue_error;
385 #endif
386
387 #ifdef RTE_LIBRTE_MLX5_DEBUG
388 /* Glue structure must not contain any NULL pointers. */
389 {
390 unsigned int i;
391
392 for (i = 0; i != sizeof(*mlx5_glue) / sizeof(void *); ++i)
393 MLX5_ASSERT(((const void *const *)mlx5_glue)[i]);
394 }
395 #endif
396 if (strcmp(mlx5_glue->version, MLX5_GLUE_VERSION)) {
397 rte_errno = EINVAL;
398 DRV_LOG(ERR, "rdma-core glue \"%s\" mismatch: \"%s\" is "
399 "required", mlx5_glue->version, MLX5_GLUE_VERSION);
400 goto glue_error;
401 }
402 mlx5_glue->fork_init();
403 return;
404
405 glue_error:
406 DRV_LOG(WARNING, "Cannot initialize MLX5 common due to missing"
407 " run-time dependency on rdma-core libraries (libibverbs,"
408 " libmlx5)");
409 mlx5_glue = NULL;
410 }
411
412 /**
413 * Validate user arguments for remote PD and CTX.
414 *
415 * @param config
416 * Pointer to device configuration structure.
417 *
418 * @return
419 * 0 on success, a negative errno value otherwise and rte_errno is set.
420 */
421 int
mlx5_os_remote_pd_and_ctx_validate(struct mlx5_common_dev_config * config)422 mlx5_os_remote_pd_and_ctx_validate(struct mlx5_common_dev_config *config)
423 {
424 int device_fd = config->device_fd;
425 int pd_handle = config->pd_handle;
426
427 #ifdef HAVE_MLX5_IBV_IMPORT_CTX_PD_AND_MR
428 if (device_fd == MLX5_ARG_UNSET && pd_handle != MLX5_ARG_UNSET) {
429 DRV_LOG(ERR, "Remote PD without CTX is not supported.");
430 rte_errno = EINVAL;
431 return -rte_errno;
432 }
433 if (device_fd != MLX5_ARG_UNSET && pd_handle == MLX5_ARG_UNSET) {
434 DRV_LOG(ERR, "Remote CTX without PD is not supported.");
435 rte_errno = EINVAL;
436 return -rte_errno;
437 }
438 DRV_LOG(DEBUG, "Remote PD and CTX is supported: (cmd_fd=%d, "
439 "pd_handle=%d).", device_fd, pd_handle);
440 #else
441 if (pd_handle != MLX5_ARG_UNSET || device_fd != MLX5_ARG_UNSET) {
442 DRV_LOG(ERR,
443 "Remote PD and CTX is not supported - maybe old rdma-core version?");
444 rte_errno = ENOTSUP;
445 return -rte_errno;
446 }
447 #endif
448 return 0;
449 }
450
451 /**
452 * Release Protection Domain object.
453 *
454 * @param[out] cdev
455 * Pointer to the mlx5 device.
456 *
457 * @return
458 * 0 on success, a negative errno value otherwise.
459 */
460 int
mlx5_os_pd_release(struct mlx5_common_device * cdev)461 mlx5_os_pd_release(struct mlx5_common_device *cdev)
462 {
463 if (cdev->config.pd_handle == MLX5_ARG_UNSET)
464 return mlx5_glue->dealloc_pd(cdev->pd);
465 else
466 return mlx5_glue->unimport_pd(cdev->pd);
467 }
468
469 /**
470 * Allocate Protection Domain object.
471 *
472 * @param[out] cdev
473 * Pointer to the mlx5 device.
474 *
475 * @return
476 * 0 on success, a negative errno value otherwise.
477 */
478 static int
mlx5_os_pd_create(struct mlx5_common_device * cdev)479 mlx5_os_pd_create(struct mlx5_common_device *cdev)
480 {
481 cdev->pd = mlx5_glue->alloc_pd(cdev->ctx);
482 if (cdev->pd == NULL) {
483 DRV_LOG(ERR, "Failed to allocate PD: %s", rte_strerror(errno));
484 return errno ? -errno : -ENOMEM;
485 }
486 return 0;
487 }
488
489 /**
490 * Import Protection Domain object according to given PD handle.
491 *
492 * @param[out] cdev
493 * Pointer to the mlx5 device.
494 *
495 * @return
496 * 0 on success, a negative errno value otherwise.
497 */
498 static int
mlx5_os_pd_import(struct mlx5_common_device * cdev)499 mlx5_os_pd_import(struct mlx5_common_device *cdev)
500 {
501 cdev->pd = mlx5_glue->import_pd(cdev->ctx, cdev->config.pd_handle);
502 if (cdev->pd == NULL) {
503 DRV_LOG(ERR, "Failed to import PD using handle=%d: %s",
504 cdev->config.pd_handle, rte_strerror(errno));
505 return errno ? -errno : -ENOMEM;
506 }
507 return 0;
508 }
509
510 /**
511 * Prepare Protection Domain object and extract its pdn using DV API.
512 *
513 * @param[out] cdev
514 * Pointer to the mlx5 device.
515 *
516 * @return
517 * 0 on success, a negative errno value otherwise and rte_errno is set.
518 */
519 int
mlx5_os_pd_prepare(struct mlx5_common_device * cdev)520 mlx5_os_pd_prepare(struct mlx5_common_device *cdev)
521 {
522 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
523 struct mlx5dv_obj obj;
524 struct mlx5dv_pd pd_info;
525 #endif
526 int ret;
527
528 if (cdev->config.pd_handle == MLX5_ARG_UNSET)
529 ret = mlx5_os_pd_create(cdev);
530 else
531 ret = mlx5_os_pd_import(cdev);
532 if (ret) {
533 rte_errno = -ret;
534 return ret;
535 }
536 if (cdev->config.devx == 0)
537 return 0;
538 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
539 obj.pd.in = cdev->pd;
540 obj.pd.out = &pd_info;
541 ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_PD);
542 if (ret != 0) {
543 DRV_LOG(ERR, "Fail to get PD object info.");
544 rte_errno = errno;
545 claim_zero(mlx5_os_pd_release(cdev));
546 cdev->pd = NULL;
547 return -rte_errno;
548 }
549 cdev->pdn = pd_info.pdn;
550 return 0;
551 #else
552 DRV_LOG(ERR, "Cannot get pdn - no DV support.");
553 rte_errno = ENOTSUP;
554 return -rte_errno;
555 #endif /* HAVE_IBV_FLOW_DV_SUPPORT */
556 }
557
558 static struct ibv_device *
mlx5_os_get_ibv_device(const struct rte_pci_device * pci_dev)559 mlx5_os_get_ibv_device(const struct rte_pci_device *pci_dev)
560 {
561 int n;
562 struct ibv_device **ibv_list = mlx5_glue->get_device_list(&n);
563 struct ibv_device *ibv_match = NULL;
564 uint8_t guid1[32] = {0};
565 uint8_t guid2[32] = {0};
566 int ret1, ret2 = -1;
567 struct rte_pci_addr paddr;
568 const struct rte_pci_addr *addr = &pci_dev->addr;
569 bool is_vf_dev = mlx5_dev_is_vf_pci(pci_dev);
570
571 if (ibv_list == NULL || !n) {
572 rte_errno = ENOSYS;
573 if (ibv_list)
574 mlx5_glue->free_device_list(ibv_list);
575 return NULL;
576 }
577 ret1 = mlx5_get_device_guid(addr, guid1, sizeof(guid1));
578 while (n-- > 0) {
579 DRV_LOG(DEBUG, "Checking device \"%s\"..", ibv_list[n]->name);
580 if (mlx5_get_pci_addr(ibv_list[n]->ibdev_path, &paddr) != 0)
581 continue;
582 if (ret1 > 0)
583 ret2 = mlx5_get_device_guid(&paddr, guid2, sizeof(guid2));
584 /* Bond device can bond secondary PCIe */
585 if ((strstr(ibv_list[n]->name, "bond") && !is_vf_dev &&
586 ((ret1 > 0 && ret2 > 0 && !memcmp(guid1, guid2, sizeof(guid1))) ||
587 (addr->domain == paddr.domain && addr->bus == paddr.bus &&
588 addr->devid == paddr.devid))) ||
589 !rte_pci_addr_cmp(addr, &paddr)) {
590 ibv_match = ibv_list[n];
591 break;
592 }
593 }
594 if (ibv_match == NULL) {
595 DRV_LOG(WARNING,
596 "No Verbs device matches PCI device " PCI_PRI_FMT ","
597 " are kernel drivers loaded?",
598 addr->domain, addr->bus, addr->devid, addr->function);
599 rte_errno = ENOENT;
600 }
601 mlx5_glue->free_device_list(ibv_list);
602 return ibv_match;
603 }
604
605 /* Try to disable ROCE by Netlink\Devlink. */
606 static int
mlx5_nl_roce_disable(const char * addr)607 mlx5_nl_roce_disable(const char *addr)
608 {
609 int nlsk_fd = mlx5_nl_init(NETLINK_GENERIC, 0);
610 int devlink_id;
611 int enable;
612 int ret;
613
614 if (nlsk_fd < 0)
615 return nlsk_fd;
616 devlink_id = mlx5_nl_devlink_family_id_get(nlsk_fd);
617 if (devlink_id < 0) {
618 ret = devlink_id;
619 DRV_LOG(DEBUG,
620 "Failed to get devlink id for ROCE operations by Netlink.");
621 goto close;
622 }
623 ret = mlx5_nl_enable_roce_get(nlsk_fd, devlink_id, addr, &enable);
624 if (ret) {
625 DRV_LOG(DEBUG, "Failed to get ROCE enable by Netlink: %d.",
626 ret);
627 goto close;
628 } else if (!enable) {
629 DRV_LOG(INFO, "ROCE has already disabled(Netlink).");
630 goto close;
631 }
632 ret = mlx5_nl_enable_roce_set(nlsk_fd, devlink_id, addr, 0);
633 if (ret)
634 DRV_LOG(DEBUG, "Failed to disable ROCE by Netlink: %d.", ret);
635 else
636 DRV_LOG(INFO, "ROCE is disabled by Netlink successfully.");
637 close:
638 close(nlsk_fd);
639 return ret;
640 }
641
642 /* Try to disable ROCE by sysfs. */
643 static int
mlx5_sys_roce_disable(const char * addr)644 mlx5_sys_roce_disable(const char *addr)
645 {
646 FILE *file_o;
647 int enable;
648 int ret;
649
650 MKSTR(file_p, "/sys/bus/pci/devices/%s/roce_enable", addr);
651 file_o = fopen(file_p, "rb");
652 if (!file_o) {
653 rte_errno = ENOTSUP;
654 return -ENOTSUP;
655 }
656 ret = fscanf(file_o, "%d", &enable);
657 if (ret != 1) {
658 rte_errno = EINVAL;
659 ret = EINVAL;
660 goto close;
661 } else if (!enable) {
662 ret = 0;
663 DRV_LOG(INFO, "ROCE has already disabled(sysfs).");
664 goto close;
665 }
666 fclose(file_o);
667 file_o = fopen(file_p, "wb");
668 if (!file_o) {
669 rte_errno = ENOTSUP;
670 return -ENOTSUP;
671 }
672 fprintf(file_o, "0\n");
673 ret = 0;
674 close:
675 if (ret)
676 DRV_LOG(DEBUG, "Failed to disable ROCE by sysfs: %d.", ret);
677 else
678 DRV_LOG(INFO, "ROCE is disabled by sysfs successfully.");
679 fclose(file_o);
680 return ret;
681 }
682
683 static int
mlx5_roce_disable(const struct rte_device * dev)684 mlx5_roce_disable(const struct rte_device *dev)
685 {
686 char pci_addr[PCI_PRI_STR_SIZE] = { 0 };
687
688 if (mlx5_dev_to_pci_str(dev, pci_addr, sizeof(pci_addr)) < 0)
689 return -rte_errno;
690 /* Firstly try to disable ROCE by Netlink and fallback to sysfs. */
691 if (mlx5_nl_roce_disable(pci_addr) != 0 &&
692 mlx5_sys_roce_disable(pci_addr) != 0)
693 return -rte_errno;
694 return 0;
695 }
696
697 static struct ibv_device *
mlx5_os_get_ibv_dev(const struct rte_device * dev)698 mlx5_os_get_ibv_dev(const struct rte_device *dev)
699 {
700 struct ibv_device *ibv;
701
702 if (mlx5_dev_is_pci(dev))
703 ibv = mlx5_os_get_ibv_device(RTE_DEV_TO_PCI_CONST(dev));
704 else
705 ibv = mlx5_get_aux_ibv_device(RTE_DEV_TO_AUXILIARY_CONST(dev));
706 if (ibv == NULL) {
707 rte_errno = ENODEV;
708 DRV_LOG(ERR, "Verbs device not found: %s", dev->name);
709 }
710 return ibv;
711 }
712
713 static struct ibv_device *
mlx5_vdpa_get_ibv_dev(const struct rte_device * dev)714 mlx5_vdpa_get_ibv_dev(const struct rte_device *dev)
715 {
716 struct ibv_device *ibv;
717 int retry;
718
719 if (mlx5_roce_disable(dev) != 0) {
720 DRV_LOG(WARNING, "Failed to disable ROCE for \"%s\".",
721 dev->name);
722 return NULL;
723 }
724 /* Wait for the IB device to appear again after reload. */
725 for (retry = MLX5_VDPA_MAX_RETRIES; retry > 0; --retry) {
726 ibv = mlx5_os_get_ibv_dev(dev);
727 if (ibv != NULL)
728 return ibv;
729 usleep(MLX5_VDPA_USEC);
730 }
731 DRV_LOG(ERR,
732 "Cannot get IB device after disabling RoCE for \"%s\", retries exceed %d.",
733 dev->name, MLX5_VDPA_MAX_RETRIES);
734 rte_errno = EAGAIN;
735 return NULL;
736 }
737
738 static int
mlx5_config_doorbell_mapping_env(int dbnc)739 mlx5_config_doorbell_mapping_env(int dbnc)
740 {
741 char *env;
742 int value;
743
744 MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
745 /* Get environment variable to store. */
746 env = getenv(MLX5_SHUT_UP_BF);
747 value = env ? !!strcmp(env, "0") : MLX5_ARG_UNSET;
748 if (dbnc == MLX5_ARG_UNSET)
749 setenv(MLX5_SHUT_UP_BF, MLX5_SHUT_UP_BF_DEFAULT, 1);
750 else
751 setenv(MLX5_SHUT_UP_BF,
752 dbnc == MLX5_SQ_DB_NCACHED ? "1" : "0", 1);
753 return value;
754 }
755
756 static void
mlx5_restore_doorbell_mapping_env(int value)757 mlx5_restore_doorbell_mapping_env(int value)
758 {
759 MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
760 /* Restore the original environment variable state. */
761 if (value == MLX5_ARG_UNSET)
762 unsetenv(MLX5_SHUT_UP_BF);
763 else
764 setenv(MLX5_SHUT_UP_BF, value ? "1" : "0", 1);
765 }
766
767 /**
768 * Function API to open IB device.
769 *
770 * @param cdev
771 * Pointer to the mlx5 device.
772 * @param classes
773 * Chosen classes come from device arguments.
774 *
775 * @return
776 * Pointer to ibv_context on success, NULL otherwise and rte_errno is set.
777 */
778 static struct ibv_context *
mlx5_open_device(struct mlx5_common_device * cdev,uint32_t classes)779 mlx5_open_device(struct mlx5_common_device *cdev, uint32_t classes)
780 {
781 struct ibv_device *ibv;
782 struct ibv_context *ctx = NULL;
783 int dbmap_env;
784
785 MLX5_ASSERT(cdev->config.device_fd == MLX5_ARG_UNSET);
786 if (classes & MLX5_CLASS_VDPA)
787 ibv = mlx5_vdpa_get_ibv_dev(cdev->dev);
788 else
789 ibv = mlx5_os_get_ibv_dev(cdev->dev);
790 if (!ibv)
791 return NULL;
792 DRV_LOG(INFO, "Dev information matches for device \"%s\".", ibv->name);
793 /*
794 * Configure environment variable "MLX5_BF_SHUT_UP" before the device
795 * creation. The rdma_core library checks the variable at device
796 * creation and stores the result internally.
797 */
798 dbmap_env = mlx5_config_doorbell_mapping_env(cdev->config.dbnc);
799 /* Try to open IB device with DV first, then usual Verbs. */
800 errno = 0;
801 ctx = mlx5_glue->dv_open_device(ibv);
802 if (ctx) {
803 cdev->config.devx = 1;
804 } else if (classes == MLX5_CLASS_ETH) {
805 /* The environment variable is still configured. */
806 ctx = mlx5_glue->open_device(ibv);
807 if (ctx == NULL)
808 goto error;
809 } else {
810 goto error;
811 }
812 /* The device is created, no need for environment. */
813 mlx5_restore_doorbell_mapping_env(dbmap_env);
814 return ctx;
815 error:
816 rte_errno = errno ? errno : ENODEV;
817 /* The device creation is failed, no need for environment. */
818 mlx5_restore_doorbell_mapping_env(dbmap_env);
819 DRV_LOG(ERR, "Failed to open IB device \"%s\".", ibv->name);
820 return NULL;
821 }
822
823 /**
824 * Function API to import IB device.
825 *
826 * @param cdev
827 * Pointer to the mlx5 device.
828 *
829 * @return
830 * Pointer to ibv_context on success, NULL otherwise and rte_errno is set.
831 */
832 static struct ibv_context *
mlx5_import_device(struct mlx5_common_device * cdev)833 mlx5_import_device(struct mlx5_common_device *cdev)
834 {
835 struct ibv_context *ctx = NULL;
836
837 MLX5_ASSERT(cdev->config.device_fd != MLX5_ARG_UNSET);
838 ctx = mlx5_glue->import_device(cdev->config.device_fd);
839 if (!ctx) {
840 DRV_LOG(ERR, "Failed to import device for fd=%d: %s",
841 cdev->config.device_fd, rte_strerror(errno));
842 rte_errno = errno;
843 }
844 return ctx;
845 }
846
847 /**
848 * Function API to prepare IB device.
849 *
850 * @param cdev
851 * Pointer to the mlx5 device.
852 * @param classes
853 * Chosen classes come from device arguments.
854 *
855 * @return
856 * 0 on success, a negative errno value otherwise and rte_errno is set.
857 */
858 int
mlx5_os_open_device(struct mlx5_common_device * cdev,uint32_t classes)859 mlx5_os_open_device(struct mlx5_common_device *cdev, uint32_t classes)
860 {
861
862 struct ibv_context *ctx = NULL;
863
864 if (cdev->config.device_fd == MLX5_ARG_UNSET)
865 ctx = mlx5_open_device(cdev, classes);
866 else
867 ctx = mlx5_import_device(cdev);
868 if (ctx == NULL)
869 return -rte_errno;
870 /* Hint libmlx5 to use PMD allocator for data plane resources */
871 mlx5_set_context_attr(cdev->dev, ctx);
872 cdev->ctx = ctx;
873 return 0;
874 }
875
876 int
mlx5_get_device_guid(const struct rte_pci_addr * dev,uint8_t * guid,size_t len)877 mlx5_get_device_guid(const struct rte_pci_addr *dev, uint8_t *guid, size_t len)
878 {
879 char tmp[512];
880 char cur_ifname[IF_NAMESIZE + 1];
881 FILE *id_file;
882 DIR *dir;
883 struct dirent *ptr;
884 int ret;
885
886 if (guid == NULL || len < sizeof(u_int64_t) + 1)
887 return -1;
888 memset(guid, 0, len);
889 snprintf(tmp, sizeof(tmp), "/sys/bus/pci/devices/%04x:%02x:%02x.%x/net",
890 dev->domain, dev->bus, dev->devid, dev->function);
891 dir = opendir(tmp);
892 if (dir == NULL)
893 return -1;
894 /* Traverse to identify PF interface */
895 do {
896 ptr = readdir(dir);
897 if (ptr == NULL || ptr->d_type != DT_DIR) {
898 closedir(dir);
899 return -1;
900 }
901 } while (strchr(ptr->d_name, '.') || strchr(ptr->d_name, '_') ||
902 strchr(ptr->d_name, 'v'));
903 snprintf(cur_ifname, sizeof(cur_ifname), "%s", ptr->d_name);
904 closedir(dir);
905 snprintf(tmp + strlen(tmp), sizeof(tmp) - strlen(tmp),
906 "/%s/phys_switch_id", cur_ifname);
907 /* Older OFED like 5.3 doesn't support read */
908 id_file = fopen(tmp, "r");
909 if (!id_file)
910 return 0;
911 ret = fscanf(id_file, "%16s", guid);
912 fclose(id_file);
913 return ret;
914 }
915
916 /*
917 * Create direct mkey using the kernel ibv_reg_mr API and wrap it with a new
918 * indirect mkey created by the DevX API.
919 * This mkey should be used for DevX commands requesting mkey as a parameter.
920 */
921 int
mlx5_os_wrapped_mkey_create(void * ctx,void * pd,uint32_t pdn,void * addr,size_t length,struct mlx5_pmd_wrapped_mr * pmd_mr)922 mlx5_os_wrapped_mkey_create(void *ctx, void *pd, uint32_t pdn, void *addr,
923 size_t length, struct mlx5_pmd_wrapped_mr *pmd_mr)
924 {
925 struct mlx5_klm klm = {
926 .byte_count = length,
927 .address = (uintptr_t)addr,
928 };
929 struct mlx5_devx_mkey_attr mkey_attr = {
930 .pd = pdn,
931 .klm_array = &klm,
932 .klm_num = 1,
933 };
934 struct mlx5_devx_obj *mkey;
935 struct ibv_mr *ibv_mr = mlx5_glue->reg_mr(pd, addr, length,
936 IBV_ACCESS_LOCAL_WRITE |
937 (haswell_broadwell_cpu ? 0 :
938 IBV_ACCESS_RELAXED_ORDERING));
939
940 if (!ibv_mr) {
941 rte_errno = errno;
942 return -rte_errno;
943 }
944 klm.mkey = ibv_mr->lkey;
945 mkey_attr.addr = (uintptr_t)addr;
946 mkey_attr.size = length;
947 mkey = mlx5_devx_cmd_mkey_create(ctx, &mkey_attr);
948 if (!mkey) {
949 claim_zero(mlx5_glue->dereg_mr(ibv_mr));
950 return -rte_errno;
951 }
952 pmd_mr->addr = addr;
953 pmd_mr->len = length;
954 pmd_mr->obj = (void *)ibv_mr;
955 pmd_mr->imkey = mkey;
956 pmd_mr->lkey = mkey->id;
957 return 0;
958 }
959
960 void
mlx5_os_wrapped_mkey_destroy(struct mlx5_pmd_wrapped_mr * pmd_mr)961 mlx5_os_wrapped_mkey_destroy(struct mlx5_pmd_wrapped_mr *pmd_mr)
962 {
963 if (!pmd_mr)
964 return;
965 if (pmd_mr->imkey)
966 claim_zero(mlx5_devx_cmd_destroy(pmd_mr->imkey));
967 if (pmd_mr->obj)
968 claim_zero(mlx5_glue->dereg_mr(pmd_mr->obj));
969 memset(pmd_mr, 0, sizeof(*pmd_mr));
970 }
971
972 /**
973 * Rte_intr_handle create and init helper.
974 *
975 * @param[in] mode
976 * interrupt instance can be shared between primary and secondary
977 * processes or not.
978 * @param[in] set_fd_nonblock
979 * Whether to set fd to O_NONBLOCK.
980 * @param[in] fd
981 * Fd to set in created intr_handle.
982 * @param[in] cb
983 * Callback to register for intr_handle.
984 * @param[in] cb_arg
985 * Callback argument for cb.
986 *
987 * @return
988 * - Interrupt handle on success.
989 * - NULL on failure, with rte_errno set.
990 */
991 struct rte_intr_handle *
mlx5_os_interrupt_handler_create(int mode,bool set_fd_nonblock,int fd,rte_intr_callback_fn cb,void * cb_arg)992 mlx5_os_interrupt_handler_create(int mode, bool set_fd_nonblock, int fd,
993 rte_intr_callback_fn cb, void *cb_arg)
994 {
995 struct rte_intr_handle *tmp_intr_handle;
996 int ret, flags;
997
998 tmp_intr_handle = rte_intr_instance_alloc(mode);
999 if (!tmp_intr_handle) {
1000 rte_errno = ENOMEM;
1001 goto err;
1002 }
1003 if (set_fd_nonblock) {
1004 flags = fcntl(fd, F_GETFL);
1005 ret = fcntl(fd, F_SETFL, flags | O_NONBLOCK);
1006 if (ret) {
1007 rte_errno = errno;
1008 goto err;
1009 }
1010 }
1011 ret = rte_intr_fd_set(tmp_intr_handle, fd);
1012 if (ret)
1013 goto err;
1014 ret = rte_intr_type_set(tmp_intr_handle, RTE_INTR_HANDLE_EXT);
1015 if (ret)
1016 goto err;
1017 ret = rte_intr_callback_register(tmp_intr_handle, cb, cb_arg);
1018 if (ret) {
1019 rte_errno = -ret;
1020 goto err;
1021 }
1022 return tmp_intr_handle;
1023 err:
1024 rte_intr_instance_free(tmp_intr_handle);
1025 return NULL;
1026 }
1027
1028 /* Safe unregistration for interrupt callback. */
1029 static void
mlx5_intr_callback_unregister(const struct rte_intr_handle * handle,rte_intr_callback_fn cb_fn,void * cb_arg)1030 mlx5_intr_callback_unregister(const struct rte_intr_handle *handle,
1031 rte_intr_callback_fn cb_fn, void *cb_arg)
1032 {
1033 uint64_t twait = 0;
1034 uint64_t start = 0;
1035
1036 do {
1037 int ret;
1038
1039 ret = rte_intr_callback_unregister(handle, cb_fn, cb_arg);
1040 if (ret >= 0)
1041 return;
1042 if (ret != -EAGAIN) {
1043 DRV_LOG(INFO, "failed to unregister interrupt"
1044 " handler (error: %d)", ret);
1045 MLX5_ASSERT(false);
1046 return;
1047 }
1048 if (twait) {
1049 struct timespec onems;
1050
1051 /* Wait one millisecond and try again. */
1052 onems.tv_sec = 0;
1053 onems.tv_nsec = NS_PER_S / MS_PER_S;
1054 nanosleep(&onems, 0);
1055 /* Check whether one second elapsed. */
1056 if ((rte_get_timer_cycles() - start) <= twait)
1057 continue;
1058 } else {
1059 /*
1060 * We get the amount of timer ticks for one second.
1061 * If this amount elapsed it means we spent one
1062 * second in waiting. This branch is executed once
1063 * on first iteration.
1064 */
1065 twait = rte_get_timer_hz();
1066 MLX5_ASSERT(twait);
1067 }
1068 /*
1069 * Timeout elapsed, show message (once a second) and retry.
1070 * We have no other acceptable option here, if we ignore
1071 * the unregistering return code the handler will not
1072 * be unregistered, fd will be closed and we may get the
1073 * crush. Hanging and messaging in the loop seems not to be
1074 * the worst choice.
1075 */
1076 DRV_LOG(INFO, "Retrying to unregister interrupt handler");
1077 start = rte_get_timer_cycles();
1078 } while (true);
1079 }
1080
1081 /**
1082 * Rte_intr_handle destroy helper.
1083 *
1084 * @param[in] intr_handle
1085 * Rte_intr_handle to destroy.
1086 * @param[in] cb
1087 * Callback which is registered to intr_handle.
1088 * @param[in] cb_arg
1089 * Callback argument for cb.
1090 *
1091 */
1092 void
mlx5_os_interrupt_handler_destroy(struct rte_intr_handle * intr_handle,rte_intr_callback_fn cb,void * cb_arg)1093 mlx5_os_interrupt_handler_destroy(struct rte_intr_handle *intr_handle,
1094 rte_intr_callback_fn cb, void *cb_arg)
1095 {
1096 if (rte_intr_fd_get(intr_handle) >= 0)
1097 mlx5_intr_callback_unregister(intr_handle, cb, cb_arg);
1098 rte_intr_instance_free(intr_handle);
1099 }
1100