xref: /dpdk/drivers/common/mlx5/linux/mlx5_common_os.c (revision f956d3d4c33cdfac5e352f457050029bd5c9b8a8)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2020 Mellanox Technologies, Ltd
3  */
4 
5 #include <sys/types.h>
6 #include <unistd.h>
7 #include <string.h>
8 #include <stdio.h>
9 #ifdef RTE_IBVERBS_LINK_DLOPEN
10 #include <dlfcn.h>
11 #endif
12 #include <dirent.h>
13 #include <net/if.h>
14 
15 #include <rte_errno.h>
16 #include <rte_string_fns.h>
17 #include <rte_bus_pci.h>
18 #include <rte_bus_auxiliary.h>
19 
20 #include "mlx5_common.h"
21 #include "mlx5_nl.h"
22 #include "mlx5_common_log.h"
23 #include "mlx5_common_private.h"
24 #include "mlx5_common_defs.h"
25 #include "mlx5_common_os.h"
26 #include "mlx5_glue.h"
27 
28 #ifdef MLX5_GLUE
29 const struct mlx5_glue *mlx5_glue;
30 #endif
31 
32 int
33 mlx5_get_pci_addr(const char *dev_path, struct rte_pci_addr *pci_addr)
34 {
35 	FILE *file;
36 	char line[32];
37 	int rc = -ENOENT;
38 	MKSTR(path, "%s/device/uevent", dev_path);
39 
40 	file = fopen(path, "rb");
41 	if (file == NULL) {
42 		rte_errno = errno;
43 		return -rte_errno;
44 	}
45 	while (fgets(line, sizeof(line), file) == line) {
46 		size_t len = strlen(line);
47 
48 		/* Truncate long lines. */
49 		if (len == (sizeof(line) - 1)) {
50 			while (line[(len - 1)] != '\n') {
51 				int ret = fgetc(file);
52 
53 				if (ret == EOF)
54 					goto exit;
55 				line[(len - 1)] = ret;
56 			}
57 			/* No match for long lines. */
58 			continue;
59 		}
60 		/* Extract information. */
61 		if (sscanf(line,
62 			   "PCI_SLOT_NAME="
63 			   "%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n",
64 			   &pci_addr->domain,
65 			   &pci_addr->bus,
66 			   &pci_addr->devid,
67 			   &pci_addr->function) == 4) {
68 			rc = 0;
69 			break;
70 		}
71 	}
72 exit:
73 	fclose(file);
74 	if (rc)
75 		rte_errno = -rc;
76 	return rc;
77 }
78 
79 /**
80  * Extract port name, as a number, from sysfs or netlink information.
81  *
82  * @param[in] port_name_in
83  *   String representing the port name.
84  * @param[out] port_info_out
85  *   Port information, including port name as a number and port name
86  *   type if recognized
87  *
88  * @return
89  *   port_name field set according to recognized name format.
90  */
91 void
92 mlx5_translate_port_name(const char *port_name_in,
93 			 struct mlx5_switch_info *port_info_out)
94 {
95 	char ctrl = 0, pf_c1, pf_c2, vf_c1, vf_c2, eol;
96 	char *end;
97 	int sc_items;
98 
99 	sc_items = sscanf(port_name_in, "%c%d",
100 			  &ctrl, &port_info_out->ctrl_num);
101 	if (sc_items == 2 && ctrl == 'c') {
102 		port_name_in++; /* 'c' */
103 		port_name_in += snprintf(NULL, 0, "%d",
104 					  port_info_out->ctrl_num);
105 	}
106 	/* Check for port-name as a string of the form pf0vf0 or pf0sf0 */
107 	sc_items = sscanf(port_name_in, "%c%c%d%c%c%d%c",
108 			  &pf_c1, &pf_c2, &port_info_out->pf_num,
109 			  &vf_c1, &vf_c2, &port_info_out->port_name, &eol);
110 	if (sc_items == 6 && pf_c1 == 'p' && pf_c2 == 'f') {
111 		if (vf_c1 == 'v' && vf_c2 == 'f') {
112 			/* Kernel ver >= 5.0 or OFED ver >= 4.6 */
113 			port_info_out->name_type =
114 					MLX5_PHYS_PORT_NAME_TYPE_PFVF;
115 			return;
116 		}
117 		if (vf_c1 == 's' && vf_c2 == 'f') {
118 			/* Kernel ver >= 5.11 or OFED ver >= 5.1 */
119 			port_info_out->name_type =
120 					MLX5_PHYS_PORT_NAME_TYPE_PFSF;
121 			return;
122 		}
123 	}
124 	/*
125 	 * Check for port-name as a string of the form p0
126 	 * (support kernel ver >= 5.0, or OFED ver >= 4.6).
127 	 */
128 	sc_items = sscanf(port_name_in, "%c%d%c",
129 			  &pf_c1, &port_info_out->port_name, &eol);
130 	if (sc_items == 2 && pf_c1 == 'p') {
131 		port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_UPLINK;
132 		return;
133 	}
134 	/*
135 	 * Check for port-name as a string of the form pf0
136 	 * (support kernel ver >= 5.7 for HPF representor on BF).
137 	 */
138 	sc_items = sscanf(port_name_in, "%c%c%d%c",
139 			  &pf_c1, &pf_c2, &port_info_out->pf_num, &eol);
140 	if (sc_items == 3 && pf_c1 == 'p' && pf_c2 == 'f') {
141 		port_info_out->port_name = -1;
142 		port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_PFHPF;
143 		return;
144 	}
145 	/* Check for port-name as a number (support kernel ver < 5.0 */
146 	errno = 0;
147 	port_info_out->port_name = strtol(port_name_in, &end, 0);
148 	if (!errno &&
149 	    (size_t)(end - port_name_in) == strlen(port_name_in)) {
150 		port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_LEGACY;
151 		return;
152 	}
153 	port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN;
154 }
155 
156 int
157 mlx5_get_ifname_sysfs(const char *ibdev_path, char *ifname)
158 {
159 	DIR *dir;
160 	struct dirent *dent;
161 	unsigned int dev_type = 0;
162 	unsigned int dev_port_prev = ~0u;
163 	char match[IF_NAMESIZE] = "";
164 
165 	MLX5_ASSERT(ibdev_path);
166 	{
167 		MKSTR(path, "%s/device/net", ibdev_path);
168 
169 		dir = opendir(path);
170 		if (dir == NULL) {
171 			rte_errno = errno;
172 			return -rte_errno;
173 		}
174 	}
175 	while ((dent = readdir(dir)) != NULL) {
176 		char *name = dent->d_name;
177 		FILE *file;
178 		unsigned int dev_port;
179 		int r;
180 
181 		if ((name[0] == '.') &&
182 		    ((name[1] == '\0') ||
183 		     ((name[1] == '.') && (name[2] == '\0'))))
184 			continue;
185 
186 		MKSTR(path, "%s/device/net/%s/%s",
187 		      ibdev_path, name,
188 		      (dev_type ? "dev_id" : "dev_port"));
189 
190 		file = fopen(path, "rb");
191 		if (file == NULL) {
192 			if (errno != ENOENT)
193 				continue;
194 			/*
195 			 * Switch to dev_id when dev_port does not exist as
196 			 * is the case with Linux kernel versions < 3.15.
197 			 */
198 try_dev_id:
199 			match[0] = '\0';
200 			if (dev_type)
201 				break;
202 			dev_type = 1;
203 			dev_port_prev = ~0u;
204 			rewinddir(dir);
205 			continue;
206 		}
207 		r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port);
208 		fclose(file);
209 		if (r != 1)
210 			continue;
211 		/*
212 		 * Switch to dev_id when dev_port returns the same value for
213 		 * all ports. May happen when using a MOFED release older than
214 		 * 3.0 with a Linux kernel >= 3.15.
215 		 */
216 		if (dev_port == dev_port_prev)
217 			goto try_dev_id;
218 		dev_port_prev = dev_port;
219 		if (dev_port == 0)
220 			strlcpy(match, name, IF_NAMESIZE);
221 	}
222 	closedir(dir);
223 	if (match[0] == '\0') {
224 		rte_errno = ENOENT;
225 		return -rte_errno;
226 	}
227 	strncpy(ifname, match, IF_NAMESIZE);
228 	return 0;
229 }
230 
231 #ifdef MLX5_GLUE
232 
233 /**
234  * Suffix RTE_EAL_PMD_PATH with "-glue".
235  *
236  * This function performs a sanity check on RTE_EAL_PMD_PATH before
237  * suffixing its last component.
238  *
239  * @param buf[out]
240  *   Output buffer, should be large enough otherwise NULL is returned.
241  * @param size
242  *   Size of @p out.
243  *
244  * @return
245  *   Pointer to @p buf or @p NULL in case suffix cannot be appended.
246  */
247 static char *
248 mlx5_glue_path(char *buf, size_t size)
249 {
250 	static const char *const bad[] = { "/", ".", "..", NULL };
251 	const char *path = RTE_EAL_PMD_PATH;
252 	size_t len = strlen(path);
253 	size_t off;
254 	int i;
255 
256 	while (len && path[len - 1] == '/')
257 		--len;
258 	for (off = len; off && path[off - 1] != '/'; --off)
259 		;
260 	for (i = 0; bad[i]; ++i)
261 		if (!strncmp(path + off, bad[i], (int)(len - off)))
262 			goto error;
263 	i = snprintf(buf, size, "%.*s-glue", (int)len, path);
264 	if (i == -1 || (size_t)i >= size)
265 		goto error;
266 	return buf;
267 error:
268 	RTE_LOG(ERR, PMD, "unable to append \"-glue\" to last component of"
269 		" RTE_EAL_PMD_PATH (\"" RTE_EAL_PMD_PATH "\"), please"
270 		" re-configure DPDK");
271 	return NULL;
272 }
273 
274 static int
275 mlx5_glue_dlopen(void)
276 {
277 	char glue_path[sizeof(RTE_EAL_PMD_PATH) - 1 + sizeof("-glue")];
278 	void *handle = NULL;
279 
280 	char const *path[] = {
281 		/*
282 		 * A basic security check is necessary before trusting
283 		 * MLX5_GLUE_PATH, which may override RTE_EAL_PMD_PATH.
284 		 */
285 		(geteuid() == getuid() && getegid() == getgid() ?
286 		 getenv("MLX5_GLUE_PATH") : NULL),
287 		/*
288 		 * When RTE_EAL_PMD_PATH is set, use its glue-suffixed
289 		 * variant, otherwise let dlopen() look up libraries on its
290 		 * own.
291 		 */
292 		(*RTE_EAL_PMD_PATH ?
293 		 mlx5_glue_path(glue_path, sizeof(glue_path)) : ""),
294 	};
295 	unsigned int i = 0;
296 	void **sym;
297 	const char *dlmsg;
298 
299 	while (!handle && i != RTE_DIM(path)) {
300 		const char *end;
301 		size_t len;
302 		int ret;
303 
304 		if (!path[i]) {
305 			++i;
306 			continue;
307 		}
308 		end = strpbrk(path[i], ":;");
309 		if (!end)
310 			end = path[i] + strlen(path[i]);
311 		len = end - path[i];
312 		ret = 0;
313 		do {
314 			char name[ret + 1];
315 
316 			ret = snprintf(name, sizeof(name), "%.*s%s" MLX5_GLUE,
317 				       (int)len, path[i],
318 				       (!len || *(end - 1) == '/') ? "" : "/");
319 			if (ret == -1)
320 				break;
321 			if (sizeof(name) != (size_t)ret + 1)
322 				continue;
323 			DRV_LOG(DEBUG, "Looking for rdma-core glue as "
324 				"\"%s\"", name);
325 			handle = dlopen(name, RTLD_LAZY);
326 			break;
327 		} while (1);
328 		path[i] = end + 1;
329 		if (!*end)
330 			++i;
331 	}
332 	if (!handle) {
333 		rte_errno = EINVAL;
334 		dlmsg = dlerror();
335 		if (dlmsg)
336 			DRV_LOG(WARNING, "Cannot load glue library: %s", dlmsg);
337 		goto glue_error;
338 	}
339 	sym = dlsym(handle, "mlx5_glue");
340 	if (!sym || !*sym) {
341 		rte_errno = EINVAL;
342 		dlmsg = dlerror();
343 		if (dlmsg)
344 			DRV_LOG(ERR, "Cannot resolve glue symbol: %s", dlmsg);
345 		goto glue_error;
346 	}
347 	mlx5_glue = *sym;
348 	return 0;
349 
350 glue_error:
351 	if (handle)
352 		dlclose(handle);
353 	return -1;
354 }
355 
356 #endif
357 
358 /**
359  * Initialization routine for run-time dependency on rdma-core.
360  */
361 void
362 mlx5_glue_constructor(void)
363 {
364 	/*
365 	 * RDMAV_HUGEPAGES_SAFE tells ibv_fork_init() we intend to use
366 	 * huge pages. Calling ibv_fork_init() during init allows
367 	 * applications to use fork() safely for purposes other than
368 	 * using this PMD, which is not supported in forked processes.
369 	 */
370 	setenv("RDMAV_HUGEPAGES_SAFE", "1", 1);
371 	/* Match the size of Rx completion entry to the size of a cacheline. */
372 	if (RTE_CACHE_LINE_SIZE == 128)
373 		setenv("MLX5_CQE_SIZE", "128", 0);
374 	/*
375 	 * MLX5_DEVICE_FATAL_CLEANUP tells ibv_destroy functions to
376 	 * cleanup all the Verbs resources even when the device was removed.
377 	 */
378 	setenv("MLX5_DEVICE_FATAL_CLEANUP", "1", 1);
379 
380 #ifdef MLX5_GLUE
381 	if (mlx5_glue_dlopen() != 0)
382 		goto glue_error;
383 #endif
384 
385 #ifdef RTE_LIBRTE_MLX5_DEBUG
386 	/* Glue structure must not contain any NULL pointers. */
387 	{
388 		unsigned int i;
389 
390 		for (i = 0; i != sizeof(*mlx5_glue) / sizeof(void *); ++i)
391 			MLX5_ASSERT(((const void *const *)mlx5_glue)[i]);
392 	}
393 #endif
394 	if (strcmp(mlx5_glue->version, MLX5_GLUE_VERSION)) {
395 		rte_errno = EINVAL;
396 		DRV_LOG(ERR, "rdma-core glue \"%s\" mismatch: \"%s\" is "
397 			"required", mlx5_glue->version, MLX5_GLUE_VERSION);
398 		goto glue_error;
399 	}
400 	mlx5_glue->fork_init();
401 	return;
402 
403 glue_error:
404 	DRV_LOG(WARNING, "Cannot initialize MLX5 common due to missing"
405 		" run-time dependency on rdma-core libraries (libibverbs,"
406 		" libmlx5)");
407 	mlx5_glue = NULL;
408 }
409 
410 /**
411  * Validate user arguments for remote PD and CTX.
412  *
413  * @param config
414  *   Pointer to device configuration structure.
415  *
416  * @return
417  *   0 on success, a negative errno value otherwise and rte_errno is set.
418  */
419 int
420 mlx5_os_remote_pd_and_ctx_validate(struct mlx5_common_dev_config *config)
421 {
422 	int device_fd = config->device_fd;
423 	int pd_handle = config->pd_handle;
424 
425 #ifdef HAVE_MLX5_IBV_IMPORT_CTX_PD_AND_MR
426 	if (device_fd == MLX5_ARG_UNSET && pd_handle != MLX5_ARG_UNSET) {
427 		DRV_LOG(ERR, "Remote PD without CTX is not supported.");
428 		rte_errno = EINVAL;
429 		return -rte_errno;
430 	}
431 	if (device_fd != MLX5_ARG_UNSET && pd_handle == MLX5_ARG_UNSET) {
432 		DRV_LOG(ERR, "Remote CTX without PD is not supported.");
433 		rte_errno = EINVAL;
434 		return -rte_errno;
435 	}
436 	DRV_LOG(DEBUG, "Remote PD and CTX is supported: (cmd_fd=%d, "
437 		"pd_handle=%d).", device_fd, pd_handle);
438 #else
439 	if (pd_handle != MLX5_ARG_UNSET || device_fd != MLX5_ARG_UNSET) {
440 		DRV_LOG(ERR,
441 			"Remote PD and CTX is not supported - maybe old rdma-core version?");
442 		rte_errno = ENOTSUP;
443 		return -rte_errno;
444 	}
445 #endif
446 	return 0;
447 }
448 
449 /**
450  * Release Protection Domain object.
451  *
452  * @param[out] cdev
453  *   Pointer to the mlx5 device.
454  *
455  * @return
456  *   0 on success, a negative errno value otherwise.
457  */
458 int
459 mlx5_os_pd_release(struct mlx5_common_device *cdev)
460 {
461 	if (cdev->config.pd_handle == MLX5_ARG_UNSET)
462 		return mlx5_glue->dealloc_pd(cdev->pd);
463 	else
464 		return mlx5_glue->unimport_pd(cdev->pd);
465 }
466 
467 /**
468  * Allocate Protection Domain object.
469  *
470  * @param[out] cdev
471  *   Pointer to the mlx5 device.
472  *
473  * @return
474  *   0 on success, a negative errno value otherwise.
475  */
476 static int
477 mlx5_os_pd_create(struct mlx5_common_device *cdev)
478 {
479 	cdev->pd = mlx5_glue->alloc_pd(cdev->ctx);
480 	if (cdev->pd == NULL) {
481 		DRV_LOG(ERR, "Failed to allocate PD: %s", rte_strerror(errno));
482 		return errno ? -errno : -ENOMEM;
483 	}
484 	return 0;
485 }
486 
487 /**
488  * Import Protection Domain object according to given PD handle.
489  *
490  * @param[out] cdev
491  *   Pointer to the mlx5 device.
492  *
493  * @return
494  *   0 on success, a negative errno value otherwise.
495  */
496 static int
497 mlx5_os_pd_import(struct mlx5_common_device *cdev)
498 {
499 	cdev->pd = mlx5_glue->import_pd(cdev->ctx, cdev->config.pd_handle);
500 	if (cdev->pd == NULL) {
501 		DRV_LOG(ERR, "Failed to import PD using handle=%d: %s",
502 			cdev->config.pd_handle, rte_strerror(errno));
503 		return errno ? -errno : -ENOMEM;
504 	}
505 	return 0;
506 }
507 
508 /**
509  * Prepare Protection Domain object and extract its pdn using DV API.
510  *
511  * @param[out] cdev
512  *   Pointer to the mlx5 device.
513  *
514  * @return
515  *   0 on success, a negative errno value otherwise and rte_errno is set.
516  */
517 int
518 mlx5_os_pd_prepare(struct mlx5_common_device *cdev)
519 {
520 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
521 	struct mlx5dv_obj obj;
522 	struct mlx5dv_pd pd_info;
523 #endif
524 	int ret;
525 
526 	if (cdev->config.pd_handle == MLX5_ARG_UNSET)
527 		ret = mlx5_os_pd_create(cdev);
528 	else
529 		ret = mlx5_os_pd_import(cdev);
530 	if (ret) {
531 		rte_errno = -ret;
532 		return ret;
533 	}
534 	if (cdev->config.devx == 0)
535 		return 0;
536 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
537 	obj.pd.in = cdev->pd;
538 	obj.pd.out = &pd_info;
539 	ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_PD);
540 	if (ret != 0) {
541 		DRV_LOG(ERR, "Fail to get PD object info.");
542 		rte_errno = errno;
543 		claim_zero(mlx5_os_pd_release(cdev));
544 		cdev->pd = NULL;
545 		return -rte_errno;
546 	}
547 	cdev->pdn = pd_info.pdn;
548 	return 0;
549 #else
550 	DRV_LOG(ERR, "Cannot get pdn - no DV support.");
551 	rte_errno = ENOTSUP;
552 	return -rte_errno;
553 #endif /* HAVE_IBV_FLOW_DV_SUPPORT */
554 }
555 
556 static struct ibv_device *
557 mlx5_os_get_ibv_device(const struct rte_pci_addr *addr)
558 {
559 	int n;
560 	struct ibv_device **ibv_list = mlx5_glue->get_device_list(&n);
561 	struct ibv_device *ibv_match = NULL;
562 	uint8_t guid1[32] = {0};
563 	uint8_t guid2[32] = {0};
564 	int ret1, ret2 = -1;
565 	struct rte_pci_addr paddr;
566 
567 	if (ibv_list == NULL || !n) {
568 		rte_errno = ENOSYS;
569 		if (ibv_list)
570 			mlx5_glue->free_device_list(ibv_list);
571 		return NULL;
572 	}
573 	ret1 = mlx5_get_device_guid(addr, guid1, sizeof(guid1));
574 	while (n-- > 0) {
575 		DRV_LOG(DEBUG, "Checking device \"%s\"..", ibv_list[n]->name);
576 		if (mlx5_get_pci_addr(ibv_list[n]->ibdev_path, &paddr) != 0)
577 			continue;
578 		if (ret1 > 0)
579 			ret2 = mlx5_get_device_guid(&paddr, guid2, sizeof(guid2));
580 		/* Bond device can bond secondary PCIe */
581 		if ((strstr(ibv_list[n]->name, "bond") &&
582 		    ((ret1 > 0 && ret2 > 0 && !memcmp(guid1, guid2, sizeof(guid1))) ||
583 		    (addr->domain == paddr.domain && addr->bus == paddr.bus &&
584 		     addr->devid == paddr.devid))) ||
585 		     !rte_pci_addr_cmp(addr, &paddr)) {
586 			ibv_match = ibv_list[n];
587 			break;
588 		}
589 	}
590 	if (ibv_match == NULL) {
591 		DRV_LOG(WARNING,
592 			"No Verbs device matches PCI device " PCI_PRI_FMT ","
593 			" are kernel drivers loaded?",
594 			addr->domain, addr->bus, addr->devid, addr->function);
595 		rte_errno = ENOENT;
596 	}
597 	mlx5_glue->free_device_list(ibv_list);
598 	return ibv_match;
599 }
600 
601 /* Try to disable ROCE by Netlink\Devlink. */
602 static int
603 mlx5_nl_roce_disable(const char *addr)
604 {
605 	int nlsk_fd = mlx5_nl_init(NETLINK_GENERIC, 0);
606 	int devlink_id;
607 	int enable;
608 	int ret;
609 
610 	if (nlsk_fd < 0)
611 		return nlsk_fd;
612 	devlink_id = mlx5_nl_devlink_family_id_get(nlsk_fd);
613 	if (devlink_id < 0) {
614 		ret = devlink_id;
615 		DRV_LOG(DEBUG,
616 			"Failed to get devlink id for ROCE operations by Netlink.");
617 		goto close;
618 	}
619 	ret = mlx5_nl_enable_roce_get(nlsk_fd, devlink_id, addr, &enable);
620 	if (ret) {
621 		DRV_LOG(DEBUG, "Failed to get ROCE enable by Netlink: %d.",
622 			ret);
623 		goto close;
624 	} else if (!enable) {
625 		DRV_LOG(INFO, "ROCE has already disabled(Netlink).");
626 		goto close;
627 	}
628 	ret = mlx5_nl_enable_roce_set(nlsk_fd, devlink_id, addr, 0);
629 	if (ret)
630 		DRV_LOG(DEBUG, "Failed to disable ROCE by Netlink: %d.", ret);
631 	else
632 		DRV_LOG(INFO, "ROCE is disabled by Netlink successfully.");
633 close:
634 	close(nlsk_fd);
635 	return ret;
636 }
637 
638 /* Try to disable ROCE by sysfs. */
639 static int
640 mlx5_sys_roce_disable(const char *addr)
641 {
642 	FILE *file_o;
643 	int enable;
644 	int ret;
645 
646 	MKSTR(file_p, "/sys/bus/pci/devices/%s/roce_enable", addr);
647 	file_o = fopen(file_p, "rb");
648 	if (!file_o) {
649 		rte_errno = ENOTSUP;
650 		return -ENOTSUP;
651 	}
652 	ret = fscanf(file_o, "%d", &enable);
653 	if (ret != 1) {
654 		rte_errno = EINVAL;
655 		ret = EINVAL;
656 		goto close;
657 	} else if (!enable) {
658 		ret = 0;
659 		DRV_LOG(INFO, "ROCE has already disabled(sysfs).");
660 		goto close;
661 	}
662 	fclose(file_o);
663 	file_o = fopen(file_p, "wb");
664 	if (!file_o) {
665 		rte_errno = ENOTSUP;
666 		return -ENOTSUP;
667 	}
668 	fprintf(file_o, "0\n");
669 	ret = 0;
670 close:
671 	if (ret)
672 		DRV_LOG(DEBUG, "Failed to disable ROCE by sysfs: %d.", ret);
673 	else
674 		DRV_LOG(INFO, "ROCE is disabled by sysfs successfully.");
675 	fclose(file_o);
676 	return ret;
677 }
678 
679 static int
680 mlx5_roce_disable(const struct rte_device *dev)
681 {
682 	char pci_addr[PCI_PRI_STR_SIZE] = { 0 };
683 
684 	if (mlx5_dev_to_pci_str(dev, pci_addr, sizeof(pci_addr)) < 0)
685 		return -rte_errno;
686 	/* Firstly try to disable ROCE by Netlink and fallback to sysfs. */
687 	if (mlx5_nl_roce_disable(pci_addr) != 0 &&
688 	    mlx5_sys_roce_disable(pci_addr) != 0)
689 		return -rte_errno;
690 	return 0;
691 }
692 
693 static struct ibv_device *
694 mlx5_os_get_ibv_dev(const struct rte_device *dev)
695 {
696 	struct ibv_device *ibv;
697 
698 	if (mlx5_dev_is_pci(dev))
699 		ibv = mlx5_os_get_ibv_device(&RTE_DEV_TO_PCI_CONST(dev)->addr);
700 	else
701 		ibv = mlx5_get_aux_ibv_device(RTE_DEV_TO_AUXILIARY_CONST(dev));
702 	if (ibv == NULL) {
703 		rte_errno = ENODEV;
704 		DRV_LOG(ERR, "Verbs device not found: %s", dev->name);
705 	}
706 	return ibv;
707 }
708 
709 static struct ibv_device *
710 mlx5_vdpa_get_ibv_dev(const struct rte_device *dev)
711 {
712 	struct ibv_device *ibv;
713 	int retry;
714 
715 	if (mlx5_roce_disable(dev) != 0) {
716 		DRV_LOG(WARNING, "Failed to disable ROCE for \"%s\".",
717 			dev->name);
718 		return NULL;
719 	}
720 	/* Wait for the IB device to appear again after reload. */
721 	for (retry = MLX5_VDPA_MAX_RETRIES; retry > 0; --retry) {
722 		ibv = mlx5_os_get_ibv_dev(dev);
723 		if (ibv != NULL)
724 			return ibv;
725 		usleep(MLX5_VDPA_USEC);
726 	}
727 	DRV_LOG(ERR,
728 		"Cannot get IB device after disabling RoCE for \"%s\", retries exceed %d.",
729 		dev->name, MLX5_VDPA_MAX_RETRIES);
730 	rte_errno = EAGAIN;
731 	return NULL;
732 }
733 
734 static int
735 mlx5_config_doorbell_mapping_env(int dbnc)
736 {
737 	char *env;
738 	int value;
739 
740 	MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
741 	/* Get environment variable to store. */
742 	env = getenv(MLX5_SHUT_UP_BF);
743 	value = env ? !!strcmp(env, "0") : MLX5_ARG_UNSET;
744 	if (dbnc == MLX5_ARG_UNSET)
745 		setenv(MLX5_SHUT_UP_BF, MLX5_SHUT_UP_BF_DEFAULT, 1);
746 	else
747 		setenv(MLX5_SHUT_UP_BF,
748 		       dbnc == MLX5_SQ_DB_NCACHED ? "1" : "0", 1);
749 	return value;
750 }
751 
752 static void
753 mlx5_restore_doorbell_mapping_env(int value)
754 {
755 	MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
756 	/* Restore the original environment variable state. */
757 	if (value == MLX5_ARG_UNSET)
758 		unsetenv(MLX5_SHUT_UP_BF);
759 	else
760 		setenv(MLX5_SHUT_UP_BF, value ? "1" : "0", 1);
761 }
762 
763 /**
764  * Function API to open IB device.
765  *
766  * @param cdev
767  *   Pointer to the mlx5 device.
768  * @param classes
769  *   Chosen classes come from device arguments.
770  *
771  * @return
772  *   Pointer to ibv_context on success, NULL otherwise and rte_errno is set.
773  */
774 static struct ibv_context *
775 mlx5_open_device(struct mlx5_common_device *cdev, uint32_t classes)
776 {
777 	struct ibv_device *ibv;
778 	struct ibv_context *ctx = NULL;
779 	int dbmap_env;
780 
781 	MLX5_ASSERT(cdev->config.device_fd == MLX5_ARG_UNSET);
782 	if (classes & MLX5_CLASS_VDPA)
783 		ibv = mlx5_vdpa_get_ibv_dev(cdev->dev);
784 	else
785 		ibv = mlx5_os_get_ibv_dev(cdev->dev);
786 	if (!ibv)
787 		return NULL;
788 	DRV_LOG(INFO, "Dev information matches for device \"%s\".", ibv->name);
789 	/*
790 	 * Configure environment variable "MLX5_BF_SHUT_UP" before the device
791 	 * creation. The rdma_core library checks the variable at device
792 	 * creation and stores the result internally.
793 	 */
794 	dbmap_env = mlx5_config_doorbell_mapping_env(cdev->config.dbnc);
795 	/* Try to open IB device with DV first, then usual Verbs. */
796 	errno = 0;
797 	ctx = mlx5_glue->dv_open_device(ibv);
798 	if (ctx) {
799 		cdev->config.devx = 1;
800 	} else if (classes == MLX5_CLASS_ETH) {
801 		/* The environment variable is still configured. */
802 		ctx = mlx5_glue->open_device(ibv);
803 		if (ctx == NULL)
804 			goto error;
805 	} else {
806 		goto error;
807 	}
808 	/* The device is created, no need for environment. */
809 	mlx5_restore_doorbell_mapping_env(dbmap_env);
810 	return ctx;
811 error:
812 	rte_errno = errno ? errno : ENODEV;
813 	/* The device creation is failed, no need for environment. */
814 	mlx5_restore_doorbell_mapping_env(dbmap_env);
815 	DRV_LOG(ERR, "Failed to open IB device \"%s\".", ibv->name);
816 	return NULL;
817 }
818 
819 /**
820  * Function API to import IB device.
821  *
822  * @param cdev
823  *   Pointer to the mlx5 device.
824  *
825  * @return
826  *   Pointer to ibv_context on success, NULL otherwise and rte_errno is set.
827  */
828 static struct ibv_context *
829 mlx5_import_device(struct mlx5_common_device *cdev)
830 {
831 	struct ibv_context *ctx = NULL;
832 
833 	MLX5_ASSERT(cdev->config.device_fd != MLX5_ARG_UNSET);
834 	ctx = mlx5_glue->import_device(cdev->config.device_fd);
835 	if (!ctx) {
836 		DRV_LOG(ERR, "Failed to import device for fd=%d: %s",
837 			cdev->config.device_fd, rte_strerror(errno));
838 		rte_errno = errno;
839 	}
840 	return ctx;
841 }
842 
843 /**
844  * Function API to prepare IB device.
845  *
846  * @param cdev
847  *   Pointer to the mlx5 device.
848  * @param classes
849  *   Chosen classes come from device arguments.
850  *
851  * @return
852  *   0 on success, a negative errno value otherwise and rte_errno is set.
853  */
854 int
855 mlx5_os_open_device(struct mlx5_common_device *cdev, uint32_t classes)
856 {
857 
858 	struct ibv_context *ctx = NULL;
859 
860 	if (cdev->config.device_fd == MLX5_ARG_UNSET)
861 		ctx = mlx5_open_device(cdev, classes);
862 	else
863 		ctx = mlx5_import_device(cdev);
864 	if (ctx == NULL)
865 		return -rte_errno;
866 	/* Hint libmlx5 to use PMD allocator for data plane resources */
867 	mlx5_set_context_attr(cdev->dev, ctx);
868 	cdev->ctx = ctx;
869 	return 0;
870 }
871 
872 int
873 mlx5_get_device_guid(const struct rte_pci_addr *dev, uint8_t *guid, size_t len)
874 {
875 	char tmp[512];
876 	char cur_ifname[IF_NAMESIZE + 1];
877 	FILE *id_file;
878 	DIR *dir;
879 	struct dirent *ptr;
880 	int ret;
881 
882 	if (guid == NULL || len < sizeof(u_int64_t) + 1)
883 		return -1;
884 	memset(guid, 0, len);
885 	snprintf(tmp, sizeof(tmp), "/sys/bus/pci/devices/%04x:%02x:%02x.%x/net",
886 			dev->domain, dev->bus, dev->devid, dev->function);
887 	dir = opendir(tmp);
888 	if (dir == NULL)
889 		return -1;
890 	/* Traverse to identify PF interface */
891 	do {
892 		ptr = readdir(dir);
893 		if (ptr == NULL || ptr->d_type != DT_DIR) {
894 			closedir(dir);
895 			return -1;
896 		}
897 	} while (strchr(ptr->d_name, '.') || strchr(ptr->d_name, '_') ||
898 		 strchr(ptr->d_name, 'v'));
899 	snprintf(cur_ifname, sizeof(cur_ifname), "%s", ptr->d_name);
900 	closedir(dir);
901 	snprintf(tmp + strlen(tmp), sizeof(tmp) - strlen(tmp),
902 			"/%s/phys_switch_id", cur_ifname);
903 	/* Older OFED like 5.3 doesn't support read */
904 	id_file = fopen(tmp, "r");
905 	if (!id_file)
906 		return 0;
907 	ret = fscanf(id_file, "%16s", guid);
908 	fclose(id_file);
909 	return ret;
910 }
911 
912 /*
913  * Create direct mkey using the kernel ibv_reg_mr API and wrap it with a new
914  * indirect mkey created by the DevX API.
915  * This mkey should be used for DevX commands requesting mkey as a parameter.
916  */
917 int
918 mlx5_os_wrapped_mkey_create(void *ctx, void *pd, uint32_t pdn, void *addr,
919 			    size_t length, struct mlx5_pmd_wrapped_mr *pmd_mr)
920 {
921 	struct mlx5_klm klm = {
922 		.byte_count = length,
923 		.address = (uintptr_t)addr,
924 	};
925 	struct mlx5_devx_mkey_attr mkey_attr = {
926 		.pd = pdn,
927 		.klm_array = &klm,
928 		.klm_num = 1,
929 	};
930 	struct mlx5_devx_obj *mkey;
931 	struct ibv_mr *ibv_mr = mlx5_glue->reg_mr(pd, addr, length,
932 						  IBV_ACCESS_LOCAL_WRITE |
933 						  (haswell_broadwell_cpu ? 0 :
934 						  IBV_ACCESS_RELAXED_ORDERING));
935 
936 	if (!ibv_mr) {
937 		rte_errno = errno;
938 		return -rte_errno;
939 	}
940 	klm.mkey = ibv_mr->lkey;
941 	mkey_attr.addr = (uintptr_t)addr;
942 	mkey_attr.size = length;
943 	mkey = mlx5_devx_cmd_mkey_create(ctx, &mkey_attr);
944 	if (!mkey) {
945 		claim_zero(mlx5_glue->dereg_mr(ibv_mr));
946 		return -rte_errno;
947 	}
948 	pmd_mr->addr = addr;
949 	pmd_mr->len = length;
950 	pmd_mr->obj = (void *)ibv_mr;
951 	pmd_mr->imkey = mkey;
952 	pmd_mr->lkey = mkey->id;
953 	return 0;
954 }
955 
956 void
957 mlx5_os_wrapped_mkey_destroy(struct mlx5_pmd_wrapped_mr *pmd_mr)
958 {
959 	if (!pmd_mr)
960 		return;
961 	if (pmd_mr->imkey)
962 		claim_zero(mlx5_devx_cmd_destroy(pmd_mr->imkey));
963 	if (pmd_mr->obj)
964 		claim_zero(mlx5_glue->dereg_mr(pmd_mr->obj));
965 	memset(pmd_mr, 0, sizeof(*pmd_mr));
966 }
967