xref: /dpdk/drivers/common/mlx5/linux/mlx5_common_os.c (revision decb35d890209f603b01c1d23f35995bd51228fc)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2020 Mellanox Technologies, Ltd
3  */
4 
5 #include <sys/types.h>
6 #include <unistd.h>
7 #include <string.h>
8 #include <stdio.h>
9 #ifdef RTE_IBVERBS_LINK_DLOPEN
10 #include <dlfcn.h>
11 #endif
12 #include <dirent.h>
13 #include <net/if.h>
14 #include <fcntl.h>
15 
16 #include <rte_errno.h>
17 #include <rte_string_fns.h>
18 #include <rte_bus_pci.h>
19 #include <rte_bus_auxiliary.h>
20 
21 #include "mlx5_common.h"
22 #include "mlx5_nl.h"
23 #include "mlx5_common_log.h"
24 #include "mlx5_common_private.h"
25 #include "mlx5_common_defs.h"
26 #include "mlx5_common_os.h"
27 #include "mlx5_glue.h"
28 
29 #ifdef MLX5_GLUE
30 const struct mlx5_glue *mlx5_glue;
31 #endif
32 
33 int
34 mlx5_get_pci_addr(const char *dev_path, struct rte_pci_addr *pci_addr)
35 {
36 	FILE *file;
37 	char line[32];
38 	int rc = -ENOENT;
39 	MKSTR(path, "%s/device/uevent", dev_path);
40 
41 	file = fopen(path, "rb");
42 	if (file == NULL) {
43 		rte_errno = errno;
44 		return -rte_errno;
45 	}
46 	while (fgets(line, sizeof(line), file) == line) {
47 		size_t len = strlen(line);
48 
49 		/* Truncate long lines. */
50 		if (len == (sizeof(line) - 1)) {
51 			while (line[(len - 1)] != '\n') {
52 				int ret = fgetc(file);
53 
54 				if (ret == EOF)
55 					goto exit;
56 				line[(len - 1)] = ret;
57 			}
58 			/* No match for long lines. */
59 			continue;
60 		}
61 		/* Extract information. */
62 		if (sscanf(line,
63 			   "PCI_SLOT_NAME="
64 			   "%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n",
65 			   &pci_addr->domain,
66 			   &pci_addr->bus,
67 			   &pci_addr->devid,
68 			   &pci_addr->function) == 4) {
69 			rc = 0;
70 			break;
71 		}
72 	}
73 exit:
74 	fclose(file);
75 	if (rc)
76 		rte_errno = -rc;
77 	return rc;
78 }
79 
80 /**
81  * Extract port name, as a number, from sysfs or netlink information.
82  *
83  * @param[in] port_name_in
84  *   String representing the port name.
85  * @param[out] port_info_out
86  *   Port information, including port name as a number and port name
87  *   type if recognized
88  *
89  * @return
90  *   port_name field set according to recognized name format.
91  */
92 void
93 mlx5_translate_port_name(const char *port_name_in,
94 			 struct mlx5_switch_info *port_info_out)
95 {
96 	char ctrl = 0, pf_c1, pf_c2, vf_c1, vf_c2, eol;
97 	char *end;
98 	int sc_items;
99 
100 	sc_items = sscanf(port_name_in, "%c%d",
101 			  &ctrl, &port_info_out->ctrl_num);
102 	if (sc_items == 2 && ctrl == 'c') {
103 		port_name_in++; /* 'c' */
104 		port_name_in += snprintf(NULL, 0, "%d",
105 					  port_info_out->ctrl_num);
106 	}
107 	/* Check for port-name as a string of the form pf0vf0 or pf0sf0 */
108 	sc_items = sscanf(port_name_in, "%c%c%d%c%c%d%c",
109 			  &pf_c1, &pf_c2, &port_info_out->pf_num,
110 			  &vf_c1, &vf_c2, &port_info_out->port_name, &eol);
111 	if (sc_items == 6 && pf_c1 == 'p' && pf_c2 == 'f') {
112 		if (vf_c1 == 'v' && vf_c2 == 'f') {
113 			/* Kernel ver >= 5.0 or OFED ver >= 4.6 */
114 			port_info_out->name_type =
115 					MLX5_PHYS_PORT_NAME_TYPE_PFVF;
116 			return;
117 		}
118 		if (vf_c1 == 's' && vf_c2 == 'f') {
119 			/* Kernel ver >= 5.11 or OFED ver >= 5.1 */
120 			port_info_out->name_type =
121 					MLX5_PHYS_PORT_NAME_TYPE_PFSF;
122 			return;
123 		}
124 	}
125 	/*
126 	 * Check for port-name as a string of the form p0
127 	 * (support kernel ver >= 5.0, or OFED ver >= 4.6).
128 	 */
129 	sc_items = sscanf(port_name_in, "%c%d%c",
130 			  &pf_c1, &port_info_out->port_name, &eol);
131 	if (sc_items == 2 && pf_c1 == 'p') {
132 		port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_UPLINK;
133 		return;
134 	}
135 	/*
136 	 * Check for port-name as a string of the form pf0
137 	 * (support kernel ver >= 5.7 for HPF representor on BF).
138 	 */
139 	sc_items = sscanf(port_name_in, "%c%c%d%c",
140 			  &pf_c1, &pf_c2, &port_info_out->pf_num, &eol);
141 	if (sc_items == 3 && pf_c1 == 'p' && pf_c2 == 'f') {
142 		port_info_out->port_name = -1;
143 		port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_PFHPF;
144 		return;
145 	}
146 	/* Check for port-name as a number (support kernel ver < 5.0 */
147 	errno = 0;
148 	port_info_out->port_name = strtol(port_name_in, &end, 0);
149 	if (!errno &&
150 	    (size_t)(end - port_name_in) == strlen(port_name_in)) {
151 		port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_LEGACY;
152 		return;
153 	}
154 	port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN;
155 }
156 
157 int
158 mlx5_get_ifname_sysfs(const char *ibdev_path, char *ifname)
159 {
160 	DIR *dir;
161 	struct dirent *dent;
162 	unsigned int dev_type = 0;
163 	unsigned int dev_port_prev = ~0u;
164 	char match[IF_NAMESIZE] = "";
165 
166 	MLX5_ASSERT(ibdev_path);
167 	{
168 		MKSTR(path, "%s/device/net", ibdev_path);
169 
170 		dir = opendir(path);
171 		if (dir == NULL) {
172 			rte_errno = errno;
173 			return -rte_errno;
174 		}
175 	}
176 	while ((dent = readdir(dir)) != NULL) {
177 		char *name = dent->d_name;
178 		FILE *file;
179 		unsigned int dev_port;
180 		int r;
181 
182 		if ((name[0] == '.') &&
183 		    ((name[1] == '\0') ||
184 		     ((name[1] == '.') && (name[2] == '\0'))))
185 			continue;
186 
187 		MKSTR(path, "%s/device/net/%s/%s",
188 		      ibdev_path, name,
189 		      (dev_type ? "dev_id" : "dev_port"));
190 
191 		file = fopen(path, "rb");
192 		if (file == NULL) {
193 			if (errno != ENOENT)
194 				continue;
195 			/*
196 			 * Switch to dev_id when dev_port does not exist as
197 			 * is the case with Linux kernel versions < 3.15.
198 			 */
199 try_dev_id:
200 			match[0] = '\0';
201 			if (dev_type)
202 				break;
203 			dev_type = 1;
204 			dev_port_prev = ~0u;
205 			rewinddir(dir);
206 			continue;
207 		}
208 		r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port);
209 		fclose(file);
210 		if (r != 1)
211 			continue;
212 		/*
213 		 * Switch to dev_id when dev_port returns the same value for
214 		 * all ports. May happen when using a MOFED release older than
215 		 * 3.0 with a Linux kernel >= 3.15.
216 		 */
217 		if (dev_port == dev_port_prev)
218 			goto try_dev_id;
219 		dev_port_prev = dev_port;
220 		if (dev_port == 0)
221 			strlcpy(match, name, IF_NAMESIZE);
222 	}
223 	closedir(dir);
224 	if (match[0] == '\0') {
225 		rte_errno = ENOENT;
226 		return -rte_errno;
227 	}
228 	strncpy(ifname, match, IF_NAMESIZE);
229 	return 0;
230 }
231 
232 #ifdef MLX5_GLUE
233 
234 /**
235  * Suffix RTE_EAL_PMD_PATH with "-glue".
236  *
237  * This function performs a sanity check on RTE_EAL_PMD_PATH before
238  * suffixing its last component.
239  *
240  * @param buf[out]
241  *   Output buffer, should be large enough otherwise NULL is returned.
242  * @param size
243  *   Size of @p out.
244  *
245  * @return
246  *   Pointer to @p buf or @p NULL in case suffix cannot be appended.
247  */
248 static char *
249 mlx5_glue_path(char *buf, size_t size)
250 {
251 	static const char *const bad[] = { "/", ".", "..", NULL };
252 	const char *path = RTE_EAL_PMD_PATH;
253 	size_t len = strlen(path);
254 	size_t off;
255 	int i;
256 
257 	while (len && path[len - 1] == '/')
258 		--len;
259 	for (off = len; off && path[off - 1] != '/'; --off)
260 		;
261 	for (i = 0; bad[i]; ++i)
262 		if (!strncmp(path + off, bad[i], (int)(len - off)))
263 			goto error;
264 	i = snprintf(buf, size, "%.*s-glue", (int)len, path);
265 	if (i == -1 || (size_t)i >= size)
266 		goto error;
267 	return buf;
268 error:
269 	RTE_LOG(ERR, PMD, "unable to append \"-glue\" to last component of"
270 		" RTE_EAL_PMD_PATH (\"" RTE_EAL_PMD_PATH "\"), please"
271 		" re-configure DPDK");
272 	return NULL;
273 }
274 
275 static int
276 mlx5_glue_dlopen(void)
277 {
278 	char glue_path[sizeof(RTE_EAL_PMD_PATH) - 1 + sizeof("-glue")];
279 	void *handle = NULL;
280 
281 	char const *path[] = {
282 		/*
283 		 * A basic security check is necessary before trusting
284 		 * MLX5_GLUE_PATH, which may override RTE_EAL_PMD_PATH.
285 		 */
286 		(geteuid() == getuid() && getegid() == getgid() ?
287 		 getenv("MLX5_GLUE_PATH") : NULL),
288 		/*
289 		 * When RTE_EAL_PMD_PATH is set, use its glue-suffixed
290 		 * variant, otherwise let dlopen() look up libraries on its
291 		 * own.
292 		 */
293 		(*RTE_EAL_PMD_PATH ?
294 		 mlx5_glue_path(glue_path, sizeof(glue_path)) : ""),
295 	};
296 	unsigned int i = 0;
297 	void **sym;
298 	const char *dlmsg;
299 
300 	while (!handle && i != RTE_DIM(path)) {
301 		const char *end;
302 		size_t len;
303 		int ret;
304 
305 		if (!path[i]) {
306 			++i;
307 			continue;
308 		}
309 		end = strpbrk(path[i], ":;");
310 		if (!end)
311 			end = path[i] + strlen(path[i]);
312 		len = end - path[i];
313 		ret = 0;
314 		do {
315 			char name[ret + 1];
316 
317 			ret = snprintf(name, sizeof(name), "%.*s%s" MLX5_GLUE,
318 				       (int)len, path[i],
319 				       (!len || *(end - 1) == '/') ? "" : "/");
320 			if (ret == -1)
321 				break;
322 			if (sizeof(name) != (size_t)ret + 1)
323 				continue;
324 			DRV_LOG(DEBUG, "Looking for rdma-core glue as "
325 				"\"%s\"", name);
326 			handle = dlopen(name, RTLD_LAZY);
327 			break;
328 		} while (1);
329 		path[i] = end + 1;
330 		if (!*end)
331 			++i;
332 	}
333 	if (!handle) {
334 		rte_errno = EINVAL;
335 		dlmsg = dlerror();
336 		if (dlmsg)
337 			DRV_LOG(WARNING, "Cannot load glue library: %s", dlmsg);
338 		goto glue_error;
339 	}
340 	sym = dlsym(handle, "mlx5_glue");
341 	if (!sym || !*sym) {
342 		rte_errno = EINVAL;
343 		dlmsg = dlerror();
344 		if (dlmsg)
345 			DRV_LOG(ERR, "Cannot resolve glue symbol: %s", dlmsg);
346 		goto glue_error;
347 	}
348 	mlx5_glue = *sym;
349 	return 0;
350 
351 glue_error:
352 	if (handle)
353 		dlclose(handle);
354 	return -1;
355 }
356 
357 #endif
358 
359 /**
360  * Initialization routine for run-time dependency on rdma-core.
361  */
362 void
363 mlx5_glue_constructor(void)
364 {
365 	/*
366 	 * RDMAV_HUGEPAGES_SAFE tells ibv_fork_init() we intend to use
367 	 * huge pages. Calling ibv_fork_init() during init allows
368 	 * applications to use fork() safely for purposes other than
369 	 * using this PMD, which is not supported in forked processes.
370 	 */
371 	setenv("RDMAV_HUGEPAGES_SAFE", "1", 1);
372 	/* Match the size of Rx completion entry to the size of a cacheline. */
373 	if (RTE_CACHE_LINE_SIZE == 128)
374 		setenv("MLX5_CQE_SIZE", "128", 0);
375 	/*
376 	 * MLX5_DEVICE_FATAL_CLEANUP tells ibv_destroy functions to
377 	 * cleanup all the Verbs resources even when the device was removed.
378 	 */
379 	setenv("MLX5_DEVICE_FATAL_CLEANUP", "1", 1);
380 
381 #ifdef MLX5_GLUE
382 	if (mlx5_glue_dlopen() != 0)
383 		goto glue_error;
384 #endif
385 
386 #ifdef RTE_LIBRTE_MLX5_DEBUG
387 	/* Glue structure must not contain any NULL pointers. */
388 	{
389 		unsigned int i;
390 
391 		for (i = 0; i != sizeof(*mlx5_glue) / sizeof(void *); ++i)
392 			MLX5_ASSERT(((const void *const *)mlx5_glue)[i]);
393 	}
394 #endif
395 	if (strcmp(mlx5_glue->version, MLX5_GLUE_VERSION)) {
396 		rte_errno = EINVAL;
397 		DRV_LOG(ERR, "rdma-core glue \"%s\" mismatch: \"%s\" is "
398 			"required", mlx5_glue->version, MLX5_GLUE_VERSION);
399 		goto glue_error;
400 	}
401 	mlx5_glue->fork_init();
402 	return;
403 
404 glue_error:
405 	DRV_LOG(WARNING, "Cannot initialize MLX5 common due to missing"
406 		" run-time dependency on rdma-core libraries (libibverbs,"
407 		" libmlx5)");
408 	mlx5_glue = NULL;
409 }
410 
411 /**
412  * Validate user arguments for remote PD and CTX.
413  *
414  * @param config
415  *   Pointer to device configuration structure.
416  *
417  * @return
418  *   0 on success, a negative errno value otherwise and rte_errno is set.
419  */
420 int
421 mlx5_os_remote_pd_and_ctx_validate(struct mlx5_common_dev_config *config)
422 {
423 	int device_fd = config->device_fd;
424 	int pd_handle = config->pd_handle;
425 
426 #ifdef HAVE_MLX5_IBV_IMPORT_CTX_PD_AND_MR
427 	if (device_fd == MLX5_ARG_UNSET && pd_handle != MLX5_ARG_UNSET) {
428 		DRV_LOG(ERR, "Remote PD without CTX is not supported.");
429 		rte_errno = EINVAL;
430 		return -rte_errno;
431 	}
432 	if (device_fd != MLX5_ARG_UNSET && pd_handle == MLX5_ARG_UNSET) {
433 		DRV_LOG(ERR, "Remote CTX without PD is not supported.");
434 		rte_errno = EINVAL;
435 		return -rte_errno;
436 	}
437 	DRV_LOG(DEBUG, "Remote PD and CTX is supported: (cmd_fd=%d, "
438 		"pd_handle=%d).", device_fd, pd_handle);
439 #else
440 	if (pd_handle != MLX5_ARG_UNSET || device_fd != MLX5_ARG_UNSET) {
441 		DRV_LOG(ERR,
442 			"Remote PD and CTX is not supported - maybe old rdma-core version?");
443 		rte_errno = ENOTSUP;
444 		return -rte_errno;
445 	}
446 #endif
447 	return 0;
448 }
449 
450 /**
451  * Release Protection Domain object.
452  *
453  * @param[out] cdev
454  *   Pointer to the mlx5 device.
455  *
456  * @return
457  *   0 on success, a negative errno value otherwise.
458  */
459 int
460 mlx5_os_pd_release(struct mlx5_common_device *cdev)
461 {
462 	if (cdev->config.pd_handle == MLX5_ARG_UNSET)
463 		return mlx5_glue->dealloc_pd(cdev->pd);
464 	else
465 		return mlx5_glue->unimport_pd(cdev->pd);
466 }
467 
468 /**
469  * Allocate Protection Domain object.
470  *
471  * @param[out] cdev
472  *   Pointer to the mlx5 device.
473  *
474  * @return
475  *   0 on success, a negative errno value otherwise.
476  */
477 static int
478 mlx5_os_pd_create(struct mlx5_common_device *cdev)
479 {
480 	cdev->pd = mlx5_glue->alloc_pd(cdev->ctx);
481 	if (cdev->pd == NULL) {
482 		DRV_LOG(ERR, "Failed to allocate PD: %s", rte_strerror(errno));
483 		return errno ? -errno : -ENOMEM;
484 	}
485 	return 0;
486 }
487 
488 /**
489  * Import Protection Domain object according to given PD handle.
490  *
491  * @param[out] cdev
492  *   Pointer to the mlx5 device.
493  *
494  * @return
495  *   0 on success, a negative errno value otherwise.
496  */
497 static int
498 mlx5_os_pd_import(struct mlx5_common_device *cdev)
499 {
500 	cdev->pd = mlx5_glue->import_pd(cdev->ctx, cdev->config.pd_handle);
501 	if (cdev->pd == NULL) {
502 		DRV_LOG(ERR, "Failed to import PD using handle=%d: %s",
503 			cdev->config.pd_handle, rte_strerror(errno));
504 		return errno ? -errno : -ENOMEM;
505 	}
506 	return 0;
507 }
508 
509 /**
510  * Prepare Protection Domain object and extract its pdn using DV API.
511  *
512  * @param[out] cdev
513  *   Pointer to the mlx5 device.
514  *
515  * @return
516  *   0 on success, a negative errno value otherwise and rte_errno is set.
517  */
518 int
519 mlx5_os_pd_prepare(struct mlx5_common_device *cdev)
520 {
521 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
522 	struct mlx5dv_obj obj;
523 	struct mlx5dv_pd pd_info;
524 #endif
525 	int ret;
526 
527 	if (cdev->config.pd_handle == MLX5_ARG_UNSET)
528 		ret = mlx5_os_pd_create(cdev);
529 	else
530 		ret = mlx5_os_pd_import(cdev);
531 	if (ret) {
532 		rte_errno = -ret;
533 		return ret;
534 	}
535 	if (cdev->config.devx == 0)
536 		return 0;
537 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
538 	obj.pd.in = cdev->pd;
539 	obj.pd.out = &pd_info;
540 	ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_PD);
541 	if (ret != 0) {
542 		DRV_LOG(ERR, "Fail to get PD object info.");
543 		rte_errno = errno;
544 		claim_zero(mlx5_os_pd_release(cdev));
545 		cdev->pd = NULL;
546 		return -rte_errno;
547 	}
548 	cdev->pdn = pd_info.pdn;
549 	return 0;
550 #else
551 	DRV_LOG(ERR, "Cannot get pdn - no DV support.");
552 	rte_errno = ENOTSUP;
553 	return -rte_errno;
554 #endif /* HAVE_IBV_FLOW_DV_SUPPORT */
555 }
556 
557 static struct ibv_device *
558 mlx5_os_get_ibv_device(const struct rte_pci_addr *addr)
559 {
560 	int n;
561 	struct ibv_device **ibv_list = mlx5_glue->get_device_list(&n);
562 	struct ibv_device *ibv_match = NULL;
563 	uint8_t guid1[32] = {0};
564 	uint8_t guid2[32] = {0};
565 	int ret1, ret2 = -1;
566 	struct rte_pci_addr paddr;
567 
568 	if (ibv_list == NULL || !n) {
569 		rte_errno = ENOSYS;
570 		if (ibv_list)
571 			mlx5_glue->free_device_list(ibv_list);
572 		return NULL;
573 	}
574 	ret1 = mlx5_get_device_guid(addr, guid1, sizeof(guid1));
575 	while (n-- > 0) {
576 		DRV_LOG(DEBUG, "Checking device \"%s\"..", ibv_list[n]->name);
577 		if (mlx5_get_pci_addr(ibv_list[n]->ibdev_path, &paddr) != 0)
578 			continue;
579 		if (ret1 > 0)
580 			ret2 = mlx5_get_device_guid(&paddr, guid2, sizeof(guid2));
581 		/* Bond device can bond secondary PCIe */
582 		if ((strstr(ibv_list[n]->name, "bond") &&
583 		    ((ret1 > 0 && ret2 > 0 && !memcmp(guid1, guid2, sizeof(guid1))) ||
584 		    (addr->domain == paddr.domain && addr->bus == paddr.bus &&
585 		     addr->devid == paddr.devid))) ||
586 		     !rte_pci_addr_cmp(addr, &paddr)) {
587 			ibv_match = ibv_list[n];
588 			break;
589 		}
590 	}
591 	if (ibv_match == NULL) {
592 		DRV_LOG(WARNING,
593 			"No Verbs device matches PCI device " PCI_PRI_FMT ","
594 			" are kernel drivers loaded?",
595 			addr->domain, addr->bus, addr->devid, addr->function);
596 		rte_errno = ENOENT;
597 	}
598 	mlx5_glue->free_device_list(ibv_list);
599 	return ibv_match;
600 }
601 
602 /* Try to disable ROCE by Netlink\Devlink. */
603 static int
604 mlx5_nl_roce_disable(const char *addr)
605 {
606 	int nlsk_fd = mlx5_nl_init(NETLINK_GENERIC, 0);
607 	int devlink_id;
608 	int enable;
609 	int ret;
610 
611 	if (nlsk_fd < 0)
612 		return nlsk_fd;
613 	devlink_id = mlx5_nl_devlink_family_id_get(nlsk_fd);
614 	if (devlink_id < 0) {
615 		ret = devlink_id;
616 		DRV_LOG(DEBUG,
617 			"Failed to get devlink id for ROCE operations by Netlink.");
618 		goto close;
619 	}
620 	ret = mlx5_nl_enable_roce_get(nlsk_fd, devlink_id, addr, &enable);
621 	if (ret) {
622 		DRV_LOG(DEBUG, "Failed to get ROCE enable by Netlink: %d.",
623 			ret);
624 		goto close;
625 	} else if (!enable) {
626 		DRV_LOG(INFO, "ROCE has already disabled(Netlink).");
627 		goto close;
628 	}
629 	ret = mlx5_nl_enable_roce_set(nlsk_fd, devlink_id, addr, 0);
630 	if (ret)
631 		DRV_LOG(DEBUG, "Failed to disable ROCE by Netlink: %d.", ret);
632 	else
633 		DRV_LOG(INFO, "ROCE is disabled by Netlink successfully.");
634 close:
635 	close(nlsk_fd);
636 	return ret;
637 }
638 
639 /* Try to disable ROCE by sysfs. */
640 static int
641 mlx5_sys_roce_disable(const char *addr)
642 {
643 	FILE *file_o;
644 	int enable;
645 	int ret;
646 
647 	MKSTR(file_p, "/sys/bus/pci/devices/%s/roce_enable", addr);
648 	file_o = fopen(file_p, "rb");
649 	if (!file_o) {
650 		rte_errno = ENOTSUP;
651 		return -ENOTSUP;
652 	}
653 	ret = fscanf(file_o, "%d", &enable);
654 	if (ret != 1) {
655 		rte_errno = EINVAL;
656 		ret = EINVAL;
657 		goto close;
658 	} else if (!enable) {
659 		ret = 0;
660 		DRV_LOG(INFO, "ROCE has already disabled(sysfs).");
661 		goto close;
662 	}
663 	fclose(file_o);
664 	file_o = fopen(file_p, "wb");
665 	if (!file_o) {
666 		rte_errno = ENOTSUP;
667 		return -ENOTSUP;
668 	}
669 	fprintf(file_o, "0\n");
670 	ret = 0;
671 close:
672 	if (ret)
673 		DRV_LOG(DEBUG, "Failed to disable ROCE by sysfs: %d.", ret);
674 	else
675 		DRV_LOG(INFO, "ROCE is disabled by sysfs successfully.");
676 	fclose(file_o);
677 	return ret;
678 }
679 
680 static int
681 mlx5_roce_disable(const struct rte_device *dev)
682 {
683 	char pci_addr[PCI_PRI_STR_SIZE] = { 0 };
684 
685 	if (mlx5_dev_to_pci_str(dev, pci_addr, sizeof(pci_addr)) < 0)
686 		return -rte_errno;
687 	/* Firstly try to disable ROCE by Netlink and fallback to sysfs. */
688 	if (mlx5_nl_roce_disable(pci_addr) != 0 &&
689 	    mlx5_sys_roce_disable(pci_addr) != 0)
690 		return -rte_errno;
691 	return 0;
692 }
693 
694 static struct ibv_device *
695 mlx5_os_get_ibv_dev(const struct rte_device *dev)
696 {
697 	struct ibv_device *ibv;
698 
699 	if (mlx5_dev_is_pci(dev))
700 		ibv = mlx5_os_get_ibv_device(&RTE_DEV_TO_PCI_CONST(dev)->addr);
701 	else
702 		ibv = mlx5_get_aux_ibv_device(RTE_DEV_TO_AUXILIARY_CONST(dev));
703 	if (ibv == NULL) {
704 		rte_errno = ENODEV;
705 		DRV_LOG(ERR, "Verbs device not found: %s", dev->name);
706 	}
707 	return ibv;
708 }
709 
710 static struct ibv_device *
711 mlx5_vdpa_get_ibv_dev(const struct rte_device *dev)
712 {
713 	struct ibv_device *ibv;
714 	int retry;
715 
716 	if (mlx5_roce_disable(dev) != 0) {
717 		DRV_LOG(WARNING, "Failed to disable ROCE for \"%s\".",
718 			dev->name);
719 		return NULL;
720 	}
721 	/* Wait for the IB device to appear again after reload. */
722 	for (retry = MLX5_VDPA_MAX_RETRIES; retry > 0; --retry) {
723 		ibv = mlx5_os_get_ibv_dev(dev);
724 		if (ibv != NULL)
725 			return ibv;
726 		usleep(MLX5_VDPA_USEC);
727 	}
728 	DRV_LOG(ERR,
729 		"Cannot get IB device after disabling RoCE for \"%s\", retries exceed %d.",
730 		dev->name, MLX5_VDPA_MAX_RETRIES);
731 	rte_errno = EAGAIN;
732 	return NULL;
733 }
734 
735 static int
736 mlx5_config_doorbell_mapping_env(int dbnc)
737 {
738 	char *env;
739 	int value;
740 
741 	MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
742 	/* Get environment variable to store. */
743 	env = getenv(MLX5_SHUT_UP_BF);
744 	value = env ? !!strcmp(env, "0") : MLX5_ARG_UNSET;
745 	if (dbnc == MLX5_ARG_UNSET)
746 		setenv(MLX5_SHUT_UP_BF, MLX5_SHUT_UP_BF_DEFAULT, 1);
747 	else
748 		setenv(MLX5_SHUT_UP_BF,
749 		       dbnc == MLX5_SQ_DB_NCACHED ? "1" : "0", 1);
750 	return value;
751 }
752 
753 static void
754 mlx5_restore_doorbell_mapping_env(int value)
755 {
756 	MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
757 	/* Restore the original environment variable state. */
758 	if (value == MLX5_ARG_UNSET)
759 		unsetenv(MLX5_SHUT_UP_BF);
760 	else
761 		setenv(MLX5_SHUT_UP_BF, value ? "1" : "0", 1);
762 }
763 
764 /**
765  * Function API to open IB device.
766  *
767  * @param cdev
768  *   Pointer to the mlx5 device.
769  * @param classes
770  *   Chosen classes come from device arguments.
771  *
772  * @return
773  *   Pointer to ibv_context on success, NULL otherwise and rte_errno is set.
774  */
775 static struct ibv_context *
776 mlx5_open_device(struct mlx5_common_device *cdev, uint32_t classes)
777 {
778 	struct ibv_device *ibv;
779 	struct ibv_context *ctx = NULL;
780 	int dbmap_env;
781 
782 	MLX5_ASSERT(cdev->config.device_fd == MLX5_ARG_UNSET);
783 	if (classes & MLX5_CLASS_VDPA)
784 		ibv = mlx5_vdpa_get_ibv_dev(cdev->dev);
785 	else
786 		ibv = mlx5_os_get_ibv_dev(cdev->dev);
787 	if (!ibv)
788 		return NULL;
789 	DRV_LOG(INFO, "Dev information matches for device \"%s\".", ibv->name);
790 	/*
791 	 * Configure environment variable "MLX5_BF_SHUT_UP" before the device
792 	 * creation. The rdma_core library checks the variable at device
793 	 * creation and stores the result internally.
794 	 */
795 	dbmap_env = mlx5_config_doorbell_mapping_env(cdev->config.dbnc);
796 	/* Try to open IB device with DV first, then usual Verbs. */
797 	errno = 0;
798 	ctx = mlx5_glue->dv_open_device(ibv);
799 	if (ctx) {
800 		cdev->config.devx = 1;
801 	} else if (classes == MLX5_CLASS_ETH) {
802 		/* The environment variable is still configured. */
803 		ctx = mlx5_glue->open_device(ibv);
804 		if (ctx == NULL)
805 			goto error;
806 	} else {
807 		goto error;
808 	}
809 	/* The device is created, no need for environment. */
810 	mlx5_restore_doorbell_mapping_env(dbmap_env);
811 	return ctx;
812 error:
813 	rte_errno = errno ? errno : ENODEV;
814 	/* The device creation is failed, no need for environment. */
815 	mlx5_restore_doorbell_mapping_env(dbmap_env);
816 	DRV_LOG(ERR, "Failed to open IB device \"%s\".", ibv->name);
817 	return NULL;
818 }
819 
820 /**
821  * Function API to import IB device.
822  *
823  * @param cdev
824  *   Pointer to the mlx5 device.
825  *
826  * @return
827  *   Pointer to ibv_context on success, NULL otherwise and rte_errno is set.
828  */
829 static struct ibv_context *
830 mlx5_import_device(struct mlx5_common_device *cdev)
831 {
832 	struct ibv_context *ctx = NULL;
833 
834 	MLX5_ASSERT(cdev->config.device_fd != MLX5_ARG_UNSET);
835 	ctx = mlx5_glue->import_device(cdev->config.device_fd);
836 	if (!ctx) {
837 		DRV_LOG(ERR, "Failed to import device for fd=%d: %s",
838 			cdev->config.device_fd, rte_strerror(errno));
839 		rte_errno = errno;
840 	}
841 	return ctx;
842 }
843 
844 /**
845  * Function API to prepare IB device.
846  *
847  * @param cdev
848  *   Pointer to the mlx5 device.
849  * @param classes
850  *   Chosen classes come from device arguments.
851  *
852  * @return
853  *   0 on success, a negative errno value otherwise and rte_errno is set.
854  */
855 int
856 mlx5_os_open_device(struct mlx5_common_device *cdev, uint32_t classes)
857 {
858 
859 	struct ibv_context *ctx = NULL;
860 
861 	if (cdev->config.device_fd == MLX5_ARG_UNSET)
862 		ctx = mlx5_open_device(cdev, classes);
863 	else
864 		ctx = mlx5_import_device(cdev);
865 	if (ctx == NULL)
866 		return -rte_errno;
867 	/* Hint libmlx5 to use PMD allocator for data plane resources */
868 	mlx5_set_context_attr(cdev->dev, ctx);
869 	cdev->ctx = ctx;
870 	return 0;
871 }
872 
873 int
874 mlx5_get_device_guid(const struct rte_pci_addr *dev, uint8_t *guid, size_t len)
875 {
876 	char tmp[512];
877 	char cur_ifname[IF_NAMESIZE + 1];
878 	FILE *id_file;
879 	DIR *dir;
880 	struct dirent *ptr;
881 	int ret;
882 
883 	if (guid == NULL || len < sizeof(u_int64_t) + 1)
884 		return -1;
885 	memset(guid, 0, len);
886 	snprintf(tmp, sizeof(tmp), "/sys/bus/pci/devices/%04x:%02x:%02x.%x/net",
887 			dev->domain, dev->bus, dev->devid, dev->function);
888 	dir = opendir(tmp);
889 	if (dir == NULL)
890 		return -1;
891 	/* Traverse to identify PF interface */
892 	do {
893 		ptr = readdir(dir);
894 		if (ptr == NULL || ptr->d_type != DT_DIR) {
895 			closedir(dir);
896 			return -1;
897 		}
898 	} while (strchr(ptr->d_name, '.') || strchr(ptr->d_name, '_') ||
899 		 strchr(ptr->d_name, 'v'));
900 	snprintf(cur_ifname, sizeof(cur_ifname), "%s", ptr->d_name);
901 	closedir(dir);
902 	snprintf(tmp + strlen(tmp), sizeof(tmp) - strlen(tmp),
903 			"/%s/phys_switch_id", cur_ifname);
904 	/* Older OFED like 5.3 doesn't support read */
905 	id_file = fopen(tmp, "r");
906 	if (!id_file)
907 		return 0;
908 	ret = fscanf(id_file, "%16s", guid);
909 	fclose(id_file);
910 	return ret;
911 }
912 
913 /*
914  * Create direct mkey using the kernel ibv_reg_mr API and wrap it with a new
915  * indirect mkey created by the DevX API.
916  * This mkey should be used for DevX commands requesting mkey as a parameter.
917  */
918 int
919 mlx5_os_wrapped_mkey_create(void *ctx, void *pd, uint32_t pdn, void *addr,
920 			    size_t length, struct mlx5_pmd_wrapped_mr *pmd_mr)
921 {
922 	struct mlx5_klm klm = {
923 		.byte_count = length,
924 		.address = (uintptr_t)addr,
925 	};
926 	struct mlx5_devx_mkey_attr mkey_attr = {
927 		.pd = pdn,
928 		.klm_array = &klm,
929 		.klm_num = 1,
930 	};
931 	struct mlx5_devx_obj *mkey;
932 	struct ibv_mr *ibv_mr = mlx5_glue->reg_mr(pd, addr, length,
933 						  IBV_ACCESS_LOCAL_WRITE |
934 						  (haswell_broadwell_cpu ? 0 :
935 						  IBV_ACCESS_RELAXED_ORDERING));
936 
937 	if (!ibv_mr) {
938 		rte_errno = errno;
939 		return -rte_errno;
940 	}
941 	klm.mkey = ibv_mr->lkey;
942 	mkey_attr.addr = (uintptr_t)addr;
943 	mkey_attr.size = length;
944 	mkey = mlx5_devx_cmd_mkey_create(ctx, &mkey_attr);
945 	if (!mkey) {
946 		claim_zero(mlx5_glue->dereg_mr(ibv_mr));
947 		return -rte_errno;
948 	}
949 	pmd_mr->addr = addr;
950 	pmd_mr->len = length;
951 	pmd_mr->obj = (void *)ibv_mr;
952 	pmd_mr->imkey = mkey;
953 	pmd_mr->lkey = mkey->id;
954 	return 0;
955 }
956 
957 void
958 mlx5_os_wrapped_mkey_destroy(struct mlx5_pmd_wrapped_mr *pmd_mr)
959 {
960 	if (!pmd_mr)
961 		return;
962 	if (pmd_mr->imkey)
963 		claim_zero(mlx5_devx_cmd_destroy(pmd_mr->imkey));
964 	if (pmd_mr->obj)
965 		claim_zero(mlx5_glue->dereg_mr(pmd_mr->obj));
966 	memset(pmd_mr, 0, sizeof(*pmd_mr));
967 }
968 
969 /**
970  * Rte_intr_handle create and init helper.
971  *
972  * @param[in] mode
973  *   interrupt instance can be shared between primary and secondary
974  *   processes or not.
975  * @param[in] set_fd_nonblock
976  *   Whether to set fd to O_NONBLOCK.
977  * @param[in] fd
978  *   Fd to set in created intr_handle.
979  * @param[in] cb
980  *   Callback to register for intr_handle.
981  * @param[in] cb_arg
982  *   Callback argument for cb.
983  *
984  * @return
985  *  - Interrupt handle on success.
986  *  - NULL on failure, with rte_errno set.
987  */
988 struct rte_intr_handle *
989 mlx5_os_interrupt_handler_create(int mode, bool set_fd_nonblock, int fd,
990 				 rte_intr_callback_fn cb, void *cb_arg)
991 {
992 	struct rte_intr_handle *tmp_intr_handle;
993 	int ret, flags;
994 
995 	tmp_intr_handle = rte_intr_instance_alloc(mode);
996 	if (!tmp_intr_handle) {
997 		rte_errno = ENOMEM;
998 		goto err;
999 	}
1000 	if (set_fd_nonblock) {
1001 		flags = fcntl(fd, F_GETFL);
1002 		ret = fcntl(fd, F_SETFL, flags | O_NONBLOCK);
1003 		if (ret) {
1004 			rte_errno = errno;
1005 			goto err;
1006 		}
1007 	}
1008 	ret = rte_intr_fd_set(tmp_intr_handle, fd);
1009 	if (ret)
1010 		goto err;
1011 	ret = rte_intr_type_set(tmp_intr_handle, RTE_INTR_HANDLE_EXT);
1012 	if (ret)
1013 		goto err;
1014 	ret = rte_intr_callback_register(tmp_intr_handle, cb, cb_arg);
1015 	if (ret) {
1016 		rte_errno = -ret;
1017 		goto err;
1018 	}
1019 	return tmp_intr_handle;
1020 err:
1021 	rte_intr_instance_free(tmp_intr_handle);
1022 	return NULL;
1023 }
1024 
1025 /* Safe unregistration for interrupt callback. */
1026 static void
1027 mlx5_intr_callback_unregister(const struct rte_intr_handle *handle,
1028 			      rte_intr_callback_fn cb_fn, void *cb_arg)
1029 {
1030 	uint64_t twait = 0;
1031 	uint64_t start = 0;
1032 
1033 	do {
1034 		int ret;
1035 
1036 		ret = rte_intr_callback_unregister(handle, cb_fn, cb_arg);
1037 		if (ret >= 0)
1038 			return;
1039 		if (ret != -EAGAIN) {
1040 			DRV_LOG(INFO, "failed to unregister interrupt"
1041 				      " handler (error: %d)", ret);
1042 			MLX5_ASSERT(false);
1043 			return;
1044 		}
1045 		if (twait) {
1046 			struct timespec onems;
1047 
1048 			/* Wait one millisecond and try again. */
1049 			onems.tv_sec = 0;
1050 			onems.tv_nsec = NS_PER_S / MS_PER_S;
1051 			nanosleep(&onems, 0);
1052 			/* Check whether one second elapsed. */
1053 			if ((rte_get_timer_cycles() - start) <= twait)
1054 				continue;
1055 		} else {
1056 			/*
1057 			 * We get the amount of timer ticks for one second.
1058 			 * If this amount elapsed it means we spent one
1059 			 * second in waiting. This branch is executed once
1060 			 * on first iteration.
1061 			 */
1062 			twait = rte_get_timer_hz();
1063 			MLX5_ASSERT(twait);
1064 		}
1065 		/*
1066 		 * Timeout elapsed, show message (once a second) and retry.
1067 		 * We have no other acceptable option here, if we ignore
1068 		 * the unregistering return code the handler will not
1069 		 * be unregistered, fd will be closed and we may get the
1070 		 * crush. Hanging and messaging in the loop seems not to be
1071 		 * the worst choice.
1072 		 */
1073 		DRV_LOG(INFO, "Retrying to unregister interrupt handler");
1074 		start = rte_get_timer_cycles();
1075 	} while (true);
1076 }
1077 
1078 /**
1079  * Rte_intr_handle destroy helper.
1080  *
1081  * @param[in] intr_handle
1082  *   Rte_intr_handle to destroy.
1083  * @param[in] cb
1084  *   Callback which is registered to intr_handle.
1085  * @param[in] cb_arg
1086  *   Callback argument for cb.
1087  *
1088  */
1089 void
1090 mlx5_os_interrupt_handler_destroy(struct rte_intr_handle *intr_handle,
1091 				  rte_intr_callback_fn cb, void *cb_arg)
1092 {
1093 	if (rte_intr_fd_get(intr_handle) >= 0)
1094 		mlx5_intr_callback_unregister(intr_handle, cb, cb_arg);
1095 	rte_intr_instance_free(intr_handle);
1096 }
1097