xref: /dpdk/drivers/common/mlx5/linux/mlx5_common_os.c (revision 5d52418fa4b9a7f28eaedc1d88ec5cf330381c0e)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2020 Mellanox Technologies, Ltd
3  */
4 
5 #include <sys/types.h>
6 #include <unistd.h>
7 #include <string.h>
8 #include <stdio.h>
9 #ifdef RTE_IBVERBS_LINK_DLOPEN
10 #include <dlfcn.h>
11 #endif
12 #include <dirent.h>
13 #include <net/if.h>
14 #include <fcntl.h>
15 
16 #include <rte_errno.h>
17 #include <rte_string_fns.h>
18 #include <bus_pci_driver.h>
19 #include <bus_auxiliary_driver.h>
20 
21 #include "mlx5_common.h"
22 #include "mlx5_nl.h"
23 #include "mlx5_common_log.h"
24 #include "mlx5_common_private.h"
25 #include "mlx5_common_defs.h"
26 #include "mlx5_common_os.h"
27 #include "mlx5_glue.h"
28 
29 #ifdef MLX5_GLUE
30 const struct mlx5_glue *mlx5_glue;
31 #endif
32 
33 int
34 mlx5_get_pci_addr(const char *dev_path, struct rte_pci_addr *pci_addr)
35 {
36 	FILE *file;
37 	char line[32];
38 	int rc = -ENOENT;
39 	MKSTR(path, "%s/device/uevent", dev_path);
40 
41 	file = fopen(path, "rb");
42 	if (file == NULL) {
43 		rte_errno = errno;
44 		return -rte_errno;
45 	}
46 	while (fgets(line, sizeof(line), file) == line) {
47 		size_t len = strlen(line);
48 
49 		/* Truncate long lines. */
50 		if (len == (sizeof(line) - 1)) {
51 			while (line[(len - 1)] != '\n') {
52 				int ret = fgetc(file);
53 
54 				if (ret == EOF)
55 					goto exit;
56 				line[(len - 1)] = ret;
57 			}
58 			/* No match for long lines. */
59 			continue;
60 		}
61 		/* Extract information. */
62 		if (sscanf(line,
63 			   "PCI_SLOT_NAME="
64 			   "%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n",
65 			   &pci_addr->domain,
66 			   &pci_addr->bus,
67 			   &pci_addr->devid,
68 			   &pci_addr->function) == 4) {
69 			rc = 0;
70 			break;
71 		}
72 	}
73 exit:
74 	fclose(file);
75 	if (rc)
76 		rte_errno = -rc;
77 	return rc;
78 }
79 
80 /**
81  * Extract port name, as a number, from sysfs or netlink information.
82  *
83  * @param[in] port_name_in
84  *   String representing the port name.
85  * @param[out] port_info_out
86  *   Port information, including port name as a number and port name
87  *   type if recognized
88  *
89  * @return
90  *   port_name field set according to recognized name format.
91  */
92 void
93 mlx5_translate_port_name(const char *port_name_in,
94 			 struct mlx5_switch_info *port_info_out)
95 {
96 	char ctrl = 0, pf_c1, pf_c2, vf_c1, vf_c2, eol;
97 	char *end;
98 	int sc_items;
99 
100 	sc_items = sscanf(port_name_in, "%c%d",
101 			  &ctrl, &port_info_out->ctrl_num);
102 	if (sc_items == 2 && ctrl == 'c') {
103 		port_name_in++; /* 'c' */
104 		port_name_in += snprintf(NULL, 0, "%d",
105 					  port_info_out->ctrl_num);
106 	}
107 	/* Check for port-name as a string of the form pf0vf0 or pf0sf0 */
108 	sc_items = sscanf(port_name_in, "%c%c%d%c%c%d%c",
109 			  &pf_c1, &pf_c2, &port_info_out->pf_num,
110 			  &vf_c1, &vf_c2, &port_info_out->port_name, &eol);
111 	if (sc_items == 6 && pf_c1 == 'p' && pf_c2 == 'f') {
112 		if (vf_c1 == 'v' && vf_c2 == 'f') {
113 			/* Kernel ver >= 5.0 or OFED ver >= 4.6 */
114 			port_info_out->name_type =
115 					MLX5_PHYS_PORT_NAME_TYPE_PFVF;
116 			return;
117 		}
118 		if (vf_c1 == 's' && vf_c2 == 'f') {
119 			/* Kernel ver >= 5.11 or OFED ver >= 5.1 */
120 			port_info_out->name_type =
121 					MLX5_PHYS_PORT_NAME_TYPE_PFSF;
122 			return;
123 		}
124 	}
125 	/*
126 	 * Check for port-name as a string of the form p0
127 	 * (support kernel ver >= 5.0, or OFED ver >= 4.6).
128 	 */
129 	sc_items = sscanf(port_name_in, "%c%d%c",
130 			  &pf_c1, &port_info_out->port_name, &eol);
131 	if (sc_items == 2 && pf_c1 == 'p') {
132 		port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_UPLINK;
133 		return;
134 	}
135 	/*
136 	 * Check for port-name as a string of the form pf0
137 	 * (support kernel ver >= 5.7 for HPF representor on BF).
138 	 */
139 	sc_items = sscanf(port_name_in, "%c%c%d%c",
140 			  &pf_c1, &pf_c2, &port_info_out->pf_num, &eol);
141 	if (sc_items == 3 && pf_c1 == 'p' && pf_c2 == 'f') {
142 		port_info_out->port_name = -1;
143 		port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_PFHPF;
144 		return;
145 	}
146 	/* Check for port-name as a number (support kernel ver < 5.0 */
147 	errno = 0;
148 	port_info_out->port_name = strtol(port_name_in, &end, 0);
149 	if (!errno &&
150 	    (size_t)(end - port_name_in) == strlen(port_name_in)) {
151 		port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_LEGACY;
152 		return;
153 	}
154 	port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN;
155 }
156 
157 int
158 mlx5_get_ifname_sysfs(const char *ibdev_path, char *ifname)
159 {
160 	DIR *dir;
161 	struct dirent *dent;
162 	unsigned int dev_type = 0;
163 	unsigned int dev_port_prev = ~0u;
164 	char match[IF_NAMESIZE] = "";
165 
166 	MLX5_ASSERT(ibdev_path);
167 	{
168 		MKSTR(path, "%s/device/net", ibdev_path);
169 
170 		dir = opendir(path);
171 		if (dir == NULL) {
172 			rte_errno = errno;
173 			return -rte_errno;
174 		}
175 	}
176 	while ((dent = readdir(dir)) != NULL) {
177 		char *name = dent->d_name;
178 		FILE *file;
179 		unsigned int dev_port;
180 		int r;
181 
182 		if ((name[0] == '.') &&
183 		    ((name[1] == '\0') ||
184 		     ((name[1] == '.') && (name[2] == '\0'))))
185 			continue;
186 
187 		MKSTR(path, "%s/device/net/%s/%s",
188 		      ibdev_path, name,
189 		      (dev_type ? "dev_id" : "dev_port"));
190 
191 		file = fopen(path, "rb");
192 		if (file == NULL) {
193 			if (errno != ENOENT)
194 				continue;
195 			/*
196 			 * Switch to dev_id when dev_port does not exist as
197 			 * is the case with Linux kernel versions < 3.15.
198 			 */
199 try_dev_id:
200 			match[0] = '\0';
201 			if (dev_type)
202 				break;
203 			dev_type = 1;
204 			dev_port_prev = ~0u;
205 			rewinddir(dir);
206 			continue;
207 		}
208 		r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port);
209 		fclose(file);
210 		if (r != 1)
211 			continue;
212 		/*
213 		 * Switch to dev_id when dev_port returns the same value for
214 		 * all ports. May happen when using a MOFED release older than
215 		 * 3.0 with a Linux kernel >= 3.15.
216 		 */
217 		if (dev_port == dev_port_prev)
218 			goto try_dev_id;
219 		dev_port_prev = dev_port;
220 		if (dev_port == 0)
221 			strlcpy(match, name, IF_NAMESIZE);
222 	}
223 	closedir(dir);
224 	if (match[0] == '\0') {
225 		rte_errno = ENOENT;
226 		return -rte_errno;
227 	}
228 	strncpy(ifname, match, IF_NAMESIZE);
229 	return 0;
230 }
231 
232 #ifdef MLX5_GLUE
233 
234 /**
235  * Suffix RTE_EAL_PMD_PATH with "-glue".
236  *
237  * This function performs a sanity check on RTE_EAL_PMD_PATH before
238  * suffixing its last component.
239  *
240  * @param buf[out]
241  *   Output buffer, should be large enough otherwise NULL is returned.
242  * @param size
243  *   Size of @p out.
244  *
245  * @return
246  *   Pointer to @p buf or @p NULL in case suffix cannot be appended.
247  */
248 static char *
249 mlx5_glue_path(char *buf, size_t size)
250 {
251 	static const char *const bad[] = { "/", ".", "..", NULL };
252 	const char *path = RTE_EAL_PMD_PATH;
253 	size_t len = strlen(path);
254 	size_t off;
255 	int i;
256 
257 	while (len && path[len - 1] == '/')
258 		--len;
259 	for (off = len; off && path[off - 1] != '/'; --off)
260 		;
261 	for (i = 0; bad[i]; ++i)
262 		if (!strncmp(path + off, bad[i], (int)(len - off)))
263 			goto error;
264 	i = snprintf(buf, size, "%.*s-glue", (int)len, path);
265 	if (i == -1 || (size_t)i >= size)
266 		goto error;
267 	return buf;
268 error:
269 	RTE_LOG(ERR, PMD, "unable to append \"-glue\" to last component of"
270 		" RTE_EAL_PMD_PATH (\"" RTE_EAL_PMD_PATH "\"), please"
271 		" re-configure DPDK");
272 	return NULL;
273 }
274 
275 static int
276 mlx5_glue_dlopen(void)
277 {
278 	char glue_path[sizeof(RTE_EAL_PMD_PATH) - 1 + sizeof("-glue")];
279 	void *handle = NULL;
280 
281 	char const *path[] = {
282 		/*
283 		 * A basic security check is necessary before trusting
284 		 * MLX5_GLUE_PATH, which may override RTE_EAL_PMD_PATH.
285 		 */
286 		(geteuid() == getuid() && getegid() == getgid() ?
287 		 getenv("MLX5_GLUE_PATH") : NULL),
288 		/*
289 		 * When RTE_EAL_PMD_PATH is set, use its glue-suffixed
290 		 * variant, otherwise let dlopen() look up libraries on its
291 		 * own.
292 		 */
293 		(*RTE_EAL_PMD_PATH ?
294 		 mlx5_glue_path(glue_path, sizeof(glue_path)) : ""),
295 	};
296 	unsigned int i = 0;
297 	void **sym;
298 	const char *dlmsg;
299 
300 	while (!handle && i != RTE_DIM(path)) {
301 		const char *end;
302 		size_t len;
303 		int ret;
304 
305 		if (!path[i]) {
306 			++i;
307 			continue;
308 		}
309 		end = strpbrk(path[i], ":;");
310 		if (!end)
311 			end = path[i] + strlen(path[i]);
312 		len = end - path[i];
313 		ret = 0;
314 		do {
315 			char name[ret + 1];
316 
317 			ret = snprintf(name, sizeof(name), "%.*s%s" MLX5_GLUE,
318 				       (int)len, path[i],
319 				       (!len || *(end - 1) == '/') ? "" : "/");
320 			if (ret == -1)
321 				break;
322 			if (sizeof(name) != (size_t)ret + 1)
323 				continue;
324 			DRV_LOG(DEBUG, "Looking for rdma-core glue as "
325 				"\"%s\"", name);
326 			handle = dlopen(name, RTLD_LAZY);
327 			break;
328 		} while (1);
329 		path[i] = end + 1;
330 		if (!*end)
331 			++i;
332 	}
333 	if (!handle) {
334 		rte_errno = EINVAL;
335 		dlmsg = dlerror();
336 		if (dlmsg)
337 			DRV_LOG(WARNING, "Cannot load glue library: %s", dlmsg);
338 		goto glue_error;
339 	}
340 	sym = dlsym(handle, "mlx5_glue");
341 	if (!sym || !*sym) {
342 		rte_errno = EINVAL;
343 		dlmsg = dlerror();
344 		if (dlmsg)
345 			DRV_LOG(ERR, "Cannot resolve glue symbol: %s", dlmsg);
346 		goto glue_error;
347 	}
348 	mlx5_glue = *sym;
349 	return 0;
350 
351 glue_error:
352 	if (handle)
353 		dlclose(handle);
354 	return -1;
355 }
356 
357 #endif
358 
359 /**
360  * Initialization routine for run-time dependency on rdma-core.
361  */
362 void
363 mlx5_glue_constructor(void)
364 {
365 	/*
366 	 * RDMAV_HUGEPAGES_SAFE tells ibv_fork_init() we intend to use
367 	 * huge pages. Calling ibv_fork_init() during init allows
368 	 * applications to use fork() safely for purposes other than
369 	 * using this PMD, which is not supported in forked processes.
370 	 */
371 	setenv("RDMAV_HUGEPAGES_SAFE", "1", 1);
372 	/* Match the size of Rx completion entry to the size of a cacheline. */
373 	if (RTE_CACHE_LINE_SIZE == 128)
374 		setenv("MLX5_CQE_SIZE", "128", 0);
375 	/*
376 	 * MLX5_DEVICE_FATAL_CLEANUP tells ibv_destroy functions to
377 	 * cleanup all the Verbs resources even when the device was removed.
378 	 */
379 	setenv("MLX5_DEVICE_FATAL_CLEANUP", "1", 1);
380 
381 #ifdef MLX5_GLUE
382 	if (mlx5_glue_dlopen() != 0)
383 		goto glue_error;
384 #endif
385 
386 #ifdef RTE_LIBRTE_MLX5_DEBUG
387 	/* Glue structure must not contain any NULL pointers. */
388 	{
389 		unsigned int i;
390 
391 		for (i = 0; i != sizeof(*mlx5_glue) / sizeof(void *); ++i)
392 			MLX5_ASSERT(((const void *const *)mlx5_glue)[i]);
393 	}
394 #endif
395 	if (strcmp(mlx5_glue->version, MLX5_GLUE_VERSION)) {
396 		rte_errno = EINVAL;
397 		DRV_LOG(ERR, "rdma-core glue \"%s\" mismatch: \"%s\" is "
398 			"required", mlx5_glue->version, MLX5_GLUE_VERSION);
399 		goto glue_error;
400 	}
401 	mlx5_glue->fork_init();
402 	return;
403 
404 glue_error:
405 	DRV_LOG(WARNING, "Cannot initialize MLX5 common due to missing"
406 		" run-time dependency on rdma-core libraries (libibverbs,"
407 		" libmlx5)");
408 	mlx5_glue = NULL;
409 }
410 
411 /**
412  * Validate user arguments for remote PD and CTX.
413  *
414  * @param config
415  *   Pointer to device configuration structure.
416  *
417  * @return
418  *   0 on success, a negative errno value otherwise and rte_errno is set.
419  */
420 int
421 mlx5_os_remote_pd_and_ctx_validate(struct mlx5_common_dev_config *config)
422 {
423 	int device_fd = config->device_fd;
424 	int pd_handle = config->pd_handle;
425 
426 #ifdef HAVE_MLX5_IBV_IMPORT_CTX_PD_AND_MR
427 	if (device_fd == MLX5_ARG_UNSET && pd_handle != MLX5_ARG_UNSET) {
428 		DRV_LOG(ERR, "Remote PD without CTX is not supported.");
429 		rte_errno = EINVAL;
430 		return -rte_errno;
431 	}
432 	if (device_fd != MLX5_ARG_UNSET && pd_handle == MLX5_ARG_UNSET) {
433 		DRV_LOG(ERR, "Remote CTX without PD is not supported.");
434 		rte_errno = EINVAL;
435 		return -rte_errno;
436 	}
437 	DRV_LOG(DEBUG, "Remote PD and CTX is supported: (cmd_fd=%d, "
438 		"pd_handle=%d).", device_fd, pd_handle);
439 #else
440 	if (pd_handle != MLX5_ARG_UNSET || device_fd != MLX5_ARG_UNSET) {
441 		DRV_LOG(ERR,
442 			"Remote PD and CTX is not supported - maybe old rdma-core version?");
443 		rte_errno = ENOTSUP;
444 		return -rte_errno;
445 	}
446 #endif
447 	return 0;
448 }
449 
450 /**
451  * Release Protection Domain object.
452  *
453  * @param[out] cdev
454  *   Pointer to the mlx5 device.
455  *
456  * @return
457  *   0 on success, a negative errno value otherwise.
458  */
459 int
460 mlx5_os_pd_release(struct mlx5_common_device *cdev)
461 {
462 	if (cdev->config.pd_handle == MLX5_ARG_UNSET)
463 		return mlx5_glue->dealloc_pd(cdev->pd);
464 	else
465 		return mlx5_glue->unimport_pd(cdev->pd);
466 }
467 
468 /**
469  * Allocate Protection Domain object.
470  *
471  * @param[out] cdev
472  *   Pointer to the mlx5 device.
473  *
474  * @return
475  *   0 on success, a negative errno value otherwise.
476  */
477 static int
478 mlx5_os_pd_create(struct mlx5_common_device *cdev)
479 {
480 	cdev->pd = mlx5_glue->alloc_pd(cdev->ctx);
481 	if (cdev->pd == NULL) {
482 		DRV_LOG(ERR, "Failed to allocate PD: %s", rte_strerror(errno));
483 		return errno ? -errno : -ENOMEM;
484 	}
485 	return 0;
486 }
487 
488 /**
489  * Import Protection Domain object according to given PD handle.
490  *
491  * @param[out] cdev
492  *   Pointer to the mlx5 device.
493  *
494  * @return
495  *   0 on success, a negative errno value otherwise.
496  */
497 static int
498 mlx5_os_pd_import(struct mlx5_common_device *cdev)
499 {
500 	cdev->pd = mlx5_glue->import_pd(cdev->ctx, cdev->config.pd_handle);
501 	if (cdev->pd == NULL) {
502 		DRV_LOG(ERR, "Failed to import PD using handle=%d: %s",
503 			cdev->config.pd_handle, rte_strerror(errno));
504 		return errno ? -errno : -ENOMEM;
505 	}
506 	return 0;
507 }
508 
509 /**
510  * Prepare Protection Domain object and extract its pdn using DV API.
511  *
512  * @param[out] cdev
513  *   Pointer to the mlx5 device.
514  *
515  * @return
516  *   0 on success, a negative errno value otherwise and rte_errno is set.
517  */
518 int
519 mlx5_os_pd_prepare(struct mlx5_common_device *cdev)
520 {
521 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
522 	struct mlx5dv_obj obj;
523 	struct mlx5dv_pd pd_info;
524 #endif
525 	int ret;
526 
527 	if (cdev->config.pd_handle == MLX5_ARG_UNSET)
528 		ret = mlx5_os_pd_create(cdev);
529 	else
530 		ret = mlx5_os_pd_import(cdev);
531 	if (ret) {
532 		rte_errno = -ret;
533 		return ret;
534 	}
535 	if (cdev->config.devx == 0)
536 		return 0;
537 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
538 	obj.pd.in = cdev->pd;
539 	obj.pd.out = &pd_info;
540 	ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_PD);
541 	if (ret != 0) {
542 		DRV_LOG(ERR, "Fail to get PD object info.");
543 		rte_errno = errno;
544 		claim_zero(mlx5_os_pd_release(cdev));
545 		cdev->pd = NULL;
546 		return -rte_errno;
547 	}
548 	cdev->pdn = pd_info.pdn;
549 	return 0;
550 #else
551 	DRV_LOG(ERR, "Cannot get pdn - no DV support.");
552 	rte_errno = ENOTSUP;
553 	return -rte_errno;
554 #endif /* HAVE_IBV_FLOW_DV_SUPPORT */
555 }
556 
557 static struct ibv_device *
558 mlx5_os_get_ibv_device(const struct rte_pci_device *pci_dev)
559 {
560 	int n;
561 	struct ibv_device **ibv_list = mlx5_glue->get_device_list(&n);
562 	struct ibv_device *ibv_match = NULL;
563 	uint8_t guid1[32] = {0};
564 	uint8_t guid2[32] = {0};
565 	int ret1, ret2 = -1;
566 	struct rte_pci_addr paddr;
567 	const struct rte_pci_addr *addr = &pci_dev->addr;
568 	bool is_vf_dev = mlx5_dev_is_vf_pci(pci_dev);
569 
570 	if (ibv_list == NULL || !n) {
571 		rte_errno = ENOSYS;
572 		if (ibv_list)
573 			mlx5_glue->free_device_list(ibv_list);
574 		return NULL;
575 	}
576 	ret1 = mlx5_get_device_guid(addr, guid1, sizeof(guid1));
577 	while (n-- > 0) {
578 		DRV_LOG(DEBUG, "Checking device \"%s\"..", ibv_list[n]->name);
579 		if (mlx5_get_pci_addr(ibv_list[n]->ibdev_path, &paddr) != 0)
580 			continue;
581 		if (ret1 > 0)
582 			ret2 = mlx5_get_device_guid(&paddr, guid2, sizeof(guid2));
583 		/* Bond device can bond secondary PCIe */
584 		if ((strstr(ibv_list[n]->name, "bond") && !is_vf_dev &&
585 		     ((ret1 > 0 && ret2 > 0 && !memcmp(guid1, guid2, sizeof(guid1))) ||
586 		      (addr->domain == paddr.domain && addr->bus == paddr.bus &&
587 		       addr->devid == paddr.devid))) ||
588 		    !rte_pci_addr_cmp(addr, &paddr)) {
589 			ibv_match = ibv_list[n];
590 			break;
591 		}
592 	}
593 	if (ibv_match == NULL) {
594 		DRV_LOG(WARNING,
595 			"No Verbs device matches PCI device " PCI_PRI_FMT ","
596 			" are kernel drivers loaded?",
597 			addr->domain, addr->bus, addr->devid, addr->function);
598 		rte_errno = ENOENT;
599 	}
600 	mlx5_glue->free_device_list(ibv_list);
601 	return ibv_match;
602 }
603 
604 /* Try to disable ROCE by Netlink\Devlink. */
605 static int
606 mlx5_nl_roce_disable(const char *addr)
607 {
608 	int nlsk_fd = mlx5_nl_init(NETLINK_GENERIC, 0);
609 	int devlink_id;
610 	int enable;
611 	int ret;
612 
613 	if (nlsk_fd < 0)
614 		return nlsk_fd;
615 	devlink_id = mlx5_nl_devlink_family_id_get(nlsk_fd);
616 	if (devlink_id < 0) {
617 		ret = devlink_id;
618 		DRV_LOG(DEBUG,
619 			"Failed to get devlink id for ROCE operations by Netlink.");
620 		goto close;
621 	}
622 	ret = mlx5_nl_enable_roce_get(nlsk_fd, devlink_id, addr, &enable);
623 	if (ret) {
624 		DRV_LOG(DEBUG, "Failed to get ROCE enable by Netlink: %d.",
625 			ret);
626 		goto close;
627 	} else if (!enable) {
628 		DRV_LOG(INFO, "ROCE has already disabled(Netlink).");
629 		goto close;
630 	}
631 	ret = mlx5_nl_enable_roce_set(nlsk_fd, devlink_id, addr, 0);
632 	if (ret)
633 		DRV_LOG(DEBUG, "Failed to disable ROCE by Netlink: %d.", ret);
634 	else
635 		DRV_LOG(INFO, "ROCE is disabled by Netlink successfully.");
636 close:
637 	close(nlsk_fd);
638 	return ret;
639 }
640 
641 /* Try to disable ROCE by sysfs. */
642 static int
643 mlx5_sys_roce_disable(const char *addr)
644 {
645 	FILE *file_o;
646 	int enable;
647 	int ret;
648 
649 	MKSTR(file_p, "/sys/bus/pci/devices/%s/roce_enable", addr);
650 	file_o = fopen(file_p, "rb");
651 	if (!file_o) {
652 		rte_errno = ENOTSUP;
653 		return -ENOTSUP;
654 	}
655 	ret = fscanf(file_o, "%d", &enable);
656 	if (ret != 1) {
657 		rte_errno = EINVAL;
658 		ret = EINVAL;
659 		goto close;
660 	} else if (!enable) {
661 		ret = 0;
662 		DRV_LOG(INFO, "ROCE has already disabled(sysfs).");
663 		goto close;
664 	}
665 	fclose(file_o);
666 	file_o = fopen(file_p, "wb");
667 	if (!file_o) {
668 		rte_errno = ENOTSUP;
669 		return -ENOTSUP;
670 	}
671 	fprintf(file_o, "0\n");
672 	ret = 0;
673 close:
674 	if (ret)
675 		DRV_LOG(DEBUG, "Failed to disable ROCE by sysfs: %d.", ret);
676 	else
677 		DRV_LOG(INFO, "ROCE is disabled by sysfs successfully.");
678 	fclose(file_o);
679 	return ret;
680 }
681 
682 static int
683 mlx5_roce_disable(const struct rte_device *dev)
684 {
685 	char pci_addr[PCI_PRI_STR_SIZE] = { 0 };
686 
687 	if (mlx5_dev_to_pci_str(dev, pci_addr, sizeof(pci_addr)) < 0)
688 		return -rte_errno;
689 	/* Firstly try to disable ROCE by Netlink and fallback to sysfs. */
690 	if (mlx5_nl_roce_disable(pci_addr) != 0 &&
691 	    mlx5_sys_roce_disable(pci_addr) != 0)
692 		return -rte_errno;
693 	return 0;
694 }
695 
696 static struct ibv_device *
697 mlx5_os_get_ibv_dev(const struct rte_device *dev)
698 {
699 	struct ibv_device *ibv;
700 
701 	if (mlx5_dev_is_pci(dev))
702 		ibv = mlx5_os_get_ibv_device(RTE_DEV_TO_PCI_CONST(dev));
703 	else
704 		ibv = mlx5_get_aux_ibv_device(RTE_DEV_TO_AUXILIARY_CONST(dev));
705 	if (ibv == NULL) {
706 		rte_errno = ENODEV;
707 		DRV_LOG(ERR, "Verbs device not found: %s", dev->name);
708 	}
709 	return ibv;
710 }
711 
712 static struct ibv_device *
713 mlx5_vdpa_get_ibv_dev(const struct rte_device *dev)
714 {
715 	struct ibv_device *ibv;
716 	int retry;
717 
718 	if (mlx5_roce_disable(dev) != 0) {
719 		DRV_LOG(WARNING, "Failed to disable ROCE for \"%s\".",
720 			dev->name);
721 		return NULL;
722 	}
723 	/* Wait for the IB device to appear again after reload. */
724 	for (retry = MLX5_VDPA_MAX_RETRIES; retry > 0; --retry) {
725 		ibv = mlx5_os_get_ibv_dev(dev);
726 		if (ibv != NULL)
727 			return ibv;
728 		usleep(MLX5_VDPA_USEC);
729 	}
730 	DRV_LOG(ERR,
731 		"Cannot get IB device after disabling RoCE for \"%s\", retries exceed %d.",
732 		dev->name, MLX5_VDPA_MAX_RETRIES);
733 	rte_errno = EAGAIN;
734 	return NULL;
735 }
736 
737 static int
738 mlx5_config_doorbell_mapping_env(int dbnc)
739 {
740 	char *env;
741 	int value;
742 
743 	MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
744 	/* Get environment variable to store. */
745 	env = getenv(MLX5_SHUT_UP_BF);
746 	value = env ? !!strcmp(env, "0") : MLX5_ARG_UNSET;
747 	if (dbnc == MLX5_ARG_UNSET)
748 		setenv(MLX5_SHUT_UP_BF, MLX5_SHUT_UP_BF_DEFAULT, 1);
749 	else
750 		setenv(MLX5_SHUT_UP_BF,
751 		       dbnc == MLX5_SQ_DB_NCACHED ? "1" : "0", 1);
752 	return value;
753 }
754 
755 static void
756 mlx5_restore_doorbell_mapping_env(int value)
757 {
758 	MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
759 	/* Restore the original environment variable state. */
760 	if (value == MLX5_ARG_UNSET)
761 		unsetenv(MLX5_SHUT_UP_BF);
762 	else
763 		setenv(MLX5_SHUT_UP_BF, value ? "1" : "0", 1);
764 }
765 
766 /**
767  * Function API to open IB device.
768  *
769  * @param cdev
770  *   Pointer to the mlx5 device.
771  * @param classes
772  *   Chosen classes come from device arguments.
773  *
774  * @return
775  *   Pointer to ibv_context on success, NULL otherwise and rte_errno is set.
776  */
777 static struct ibv_context *
778 mlx5_open_device(struct mlx5_common_device *cdev, uint32_t classes)
779 {
780 	struct ibv_device *ibv;
781 	struct ibv_context *ctx = NULL;
782 	int dbmap_env;
783 
784 	MLX5_ASSERT(cdev->config.device_fd == MLX5_ARG_UNSET);
785 	if (classes & MLX5_CLASS_VDPA)
786 		ibv = mlx5_vdpa_get_ibv_dev(cdev->dev);
787 	else
788 		ibv = mlx5_os_get_ibv_dev(cdev->dev);
789 	if (!ibv)
790 		return NULL;
791 	DRV_LOG(INFO, "Dev information matches for device \"%s\".", ibv->name);
792 	/*
793 	 * Configure environment variable "MLX5_BF_SHUT_UP" before the device
794 	 * creation. The rdma_core library checks the variable at device
795 	 * creation and stores the result internally.
796 	 */
797 	dbmap_env = mlx5_config_doorbell_mapping_env(cdev->config.dbnc);
798 	/* Try to open IB device with DV first, then usual Verbs. */
799 	errno = 0;
800 	ctx = mlx5_glue->dv_open_device(ibv);
801 	if (ctx) {
802 		cdev->config.devx = 1;
803 	} else if (classes == MLX5_CLASS_ETH) {
804 		/* The environment variable is still configured. */
805 		ctx = mlx5_glue->open_device(ibv);
806 		if (ctx == NULL)
807 			goto error;
808 	} else {
809 		goto error;
810 	}
811 	/* The device is created, no need for environment. */
812 	mlx5_restore_doorbell_mapping_env(dbmap_env);
813 	return ctx;
814 error:
815 	rte_errno = errno ? errno : ENODEV;
816 	/* The device creation is failed, no need for environment. */
817 	mlx5_restore_doorbell_mapping_env(dbmap_env);
818 	DRV_LOG(ERR, "Failed to open IB device \"%s\".", ibv->name);
819 	return NULL;
820 }
821 
822 /**
823  * Function API to import IB device.
824  *
825  * @param cdev
826  *   Pointer to the mlx5 device.
827  *
828  * @return
829  *   Pointer to ibv_context on success, NULL otherwise and rte_errno is set.
830  */
831 static struct ibv_context *
832 mlx5_import_device(struct mlx5_common_device *cdev)
833 {
834 	struct ibv_context *ctx = NULL;
835 
836 	MLX5_ASSERT(cdev->config.device_fd != MLX5_ARG_UNSET);
837 	ctx = mlx5_glue->import_device(cdev->config.device_fd);
838 	if (!ctx) {
839 		DRV_LOG(ERR, "Failed to import device for fd=%d: %s",
840 			cdev->config.device_fd, rte_strerror(errno));
841 		rte_errno = errno;
842 	}
843 	return ctx;
844 }
845 
846 /**
847  * Function API to prepare IB device.
848  *
849  * @param cdev
850  *   Pointer to the mlx5 device.
851  * @param classes
852  *   Chosen classes come from device arguments.
853  *
854  * @return
855  *   0 on success, a negative errno value otherwise and rte_errno is set.
856  */
857 int
858 mlx5_os_open_device(struct mlx5_common_device *cdev, uint32_t classes)
859 {
860 
861 	struct ibv_context *ctx = NULL;
862 
863 	if (cdev->config.device_fd == MLX5_ARG_UNSET)
864 		ctx = mlx5_open_device(cdev, classes);
865 	else
866 		ctx = mlx5_import_device(cdev);
867 	if (ctx == NULL)
868 		return -rte_errno;
869 	/* Hint libmlx5 to use PMD allocator for data plane resources */
870 	mlx5_set_context_attr(cdev->dev, ctx);
871 	cdev->ctx = ctx;
872 	return 0;
873 }
874 
875 int
876 mlx5_get_device_guid(const struct rte_pci_addr *dev, uint8_t *guid, size_t len)
877 {
878 	char tmp[512];
879 	char cur_ifname[IF_NAMESIZE + 1];
880 	FILE *id_file;
881 	DIR *dir;
882 	struct dirent *ptr;
883 	int ret;
884 
885 	if (guid == NULL || len < sizeof(u_int64_t) + 1)
886 		return -1;
887 	memset(guid, 0, len);
888 	snprintf(tmp, sizeof(tmp), "/sys/bus/pci/devices/%04x:%02x:%02x.%x/net",
889 			dev->domain, dev->bus, dev->devid, dev->function);
890 	dir = opendir(tmp);
891 	if (dir == NULL)
892 		return -1;
893 	/* Traverse to identify PF interface */
894 	do {
895 		ptr = readdir(dir);
896 		if (ptr == NULL || ptr->d_type != DT_DIR) {
897 			closedir(dir);
898 			return -1;
899 		}
900 	} while (strchr(ptr->d_name, '.') || strchr(ptr->d_name, '_') ||
901 		 strchr(ptr->d_name, 'v'));
902 	snprintf(cur_ifname, sizeof(cur_ifname), "%s", ptr->d_name);
903 	closedir(dir);
904 	snprintf(tmp + strlen(tmp), sizeof(tmp) - strlen(tmp),
905 			"/%s/phys_switch_id", cur_ifname);
906 	/* Older OFED like 5.3 doesn't support read */
907 	id_file = fopen(tmp, "r");
908 	if (!id_file)
909 		return 0;
910 	ret = fscanf(id_file, "%16s", guid);
911 	fclose(id_file);
912 	return ret;
913 }
914 
915 /*
916  * Create direct mkey using the kernel ibv_reg_mr API and wrap it with a new
917  * indirect mkey created by the DevX API.
918  * This mkey should be used for DevX commands requesting mkey as a parameter.
919  */
920 int
921 mlx5_os_wrapped_mkey_create(void *ctx, void *pd, uint32_t pdn, void *addr,
922 			    size_t length, struct mlx5_pmd_wrapped_mr *pmd_mr)
923 {
924 	struct mlx5_klm klm = {
925 		.byte_count = length,
926 		.address = (uintptr_t)addr,
927 	};
928 	struct mlx5_devx_mkey_attr mkey_attr = {
929 		.pd = pdn,
930 		.klm_array = &klm,
931 		.klm_num = 1,
932 	};
933 	struct mlx5_devx_obj *mkey;
934 	struct ibv_mr *ibv_mr = mlx5_glue->reg_mr(pd, addr, length,
935 						  IBV_ACCESS_LOCAL_WRITE |
936 						  (haswell_broadwell_cpu ? 0 :
937 						  IBV_ACCESS_RELAXED_ORDERING));
938 
939 	if (!ibv_mr) {
940 		rte_errno = errno;
941 		return -rte_errno;
942 	}
943 	klm.mkey = ibv_mr->lkey;
944 	mkey_attr.addr = (uintptr_t)addr;
945 	mkey_attr.size = length;
946 	mkey = mlx5_devx_cmd_mkey_create(ctx, &mkey_attr);
947 	if (!mkey) {
948 		claim_zero(mlx5_glue->dereg_mr(ibv_mr));
949 		return -rte_errno;
950 	}
951 	pmd_mr->addr = addr;
952 	pmd_mr->len = length;
953 	pmd_mr->obj = (void *)ibv_mr;
954 	pmd_mr->imkey = mkey;
955 	pmd_mr->lkey = mkey->id;
956 	return 0;
957 }
958 
959 void
960 mlx5_os_wrapped_mkey_destroy(struct mlx5_pmd_wrapped_mr *pmd_mr)
961 {
962 	if (!pmd_mr)
963 		return;
964 	if (pmd_mr->imkey)
965 		claim_zero(mlx5_devx_cmd_destroy(pmd_mr->imkey));
966 	if (pmd_mr->obj)
967 		claim_zero(mlx5_glue->dereg_mr(pmd_mr->obj));
968 	memset(pmd_mr, 0, sizeof(*pmd_mr));
969 }
970 
971 /**
972  * Rte_intr_handle create and init helper.
973  *
974  * @param[in] mode
975  *   interrupt instance can be shared between primary and secondary
976  *   processes or not.
977  * @param[in] set_fd_nonblock
978  *   Whether to set fd to O_NONBLOCK.
979  * @param[in] fd
980  *   Fd to set in created intr_handle.
981  * @param[in] cb
982  *   Callback to register for intr_handle.
983  * @param[in] cb_arg
984  *   Callback argument for cb.
985  *
986  * @return
987  *  - Interrupt handle on success.
988  *  - NULL on failure, with rte_errno set.
989  */
990 struct rte_intr_handle *
991 mlx5_os_interrupt_handler_create(int mode, bool set_fd_nonblock, int fd,
992 				 rte_intr_callback_fn cb, void *cb_arg)
993 {
994 	struct rte_intr_handle *tmp_intr_handle;
995 	int ret, flags;
996 
997 	tmp_intr_handle = rte_intr_instance_alloc(mode);
998 	if (!tmp_intr_handle) {
999 		rte_errno = ENOMEM;
1000 		goto err;
1001 	}
1002 	if (set_fd_nonblock) {
1003 		flags = fcntl(fd, F_GETFL);
1004 		ret = fcntl(fd, F_SETFL, flags | O_NONBLOCK);
1005 		if (ret) {
1006 			rte_errno = errno;
1007 			goto err;
1008 		}
1009 	}
1010 	ret = rte_intr_fd_set(tmp_intr_handle, fd);
1011 	if (ret)
1012 		goto err;
1013 	ret = rte_intr_type_set(tmp_intr_handle, RTE_INTR_HANDLE_EXT);
1014 	if (ret)
1015 		goto err;
1016 	ret = rte_intr_callback_register(tmp_intr_handle, cb, cb_arg);
1017 	if (ret) {
1018 		rte_errno = -ret;
1019 		goto err;
1020 	}
1021 	return tmp_intr_handle;
1022 err:
1023 	rte_intr_instance_free(tmp_intr_handle);
1024 	return NULL;
1025 }
1026 
1027 /* Safe unregistration for interrupt callback. */
1028 static void
1029 mlx5_intr_callback_unregister(const struct rte_intr_handle *handle,
1030 			      rte_intr_callback_fn cb_fn, void *cb_arg)
1031 {
1032 	uint64_t twait = 0;
1033 	uint64_t start = 0;
1034 
1035 	do {
1036 		int ret;
1037 
1038 		ret = rte_intr_callback_unregister(handle, cb_fn, cb_arg);
1039 		if (ret >= 0)
1040 			return;
1041 		if (ret != -EAGAIN) {
1042 			DRV_LOG(INFO, "failed to unregister interrupt"
1043 				      " handler (error: %d)", ret);
1044 			MLX5_ASSERT(false);
1045 			return;
1046 		}
1047 		if (twait) {
1048 			struct timespec onems;
1049 
1050 			/* Wait one millisecond and try again. */
1051 			onems.tv_sec = 0;
1052 			onems.tv_nsec = NS_PER_S / MS_PER_S;
1053 			nanosleep(&onems, 0);
1054 			/* Check whether one second elapsed. */
1055 			if ((rte_get_timer_cycles() - start) <= twait)
1056 				continue;
1057 		} else {
1058 			/*
1059 			 * We get the amount of timer ticks for one second.
1060 			 * If this amount elapsed it means we spent one
1061 			 * second in waiting. This branch is executed once
1062 			 * on first iteration.
1063 			 */
1064 			twait = rte_get_timer_hz();
1065 			MLX5_ASSERT(twait);
1066 		}
1067 		/*
1068 		 * Timeout elapsed, show message (once a second) and retry.
1069 		 * We have no other acceptable option here, if we ignore
1070 		 * the unregistering return code the handler will not
1071 		 * be unregistered, fd will be closed and we may get the
1072 		 * crush. Hanging and messaging in the loop seems not to be
1073 		 * the worst choice.
1074 		 */
1075 		DRV_LOG(INFO, "Retrying to unregister interrupt handler");
1076 		start = rte_get_timer_cycles();
1077 	} while (true);
1078 }
1079 
1080 /**
1081  * Rte_intr_handle destroy helper.
1082  *
1083  * @param[in] intr_handle
1084  *   Rte_intr_handle to destroy.
1085  * @param[in] cb
1086  *   Callback which is registered to intr_handle.
1087  * @param[in] cb_arg
1088  *   Callback argument for cb.
1089  *
1090  */
1091 void
1092 mlx5_os_interrupt_handler_destroy(struct rte_intr_handle *intr_handle,
1093 				  rte_intr_callback_fn cb, void *cb_arg)
1094 {
1095 	if (rte_intr_fd_get(intr_handle) >= 0)
1096 		mlx5_intr_callback_unregister(intr_handle, cb, cb_arg);
1097 	rte_intr_instance_free(intr_handle);
1098 }
1099