xref: /dpdk/drivers/common/mlx5/linux/mlx5_common_os.c (revision 3cd5e500b5cb1b72ee182be4043018f22a5f8a3b)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2020 Mellanox Technologies, Ltd
3  */
4 
5 #include <sys/types.h>
6 #include <unistd.h>
7 #include <string.h>
8 #include <stdio.h>
9 #ifdef RTE_IBVERBS_LINK_DLOPEN
10 #include <dlfcn.h>
11 #endif
12 #include <dirent.h>
13 #include <net/if.h>
14 #include <fcntl.h>
15 
16 #include <rte_errno.h>
17 #include <rte_string_fns.h>
18 #include <bus_pci_driver.h>
19 #include <bus_auxiliary_driver.h>
20 
21 #include "mlx5_common.h"
22 #include "mlx5_nl.h"
23 #include "mlx5_common_log.h"
24 #include "mlx5_common_private.h"
25 #include "mlx5_common_defs.h"
26 #include "mlx5_common_os.h"
27 #include "mlx5_glue.h"
28 
29 #ifdef MLX5_GLUE
30 const struct mlx5_glue *mlx5_glue;
31 #endif
32 
33 int
mlx5_get_pci_addr(const char * dev_path,struct rte_pci_addr * pci_addr)34 mlx5_get_pci_addr(const char *dev_path, struct rte_pci_addr *pci_addr)
35 {
36 	FILE *file;
37 	char line[32];
38 	int rc = -ENOENT;
39 	MKSTR(path, "%s/device/uevent", dev_path);
40 
41 	file = fopen(path, "rb");
42 	if (file == NULL) {
43 		rte_errno = errno;
44 		return -rte_errno;
45 	}
46 	while (fgets(line, sizeof(line), file) == line) {
47 		size_t len = strlen(line);
48 
49 		/* Truncate long lines. */
50 		if (len == (sizeof(line) - 1)) {
51 			while (line[(len - 1)] != '\n') {
52 				int ret = fgetc(file);
53 
54 				if (ret == EOF)
55 					goto exit;
56 				line[(len - 1)] = ret;
57 			}
58 			/* No match for long lines. */
59 			continue;
60 		}
61 		/* Extract information. */
62 		if (sscanf(line,
63 			   "PCI_SLOT_NAME="
64 			   "%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n",
65 			   &pci_addr->domain,
66 			   &pci_addr->bus,
67 			   &pci_addr->devid,
68 			   &pci_addr->function) == 4) {
69 			rc = 0;
70 			break;
71 		}
72 	}
73 exit:
74 	fclose(file);
75 	if (rc)
76 		rte_errno = -rc;
77 	return rc;
78 }
79 
80 /**
81  * Extract port name, as a number, from sysfs or netlink information.
82  *
83  * @param[in] port_name_in
84  *   String representing the port name.
85  * @param[out] port_info_out
86  *   Port information, including port name as a number and port name
87  *   type if recognized
88  *
89  * @return
90  *   port_name field set according to recognized name format.
91  */
92 void
mlx5_translate_port_name(const char * port_name_in,struct mlx5_switch_info * port_info_out)93 mlx5_translate_port_name(const char *port_name_in,
94 			 struct mlx5_switch_info *port_info_out)
95 {
96 	char ctrl = 0, pf_c1, pf_c2, vf_c1, vf_c2, eol;
97 	char *end;
98 	int sc_items;
99 	int32_t ctrl_num = -1;
100 
101 	sc_items = sscanf(port_name_in, "%c%d", &ctrl, &ctrl_num);
102 	if (sc_items == 2 && ctrl == 'c') {
103 		port_info_out->ctrl_num = ctrl_num;
104 		port_name_in++; /* 'c' */
105 		port_name_in += snprintf(NULL, 0, "%d",
106 					  port_info_out->ctrl_num);
107 	}
108 	/* Check for port-name as a string of the form pf0vf0 or pf0sf0 */
109 	sc_items = sscanf(port_name_in, "%c%c%d%c%c%d%c",
110 			  &pf_c1, &pf_c2, &port_info_out->pf_num,
111 			  &vf_c1, &vf_c2, &port_info_out->port_name, &eol);
112 	if (sc_items == 6 && pf_c1 == 'p' && pf_c2 == 'f') {
113 		if (vf_c1 == 'v' && vf_c2 == 'f') {
114 			/* Kernel ver >= 5.0 or OFED ver >= 4.6 */
115 			port_info_out->name_type =
116 					MLX5_PHYS_PORT_NAME_TYPE_PFVF;
117 			return;
118 		}
119 		if (vf_c1 == 's' && vf_c2 == 'f') {
120 			/* Kernel ver >= 5.11 or OFED ver >= 5.1 */
121 			port_info_out->name_type =
122 					MLX5_PHYS_PORT_NAME_TYPE_PFSF;
123 			return;
124 		}
125 	}
126 	/*
127 	 * Check for port-name as a string of the form p0
128 	 * (support kernel ver >= 5.0, or OFED ver >= 4.6).
129 	 */
130 	sc_items = sscanf(port_name_in, "%c%d%c",
131 			  &pf_c1, &port_info_out->port_name, &eol);
132 	if (sc_items == 2 && pf_c1 == 'p') {
133 		port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_UPLINK;
134 		return;
135 	}
136 	/*
137 	 * Check for port-name as a string of the form pf0
138 	 * (support kernel ver >= 5.7 for HPF representor on BF).
139 	 */
140 	sc_items = sscanf(port_name_in, "%c%c%d%c",
141 			  &pf_c1, &pf_c2, &port_info_out->pf_num, &eol);
142 	if (sc_items == 3 && pf_c1 == 'p' && pf_c2 == 'f') {
143 		port_info_out->port_name = -1;
144 		port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_PFHPF;
145 		return;
146 	}
147 	/* Check for port-name as a number (support kernel ver < 5.0 */
148 	errno = 0;
149 	port_info_out->port_name = strtol(port_name_in, &end, 0);
150 	if (!errno &&
151 	    (size_t)(end - port_name_in) == strlen(port_name_in)) {
152 		port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_LEGACY;
153 		return;
154 	}
155 	port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN;
156 }
157 
158 int
mlx5_get_ifname_sysfs(const char * ibdev_path,char * ifname)159 mlx5_get_ifname_sysfs(const char *ibdev_path, char *ifname)
160 {
161 	DIR *dir;
162 	struct dirent *dent;
163 	unsigned int dev_type = 0;
164 	unsigned int dev_port_prev = ~0u;
165 	char match[IF_NAMESIZE] = "";
166 
167 	MLX5_ASSERT(ibdev_path);
168 	{
169 		MKSTR(path, "%s/device/net", ibdev_path);
170 
171 		dir = opendir(path);
172 		if (dir == NULL) {
173 			rte_errno = errno;
174 			return -rte_errno;
175 		}
176 	}
177 	while ((dent = readdir(dir)) != NULL) {
178 		char *name = dent->d_name;
179 		FILE *file;
180 		unsigned int dev_port;
181 		int r;
182 
183 		if ((name[0] == '.') &&
184 		    ((name[1] == '\0') ||
185 		     ((name[1] == '.') && (name[2] == '\0'))))
186 			continue;
187 
188 		MKSTR(path, "%s/device/net/%s/%s",
189 		      ibdev_path, name,
190 		      (dev_type ? "dev_id" : "dev_port"));
191 
192 		file = fopen(path, "rb");
193 		if (file == NULL) {
194 			if (errno != ENOENT)
195 				continue;
196 			/*
197 			 * Switch to dev_id when dev_port does not exist as
198 			 * is the case with Linux kernel versions < 3.15.
199 			 */
200 try_dev_id:
201 			match[0] = '\0';
202 			if (dev_type)
203 				break;
204 			dev_type = 1;
205 			dev_port_prev = ~0u;
206 			rewinddir(dir);
207 			continue;
208 		}
209 		r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port);
210 		fclose(file);
211 		if (r != 1)
212 			continue;
213 		/*
214 		 * Switch to dev_id when dev_port returns the same value for
215 		 * all ports. May happen when using a MOFED release older than
216 		 * 3.0 with a Linux kernel >= 3.15.
217 		 */
218 		if (dev_port == dev_port_prev)
219 			goto try_dev_id;
220 		dev_port_prev = dev_port;
221 		if (dev_port == 0)
222 			strlcpy(match, name, IF_NAMESIZE);
223 	}
224 	closedir(dir);
225 	if (match[0] == '\0') {
226 		rte_errno = ENOENT;
227 		return -rte_errno;
228 	}
229 	strncpy(ifname, match, IF_NAMESIZE);
230 	return 0;
231 }
232 
233 #ifdef MLX5_GLUE
234 
235 /**
236  * Suffix RTE_EAL_PMD_PATH with "-glue".
237  *
238  * This function performs a sanity check on RTE_EAL_PMD_PATH before
239  * suffixing its last component.
240  *
241  * @param buf[out]
242  *   Output buffer, should be large enough otherwise NULL is returned.
243  * @param size
244  *   Size of @p out.
245  *
246  * @return
247  *   Pointer to @p buf or @p NULL in case suffix cannot be appended.
248  */
249 static char *
mlx5_glue_path(char * buf,size_t size)250 mlx5_glue_path(char *buf, size_t size)
251 {
252 	static const char *const bad[] = { "/", ".", "..", NULL };
253 	const char *path = RTE_EAL_PMD_PATH;
254 	size_t len = strlen(path);
255 	size_t off;
256 	int i;
257 
258 	while (len && path[len - 1] == '/')
259 		--len;
260 	for (off = len; off && path[off - 1] != '/'; --off)
261 		;
262 	for (i = 0; bad[i]; ++i)
263 		if (!strncmp(path + off, bad[i], (int)(len - off)))
264 			goto error;
265 	i = snprintf(buf, size, "%.*s-glue", (int)len, path);
266 	if (i == -1 || (size_t)i >= size)
267 		goto error;
268 	return buf;
269 error:
270 	DRV_LOG(ERR, "unable to append \"-glue\" to last component of"
271 		" RTE_EAL_PMD_PATH (\"" RTE_EAL_PMD_PATH "\"), please"
272 		" re-configure DPDK");
273 	return NULL;
274 }
275 
276 static int
mlx5_glue_dlopen(void)277 mlx5_glue_dlopen(void)
278 {
279 	char glue_path[sizeof(RTE_EAL_PMD_PATH) - 1 + sizeof("-glue")];
280 	void *handle = NULL;
281 
282 	char const *path[] = {
283 		/*
284 		 * A basic security check is necessary before trusting
285 		 * MLX5_GLUE_PATH, which may override RTE_EAL_PMD_PATH.
286 		 */
287 		(geteuid() == getuid() && getegid() == getgid() ?
288 		 getenv("MLX5_GLUE_PATH") : NULL),
289 		/*
290 		 * When RTE_EAL_PMD_PATH is set, use its glue-suffixed
291 		 * variant, otherwise let dlopen() look up libraries on its
292 		 * own.
293 		 */
294 		(*RTE_EAL_PMD_PATH ?
295 		 mlx5_glue_path(glue_path, sizeof(glue_path)) : ""),
296 	};
297 	unsigned int i = 0;
298 	void **sym;
299 	const char *dlmsg;
300 
301 	while (!handle && i != RTE_DIM(path)) {
302 		const char *end;
303 		size_t len;
304 		int ret;
305 
306 		if (!path[i]) {
307 			++i;
308 			continue;
309 		}
310 		end = strpbrk(path[i], ":;");
311 		if (!end)
312 			end = path[i] + strlen(path[i]);
313 		len = end - path[i];
314 		ret = 0;
315 		do {
316 			char name[ret + 1];
317 
318 			ret = snprintf(name, sizeof(name), "%.*s%s" MLX5_GLUE,
319 				       (int)len, path[i],
320 				       (!len || *(end - 1) == '/') ? "" : "/");
321 			if (ret == -1)
322 				break;
323 			if (sizeof(name) != (size_t)ret + 1)
324 				continue;
325 			DRV_LOG(DEBUG, "Looking for rdma-core glue as "
326 				"\"%s\"", name);
327 			handle = dlopen(name, RTLD_LAZY);
328 			break;
329 		} while (1);
330 		path[i] = end + 1;
331 		if (!*end)
332 			++i;
333 	}
334 	if (!handle) {
335 		rte_errno = EINVAL;
336 		dlmsg = dlerror();
337 		if (dlmsg)
338 			DRV_LOG(WARNING, "Cannot load glue library: %s", dlmsg);
339 		goto glue_error;
340 	}
341 	sym = dlsym(handle, "mlx5_glue");
342 	if (!sym || !*sym) {
343 		rte_errno = EINVAL;
344 		dlmsg = dlerror();
345 		if (dlmsg)
346 			DRV_LOG(ERR, "Cannot resolve glue symbol: %s", dlmsg);
347 		goto glue_error;
348 	}
349 	mlx5_glue = *sym;
350 	return 0;
351 
352 glue_error:
353 	if (handle)
354 		dlclose(handle);
355 	return -1;
356 }
357 
358 #endif
359 
360 /**
361  * Initialization routine for run-time dependency on rdma-core.
362  */
363 void
mlx5_glue_constructor(void)364 mlx5_glue_constructor(void)
365 {
366 	/*
367 	 * RDMAV_HUGEPAGES_SAFE tells ibv_fork_init() we intend to use
368 	 * huge pages. Calling ibv_fork_init() during init allows
369 	 * applications to use fork() safely for purposes other than
370 	 * using this PMD, which is not supported in forked processes.
371 	 */
372 	setenv("RDMAV_HUGEPAGES_SAFE", "1", 1);
373 	/* Match the size of Rx completion entry to the size of a cacheline. */
374 	if (RTE_CACHE_LINE_SIZE == 128)
375 		setenv("MLX5_CQE_SIZE", "128", 0);
376 	/*
377 	 * MLX5_DEVICE_FATAL_CLEANUP tells ibv_destroy functions to
378 	 * cleanup all the Verbs resources even when the device was removed.
379 	 */
380 	setenv("MLX5_DEVICE_FATAL_CLEANUP", "1", 1);
381 
382 #ifdef MLX5_GLUE
383 	if (mlx5_glue_dlopen() != 0)
384 		goto glue_error;
385 #endif
386 
387 #ifdef RTE_LIBRTE_MLX5_DEBUG
388 	/* Glue structure must not contain any NULL pointers. */
389 	{
390 		unsigned int i;
391 
392 		for (i = 0; i != sizeof(*mlx5_glue) / sizeof(void *); ++i)
393 			MLX5_ASSERT(((const void *const *)mlx5_glue)[i]);
394 	}
395 #endif
396 	if (strcmp(mlx5_glue->version, MLX5_GLUE_VERSION)) {
397 		rte_errno = EINVAL;
398 		DRV_LOG(ERR, "rdma-core glue \"%s\" mismatch: \"%s\" is "
399 			"required", mlx5_glue->version, MLX5_GLUE_VERSION);
400 		goto glue_error;
401 	}
402 	mlx5_glue->fork_init();
403 	return;
404 
405 glue_error:
406 	DRV_LOG(WARNING, "Cannot initialize MLX5 common due to missing"
407 		" run-time dependency on rdma-core libraries (libibverbs,"
408 		" libmlx5)");
409 	mlx5_glue = NULL;
410 }
411 
412 /**
413  * Validate user arguments for remote PD and CTX.
414  *
415  * @param config
416  *   Pointer to device configuration structure.
417  *
418  * @return
419  *   0 on success, a negative errno value otherwise and rte_errno is set.
420  */
421 int
mlx5_os_remote_pd_and_ctx_validate(struct mlx5_common_dev_config * config)422 mlx5_os_remote_pd_and_ctx_validate(struct mlx5_common_dev_config *config)
423 {
424 	int device_fd = config->device_fd;
425 	int pd_handle = config->pd_handle;
426 
427 #ifdef HAVE_MLX5_IBV_IMPORT_CTX_PD_AND_MR
428 	if (device_fd == MLX5_ARG_UNSET && pd_handle != MLX5_ARG_UNSET) {
429 		DRV_LOG(ERR, "Remote PD without CTX is not supported.");
430 		rte_errno = EINVAL;
431 		return -rte_errno;
432 	}
433 	if (device_fd != MLX5_ARG_UNSET && pd_handle == MLX5_ARG_UNSET) {
434 		DRV_LOG(ERR, "Remote CTX without PD is not supported.");
435 		rte_errno = EINVAL;
436 		return -rte_errno;
437 	}
438 	DRV_LOG(DEBUG, "Remote PD and CTX is supported: (cmd_fd=%d, "
439 		"pd_handle=%d).", device_fd, pd_handle);
440 #else
441 	if (pd_handle != MLX5_ARG_UNSET || device_fd != MLX5_ARG_UNSET) {
442 		DRV_LOG(ERR,
443 			"Remote PD and CTX is not supported - maybe old rdma-core version?");
444 		rte_errno = ENOTSUP;
445 		return -rte_errno;
446 	}
447 #endif
448 	return 0;
449 }
450 
451 /**
452  * Release Protection Domain object.
453  *
454  * @param[out] cdev
455  *   Pointer to the mlx5 device.
456  *
457  * @return
458  *   0 on success, a negative errno value otherwise.
459  */
460 int
mlx5_os_pd_release(struct mlx5_common_device * cdev)461 mlx5_os_pd_release(struct mlx5_common_device *cdev)
462 {
463 	if (cdev->config.pd_handle == MLX5_ARG_UNSET)
464 		return mlx5_glue->dealloc_pd(cdev->pd);
465 	else
466 		return mlx5_glue->unimport_pd(cdev->pd);
467 }
468 
469 /**
470  * Allocate Protection Domain object.
471  *
472  * @param[out] cdev
473  *   Pointer to the mlx5 device.
474  *
475  * @return
476  *   0 on success, a negative errno value otherwise.
477  */
478 static int
mlx5_os_pd_create(struct mlx5_common_device * cdev)479 mlx5_os_pd_create(struct mlx5_common_device *cdev)
480 {
481 	cdev->pd = mlx5_glue->alloc_pd(cdev->ctx);
482 	if (cdev->pd == NULL) {
483 		DRV_LOG(ERR, "Failed to allocate PD: %s", rte_strerror(errno));
484 		return errno ? -errno : -ENOMEM;
485 	}
486 	return 0;
487 }
488 
489 /**
490  * Import Protection Domain object according to given PD handle.
491  *
492  * @param[out] cdev
493  *   Pointer to the mlx5 device.
494  *
495  * @return
496  *   0 on success, a negative errno value otherwise.
497  */
498 static int
mlx5_os_pd_import(struct mlx5_common_device * cdev)499 mlx5_os_pd_import(struct mlx5_common_device *cdev)
500 {
501 	cdev->pd = mlx5_glue->import_pd(cdev->ctx, cdev->config.pd_handle);
502 	if (cdev->pd == NULL) {
503 		DRV_LOG(ERR, "Failed to import PD using handle=%d: %s",
504 			cdev->config.pd_handle, rte_strerror(errno));
505 		return errno ? -errno : -ENOMEM;
506 	}
507 	return 0;
508 }
509 
510 /**
511  * Prepare Protection Domain object and extract its pdn using DV API.
512  *
513  * @param[out] cdev
514  *   Pointer to the mlx5 device.
515  *
516  * @return
517  *   0 on success, a negative errno value otherwise and rte_errno is set.
518  */
519 int
mlx5_os_pd_prepare(struct mlx5_common_device * cdev)520 mlx5_os_pd_prepare(struct mlx5_common_device *cdev)
521 {
522 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
523 	struct mlx5dv_obj obj;
524 	struct mlx5dv_pd pd_info;
525 #endif
526 	int ret;
527 
528 	if (cdev->config.pd_handle == MLX5_ARG_UNSET)
529 		ret = mlx5_os_pd_create(cdev);
530 	else
531 		ret = mlx5_os_pd_import(cdev);
532 	if (ret) {
533 		rte_errno = -ret;
534 		return ret;
535 	}
536 	if (cdev->config.devx == 0)
537 		return 0;
538 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
539 	obj.pd.in = cdev->pd;
540 	obj.pd.out = &pd_info;
541 	ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_PD);
542 	if (ret != 0) {
543 		DRV_LOG(ERR, "Fail to get PD object info.");
544 		rte_errno = errno;
545 		claim_zero(mlx5_os_pd_release(cdev));
546 		cdev->pd = NULL;
547 		return -rte_errno;
548 	}
549 	cdev->pdn = pd_info.pdn;
550 	return 0;
551 #else
552 	DRV_LOG(ERR, "Cannot get pdn - no DV support.");
553 	rte_errno = ENOTSUP;
554 	return -rte_errno;
555 #endif /* HAVE_IBV_FLOW_DV_SUPPORT */
556 }
557 
558 static struct ibv_device *
mlx5_os_get_ibv_device(const struct rte_pci_device * pci_dev)559 mlx5_os_get_ibv_device(const struct rte_pci_device *pci_dev)
560 {
561 	int n;
562 	struct ibv_device **ibv_list = mlx5_glue->get_device_list(&n);
563 	struct ibv_device *ibv_match = NULL;
564 	uint8_t guid1[32] = {0};
565 	uint8_t guid2[32] = {0};
566 	int ret1, ret2 = -1;
567 	struct rte_pci_addr paddr;
568 	const struct rte_pci_addr *addr = &pci_dev->addr;
569 	bool is_vf_dev = mlx5_dev_is_vf_pci(pci_dev);
570 
571 	if (ibv_list == NULL || !n) {
572 		rte_errno = ENOSYS;
573 		if (ibv_list)
574 			mlx5_glue->free_device_list(ibv_list);
575 		return NULL;
576 	}
577 	ret1 = mlx5_get_device_guid(addr, guid1, sizeof(guid1));
578 	while (n-- > 0) {
579 		DRV_LOG(DEBUG, "Checking device \"%s\"..", ibv_list[n]->name);
580 		if (mlx5_get_pci_addr(ibv_list[n]->ibdev_path, &paddr) != 0)
581 			continue;
582 		if (ret1 > 0)
583 			ret2 = mlx5_get_device_guid(&paddr, guid2, sizeof(guid2));
584 		/* Bond device can bond secondary PCIe */
585 		if ((strstr(ibv_list[n]->name, "bond") && !is_vf_dev &&
586 		     ((ret1 > 0 && ret2 > 0 && !memcmp(guid1, guid2, sizeof(guid1))) ||
587 		      (addr->domain == paddr.domain && addr->bus == paddr.bus &&
588 		       addr->devid == paddr.devid))) ||
589 		    !rte_pci_addr_cmp(addr, &paddr)) {
590 			ibv_match = ibv_list[n];
591 			break;
592 		}
593 	}
594 	if (ibv_match == NULL) {
595 		DRV_LOG(WARNING,
596 			"No Verbs device matches PCI device " PCI_PRI_FMT ","
597 			" are kernel drivers loaded?",
598 			addr->domain, addr->bus, addr->devid, addr->function);
599 		rte_errno = ENOENT;
600 	}
601 	mlx5_glue->free_device_list(ibv_list);
602 	return ibv_match;
603 }
604 
605 /* Try to disable ROCE by Netlink\Devlink. */
606 static int
mlx5_nl_roce_disable(const char * addr)607 mlx5_nl_roce_disable(const char *addr)
608 {
609 	int nlsk_fd = mlx5_nl_init(NETLINK_GENERIC, 0);
610 	int devlink_id;
611 	int enable;
612 	int ret;
613 
614 	if (nlsk_fd < 0)
615 		return nlsk_fd;
616 	devlink_id = mlx5_nl_devlink_family_id_get(nlsk_fd);
617 	if (devlink_id < 0) {
618 		ret = devlink_id;
619 		DRV_LOG(DEBUG,
620 			"Failed to get devlink id for ROCE operations by Netlink.");
621 		goto close;
622 	}
623 	ret = mlx5_nl_enable_roce_get(nlsk_fd, devlink_id, addr, &enable);
624 	if (ret) {
625 		DRV_LOG(DEBUG, "Failed to get ROCE enable by Netlink: %d.",
626 			ret);
627 		goto close;
628 	} else if (!enable) {
629 		DRV_LOG(INFO, "ROCE has already disabled(Netlink).");
630 		goto close;
631 	}
632 	ret = mlx5_nl_enable_roce_set(nlsk_fd, devlink_id, addr, 0);
633 	if (ret)
634 		DRV_LOG(DEBUG, "Failed to disable ROCE by Netlink: %d.", ret);
635 	else
636 		DRV_LOG(INFO, "ROCE is disabled by Netlink successfully.");
637 close:
638 	close(nlsk_fd);
639 	return ret;
640 }
641 
642 /* Try to disable ROCE by sysfs. */
643 static int
mlx5_sys_roce_disable(const char * addr)644 mlx5_sys_roce_disable(const char *addr)
645 {
646 	FILE *file_o;
647 	int enable;
648 	int ret;
649 
650 	MKSTR(file_p, "/sys/bus/pci/devices/%s/roce_enable", addr);
651 	file_o = fopen(file_p, "rb");
652 	if (!file_o) {
653 		rte_errno = ENOTSUP;
654 		return -ENOTSUP;
655 	}
656 	ret = fscanf(file_o, "%d", &enable);
657 	if (ret != 1) {
658 		rte_errno = EINVAL;
659 		ret = EINVAL;
660 		goto close;
661 	} else if (!enable) {
662 		ret = 0;
663 		DRV_LOG(INFO, "ROCE has already disabled(sysfs).");
664 		goto close;
665 	}
666 	fclose(file_o);
667 	file_o = fopen(file_p, "wb");
668 	if (!file_o) {
669 		rte_errno = ENOTSUP;
670 		return -ENOTSUP;
671 	}
672 	fprintf(file_o, "0\n");
673 	ret = 0;
674 close:
675 	if (ret)
676 		DRV_LOG(DEBUG, "Failed to disable ROCE by sysfs: %d.", ret);
677 	else
678 		DRV_LOG(INFO, "ROCE is disabled by sysfs successfully.");
679 	fclose(file_o);
680 	return ret;
681 }
682 
683 static int
mlx5_roce_disable(const struct rte_device * dev)684 mlx5_roce_disable(const struct rte_device *dev)
685 {
686 	char pci_addr[PCI_PRI_STR_SIZE] = { 0 };
687 
688 	if (mlx5_dev_to_pci_str(dev, pci_addr, sizeof(pci_addr)) < 0)
689 		return -rte_errno;
690 	/* Firstly try to disable ROCE by Netlink and fallback to sysfs. */
691 	if (mlx5_nl_roce_disable(pci_addr) != 0 &&
692 	    mlx5_sys_roce_disable(pci_addr) != 0)
693 		return -rte_errno;
694 	return 0;
695 }
696 
697 static struct ibv_device *
mlx5_os_get_ibv_dev(const struct rte_device * dev)698 mlx5_os_get_ibv_dev(const struct rte_device *dev)
699 {
700 	struct ibv_device *ibv;
701 
702 	if (mlx5_dev_is_pci(dev))
703 		ibv = mlx5_os_get_ibv_device(RTE_DEV_TO_PCI_CONST(dev));
704 	else
705 		ibv = mlx5_get_aux_ibv_device(RTE_DEV_TO_AUXILIARY_CONST(dev));
706 	if (ibv == NULL) {
707 		rte_errno = ENODEV;
708 		DRV_LOG(ERR, "Verbs device not found: %s", dev->name);
709 	}
710 	return ibv;
711 }
712 
713 static struct ibv_device *
mlx5_vdpa_get_ibv_dev(const struct rte_device * dev)714 mlx5_vdpa_get_ibv_dev(const struct rte_device *dev)
715 {
716 	struct ibv_device *ibv;
717 	int retry;
718 
719 	if (mlx5_roce_disable(dev) != 0) {
720 		DRV_LOG(WARNING, "Failed to disable ROCE for \"%s\".",
721 			dev->name);
722 		return NULL;
723 	}
724 	/* Wait for the IB device to appear again after reload. */
725 	for (retry = MLX5_VDPA_MAX_RETRIES; retry > 0; --retry) {
726 		ibv = mlx5_os_get_ibv_dev(dev);
727 		if (ibv != NULL)
728 			return ibv;
729 		usleep(MLX5_VDPA_USEC);
730 	}
731 	DRV_LOG(ERR,
732 		"Cannot get IB device after disabling RoCE for \"%s\", retries exceed %d.",
733 		dev->name, MLX5_VDPA_MAX_RETRIES);
734 	rte_errno = EAGAIN;
735 	return NULL;
736 }
737 
738 static int
mlx5_config_doorbell_mapping_env(int dbnc)739 mlx5_config_doorbell_mapping_env(int dbnc)
740 {
741 	char *env;
742 	int value;
743 
744 	MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
745 	/* Get environment variable to store. */
746 	env = getenv(MLX5_SHUT_UP_BF);
747 	value = env ? !!strcmp(env, "0") : MLX5_ARG_UNSET;
748 	if (dbnc == MLX5_ARG_UNSET)
749 		setenv(MLX5_SHUT_UP_BF, MLX5_SHUT_UP_BF_DEFAULT, 1);
750 	else
751 		setenv(MLX5_SHUT_UP_BF,
752 		       dbnc == MLX5_SQ_DB_NCACHED ? "1" : "0", 1);
753 	return value;
754 }
755 
756 static void
mlx5_restore_doorbell_mapping_env(int value)757 mlx5_restore_doorbell_mapping_env(int value)
758 {
759 	MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
760 	/* Restore the original environment variable state. */
761 	if (value == MLX5_ARG_UNSET)
762 		unsetenv(MLX5_SHUT_UP_BF);
763 	else
764 		setenv(MLX5_SHUT_UP_BF, value ? "1" : "0", 1);
765 }
766 
767 /**
768  * Function API to open IB device.
769  *
770  * @param cdev
771  *   Pointer to the mlx5 device.
772  * @param classes
773  *   Chosen classes come from device arguments.
774  *
775  * @return
776  *   Pointer to ibv_context on success, NULL otherwise and rte_errno is set.
777  */
778 static struct ibv_context *
mlx5_open_device(struct mlx5_common_device * cdev,uint32_t classes)779 mlx5_open_device(struct mlx5_common_device *cdev, uint32_t classes)
780 {
781 	struct ibv_device *ibv;
782 	struct ibv_context *ctx = NULL;
783 	int dbmap_env;
784 
785 	MLX5_ASSERT(cdev->config.device_fd == MLX5_ARG_UNSET);
786 	if (classes & MLX5_CLASS_VDPA)
787 		ibv = mlx5_vdpa_get_ibv_dev(cdev->dev);
788 	else
789 		ibv = mlx5_os_get_ibv_dev(cdev->dev);
790 	if (!ibv)
791 		return NULL;
792 	DRV_LOG(INFO, "Dev information matches for device \"%s\".", ibv->name);
793 	/*
794 	 * Configure environment variable "MLX5_BF_SHUT_UP" before the device
795 	 * creation. The rdma_core library checks the variable at device
796 	 * creation and stores the result internally.
797 	 */
798 	dbmap_env = mlx5_config_doorbell_mapping_env(cdev->config.dbnc);
799 	/* Try to open IB device with DV first, then usual Verbs. */
800 	errno = 0;
801 	ctx = mlx5_glue->dv_open_device(ibv);
802 	if (ctx) {
803 		cdev->config.devx = 1;
804 	} else if (classes == MLX5_CLASS_ETH) {
805 		/* The environment variable is still configured. */
806 		ctx = mlx5_glue->open_device(ibv);
807 		if (ctx == NULL)
808 			goto error;
809 	} else {
810 		goto error;
811 	}
812 	/* The device is created, no need for environment. */
813 	mlx5_restore_doorbell_mapping_env(dbmap_env);
814 	return ctx;
815 error:
816 	rte_errno = errno ? errno : ENODEV;
817 	/* The device creation is failed, no need for environment. */
818 	mlx5_restore_doorbell_mapping_env(dbmap_env);
819 	DRV_LOG(ERR, "Failed to open IB device \"%s\".", ibv->name);
820 	return NULL;
821 }
822 
823 /**
824  * Function API to import IB device.
825  *
826  * @param cdev
827  *   Pointer to the mlx5 device.
828  *
829  * @return
830  *   Pointer to ibv_context on success, NULL otherwise and rte_errno is set.
831  */
832 static struct ibv_context *
mlx5_import_device(struct mlx5_common_device * cdev)833 mlx5_import_device(struct mlx5_common_device *cdev)
834 {
835 	struct ibv_context *ctx = NULL;
836 
837 	MLX5_ASSERT(cdev->config.device_fd != MLX5_ARG_UNSET);
838 	ctx = mlx5_glue->import_device(cdev->config.device_fd);
839 	if (!ctx) {
840 		DRV_LOG(ERR, "Failed to import device for fd=%d: %s",
841 			cdev->config.device_fd, rte_strerror(errno));
842 		rte_errno = errno;
843 	}
844 	return ctx;
845 }
846 
847 /**
848  * Function API to prepare IB device.
849  *
850  * @param cdev
851  *   Pointer to the mlx5 device.
852  * @param classes
853  *   Chosen classes come from device arguments.
854  *
855  * @return
856  *   0 on success, a negative errno value otherwise and rte_errno is set.
857  */
858 int
mlx5_os_open_device(struct mlx5_common_device * cdev,uint32_t classes)859 mlx5_os_open_device(struct mlx5_common_device *cdev, uint32_t classes)
860 {
861 
862 	struct ibv_context *ctx = NULL;
863 
864 	if (cdev->config.device_fd == MLX5_ARG_UNSET)
865 		ctx = mlx5_open_device(cdev, classes);
866 	else
867 		ctx = mlx5_import_device(cdev);
868 	if (ctx == NULL)
869 		return -rte_errno;
870 	/* Hint libmlx5 to use PMD allocator for data plane resources */
871 	mlx5_set_context_attr(cdev->dev, ctx);
872 	cdev->ctx = ctx;
873 	return 0;
874 }
875 
876 int
mlx5_get_device_guid(const struct rte_pci_addr * dev,uint8_t * guid,size_t len)877 mlx5_get_device_guid(const struct rte_pci_addr *dev, uint8_t *guid, size_t len)
878 {
879 	char tmp[512];
880 	char cur_ifname[IF_NAMESIZE + 1];
881 	FILE *id_file;
882 	DIR *dir;
883 	struct dirent *ptr;
884 	int ret;
885 
886 	if (guid == NULL || len < sizeof(u_int64_t) + 1)
887 		return -1;
888 	memset(guid, 0, len);
889 	snprintf(tmp, sizeof(tmp), "/sys/bus/pci/devices/%04x:%02x:%02x.%x/net",
890 			dev->domain, dev->bus, dev->devid, dev->function);
891 	dir = opendir(tmp);
892 	if (dir == NULL)
893 		return -1;
894 	/* Traverse to identify PF interface */
895 	do {
896 		ptr = readdir(dir);
897 		if (ptr == NULL || ptr->d_type != DT_DIR) {
898 			closedir(dir);
899 			return -1;
900 		}
901 	} while (strchr(ptr->d_name, '.') || strchr(ptr->d_name, '_') ||
902 		 strchr(ptr->d_name, 'v'));
903 	snprintf(cur_ifname, sizeof(cur_ifname), "%s", ptr->d_name);
904 	closedir(dir);
905 	snprintf(tmp + strlen(tmp), sizeof(tmp) - strlen(tmp),
906 			"/%s/phys_switch_id", cur_ifname);
907 	/* Older OFED like 5.3 doesn't support read */
908 	id_file = fopen(tmp, "r");
909 	if (!id_file)
910 		return 0;
911 	ret = fscanf(id_file, "%16s", guid);
912 	fclose(id_file);
913 	return ret;
914 }
915 
916 /*
917  * Create direct mkey using the kernel ibv_reg_mr API and wrap it with a new
918  * indirect mkey created by the DevX API.
919  * This mkey should be used for DevX commands requesting mkey as a parameter.
920  */
921 int
mlx5_os_wrapped_mkey_create(void * ctx,void * pd,uint32_t pdn,void * addr,size_t length,struct mlx5_pmd_wrapped_mr * pmd_mr)922 mlx5_os_wrapped_mkey_create(void *ctx, void *pd, uint32_t pdn, void *addr,
923 			    size_t length, struct mlx5_pmd_wrapped_mr *pmd_mr)
924 {
925 	struct mlx5_klm klm = {
926 		.byte_count = length,
927 		.address = (uintptr_t)addr,
928 	};
929 	struct mlx5_devx_mkey_attr mkey_attr = {
930 		.pd = pdn,
931 		.klm_array = &klm,
932 		.klm_num = 1,
933 	};
934 	struct mlx5_devx_obj *mkey;
935 	struct ibv_mr *ibv_mr = mlx5_glue->reg_mr(pd, addr, length,
936 						  IBV_ACCESS_LOCAL_WRITE |
937 						  (haswell_broadwell_cpu ? 0 :
938 						  IBV_ACCESS_RELAXED_ORDERING));
939 
940 	if (!ibv_mr) {
941 		rte_errno = errno;
942 		return -rte_errno;
943 	}
944 	klm.mkey = ibv_mr->lkey;
945 	mkey_attr.addr = (uintptr_t)addr;
946 	mkey_attr.size = length;
947 	mkey = mlx5_devx_cmd_mkey_create(ctx, &mkey_attr);
948 	if (!mkey) {
949 		claim_zero(mlx5_glue->dereg_mr(ibv_mr));
950 		return -rte_errno;
951 	}
952 	pmd_mr->addr = addr;
953 	pmd_mr->len = length;
954 	pmd_mr->obj = (void *)ibv_mr;
955 	pmd_mr->imkey = mkey;
956 	pmd_mr->lkey = mkey->id;
957 	return 0;
958 }
959 
960 void
mlx5_os_wrapped_mkey_destroy(struct mlx5_pmd_wrapped_mr * pmd_mr)961 mlx5_os_wrapped_mkey_destroy(struct mlx5_pmd_wrapped_mr *pmd_mr)
962 {
963 	if (!pmd_mr)
964 		return;
965 	if (pmd_mr->imkey)
966 		claim_zero(mlx5_devx_cmd_destroy(pmd_mr->imkey));
967 	if (pmd_mr->obj)
968 		claim_zero(mlx5_glue->dereg_mr(pmd_mr->obj));
969 	memset(pmd_mr, 0, sizeof(*pmd_mr));
970 }
971 
972 /**
973  * Rte_intr_handle create and init helper.
974  *
975  * @param[in] mode
976  *   interrupt instance can be shared between primary and secondary
977  *   processes or not.
978  * @param[in] set_fd_nonblock
979  *   Whether to set fd to O_NONBLOCK.
980  * @param[in] fd
981  *   Fd to set in created intr_handle.
982  * @param[in] cb
983  *   Callback to register for intr_handle.
984  * @param[in] cb_arg
985  *   Callback argument for cb.
986  *
987  * @return
988  *  - Interrupt handle on success.
989  *  - NULL on failure, with rte_errno set.
990  */
991 struct rte_intr_handle *
mlx5_os_interrupt_handler_create(int mode,bool set_fd_nonblock,int fd,rte_intr_callback_fn cb,void * cb_arg)992 mlx5_os_interrupt_handler_create(int mode, bool set_fd_nonblock, int fd,
993 				 rte_intr_callback_fn cb, void *cb_arg)
994 {
995 	struct rte_intr_handle *tmp_intr_handle;
996 	int ret, flags;
997 
998 	tmp_intr_handle = rte_intr_instance_alloc(mode);
999 	if (!tmp_intr_handle) {
1000 		rte_errno = ENOMEM;
1001 		goto err;
1002 	}
1003 	if (set_fd_nonblock) {
1004 		flags = fcntl(fd, F_GETFL);
1005 		ret = fcntl(fd, F_SETFL, flags | O_NONBLOCK);
1006 		if (ret) {
1007 			rte_errno = errno;
1008 			goto err;
1009 		}
1010 	}
1011 	ret = rte_intr_fd_set(tmp_intr_handle, fd);
1012 	if (ret)
1013 		goto err;
1014 	ret = rte_intr_type_set(tmp_intr_handle, RTE_INTR_HANDLE_EXT);
1015 	if (ret)
1016 		goto err;
1017 	ret = rte_intr_callback_register(tmp_intr_handle, cb, cb_arg);
1018 	if (ret) {
1019 		rte_errno = -ret;
1020 		goto err;
1021 	}
1022 	return tmp_intr_handle;
1023 err:
1024 	rte_intr_instance_free(tmp_intr_handle);
1025 	return NULL;
1026 }
1027 
1028 /* Safe unregistration for interrupt callback. */
1029 static void
mlx5_intr_callback_unregister(const struct rte_intr_handle * handle,rte_intr_callback_fn cb_fn,void * cb_arg)1030 mlx5_intr_callback_unregister(const struct rte_intr_handle *handle,
1031 			      rte_intr_callback_fn cb_fn, void *cb_arg)
1032 {
1033 	uint64_t twait = 0;
1034 	uint64_t start = 0;
1035 
1036 	do {
1037 		int ret;
1038 
1039 		ret = rte_intr_callback_unregister(handle, cb_fn, cb_arg);
1040 		if (ret >= 0)
1041 			return;
1042 		if (ret != -EAGAIN) {
1043 			DRV_LOG(INFO, "failed to unregister interrupt"
1044 				      " handler (error: %d)", ret);
1045 			MLX5_ASSERT(false);
1046 			return;
1047 		}
1048 		if (twait) {
1049 			struct timespec onems;
1050 
1051 			/* Wait one millisecond and try again. */
1052 			onems.tv_sec = 0;
1053 			onems.tv_nsec = NS_PER_S / MS_PER_S;
1054 			nanosleep(&onems, 0);
1055 			/* Check whether one second elapsed. */
1056 			if ((rte_get_timer_cycles() - start) <= twait)
1057 				continue;
1058 		} else {
1059 			/*
1060 			 * We get the amount of timer ticks for one second.
1061 			 * If this amount elapsed it means we spent one
1062 			 * second in waiting. This branch is executed once
1063 			 * on first iteration.
1064 			 */
1065 			twait = rte_get_timer_hz();
1066 			MLX5_ASSERT(twait);
1067 		}
1068 		/*
1069 		 * Timeout elapsed, show message (once a second) and retry.
1070 		 * We have no other acceptable option here, if we ignore
1071 		 * the unregistering return code the handler will not
1072 		 * be unregistered, fd will be closed and we may get the
1073 		 * crush. Hanging and messaging in the loop seems not to be
1074 		 * the worst choice.
1075 		 */
1076 		DRV_LOG(INFO, "Retrying to unregister interrupt handler");
1077 		start = rte_get_timer_cycles();
1078 	} while (true);
1079 }
1080 
1081 /**
1082  * Rte_intr_handle destroy helper.
1083  *
1084  * @param[in] intr_handle
1085  *   Rte_intr_handle to destroy.
1086  * @param[in] cb
1087  *   Callback which is registered to intr_handle.
1088  * @param[in] cb_arg
1089  *   Callback argument for cb.
1090  *
1091  */
1092 void
mlx5_os_interrupt_handler_destroy(struct rte_intr_handle * intr_handle,rte_intr_callback_fn cb,void * cb_arg)1093 mlx5_os_interrupt_handler_destroy(struct rte_intr_handle *intr_handle,
1094 				  rte_intr_callback_fn cb, void *cb_arg)
1095 {
1096 	if (rte_intr_fd_get(intr_handle) >= 0)
1097 		mlx5_intr_callback_unregister(intr_handle, cb, cb_arg);
1098 	rte_intr_instance_free(intr_handle);
1099 }
1100