xref: /dpdk/drivers/net/mlx5/mlx5_ethdev.c (revision 14ad4f01845331a0ae98c681efa3086eeed3343a)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2015 6WIND S.A.
3  * Copyright 2015 Mellanox Technologies, Ltd
4  */
5 
6 #include <stddef.h>
7 #include <assert.h>
8 #include <inttypes.h>
9 #include <unistd.h>
10 #include <stdint.h>
11 #include <stdio.h>
12 #include <string.h>
13 #include <stdlib.h>
14 #include <errno.h>
15 #include <dirent.h>
16 #include <net/if.h>
17 #include <sys/ioctl.h>
18 #include <sys/socket.h>
19 #include <netinet/in.h>
20 #include <linux/ethtool.h>
21 #include <linux/sockios.h>
22 #include <fcntl.h>
23 #include <stdalign.h>
24 #include <sys/un.h>
25 #include <time.h>
26 
27 #include <rte_atomic.h>
28 #include <rte_ethdev_driver.h>
29 #include <rte_bus_pci.h>
30 #include <rte_mbuf.h>
31 #include <rte_common.h>
32 #include <rte_interrupts.h>
33 #include <rte_malloc.h>
34 #include <rte_string_fns.h>
35 #include <rte_rwlock.h>
36 #include <rte_cycles.h>
37 
38 #include "mlx5.h"
39 #include "mlx5_glue.h"
40 #include "mlx5_rxtx.h"
41 #include "mlx5_utils.h"
42 
43 /* Supported speed values found in /usr/include/linux/ethtool.h */
44 #ifndef HAVE_SUPPORTED_40000baseKR4_Full
45 #define SUPPORTED_40000baseKR4_Full (1 << 23)
46 #endif
47 #ifndef HAVE_SUPPORTED_40000baseCR4_Full
48 #define SUPPORTED_40000baseCR4_Full (1 << 24)
49 #endif
50 #ifndef HAVE_SUPPORTED_40000baseSR4_Full
51 #define SUPPORTED_40000baseSR4_Full (1 << 25)
52 #endif
53 #ifndef HAVE_SUPPORTED_40000baseLR4_Full
54 #define SUPPORTED_40000baseLR4_Full (1 << 26)
55 #endif
56 #ifndef HAVE_SUPPORTED_56000baseKR4_Full
57 #define SUPPORTED_56000baseKR4_Full (1 << 27)
58 #endif
59 #ifndef HAVE_SUPPORTED_56000baseCR4_Full
60 #define SUPPORTED_56000baseCR4_Full (1 << 28)
61 #endif
62 #ifndef HAVE_SUPPORTED_56000baseSR4_Full
63 #define SUPPORTED_56000baseSR4_Full (1 << 29)
64 #endif
65 #ifndef HAVE_SUPPORTED_56000baseLR4_Full
66 #define SUPPORTED_56000baseLR4_Full (1 << 30)
67 #endif
68 
69 /* Add defines in case the running kernel is not the same as user headers. */
70 #ifndef ETHTOOL_GLINKSETTINGS
71 struct ethtool_link_settings {
72 	uint32_t cmd;
73 	uint32_t speed;
74 	uint8_t duplex;
75 	uint8_t port;
76 	uint8_t phy_address;
77 	uint8_t autoneg;
78 	uint8_t mdio_support;
79 	uint8_t eth_to_mdix;
80 	uint8_t eth_tp_mdix_ctrl;
81 	int8_t link_mode_masks_nwords;
82 	uint32_t reserved[8];
83 	uint32_t link_mode_masks[];
84 };
85 
86 #define ETHTOOL_GLINKSETTINGS 0x0000004c
87 #define ETHTOOL_LINK_MODE_1000baseT_Full_BIT 5
88 #define ETHTOOL_LINK_MODE_Autoneg_BIT 6
89 #define ETHTOOL_LINK_MODE_1000baseKX_Full_BIT 17
90 #define ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT 18
91 #define ETHTOOL_LINK_MODE_10000baseKR_Full_BIT 19
92 #define ETHTOOL_LINK_MODE_10000baseR_FEC_BIT 20
93 #define ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT 21
94 #define ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT 22
95 #define ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT 23
96 #define ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT 24
97 #define ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT 25
98 #define ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT 26
99 #define ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT 27
100 #define ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT 28
101 #define ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT 29
102 #define ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT 30
103 #endif
104 #ifndef HAVE_ETHTOOL_LINK_MODE_25G
105 #define ETHTOOL_LINK_MODE_25000baseCR_Full_BIT 31
106 #define ETHTOOL_LINK_MODE_25000baseKR_Full_BIT 32
107 #define ETHTOOL_LINK_MODE_25000baseSR_Full_BIT 33
108 #endif
109 #ifndef HAVE_ETHTOOL_LINK_MODE_50G
110 #define ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT 34
111 #define ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT 35
112 #endif
113 #ifndef HAVE_ETHTOOL_LINK_MODE_100G
114 #define ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT 36
115 #define ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT 37
116 #define ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT 38
117 #define ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT 39
118 #endif
119 
120 /**
121  * Get master interface name from private structure.
122  *
123  * @param[in] dev
124  *   Pointer to Ethernet device.
125  * @param[out] ifname
126  *   Interface name output buffer.
127  *
128  * @return
129  *   0 on success, a negative errno value otherwise and rte_errno is set.
130  */
131 int
132 mlx5_get_master_ifname(const char *ibdev_path, char (*ifname)[IF_NAMESIZE])
133 {
134 	DIR *dir;
135 	struct dirent *dent;
136 	unsigned int dev_type = 0;
137 	unsigned int dev_port_prev = ~0u;
138 	char match[IF_NAMESIZE] = "";
139 
140 	assert(ibdev_path);
141 	{
142 		MKSTR(path, "%s/device/net", ibdev_path);
143 
144 		dir = opendir(path);
145 		if (dir == NULL) {
146 			rte_errno = errno;
147 			return -rte_errno;
148 		}
149 	}
150 	while ((dent = readdir(dir)) != NULL) {
151 		char *name = dent->d_name;
152 		FILE *file;
153 		unsigned int dev_port;
154 		int r;
155 
156 		if ((name[0] == '.') &&
157 		    ((name[1] == '\0') ||
158 		     ((name[1] == '.') && (name[2] == '\0'))))
159 			continue;
160 
161 		MKSTR(path, "%s/device/net/%s/%s",
162 		      ibdev_path, name,
163 		      (dev_type ? "dev_id" : "dev_port"));
164 
165 		file = fopen(path, "rb");
166 		if (file == NULL) {
167 			if (errno != ENOENT)
168 				continue;
169 			/*
170 			 * Switch to dev_id when dev_port does not exist as
171 			 * is the case with Linux kernel versions < 3.15.
172 			 */
173 try_dev_id:
174 			match[0] = '\0';
175 			if (dev_type)
176 				break;
177 			dev_type = 1;
178 			dev_port_prev = ~0u;
179 			rewinddir(dir);
180 			continue;
181 		}
182 		r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port);
183 		fclose(file);
184 		if (r != 1)
185 			continue;
186 		/*
187 		 * Switch to dev_id when dev_port returns the same value for
188 		 * all ports. May happen when using a MOFED release older than
189 		 * 3.0 with a Linux kernel >= 3.15.
190 		 */
191 		if (dev_port == dev_port_prev)
192 			goto try_dev_id;
193 		dev_port_prev = dev_port;
194 		if (dev_port == 0)
195 			strlcpy(match, name, sizeof(match));
196 	}
197 	closedir(dir);
198 	if (match[0] == '\0') {
199 		rte_errno = ENOENT;
200 		return -rte_errno;
201 	}
202 	strncpy(*ifname, match, sizeof(*ifname));
203 	return 0;
204 }
205 
206 /**
207  * Get interface name from private structure.
208  *
209  * This is a port representor-aware version of mlx5_get_master_ifname().
210  *
211  * @param[in] dev
212  *   Pointer to Ethernet device.
213  * @param[out] ifname
214  *   Interface name output buffer.
215  *
216  * @return
217  *   0 on success, a negative errno value otherwise and rte_errno is set.
218  */
219 int
220 mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE])
221 {
222 	struct mlx5_priv *priv = dev->data->dev_private;
223 	unsigned int ifindex;
224 
225 	assert(priv);
226 	assert(priv->sh);
227 	ifindex = priv->nl_socket_rdma >= 0 ?
228 		  mlx5_nl_ifindex(priv->nl_socket_rdma,
229 				  priv->sh->ibdev_name,
230 				  priv->ibv_port) : 0;
231 	if (!ifindex) {
232 		if (!priv->representor)
233 			return mlx5_get_master_ifname(priv->sh->ibdev_path,
234 						      ifname);
235 		rte_errno = ENXIO;
236 		return -rte_errno;
237 	}
238 	if (if_indextoname(ifindex, &(*ifname)[0]))
239 		return 0;
240 	rte_errno = errno;
241 	return -rte_errno;
242 }
243 
244 /**
245  * Get interface name for the specified device, uses the extra base
246  * device resources to perform Netlink requests.
247  *
248  * This is a port representor-aware version of mlx5_get_master_ifname().
249  *
250  * @param[in] base
251  *   Pointer to Ethernet device to use Netlink socket from
252  *   to perfrom requests.
253  * @param[in] dev
254  *   Pointer to Ethernet device.
255  * @param[out] ifname
256  *   Interface name output buffer.
257  *
258  * @return
259  *   0 on success, a negative errno value otherwise and rte_errno is set.
260  */
261 int
262 mlx5_get_ifname_base(const struct rte_eth_dev *base,
263 		     const struct rte_eth_dev *dev,
264 		     char (*ifname)[IF_NAMESIZE])
265 {
266 	struct mlx5_priv *priv = dev->data->dev_private;
267 	struct mlx5_priv *priv_base = base->data->dev_private;
268 	unsigned int ifindex;
269 
270 	assert(priv);
271 	assert(priv->sh);
272 	assert(priv_base);
273 	ifindex = priv_base->nl_socket_rdma >= 0 ?
274 		  mlx5_nl_ifindex(priv_base->nl_socket_rdma,
275 				  priv->sh->ibdev_name,
276 				  priv->ibv_port) : 0;
277 	if (!ifindex) {
278 		if (!priv->representor)
279 			return mlx5_get_master_ifname(priv->sh->ibdev_path,
280 						      ifname);
281 		rte_errno = ENXIO;
282 		return -rte_errno;
283 	}
284 	if (if_indextoname(ifindex, &(*ifname)[0]))
285 		return 0;
286 	rte_errno = errno;
287 	return -rte_errno;
288 }
289 /**
290  * Get the interface index from device name.
291  *
292  * @param[in] dev
293  *   Pointer to Ethernet device.
294  *
295  * @return
296  *   Nonzero interface index on success, zero otherwise and rte_errno is set.
297  */
298 unsigned int
299 mlx5_ifindex(const struct rte_eth_dev *dev)
300 {
301 	char ifname[IF_NAMESIZE];
302 	unsigned int ifindex;
303 
304 	if (mlx5_get_ifname(dev, &ifname))
305 		return 0;
306 	ifindex = if_nametoindex(ifname);
307 	if (!ifindex)
308 		rte_errno = errno;
309 	return ifindex;
310 }
311 
312 /**
313  * Perform ifreq ioctl() on associated Ethernet device.
314  *
315  * @param[in] dev
316  *   Pointer to Ethernet device.
317  * @param req
318  *   Request number to pass to ioctl().
319  * @param[out] ifr
320  *   Interface request structure output buffer.
321  *
322  * @return
323  *   0 on success, a negative errno value otherwise and rte_errno is set.
324  */
325 int
326 mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr)
327 {
328 	int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
329 	int ret = 0;
330 
331 	if (sock == -1) {
332 		rte_errno = errno;
333 		return -rte_errno;
334 	}
335 	ret = mlx5_get_ifname(dev, &ifr->ifr_name);
336 	if (ret)
337 		goto error;
338 	ret = ioctl(sock, req, ifr);
339 	if (ret == -1) {
340 		rte_errno = errno;
341 		goto error;
342 	}
343 	close(sock);
344 	return 0;
345 error:
346 	close(sock);
347 	return -rte_errno;
348 }
349 
350 /**
351  * Perform ifreq ioctl() on specified Ethernet device,
352  * ifindex, name and other attributes are requested
353  * on the base device to avoid specified device Netlink
354  * socket sharing (this is not thread-safe).
355  *
356  * @param[in] base
357  *   Pointer to Ethernet device to get dev attributes.
358  * @param[in] dev
359  *   Pointer to Ethernet device to perform ioctl.
360  * @param req
361  *   Request number to pass to ioctl().
362  * @param[out] ifr
363  *   Interface request structure output buffer.
364  *
365  * @return
366  *   0 on success, a negative errno value otherwise and rte_errno is set.
367  */
368 int
369 mlx5_ifreq_base(const struct rte_eth_dev *base,
370 		const struct rte_eth_dev *dev,
371 		int req, struct ifreq *ifr)
372 {
373 	int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
374 	int ret = 0;
375 
376 	if (sock == -1) {
377 		rte_errno = errno;
378 		return -rte_errno;
379 	}
380 	ret = mlx5_get_ifname_base(base, dev, &ifr->ifr_name);
381 	if (ret)
382 		goto error;
383 	ret = ioctl(sock, req, ifr);
384 	if (ret == -1) {
385 		rte_errno = errno;
386 		goto error;
387 	}
388 	close(sock);
389 	return 0;
390 error:
391 	close(sock);
392 	return -rte_errno;
393 }
394 
395 /**
396  * Get device MTU.
397  *
398  * @param dev
399  *   Pointer to Ethernet device.
400  * @param[out] mtu
401  *   MTU value output buffer.
402  *
403  * @return
404  *   0 on success, a negative errno value otherwise and rte_errno is set.
405  */
406 int
407 mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu)
408 {
409 	struct ifreq request;
410 	int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request);
411 
412 	if (ret)
413 		return ret;
414 	*mtu = request.ifr_mtu;
415 	return 0;
416 }
417 
418 /**
419  * Set device MTU.
420  *
421  * @param dev
422  *   Pointer to Ethernet device.
423  * @param mtu
424  *   MTU value to set.
425  *
426  * @return
427  *   0 on success, a negative errno value otherwise and rte_errno is set.
428  */
429 static int
430 mlx5_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
431 {
432 	struct ifreq request = { .ifr_mtu = mtu, };
433 
434 	return mlx5_ifreq(dev, SIOCSIFMTU, &request);
435 }
436 
437 /**
438  * Set device flags.
439  *
440  * @param dev
441  *   Pointer to Ethernet device.
442  * @param keep
443  *   Bitmask for flags that must remain untouched.
444  * @param flags
445  *   Bitmask for flags to modify.
446  *
447  * @return
448  *   0 on success, a negative errno value otherwise and rte_errno is set.
449  */
450 int
451 mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep, unsigned int flags)
452 {
453 	struct ifreq request;
454 	int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request);
455 
456 	if (ret)
457 		return ret;
458 	request.ifr_flags &= keep;
459 	request.ifr_flags |= flags & ~keep;
460 	return mlx5_ifreq(dev, SIOCSIFFLAGS, &request);
461 }
462 
463 /**
464  * DPDK callback for Ethernet device configuration.
465  *
466  * @param dev
467  *   Pointer to Ethernet device structure.
468  *
469  * @return
470  *   0 on success, a negative errno value otherwise and rte_errno is set.
471  */
472 int
473 mlx5_dev_configure(struct rte_eth_dev *dev)
474 {
475 	struct mlx5_priv *priv = dev->data->dev_private;
476 	unsigned int rxqs_n = dev->data->nb_rx_queues;
477 	unsigned int txqs_n = dev->data->nb_tx_queues;
478 	unsigned int i;
479 	unsigned int j;
480 	unsigned int reta_idx_n;
481 	const uint8_t use_app_rss_key =
482 		!!dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key;
483 	int ret = 0;
484 
485 	if (use_app_rss_key &&
486 	    (dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len !=
487 	     MLX5_RSS_HASH_KEY_LEN)) {
488 		DRV_LOG(ERR, "port %u RSS key len must be %s Bytes long",
489 			dev->data->port_id, RTE_STR(MLX5_RSS_HASH_KEY_LEN));
490 		rte_errno = EINVAL;
491 		return -rte_errno;
492 	}
493 	priv->rss_conf.rss_key =
494 		rte_realloc(priv->rss_conf.rss_key,
495 			    MLX5_RSS_HASH_KEY_LEN, 0);
496 	if (!priv->rss_conf.rss_key) {
497 		DRV_LOG(ERR, "port %u cannot allocate RSS hash key memory (%u)",
498 			dev->data->port_id, rxqs_n);
499 		rte_errno = ENOMEM;
500 		return -rte_errno;
501 	}
502 	memcpy(priv->rss_conf.rss_key,
503 	       use_app_rss_key ?
504 	       dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key :
505 	       rss_hash_default_key,
506 	       MLX5_RSS_HASH_KEY_LEN);
507 	priv->rss_conf.rss_key_len = MLX5_RSS_HASH_KEY_LEN;
508 	priv->rss_conf.rss_hf = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf;
509 	priv->rxqs = (void *)dev->data->rx_queues;
510 	priv->txqs = (void *)dev->data->tx_queues;
511 	if (txqs_n != priv->txqs_n) {
512 		DRV_LOG(INFO, "port %u Tx queues number update: %u -> %u",
513 			dev->data->port_id, priv->txqs_n, txqs_n);
514 		priv->txqs_n = txqs_n;
515 	}
516 	if (rxqs_n > priv->config.ind_table_max_size) {
517 		DRV_LOG(ERR, "port %u cannot handle this many Rx queues (%u)",
518 			dev->data->port_id, rxqs_n);
519 		rte_errno = EINVAL;
520 		return -rte_errno;
521 	}
522 	if (rxqs_n != priv->rxqs_n) {
523 		DRV_LOG(INFO, "port %u Rx queues number update: %u -> %u",
524 			dev->data->port_id, priv->rxqs_n, rxqs_n);
525 		priv->rxqs_n = rxqs_n;
526 		/*
527 		 * If the requested number of RX queues is not a power of two,
528 		 * use the maximum indirection table size for better balancing.
529 		 * The result is always rounded to the next power of two.
530 		 */
531 		reta_idx_n = (1 << log2above((rxqs_n & (rxqs_n - 1)) ?
532 					     priv->config.ind_table_max_size :
533 					     rxqs_n));
534 		ret = mlx5_rss_reta_index_resize(dev, reta_idx_n);
535 		if (ret)
536 			return ret;
537 		/*
538 		 * When the number of RX queues is not a power of two,
539 		 * the remaining table entries are padded with reused WQs
540 		 * and hashes are not spread uniformly.
541 		 */
542 		for (i = 0, j = 0; (i != reta_idx_n); ++i) {
543 			(*priv->reta_idx)[i] = j;
544 			if (++j == rxqs_n)
545 				j = 0;
546 		}
547 	}
548 	ret = mlx5_proc_priv_init(dev);
549 	if (ret)
550 		return ret;
551 	return 0;
552 }
553 
554 /**
555  * Sets default tuning parameters.
556  *
557  * @param dev
558  *   Pointer to Ethernet device.
559  * @param[out] info
560  *   Info structure output buffer.
561  */
562 static void
563 mlx5_set_default_params(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
564 {
565 	struct mlx5_priv *priv = dev->data->dev_private;
566 
567 	/* Minimum CPU utilization. */
568 	info->default_rxportconf.ring_size = 256;
569 	info->default_txportconf.ring_size = 256;
570 	info->default_rxportconf.burst_size = 64;
571 	info->default_txportconf.burst_size = 64;
572 	if (priv->link_speed_capa & ETH_LINK_SPEED_100G) {
573 		info->default_rxportconf.nb_queues = 16;
574 		info->default_txportconf.nb_queues = 16;
575 		if (dev->data->nb_rx_queues > 2 ||
576 		    dev->data->nb_tx_queues > 2) {
577 			/* Max Throughput. */
578 			info->default_rxportconf.ring_size = 2048;
579 			info->default_txportconf.ring_size = 2048;
580 		}
581 	} else {
582 		info->default_rxportconf.nb_queues = 8;
583 		info->default_txportconf.nb_queues = 8;
584 		if (dev->data->nb_rx_queues > 2 ||
585 		    dev->data->nb_tx_queues > 2) {
586 			/* Max Throughput. */
587 			info->default_rxportconf.ring_size = 4096;
588 			info->default_txportconf.ring_size = 4096;
589 		}
590 	}
591 }
592 
593 /**
594  * DPDK callback to get information about the device.
595  *
596  * @param dev
597  *   Pointer to Ethernet device structure.
598  * @param[out] info
599  *   Info structure output buffer.
600  */
601 void
602 mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
603 {
604 	struct mlx5_priv *priv = dev->data->dev_private;
605 	struct mlx5_dev_config *config = &priv->config;
606 	unsigned int max;
607 	char ifname[IF_NAMESIZE];
608 
609 	/* FIXME: we should ask the device for these values. */
610 	info->min_rx_bufsize = 32;
611 	info->max_rx_pktlen = 65536;
612 	/*
613 	 * Since we need one CQ per QP, the limit is the minimum number
614 	 * between the two values.
615 	 */
616 	max = RTE_MIN(priv->sh->device_attr.orig_attr.max_cq,
617 		      priv->sh->device_attr.orig_attr.max_qp);
618 	/* If max >= 65535 then max = 0, max_rx_queues is uint16_t. */
619 	if (max >= 65535)
620 		max = 65535;
621 	info->max_rx_queues = max;
622 	info->max_tx_queues = max;
623 	info->max_mac_addrs = MLX5_MAX_UC_MAC_ADDRESSES;
624 	info->rx_queue_offload_capa = mlx5_get_rx_queue_offloads(dev);
625 	info->rx_offload_capa = (mlx5_get_rx_port_offloads() |
626 				 info->rx_queue_offload_capa);
627 	info->tx_offload_capa = mlx5_get_tx_port_offloads(dev);
628 	if (mlx5_get_ifname(dev, &ifname) == 0)
629 		info->if_index = if_nametoindex(ifname);
630 	info->reta_size = priv->reta_idx_n ?
631 		priv->reta_idx_n : config->ind_table_max_size;
632 	info->hash_key_size = MLX5_RSS_HASH_KEY_LEN;
633 	info->speed_capa = priv->link_speed_capa;
634 	info->flow_type_rss_offloads = ~MLX5_RSS_HF_MASK;
635 	mlx5_set_default_params(dev, info);
636 	info->switch_info.name = dev->data->name;
637 	info->switch_info.domain_id = priv->domain_id;
638 	info->switch_info.port_id = priv->representor_id;
639 	if (priv->representor) {
640 		unsigned int i = mlx5_dev_to_port_id(dev->device, NULL, 0);
641 		uint16_t port_id[i];
642 
643 		i = RTE_MIN(mlx5_dev_to_port_id(dev->device, port_id, i), i);
644 		while (i--) {
645 			struct mlx5_priv *opriv =
646 				rte_eth_devices[port_id[i]].data->dev_private;
647 
648 			if (!opriv ||
649 			    opriv->representor ||
650 			    opriv->domain_id != priv->domain_id)
651 				continue;
652 			/*
653 			 * Override switch name with that of the master
654 			 * device.
655 			 */
656 			info->switch_info.name = opriv->dev_data->name;
657 			break;
658 		}
659 	}
660 }
661 
662 /**
663  * Get device current raw clock counter
664  *
665  * @param dev
666  *   Pointer to Ethernet device structure.
667  * @param[out] time
668  *   Current raw clock counter of the device.
669  *
670  * @return
671  *   0 if the clock has correctly been read
672  *   The value of errno in case of error
673  */
674 int
675 mlx5_read_clock(struct rte_eth_dev *dev, uint64_t *clock)
676 {
677 	struct mlx5_priv *priv = dev->data->dev_private;
678 	struct ibv_context *ctx = priv->sh->ctx;
679 	struct ibv_values_ex values;
680 	int err = 0;
681 
682 	values.comp_mask = IBV_VALUES_MASK_RAW_CLOCK;
683 	err = mlx5_glue->query_rt_values_ex(ctx, &values);
684 	if (err != 0) {
685 		DRV_LOG(WARNING, "Could not query the clock !");
686 		return err;
687 	}
688 	*clock = values.raw_clock.tv_nsec;
689 	return 0;
690 }
691 
692 /**
693  * Get firmware version of a device.
694  *
695  * @param dev
696  *   Ethernet device port.
697  * @param fw_ver
698  *   String output allocated by caller.
699  * @param fw_size
700  *   Size of the output string, including terminating null byte.
701  *
702  * @return
703  *   0 on success, or the size of the non truncated string if too big.
704  */
705 int mlx5_fw_version_get(struct rte_eth_dev *dev, char *fw_ver, size_t fw_size)
706 {
707 	struct mlx5_priv *priv = dev->data->dev_private;
708 	struct ibv_device_attr *attr = &priv->sh->device_attr.orig_attr;
709 	size_t size = strnlen(attr->fw_ver, sizeof(attr->fw_ver)) + 1;
710 
711 	if (fw_size < size)
712 		return size;
713 	if (fw_ver != NULL)
714 		strlcpy(fw_ver, attr->fw_ver, fw_size);
715 	return 0;
716 }
717 
718 /**
719  * Get supported packet types.
720  *
721  * @param dev
722  *   Pointer to Ethernet device structure.
723  *
724  * @return
725  *   A pointer to the supported Packet types array.
726  */
727 const uint32_t *
728 mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev)
729 {
730 	static const uint32_t ptypes[] = {
731 		/* refers to rxq_cq_to_pkt_type() */
732 		RTE_PTYPE_L2_ETHER,
733 		RTE_PTYPE_L3_IPV4_EXT_UNKNOWN,
734 		RTE_PTYPE_L3_IPV6_EXT_UNKNOWN,
735 		RTE_PTYPE_L4_NONFRAG,
736 		RTE_PTYPE_L4_FRAG,
737 		RTE_PTYPE_L4_TCP,
738 		RTE_PTYPE_L4_UDP,
739 		RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN,
740 		RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN,
741 		RTE_PTYPE_INNER_L4_NONFRAG,
742 		RTE_PTYPE_INNER_L4_FRAG,
743 		RTE_PTYPE_INNER_L4_TCP,
744 		RTE_PTYPE_INNER_L4_UDP,
745 		RTE_PTYPE_UNKNOWN
746 	};
747 
748 	if (dev->rx_pkt_burst == mlx5_rx_burst ||
749 	    dev->rx_pkt_burst == mlx5_rx_burst_mprq ||
750 	    dev->rx_pkt_burst == mlx5_rx_burst_vec)
751 		return ptypes;
752 	return NULL;
753 }
754 
755 /**
756  * Retrieve the master device for representor in the same switch domain.
757  *
758  * @param dev
759  *   Pointer to representor Ethernet device structure.
760  *
761  * @return
762  *   Master device structure  on success, NULL otherwise.
763  */
764 
765 static struct rte_eth_dev *
766 mlx5_find_master_dev(struct rte_eth_dev *dev)
767 {
768 	struct mlx5_priv *priv;
769 	uint16_t port_id;
770 	uint16_t domain_id;
771 
772 	priv = dev->data->dev_private;
773 	domain_id = priv->domain_id;
774 	assert(priv->representor);
775 	RTE_ETH_FOREACH_DEV_OF(port_id, dev->device) {
776 		priv = rte_eth_devices[port_id].data->dev_private;
777 		if (priv &&
778 		    priv->master &&
779 		    priv->domain_id == domain_id)
780 			return &rte_eth_devices[port_id];
781 	}
782 	return NULL;
783 }
784 
785 /**
786  * DPDK callback to retrieve physical link information.
787  *
788  * @param dev
789  *   Pointer to Ethernet device structure.
790  * @param[out] link
791  *   Storage for current link status.
792  *
793  * @return
794  *   0 on success, a negative errno value otherwise and rte_errno is set.
795  */
796 static int
797 mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev,
798 			       struct rte_eth_link *link)
799 {
800 	struct mlx5_priv *priv = dev->data->dev_private;
801 	struct ethtool_cmd edata = {
802 		.cmd = ETHTOOL_GSET /* Deprecated since Linux v4.5. */
803 	};
804 	struct ifreq ifr;
805 	struct rte_eth_link dev_link;
806 	int link_speed = 0;
807 	int ret;
808 
809 	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
810 	if (ret) {
811 		DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
812 			dev->data->port_id, strerror(rte_errno));
813 		return ret;
814 	}
815 	dev_link = (struct rte_eth_link) {
816 		.link_status = ((ifr.ifr_flags & IFF_UP) &&
817 				(ifr.ifr_flags & IFF_RUNNING)),
818 	};
819 	ifr = (struct ifreq) {
820 		.ifr_data = (void *)&edata,
821 	};
822 	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
823 	if (ret) {
824 		if (ret == -ENOTSUP && priv->representor) {
825 			struct rte_eth_dev *master;
826 
827 			/*
828 			 * For representors we can try to inherit link
829 			 * settings from the master device. Actually
830 			 * link settings do not make a lot of sense
831 			 * for representors due to missing physical
832 			 * link. The old kernel drivers supported
833 			 * emulated settings query for representors,
834 			 * the new ones do not, so we have to add
835 			 * this code for compatibility issues.
836 			 */
837 			master = mlx5_find_master_dev(dev);
838 			if (master) {
839 				ifr = (struct ifreq) {
840 					.ifr_data = (void *)&edata,
841 				};
842 				/*
843 				 * Use special version of mlx5_ifreq()
844 				 * to get master device name with local
845 				 * device Netlink socket. Using master
846 				 * device Netlink socket is not thread
847 				 * safe.
848 				 */
849 				ret = mlx5_ifreq_base(dev, master,
850 						      SIOCETHTOOL, &ifr);
851 			}
852 		}
853 		if (ret) {
854 			DRV_LOG(WARNING,
855 				"port %u ioctl(SIOCETHTOOL,"
856 				" ETHTOOL_GSET) failed: %s",
857 				dev->data->port_id, strerror(rte_errno));
858 			return ret;
859 		}
860 	}
861 	link_speed = ethtool_cmd_speed(&edata);
862 	if (link_speed == -1)
863 		dev_link.link_speed = ETH_SPEED_NUM_NONE;
864 	else
865 		dev_link.link_speed = link_speed;
866 	priv->link_speed_capa = 0;
867 	if (edata.supported & SUPPORTED_Autoneg)
868 		priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG;
869 	if (edata.supported & (SUPPORTED_1000baseT_Full |
870 			       SUPPORTED_1000baseKX_Full))
871 		priv->link_speed_capa |= ETH_LINK_SPEED_1G;
872 	if (edata.supported & SUPPORTED_10000baseKR_Full)
873 		priv->link_speed_capa |= ETH_LINK_SPEED_10G;
874 	if (edata.supported & (SUPPORTED_40000baseKR4_Full |
875 			       SUPPORTED_40000baseCR4_Full |
876 			       SUPPORTED_40000baseSR4_Full |
877 			       SUPPORTED_40000baseLR4_Full))
878 		priv->link_speed_capa |= ETH_LINK_SPEED_40G;
879 	dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ?
880 				ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
881 	dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
882 			ETH_LINK_SPEED_FIXED);
883 	if (((dev_link.link_speed && !dev_link.link_status) ||
884 	     (!dev_link.link_speed && dev_link.link_status))) {
885 		rte_errno = EAGAIN;
886 		return -rte_errno;
887 	}
888 	*link = dev_link;
889 	return 0;
890 }
891 
892 /**
893  * Retrieve physical link information (unlocked version using new ioctl).
894  *
895  * @param dev
896  *   Pointer to Ethernet device structure.
897  * @param[out] link
898  *   Storage for current link status.
899  *
900  * @return
901  *   0 on success, a negative errno value otherwise and rte_errno is set.
902  */
903 static int
904 mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
905 			     struct rte_eth_link *link)
906 
907 {
908 	struct mlx5_priv *priv = dev->data->dev_private;
909 	struct ethtool_link_settings gcmd = { .cmd = ETHTOOL_GLINKSETTINGS };
910 	struct ifreq ifr;
911 	struct rte_eth_link dev_link;
912 	struct rte_eth_dev *master = NULL;
913 	uint64_t sc;
914 	int ret;
915 
916 	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
917 	if (ret) {
918 		DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
919 			dev->data->port_id, strerror(rte_errno));
920 		return ret;
921 	}
922 	dev_link = (struct rte_eth_link) {
923 		.link_status = ((ifr.ifr_flags & IFF_UP) &&
924 				(ifr.ifr_flags & IFF_RUNNING)),
925 	};
926 	ifr = (struct ifreq) {
927 		.ifr_data = (void *)&gcmd,
928 	};
929 	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
930 	if (ret) {
931 		if (ret == -ENOTSUP && priv->representor) {
932 			/*
933 			 * For representors we can try to inherit link
934 			 * settings from the master device. Actually
935 			 * link settings do not make a lot of sense
936 			 * for representors due to missing physical
937 			 * link. The old kernel drivers supported
938 			 * emulated settings query for representors,
939 			 * the new ones do not, so we have to add
940 			 * this code for compatibility issues.
941 			 */
942 			master = mlx5_find_master_dev(dev);
943 			if (master) {
944 				ifr = (struct ifreq) {
945 					.ifr_data = (void *)&gcmd,
946 				};
947 				/*
948 				 * Avoid using master Netlink socket.
949 				 * This is not thread-safe.
950 				 */
951 				ret = mlx5_ifreq_base(dev, master,
952 						      SIOCETHTOOL, &ifr);
953 			}
954 		}
955 		if (ret) {
956 			DRV_LOG(DEBUG,
957 				"port %u ioctl(SIOCETHTOOL,"
958 				" ETHTOOL_GLINKSETTINGS) failed: %s",
959 				dev->data->port_id, strerror(rte_errno));
960 			return ret;
961 		}
962 
963 	}
964 	gcmd.link_mode_masks_nwords = -gcmd.link_mode_masks_nwords;
965 
966 	alignas(struct ethtool_link_settings)
967 	uint8_t data[offsetof(struct ethtool_link_settings, link_mode_masks) +
968 		     sizeof(uint32_t) * gcmd.link_mode_masks_nwords * 3];
969 	struct ethtool_link_settings *ecmd = (void *)data;
970 
971 	*ecmd = gcmd;
972 	ifr.ifr_data = (void *)ecmd;
973 	ret = mlx5_ifreq_base(dev, master ? master : dev, SIOCETHTOOL, &ifr);
974 	if (ret) {
975 		DRV_LOG(DEBUG,
976 			"port %u ioctl(SIOCETHTOOL,"
977 			"ETHTOOL_GLINKSETTINGS) failed: %s",
978 			dev->data->port_id, strerror(rte_errno));
979 		return ret;
980 	}
981 	dev_link.link_speed = ecmd->speed;
982 	sc = ecmd->link_mode_masks[0] |
983 		((uint64_t)ecmd->link_mode_masks[1] << 32);
984 	priv->link_speed_capa = 0;
985 	if (sc & MLX5_BITSHIFT(ETHTOOL_LINK_MODE_Autoneg_BIT))
986 		priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG;
987 	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseT_Full_BIT) |
988 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseKX_Full_BIT)))
989 		priv->link_speed_capa |= ETH_LINK_SPEED_1G;
990 	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT) |
991 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKR_Full_BIT) |
992 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseR_FEC_BIT)))
993 		priv->link_speed_capa |= ETH_LINK_SPEED_10G;
994 	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT) |
995 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT)))
996 		priv->link_speed_capa |= ETH_LINK_SPEED_20G;
997 	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT) |
998 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT) |
999 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT) |
1000 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT)))
1001 		priv->link_speed_capa |= ETH_LINK_SPEED_40G;
1002 	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT) |
1003 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT) |
1004 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT) |
1005 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT)))
1006 		priv->link_speed_capa |= ETH_LINK_SPEED_56G;
1007 	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseCR_Full_BIT) |
1008 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseKR_Full_BIT) |
1009 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseSR_Full_BIT)))
1010 		priv->link_speed_capa |= ETH_LINK_SPEED_25G;
1011 	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT) |
1012 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT)))
1013 		priv->link_speed_capa |= ETH_LINK_SPEED_50G;
1014 	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT) |
1015 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT) |
1016 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT) |
1017 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT)))
1018 		priv->link_speed_capa |= ETH_LINK_SPEED_100G;
1019 	dev_link.link_duplex = ((ecmd->duplex == DUPLEX_HALF) ?
1020 				ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
1021 	dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
1022 				  ETH_LINK_SPEED_FIXED);
1023 	if (((dev_link.link_speed && !dev_link.link_status) ||
1024 	     (!dev_link.link_speed && dev_link.link_status))) {
1025 		rte_errno = EAGAIN;
1026 		return -rte_errno;
1027 	}
1028 	*link = dev_link;
1029 	return 0;
1030 }
1031 
1032 /**
1033  * DPDK callback to retrieve physical link information.
1034  *
1035  * @param dev
1036  *   Pointer to Ethernet device structure.
1037  * @param wait_to_complete
1038  *   Wait for request completion.
1039  *
1040  * @return
1041  *   0 if link status was not updated, positive if it was, a negative errno
1042  *   value otherwise and rte_errno is set.
1043  */
1044 int
1045 mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete)
1046 {
1047 	int ret;
1048 	struct rte_eth_link dev_link;
1049 	time_t start_time = time(NULL);
1050 
1051 	do {
1052 		ret = mlx5_link_update_unlocked_gs(dev, &dev_link);
1053 		if (ret)
1054 			ret = mlx5_link_update_unlocked_gset(dev, &dev_link);
1055 		if (ret == 0)
1056 			break;
1057 		/* Handle wait to complete situation. */
1058 		if (wait_to_complete && ret == -EAGAIN) {
1059 			if (abs((int)difftime(time(NULL), start_time)) <
1060 			    MLX5_LINK_STATUS_TIMEOUT) {
1061 				usleep(0);
1062 				continue;
1063 			} else {
1064 				rte_errno = EBUSY;
1065 				return -rte_errno;
1066 			}
1067 		} else if (ret < 0) {
1068 			return ret;
1069 		}
1070 	} while (wait_to_complete);
1071 	ret = !!memcmp(&dev->data->dev_link, &dev_link,
1072 		       sizeof(struct rte_eth_link));
1073 	dev->data->dev_link = dev_link;
1074 	return ret;
1075 }
1076 
1077 /**
1078  * DPDK callback to change the MTU.
1079  *
1080  * @param dev
1081  *   Pointer to Ethernet device structure.
1082  * @param in_mtu
1083  *   New MTU.
1084  *
1085  * @return
1086  *   0 on success, a negative errno value otherwise and rte_errno is set.
1087  */
1088 int
1089 mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
1090 {
1091 	struct mlx5_priv *priv = dev->data->dev_private;
1092 	uint16_t kern_mtu = 0;
1093 	int ret;
1094 
1095 	ret = mlx5_get_mtu(dev, &kern_mtu);
1096 	if (ret)
1097 		return ret;
1098 	/* Set kernel interface MTU first. */
1099 	ret = mlx5_set_mtu(dev, mtu);
1100 	if (ret)
1101 		return ret;
1102 	ret = mlx5_get_mtu(dev, &kern_mtu);
1103 	if (ret)
1104 		return ret;
1105 	if (kern_mtu == mtu) {
1106 		priv->mtu = mtu;
1107 		DRV_LOG(DEBUG, "port %u adapter MTU set to %u",
1108 			dev->data->port_id, mtu);
1109 		return 0;
1110 	}
1111 	rte_errno = EAGAIN;
1112 	return -rte_errno;
1113 }
1114 
1115 /**
1116  * DPDK callback to get flow control status.
1117  *
1118  * @param dev
1119  *   Pointer to Ethernet device structure.
1120  * @param[out] fc_conf
1121  *   Flow control output buffer.
1122  *
1123  * @return
1124  *   0 on success, a negative errno value otherwise and rte_errno is set.
1125  */
1126 int
1127 mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
1128 {
1129 	struct ifreq ifr;
1130 	struct ethtool_pauseparam ethpause = {
1131 		.cmd = ETHTOOL_GPAUSEPARAM
1132 	};
1133 	int ret;
1134 
1135 	ifr.ifr_data = (void *)&ethpause;
1136 	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
1137 	if (ret) {
1138 		DRV_LOG(WARNING,
1139 			"port %u ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM) failed:"
1140 			" %s",
1141 			dev->data->port_id, strerror(rte_errno));
1142 		return ret;
1143 	}
1144 	fc_conf->autoneg = ethpause.autoneg;
1145 	if (ethpause.rx_pause && ethpause.tx_pause)
1146 		fc_conf->mode = RTE_FC_FULL;
1147 	else if (ethpause.rx_pause)
1148 		fc_conf->mode = RTE_FC_RX_PAUSE;
1149 	else if (ethpause.tx_pause)
1150 		fc_conf->mode = RTE_FC_TX_PAUSE;
1151 	else
1152 		fc_conf->mode = RTE_FC_NONE;
1153 	return 0;
1154 }
1155 
1156 /**
1157  * DPDK callback to modify flow control parameters.
1158  *
1159  * @param dev
1160  *   Pointer to Ethernet device structure.
1161  * @param[in] fc_conf
1162  *   Flow control parameters.
1163  *
1164  * @return
1165  *   0 on success, a negative errno value otherwise and rte_errno is set.
1166  */
1167 int
1168 mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
1169 {
1170 	struct ifreq ifr;
1171 	struct ethtool_pauseparam ethpause = {
1172 		.cmd = ETHTOOL_SPAUSEPARAM
1173 	};
1174 	int ret;
1175 
1176 	ifr.ifr_data = (void *)&ethpause;
1177 	ethpause.autoneg = fc_conf->autoneg;
1178 	if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
1179 	    (fc_conf->mode & RTE_FC_RX_PAUSE))
1180 		ethpause.rx_pause = 1;
1181 	else
1182 		ethpause.rx_pause = 0;
1183 
1184 	if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
1185 	    (fc_conf->mode & RTE_FC_TX_PAUSE))
1186 		ethpause.tx_pause = 1;
1187 	else
1188 		ethpause.tx_pause = 0;
1189 	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
1190 	if (ret) {
1191 		DRV_LOG(WARNING,
1192 			"port %u ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)"
1193 			" failed: %s",
1194 			dev->data->port_id, strerror(rte_errno));
1195 		return ret;
1196 	}
1197 	return 0;
1198 }
1199 
1200 /**
1201  * Get PCI information from struct ibv_device.
1202  *
1203  * @param device
1204  *   Pointer to Ethernet device structure.
1205  * @param[out] pci_addr
1206  *   PCI bus address output buffer.
1207  *
1208  * @return
1209  *   0 on success, a negative errno value otherwise and rte_errno is set.
1210  */
1211 int
1212 mlx5_ibv_device_to_pci_addr(const struct ibv_device *device,
1213 			    struct rte_pci_addr *pci_addr)
1214 {
1215 	FILE *file;
1216 	char line[32];
1217 	MKSTR(path, "%s/device/uevent", device->ibdev_path);
1218 
1219 	file = fopen(path, "rb");
1220 	if (file == NULL) {
1221 		rte_errno = errno;
1222 		return -rte_errno;
1223 	}
1224 	while (fgets(line, sizeof(line), file) == line) {
1225 		size_t len = strlen(line);
1226 		int ret;
1227 
1228 		/* Truncate long lines. */
1229 		if (len == (sizeof(line) - 1))
1230 			while (line[(len - 1)] != '\n') {
1231 				ret = fgetc(file);
1232 				if (ret == EOF)
1233 					break;
1234 				line[(len - 1)] = ret;
1235 			}
1236 		/* Extract information. */
1237 		if (sscanf(line,
1238 			   "PCI_SLOT_NAME="
1239 			   "%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n",
1240 			   &pci_addr->domain,
1241 			   &pci_addr->bus,
1242 			   &pci_addr->devid,
1243 			   &pci_addr->function) == 4) {
1244 			ret = 0;
1245 			break;
1246 		}
1247 	}
1248 	fclose(file);
1249 	return 0;
1250 }
1251 
1252 /**
1253  * Handle asynchronous removal event for entire multiport device.
1254  *
1255  * @param sh
1256  *   Infiniband device shared context.
1257  */
1258 static void
1259 mlx5_dev_interrupt_device_fatal(struct mlx5_ibv_shared *sh)
1260 {
1261 	uint32_t i;
1262 
1263 	for (i = 0; i < sh->max_port; ++i) {
1264 		struct rte_eth_dev *dev;
1265 
1266 		if (sh->port[i].ih_port_id >= RTE_MAX_ETHPORTS) {
1267 			/*
1268 			 * Or not existing port either no
1269 			 * handler installed for this port.
1270 			 */
1271 			continue;
1272 		}
1273 		dev = &rte_eth_devices[sh->port[i].ih_port_id];
1274 		assert(dev);
1275 		if (dev->data->dev_conf.intr_conf.rmv)
1276 			_rte_eth_dev_callback_process
1277 				(dev, RTE_ETH_EVENT_INTR_RMV, NULL);
1278 	}
1279 }
1280 
1281 /**
1282  * Handle shared asynchronous events the NIC (removal event
1283  * and link status change). Supports multiport IB device.
1284  *
1285  * @param cb_arg
1286  *   Callback argument.
1287  */
1288 void
1289 mlx5_dev_interrupt_handler(void *cb_arg)
1290 {
1291 	struct mlx5_ibv_shared *sh = cb_arg;
1292 	struct ibv_async_event event;
1293 
1294 	/* Read all message from the IB device and acknowledge them. */
1295 	for (;;) {
1296 		struct rte_eth_dev *dev;
1297 		uint32_t tmp;
1298 
1299 		if (mlx5_glue->get_async_event(sh->ctx, &event))
1300 			break;
1301 		/* Retrieve and check IB port index. */
1302 		tmp = (uint32_t)event.element.port_num;
1303 		if (!tmp && event.event_type == IBV_EVENT_DEVICE_FATAL) {
1304 			/*
1305 			 * The DEVICE_FATAL event is called once for
1306 			 * entire device without port specifying.
1307 			 * We should notify all existing ports.
1308 			 */
1309 			mlx5_glue->ack_async_event(&event);
1310 			mlx5_dev_interrupt_device_fatal(sh);
1311 			continue;
1312 		}
1313 		assert(tmp && (tmp <= sh->max_port));
1314 		if (!tmp) {
1315 			/* Unsupported devive level event. */
1316 			mlx5_glue->ack_async_event(&event);
1317 			DRV_LOG(DEBUG,
1318 				"unsupported common event (type %d)",
1319 				event.event_type);
1320 			continue;
1321 		}
1322 		if (tmp > sh->max_port) {
1323 			/* Invalid IB port index. */
1324 			mlx5_glue->ack_async_event(&event);
1325 			DRV_LOG(DEBUG,
1326 				"cannot handle an event (type %d)"
1327 				"due to invalid IB port index (%u)",
1328 				event.event_type, tmp);
1329 			continue;
1330 		}
1331 		if (sh->port[tmp - 1].ih_port_id >= RTE_MAX_ETHPORTS) {
1332 			/* No handler installed. */
1333 			mlx5_glue->ack_async_event(&event);
1334 			DRV_LOG(DEBUG,
1335 				"cannot handle an event (type %d)"
1336 				"due to no handler installed for port %u",
1337 				event.event_type, tmp);
1338 			continue;
1339 		}
1340 		/* Retrieve ethernet device descriptor. */
1341 		tmp = sh->port[tmp - 1].ih_port_id;
1342 		dev = &rte_eth_devices[tmp];
1343 		assert(dev);
1344 		if ((event.event_type == IBV_EVENT_PORT_ACTIVE ||
1345 		     event.event_type == IBV_EVENT_PORT_ERR) &&
1346 			dev->data->dev_conf.intr_conf.lsc) {
1347 			mlx5_glue->ack_async_event(&event);
1348 			if (mlx5_link_update(dev, 0) == -EAGAIN) {
1349 				usleep(0);
1350 				continue;
1351 			}
1352 			_rte_eth_dev_callback_process
1353 				(dev, RTE_ETH_EVENT_INTR_LSC, NULL);
1354 			continue;
1355 		}
1356 		DRV_LOG(DEBUG,
1357 			"port %u cannot handle an unknown event (type %d)",
1358 			dev->data->port_id, event.event_type);
1359 		mlx5_glue->ack_async_event(&event);
1360 	}
1361 }
1362 
1363 /*
1364  * Unregister callback handler safely. The handler may be active
1365  * while we are trying to unregister it, in this case code -EAGAIN
1366  * is returned by rte_intr_callback_unregister(). This routine checks
1367  * the return code and tries to unregister handler again.
1368  *
1369  * @param handle
1370  *   interrupt handle
1371  * @param cb_fn
1372  *   pointer to callback routine
1373  * @cb_arg
1374  *   opaque callback parameter
1375  */
1376 void
1377 mlx5_intr_callback_unregister(const struct rte_intr_handle *handle,
1378 			      rte_intr_callback_fn cb_fn, void *cb_arg)
1379 {
1380 	/*
1381 	 * Try to reduce timeout management overhead by not calling
1382 	 * the timer related routines on the first iteration. If the
1383 	 * unregistering succeeds on first call there will be no
1384 	 * timer calls at all.
1385 	 */
1386 	uint64_t twait = 0;
1387 	uint64_t start = 0;
1388 
1389 	do {
1390 		int ret;
1391 
1392 		ret = rte_intr_callback_unregister(handle, cb_fn, cb_arg);
1393 		if (ret >= 0)
1394 			return;
1395 		if (ret != -EAGAIN) {
1396 			DRV_LOG(INFO, "failed to unregister interrupt"
1397 				      " handler (error: %d)", ret);
1398 			assert(false);
1399 			return;
1400 		}
1401 		if (twait) {
1402 			struct timespec onems;
1403 
1404 			/* Wait one millisecond and try again. */
1405 			onems.tv_sec = 0;
1406 			onems.tv_nsec = NS_PER_S / MS_PER_S;
1407 			nanosleep(&onems, 0);
1408 			/* Check whether one second elapsed. */
1409 			if ((rte_get_timer_cycles() - start) <= twait)
1410 				continue;
1411 		} else {
1412 			/*
1413 			 * We get the amount of timer ticks for one second.
1414 			 * If this amount elapsed it means we spent one
1415 			 * second in waiting. This branch is executed once
1416 			 * on first iteration.
1417 			 */
1418 			twait = rte_get_timer_hz();
1419 			assert(twait);
1420 		}
1421 		/*
1422 		 * Timeout elapsed, show message (once a second) and retry.
1423 		 * We have no other acceptable option here, if we ignore
1424 		 * the unregistering return code the handler will not
1425 		 * be unregistered, fd will be closed and we may get the
1426 		 * crush. Hanging and messaging in the loop seems not to be
1427 		 * the worst choice.
1428 		 */
1429 		DRV_LOG(INFO, "Retrying to unregister interrupt handler");
1430 		start = rte_get_timer_cycles();
1431 	} while (true);
1432 }
1433 
1434 /**
1435  * Uninstall shared asynchronous device events handler.
1436  * This function is implemented to support event sharing
1437  * between multiple ports of single IB device.
1438  *
1439  * @param dev
1440  *   Pointer to Ethernet device.
1441  */
1442 static void
1443 mlx5_dev_shared_handler_uninstall(struct rte_eth_dev *dev)
1444 {
1445 	struct mlx5_priv *priv = dev->data->dev_private;
1446 	struct mlx5_ibv_shared *sh = priv->sh;
1447 
1448 	if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1449 		return;
1450 	pthread_mutex_lock(&sh->intr_mutex);
1451 	assert(priv->ibv_port);
1452 	assert(priv->ibv_port <= sh->max_port);
1453 	assert(dev->data->port_id < RTE_MAX_ETHPORTS);
1454 	if (sh->port[priv->ibv_port - 1].ih_port_id >= RTE_MAX_ETHPORTS)
1455 		goto exit;
1456 	assert(sh->port[priv->ibv_port - 1].ih_port_id ==
1457 					(uint32_t)dev->data->port_id);
1458 	assert(sh->intr_cnt);
1459 	sh->port[priv->ibv_port - 1].ih_port_id = RTE_MAX_ETHPORTS;
1460 	if (!sh->intr_cnt || --sh->intr_cnt)
1461 		goto exit;
1462 	mlx5_intr_callback_unregister(&sh->intr_handle,
1463 				     mlx5_dev_interrupt_handler, sh);
1464 	sh->intr_handle.fd = 0;
1465 	sh->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
1466 exit:
1467 	pthread_mutex_unlock(&sh->intr_mutex);
1468 }
1469 
1470 /**
1471  * Install shared asynchronous device events handler.
1472  * This function is implemented to support event sharing
1473  * between multiple ports of single IB device.
1474  *
1475  * @param dev
1476  *   Pointer to Ethernet device.
1477  */
1478 static void
1479 mlx5_dev_shared_handler_install(struct rte_eth_dev *dev)
1480 {
1481 	struct mlx5_priv *priv = dev->data->dev_private;
1482 	struct mlx5_ibv_shared *sh = priv->sh;
1483 	int ret;
1484 	int flags;
1485 
1486 	if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1487 		return;
1488 	pthread_mutex_lock(&sh->intr_mutex);
1489 	assert(priv->ibv_port);
1490 	assert(priv->ibv_port <= sh->max_port);
1491 	assert(dev->data->port_id < RTE_MAX_ETHPORTS);
1492 	if (sh->port[priv->ibv_port - 1].ih_port_id < RTE_MAX_ETHPORTS) {
1493 		/* The handler is already installed for this port. */
1494 		assert(sh->intr_cnt);
1495 		goto exit;
1496 	}
1497 	sh->port[priv->ibv_port - 1].ih_port_id = (uint32_t)dev->data->port_id;
1498 	if (sh->intr_cnt) {
1499 		sh->intr_cnt++;
1500 		goto exit;
1501 	}
1502 	/* No shared handler installed. */
1503 	assert(sh->ctx->async_fd > 0);
1504 	flags = fcntl(sh->ctx->async_fd, F_GETFL);
1505 	ret = fcntl(sh->ctx->async_fd, F_SETFL, flags | O_NONBLOCK);
1506 	if (ret) {
1507 		DRV_LOG(INFO, "failed to change file descriptor"
1508 			      " async event queue");
1509 		/* Indicate there will be no interrupts. */
1510 		dev->data->dev_conf.intr_conf.lsc = 0;
1511 		dev->data->dev_conf.intr_conf.rmv = 0;
1512 		sh->port[priv->ibv_port - 1].ih_port_id = RTE_MAX_ETHPORTS;
1513 		goto exit;
1514 	}
1515 	sh->intr_handle.fd = sh->ctx->async_fd;
1516 	sh->intr_handle.type = RTE_INTR_HANDLE_EXT;
1517 	rte_intr_callback_register(&sh->intr_handle,
1518 				   mlx5_dev_interrupt_handler, sh);
1519 	sh->intr_cnt++;
1520 exit:
1521 	pthread_mutex_unlock(&sh->intr_mutex);
1522 }
1523 
1524 /**
1525  * Uninstall interrupt handler.
1526  *
1527  * @param dev
1528  *   Pointer to Ethernet device.
1529  */
1530 void
1531 mlx5_dev_interrupt_handler_uninstall(struct rte_eth_dev *dev)
1532 {
1533 	mlx5_dev_shared_handler_uninstall(dev);
1534 }
1535 
1536 /**
1537  * Install interrupt handler.
1538  *
1539  * @param dev
1540  *   Pointer to Ethernet device.
1541  */
1542 void
1543 mlx5_dev_interrupt_handler_install(struct rte_eth_dev *dev)
1544 {
1545 	mlx5_dev_shared_handler_install(dev);
1546 }
1547 
1548 /**
1549  * DPDK callback to bring the link DOWN.
1550  *
1551  * @param dev
1552  *   Pointer to Ethernet device structure.
1553  *
1554  * @return
1555  *   0 on success, a negative errno value otherwise and rte_errno is set.
1556  */
1557 int
1558 mlx5_set_link_down(struct rte_eth_dev *dev)
1559 {
1560 	return mlx5_set_flags(dev, ~IFF_UP, ~IFF_UP);
1561 }
1562 
1563 /**
1564  * DPDK callback to bring the link UP.
1565  *
1566  * @param dev
1567  *   Pointer to Ethernet device structure.
1568  *
1569  * @return
1570  *   0 on success, a negative errno value otherwise and rte_errno is set.
1571  */
1572 int
1573 mlx5_set_link_up(struct rte_eth_dev *dev)
1574 {
1575 	return mlx5_set_flags(dev, ~IFF_UP, IFF_UP);
1576 }
1577 
1578 /**
1579  * Configure the TX function to use.
1580  *
1581  * @param dev
1582  *   Pointer to private data structure.
1583  *
1584  * @return
1585  *   Pointer to selected Tx burst function.
1586  */
1587 eth_tx_burst_t
1588 mlx5_select_tx_function(struct rte_eth_dev *dev)
1589 {
1590 	struct mlx5_priv *priv = dev->data->dev_private;
1591 	eth_tx_burst_t tx_pkt_burst = mlx5_tx_burst;
1592 	struct mlx5_dev_config *config = &priv->config;
1593 	uint64_t tx_offloads = dev->data->dev_conf.txmode.offloads;
1594 	int tso = !!(tx_offloads & (DEV_TX_OFFLOAD_TCP_TSO |
1595 				    DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
1596 				    DEV_TX_OFFLOAD_GRE_TNL_TSO |
1597 				    DEV_TX_OFFLOAD_IP_TNL_TSO |
1598 				    DEV_TX_OFFLOAD_UDP_TNL_TSO));
1599 	int swp = !!(tx_offloads & (DEV_TX_OFFLOAD_IP_TNL_TSO |
1600 				    DEV_TX_OFFLOAD_UDP_TNL_TSO |
1601 				    DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM));
1602 	int vlan_insert = !!(tx_offloads & DEV_TX_OFFLOAD_VLAN_INSERT);
1603 
1604 	assert(priv != NULL);
1605 	/* Select appropriate TX function. */
1606 	if (vlan_insert || tso || swp)
1607 		return tx_pkt_burst;
1608 	if (config->mps == MLX5_MPW_ENHANCED) {
1609 		if (mlx5_check_vec_tx_support(dev) > 0) {
1610 			if (mlx5_check_raw_vec_tx_support(dev) > 0)
1611 				tx_pkt_burst = mlx5_tx_burst_raw_vec;
1612 			else
1613 				tx_pkt_burst = mlx5_tx_burst_vec;
1614 			DRV_LOG(DEBUG,
1615 				"port %u selected enhanced MPW Tx vectorized"
1616 				" function",
1617 				dev->data->port_id);
1618 		} else {
1619 			tx_pkt_burst = mlx5_tx_burst_empw;
1620 			DRV_LOG(DEBUG,
1621 				"port %u selected enhanced MPW Tx function",
1622 				dev->data->port_id);
1623 		}
1624 	} else if (config->mps && (config->txq_inline > 0)) {
1625 		tx_pkt_burst = mlx5_tx_burst_mpw_inline;
1626 		DRV_LOG(DEBUG, "port %u selected MPW inline Tx function",
1627 			dev->data->port_id);
1628 	} else if (config->mps) {
1629 		tx_pkt_burst = mlx5_tx_burst_mpw;
1630 		DRV_LOG(DEBUG, "port %u selected MPW Tx function",
1631 			dev->data->port_id);
1632 	}
1633 	return tx_pkt_burst;
1634 }
1635 
1636 /**
1637  * Configure the RX function to use.
1638  *
1639  * @param dev
1640  *   Pointer to private data structure.
1641  *
1642  * @return
1643  *   Pointer to selected Rx burst function.
1644  */
1645 eth_rx_burst_t
1646 mlx5_select_rx_function(struct rte_eth_dev *dev)
1647 {
1648 	eth_rx_burst_t rx_pkt_burst = mlx5_rx_burst;
1649 
1650 	assert(dev != NULL);
1651 	if (mlx5_check_vec_rx_support(dev) > 0) {
1652 		rx_pkt_burst = mlx5_rx_burst_vec;
1653 		DRV_LOG(DEBUG, "port %u selected Rx vectorized function",
1654 			dev->data->port_id);
1655 	} else if (mlx5_mprq_enabled(dev)) {
1656 		rx_pkt_burst = mlx5_rx_burst_mprq;
1657 	}
1658 	return rx_pkt_burst;
1659 }
1660 
1661 /**
1662  * Check if mlx5 device was removed.
1663  *
1664  * @param dev
1665  *   Pointer to Ethernet device structure.
1666  *
1667  * @return
1668  *   1 when device is removed, otherwise 0.
1669  */
1670 int
1671 mlx5_is_removed(struct rte_eth_dev *dev)
1672 {
1673 	struct ibv_device_attr device_attr;
1674 	struct mlx5_priv *priv = dev->data->dev_private;
1675 
1676 	if (mlx5_glue->query_device(priv->sh->ctx, &device_attr) == EIO)
1677 		return 1;
1678 	return 0;
1679 }
1680 
1681 /**
1682  * Get port ID list of mlx5 instances sharing a common device.
1683  *
1684  * @param[in] dev
1685  *   Device to look for.
1686  * @param[out] port_list
1687  *   Result buffer for collected port IDs.
1688  * @param port_list_n
1689  *   Maximum number of entries in result buffer. If 0, @p port_list can be
1690  *   NULL.
1691  *
1692  * @return
1693  *   Number of matching instances regardless of the @p port_list_n
1694  *   parameter, 0 if none were found.
1695  */
1696 unsigned int
1697 mlx5_dev_to_port_id(const struct rte_device *dev, uint16_t *port_list,
1698 		    unsigned int port_list_n)
1699 {
1700 	uint16_t id;
1701 	unsigned int n = 0;
1702 
1703 	RTE_ETH_FOREACH_DEV_OF(id, dev) {
1704 		if (n < port_list_n)
1705 			port_list[n] = id;
1706 		n++;
1707 	}
1708 	return n;
1709 }
1710 
1711 /**
1712  * Get the E-Switch domain id this port belongs to.
1713  *
1714  * @param[in] port
1715  *   Device port id.
1716  * @param[out] es_domain_id
1717  *   E-Switch domain id.
1718  * @param[out] es_port_id
1719  *   The port id of the port in the E-Switch.
1720  *
1721  * @return
1722  *   0 on success, a negative errno value otherwise and rte_errno is set.
1723  */
1724 int
1725 mlx5_port_to_eswitch_info(uint16_t port,
1726 			  uint16_t *es_domain_id, uint16_t *es_port_id)
1727 {
1728 	struct rte_eth_dev *dev;
1729 	struct mlx5_priv *priv;
1730 
1731 	if (port >= RTE_MAX_ETHPORTS) {
1732 		rte_errno = EINVAL;
1733 		return -rte_errno;
1734 	}
1735 	if (!rte_eth_dev_is_valid_port(port)) {
1736 		rte_errno = ENODEV;
1737 		return -rte_errno;
1738 	}
1739 	dev = &rte_eth_devices[port];
1740 	priv = dev->data->dev_private;
1741 	if (!(priv->representor || priv->master)) {
1742 		rte_errno = EINVAL;
1743 		return -rte_errno;
1744 	}
1745 	if (es_domain_id)
1746 		*es_domain_id = priv->domain_id;
1747 	if (es_port_id)
1748 		*es_port_id = priv->vport_id;
1749 	return 0;
1750 }
1751 
1752 /**
1753  * Get switch information associated with network interface.
1754  *
1755  * @param ifindex
1756  *   Network interface index.
1757  * @param[out] info
1758  *   Switch information object, populated in case of success.
1759  *
1760  * @return
1761  *   0 on success, a negative errno value otherwise and rte_errno is set.
1762  */
1763 int
1764 mlx5_sysfs_switch_info(unsigned int ifindex, struct mlx5_switch_info *info)
1765 {
1766 	char ifname[IF_NAMESIZE];
1767 	char port_name[IF_NAMESIZE];
1768 	FILE *file;
1769 	struct mlx5_switch_info data = {
1770 		.master = 0,
1771 		.representor = 0,
1772 		.name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET,
1773 		.port_name = 0,
1774 		.switch_id = 0,
1775 	};
1776 	DIR *dir;
1777 	bool port_switch_id_set = false;
1778 	bool device_dir = false;
1779 	char c;
1780 	int ret;
1781 
1782 	if (!if_indextoname(ifindex, ifname)) {
1783 		rte_errno = errno;
1784 		return -rte_errno;
1785 	}
1786 
1787 	MKSTR(phys_port_name, "/sys/class/net/%s/phys_port_name",
1788 	      ifname);
1789 	MKSTR(phys_switch_id, "/sys/class/net/%s/phys_switch_id",
1790 	      ifname);
1791 	MKSTR(pci_device, "/sys/class/net/%s/device",
1792 	      ifname);
1793 
1794 	file = fopen(phys_port_name, "rb");
1795 	if (file != NULL) {
1796 		ret = fscanf(file, "%s", port_name);
1797 		fclose(file);
1798 		if (ret == 1)
1799 			mlx5_translate_port_name(port_name, &data);
1800 	}
1801 	file = fopen(phys_switch_id, "rb");
1802 	if (file == NULL) {
1803 		rte_errno = errno;
1804 		return -rte_errno;
1805 	}
1806 	port_switch_id_set =
1807 		fscanf(file, "%" SCNx64 "%c", &data.switch_id, &c) == 2 &&
1808 		c == '\n';
1809 	fclose(file);
1810 	dir = opendir(pci_device);
1811 	if (dir != NULL) {
1812 		closedir(dir);
1813 		device_dir = true;
1814 	}
1815 	if (port_switch_id_set) {
1816 		/* We have some E-Switch configuration. */
1817 		mlx5_sysfs_check_switch_info(device_dir, &data);
1818 	}
1819 	*info = data;
1820 	assert(!(data.master && data.representor));
1821 	if (data.master && data.representor) {
1822 		DRV_LOG(ERR, "ifindex %u device is recognized as master"
1823 			     " and as representor", ifindex);
1824 		rte_errno = ENODEV;
1825 		return -rte_errno;
1826 	}
1827 	return 0;
1828 }
1829 
1830 /**
1831  * Analyze gathered port parameters via Netlink to recognize master
1832  * and representor devices for E-Switch configuration.
1833  *
1834  * @param[in] num_vf_set
1835  *   flag of presence of number of VFs port attribute.
1836  * @param[inout] switch_info
1837  *   Port information, including port name as a number and port name
1838  *   type if recognized
1839  *
1840  * @return
1841  *   master and representor flags are set in switch_info according to
1842  *   recognized parameters (if any).
1843  */
1844 void
1845 mlx5_nl_check_switch_info(bool num_vf_set,
1846 			  struct mlx5_switch_info *switch_info)
1847 {
1848 	switch (switch_info->name_type) {
1849 	case MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN:
1850 		/*
1851 		 * Name is not recognized, assume the master,
1852 		 * check the number of VFs key presence.
1853 		 */
1854 		switch_info->master = num_vf_set;
1855 		break;
1856 	case MLX5_PHYS_PORT_NAME_TYPE_NOTSET:
1857 		/*
1858 		 * Name is not set, this assumes the legacy naming
1859 		 * schema for master, just check if there is a
1860 		 * number of VFs key.
1861 		 */
1862 		switch_info->master = num_vf_set;
1863 		break;
1864 	case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
1865 		/* New uplink naming schema recognized. */
1866 		switch_info->master = 1;
1867 		break;
1868 	case MLX5_PHYS_PORT_NAME_TYPE_LEGACY:
1869 		/* Legacy representors naming schema. */
1870 		switch_info->representor = !num_vf_set;
1871 		break;
1872 	case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
1873 		/* New representors naming schema. */
1874 		switch_info->representor = 1;
1875 		break;
1876 	}
1877 }
1878 
1879 /**
1880  * Analyze gathered port parameters via sysfs to recognize master
1881  * and representor devices for E-Switch configuration.
1882  *
1883  * @param[in] device_dir
1884  *   flag of presence of "device" directory under port device key.
1885  * @param[inout] switch_info
1886  *   Port information, including port name as a number and port name
1887  *   type if recognized
1888  *
1889  * @return
1890  *   master and representor flags are set in switch_info according to
1891  *   recognized parameters (if any).
1892  */
1893 void
1894 mlx5_sysfs_check_switch_info(bool device_dir,
1895 			     struct mlx5_switch_info *switch_info)
1896 {
1897 	switch (switch_info->name_type) {
1898 	case MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN:
1899 		/*
1900 		 * Name is not recognized, assume the master,
1901 		 * check the device directory presence.
1902 		 */
1903 		switch_info->master = device_dir;
1904 		break;
1905 	case MLX5_PHYS_PORT_NAME_TYPE_NOTSET:
1906 		/*
1907 		 * Name is not set, this assumes the legacy naming
1908 		 * schema for master, just check if there is
1909 		 * a device directory.
1910 		 */
1911 		switch_info->master = device_dir;
1912 		break;
1913 	case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
1914 		/* New uplink naming schema recognized. */
1915 		switch_info->master = 1;
1916 		break;
1917 	case MLX5_PHYS_PORT_NAME_TYPE_LEGACY:
1918 		/* Legacy representors naming schema. */
1919 		switch_info->representor = !device_dir;
1920 		break;
1921 	case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
1922 		/* New representors naming schema. */
1923 		switch_info->representor = 1;
1924 		break;
1925 	}
1926 }
1927 
1928 /**
1929  * Extract port name, as a number, from sysfs or netlink information.
1930  *
1931  * @param[in] port_name_in
1932  *   String representing the port name.
1933  * @param[out] port_info_out
1934  *   Port information, including port name as a number and port name
1935  *   type if recognized
1936  *
1937  * @return
1938  *   port_name field set according to recognized name format.
1939  */
1940 void
1941 mlx5_translate_port_name(const char *port_name_in,
1942 			 struct mlx5_switch_info *port_info_out)
1943 {
1944 	char pf_c1, pf_c2, vf_c1, vf_c2;
1945 	char *end;
1946 	int sc_items;
1947 
1948 	/*
1949 	 * Check for port-name as a string of the form pf0vf0
1950 	 * (support kernel ver >= 5.0 or OFED ver >= 4.6).
1951 	 */
1952 	sc_items = sscanf(port_name_in, "%c%c%d%c%c%d",
1953 			  &pf_c1, &pf_c2, &port_info_out->pf_num,
1954 			  &vf_c1, &vf_c2, &port_info_out->port_name);
1955 	if (sc_items == 6 &&
1956 	    pf_c1 == 'p' && pf_c2 == 'f' &&
1957 	    vf_c1 == 'v' && vf_c2 == 'f') {
1958 		port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_PFVF;
1959 		return;
1960 	}
1961 	/*
1962 	 * Check for port-name as a string of the form p0
1963 	 * (support kernel ver >= 5.0, or OFED ver >= 4.6).
1964 	 */
1965 	sc_items = sscanf(port_name_in, "%c%d",
1966 			  &pf_c1, &port_info_out->port_name);
1967 	if (sc_items == 2 && pf_c1 == 'p') {
1968 		port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_UPLINK;
1969 		return;
1970 	}
1971 	/* Check for port-name as a number (support kernel ver < 5.0 */
1972 	errno = 0;
1973 	port_info_out->port_name = strtol(port_name_in, &end, 0);
1974 	if (!errno &&
1975 	    (size_t)(end - port_name_in) == strlen(port_name_in)) {
1976 		port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_LEGACY;
1977 		return;
1978 	}
1979 	port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN;
1980 	return;
1981 }
1982