xref: /dpdk/drivers/net/mlx5/mlx5_ethdev.c (revision bd41389e35ee3ed29cdee851efc8433f151e5928)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2015 6WIND S.A.
3  * Copyright 2015 Mellanox Technologies, Ltd
4  */
5 
6 #include <stddef.h>
7 #include <assert.h>
8 #include <inttypes.h>
9 #include <unistd.h>
10 #include <stdbool.h>
11 #include <stdint.h>
12 #include <stdio.h>
13 #include <string.h>
14 #include <stdlib.h>
15 #include <errno.h>
16 #include <dirent.h>
17 #include <net/if.h>
18 #include <sys/ioctl.h>
19 #include <sys/socket.h>
20 #include <netinet/in.h>
21 #include <linux/ethtool.h>
22 #include <linux/sockios.h>
23 #include <fcntl.h>
24 #include <stdalign.h>
25 #include <sys/un.h>
26 #include <time.h>
27 
28 #include <rte_atomic.h>
29 #include <rte_ethdev_driver.h>
30 #include <rte_bus_pci.h>
31 #include <rte_mbuf.h>
32 #include <rte_common.h>
33 #include <rte_interrupts.h>
34 #include <rte_malloc.h>
35 #include <rte_string_fns.h>
36 #include <rte_rwlock.h>
37 #include <rte_cycles.h>
38 
39 #include "mlx5.h"
40 #include "mlx5_glue.h"
41 #include "mlx5_rxtx.h"
42 #include "mlx5_utils.h"
43 
44 /* Supported speed values found in /usr/include/linux/ethtool.h */
45 #ifndef HAVE_SUPPORTED_40000baseKR4_Full
46 #define SUPPORTED_40000baseKR4_Full (1 << 23)
47 #endif
48 #ifndef HAVE_SUPPORTED_40000baseCR4_Full
49 #define SUPPORTED_40000baseCR4_Full (1 << 24)
50 #endif
51 #ifndef HAVE_SUPPORTED_40000baseSR4_Full
52 #define SUPPORTED_40000baseSR4_Full (1 << 25)
53 #endif
54 #ifndef HAVE_SUPPORTED_40000baseLR4_Full
55 #define SUPPORTED_40000baseLR4_Full (1 << 26)
56 #endif
57 #ifndef HAVE_SUPPORTED_56000baseKR4_Full
58 #define SUPPORTED_56000baseKR4_Full (1 << 27)
59 #endif
60 #ifndef HAVE_SUPPORTED_56000baseCR4_Full
61 #define SUPPORTED_56000baseCR4_Full (1 << 28)
62 #endif
63 #ifndef HAVE_SUPPORTED_56000baseSR4_Full
64 #define SUPPORTED_56000baseSR4_Full (1 << 29)
65 #endif
66 #ifndef HAVE_SUPPORTED_56000baseLR4_Full
67 #define SUPPORTED_56000baseLR4_Full (1 << 30)
68 #endif
69 
70 /* Add defines in case the running kernel is not the same as user headers. */
71 #ifndef ETHTOOL_GLINKSETTINGS
72 struct ethtool_link_settings {
73 	uint32_t cmd;
74 	uint32_t speed;
75 	uint8_t duplex;
76 	uint8_t port;
77 	uint8_t phy_address;
78 	uint8_t autoneg;
79 	uint8_t mdio_support;
80 	uint8_t eth_to_mdix;
81 	uint8_t eth_tp_mdix_ctrl;
82 	int8_t link_mode_masks_nwords;
83 	uint32_t reserved[8];
84 	uint32_t link_mode_masks[];
85 };
86 
87 #define ETHTOOL_GLINKSETTINGS 0x0000004c
88 #define ETHTOOL_LINK_MODE_1000baseT_Full_BIT 5
89 #define ETHTOOL_LINK_MODE_Autoneg_BIT 6
90 #define ETHTOOL_LINK_MODE_1000baseKX_Full_BIT 17
91 #define ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT 18
92 #define ETHTOOL_LINK_MODE_10000baseKR_Full_BIT 19
93 #define ETHTOOL_LINK_MODE_10000baseR_FEC_BIT 20
94 #define ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT 21
95 #define ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT 22
96 #define ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT 23
97 #define ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT 24
98 #define ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT 25
99 #define ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT 26
100 #define ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT 27
101 #define ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT 28
102 #define ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT 29
103 #define ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT 30
104 #endif
105 #ifndef HAVE_ETHTOOL_LINK_MODE_25G
106 #define ETHTOOL_LINK_MODE_25000baseCR_Full_BIT 31
107 #define ETHTOOL_LINK_MODE_25000baseKR_Full_BIT 32
108 #define ETHTOOL_LINK_MODE_25000baseSR_Full_BIT 33
109 #endif
110 #ifndef HAVE_ETHTOOL_LINK_MODE_50G
111 #define ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT 34
112 #define ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT 35
113 #endif
114 #ifndef HAVE_ETHTOOL_LINK_MODE_100G
115 #define ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT 36
116 #define ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT 37
117 #define ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT 38
118 #define ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT 39
119 #endif
120 
121 /**
122  * Get master interface name from private structure.
123  *
124  * @param[in] dev
125  *   Pointer to Ethernet device.
126  * @param[out] ifname
127  *   Interface name output buffer.
128  *
129  * @return
130  *   0 on success, a negative errno value otherwise and rte_errno is set.
131  */
132 int
133 mlx5_get_master_ifname(const char *ibdev_path, char (*ifname)[IF_NAMESIZE])
134 {
135 	DIR *dir;
136 	struct dirent *dent;
137 	unsigned int dev_type = 0;
138 	unsigned int dev_port_prev = ~0u;
139 	char match[IF_NAMESIZE] = "";
140 
141 	assert(ibdev_path);
142 	{
143 		MKSTR(path, "%s/device/net", ibdev_path);
144 
145 		dir = opendir(path);
146 		if (dir == NULL) {
147 			rte_errno = errno;
148 			return -rte_errno;
149 		}
150 	}
151 	while ((dent = readdir(dir)) != NULL) {
152 		char *name = dent->d_name;
153 		FILE *file;
154 		unsigned int dev_port;
155 		int r;
156 
157 		if ((name[0] == '.') &&
158 		    ((name[1] == '\0') ||
159 		     ((name[1] == '.') && (name[2] == '\0'))))
160 			continue;
161 
162 		MKSTR(path, "%s/device/net/%s/%s",
163 		      ibdev_path, name,
164 		      (dev_type ? "dev_id" : "dev_port"));
165 
166 		file = fopen(path, "rb");
167 		if (file == NULL) {
168 			if (errno != ENOENT)
169 				continue;
170 			/*
171 			 * Switch to dev_id when dev_port does not exist as
172 			 * is the case with Linux kernel versions < 3.15.
173 			 */
174 try_dev_id:
175 			match[0] = '\0';
176 			if (dev_type)
177 				break;
178 			dev_type = 1;
179 			dev_port_prev = ~0u;
180 			rewinddir(dir);
181 			continue;
182 		}
183 		r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port);
184 		fclose(file);
185 		if (r != 1)
186 			continue;
187 		/*
188 		 * Switch to dev_id when dev_port returns the same value for
189 		 * all ports. May happen when using a MOFED release older than
190 		 * 3.0 with a Linux kernel >= 3.15.
191 		 */
192 		if (dev_port == dev_port_prev)
193 			goto try_dev_id;
194 		dev_port_prev = dev_port;
195 		if (dev_port == 0)
196 			strlcpy(match, name, sizeof(match));
197 	}
198 	closedir(dir);
199 	if (match[0] == '\0') {
200 		rte_errno = ENOENT;
201 		return -rte_errno;
202 	}
203 	strncpy(*ifname, match, sizeof(*ifname));
204 	return 0;
205 }
206 
207 /**
208  * Get interface name from private structure.
209  *
210  * This is a port representor-aware version of mlx5_get_master_ifname().
211  *
212  * @param[in] dev
213  *   Pointer to Ethernet device.
214  * @param[out] ifname
215  *   Interface name output buffer.
216  *
217  * @return
218  *   0 on success, a negative errno value otherwise and rte_errno is set.
219  */
220 int
221 mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE])
222 {
223 	struct mlx5_priv *priv = dev->data->dev_private;
224 	unsigned int ifindex;
225 
226 	assert(priv);
227 	assert(priv->sh);
228 	ifindex = mlx5_ifindex(dev);
229 	if (!ifindex) {
230 		if (!priv->representor)
231 			return mlx5_get_master_ifname(priv->sh->ibdev_path,
232 						      ifname);
233 		rte_errno = ENXIO;
234 		return -rte_errno;
235 	}
236 	if (if_indextoname(ifindex, &(*ifname)[0]))
237 		return 0;
238 	rte_errno = errno;
239 	return -rte_errno;
240 }
241 
242 /**
243  * Get the interface index from device name.
244  *
245  * @param[in] dev
246  *   Pointer to Ethernet device.
247  *
248  * @return
249  *   Nonzero interface index on success, zero otherwise and rte_errno is set.
250  */
251 unsigned int
252 mlx5_ifindex(const struct rte_eth_dev *dev)
253 {
254 	struct mlx5_priv *priv = dev->data->dev_private;
255 	unsigned int ifindex;
256 
257 	assert(priv);
258 	assert(priv->if_index);
259 	ifindex = priv->if_index;
260 	if (!ifindex)
261 		rte_errno = ENXIO;
262 	return ifindex;
263 }
264 
265 /**
266  * Perform ifreq ioctl() on associated Ethernet device.
267  *
268  * @param[in] dev
269  *   Pointer to Ethernet device.
270  * @param req
271  *   Request number to pass to ioctl().
272  * @param[out] ifr
273  *   Interface request structure output buffer.
274  *
275  * @return
276  *   0 on success, a negative errno value otherwise and rte_errno is set.
277  */
278 int
279 mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr)
280 {
281 	int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
282 	int ret = 0;
283 
284 	if (sock == -1) {
285 		rte_errno = errno;
286 		return -rte_errno;
287 	}
288 	ret = mlx5_get_ifname(dev, &ifr->ifr_name);
289 	if (ret)
290 		goto error;
291 	ret = ioctl(sock, req, ifr);
292 	if (ret == -1) {
293 		rte_errno = errno;
294 		goto error;
295 	}
296 	close(sock);
297 	return 0;
298 error:
299 	close(sock);
300 	return -rte_errno;
301 }
302 
303 /**
304  * Get device MTU.
305  *
306  * @param dev
307  *   Pointer to Ethernet device.
308  * @param[out] mtu
309  *   MTU value output buffer.
310  *
311  * @return
312  *   0 on success, a negative errno value otherwise and rte_errno is set.
313  */
314 int
315 mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu)
316 {
317 	struct ifreq request;
318 	int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request);
319 
320 	if (ret)
321 		return ret;
322 	*mtu = request.ifr_mtu;
323 	return 0;
324 }
325 
326 /**
327  * Set device MTU.
328  *
329  * @param dev
330  *   Pointer to Ethernet device.
331  * @param mtu
332  *   MTU value to set.
333  *
334  * @return
335  *   0 on success, a negative errno value otherwise and rte_errno is set.
336  */
337 static int
338 mlx5_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
339 {
340 	struct ifreq request = { .ifr_mtu = mtu, };
341 
342 	return mlx5_ifreq(dev, SIOCSIFMTU, &request);
343 }
344 
345 /**
346  * Set device flags.
347  *
348  * @param dev
349  *   Pointer to Ethernet device.
350  * @param keep
351  *   Bitmask for flags that must remain untouched.
352  * @param flags
353  *   Bitmask for flags to modify.
354  *
355  * @return
356  *   0 on success, a negative errno value otherwise and rte_errno is set.
357  */
358 int
359 mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep, unsigned int flags)
360 {
361 	struct ifreq request;
362 	int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request);
363 
364 	if (ret)
365 		return ret;
366 	request.ifr_flags &= keep;
367 	request.ifr_flags |= flags & ~keep;
368 	return mlx5_ifreq(dev, SIOCSIFFLAGS, &request);
369 }
370 
371 /**
372  * DPDK callback for Ethernet device configuration.
373  *
374  * @param dev
375  *   Pointer to Ethernet device structure.
376  *
377  * @return
378  *   0 on success, a negative errno value otherwise and rte_errno is set.
379  */
380 int
381 mlx5_dev_configure(struct rte_eth_dev *dev)
382 {
383 	struct mlx5_priv *priv = dev->data->dev_private;
384 	unsigned int rxqs_n = dev->data->nb_rx_queues;
385 	unsigned int txqs_n = dev->data->nb_tx_queues;
386 	unsigned int i;
387 	unsigned int j;
388 	unsigned int reta_idx_n;
389 	const uint8_t use_app_rss_key =
390 		!!dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key;
391 	int ret = 0;
392 	unsigned int lro_on = mlx5_lro_on(dev);
393 
394 	if (use_app_rss_key &&
395 	    (dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len !=
396 	     MLX5_RSS_HASH_KEY_LEN)) {
397 		DRV_LOG(ERR, "port %u RSS key len must be %s Bytes long",
398 			dev->data->port_id, RTE_STR(MLX5_RSS_HASH_KEY_LEN));
399 		rte_errno = EINVAL;
400 		return -rte_errno;
401 	}
402 	priv->rss_conf.rss_key =
403 		rte_realloc(priv->rss_conf.rss_key,
404 			    MLX5_RSS_HASH_KEY_LEN, 0);
405 	if (!priv->rss_conf.rss_key) {
406 		DRV_LOG(ERR, "port %u cannot allocate RSS hash key memory (%u)",
407 			dev->data->port_id, rxqs_n);
408 		rte_errno = ENOMEM;
409 		return -rte_errno;
410 	}
411 	memcpy(priv->rss_conf.rss_key,
412 	       use_app_rss_key ?
413 	       dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key :
414 	       rss_hash_default_key,
415 	       MLX5_RSS_HASH_KEY_LEN);
416 	priv->rss_conf.rss_key_len = MLX5_RSS_HASH_KEY_LEN;
417 	priv->rss_conf.rss_hf = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf;
418 	priv->rxqs = (void *)dev->data->rx_queues;
419 	priv->txqs = (void *)dev->data->tx_queues;
420 	if (txqs_n != priv->txqs_n) {
421 		DRV_LOG(INFO, "port %u Tx queues number update: %u -> %u",
422 			dev->data->port_id, priv->txqs_n, txqs_n);
423 		priv->txqs_n = txqs_n;
424 	}
425 	if (rxqs_n > priv->config.ind_table_max_size) {
426 		DRV_LOG(ERR, "port %u cannot handle this many Rx queues (%u)",
427 			dev->data->port_id, rxqs_n);
428 		rte_errno = EINVAL;
429 		return -rte_errno;
430 	}
431 	if (rxqs_n != priv->rxqs_n) {
432 		DRV_LOG(INFO, "port %u Rx queues number update: %u -> %u",
433 			dev->data->port_id, priv->rxqs_n, rxqs_n);
434 		priv->rxqs_n = rxqs_n;
435 		/*
436 		 * If the requested number of RX queues is not a power of two,
437 		 * use the maximum indirection table size for better balancing.
438 		 * The result is always rounded to the next power of two.
439 		 */
440 		reta_idx_n = (1 << log2above((rxqs_n & (rxqs_n - 1)) ?
441 					     priv->config.ind_table_max_size :
442 					     rxqs_n));
443 		ret = mlx5_rss_reta_index_resize(dev, reta_idx_n);
444 		if (ret)
445 			return ret;
446 		/*
447 		 * When the number of RX queues is not a power of two,
448 		 * the remaining table entries are padded with reused WQs
449 		 * and hashes are not spread uniformly.
450 		 */
451 		for (i = 0, j = 0; (i != reta_idx_n); ++i) {
452 			(*priv->reta_idx)[i] = j;
453 			if (++j == rxqs_n)
454 				j = 0;
455 		}
456 	}
457 	if (lro_on && priv->config.cqe_comp) {
458 		/* CQE compressing is not supported for LRO CQEs. */
459 		DRV_LOG(WARNING, "Rx CQE compression isn't supported with LRO");
460 		priv->config.cqe_comp = 0;
461 	}
462 	ret = mlx5_proc_priv_init(dev);
463 	if (ret)
464 		return ret;
465 	return 0;
466 }
467 
468 /**
469  * Sets default tuning parameters.
470  *
471  * @param dev
472  *   Pointer to Ethernet device.
473  * @param[out] info
474  *   Info structure output buffer.
475  */
476 static void
477 mlx5_set_default_params(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
478 {
479 	struct mlx5_priv *priv = dev->data->dev_private;
480 
481 	/* Minimum CPU utilization. */
482 	info->default_rxportconf.ring_size = 256;
483 	info->default_txportconf.ring_size = 256;
484 	info->default_rxportconf.burst_size = 64;
485 	info->default_txportconf.burst_size = 64;
486 	if (priv->link_speed_capa & ETH_LINK_SPEED_100G) {
487 		info->default_rxportconf.nb_queues = 16;
488 		info->default_txportconf.nb_queues = 16;
489 		if (dev->data->nb_rx_queues > 2 ||
490 		    dev->data->nb_tx_queues > 2) {
491 			/* Max Throughput. */
492 			info->default_rxportconf.ring_size = 2048;
493 			info->default_txportconf.ring_size = 2048;
494 		}
495 	} else {
496 		info->default_rxportconf.nb_queues = 8;
497 		info->default_txportconf.nb_queues = 8;
498 		if (dev->data->nb_rx_queues > 2 ||
499 		    dev->data->nb_tx_queues > 2) {
500 			/* Max Throughput. */
501 			info->default_rxportconf.ring_size = 4096;
502 			info->default_txportconf.ring_size = 4096;
503 		}
504 	}
505 }
506 
507 /**
508  * Sets tx mbuf limiting parameters.
509  *
510  * @param dev
511  *   Pointer to Ethernet device.
512  * @param[out] info
513  *   Info structure output buffer.
514  */
515 static void
516 mlx5_set_txlimit_params(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
517 {
518 	struct mlx5_priv *priv = dev->data->dev_private;
519 	struct mlx5_dev_config *config = &priv->config;
520 	unsigned int inlen;
521 	uint16_t nb_max;
522 
523 	inlen = (config->txq_inline_max == MLX5_ARG_UNSET) ?
524 		MLX5_SEND_DEF_INLINE_LEN :
525 		(unsigned int)config->txq_inline_max;
526 	assert(config->txq_inline_min >= 0);
527 	inlen = RTE_MAX(inlen, (unsigned int)config->txq_inline_min);
528 	inlen = RTE_MIN(inlen, MLX5_WQE_SIZE_MAX +
529 			       MLX5_ESEG_MIN_INLINE_SIZE -
530 			       MLX5_WQE_CSEG_SIZE -
531 			       MLX5_WQE_ESEG_SIZE -
532 			       MLX5_WQE_DSEG_SIZE * 2);
533 	nb_max = (MLX5_WQE_SIZE_MAX +
534 		  MLX5_ESEG_MIN_INLINE_SIZE -
535 		  MLX5_WQE_CSEG_SIZE -
536 		  MLX5_WQE_ESEG_SIZE -
537 		  MLX5_WQE_DSEG_SIZE -
538 		  inlen) / MLX5_WSEG_SIZE;
539 	info->tx_desc_lim.nb_seg_max = nb_max;
540 	info->tx_desc_lim.nb_mtu_seg_max = nb_max;
541 }
542 
543 /**
544  * DPDK callback to get information about the device.
545  *
546  * @param dev
547  *   Pointer to Ethernet device structure.
548  * @param[out] info
549  *   Info structure output buffer.
550  */
551 void
552 mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
553 {
554 	struct mlx5_priv *priv = dev->data->dev_private;
555 	struct mlx5_dev_config *config = &priv->config;
556 	unsigned int max;
557 
558 	/* FIXME: we should ask the device for these values. */
559 	info->min_rx_bufsize = 32;
560 	info->max_rx_pktlen = 65536;
561 	/*
562 	 * Since we need one CQ per QP, the limit is the minimum number
563 	 * between the two values.
564 	 */
565 	max = RTE_MIN(priv->sh->device_attr.orig_attr.max_cq,
566 		      priv->sh->device_attr.orig_attr.max_qp);
567 	/* If max >= 65535 then max = 0, max_rx_queues is uint16_t. */
568 	if (max >= 65535)
569 		max = 65535;
570 	info->max_rx_queues = max;
571 	info->max_tx_queues = max;
572 	info->max_mac_addrs = MLX5_MAX_UC_MAC_ADDRESSES;
573 	info->rx_queue_offload_capa = mlx5_get_rx_queue_offloads(dev);
574 	info->rx_offload_capa = (mlx5_get_rx_port_offloads(dev) |
575 				 info->rx_queue_offload_capa);
576 	info->tx_offload_capa = mlx5_get_tx_port_offloads(dev);
577 	info->if_index = mlx5_ifindex(dev);
578 	info->reta_size = priv->reta_idx_n ?
579 		priv->reta_idx_n : config->ind_table_max_size;
580 	info->hash_key_size = MLX5_RSS_HASH_KEY_LEN;
581 	info->speed_capa = priv->link_speed_capa;
582 	info->flow_type_rss_offloads = ~MLX5_RSS_HF_MASK;
583 	mlx5_set_default_params(dev, info);
584 	mlx5_set_txlimit_params(dev, info);
585 	info->switch_info.name = dev->data->name;
586 	info->switch_info.domain_id = priv->domain_id;
587 	info->switch_info.port_id = priv->representor_id;
588 	if (priv->representor) {
589 		unsigned int i = mlx5_dev_to_port_id(dev->device, NULL, 0);
590 		uint16_t port_id[i];
591 
592 		i = RTE_MIN(mlx5_dev_to_port_id(dev->device, port_id, i), i);
593 		while (i--) {
594 			struct mlx5_priv *opriv =
595 				rte_eth_devices[port_id[i]].data->dev_private;
596 
597 			if (!opriv ||
598 			    opriv->representor ||
599 			    opriv->domain_id != priv->domain_id)
600 				continue;
601 			/*
602 			 * Override switch name with that of the master
603 			 * device.
604 			 */
605 			info->switch_info.name = opriv->dev_data->name;
606 			break;
607 		}
608 	}
609 }
610 
611 /**
612  * Get device current raw clock counter
613  *
614  * @param dev
615  *   Pointer to Ethernet device structure.
616  * @param[out] time
617  *   Current raw clock counter of the device.
618  *
619  * @return
620  *   0 if the clock has correctly been read
621  *   The value of errno in case of error
622  */
623 int
624 mlx5_read_clock(struct rte_eth_dev *dev, uint64_t *clock)
625 {
626 	struct mlx5_priv *priv = dev->data->dev_private;
627 	struct ibv_context *ctx = priv->sh->ctx;
628 	struct ibv_values_ex values;
629 	int err = 0;
630 
631 	values.comp_mask = IBV_VALUES_MASK_RAW_CLOCK;
632 	err = mlx5_glue->query_rt_values_ex(ctx, &values);
633 	if (err != 0) {
634 		DRV_LOG(WARNING, "Could not query the clock !");
635 		return err;
636 	}
637 	*clock = values.raw_clock.tv_nsec;
638 	return 0;
639 }
640 
641 /**
642  * Get firmware version of a device.
643  *
644  * @param dev
645  *   Ethernet device port.
646  * @param fw_ver
647  *   String output allocated by caller.
648  * @param fw_size
649  *   Size of the output string, including terminating null byte.
650  *
651  * @return
652  *   0 on success, or the size of the non truncated string if too big.
653  */
654 int mlx5_fw_version_get(struct rte_eth_dev *dev, char *fw_ver, size_t fw_size)
655 {
656 	struct mlx5_priv *priv = dev->data->dev_private;
657 	struct ibv_device_attr *attr = &priv->sh->device_attr.orig_attr;
658 	size_t size = strnlen(attr->fw_ver, sizeof(attr->fw_ver)) + 1;
659 
660 	if (fw_size < size)
661 		return size;
662 	if (fw_ver != NULL)
663 		strlcpy(fw_ver, attr->fw_ver, fw_size);
664 	return 0;
665 }
666 
667 /**
668  * Get supported packet types.
669  *
670  * @param dev
671  *   Pointer to Ethernet device structure.
672  *
673  * @return
674  *   A pointer to the supported Packet types array.
675  */
676 const uint32_t *
677 mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev)
678 {
679 	static const uint32_t ptypes[] = {
680 		/* refers to rxq_cq_to_pkt_type() */
681 		RTE_PTYPE_L2_ETHER,
682 		RTE_PTYPE_L3_IPV4_EXT_UNKNOWN,
683 		RTE_PTYPE_L3_IPV6_EXT_UNKNOWN,
684 		RTE_PTYPE_L4_NONFRAG,
685 		RTE_PTYPE_L4_FRAG,
686 		RTE_PTYPE_L4_TCP,
687 		RTE_PTYPE_L4_UDP,
688 		RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN,
689 		RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN,
690 		RTE_PTYPE_INNER_L4_NONFRAG,
691 		RTE_PTYPE_INNER_L4_FRAG,
692 		RTE_PTYPE_INNER_L4_TCP,
693 		RTE_PTYPE_INNER_L4_UDP,
694 		RTE_PTYPE_UNKNOWN
695 	};
696 
697 	if (dev->rx_pkt_burst == mlx5_rx_burst ||
698 	    dev->rx_pkt_burst == mlx5_rx_burst_mprq ||
699 	    dev->rx_pkt_burst == mlx5_rx_burst_vec)
700 		return ptypes;
701 	return NULL;
702 }
703 
704 /**
705  * Retrieve the master device for representor in the same switch domain.
706  *
707  * @param dev
708  *   Pointer to representor Ethernet device structure.
709  *
710  * @return
711  *   Master device structure  on success, NULL otherwise.
712  */
713 
714 static struct rte_eth_dev *
715 mlx5_find_master_dev(struct rte_eth_dev *dev)
716 {
717 	struct mlx5_priv *priv;
718 	uint16_t port_id;
719 	uint16_t domain_id;
720 
721 	priv = dev->data->dev_private;
722 	domain_id = priv->domain_id;
723 	assert(priv->representor);
724 	RTE_ETH_FOREACH_DEV_OF(port_id, dev->device) {
725 		priv = rte_eth_devices[port_id].data->dev_private;
726 		if (priv &&
727 		    priv->master &&
728 		    priv->domain_id == domain_id)
729 			return &rte_eth_devices[port_id];
730 	}
731 	return NULL;
732 }
733 
734 /**
735  * DPDK callback to retrieve physical link information.
736  *
737  * @param dev
738  *   Pointer to Ethernet device structure.
739  * @param[out] link
740  *   Storage for current link status.
741  *
742  * @return
743  *   0 on success, a negative errno value otherwise and rte_errno is set.
744  */
745 static int
746 mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev,
747 			       struct rte_eth_link *link)
748 {
749 	struct mlx5_priv *priv = dev->data->dev_private;
750 	struct ethtool_cmd edata = {
751 		.cmd = ETHTOOL_GSET /* Deprecated since Linux v4.5. */
752 	};
753 	struct ifreq ifr;
754 	struct rte_eth_link dev_link;
755 	int link_speed = 0;
756 	int ret;
757 
758 	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
759 	if (ret) {
760 		DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
761 			dev->data->port_id, strerror(rte_errno));
762 		return ret;
763 	}
764 	dev_link = (struct rte_eth_link) {
765 		.link_status = ((ifr.ifr_flags & IFF_UP) &&
766 				(ifr.ifr_flags & IFF_RUNNING)),
767 	};
768 	ifr = (struct ifreq) {
769 		.ifr_data = (void *)&edata,
770 	};
771 	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
772 	if (ret) {
773 		if (ret == -ENOTSUP && priv->representor) {
774 			struct rte_eth_dev *master;
775 
776 			/*
777 			 * For representors we can try to inherit link
778 			 * settings from the master device. Actually
779 			 * link settings do not make a lot of sense
780 			 * for representors due to missing physical
781 			 * link. The old kernel drivers supported
782 			 * emulated settings query for representors,
783 			 * the new ones do not, so we have to add
784 			 * this code for compatibility issues.
785 			 */
786 			master = mlx5_find_master_dev(dev);
787 			if (master) {
788 				ifr = (struct ifreq) {
789 					.ifr_data = (void *)&edata,
790 				};
791 				ret = mlx5_ifreq(master, SIOCETHTOOL, &ifr);
792 			}
793 		}
794 		if (ret) {
795 			DRV_LOG(WARNING,
796 				"port %u ioctl(SIOCETHTOOL,"
797 				" ETHTOOL_GSET) failed: %s",
798 				dev->data->port_id, strerror(rte_errno));
799 			return ret;
800 		}
801 	}
802 	link_speed = ethtool_cmd_speed(&edata);
803 	if (link_speed == -1)
804 		dev_link.link_speed = ETH_SPEED_NUM_NONE;
805 	else
806 		dev_link.link_speed = link_speed;
807 	priv->link_speed_capa = 0;
808 	if (edata.supported & SUPPORTED_Autoneg)
809 		priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG;
810 	if (edata.supported & (SUPPORTED_1000baseT_Full |
811 			       SUPPORTED_1000baseKX_Full))
812 		priv->link_speed_capa |= ETH_LINK_SPEED_1G;
813 	if (edata.supported & SUPPORTED_10000baseKR_Full)
814 		priv->link_speed_capa |= ETH_LINK_SPEED_10G;
815 	if (edata.supported & (SUPPORTED_40000baseKR4_Full |
816 			       SUPPORTED_40000baseCR4_Full |
817 			       SUPPORTED_40000baseSR4_Full |
818 			       SUPPORTED_40000baseLR4_Full))
819 		priv->link_speed_capa |= ETH_LINK_SPEED_40G;
820 	dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ?
821 				ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
822 	dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
823 			ETH_LINK_SPEED_FIXED);
824 	if (((dev_link.link_speed && !dev_link.link_status) ||
825 	     (!dev_link.link_speed && dev_link.link_status))) {
826 		rte_errno = EAGAIN;
827 		return -rte_errno;
828 	}
829 	*link = dev_link;
830 	return 0;
831 }
832 
833 /**
834  * Retrieve physical link information (unlocked version using new ioctl).
835  *
836  * @param dev
837  *   Pointer to Ethernet device structure.
838  * @param[out] link
839  *   Storage for current link status.
840  *
841  * @return
842  *   0 on success, a negative errno value otherwise and rte_errno is set.
843  */
844 static int
845 mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
846 			     struct rte_eth_link *link)
847 
848 {
849 	struct mlx5_priv *priv = dev->data->dev_private;
850 	struct ethtool_link_settings gcmd = { .cmd = ETHTOOL_GLINKSETTINGS };
851 	struct ifreq ifr;
852 	struct rte_eth_link dev_link;
853 	struct rte_eth_dev *master = NULL;
854 	uint64_t sc;
855 	int ret;
856 
857 	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
858 	if (ret) {
859 		DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
860 			dev->data->port_id, strerror(rte_errno));
861 		return ret;
862 	}
863 	dev_link = (struct rte_eth_link) {
864 		.link_status = ((ifr.ifr_flags & IFF_UP) &&
865 				(ifr.ifr_flags & IFF_RUNNING)),
866 	};
867 	ifr = (struct ifreq) {
868 		.ifr_data = (void *)&gcmd,
869 	};
870 	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
871 	if (ret) {
872 		if (ret == -ENOTSUP && priv->representor) {
873 			/*
874 			 * For representors we can try to inherit link
875 			 * settings from the master device. Actually
876 			 * link settings do not make a lot of sense
877 			 * for representors due to missing physical
878 			 * link. The old kernel drivers supported
879 			 * emulated settings query for representors,
880 			 * the new ones do not, so we have to add
881 			 * this code for compatibility issues.
882 			 */
883 			master = mlx5_find_master_dev(dev);
884 			if (master) {
885 				ifr = (struct ifreq) {
886 					.ifr_data = (void *)&gcmd,
887 				};
888 				ret = mlx5_ifreq(master, SIOCETHTOOL, &ifr);
889 			}
890 		}
891 		if (ret) {
892 			DRV_LOG(DEBUG,
893 				"port %u ioctl(SIOCETHTOOL,"
894 				" ETHTOOL_GLINKSETTINGS) failed: %s",
895 				dev->data->port_id, strerror(rte_errno));
896 			return ret;
897 		}
898 
899 	}
900 	gcmd.link_mode_masks_nwords = -gcmd.link_mode_masks_nwords;
901 
902 	alignas(struct ethtool_link_settings)
903 	uint8_t data[offsetof(struct ethtool_link_settings, link_mode_masks) +
904 		     sizeof(uint32_t) * gcmd.link_mode_masks_nwords * 3];
905 	struct ethtool_link_settings *ecmd = (void *)data;
906 
907 	*ecmd = gcmd;
908 	ifr.ifr_data = (void *)ecmd;
909 	ret = mlx5_ifreq(master ? master : dev, SIOCETHTOOL, &ifr);
910 	if (ret) {
911 		DRV_LOG(DEBUG,
912 			"port %u ioctl(SIOCETHTOOL,"
913 			"ETHTOOL_GLINKSETTINGS) failed: %s",
914 			dev->data->port_id, strerror(rte_errno));
915 		return ret;
916 	}
917 	dev_link.link_speed = ecmd->speed;
918 	sc = ecmd->link_mode_masks[0] |
919 		((uint64_t)ecmd->link_mode_masks[1] << 32);
920 	priv->link_speed_capa = 0;
921 	if (sc & MLX5_BITSHIFT(ETHTOOL_LINK_MODE_Autoneg_BIT))
922 		priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG;
923 	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseT_Full_BIT) |
924 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseKX_Full_BIT)))
925 		priv->link_speed_capa |= ETH_LINK_SPEED_1G;
926 	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT) |
927 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKR_Full_BIT) |
928 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseR_FEC_BIT)))
929 		priv->link_speed_capa |= ETH_LINK_SPEED_10G;
930 	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT) |
931 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT)))
932 		priv->link_speed_capa |= ETH_LINK_SPEED_20G;
933 	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT) |
934 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT) |
935 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT) |
936 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT)))
937 		priv->link_speed_capa |= ETH_LINK_SPEED_40G;
938 	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT) |
939 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT) |
940 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT) |
941 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT)))
942 		priv->link_speed_capa |= ETH_LINK_SPEED_56G;
943 	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseCR_Full_BIT) |
944 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseKR_Full_BIT) |
945 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseSR_Full_BIT)))
946 		priv->link_speed_capa |= ETH_LINK_SPEED_25G;
947 	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT) |
948 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT)))
949 		priv->link_speed_capa |= ETH_LINK_SPEED_50G;
950 	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT) |
951 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT) |
952 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT) |
953 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT)))
954 		priv->link_speed_capa |= ETH_LINK_SPEED_100G;
955 	dev_link.link_duplex = ((ecmd->duplex == DUPLEX_HALF) ?
956 				ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
957 	dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
958 				  ETH_LINK_SPEED_FIXED);
959 	if (((dev_link.link_speed && !dev_link.link_status) ||
960 	     (!dev_link.link_speed && dev_link.link_status))) {
961 		rte_errno = EAGAIN;
962 		return -rte_errno;
963 	}
964 	*link = dev_link;
965 	return 0;
966 }
967 
968 /**
969  * DPDK callback to retrieve physical link information.
970  *
971  * @param dev
972  *   Pointer to Ethernet device structure.
973  * @param wait_to_complete
974  *   Wait for request completion.
975  *
976  * @return
977  *   0 if link status was not updated, positive if it was, a negative errno
978  *   value otherwise and rte_errno is set.
979  */
980 int
981 mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete)
982 {
983 	int ret;
984 	struct rte_eth_link dev_link;
985 	time_t start_time = time(NULL);
986 
987 	do {
988 		ret = mlx5_link_update_unlocked_gs(dev, &dev_link);
989 		if (ret == -ENOTSUP)
990 			ret = mlx5_link_update_unlocked_gset(dev, &dev_link);
991 		if (ret == 0)
992 			break;
993 		/* Handle wait to complete situation. */
994 		if (wait_to_complete && ret == -EAGAIN) {
995 			if (abs((int)difftime(time(NULL), start_time)) <
996 			    MLX5_LINK_STATUS_TIMEOUT) {
997 				usleep(0);
998 				continue;
999 			} else {
1000 				rte_errno = EBUSY;
1001 				return -rte_errno;
1002 			}
1003 		} else if (ret < 0) {
1004 			return ret;
1005 		}
1006 	} while (wait_to_complete);
1007 	ret = !!memcmp(&dev->data->dev_link, &dev_link,
1008 		       sizeof(struct rte_eth_link));
1009 	dev->data->dev_link = dev_link;
1010 	return ret;
1011 }
1012 
1013 /**
1014  * DPDK callback to change the MTU.
1015  *
1016  * @param dev
1017  *   Pointer to Ethernet device structure.
1018  * @param in_mtu
1019  *   New MTU.
1020  *
1021  * @return
1022  *   0 on success, a negative errno value otherwise and rte_errno is set.
1023  */
1024 int
1025 mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
1026 {
1027 	struct mlx5_priv *priv = dev->data->dev_private;
1028 	uint16_t kern_mtu = 0;
1029 	int ret;
1030 
1031 	ret = mlx5_get_mtu(dev, &kern_mtu);
1032 	if (ret)
1033 		return ret;
1034 	/* Set kernel interface MTU first. */
1035 	ret = mlx5_set_mtu(dev, mtu);
1036 	if (ret)
1037 		return ret;
1038 	ret = mlx5_get_mtu(dev, &kern_mtu);
1039 	if (ret)
1040 		return ret;
1041 	if (kern_mtu == mtu) {
1042 		priv->mtu = mtu;
1043 		DRV_LOG(DEBUG, "port %u adapter MTU set to %u",
1044 			dev->data->port_id, mtu);
1045 		return 0;
1046 	}
1047 	rte_errno = EAGAIN;
1048 	return -rte_errno;
1049 }
1050 
1051 /**
1052  * DPDK callback to get flow control status.
1053  *
1054  * @param dev
1055  *   Pointer to Ethernet device structure.
1056  * @param[out] fc_conf
1057  *   Flow control output buffer.
1058  *
1059  * @return
1060  *   0 on success, a negative errno value otherwise and rte_errno is set.
1061  */
1062 int
1063 mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
1064 {
1065 	struct ifreq ifr;
1066 	struct ethtool_pauseparam ethpause = {
1067 		.cmd = ETHTOOL_GPAUSEPARAM
1068 	};
1069 	int ret;
1070 
1071 	ifr.ifr_data = (void *)&ethpause;
1072 	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
1073 	if (ret) {
1074 		DRV_LOG(WARNING,
1075 			"port %u ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM) failed:"
1076 			" %s",
1077 			dev->data->port_id, strerror(rte_errno));
1078 		return ret;
1079 	}
1080 	fc_conf->autoneg = ethpause.autoneg;
1081 	if (ethpause.rx_pause && ethpause.tx_pause)
1082 		fc_conf->mode = RTE_FC_FULL;
1083 	else if (ethpause.rx_pause)
1084 		fc_conf->mode = RTE_FC_RX_PAUSE;
1085 	else if (ethpause.tx_pause)
1086 		fc_conf->mode = RTE_FC_TX_PAUSE;
1087 	else
1088 		fc_conf->mode = RTE_FC_NONE;
1089 	return 0;
1090 }
1091 
1092 /**
1093  * DPDK callback to modify flow control parameters.
1094  *
1095  * @param dev
1096  *   Pointer to Ethernet device structure.
1097  * @param[in] fc_conf
1098  *   Flow control parameters.
1099  *
1100  * @return
1101  *   0 on success, a negative errno value otherwise and rte_errno is set.
1102  */
1103 int
1104 mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
1105 {
1106 	struct ifreq ifr;
1107 	struct ethtool_pauseparam ethpause = {
1108 		.cmd = ETHTOOL_SPAUSEPARAM
1109 	};
1110 	int ret;
1111 
1112 	ifr.ifr_data = (void *)&ethpause;
1113 	ethpause.autoneg = fc_conf->autoneg;
1114 	if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
1115 	    (fc_conf->mode & RTE_FC_RX_PAUSE))
1116 		ethpause.rx_pause = 1;
1117 	else
1118 		ethpause.rx_pause = 0;
1119 
1120 	if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
1121 	    (fc_conf->mode & RTE_FC_TX_PAUSE))
1122 		ethpause.tx_pause = 1;
1123 	else
1124 		ethpause.tx_pause = 0;
1125 	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
1126 	if (ret) {
1127 		DRV_LOG(WARNING,
1128 			"port %u ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)"
1129 			" failed: %s",
1130 			dev->data->port_id, strerror(rte_errno));
1131 		return ret;
1132 	}
1133 	return 0;
1134 }
1135 
1136 /**
1137  * Get PCI information from struct ibv_device.
1138  *
1139  * @param device
1140  *   Pointer to Ethernet device structure.
1141  * @param[out] pci_addr
1142  *   PCI bus address output buffer.
1143  *
1144  * @return
1145  *   0 on success, a negative errno value otherwise and rte_errno is set.
1146  */
1147 int
1148 mlx5_ibv_device_to_pci_addr(const struct ibv_device *device,
1149 			    struct rte_pci_addr *pci_addr)
1150 {
1151 	FILE *file;
1152 	char line[32];
1153 	MKSTR(path, "%s/device/uevent", device->ibdev_path);
1154 
1155 	file = fopen(path, "rb");
1156 	if (file == NULL) {
1157 		rte_errno = errno;
1158 		return -rte_errno;
1159 	}
1160 	while (fgets(line, sizeof(line), file) == line) {
1161 		size_t len = strlen(line);
1162 		int ret;
1163 
1164 		/* Truncate long lines. */
1165 		if (len == (sizeof(line) - 1))
1166 			while (line[(len - 1)] != '\n') {
1167 				ret = fgetc(file);
1168 				if (ret == EOF)
1169 					break;
1170 				line[(len - 1)] = ret;
1171 			}
1172 		/* Extract information. */
1173 		if (sscanf(line,
1174 			   "PCI_SLOT_NAME="
1175 			   "%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n",
1176 			   &pci_addr->domain,
1177 			   &pci_addr->bus,
1178 			   &pci_addr->devid,
1179 			   &pci_addr->function) == 4) {
1180 			ret = 0;
1181 			break;
1182 		}
1183 	}
1184 	fclose(file);
1185 	return 0;
1186 }
1187 
1188 /**
1189  * Handle asynchronous removal event for entire multiport device.
1190  *
1191  * @param sh
1192  *   Infiniband device shared context.
1193  */
1194 static void
1195 mlx5_dev_interrupt_device_fatal(struct mlx5_ibv_shared *sh)
1196 {
1197 	uint32_t i;
1198 
1199 	for (i = 0; i < sh->max_port; ++i) {
1200 		struct rte_eth_dev *dev;
1201 
1202 		if (sh->port[i].ih_port_id >= RTE_MAX_ETHPORTS) {
1203 			/*
1204 			 * Or not existing port either no
1205 			 * handler installed for this port.
1206 			 */
1207 			continue;
1208 		}
1209 		dev = &rte_eth_devices[sh->port[i].ih_port_id];
1210 		assert(dev);
1211 		if (dev->data->dev_conf.intr_conf.rmv)
1212 			_rte_eth_dev_callback_process
1213 				(dev, RTE_ETH_EVENT_INTR_RMV, NULL);
1214 	}
1215 }
1216 
1217 /**
1218  * Handle shared asynchronous events the NIC (removal event
1219  * and link status change). Supports multiport IB device.
1220  *
1221  * @param cb_arg
1222  *   Callback argument.
1223  */
1224 void
1225 mlx5_dev_interrupt_handler(void *cb_arg)
1226 {
1227 	struct mlx5_ibv_shared *sh = cb_arg;
1228 	struct ibv_async_event event;
1229 
1230 	/* Read all message from the IB device and acknowledge them. */
1231 	for (;;) {
1232 		struct rte_eth_dev *dev;
1233 		uint32_t tmp;
1234 
1235 		if (mlx5_glue->get_async_event(sh->ctx, &event))
1236 			break;
1237 		/* Retrieve and check IB port index. */
1238 		tmp = (uint32_t)event.element.port_num;
1239 		if (!tmp && event.event_type == IBV_EVENT_DEVICE_FATAL) {
1240 			/*
1241 			 * The DEVICE_FATAL event is called once for
1242 			 * entire device without port specifying.
1243 			 * We should notify all existing ports.
1244 			 */
1245 			mlx5_glue->ack_async_event(&event);
1246 			mlx5_dev_interrupt_device_fatal(sh);
1247 			continue;
1248 		}
1249 		assert(tmp && (tmp <= sh->max_port));
1250 		if (!tmp) {
1251 			/* Unsupported devive level event. */
1252 			mlx5_glue->ack_async_event(&event);
1253 			DRV_LOG(DEBUG,
1254 				"unsupported common event (type %d)",
1255 				event.event_type);
1256 			continue;
1257 		}
1258 		if (tmp > sh->max_port) {
1259 			/* Invalid IB port index. */
1260 			mlx5_glue->ack_async_event(&event);
1261 			DRV_LOG(DEBUG,
1262 				"cannot handle an event (type %d)"
1263 				"due to invalid IB port index (%u)",
1264 				event.event_type, tmp);
1265 			continue;
1266 		}
1267 		if (sh->port[tmp - 1].ih_port_id >= RTE_MAX_ETHPORTS) {
1268 			/* No handler installed. */
1269 			mlx5_glue->ack_async_event(&event);
1270 			DRV_LOG(DEBUG,
1271 				"cannot handle an event (type %d)"
1272 				"due to no handler installed for port %u",
1273 				event.event_type, tmp);
1274 			continue;
1275 		}
1276 		/* Retrieve ethernet device descriptor. */
1277 		tmp = sh->port[tmp - 1].ih_port_id;
1278 		dev = &rte_eth_devices[tmp];
1279 		assert(dev);
1280 		if ((event.event_type == IBV_EVENT_PORT_ACTIVE ||
1281 		     event.event_type == IBV_EVENT_PORT_ERR) &&
1282 			dev->data->dev_conf.intr_conf.lsc) {
1283 			mlx5_glue->ack_async_event(&event);
1284 			if (mlx5_link_update(dev, 0) == -EAGAIN) {
1285 				usleep(0);
1286 				continue;
1287 			}
1288 			_rte_eth_dev_callback_process
1289 				(dev, RTE_ETH_EVENT_INTR_LSC, NULL);
1290 			continue;
1291 		}
1292 		DRV_LOG(DEBUG,
1293 			"port %u cannot handle an unknown event (type %d)",
1294 			dev->data->port_id, event.event_type);
1295 		mlx5_glue->ack_async_event(&event);
1296 	}
1297 }
1298 
1299 /*
1300  * Unregister callback handler safely. The handler may be active
1301  * while we are trying to unregister it, in this case code -EAGAIN
1302  * is returned by rte_intr_callback_unregister(). This routine checks
1303  * the return code and tries to unregister handler again.
1304  *
1305  * @param handle
1306  *   interrupt handle
1307  * @param cb_fn
1308  *   pointer to callback routine
1309  * @cb_arg
1310  *   opaque callback parameter
1311  */
1312 void
1313 mlx5_intr_callback_unregister(const struct rte_intr_handle *handle,
1314 			      rte_intr_callback_fn cb_fn, void *cb_arg)
1315 {
1316 	/*
1317 	 * Try to reduce timeout management overhead by not calling
1318 	 * the timer related routines on the first iteration. If the
1319 	 * unregistering succeeds on first call there will be no
1320 	 * timer calls at all.
1321 	 */
1322 	uint64_t twait = 0;
1323 	uint64_t start = 0;
1324 
1325 	do {
1326 		int ret;
1327 
1328 		ret = rte_intr_callback_unregister(handle, cb_fn, cb_arg);
1329 		if (ret >= 0)
1330 			return;
1331 		if (ret != -EAGAIN) {
1332 			DRV_LOG(INFO, "failed to unregister interrupt"
1333 				      " handler (error: %d)", ret);
1334 			assert(false);
1335 			return;
1336 		}
1337 		if (twait) {
1338 			struct timespec onems;
1339 
1340 			/* Wait one millisecond and try again. */
1341 			onems.tv_sec = 0;
1342 			onems.tv_nsec = NS_PER_S / MS_PER_S;
1343 			nanosleep(&onems, 0);
1344 			/* Check whether one second elapsed. */
1345 			if ((rte_get_timer_cycles() - start) <= twait)
1346 				continue;
1347 		} else {
1348 			/*
1349 			 * We get the amount of timer ticks for one second.
1350 			 * If this amount elapsed it means we spent one
1351 			 * second in waiting. This branch is executed once
1352 			 * on first iteration.
1353 			 */
1354 			twait = rte_get_timer_hz();
1355 			assert(twait);
1356 		}
1357 		/*
1358 		 * Timeout elapsed, show message (once a second) and retry.
1359 		 * We have no other acceptable option here, if we ignore
1360 		 * the unregistering return code the handler will not
1361 		 * be unregistered, fd will be closed and we may get the
1362 		 * crush. Hanging and messaging in the loop seems not to be
1363 		 * the worst choice.
1364 		 */
1365 		DRV_LOG(INFO, "Retrying to unregister interrupt handler");
1366 		start = rte_get_timer_cycles();
1367 	} while (true);
1368 }
1369 
1370 /**
1371  * Handle DEVX interrupts from the NIC.
1372  * This function is probably called from the DPDK host thread.
1373  *
1374  * @param cb_arg
1375  *   Callback argument.
1376  */
1377 void
1378 mlx5_dev_interrupt_handler_devx(void *cb_arg)
1379 {
1380 #ifndef HAVE_IBV_DEVX_ASYNC
1381 	(void)cb_arg;
1382 	return;
1383 #else
1384 	struct mlx5_ibv_shared *sh = cb_arg;
1385 	union {
1386 		struct mlx5dv_devx_async_cmd_hdr cmd_resp;
1387 		uint8_t buf[MLX5_ST_SZ_BYTES(query_flow_counter_out) +
1388 			    MLX5_ST_SZ_BYTES(traffic_counter) +
1389 			    sizeof(struct mlx5dv_devx_async_cmd_hdr)];
1390 	} out;
1391 	uint8_t *buf = out.buf + sizeof(out.cmd_resp);
1392 
1393 	while (!mlx5_glue->devx_get_async_cmd_comp(sh->devx_comp,
1394 						   &out.cmd_resp,
1395 						   sizeof(out.buf)))
1396 		mlx5_flow_async_pool_query_handle
1397 			(sh, (uint64_t)out.cmd_resp.wr_id,
1398 			 mlx5_devx_get_out_command_status(buf));
1399 #endif /* HAVE_IBV_DEVX_ASYNC */
1400 }
1401 
1402 /**
1403  * Uninstall shared asynchronous device events handler.
1404  * This function is implemented to support event sharing
1405  * between multiple ports of single IB device.
1406  *
1407  * @param dev
1408  *   Pointer to Ethernet device.
1409  */
1410 static void
1411 mlx5_dev_shared_handler_uninstall(struct rte_eth_dev *dev)
1412 {
1413 	struct mlx5_priv *priv = dev->data->dev_private;
1414 	struct mlx5_ibv_shared *sh = priv->sh;
1415 
1416 	if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1417 		return;
1418 	pthread_mutex_lock(&sh->intr_mutex);
1419 	assert(priv->ibv_port);
1420 	assert(priv->ibv_port <= sh->max_port);
1421 	assert(dev->data->port_id < RTE_MAX_ETHPORTS);
1422 	if (sh->port[priv->ibv_port - 1].ih_port_id >= RTE_MAX_ETHPORTS)
1423 		goto exit;
1424 	assert(sh->port[priv->ibv_port - 1].ih_port_id ==
1425 					(uint32_t)dev->data->port_id);
1426 	assert(sh->intr_cnt);
1427 	sh->port[priv->ibv_port - 1].ih_port_id = RTE_MAX_ETHPORTS;
1428 	if (!sh->intr_cnt || --sh->intr_cnt)
1429 		goto exit;
1430 	mlx5_intr_callback_unregister(&sh->intr_handle,
1431 				     mlx5_dev_interrupt_handler, sh);
1432 	sh->intr_handle.fd = 0;
1433 	sh->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
1434 	if (sh->intr_handle_devx.fd) {
1435 		rte_intr_callback_unregister(&sh->intr_handle_devx,
1436 					     mlx5_dev_interrupt_handler_devx,
1437 					     sh);
1438 		sh->intr_handle_devx.fd = 0;
1439 		sh->intr_handle_devx.type = RTE_INTR_HANDLE_UNKNOWN;
1440 	}
1441 	if (sh->devx_comp) {
1442 		mlx5_glue->devx_destroy_cmd_comp(sh->devx_comp);
1443 		sh->devx_comp = NULL;
1444 	}
1445 exit:
1446 	pthread_mutex_unlock(&sh->intr_mutex);
1447 }
1448 
1449 /**
1450  * Install shared asynchronous device events handler.
1451  * This function is implemented to support event sharing
1452  * between multiple ports of single IB device.
1453  *
1454  * @param dev
1455  *   Pointer to Ethernet device.
1456  */
1457 static void
1458 mlx5_dev_shared_handler_install(struct rte_eth_dev *dev)
1459 {
1460 	struct mlx5_priv *priv = dev->data->dev_private;
1461 	struct mlx5_ibv_shared *sh = priv->sh;
1462 	int ret;
1463 	int flags;
1464 
1465 	if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1466 		return;
1467 	pthread_mutex_lock(&sh->intr_mutex);
1468 	assert(priv->ibv_port);
1469 	assert(priv->ibv_port <= sh->max_port);
1470 	assert(dev->data->port_id < RTE_MAX_ETHPORTS);
1471 	if (sh->port[priv->ibv_port - 1].ih_port_id < RTE_MAX_ETHPORTS) {
1472 		/* The handler is already installed for this port. */
1473 		assert(sh->intr_cnt);
1474 		goto exit;
1475 	}
1476 	sh->port[priv->ibv_port - 1].ih_port_id = (uint32_t)dev->data->port_id;
1477 	if (sh->intr_cnt) {
1478 		sh->intr_cnt++;
1479 		goto exit;
1480 	}
1481 	/* No shared handler installed. */
1482 	assert(sh->ctx->async_fd > 0);
1483 	flags = fcntl(sh->ctx->async_fd, F_GETFL);
1484 	ret = fcntl(sh->ctx->async_fd, F_SETFL, flags | O_NONBLOCK);
1485 	if (ret) {
1486 		DRV_LOG(INFO, "failed to change file descriptor"
1487 			      " async event queue");
1488 		goto error;
1489 	}
1490 	sh->intr_handle.fd = sh->ctx->async_fd;
1491 	sh->intr_handle.type = RTE_INTR_HANDLE_EXT;
1492 	rte_intr_callback_register(&sh->intr_handle,
1493 				   mlx5_dev_interrupt_handler, sh);
1494 	if (priv->config.devx) {
1495 #ifndef HAVE_IBV_DEVX_ASYNC
1496 		goto error_unregister;
1497 #else
1498 		sh->devx_comp = mlx5_glue->devx_create_cmd_comp(sh->ctx);
1499 		if (sh->devx_comp) {
1500 			flags = fcntl(sh->devx_comp->fd, F_GETFL);
1501 			ret = fcntl(sh->devx_comp->fd, F_SETFL,
1502 				    flags | O_NONBLOCK);
1503 			if (ret) {
1504 				DRV_LOG(INFO, "failed to change file descriptor"
1505 					      " devx async event queue");
1506 				goto error_unregister;
1507 			}
1508 			sh->intr_handle_devx.fd = sh->devx_comp->fd;
1509 			sh->intr_handle_devx.type = RTE_INTR_HANDLE_EXT;
1510 			rte_intr_callback_register
1511 				(&sh->intr_handle_devx,
1512 				 mlx5_dev_interrupt_handler_devx, sh);
1513 		} else {
1514 			DRV_LOG(INFO, "failed to create devx async command "
1515 				"completion");
1516 			goto error_unregister;
1517 		}
1518 #endif /* HAVE_IBV_DEVX_ASYNC */
1519 	}
1520 	sh->intr_cnt++;
1521 	goto exit;
1522 error_unregister:
1523 	rte_intr_callback_unregister(&sh->intr_handle,
1524 				     mlx5_dev_interrupt_handler, sh);
1525 error:
1526 	/* Indicate there will be no interrupts. */
1527 	dev->data->dev_conf.intr_conf.lsc = 0;
1528 	dev->data->dev_conf.intr_conf.rmv = 0;
1529 	sh->intr_handle.fd = 0;
1530 	sh->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
1531 	sh->port[priv->ibv_port - 1].ih_port_id = RTE_MAX_ETHPORTS;
1532 exit:
1533 	pthread_mutex_unlock(&sh->intr_mutex);
1534 }
1535 
1536 /**
1537  * Uninstall interrupt handler.
1538  *
1539  * @param dev
1540  *   Pointer to Ethernet device.
1541  */
1542 void
1543 mlx5_dev_interrupt_handler_uninstall(struct rte_eth_dev *dev)
1544 {
1545 	mlx5_dev_shared_handler_uninstall(dev);
1546 }
1547 
1548 /**
1549  * Install interrupt handler.
1550  *
1551  * @param dev
1552  *   Pointer to Ethernet device.
1553  */
1554 void
1555 mlx5_dev_interrupt_handler_install(struct rte_eth_dev *dev)
1556 {
1557 	mlx5_dev_shared_handler_install(dev);
1558 }
1559 
1560 /**
1561  * DPDK callback to bring the link DOWN.
1562  *
1563  * @param dev
1564  *   Pointer to Ethernet device structure.
1565  *
1566  * @return
1567  *   0 on success, a negative errno value otherwise and rte_errno is set.
1568  */
1569 int
1570 mlx5_set_link_down(struct rte_eth_dev *dev)
1571 {
1572 	return mlx5_set_flags(dev, ~IFF_UP, ~IFF_UP);
1573 }
1574 
1575 /**
1576  * DPDK callback to bring the link UP.
1577  *
1578  * @param dev
1579  *   Pointer to Ethernet device structure.
1580  *
1581  * @return
1582  *   0 on success, a negative errno value otherwise and rte_errno is set.
1583  */
1584 int
1585 mlx5_set_link_up(struct rte_eth_dev *dev)
1586 {
1587 	return mlx5_set_flags(dev, ~IFF_UP, IFF_UP);
1588 }
1589 
1590 /**
1591  * Configure the RX function to use.
1592  *
1593  * @param dev
1594  *   Pointer to private data structure.
1595  *
1596  * @return
1597  *   Pointer to selected Rx burst function.
1598  */
1599 eth_rx_burst_t
1600 mlx5_select_rx_function(struct rte_eth_dev *dev)
1601 {
1602 	eth_rx_burst_t rx_pkt_burst = mlx5_rx_burst;
1603 
1604 	assert(dev != NULL);
1605 	if (mlx5_check_vec_rx_support(dev) > 0) {
1606 		rx_pkt_burst = mlx5_rx_burst_vec;
1607 		DRV_LOG(DEBUG, "port %u selected Rx vectorized function",
1608 			dev->data->port_id);
1609 	} else if (mlx5_mprq_enabled(dev)) {
1610 		rx_pkt_burst = mlx5_rx_burst_mprq;
1611 	}
1612 	return rx_pkt_burst;
1613 }
1614 
1615 /**
1616  * Check if mlx5 device was removed.
1617  *
1618  * @param dev
1619  *   Pointer to Ethernet device structure.
1620  *
1621  * @return
1622  *   1 when device is removed, otherwise 0.
1623  */
1624 int
1625 mlx5_is_removed(struct rte_eth_dev *dev)
1626 {
1627 	struct ibv_device_attr device_attr;
1628 	struct mlx5_priv *priv = dev->data->dev_private;
1629 
1630 	if (mlx5_glue->query_device(priv->sh->ctx, &device_attr) == EIO)
1631 		return 1;
1632 	return 0;
1633 }
1634 
1635 /**
1636  * Get port ID list of mlx5 instances sharing a common device.
1637  *
1638  * @param[in] dev
1639  *   Device to look for.
1640  * @param[out] port_list
1641  *   Result buffer for collected port IDs.
1642  * @param port_list_n
1643  *   Maximum number of entries in result buffer. If 0, @p port_list can be
1644  *   NULL.
1645  *
1646  * @return
1647  *   Number of matching instances regardless of the @p port_list_n
1648  *   parameter, 0 if none were found.
1649  */
1650 unsigned int
1651 mlx5_dev_to_port_id(const struct rte_device *dev, uint16_t *port_list,
1652 		    unsigned int port_list_n)
1653 {
1654 	uint16_t id;
1655 	unsigned int n = 0;
1656 
1657 	RTE_ETH_FOREACH_DEV_OF(id, dev) {
1658 		if (n < port_list_n)
1659 			port_list[n] = id;
1660 		n++;
1661 	}
1662 	return n;
1663 }
1664 
1665 /**
1666  * Get the E-Switch domain id this port belongs to.
1667  *
1668  * @param[in] port
1669  *   Device port id.
1670  * @param[out] es_domain_id
1671  *   E-Switch domain id.
1672  * @param[out] es_port_id
1673  *   The port id of the port in the E-Switch.
1674  *
1675  * @return
1676  *   0 on success, a negative errno value otherwise and rte_errno is set.
1677  */
1678 int
1679 mlx5_port_to_eswitch_info(uint16_t port,
1680 			  uint16_t *es_domain_id, uint16_t *es_port_id)
1681 {
1682 	struct rte_eth_dev *dev;
1683 	struct mlx5_priv *priv;
1684 
1685 	if (port >= RTE_MAX_ETHPORTS) {
1686 		rte_errno = EINVAL;
1687 		return -rte_errno;
1688 	}
1689 	if (!rte_eth_dev_is_valid_port(port)) {
1690 		rte_errno = ENODEV;
1691 		return -rte_errno;
1692 	}
1693 	dev = &rte_eth_devices[port];
1694 	priv = dev->data->dev_private;
1695 	if (!(priv->representor || priv->master)) {
1696 		rte_errno = EINVAL;
1697 		return -rte_errno;
1698 	}
1699 	if (es_domain_id)
1700 		*es_domain_id = priv->domain_id;
1701 	if (es_port_id)
1702 		*es_port_id = priv->vport_id;
1703 	return 0;
1704 }
1705 
1706 /**
1707  * Get switch information associated with network interface.
1708  *
1709  * @param ifindex
1710  *   Network interface index.
1711  * @param[out] info
1712  *   Switch information object, populated in case of success.
1713  *
1714  * @return
1715  *   0 on success, a negative errno value otherwise and rte_errno is set.
1716  */
1717 int
1718 mlx5_sysfs_switch_info(unsigned int ifindex, struct mlx5_switch_info *info)
1719 {
1720 	char ifname[IF_NAMESIZE];
1721 	char port_name[IF_NAMESIZE];
1722 	FILE *file;
1723 	struct mlx5_switch_info data = {
1724 		.master = 0,
1725 		.representor = 0,
1726 		.name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET,
1727 		.port_name = 0,
1728 		.switch_id = 0,
1729 	};
1730 	DIR *dir;
1731 	bool port_switch_id_set = false;
1732 	bool device_dir = false;
1733 	char c;
1734 	int ret;
1735 
1736 	if (!if_indextoname(ifindex, ifname)) {
1737 		rte_errno = errno;
1738 		return -rte_errno;
1739 	}
1740 
1741 	MKSTR(phys_port_name, "/sys/class/net/%s/phys_port_name",
1742 	      ifname);
1743 	MKSTR(phys_switch_id, "/sys/class/net/%s/phys_switch_id",
1744 	      ifname);
1745 	MKSTR(pci_device, "/sys/class/net/%s/device",
1746 	      ifname);
1747 
1748 	file = fopen(phys_port_name, "rb");
1749 	if (file != NULL) {
1750 		ret = fscanf(file, "%s", port_name);
1751 		fclose(file);
1752 		if (ret == 1)
1753 			mlx5_translate_port_name(port_name, &data);
1754 	}
1755 	file = fopen(phys_switch_id, "rb");
1756 	if (file == NULL) {
1757 		rte_errno = errno;
1758 		return -rte_errno;
1759 	}
1760 	port_switch_id_set =
1761 		fscanf(file, "%" SCNx64 "%c", &data.switch_id, &c) == 2 &&
1762 		c == '\n';
1763 	fclose(file);
1764 	dir = opendir(pci_device);
1765 	if (dir != NULL) {
1766 		closedir(dir);
1767 		device_dir = true;
1768 	}
1769 	if (port_switch_id_set) {
1770 		/* We have some E-Switch configuration. */
1771 		mlx5_sysfs_check_switch_info(device_dir, &data);
1772 	}
1773 	*info = data;
1774 	assert(!(data.master && data.representor));
1775 	if (data.master && data.representor) {
1776 		DRV_LOG(ERR, "ifindex %u device is recognized as master"
1777 			     " and as representor", ifindex);
1778 		rte_errno = ENODEV;
1779 		return -rte_errno;
1780 	}
1781 	return 0;
1782 }
1783 
1784 /**
1785  * Analyze gathered port parameters via Netlink to recognize master
1786  * and representor devices for E-Switch configuration.
1787  *
1788  * @param[in] num_vf_set
1789  *   flag of presence of number of VFs port attribute.
1790  * @param[inout] switch_info
1791  *   Port information, including port name as a number and port name
1792  *   type if recognized
1793  *
1794  * @return
1795  *   master and representor flags are set in switch_info according to
1796  *   recognized parameters (if any).
1797  */
1798 void
1799 mlx5_nl_check_switch_info(bool num_vf_set,
1800 			  struct mlx5_switch_info *switch_info)
1801 {
1802 	switch (switch_info->name_type) {
1803 	case MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN:
1804 		/*
1805 		 * Name is not recognized, assume the master,
1806 		 * check the number of VFs key presence.
1807 		 */
1808 		switch_info->master = num_vf_set;
1809 		break;
1810 	case MLX5_PHYS_PORT_NAME_TYPE_NOTSET:
1811 		/*
1812 		 * Name is not set, this assumes the legacy naming
1813 		 * schema for master, just check if there is a
1814 		 * number of VFs key.
1815 		 */
1816 		switch_info->master = num_vf_set;
1817 		break;
1818 	case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
1819 		/* New uplink naming schema recognized. */
1820 		switch_info->master = 1;
1821 		break;
1822 	case MLX5_PHYS_PORT_NAME_TYPE_LEGACY:
1823 		/* Legacy representors naming schema. */
1824 		switch_info->representor = !num_vf_set;
1825 		break;
1826 	case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
1827 		/* New representors naming schema. */
1828 		switch_info->representor = 1;
1829 		break;
1830 	}
1831 }
1832 
1833 /**
1834  * Analyze gathered port parameters via sysfs to recognize master
1835  * and representor devices for E-Switch configuration.
1836  *
1837  * @param[in] device_dir
1838  *   flag of presence of "device" directory under port device key.
1839  * @param[inout] switch_info
1840  *   Port information, including port name as a number and port name
1841  *   type if recognized
1842  *
1843  * @return
1844  *   master and representor flags are set in switch_info according to
1845  *   recognized parameters (if any).
1846  */
1847 void
1848 mlx5_sysfs_check_switch_info(bool device_dir,
1849 			     struct mlx5_switch_info *switch_info)
1850 {
1851 	switch (switch_info->name_type) {
1852 	case MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN:
1853 		/*
1854 		 * Name is not recognized, assume the master,
1855 		 * check the device directory presence.
1856 		 */
1857 		switch_info->master = device_dir;
1858 		break;
1859 	case MLX5_PHYS_PORT_NAME_TYPE_NOTSET:
1860 		/*
1861 		 * Name is not set, this assumes the legacy naming
1862 		 * schema for master, just check if there is
1863 		 * a device directory.
1864 		 */
1865 		switch_info->master = device_dir;
1866 		break;
1867 	case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
1868 		/* New uplink naming schema recognized. */
1869 		switch_info->master = 1;
1870 		break;
1871 	case MLX5_PHYS_PORT_NAME_TYPE_LEGACY:
1872 		/* Legacy representors naming schema. */
1873 		switch_info->representor = !device_dir;
1874 		break;
1875 	case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
1876 		/* New representors naming schema. */
1877 		switch_info->representor = 1;
1878 		break;
1879 	}
1880 }
1881 
1882 /**
1883  * Extract port name, as a number, from sysfs or netlink information.
1884  *
1885  * @param[in] port_name_in
1886  *   String representing the port name.
1887  * @param[out] port_info_out
1888  *   Port information, including port name as a number and port name
1889  *   type if recognized
1890  *
1891  * @return
1892  *   port_name field set according to recognized name format.
1893  */
1894 void
1895 mlx5_translate_port_name(const char *port_name_in,
1896 			 struct mlx5_switch_info *port_info_out)
1897 {
1898 	char pf_c1, pf_c2, vf_c1, vf_c2;
1899 	char *end;
1900 	int sc_items;
1901 
1902 	/*
1903 	 * Check for port-name as a string of the form pf0vf0
1904 	 * (support kernel ver >= 5.0 or OFED ver >= 4.6).
1905 	 */
1906 	sc_items = sscanf(port_name_in, "%c%c%d%c%c%d",
1907 			  &pf_c1, &pf_c2, &port_info_out->pf_num,
1908 			  &vf_c1, &vf_c2, &port_info_out->port_name);
1909 	if (sc_items == 6 &&
1910 	    pf_c1 == 'p' && pf_c2 == 'f' &&
1911 	    vf_c1 == 'v' && vf_c2 == 'f') {
1912 		port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_PFVF;
1913 		return;
1914 	}
1915 	/*
1916 	 * Check for port-name as a string of the form p0
1917 	 * (support kernel ver >= 5.0, or OFED ver >= 4.6).
1918 	 */
1919 	sc_items = sscanf(port_name_in, "%c%d",
1920 			  &pf_c1, &port_info_out->port_name);
1921 	if (sc_items == 2 && pf_c1 == 'p') {
1922 		port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_UPLINK;
1923 		return;
1924 	}
1925 	/* Check for port-name as a number (support kernel ver < 5.0 */
1926 	errno = 0;
1927 	port_info_out->port_name = strtol(port_name_in, &end, 0);
1928 	if (!errno &&
1929 	    (size_t)(end - port_name_in) == strlen(port_name_in)) {
1930 		port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_LEGACY;
1931 		return;
1932 	}
1933 	port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN;
1934 	return;
1935 }
1936