xref: /dpdk/drivers/net/mlx5/mlx5_ethdev.c (revision 5feecc57d90b97c579b16d1083ea167f7564530b)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2015 6WIND S.A.
3  * Copyright 2015 Mellanox Technologies, Ltd
4  */
5 
6 #define _GNU_SOURCE
7 
8 #include <stddef.h>
9 #include <assert.h>
10 #include <inttypes.h>
11 #include <unistd.h>
12 #include <stdint.h>
13 #include <stdio.h>
14 #include <string.h>
15 #include <stdlib.h>
16 #include <errno.h>
17 #include <dirent.h>
18 #include <net/if.h>
19 #include <sys/ioctl.h>
20 #include <sys/socket.h>
21 #include <netinet/in.h>
22 #include <linux/ethtool.h>
23 #include <linux/sockios.h>
24 #include <fcntl.h>
25 #include <stdalign.h>
26 #include <sys/un.h>
27 #include <time.h>
28 
29 #include <rte_atomic.h>
30 #include <rte_ethdev_driver.h>
31 #include <rte_bus_pci.h>
32 #include <rte_mbuf.h>
33 #include <rte_common.h>
34 #include <rte_interrupts.h>
35 #include <rte_malloc.h>
36 #include <rte_string_fns.h>
37 
38 #include "mlx5.h"
39 #include "mlx5_glue.h"
40 #include "mlx5_rxtx.h"
41 #include "mlx5_utils.h"
42 
43 /* Add defines in case the running kernel is not the same as user headers. */
44 #ifndef ETHTOOL_GLINKSETTINGS
45 struct ethtool_link_settings {
46 	uint32_t cmd;
47 	uint32_t speed;
48 	uint8_t duplex;
49 	uint8_t port;
50 	uint8_t phy_address;
51 	uint8_t autoneg;
52 	uint8_t mdio_support;
53 	uint8_t eth_to_mdix;
54 	uint8_t eth_tp_mdix_ctrl;
55 	int8_t link_mode_masks_nwords;
56 	uint32_t reserved[8];
57 	uint32_t link_mode_masks[];
58 };
59 
60 #define ETHTOOL_GLINKSETTINGS 0x0000004c
61 #define ETHTOOL_LINK_MODE_1000baseT_Full_BIT 5
62 #define ETHTOOL_LINK_MODE_Autoneg_BIT 6
63 #define ETHTOOL_LINK_MODE_1000baseKX_Full_BIT 17
64 #define ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT 18
65 #define ETHTOOL_LINK_MODE_10000baseKR_Full_BIT 19
66 #define ETHTOOL_LINK_MODE_10000baseR_FEC_BIT 20
67 #define ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT 21
68 #define ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT 22
69 #define ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT 23
70 #define ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT 24
71 #define ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT 25
72 #define ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT 26
73 #define ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT 27
74 #define ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT 28
75 #define ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT 29
76 #define ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT 30
77 #endif
78 #ifndef HAVE_ETHTOOL_LINK_MODE_25G
79 #define ETHTOOL_LINK_MODE_25000baseCR_Full_BIT 31
80 #define ETHTOOL_LINK_MODE_25000baseKR_Full_BIT 32
81 #define ETHTOOL_LINK_MODE_25000baseSR_Full_BIT 33
82 #endif
83 #ifndef HAVE_ETHTOOL_LINK_MODE_50G
84 #define ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT 34
85 #define ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT 35
86 #endif
87 #ifndef HAVE_ETHTOOL_LINK_MODE_100G
88 #define ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT 36
89 #define ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT 37
90 #define ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT 38
91 #define ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT 39
92 #endif
93 
94 /**
95  * Get interface name from private structure.
96  *
97  * @param[in] dev
98  *   Pointer to Ethernet device.
99  * @param[out] ifname
100  *   Interface name output buffer.
101  *
102  * @return
103  *   0 on success, a negative errno value otherwise and rte_errno is set.
104  */
105 int
106 mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE])
107 {
108 	struct priv *priv = dev->data->dev_private;
109 	DIR *dir;
110 	struct dirent *dent;
111 	unsigned int dev_type = 0;
112 	unsigned int dev_port_prev = ~0u;
113 	char match[IF_NAMESIZE] = "";
114 
115 	{
116 		MKSTR(path, "%s/device/net", priv->ibdev_path);
117 
118 		dir = opendir(path);
119 		if (dir == NULL) {
120 			rte_errno = errno;
121 			return -rte_errno;
122 		}
123 	}
124 	while ((dent = readdir(dir)) != NULL) {
125 		char *name = dent->d_name;
126 		FILE *file;
127 		unsigned int dev_port;
128 		int r;
129 
130 		if ((name[0] == '.') &&
131 		    ((name[1] == '\0') ||
132 		     ((name[1] == '.') && (name[2] == '\0'))))
133 			continue;
134 
135 		MKSTR(path, "%s/device/net/%s/%s",
136 		      priv->ibdev_path, name,
137 		      (dev_type ? "dev_id" : "dev_port"));
138 
139 		file = fopen(path, "rb");
140 		if (file == NULL) {
141 			if (errno != ENOENT)
142 				continue;
143 			/*
144 			 * Switch to dev_id when dev_port does not exist as
145 			 * is the case with Linux kernel versions < 3.15.
146 			 */
147 try_dev_id:
148 			match[0] = '\0';
149 			if (dev_type)
150 				break;
151 			dev_type = 1;
152 			dev_port_prev = ~0u;
153 			rewinddir(dir);
154 			continue;
155 		}
156 		r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port);
157 		fclose(file);
158 		if (r != 1)
159 			continue;
160 		/*
161 		 * Switch to dev_id when dev_port returns the same value for
162 		 * all ports. May happen when using a MOFED release older than
163 		 * 3.0 with a Linux kernel >= 3.15.
164 		 */
165 		if (dev_port == dev_port_prev)
166 			goto try_dev_id;
167 		dev_port_prev = dev_port;
168 		if (dev_port == (priv->port - 1u))
169 			strlcpy(match, name, sizeof(match));
170 	}
171 	closedir(dir);
172 	if (match[0] == '\0') {
173 		rte_errno = ENOENT;
174 		return -rte_errno;
175 	}
176 	strncpy(*ifname, match, sizeof(*ifname));
177 	return 0;
178 }
179 
180 /**
181  * Perform ifreq ioctl() on associated Ethernet device.
182  *
183  * @param[in] dev
184  *   Pointer to Ethernet device.
185  * @param req
186  *   Request number to pass to ioctl().
187  * @param[out] ifr
188  *   Interface request structure output buffer.
189  *
190  * @return
191  *   0 on success, a negative errno value otherwise and rte_errno is set.
192  */
193 int
194 mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr)
195 {
196 	int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
197 	int ret = 0;
198 
199 	if (sock == -1) {
200 		rte_errno = errno;
201 		return -rte_errno;
202 	}
203 	ret = mlx5_get_ifname(dev, &ifr->ifr_name);
204 	if (ret)
205 		goto error;
206 	ret = ioctl(sock, req, ifr);
207 	if (ret == -1) {
208 		rte_errno = errno;
209 		goto error;
210 	}
211 	close(sock);
212 	return 0;
213 error:
214 	close(sock);
215 	return -rte_errno;
216 }
217 
218 /**
219  * Get device MTU.
220  *
221  * @param dev
222  *   Pointer to Ethernet device.
223  * @param[out] mtu
224  *   MTU value output buffer.
225  *
226  * @return
227  *   0 on success, a negative errno value otherwise and rte_errno is set.
228  */
229 int
230 mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu)
231 {
232 	struct ifreq request;
233 	int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request);
234 
235 	if (ret)
236 		return ret;
237 	*mtu = request.ifr_mtu;
238 	return 0;
239 }
240 
241 /**
242  * Set device MTU.
243  *
244  * @param dev
245  *   Pointer to Ethernet device.
246  * @param mtu
247  *   MTU value to set.
248  *
249  * @return
250  *   0 on success, a negative errno value otherwise and rte_errno is set.
251  */
252 static int
253 mlx5_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
254 {
255 	struct ifreq request = { .ifr_mtu = mtu, };
256 
257 	return mlx5_ifreq(dev, SIOCSIFMTU, &request);
258 }
259 
260 /**
261  * Set device flags.
262  *
263  * @param dev
264  *   Pointer to Ethernet device.
265  * @param keep
266  *   Bitmask for flags that must remain untouched.
267  * @param flags
268  *   Bitmask for flags to modify.
269  *
270  * @return
271  *   0 on success, a negative errno value otherwise and rte_errno is set.
272  */
273 int
274 mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep, unsigned int flags)
275 {
276 	struct ifreq request;
277 	int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request);
278 
279 	if (ret)
280 		return ret;
281 	request.ifr_flags &= keep;
282 	request.ifr_flags |= flags & ~keep;
283 	return mlx5_ifreq(dev, SIOCSIFFLAGS, &request);
284 }
285 
286 /**
287  * DPDK callback for Ethernet device configuration.
288  *
289  * @param dev
290  *   Pointer to Ethernet device structure.
291  *
292  * @return
293  *   0 on success, a negative errno value otherwise and rte_errno is set.
294  */
295 int
296 mlx5_dev_configure(struct rte_eth_dev *dev)
297 {
298 	struct priv *priv = dev->data->dev_private;
299 	unsigned int rxqs_n = dev->data->nb_rx_queues;
300 	unsigned int txqs_n = dev->data->nb_tx_queues;
301 	unsigned int i;
302 	unsigned int j;
303 	unsigned int reta_idx_n;
304 	const uint8_t use_app_rss_key =
305 		!!dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key;
306 	uint64_t supp_tx_offloads = mlx5_get_tx_port_offloads(dev);
307 	uint64_t tx_offloads = dev->data->dev_conf.txmode.offloads;
308 	uint64_t supp_rx_offloads =
309 		(mlx5_get_rx_port_offloads() |
310 		 mlx5_get_rx_queue_offloads(dev));
311 	uint64_t rx_offloads = dev->data->dev_conf.rxmode.offloads;
312 	int ret = 0;
313 
314 	if ((tx_offloads & supp_tx_offloads) != tx_offloads) {
315 		DRV_LOG(ERR,
316 			"port %u some Tx offloads are not supported requested"
317 			" 0x%" PRIx64 " supported 0x%" PRIx64,
318 			dev->data->port_id, tx_offloads, supp_tx_offloads);
319 		rte_errno = ENOTSUP;
320 		return -rte_errno;
321 	}
322 	if ((rx_offloads & supp_rx_offloads) != rx_offloads) {
323 		DRV_LOG(ERR,
324 			"port %u some Rx offloads are not supported requested"
325 			" 0x%" PRIx64 " supported 0x%" PRIx64,
326 			dev->data->port_id, rx_offloads, supp_rx_offloads);
327 		rte_errno = ENOTSUP;
328 		return -rte_errno;
329 	}
330 	if (use_app_rss_key &&
331 	    (dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len !=
332 	     rss_hash_default_key_len)) {
333 		DRV_LOG(ERR, "port %u RSS key len must be %zu Bytes long",
334 			dev->data->port_id, rss_hash_default_key_len);
335 		rte_errno = EINVAL;
336 		return -rte_errno;
337 	}
338 	priv->rss_conf.rss_key =
339 		rte_realloc(priv->rss_conf.rss_key,
340 			    rss_hash_default_key_len, 0);
341 	if (!priv->rss_conf.rss_key) {
342 		DRV_LOG(ERR, "port %u cannot allocate RSS hash key memory (%u)",
343 			dev->data->port_id, rxqs_n);
344 		rte_errno = ENOMEM;
345 		return -rte_errno;
346 	}
347 	memcpy(priv->rss_conf.rss_key,
348 	       use_app_rss_key ?
349 	       dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key :
350 	       rss_hash_default_key,
351 	       rss_hash_default_key_len);
352 	priv->rss_conf.rss_key_len = rss_hash_default_key_len;
353 	priv->rss_conf.rss_hf = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf;
354 	priv->rxqs = (void *)dev->data->rx_queues;
355 	priv->txqs = (void *)dev->data->tx_queues;
356 	if (txqs_n != priv->txqs_n) {
357 		DRV_LOG(INFO, "port %u Tx queues number update: %u -> %u",
358 			dev->data->port_id, priv->txqs_n, txqs_n);
359 		priv->txqs_n = txqs_n;
360 	}
361 	if (rxqs_n > priv->config.ind_table_max_size) {
362 		DRV_LOG(ERR, "port %u cannot handle this many Rx queues (%u)",
363 			dev->data->port_id, rxqs_n);
364 		rte_errno = EINVAL;
365 		return -rte_errno;
366 	}
367 	if (rxqs_n == priv->rxqs_n)
368 		return 0;
369 	DRV_LOG(INFO, "port %u Rx queues number update: %u -> %u",
370 		dev->data->port_id, priv->rxqs_n, rxqs_n);
371 	priv->rxqs_n = rxqs_n;
372 	/* If the requested number of RX queues is not a power of two, use the
373 	 * maximum indirection table size for better balancing.
374 	 * The result is always rounded to the next power of two. */
375 	reta_idx_n = (1 << log2above((rxqs_n & (rxqs_n - 1)) ?
376 				     priv->config.ind_table_max_size :
377 				     rxqs_n));
378 	ret = mlx5_rss_reta_index_resize(dev, reta_idx_n);
379 	if (ret)
380 		return ret;
381 	/* When the number of RX queues is not a power of two, the remaining
382 	 * table entries are padded with reused WQs and hashes are not spread
383 	 * uniformly. */
384 	for (i = 0, j = 0; (i != reta_idx_n); ++i) {
385 		(*priv->reta_idx)[i] = j;
386 		if (++j == rxqs_n)
387 			j = 0;
388 	}
389 	return 0;
390 }
391 
392 /**
393  * DPDK callback to get information about the device.
394  *
395  * @param dev
396  *   Pointer to Ethernet device structure.
397  * @param[out] info
398  *   Info structure output buffer.
399  */
400 void
401 mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
402 {
403 	struct priv *priv = dev->data->dev_private;
404 	struct mlx5_dev_config *config = &priv->config;
405 	unsigned int max;
406 	char ifname[IF_NAMESIZE];
407 
408 	info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
409 	/* FIXME: we should ask the device for these values. */
410 	info->min_rx_bufsize = 32;
411 	info->max_rx_pktlen = 65536;
412 	/*
413 	 * Since we need one CQ per QP, the limit is the minimum number
414 	 * between the two values.
415 	 */
416 	max = RTE_MIN(priv->device_attr.orig_attr.max_cq,
417 		      priv->device_attr.orig_attr.max_qp);
418 	/* If max >= 65535 then max = 0, max_rx_queues is uint16_t. */
419 	if (max >= 65535)
420 		max = 65535;
421 	info->max_rx_queues = max;
422 	info->max_tx_queues = max;
423 	info->max_mac_addrs = RTE_DIM(priv->mac);
424 	info->rx_queue_offload_capa = mlx5_get_rx_queue_offloads(dev);
425 	info->rx_offload_capa = (mlx5_get_rx_port_offloads() |
426 				 info->rx_queue_offload_capa);
427 	info->tx_offload_capa = mlx5_get_tx_port_offloads(dev);
428 	if (mlx5_get_ifname(dev, &ifname) == 0)
429 		info->if_index = if_nametoindex(ifname);
430 	info->reta_size = priv->reta_idx_n ?
431 		priv->reta_idx_n : config->ind_table_max_size;
432 	info->hash_key_size = rss_hash_default_key_len;
433 	info->speed_capa = priv->link_speed_capa;
434 	info->flow_type_rss_offloads = ~MLX5_RSS_HF_MASK;
435 }
436 
437 /**
438  * Get supported packet types.
439  *
440  * @param dev
441  *   Pointer to Ethernet device structure.
442  *
443  * @return
444  *   A pointer to the supported Packet types array.
445  */
446 const uint32_t *
447 mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev)
448 {
449 	static const uint32_t ptypes[] = {
450 		/* refers to rxq_cq_to_pkt_type() */
451 		RTE_PTYPE_L2_ETHER,
452 		RTE_PTYPE_L3_IPV4_EXT_UNKNOWN,
453 		RTE_PTYPE_L3_IPV6_EXT_UNKNOWN,
454 		RTE_PTYPE_L4_NONFRAG,
455 		RTE_PTYPE_L4_FRAG,
456 		RTE_PTYPE_L4_TCP,
457 		RTE_PTYPE_L4_UDP,
458 		RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN,
459 		RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN,
460 		RTE_PTYPE_INNER_L4_NONFRAG,
461 		RTE_PTYPE_INNER_L4_FRAG,
462 		RTE_PTYPE_INNER_L4_TCP,
463 		RTE_PTYPE_INNER_L4_UDP,
464 		RTE_PTYPE_UNKNOWN
465 	};
466 
467 	if (dev->rx_pkt_burst == mlx5_rx_burst ||
468 	    dev->rx_pkt_burst == mlx5_rx_burst_vec)
469 		return ptypes;
470 	return NULL;
471 }
472 
473 /**
474  * DPDK callback to retrieve physical link information.
475  *
476  * @param dev
477  *   Pointer to Ethernet device structure.
478  * @param[out] link
479  *   Storage for current link status.
480  *
481  * @return
482  *   0 on success, a negative errno value otherwise and rte_errno is set.
483  */
484 static int
485 mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev,
486 			       struct rte_eth_link *link)
487 {
488 	struct priv *priv = dev->data->dev_private;
489 	struct ethtool_cmd edata = {
490 		.cmd = ETHTOOL_GSET /* Deprecated since Linux v4.5. */
491 	};
492 	struct ifreq ifr;
493 	struct rte_eth_link dev_link;
494 	int link_speed = 0;
495 	int ret;
496 
497 	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
498 	if (ret) {
499 		DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
500 			dev->data->port_id, strerror(rte_errno));
501 		return ret;
502 	}
503 	memset(&dev_link, 0, sizeof(dev_link));
504 	dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
505 				(ifr.ifr_flags & IFF_RUNNING));
506 	ifr.ifr_data = (void *)&edata;
507 	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
508 	if (ret) {
509 		DRV_LOG(WARNING,
510 			"port %u ioctl(SIOCETHTOOL, ETHTOOL_GSET) failed: %s",
511 			dev->data->port_id, strerror(rte_errno));
512 		return ret;
513 	}
514 	link_speed = ethtool_cmd_speed(&edata);
515 	if (link_speed == -1)
516 		dev_link.link_speed = 0;
517 	else
518 		dev_link.link_speed = link_speed;
519 	priv->link_speed_capa = 0;
520 	if (edata.supported & SUPPORTED_Autoneg)
521 		priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG;
522 	if (edata.supported & (SUPPORTED_1000baseT_Full |
523 			       SUPPORTED_1000baseKX_Full))
524 		priv->link_speed_capa |= ETH_LINK_SPEED_1G;
525 	if (edata.supported & SUPPORTED_10000baseKR_Full)
526 		priv->link_speed_capa |= ETH_LINK_SPEED_10G;
527 	if (edata.supported & (SUPPORTED_40000baseKR4_Full |
528 			       SUPPORTED_40000baseCR4_Full |
529 			       SUPPORTED_40000baseSR4_Full |
530 			       SUPPORTED_40000baseLR4_Full))
531 		priv->link_speed_capa |= ETH_LINK_SPEED_40G;
532 	dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ?
533 				ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
534 	dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
535 			ETH_LINK_SPEED_FIXED);
536 	if ((dev_link.link_speed && !dev_link.link_status) ||
537 	    (!dev_link.link_speed && dev_link.link_status)) {
538 		rte_errno = EAGAIN;
539 		return -rte_errno;
540 	}
541 	*link = dev_link;
542 	return 0;
543 }
544 
545 /**
546  * Retrieve physical link information (unlocked version using new ioctl).
547  *
548  * @param dev
549  *   Pointer to Ethernet device structure.
550  * @param[out] link
551  *   Storage for current link status.
552  *
553  * @return
554  *   0 on success, a negative errno value otherwise and rte_errno is set.
555  */
556 static int
557 mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
558 			     struct rte_eth_link *link)
559 
560 {
561 	struct priv *priv = dev->data->dev_private;
562 	struct ethtool_link_settings gcmd = { .cmd = ETHTOOL_GLINKSETTINGS };
563 	struct ifreq ifr;
564 	struct rte_eth_link dev_link;
565 	uint64_t sc;
566 	int ret;
567 
568 	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
569 	if (ret) {
570 		DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
571 			dev->data->port_id, strerror(rte_errno));
572 		return ret;
573 	}
574 	memset(&dev_link, 0, sizeof(dev_link));
575 	dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
576 				(ifr.ifr_flags & IFF_RUNNING));
577 	ifr.ifr_data = (void *)&gcmd;
578 	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
579 	if (ret) {
580 		DRV_LOG(DEBUG,
581 			"port %u ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS)"
582 			" failed: %s",
583 			dev->data->port_id, strerror(rte_errno));
584 		return ret;
585 	}
586 	gcmd.link_mode_masks_nwords = -gcmd.link_mode_masks_nwords;
587 
588 	alignas(struct ethtool_link_settings)
589 	uint8_t data[offsetof(struct ethtool_link_settings, link_mode_masks) +
590 		     sizeof(uint32_t) * gcmd.link_mode_masks_nwords * 3];
591 	struct ethtool_link_settings *ecmd = (void *)data;
592 
593 	*ecmd = gcmd;
594 	ifr.ifr_data = (void *)ecmd;
595 	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
596 	if (ret) {
597 		DRV_LOG(DEBUG,
598 			"port %u ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS)"
599 			" failed: %s",
600 			dev->data->port_id, strerror(rte_errno));
601 		return ret;
602 	}
603 	dev_link.link_speed = ecmd->speed;
604 	sc = ecmd->link_mode_masks[0] |
605 		((uint64_t)ecmd->link_mode_masks[1] << 32);
606 	priv->link_speed_capa = 0;
607 	if (sc & MLX5_BITSHIFT(ETHTOOL_LINK_MODE_Autoneg_BIT))
608 		priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG;
609 	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseT_Full_BIT) |
610 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseKX_Full_BIT)))
611 		priv->link_speed_capa |= ETH_LINK_SPEED_1G;
612 	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT) |
613 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKR_Full_BIT) |
614 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseR_FEC_BIT)))
615 		priv->link_speed_capa |= ETH_LINK_SPEED_10G;
616 	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT) |
617 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT)))
618 		priv->link_speed_capa |= ETH_LINK_SPEED_20G;
619 	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT) |
620 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT) |
621 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT) |
622 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT)))
623 		priv->link_speed_capa |= ETH_LINK_SPEED_40G;
624 	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT) |
625 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT) |
626 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT) |
627 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT)))
628 		priv->link_speed_capa |= ETH_LINK_SPEED_56G;
629 	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseCR_Full_BIT) |
630 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseKR_Full_BIT) |
631 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseSR_Full_BIT)))
632 		priv->link_speed_capa |= ETH_LINK_SPEED_25G;
633 	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT) |
634 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT)))
635 		priv->link_speed_capa |= ETH_LINK_SPEED_50G;
636 	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT) |
637 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT) |
638 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT) |
639 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT)))
640 		priv->link_speed_capa |= ETH_LINK_SPEED_100G;
641 	dev_link.link_duplex = ((ecmd->duplex == DUPLEX_HALF) ?
642 				ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
643 	dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
644 				  ETH_LINK_SPEED_FIXED);
645 	if ((dev_link.link_speed && !dev_link.link_status) ||
646 	    (!dev_link.link_speed && dev_link.link_status)) {
647 		rte_errno = EAGAIN;
648 		return -rte_errno;
649 	}
650 	*link = dev_link;
651 	return 0;
652 }
653 
654 /**
655  * DPDK callback to retrieve physical link information.
656  *
657  * @param dev
658  *   Pointer to Ethernet device structure.
659  * @param wait_to_complete
660  *   Wait for request completion.
661  *
662  * @return
663  *   0 if link status was not updated, positive if it was, a negative errno
664  *   value otherwise and rte_errno is set.
665  */
666 int
667 mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete)
668 {
669 	int ret;
670 	struct rte_eth_link dev_link;
671 	time_t start_time = time(NULL);
672 
673 	do {
674 		ret = mlx5_link_update_unlocked_gset(dev, &dev_link);
675 		if (ret)
676 			ret = mlx5_link_update_unlocked_gs(dev, &dev_link);
677 		if (ret == 0)
678 			break;
679 		/* Handle wait to complete situation. */
680 		if (wait_to_complete && ret == -EAGAIN) {
681 			if (abs((int)difftime(time(NULL), start_time)) <
682 			    MLX5_LINK_STATUS_TIMEOUT) {
683 				usleep(0);
684 				continue;
685 			} else {
686 				rte_errno = EBUSY;
687 				return -rte_errno;
688 			}
689 		} else if (ret < 0) {
690 			return ret;
691 		}
692 	} while (wait_to_complete);
693 	ret = !!memcmp(&dev->data->dev_link, &dev_link,
694 		       sizeof(struct rte_eth_link));
695 	dev->data->dev_link = dev_link;
696 	return ret;
697 }
698 
699 /**
700  * DPDK callback to change the MTU.
701  *
702  * @param dev
703  *   Pointer to Ethernet device structure.
704  * @param in_mtu
705  *   New MTU.
706  *
707  * @return
708  *   0 on success, a negative errno value otherwise and rte_errno is set.
709  */
710 int
711 mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
712 {
713 	struct priv *priv = dev->data->dev_private;
714 	uint16_t kern_mtu = 0;
715 	int ret;
716 
717 	ret = mlx5_get_mtu(dev, &kern_mtu);
718 	if (ret)
719 		return ret;
720 	/* Set kernel interface MTU first. */
721 	ret = mlx5_set_mtu(dev, mtu);
722 	if (ret)
723 		return ret;
724 	ret = mlx5_get_mtu(dev, &kern_mtu);
725 	if (ret)
726 		return ret;
727 	if (kern_mtu == mtu) {
728 		priv->mtu = mtu;
729 		DRV_LOG(DEBUG, "port %u adapter MTU set to %u",
730 			dev->data->port_id, mtu);
731 		return 0;
732 	}
733 	rte_errno = EAGAIN;
734 	return -rte_errno;
735 }
736 
737 /**
738  * DPDK callback to get flow control status.
739  *
740  * @param dev
741  *   Pointer to Ethernet device structure.
742  * @param[out] fc_conf
743  *   Flow control output buffer.
744  *
745  * @return
746  *   0 on success, a negative errno value otherwise and rte_errno is set.
747  */
748 int
749 mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
750 {
751 	struct ifreq ifr;
752 	struct ethtool_pauseparam ethpause = {
753 		.cmd = ETHTOOL_GPAUSEPARAM
754 	};
755 	int ret;
756 
757 	ifr.ifr_data = (void *)&ethpause;
758 	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
759 	if (ret) {
760 		DRV_LOG(WARNING,
761 			"port %u ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM) failed:"
762 			" %s",
763 			dev->data->port_id, strerror(rte_errno));
764 		return ret;
765 	}
766 	fc_conf->autoneg = ethpause.autoneg;
767 	if (ethpause.rx_pause && ethpause.tx_pause)
768 		fc_conf->mode = RTE_FC_FULL;
769 	else if (ethpause.rx_pause)
770 		fc_conf->mode = RTE_FC_RX_PAUSE;
771 	else if (ethpause.tx_pause)
772 		fc_conf->mode = RTE_FC_TX_PAUSE;
773 	else
774 		fc_conf->mode = RTE_FC_NONE;
775 	return 0;
776 }
777 
778 /**
779  * DPDK callback to modify flow control parameters.
780  *
781  * @param dev
782  *   Pointer to Ethernet device structure.
783  * @param[in] fc_conf
784  *   Flow control parameters.
785  *
786  * @return
787  *   0 on success, a negative errno value otherwise and rte_errno is set.
788  */
789 int
790 mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
791 {
792 	struct ifreq ifr;
793 	struct ethtool_pauseparam ethpause = {
794 		.cmd = ETHTOOL_SPAUSEPARAM
795 	};
796 	int ret;
797 
798 	ifr.ifr_data = (void *)&ethpause;
799 	ethpause.autoneg = fc_conf->autoneg;
800 	if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
801 	    (fc_conf->mode & RTE_FC_RX_PAUSE))
802 		ethpause.rx_pause = 1;
803 	else
804 		ethpause.rx_pause = 0;
805 
806 	if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
807 	    (fc_conf->mode & RTE_FC_TX_PAUSE))
808 		ethpause.tx_pause = 1;
809 	else
810 		ethpause.tx_pause = 0;
811 	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
812 	if (ret) {
813 		DRV_LOG(WARNING,
814 			"port %u ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)"
815 			" failed: %s",
816 			dev->data->port_id, strerror(rte_errno));
817 		return ret;
818 	}
819 	return 0;
820 }
821 
822 /**
823  * Get PCI information from struct ibv_device.
824  *
825  * @param device
826  *   Pointer to Ethernet device structure.
827  * @param[out] pci_addr
828  *   PCI bus address output buffer.
829  *
830  * @return
831  *   0 on success, a negative errno value otherwise and rte_errno is set.
832  */
833 int
834 mlx5_ibv_device_to_pci_addr(const struct ibv_device *device,
835 			    struct rte_pci_addr *pci_addr)
836 {
837 	FILE *file;
838 	char line[32];
839 	MKSTR(path, "%s/device/uevent", device->ibdev_path);
840 
841 	file = fopen(path, "rb");
842 	if (file == NULL) {
843 		rte_errno = errno;
844 		return -rte_errno;
845 	}
846 	while (fgets(line, sizeof(line), file) == line) {
847 		size_t len = strlen(line);
848 		int ret;
849 
850 		/* Truncate long lines. */
851 		if (len == (sizeof(line) - 1))
852 			while (line[(len - 1)] != '\n') {
853 				ret = fgetc(file);
854 				if (ret == EOF)
855 					break;
856 				line[(len - 1)] = ret;
857 			}
858 		/* Extract information. */
859 		if (sscanf(line,
860 			   "PCI_SLOT_NAME="
861 			   "%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n",
862 			   &pci_addr->domain,
863 			   &pci_addr->bus,
864 			   &pci_addr->devid,
865 			   &pci_addr->function) == 4) {
866 			ret = 0;
867 			break;
868 		}
869 	}
870 	fclose(file);
871 	return 0;
872 }
873 
874 /**
875  * Device status handler.
876  *
877  * @param dev
878  *   Pointer to Ethernet device.
879  * @param events
880  *   Pointer to event flags holder.
881  *
882  * @return
883  *   Events bitmap of callback process which can be called immediately.
884  */
885 static uint32_t
886 mlx5_dev_status_handler(struct rte_eth_dev *dev)
887 {
888 	struct priv *priv = dev->data->dev_private;
889 	struct ibv_async_event event;
890 	uint32_t ret = 0;
891 
892 	if (mlx5_link_update(dev, 0) == -EAGAIN) {
893 		usleep(0);
894 		return 0;
895 	}
896 	/* Read all message and acknowledge them. */
897 	for (;;) {
898 		if (mlx5_glue->get_async_event(priv->ctx, &event))
899 			break;
900 		if ((event.event_type == IBV_EVENT_PORT_ACTIVE ||
901 			event.event_type == IBV_EVENT_PORT_ERR) &&
902 			(dev->data->dev_conf.intr_conf.lsc == 1))
903 			ret |= (1 << RTE_ETH_EVENT_INTR_LSC);
904 		else if (event.event_type == IBV_EVENT_DEVICE_FATAL &&
905 			dev->data->dev_conf.intr_conf.rmv == 1)
906 			ret |= (1 << RTE_ETH_EVENT_INTR_RMV);
907 		else
908 			DRV_LOG(DEBUG,
909 				"port %u event type %d on not handled",
910 				dev->data->port_id, event.event_type);
911 		mlx5_glue->ack_async_event(&event);
912 	}
913 	return ret;
914 }
915 
916 /**
917  * Handle interrupts from the NIC.
918  *
919  * @param[in] intr_handle
920  *   Interrupt handler.
921  * @param cb_arg
922  *   Callback argument.
923  */
924 void
925 mlx5_dev_interrupt_handler(void *cb_arg)
926 {
927 	struct rte_eth_dev *dev = cb_arg;
928 	uint32_t events;
929 
930 	events = mlx5_dev_status_handler(dev);
931 	if (events & (1 << RTE_ETH_EVENT_INTR_LSC))
932 		_rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL);
933 	if (events & (1 << RTE_ETH_EVENT_INTR_RMV))
934 		_rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_RMV, NULL);
935 }
936 
937 /**
938  * Handle interrupts from the socket.
939  *
940  * @param cb_arg
941  *   Callback argument.
942  */
943 static void
944 mlx5_dev_handler_socket(void *cb_arg)
945 {
946 	struct rte_eth_dev *dev = cb_arg;
947 
948 	mlx5_socket_handle(dev);
949 }
950 
951 /**
952  * Uninstall interrupt handler.
953  *
954  * @param dev
955  *   Pointer to Ethernet device.
956  */
957 void
958 mlx5_dev_interrupt_handler_uninstall(struct rte_eth_dev *dev)
959 {
960 	struct priv *priv = dev->data->dev_private;
961 
962 	if (dev->data->dev_conf.intr_conf.lsc ||
963 	    dev->data->dev_conf.intr_conf.rmv)
964 		rte_intr_callback_unregister(&priv->intr_handle,
965 					     mlx5_dev_interrupt_handler, dev);
966 	if (priv->primary_socket)
967 		rte_intr_callback_unregister(&priv->intr_handle_socket,
968 					     mlx5_dev_handler_socket, dev);
969 	priv->intr_handle.fd = 0;
970 	priv->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
971 	priv->intr_handle_socket.fd = 0;
972 	priv->intr_handle_socket.type = RTE_INTR_HANDLE_UNKNOWN;
973 }
974 
975 /**
976  * Install interrupt handler.
977  *
978  * @param dev
979  *   Pointer to Ethernet device.
980  */
981 void
982 mlx5_dev_interrupt_handler_install(struct rte_eth_dev *dev)
983 {
984 	struct priv *priv = dev->data->dev_private;
985 	int ret;
986 	int flags;
987 
988 	assert(priv->ctx->async_fd > 0);
989 	flags = fcntl(priv->ctx->async_fd, F_GETFL);
990 	ret = fcntl(priv->ctx->async_fd, F_SETFL, flags | O_NONBLOCK);
991 	if (ret) {
992 		DRV_LOG(INFO,
993 			"port %u failed to change file descriptor async event"
994 			" queue",
995 			dev->data->port_id);
996 		dev->data->dev_conf.intr_conf.lsc = 0;
997 		dev->data->dev_conf.intr_conf.rmv = 0;
998 	}
999 	if (dev->data->dev_conf.intr_conf.lsc ||
1000 	    dev->data->dev_conf.intr_conf.rmv) {
1001 		priv->intr_handle.fd = priv->ctx->async_fd;
1002 		priv->intr_handle.type = RTE_INTR_HANDLE_EXT;
1003 		rte_intr_callback_register(&priv->intr_handle,
1004 					   mlx5_dev_interrupt_handler, dev);
1005 	}
1006 	ret = mlx5_socket_init(dev);
1007 	if (ret)
1008 		DRV_LOG(ERR, "port %u cannot initialise socket: %s",
1009 			dev->data->port_id, strerror(rte_errno));
1010 	else if (priv->primary_socket) {
1011 		priv->intr_handle_socket.fd = priv->primary_socket;
1012 		priv->intr_handle_socket.type = RTE_INTR_HANDLE_EXT;
1013 		rte_intr_callback_register(&priv->intr_handle_socket,
1014 					   mlx5_dev_handler_socket, dev);
1015 	}
1016 }
1017 
1018 /**
1019  * DPDK callback to bring the link DOWN.
1020  *
1021  * @param dev
1022  *   Pointer to Ethernet device structure.
1023  *
1024  * @return
1025  *   0 on success, a negative errno value otherwise and rte_errno is set.
1026  */
1027 int
1028 mlx5_set_link_down(struct rte_eth_dev *dev)
1029 {
1030 	return mlx5_set_flags(dev, ~IFF_UP, ~IFF_UP);
1031 }
1032 
1033 /**
1034  * DPDK callback to bring the link UP.
1035  *
1036  * @param dev
1037  *   Pointer to Ethernet device structure.
1038  *
1039  * @return
1040  *   0 on success, a negative errno value otherwise and rte_errno is set.
1041  */
1042 int
1043 mlx5_set_link_up(struct rte_eth_dev *dev)
1044 {
1045 	return mlx5_set_flags(dev, ~IFF_UP, IFF_UP);
1046 }
1047 
1048 /**
1049  * Configure the TX function to use.
1050  *
1051  * @param dev
1052  *   Pointer to private data structure.
1053  *
1054  * @return
1055  *   Pointer to selected Tx burst function.
1056  */
1057 eth_tx_burst_t
1058 mlx5_select_tx_function(struct rte_eth_dev *dev)
1059 {
1060 	struct priv *priv = dev->data->dev_private;
1061 	eth_tx_burst_t tx_pkt_burst = mlx5_tx_burst;
1062 	struct mlx5_dev_config *config = &priv->config;
1063 	uint64_t tx_offloads = dev->data->dev_conf.txmode.offloads;
1064 	int tso = !!(tx_offloads & (DEV_TX_OFFLOAD_TCP_TSO |
1065 				    DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
1066 				    DEV_TX_OFFLOAD_GRE_TNL_TSO));
1067 	int vlan_insert = !!(tx_offloads & DEV_TX_OFFLOAD_VLAN_INSERT);
1068 
1069 	assert(priv != NULL);
1070 	/* Select appropriate TX function. */
1071 	if (vlan_insert || tso)
1072 		return tx_pkt_burst;
1073 	if (config->mps == MLX5_MPW_ENHANCED) {
1074 		if (mlx5_check_vec_tx_support(dev) > 0) {
1075 			if (mlx5_check_raw_vec_tx_support(dev) > 0)
1076 				tx_pkt_burst = mlx5_tx_burst_raw_vec;
1077 			else
1078 				tx_pkt_burst = mlx5_tx_burst_vec;
1079 			DRV_LOG(DEBUG,
1080 				"port %u selected enhanced MPW Tx vectorized"
1081 				" function",
1082 				dev->data->port_id);
1083 		} else {
1084 			tx_pkt_burst = mlx5_tx_burst_empw;
1085 			DRV_LOG(DEBUG,
1086 				"port %u selected enhanced MPW Tx function",
1087 				dev->data->port_id);
1088 		}
1089 	} else if (config->mps && (config->txq_inline > 0)) {
1090 		tx_pkt_burst = mlx5_tx_burst_mpw_inline;
1091 		DRV_LOG(DEBUG, "port %u selected MPW inline Tx function",
1092 			dev->data->port_id);
1093 	} else if (config->mps) {
1094 		tx_pkt_burst = mlx5_tx_burst_mpw;
1095 		DRV_LOG(DEBUG, "port %u selected MPW Tx function",
1096 			dev->data->port_id);
1097 	}
1098 	return tx_pkt_burst;
1099 }
1100 
1101 /**
1102  * Configure the RX function to use.
1103  *
1104  * @param dev
1105  *   Pointer to private data structure.
1106  *
1107  * @return
1108  *   Pointer to selected Rx burst function.
1109  */
1110 eth_rx_burst_t
1111 mlx5_select_rx_function(struct rte_eth_dev *dev)
1112 {
1113 	eth_rx_burst_t rx_pkt_burst = mlx5_rx_burst;
1114 
1115 	assert(dev != NULL);
1116 	if (mlx5_check_vec_rx_support(dev) > 0) {
1117 		rx_pkt_burst = mlx5_rx_burst_vec;
1118 		DRV_LOG(DEBUG, "port %u selected Rx vectorized function",
1119 			dev->data->port_id);
1120 	}
1121 	return rx_pkt_burst;
1122 }
1123 
1124 /**
1125  * Check if mlx5 device was removed.
1126  *
1127  * @param dev
1128  *   Pointer to Ethernet device structure.
1129  *
1130  * @return
1131  *   1 when device is removed, otherwise 0.
1132  */
1133 int
1134 mlx5_is_removed(struct rte_eth_dev *dev)
1135 {
1136 	struct ibv_device_attr device_attr;
1137 	struct priv *priv = dev->data->dev_private;
1138 
1139 	if (mlx5_glue->query_device(priv->ctx, &device_attr) == EIO)
1140 		return 1;
1141 	return 0;
1142 }
1143