xref: /dpdk/drivers/net/mlx5/mlx5_ethdev.c (revision fc40db997323bb0e9b725a6e8d65eae95372446c)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2015 6WIND S.A.
3  * Copyright 2015 Mellanox.
4  */
5 
6 #define _GNU_SOURCE
7 
8 #include <stddef.h>
9 #include <assert.h>
10 #include <inttypes.h>
11 #include <unistd.h>
12 #include <stdint.h>
13 #include <stdio.h>
14 #include <string.h>
15 #include <stdlib.h>
16 #include <errno.h>
17 #include <dirent.h>
18 #include <net/if.h>
19 #include <sys/ioctl.h>
20 #include <sys/socket.h>
21 #include <sys/utsname.h>
22 #include <netinet/in.h>
23 #include <linux/ethtool.h>
24 #include <linux/sockios.h>
25 #include <linux/version.h>
26 #include <fcntl.h>
27 #include <stdalign.h>
28 #include <sys/un.h>
29 
30 #include <rte_atomic.h>
31 #include <rte_ethdev_driver.h>
32 #include <rte_bus_pci.h>
33 #include <rte_mbuf.h>
34 #include <rte_common.h>
35 #include <rte_interrupts.h>
36 #include <rte_alarm.h>
37 #include <rte_malloc.h>
38 
39 #include "mlx5.h"
40 #include "mlx5_glue.h"
41 #include "mlx5_rxtx.h"
42 #include "mlx5_utils.h"
43 
44 /* Add defines in case the running kernel is not the same as user headers. */
45 #ifndef ETHTOOL_GLINKSETTINGS
46 struct ethtool_link_settings {
47 	uint32_t cmd;
48 	uint32_t speed;
49 	uint8_t duplex;
50 	uint8_t port;
51 	uint8_t phy_address;
52 	uint8_t autoneg;
53 	uint8_t mdio_support;
54 	uint8_t eth_to_mdix;
55 	uint8_t eth_tp_mdix_ctrl;
56 	int8_t link_mode_masks_nwords;
57 	uint32_t reserved[8];
58 	uint32_t link_mode_masks[];
59 };
60 
61 #define ETHTOOL_GLINKSETTINGS 0x0000004c
62 #define ETHTOOL_LINK_MODE_1000baseT_Full_BIT 5
63 #define ETHTOOL_LINK_MODE_Autoneg_BIT 6
64 #define ETHTOOL_LINK_MODE_1000baseKX_Full_BIT 17
65 #define ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT 18
66 #define ETHTOOL_LINK_MODE_10000baseKR_Full_BIT 19
67 #define ETHTOOL_LINK_MODE_10000baseR_FEC_BIT 20
68 #define ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT 21
69 #define ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT 22
70 #define ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT 23
71 #define ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT 24
72 #define ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT 25
73 #define ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT 26
74 #define ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT 27
75 #define ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT 28
76 #define ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT 29
77 #define ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT 30
78 #endif
79 #ifndef HAVE_ETHTOOL_LINK_MODE_25G
80 #define ETHTOOL_LINK_MODE_25000baseCR_Full_BIT 31
81 #define ETHTOOL_LINK_MODE_25000baseKR_Full_BIT 32
82 #define ETHTOOL_LINK_MODE_25000baseSR_Full_BIT 33
83 #endif
84 #ifndef HAVE_ETHTOOL_LINK_MODE_50G
85 #define ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT 34
86 #define ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT 35
87 #endif
88 #ifndef HAVE_ETHTOOL_LINK_MODE_100G
89 #define ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT 36
90 #define ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT 37
91 #define ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT 38
92 #define ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT 39
93 #endif
94 
95 /**
96  * Get interface name from private structure.
97  *
98  * @param[in] priv
99  *   Pointer to private structure.
100  * @param[out] ifname
101  *   Interface name output buffer.
102  *
103  * @return
104  *   0 on success, -1 on failure and errno is set.
105  */
106 int
107 priv_get_ifname(const struct priv *priv, char (*ifname)[IF_NAMESIZE])
108 {
109 	DIR *dir;
110 	struct dirent *dent;
111 	unsigned int dev_type = 0;
112 	unsigned int dev_port_prev = ~0u;
113 	char match[IF_NAMESIZE] = "";
114 
115 	{
116 		MKSTR(path, "%s/device/net", priv->ibdev_path);
117 
118 		dir = opendir(path);
119 		if (dir == NULL)
120 			return -1;
121 	}
122 	while ((dent = readdir(dir)) != NULL) {
123 		char *name = dent->d_name;
124 		FILE *file;
125 		unsigned int dev_port;
126 		int r;
127 
128 		if ((name[0] == '.') &&
129 		    ((name[1] == '\0') ||
130 		     ((name[1] == '.') && (name[2] == '\0'))))
131 			continue;
132 
133 		MKSTR(path, "%s/device/net/%s/%s",
134 		      priv->ibdev_path, name,
135 		      (dev_type ? "dev_id" : "dev_port"));
136 
137 		file = fopen(path, "rb");
138 		if (file == NULL) {
139 			if (errno != ENOENT)
140 				continue;
141 			/*
142 			 * Switch to dev_id when dev_port does not exist as
143 			 * is the case with Linux kernel versions < 3.15.
144 			 */
145 try_dev_id:
146 			match[0] = '\0';
147 			if (dev_type)
148 				break;
149 			dev_type = 1;
150 			dev_port_prev = ~0u;
151 			rewinddir(dir);
152 			continue;
153 		}
154 		r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port);
155 		fclose(file);
156 		if (r != 1)
157 			continue;
158 		/*
159 		 * Switch to dev_id when dev_port returns the same value for
160 		 * all ports. May happen when using a MOFED release older than
161 		 * 3.0 with a Linux kernel >= 3.15.
162 		 */
163 		if (dev_port == dev_port_prev)
164 			goto try_dev_id;
165 		dev_port_prev = dev_port;
166 		if (dev_port == (priv->port - 1u))
167 			snprintf(match, sizeof(match), "%s", name);
168 	}
169 	closedir(dir);
170 	if (match[0] == '\0')
171 		return -1;
172 	strncpy(*ifname, match, sizeof(*ifname));
173 	return 0;
174 }
175 
176 /**
177  * Perform ifreq ioctl() on associated Ethernet device.
178  *
179  * @param[in] priv
180  *   Pointer to private structure.
181  * @param req
182  *   Request number to pass to ioctl().
183  * @param[out] ifr
184  *   Interface request structure output buffer.
185  *
186  * @return
187  *   0 on success, -1 on failure and errno is set.
188  */
189 int
190 priv_ifreq(const struct priv *priv, int req, struct ifreq *ifr)
191 {
192 	int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
193 	int ret = -1;
194 
195 	if (sock == -1)
196 		return ret;
197 	if (priv_get_ifname(priv, &ifr->ifr_name) == 0)
198 		ret = ioctl(sock, req, ifr);
199 	close(sock);
200 	return ret;
201 }
202 
203 /**
204  * Return the number of active VFs for the current device.
205  *
206  * @param[in] priv
207  *   Pointer to private structure.
208  * @param[out] num_vfs
209  *   Number of active VFs.
210  *
211  * @return
212  *   0 on success, -1 on failure and errno is set.
213  */
214 int
215 priv_get_num_vfs(struct priv *priv, uint16_t *num_vfs)
216 {
217 	/* The sysfs entry name depends on the operating system. */
218 	const char **name = (const char *[]){
219 		"sriov_numvfs",
220 		"mlx5_num_vfs",
221 		NULL,
222 	};
223 
224 	do {
225 		int n;
226 		FILE *file;
227 		MKSTR(path, "%s/device/%s", priv->ibdev_path, *name);
228 
229 		file = fopen(path, "rb");
230 		if (!file)
231 			continue;
232 		n = fscanf(file, "%" SCNu16, num_vfs);
233 		fclose(file);
234 		if (n == 1)
235 			return 0;
236 	} while (*(++name));
237 	return -1;
238 }
239 
240 /**
241  * Get device MTU.
242  *
243  * @param priv
244  *   Pointer to private structure.
245  * @param[out] mtu
246  *   MTU value output buffer.
247  *
248  * @return
249  *   0 on success, -1 on failure and errno is set.
250  */
251 int
252 priv_get_mtu(struct priv *priv, uint16_t *mtu)
253 {
254 	struct ifreq request;
255 	int ret = priv_ifreq(priv, SIOCGIFMTU, &request);
256 
257 	if (ret)
258 		return ret;
259 	*mtu = request.ifr_mtu;
260 	return 0;
261 }
262 
263 /**
264  * Set device MTU.
265  *
266  * @param priv
267  *   Pointer to private structure.
268  * @param mtu
269  *   MTU value to set.
270  *
271  * @return
272  *   0 on success, -1 on failure and errno is set.
273  */
274 static int
275 priv_set_mtu(struct priv *priv, uint16_t mtu)
276 {
277 	struct ifreq request = { .ifr_mtu = mtu, };
278 
279 	return priv_ifreq(priv, SIOCSIFMTU, &request);
280 }
281 
282 /**
283  * Set device flags.
284  *
285  * @param priv
286  *   Pointer to private structure.
287  * @param keep
288  *   Bitmask for flags that must remain untouched.
289  * @param flags
290  *   Bitmask for flags to modify.
291  *
292  * @return
293  *   0 on success, -1 on failure and errno is set.
294  */
295 int
296 priv_set_flags(struct priv *priv, unsigned int keep, unsigned int flags)
297 {
298 	struct ifreq request;
299 	int ret = priv_ifreq(priv, SIOCGIFFLAGS, &request);
300 
301 	if (ret)
302 		return ret;
303 	request.ifr_flags &= keep;
304 	request.ifr_flags |= flags & ~keep;
305 	return priv_ifreq(priv, SIOCSIFFLAGS, &request);
306 }
307 
308 /**
309  * Ethernet device configuration.
310  *
311  * Prepare the driver for a given number of TX and RX queues.
312  *
313  * @param dev
314  *   Pointer to Ethernet device structure.
315  *
316  * @return
317  *   0 on success, errno value on failure.
318  */
319 static int
320 dev_configure(struct rte_eth_dev *dev)
321 {
322 	struct priv *priv = dev->data->dev_private;
323 	unsigned int rxqs_n = dev->data->nb_rx_queues;
324 	unsigned int txqs_n = dev->data->nb_tx_queues;
325 	unsigned int i;
326 	unsigned int j;
327 	unsigned int reta_idx_n;
328 	const uint8_t use_app_rss_key =
329 		!!dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key;
330 	uint64_t supp_tx_offloads = mlx5_priv_get_tx_port_offloads(priv);
331 	uint64_t tx_offloads = dev->data->dev_conf.txmode.offloads;
332 	uint64_t supp_rx_offloads =
333 		(mlx5_priv_get_rx_port_offloads(priv) |
334 		 mlx5_priv_get_rx_queue_offloads(priv));
335 	uint64_t rx_offloads = dev->data->dev_conf.rxmode.offloads;
336 
337 	if ((tx_offloads & supp_tx_offloads) != tx_offloads) {
338 		ERROR("Some Tx offloads are not supported "
339 		      "requested 0x%" PRIx64 " supported 0x%" PRIx64,
340 		      tx_offloads, supp_tx_offloads);
341 		return ENOTSUP;
342 	}
343 	if ((rx_offloads & supp_rx_offloads) != rx_offloads) {
344 		ERROR("Some Rx offloads are not supported "
345 		      "requested 0x%" PRIx64 " supported 0x%" PRIx64,
346 		      rx_offloads, supp_rx_offloads);
347 		return ENOTSUP;
348 	}
349 	if (use_app_rss_key &&
350 	    (dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len !=
351 	     rss_hash_default_key_len)) {
352 		/* MLX5 RSS only support 40bytes key. */
353 		return EINVAL;
354 	}
355 	priv->rss_conf.rss_key =
356 		rte_realloc(priv->rss_conf.rss_key,
357 			    rss_hash_default_key_len, 0);
358 	if (!priv->rss_conf.rss_key) {
359 		ERROR("cannot allocate RSS hash key memory (%u)", rxqs_n);
360 		return ENOMEM;
361 	}
362 	memcpy(priv->rss_conf.rss_key,
363 	       use_app_rss_key ?
364 	       dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key :
365 	       rss_hash_default_key,
366 	       rss_hash_default_key_len);
367 	priv->rss_conf.rss_key_len = rss_hash_default_key_len;
368 	priv->rss_conf.rss_hf = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf;
369 	priv->rxqs = (void *)dev->data->rx_queues;
370 	priv->txqs = (void *)dev->data->tx_queues;
371 	if (txqs_n != priv->txqs_n) {
372 		INFO("%p: TX queues number update: %u -> %u",
373 		     (void *)dev, priv->txqs_n, txqs_n);
374 		priv->txqs_n = txqs_n;
375 	}
376 	if (rxqs_n > priv->config.ind_table_max_size) {
377 		ERROR("cannot handle this many RX queues (%u)", rxqs_n);
378 		return EINVAL;
379 	}
380 	if (rxqs_n == priv->rxqs_n)
381 		return 0;
382 	INFO("%p: RX queues number update: %u -> %u",
383 	     (void *)dev, priv->rxqs_n, rxqs_n);
384 	priv->rxqs_n = rxqs_n;
385 	/* If the requested number of RX queues is not a power of two, use the
386 	 * maximum indirection table size for better balancing.
387 	 * The result is always rounded to the next power of two. */
388 	reta_idx_n = (1 << log2above((rxqs_n & (rxqs_n - 1)) ?
389 				     priv->config.ind_table_max_size :
390 				     rxqs_n));
391 	if (priv_rss_reta_index_resize(priv, reta_idx_n))
392 		return ENOMEM;
393 	/* When the number of RX queues is not a power of two, the remaining
394 	 * table entries are padded with reused WQs and hashes are not spread
395 	 * uniformly. */
396 	for (i = 0, j = 0; (i != reta_idx_n); ++i) {
397 		(*priv->reta_idx)[i] = j;
398 		if (++j == rxqs_n)
399 			j = 0;
400 	}
401 	return 0;
402 }
403 
404 /**
405  * DPDK callback for Ethernet device configuration.
406  *
407  * @param dev
408  *   Pointer to Ethernet device structure.
409  *
410  * @return
411  *   0 on success, negative errno value on failure.
412  */
413 int
414 mlx5_dev_configure(struct rte_eth_dev *dev)
415 {
416 	struct priv *priv = dev->data->dev_private;
417 	int ret;
418 
419 	priv_lock(priv);
420 	ret = dev_configure(dev);
421 	assert(ret >= 0);
422 	priv_unlock(priv);
423 	return -ret;
424 }
425 
426 /**
427  * DPDK callback to get information about the device.
428  *
429  * @param dev
430  *   Pointer to Ethernet device structure.
431  * @param[out] info
432  *   Info structure output buffer.
433  */
434 void
435 mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
436 {
437 	struct priv *priv = dev->data->dev_private;
438 	struct mlx5_dev_config *config = &priv->config;
439 	unsigned int max;
440 	char ifname[IF_NAMESIZE];
441 
442 	info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
443 
444 	priv_lock(priv);
445 	/* FIXME: we should ask the device for these values. */
446 	info->min_rx_bufsize = 32;
447 	info->max_rx_pktlen = 65536;
448 	/*
449 	 * Since we need one CQ per QP, the limit is the minimum number
450 	 * between the two values.
451 	 */
452 	max = RTE_MIN(priv->device_attr.orig_attr.max_cq,
453 		      priv->device_attr.orig_attr.max_qp);
454 	/* If max >= 65535 then max = 0, max_rx_queues is uint16_t. */
455 	if (max >= 65535)
456 		max = 65535;
457 	info->max_rx_queues = max;
458 	info->max_tx_queues = max;
459 	info->max_mac_addrs = RTE_DIM(priv->mac);
460 	info->rx_queue_offload_capa =
461 		mlx5_priv_get_rx_queue_offloads(priv);
462 	info->rx_offload_capa = (mlx5_priv_get_rx_port_offloads(priv) |
463 				 info->rx_queue_offload_capa);
464 	info->tx_offload_capa = mlx5_priv_get_tx_port_offloads(priv);
465 	if (priv_get_ifname(priv, &ifname) == 0)
466 		info->if_index = if_nametoindex(ifname);
467 	info->reta_size = priv->reta_idx_n ?
468 		priv->reta_idx_n : config->ind_table_max_size;
469 	info->hash_key_size = priv->rss_conf.rss_key_len;
470 	info->speed_capa = priv->link_speed_capa;
471 	info->flow_type_rss_offloads = ~MLX5_RSS_HF_MASK;
472 	priv_unlock(priv);
473 }
474 
475 const uint32_t *
476 mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev)
477 {
478 	static const uint32_t ptypes[] = {
479 		/* refers to rxq_cq_to_pkt_type() */
480 		RTE_PTYPE_L2_ETHER,
481 		RTE_PTYPE_L3_IPV4_EXT_UNKNOWN,
482 		RTE_PTYPE_L3_IPV6_EXT_UNKNOWN,
483 		RTE_PTYPE_L4_NONFRAG,
484 		RTE_PTYPE_L4_FRAG,
485 		RTE_PTYPE_L4_TCP,
486 		RTE_PTYPE_L4_UDP,
487 		RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN,
488 		RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN,
489 		RTE_PTYPE_INNER_L4_NONFRAG,
490 		RTE_PTYPE_INNER_L4_FRAG,
491 		RTE_PTYPE_INNER_L4_TCP,
492 		RTE_PTYPE_INNER_L4_UDP,
493 		RTE_PTYPE_UNKNOWN
494 	};
495 
496 	if (dev->rx_pkt_burst == mlx5_rx_burst ||
497 	    dev->rx_pkt_burst == mlx5_rx_burst_vec)
498 		return ptypes;
499 	return NULL;
500 }
501 
502 /**
503  * DPDK callback to retrieve physical link information.
504  *
505  * @param dev
506  *   Pointer to Ethernet device structure.
507  * @param wait_to_complete
508  *   Wait for request completion (ignored).
509  */
510 static int
511 mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev, int wait_to_complete)
512 {
513 	struct priv *priv = dev->data->dev_private;
514 	struct ethtool_cmd edata = {
515 		.cmd = ETHTOOL_GSET /* Deprecated since Linux v4.5. */
516 	};
517 	struct ifreq ifr;
518 	struct rte_eth_link dev_link;
519 	int link_speed = 0;
520 
521 	/* priv_lock() is not taken to allow concurrent calls. */
522 
523 	(void)wait_to_complete;
524 	if (priv_ifreq(priv, SIOCGIFFLAGS, &ifr)) {
525 		WARN("ioctl(SIOCGIFFLAGS) failed: %s", strerror(errno));
526 		return -1;
527 	}
528 	memset(&dev_link, 0, sizeof(dev_link));
529 	dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
530 				(ifr.ifr_flags & IFF_RUNNING));
531 	ifr.ifr_data = (void *)&edata;
532 	if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) {
533 		WARN("ioctl(SIOCETHTOOL, ETHTOOL_GSET) failed: %s",
534 		     strerror(errno));
535 		return -1;
536 	}
537 	link_speed = ethtool_cmd_speed(&edata);
538 	if (link_speed == -1)
539 		dev_link.link_speed = 0;
540 	else
541 		dev_link.link_speed = link_speed;
542 	priv->link_speed_capa = 0;
543 	if (edata.supported & SUPPORTED_Autoneg)
544 		priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG;
545 	if (edata.supported & (SUPPORTED_1000baseT_Full |
546 			       SUPPORTED_1000baseKX_Full))
547 		priv->link_speed_capa |= ETH_LINK_SPEED_1G;
548 	if (edata.supported & SUPPORTED_10000baseKR_Full)
549 		priv->link_speed_capa |= ETH_LINK_SPEED_10G;
550 	if (edata.supported & (SUPPORTED_40000baseKR4_Full |
551 			       SUPPORTED_40000baseCR4_Full |
552 			       SUPPORTED_40000baseSR4_Full |
553 			       SUPPORTED_40000baseLR4_Full))
554 		priv->link_speed_capa |= ETH_LINK_SPEED_40G;
555 	dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ?
556 				ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
557 	dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
558 			ETH_LINK_SPEED_FIXED);
559 	if (memcmp(&dev_link, &dev->data->dev_link, sizeof(dev_link))) {
560 		/* Link status changed. */
561 		dev->data->dev_link = dev_link;
562 		return 0;
563 	}
564 	/* Link status is still the same. */
565 	return -1;
566 }
567 
568 /**
569  * Retrieve physical link information (unlocked version using new ioctl).
570  *
571  * @param dev
572  *   Pointer to Ethernet device structure.
573  * @param wait_to_complete
574  *   Wait for request completion (ignored).
575  */
576 static int
577 mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev, int wait_to_complete)
578 {
579 	struct priv *priv = dev->data->dev_private;
580 	struct ethtool_link_settings gcmd = { .cmd = ETHTOOL_GLINKSETTINGS };
581 	struct ifreq ifr;
582 	struct rte_eth_link dev_link;
583 	uint64_t sc;
584 
585 	(void)wait_to_complete;
586 	if (priv_ifreq(priv, SIOCGIFFLAGS, &ifr)) {
587 		WARN("ioctl(SIOCGIFFLAGS) failed: %s", strerror(errno));
588 		return -1;
589 	}
590 	memset(&dev_link, 0, sizeof(dev_link));
591 	dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
592 				(ifr.ifr_flags & IFF_RUNNING));
593 	ifr.ifr_data = (void *)&gcmd;
594 	if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) {
595 		DEBUG("ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS) failed: %s",
596 		      strerror(errno));
597 		return -1;
598 	}
599 	gcmd.link_mode_masks_nwords = -gcmd.link_mode_masks_nwords;
600 
601 	alignas(struct ethtool_link_settings)
602 	uint8_t data[offsetof(struct ethtool_link_settings, link_mode_masks) +
603 		     sizeof(uint32_t) * gcmd.link_mode_masks_nwords * 3];
604 	struct ethtool_link_settings *ecmd = (void *)data;
605 
606 	*ecmd = gcmd;
607 	ifr.ifr_data = (void *)ecmd;
608 	if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) {
609 		DEBUG("ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS) failed: %s",
610 		      strerror(errno));
611 		return -1;
612 	}
613 	dev_link.link_speed = ecmd->speed;
614 	sc = ecmd->link_mode_masks[0] |
615 		((uint64_t)ecmd->link_mode_masks[1] << 32);
616 	priv->link_speed_capa = 0;
617 	if (sc & MLX5_BITSHIFT(ETHTOOL_LINK_MODE_Autoneg_BIT))
618 		priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG;
619 	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseT_Full_BIT) |
620 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseKX_Full_BIT)))
621 		priv->link_speed_capa |= ETH_LINK_SPEED_1G;
622 	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT) |
623 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKR_Full_BIT) |
624 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseR_FEC_BIT)))
625 		priv->link_speed_capa |= ETH_LINK_SPEED_10G;
626 	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT) |
627 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT)))
628 		priv->link_speed_capa |= ETH_LINK_SPEED_20G;
629 	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT) |
630 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT) |
631 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT) |
632 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT)))
633 		priv->link_speed_capa |= ETH_LINK_SPEED_40G;
634 	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT) |
635 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT) |
636 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT) |
637 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT)))
638 		priv->link_speed_capa |= ETH_LINK_SPEED_56G;
639 	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseCR_Full_BIT) |
640 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseKR_Full_BIT) |
641 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseSR_Full_BIT)))
642 		priv->link_speed_capa |= ETH_LINK_SPEED_25G;
643 	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT) |
644 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT)))
645 		priv->link_speed_capa |= ETH_LINK_SPEED_50G;
646 	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT) |
647 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT) |
648 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT) |
649 		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT)))
650 		priv->link_speed_capa |= ETH_LINK_SPEED_100G;
651 	dev_link.link_duplex = ((ecmd->duplex == DUPLEX_HALF) ?
652 				ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
653 	dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
654 				  ETH_LINK_SPEED_FIXED);
655 	if (memcmp(&dev_link, &dev->data->dev_link, sizeof(dev_link))) {
656 		/* Link status changed. */
657 		dev->data->dev_link = dev_link;
658 		return 0;
659 	}
660 	/* Link status is still the same. */
661 	return -1;
662 }
663 
664 /**
665  * Enable receiving and transmitting traffic.
666  *
667  * @param priv
668  *   Pointer to private structure.
669  */
670 static void
671 priv_link_start(struct priv *priv)
672 {
673 	struct rte_eth_dev *dev = priv->dev;
674 	int err;
675 
676 	dev->tx_pkt_burst = priv_select_tx_function(priv, dev);
677 	dev->rx_pkt_burst = priv_select_rx_function(priv, dev);
678 	err = priv_dev_traffic_enable(priv, dev);
679 	if (err)
680 		ERROR("%p: error occurred while configuring control flows: %s",
681 		      (void *)priv, strerror(err));
682 	err = priv_flow_start(priv, &priv->flows);
683 	if (err)
684 		ERROR("%p: error occurred while configuring flows: %s",
685 		      (void *)priv, strerror(err));
686 }
687 
688 /**
689  * Disable receiving and transmitting traffic.
690  *
691  * @param priv
692  *   Pointer to private structure.
693  */
694 static void
695 priv_link_stop(struct priv *priv)
696 {
697 	struct rte_eth_dev *dev = priv->dev;
698 
699 	priv_flow_stop(priv, &priv->flows);
700 	priv_dev_traffic_disable(priv, dev);
701 	dev->rx_pkt_burst = removed_rx_burst;
702 	dev->tx_pkt_burst = removed_tx_burst;
703 }
704 
705 /**
706  * Retrieve physical link information and update rx/tx_pkt_burst callbacks
707  * accordingly.
708  *
709  * @param priv
710  *   Pointer to private structure.
711  * @param wait_to_complete
712  *   Wait for request completion (ignored).
713  */
714 int
715 priv_link_update(struct priv *priv, int wait_to_complete)
716 {
717 	struct rte_eth_dev *dev = priv->dev;
718 	struct utsname utsname;
719 	int ver[3];
720 	int ret;
721 	struct rte_eth_link dev_link = dev->data->dev_link;
722 
723 	if (uname(&utsname) == -1 ||
724 	    sscanf(utsname.release, "%d.%d.%d",
725 		   &ver[0], &ver[1], &ver[2]) != 3 ||
726 	    KERNEL_VERSION(ver[0], ver[1], ver[2]) < KERNEL_VERSION(4, 9, 0))
727 		ret = mlx5_link_update_unlocked_gset(dev, wait_to_complete);
728 	else
729 		ret = mlx5_link_update_unlocked_gs(dev, wait_to_complete);
730 	/* If lsc interrupt is disabled, should always be ready for traffic. */
731 	if (!dev->data->dev_conf.intr_conf.lsc) {
732 		priv_link_start(priv);
733 		return ret;
734 	}
735 	/* Re-select burst callbacks only if link status has been changed. */
736 	if (!ret && dev_link.link_status != dev->data->dev_link.link_status) {
737 		if (dev->data->dev_link.link_status == ETH_LINK_UP)
738 			priv_link_start(priv);
739 		else
740 			priv_link_stop(priv);
741 	}
742 	return ret;
743 }
744 
745 /**
746  * Querying the link status till it changes to the desired state.
747  * Number of query attempts is bounded by MLX5_MAX_LINK_QUERY_ATTEMPTS.
748  *
749  * @param priv
750  *   Pointer to private structure.
751  * @param status
752  *   Link desired status.
753  *
754  * @return
755  *   0 on success, negative errno value on failure.
756  */
757 int
758 priv_force_link_status_change(struct priv *priv, int status)
759 {
760 	int try = 0;
761 
762 	while (try < MLX5_MAX_LINK_QUERY_ATTEMPTS) {
763 		priv_link_update(priv, 0);
764 		if (priv->dev->data->dev_link.link_status == status)
765 			return 0;
766 		try++;
767 		sleep(1);
768 	}
769 	return -EAGAIN;
770 }
771 
772 /**
773  * DPDK callback to retrieve physical link information.
774  *
775  * @param dev
776  *   Pointer to Ethernet device structure.
777  * @param wait_to_complete
778  *   Wait for request completion (ignored).
779  */
780 int
781 mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete)
782 {
783 	struct priv *priv = dev->data->dev_private;
784 	int ret;
785 
786 	priv_lock(priv);
787 	ret = priv_link_update(priv, wait_to_complete);
788 	priv_unlock(priv);
789 	return ret;
790 }
791 
792 /**
793  * DPDK callback to change the MTU.
794  *
795  * @param dev
796  *   Pointer to Ethernet device structure.
797  * @param in_mtu
798  *   New MTU.
799  *
800  * @return
801  *   0 on success, negative errno value on failure.
802  */
803 int
804 mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
805 {
806 	struct priv *priv = dev->data->dev_private;
807 	uint16_t kern_mtu;
808 	int ret = 0;
809 
810 	priv_lock(priv);
811 	ret = priv_get_mtu(priv, &kern_mtu);
812 	if (ret)
813 		goto out;
814 	/* Set kernel interface MTU first. */
815 	ret = priv_set_mtu(priv, mtu);
816 	if (ret)
817 		goto out;
818 	ret = priv_get_mtu(priv, &kern_mtu);
819 	if (ret)
820 		goto out;
821 	if (kern_mtu == mtu) {
822 		priv->mtu = mtu;
823 		DEBUG("adapter port %u MTU set to %u", priv->port, mtu);
824 	}
825 	priv_unlock(priv);
826 	return 0;
827 out:
828 	ret = errno;
829 	WARN("cannot set port %u MTU to %u: %s", priv->port, mtu,
830 	     strerror(ret));
831 	priv_unlock(priv);
832 	assert(ret >= 0);
833 	return -ret;
834 }
835 
836 /**
837  * DPDK callback to get flow control status.
838  *
839  * @param dev
840  *   Pointer to Ethernet device structure.
841  * @param[out] fc_conf
842  *   Flow control output buffer.
843  *
844  * @return
845  *   0 on success, negative errno value on failure.
846  */
847 int
848 mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
849 {
850 	struct priv *priv = dev->data->dev_private;
851 	struct ifreq ifr;
852 	struct ethtool_pauseparam ethpause = {
853 		.cmd = ETHTOOL_GPAUSEPARAM
854 	};
855 	int ret;
856 
857 	ifr.ifr_data = (void *)&ethpause;
858 	priv_lock(priv);
859 	if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) {
860 		ret = errno;
861 		WARN("ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM)"
862 		     " failed: %s",
863 		     strerror(ret));
864 		goto out;
865 	}
866 
867 	fc_conf->autoneg = ethpause.autoneg;
868 	if (ethpause.rx_pause && ethpause.tx_pause)
869 		fc_conf->mode = RTE_FC_FULL;
870 	else if (ethpause.rx_pause)
871 		fc_conf->mode = RTE_FC_RX_PAUSE;
872 	else if (ethpause.tx_pause)
873 		fc_conf->mode = RTE_FC_TX_PAUSE;
874 	else
875 		fc_conf->mode = RTE_FC_NONE;
876 	ret = 0;
877 
878 out:
879 	priv_unlock(priv);
880 	assert(ret >= 0);
881 	return -ret;
882 }
883 
884 /**
885  * DPDK callback to modify flow control parameters.
886  *
887  * @param dev
888  *   Pointer to Ethernet device structure.
889  * @param[in] fc_conf
890  *   Flow control parameters.
891  *
892  * @return
893  *   0 on success, negative errno value on failure.
894  */
895 int
896 mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
897 {
898 	struct priv *priv = dev->data->dev_private;
899 	struct ifreq ifr;
900 	struct ethtool_pauseparam ethpause = {
901 		.cmd = ETHTOOL_SPAUSEPARAM
902 	};
903 	int ret;
904 
905 	ifr.ifr_data = (void *)&ethpause;
906 	ethpause.autoneg = fc_conf->autoneg;
907 	if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
908 	    (fc_conf->mode & RTE_FC_RX_PAUSE))
909 		ethpause.rx_pause = 1;
910 	else
911 		ethpause.rx_pause = 0;
912 
913 	if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
914 	    (fc_conf->mode & RTE_FC_TX_PAUSE))
915 		ethpause.tx_pause = 1;
916 	else
917 		ethpause.tx_pause = 0;
918 
919 	priv_lock(priv);
920 	if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) {
921 		ret = errno;
922 		WARN("ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)"
923 		     " failed: %s",
924 		     strerror(ret));
925 		goto out;
926 	}
927 	ret = 0;
928 
929 out:
930 	priv_unlock(priv);
931 	assert(ret >= 0);
932 	return -ret;
933 }
934 
935 /**
936  * Get PCI information from struct ibv_device.
937  *
938  * @param device
939  *   Pointer to Ethernet device structure.
940  * @param[out] pci_addr
941  *   PCI bus address output buffer.
942  *
943  * @return
944  *   0 on success, -1 on failure and errno is set.
945  */
946 int
947 mlx5_ibv_device_to_pci_addr(const struct ibv_device *device,
948 			    struct rte_pci_addr *pci_addr)
949 {
950 	FILE *file;
951 	char line[32];
952 	MKSTR(path, "%s/device/uevent", device->ibdev_path);
953 
954 	file = fopen(path, "rb");
955 	if (file == NULL)
956 		return -1;
957 	while (fgets(line, sizeof(line), file) == line) {
958 		size_t len = strlen(line);
959 		int ret;
960 
961 		/* Truncate long lines. */
962 		if (len == (sizeof(line) - 1))
963 			while (line[(len - 1)] != '\n') {
964 				ret = fgetc(file);
965 				if (ret == EOF)
966 					break;
967 				line[(len - 1)] = ret;
968 			}
969 		/* Extract information. */
970 		if (sscanf(line,
971 			   "PCI_SLOT_NAME="
972 			   "%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n",
973 			   &pci_addr->domain,
974 			   &pci_addr->bus,
975 			   &pci_addr->devid,
976 			   &pci_addr->function) == 4) {
977 			ret = 0;
978 			break;
979 		}
980 	}
981 	fclose(file);
982 	return 0;
983 }
984 
985 /**
986  * Update the link status.
987  *
988  * @param priv
989  *   Pointer to private structure.
990  *
991  * @return
992  *   Zero if the callback process can be called immediately.
993  */
994 static int
995 priv_link_status_update(struct priv *priv)
996 {
997 	struct rte_eth_link *link = &priv->dev->data->dev_link;
998 
999 	priv_link_update(priv, 0);
1000 	if (((link->link_speed == 0) && link->link_status) ||
1001 		((link->link_speed != 0) && !link->link_status)) {
1002 		/*
1003 		 * Inconsistent status. Event likely occurred before the
1004 		 * kernel netdevice exposes the new status.
1005 		 */
1006 		if (!priv->pending_alarm) {
1007 			priv->pending_alarm = 1;
1008 			rte_eal_alarm_set(MLX5_ALARM_TIMEOUT_US,
1009 					  mlx5_dev_link_status_handler,
1010 					  priv->dev);
1011 		}
1012 		return 1;
1013 	} else if (unlikely(priv->pending_alarm)) {
1014 		/* Link interrupt occurred while alarm is already scheduled. */
1015 		priv->pending_alarm = 0;
1016 		rte_eal_alarm_cancel(mlx5_dev_link_status_handler, priv->dev);
1017 	}
1018 	return 0;
1019 }
1020 
1021 /**
1022  * Device status handler.
1023  *
1024  * @param priv
1025  *   Pointer to private structure.
1026  * @param events
1027  *   Pointer to event flags holder.
1028  *
1029  * @return
1030  *   Events bitmap of callback process which can be called immediately.
1031  */
1032 static uint32_t
1033 priv_dev_status_handler(struct priv *priv)
1034 {
1035 	struct ibv_async_event event;
1036 	uint32_t ret = 0;
1037 
1038 	/* Read all message and acknowledge them. */
1039 	for (;;) {
1040 		if (mlx5_glue->get_async_event(priv->ctx, &event))
1041 			break;
1042 		if ((event.event_type == IBV_EVENT_PORT_ACTIVE ||
1043 			event.event_type == IBV_EVENT_PORT_ERR) &&
1044 			(priv->dev->data->dev_conf.intr_conf.lsc == 1))
1045 			ret |= (1 << RTE_ETH_EVENT_INTR_LSC);
1046 		else if (event.event_type == IBV_EVENT_DEVICE_FATAL &&
1047 			priv->dev->data->dev_conf.intr_conf.rmv == 1)
1048 			ret |= (1 << RTE_ETH_EVENT_INTR_RMV);
1049 		else
1050 			DEBUG("event type %d on port %d not handled",
1051 			      event.event_type, event.element.port_num);
1052 		mlx5_glue->ack_async_event(&event);
1053 	}
1054 	if (ret & (1 << RTE_ETH_EVENT_INTR_LSC))
1055 		if (priv_link_status_update(priv))
1056 			ret &= ~(1 << RTE_ETH_EVENT_INTR_LSC);
1057 	return ret;
1058 }
1059 
1060 /**
1061  * Handle delayed link status event.
1062  *
1063  * @param arg
1064  *   Registered argument.
1065  */
1066 void
1067 mlx5_dev_link_status_handler(void *arg)
1068 {
1069 	struct rte_eth_dev *dev = arg;
1070 	struct priv *priv = dev->data->dev_private;
1071 	int ret;
1072 
1073 	while (!priv_trylock(priv)) {
1074 		/* Alarm is being canceled. */
1075 		if (priv->pending_alarm == 0)
1076 			return;
1077 		rte_pause();
1078 	}
1079 	priv->pending_alarm = 0;
1080 	ret = priv_link_status_update(priv);
1081 	priv_unlock(priv);
1082 	if (!ret)
1083 		_rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL);
1084 }
1085 
1086 /**
1087  * Handle interrupts from the NIC.
1088  *
1089  * @param[in] intr_handle
1090  *   Interrupt handler.
1091  * @param cb_arg
1092  *   Callback argument.
1093  */
1094 void
1095 mlx5_dev_interrupt_handler(void *cb_arg)
1096 {
1097 	struct rte_eth_dev *dev = cb_arg;
1098 	struct priv *priv = dev->data->dev_private;
1099 	uint32_t events;
1100 
1101 	priv_lock(priv);
1102 	events = priv_dev_status_handler(priv);
1103 	priv_unlock(priv);
1104 	if (events & (1 << RTE_ETH_EVENT_INTR_LSC))
1105 		_rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL);
1106 	if (events & (1 << RTE_ETH_EVENT_INTR_RMV))
1107 		_rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_RMV, NULL);
1108 }
1109 
1110 /**
1111  * Handle interrupts from the socket.
1112  *
1113  * @param cb_arg
1114  *   Callback argument.
1115  */
1116 static void
1117 mlx5_dev_handler_socket(void *cb_arg)
1118 {
1119 	struct rte_eth_dev *dev = cb_arg;
1120 	struct priv *priv = dev->data->dev_private;
1121 
1122 	priv_lock(priv);
1123 	priv_socket_handle(priv);
1124 	priv_unlock(priv);
1125 }
1126 
1127 /**
1128  * Uninstall interrupt handler.
1129  *
1130  * @param priv
1131  *   Pointer to private structure.
1132  * @param dev
1133  *   Pointer to the rte_eth_dev structure.
1134  */
1135 void
1136 priv_dev_interrupt_handler_uninstall(struct priv *priv, struct rte_eth_dev *dev)
1137 {
1138 	if (dev->data->dev_conf.intr_conf.lsc ||
1139 	    dev->data->dev_conf.intr_conf.rmv)
1140 		rte_intr_callback_unregister(&priv->intr_handle,
1141 					     mlx5_dev_interrupt_handler, dev);
1142 	if (priv->primary_socket)
1143 		rte_intr_callback_unregister(&priv->intr_handle_socket,
1144 					     mlx5_dev_handler_socket, dev);
1145 	if (priv->pending_alarm) {
1146 		priv->pending_alarm = 0;
1147 		rte_eal_alarm_cancel(mlx5_dev_link_status_handler, dev);
1148 	}
1149 	priv->intr_handle.fd = 0;
1150 	priv->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
1151 	priv->intr_handle_socket.fd = 0;
1152 	priv->intr_handle_socket.type = RTE_INTR_HANDLE_UNKNOWN;
1153 }
1154 
1155 /**
1156  * Install interrupt handler.
1157  *
1158  * @param priv
1159  *   Pointer to private structure.
1160  * @param dev
1161  *   Pointer to the rte_eth_dev structure.
1162  */
1163 void
1164 priv_dev_interrupt_handler_install(struct priv *priv, struct rte_eth_dev *dev)
1165 {
1166 	int rc, flags;
1167 
1168 	assert(priv->ctx->async_fd > 0);
1169 	flags = fcntl(priv->ctx->async_fd, F_GETFL);
1170 	rc = fcntl(priv->ctx->async_fd, F_SETFL, flags | O_NONBLOCK);
1171 	if (rc < 0) {
1172 		INFO("failed to change file descriptor async event queue");
1173 		dev->data->dev_conf.intr_conf.lsc = 0;
1174 		dev->data->dev_conf.intr_conf.rmv = 0;
1175 	}
1176 	if (dev->data->dev_conf.intr_conf.lsc ||
1177 	    dev->data->dev_conf.intr_conf.rmv) {
1178 		priv->intr_handle.fd = priv->ctx->async_fd;
1179 		priv->intr_handle.type = RTE_INTR_HANDLE_EXT;
1180 		rte_intr_callback_register(&priv->intr_handle,
1181 					   mlx5_dev_interrupt_handler, dev);
1182 	}
1183 
1184 	rc = priv_socket_init(priv);
1185 	if (!rc && priv->primary_socket) {
1186 		priv->intr_handle_socket.fd = priv->primary_socket;
1187 		priv->intr_handle_socket.type = RTE_INTR_HANDLE_EXT;
1188 		rte_intr_callback_register(&priv->intr_handle_socket,
1189 					   mlx5_dev_handler_socket, dev);
1190 	}
1191 }
1192 
1193 /**
1194  * Change the link state (UP / DOWN).
1195  *
1196  * @param priv
1197  *   Pointer to private data structure.
1198  * @param up
1199  *   Nonzero for link up, otherwise link down.
1200  *
1201  * @return
1202  *   0 on success, errno value on failure.
1203  */
1204 static int
1205 priv_dev_set_link(struct priv *priv, int up)
1206 {
1207 	return priv_set_flags(priv, ~IFF_UP, up ? IFF_UP : ~IFF_UP);
1208 }
1209 
1210 /**
1211  * DPDK callback to bring the link DOWN.
1212  *
1213  * @param dev
1214  *   Pointer to Ethernet device structure.
1215  *
1216  * @return
1217  *   0 on success, errno value on failure.
1218  */
1219 int
1220 mlx5_set_link_down(struct rte_eth_dev *dev)
1221 {
1222 	struct priv *priv = dev->data->dev_private;
1223 	int err;
1224 
1225 	priv_lock(priv);
1226 	err = priv_dev_set_link(priv, 0);
1227 	priv_unlock(priv);
1228 	return err;
1229 }
1230 
1231 /**
1232  * DPDK callback to bring the link UP.
1233  *
1234  * @param dev
1235  *   Pointer to Ethernet device structure.
1236  *
1237  * @return
1238  *   0 on success, errno value on failure.
1239  */
1240 int
1241 mlx5_set_link_up(struct rte_eth_dev *dev)
1242 {
1243 	struct priv *priv = dev->data->dev_private;
1244 	int err;
1245 
1246 	priv_lock(priv);
1247 	err = priv_dev_set_link(priv, 1);
1248 	priv_unlock(priv);
1249 	return err;
1250 }
1251 
1252 /**
1253  * Configure the TX function to use.
1254  *
1255  * @param priv
1256  *   Pointer to private data structure.
1257  * @param dev
1258  *   Pointer to rte_eth_dev structure.
1259  *
1260  * @return
1261  *   Pointer to selected Tx burst function.
1262  */
1263 eth_tx_burst_t
1264 priv_select_tx_function(struct priv *priv, struct rte_eth_dev *dev)
1265 {
1266 	eth_tx_burst_t tx_pkt_burst = mlx5_tx_burst;
1267 	struct mlx5_dev_config *config = &priv->config;
1268 	uint64_t tx_offloads = dev->data->dev_conf.txmode.offloads;
1269 	int tso = !!(tx_offloads & (DEV_TX_OFFLOAD_TCP_TSO |
1270 				    DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
1271 				    DEV_TX_OFFLOAD_GRE_TNL_TSO));
1272 	int vlan_insert = !!(tx_offloads & DEV_TX_OFFLOAD_VLAN_INSERT);
1273 
1274 	assert(priv != NULL);
1275 	/* Select appropriate TX function. */
1276 	if (vlan_insert || tso)
1277 		return tx_pkt_burst;
1278 	if (config->mps == MLX5_MPW_ENHANCED) {
1279 		if (priv_check_vec_tx_support(priv, dev) > 0) {
1280 			if (priv_check_raw_vec_tx_support(priv, dev) > 0)
1281 				tx_pkt_burst = mlx5_tx_burst_raw_vec;
1282 			else
1283 				tx_pkt_burst = mlx5_tx_burst_vec;
1284 			DEBUG("selected Enhanced MPW TX vectorized function");
1285 		} else {
1286 			tx_pkt_burst = mlx5_tx_burst_empw;
1287 			DEBUG("selected Enhanced MPW TX function");
1288 		}
1289 	} else if (config->mps && (config->txq_inline > 0)) {
1290 		tx_pkt_burst = mlx5_tx_burst_mpw_inline;
1291 		DEBUG("selected MPW inline TX function");
1292 	} else if (config->mps) {
1293 		tx_pkt_burst = mlx5_tx_burst_mpw;
1294 		DEBUG("selected MPW TX function");
1295 	}
1296 	return tx_pkt_burst;
1297 }
1298 
1299 /**
1300  * Configure the RX function to use.
1301  *
1302  * @param priv
1303  *   Pointer to private data structure.
1304  * @param dev
1305  *   Pointer to rte_eth_dev structure.
1306  *
1307  * @return
1308  *   Pointer to selected Rx burst function.
1309  */
1310 eth_rx_burst_t
1311 priv_select_rx_function(struct priv *priv, __rte_unused struct rte_eth_dev *dev)
1312 {
1313 	eth_rx_burst_t rx_pkt_burst = mlx5_rx_burst;
1314 
1315 	assert(priv != NULL);
1316 	if (priv_check_vec_rx_support(priv) > 0) {
1317 		rx_pkt_burst = mlx5_rx_burst_vec;
1318 		DEBUG("selected RX vectorized function");
1319 	}
1320 	return rx_pkt_burst;
1321 }
1322 
1323 /**
1324  * Check if mlx5 device was removed.
1325  *
1326  * @param dev
1327  *   Pointer to Ethernet device structure.
1328  *
1329  * @return
1330  *   1 when device is removed, otherwise 0.
1331  */
1332 int
1333 mlx5_is_removed(struct rte_eth_dev *dev)
1334 {
1335 	struct ibv_device_attr device_attr;
1336 	struct priv *priv = dev->data->dev_private;
1337 
1338 	if (mlx5_glue->query_device(priv->ctx, &device_attr) == EIO)
1339 		return 1;
1340 	return 0;
1341 }
1342