xref: /dpdk/drivers/net/mlx5/linux/mlx5_ethdev_os.c (revision bd4a5aa413583aa698f10849c4784a3d524566bc)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2015 6WIND S.A.
3  * Copyright 2015 Mellanox Technologies, Ltd
4  */
5 
6 #include <stddef.h>
7 #include <inttypes.h>
8 #include <unistd.h>
9 #include <stdbool.h>
10 #include <stdint.h>
11 #include <stdio.h>
12 #include <string.h>
13 #include <stdlib.h>
14 #include <errno.h>
15 #include <dirent.h>
16 #include <net/if.h>
17 #include <sys/ioctl.h>
18 #include <sys/socket.h>
19 #include <netinet/in.h>
20 #include <linux/ethtool.h>
21 #include <linux/sockios.h>
22 #include <fcntl.h>
23 #include <stdalign.h>
24 #include <sys/un.h>
25 #include <time.h>
26 
27 #include <ethdev_linux_ethtool.h>
28 #include <ethdev_driver.h>
29 #include <bus_pci_driver.h>
30 #include <rte_mbuf.h>
31 #include <rte_common.h>
32 #include <rte_eal_paging.h>
33 #include <rte_interrupts.h>
34 #include <rte_malloc.h>
35 #include <rte_string_fns.h>
36 #include <rte_rwlock.h>
37 #include <rte_cycles.h>
38 
39 #include <mlx5_glue.h>
40 #include <mlx5_devx_cmds.h>
41 #include <mlx5_common.h>
42 #include <mlx5_malloc.h>
43 #include <mlx5_nl.h>
44 
45 #include "mlx5.h"
46 #include "mlx5_rxtx.h"
47 #include "mlx5_utils.h"
48 
49 /* Get interface index from SubFunction device name. */
50 int
51 mlx5_auxiliary_get_ifindex(const char *sf_name)
52 {
53 	char if_name[IF_NAMESIZE] = { 0 };
54 
55 	if (mlx5_auxiliary_get_child_name(sf_name, "/net",
56 					  if_name, sizeof(if_name)) != 0)
57 		return -rte_errno;
58 	return if_nametoindex(if_name);
59 }
60 
61 /**
62  * Get interface name from private structure.
63  *
64  * This is a port representor-aware version of mlx5_get_ifname_sysfs().
65  *
66  * @param[in] dev
67  *   Pointer to Ethernet device.
68  * @param[out] ifname
69  *   Interface name output buffer.
70  *
71  * @return
72  *   0 on success, a negative errno value otherwise and rte_errno is set.
73  */
74 int
75 mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[MLX5_NAMESIZE])
76 {
77 	struct mlx5_priv *priv = dev->data->dev_private;
78 	unsigned int ifindex;
79 
80 	MLX5_ASSERT(priv);
81 	MLX5_ASSERT(priv->sh);
82 	if (priv->master && priv->sh->bond.ifindex > 0) {
83 		memcpy(ifname, priv->sh->bond.ifname, MLX5_NAMESIZE);
84 		return 0;
85 	}
86 	ifindex = mlx5_ifindex(dev);
87 	if (!ifindex) {
88 		if (!priv->representor)
89 			return mlx5_get_ifname_sysfs(priv->sh->ibdev_path,
90 						     *ifname);
91 		rte_errno = ENXIO;
92 		return -rte_errno;
93 	}
94 	if (if_indextoname(ifindex, &(*ifname)[0]))
95 		return 0;
96 	rte_errno = errno;
97 	return -rte_errno;
98 }
99 
100 /**
101  * Perform ifreq ioctl() on associated netdev ifname.
102  *
103  * @param[in] ifname
104  *   Pointer to netdev name.
105  * @param req
106  *   Request number to pass to ioctl().
107  * @param[out] ifr
108  *   Interface request structure output buffer.
109  *
110  * @return
111  *   0 on success, a negative errno value otherwise and rte_errno is set.
112  */
113 static int
114 mlx5_ifreq_by_ifname(const char *ifname, int req, struct ifreq *ifr)
115 {
116 	int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
117 	int ret = 0;
118 
119 	if (sock == -1) {
120 		rte_errno = errno;
121 		return -rte_errno;
122 	}
123 	rte_strscpy(ifr->ifr_name, ifname, sizeof(ifr->ifr_name));
124 	ret = ioctl(sock, req, ifr);
125 	if (ret == -1) {
126 		rte_errno = errno;
127 		goto error;
128 	}
129 	close(sock);
130 	return 0;
131 error:
132 	close(sock);
133 	return -rte_errno;
134 }
135 
136 /**
137  * Perform ifreq ioctl() on associated Ethernet device.
138  *
139  * @param[in] dev
140  *   Pointer to Ethernet device.
141  * @param req
142  *   Request number to pass to ioctl().
143  * @param[out] ifr
144  *   Interface request structure output buffer.
145  *
146  * @return
147  *   0 on success, a negative errno value otherwise and rte_errno is set.
148  */
149 static int
150 mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr)
151 {
152 	char ifname[sizeof(ifr->ifr_name)];
153 	int ret;
154 
155 	ret = mlx5_get_ifname(dev, &ifname);
156 	if (ret)
157 		return -rte_errno;
158 	return mlx5_ifreq_by_ifname(ifname, req, ifr);
159 }
160 
161 /**
162  * Get device MTU.
163  *
164  * @param dev
165  *   Pointer to Ethernet device.
166  * @param[out] mtu
167  *   MTU value output buffer.
168  *
169  * @return
170  *   0 on success, a negative errno value otherwise and rte_errno is set.
171  */
172 int
173 mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu)
174 {
175 	struct ifreq request;
176 	int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request);
177 
178 	if (ret)
179 		return ret;
180 	*mtu = request.ifr_mtu;
181 	return 0;
182 }
183 
184 /**
185  * Set device MTU.
186  *
187  * @param dev
188  *   Pointer to Ethernet device.
189  * @param mtu
190  *   MTU value to set.
191  *
192  * @return
193  *   0 on success, a negative errno value otherwise and rte_errno is set.
194  */
195 int
196 mlx5_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
197 {
198 	struct ifreq request = { .ifr_mtu = mtu, };
199 
200 	return mlx5_ifreq(dev, SIOCSIFMTU, &request);
201 }
202 
203 /**
204  * Set device flags.
205  *
206  * @param dev
207  *   Pointer to Ethernet device.
208  * @param keep
209  *   Bitmask for flags that must remain untouched.
210  * @param flags
211  *   Bitmask for flags to modify.
212  *
213  * @return
214  *   0 on success, a negative errno value otherwise and rte_errno is set.
215  */
216 static int
217 mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep, unsigned int flags)
218 {
219 	struct ifreq request;
220 	int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request);
221 
222 	if (ret)
223 		return ret;
224 	request.ifr_flags &= keep;
225 	request.ifr_flags |= flags & ~keep;
226 	return mlx5_ifreq(dev, SIOCSIFFLAGS, &request);
227 }
228 
229 /**
230  * Get device current raw clock counter
231  *
232  * @param dev
233  *   Pointer to Ethernet device structure.
234  * @param[out] time
235  *   Current raw clock counter of the device.
236  *
237  * @return
238  *   0 if the clock has correctly been read
239  *   The value of errno in case of error
240  */
241 int
242 mlx5_read_clock(struct rte_eth_dev *dev, uint64_t *clock)
243 {
244 	struct mlx5_priv *priv = dev->data->dev_private;
245 	struct ibv_context *ctx = priv->sh->cdev->ctx;
246 	struct ibv_values_ex values;
247 	int err = 0;
248 
249 	values.comp_mask = IBV_VALUES_MASK_RAW_CLOCK;
250 	err = mlx5_glue->query_rt_values_ex(ctx, &values);
251 	if (err != 0) {
252 		DRV_LOG(WARNING, "Could not query the clock !");
253 		return err;
254 	}
255 	*clock = values.raw_clock.tv_nsec;
256 	return 0;
257 }
258 
259 /**
260  * Retrieve the master device for representor in the same switch domain.
261  *
262  * @param dev
263  *   Pointer to representor Ethernet device structure.
264  *
265  * @return
266  *   Master device structure  on success, NULL otherwise.
267  */
268 static struct rte_eth_dev *
269 mlx5_find_master_dev(struct rte_eth_dev *dev)
270 {
271 	struct mlx5_priv *priv;
272 	uint16_t port_id;
273 	uint16_t domain_id;
274 
275 	priv = dev->data->dev_private;
276 	domain_id = priv->domain_id;
277 	MLX5_ASSERT(priv->representor);
278 	MLX5_ETH_FOREACH_DEV(port_id, dev->device) {
279 		struct mlx5_priv *opriv =
280 			rte_eth_devices[port_id].data->dev_private;
281 		if (opriv &&
282 		    opriv->master &&
283 		    opriv->domain_id == domain_id &&
284 		    opriv->sh == priv->sh)
285 			return &rte_eth_devices[port_id];
286 	}
287 	return NULL;
288 }
289 
290 /**
291  * DPDK callback to retrieve physical link information.
292  *
293  * @param dev
294  *   Pointer to Ethernet device structure.
295  * @param[out] link
296  *   Storage for current link status.
297  *
298  * @return
299  *   0 on success, a negative errno value otherwise and rte_errno is set.
300  */
301 static int
302 mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev,
303 			       struct rte_eth_link *link)
304 {
305 	struct mlx5_priv *priv = dev->data->dev_private;
306 	struct ethtool_cmd edata = {
307 		.cmd = ETHTOOL_GSET /* Deprecated since Linux v4.5. */
308 	};
309 	struct ifreq ifr;
310 	struct rte_eth_link dev_link;
311 	int link_speed = 0;
312 	int ret;
313 
314 	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
315 	if (ret) {
316 		DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
317 			dev->data->port_id, strerror(rte_errno));
318 		return ret;
319 	}
320 	dev_link = (struct rte_eth_link) {
321 		.link_status = ((ifr.ifr_flags & IFF_UP) &&
322 				(ifr.ifr_flags & IFF_RUNNING)),
323 	};
324 	ifr = (struct ifreq) {
325 		.ifr_data = (void *)&edata,
326 	};
327 	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
328 	if (ret) {
329 		if (ret == -ENOTSUP && priv->representor) {
330 			struct rte_eth_dev *master;
331 
332 			/*
333 			 * For representors we can try to inherit link
334 			 * settings from the master device. Actually
335 			 * link settings do not make a lot of sense
336 			 * for representors due to missing physical
337 			 * link. The old kernel drivers supported
338 			 * emulated settings query for representors,
339 			 * the new ones do not, so we have to add
340 			 * this code for compatibility issues.
341 			 */
342 			master = mlx5_find_master_dev(dev);
343 			if (master) {
344 				ifr = (struct ifreq) {
345 					.ifr_data = (void *)&edata,
346 				};
347 				ret = mlx5_ifreq(master, SIOCETHTOOL, &ifr);
348 			}
349 		}
350 		if (ret) {
351 			DRV_LOG(WARNING,
352 				"port %u ioctl(SIOCETHTOOL,"
353 				" ETHTOOL_GSET) failed: %s",
354 				dev->data->port_id, strerror(rte_errno));
355 			return ret;
356 		}
357 	}
358 	link_speed = ethtool_cmd_speed(&edata);
359 	if (link_speed == -1)
360 		dev_link.link_speed = RTE_ETH_SPEED_NUM_UNKNOWN;
361 	else
362 		dev_link.link_speed = link_speed;
363 	dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ?
364 				RTE_ETH_LINK_HALF_DUPLEX : RTE_ETH_LINK_FULL_DUPLEX);
365 	dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
366 			RTE_ETH_LINK_SPEED_FIXED);
367 	*link = dev_link;
368 	priv->link_speed_capa = rte_eth_link_speed_gset(edata.supported);
369 	return 0;
370 }
371 
372 /**
373  * Retrieve physical link information (unlocked version using new ioctl).
374  *
375  * @param dev
376  *   Pointer to Ethernet device structure.
377  * @param[out] link
378  *   Storage for current link status.
379  *
380  * @return
381  *   0 on success, a negative errno value otherwise and rte_errno is set.
382  */
383 static int
384 mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
385 			     struct rte_eth_link *link)
386 
387 {
388 	struct mlx5_priv *priv = dev->data->dev_private;
389 	struct ethtool_link_settings gcmd = { .cmd = ETHTOOL_GLINKSETTINGS };
390 	struct ifreq ifr;
391 	struct rte_eth_link dev_link;
392 	struct rte_eth_dev *master = NULL;
393 	int ret;
394 
395 	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
396 	if (ret) {
397 		DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
398 			dev->data->port_id, strerror(rte_errno));
399 		return ret;
400 	}
401 	dev_link = (struct rte_eth_link) {
402 		.link_status = ((ifr.ifr_flags & IFF_UP) &&
403 				(ifr.ifr_flags & IFF_RUNNING)),
404 	};
405 	ifr = (struct ifreq) {
406 		.ifr_data = (void *)&gcmd,
407 	};
408 	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
409 	if (ret) {
410 		if (ret == -ENOTSUP && priv->representor) {
411 			/*
412 			 * For representors we can try to inherit link
413 			 * settings from the master device. Actually
414 			 * link settings do not make a lot of sense
415 			 * for representors due to missing physical
416 			 * link. The old kernel drivers supported
417 			 * emulated settings query for representors,
418 			 * the new ones do not, so we have to add
419 			 * this code for compatibility issues.
420 			 */
421 			master = mlx5_find_master_dev(dev);
422 			if (master) {
423 				ifr = (struct ifreq) {
424 					.ifr_data = (void *)&gcmd,
425 				};
426 				ret = mlx5_ifreq(master, SIOCETHTOOL, &ifr);
427 			}
428 		}
429 		if (ret) {
430 			DRV_LOG(DEBUG,
431 				"port %u ioctl(SIOCETHTOOL,"
432 				" ETHTOOL_GLINKSETTINGS) failed: %s",
433 				dev->data->port_id, strerror(rte_errno));
434 			return ret;
435 		}
436 	}
437 	gcmd.link_mode_masks_nwords = -gcmd.link_mode_masks_nwords;
438 
439 	alignas(struct ethtool_link_settings)
440 	uint8_t data[offsetof(struct ethtool_link_settings, link_mode_masks) +
441 		     sizeof(uint32_t) * gcmd.link_mode_masks_nwords * 3];
442 	struct ethtool_link_settings *ecmd = (void *)data;
443 
444 	*ecmd = gcmd;
445 	ifr.ifr_data = (void *)ecmd;
446 	ret = mlx5_ifreq(master ? master : dev, SIOCETHTOOL, &ifr);
447 	if (ret) {
448 		DRV_LOG(DEBUG,
449 			"port %u ioctl(SIOCETHTOOL,"
450 			"ETHTOOL_GLINKSETTINGS) failed: %s",
451 			dev->data->port_id, strerror(rte_errno));
452 		return ret;
453 	}
454 
455 	dev_link.link_speed = (ecmd->speed == UINT32_MAX) ?
456 				RTE_ETH_SPEED_NUM_UNKNOWN : ecmd->speed;
457 	dev_link.link_duplex = ((ecmd->duplex == DUPLEX_HALF) ?
458 				RTE_ETH_LINK_HALF_DUPLEX : RTE_ETH_LINK_FULL_DUPLEX);
459 	dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
460 				  RTE_ETH_LINK_SPEED_FIXED);
461 	*link = dev_link;
462 
463 	priv->link_speed_capa = rte_eth_link_speed_glink(ecmd->link_mode_masks,
464 			ecmd->link_mode_masks_nwords);
465 
466 	return 0;
467 }
468 
469 /**
470  * DPDK callback to retrieve physical link information.
471  *
472  * @param dev
473  *   Pointer to Ethernet device structure.
474  * @param wait_to_complete
475  *   Wait for request completion.
476  *
477  * @return
478  *   0 if link status was not updated, positive if it was, a negative errno
479  *   value otherwise and rte_errno is set.
480  */
481 int
482 mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete)
483 {
484 	int ret;
485 	struct rte_eth_link dev_link;
486 	time_t start_time = time(NULL);
487 	int retry = MLX5_GET_LINK_STATUS_RETRY_COUNT;
488 
489 	do {
490 		ret = mlx5_link_update_unlocked_gs(dev, &dev_link);
491 		if (ret == -ENOTSUP)
492 			ret = mlx5_link_update_unlocked_gset(dev, &dev_link);
493 		if (ret == 0)
494 			break;
495 		/* Handle wait to complete situation. */
496 		if ((wait_to_complete || retry) && ret == -EAGAIN) {
497 			if (abs((int)difftime(time(NULL), start_time)) <
498 			    MLX5_LINK_STATUS_TIMEOUT) {
499 				usleep(0);
500 				continue;
501 			} else {
502 				rte_errno = EBUSY;
503 				return -rte_errno;
504 			}
505 		} else if (ret < 0) {
506 			return ret;
507 		}
508 	} while (wait_to_complete || retry-- > 0);
509 	ret = !!memcmp(&dev->data->dev_link, &dev_link,
510 		       sizeof(struct rte_eth_link));
511 	dev->data->dev_link = dev_link;
512 	return ret;
513 }
514 
515 /**
516  * DPDK callback to get flow control status.
517  *
518  * @param dev
519  *   Pointer to Ethernet device structure.
520  * @param[out] fc_conf
521  *   Flow control output buffer.
522  *
523  * @return
524  *   0 on success, a negative errno value otherwise and rte_errno is set.
525  */
526 int
527 mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
528 {
529 	struct ifreq ifr;
530 	struct ethtool_pauseparam ethpause = {
531 		.cmd = ETHTOOL_GPAUSEPARAM
532 	};
533 	int ret;
534 
535 	ifr.ifr_data = (void *)&ethpause;
536 	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
537 	if (ret) {
538 		DRV_LOG(DEBUG,
539 			"port %u ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM) failed:"
540 			" %s",
541 			dev->data->port_id, strerror(rte_errno));
542 		return ret;
543 	}
544 	fc_conf->autoneg = ethpause.autoneg;
545 	if (ethpause.rx_pause && ethpause.tx_pause)
546 		fc_conf->mode = RTE_ETH_FC_FULL;
547 	else if (ethpause.rx_pause)
548 		fc_conf->mode = RTE_ETH_FC_RX_PAUSE;
549 	else if (ethpause.tx_pause)
550 		fc_conf->mode = RTE_ETH_FC_TX_PAUSE;
551 	else
552 		fc_conf->mode = RTE_ETH_FC_NONE;
553 	return 0;
554 }
555 
556 /**
557  * DPDK callback to modify flow control parameters.
558  *
559  * @param dev
560  *   Pointer to Ethernet device structure.
561  * @param[in] fc_conf
562  *   Flow control parameters.
563  *
564  * @return
565  *   0 on success, a negative errno value otherwise and rte_errno is set.
566  */
567 int
568 mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
569 {
570 	struct ifreq ifr;
571 	struct ethtool_pauseparam ethpause = {
572 		.cmd = ETHTOOL_SPAUSEPARAM
573 	};
574 	int ret;
575 
576 	ifr.ifr_data = (void *)&ethpause;
577 	ethpause.autoneg = fc_conf->autoneg;
578 	if (((fc_conf->mode & RTE_ETH_FC_FULL) == RTE_ETH_FC_FULL) ||
579 	    (fc_conf->mode & RTE_ETH_FC_RX_PAUSE))
580 		ethpause.rx_pause = 1;
581 	else
582 		ethpause.rx_pause = 0;
583 
584 	if (((fc_conf->mode & RTE_ETH_FC_FULL) == RTE_ETH_FC_FULL) ||
585 	    (fc_conf->mode & RTE_ETH_FC_TX_PAUSE))
586 		ethpause.tx_pause = 1;
587 	else
588 		ethpause.tx_pause = 0;
589 	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
590 	if (ret) {
591 		DRV_LOG(WARNING,
592 			"port %u ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)"
593 			" failed: %s",
594 			dev->data->port_id, strerror(rte_errno));
595 		return ret;
596 	}
597 	return 0;
598 }
599 
600 /**
601  * Handle asynchronous removal event for entire multiport device.
602  *
603  * @param sh
604  *   Infiniband device shared context.
605  */
606 static void
607 mlx5_dev_interrupt_device_fatal(struct mlx5_dev_ctx_shared *sh)
608 {
609 	uint32_t i;
610 
611 	for (i = 0; i < sh->max_port; ++i) {
612 		struct rte_eth_dev *dev;
613 		struct mlx5_priv *priv;
614 
615 		if (sh->port[i].ih_port_id >= RTE_MAX_ETHPORTS) {
616 			/*
617 			 * Or not existing port either no
618 			 * handler installed for this port.
619 			 */
620 			continue;
621 		}
622 		dev = &rte_eth_devices[sh->port[i].ih_port_id];
623 		MLX5_ASSERT(dev);
624 		priv = dev->data->dev_private;
625 		MLX5_ASSERT(priv);
626 		if (!priv->rmv_notified && dev->data->dev_conf.intr_conf.rmv) {
627 			/* Notify driver about removal only once. */
628 			priv->rmv_notified = 1;
629 			rte_eth_dev_callback_process
630 				(dev, RTE_ETH_EVENT_INTR_RMV, NULL);
631 		}
632 	}
633 }
634 
635 static bool
636 mlx5_dev_nl_ifindex_verify(uint32_t if_index, struct mlx5_priv *priv)
637 {
638 	struct mlx5_bond_info *bond = &priv->sh->bond;
639 	int i;
640 
641 	if (bond->n_port == 0)
642 		return (if_index == priv->if_index);
643 
644 	if (if_index == bond->ifindex)
645 		return true;
646 	for (i = 0; i < bond->n_port; i++) {
647 		if (i >= MLX5_BOND_MAX_PORTS)
648 			return false;
649 		if (if_index == bond->ports[i].ifindex)
650 			return true;
651 	}
652 
653 	return false;
654 }
655 
656 static void
657 mlx5_link_update_bond(struct rte_eth_dev *dev)
658 {
659 	struct mlx5_priv *priv = dev->data->dev_private;
660 	struct mlx5_bond_info *bond = &priv->sh->bond;
661 	struct ifreq ifr = (struct ifreq) {
662 		.ifr_flags = 0,
663 	};
664 	int ret;
665 
666 	ret = mlx5_ifreq_by_ifname(bond->ifname, SIOCGIFFLAGS, &ifr);
667 	if (ret) {
668 		DRV_LOG(WARNING, "ifname %s ioctl(SIOCGIFFLAGS) failed: %s",
669 			bond->ifname, strerror(rte_errno));
670 		return;
671 	}
672 	dev->data->dev_link.link_status =
673 		((ifr.ifr_flags & IFF_UP) && (ifr.ifr_flags & IFF_RUNNING));
674 }
675 
676 static void
677 mlx5_dev_interrupt_nl_cb(struct nlmsghdr *hdr, void *cb_arg)
678 {
679 	struct mlx5_dev_ctx_shared *sh = cb_arg;
680 	uint32_t i;
681 	uint32_t if_index;
682 
683 	if (mlx5_nl_parse_link_status_update(hdr, &if_index) < 0)
684 		return;
685 	for (i = 0; i < sh->max_port; i++) {
686 		struct mlx5_dev_shared_port *port = &sh->port[i];
687 		struct rte_eth_dev *dev;
688 		struct mlx5_priv *priv;
689 
690 		if (port->nl_ih_port_id >= RTE_MAX_ETHPORTS)
691 			continue;
692 		dev = &rte_eth_devices[port->nl_ih_port_id];
693 		/* Probing may initiate an LSC before configuration is done. */
694 		if (dev->data->dev_configured &&
695 		    !dev->data->dev_conf.intr_conf.lsc)
696 			break;
697 		priv = dev->data->dev_private;
698 		if (mlx5_dev_nl_ifindex_verify(if_index, priv)) {
699 			/* Block logical LSC events. */
700 			uint16_t prev_status = dev->data->dev_link.link_status;
701 
702 			if (mlx5_link_update(dev, 0) < 0) {
703 				DRV_LOG(ERR, "Failed to update link status: %s",
704 					rte_strerror(rte_errno));
705 			} else {
706 				if (priv->sh->bond.n_port)
707 					mlx5_link_update_bond(dev);
708 				if (prev_status != dev->data->dev_link.link_status)
709 					rte_eth_dev_callback_process
710 						(dev, RTE_ETH_EVENT_INTR_LSC, NULL);
711 			}
712 			break;
713 		}
714 	}
715 }
716 
717 void
718 mlx5_dev_interrupt_handler_nl(void *arg)
719 {
720 	struct mlx5_dev_ctx_shared *sh = arg;
721 	int nlsk_fd = rte_intr_fd_get(sh->intr_handle_nl);
722 
723 	if (nlsk_fd < 0)
724 		return;
725 	if (mlx5_nl_read_events(nlsk_fd, mlx5_dev_interrupt_nl_cb, sh) < 0)
726 		DRV_LOG(ERR, "Failed to process Netlink events: %s",
727 			rte_strerror(rte_errno));
728 }
729 
730 /**
731  * Handle shared asynchronous events the NIC (removal event
732  * and link status change). Supports multiport IB device.
733  *
734  * @param cb_arg
735  *   Callback argument.
736  */
737 void
738 mlx5_dev_interrupt_handler(void *cb_arg)
739 {
740 	struct mlx5_dev_ctx_shared *sh = cb_arg;
741 	struct ibv_async_event event;
742 
743 	/* Read all message from the IB device and acknowledge them. */
744 	for (;;) {
745 		struct rte_eth_dev *dev;
746 		uint32_t tmp;
747 
748 		if (mlx5_glue->get_async_event(sh->cdev->ctx, &event)) {
749 			if (errno == EIO) {
750 				DRV_LOG(DEBUG,
751 					"IBV async event queue closed on: %s",
752 					sh->ibdev_name);
753 				mlx5_dev_interrupt_device_fatal(sh);
754 			}
755 			break;
756 		}
757 		if (event.event_type == IBV_EVENT_DEVICE_FATAL) {
758 			/*
759 			 * The DEVICE_FATAL event can be called by kernel
760 			 * twice - from mlx5 and uverbs layers, and port
761 			 * index is not applicable. We should notify all
762 			 * existing ports.
763 			 */
764 			mlx5_dev_interrupt_device_fatal(sh);
765 			mlx5_glue->ack_async_event(&event);
766 			continue;
767 		}
768 		/* Retrieve and check IB port index. */
769 		tmp = (uint32_t)event.element.port_num;
770 		MLX5_ASSERT(tmp <= sh->max_port);
771 		if (!tmp) {
772 			/* Unsupported device level event. */
773 			mlx5_glue->ack_async_event(&event);
774 			DRV_LOG(DEBUG,
775 				"unsupported common event (type %d)",
776 				event.event_type);
777 			continue;
778 		}
779 		if (tmp > sh->max_port) {
780 			/* Invalid IB port index. */
781 			mlx5_glue->ack_async_event(&event);
782 			DRV_LOG(DEBUG,
783 				"cannot handle an event (type %d)"
784 				"due to invalid IB port index (%u)",
785 				event.event_type, tmp);
786 			continue;
787 		}
788 		if (sh->port[tmp - 1].ih_port_id >= RTE_MAX_ETHPORTS) {
789 			/* No handler installed. */
790 			mlx5_glue->ack_async_event(&event);
791 			DRV_LOG(DEBUG,
792 				"cannot handle an event (type %d)"
793 				"due to no handler installed for port %u",
794 				event.event_type, tmp);
795 			continue;
796 		}
797 		/* Retrieve ethernet device descriptor. */
798 		tmp = sh->port[tmp - 1].ih_port_id;
799 		dev = &rte_eth_devices[tmp];
800 		MLX5_ASSERT(dev);
801 		DRV_LOG(DEBUG,
802 			"port %u cannot handle an unknown event (type %d)",
803 			dev->data->port_id, event.event_type);
804 		mlx5_glue->ack_async_event(&event);
805 	}
806 }
807 
808 /**
809  * Handle DEVX interrupts from the NIC.
810  * This function is probably called from the DPDK host thread.
811  *
812  * @param cb_arg
813  *   Callback argument.
814  */
815 void
816 mlx5_dev_interrupt_handler_devx(void *cb_arg)
817 {
818 #ifndef HAVE_IBV_DEVX_ASYNC
819 	(void)cb_arg;
820 	return;
821 #else
822 	struct mlx5_dev_ctx_shared *sh = cb_arg;
823 	union {
824 		struct mlx5dv_devx_async_cmd_hdr cmd_resp;
825 		uint8_t buf[MLX5_ST_SZ_BYTES(query_flow_counter_out) +
826 			    MLX5_ST_SZ_BYTES(traffic_counter) +
827 			    sizeof(struct mlx5dv_devx_async_cmd_hdr)];
828 	} out;
829 	uint8_t *buf = out.buf + sizeof(out.cmd_resp);
830 
831 	while (!mlx5_glue->devx_get_async_cmd_comp(sh->devx_comp,
832 						   &out.cmd_resp,
833 						   sizeof(out.buf)))
834 		mlx5_flow_async_pool_query_handle
835 			(sh, (uint64_t)out.cmd_resp.wr_id,
836 			 mlx5_devx_get_out_command_status(buf));
837 #endif /* HAVE_IBV_DEVX_ASYNC */
838 }
839 
840 /**
841  * DPDK callback to bring the link DOWN.
842  *
843  * @param dev
844  *   Pointer to Ethernet device structure.
845  *
846  * @return
847  *   0 on success, a negative errno value otherwise and rte_errno is set.
848  */
849 int
850 mlx5_set_link_down(struct rte_eth_dev *dev)
851 {
852 	return mlx5_set_flags(dev, ~IFF_UP, ~IFF_UP);
853 }
854 
855 /**
856  * DPDK callback to bring the link UP.
857  *
858  * @param dev
859  *   Pointer to Ethernet device structure.
860  *
861  * @return
862  *   0 on success, a negative errno value otherwise and rte_errno is set.
863  */
864 int
865 mlx5_set_link_up(struct rte_eth_dev *dev)
866 {
867 	return mlx5_set_flags(dev, ~IFF_UP, IFF_UP);
868 }
869 
870 /**
871  * Check if mlx5 device was removed.
872  *
873  * @param dev
874  *   Pointer to Ethernet device structure.
875  *
876  * @return
877  *   1 when device is removed, otherwise 0.
878  */
879 int
880 mlx5_is_removed(struct rte_eth_dev *dev)
881 {
882 	struct ibv_device_attr device_attr;
883 	struct mlx5_priv *priv = dev->data->dev_private;
884 
885 	if (mlx5_glue->query_device(priv->sh->cdev->ctx, &device_attr) == EIO)
886 		return 1;
887 	return 0;
888 }
889 
890 /**
891  * Analyze gathered port parameters via sysfs to recognize master
892  * and representor devices for E-Switch configuration.
893  *
894  * @param[in] device_dir
895  *   flag of presence of "device" directory under port device key.
896  * @param[inout] switch_info
897  *   Port information, including port name as a number and port name
898  *   type if recognized
899  *
900  * @return
901  *   master and representor flags are set in switch_info according to
902  *   recognized parameters (if any).
903  */
904 static void
905 mlx5_sysfs_check_switch_info(bool device_dir,
906 			     struct mlx5_switch_info *switch_info)
907 {
908 	switch (switch_info->name_type) {
909 	case MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN:
910 		/*
911 		 * Name is not recognized, assume the master,
912 		 * check the device directory presence.
913 		 */
914 		switch_info->master = device_dir;
915 		break;
916 	case MLX5_PHYS_PORT_NAME_TYPE_NOTSET:
917 		/*
918 		 * Name is not set, this assumes the legacy naming
919 		 * schema for master, just check if there is
920 		 * a device directory.
921 		 */
922 		switch_info->master = device_dir;
923 		break;
924 	case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
925 		/* New uplink naming schema recognized. */
926 		switch_info->master = 1;
927 		break;
928 	case MLX5_PHYS_PORT_NAME_TYPE_LEGACY:
929 		/* Legacy representors naming schema. */
930 		switch_info->representor = !device_dir;
931 		break;
932 	case MLX5_PHYS_PORT_NAME_TYPE_PFHPF:
933 		/* Fallthrough */
934 	case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
935 		/* Fallthrough */
936 	case MLX5_PHYS_PORT_NAME_TYPE_PFSF:
937 		/* New representors naming schema. */
938 		switch_info->representor = 1;
939 		break;
940 	default:
941 		switch_info->master = device_dir;
942 		break;
943 	}
944 }
945 
946 /**
947  * Get switch information associated with network interface.
948  *
949  * @param ifindex
950  *   Network interface index.
951  * @param[out] info
952  *   Switch information object, populated in case of success.
953  *
954  * @return
955  *   0 on success, a negative errno value otherwise and rte_errno is set.
956  */
957 int
958 mlx5_sysfs_switch_info(unsigned int ifindex, struct mlx5_switch_info *info)
959 {
960 	char ifname[IF_NAMESIZE];
961 	char *port_name = NULL;
962 	size_t port_name_size = 0;
963 	FILE *file;
964 	struct mlx5_switch_info data = {
965 		.master = 0,
966 		.representor = 0,
967 		.name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET,
968 		.port_name = 0,
969 		.switch_id = 0,
970 	};
971 	DIR *dir;
972 	bool port_switch_id_set = false;
973 	bool device_dir = false;
974 	char c;
975 	ssize_t line_size;
976 
977 	if (!if_indextoname(ifindex, ifname)) {
978 		rte_errno = errno;
979 		return -rte_errno;
980 	}
981 
982 	MKSTR(phys_port_name, "/sys/class/net/%s/phys_port_name",
983 	      ifname);
984 	MKSTR(phys_switch_id, "/sys/class/net/%s/phys_switch_id",
985 	      ifname);
986 	MKSTR(pci_device, "/sys/class/net/%s/device",
987 	      ifname);
988 
989 	file = fopen(phys_port_name, "rb");
990 	if (file != NULL) {
991 		char *tail_nl;
992 
993 		line_size = getline(&port_name, &port_name_size, file);
994 		if (line_size < 0) {
995 			free(port_name);
996 			fclose(file);
997 			rte_errno = errno;
998 			return -rte_errno;
999 		} else if (line_size > 0) {
1000 			/* Remove tailing newline character. */
1001 			tail_nl = strchr(port_name, '\n');
1002 			if (tail_nl)
1003 				*tail_nl = '\0';
1004 			mlx5_translate_port_name(port_name, &data);
1005 		}
1006 		free(port_name);
1007 		fclose(file);
1008 	}
1009 	file = fopen(phys_switch_id, "rb");
1010 	if (file == NULL) {
1011 		rte_errno = errno;
1012 		return -rte_errno;
1013 	}
1014 	port_switch_id_set =
1015 		fscanf(file, "%" SCNx64 "%c", &data.switch_id, &c) == 2 &&
1016 		c == '\n';
1017 	fclose(file);
1018 	dir = opendir(pci_device);
1019 	if (dir != NULL) {
1020 		closedir(dir);
1021 		device_dir = true;
1022 	}
1023 	if (port_switch_id_set) {
1024 		/* We have some E-Switch configuration. */
1025 		mlx5_sysfs_check_switch_info(device_dir, &data);
1026 	}
1027 	*info = data;
1028 	MLX5_ASSERT(!(data.master && data.representor));
1029 	if (data.master && data.representor) {
1030 		DRV_LOG(ERR, "ifindex %u device is recognized as master"
1031 			     " and as representor", ifindex);
1032 		rte_errno = ENODEV;
1033 		return -rte_errno;
1034 	}
1035 	return 0;
1036 }
1037 
1038 /**
1039  * Get bond information associated with network interface.
1040  *
1041  * @param pf_ifindex
1042  *   Network interface index of bond slave interface
1043  * @param[out] ifindex
1044  *   Pointer to bond ifindex.
1045  * @param[out] ifname
1046  *   Pointer to bond ifname.
1047  *
1048  * @return
1049  *   0 on success, a negative errno value otherwise and rte_errno is set.
1050  */
1051 int
1052 mlx5_sysfs_bond_info(unsigned int pf_ifindex, unsigned int *ifindex,
1053 		     char *ifname)
1054 {
1055 	char name[IF_NAMESIZE];
1056 	FILE *file;
1057 	unsigned int index;
1058 	int ret;
1059 
1060 	if (!if_indextoname(pf_ifindex, name) || !strlen(name)) {
1061 		rte_errno = errno;
1062 		return -rte_errno;
1063 	}
1064 	MKSTR(bond_if, "/sys/class/net/%s/master/ifindex", name);
1065 	/* read bond ifindex */
1066 	file = fopen(bond_if, "rb");
1067 	if (file == NULL) {
1068 		rte_errno = errno;
1069 		return -rte_errno;
1070 	}
1071 	ret = fscanf(file, "%u", &index);
1072 	fclose(file);
1073 	if (ret <= 0) {
1074 		rte_errno = errno;
1075 		return -rte_errno;
1076 	}
1077 	if (ifindex)
1078 		*ifindex = index;
1079 
1080 	/* read bond device name from symbol link */
1081 	if (ifname) {
1082 		if (!if_indextoname(index, ifname)) {
1083 			rte_errno = errno;
1084 			return -rte_errno;
1085 		}
1086 	}
1087 	return 0;
1088 }
1089 
1090 /**
1091  * DPDK callback to retrieve plug-in module EEPROM information (type and size).
1092  *
1093  * @param dev
1094  *   Pointer to Ethernet device structure.
1095  * @param[out] modinfo
1096  *   Storage for plug-in module EEPROM information.
1097  *
1098  * @return
1099  *   0 on success, a negative errno value otherwise and rte_errno is set.
1100  */
1101 int
1102 mlx5_get_module_info(struct rte_eth_dev *dev,
1103 		     struct rte_eth_dev_module_info *modinfo)
1104 {
1105 	struct ethtool_modinfo info = {
1106 		.cmd = ETHTOOL_GMODULEINFO,
1107 	};
1108 	struct ifreq ifr = (struct ifreq) {
1109 		.ifr_data = (void *)&info,
1110 	};
1111 	int ret = 0;
1112 
1113 	if (!dev) {
1114 		DRV_LOG(WARNING, "missing argument, cannot get module info");
1115 		rte_errno = EINVAL;
1116 		return -rte_errno;
1117 	}
1118 	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
1119 	if (ret) {
1120 		DRV_LOG(WARNING, "port %u ioctl(SIOCETHTOOL) failed: %s",
1121 			dev->data->port_id, strerror(rte_errno));
1122 		return ret;
1123 	}
1124 	modinfo->type = info.type;
1125 	modinfo->eeprom_len = info.eeprom_len;
1126 	return ret;
1127 }
1128 
1129 /**
1130  * DPDK callback to retrieve plug-in module EEPROM data.
1131  *
1132  * @param dev
1133  *   Pointer to Ethernet device structure.
1134  * @param[out] info
1135  *   Storage for plug-in module EEPROM data.
1136  *
1137  * @return
1138  *   0 on success, a negative errno value otherwise and rte_errno is set.
1139  */
1140 int mlx5_get_module_eeprom(struct rte_eth_dev *dev,
1141 			   struct rte_dev_eeprom_info *info)
1142 {
1143 	struct ethtool_eeprom *eeprom;
1144 	struct ifreq ifr;
1145 	int ret = 0;
1146 
1147 	if (!dev) {
1148 		DRV_LOG(WARNING, "missing argument, cannot get module eeprom");
1149 		rte_errno = EINVAL;
1150 		return -rte_errno;
1151 	}
1152 	eeprom = mlx5_malloc(MLX5_MEM_ZERO,
1153 			     (sizeof(struct ethtool_eeprom) + info->length), 0,
1154 			     SOCKET_ID_ANY);
1155 	if (!eeprom) {
1156 		DRV_LOG(WARNING, "port %u cannot allocate memory for "
1157 			"eeprom data", dev->data->port_id);
1158 		rte_errno = ENOMEM;
1159 		return -rte_errno;
1160 	}
1161 	eeprom->cmd = ETHTOOL_GMODULEEEPROM;
1162 	eeprom->offset = info->offset;
1163 	eeprom->len = info->length;
1164 	ifr = (struct ifreq) {
1165 		.ifr_data = (void *)eeprom,
1166 	};
1167 	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
1168 	if (ret)
1169 		DRV_LOG(WARNING, "port %u ioctl(SIOCETHTOOL) failed: %s",
1170 			dev->data->port_id, strerror(rte_errno));
1171 	else
1172 		rte_memcpy(info->data, eeprom->data, info->length);
1173 	mlx5_free(eeprom);
1174 	return ret;
1175 }
1176 
1177 /**
1178  * Read device counters table.
1179  *
1180  * @param dev
1181  *   Pointer to Ethernet device.
1182  * @param[in] pf
1183  *   PF index in case of bonding device, -1 otherwise
1184  * @param[out] stats
1185  *   Counters table output buffer.
1186  *
1187  * @return
1188  *   0 on success and stats is filled, negative errno value otherwise and
1189  *   rte_errno is set.
1190  */
1191 static int
1192 _mlx5_os_read_dev_counters(struct rte_eth_dev *dev, int pf, uint64_t *stats)
1193 {
1194 	struct mlx5_priv *priv = dev->data->dev_private;
1195 	struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl;
1196 	unsigned int i;
1197 	struct ifreq ifr;
1198 	unsigned int max_stats_n = RTE_MAX(xstats_ctrl->stats_n, xstats_ctrl->stats_n_2nd);
1199 	unsigned int stats_sz = max_stats_n * sizeof(uint64_t);
1200 	unsigned char et_stat_buf[sizeof(struct ethtool_stats) + stats_sz];
1201 	struct ethtool_stats *et_stats = (struct ethtool_stats *)et_stat_buf;
1202 	int ret;
1203 	uint16_t i_idx, o_idx;
1204 
1205 	et_stats->cmd = ETHTOOL_GSTATS;
1206 	/* Pass the maximum value, the driver may ignore this. */
1207 	et_stats->n_stats = max_stats_n;
1208 	ifr.ifr_data = (caddr_t)et_stats;
1209 	if (pf >= 0)
1210 		ret = mlx5_ifreq_by_ifname(priv->sh->bond.ports[pf].ifname,
1211 					   SIOCETHTOOL, &ifr);
1212 	else
1213 		ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
1214 	if (ret) {
1215 		DRV_LOG(WARNING,
1216 			"port %u unable to read statistic values from device",
1217 			dev->data->port_id);
1218 		return ret;
1219 	}
1220 	if (pf <= 0) {
1221 		for (i = 0; i != xstats_ctrl->mlx5_stats_n; i++) {
1222 			i_idx = xstats_ctrl->dev_table_idx[i];
1223 			if (i_idx == UINT16_MAX || xstats_ctrl->info[i].dev)
1224 				continue;
1225 			o_idx = xstats_ctrl->xstats_o_idx[i];
1226 			stats[o_idx] += (uint64_t)et_stats->data[i_idx];
1227 		}
1228 	} else {
1229 		for (i = 0; i != xstats_ctrl->mlx5_stats_n; i++) {
1230 			i_idx = xstats_ctrl->dev_table_idx_2nd[i];
1231 			if (i_idx == UINT16_MAX)
1232 				continue;
1233 			o_idx = xstats_ctrl->xstats_o_idx_2nd[i];
1234 			stats[o_idx] += (uint64_t)et_stats->data[i_idx];
1235 		}
1236 	}
1237 	return 0;
1238 }
1239 
1240 /*
1241  * Read device counters.
1242  *
1243  * @param dev
1244  *   Pointer to Ethernet device.
1245  * @param bond_master
1246  *   Indicate if the device is a bond master.
1247  * @param stats
1248  *   Counters table output buffer.
1249  *
1250  * @return
1251  *   0 on success and stats is filled, negative errno value otherwise and
1252  *   rte_errno is set.
1253  */
1254 int
1255 mlx5_os_read_dev_counters(struct rte_eth_dev *dev, bool bond_master, uint64_t *stats)
1256 {
1257 	struct mlx5_priv *priv = dev->data->dev_private;
1258 	struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl;
1259 	int ret = 0, i;
1260 
1261 	memset(stats, 0, sizeof(*stats) * xstats_ctrl->mlx5_stats_n);
1262 	/* Read ifreq counters. */
1263 	if (bond_master) {
1264 		/* Sum xstats from bonding device member ports. */
1265 		for (i = 0; i < priv->sh->bond.n_port; i++) {
1266 			ret = _mlx5_os_read_dev_counters(dev, i, stats);
1267 			if (ret)
1268 				return ret;
1269 		}
1270 	} else {
1271 		ret = _mlx5_os_read_dev_counters(dev, -1, stats);
1272 		if (ret)
1273 			return ret;
1274 	}
1275 	/*
1276 	 * Read IB counters.
1277 	 * The counters are unique per IB device but not per net IF.
1278 	 * In bonding mode, getting the stats name only from 1 port is enough.
1279 	 */
1280 	for (i = 0; i != xstats_ctrl->mlx5_stats_n; i++) {
1281 		if (!xstats_ctrl->info[i].dev)
1282 			continue;
1283 		/* return last xstats counter if fail to read. */
1284 		if (mlx5_os_read_dev_stat(priv, xstats_ctrl->info[i].ctr_name,
1285 					  &stats[i]) == 0)
1286 			xstats_ctrl->xstats[i] = stats[i];
1287 		else
1288 			stats[i] = xstats_ctrl->xstats[i];
1289 	}
1290 	return ret;
1291 }
1292 
1293 /*
1294  * Query the number of statistics provided by ETHTOOL.
1295  *
1296  * @param dev
1297  *   Pointer to Ethernet device.
1298  * @param bond_master
1299  *   Indicate if the device is a bond master.
1300  * @param n_stats
1301  *   Pointer to number of stats to store.
1302  * @param n_stats_sec
1303  *   Pointer to number of stats to store for the 2nd port of the bond.
1304  *
1305  * @return
1306  *   0 on success, negative errno value otherwise and rte_errno is set.
1307  */
1308 int
1309 mlx5_os_get_stats_n(struct rte_eth_dev *dev, bool bond_master,
1310 		    uint16_t *n_stats, uint16_t *n_stats_sec)
1311 {
1312 	struct mlx5_priv *priv = dev->data->dev_private;
1313 	struct ethtool_drvinfo drvinfo;
1314 	struct ifreq ifr;
1315 	int ret;
1316 
1317 	drvinfo.cmd = ETHTOOL_GDRVINFO;
1318 	ifr.ifr_data = (caddr_t)&drvinfo;
1319 	/* Bonding PFs. */
1320 	if (bond_master) {
1321 		ret = mlx5_ifreq_by_ifname(priv->sh->bond.ports[0].ifname,
1322 					   SIOCETHTOOL, &ifr);
1323 		if (ret) {
1324 			DRV_LOG(WARNING, "bonding port %u unable to query number of"
1325 				" statistics for the 1st slave, %d", PORT_ID(priv), ret);
1326 			return ret;
1327 		}
1328 		*n_stats = drvinfo.n_stats;
1329 		ret = mlx5_ifreq_by_ifname(priv->sh->bond.ports[1].ifname,
1330 					   SIOCETHTOOL, &ifr);
1331 		if (ret) {
1332 			DRV_LOG(WARNING, "bonding port %u unable to query number of"
1333 				" statistics for the 2nd slave, %d", PORT_ID(priv), ret);
1334 			return ret;
1335 		}
1336 		*n_stats_sec = drvinfo.n_stats;
1337 	} else {
1338 		ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
1339 		if (ret) {
1340 			DRV_LOG(WARNING, "port %u unable to query number of statistics",
1341 				PORT_ID(priv));
1342 			return ret;
1343 		}
1344 		*n_stats = drvinfo.n_stats;
1345 	}
1346 	return 0;
1347 }
1348 
1349 static const struct mlx5_counter_ctrl mlx5_counters_init[] = {
1350 	{
1351 		.dpdk_name = "rx_unicast_bytes",
1352 		.ctr_name = "rx_vport_unicast_bytes",
1353 	},
1354 	{
1355 		.dpdk_name = "rx_multicast_bytes",
1356 		.ctr_name = "rx_vport_multicast_bytes",
1357 	},
1358 	{
1359 		.dpdk_name = "rx_broadcast_bytes",
1360 		.ctr_name = "rx_vport_broadcast_bytes",
1361 	},
1362 	{
1363 		.dpdk_name = "rx_unicast_packets",
1364 		.ctr_name = "rx_vport_unicast_packets",
1365 	},
1366 	{
1367 		.dpdk_name = "rx_multicast_packets",
1368 		.ctr_name = "rx_vport_multicast_packets",
1369 	},
1370 	{
1371 		.dpdk_name = "rx_broadcast_packets",
1372 		.ctr_name = "rx_vport_broadcast_packets",
1373 	},
1374 	{
1375 		.dpdk_name = "tx_unicast_bytes",
1376 		.ctr_name = "tx_vport_unicast_bytes",
1377 	},
1378 	{
1379 		.dpdk_name = "tx_multicast_bytes",
1380 		.ctr_name = "tx_vport_multicast_bytes",
1381 	},
1382 	{
1383 		.dpdk_name = "tx_broadcast_bytes",
1384 		.ctr_name = "tx_vport_broadcast_bytes",
1385 	},
1386 	{
1387 		.dpdk_name = "tx_unicast_packets",
1388 		.ctr_name = "tx_vport_unicast_packets",
1389 	},
1390 	{
1391 		.dpdk_name = "tx_multicast_packets",
1392 		.ctr_name = "tx_vport_multicast_packets",
1393 	},
1394 	{
1395 		.dpdk_name = "tx_broadcast_packets",
1396 		.ctr_name = "tx_vport_broadcast_packets",
1397 	},
1398 	{
1399 		.dpdk_name = "rx_wqe_errors",
1400 		.ctr_name = "rx_wqe_err",
1401 	},
1402 	{
1403 		.dpdk_name = "rx_phy_crc_errors",
1404 		.ctr_name = "rx_crc_errors_phy",
1405 	},
1406 	{
1407 		.dpdk_name = "rx_phy_in_range_len_errors",
1408 		.ctr_name = "rx_in_range_len_errors_phy",
1409 	},
1410 	{
1411 		.dpdk_name = "rx_phy_symbol_errors",
1412 		.ctr_name = "rx_symbol_err_phy",
1413 	},
1414 	{
1415 		.dpdk_name = "tx_phy_errors",
1416 		.ctr_name = "tx_errors_phy",
1417 	},
1418 	{
1419 		.dpdk_name = "rx_out_of_buffer",
1420 		.ctr_name = "out_of_buffer",
1421 		.dev = 1,
1422 	},
1423 	{
1424 		.dpdk_name = "tx_phy_packets",
1425 		.ctr_name = "tx_packets_phy",
1426 	},
1427 	{
1428 		.dpdk_name = "rx_phy_packets",
1429 		.ctr_name = "rx_packets_phy",
1430 	},
1431 	{
1432 		.dpdk_name = "tx_phy_discard_packets",
1433 		.ctr_name = "tx_discards_phy",
1434 	},
1435 	{
1436 		.dpdk_name = "rx_phy_discard_packets",
1437 		.ctr_name = "rx_discards_phy",
1438 	},
1439 	{
1440 		.dpdk_name = "rx_prio0_buf_discard_packets",
1441 		.ctr_name = "rx_prio0_buf_discard",
1442 	},
1443 	{
1444 		.dpdk_name = "rx_prio1_buf_discard_packets",
1445 		.ctr_name = "rx_prio1_buf_discard",
1446 	},
1447 	{
1448 		.dpdk_name = "rx_prio2_buf_discard_packets",
1449 		.ctr_name = "rx_prio2_buf_discard",
1450 	},
1451 	{
1452 		.dpdk_name = "rx_prio3_buf_discard_packets",
1453 		.ctr_name = "rx_prio3_buf_discard",
1454 	},
1455 	{
1456 		.dpdk_name = "rx_prio4_buf_discard_packets",
1457 		.ctr_name = "rx_prio4_buf_discard",
1458 	},
1459 	{
1460 		.dpdk_name = "rx_prio5_buf_discard_packets",
1461 		.ctr_name = "rx_prio5_buf_discard",
1462 	},
1463 	{
1464 		.dpdk_name = "rx_prio6_buf_discard_packets",
1465 		.ctr_name = "rx_prio6_buf_discard",
1466 	},
1467 	{
1468 		.dpdk_name = "rx_prio7_buf_discard_packets",
1469 		.ctr_name = "rx_prio7_buf_discard",
1470 	},
1471 	{
1472 		.dpdk_name = "rx_prio0_cong_discard_packets",
1473 		.ctr_name = "rx_prio0_cong_discard",
1474 	},
1475 	{
1476 		.dpdk_name = "rx_prio1_cong_discard_packets",
1477 		.ctr_name = "rx_prio1_cong_discard",
1478 	},
1479 	{
1480 		.dpdk_name = "rx_prio2_cong_discard_packets",
1481 		.ctr_name = "rx_prio2_cong_discard",
1482 	},
1483 	{
1484 		.dpdk_name = "rx_prio3_cong_discard_packets",
1485 		.ctr_name = "rx_prio3_cong_discard",
1486 	},
1487 	{
1488 		.dpdk_name = "rx_prio4_cong_discard_packets",
1489 		.ctr_name = "rx_prio4_cong_discard",
1490 	},
1491 	{
1492 		.dpdk_name = "rx_prio5_cong_discard_packets",
1493 		.ctr_name = "rx_prio5_cong_discard",
1494 	},
1495 	{
1496 		.dpdk_name = "rx_prio6_cong_discard_packets",
1497 		.ctr_name = "rx_prio6_cong_discard",
1498 	},
1499 	{
1500 		.dpdk_name = "rx_prio7_cong_discard_packets",
1501 		.ctr_name = "rx_prio7_cong_discard",
1502 	},
1503 	{
1504 		.dpdk_name = "tx_phy_bytes",
1505 		.ctr_name = "tx_bytes_phy",
1506 	},
1507 	{
1508 		.dpdk_name = "rx_phy_bytes",
1509 		.ctr_name = "rx_bytes_phy",
1510 	},
1511 	/* Representor only */
1512 	{
1513 		.dpdk_name = "rx_vport_packets",
1514 		.ctr_name = "vport_rx_packets",
1515 	},
1516 	{
1517 		.dpdk_name = "rx_vport_bytes",
1518 		.ctr_name = "vport_rx_bytes",
1519 	},
1520 	{
1521 		.dpdk_name = "tx_vport_packets",
1522 		.ctr_name = "vport_tx_packets",
1523 	},
1524 	{
1525 		.dpdk_name = "tx_vport_bytes",
1526 		.ctr_name = "vport_tx_bytes",
1527 	},
1528 	/**
1529 	 * Device counters: These counters are for the
1530 	 * entire PCI device (NIC). These counters are
1531 	 * not counting on a per port/queue basis.
1532 	 */
1533 	{
1534 		.dpdk_name = "rx_pci_signal_integrity",
1535 		.ctr_name = "rx_pci_signal_integrity",
1536 	},
1537 	{
1538 		.dpdk_name = "tx_pci_signal_integrity",
1539 		.ctr_name = "tx_pci_signal_integrity",
1540 	},
1541 	{
1542 		.dpdk_name = "outbound_pci_buffer_overflow",
1543 		.ctr_name = "outbound_pci_buffer_overflow",
1544 	},
1545 	{
1546 		.dpdk_name = "outbound_pci_stalled_rd",
1547 		.ctr_name = "outbound_pci_stalled_rd",
1548 	},
1549 	{
1550 		.dpdk_name = "outbound_pci_stalled_wr",
1551 		.ctr_name = "outbound_pci_stalled_wr",
1552 	},
1553 	{
1554 		.dpdk_name = "outbound_pci_stalled_rd_events",
1555 		.ctr_name = "outbound_pci_stalled_rd_events",
1556 	},
1557 	{
1558 		.dpdk_name = "outbound_pci_stalled_wr_events",
1559 		.ctr_name = "outbound_pci_stalled_wr_events",
1560 	},
1561 	{
1562 		.dpdk_name = "dev_out_of_buffer",
1563 		.ctr_name = "dev_out_of_buffer",
1564 	},
1565 };
1566 
1567 static const unsigned int xstats_n = RTE_DIM(mlx5_counters_init);
1568 
1569 static int
1570 mlx5_os_get_stats_strings(struct rte_eth_dev *dev, bool bond_master,
1571 			  struct ethtool_gstrings *strings,
1572 			  uint32_t stats_n, uint32_t stats_n_2nd)
1573 {
1574 	struct mlx5_priv *priv = dev->data->dev_private;
1575 	struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl;
1576 	struct ifreq ifr;
1577 	int ret;
1578 	uint32_t i, j, idx;
1579 
1580 	/* Ensure no out of bounds access before. */
1581 	MLX5_ASSERT(xstats_n <= MLX5_MAX_XSTATS);
1582 	strings->cmd = ETHTOOL_GSTRINGS;
1583 	strings->string_set = ETH_SS_STATS;
1584 	strings->len = stats_n;
1585 	ifr.ifr_data = (caddr_t)strings;
1586 	if (bond_master)
1587 		ret = mlx5_ifreq_by_ifname(priv->sh->bond.ports[0].ifname,
1588 					   SIOCETHTOOL, &ifr);
1589 	else
1590 		ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
1591 	if (ret) {
1592 		DRV_LOG(WARNING, "port %u unable to get statistic names with %d",
1593 			PORT_ID(priv), ret);
1594 		return ret;
1595 	}
1596 	/* Reorganize the orders to reduce the iterations. */
1597 	for (j = 0; j < xstats_n; j++) {
1598 		xstats_ctrl->dev_table_idx[j] = UINT16_MAX;
1599 		for (i = 0; i < stats_n; i++) {
1600 			const char *curr_string =
1601 				(const char *)&strings->data[i * ETH_GSTRING_LEN];
1602 
1603 			if (!strcmp(mlx5_counters_init[j].ctr_name, curr_string)) {
1604 				idx = xstats_ctrl->mlx5_stats_n++;
1605 				xstats_ctrl->dev_table_idx[j] = i;
1606 				xstats_ctrl->xstats_o_idx[j] = idx;
1607 				xstats_ctrl->info[idx] = mlx5_counters_init[j];
1608 			}
1609 		}
1610 	}
1611 	if (!bond_master) {
1612 		/* Add dev counters, unique per IB device. */
1613 		for (j = 0; j != xstats_n; j++) {
1614 			if (mlx5_counters_init[j].dev) {
1615 				idx = xstats_ctrl->mlx5_stats_n++;
1616 				xstats_ctrl->info[idx] = mlx5_counters_init[j];
1617 				xstats_ctrl->hw_stats[idx] = 0;
1618 			}
1619 		}
1620 		return 0;
1621 	}
1622 
1623 	strings->len = stats_n_2nd;
1624 	ret = mlx5_ifreq_by_ifname(priv->sh->bond.ports[1].ifname,
1625 				   SIOCETHTOOL, &ifr);
1626 	if (ret) {
1627 		DRV_LOG(WARNING, "port %u unable to get statistic names for 2nd slave with %d",
1628 			PORT_ID(priv), ret);
1629 		return ret;
1630 	}
1631 	/* The 2nd slave port may have a different strings set, based on the configuration. */
1632 	for (j = 0; j != xstats_n; j++) {
1633 		xstats_ctrl->dev_table_idx_2nd[j] = UINT16_MAX;
1634 		for (i = 0; i != stats_n_2nd; i++) {
1635 			const char *curr_string =
1636 				(const char *)&strings->data[i * ETH_GSTRING_LEN];
1637 
1638 			if (!strcmp(mlx5_counters_init[j].ctr_name, curr_string)) {
1639 				xstats_ctrl->dev_table_idx_2nd[j] = i;
1640 				if (xstats_ctrl->dev_table_idx[j] != UINT16_MAX) {
1641 					/* Already mapped in the 1st slave port. */
1642 					idx = xstats_ctrl->xstats_o_idx[j];
1643 					xstats_ctrl->xstats_o_idx_2nd[j] = idx;
1644 				} else {
1645 					/* Append the new items to the end of the map. */
1646 					idx = xstats_ctrl->mlx5_stats_n++;
1647 					xstats_ctrl->xstats_o_idx_2nd[j] = idx;
1648 					xstats_ctrl->info[idx] = mlx5_counters_init[j];
1649 				}
1650 			}
1651 		}
1652 	}
1653 	/* Dev counters are always at the last now. */
1654 	for (j = 0; j != xstats_n; j++) {
1655 		if (mlx5_counters_init[j].dev) {
1656 			idx = xstats_ctrl->mlx5_stats_n++;
1657 			xstats_ctrl->info[idx] = mlx5_counters_init[j];
1658 			xstats_ctrl->hw_stats[idx] = 0;
1659 		}
1660 	}
1661 	return 0;
1662 }
1663 
1664 /**
1665  * Init the structures to read device counters.
1666  *
1667  * @param dev
1668  *   Pointer to Ethernet device.
1669  */
1670 void
1671 mlx5_os_stats_init(struct rte_eth_dev *dev)
1672 {
1673 	struct mlx5_priv *priv = dev->data->dev_private;
1674 	struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl;
1675 	struct mlx5_stats_ctrl *stats_ctrl = &priv->stats_ctrl;
1676 	struct ethtool_gstrings *strings = NULL;
1677 	uint16_t dev_stats_n = 0;
1678 	uint16_t dev_stats_n_2nd = 0;
1679 	unsigned int max_stats_n;
1680 	unsigned int str_sz;
1681 	int ret;
1682 	bool bond_master = (priv->master && priv->pf_bond >= 0);
1683 
1684 	/* So that it won't aggregate for each init. */
1685 	xstats_ctrl->mlx5_stats_n = 0;
1686 	ret = mlx5_os_get_stats_n(dev, bond_master, &dev_stats_n, &dev_stats_n_2nd);
1687 	if (ret < 0) {
1688 		DRV_LOG(WARNING, "port %u no extended statistics available",
1689 			dev->data->port_id);
1690 		return;
1691 	}
1692 	max_stats_n = RTE_MAX(dev_stats_n, dev_stats_n_2nd);
1693 	/* Allocate memory to grab stat names and values. */
1694 	str_sz = max_stats_n * ETH_GSTRING_LEN;
1695 	strings = (struct ethtool_gstrings *)
1696 		  mlx5_malloc(0, str_sz + sizeof(struct ethtool_gstrings), 0,
1697 			      SOCKET_ID_ANY);
1698 	if (!strings) {
1699 		DRV_LOG(WARNING, "port %u unable to allocate memory for xstats",
1700 			dev->data->port_id);
1701 		return;
1702 	}
1703 	ret = mlx5_os_get_stats_strings(dev, bond_master, strings,
1704 					dev_stats_n, dev_stats_n_2nd);
1705 	if (ret < 0) {
1706 		DRV_LOG(WARNING, "port %u failed to get the stats strings",
1707 			dev->data->port_id);
1708 		goto free;
1709 	}
1710 	xstats_ctrl->stats_n = dev_stats_n;
1711 	xstats_ctrl->stats_n_2nd = dev_stats_n_2nd;
1712 	/* Copy to base at first time. */
1713 	ret = mlx5_os_read_dev_counters(dev, bond_master, xstats_ctrl->base);
1714 	if (ret)
1715 		DRV_LOG(ERR, "port %u cannot read device counters: %s",
1716 			dev->data->port_id, strerror(rte_errno));
1717 	mlx5_os_read_dev_stat(priv, "out_of_buffer", &stats_ctrl->imissed_base);
1718 	stats_ctrl->imissed = 0;
1719 free:
1720 	mlx5_free(strings);
1721 }
1722 
1723 /**
1724  * Get MAC address by querying netdevice.
1725  *
1726  * @param[in] dev
1727  *   Pointer to Ethernet device.
1728  * @param[out] mac
1729  *   MAC address output buffer.
1730  *
1731  * @return
1732  *   0 on success, a negative errno value otherwise and rte_errno is set.
1733  */
1734 int
1735 mlx5_get_mac(struct rte_eth_dev *dev, uint8_t (*mac)[RTE_ETHER_ADDR_LEN])
1736 {
1737 	struct ifreq request;
1738 	int ret;
1739 
1740 	ret = mlx5_ifreq(dev, SIOCGIFHWADDR, &request);
1741 	if (ret)
1742 		return ret;
1743 	memcpy(mac, request.ifr_hwaddr.sa_data, RTE_ETHER_ADDR_LEN);
1744 	return 0;
1745 }
1746 
1747 /*
1748  * Query dropless_rq private flag value provided by ETHTOOL.
1749  *
1750  * @param dev
1751  *   Pointer to Ethernet device.
1752  *
1753  * @return
1754  *   - 0 on success, flag is not set.
1755  *   - 1 on success, flag is set.
1756  *   - negative errno value otherwise and rte_errno is set.
1757  */
1758 int mlx5_get_flag_dropless_rq(struct rte_eth_dev *dev)
1759 {
1760 	struct ethtool_sset_info *sset_info = NULL;
1761 	struct ethtool_drvinfo drvinfo;
1762 	struct ifreq ifr;
1763 	struct ethtool_gstrings *strings = NULL;
1764 	struct ethtool_value flags;
1765 	const int32_t flag_len = sizeof(flags.data) * CHAR_BIT;
1766 	int32_t str_sz;
1767 	int32_t len;
1768 	int32_t i;
1769 	int ret;
1770 
1771 	sset_info = mlx5_malloc(0, sizeof(struct ethtool_sset_info) +
1772 			sizeof(uint32_t), 0, SOCKET_ID_ANY);
1773 	if (sset_info == NULL) {
1774 		rte_errno = ENOMEM;
1775 		return -rte_errno;
1776 	}
1777 	sset_info->cmd = ETHTOOL_GSSET_INFO;
1778 	sset_info->reserved = 0;
1779 	sset_info->sset_mask = 1ULL << ETH_SS_PRIV_FLAGS;
1780 	ifr.ifr_data = (caddr_t)&sset_info;
1781 	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
1782 	if (!ret) {
1783 		const uint32_t *sset_lengths = sset_info->data;
1784 
1785 		len = sset_info->sset_mask ? sset_lengths[0] : 0;
1786 	} else if (ret == -EOPNOTSUPP) {
1787 		drvinfo.cmd = ETHTOOL_GDRVINFO;
1788 		ifr.ifr_data = (caddr_t)&drvinfo;
1789 		ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
1790 		if (ret) {
1791 			DRV_LOG(WARNING, "port %u cannot get the driver info",
1792 				dev->data->port_id);
1793 			goto exit;
1794 		}
1795 		len = *(uint32_t *)((char *)&drvinfo +
1796 			offsetof(struct ethtool_drvinfo, n_priv_flags));
1797 	} else {
1798 		DRV_LOG(WARNING, "port %u cannot get the sset info",
1799 			dev->data->port_id);
1800 		goto exit;
1801 	}
1802 	if (!len) {
1803 		DRV_LOG(WARNING, "port %u does not have private flag",
1804 			dev->data->port_id);
1805 		rte_errno = EOPNOTSUPP;
1806 		ret = -rte_errno;
1807 		goto exit;
1808 	} else if (len > flag_len) {
1809 		DRV_LOG(WARNING, "port %u maximal private flags number is %d",
1810 			dev->data->port_id, flag_len);
1811 		len = flag_len;
1812 	}
1813 	str_sz = ETH_GSTRING_LEN * len;
1814 	strings = (struct ethtool_gstrings *)
1815 		  mlx5_malloc(0, str_sz + sizeof(struct ethtool_gstrings), 0,
1816 			      SOCKET_ID_ANY);
1817 	if (!strings) {
1818 		DRV_LOG(WARNING, "port %u unable to allocate memory for"
1819 			" private flags", dev->data->port_id);
1820 		rte_errno = ENOMEM;
1821 		ret = -rte_errno;
1822 		goto exit;
1823 	}
1824 	strings->cmd = ETHTOOL_GSTRINGS;
1825 	strings->string_set = ETH_SS_PRIV_FLAGS;
1826 	strings->len = len;
1827 	ifr.ifr_data = (caddr_t)strings;
1828 	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
1829 	if (ret) {
1830 		DRV_LOG(WARNING, "port %u unable to get private flags strings",
1831 			dev->data->port_id);
1832 		goto exit;
1833 	}
1834 	for (i = 0; i < len; i++) {
1835 		strings->data[(i + 1) * ETH_GSTRING_LEN - 1] = 0;
1836 		if (!strcmp((const char *)strings->data + i * ETH_GSTRING_LEN,
1837 			     "dropless_rq"))
1838 			break;
1839 	}
1840 	if (i == len) {
1841 		DRV_LOG(WARNING, "port %u does not support dropless_rq",
1842 			dev->data->port_id);
1843 		rte_errno = EOPNOTSUPP;
1844 		ret = -rte_errno;
1845 		goto exit;
1846 	}
1847 	flags.cmd = ETHTOOL_GPFLAGS;
1848 	ifr.ifr_data = (caddr_t)&flags;
1849 	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
1850 	if (ret) {
1851 		DRV_LOG(WARNING, "port %u unable to get private flags status",
1852 			dev->data->port_id);
1853 		goto exit;
1854 	}
1855 	ret = !!(flags.data & (1U << i));
1856 exit:
1857 	mlx5_free(strings);
1858 	mlx5_free(sset_info);
1859 	return ret;
1860 }
1861 
1862 /**
1863  * Unmaps HCA PCI BAR from the current process address space.
1864  *
1865  * @param dev
1866  *   Pointer to Ethernet device structure.
1867  */
1868 void mlx5_txpp_unmap_hca_bar(struct rte_eth_dev *dev)
1869 {
1870 	struct mlx5_proc_priv *ppriv = dev->process_private;
1871 
1872 	if (ppriv && ppriv->hca_bar) {
1873 		rte_mem_unmap(ppriv->hca_bar, MLX5_ST_SZ_BYTES(initial_seg));
1874 		ppriv->hca_bar = NULL;
1875 	}
1876 }
1877 
1878 /**
1879  * Maps HCA PCI BAR to the current process address space.
1880  * Stores pointer in the process private structure allowing
1881  * to read internal and real time counter directly from the HW.
1882  *
1883  * @param dev
1884  *   Pointer to Ethernet device structure.
1885  *
1886  * @return
1887  *   0 on success and not NULL pointer to mapped area in process structure.
1888  *   negative otherwise and NULL pointer
1889  */
1890 int mlx5_txpp_map_hca_bar(struct rte_eth_dev *dev)
1891 {
1892 	struct mlx5_proc_priv *ppriv = dev->process_private;
1893 	char pci_addr[PCI_PRI_STR_SIZE] = { 0 };
1894 	void *base, *expected = NULL;
1895 	int fd, ret;
1896 
1897 	if (!ppriv) {
1898 		rte_errno = ENOMEM;
1899 		return -rte_errno;
1900 	}
1901 	if (ppriv->hca_bar)
1902 		return 0;
1903 	ret = mlx5_dev_to_pci_str(dev->device, pci_addr, sizeof(pci_addr));
1904 	if (ret < 0)
1905 		return -rte_errno;
1906 	/* Open PCI device resource 0 - HCA initialize segment */
1907 	MKSTR(name, "/sys/bus/pci/devices/%s/resource0", pci_addr);
1908 	fd = open(name, O_RDWR | O_SYNC);
1909 	if (fd == -1) {
1910 		rte_errno = ENOTSUP;
1911 		return -ENOTSUP;
1912 	}
1913 	base = rte_mem_map(NULL, MLX5_ST_SZ_BYTES(initial_seg),
1914 			   RTE_PROT_READ, RTE_MAP_SHARED, fd, 0);
1915 	close(fd);
1916 	if (!base) {
1917 		rte_errno = ENOTSUP;
1918 		return -ENOTSUP;
1919 	}
1920 	/* Check there is no concurrent mapping in other thread. */
1921 	if (!__atomic_compare_exchange_n(&ppriv->hca_bar, &expected,
1922 					 base, false,
1923 					 __ATOMIC_RELAXED, __ATOMIC_RELAXED))
1924 		rte_mem_unmap(base, MLX5_ST_SZ_BYTES(initial_seg));
1925 	return 0;
1926 }
1927 
1928