xref: /dpdk/drivers/net/mlx5/linux/mlx5_ethdev_os.c (revision 1be61fe1333cd4a8a4bba6eb0d68780073fe3512)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2015 6WIND S.A.
3  * Copyright 2015 Mellanox Technologies, Ltd
4  */
5 
6 #include <stddef.h>
7 #include <inttypes.h>
8 #include <unistd.h>
9 #include <stdbool.h>
10 #include <stdint.h>
11 #include <stdio.h>
12 #include <string.h>
13 #include <stdlib.h>
14 #include <errno.h>
15 #include <dirent.h>
16 #include <net/if.h>
17 #include <sys/ioctl.h>
18 #include <sys/socket.h>
19 #include <netinet/in.h>
20 #include <linux/ethtool.h>
21 #include <linux/sockios.h>
22 #include <fcntl.h>
23 #include <stdalign.h>
24 #include <sys/un.h>
25 #include <time.h>
26 
27 #include <ethdev_linux_ethtool.h>
28 #include <ethdev_driver.h>
29 #include <bus_pci_driver.h>
30 #include <rte_mbuf.h>
31 #include <rte_common.h>
32 #include <rte_eal_paging.h>
33 #include <rte_interrupts.h>
34 #include <rte_malloc.h>
35 #include <rte_string_fns.h>
36 #include <rte_rwlock.h>
37 #include <rte_cycles.h>
38 
39 #include <mlx5_glue.h>
40 #include <mlx5_devx_cmds.h>
41 #include <mlx5_common.h>
42 #include <mlx5_malloc.h>
43 #include <mlx5_nl.h>
44 
45 #include "mlx5.h"
46 #include "mlx5_rxtx.h"
47 #include "mlx5_utils.h"
48 
49 /* Get interface index from SubFunction device name. */
50 int
51 mlx5_auxiliary_get_ifindex(const char *sf_name)
52 {
53 	char if_name[IF_NAMESIZE] = { 0 };
54 
55 	if (mlx5_auxiliary_get_child_name(sf_name, "/net",
56 					  if_name, sizeof(if_name)) != 0)
57 		return -rte_errno;
58 	return if_nametoindex(if_name);
59 }
60 
61 /**
62  * Get interface name from private structure.
63  *
64  * This is a port representor-aware version of mlx5_get_ifname_sysfs().
65  *
66  * @param[in] dev
67  *   Pointer to Ethernet device.
68  * @param[out] ifname
69  *   Interface name output buffer.
70  *
71  * @return
72  *   0 on success, a negative errno value otherwise and rte_errno is set.
73  */
74 int
75 mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[MLX5_NAMESIZE])
76 {
77 	struct mlx5_priv *priv = dev->data->dev_private;
78 	unsigned int ifindex;
79 
80 	MLX5_ASSERT(priv);
81 	MLX5_ASSERT(priv->sh);
82 	if (priv->master && priv->sh->bond.ifindex > 0) {
83 		memcpy(ifname, priv->sh->bond.ifname, MLX5_NAMESIZE);
84 		return 0;
85 	}
86 	ifindex = mlx5_ifindex(dev);
87 	if (!ifindex) {
88 		if (!priv->representor)
89 			return mlx5_get_ifname_sysfs(priv->sh->ibdev_path,
90 						     *ifname);
91 		rte_errno = ENXIO;
92 		return -rte_errno;
93 	}
94 	if (if_indextoname(ifindex, &(*ifname)[0]))
95 		return 0;
96 	rte_errno = errno;
97 	return -rte_errno;
98 }
99 
100 /**
101  * Perform ifreq ioctl() on associated netdev ifname.
102  *
103  * @param[in] ifname
104  *   Pointer to netdev name.
105  * @param req
106  *   Request number to pass to ioctl().
107  * @param[out] ifr
108  *   Interface request structure output buffer.
109  *
110  * @return
111  *   0 on success, a negative errno value otherwise and rte_errno is set.
112  */
113 static int
114 mlx5_ifreq_by_ifname(const char *ifname, int req, struct ifreq *ifr)
115 {
116 	int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
117 	int ret = 0;
118 
119 	if (sock == -1) {
120 		rte_errno = errno;
121 		return -rte_errno;
122 	}
123 	rte_strscpy(ifr->ifr_name, ifname, sizeof(ifr->ifr_name));
124 	ret = ioctl(sock, req, ifr);
125 	if (ret == -1) {
126 		rte_errno = errno;
127 		goto error;
128 	}
129 	close(sock);
130 	return 0;
131 error:
132 	close(sock);
133 	return -rte_errno;
134 }
135 
136 /**
137  * Perform ifreq ioctl() on associated Ethernet device.
138  *
139  * @param[in] dev
140  *   Pointer to Ethernet device.
141  * @param req
142  *   Request number to pass to ioctl().
143  * @param[out] ifr
144  *   Interface request structure output buffer.
145  *
146  * @return
147  *   0 on success, a negative errno value otherwise and rte_errno is set.
148  */
149 static int
150 mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr)
151 {
152 	char ifname[sizeof(ifr->ifr_name)];
153 	int ret;
154 
155 	ret = mlx5_get_ifname(dev, &ifname);
156 	if (ret)
157 		return -rte_errno;
158 	return mlx5_ifreq_by_ifname(ifname, req, ifr);
159 }
160 
161 /**
162  * Get device MTU.
163  *
164  * @param dev
165  *   Pointer to Ethernet device.
166  * @param[out] mtu
167  *   MTU value output buffer.
168  *
169  * @return
170  *   0 on success, a negative errno value otherwise and rte_errno is set.
171  */
172 int
173 mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu)
174 {
175 	struct ifreq request;
176 	int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request);
177 
178 	if (ret)
179 		return ret;
180 	*mtu = request.ifr_mtu;
181 	return 0;
182 }
183 
184 /**
185  * Set device MTU.
186  *
187  * @param dev
188  *   Pointer to Ethernet device.
189  * @param mtu
190  *   MTU value to set.
191  *
192  * @return
193  *   0 on success, a negative errno value otherwise and rte_errno is set.
194  */
195 int
196 mlx5_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
197 {
198 	struct ifreq request = { .ifr_mtu = mtu, };
199 
200 	return mlx5_ifreq(dev, SIOCSIFMTU, &request);
201 }
202 
203 /**
204  * Set device flags.
205  *
206  * @param dev
207  *   Pointer to Ethernet device.
208  * @param keep
209  *   Bitmask for flags that must remain untouched.
210  * @param flags
211  *   Bitmask for flags to modify.
212  *
213  * @return
214  *   0 on success, a negative errno value otherwise and rte_errno is set.
215  */
216 static int
217 mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep, unsigned int flags)
218 {
219 	struct ifreq request;
220 	int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request);
221 
222 	if (ret)
223 		return ret;
224 	request.ifr_flags &= keep;
225 	request.ifr_flags |= flags & ~keep;
226 	return mlx5_ifreq(dev, SIOCSIFFLAGS, &request);
227 }
228 
229 /**
230  * Get device current raw clock counter
231  *
232  * @param dev
233  *   Pointer to Ethernet device structure.
234  * @param[out] time
235  *   Current raw clock counter of the device.
236  *
237  * @return
238  *   0 if the clock has correctly been read
239  *   The value of errno in case of error
240  */
241 int
242 mlx5_read_clock(struct rte_eth_dev *dev, uint64_t *clock)
243 {
244 	struct mlx5_priv *priv = dev->data->dev_private;
245 	struct ibv_context *ctx = priv->sh->cdev->ctx;
246 	struct ibv_values_ex values;
247 	int err = 0;
248 
249 	values.comp_mask = IBV_VALUES_MASK_RAW_CLOCK;
250 	err = mlx5_glue->query_rt_values_ex(ctx, &values);
251 	if (err != 0) {
252 		DRV_LOG(WARNING, "Could not query the clock !");
253 		return err;
254 	}
255 	*clock = values.raw_clock.tv_nsec;
256 	return 0;
257 }
258 
259 /**
260  * Retrieve the master device for representor in the same switch domain.
261  *
262  * @param dev
263  *   Pointer to representor Ethernet device structure.
264  *
265  * @return
266  *   Master device structure  on success, NULL otherwise.
267  */
268 static struct rte_eth_dev *
269 mlx5_find_master_dev(struct rte_eth_dev *dev)
270 {
271 	struct mlx5_priv *priv;
272 	uint16_t port_id;
273 	uint16_t domain_id;
274 
275 	priv = dev->data->dev_private;
276 	domain_id = priv->domain_id;
277 	MLX5_ASSERT(priv->representor);
278 	MLX5_ETH_FOREACH_DEV(port_id, dev->device) {
279 		struct mlx5_priv *opriv =
280 			rte_eth_devices[port_id].data->dev_private;
281 		if (opriv &&
282 		    opriv->master &&
283 		    opriv->domain_id == domain_id &&
284 		    opriv->sh == priv->sh)
285 			return &rte_eth_devices[port_id];
286 	}
287 	return NULL;
288 }
289 
290 /**
291  * DPDK callback to retrieve physical link information.
292  *
293  * @param dev
294  *   Pointer to Ethernet device structure.
295  * @param[out] link
296  *   Storage for current link status.
297  *
298  * @return
299  *   0 on success, a negative errno value otherwise and rte_errno is set.
300  */
301 static int
302 mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev,
303 			       struct rte_eth_link *link)
304 {
305 	struct mlx5_priv *priv = dev->data->dev_private;
306 	struct ethtool_cmd edata = {
307 		.cmd = ETHTOOL_GSET /* Deprecated since Linux v4.5. */
308 	};
309 	struct ifreq ifr;
310 	struct rte_eth_link dev_link;
311 	int link_speed = 0;
312 	int ret;
313 
314 	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
315 	if (ret) {
316 		DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
317 			dev->data->port_id, strerror(rte_errno));
318 		return ret;
319 	}
320 	dev_link = (struct rte_eth_link) {
321 		.link_status = ((ifr.ifr_flags & IFF_UP) &&
322 				(ifr.ifr_flags & IFF_RUNNING)),
323 	};
324 	ifr = (struct ifreq) {
325 		.ifr_data = (void *)&edata,
326 	};
327 	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
328 	if (ret) {
329 		if (ret == -ENOTSUP && priv->representor) {
330 			struct rte_eth_dev *master;
331 
332 			/*
333 			 * For representors we can try to inherit link
334 			 * settings from the master device. Actually
335 			 * link settings do not make a lot of sense
336 			 * for representors due to missing physical
337 			 * link. The old kernel drivers supported
338 			 * emulated settings query for representors,
339 			 * the new ones do not, so we have to add
340 			 * this code for compatibility issues.
341 			 */
342 			master = mlx5_find_master_dev(dev);
343 			if (master) {
344 				ifr = (struct ifreq) {
345 					.ifr_data = (void *)&edata,
346 				};
347 				ret = mlx5_ifreq(master, SIOCETHTOOL, &ifr);
348 			}
349 		}
350 		if (ret) {
351 			DRV_LOG(WARNING,
352 				"port %u ioctl(SIOCETHTOOL,"
353 				" ETHTOOL_GSET) failed: %s",
354 				dev->data->port_id, strerror(rte_errno));
355 			return ret;
356 		}
357 	}
358 	link_speed = ethtool_cmd_speed(&edata);
359 	if (link_speed == -1)
360 		dev_link.link_speed = RTE_ETH_SPEED_NUM_UNKNOWN;
361 	else
362 		dev_link.link_speed = link_speed;
363 	dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ?
364 				RTE_ETH_LINK_HALF_DUPLEX : RTE_ETH_LINK_FULL_DUPLEX);
365 	dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
366 			RTE_ETH_LINK_SPEED_FIXED);
367 	*link = dev_link;
368 	priv->link_speed_capa = rte_eth_link_speed_gset(edata.supported);
369 	return 0;
370 }
371 
372 /**
373  * Retrieve physical link information (unlocked version using new ioctl).
374  *
375  * @param dev
376  *   Pointer to Ethernet device structure.
377  * @param[out] link
378  *   Storage for current link status.
379  *
380  * @return
381  *   0 on success, a negative errno value otherwise and rte_errno is set.
382  */
383 static int
384 mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
385 			     struct rte_eth_link *link)
386 
387 {
388 	struct mlx5_priv *priv = dev->data->dev_private;
389 	struct ethtool_link_settings gcmd = { .cmd = ETHTOOL_GLINKSETTINGS };
390 	struct ifreq ifr;
391 	struct rte_eth_link dev_link;
392 	struct rte_eth_dev *master = NULL;
393 	int ret;
394 
395 	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
396 	if (ret) {
397 		DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
398 			dev->data->port_id, strerror(rte_errno));
399 		return ret;
400 	}
401 	dev_link = (struct rte_eth_link) {
402 		.link_status = ((ifr.ifr_flags & IFF_UP) &&
403 				(ifr.ifr_flags & IFF_RUNNING)),
404 	};
405 	ifr = (struct ifreq) {
406 		.ifr_data = (void *)&gcmd,
407 	};
408 	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
409 	if (ret) {
410 		if (ret == -ENOTSUP && priv->representor) {
411 			/*
412 			 * For representors we can try to inherit link
413 			 * settings from the master device. Actually
414 			 * link settings do not make a lot of sense
415 			 * for representors due to missing physical
416 			 * link. The old kernel drivers supported
417 			 * emulated settings query for representors,
418 			 * the new ones do not, so we have to add
419 			 * this code for compatibility issues.
420 			 */
421 			master = mlx5_find_master_dev(dev);
422 			if (master) {
423 				ifr = (struct ifreq) {
424 					.ifr_data = (void *)&gcmd,
425 				};
426 				ret = mlx5_ifreq(master, SIOCETHTOOL, &ifr);
427 			}
428 		}
429 		if (ret) {
430 			DRV_LOG(DEBUG,
431 				"port %u ioctl(SIOCETHTOOL,"
432 				" ETHTOOL_GLINKSETTINGS) failed: %s",
433 				dev->data->port_id, strerror(rte_errno));
434 			return ret;
435 		}
436 	}
437 	gcmd.link_mode_masks_nwords = -gcmd.link_mode_masks_nwords;
438 
439 	alignas(struct ethtool_link_settings)
440 	uint8_t data[offsetof(struct ethtool_link_settings, link_mode_masks) +
441 		     sizeof(uint32_t) * gcmd.link_mode_masks_nwords * 3];
442 	struct ethtool_link_settings *ecmd = (void *)data;
443 
444 	*ecmd = gcmd;
445 	ifr.ifr_data = (void *)ecmd;
446 	ret = mlx5_ifreq(master ? master : dev, SIOCETHTOOL, &ifr);
447 	if (ret) {
448 		DRV_LOG(DEBUG,
449 			"port %u ioctl(SIOCETHTOOL,"
450 			"ETHTOOL_GLINKSETTINGS) failed: %s",
451 			dev->data->port_id, strerror(rte_errno));
452 		return ret;
453 	}
454 
455 	dev_link.link_speed = (ecmd->speed == UINT32_MAX) ?
456 				RTE_ETH_SPEED_NUM_UNKNOWN : ecmd->speed;
457 	dev_link.link_duplex = ((ecmd->duplex == DUPLEX_HALF) ?
458 				RTE_ETH_LINK_HALF_DUPLEX : RTE_ETH_LINK_FULL_DUPLEX);
459 	dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
460 				  RTE_ETH_LINK_SPEED_FIXED);
461 	*link = dev_link;
462 
463 	priv->link_speed_capa = rte_eth_link_speed_glink(ecmd->link_mode_masks,
464 			ecmd->link_mode_masks_nwords);
465 
466 	return 0;
467 }
468 
469 /**
470  * DPDK callback to retrieve physical link information.
471  *
472  * @param dev
473  *   Pointer to Ethernet device structure.
474  * @param wait_to_complete
475  *   Wait for request completion.
476  *
477  * @return
478  *   0 if link status was not updated, positive if it was, a negative errno
479  *   value otherwise and rte_errno is set.
480  */
481 int
482 mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete)
483 {
484 	int ret;
485 	struct rte_eth_link dev_link;
486 	time_t start_time = time(NULL);
487 	int retry = MLX5_GET_LINK_STATUS_RETRY_COUNT;
488 
489 	do {
490 		ret = mlx5_link_update_unlocked_gs(dev, &dev_link);
491 		if (ret == -ENOTSUP)
492 			ret = mlx5_link_update_unlocked_gset(dev, &dev_link);
493 		if (ret == 0)
494 			break;
495 		/* Handle wait to complete situation. */
496 		if ((wait_to_complete || retry) && ret == -EAGAIN) {
497 			if (abs((int)difftime(time(NULL), start_time)) <
498 			    MLX5_LINK_STATUS_TIMEOUT) {
499 				usleep(0);
500 				continue;
501 			} else {
502 				rte_errno = EBUSY;
503 				return -rte_errno;
504 			}
505 		} else if (ret < 0) {
506 			return ret;
507 		}
508 	} while (wait_to_complete || retry-- > 0);
509 	ret = !!memcmp(&dev->data->dev_link, &dev_link,
510 		       sizeof(struct rte_eth_link));
511 	dev->data->dev_link = dev_link;
512 	return ret;
513 }
514 
515 /**
516  * DPDK callback to get flow control status.
517  *
518  * @param dev
519  *   Pointer to Ethernet device structure.
520  * @param[out] fc_conf
521  *   Flow control output buffer.
522  *
523  * @return
524  *   0 on success, a negative errno value otherwise and rte_errno is set.
525  */
526 int
527 mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
528 {
529 	struct ifreq ifr;
530 	struct ethtool_pauseparam ethpause = {
531 		.cmd = ETHTOOL_GPAUSEPARAM
532 	};
533 	int ret;
534 
535 	ifr.ifr_data = (void *)&ethpause;
536 	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
537 	if (ret) {
538 		DRV_LOG(DEBUG,
539 			"port %u ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM) failed:"
540 			" %s",
541 			dev->data->port_id, strerror(rte_errno));
542 		return ret;
543 	}
544 	fc_conf->autoneg = ethpause.autoneg;
545 	if (ethpause.rx_pause && ethpause.tx_pause)
546 		fc_conf->mode = RTE_ETH_FC_FULL;
547 	else if (ethpause.rx_pause)
548 		fc_conf->mode = RTE_ETH_FC_RX_PAUSE;
549 	else if (ethpause.tx_pause)
550 		fc_conf->mode = RTE_ETH_FC_TX_PAUSE;
551 	else
552 		fc_conf->mode = RTE_ETH_FC_NONE;
553 	return 0;
554 }
555 
556 /**
557  * DPDK callback to modify flow control parameters.
558  *
559  * @param dev
560  *   Pointer to Ethernet device structure.
561  * @param[in] fc_conf
562  *   Flow control parameters.
563  *
564  * @return
565  *   0 on success, a negative errno value otherwise and rte_errno is set.
566  */
567 int
568 mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
569 {
570 	struct ifreq ifr;
571 	struct ethtool_pauseparam ethpause = {
572 		.cmd = ETHTOOL_SPAUSEPARAM
573 	};
574 	int ret;
575 
576 	ifr.ifr_data = (void *)&ethpause;
577 	ethpause.autoneg = fc_conf->autoneg;
578 	if (((fc_conf->mode & RTE_ETH_FC_FULL) == RTE_ETH_FC_FULL) ||
579 	    (fc_conf->mode & RTE_ETH_FC_RX_PAUSE))
580 		ethpause.rx_pause = 1;
581 	else
582 		ethpause.rx_pause = 0;
583 
584 	if (((fc_conf->mode & RTE_ETH_FC_FULL) == RTE_ETH_FC_FULL) ||
585 	    (fc_conf->mode & RTE_ETH_FC_TX_PAUSE))
586 		ethpause.tx_pause = 1;
587 	else
588 		ethpause.tx_pause = 0;
589 	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
590 	if (ret) {
591 		DRV_LOG(WARNING,
592 			"port %u ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)"
593 			" failed: %s",
594 			dev->data->port_id, strerror(rte_errno));
595 		return ret;
596 	}
597 	return 0;
598 }
599 
600 /**
601  * Handle asynchronous removal event for entire multiport device.
602  *
603  * @param sh
604  *   Infiniband device shared context.
605  */
606 static void
607 mlx5_dev_interrupt_device_fatal(struct mlx5_dev_ctx_shared *sh)
608 {
609 	uint32_t i;
610 
611 	for (i = 0; i < sh->max_port; ++i) {
612 		struct rte_eth_dev *dev;
613 		struct mlx5_priv *priv;
614 
615 		if (sh->port[i].ih_port_id >= RTE_MAX_ETHPORTS) {
616 			/*
617 			 * Or not existing port either no
618 			 * handler installed for this port.
619 			 */
620 			continue;
621 		}
622 		dev = &rte_eth_devices[sh->port[i].ih_port_id];
623 		MLX5_ASSERT(dev);
624 		priv = dev->data->dev_private;
625 		MLX5_ASSERT(priv);
626 		if (!priv->rmv_notified && dev->data->dev_conf.intr_conf.rmv) {
627 			/* Notify driver about removal only once. */
628 			priv->rmv_notified = 1;
629 			rte_eth_dev_callback_process
630 				(dev, RTE_ETH_EVENT_INTR_RMV, NULL);
631 		}
632 	}
633 }
634 
635 static bool
636 mlx5_dev_nl_ifindex_verify(uint32_t if_index, struct mlx5_priv *priv)
637 {
638 	struct mlx5_bond_info *bond = &priv->sh->bond;
639 	int i;
640 
641 	if (bond->n_port == 0)
642 		return (if_index == priv->if_index);
643 
644 	if (if_index == bond->ifindex)
645 		return true;
646 	for (i = 0; i < bond->n_port; i++) {
647 		if (i >= MLX5_BOND_MAX_PORTS)
648 			return false;
649 		if (if_index == bond->ports[i].ifindex)
650 			return true;
651 	}
652 
653 	return false;
654 }
655 
656 static void
657 mlx5_link_update_bond(struct rte_eth_dev *dev)
658 {
659 	struct mlx5_priv *priv = dev->data->dev_private;
660 	struct mlx5_bond_info *bond = &priv->sh->bond;
661 	struct ifreq ifr = (struct ifreq) {
662 		.ifr_flags = 0,
663 	};
664 	int ret;
665 
666 	ret = mlx5_ifreq_by_ifname(bond->ifname, SIOCGIFFLAGS, &ifr);
667 	if (ret) {
668 		DRV_LOG(WARNING, "ifname %s ioctl(SIOCGIFFLAGS) failed: %s",
669 			bond->ifname, strerror(rte_errno));
670 		return;
671 	}
672 	dev->data->dev_link.link_status =
673 		((ifr.ifr_flags & IFF_UP) && (ifr.ifr_flags & IFF_RUNNING));
674 }
675 
676 static void
677 mlx5_dev_interrupt_nl_cb(struct nlmsghdr *hdr, void *cb_arg)
678 {
679 	struct mlx5_dev_ctx_shared *sh = cb_arg;
680 	uint32_t i;
681 	uint32_t if_index;
682 
683 	if (mlx5_nl_parse_link_status_update(hdr, &if_index) < 0)
684 		return;
685 	for (i = 0; i < sh->max_port; i++) {
686 		struct mlx5_dev_shared_port *port = &sh->port[i];
687 		struct rte_eth_dev *dev;
688 		struct mlx5_priv *priv;
689 
690 		if (port->nl_ih_port_id >= RTE_MAX_ETHPORTS)
691 			continue;
692 		dev = &rte_eth_devices[port->nl_ih_port_id];
693 		/* Probing may initiate an LSC before configuration is done. */
694 		if (dev->data->dev_configured &&
695 		    !dev->data->dev_conf.intr_conf.lsc)
696 			break;
697 		priv = dev->data->dev_private;
698 		if (mlx5_dev_nl_ifindex_verify(if_index, priv)) {
699 			/* Block logical LSC events. */
700 			uint16_t prev_status = dev->data->dev_link.link_status;
701 
702 			if (mlx5_link_update(dev, 0) < 0) {
703 				DRV_LOG(ERR, "Failed to update link status: %s",
704 					rte_strerror(rte_errno));
705 			} else {
706 				if (priv->sh->bond.n_port)
707 					mlx5_link_update_bond(dev);
708 				if (prev_status != dev->data->dev_link.link_status)
709 					rte_eth_dev_callback_process
710 						(dev, RTE_ETH_EVENT_INTR_LSC, NULL);
711 			}
712 			break;
713 		}
714 	}
715 }
716 
717 void
718 mlx5_dev_interrupt_handler_nl(void *arg)
719 {
720 	struct mlx5_dev_ctx_shared *sh = arg;
721 	int nlsk_fd = rte_intr_fd_get(sh->intr_handle_nl);
722 
723 	if (nlsk_fd < 0)
724 		return;
725 	if (mlx5_nl_read_events(nlsk_fd, mlx5_dev_interrupt_nl_cb, sh) < 0)
726 		DRV_LOG(ERR, "Failed to process Netlink events: %s",
727 			rte_strerror(rte_errno));
728 }
729 
730 /**
731  * Handle shared asynchronous events the NIC (removal event
732  * and link status change). Supports multiport IB device.
733  *
734  * @param cb_arg
735  *   Callback argument.
736  */
737 void
738 mlx5_dev_interrupt_handler(void *cb_arg)
739 {
740 	struct mlx5_dev_ctx_shared *sh = cb_arg;
741 	struct ibv_async_event event;
742 
743 	/* Read all message from the IB device and acknowledge them. */
744 	for (;;) {
745 		struct rte_eth_dev *dev;
746 		uint32_t tmp;
747 
748 		if (mlx5_glue->get_async_event(sh->cdev->ctx, &event)) {
749 			if (errno == EIO) {
750 				DRV_LOG(DEBUG,
751 					"IBV async event queue closed on: %s",
752 					sh->ibdev_name);
753 				mlx5_dev_interrupt_device_fatal(sh);
754 			}
755 			break;
756 		}
757 		if (event.event_type == IBV_EVENT_DEVICE_FATAL) {
758 			/*
759 			 * The DEVICE_FATAL event can be called by kernel
760 			 * twice - from mlx5 and uverbs layers, and port
761 			 * index is not applicable. We should notify all
762 			 * existing ports.
763 			 */
764 			mlx5_dev_interrupt_device_fatal(sh);
765 			mlx5_glue->ack_async_event(&event);
766 			continue;
767 		}
768 		/* Retrieve and check IB port index. */
769 		tmp = (uint32_t)event.element.port_num;
770 		MLX5_ASSERT(tmp <= sh->max_port);
771 		if (!tmp) {
772 			/* Unsupported device level event. */
773 			mlx5_glue->ack_async_event(&event);
774 			DRV_LOG(DEBUG,
775 				"unsupported common event (type %d)",
776 				event.event_type);
777 			continue;
778 		}
779 		if (tmp > sh->max_port) {
780 			/* Invalid IB port index. */
781 			mlx5_glue->ack_async_event(&event);
782 			DRV_LOG(DEBUG,
783 				"cannot handle an event (type %d)"
784 				"due to invalid IB port index (%u)",
785 				event.event_type, tmp);
786 			continue;
787 		}
788 		if (sh->port[tmp - 1].ih_port_id >= RTE_MAX_ETHPORTS) {
789 			/* No handler installed. */
790 			mlx5_glue->ack_async_event(&event);
791 			DRV_LOG(DEBUG,
792 				"cannot handle an event (type %d)"
793 				"due to no handler installed for port %u",
794 				event.event_type, tmp);
795 			continue;
796 		}
797 		/* Retrieve ethernet device descriptor. */
798 		tmp = sh->port[tmp - 1].ih_port_id;
799 		dev = &rte_eth_devices[tmp];
800 		MLX5_ASSERT(dev);
801 		DRV_LOG(DEBUG,
802 			"port %u cannot handle an unknown event (type %d)",
803 			dev->data->port_id, event.event_type);
804 		mlx5_glue->ack_async_event(&event);
805 	}
806 }
807 
808 /**
809  * Handle DEVX interrupts from the NIC.
810  * This function is probably called from the DPDK host thread.
811  *
812  * @param cb_arg
813  *   Callback argument.
814  */
815 void
816 mlx5_dev_interrupt_handler_devx(void *cb_arg)
817 {
818 #ifndef HAVE_IBV_DEVX_ASYNC
819 	(void)cb_arg;
820 	return;
821 #else
822 	struct mlx5_dev_ctx_shared *sh = cb_arg;
823 	union {
824 		struct mlx5dv_devx_async_cmd_hdr cmd_resp;
825 		uint8_t buf[MLX5_ST_SZ_BYTES(query_flow_counter_out) +
826 			    MLX5_ST_SZ_BYTES(traffic_counter) +
827 			    sizeof(struct mlx5dv_devx_async_cmd_hdr)];
828 	} out;
829 	uint8_t *buf = out.buf + sizeof(out.cmd_resp);
830 
831 	while (!mlx5_glue->devx_get_async_cmd_comp(sh->devx_comp,
832 						   &out.cmd_resp,
833 						   sizeof(out.buf)))
834 		mlx5_flow_async_pool_query_handle
835 			(sh, (uint64_t)out.cmd_resp.wr_id,
836 			 mlx5_devx_get_out_command_status(buf));
837 #endif /* HAVE_IBV_DEVX_ASYNC */
838 }
839 
840 /**
841  * DPDK callback to bring the link DOWN.
842  *
843  * @param dev
844  *   Pointer to Ethernet device structure.
845  *
846  * @return
847  *   0 on success, a negative errno value otherwise and rte_errno is set.
848  */
849 int
850 mlx5_set_link_down(struct rte_eth_dev *dev)
851 {
852 	return mlx5_set_flags(dev, ~IFF_UP, ~IFF_UP);
853 }
854 
855 /**
856  * DPDK callback to bring the link UP.
857  *
858  * @param dev
859  *   Pointer to Ethernet device structure.
860  *
861  * @return
862  *   0 on success, a negative errno value otherwise and rte_errno is set.
863  */
864 int
865 mlx5_set_link_up(struct rte_eth_dev *dev)
866 {
867 	return mlx5_set_flags(dev, ~IFF_UP, IFF_UP);
868 }
869 
870 /**
871  * Check if mlx5 device was removed.
872  *
873  * @param dev
874  *   Pointer to Ethernet device structure.
875  *
876  * @return
877  *   1 when device is removed, otherwise 0.
878  */
879 int
880 mlx5_is_removed(struct rte_eth_dev *dev)
881 {
882 	struct ibv_device_attr device_attr;
883 	struct mlx5_priv *priv = dev->data->dev_private;
884 
885 	if (mlx5_glue->query_device(priv->sh->cdev->ctx, &device_attr) == EIO)
886 		return 1;
887 	return 0;
888 }
889 
890 /**
891  * Analyze gathered port parameters via sysfs to recognize master
892  * and representor devices for E-Switch configuration.
893  *
894  * @param[in] device_dir
895  *   flag of presence of "device" directory under port device key.
896  * @param[inout] switch_info
897  *   Port information, including port name as a number and port name
898  *   type if recognized
899  *
900  * @return
901  *   master and representor flags are set in switch_info according to
902  *   recognized parameters (if any).
903  */
904 static void
905 mlx5_sysfs_check_switch_info(bool device_dir,
906 			     struct mlx5_switch_info *switch_info)
907 {
908 	switch (switch_info->name_type) {
909 	case MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN:
910 		/*
911 		 * Name is not recognized, assume the master,
912 		 * check the device directory presence.
913 		 */
914 		switch_info->master = device_dir;
915 		break;
916 	case MLX5_PHYS_PORT_NAME_TYPE_NOTSET:
917 		/*
918 		 * Name is not set, this assumes the legacy naming
919 		 * schema for master, just check if there is
920 		 * a device directory.
921 		 */
922 		switch_info->master = device_dir;
923 		break;
924 	case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
925 		/* New uplink naming schema recognized. */
926 		switch_info->master = 1;
927 		break;
928 	case MLX5_PHYS_PORT_NAME_TYPE_LEGACY:
929 		/* Legacy representors naming schema. */
930 		switch_info->representor = !device_dir;
931 		break;
932 	case MLX5_PHYS_PORT_NAME_TYPE_PFHPF:
933 		/* Fallthrough */
934 	case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
935 		/* Fallthrough */
936 	case MLX5_PHYS_PORT_NAME_TYPE_PFSF:
937 		/* New representors naming schema. */
938 		switch_info->representor = 1;
939 		break;
940 	default:
941 		switch_info->master = device_dir;
942 		break;
943 	}
944 }
945 
946 /**
947  * Get switch information associated with network interface.
948  *
949  * @param ifindex
950  *   Network interface index.
951  * @param[out] info
952  *   Switch information object, populated in case of success.
953  *
954  * @return
955  *   0 on success, a negative errno value otherwise and rte_errno is set.
956  */
957 int
958 mlx5_sysfs_switch_info(unsigned int ifindex, struct mlx5_switch_info *info)
959 {
960 	char ifname[IF_NAMESIZE];
961 	char *port_name = NULL;
962 	size_t port_name_size = 0;
963 	FILE *file;
964 	struct mlx5_switch_info data = {
965 		.master = 0,
966 		.representor = 0,
967 		.name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET,
968 		.port_name = 0,
969 		.switch_id = 0,
970 	};
971 	DIR *dir;
972 	bool port_switch_id_set = false;
973 	bool device_dir = false;
974 	char c;
975 	ssize_t line_size;
976 
977 	if (!if_indextoname(ifindex, ifname)) {
978 		rte_errno = errno;
979 		return -rte_errno;
980 	}
981 
982 	MKSTR(phys_port_name, "/sys/class/net/%s/phys_port_name",
983 	      ifname);
984 	MKSTR(phys_switch_id, "/sys/class/net/%s/phys_switch_id",
985 	      ifname);
986 	MKSTR(pci_device, "/sys/class/net/%s/device",
987 	      ifname);
988 
989 	file = fopen(phys_port_name, "rb");
990 	if (file != NULL) {
991 		char *tail_nl;
992 
993 		line_size = getline(&port_name, &port_name_size, file);
994 		if (line_size < 0) {
995 			free(port_name);
996 			fclose(file);
997 			rte_errno = errno;
998 			return -rte_errno;
999 		} else if (line_size > 0) {
1000 			/* Remove tailing newline character. */
1001 			tail_nl = strchr(port_name, '\n');
1002 			if (tail_nl)
1003 				*tail_nl = '\0';
1004 			mlx5_translate_port_name(port_name, &data);
1005 		}
1006 		free(port_name);
1007 		fclose(file);
1008 	}
1009 	file = fopen(phys_switch_id, "rb");
1010 	if (file == NULL) {
1011 		rte_errno = errno;
1012 		return -rte_errno;
1013 	}
1014 	port_switch_id_set =
1015 		fscanf(file, "%" SCNx64 "%c", &data.switch_id, &c) == 2 &&
1016 		c == '\n';
1017 	fclose(file);
1018 	dir = opendir(pci_device);
1019 	if (dir != NULL) {
1020 		closedir(dir);
1021 		device_dir = true;
1022 	}
1023 	if (port_switch_id_set) {
1024 		/* We have some E-Switch configuration. */
1025 		mlx5_sysfs_check_switch_info(device_dir, &data);
1026 	}
1027 	*info = data;
1028 	MLX5_ASSERT(!(data.master && data.representor));
1029 	if (data.master && data.representor) {
1030 		DRV_LOG(ERR, "ifindex %u device is recognized as master"
1031 			     " and as representor", ifindex);
1032 		rte_errno = ENODEV;
1033 		return -rte_errno;
1034 	}
1035 	return 0;
1036 }
1037 
1038 /**
1039  * Get bond information associated with network interface.
1040  *
1041  * @param pf_ifindex
1042  *   Network interface index of bond slave interface
1043  * @param[out] ifindex
1044  *   Pointer to bond ifindex.
1045  * @param[out] ifname
1046  *   Pointer to bond ifname.
1047  *
1048  * @return
1049  *   0 on success, a negative errno value otherwise and rte_errno is set.
1050  */
1051 int
1052 mlx5_sysfs_bond_info(unsigned int pf_ifindex, unsigned int *ifindex,
1053 		     char *ifname)
1054 {
1055 	char name[IF_NAMESIZE];
1056 	FILE *file;
1057 	unsigned int index;
1058 	int ret;
1059 
1060 	if (!if_indextoname(pf_ifindex, name) || !strlen(name)) {
1061 		rte_errno = errno;
1062 		return -rte_errno;
1063 	}
1064 	MKSTR(bond_if, "/sys/class/net/%s/master/ifindex", name);
1065 	/* read bond ifindex */
1066 	file = fopen(bond_if, "rb");
1067 	if (file == NULL) {
1068 		rte_errno = errno;
1069 		return -rte_errno;
1070 	}
1071 	ret = fscanf(file, "%u", &index);
1072 	fclose(file);
1073 	if (ret <= 0) {
1074 		rte_errno = errno;
1075 		return -rte_errno;
1076 	}
1077 	if (ifindex)
1078 		*ifindex = index;
1079 
1080 	/* read bond device name from symbol link */
1081 	if (ifname) {
1082 		if (!if_indextoname(index, ifname)) {
1083 			rte_errno = errno;
1084 			return -rte_errno;
1085 		}
1086 	}
1087 	return 0;
1088 }
1089 
1090 /**
1091  * DPDK callback to retrieve plug-in module EEPROM information (type and size).
1092  *
1093  * @param dev
1094  *   Pointer to Ethernet device structure.
1095  * @param[out] modinfo
1096  *   Storage for plug-in module EEPROM information.
1097  *
1098  * @return
1099  *   0 on success, a negative errno value otherwise and rte_errno is set.
1100  */
1101 int
1102 mlx5_get_module_info(struct rte_eth_dev *dev,
1103 		     struct rte_eth_dev_module_info *modinfo)
1104 {
1105 	struct ethtool_modinfo info = {
1106 		.cmd = ETHTOOL_GMODULEINFO,
1107 	};
1108 	struct ifreq ifr = (struct ifreq) {
1109 		.ifr_data = (void *)&info,
1110 	};
1111 	int ret = 0;
1112 
1113 	if (!dev) {
1114 		DRV_LOG(WARNING, "missing argument, cannot get module info");
1115 		rte_errno = EINVAL;
1116 		return -rte_errno;
1117 	}
1118 	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
1119 	if (ret) {
1120 		DRV_LOG(WARNING, "port %u ioctl(SIOCETHTOOL) failed: %s",
1121 			dev->data->port_id, strerror(rte_errno));
1122 		return ret;
1123 	}
1124 	modinfo->type = info.type;
1125 	modinfo->eeprom_len = info.eeprom_len;
1126 	return ret;
1127 }
1128 
1129 /**
1130  * DPDK callback to retrieve plug-in module EEPROM data.
1131  *
1132  * @param dev
1133  *   Pointer to Ethernet device structure.
1134  * @param[out] info
1135  *   Storage for plug-in module EEPROM data.
1136  *
1137  * @return
1138  *   0 on success, a negative errno value otherwise and rte_errno is set.
1139  */
1140 int mlx5_get_module_eeprom(struct rte_eth_dev *dev,
1141 			   struct rte_dev_eeprom_info *info)
1142 {
1143 	struct ethtool_eeprom *eeprom;
1144 	struct ifreq ifr;
1145 	int ret = 0;
1146 
1147 	if (!dev) {
1148 		DRV_LOG(WARNING, "missing argument, cannot get module eeprom");
1149 		rte_errno = EINVAL;
1150 		return -rte_errno;
1151 	}
1152 	eeprom = mlx5_malloc(MLX5_MEM_ZERO,
1153 			     (sizeof(struct ethtool_eeprom) + info->length), 0,
1154 			     SOCKET_ID_ANY);
1155 	if (!eeprom) {
1156 		DRV_LOG(WARNING, "port %u cannot allocate memory for "
1157 			"eeprom data", dev->data->port_id);
1158 		rte_errno = ENOMEM;
1159 		return -rte_errno;
1160 	}
1161 	eeprom->cmd = ETHTOOL_GMODULEEEPROM;
1162 	eeprom->offset = info->offset;
1163 	eeprom->len = info->length;
1164 	ifr = (struct ifreq) {
1165 		.ifr_data = (void *)eeprom,
1166 	};
1167 	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
1168 	if (ret)
1169 		DRV_LOG(WARNING, "port %u ioctl(SIOCETHTOOL) failed: %s",
1170 			dev->data->port_id, strerror(rte_errno));
1171 	else
1172 		rte_memcpy(info->data, eeprom->data, info->length);
1173 	mlx5_free(eeprom);
1174 	return ret;
1175 }
1176 
1177 /**
1178  * Read device counters table.
1179  *
1180  * @param dev
1181  *   Pointer to Ethernet device.
1182  * @param[in] pf
1183  *   PF index in case of bonding device, -1 otherwise
1184  * @param[out] stats
1185  *   Counters table output buffer.
1186  *
1187  * @return
1188  *   0 on success and stats is filled, negative errno value otherwise and
1189  *   rte_errno is set.
1190  */
1191 static int
1192 _mlx5_os_read_dev_counters(struct rte_eth_dev *dev, int pf, uint64_t *stats)
1193 {
1194 	struct mlx5_priv *priv = dev->data->dev_private;
1195 	struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl;
1196 	unsigned int i;
1197 	struct ifreq ifr;
1198 	unsigned int max_stats_n = RTE_MAX(xstats_ctrl->stats_n, xstats_ctrl->stats_n_2nd);
1199 	unsigned int stats_sz = max_stats_n * sizeof(uint64_t);
1200 	unsigned char et_stat_buf[sizeof(struct ethtool_stats) + stats_sz];
1201 	struct ethtool_stats *et_stats = (struct ethtool_stats *)et_stat_buf;
1202 	int ret;
1203 	uint16_t i_idx, o_idx;
1204 	uint32_t total_stats = xstats_n;
1205 
1206 	et_stats->cmd = ETHTOOL_GSTATS;
1207 	/* Pass the maximum value, the driver may ignore this. */
1208 	et_stats->n_stats = max_stats_n;
1209 	ifr.ifr_data = (caddr_t)et_stats;
1210 	if (pf >= 0)
1211 		ret = mlx5_ifreq_by_ifname(priv->sh->bond.ports[pf].ifname,
1212 					   SIOCETHTOOL, &ifr);
1213 	else
1214 		ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
1215 	if (ret) {
1216 		DRV_LOG(WARNING,
1217 			"port %u unable to read statistic values from device",
1218 			dev->data->port_id);
1219 		return ret;
1220 	}
1221 	if (pf <= 0) {
1222 		for (i = 0; i != total_stats; i++) {
1223 			i_idx = xstats_ctrl->dev_table_idx[i];
1224 			o_idx = xstats_ctrl->xstats_o_idx[i];
1225 			if (i_idx == UINT16_MAX || xstats_ctrl->info[o_idx].dev)
1226 				continue;
1227 			stats[o_idx] += (uint64_t)et_stats->data[i_idx];
1228 		}
1229 	} else {
1230 		for (i = 0; i != total_stats; i++) {
1231 			i_idx = xstats_ctrl->dev_table_idx_2nd[i];
1232 			o_idx = xstats_ctrl->xstats_o_idx_2nd[i];
1233 			if (i_idx == UINT16_MAX || xstats_ctrl->info[o_idx].dev)
1234 				continue;
1235 			stats[o_idx] += (uint64_t)et_stats->data[i_idx];
1236 		}
1237 	}
1238 	return 0;
1239 }
1240 
1241 /*
1242  * Read device counters.
1243  *
1244  * @param dev
1245  *   Pointer to Ethernet device.
1246  * @param bond_master
1247  *   Indicate if the device is a bond master.
1248  * @param stats
1249  *   Counters table output buffer.
1250  *
1251  * @return
1252  *   0 on success and stats is filled, negative errno value otherwise and
1253  *   rte_errno is set.
1254  */
1255 int
1256 mlx5_os_read_dev_counters(struct rte_eth_dev *dev, bool bond_master, uint64_t *stats)
1257 {
1258 	struct mlx5_priv *priv = dev->data->dev_private;
1259 	struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl;
1260 	int ret = 0, i;
1261 
1262 	memset(stats, 0, sizeof(*stats) * xstats_ctrl->mlx5_stats_n);
1263 	/* Read ifreq counters. */
1264 	if (bond_master) {
1265 		/* Sum xstats from bonding device member ports. */
1266 		for (i = 0; i < priv->sh->bond.n_port; i++) {
1267 			ret = _mlx5_os_read_dev_counters(dev, i, stats);
1268 			if (ret)
1269 				return ret;
1270 		}
1271 	} else {
1272 		ret = _mlx5_os_read_dev_counters(dev, -1, stats);
1273 		if (ret)
1274 			return ret;
1275 	}
1276 	/*
1277 	 * Read IB dev counters.
1278 	 * The counters are unique per IB device but not per netdev IF.
1279 	 * In bonding mode, getting the stats name only from 1 port is enough.
1280 	 */
1281 	for (i = xstats_ctrl->dev_cnt_start; i < xstats_ctrl->mlx5_stats_n; i++) {
1282 		if (!xstats_ctrl->info[i].dev)
1283 			continue;
1284 		/* return last xstats counter if fail to read. */
1285 		if (mlx5_os_read_dev_stat(priv, xstats_ctrl->info[i].ctr_name,
1286 					  &stats[i]) == 0)
1287 			xstats_ctrl->xstats[i] = stats[i];
1288 		else
1289 			stats[i] = xstats_ctrl->xstats[i];
1290 	}
1291 	return ret;
1292 }
1293 
1294 /*
1295  * Query the number of statistics provided by ETHTOOL.
1296  *
1297  * @param dev
1298  *   Pointer to Ethernet device.
1299  * @param bond_master
1300  *   Indicate if the device is a bond master.
1301  * @param n_stats
1302  *   Pointer to number of stats to store.
1303  * @param n_stats_sec
1304  *   Pointer to number of stats to store for the 2nd port of the bond.
1305  *
1306  * @return
1307  *   0 on success, negative errno value otherwise and rte_errno is set.
1308  */
1309 int
1310 mlx5_os_get_stats_n(struct rte_eth_dev *dev, bool bond_master,
1311 		    uint16_t *n_stats, uint16_t *n_stats_sec)
1312 {
1313 	struct mlx5_priv *priv = dev->data->dev_private;
1314 	struct ethtool_drvinfo drvinfo;
1315 	struct ifreq ifr;
1316 	int ret;
1317 
1318 	drvinfo.cmd = ETHTOOL_GDRVINFO;
1319 	ifr.ifr_data = (caddr_t)&drvinfo;
1320 	/* Bonding PFs. */
1321 	if (bond_master) {
1322 		ret = mlx5_ifreq_by_ifname(priv->sh->bond.ports[0].ifname,
1323 					   SIOCETHTOOL, &ifr);
1324 		if (ret) {
1325 			DRV_LOG(WARNING, "bonding port %u unable to query number of"
1326 				" statistics for the 1st slave, %d", PORT_ID(priv), ret);
1327 			return ret;
1328 		}
1329 		*n_stats = drvinfo.n_stats;
1330 		ret = mlx5_ifreq_by_ifname(priv->sh->bond.ports[1].ifname,
1331 					   SIOCETHTOOL, &ifr);
1332 		if (ret) {
1333 			DRV_LOG(WARNING, "bonding port %u unable to query number of"
1334 				" statistics for the 2nd slave, %d", PORT_ID(priv), ret);
1335 			return ret;
1336 		}
1337 		*n_stats_sec = drvinfo.n_stats;
1338 	} else {
1339 		ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
1340 		if (ret) {
1341 			DRV_LOG(WARNING, "port %u unable to query number of statistics",
1342 				PORT_ID(priv));
1343 			return ret;
1344 		}
1345 		*n_stats = drvinfo.n_stats;
1346 	}
1347 	return 0;
1348 }
1349 
1350 static const struct mlx5_counter_ctrl mlx5_counters_init[] = {
1351 	{
1352 		.dpdk_name = "rx_unicast_bytes",
1353 		.ctr_name = "rx_vport_unicast_bytes",
1354 	},
1355 	{
1356 		.dpdk_name = "rx_multicast_bytes",
1357 		.ctr_name = "rx_vport_multicast_bytes",
1358 	},
1359 	{
1360 		.dpdk_name = "rx_broadcast_bytes",
1361 		.ctr_name = "rx_vport_broadcast_bytes",
1362 	},
1363 	{
1364 		.dpdk_name = "rx_unicast_packets",
1365 		.ctr_name = "rx_vport_unicast_packets",
1366 	},
1367 	{
1368 		.dpdk_name = "rx_multicast_packets",
1369 		.ctr_name = "rx_vport_multicast_packets",
1370 	},
1371 	{
1372 		.dpdk_name = "rx_broadcast_packets",
1373 		.ctr_name = "rx_vport_broadcast_packets",
1374 	},
1375 	{
1376 		.dpdk_name = "tx_unicast_bytes",
1377 		.ctr_name = "tx_vport_unicast_bytes",
1378 	},
1379 	{
1380 		.dpdk_name = "tx_multicast_bytes",
1381 		.ctr_name = "tx_vport_multicast_bytes",
1382 	},
1383 	{
1384 		.dpdk_name = "tx_broadcast_bytes",
1385 		.ctr_name = "tx_vport_broadcast_bytes",
1386 	},
1387 	{
1388 		.dpdk_name = "tx_unicast_packets",
1389 		.ctr_name = "tx_vport_unicast_packets",
1390 	},
1391 	{
1392 		.dpdk_name = "tx_multicast_packets",
1393 		.ctr_name = "tx_vport_multicast_packets",
1394 	},
1395 	{
1396 		.dpdk_name = "tx_broadcast_packets",
1397 		.ctr_name = "tx_vport_broadcast_packets",
1398 	},
1399 	{
1400 		.dpdk_name = "rx_wqe_errors",
1401 		.ctr_name = "rx_wqe_err",
1402 	},
1403 	{
1404 		.dpdk_name = "rx_phy_crc_errors",
1405 		.ctr_name = "rx_crc_errors_phy",
1406 	},
1407 	{
1408 		.dpdk_name = "rx_phy_in_range_len_errors",
1409 		.ctr_name = "rx_in_range_len_errors_phy",
1410 	},
1411 	{
1412 		.dpdk_name = "rx_phy_symbol_errors",
1413 		.ctr_name = "rx_symbol_err_phy",
1414 	},
1415 	{
1416 		.dpdk_name = "tx_phy_errors",
1417 		.ctr_name = "tx_errors_phy",
1418 	},
1419 	{
1420 		.dpdk_name = "rx_out_of_buffer",
1421 		.ctr_name = "out_of_buffer",
1422 		.dev = 1,
1423 	},
1424 	{
1425 		.dpdk_name = "hairpin_out_of_buffer",
1426 		.ctr_name = "hairpin_out_of_buffer",
1427 		.dev = 1,
1428 	},
1429 	{
1430 		.dpdk_name = "dev_internal_queue_oob",
1431 		.ctr_name = "dev_internal_queue_oob",
1432 	},
1433 	{
1434 		.dpdk_name = "tx_phy_packets",
1435 		.ctr_name = "tx_packets_phy",
1436 	},
1437 	{
1438 		.dpdk_name = "rx_phy_packets",
1439 		.ctr_name = "rx_packets_phy",
1440 	},
1441 	{
1442 		.dpdk_name = "tx_phy_discard_packets",
1443 		.ctr_name = "tx_discards_phy",
1444 	},
1445 	{
1446 		.dpdk_name = "rx_phy_discard_packets",
1447 		.ctr_name = "rx_discards_phy",
1448 	},
1449 	{
1450 		.dpdk_name = "rx_prio0_buf_discard_packets",
1451 		.ctr_name = "rx_prio0_buf_discard",
1452 	},
1453 	{
1454 		.dpdk_name = "rx_prio1_buf_discard_packets",
1455 		.ctr_name = "rx_prio1_buf_discard",
1456 	},
1457 	{
1458 		.dpdk_name = "rx_prio2_buf_discard_packets",
1459 		.ctr_name = "rx_prio2_buf_discard",
1460 	},
1461 	{
1462 		.dpdk_name = "rx_prio3_buf_discard_packets",
1463 		.ctr_name = "rx_prio3_buf_discard",
1464 	},
1465 	{
1466 		.dpdk_name = "rx_prio4_buf_discard_packets",
1467 		.ctr_name = "rx_prio4_buf_discard",
1468 	},
1469 	{
1470 		.dpdk_name = "rx_prio5_buf_discard_packets",
1471 		.ctr_name = "rx_prio5_buf_discard",
1472 	},
1473 	{
1474 		.dpdk_name = "rx_prio6_buf_discard_packets",
1475 		.ctr_name = "rx_prio6_buf_discard",
1476 	},
1477 	{
1478 		.dpdk_name = "rx_prio7_buf_discard_packets",
1479 		.ctr_name = "rx_prio7_buf_discard",
1480 	},
1481 	{
1482 		.dpdk_name = "rx_prio0_cong_discard_packets",
1483 		.ctr_name = "rx_prio0_cong_discard",
1484 	},
1485 	{
1486 		.dpdk_name = "rx_prio1_cong_discard_packets",
1487 		.ctr_name = "rx_prio1_cong_discard",
1488 	},
1489 	{
1490 		.dpdk_name = "rx_prio2_cong_discard_packets",
1491 		.ctr_name = "rx_prio2_cong_discard",
1492 	},
1493 	{
1494 		.dpdk_name = "rx_prio3_cong_discard_packets",
1495 		.ctr_name = "rx_prio3_cong_discard",
1496 	},
1497 	{
1498 		.dpdk_name = "rx_prio4_cong_discard_packets",
1499 		.ctr_name = "rx_prio4_cong_discard",
1500 	},
1501 	{
1502 		.dpdk_name = "rx_prio5_cong_discard_packets",
1503 		.ctr_name = "rx_prio5_cong_discard",
1504 	},
1505 	{
1506 		.dpdk_name = "rx_prio6_cong_discard_packets",
1507 		.ctr_name = "rx_prio6_cong_discard",
1508 	},
1509 	{
1510 		.dpdk_name = "rx_prio7_cong_discard_packets",
1511 		.ctr_name = "rx_prio7_cong_discard",
1512 	},
1513 	{
1514 		.dpdk_name = "tx_phy_bytes",
1515 		.ctr_name = "tx_bytes_phy",
1516 	},
1517 	{
1518 		.dpdk_name = "rx_phy_bytes",
1519 		.ctr_name = "rx_bytes_phy",
1520 	},
1521 	/* Representor only */
1522 	{
1523 		.dpdk_name = "rx_vport_packets",
1524 		.ctr_name = "vport_rx_packets",
1525 	},
1526 	{
1527 		.dpdk_name = "rx_vport_bytes",
1528 		.ctr_name = "vport_rx_bytes",
1529 	},
1530 	{
1531 		.dpdk_name = "tx_vport_packets",
1532 		.ctr_name = "vport_tx_packets",
1533 	},
1534 	{
1535 		.dpdk_name = "tx_vport_bytes",
1536 		.ctr_name = "vport_tx_bytes",
1537 	},
1538 	/**
1539 	 * Device counters: These counters are for the
1540 	 * entire PCI device (NIC). These counters are
1541 	 * not counting on a per port/queue basis.
1542 	 */
1543 	{
1544 		.dpdk_name = "rx_pci_signal_integrity",
1545 		.ctr_name = "rx_pci_signal_integrity",
1546 	},
1547 	{
1548 		.dpdk_name = "tx_pci_signal_integrity",
1549 		.ctr_name = "tx_pci_signal_integrity",
1550 	},
1551 	{
1552 		.dpdk_name = "outbound_pci_buffer_overflow",
1553 		.ctr_name = "outbound_pci_buffer_overflow",
1554 	},
1555 	{
1556 		.dpdk_name = "outbound_pci_stalled_rd",
1557 		.ctr_name = "outbound_pci_stalled_rd",
1558 	},
1559 	{
1560 		.dpdk_name = "outbound_pci_stalled_wr",
1561 		.ctr_name = "outbound_pci_stalled_wr",
1562 	},
1563 	{
1564 		.dpdk_name = "outbound_pci_stalled_rd_events",
1565 		.ctr_name = "outbound_pci_stalled_rd_events",
1566 	},
1567 	{
1568 		.dpdk_name = "outbound_pci_stalled_wr_events",
1569 		.ctr_name = "outbound_pci_stalled_wr_events",
1570 	},
1571 	{
1572 		.dpdk_name = "dev_out_of_buffer",
1573 		.ctr_name = "dev_out_of_buffer",
1574 	},
1575 };
1576 
1577 const unsigned int xstats_n = RTE_DIM(mlx5_counters_init);
1578 
1579 static int
1580 mlx5_os_get_stats_strings(struct rte_eth_dev *dev, bool bond_master,
1581 			  struct ethtool_gstrings *strings,
1582 			  uint32_t stats_n, uint32_t stats_n_2nd)
1583 {
1584 	struct mlx5_priv *priv = dev->data->dev_private;
1585 	struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl;
1586 	struct ifreq ifr;
1587 	int ret;
1588 	uint32_t i, j, idx;
1589 
1590 	/* Ensure no out of bounds access before. */
1591 	MLX5_ASSERT(xstats_n <= MLX5_MAX_XSTATS);
1592 	strings->cmd = ETHTOOL_GSTRINGS;
1593 	strings->string_set = ETH_SS_STATS;
1594 	strings->len = stats_n;
1595 	ifr.ifr_data = (caddr_t)strings;
1596 	if (bond_master)
1597 		ret = mlx5_ifreq_by_ifname(priv->sh->bond.ports[0].ifname,
1598 					   SIOCETHTOOL, &ifr);
1599 	else
1600 		ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
1601 	if (ret) {
1602 		DRV_LOG(WARNING, "port %u unable to get statistic names with %d",
1603 			PORT_ID(priv), ret);
1604 		return ret;
1605 	}
1606 	/* Reorganize the orders to reduce the iterations. */
1607 	for (j = 0; j < xstats_n; j++) {
1608 		xstats_ctrl->dev_table_idx[j] = UINT16_MAX;
1609 		for (i = 0; i < stats_n; i++) {
1610 			const char *curr_string =
1611 				(const char *)&strings->data[i * ETH_GSTRING_LEN];
1612 
1613 			if (!strcmp(mlx5_counters_init[j].ctr_name, curr_string)) {
1614 				idx = xstats_ctrl->mlx5_stats_n++;
1615 				xstats_ctrl->dev_table_idx[j] = i;
1616 				xstats_ctrl->xstats_o_idx[j] = idx;
1617 				xstats_ctrl->info[idx] = mlx5_counters_init[j];
1618 			}
1619 		}
1620 	}
1621 	if (!bond_master) {
1622 		/* Add dev counters, unique per IB device. */
1623 		xstats_ctrl->dev_cnt_start = xstats_ctrl->mlx5_stats_n;
1624 		for (j = 0; j != xstats_n; j++) {
1625 			if (mlx5_counters_init[j].dev) {
1626 				idx = xstats_ctrl->mlx5_stats_n++;
1627 				xstats_ctrl->info[idx] = mlx5_counters_init[j];
1628 				xstats_ctrl->hw_stats[idx] = 0;
1629 			}
1630 		}
1631 		return 0;
1632 	}
1633 
1634 	strings->len = stats_n_2nd;
1635 	ret = mlx5_ifreq_by_ifname(priv->sh->bond.ports[1].ifname,
1636 				   SIOCETHTOOL, &ifr);
1637 	if (ret) {
1638 		DRV_LOG(WARNING, "port %u unable to get statistic names for 2nd slave with %d",
1639 			PORT_ID(priv), ret);
1640 		return ret;
1641 	}
1642 	/* The 2nd slave port may have a different strings set, based on the configuration. */
1643 	for (j = 0; j != xstats_n; j++) {
1644 		xstats_ctrl->dev_table_idx_2nd[j] = UINT16_MAX;
1645 		for (i = 0; i != stats_n_2nd; i++) {
1646 			const char *curr_string =
1647 				(const char *)&strings->data[i * ETH_GSTRING_LEN];
1648 
1649 			if (!strcmp(mlx5_counters_init[j].ctr_name, curr_string)) {
1650 				xstats_ctrl->dev_table_idx_2nd[j] = i;
1651 				if (xstats_ctrl->dev_table_idx[j] != UINT16_MAX) {
1652 					/* Already mapped in the 1st slave port. */
1653 					idx = xstats_ctrl->xstats_o_idx[j];
1654 					xstats_ctrl->xstats_o_idx_2nd[j] = idx;
1655 				} else {
1656 					/* Append the new items to the end of the map. */
1657 					idx = xstats_ctrl->mlx5_stats_n++;
1658 					xstats_ctrl->xstats_o_idx_2nd[j] = idx;
1659 					xstats_ctrl->info[idx] = mlx5_counters_init[j];
1660 				}
1661 			}
1662 		}
1663 	}
1664 	/* Dev counters are always at the last now. */
1665 	xstats_ctrl->dev_cnt_start = xstats_ctrl->mlx5_stats_n;
1666 	for (j = 0; j != xstats_n; j++) {
1667 		if (mlx5_counters_init[j].dev) {
1668 			idx = xstats_ctrl->mlx5_stats_n++;
1669 			xstats_ctrl->info[idx] = mlx5_counters_init[j];
1670 			xstats_ctrl->hw_stats[idx] = 0;
1671 		}
1672 	}
1673 	return 0;
1674 }
1675 
1676 /**
1677  * Init the structures to read device counters.
1678  *
1679  * @param dev
1680  *   Pointer to Ethernet device.
1681  */
1682 void
1683 mlx5_os_stats_init(struct rte_eth_dev *dev)
1684 {
1685 	struct mlx5_priv *priv = dev->data->dev_private;
1686 	struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl;
1687 	struct mlx5_stats_ctrl *stats_ctrl = &priv->stats_ctrl;
1688 	struct ethtool_gstrings *strings = NULL;
1689 	uint16_t dev_stats_n = 0;
1690 	uint16_t dev_stats_n_2nd = 0;
1691 	unsigned int max_stats_n;
1692 	unsigned int str_sz;
1693 	int ret;
1694 	bool bond_master = (priv->master && priv->pf_bond >= 0);
1695 
1696 	/* So that it won't aggregate for each init. */
1697 	xstats_ctrl->mlx5_stats_n = 0;
1698 	ret = mlx5_os_get_stats_n(dev, bond_master, &dev_stats_n, &dev_stats_n_2nd);
1699 	if (ret < 0) {
1700 		DRV_LOG(WARNING, "port %u no extended statistics available",
1701 			dev->data->port_id);
1702 		return;
1703 	}
1704 	max_stats_n = RTE_MAX(dev_stats_n, dev_stats_n_2nd);
1705 	/* Allocate memory to grab stat names and values. */
1706 	str_sz = max_stats_n * ETH_GSTRING_LEN;
1707 	strings = (struct ethtool_gstrings *)
1708 		  mlx5_malloc(0, str_sz + sizeof(struct ethtool_gstrings), 0,
1709 			      SOCKET_ID_ANY);
1710 	if (!strings) {
1711 		DRV_LOG(WARNING, "port %u unable to allocate memory for xstats",
1712 			dev->data->port_id);
1713 		return;
1714 	}
1715 	ret = mlx5_os_get_stats_strings(dev, bond_master, strings,
1716 					dev_stats_n, dev_stats_n_2nd);
1717 	if (ret < 0) {
1718 		DRV_LOG(WARNING, "port %u failed to get the stats strings",
1719 			dev->data->port_id);
1720 		goto free;
1721 	}
1722 	xstats_ctrl->stats_n = dev_stats_n;
1723 	xstats_ctrl->stats_n_2nd = dev_stats_n_2nd;
1724 	/* Copy to base at first time. */
1725 	ret = mlx5_os_read_dev_counters(dev, bond_master, xstats_ctrl->base);
1726 	if (ret)
1727 		DRV_LOG(ERR, "port %u cannot read device counters: %s",
1728 			dev->data->port_id, strerror(rte_errno));
1729 	mlx5_os_read_dev_stat(priv, "out_of_buffer", &stats_ctrl->imissed_base);
1730 	stats_ctrl->imissed = 0;
1731 free:
1732 	mlx5_free(strings);
1733 }
1734 
1735 /**
1736  * Get MAC address by querying netdevice.
1737  *
1738  * @param[in] dev
1739  *   Pointer to Ethernet device.
1740  * @param[out] mac
1741  *   MAC address output buffer.
1742  *
1743  * @return
1744  *   0 on success, a negative errno value otherwise and rte_errno is set.
1745  */
1746 int
1747 mlx5_get_mac(struct rte_eth_dev *dev, uint8_t (*mac)[RTE_ETHER_ADDR_LEN])
1748 {
1749 	struct ifreq request;
1750 	int ret;
1751 
1752 	ret = mlx5_ifreq(dev, SIOCGIFHWADDR, &request);
1753 	if (ret)
1754 		return ret;
1755 	memcpy(mac, request.ifr_hwaddr.sa_data, RTE_ETHER_ADDR_LEN);
1756 	return 0;
1757 }
1758 
1759 /*
1760  * Query dropless_rq private flag value provided by ETHTOOL.
1761  *
1762  * @param dev
1763  *   Pointer to Ethernet device.
1764  *
1765  * @return
1766  *   - 0 on success, flag is not set.
1767  *   - 1 on success, flag is set.
1768  *   - negative errno value otherwise and rte_errno is set.
1769  */
1770 int mlx5_get_flag_dropless_rq(struct rte_eth_dev *dev)
1771 {
1772 	struct ethtool_sset_info *sset_info = NULL;
1773 	struct ethtool_drvinfo drvinfo;
1774 	struct ifreq ifr;
1775 	struct ethtool_gstrings *strings = NULL;
1776 	struct ethtool_value flags;
1777 	const int32_t flag_len = sizeof(flags.data) * CHAR_BIT;
1778 	int32_t str_sz;
1779 	int32_t len;
1780 	int32_t i;
1781 	int ret;
1782 
1783 	sset_info = mlx5_malloc(0, sizeof(struct ethtool_sset_info) +
1784 			sizeof(uint32_t), 0, SOCKET_ID_ANY);
1785 	if (sset_info == NULL) {
1786 		rte_errno = ENOMEM;
1787 		return -rte_errno;
1788 	}
1789 	sset_info->cmd = ETHTOOL_GSSET_INFO;
1790 	sset_info->reserved = 0;
1791 	sset_info->sset_mask = 1ULL << ETH_SS_PRIV_FLAGS;
1792 	ifr.ifr_data = (caddr_t)&sset_info;
1793 	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
1794 	if (!ret) {
1795 		const uint32_t *sset_lengths = sset_info->data;
1796 
1797 		len = sset_info->sset_mask ? sset_lengths[0] : 0;
1798 	} else if (ret == -EOPNOTSUPP) {
1799 		drvinfo.cmd = ETHTOOL_GDRVINFO;
1800 		ifr.ifr_data = (caddr_t)&drvinfo;
1801 		ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
1802 		if (ret) {
1803 			DRV_LOG(WARNING, "port %u cannot get the driver info",
1804 				dev->data->port_id);
1805 			goto exit;
1806 		}
1807 		len = *(uint32_t *)((char *)&drvinfo +
1808 			offsetof(struct ethtool_drvinfo, n_priv_flags));
1809 	} else {
1810 		DRV_LOG(WARNING, "port %u cannot get the sset info",
1811 			dev->data->port_id);
1812 		goto exit;
1813 	}
1814 	if (!len) {
1815 		DRV_LOG(WARNING, "port %u does not have private flag",
1816 			dev->data->port_id);
1817 		rte_errno = EOPNOTSUPP;
1818 		ret = -rte_errno;
1819 		goto exit;
1820 	} else if (len > flag_len) {
1821 		DRV_LOG(WARNING, "port %u maximal private flags number is %d",
1822 			dev->data->port_id, flag_len);
1823 		len = flag_len;
1824 	}
1825 	str_sz = ETH_GSTRING_LEN * len;
1826 	strings = (struct ethtool_gstrings *)
1827 		  mlx5_malloc(0, str_sz + sizeof(struct ethtool_gstrings), 0,
1828 			      SOCKET_ID_ANY);
1829 	if (!strings) {
1830 		DRV_LOG(WARNING, "port %u unable to allocate memory for"
1831 			" private flags", dev->data->port_id);
1832 		rte_errno = ENOMEM;
1833 		ret = -rte_errno;
1834 		goto exit;
1835 	}
1836 	strings->cmd = ETHTOOL_GSTRINGS;
1837 	strings->string_set = ETH_SS_PRIV_FLAGS;
1838 	strings->len = len;
1839 	ifr.ifr_data = (caddr_t)strings;
1840 	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
1841 	if (ret) {
1842 		DRV_LOG(WARNING, "port %u unable to get private flags strings",
1843 			dev->data->port_id);
1844 		goto exit;
1845 	}
1846 	for (i = 0; i < len; i++) {
1847 		strings->data[(i + 1) * ETH_GSTRING_LEN - 1] = 0;
1848 		if (!strcmp((const char *)strings->data + i * ETH_GSTRING_LEN,
1849 			     "dropless_rq"))
1850 			break;
1851 	}
1852 	if (i == len) {
1853 		DRV_LOG(WARNING, "port %u does not support dropless_rq",
1854 			dev->data->port_id);
1855 		rte_errno = EOPNOTSUPP;
1856 		ret = -rte_errno;
1857 		goto exit;
1858 	}
1859 	flags.cmd = ETHTOOL_GPFLAGS;
1860 	ifr.ifr_data = (caddr_t)&flags;
1861 	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
1862 	if (ret) {
1863 		DRV_LOG(WARNING, "port %u unable to get private flags status",
1864 			dev->data->port_id);
1865 		goto exit;
1866 	}
1867 	ret = !!(flags.data & (1U << i));
1868 exit:
1869 	mlx5_free(strings);
1870 	mlx5_free(sset_info);
1871 	return ret;
1872 }
1873 
1874 /**
1875  * Unmaps HCA PCI BAR from the current process address space.
1876  *
1877  * @param dev
1878  *   Pointer to Ethernet device structure.
1879  */
1880 void mlx5_txpp_unmap_hca_bar(struct rte_eth_dev *dev)
1881 {
1882 	struct mlx5_proc_priv *ppriv = dev->process_private;
1883 
1884 	if (ppriv && ppriv->hca_bar) {
1885 		rte_mem_unmap(ppriv->hca_bar, MLX5_ST_SZ_BYTES(initial_seg));
1886 		ppriv->hca_bar = NULL;
1887 	}
1888 }
1889 
1890 /**
1891  * Maps HCA PCI BAR to the current process address space.
1892  * Stores pointer in the process private structure allowing
1893  * to read internal and real time counter directly from the HW.
1894  *
1895  * @param dev
1896  *   Pointer to Ethernet device structure.
1897  *
1898  * @return
1899  *   0 on success and not NULL pointer to mapped area in process structure.
1900  *   negative otherwise and NULL pointer
1901  */
1902 int mlx5_txpp_map_hca_bar(struct rte_eth_dev *dev)
1903 {
1904 	struct mlx5_proc_priv *ppriv = dev->process_private;
1905 	char pci_addr[PCI_PRI_STR_SIZE] = { 0 };
1906 	void *base, *expected = NULL;
1907 	int fd, ret;
1908 
1909 	if (!ppriv) {
1910 		rte_errno = ENOMEM;
1911 		return -rte_errno;
1912 	}
1913 	if (ppriv->hca_bar)
1914 		return 0;
1915 	ret = mlx5_dev_to_pci_str(dev->device, pci_addr, sizeof(pci_addr));
1916 	if (ret < 0)
1917 		return -rte_errno;
1918 	/* Open PCI device resource 0 - HCA initialize segment */
1919 	MKSTR(name, "/sys/bus/pci/devices/%s/resource0", pci_addr);
1920 	fd = open(name, O_RDWR | O_SYNC);
1921 	if (fd == -1) {
1922 		rte_errno = ENOTSUP;
1923 		return -ENOTSUP;
1924 	}
1925 	base = rte_mem_map(NULL, MLX5_ST_SZ_BYTES(initial_seg),
1926 			   RTE_PROT_READ, RTE_MAP_SHARED, fd, 0);
1927 	close(fd);
1928 	if (!base) {
1929 		rte_errno = ENOTSUP;
1930 		return -ENOTSUP;
1931 	}
1932 	/* Check there is no concurrent mapping in other thread. */
1933 	if (!rte_atomic_compare_exchange_strong_explicit(&ppriv->hca_bar, &expected,
1934 					 base,
1935 					 rte_memory_order_relaxed, rte_memory_order_relaxed))
1936 		rte_mem_unmap(base, MLX5_ST_SZ_BYTES(initial_seg));
1937 	return 0;
1938 }
1939 
1940