xref: /dpdk/drivers/net/mlx5/mlx5_ethdev.c (revision 78a38edf66de67c8f52d0fcf17865c0dd9937013)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright 2015 6WIND S.A.
5  *   Copyright 2015 Mellanox.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of 6WIND S.A. nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <stddef.h>
35 #include <assert.h>
36 #include <unistd.h>
37 #include <stdint.h>
38 #include <stdio.h>
39 #include <string.h>
40 #include <stdlib.h>
41 #include <errno.h>
42 #include <dirent.h>
43 #include <net/if.h>
44 #include <sys/ioctl.h>
45 #include <sys/socket.h>
46 #include <netinet/in.h>
47 #include <linux/if.h>
48 #include <linux/ethtool.h>
49 #include <linux/sockios.h>
50 #include <fcntl.h>
51 
52 /* DPDK headers don't like -pedantic. */
53 #ifdef PEDANTIC
54 #pragma GCC diagnostic ignored "-pedantic"
55 #endif
56 #include <rte_atomic.h>
57 #include <rte_ethdev.h>
58 #include <rte_mbuf.h>
59 #include <rte_common.h>
60 #include <rte_interrupts.h>
61 #include <rte_alarm.h>
62 #ifdef PEDANTIC
63 #pragma GCC diagnostic error "-pedantic"
64 #endif
65 
66 #include "mlx5.h"
67 #include "mlx5_rxtx.h"
68 #include "mlx5_utils.h"
69 
70 /**
71  * Get interface name from private structure.
72  *
73  * @param[in] priv
74  *   Pointer to private structure.
75  * @param[out] ifname
76  *   Interface name output buffer.
77  *
78  * @return
79  *   0 on success, -1 on failure and errno is set.
80  */
81 int
82 priv_get_ifname(const struct priv *priv, char (*ifname)[IF_NAMESIZE])
83 {
84 	DIR *dir;
85 	struct dirent *dent;
86 	unsigned int dev_type = 0;
87 	unsigned int dev_port_prev = ~0u;
88 	char match[IF_NAMESIZE] = "";
89 
90 	{
91 		MKSTR(path, "%s/device/net", priv->ctx->device->ibdev_path);
92 
93 		dir = opendir(path);
94 		if (dir == NULL)
95 			return -1;
96 	}
97 	while ((dent = readdir(dir)) != NULL) {
98 		char *name = dent->d_name;
99 		FILE *file;
100 		unsigned int dev_port;
101 		int r;
102 
103 		if ((name[0] == '.') &&
104 		    ((name[1] == '\0') ||
105 		     ((name[1] == '.') && (name[2] == '\0'))))
106 			continue;
107 
108 		MKSTR(path, "%s/device/net/%s/%s",
109 		      priv->ctx->device->ibdev_path, name,
110 		      (dev_type ? "dev_id" : "dev_port"));
111 
112 		file = fopen(path, "rb");
113 		if (file == NULL) {
114 			if (errno != ENOENT)
115 				continue;
116 			/*
117 			 * Switch to dev_id when dev_port does not exist as
118 			 * is the case with Linux kernel versions < 3.15.
119 			 */
120 try_dev_id:
121 			match[0] = '\0';
122 			if (dev_type)
123 				break;
124 			dev_type = 1;
125 			dev_port_prev = ~0u;
126 			rewinddir(dir);
127 			continue;
128 		}
129 		r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port);
130 		fclose(file);
131 		if (r != 1)
132 			continue;
133 		/*
134 		 * Switch to dev_id when dev_port returns the same value for
135 		 * all ports. May happen when using a MOFED release older than
136 		 * 3.0 with a Linux kernel >= 3.15.
137 		 */
138 		if (dev_port == dev_port_prev)
139 			goto try_dev_id;
140 		dev_port_prev = dev_port;
141 		if (dev_port == (priv->port - 1u))
142 			snprintf(match, sizeof(match), "%s", name);
143 	}
144 	closedir(dir);
145 	if (match[0] == '\0')
146 		return -1;
147 	strncpy(*ifname, match, sizeof(*ifname));
148 	return 0;
149 }
150 
151 /**
152  * Read from sysfs entry.
153  *
154  * @param[in] priv
155  *   Pointer to private structure.
156  * @param[in] entry
157  *   Entry name relative to sysfs path.
158  * @param[out] buf
159  *   Data output buffer.
160  * @param size
161  *   Buffer size.
162  *
163  * @return
164  *   0 on success, -1 on failure and errno is set.
165  */
166 static int
167 priv_sysfs_read(const struct priv *priv, const char *entry,
168 		char *buf, size_t size)
169 {
170 	char ifname[IF_NAMESIZE];
171 	FILE *file;
172 	int ret;
173 	int err;
174 
175 	if (priv_get_ifname(priv, &ifname))
176 		return -1;
177 
178 	MKSTR(path, "%s/device/net/%s/%s", priv->ctx->device->ibdev_path,
179 	      ifname, entry);
180 
181 	file = fopen(path, "rb");
182 	if (file == NULL)
183 		return -1;
184 	ret = fread(buf, 1, size, file);
185 	err = errno;
186 	if (((size_t)ret < size) && (ferror(file)))
187 		ret = -1;
188 	else
189 		ret = size;
190 	fclose(file);
191 	errno = err;
192 	return ret;
193 }
194 
195 /**
196  * Write to sysfs entry.
197  *
198  * @param[in] priv
199  *   Pointer to private structure.
200  * @param[in] entry
201  *   Entry name relative to sysfs path.
202  * @param[in] buf
203  *   Data buffer.
204  * @param size
205  *   Buffer size.
206  *
207  * @return
208  *   0 on success, -1 on failure and errno is set.
209  */
210 static int
211 priv_sysfs_write(const struct priv *priv, const char *entry,
212 		 char *buf, size_t size)
213 {
214 	char ifname[IF_NAMESIZE];
215 	FILE *file;
216 	int ret;
217 	int err;
218 
219 	if (priv_get_ifname(priv, &ifname))
220 		return -1;
221 
222 	MKSTR(path, "%s/device/net/%s/%s", priv->ctx->device->ibdev_path,
223 	      ifname, entry);
224 
225 	file = fopen(path, "wb");
226 	if (file == NULL)
227 		return -1;
228 	ret = fwrite(buf, 1, size, file);
229 	err = errno;
230 	if (((size_t)ret < size) || (ferror(file)))
231 		ret = -1;
232 	else
233 		ret = size;
234 	fclose(file);
235 	errno = err;
236 	return ret;
237 }
238 
239 /**
240  * Get unsigned long sysfs property.
241  *
242  * @param priv
243  *   Pointer to private structure.
244  * @param[in] name
245  *   Entry name relative to sysfs path.
246  * @param[out] value
247  *   Value output buffer.
248  *
249  * @return
250  *   0 on success, -1 on failure and errno is set.
251  */
252 static int
253 priv_get_sysfs_ulong(struct priv *priv, const char *name, unsigned long *value)
254 {
255 	int ret;
256 	unsigned long value_ret;
257 	char value_str[32];
258 
259 	ret = priv_sysfs_read(priv, name, value_str, (sizeof(value_str) - 1));
260 	if (ret == -1) {
261 		DEBUG("cannot read %s value from sysfs: %s",
262 		      name, strerror(errno));
263 		return -1;
264 	}
265 	value_str[ret] = '\0';
266 	errno = 0;
267 	value_ret = strtoul(value_str, NULL, 0);
268 	if (errno) {
269 		DEBUG("invalid %s value `%s': %s", name, value_str,
270 		      strerror(errno));
271 		return -1;
272 	}
273 	*value = value_ret;
274 	return 0;
275 }
276 
277 /**
278  * Set unsigned long sysfs property.
279  *
280  * @param priv
281  *   Pointer to private structure.
282  * @param[in] name
283  *   Entry name relative to sysfs path.
284  * @param value
285  *   Value to set.
286  *
287  * @return
288  *   0 on success, -1 on failure and errno is set.
289  */
290 static int
291 priv_set_sysfs_ulong(struct priv *priv, const char *name, unsigned long value)
292 {
293 	int ret;
294 	MKSTR(value_str, "%lu", value);
295 
296 	ret = priv_sysfs_write(priv, name, value_str, (sizeof(value_str) - 1));
297 	if (ret == -1) {
298 		DEBUG("cannot write %s `%s' (%lu) to sysfs: %s",
299 		      name, value_str, value, strerror(errno));
300 		return -1;
301 	}
302 	return 0;
303 }
304 
305 /**
306  * Perform ifreq ioctl() on associated Ethernet device.
307  *
308  * @param[in] priv
309  *   Pointer to private structure.
310  * @param req
311  *   Request number to pass to ioctl().
312  * @param[out] ifr
313  *   Interface request structure output buffer.
314  *
315  * @return
316  *   0 on success, -1 on failure and errno is set.
317  */
318 int
319 priv_ifreq(const struct priv *priv, int req, struct ifreq *ifr)
320 {
321 	int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
322 	int ret = -1;
323 
324 	if (sock == -1)
325 		return ret;
326 	if (priv_get_ifname(priv, &ifr->ifr_name) == 0)
327 		ret = ioctl(sock, req, ifr);
328 	close(sock);
329 	return ret;
330 }
331 
332 /**
333  * Get device MTU.
334  *
335  * @param priv
336  *   Pointer to private structure.
337  * @param[out] mtu
338  *   MTU value output buffer.
339  *
340  * @return
341  *   0 on success, -1 on failure and errno is set.
342  */
343 int
344 priv_get_mtu(struct priv *priv, uint16_t *mtu)
345 {
346 	unsigned long ulong_mtu;
347 
348 	if (priv_get_sysfs_ulong(priv, "mtu", &ulong_mtu) == -1)
349 		return -1;
350 	*mtu = ulong_mtu;
351 	return 0;
352 }
353 
354 /**
355  * Set device MTU.
356  *
357  * @param priv
358  *   Pointer to private structure.
359  * @param mtu
360  *   MTU value to set.
361  *
362  * @return
363  *   0 on success, -1 on failure and errno is set.
364  */
365 static int
366 priv_set_mtu(struct priv *priv, uint16_t mtu)
367 {
368 	return priv_set_sysfs_ulong(priv, "mtu", mtu);
369 }
370 
371 /**
372  * Set device flags.
373  *
374  * @param priv
375  *   Pointer to private structure.
376  * @param keep
377  *   Bitmask for flags that must remain untouched.
378  * @param flags
379  *   Bitmask for flags to modify.
380  *
381  * @return
382  *   0 on success, -1 on failure and errno is set.
383  */
384 int
385 priv_set_flags(struct priv *priv, unsigned int keep, unsigned int flags)
386 {
387 	unsigned long tmp;
388 
389 	if (priv_get_sysfs_ulong(priv, "flags", &tmp) == -1)
390 		return -1;
391 	tmp &= keep;
392 	tmp |= flags;
393 	return priv_set_sysfs_ulong(priv, "flags", tmp);
394 }
395 
396 /**
397  * Ethernet device configuration.
398  *
399  * Prepare the driver for a given number of TX and RX queues.
400  *
401  * @param dev
402  *   Pointer to Ethernet device structure.
403  *
404  * @return
405  *   0 on success, errno value on failure.
406  */
407 static int
408 dev_configure(struct rte_eth_dev *dev)
409 {
410 	struct priv *priv = dev->data->dev_private;
411 	unsigned int rxqs_n = dev->data->nb_rx_queues;
412 	unsigned int txqs_n = dev->data->nb_tx_queues;
413 	unsigned int i;
414 	unsigned int j;
415 	unsigned int reta_idx_n;
416 
417 	priv->rxqs = (void *)dev->data->rx_queues;
418 	priv->txqs = (void *)dev->data->tx_queues;
419 	if (txqs_n != priv->txqs_n) {
420 		INFO("%p: TX queues number update: %u -> %u",
421 		     (void *)dev, priv->txqs_n, txqs_n);
422 		priv->txqs_n = txqs_n;
423 	}
424 	if (rxqs_n > priv->ind_table_max_size) {
425 		ERROR("cannot handle this many RX queues (%u)", rxqs_n);
426 		return EINVAL;
427 	}
428 	if (rxqs_n == priv->rxqs_n)
429 		return 0;
430 	INFO("%p: RX queues number update: %u -> %u",
431 	     (void *)dev, priv->rxqs_n, rxqs_n);
432 	priv->rxqs_n = rxqs_n;
433 	/* If the requested number of RX queues is not a power of two, use the
434 	 * maximum indirection table size for better balancing.
435 	 * The result is always rounded to the next power of two. */
436 	reta_idx_n = (1 << log2above((rxqs_n & (rxqs_n - 1)) ?
437 				     priv->ind_table_max_size :
438 				     rxqs_n));
439 	if (priv_rss_reta_index_resize(priv, reta_idx_n))
440 		return ENOMEM;
441 	/* When the number of RX queues is not a power of two, the remaining
442 	 * table entries are padded with reused WQs and hashes are not spread
443 	 * uniformly. */
444 	for (i = 0, j = 0; (i != reta_idx_n); ++i) {
445 		(*priv->reta_idx)[i] = j;
446 		if (++j == rxqs_n)
447 			j = 0;
448 	}
449 	return 0;
450 }
451 
452 /**
453  * DPDK callback for Ethernet device configuration.
454  *
455  * @param dev
456  *   Pointer to Ethernet device structure.
457  *
458  * @return
459  *   0 on success, negative errno value on failure.
460  */
461 int
462 mlx5_dev_configure(struct rte_eth_dev *dev)
463 {
464 	struct priv *priv = dev->data->dev_private;
465 	int ret;
466 
467 	priv_lock(priv);
468 	ret = dev_configure(dev);
469 	assert(ret >= 0);
470 	priv_unlock(priv);
471 	return -ret;
472 }
473 
474 /**
475  * DPDK callback to get information about the device.
476  *
477  * @param dev
478  *   Pointer to Ethernet device structure.
479  * @param[out] info
480  *   Info structure output buffer.
481  */
482 void
483 mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
484 {
485 	struct priv *priv = dev->data->dev_private;
486 	unsigned int max;
487 	char ifname[IF_NAMESIZE];
488 
489 	priv_lock(priv);
490 	/* FIXME: we should ask the device for these values. */
491 	info->min_rx_bufsize = 32;
492 	info->max_rx_pktlen = 65536;
493 	/*
494 	 * Since we need one CQ per QP, the limit is the minimum number
495 	 * between the two values.
496 	 */
497 	max = ((priv->device_attr.max_cq > priv->device_attr.max_qp) ?
498 	       priv->device_attr.max_qp : priv->device_attr.max_cq);
499 	/* If max >= 65535 then max = 0, max_rx_queues is uint16_t. */
500 	if (max >= 65535)
501 		max = 65535;
502 	info->max_rx_queues = max;
503 	info->max_tx_queues = max;
504 	info->max_mac_addrs = RTE_DIM(priv->mac);
505 	info->rx_offload_capa =
506 		(priv->hw_csum ?
507 		 (DEV_RX_OFFLOAD_IPV4_CKSUM |
508 		  DEV_RX_OFFLOAD_UDP_CKSUM |
509 		  DEV_RX_OFFLOAD_TCP_CKSUM) :
510 		 0);
511 	info->tx_offload_capa =
512 		(priv->hw_csum ?
513 		 (DEV_TX_OFFLOAD_IPV4_CKSUM |
514 		  DEV_TX_OFFLOAD_UDP_CKSUM |
515 		  DEV_TX_OFFLOAD_TCP_CKSUM) :
516 		 0);
517 	if (priv_get_ifname(priv, &ifname) == 0)
518 		info->if_index = if_nametoindex(ifname);
519 	/* FIXME: RETA update/query API expects the callee to know the size of
520 	 * the indirection table, for this PMD the size varies depending on
521 	 * the number of RX queues, it becomes impossible to find the correct
522 	 * size if it is not fixed.
523 	 * The API should be updated to solve this problem. */
524 	info->reta_size = priv->ind_table_max_size;
525 	priv_unlock(priv);
526 }
527 
528 const uint32_t *
529 mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev)
530 {
531 	static const uint32_t ptypes[] = {
532 		/* refers to rxq_cq_to_pkt_type() */
533 		RTE_PTYPE_L3_IPV4,
534 		RTE_PTYPE_L3_IPV6,
535 		RTE_PTYPE_INNER_L3_IPV4,
536 		RTE_PTYPE_INNER_L3_IPV6,
537 		RTE_PTYPE_UNKNOWN
538 
539 	};
540 
541 	if (dev->rx_pkt_burst == mlx5_rx_burst ||
542 	    dev->rx_pkt_burst == mlx5_rx_burst_sp)
543 		return ptypes;
544 	return NULL;
545 }
546 
547 /**
548  * DPDK callback to retrieve physical link information (unlocked version).
549  *
550  * @param dev
551  *   Pointer to Ethernet device structure.
552  * @param wait_to_complete
553  *   Wait for request completion (ignored).
554  */
555 static int
556 mlx5_link_update_unlocked(struct rte_eth_dev *dev, int wait_to_complete)
557 {
558 	struct priv *priv = dev->data->dev_private;
559 	struct ethtool_cmd edata = {
560 		.cmd = ETHTOOL_GSET
561 	};
562 	struct ifreq ifr;
563 	struct rte_eth_link dev_link;
564 	int link_speed = 0;
565 
566 	(void)wait_to_complete;
567 	if (priv_ifreq(priv, SIOCGIFFLAGS, &ifr)) {
568 		WARN("ioctl(SIOCGIFFLAGS) failed: %s", strerror(errno));
569 		return -1;
570 	}
571 	memset(&dev_link, 0, sizeof(dev_link));
572 	dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
573 				(ifr.ifr_flags & IFF_RUNNING));
574 	ifr.ifr_data = &edata;
575 	if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) {
576 		WARN("ioctl(SIOCETHTOOL, ETHTOOL_GSET) failed: %s",
577 		     strerror(errno));
578 		return -1;
579 	}
580 	link_speed = ethtool_cmd_speed(&edata);
581 	if (link_speed == -1)
582 		dev_link.link_speed = 0;
583 	else
584 		dev_link.link_speed = link_speed;
585 	dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ?
586 				ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
587 	if (memcmp(&dev_link, &dev->data->dev_link, sizeof(dev_link))) {
588 		/* Link status changed. */
589 		dev->data->dev_link = dev_link;
590 		return 0;
591 	}
592 	/* Link status is still the same. */
593 	return -1;
594 }
595 
596 /**
597  * DPDK callback to retrieve physical link information.
598  *
599  * @param dev
600  *   Pointer to Ethernet device structure.
601  * @param wait_to_complete
602  *   Wait for request completion (ignored).
603  */
604 int
605 mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete)
606 {
607 	struct priv *priv = dev->data->dev_private;
608 	int ret;
609 
610 	priv_lock(priv);
611 	ret = mlx5_link_update_unlocked(dev, wait_to_complete);
612 	priv_unlock(priv);
613 	return ret;
614 }
615 
616 /**
617  * DPDK callback to change the MTU.
618  *
619  * Setting the MTU affects hardware MRU (packets larger than the MTU cannot be
620  * received). Use this as a hint to enable/disable scattered packets support
621  * and improve performance when not needed.
622  * Since failure is not an option, reconfiguring queues on the fly is not
623  * recommended.
624  *
625  * @param dev
626  *   Pointer to Ethernet device structure.
627  * @param in_mtu
628  *   New MTU.
629  *
630  * @return
631  *   0 on success, negative errno value on failure.
632  */
633 int
634 mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
635 {
636 	struct priv *priv = dev->data->dev_private;
637 	int ret = 0;
638 	unsigned int i;
639 	uint16_t (*rx_func)(void *, struct rte_mbuf **, uint16_t) =
640 		mlx5_rx_burst;
641 
642 	priv_lock(priv);
643 	/* Set kernel interface MTU first. */
644 	if (priv_set_mtu(priv, mtu)) {
645 		ret = errno;
646 		WARN("cannot set port %u MTU to %u: %s", priv->port, mtu,
647 		     strerror(ret));
648 		goto out;
649 	} else
650 		DEBUG("adapter port %u MTU set to %u", priv->port, mtu);
651 	priv->mtu = mtu;
652 	/* Temporarily replace RX handler with a fake one, assuming it has not
653 	 * been copied elsewhere. */
654 	dev->rx_pkt_burst = removed_rx_burst;
655 	/* Make sure everyone has left mlx5_rx_burst() and uses
656 	 * removed_rx_burst() instead. */
657 	rte_wmb();
658 	usleep(1000);
659 	/* Reconfigure each RX queue. */
660 	for (i = 0; (i != priv->rxqs_n); ++i) {
661 		struct rxq *rxq = (*priv->rxqs)[i];
662 		unsigned int max_frame_len;
663 		int sp;
664 
665 		if (rxq == NULL)
666 			continue;
667 		/* Calculate new maximum frame length according to MTU and
668 		 * toggle scattered support (sp) if necessary. */
669 		max_frame_len = (priv->mtu + ETHER_HDR_LEN +
670 				 (ETHER_MAX_VLAN_FRAME_LEN - ETHER_MAX_LEN));
671 		sp = (max_frame_len > (rxq->mb_len - RTE_PKTMBUF_HEADROOM));
672 		/* Provide new values to rxq_setup(). */
673 		dev->data->dev_conf.rxmode.jumbo_frame = sp;
674 		dev->data->dev_conf.rxmode.max_rx_pkt_len = max_frame_len;
675 		ret = rxq_rehash(dev, rxq);
676 		if (ret) {
677 			/* Force SP RX if that queue requires it and abort. */
678 			if (rxq->sp)
679 				rx_func = mlx5_rx_burst_sp;
680 			break;
681 		}
682 		/* Scattered burst function takes priority. */
683 		if (rxq->sp)
684 			rx_func = mlx5_rx_burst_sp;
685 	}
686 	/* Burst functions can now be called again. */
687 	rte_wmb();
688 	dev->rx_pkt_burst = rx_func;
689 out:
690 	priv_unlock(priv);
691 	assert(ret >= 0);
692 	return -ret;
693 }
694 
695 /**
696  * DPDK callback to get flow control status.
697  *
698  * @param dev
699  *   Pointer to Ethernet device structure.
700  * @param[out] fc_conf
701  *   Flow control output buffer.
702  *
703  * @return
704  *   0 on success, negative errno value on failure.
705  */
706 int
707 mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
708 {
709 	struct priv *priv = dev->data->dev_private;
710 	struct ifreq ifr;
711 	struct ethtool_pauseparam ethpause = {
712 		.cmd = ETHTOOL_GPAUSEPARAM
713 	};
714 	int ret;
715 
716 	ifr.ifr_data = &ethpause;
717 	priv_lock(priv);
718 	if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) {
719 		ret = errno;
720 		WARN("ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM)"
721 		     " failed: %s",
722 		     strerror(ret));
723 		goto out;
724 	}
725 
726 	fc_conf->autoneg = ethpause.autoneg;
727 	if (ethpause.rx_pause && ethpause.tx_pause)
728 		fc_conf->mode = RTE_FC_FULL;
729 	else if (ethpause.rx_pause)
730 		fc_conf->mode = RTE_FC_RX_PAUSE;
731 	else if (ethpause.tx_pause)
732 		fc_conf->mode = RTE_FC_TX_PAUSE;
733 	else
734 		fc_conf->mode = RTE_FC_NONE;
735 	ret = 0;
736 
737 out:
738 	priv_unlock(priv);
739 	assert(ret >= 0);
740 	return -ret;
741 }
742 
743 /**
744  * DPDK callback to modify flow control parameters.
745  *
746  * @param dev
747  *   Pointer to Ethernet device structure.
748  * @param[in] fc_conf
749  *   Flow control parameters.
750  *
751  * @return
752  *   0 on success, negative errno value on failure.
753  */
754 int
755 mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
756 {
757 	struct priv *priv = dev->data->dev_private;
758 	struct ifreq ifr;
759 	struct ethtool_pauseparam ethpause = {
760 		.cmd = ETHTOOL_SPAUSEPARAM
761 	};
762 	int ret;
763 
764 	ifr.ifr_data = &ethpause;
765 	ethpause.autoneg = fc_conf->autoneg;
766 	if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
767 	    (fc_conf->mode & RTE_FC_RX_PAUSE))
768 		ethpause.rx_pause = 1;
769 	else
770 		ethpause.rx_pause = 0;
771 
772 	if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
773 	    (fc_conf->mode & RTE_FC_TX_PAUSE))
774 		ethpause.tx_pause = 1;
775 	else
776 		ethpause.tx_pause = 0;
777 
778 	priv_lock(priv);
779 	if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) {
780 		ret = errno;
781 		WARN("ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)"
782 		     " failed: %s",
783 		     strerror(ret));
784 		goto out;
785 	}
786 	ret = 0;
787 
788 out:
789 	priv_unlock(priv);
790 	assert(ret >= 0);
791 	return -ret;
792 }
793 
794 /**
795  * Get PCI information from struct ibv_device.
796  *
797  * @param device
798  *   Pointer to Ethernet device structure.
799  * @param[out] pci_addr
800  *   PCI bus address output buffer.
801  *
802  * @return
803  *   0 on success, -1 on failure and errno is set.
804  */
805 int
806 mlx5_ibv_device_to_pci_addr(const struct ibv_device *device,
807 			    struct rte_pci_addr *pci_addr)
808 {
809 	FILE *file;
810 	char line[32];
811 	MKSTR(path, "%s/device/uevent", device->ibdev_path);
812 
813 	file = fopen(path, "rb");
814 	if (file == NULL)
815 		return -1;
816 	while (fgets(line, sizeof(line), file) == line) {
817 		size_t len = strlen(line);
818 		int ret;
819 
820 		/* Truncate long lines. */
821 		if (len == (sizeof(line) - 1))
822 			while (line[(len - 1)] != '\n') {
823 				ret = fgetc(file);
824 				if (ret == EOF)
825 					break;
826 				line[(len - 1)] = ret;
827 			}
828 		/* Extract information. */
829 		if (sscanf(line,
830 			   "PCI_SLOT_NAME="
831 			   "%" SCNx16 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n",
832 			   &pci_addr->domain,
833 			   &pci_addr->bus,
834 			   &pci_addr->devid,
835 			   &pci_addr->function) == 4) {
836 			ret = 0;
837 			break;
838 		}
839 	}
840 	fclose(file);
841 	return 0;
842 }
843 
844 /**
845  * Link status handler.
846  *
847  * @param priv
848  *   Pointer to private structure.
849  * @param dev
850  *   Pointer to the rte_eth_dev structure.
851  *
852  * @return
853  *   Nonzero if the callback process can be called immediately.
854  */
855 static int
856 priv_dev_link_status_handler(struct priv *priv, struct rte_eth_dev *dev)
857 {
858 	struct ibv_async_event event;
859 	int port_change = 0;
860 	int ret = 0;
861 
862 	/* Read all message and acknowledge them. */
863 	for (;;) {
864 		if (ibv_get_async_event(priv->ctx, &event))
865 			break;
866 
867 		if (event.event_type == IBV_EVENT_PORT_ACTIVE ||
868 		    event.event_type == IBV_EVENT_PORT_ERR)
869 			port_change = 1;
870 		else
871 			DEBUG("event type %d on port %d not handled",
872 			      event.event_type, event.element.port_num);
873 		ibv_ack_async_event(&event);
874 	}
875 
876 	if (port_change ^ priv->pending_alarm) {
877 		struct rte_eth_link *link = &dev->data->dev_link;
878 
879 		priv->pending_alarm = 0;
880 		mlx5_link_update_unlocked(dev, 0);
881 		if (((link->link_speed == 0) && link->link_status) ||
882 		    ((link->link_speed != 0) && !link->link_status)) {
883 			/* Inconsistent status, check again later. */
884 			priv->pending_alarm = 1;
885 			rte_eal_alarm_set(MLX5_ALARM_TIMEOUT_US,
886 					  mlx5_dev_link_status_handler,
887 					  dev);
888 		} else
889 			ret = 1;
890 	}
891 	return ret;
892 }
893 
894 /**
895  * Handle delayed link status event.
896  *
897  * @param arg
898  *   Registered argument.
899  */
900 void
901 mlx5_dev_link_status_handler(void *arg)
902 {
903 	struct rte_eth_dev *dev = arg;
904 	struct priv *priv = dev->data->dev_private;
905 	int ret;
906 
907 	priv_lock(priv);
908 	assert(priv->pending_alarm == 1);
909 	ret = priv_dev_link_status_handler(priv, dev);
910 	priv_unlock(priv);
911 	if (ret)
912 		_rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC);
913 }
914 
915 /**
916  * Handle interrupts from the NIC.
917  *
918  * @param[in] intr_handle
919  *   Interrupt handler.
920  * @param cb_arg
921  *   Callback argument.
922  */
923 void
924 mlx5_dev_interrupt_handler(struct rte_intr_handle *intr_handle, void *cb_arg)
925 {
926 	struct rte_eth_dev *dev = cb_arg;
927 	struct priv *priv = dev->data->dev_private;
928 	int ret;
929 
930 	(void)intr_handle;
931 	priv_lock(priv);
932 	ret = priv_dev_link_status_handler(priv, dev);
933 	priv_unlock(priv);
934 	if (ret)
935 		_rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC);
936 }
937 
938 /**
939  * Uninstall interrupt handler.
940  *
941  * @param priv
942  *   Pointer to private structure.
943  * @param dev
944  *   Pointer to the rte_eth_dev structure.
945  */
946 void
947 priv_dev_interrupt_handler_uninstall(struct priv *priv, struct rte_eth_dev *dev)
948 {
949 	if (!dev->data->dev_conf.intr_conf.lsc)
950 		return;
951 	rte_intr_callback_unregister(&priv->intr_handle,
952 				     mlx5_dev_interrupt_handler,
953 				     dev);
954 	if (priv->pending_alarm)
955 		rte_eal_alarm_cancel(mlx5_dev_link_status_handler, dev);
956 	priv->pending_alarm = 0;
957 	priv->intr_handle.fd = 0;
958 	priv->intr_handle.type = 0;
959 }
960 
961 /**
962  * Install interrupt handler.
963  *
964  * @param priv
965  *   Pointer to private structure.
966  * @param dev
967  *   Pointer to the rte_eth_dev structure.
968  */
969 void
970 priv_dev_interrupt_handler_install(struct priv *priv, struct rte_eth_dev *dev)
971 {
972 	int rc, flags;
973 
974 	if (!dev->data->dev_conf.intr_conf.lsc)
975 		return;
976 	assert(priv->ctx->async_fd > 0);
977 	flags = fcntl(priv->ctx->async_fd, F_GETFL);
978 	rc = fcntl(priv->ctx->async_fd, F_SETFL, flags | O_NONBLOCK);
979 	if (rc < 0) {
980 		INFO("failed to change file descriptor async event queue");
981 		dev->data->dev_conf.intr_conf.lsc = 0;
982 	} else {
983 		priv->intr_handle.fd = priv->ctx->async_fd;
984 		priv->intr_handle.type = RTE_INTR_HANDLE_EXT;
985 		rte_intr_callback_register(&priv->intr_handle,
986 					   mlx5_dev_interrupt_handler,
987 					   dev);
988 	}
989 }
990