xref: /dpdk/drivers/common/mlx5/linux/mlx5_nl.c (revision da7e701151ea8b742d4c38ace3e4fefd1b4507fc)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2018 6WIND S.A.
3  * Copyright 2018 Mellanox Technologies, Ltd
4  */
5 
6 #include <errno.h>
7 #include <linux/if_link.h>
8 #include <linux/rtnetlink.h>
9 #include <linux/genetlink.h>
10 #include <net/if.h>
11 #include <rdma/rdma_netlink.h>
12 #include <stdbool.h>
13 #include <stdint.h>
14 #include <stdlib.h>
15 #include <stdalign.h>
16 #include <string.h>
17 #include <sys/socket.h>
18 #include <unistd.h>
19 
20 #include <rte_errno.h>
21 
22 #include "mlx5_nl.h"
23 #include "../mlx5_common_log.h"
24 #include "mlx5_malloc.h"
25 #ifdef HAVE_DEVLINK
26 #include <linux/devlink.h>
27 #endif
28 
29 
30 /* Size of the buffer to receive kernel messages */
31 #define MLX5_NL_BUF_SIZE (32 * 1024)
32 /* Send buffer size for the Netlink socket */
33 #define MLX5_SEND_BUF_SIZE 32768
34 /* Receive buffer size for the Netlink socket */
35 #define MLX5_RECV_BUF_SIZE 32768
36 /* Maximal physical port name length. */
37 #define MLX5_PHYS_PORT_NAME_MAX 128
38 
39 /** Parameters of VLAN devices created by driver. */
40 #define MLX5_VMWA_VLAN_DEVICE_PFX "evmlx"
41 /*
42  * Define NDA_RTA as defined in iproute2 sources.
43  *
44  * see in iproute2 sources file include/libnetlink.h
45  */
46 #ifndef MLX5_NDA_RTA
47 #define MLX5_NDA_RTA(r) \
48 	((struct rtattr *)(((char *)(r)) + NLMSG_ALIGN(sizeof(struct ndmsg))))
49 #endif
50 /*
51  * Define NLMSG_TAIL as defined in iproute2 sources.
52  *
53  * see in iproute2 sources file include/libnetlink.h
54  */
55 #ifndef NLMSG_TAIL
56 #define NLMSG_TAIL(nmsg) \
57 	((struct rtattr *)(((char *)(nmsg)) + NLMSG_ALIGN((nmsg)->nlmsg_len)))
58 #endif
59 /*
60  * The following definitions are normally found in rdma/rdma_netlink.h,
61  * however they are so recent that most systems do not expose them yet.
62  */
63 #ifndef HAVE_RDMA_NL_NLDEV
64 #define RDMA_NL_NLDEV 5
65 #endif
66 #ifndef HAVE_RDMA_NLDEV_CMD_GET
67 #define RDMA_NLDEV_CMD_GET 1
68 #endif
69 #ifndef HAVE_RDMA_NLDEV_CMD_PORT_GET
70 #define RDMA_NLDEV_CMD_PORT_GET 5
71 #endif
72 #ifndef HAVE_RDMA_NLDEV_ATTR_DEV_INDEX
73 #define RDMA_NLDEV_ATTR_DEV_INDEX 1
74 #endif
75 #ifndef HAVE_RDMA_NLDEV_ATTR_DEV_NAME
76 #define RDMA_NLDEV_ATTR_DEV_NAME 2
77 #endif
78 #ifndef HAVE_RDMA_NLDEV_ATTR_PORT_INDEX
79 #define RDMA_NLDEV_ATTR_PORT_INDEX 3
80 #endif
81 #ifndef HAVE_RDMA_NLDEV_ATTR_PORT_STATE
82 #define RDMA_NLDEV_ATTR_PORT_STATE 12
83 #endif
84 #ifndef HAVE_RDMA_NLDEV_ATTR_NDEV_INDEX
85 #define RDMA_NLDEV_ATTR_NDEV_INDEX 50
86 #endif
87 
88 /* These are normally found in linux/if_link.h. */
89 #ifndef HAVE_IFLA_NUM_VF
90 #define IFLA_NUM_VF 21
91 #endif
92 #ifndef HAVE_IFLA_EXT_MASK
93 #define IFLA_EXT_MASK 29
94 #endif
95 #ifndef HAVE_IFLA_PHYS_SWITCH_ID
96 #define IFLA_PHYS_SWITCH_ID 36
97 #endif
98 #ifndef HAVE_IFLA_PHYS_PORT_NAME
99 #define IFLA_PHYS_PORT_NAME 38
100 #endif
101 
102 /*
103  * Some Devlink defines may be missed in old kernel versions,
104  * adjust used defines.
105  */
106 #ifndef DEVLINK_GENL_NAME
107 #define DEVLINK_GENL_NAME "devlink"
108 #endif
109 #ifndef DEVLINK_GENL_VERSION
110 #define DEVLINK_GENL_VERSION 1
111 #endif
112 #ifndef DEVLINK_ATTR_BUS_NAME
113 #define DEVLINK_ATTR_BUS_NAME 1
114 #endif
115 #ifndef DEVLINK_ATTR_DEV_NAME
116 #define DEVLINK_ATTR_DEV_NAME 2
117 #endif
118 #ifndef DEVLINK_ATTR_PARAM
119 #define DEVLINK_ATTR_PARAM 80
120 #endif
121 #ifndef DEVLINK_ATTR_PARAM_NAME
122 #define DEVLINK_ATTR_PARAM_NAME 81
123 #endif
124 #ifndef DEVLINK_ATTR_PARAM_TYPE
125 #define DEVLINK_ATTR_PARAM_TYPE 83
126 #endif
127 #ifndef DEVLINK_ATTR_PARAM_VALUES_LIST
128 #define DEVLINK_ATTR_PARAM_VALUES_LIST 84
129 #endif
130 #ifndef DEVLINK_ATTR_PARAM_VALUE
131 #define DEVLINK_ATTR_PARAM_VALUE 85
132 #endif
133 #ifndef DEVLINK_ATTR_PARAM_VALUE_DATA
134 #define DEVLINK_ATTR_PARAM_VALUE_DATA 86
135 #endif
136 #ifndef DEVLINK_ATTR_PARAM_VALUE_CMODE
137 #define DEVLINK_ATTR_PARAM_VALUE_CMODE 87
138 #endif
139 #ifndef DEVLINK_PARAM_CMODE_DRIVERINIT
140 #define DEVLINK_PARAM_CMODE_DRIVERINIT 1
141 #endif
142 #ifndef DEVLINK_CMD_RELOAD
143 #define DEVLINK_CMD_RELOAD 37
144 #endif
145 #ifndef DEVLINK_CMD_PARAM_GET
146 #define DEVLINK_CMD_PARAM_GET 38
147 #endif
148 #ifndef DEVLINK_CMD_PARAM_SET
149 #define DEVLINK_CMD_PARAM_SET 39
150 #endif
151 #ifndef NLA_FLAG
152 #define NLA_FLAG 6
153 #endif
154 
155 /* Add/remove MAC address through Netlink */
156 struct mlx5_nl_mac_addr {
157 	struct rte_ether_addr (*mac)[];
158 	/**< MAC address handled by the device. */
159 	int mac_n; /**< Number of addresses in the array. */
160 };
161 
162 #define MLX5_NL_CMD_GET_IB_NAME (1 << 0)
163 #define MLX5_NL_CMD_GET_IB_INDEX (1 << 1)
164 #define MLX5_NL_CMD_GET_NET_INDEX (1 << 2)
165 #define MLX5_NL_CMD_GET_PORT_INDEX (1 << 3)
166 #define MLX5_NL_CMD_GET_PORT_STATE (1 << 4)
167 
168 /** Data structure used by mlx5_nl_cmdget_cb(). */
169 struct mlx5_nl_port_info {
170 	const char *name; /**< IB device name (in). */
171 	uint32_t flags; /**< found attribute flags (out). */
172 	uint32_t ibindex; /**< IB device index (out). */
173 	uint32_t ifindex; /**< Network interface index (out). */
174 	uint32_t portnum; /**< IB device max port number (out). */
175 	uint16_t state; /**< IB device port state (out). */
176 };
177 
178 uint32_t atomic_sn;
179 
180 /* Generate Netlink sequence number. */
181 #define MLX5_NL_SN_GENERATE (__atomic_fetch_add(&atomic_sn, 1, __ATOMIC_RELAXED) + 1)
182 
183 /**
184  * Opens a Netlink socket.
185  *
186  * @param protocol
187  *   Netlink protocol (e.g. NETLINK_ROUTE, NETLINK_RDMA).
188  * @param groups
189  *   Groups to listen (e.g. RTMGRP_LINK), can be 0.
190  *
191  * @return
192  *   A file descriptor on success, a negative errno value otherwise and
193  *   rte_errno is set.
194  */
195 int
196 mlx5_nl_init(int protocol, int groups)
197 {
198 	int fd;
199 	int buf_size;
200 	socklen_t opt_size;
201 	struct sockaddr_nl local = {
202 		.nl_family = AF_NETLINK,
203 		.nl_groups = groups,
204 	};
205 	int ret;
206 
207 	fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, protocol);
208 	if (fd == -1) {
209 		rte_errno = errno;
210 		return -rte_errno;
211 	}
212 	opt_size = sizeof(buf_size);
213 	ret = getsockopt(fd, SOL_SOCKET, SO_SNDBUF, &buf_size, &opt_size);
214 	if (ret == -1) {
215 		rte_errno = errno;
216 		goto error;
217 	}
218 	DRV_LOG(DEBUG, "Netlink socket send buffer: %d", buf_size);
219 	if (buf_size < MLX5_SEND_BUF_SIZE) {
220 		ret = setsockopt(fd, SOL_SOCKET, SO_SNDBUF,
221 				 &buf_size, sizeof(buf_size));
222 		if (ret == -1) {
223 			rte_errno = errno;
224 			goto error;
225 		}
226 	}
227 	opt_size = sizeof(buf_size);
228 	ret = getsockopt(fd, SOL_SOCKET, SO_RCVBUF, &buf_size, &opt_size);
229 	if (ret == -1) {
230 		rte_errno = errno;
231 		goto error;
232 	}
233 	DRV_LOG(DEBUG, "Netlink socket recv buffer: %d", buf_size);
234 	if (buf_size < MLX5_RECV_BUF_SIZE) {
235 		ret = setsockopt(fd, SOL_SOCKET, SO_RCVBUF,
236 				 &buf_size, sizeof(buf_size));
237 		if (ret == -1) {
238 			rte_errno = errno;
239 			goto error;
240 		}
241 	}
242 	ret = bind(fd, (struct sockaddr *)&local, sizeof(local));
243 	if (ret == -1) {
244 		rte_errno = errno;
245 		goto error;
246 	}
247 	return fd;
248 error:
249 	close(fd);
250 	return -rte_errno;
251 }
252 
253 /**
254  * Send a request message to the kernel on the Netlink socket.
255  *
256  * @param[in] nlsk_fd
257  *   Netlink socket file descriptor.
258  * @param[in] nh
259  *   The Netlink message send to the kernel.
260  * @param[in] ssn
261  *   Sequence number.
262  * @param[in] req
263  *   Pointer to the request structure.
264  * @param[in] len
265  *   Length of the request in bytes.
266  *
267  * @return
268  *   The number of sent bytes on success, a negative errno value otherwise and
269  *   rte_errno is set.
270  */
271 static int
272 mlx5_nl_request(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn, void *req,
273 		int len)
274 {
275 	struct sockaddr_nl sa = {
276 		.nl_family = AF_NETLINK,
277 	};
278 	struct iovec iov[2] = {
279 		{ .iov_base = nh, .iov_len = sizeof(*nh), },
280 		{ .iov_base = req, .iov_len = len, },
281 	};
282 	struct msghdr msg = {
283 		.msg_name = &sa,
284 		.msg_namelen = sizeof(sa),
285 		.msg_iov = iov,
286 		.msg_iovlen = 2,
287 	};
288 	int send_bytes;
289 
290 	nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
291 	nh->nlmsg_seq = sn;
292 	send_bytes = sendmsg(nlsk_fd, &msg, 0);
293 	if (send_bytes < 0) {
294 		rte_errno = errno;
295 		return -rte_errno;
296 	}
297 	return send_bytes;
298 }
299 
300 /**
301  * Send a message to the kernel on the Netlink socket.
302  *
303  * @param[in] nlsk_fd
304  *   The Netlink socket file descriptor used for communication.
305  * @param[in] nh
306  *   The Netlink message send to the kernel.
307  * @param[in] sn
308  *   Sequence number.
309  *
310  * @return
311  *   The number of sent bytes on success, a negative errno value otherwise and
312  *   rte_errno is set.
313  */
314 static int
315 mlx5_nl_send(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn)
316 {
317 	struct sockaddr_nl sa = {
318 		.nl_family = AF_NETLINK,
319 	};
320 	struct iovec iov = {
321 		.iov_base = nh,
322 		.iov_len = nh->nlmsg_len,
323 	};
324 	struct msghdr msg = {
325 		.msg_name = &sa,
326 		.msg_namelen = sizeof(sa),
327 		.msg_iov = &iov,
328 		.msg_iovlen = 1,
329 	};
330 	int send_bytes;
331 
332 	nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
333 	nh->nlmsg_seq = sn;
334 	send_bytes = sendmsg(nlsk_fd, &msg, 0);
335 	if (send_bytes < 0) {
336 		rte_errno = errno;
337 		return -rte_errno;
338 	}
339 	return send_bytes;
340 }
341 
342 /**
343  * Receive a message from the kernel on the Netlink socket, following
344  * mlx5_nl_send().
345  *
346  * @param[in] nlsk_fd
347  *   The Netlink socket file descriptor used for communication.
348  * @param[in] sn
349  *   Sequence number.
350  * @param[in] cb
351  *   The callback function to call for each Netlink message received.
352  * @param[in, out] arg
353  *   Custom arguments for the callback.
354  *
355  * @return
356  *   0 on success, a negative errno value otherwise and rte_errno is set.
357  */
358 static int
359 mlx5_nl_recv(int nlsk_fd, uint32_t sn, int (*cb)(struct nlmsghdr *, void *arg),
360 	     void *arg)
361 {
362 	struct sockaddr_nl sa;
363 	struct iovec iov;
364 	struct msghdr msg = {
365 		.msg_name = &sa,
366 		.msg_namelen = sizeof(sa),
367 		.msg_iov = &iov,
368 		/* One message at a time */
369 		.msg_iovlen = 1,
370 	};
371 	void *buf = NULL;
372 	int multipart = 0;
373 	int ret = 0;
374 
375 	do {
376 		struct nlmsghdr *nh;
377 		int recv_bytes;
378 
379 		do {
380 			/* Query length of incoming message. */
381 			iov.iov_base = NULL;
382 			iov.iov_len = 0;
383 			recv_bytes = recvmsg(nlsk_fd, &msg,
384 					     MSG_PEEK | MSG_TRUNC);
385 			if (recv_bytes < 0) {
386 				rte_errno = errno;
387 				ret = -rte_errno;
388 				goto exit;
389 			}
390 			if (recv_bytes == 0) {
391 				rte_errno = ENODATA;
392 				ret = -rte_errno;
393 				goto exit;
394 			}
395 			/* Allocate buffer to fetch the message. */
396 			if (recv_bytes < MLX5_RECV_BUF_SIZE)
397 				recv_bytes = MLX5_RECV_BUF_SIZE;
398 			mlx5_free(buf);
399 			buf = mlx5_malloc(0, recv_bytes, 0, SOCKET_ID_ANY);
400 			if (!buf) {
401 				rte_errno = ENOMEM;
402 				ret = -rte_errno;
403 				goto exit;
404 			}
405 			/* Fetch the message. */
406 			iov.iov_base = buf;
407 			iov.iov_len = recv_bytes;
408 			recv_bytes = recvmsg(nlsk_fd, &msg, 0);
409 			if (recv_bytes == -1) {
410 				rte_errno = errno;
411 				ret = -rte_errno;
412 				goto exit;
413 			}
414 			nh = (struct nlmsghdr *)buf;
415 		} while (nh->nlmsg_seq != sn);
416 		for (;
417 		     NLMSG_OK(nh, (unsigned int)recv_bytes);
418 		     nh = NLMSG_NEXT(nh, recv_bytes)) {
419 			if (nh->nlmsg_type == NLMSG_ERROR) {
420 				struct nlmsgerr *err_data = NLMSG_DATA(nh);
421 
422 				if (err_data->error < 0) {
423 					rte_errno = -err_data->error;
424 					ret = -rte_errno;
425 					goto exit;
426 				}
427 				/* Ack message. */
428 				ret = 0;
429 				goto exit;
430 			}
431 			/* Multi-part msgs and their trailing DONE message. */
432 			if (nh->nlmsg_flags & NLM_F_MULTI) {
433 				if (nh->nlmsg_type == NLMSG_DONE) {
434 					ret =  0;
435 					goto exit;
436 				}
437 				multipart = 1;
438 			}
439 			if (cb) {
440 				ret = cb(nh, arg);
441 				if (ret < 0)
442 					goto exit;
443 			}
444 		}
445 	} while (multipart);
446 exit:
447 	mlx5_free(buf);
448 	return ret;
449 }
450 
451 /**
452  * Parse Netlink message to retrieve the bridge MAC address.
453  *
454  * @param nh
455  *   Pointer to Netlink Message Header.
456  * @param arg
457  *   PMD data register with this callback.
458  *
459  * @return
460  *   0 on success, a negative errno value otherwise and rte_errno is set.
461  */
462 static int
463 mlx5_nl_mac_addr_cb(struct nlmsghdr *nh, void *arg)
464 {
465 	struct mlx5_nl_mac_addr *data = arg;
466 	struct ndmsg *r = NLMSG_DATA(nh);
467 	struct rtattr *attribute;
468 	int len;
469 
470 	len = nh->nlmsg_len - NLMSG_LENGTH(sizeof(*r));
471 	for (attribute = MLX5_NDA_RTA(r);
472 	     RTA_OK(attribute, len);
473 	     attribute = RTA_NEXT(attribute, len)) {
474 		if (attribute->rta_type == NDA_LLADDR) {
475 			if (data->mac_n == MLX5_MAX_MAC_ADDRESSES) {
476 				DRV_LOG(WARNING,
477 					"not enough room to finalize the"
478 					" request");
479 				rte_errno = ENOMEM;
480 				return -rte_errno;
481 			}
482 #ifdef RTE_LIBRTE_MLX5_DEBUG
483 			char m[RTE_ETHER_ADDR_FMT_SIZE];
484 
485 			rte_ether_format_addr(m, RTE_ETHER_ADDR_FMT_SIZE,
486 					      RTA_DATA(attribute));
487 			DRV_LOG(DEBUG, "bridge MAC address %s", m);
488 #endif
489 			memcpy(&(*data->mac)[data->mac_n++],
490 			       RTA_DATA(attribute), RTE_ETHER_ADDR_LEN);
491 		}
492 	}
493 	return 0;
494 }
495 
496 /**
497  * Get bridge MAC addresses.
498  *
499  * @param[in] nlsk_fd
500  *   Netlink socket file descriptor.
501  * @param[in] iface_idx
502  *   Net device interface index.
503  * @param mac[out]
504  *   Pointer to the array table of MAC addresses to fill.
505  *   Its size should be of MLX5_MAX_MAC_ADDRESSES.
506  * @param mac_n[out]
507  *   Number of entries filled in MAC array.
508  *
509  * @return
510  *   0 on success, a negative errno value otherwise and rte_errno is set.
511  */
512 static int
513 mlx5_nl_mac_addr_list(int nlsk_fd, unsigned int iface_idx,
514 		      struct rte_ether_addr (*mac)[], int *mac_n)
515 {
516 	struct {
517 		struct nlmsghdr	hdr;
518 		struct ifinfomsg ifm;
519 	} req = {
520 		.hdr = {
521 			.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
522 			.nlmsg_type = RTM_GETNEIGH,
523 			.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST,
524 		},
525 		.ifm = {
526 			.ifi_family = PF_BRIDGE,
527 			.ifi_index = iface_idx,
528 		},
529 	};
530 	struct mlx5_nl_mac_addr data = {
531 		.mac = mac,
532 		.mac_n = 0,
533 	};
534 	uint32_t sn = MLX5_NL_SN_GENERATE;
535 	int ret;
536 
537 	if (nlsk_fd == -1)
538 		return 0;
539 	ret = mlx5_nl_request(nlsk_fd, &req.hdr, sn, &req.ifm,
540 			      sizeof(struct ifinfomsg));
541 	if (ret < 0)
542 		goto error;
543 	ret = mlx5_nl_recv(nlsk_fd, sn, mlx5_nl_mac_addr_cb, &data);
544 	if (ret < 0)
545 		goto error;
546 	*mac_n = data.mac_n;
547 	return 0;
548 error:
549 	DRV_LOG(DEBUG, "Interface %u cannot retrieve MAC address list %s",
550 		iface_idx, strerror(rte_errno));
551 	return -rte_errno;
552 }
553 
554 /**
555  * Modify the MAC address neighbour table with Netlink.
556  *
557  * @param[in] nlsk_fd
558  *   Netlink socket file descriptor.
559  * @param[in] iface_idx
560  *   Net device interface index.
561  * @param mac
562  *   MAC address to consider.
563  * @param add
564  *   1 to add the MAC address, 0 to remove the MAC address.
565  *
566  * @return
567  *   0 on success, a negative errno value otherwise and rte_errno is set.
568  */
569 static int
570 mlx5_nl_mac_addr_modify(int nlsk_fd, unsigned int iface_idx,
571 			struct rte_ether_addr *mac, int add)
572 {
573 	struct {
574 		struct nlmsghdr hdr;
575 		struct ndmsg ndm;
576 		struct rtattr rta;
577 		uint8_t buffer[RTE_ETHER_ADDR_LEN];
578 	} req = {
579 		.hdr = {
580 			.nlmsg_len = NLMSG_LENGTH(sizeof(struct ndmsg)),
581 			.nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
582 				NLM_F_EXCL | NLM_F_ACK,
583 			.nlmsg_type = add ? RTM_NEWNEIGH : RTM_DELNEIGH,
584 		},
585 		.ndm = {
586 			.ndm_family = PF_BRIDGE,
587 			.ndm_state = NUD_NOARP | NUD_PERMANENT,
588 			.ndm_ifindex = iface_idx,
589 			.ndm_flags = NTF_SELF,
590 		},
591 		.rta = {
592 			.rta_type = NDA_LLADDR,
593 			.rta_len = RTA_LENGTH(RTE_ETHER_ADDR_LEN),
594 		},
595 	};
596 	uint32_t sn = MLX5_NL_SN_GENERATE;
597 	int ret;
598 
599 	if (nlsk_fd == -1)
600 		return 0;
601 	memcpy(RTA_DATA(&req.rta), mac, RTE_ETHER_ADDR_LEN);
602 	req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) +
603 		RTA_ALIGN(req.rta.rta_len);
604 	ret = mlx5_nl_send(nlsk_fd, &req.hdr, sn);
605 	if (ret < 0)
606 		goto error;
607 	ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
608 	if (ret < 0)
609 		goto error;
610 	return 0;
611 error:
612 #ifdef RTE_LIBRTE_MLX5_DEBUG
613 	{
614 		char m[RTE_ETHER_ADDR_FMT_SIZE];
615 
616 		rte_ether_format_addr(m, RTE_ETHER_ADDR_FMT_SIZE, mac);
617 		DRV_LOG(DEBUG,
618 			"Interface %u cannot %s MAC address %s %s",
619 			iface_idx,
620 			add ? "add" : "remove", m, strerror(rte_errno));
621 	}
622 #endif
623 	return -rte_errno;
624 }
625 
626 /**
627  * Modify the VF MAC address neighbour table with Netlink.
628  *
629  * @param[in] nlsk_fd
630  *   Netlink socket file descriptor.
631  * @param[in] iface_idx
632  *   Net device interface index.
633  * @param mac
634  *    MAC address to consider.
635  * @param vf_index
636  *    VF index.
637  *
638  * @return
639  *    0 on success, a negative errno value otherwise and rte_errno is set.
640  */
641 int
642 mlx5_nl_vf_mac_addr_modify(int nlsk_fd, unsigned int iface_idx,
643 			   struct rte_ether_addr *mac, int vf_index)
644 {
645 	int ret;
646 	struct {
647 		struct nlmsghdr hdr;
648 		struct ifinfomsg ifm;
649 		struct rtattr vf_list_rta;
650 		struct rtattr vf_info_rta;
651 		struct rtattr vf_mac_rta;
652 		struct ifla_vf_mac ivm;
653 	} req = {
654 		.hdr = {
655 			.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
656 			.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
657 			.nlmsg_type = RTM_BASE,
658 		},
659 		.ifm = {
660 			.ifi_index = iface_idx,
661 		},
662 		.vf_list_rta = {
663 			.rta_type = IFLA_VFINFO_LIST,
664 			.rta_len = RTA_ALIGN(RTA_LENGTH(0)),
665 		},
666 		.vf_info_rta = {
667 			.rta_type = IFLA_VF_INFO,
668 			.rta_len = RTA_ALIGN(RTA_LENGTH(0)),
669 		},
670 		.vf_mac_rta = {
671 			.rta_type = IFLA_VF_MAC,
672 		},
673 	};
674 	struct ifla_vf_mac ivm = {
675 		.vf = vf_index,
676 	};
677 	uint32_t sn = MLX5_NL_SN_GENERATE;
678 
679 	memcpy(&ivm.mac, mac, RTE_ETHER_ADDR_LEN);
680 	memcpy(RTA_DATA(&req.vf_mac_rta), &ivm, sizeof(ivm));
681 
682 	req.vf_mac_rta.rta_len = RTA_LENGTH(sizeof(ivm));
683 	req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) +
684 		RTA_ALIGN(req.vf_list_rta.rta_len) +
685 		RTA_ALIGN(req.vf_info_rta.rta_len) +
686 		RTA_ALIGN(req.vf_mac_rta.rta_len);
687 	req.vf_list_rta.rta_len = RTE_PTR_DIFF(NLMSG_TAIL(&req.hdr),
688 					       &req.vf_list_rta);
689 	req.vf_info_rta.rta_len = RTE_PTR_DIFF(NLMSG_TAIL(&req.hdr),
690 					       &req.vf_info_rta);
691 
692 	if (nlsk_fd < 0)
693 		return -1;
694 	ret = mlx5_nl_send(nlsk_fd, &req.hdr, sn);
695 	if (ret < 0)
696 		goto error;
697 	ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
698 	if (ret < 0)
699 		goto error;
700 	return 0;
701 error:
702 	DRV_LOG(ERR,
703 		"representor %u cannot set VF MAC address "
704 		RTE_ETHER_ADDR_PRT_FMT " : %s",
705 		vf_index,
706 		RTE_ETHER_ADDR_BYTES(mac),
707 		strerror(rte_errno));
708 	return -rte_errno;
709 }
710 
711 /**
712  * Add a MAC address.
713  *
714  * @param[in] nlsk_fd
715  *   Netlink socket file descriptor.
716  * @param[in] iface_idx
717  *   Net device interface index.
718  * @param mac_own
719  *   BITFIELD_DECLARE array to store the mac.
720  * @param mac
721  *   MAC address to register.
722  * @param index
723  *   MAC address index.
724  *
725  * @return
726  *   0 on success, a negative errno value otherwise and rte_errno is set.
727  */
728 int
729 mlx5_nl_mac_addr_add(int nlsk_fd, unsigned int iface_idx,
730 		     uint64_t *mac_own, struct rte_ether_addr *mac,
731 		     uint32_t index)
732 {
733 	int ret;
734 
735 	ret = mlx5_nl_mac_addr_modify(nlsk_fd, iface_idx, mac, 1);
736 	if (!ret) {
737 		MLX5_ASSERT(index < MLX5_MAX_MAC_ADDRESSES);
738 		if (index >= MLX5_MAX_MAC_ADDRESSES)
739 			return -EINVAL;
740 
741 		BITFIELD_SET(mac_own, index);
742 	}
743 	if (ret == -EEXIST)
744 		return 0;
745 	return ret;
746 }
747 
748 /**
749  * Remove a MAC address.
750  *
751  * @param[in] nlsk_fd
752  *   Netlink socket file descriptor.
753  * @param[in] iface_idx
754  *   Net device interface index.
755  * @param mac_own
756  *   BITFIELD_DECLARE array to store the mac.
757  * @param mac
758  *   MAC address to remove.
759  * @param index
760  *   MAC address index.
761  *
762  * @return
763  *   0 on success, a negative errno value otherwise and rte_errno is set.
764  */
765 int
766 mlx5_nl_mac_addr_remove(int nlsk_fd, unsigned int iface_idx, uint64_t *mac_own,
767 			struct rte_ether_addr *mac, uint32_t index)
768 {
769 	MLX5_ASSERT(index < MLX5_MAX_MAC_ADDRESSES);
770 	if (index >= MLX5_MAX_MAC_ADDRESSES)
771 		return -EINVAL;
772 
773 	BITFIELD_RESET(mac_own, index);
774 	return mlx5_nl_mac_addr_modify(nlsk_fd, iface_idx, mac, 0);
775 }
776 
777 /**
778  * Synchronize Netlink bridge table to the internal table.
779  *
780  * @param[in] nlsk_fd
781  *   Netlink socket file descriptor.
782  * @param[in] iface_idx
783  *   Net device interface index.
784  * @param mac_addrs
785  *   Mac addresses array to sync.
786  * @param n
787  *   @p mac_addrs array size.
788  */
789 void
790 mlx5_nl_mac_addr_sync(int nlsk_fd, unsigned int iface_idx,
791 		      struct rte_ether_addr *mac_addrs, int n)
792 {
793 	struct rte_ether_addr macs[n];
794 	int macs_n = 0;
795 	int i;
796 	int ret;
797 
798 	memset(macs, 0, n * sizeof(macs[0]));
799 	ret = mlx5_nl_mac_addr_list(nlsk_fd, iface_idx, &macs, &macs_n);
800 	if (ret)
801 		return;
802 	for (i = 0; i != macs_n; ++i) {
803 		int j;
804 
805 		/* Verify the address is not in the array yet. */
806 		for (j = 0; j != n; ++j)
807 			if (rte_is_same_ether_addr(&macs[i], &mac_addrs[j]))
808 				break;
809 		if (j != n)
810 			continue;
811 		if (rte_is_multicast_ether_addr(&macs[i])) {
812 			/* Find the first entry available. */
813 			for (j = MLX5_MAX_UC_MAC_ADDRESSES; j != n; ++j) {
814 				if (rte_is_zero_ether_addr(&mac_addrs[j])) {
815 					mac_addrs[j] = macs[i];
816 					break;
817 				}
818 			}
819 		} else {
820 			/* Find the first entry available. */
821 			for (j = 0; j != MLX5_MAX_UC_MAC_ADDRESSES; ++j) {
822 				if (rte_is_zero_ether_addr(&mac_addrs[j])) {
823 					mac_addrs[j] = macs[i];
824 					break;
825 				}
826 			}
827 		}
828 	}
829 }
830 
831 /**
832  * Flush all added MAC addresses.
833  *
834  * @param[in] nlsk_fd
835  *   Netlink socket file descriptor.
836  * @param[in] iface_idx
837  *   Net device interface index.
838  * @param[in] mac_addrs
839  *   Mac addresses array to flush.
840  * @param n
841  *   @p mac_addrs array size.
842  * @param mac_own
843  *   BITFIELD_DECLARE array to store the mac.
844  */
845 void
846 mlx5_nl_mac_addr_flush(int nlsk_fd, unsigned int iface_idx,
847 		       struct rte_ether_addr *mac_addrs, int n,
848 		       uint64_t *mac_own)
849 {
850 	int i;
851 
852 	if (n <= 0 || n > MLX5_MAX_MAC_ADDRESSES)
853 		return;
854 
855 	for (i = n - 1; i >= 0; --i) {
856 		struct rte_ether_addr *m = &mac_addrs[i];
857 
858 		if (BITFIELD_ISSET(mac_own, i))
859 			mlx5_nl_mac_addr_remove(nlsk_fd, iface_idx, mac_own, m,
860 						i);
861 	}
862 }
863 
864 /**
865  * Enable promiscuous / all multicast mode through Netlink.
866  *
867  * @param[in] nlsk_fd
868  *   Netlink socket file descriptor.
869  * @param[in] iface_idx
870  *   Net device interface index.
871  * @param flags
872  *   IFF_PROMISC for promiscuous, IFF_ALLMULTI for allmulti.
873  * @param enable
874  *   Nonzero to enable, disable otherwise.
875  *
876  * @return
877  *   0 on success, a negative errno value otherwise and rte_errno is set.
878  */
879 static int
880 mlx5_nl_device_flags(int nlsk_fd, unsigned int iface_idx, uint32_t flags,
881 		     int enable)
882 {
883 	struct {
884 		struct nlmsghdr hdr;
885 		struct ifinfomsg ifi;
886 	} req = {
887 		.hdr = {
888 			.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
889 			.nlmsg_type = RTM_NEWLINK,
890 			.nlmsg_flags = NLM_F_REQUEST,
891 		},
892 		.ifi = {
893 			.ifi_flags = enable ? flags : 0,
894 			.ifi_change = flags,
895 			.ifi_index = iface_idx,
896 		},
897 	};
898 	uint32_t sn = MLX5_NL_SN_GENERATE;
899 	int ret;
900 
901 	MLX5_ASSERT(!(flags & ~(IFF_PROMISC | IFF_ALLMULTI)));
902 	if (nlsk_fd < 0)
903 		return 0;
904 	ret = mlx5_nl_send(nlsk_fd, &req.hdr, sn);
905 	if (ret < 0)
906 		return ret;
907 	return 0;
908 }
909 
910 /**
911  * Enable promiscuous mode through Netlink.
912  *
913  * @param[in] nlsk_fd
914  *   Netlink socket file descriptor.
915  * @param[in] iface_idx
916  *   Net device interface index.
917  * @param enable
918  *   Nonzero to enable, disable otherwise.
919  *
920  * @return
921  *   0 on success, a negative errno value otherwise and rte_errno is set.
922  */
923 int
924 mlx5_nl_promisc(int nlsk_fd, unsigned int iface_idx, int enable)
925 {
926 	int ret = mlx5_nl_device_flags(nlsk_fd, iface_idx, IFF_PROMISC, enable);
927 
928 	if (ret)
929 		DRV_LOG(DEBUG,
930 			"Interface %u cannot %s promisc mode: Netlink error %s",
931 			iface_idx, enable ? "enable" : "disable",
932 			strerror(rte_errno));
933 	return ret;
934 }
935 
936 /**
937  * Enable all multicast mode through Netlink.
938  *
939  * @param[in] nlsk_fd
940  *   Netlink socket file descriptor.
941  * @param[in] iface_idx
942  *   Net device interface index.
943  * @param enable
944  *   Nonzero to enable, disable otherwise.
945  *
946  * @return
947  *   0 on success, a negative errno value otherwise and rte_errno is set.
948  */
949 int
950 mlx5_nl_allmulti(int nlsk_fd, unsigned int iface_idx, int enable)
951 {
952 	int ret = mlx5_nl_device_flags(nlsk_fd, iface_idx, IFF_ALLMULTI,
953 				       enable);
954 
955 	if (ret)
956 		DRV_LOG(DEBUG,
957 			"Interface %u cannot %s allmulti : Netlink error %s",
958 			iface_idx, enable ? "enable" : "disable",
959 			strerror(rte_errno));
960 	return ret;
961 }
962 
963 /**
964  * Process network interface information from Netlink message.
965  *
966  * @param nh
967  *   Pointer to Netlink message header.
968  * @param arg
969  *   Opaque data pointer for this callback.
970  *
971  * @return
972  *   0 on success, a negative errno value otherwise and rte_errno is set.
973  */
974 static int
975 mlx5_nl_cmdget_cb(struct nlmsghdr *nh, void *arg)
976 {
977 	struct mlx5_nl_port_info *data = arg;
978 	struct mlx5_nl_port_info local = {
979 		.flags = 0,
980 	};
981 	size_t off = NLMSG_HDRLEN;
982 
983 	if (nh->nlmsg_type !=
984 	    RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET) &&
985 	    nh->nlmsg_type !=
986 	    RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_PORT_GET))
987 		goto error;
988 	while (off < nh->nlmsg_len) {
989 		struct nlattr *na = (void *)((uintptr_t)nh + off);
990 		void *payload = (void *)((uintptr_t)na + NLA_HDRLEN);
991 
992 		if (na->nla_len > nh->nlmsg_len - off)
993 			goto error;
994 		switch (na->nla_type) {
995 		case RDMA_NLDEV_ATTR_DEV_INDEX:
996 			local.ibindex = *(uint32_t *)payload;
997 			local.flags |= MLX5_NL_CMD_GET_IB_INDEX;
998 			break;
999 		case RDMA_NLDEV_ATTR_DEV_NAME:
1000 			if (!strcmp(payload, data->name))
1001 				local.flags |= MLX5_NL_CMD_GET_IB_NAME;
1002 			break;
1003 		case RDMA_NLDEV_ATTR_NDEV_INDEX:
1004 			local.ifindex = *(uint32_t *)payload;
1005 			local.flags |= MLX5_NL_CMD_GET_NET_INDEX;
1006 			break;
1007 		case RDMA_NLDEV_ATTR_PORT_INDEX:
1008 			local.portnum = *(uint32_t *)payload;
1009 			local.flags |= MLX5_NL_CMD_GET_PORT_INDEX;
1010 			break;
1011 		case RDMA_NLDEV_ATTR_PORT_STATE:
1012 			local.state = *(uint8_t *)payload;
1013 			local.flags |= MLX5_NL_CMD_GET_PORT_STATE;
1014 			break;
1015 		default:
1016 			break;
1017 		}
1018 		off += NLA_ALIGN(na->nla_len);
1019 	}
1020 	/*
1021 	 * It is possible to have multiple messages for all
1022 	 * Infiniband devices in the system with appropriate name.
1023 	 * So we should gather parameters locally and copy to
1024 	 * query context only in case of coinciding device name.
1025 	 */
1026 	if (local.flags & MLX5_NL_CMD_GET_IB_NAME) {
1027 		data->flags = local.flags;
1028 		data->ibindex = local.ibindex;
1029 		data->ifindex = local.ifindex;
1030 		data->portnum = local.portnum;
1031 		data->state = local.state;
1032 	}
1033 	return 0;
1034 error:
1035 	rte_errno = EINVAL;
1036 	return -rte_errno;
1037 }
1038 
1039 /**
1040  * Get port info of network interface associated with some IB device.
1041  *
1042  * This is the only somewhat safe method to avoid resorting to heuristics
1043  * when faced with port representors. Unfortunately it requires at least
1044  * Linux 4.17.
1045  *
1046  * @param nl
1047  *   Netlink socket of the RDMA kind (NETLINK_RDMA).
1048  * @param[in] pindex
1049  *   IB device port index, starting from 1
1050  * @param[out] data
1051  *   Pointer to port info.
1052  * @return
1053  *   0 on success, negative on error and rte_errno is set.
1054  */
1055 static int
1056 mlx5_nl_port_info(int nl, uint32_t pindex, struct mlx5_nl_port_info *data)
1057 {
1058 	union {
1059 		struct nlmsghdr nh;
1060 		uint8_t buf[NLMSG_HDRLEN +
1061 			    NLA_HDRLEN + NLA_ALIGN(sizeof(data->ibindex)) +
1062 			    NLA_HDRLEN + NLA_ALIGN(sizeof(pindex))];
1063 	} req = {
1064 		.nh = {
1065 			.nlmsg_len = NLMSG_LENGTH(0),
1066 			.nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
1067 						       RDMA_NLDEV_CMD_GET),
1068 			.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,
1069 		},
1070 	};
1071 	struct nlattr *na;
1072 	uint32_t sn = MLX5_NL_SN_GENERATE;
1073 	int ret;
1074 
1075 	ret = mlx5_nl_send(nl, &req.nh, sn);
1076 	if (ret < 0)
1077 		return ret;
1078 	ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, data);
1079 	if (ret < 0)
1080 		return ret;
1081 	if (!(data->flags & MLX5_NL_CMD_GET_IB_NAME) ||
1082 	    !(data->flags & MLX5_NL_CMD_GET_IB_INDEX))
1083 		goto error;
1084 	data->flags = 0;
1085 	sn = MLX5_NL_SN_GENERATE;
1086 	req.nh.nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
1087 					     RDMA_NLDEV_CMD_PORT_GET);
1088 	req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1089 	req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.buf) - NLMSG_HDRLEN);
1090 	na = (void *)((uintptr_t)req.buf + NLMSG_HDRLEN);
1091 	na->nla_len = NLA_HDRLEN + sizeof(data->ibindex);
1092 	na->nla_type = RDMA_NLDEV_ATTR_DEV_INDEX;
1093 	memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
1094 	       &data->ibindex, sizeof(data->ibindex));
1095 	na = (void *)((uintptr_t)na + NLA_ALIGN(na->nla_len));
1096 	na->nla_len = NLA_HDRLEN + sizeof(pindex);
1097 	na->nla_type = RDMA_NLDEV_ATTR_PORT_INDEX;
1098 	memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
1099 	       &pindex, sizeof(pindex));
1100 	ret = mlx5_nl_send(nl, &req.nh, sn);
1101 	if (ret < 0)
1102 		return ret;
1103 	ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, data);
1104 	if (ret < 0)
1105 		return ret;
1106 	if (!(data->flags & MLX5_NL_CMD_GET_IB_NAME) ||
1107 	    !(data->flags & MLX5_NL_CMD_GET_IB_INDEX) ||
1108 	    !(data->flags & MLX5_NL_CMD_GET_NET_INDEX) ||
1109 	    !data->ifindex)
1110 		goto error;
1111 	return 1;
1112 error:
1113 	rte_errno = ENODEV;
1114 	return -rte_errno;
1115 }
1116 
1117 /**
1118  * Get index of network interface associated with some IB device.
1119  *
1120  * This is the only somewhat safe method to avoid resorting to heuristics
1121  * when faced with port representors. Unfortunately it requires at least
1122  * Linux 4.17.
1123  *
1124  * @param nl
1125  *   Netlink socket of the RDMA kind (NETLINK_RDMA).
1126  * @param[in] name
1127  *   IB device name.
1128  * @param[in] pindex
1129  *   IB device port index, starting from 1
1130  * @return
1131  *   A valid (nonzero) interface index on success, 0 otherwise and rte_errno
1132  *   is set.
1133  */
1134 unsigned int
1135 mlx5_nl_ifindex(int nl, const char *name, uint32_t pindex)
1136 {
1137 	struct mlx5_nl_port_info data = {
1138 			.ifindex = 0,
1139 			.name = name,
1140 	};
1141 
1142 	if (mlx5_nl_port_info(nl, pindex, &data) < 0)
1143 		return 0;
1144 	return data.ifindex;
1145 }
1146 
1147 /**
1148  * Get IB device port state.
1149  *
1150  * This is the only somewhat safe method to get info for port number >= 255.
1151  * Unfortunately it requires at least Linux 4.17.
1152  *
1153  * @param nl
1154  *   Netlink socket of the RDMA kind (NETLINK_RDMA).
1155  * @param[in] name
1156  *   IB device name.
1157  * @param[in] pindex
1158  *   IB device port index, starting from 1
1159  * @return
1160  *   Port state (ibv_port_state) on success, negative on error
1161  *   and rte_errno is set.
1162  */
1163 int
1164 mlx5_nl_port_state(int nl, const char *name, uint32_t pindex)
1165 {
1166 	struct mlx5_nl_port_info data = {
1167 			.state = 0,
1168 			.name = name,
1169 	};
1170 
1171 	if (mlx5_nl_port_info(nl, pindex, &data) < 0)
1172 		return -rte_errno;
1173 	if ((data.flags & MLX5_NL_CMD_GET_PORT_STATE) == 0) {
1174 		rte_errno = ENOTSUP;
1175 		return -rte_errno;
1176 	}
1177 	return (int)data.state;
1178 }
1179 
1180 /**
1181  * Get the number of physical ports of given IB device.
1182  *
1183  * @param nl
1184  *   Netlink socket of the RDMA kind (NETLINK_RDMA).
1185  * @param[in] name
1186  *   IB device name.
1187  *
1188  * @return
1189  *   A valid (nonzero) number of ports on success, 0 otherwise
1190  *   and rte_errno is set.
1191  */
1192 unsigned int
1193 mlx5_nl_portnum(int nl, const char *name)
1194 {
1195 	struct mlx5_nl_port_info data = {
1196 		.flags = 0,
1197 		.name = name,
1198 		.ifindex = 0,
1199 		.portnum = 0,
1200 	};
1201 	struct nlmsghdr req = {
1202 		.nlmsg_len = NLMSG_LENGTH(0),
1203 		.nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
1204 					       RDMA_NLDEV_CMD_GET),
1205 		.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,
1206 	};
1207 	uint32_t sn = MLX5_NL_SN_GENERATE;
1208 	int ret;
1209 
1210 	ret = mlx5_nl_send(nl, &req, sn);
1211 	if (ret < 0)
1212 		return 0;
1213 	ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, &data);
1214 	if (ret < 0)
1215 		return 0;
1216 	if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) ||
1217 	    !(data.flags & MLX5_NL_CMD_GET_IB_INDEX) ||
1218 	    !(data.flags & MLX5_NL_CMD_GET_PORT_INDEX)) {
1219 		rte_errno = ENODEV;
1220 		return 0;
1221 	}
1222 	if (!data.portnum)
1223 		rte_errno = EINVAL;
1224 	return data.portnum;
1225 }
1226 
1227 /**
1228  * Analyze gathered port parameters via Netlink to recognize master
1229  * and representor devices for E-Switch configuration.
1230  *
1231  * @param[in] num_vf_set
1232  *   flag of presence of number of VFs port attribute.
1233  * @param[inout] switch_info
1234  *   Port information, including port name as a number and port name
1235  *   type if recognized
1236  *
1237  * @return
1238  *   master and representor flags are set in switch_info according to
1239  *   recognized parameters (if any).
1240  */
1241 static void
1242 mlx5_nl_check_switch_info(bool num_vf_set,
1243 			  struct mlx5_switch_info *switch_info)
1244 {
1245 	switch (switch_info->name_type) {
1246 	case MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN:
1247 		/*
1248 		 * Name is not recognized, assume the master,
1249 		 * check the number of VFs key presence.
1250 		 */
1251 		switch_info->master = num_vf_set;
1252 		break;
1253 	case MLX5_PHYS_PORT_NAME_TYPE_NOTSET:
1254 		/*
1255 		 * Name is not set, this assumes the legacy naming
1256 		 * schema for master, just check if there is a
1257 		 * number of VFs key.
1258 		 */
1259 		switch_info->master = num_vf_set;
1260 		break;
1261 	case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
1262 		/* New uplink naming schema recognized. */
1263 		switch_info->master = 1;
1264 		break;
1265 	case MLX5_PHYS_PORT_NAME_TYPE_LEGACY:
1266 		/* Legacy representors naming schema. */
1267 		switch_info->representor = !num_vf_set;
1268 		break;
1269 	case MLX5_PHYS_PORT_NAME_TYPE_PFHPF:
1270 		/* Fallthrough */
1271 	case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
1272 		/* Fallthrough */
1273 	case MLX5_PHYS_PORT_NAME_TYPE_PFSF:
1274 		/* New representors naming schema. */
1275 		switch_info->representor = 1;
1276 		break;
1277 	}
1278 }
1279 
1280 /**
1281  * Process switch information from Netlink message.
1282  *
1283  * @param nh
1284  *   Pointer to Netlink message header.
1285  * @param arg
1286  *   Opaque data pointer for this callback.
1287  *
1288  * @return
1289  *   0 on success, a negative errno value otherwise and rte_errno is set.
1290  */
1291 static int
1292 mlx5_nl_switch_info_cb(struct nlmsghdr *nh, void *arg)
1293 {
1294 	struct mlx5_switch_info info = {
1295 		.master = 0,
1296 		.representor = 0,
1297 		.name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET,
1298 		.port_name = 0,
1299 		.switch_id = 0,
1300 	};
1301 	size_t off = NLMSG_LENGTH(sizeof(struct ifinfomsg));
1302 	bool switch_id_set = false;
1303 	bool num_vf_set = false;
1304 	int len;
1305 
1306 	if (nh->nlmsg_type != RTM_NEWLINK)
1307 		goto error;
1308 	while (off < nh->nlmsg_len) {
1309 		struct rtattr *ra = (void *)((uintptr_t)nh + off);
1310 		void *payload = RTA_DATA(ra);
1311 		unsigned int i;
1312 
1313 		if (ra->rta_len > nh->nlmsg_len - off)
1314 			goto error;
1315 		switch (ra->rta_type) {
1316 		case IFLA_NUM_VF:
1317 			num_vf_set = true;
1318 			break;
1319 		case IFLA_PHYS_PORT_NAME:
1320 			len = RTA_PAYLOAD(ra);
1321 			/* Some kernels do not pad attributes with zero. */
1322 			if (len > 0 && len < MLX5_PHYS_PORT_NAME_MAX) {
1323 				char name[MLX5_PHYS_PORT_NAME_MAX];
1324 
1325 				/*
1326 				 * We can't just patch the message with padding
1327 				 * zero - it might corrupt the following items
1328 				 * in the message, we have to copy the string
1329 				 * by attribute length and pad the copied one.
1330 				 */
1331 				memcpy(name, payload, len);
1332 				name[len] = 0;
1333 				mlx5_translate_port_name(name, &info);
1334 			} else {
1335 				info.name_type =
1336 					MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN;
1337 			}
1338 			break;
1339 		case IFLA_PHYS_SWITCH_ID:
1340 			info.switch_id = 0;
1341 			for (i = 0; i < RTA_PAYLOAD(ra); ++i) {
1342 				info.switch_id <<= 8;
1343 				info.switch_id |= ((uint8_t *)payload)[i];
1344 			}
1345 			switch_id_set = true;
1346 			break;
1347 		}
1348 		off += RTA_ALIGN(ra->rta_len);
1349 	}
1350 	if (switch_id_set) {
1351 		/* We have some E-Switch configuration. */
1352 		mlx5_nl_check_switch_info(num_vf_set, &info);
1353 	}
1354 	MLX5_ASSERT(!(info.master && info.representor));
1355 	memcpy(arg, &info, sizeof(info));
1356 	return 0;
1357 error:
1358 	rte_errno = EINVAL;
1359 	return -rte_errno;
1360 }
1361 
1362 /**
1363  * Get switch information associated with network interface.
1364  *
1365  * @param nl
1366  *   Netlink socket of the ROUTE kind (NETLINK_ROUTE).
1367  * @param ifindex
1368  *   Network interface index.
1369  * @param[out] info
1370  *   Switch information object, populated in case of success.
1371  *
1372  * @return
1373  *   0 on success, a negative errno value otherwise and rte_errno is set.
1374  */
1375 int
1376 mlx5_nl_switch_info(int nl, unsigned int ifindex,
1377 		    struct mlx5_switch_info *info)
1378 {
1379 	struct {
1380 		struct nlmsghdr nh;
1381 		struct ifinfomsg info;
1382 		struct rtattr rta;
1383 		uint32_t extmask;
1384 	} req = {
1385 		.nh = {
1386 			.nlmsg_len = NLMSG_LENGTH
1387 					(sizeof(req.info) +
1388 					 RTA_LENGTH(sizeof(uint32_t))),
1389 			.nlmsg_type = RTM_GETLINK,
1390 			.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
1391 		},
1392 		.info = {
1393 			.ifi_family = AF_UNSPEC,
1394 			.ifi_index = ifindex,
1395 		},
1396 		.rta = {
1397 			.rta_type = IFLA_EXT_MASK,
1398 			.rta_len = RTA_LENGTH(sizeof(int32_t)),
1399 		},
1400 		.extmask = RTE_LE32(1),
1401 	};
1402 	uint32_t sn = MLX5_NL_SN_GENERATE;
1403 	int ret;
1404 
1405 	ret = mlx5_nl_send(nl, &req.nh, sn);
1406 	if (ret >= 0)
1407 		ret = mlx5_nl_recv(nl, sn, mlx5_nl_switch_info_cb, info);
1408 	if (info->master && info->representor) {
1409 		DRV_LOG(ERR, "ifindex %u device is recognized as master"
1410 			     " and as representor", ifindex);
1411 		rte_errno = ENODEV;
1412 		ret = -rte_errno;
1413 	}
1414 	return ret;
1415 }
1416 
1417 /*
1418  * Delete VLAN network device by ifindex.
1419  *
1420  * @param[in] tcf
1421  *   Context object initialized by mlx5_nl_vlan_vmwa_init().
1422  * @param[in] ifindex
1423  *   Interface index of network device to delete.
1424  */
1425 void
1426 mlx5_nl_vlan_vmwa_delete(struct mlx5_nl_vlan_vmwa_context *vmwa,
1427 		      uint32_t ifindex)
1428 {
1429 	uint32_t sn = MLX5_NL_SN_GENERATE;
1430 	int ret;
1431 	struct {
1432 		struct nlmsghdr nh;
1433 		struct ifinfomsg info;
1434 	} req = {
1435 		.nh = {
1436 			.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
1437 			.nlmsg_type = RTM_DELLINK,
1438 			.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
1439 		},
1440 		.info = {
1441 			.ifi_family = AF_UNSPEC,
1442 			.ifi_index = ifindex,
1443 		},
1444 	};
1445 
1446 	if (ifindex) {
1447 		ret = mlx5_nl_send(vmwa->nl_socket, &req.nh, sn);
1448 		if (ret >= 0)
1449 			ret = mlx5_nl_recv(vmwa->nl_socket, sn, NULL, NULL);
1450 		if (ret < 0)
1451 			DRV_LOG(WARNING, "netlink: error deleting VLAN WA"
1452 				" ifindex %u, %d", ifindex, ret);
1453 	}
1454 }
1455 
1456 /* Set of subroutines to build Netlink message. */
1457 static struct nlattr *
1458 nl_msg_tail(struct nlmsghdr *nlh)
1459 {
1460 	return (struct nlattr *)
1461 		(((uint8_t *)nlh) + NLMSG_ALIGN(nlh->nlmsg_len));
1462 }
1463 
1464 static void
1465 nl_attr_put(struct nlmsghdr *nlh, int type, const void *data, int alen)
1466 {
1467 	struct nlattr *nla = nl_msg_tail(nlh);
1468 
1469 	nla->nla_type = type;
1470 	nla->nla_len = NLMSG_ALIGN(sizeof(struct nlattr)) + alen;
1471 	nlh->nlmsg_len += NLMSG_ALIGN(nla->nla_len);
1472 
1473 	if (alen)
1474 		memcpy((uint8_t *)nla + sizeof(struct nlattr), data, alen);
1475 }
1476 
1477 static struct nlattr *
1478 nl_attr_nest_start(struct nlmsghdr *nlh, int type)
1479 {
1480 	struct nlattr *nest = (struct nlattr *)nl_msg_tail(nlh);
1481 
1482 	nl_attr_put(nlh, type, NULL, 0);
1483 	return nest;
1484 }
1485 
1486 static void
1487 nl_attr_nest_end(struct nlmsghdr *nlh, struct nlattr *nest)
1488 {
1489 	nest->nla_len = (uint8_t *)nl_msg_tail(nlh) - (uint8_t *)nest;
1490 }
1491 
1492 /*
1493  * Create network VLAN device with specified VLAN tag.
1494  *
1495  * @param[in] tcf
1496  *   Context object initialized by mlx5_nl_vlan_vmwa_init().
1497  * @param[in] ifindex
1498  *   Base network interface index.
1499  * @param[in] tag
1500  *   VLAN tag for VLAN network device to create.
1501  */
1502 uint32_t
1503 mlx5_nl_vlan_vmwa_create(struct mlx5_nl_vlan_vmwa_context *vmwa,
1504 			 uint32_t ifindex, uint16_t tag)
1505 {
1506 	struct nlmsghdr *nlh;
1507 	struct ifinfomsg *ifm;
1508 	char name[sizeof(MLX5_VMWA_VLAN_DEVICE_PFX) + 32];
1509 
1510 	__rte_cache_aligned
1511 	uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1512 		    NLMSG_ALIGN(sizeof(struct ifinfomsg)) +
1513 		    NLMSG_ALIGN(sizeof(struct nlattr)) * 8 +
1514 		    NLMSG_ALIGN(sizeof(uint32_t)) +
1515 		    NLMSG_ALIGN(sizeof(name)) +
1516 		    NLMSG_ALIGN(sizeof("vlan")) +
1517 		    NLMSG_ALIGN(sizeof(uint32_t)) +
1518 		    NLMSG_ALIGN(sizeof(uint16_t)) + 16];
1519 	struct nlattr *na_info;
1520 	struct nlattr *na_vlan;
1521 	uint32_t sn = MLX5_NL_SN_GENERATE;
1522 	int ret;
1523 
1524 	memset(buf, 0, sizeof(buf));
1525 	nlh = (struct nlmsghdr *)buf;
1526 	nlh->nlmsg_len = sizeof(struct nlmsghdr);
1527 	nlh->nlmsg_type = RTM_NEWLINK;
1528 	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
1529 			   NLM_F_EXCL | NLM_F_ACK;
1530 	ifm = (struct ifinfomsg *)nl_msg_tail(nlh);
1531 	nlh->nlmsg_len += sizeof(struct ifinfomsg);
1532 	ifm->ifi_family = AF_UNSPEC;
1533 	ifm->ifi_type = 0;
1534 	ifm->ifi_index = 0;
1535 	ifm->ifi_flags = IFF_UP;
1536 	ifm->ifi_change = 0xffffffff;
1537 	nl_attr_put(nlh, IFLA_LINK, &ifindex, sizeof(ifindex));
1538 	ret = snprintf(name, sizeof(name), "%s.%u.%u",
1539 		       MLX5_VMWA_VLAN_DEVICE_PFX, ifindex, tag);
1540 	nl_attr_put(nlh, IFLA_IFNAME, name, ret + 1);
1541 	na_info = nl_attr_nest_start(nlh, IFLA_LINKINFO);
1542 	nl_attr_put(nlh, IFLA_INFO_KIND, "vlan", sizeof("vlan"));
1543 	na_vlan = nl_attr_nest_start(nlh, IFLA_INFO_DATA);
1544 	nl_attr_put(nlh, IFLA_VLAN_ID, &tag, sizeof(tag));
1545 	nl_attr_nest_end(nlh, na_vlan);
1546 	nl_attr_nest_end(nlh, na_info);
1547 	MLX5_ASSERT(sizeof(buf) >= nlh->nlmsg_len);
1548 	ret = mlx5_nl_send(vmwa->nl_socket, nlh, sn);
1549 	if (ret >= 0)
1550 		ret = mlx5_nl_recv(vmwa->nl_socket, sn, NULL, NULL);
1551 	if (ret < 0) {
1552 		DRV_LOG(WARNING, "netlink: VLAN %s create failure (%d)", name,
1553 			ret);
1554 	}
1555 	/* Try to get ifindex of created or pre-existing device. */
1556 	ret = if_nametoindex(name);
1557 	if (!ret) {
1558 		DRV_LOG(WARNING, "VLAN %s failed to get index (%d)", name,
1559 			errno);
1560 		return 0;
1561 	}
1562 	return ret;
1563 }
1564 
1565 /**
1566  * Parse Netlink message to retrieve the general family ID.
1567  *
1568  * @param nh
1569  *   Pointer to Netlink Message Header.
1570  * @param arg
1571  *   PMD data register with this callback.
1572  *
1573  * @return
1574  *   0 on success, a negative errno value otherwise and rte_errno is set.
1575  */
1576 static int
1577 mlx5_nl_family_id_cb(struct nlmsghdr *nh, void *arg)
1578 {
1579 
1580 	struct nlattr *tail = RTE_PTR_ADD(nh, nh->nlmsg_len);
1581 	struct nlattr *nla = RTE_PTR_ADD(nh, NLMSG_ALIGN(sizeof(*nh)) +
1582 					NLMSG_ALIGN(sizeof(struct genlmsghdr)));
1583 
1584 	for (; nla->nla_len && nla < tail;
1585 	     nla = RTE_PTR_ADD(nla, NLMSG_ALIGN(nla->nla_len))) {
1586 		if (nla->nla_type == CTRL_ATTR_FAMILY_ID) {
1587 			*(uint16_t *)arg = *(uint16_t *)(nla + 1);
1588 			return 0;
1589 		}
1590 	}
1591 	return -EINVAL;
1592 }
1593 
1594 #define MLX5_NL_MAX_ATTR_SIZE 100
1595 /**
1596  * Get generic netlink family ID.
1597  *
1598  * @param[in] nlsk_fd
1599  *   Netlink socket file descriptor.
1600  * @param[in] name
1601  *   The family name.
1602  *
1603  * @return
1604  *   ID >= 0 on success and @p enable is updated, a negative errno value
1605  *   otherwise and rte_errno is set.
1606  */
1607 static int
1608 mlx5_nl_generic_family_id_get(int nlsk_fd, const char *name)
1609 {
1610 	struct nlmsghdr *nlh;
1611 	struct genlmsghdr *genl;
1612 	uint32_t sn = MLX5_NL_SN_GENERATE;
1613 	int name_size = strlen(name) + 1;
1614 	int ret;
1615 	uint16_t id = -1;
1616 	uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1617 		    NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
1618 		    NLMSG_ALIGN(sizeof(struct nlattr)) +
1619 		    NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE)];
1620 
1621 	memset(buf, 0, sizeof(buf));
1622 	nlh = (struct nlmsghdr *)buf;
1623 	nlh->nlmsg_len = sizeof(struct nlmsghdr);
1624 	nlh->nlmsg_type = GENL_ID_CTRL;
1625 	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1626 	genl = (struct genlmsghdr *)nl_msg_tail(nlh);
1627 	nlh->nlmsg_len += sizeof(struct genlmsghdr);
1628 	genl->cmd = CTRL_CMD_GETFAMILY;
1629 	genl->version = 1;
1630 	nl_attr_put(nlh, CTRL_ATTR_FAMILY_NAME, name, name_size);
1631 	ret = mlx5_nl_send(nlsk_fd, nlh, sn);
1632 	if (ret >= 0)
1633 		ret = mlx5_nl_recv(nlsk_fd, sn, mlx5_nl_family_id_cb, &id);
1634 	if (ret < 0) {
1635 		DRV_LOG(DEBUG, "Failed to get Netlink %s family ID: %d.", name,
1636 			ret);
1637 		return ret;
1638 	}
1639 	DRV_LOG(DEBUG, "Netlink \"%s\" family ID is %u.", name, id);
1640 	return (int)id;
1641 }
1642 
1643 /**
1644  * Get Devlink family ID.
1645  *
1646  * @param[in] nlsk_fd
1647  *   Netlink socket file descriptor.
1648  *
1649  * @return
1650  *   ID >= 0 on success and @p enable is updated, a negative errno value
1651  *   otherwise and rte_errno is set.
1652  */
1653 
1654 int
1655 mlx5_nl_devlink_family_id_get(int nlsk_fd)
1656 {
1657 	return mlx5_nl_generic_family_id_get(nlsk_fd, DEVLINK_GENL_NAME);
1658 }
1659 
1660 /**
1661  * Parse Netlink message to retrieve the ROCE enable status.
1662  *
1663  * @param nh
1664  *   Pointer to Netlink Message Header.
1665  * @param arg
1666  *   PMD data register with this callback.
1667  *
1668  * @return
1669  *   0 on success, a negative errno value otherwise and rte_errno is set.
1670  */
1671 static int
1672 mlx5_nl_roce_cb(struct nlmsghdr *nh, void *arg)
1673 {
1674 
1675 	int ret = -EINVAL;
1676 	int *enable = arg;
1677 	struct nlattr *tail = RTE_PTR_ADD(nh, nh->nlmsg_len);
1678 	struct nlattr *nla = RTE_PTR_ADD(nh, NLMSG_ALIGN(sizeof(*nh)) +
1679 					NLMSG_ALIGN(sizeof(struct genlmsghdr)));
1680 
1681 	while (nla->nla_len && nla < tail) {
1682 		switch (nla->nla_type) {
1683 		/* Expected nested attributes case. */
1684 		case DEVLINK_ATTR_PARAM:
1685 		case DEVLINK_ATTR_PARAM_VALUES_LIST:
1686 		case DEVLINK_ATTR_PARAM_VALUE:
1687 			ret = 0;
1688 			nla += 1;
1689 			break;
1690 		case DEVLINK_ATTR_PARAM_VALUE_DATA:
1691 			*enable = 1;
1692 			return 0;
1693 		default:
1694 			nla = RTE_PTR_ADD(nla, NLMSG_ALIGN(nla->nla_len));
1695 		}
1696 	}
1697 	*enable = 0;
1698 	return ret;
1699 }
1700 
1701 /**
1702  * Get ROCE enable status through Netlink.
1703  *
1704  * @param[in] nlsk_fd
1705  *   Netlink socket file descriptor.
1706  * @param[in] family_id
1707  *   the Devlink family ID.
1708  * @param pci_addr
1709  *   The device PCI address.
1710  * @param[out] enable
1711  *   Where to store the enable status.
1712  *
1713  * @return
1714  *   0 on success and @p enable is updated, a negative errno value otherwise
1715  *   and rte_errno is set.
1716  */
1717 int
1718 mlx5_nl_enable_roce_get(int nlsk_fd, int family_id, const char *pci_addr,
1719 			int *enable)
1720 {
1721 	struct nlmsghdr *nlh;
1722 	struct genlmsghdr *genl;
1723 	uint32_t sn = MLX5_NL_SN_GENERATE;
1724 	int ret;
1725 	int cur_en = 0;
1726 	uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1727 		    NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
1728 		    NLMSG_ALIGN(sizeof(struct nlattr)) * 4 +
1729 		    NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE) * 4];
1730 
1731 	memset(buf, 0, sizeof(buf));
1732 	nlh = (struct nlmsghdr *)buf;
1733 	nlh->nlmsg_len = sizeof(struct nlmsghdr);
1734 	nlh->nlmsg_type = family_id;
1735 	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1736 	genl = (struct genlmsghdr *)nl_msg_tail(nlh);
1737 	nlh->nlmsg_len += sizeof(struct genlmsghdr);
1738 	genl->cmd = DEVLINK_CMD_PARAM_GET;
1739 	genl->version = DEVLINK_GENL_VERSION;
1740 	nl_attr_put(nlh, DEVLINK_ATTR_BUS_NAME, "pci", 4);
1741 	nl_attr_put(nlh, DEVLINK_ATTR_DEV_NAME, pci_addr, strlen(pci_addr) + 1);
1742 	nl_attr_put(nlh, DEVLINK_ATTR_PARAM_NAME, "enable_roce", 12);
1743 	ret = mlx5_nl_send(nlsk_fd, nlh, sn);
1744 	if (ret >= 0)
1745 		ret = mlx5_nl_recv(nlsk_fd, sn, mlx5_nl_roce_cb, &cur_en);
1746 	if (ret < 0) {
1747 		DRV_LOG(DEBUG, "Failed to get ROCE enable on device %s: %d.",
1748 			pci_addr, ret);
1749 		return ret;
1750 	}
1751 	*enable = cur_en;
1752 	DRV_LOG(DEBUG, "ROCE is %sabled for device \"%s\".",
1753 		cur_en ? "en" : "dis", pci_addr);
1754 	return ret;
1755 }
1756 
1757 /**
1758  * Reload mlx5 device kernel driver through Netlink.
1759  *
1760  * @param[in] nlsk_fd
1761  *   Netlink socket file descriptor.
1762  * @param[in] family_id
1763  *   the Devlink family ID.
1764  * @param pci_addr
1765  *   The device PCI address.
1766  * @param[out] enable
1767  *   The enable status to set.
1768  *
1769  * @return
1770  *   0 on success, a negative errno value otherwise and rte_errno is set.
1771  */
1772 static int
1773 mlx5_nl_driver_reload(int nlsk_fd, int family_id, const char *pci_addr)
1774 {
1775 	struct nlmsghdr *nlh;
1776 	struct genlmsghdr *genl;
1777 	uint32_t sn = MLX5_NL_SN_GENERATE;
1778 	int ret;
1779 	uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1780 		    NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
1781 		    NLMSG_ALIGN(sizeof(struct nlattr)) * 2 +
1782 		    NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE) * 2];
1783 
1784 	memset(buf, 0, sizeof(buf));
1785 	nlh = (struct nlmsghdr *)buf;
1786 	nlh->nlmsg_len = sizeof(struct nlmsghdr);
1787 	nlh->nlmsg_type = family_id;
1788 	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1789 	genl = (struct genlmsghdr *)nl_msg_tail(nlh);
1790 	nlh->nlmsg_len += sizeof(struct genlmsghdr);
1791 	genl->cmd = DEVLINK_CMD_RELOAD;
1792 	genl->version = DEVLINK_GENL_VERSION;
1793 	nl_attr_put(nlh, DEVLINK_ATTR_BUS_NAME, "pci", 4);
1794 	nl_attr_put(nlh, DEVLINK_ATTR_DEV_NAME, pci_addr, strlen(pci_addr) + 1);
1795 	ret = mlx5_nl_send(nlsk_fd, nlh, sn);
1796 	if (ret >= 0)
1797 		ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
1798 	if (ret < 0) {
1799 		DRV_LOG(DEBUG, "Failed to reload %s device by Netlink - %d",
1800 			pci_addr, ret);
1801 		return ret;
1802 	}
1803 	DRV_LOG(DEBUG, "Device \"%s\" was reloaded by Netlink successfully.",
1804 		pci_addr);
1805 	return 0;
1806 }
1807 
1808 /**
1809  * Set ROCE enable status through Netlink.
1810  *
1811  * @param[in] nlsk_fd
1812  *   Netlink socket file descriptor.
1813  * @param[in] family_id
1814  *   the Devlink family ID.
1815  * @param pci_addr
1816  *   The device PCI address.
1817  * @param[out] enable
1818  *   The enable status to set.
1819  *
1820  * @return
1821  *   0 on success, a negative errno value otherwise and rte_errno is set.
1822  */
1823 int
1824 mlx5_nl_enable_roce_set(int nlsk_fd, int family_id, const char *pci_addr,
1825 			int enable)
1826 {
1827 	struct nlmsghdr *nlh;
1828 	struct genlmsghdr *genl;
1829 	uint32_t sn = MLX5_NL_SN_GENERATE;
1830 	int ret;
1831 	uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1832 		    NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
1833 		    NLMSG_ALIGN(sizeof(struct nlattr)) * 6 +
1834 		    NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE) * 6];
1835 	uint8_t cmode = DEVLINK_PARAM_CMODE_DRIVERINIT;
1836 	uint8_t ptype = NLA_FLAG;
1837 ;
1838 
1839 	memset(buf, 0, sizeof(buf));
1840 	nlh = (struct nlmsghdr *)buf;
1841 	nlh->nlmsg_len = sizeof(struct nlmsghdr);
1842 	nlh->nlmsg_type = family_id;
1843 	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1844 	genl = (struct genlmsghdr *)nl_msg_tail(nlh);
1845 	nlh->nlmsg_len += sizeof(struct genlmsghdr);
1846 	genl->cmd = DEVLINK_CMD_PARAM_SET;
1847 	genl->version = DEVLINK_GENL_VERSION;
1848 	nl_attr_put(nlh, DEVLINK_ATTR_BUS_NAME, "pci", 4);
1849 	nl_attr_put(nlh, DEVLINK_ATTR_DEV_NAME, pci_addr, strlen(pci_addr) + 1);
1850 	nl_attr_put(nlh, DEVLINK_ATTR_PARAM_NAME, "enable_roce", 12);
1851 	nl_attr_put(nlh, DEVLINK_ATTR_PARAM_VALUE_CMODE, &cmode, sizeof(cmode));
1852 	nl_attr_put(nlh, DEVLINK_ATTR_PARAM_TYPE, &ptype, sizeof(ptype));
1853 	if (enable)
1854 		nl_attr_put(nlh, DEVLINK_ATTR_PARAM_VALUE_DATA, NULL, 0);
1855 	ret = mlx5_nl_send(nlsk_fd, nlh, sn);
1856 	if (ret >= 0)
1857 		ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
1858 	if (ret < 0) {
1859 		DRV_LOG(DEBUG, "Failed to %sable ROCE for device %s by Netlink:"
1860 			" %d.", enable ? "en" : "dis", pci_addr, ret);
1861 		return ret;
1862 	}
1863 	DRV_LOG(DEBUG, "Device %s ROCE was %sabled by Netlink successfully.",
1864 		pci_addr, enable ? "en" : "dis");
1865 	/* Now, need to reload the driver. */
1866 	return mlx5_nl_driver_reload(nlsk_fd, family_id, pci_addr);
1867 }
1868 
1869 /**
1870  * Try to parse a Netlink message as a link status update.
1871  *
1872  * @param hdr
1873  *  Netlink message header.
1874  * @param[out] ifindex
1875  *  Index of the updated interface.
1876  *
1877  * @return
1878  *  0 on success, negative on failure.
1879  */
1880 int
1881 mlx5_nl_parse_link_status_update(struct nlmsghdr *hdr, uint32_t *ifindex)
1882 {
1883 	struct ifinfomsg *info;
1884 
1885 	switch (hdr->nlmsg_type) {
1886 	case RTM_NEWLINK:
1887 	case RTM_DELLINK:
1888 	case RTM_GETLINK:
1889 	case RTM_SETLINK:
1890 		info = NLMSG_DATA(hdr);
1891 		*ifindex = info->ifi_index;
1892 		return 0;
1893 	}
1894 	return -1;
1895 }
1896 
1897 /**
1898  * Read pending events from a Netlink socket.
1899  *
1900  * @param nlsk_fd
1901  *  Netlink socket.
1902  * @param cb
1903  *  Callback invoked for each of the events.
1904  * @param cb_arg
1905  *  User data for the callback.
1906  *
1907  * @return
1908  *  0 on success, including the case when there are no events.
1909  *  Negative on failure and rte_errno is set.
1910  */
1911 int
1912 mlx5_nl_read_events(int nlsk_fd, mlx5_nl_event_cb *cb, void *cb_arg)
1913 {
1914 	char buf[8192];
1915 	struct sockaddr_nl addr;
1916 	struct iovec iov = {
1917 		.iov_base = buf,
1918 		.iov_len = sizeof(buf),
1919 	};
1920 	struct msghdr msg = {
1921 		.msg_name = &addr,
1922 		.msg_namelen = sizeof(addr),
1923 		.msg_iov = &iov,
1924 		.msg_iovlen = 1,
1925 	};
1926 	struct nlmsghdr *hdr;
1927 	ssize_t size;
1928 
1929 	while (1) {
1930 		size = recvmsg(nlsk_fd, &msg, MSG_DONTWAIT);
1931 		if (size < 0) {
1932 			if (errno == EAGAIN)
1933 				return 0;
1934 			if (errno == EINTR)
1935 				continue;
1936 			DRV_LOG(DEBUG, "Failed to receive netlink message: %s",
1937 				strerror(errno));
1938 			rte_errno = errno;
1939 			return -rte_errno;
1940 		}
1941 		hdr = (struct nlmsghdr *)buf;
1942 		while (size >= (ssize_t)sizeof(*hdr)) {
1943 			ssize_t msg_len = hdr->nlmsg_len;
1944 			ssize_t data_len = msg_len - sizeof(*hdr);
1945 			ssize_t aligned_len;
1946 
1947 			if (data_len < 0) {
1948 				DRV_LOG(DEBUG, "Netlink message too short");
1949 				rte_errno = EINVAL;
1950 				return -rte_errno;
1951 			}
1952 			aligned_len = NLMSG_ALIGN(msg_len);
1953 			if (aligned_len > size) {
1954 				DRV_LOG(DEBUG, "Netlink message too long");
1955 				rte_errno = EINVAL;
1956 				return -rte_errno;
1957 			}
1958 			cb(hdr, cb_arg);
1959 			hdr = RTE_PTR_ADD(hdr, aligned_len);
1960 			size -= aligned_len;
1961 		}
1962 	}
1963 	return 0;
1964 }
1965 
1966 static int
1967 mlx5_nl_esw_multiport_cb(struct nlmsghdr *nh, void *arg)
1968 {
1969 
1970 	int ret = -EINVAL;
1971 	int *enable = arg;
1972 	struct nlattr *tail = RTE_PTR_ADD(nh, nh->nlmsg_len);
1973 	struct nlattr *nla = RTE_PTR_ADD(nh, NLMSG_ALIGN(sizeof(*nh)) +
1974 					NLMSG_ALIGN(sizeof(struct genlmsghdr)));
1975 
1976 	while (nla->nla_len && nla < tail) {
1977 		switch (nla->nla_type) {
1978 		/* Expected nested attributes case. */
1979 		case DEVLINK_ATTR_PARAM:
1980 		case DEVLINK_ATTR_PARAM_VALUES_LIST:
1981 		case DEVLINK_ATTR_PARAM_VALUE:
1982 			ret = 0;
1983 			nla += 1;
1984 			break;
1985 		case DEVLINK_ATTR_PARAM_VALUE_DATA:
1986 			*enable = 1;
1987 			return 0;
1988 		default:
1989 			nla = RTE_PTR_ADD(nla, NLMSG_ALIGN(nla->nla_len));
1990 		}
1991 	}
1992 	*enable = 0;
1993 	return ret;
1994 }
1995 
1996 #define NL_ESW_MULTIPORT_PARAM "esw_multiport"
1997 
1998 int
1999 mlx5_nl_devlink_esw_multiport_get(int nlsk_fd, int family_id, const char *pci_addr, int *enable)
2000 {
2001 	struct nlmsghdr *nlh;
2002 	struct genlmsghdr *genl;
2003 	uint32_t sn = MLX5_NL_SN_GENERATE;
2004 	int ret;
2005 	uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
2006 		    NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
2007 		    NLMSG_ALIGN(sizeof(struct nlattr)) * 4 +
2008 		    NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE) * 4];
2009 
2010 	memset(buf, 0, sizeof(buf));
2011 	nlh = (struct nlmsghdr *)buf;
2012 	nlh->nlmsg_len = sizeof(struct nlmsghdr);
2013 	nlh->nlmsg_type = family_id;
2014 	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
2015 	genl = (struct genlmsghdr *)nl_msg_tail(nlh);
2016 	nlh->nlmsg_len += sizeof(struct genlmsghdr);
2017 	genl->cmd = DEVLINK_CMD_PARAM_GET;
2018 	genl->version = DEVLINK_GENL_VERSION;
2019 	nl_attr_put(nlh, DEVLINK_ATTR_BUS_NAME, "pci", 4);
2020 	nl_attr_put(nlh, DEVLINK_ATTR_DEV_NAME, pci_addr, strlen(pci_addr) + 1);
2021 	nl_attr_put(nlh, DEVLINK_ATTR_PARAM_NAME,
2022 		    NL_ESW_MULTIPORT_PARAM, sizeof(NL_ESW_MULTIPORT_PARAM));
2023 	ret = mlx5_nl_send(nlsk_fd, nlh, sn);
2024 	if (ret >= 0)
2025 		ret = mlx5_nl_recv(nlsk_fd, sn, mlx5_nl_esw_multiport_cb, enable);
2026 	if (ret < 0) {
2027 		DRV_LOG(DEBUG, "Failed to get Multiport E-Switch enable on device %s: %d.",
2028 			pci_addr, ret);
2029 		return ret;
2030 	}
2031 	DRV_LOG(DEBUG, "Multiport E-Switch is %sabled for device \"%s\".",
2032 		*enable ? "en" : "dis", pci_addr);
2033 	return ret;
2034 }
2035