xref: /dpdk/drivers/common/mlx5/linux/mlx5_nl.c (revision 68a03efeed657e6e05f281479b33b51102797e15)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2018 6WIND S.A.
3  * Copyright 2018 Mellanox Technologies, Ltd
4  */
5 
6 #include <errno.h>
7 #include <linux/if_link.h>
8 #include <linux/rtnetlink.h>
9 #include <linux/genetlink.h>
10 #include <net/if.h>
11 #include <rdma/rdma_netlink.h>
12 #include <stdbool.h>
13 #include <stdint.h>
14 #include <stdlib.h>
15 #include <stdalign.h>
16 #include <string.h>
17 #include <sys/socket.h>
18 #include <unistd.h>
19 
20 #include <rte_errno.h>
21 
22 #include "mlx5_nl.h"
23 #include "mlx5_common_utils.h"
24 #include "mlx5_malloc.h"
25 #ifdef HAVE_DEVLINK
26 #include <linux/devlink.h>
27 #endif
28 
29 
30 /* Size of the buffer to receive kernel messages */
31 #define MLX5_NL_BUF_SIZE (32 * 1024)
32 /* Send buffer size for the Netlink socket */
33 #define MLX5_SEND_BUF_SIZE 32768
34 /* Receive buffer size for the Netlink socket */
35 #define MLX5_RECV_BUF_SIZE 32768
36 
37 /** Parameters of VLAN devices created by driver. */
38 #define MLX5_VMWA_VLAN_DEVICE_PFX "evmlx"
39 /*
40  * Define NDA_RTA as defined in iproute2 sources.
41  *
42  * see in iproute2 sources file include/libnetlink.h
43  */
44 #ifndef MLX5_NDA_RTA
45 #define MLX5_NDA_RTA(r) \
46 	((struct rtattr *)(((char *)(r)) + NLMSG_ALIGN(sizeof(struct ndmsg))))
47 #endif
48 /*
49  * Define NLMSG_TAIL as defined in iproute2 sources.
50  *
51  * see in iproute2 sources file include/libnetlink.h
52  */
53 #ifndef NLMSG_TAIL
54 #define NLMSG_TAIL(nmsg) \
55 	((struct rtattr *)(((char *)(nmsg)) + NLMSG_ALIGN((nmsg)->nlmsg_len)))
56 #endif
57 /*
58  * The following definitions are normally found in rdma/rdma_netlink.h,
59  * however they are so recent that most systems do not expose them yet.
60  */
61 #ifndef HAVE_RDMA_NL_NLDEV
62 #define RDMA_NL_NLDEV 5
63 #endif
64 #ifndef HAVE_RDMA_NLDEV_CMD_GET
65 #define RDMA_NLDEV_CMD_GET 1
66 #endif
67 #ifndef HAVE_RDMA_NLDEV_CMD_PORT_GET
68 #define RDMA_NLDEV_CMD_PORT_GET 5
69 #endif
70 #ifndef HAVE_RDMA_NLDEV_ATTR_DEV_INDEX
71 #define RDMA_NLDEV_ATTR_DEV_INDEX 1
72 #endif
73 #ifndef HAVE_RDMA_NLDEV_ATTR_DEV_NAME
74 #define RDMA_NLDEV_ATTR_DEV_NAME 2
75 #endif
76 #ifndef HAVE_RDMA_NLDEV_ATTR_PORT_INDEX
77 #define RDMA_NLDEV_ATTR_PORT_INDEX 3
78 #endif
79 #ifndef HAVE_RDMA_NLDEV_ATTR_NDEV_INDEX
80 #define RDMA_NLDEV_ATTR_NDEV_INDEX 50
81 #endif
82 
83 /* These are normally found in linux/if_link.h. */
84 #ifndef HAVE_IFLA_NUM_VF
85 #define IFLA_NUM_VF 21
86 #endif
87 #ifndef HAVE_IFLA_EXT_MASK
88 #define IFLA_EXT_MASK 29
89 #endif
90 #ifndef HAVE_IFLA_PHYS_SWITCH_ID
91 #define IFLA_PHYS_SWITCH_ID 36
92 #endif
93 #ifndef HAVE_IFLA_PHYS_PORT_NAME
94 #define IFLA_PHYS_PORT_NAME 38
95 #endif
96 
97 /*
98  * Some Devlink defines may be missed in old kernel versions,
99  * adjust used defines.
100  */
101 #ifndef DEVLINK_GENL_NAME
102 #define DEVLINK_GENL_NAME "devlink"
103 #endif
104 #ifndef DEVLINK_GENL_VERSION
105 #define DEVLINK_GENL_VERSION 1
106 #endif
107 #ifndef DEVLINK_ATTR_BUS_NAME
108 #define DEVLINK_ATTR_BUS_NAME 1
109 #endif
110 #ifndef DEVLINK_ATTR_DEV_NAME
111 #define DEVLINK_ATTR_DEV_NAME 2
112 #endif
113 #ifndef DEVLINK_ATTR_PARAM
114 #define DEVLINK_ATTR_PARAM 80
115 #endif
116 #ifndef DEVLINK_ATTR_PARAM_NAME
117 #define DEVLINK_ATTR_PARAM_NAME 81
118 #endif
119 #ifndef DEVLINK_ATTR_PARAM_TYPE
120 #define DEVLINK_ATTR_PARAM_TYPE 83
121 #endif
122 #ifndef DEVLINK_ATTR_PARAM_VALUES_LIST
123 #define DEVLINK_ATTR_PARAM_VALUES_LIST 84
124 #endif
125 #ifndef DEVLINK_ATTR_PARAM_VALUE
126 #define DEVLINK_ATTR_PARAM_VALUE 85
127 #endif
128 #ifndef DEVLINK_ATTR_PARAM_VALUE_DATA
129 #define DEVLINK_ATTR_PARAM_VALUE_DATA 86
130 #endif
131 #ifndef DEVLINK_ATTR_PARAM_VALUE_CMODE
132 #define DEVLINK_ATTR_PARAM_VALUE_CMODE 87
133 #endif
134 #ifndef DEVLINK_PARAM_CMODE_DRIVERINIT
135 #define DEVLINK_PARAM_CMODE_DRIVERINIT 1
136 #endif
137 #ifndef DEVLINK_CMD_RELOAD
138 #define DEVLINK_CMD_RELOAD 37
139 #endif
140 #ifndef DEVLINK_CMD_PARAM_GET
141 #define DEVLINK_CMD_PARAM_GET 38
142 #endif
143 #ifndef DEVLINK_CMD_PARAM_SET
144 #define DEVLINK_CMD_PARAM_SET 39
145 #endif
146 #ifndef NLA_FLAG
147 #define NLA_FLAG 6
148 #endif
149 
150 /* Add/remove MAC address through Netlink */
151 struct mlx5_nl_mac_addr {
152 	struct rte_ether_addr (*mac)[];
153 	/**< MAC address handled by the device. */
154 	int mac_n; /**< Number of addresses in the array. */
155 };
156 
157 #define MLX5_NL_CMD_GET_IB_NAME (1 << 0)
158 #define MLX5_NL_CMD_GET_IB_INDEX (1 << 1)
159 #define MLX5_NL_CMD_GET_NET_INDEX (1 << 2)
160 #define MLX5_NL_CMD_GET_PORT_INDEX (1 << 3)
161 
162 /** Data structure used by mlx5_nl_cmdget_cb(). */
163 struct mlx5_nl_ifindex_data {
164 	const char *name; /**< IB device name (in). */
165 	uint32_t flags; /**< found attribute flags (out). */
166 	uint32_t ibindex; /**< IB device index (out). */
167 	uint32_t ifindex; /**< Network interface index (out). */
168 	uint32_t portnum; /**< IB device max port number (out). */
169 };
170 
171 uint32_t atomic_sn;
172 
173 /* Generate Netlink sequence number. */
174 #define MLX5_NL_SN_GENERATE __atomic_add_fetch(&atomic_sn, 1, __ATOMIC_RELAXED)
175 
176 /**
177  * Opens a Netlink socket.
178  *
179  * @param protocol
180  *   Netlink protocol (e.g. NETLINK_ROUTE, NETLINK_RDMA).
181  *
182  * @return
183  *   A file descriptor on success, a negative errno value otherwise and
184  *   rte_errno is set.
185  */
186 int
187 mlx5_nl_init(int protocol)
188 {
189 	int fd;
190 	int sndbuf_size = MLX5_SEND_BUF_SIZE;
191 	int rcvbuf_size = MLX5_RECV_BUF_SIZE;
192 	struct sockaddr_nl local = {
193 		.nl_family = AF_NETLINK,
194 	};
195 	int ret;
196 
197 	fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, protocol);
198 	if (fd == -1) {
199 		rte_errno = errno;
200 		return -rte_errno;
201 	}
202 	ret = setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &sndbuf_size, sizeof(int));
203 	if (ret == -1) {
204 		rte_errno = errno;
205 		goto error;
206 	}
207 	ret = setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &rcvbuf_size, sizeof(int));
208 	if (ret == -1) {
209 		rte_errno = errno;
210 		goto error;
211 	}
212 	ret = bind(fd, (struct sockaddr *)&local, sizeof(local));
213 	if (ret == -1) {
214 		rte_errno = errno;
215 		goto error;
216 	}
217 	return fd;
218 error:
219 	close(fd);
220 	return -rte_errno;
221 }
222 
223 /**
224  * Send a request message to the kernel on the Netlink socket.
225  *
226  * @param[in] nlsk_fd
227  *   Netlink socket file descriptor.
228  * @param[in] nh
229  *   The Netlink message send to the kernel.
230  * @param[in] ssn
231  *   Sequence number.
232  * @param[in] req
233  *   Pointer to the request structure.
234  * @param[in] len
235  *   Length of the request in bytes.
236  *
237  * @return
238  *   The number of sent bytes on success, a negative errno value otherwise and
239  *   rte_errno is set.
240  */
241 static int
242 mlx5_nl_request(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn, void *req,
243 		int len)
244 {
245 	struct sockaddr_nl sa = {
246 		.nl_family = AF_NETLINK,
247 	};
248 	struct iovec iov[2] = {
249 		{ .iov_base = nh, .iov_len = sizeof(*nh), },
250 		{ .iov_base = req, .iov_len = len, },
251 	};
252 	struct msghdr msg = {
253 		.msg_name = &sa,
254 		.msg_namelen = sizeof(sa),
255 		.msg_iov = iov,
256 		.msg_iovlen = 2,
257 	};
258 	int send_bytes;
259 
260 	nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
261 	nh->nlmsg_seq = sn;
262 	send_bytes = sendmsg(nlsk_fd, &msg, 0);
263 	if (send_bytes < 0) {
264 		rte_errno = errno;
265 		return -rte_errno;
266 	}
267 	return send_bytes;
268 }
269 
270 /**
271  * Send a message to the kernel on the Netlink socket.
272  *
273  * @param[in] nlsk_fd
274  *   The Netlink socket file descriptor used for communication.
275  * @param[in] nh
276  *   The Netlink message send to the kernel.
277  * @param[in] sn
278  *   Sequence number.
279  *
280  * @return
281  *   The number of sent bytes on success, a negative errno value otherwise and
282  *   rte_errno is set.
283  */
284 static int
285 mlx5_nl_send(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn)
286 {
287 	struct sockaddr_nl sa = {
288 		.nl_family = AF_NETLINK,
289 	};
290 	struct iovec iov = {
291 		.iov_base = nh,
292 		.iov_len = nh->nlmsg_len,
293 	};
294 	struct msghdr msg = {
295 		.msg_name = &sa,
296 		.msg_namelen = sizeof(sa),
297 		.msg_iov = &iov,
298 		.msg_iovlen = 1,
299 	};
300 	int send_bytes;
301 
302 	nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
303 	nh->nlmsg_seq = sn;
304 	send_bytes = sendmsg(nlsk_fd, &msg, 0);
305 	if (send_bytes < 0) {
306 		rte_errno = errno;
307 		return -rte_errno;
308 	}
309 	return send_bytes;
310 }
311 
312 /**
313  * Receive a message from the kernel on the Netlink socket, following
314  * mlx5_nl_send().
315  *
316  * @param[in] nlsk_fd
317  *   The Netlink socket file descriptor used for communication.
318  * @param[in] sn
319  *   Sequence number.
320  * @param[in] cb
321  *   The callback function to call for each Netlink message received.
322  * @param[in, out] arg
323  *   Custom arguments for the callback.
324  *
325  * @return
326  *   0 on success, a negative errno value otherwise and rte_errno is set.
327  */
328 static int
329 mlx5_nl_recv(int nlsk_fd, uint32_t sn, int (*cb)(struct nlmsghdr *, void *arg),
330 	     void *arg)
331 {
332 	struct sockaddr_nl sa;
333 	void *buf = mlx5_malloc(0, MLX5_RECV_BUF_SIZE, 0, SOCKET_ID_ANY);
334 	struct iovec iov = {
335 		.iov_base = buf,
336 		.iov_len = MLX5_RECV_BUF_SIZE,
337 	};
338 	struct msghdr msg = {
339 		.msg_name = &sa,
340 		.msg_namelen = sizeof(sa),
341 		.msg_iov = &iov,
342 		/* One message at a time */
343 		.msg_iovlen = 1,
344 	};
345 	int multipart = 0;
346 	int ret = 0;
347 
348 	if (!buf) {
349 		rte_errno = ENOMEM;
350 		return -rte_errno;
351 	}
352 	do {
353 		struct nlmsghdr *nh;
354 		int recv_bytes = 0;
355 
356 		do {
357 			recv_bytes = recvmsg(nlsk_fd, &msg, 0);
358 			if (recv_bytes == -1) {
359 				rte_errno = errno;
360 				ret = -rte_errno;
361 				goto exit;
362 			}
363 			nh = (struct nlmsghdr *)buf;
364 		} while (nh->nlmsg_seq != sn);
365 		for (;
366 		     NLMSG_OK(nh, (unsigned int)recv_bytes);
367 		     nh = NLMSG_NEXT(nh, recv_bytes)) {
368 			if (nh->nlmsg_type == NLMSG_ERROR) {
369 				struct nlmsgerr *err_data = NLMSG_DATA(nh);
370 
371 				if (err_data->error < 0) {
372 					rte_errno = -err_data->error;
373 					ret = -rte_errno;
374 					goto exit;
375 				}
376 				/* Ack message. */
377 				ret = 0;
378 				goto exit;
379 			}
380 			/* Multi-part msgs and their trailing DONE message. */
381 			if (nh->nlmsg_flags & NLM_F_MULTI) {
382 				if (nh->nlmsg_type == NLMSG_DONE) {
383 					ret =  0;
384 					goto exit;
385 				}
386 				multipart = 1;
387 			}
388 			if (cb) {
389 				ret = cb(nh, arg);
390 				if (ret < 0)
391 					goto exit;
392 			}
393 		}
394 	} while (multipart);
395 exit:
396 	mlx5_free(buf);
397 	return ret;
398 }
399 
400 /**
401  * Parse Netlink message to retrieve the bridge MAC address.
402  *
403  * @param nh
404  *   Pointer to Netlink Message Header.
405  * @param arg
406  *   PMD data register with this callback.
407  *
408  * @return
409  *   0 on success, a negative errno value otherwise and rte_errno is set.
410  */
411 static int
412 mlx5_nl_mac_addr_cb(struct nlmsghdr *nh, void *arg)
413 {
414 	struct mlx5_nl_mac_addr *data = arg;
415 	struct ndmsg *r = NLMSG_DATA(nh);
416 	struct rtattr *attribute;
417 	int len;
418 
419 	len = nh->nlmsg_len - NLMSG_LENGTH(sizeof(*r));
420 	for (attribute = MLX5_NDA_RTA(r);
421 	     RTA_OK(attribute, len);
422 	     attribute = RTA_NEXT(attribute, len)) {
423 		if (attribute->rta_type == NDA_LLADDR) {
424 			if (data->mac_n == MLX5_MAX_MAC_ADDRESSES) {
425 				DRV_LOG(WARNING,
426 					"not enough room to finalize the"
427 					" request");
428 				rte_errno = ENOMEM;
429 				return -rte_errno;
430 			}
431 #ifdef RTE_LIBRTE_MLX5_DEBUG
432 			char m[RTE_ETHER_ADDR_FMT_SIZE];
433 
434 			rte_ether_format_addr(m, RTE_ETHER_ADDR_FMT_SIZE,
435 					      RTA_DATA(attribute));
436 			DRV_LOG(DEBUG, "bridge MAC address %s", m);
437 #endif
438 			memcpy(&(*data->mac)[data->mac_n++],
439 			       RTA_DATA(attribute), RTE_ETHER_ADDR_LEN);
440 		}
441 	}
442 	return 0;
443 }
444 
445 /**
446  * Get bridge MAC addresses.
447  *
448  * @param[in] nlsk_fd
449  *   Netlink socket file descriptor.
450  * @param[in] iface_idx
451  *   Net device interface index.
452  * @param mac[out]
453  *   Pointer to the array table of MAC addresses to fill.
454  *   Its size should be of MLX5_MAX_MAC_ADDRESSES.
455  * @param mac_n[out]
456  *   Number of entries filled in MAC array.
457  *
458  * @return
459  *   0 on success, a negative errno value otherwise and rte_errno is set.
460  */
461 static int
462 mlx5_nl_mac_addr_list(int nlsk_fd, unsigned int iface_idx,
463 		      struct rte_ether_addr (*mac)[], int *mac_n)
464 {
465 	struct {
466 		struct nlmsghdr	hdr;
467 		struct ifinfomsg ifm;
468 	} req = {
469 		.hdr = {
470 			.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
471 			.nlmsg_type = RTM_GETNEIGH,
472 			.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST,
473 		},
474 		.ifm = {
475 			.ifi_family = PF_BRIDGE,
476 			.ifi_index = iface_idx,
477 		},
478 	};
479 	struct mlx5_nl_mac_addr data = {
480 		.mac = mac,
481 		.mac_n = 0,
482 	};
483 	uint32_t sn = MLX5_NL_SN_GENERATE;
484 	int ret;
485 
486 	if (nlsk_fd == -1)
487 		return 0;
488 	ret = mlx5_nl_request(nlsk_fd, &req.hdr, sn, &req.ifm,
489 			      sizeof(struct ifinfomsg));
490 	if (ret < 0)
491 		goto error;
492 	ret = mlx5_nl_recv(nlsk_fd, sn, mlx5_nl_mac_addr_cb, &data);
493 	if (ret < 0)
494 		goto error;
495 	*mac_n = data.mac_n;
496 	return 0;
497 error:
498 	DRV_LOG(DEBUG, "Interface %u cannot retrieve MAC address list %s",
499 		iface_idx, strerror(rte_errno));
500 	return -rte_errno;
501 }
502 
503 /**
504  * Modify the MAC address neighbour table with Netlink.
505  *
506  * @param[in] nlsk_fd
507  *   Netlink socket file descriptor.
508  * @param[in] iface_idx
509  *   Net device interface index.
510  * @param mac
511  *   MAC address to consider.
512  * @param add
513  *   1 to add the MAC address, 0 to remove the MAC address.
514  *
515  * @return
516  *   0 on success, a negative errno value otherwise and rte_errno is set.
517  */
518 static int
519 mlx5_nl_mac_addr_modify(int nlsk_fd, unsigned int iface_idx,
520 			struct rte_ether_addr *mac, int add)
521 {
522 	struct {
523 		struct nlmsghdr hdr;
524 		struct ndmsg ndm;
525 		struct rtattr rta;
526 		uint8_t buffer[RTE_ETHER_ADDR_LEN];
527 	} req = {
528 		.hdr = {
529 			.nlmsg_len = NLMSG_LENGTH(sizeof(struct ndmsg)),
530 			.nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
531 				NLM_F_EXCL | NLM_F_ACK,
532 			.nlmsg_type = add ? RTM_NEWNEIGH : RTM_DELNEIGH,
533 		},
534 		.ndm = {
535 			.ndm_family = PF_BRIDGE,
536 			.ndm_state = NUD_NOARP | NUD_PERMANENT,
537 			.ndm_ifindex = iface_idx,
538 			.ndm_flags = NTF_SELF,
539 		},
540 		.rta = {
541 			.rta_type = NDA_LLADDR,
542 			.rta_len = RTA_LENGTH(RTE_ETHER_ADDR_LEN),
543 		},
544 	};
545 	uint32_t sn = MLX5_NL_SN_GENERATE;
546 	int ret;
547 
548 	if (nlsk_fd == -1)
549 		return 0;
550 	memcpy(RTA_DATA(&req.rta), mac, RTE_ETHER_ADDR_LEN);
551 	req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) +
552 		RTA_ALIGN(req.rta.rta_len);
553 	ret = mlx5_nl_send(nlsk_fd, &req.hdr, sn);
554 	if (ret < 0)
555 		goto error;
556 	ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
557 	if (ret < 0)
558 		goto error;
559 	return 0;
560 error:
561 #ifdef RTE_LIBRTE_MLX5_DEBUG
562 	{
563 		char m[RTE_ETHER_ADDR_FMT_SIZE];
564 
565 		rte_ether_format_addr(m, RTE_ETHER_ADDR_FMT_SIZE, mac);
566 		DRV_LOG(DEBUG,
567 			"Interface %u cannot %s MAC address %s %s",
568 			iface_idx,
569 			add ? "add" : "remove", m, strerror(rte_errno));
570 	}
571 #endif
572 	return -rte_errno;
573 }
574 
575 /**
576  * Modify the VF MAC address neighbour table with Netlink.
577  *
578  * @param[in] nlsk_fd
579  *   Netlink socket file descriptor.
580  * @param[in] iface_idx
581  *   Net device interface index.
582  * @param mac
583  *    MAC address to consider.
584  * @param vf_index
585  *    VF index.
586  *
587  * @return
588  *    0 on success, a negative errno value otherwise and rte_errno is set.
589  */
590 int
591 mlx5_nl_vf_mac_addr_modify(int nlsk_fd, unsigned int iface_idx,
592 			   struct rte_ether_addr *mac, int vf_index)
593 {
594 	int ret;
595 	struct {
596 		struct nlmsghdr hdr;
597 		struct ifinfomsg ifm;
598 		struct rtattr vf_list_rta;
599 		struct rtattr vf_info_rta;
600 		struct rtattr vf_mac_rta;
601 		struct ifla_vf_mac ivm;
602 	} req = {
603 		.hdr = {
604 			.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
605 			.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
606 			.nlmsg_type = RTM_BASE,
607 		},
608 		.ifm = {
609 			.ifi_index = iface_idx,
610 		},
611 		.vf_list_rta = {
612 			.rta_type = IFLA_VFINFO_LIST,
613 			.rta_len = RTA_ALIGN(RTA_LENGTH(0)),
614 		},
615 		.vf_info_rta = {
616 			.rta_type = IFLA_VF_INFO,
617 			.rta_len = RTA_ALIGN(RTA_LENGTH(0)),
618 		},
619 		.vf_mac_rta = {
620 			.rta_type = IFLA_VF_MAC,
621 		},
622 	};
623 	struct ifla_vf_mac ivm = {
624 		.vf = vf_index,
625 	};
626 	uint32_t sn = MLX5_NL_SN_GENERATE;
627 
628 	memcpy(&ivm.mac, mac, RTE_ETHER_ADDR_LEN);
629 	memcpy(RTA_DATA(&req.vf_mac_rta), &ivm, sizeof(ivm));
630 
631 	req.vf_mac_rta.rta_len = RTA_LENGTH(sizeof(ivm));
632 	req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) +
633 		RTA_ALIGN(req.vf_list_rta.rta_len) +
634 		RTA_ALIGN(req.vf_info_rta.rta_len) +
635 		RTA_ALIGN(req.vf_mac_rta.rta_len);
636 	req.vf_list_rta.rta_len = RTE_PTR_DIFF(NLMSG_TAIL(&req.hdr),
637 					       &req.vf_list_rta);
638 	req.vf_info_rta.rta_len = RTE_PTR_DIFF(NLMSG_TAIL(&req.hdr),
639 					       &req.vf_info_rta);
640 
641 	if (nlsk_fd < 0)
642 		return -1;
643 	ret = mlx5_nl_send(nlsk_fd, &req.hdr, sn);
644 	if (ret < 0)
645 		goto error;
646 	ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
647 	if (ret < 0)
648 		goto error;
649 	return 0;
650 error:
651 	DRV_LOG(ERR,
652 		"representor %u cannot set VF MAC address "
653 		"%02X:%02X:%02X:%02X:%02X:%02X : %s",
654 		vf_index,
655 		mac->addr_bytes[0], mac->addr_bytes[1],
656 		mac->addr_bytes[2], mac->addr_bytes[3],
657 		mac->addr_bytes[4], mac->addr_bytes[5],
658 		strerror(rte_errno));
659 	return -rte_errno;
660 }
661 
662 /**
663  * Add a MAC address.
664  *
665  * @param[in] nlsk_fd
666  *   Netlink socket file descriptor.
667  * @param[in] iface_idx
668  *   Net device interface index.
669  * @param mac_own
670  *   BITFIELD_DECLARE array to store the mac.
671  * @param mac
672  *   MAC address to register.
673  * @param index
674  *   MAC address index.
675  *
676  * @return
677  *   0 on success, a negative errno value otherwise and rte_errno is set.
678  */
679 int
680 mlx5_nl_mac_addr_add(int nlsk_fd, unsigned int iface_idx,
681 		     uint64_t *mac_own, struct rte_ether_addr *mac,
682 		     uint32_t index)
683 {
684 	int ret;
685 
686 	ret = mlx5_nl_mac_addr_modify(nlsk_fd, iface_idx, mac, 1);
687 	if (!ret) {
688 		MLX5_ASSERT(index < MLX5_MAX_MAC_ADDRESSES);
689 		if (index >= MLX5_MAX_MAC_ADDRESSES)
690 			return -EINVAL;
691 
692 		BITFIELD_SET(mac_own, index);
693 	}
694 	if (ret == -EEXIST)
695 		return 0;
696 	return ret;
697 }
698 
699 /**
700  * Remove a MAC address.
701  *
702  * @param[in] nlsk_fd
703  *   Netlink socket file descriptor.
704  * @param[in] iface_idx
705  *   Net device interface index.
706  * @param mac_own
707  *   BITFIELD_DECLARE array to store the mac.
708  * @param mac
709  *   MAC address to remove.
710  * @param index
711  *   MAC address index.
712  *
713  * @return
714  *   0 on success, a negative errno value otherwise and rte_errno is set.
715  */
716 int
717 mlx5_nl_mac_addr_remove(int nlsk_fd, unsigned int iface_idx, uint64_t *mac_own,
718 			struct rte_ether_addr *mac, uint32_t index)
719 {
720 	MLX5_ASSERT(index < MLX5_MAX_MAC_ADDRESSES);
721 	if (index >= MLX5_MAX_MAC_ADDRESSES)
722 		return -EINVAL;
723 
724 	BITFIELD_RESET(mac_own, index);
725 	return mlx5_nl_mac_addr_modify(nlsk_fd, iface_idx, mac, 0);
726 }
727 
728 /**
729  * Synchronize Netlink bridge table to the internal table.
730  *
731  * @param[in] nlsk_fd
732  *   Netlink socket file descriptor.
733  * @param[in] iface_idx
734  *   Net device interface index.
735  * @param mac_addrs
736  *   Mac addresses array to sync.
737  * @param n
738  *   @p mac_addrs array size.
739  */
740 void
741 mlx5_nl_mac_addr_sync(int nlsk_fd, unsigned int iface_idx,
742 		      struct rte_ether_addr *mac_addrs, int n)
743 {
744 	struct rte_ether_addr macs[n];
745 	int macs_n = 0;
746 	int i;
747 	int ret;
748 
749 	memset(macs, 0, n * sizeof(macs[0]));
750 	ret = mlx5_nl_mac_addr_list(nlsk_fd, iface_idx, &macs, &macs_n);
751 	if (ret)
752 		return;
753 	for (i = 0; i != macs_n; ++i) {
754 		int j;
755 
756 		/* Verify the address is not in the array yet. */
757 		for (j = 0; j != n; ++j)
758 			if (rte_is_same_ether_addr(&macs[i], &mac_addrs[j]))
759 				break;
760 		if (j != n)
761 			continue;
762 		if (rte_is_multicast_ether_addr(&macs[i])) {
763 			/* Find the first entry available. */
764 			for (j = MLX5_MAX_UC_MAC_ADDRESSES; j != n; ++j) {
765 				if (rte_is_zero_ether_addr(&mac_addrs[j])) {
766 					mac_addrs[j] = macs[i];
767 					break;
768 				}
769 			}
770 		} else {
771 			/* Find the first entry available. */
772 			for (j = 0; j != MLX5_MAX_UC_MAC_ADDRESSES; ++j) {
773 				if (rte_is_zero_ether_addr(&mac_addrs[j])) {
774 					mac_addrs[j] = macs[i];
775 					break;
776 				}
777 			}
778 		}
779 	}
780 }
781 
782 /**
783  * Flush all added MAC addresses.
784  *
785  * @param[in] nlsk_fd
786  *   Netlink socket file descriptor.
787  * @param[in] iface_idx
788  *   Net device interface index.
789  * @param[in] mac_addrs
790  *   Mac addresses array to flush.
791  * @param n
792  *   @p mac_addrs array size.
793  * @param mac_own
794  *   BITFIELD_DECLARE array to store the mac.
795  */
796 void
797 mlx5_nl_mac_addr_flush(int nlsk_fd, unsigned int iface_idx,
798 		       struct rte_ether_addr *mac_addrs, int n,
799 		       uint64_t *mac_own)
800 {
801 	int i;
802 
803 	if (n <= 0 || n > MLX5_MAX_MAC_ADDRESSES)
804 		return;
805 
806 	for (i = n - 1; i >= 0; --i) {
807 		struct rte_ether_addr *m = &mac_addrs[i];
808 
809 		if (BITFIELD_ISSET(mac_own, i))
810 			mlx5_nl_mac_addr_remove(nlsk_fd, iface_idx, mac_own, m,
811 						i);
812 	}
813 }
814 
815 /**
816  * Enable promiscuous / all multicast mode through Netlink.
817  *
818  * @param[in] nlsk_fd
819  *   Netlink socket file descriptor.
820  * @param[in] iface_idx
821  *   Net device interface index.
822  * @param flags
823  *   IFF_PROMISC for promiscuous, IFF_ALLMULTI for allmulti.
824  * @param enable
825  *   Nonzero to enable, disable otherwise.
826  *
827  * @return
828  *   0 on success, a negative errno value otherwise and rte_errno is set.
829  */
830 static int
831 mlx5_nl_device_flags(int nlsk_fd, unsigned int iface_idx, uint32_t flags,
832 		     int enable)
833 {
834 	struct {
835 		struct nlmsghdr hdr;
836 		struct ifinfomsg ifi;
837 	} req = {
838 		.hdr = {
839 			.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
840 			.nlmsg_type = RTM_NEWLINK,
841 			.nlmsg_flags = NLM_F_REQUEST,
842 		},
843 		.ifi = {
844 			.ifi_flags = enable ? flags : 0,
845 			.ifi_change = flags,
846 			.ifi_index = iface_idx,
847 		},
848 	};
849 	uint32_t sn = MLX5_NL_SN_GENERATE;
850 	int ret;
851 
852 	MLX5_ASSERT(!(flags & ~(IFF_PROMISC | IFF_ALLMULTI)));
853 	if (nlsk_fd < 0)
854 		return 0;
855 	ret = mlx5_nl_send(nlsk_fd, &req.hdr, sn);
856 	if (ret < 0)
857 		return ret;
858 	return 0;
859 }
860 
861 /**
862  * Enable promiscuous mode through Netlink.
863  *
864  * @param[in] nlsk_fd
865  *   Netlink socket file descriptor.
866  * @param[in] iface_idx
867  *   Net device interface index.
868  * @param enable
869  *   Nonzero to enable, disable otherwise.
870  *
871  * @return
872  *   0 on success, a negative errno value otherwise and rte_errno is set.
873  */
874 int
875 mlx5_nl_promisc(int nlsk_fd, unsigned int iface_idx, int enable)
876 {
877 	int ret = mlx5_nl_device_flags(nlsk_fd, iface_idx, IFF_PROMISC, enable);
878 
879 	if (ret)
880 		DRV_LOG(DEBUG,
881 			"Interface %u cannot %s promisc mode: Netlink error %s",
882 			iface_idx, enable ? "enable" : "disable",
883 			strerror(rte_errno));
884 	return ret;
885 }
886 
887 /**
888  * Enable all multicast mode through Netlink.
889  *
890  * @param[in] nlsk_fd
891  *   Netlink socket file descriptor.
892  * @param[in] iface_idx
893  *   Net device interface index.
894  * @param enable
895  *   Nonzero to enable, disable otherwise.
896  *
897  * @return
898  *   0 on success, a negative errno value otherwise and rte_errno is set.
899  */
900 int
901 mlx5_nl_allmulti(int nlsk_fd, unsigned int iface_idx, int enable)
902 {
903 	int ret = mlx5_nl_device_flags(nlsk_fd, iface_idx, IFF_ALLMULTI,
904 				       enable);
905 
906 	if (ret)
907 		DRV_LOG(DEBUG,
908 			"Interface %u cannot %s allmulti : Netlink error %s",
909 			iface_idx, enable ? "enable" : "disable",
910 			strerror(rte_errno));
911 	return ret;
912 }
913 
914 /**
915  * Process network interface information from Netlink message.
916  *
917  * @param nh
918  *   Pointer to Netlink message header.
919  * @param arg
920  *   Opaque data pointer for this callback.
921  *
922  * @return
923  *   0 on success, a negative errno value otherwise and rte_errno is set.
924  */
925 static int
926 mlx5_nl_cmdget_cb(struct nlmsghdr *nh, void *arg)
927 {
928 	struct mlx5_nl_ifindex_data *data = arg;
929 	struct mlx5_nl_ifindex_data local = {
930 		.flags = 0,
931 	};
932 	size_t off = NLMSG_HDRLEN;
933 
934 	if (nh->nlmsg_type !=
935 	    RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET) &&
936 	    nh->nlmsg_type !=
937 	    RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_PORT_GET))
938 		goto error;
939 	while (off < nh->nlmsg_len) {
940 		struct nlattr *na = (void *)((uintptr_t)nh + off);
941 		void *payload = (void *)((uintptr_t)na + NLA_HDRLEN);
942 
943 		if (na->nla_len > nh->nlmsg_len - off)
944 			goto error;
945 		switch (na->nla_type) {
946 		case RDMA_NLDEV_ATTR_DEV_INDEX:
947 			local.ibindex = *(uint32_t *)payload;
948 			local.flags |= MLX5_NL_CMD_GET_IB_INDEX;
949 			break;
950 		case RDMA_NLDEV_ATTR_DEV_NAME:
951 			if (!strcmp(payload, data->name))
952 				local.flags |= MLX5_NL_CMD_GET_IB_NAME;
953 			break;
954 		case RDMA_NLDEV_ATTR_NDEV_INDEX:
955 			local.ifindex = *(uint32_t *)payload;
956 			local.flags |= MLX5_NL_CMD_GET_NET_INDEX;
957 			break;
958 		case RDMA_NLDEV_ATTR_PORT_INDEX:
959 			local.portnum = *(uint32_t *)payload;
960 			local.flags |= MLX5_NL_CMD_GET_PORT_INDEX;
961 			break;
962 		default:
963 			break;
964 		}
965 		off += NLA_ALIGN(na->nla_len);
966 	}
967 	/*
968 	 * It is possible to have multiple messages for all
969 	 * Infiniband devices in the system with appropriate name.
970 	 * So we should gather parameters locally and copy to
971 	 * query context only in case of coinciding device name.
972 	 */
973 	if (local.flags & MLX5_NL_CMD_GET_IB_NAME) {
974 		data->flags = local.flags;
975 		data->ibindex = local.ibindex;
976 		data->ifindex = local.ifindex;
977 		data->portnum = local.portnum;
978 	}
979 	return 0;
980 error:
981 	rte_errno = EINVAL;
982 	return -rte_errno;
983 }
984 
985 /**
986  * Get index of network interface associated with some IB device.
987  *
988  * This is the only somewhat safe method to avoid resorting to heuristics
989  * when faced with port representors. Unfortunately it requires at least
990  * Linux 4.17.
991  *
992  * @param nl
993  *   Netlink socket of the RDMA kind (NETLINK_RDMA).
994  * @param[in] name
995  *   IB device name.
996  * @param[in] pindex
997  *   IB device port index, starting from 1
998  * @return
999  *   A valid (nonzero) interface index on success, 0 otherwise and rte_errno
1000  *   is set.
1001  */
1002 unsigned int
1003 mlx5_nl_ifindex(int nl, const char *name, uint32_t pindex)
1004 {
1005 	struct mlx5_nl_ifindex_data data = {
1006 		.name = name,
1007 		.flags = 0,
1008 		.ibindex = 0, /* Determined during first pass. */
1009 		.ifindex = 0, /* Determined during second pass. */
1010 	};
1011 	union {
1012 		struct nlmsghdr nh;
1013 		uint8_t buf[NLMSG_HDRLEN +
1014 			    NLA_HDRLEN + NLA_ALIGN(sizeof(data.ibindex)) +
1015 			    NLA_HDRLEN + NLA_ALIGN(sizeof(pindex))];
1016 	} req = {
1017 		.nh = {
1018 			.nlmsg_len = NLMSG_LENGTH(0),
1019 			.nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
1020 						       RDMA_NLDEV_CMD_GET),
1021 			.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,
1022 		},
1023 	};
1024 	struct nlattr *na;
1025 	uint32_t sn = MLX5_NL_SN_GENERATE;
1026 	int ret;
1027 
1028 	ret = mlx5_nl_send(nl, &req.nh, sn);
1029 	if (ret < 0)
1030 		return 0;
1031 	ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, &data);
1032 	if (ret < 0)
1033 		return 0;
1034 	if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) ||
1035 	    !(data.flags & MLX5_NL_CMD_GET_IB_INDEX))
1036 		goto error;
1037 	data.flags = 0;
1038 	sn = MLX5_NL_SN_GENERATE;
1039 	req.nh.nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
1040 					     RDMA_NLDEV_CMD_PORT_GET);
1041 	req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1042 	req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.buf) - NLMSG_HDRLEN);
1043 	na = (void *)((uintptr_t)req.buf + NLMSG_HDRLEN);
1044 	na->nla_len = NLA_HDRLEN + sizeof(data.ibindex);
1045 	na->nla_type = RDMA_NLDEV_ATTR_DEV_INDEX;
1046 	memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
1047 	       &data.ibindex, sizeof(data.ibindex));
1048 	na = (void *)((uintptr_t)na + NLA_ALIGN(na->nla_len));
1049 	na->nla_len = NLA_HDRLEN + sizeof(pindex);
1050 	na->nla_type = RDMA_NLDEV_ATTR_PORT_INDEX;
1051 	memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
1052 	       &pindex, sizeof(pindex));
1053 	ret = mlx5_nl_send(nl, &req.nh, sn);
1054 	if (ret < 0)
1055 		return 0;
1056 	ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, &data);
1057 	if (ret < 0)
1058 		return 0;
1059 	if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) ||
1060 	    !(data.flags & MLX5_NL_CMD_GET_IB_INDEX) ||
1061 	    !(data.flags & MLX5_NL_CMD_GET_NET_INDEX) ||
1062 	    !data.ifindex)
1063 		goto error;
1064 	return data.ifindex;
1065 error:
1066 	rte_errno = ENODEV;
1067 	return 0;
1068 }
1069 
1070 /**
1071  * Get the number of physical ports of given IB device.
1072  *
1073  * @param nl
1074  *   Netlink socket of the RDMA kind (NETLINK_RDMA).
1075  * @param[in] name
1076  *   IB device name.
1077  *
1078  * @return
1079  *   A valid (nonzero) number of ports on success, 0 otherwise
1080  *   and rte_errno is set.
1081  */
1082 unsigned int
1083 mlx5_nl_portnum(int nl, const char *name)
1084 {
1085 	struct mlx5_nl_ifindex_data data = {
1086 		.flags = 0,
1087 		.name = name,
1088 		.ifindex = 0,
1089 		.portnum = 0,
1090 	};
1091 	struct nlmsghdr req = {
1092 		.nlmsg_len = NLMSG_LENGTH(0),
1093 		.nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
1094 					       RDMA_NLDEV_CMD_GET),
1095 		.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,
1096 	};
1097 	uint32_t sn = MLX5_NL_SN_GENERATE;
1098 	int ret;
1099 
1100 	ret = mlx5_nl_send(nl, &req, sn);
1101 	if (ret < 0)
1102 		return 0;
1103 	ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, &data);
1104 	if (ret < 0)
1105 		return 0;
1106 	if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) ||
1107 	    !(data.flags & MLX5_NL_CMD_GET_IB_INDEX) ||
1108 	    !(data.flags & MLX5_NL_CMD_GET_PORT_INDEX)) {
1109 		rte_errno = ENODEV;
1110 		return 0;
1111 	}
1112 	if (!data.portnum)
1113 		rte_errno = EINVAL;
1114 	return data.portnum;
1115 }
1116 
1117 /**
1118  * Analyze gathered port parameters via Netlink to recognize master
1119  * and representor devices for E-Switch configuration.
1120  *
1121  * @param[in] num_vf_set
1122  *   flag of presence of number of VFs port attribute.
1123  * @param[inout] switch_info
1124  *   Port information, including port name as a number and port name
1125  *   type if recognized
1126  *
1127  * @return
1128  *   master and representor flags are set in switch_info according to
1129  *   recognized parameters (if any).
1130  */
1131 static void
1132 mlx5_nl_check_switch_info(bool num_vf_set,
1133 			  struct mlx5_switch_info *switch_info)
1134 {
1135 	switch (switch_info->name_type) {
1136 	case MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN:
1137 		/*
1138 		 * Name is not recognized, assume the master,
1139 		 * check the number of VFs key presence.
1140 		 */
1141 		switch_info->master = num_vf_set;
1142 		break;
1143 	case MLX5_PHYS_PORT_NAME_TYPE_NOTSET:
1144 		/*
1145 		 * Name is not set, this assumes the legacy naming
1146 		 * schema for master, just check if there is a
1147 		 * number of VFs key.
1148 		 */
1149 		switch_info->master = num_vf_set;
1150 		break;
1151 	case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
1152 		/* New uplink naming schema recognized. */
1153 		switch_info->master = 1;
1154 		break;
1155 	case MLX5_PHYS_PORT_NAME_TYPE_LEGACY:
1156 		/* Legacy representors naming schema. */
1157 		switch_info->representor = !num_vf_set;
1158 		break;
1159 	case MLX5_PHYS_PORT_NAME_TYPE_PFHPF:
1160 		/* Fallthrough */
1161 	case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
1162 		/* Fallthrough */
1163 	case MLX5_PHYS_PORT_NAME_TYPE_PFSF:
1164 		/* New representors naming schema. */
1165 		switch_info->representor = 1;
1166 		break;
1167 	}
1168 }
1169 
1170 /**
1171  * Process switch information from Netlink message.
1172  *
1173  * @param nh
1174  *   Pointer to Netlink message header.
1175  * @param arg
1176  *   Opaque data pointer for this callback.
1177  *
1178  * @return
1179  *   0 on success, a negative errno value otherwise and rte_errno is set.
1180  */
1181 static int
1182 mlx5_nl_switch_info_cb(struct nlmsghdr *nh, void *arg)
1183 {
1184 	struct mlx5_switch_info info = {
1185 		.master = 0,
1186 		.representor = 0,
1187 		.name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET,
1188 		.port_name = 0,
1189 		.switch_id = 0,
1190 	};
1191 	size_t off = NLMSG_LENGTH(sizeof(struct ifinfomsg));
1192 	bool switch_id_set = false;
1193 	bool num_vf_set = false;
1194 
1195 	if (nh->nlmsg_type != RTM_NEWLINK)
1196 		goto error;
1197 	while (off < nh->nlmsg_len) {
1198 		struct rtattr *ra = (void *)((uintptr_t)nh + off);
1199 		void *payload = RTA_DATA(ra);
1200 		unsigned int i;
1201 
1202 		if (ra->rta_len > nh->nlmsg_len - off)
1203 			goto error;
1204 		switch (ra->rta_type) {
1205 		case IFLA_NUM_VF:
1206 			num_vf_set = true;
1207 			break;
1208 		case IFLA_PHYS_PORT_NAME:
1209 			mlx5_translate_port_name((char *)payload, &info);
1210 			break;
1211 		case IFLA_PHYS_SWITCH_ID:
1212 			info.switch_id = 0;
1213 			for (i = 0; i < RTA_PAYLOAD(ra); ++i) {
1214 				info.switch_id <<= 8;
1215 				info.switch_id |= ((uint8_t *)payload)[i];
1216 			}
1217 			switch_id_set = true;
1218 			break;
1219 		}
1220 		off += RTA_ALIGN(ra->rta_len);
1221 	}
1222 	if (switch_id_set) {
1223 		/* We have some E-Switch configuration. */
1224 		mlx5_nl_check_switch_info(num_vf_set, &info);
1225 	}
1226 	MLX5_ASSERT(!(info.master && info.representor));
1227 	memcpy(arg, &info, sizeof(info));
1228 	return 0;
1229 error:
1230 	rte_errno = EINVAL;
1231 	return -rte_errno;
1232 }
1233 
1234 /**
1235  * Get switch information associated with network interface.
1236  *
1237  * @param nl
1238  *   Netlink socket of the ROUTE kind (NETLINK_ROUTE).
1239  * @param ifindex
1240  *   Network interface index.
1241  * @param[out] info
1242  *   Switch information object, populated in case of success.
1243  *
1244  * @return
1245  *   0 on success, a negative errno value otherwise and rte_errno is set.
1246  */
1247 int
1248 mlx5_nl_switch_info(int nl, unsigned int ifindex,
1249 		    struct mlx5_switch_info *info)
1250 {
1251 	struct {
1252 		struct nlmsghdr nh;
1253 		struct ifinfomsg info;
1254 		struct rtattr rta;
1255 		uint32_t extmask;
1256 	} req = {
1257 		.nh = {
1258 			.nlmsg_len = NLMSG_LENGTH
1259 					(sizeof(req.info) +
1260 					 RTA_LENGTH(sizeof(uint32_t))),
1261 			.nlmsg_type = RTM_GETLINK,
1262 			.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
1263 		},
1264 		.info = {
1265 			.ifi_family = AF_UNSPEC,
1266 			.ifi_index = ifindex,
1267 		},
1268 		.rta = {
1269 			.rta_type = IFLA_EXT_MASK,
1270 			.rta_len = RTA_LENGTH(sizeof(int32_t)),
1271 		},
1272 		.extmask = RTE_LE32(1),
1273 	};
1274 	uint32_t sn = MLX5_NL_SN_GENERATE;
1275 	int ret;
1276 
1277 	ret = mlx5_nl_send(nl, &req.nh, sn);
1278 	if (ret >= 0)
1279 		ret = mlx5_nl_recv(nl, sn, mlx5_nl_switch_info_cb, info);
1280 	if (info->master && info->representor) {
1281 		DRV_LOG(ERR, "ifindex %u device is recognized as master"
1282 			     " and as representor", ifindex);
1283 		rte_errno = ENODEV;
1284 		ret = -rte_errno;
1285 	}
1286 	return ret;
1287 }
1288 
1289 /*
1290  * Delete VLAN network device by ifindex.
1291  *
1292  * @param[in] tcf
1293  *   Context object initialized by mlx5_nl_vlan_vmwa_init().
1294  * @param[in] ifindex
1295  *   Interface index of network device to delete.
1296  */
1297 void
1298 mlx5_nl_vlan_vmwa_delete(struct mlx5_nl_vlan_vmwa_context *vmwa,
1299 		      uint32_t ifindex)
1300 {
1301 	uint32_t sn = MLX5_NL_SN_GENERATE;
1302 	int ret;
1303 	struct {
1304 		struct nlmsghdr nh;
1305 		struct ifinfomsg info;
1306 	} req = {
1307 		.nh = {
1308 			.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
1309 			.nlmsg_type = RTM_DELLINK,
1310 			.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
1311 		},
1312 		.info = {
1313 			.ifi_family = AF_UNSPEC,
1314 			.ifi_index = ifindex,
1315 		},
1316 	};
1317 
1318 	if (ifindex) {
1319 		ret = mlx5_nl_send(vmwa->nl_socket, &req.nh, sn);
1320 		if (ret >= 0)
1321 			ret = mlx5_nl_recv(vmwa->nl_socket, sn, NULL, NULL);
1322 		if (ret < 0)
1323 			DRV_LOG(WARNING, "netlink: error deleting VLAN WA"
1324 				" ifindex %u, %d", ifindex, ret);
1325 	}
1326 }
1327 
1328 /* Set of subroutines to build Netlink message. */
1329 static struct nlattr *
1330 nl_msg_tail(struct nlmsghdr *nlh)
1331 {
1332 	return (struct nlattr *)
1333 		(((uint8_t *)nlh) + NLMSG_ALIGN(nlh->nlmsg_len));
1334 }
1335 
1336 static void
1337 nl_attr_put(struct nlmsghdr *nlh, int type, const void *data, int alen)
1338 {
1339 	struct nlattr *nla = nl_msg_tail(nlh);
1340 
1341 	nla->nla_type = type;
1342 	nla->nla_len = NLMSG_ALIGN(sizeof(struct nlattr)) + alen;
1343 	nlh->nlmsg_len += NLMSG_ALIGN(nla->nla_len);
1344 
1345 	if (alen)
1346 		memcpy((uint8_t *)nla + sizeof(struct nlattr), data, alen);
1347 }
1348 
1349 static struct nlattr *
1350 nl_attr_nest_start(struct nlmsghdr *nlh, int type)
1351 {
1352 	struct nlattr *nest = (struct nlattr *)nl_msg_tail(nlh);
1353 
1354 	nl_attr_put(nlh, type, NULL, 0);
1355 	return nest;
1356 }
1357 
1358 static void
1359 nl_attr_nest_end(struct nlmsghdr *nlh, struct nlattr *nest)
1360 {
1361 	nest->nla_len = (uint8_t *)nl_msg_tail(nlh) - (uint8_t *)nest;
1362 }
1363 
1364 /*
1365  * Create network VLAN device with specified VLAN tag.
1366  *
1367  * @param[in] tcf
1368  *   Context object initialized by mlx5_nl_vlan_vmwa_init().
1369  * @param[in] ifindex
1370  *   Base network interface index.
1371  * @param[in] tag
1372  *   VLAN tag for VLAN network device to create.
1373  */
1374 uint32_t
1375 mlx5_nl_vlan_vmwa_create(struct mlx5_nl_vlan_vmwa_context *vmwa,
1376 			 uint32_t ifindex, uint16_t tag)
1377 {
1378 	struct nlmsghdr *nlh;
1379 	struct ifinfomsg *ifm;
1380 	char name[sizeof(MLX5_VMWA_VLAN_DEVICE_PFX) + 32];
1381 
1382 	__rte_cache_aligned
1383 	uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1384 		    NLMSG_ALIGN(sizeof(struct ifinfomsg)) +
1385 		    NLMSG_ALIGN(sizeof(struct nlattr)) * 8 +
1386 		    NLMSG_ALIGN(sizeof(uint32_t)) +
1387 		    NLMSG_ALIGN(sizeof(name)) +
1388 		    NLMSG_ALIGN(sizeof("vlan")) +
1389 		    NLMSG_ALIGN(sizeof(uint32_t)) +
1390 		    NLMSG_ALIGN(sizeof(uint16_t)) + 16];
1391 	struct nlattr *na_info;
1392 	struct nlattr *na_vlan;
1393 	uint32_t sn = MLX5_NL_SN_GENERATE;
1394 	int ret;
1395 
1396 	memset(buf, 0, sizeof(buf));
1397 	nlh = (struct nlmsghdr *)buf;
1398 	nlh->nlmsg_len = sizeof(struct nlmsghdr);
1399 	nlh->nlmsg_type = RTM_NEWLINK;
1400 	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
1401 			   NLM_F_EXCL | NLM_F_ACK;
1402 	ifm = (struct ifinfomsg *)nl_msg_tail(nlh);
1403 	nlh->nlmsg_len += sizeof(struct ifinfomsg);
1404 	ifm->ifi_family = AF_UNSPEC;
1405 	ifm->ifi_type = 0;
1406 	ifm->ifi_index = 0;
1407 	ifm->ifi_flags = IFF_UP;
1408 	ifm->ifi_change = 0xffffffff;
1409 	nl_attr_put(nlh, IFLA_LINK, &ifindex, sizeof(ifindex));
1410 	ret = snprintf(name, sizeof(name), "%s.%u.%u",
1411 		       MLX5_VMWA_VLAN_DEVICE_PFX, ifindex, tag);
1412 	nl_attr_put(nlh, IFLA_IFNAME, name, ret + 1);
1413 	na_info = nl_attr_nest_start(nlh, IFLA_LINKINFO);
1414 	nl_attr_put(nlh, IFLA_INFO_KIND, "vlan", sizeof("vlan"));
1415 	na_vlan = nl_attr_nest_start(nlh, IFLA_INFO_DATA);
1416 	nl_attr_put(nlh, IFLA_VLAN_ID, &tag, sizeof(tag));
1417 	nl_attr_nest_end(nlh, na_vlan);
1418 	nl_attr_nest_end(nlh, na_info);
1419 	MLX5_ASSERT(sizeof(buf) >= nlh->nlmsg_len);
1420 	ret = mlx5_nl_send(vmwa->nl_socket, nlh, sn);
1421 	if (ret >= 0)
1422 		ret = mlx5_nl_recv(vmwa->nl_socket, sn, NULL, NULL);
1423 	if (ret < 0) {
1424 		DRV_LOG(WARNING, "netlink: VLAN %s create failure (%d)", name,
1425 			ret);
1426 	}
1427 	/* Try to get ifindex of created or pre-existing device. */
1428 	ret = if_nametoindex(name);
1429 	if (!ret) {
1430 		DRV_LOG(WARNING, "VLAN %s failed to get index (%d)", name,
1431 			errno);
1432 		return 0;
1433 	}
1434 	return ret;
1435 }
1436 
1437 /**
1438  * Parse Netlink message to retrieve the general family ID.
1439  *
1440  * @param nh
1441  *   Pointer to Netlink Message Header.
1442  * @param arg
1443  *   PMD data register with this callback.
1444  *
1445  * @return
1446  *   0 on success, a negative errno value otherwise and rte_errno is set.
1447  */
1448 static int
1449 mlx5_nl_family_id_cb(struct nlmsghdr *nh, void *arg)
1450 {
1451 
1452 	struct nlattr *tail = RTE_PTR_ADD(nh, nh->nlmsg_len);
1453 	struct nlattr *nla = RTE_PTR_ADD(nh, NLMSG_ALIGN(sizeof(*nh)) +
1454 					NLMSG_ALIGN(sizeof(struct genlmsghdr)));
1455 
1456 	for (; nla->nla_len && nla < tail;
1457 	     nla = RTE_PTR_ADD(nla, NLMSG_ALIGN(nla->nla_len))) {
1458 		if (nla->nla_type == CTRL_ATTR_FAMILY_ID) {
1459 			*(uint16_t *)arg = *(uint16_t *)(nla + 1);
1460 			return 0;
1461 		}
1462 	}
1463 	return -EINVAL;
1464 }
1465 
1466 #define MLX5_NL_MAX_ATTR_SIZE 100
1467 /**
1468  * Get generic netlink family ID.
1469  *
1470  * @param[in] nlsk_fd
1471  *   Netlink socket file descriptor.
1472  * @param[in] name
1473  *   The family name.
1474  *
1475  * @return
1476  *   ID >= 0 on success and @p enable is updated, a negative errno value
1477  *   otherwise and rte_errno is set.
1478  */
1479 static int
1480 mlx5_nl_generic_family_id_get(int nlsk_fd, const char *name)
1481 {
1482 	struct nlmsghdr *nlh;
1483 	struct genlmsghdr *genl;
1484 	uint32_t sn = MLX5_NL_SN_GENERATE;
1485 	int name_size = strlen(name) + 1;
1486 	int ret;
1487 	uint16_t id = -1;
1488 	uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1489 		    NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
1490 		    NLMSG_ALIGN(sizeof(struct nlattr)) +
1491 		    NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE)];
1492 
1493 	memset(buf, 0, sizeof(buf));
1494 	nlh = (struct nlmsghdr *)buf;
1495 	nlh->nlmsg_len = sizeof(struct nlmsghdr);
1496 	nlh->nlmsg_type = GENL_ID_CTRL;
1497 	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1498 	genl = (struct genlmsghdr *)nl_msg_tail(nlh);
1499 	nlh->nlmsg_len += sizeof(struct genlmsghdr);
1500 	genl->cmd = CTRL_CMD_GETFAMILY;
1501 	genl->version = 1;
1502 	nl_attr_put(nlh, CTRL_ATTR_FAMILY_NAME, name, name_size);
1503 	ret = mlx5_nl_send(nlsk_fd, nlh, sn);
1504 	if (ret >= 0)
1505 		ret = mlx5_nl_recv(nlsk_fd, sn, mlx5_nl_family_id_cb, &id);
1506 	if (ret < 0) {
1507 		DRV_LOG(DEBUG, "Failed to get Netlink %s family ID: %d.", name,
1508 			ret);
1509 		return ret;
1510 	}
1511 	DRV_LOG(DEBUG, "Netlink \"%s\" family ID is %u.", name, id);
1512 	return (int)id;
1513 }
1514 
1515 /**
1516  * Get Devlink family ID.
1517  *
1518  * @param[in] nlsk_fd
1519  *   Netlink socket file descriptor.
1520  *
1521  * @return
1522  *   ID >= 0 on success and @p enable is updated, a negative errno value
1523  *   otherwise and rte_errno is set.
1524  */
1525 
1526 int
1527 mlx5_nl_devlink_family_id_get(int nlsk_fd)
1528 {
1529 	return mlx5_nl_generic_family_id_get(nlsk_fd, DEVLINK_GENL_NAME);
1530 }
1531 
1532 /**
1533  * Parse Netlink message to retrieve the ROCE enable status.
1534  *
1535  * @param nh
1536  *   Pointer to Netlink Message Header.
1537  * @param arg
1538  *   PMD data register with this callback.
1539  *
1540  * @return
1541  *   0 on success, a negative errno value otherwise and rte_errno is set.
1542  */
1543 static int
1544 mlx5_nl_roce_cb(struct nlmsghdr *nh, void *arg)
1545 {
1546 
1547 	int ret = -EINVAL;
1548 	int *enable = arg;
1549 	struct nlattr *tail = RTE_PTR_ADD(nh, nh->nlmsg_len);
1550 	struct nlattr *nla = RTE_PTR_ADD(nh, NLMSG_ALIGN(sizeof(*nh)) +
1551 					NLMSG_ALIGN(sizeof(struct genlmsghdr)));
1552 
1553 	while (nla->nla_len && nla < tail) {
1554 		switch (nla->nla_type) {
1555 		/* Expected nested attributes case. */
1556 		case DEVLINK_ATTR_PARAM:
1557 		case DEVLINK_ATTR_PARAM_VALUES_LIST:
1558 		case DEVLINK_ATTR_PARAM_VALUE:
1559 			ret = 0;
1560 			nla += 1;
1561 			break;
1562 		case DEVLINK_ATTR_PARAM_VALUE_DATA:
1563 			*enable = 1;
1564 			return 0;
1565 		default:
1566 			nla = RTE_PTR_ADD(nla, NLMSG_ALIGN(nla->nla_len));
1567 		}
1568 	}
1569 	*enable = 0;
1570 	return ret;
1571 }
1572 
1573 /**
1574  * Get ROCE enable status through Netlink.
1575  *
1576  * @param[in] nlsk_fd
1577  *   Netlink socket file descriptor.
1578  * @param[in] family_id
1579  *   the Devlink family ID.
1580  * @param pci_addr
1581  *   The device PCI address.
1582  * @param[out] enable
1583  *   Where to store the enable status.
1584  *
1585  * @return
1586  *   0 on success and @p enable is updated, a negative errno value otherwise
1587  *   and rte_errno is set.
1588  */
1589 int
1590 mlx5_nl_enable_roce_get(int nlsk_fd, int family_id, const char *pci_addr,
1591 			int *enable)
1592 {
1593 	struct nlmsghdr *nlh;
1594 	struct genlmsghdr *genl;
1595 	uint32_t sn = MLX5_NL_SN_GENERATE;
1596 	int ret;
1597 	int cur_en = 0;
1598 	uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1599 		    NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
1600 		    NLMSG_ALIGN(sizeof(struct nlattr)) * 4 +
1601 		    NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE) * 4];
1602 
1603 	memset(buf, 0, sizeof(buf));
1604 	nlh = (struct nlmsghdr *)buf;
1605 	nlh->nlmsg_len = sizeof(struct nlmsghdr);
1606 	nlh->nlmsg_type = family_id;
1607 	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1608 	genl = (struct genlmsghdr *)nl_msg_tail(nlh);
1609 	nlh->nlmsg_len += sizeof(struct genlmsghdr);
1610 	genl->cmd = DEVLINK_CMD_PARAM_GET;
1611 	genl->version = DEVLINK_GENL_VERSION;
1612 	nl_attr_put(nlh, DEVLINK_ATTR_BUS_NAME, "pci", 4);
1613 	nl_attr_put(nlh, DEVLINK_ATTR_DEV_NAME, pci_addr, strlen(pci_addr) + 1);
1614 	nl_attr_put(nlh, DEVLINK_ATTR_PARAM_NAME, "enable_roce", 12);
1615 	ret = mlx5_nl_send(nlsk_fd, nlh, sn);
1616 	if (ret >= 0)
1617 		ret = mlx5_nl_recv(nlsk_fd, sn, mlx5_nl_roce_cb, &cur_en);
1618 	if (ret < 0) {
1619 		DRV_LOG(DEBUG, "Failed to get ROCE enable on device %s: %d.",
1620 			pci_addr, ret);
1621 		return ret;
1622 	}
1623 	*enable = cur_en;
1624 	DRV_LOG(DEBUG, "ROCE is %sabled for device \"%s\".",
1625 		cur_en ? "en" : "dis", pci_addr);
1626 	return ret;
1627 }
1628 
1629 /**
1630  * Reload mlx5 device kernel driver through Netlink.
1631  *
1632  * @param[in] nlsk_fd
1633  *   Netlink socket file descriptor.
1634  * @param[in] family_id
1635  *   the Devlink family ID.
1636  * @param pci_addr
1637  *   The device PCI address.
1638  * @param[out] enable
1639  *   The enable status to set.
1640  *
1641  * @return
1642  *   0 on success, a negative errno value otherwise and rte_errno is set.
1643  */
1644 int
1645 mlx5_nl_driver_reload(int nlsk_fd, int family_id, const char *pci_addr)
1646 {
1647 	struct nlmsghdr *nlh;
1648 	struct genlmsghdr *genl;
1649 	uint32_t sn = MLX5_NL_SN_GENERATE;
1650 	int ret;
1651 	uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1652 		    NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
1653 		    NLMSG_ALIGN(sizeof(struct nlattr)) * 2 +
1654 		    NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE) * 2];
1655 
1656 	memset(buf, 0, sizeof(buf));
1657 	nlh = (struct nlmsghdr *)buf;
1658 	nlh->nlmsg_len = sizeof(struct nlmsghdr);
1659 	nlh->nlmsg_type = family_id;
1660 	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1661 	genl = (struct genlmsghdr *)nl_msg_tail(nlh);
1662 	nlh->nlmsg_len += sizeof(struct genlmsghdr);
1663 	genl->cmd = DEVLINK_CMD_RELOAD;
1664 	genl->version = DEVLINK_GENL_VERSION;
1665 	nl_attr_put(nlh, DEVLINK_ATTR_BUS_NAME, "pci", 4);
1666 	nl_attr_put(nlh, DEVLINK_ATTR_DEV_NAME, pci_addr, strlen(pci_addr) + 1);
1667 	ret = mlx5_nl_send(nlsk_fd, nlh, sn);
1668 	if (ret >= 0)
1669 		ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
1670 	if (ret < 0) {
1671 		DRV_LOG(DEBUG, "Failed to reload %s device by Netlink - %d",
1672 			pci_addr, ret);
1673 		return ret;
1674 	}
1675 	DRV_LOG(DEBUG, "Device \"%s\" was reloaded by Netlink successfully.",
1676 		pci_addr);
1677 	return 0;
1678 }
1679 
1680 /**
1681  * Set ROCE enable status through Netlink.
1682  *
1683  * @param[in] nlsk_fd
1684  *   Netlink socket file descriptor.
1685  * @param[in] family_id
1686  *   the Devlink family ID.
1687  * @param pci_addr
1688  *   The device PCI address.
1689  * @param[out] enable
1690  *   The enable status to set.
1691  *
1692  * @return
1693  *   0 on success, a negative errno value otherwise and rte_errno is set.
1694  */
1695 int
1696 mlx5_nl_enable_roce_set(int nlsk_fd, int family_id, const char *pci_addr,
1697 			int enable)
1698 {
1699 	struct nlmsghdr *nlh;
1700 	struct genlmsghdr *genl;
1701 	uint32_t sn = MLX5_NL_SN_GENERATE;
1702 	int ret;
1703 	uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1704 		    NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
1705 		    NLMSG_ALIGN(sizeof(struct nlattr)) * 6 +
1706 		    NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE) * 6];
1707 	uint8_t cmode = DEVLINK_PARAM_CMODE_DRIVERINIT;
1708 	uint8_t ptype = NLA_FLAG;
1709 ;
1710 
1711 	memset(buf, 0, sizeof(buf));
1712 	nlh = (struct nlmsghdr *)buf;
1713 	nlh->nlmsg_len = sizeof(struct nlmsghdr);
1714 	nlh->nlmsg_type = family_id;
1715 	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1716 	genl = (struct genlmsghdr *)nl_msg_tail(nlh);
1717 	nlh->nlmsg_len += sizeof(struct genlmsghdr);
1718 	genl->cmd = DEVLINK_CMD_PARAM_SET;
1719 	genl->version = DEVLINK_GENL_VERSION;
1720 	nl_attr_put(nlh, DEVLINK_ATTR_BUS_NAME, "pci", 4);
1721 	nl_attr_put(nlh, DEVLINK_ATTR_DEV_NAME, pci_addr, strlen(pci_addr) + 1);
1722 	nl_attr_put(nlh, DEVLINK_ATTR_PARAM_NAME, "enable_roce", 12);
1723 	nl_attr_put(nlh, DEVLINK_ATTR_PARAM_VALUE_CMODE, &cmode, sizeof(cmode));
1724 	nl_attr_put(nlh, DEVLINK_ATTR_PARAM_TYPE, &ptype, sizeof(ptype));
1725 	if (enable)
1726 		nl_attr_put(nlh, DEVLINK_ATTR_PARAM_VALUE_DATA, NULL, 0);
1727 	ret = mlx5_nl_send(nlsk_fd, nlh, sn);
1728 	if (ret >= 0)
1729 		ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
1730 	if (ret < 0) {
1731 		DRV_LOG(DEBUG, "Failed to %sable ROCE for device %s by Netlink:"
1732 			" %d.", enable ? "en" : "dis", pci_addr, ret);
1733 		return ret;
1734 	}
1735 	DRV_LOG(DEBUG, "Device %s ROCE was %sabled by Netlink successfully.",
1736 		pci_addr, enable ? "en" : "dis");
1737 	/* Now, need to reload the driver. */
1738 	return mlx5_nl_driver_reload(nlsk_fd, family_id, pci_addr);
1739 }
1740