xref: /dpdk/drivers/common/mlx5/linux/mlx5_nl.c (revision bbbe38a6d59ccdda25917712701e629d0b10af6f)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2018 6WIND S.A.
3  * Copyright 2018 Mellanox Technologies, Ltd
4  */
5 
6 #include <errno.h>
7 #include <linux/if_link.h>
8 #include <linux/rtnetlink.h>
9 #include <linux/genetlink.h>
10 #include <net/if.h>
11 #include <rdma/rdma_netlink.h>
12 #include <stdbool.h>
13 #include <stdint.h>
14 #include <stdlib.h>
15 #include <stdalign.h>
16 #include <string.h>
17 #include <sys/socket.h>
18 #include <unistd.h>
19 
20 #include <rte_errno.h>
21 
22 #include "mlx5_nl.h"
23 #include "../mlx5_common_log.h"
24 #include "mlx5_malloc.h"
25 #ifdef HAVE_DEVLINK
26 #include <linux/devlink.h>
27 #endif
28 
29 
30 /* Size of the buffer to receive kernel messages */
31 #define MLX5_NL_BUF_SIZE (32 * 1024)
32 /* Send buffer size for the Netlink socket */
33 #define MLX5_SEND_BUF_SIZE 32768
34 /* Receive buffer size for the Netlink socket */
35 #define MLX5_RECV_BUF_SIZE 32768
36 /* Maximal physical port name length. */
37 #define MLX5_PHYS_PORT_NAME_MAX 128
38 
39 /** Parameters of VLAN devices created by driver. */
40 #define MLX5_VMWA_VLAN_DEVICE_PFX "evmlx"
41 /*
42  * Define NDA_RTA as defined in iproute2 sources.
43  *
44  * see in iproute2 sources file include/libnetlink.h
45  */
46 #ifndef MLX5_NDA_RTA
47 #define MLX5_NDA_RTA(r) \
48 	((struct rtattr *)(((char *)(r)) + NLMSG_ALIGN(sizeof(struct ndmsg))))
49 #endif
50 /*
51  * Define NLMSG_TAIL as defined in iproute2 sources.
52  *
53  * see in iproute2 sources file include/libnetlink.h
54  */
55 #ifndef NLMSG_TAIL
56 #define NLMSG_TAIL(nmsg) \
57 	((struct rtattr *)(((char *)(nmsg)) + NLMSG_ALIGN((nmsg)->nlmsg_len)))
58 #endif
59 /*
60  * The following definitions are normally found in rdma/rdma_netlink.h,
61  * however they are so recent that most systems do not expose them yet.
62  */
63 #ifndef HAVE_RDMA_NL_NLDEV
64 #define RDMA_NL_NLDEV 5
65 #endif
66 #ifndef HAVE_RDMA_NLDEV_CMD_GET
67 #define RDMA_NLDEV_CMD_GET 1
68 #endif
69 #ifndef HAVE_RDMA_NLDEV_CMD_PORT_GET
70 #define RDMA_NLDEV_CMD_PORT_GET 5
71 #endif
72 #ifndef HAVE_RDMA_NLDEV_ATTR_DEV_INDEX
73 #define RDMA_NLDEV_ATTR_DEV_INDEX 1
74 #endif
75 #ifndef HAVE_RDMA_NLDEV_ATTR_DEV_NAME
76 #define RDMA_NLDEV_ATTR_DEV_NAME 2
77 #endif
78 #ifndef HAVE_RDMA_NLDEV_ATTR_PORT_INDEX
79 #define RDMA_NLDEV_ATTR_PORT_INDEX 3
80 #endif
81 #ifndef HAVE_RDMA_NLDEV_ATTR_NDEV_INDEX
82 #define RDMA_NLDEV_ATTR_NDEV_INDEX 50
83 #endif
84 
85 /* These are normally found in linux/if_link.h. */
86 #ifndef HAVE_IFLA_NUM_VF
87 #define IFLA_NUM_VF 21
88 #endif
89 #ifndef HAVE_IFLA_EXT_MASK
90 #define IFLA_EXT_MASK 29
91 #endif
92 #ifndef HAVE_IFLA_PHYS_SWITCH_ID
93 #define IFLA_PHYS_SWITCH_ID 36
94 #endif
95 #ifndef HAVE_IFLA_PHYS_PORT_NAME
96 #define IFLA_PHYS_PORT_NAME 38
97 #endif
98 
99 /*
100  * Some Devlink defines may be missed in old kernel versions,
101  * adjust used defines.
102  */
103 #ifndef DEVLINK_GENL_NAME
104 #define DEVLINK_GENL_NAME "devlink"
105 #endif
106 #ifndef DEVLINK_GENL_VERSION
107 #define DEVLINK_GENL_VERSION 1
108 #endif
109 #ifndef DEVLINK_ATTR_BUS_NAME
110 #define DEVLINK_ATTR_BUS_NAME 1
111 #endif
112 #ifndef DEVLINK_ATTR_DEV_NAME
113 #define DEVLINK_ATTR_DEV_NAME 2
114 #endif
115 #ifndef DEVLINK_ATTR_PARAM
116 #define DEVLINK_ATTR_PARAM 80
117 #endif
118 #ifndef DEVLINK_ATTR_PARAM_NAME
119 #define DEVLINK_ATTR_PARAM_NAME 81
120 #endif
121 #ifndef DEVLINK_ATTR_PARAM_TYPE
122 #define DEVLINK_ATTR_PARAM_TYPE 83
123 #endif
124 #ifndef DEVLINK_ATTR_PARAM_VALUES_LIST
125 #define DEVLINK_ATTR_PARAM_VALUES_LIST 84
126 #endif
127 #ifndef DEVLINK_ATTR_PARAM_VALUE
128 #define DEVLINK_ATTR_PARAM_VALUE 85
129 #endif
130 #ifndef DEVLINK_ATTR_PARAM_VALUE_DATA
131 #define DEVLINK_ATTR_PARAM_VALUE_DATA 86
132 #endif
133 #ifndef DEVLINK_ATTR_PARAM_VALUE_CMODE
134 #define DEVLINK_ATTR_PARAM_VALUE_CMODE 87
135 #endif
136 #ifndef DEVLINK_PARAM_CMODE_DRIVERINIT
137 #define DEVLINK_PARAM_CMODE_DRIVERINIT 1
138 #endif
139 #ifndef DEVLINK_CMD_RELOAD
140 #define DEVLINK_CMD_RELOAD 37
141 #endif
142 #ifndef DEVLINK_CMD_PARAM_GET
143 #define DEVLINK_CMD_PARAM_GET 38
144 #endif
145 #ifndef DEVLINK_CMD_PARAM_SET
146 #define DEVLINK_CMD_PARAM_SET 39
147 #endif
148 #ifndef NLA_FLAG
149 #define NLA_FLAG 6
150 #endif
151 
152 /* Add/remove MAC address through Netlink */
153 struct mlx5_nl_mac_addr {
154 	struct rte_ether_addr (*mac)[];
155 	/**< MAC address handled by the device. */
156 	int mac_n; /**< Number of addresses in the array. */
157 };
158 
159 #define MLX5_NL_CMD_GET_IB_NAME (1 << 0)
160 #define MLX5_NL_CMD_GET_IB_INDEX (1 << 1)
161 #define MLX5_NL_CMD_GET_NET_INDEX (1 << 2)
162 #define MLX5_NL_CMD_GET_PORT_INDEX (1 << 3)
163 
164 /** Data structure used by mlx5_nl_cmdget_cb(). */
165 struct mlx5_nl_ifindex_data {
166 	const char *name; /**< IB device name (in). */
167 	uint32_t flags; /**< found attribute flags (out). */
168 	uint32_t ibindex; /**< IB device index (out). */
169 	uint32_t ifindex; /**< Network interface index (out). */
170 	uint32_t portnum; /**< IB device max port number (out). */
171 };
172 
173 uint32_t atomic_sn;
174 
175 /* Generate Netlink sequence number. */
176 #define MLX5_NL_SN_GENERATE __atomic_add_fetch(&atomic_sn, 1, __ATOMIC_RELAXED)
177 
178 /**
179  * Opens a Netlink socket.
180  *
181  * @param protocol
182  *   Netlink protocol (e.g. NETLINK_ROUTE, NETLINK_RDMA).
183  *
184  * @return
185  *   A file descriptor on success, a negative errno value otherwise and
186  *   rte_errno is set.
187  */
188 int
189 mlx5_nl_init(int protocol)
190 {
191 	int fd;
192 	int sndbuf_size = MLX5_SEND_BUF_SIZE;
193 	int rcvbuf_size = MLX5_RECV_BUF_SIZE;
194 	struct sockaddr_nl local = {
195 		.nl_family = AF_NETLINK,
196 	};
197 	int ret;
198 
199 	fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, protocol);
200 	if (fd == -1) {
201 		rte_errno = errno;
202 		return -rte_errno;
203 	}
204 	ret = setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &sndbuf_size, sizeof(int));
205 	if (ret == -1) {
206 		rte_errno = errno;
207 		goto error;
208 	}
209 	ret = setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &rcvbuf_size, sizeof(int));
210 	if (ret == -1) {
211 		rte_errno = errno;
212 		goto error;
213 	}
214 	ret = bind(fd, (struct sockaddr *)&local, sizeof(local));
215 	if (ret == -1) {
216 		rte_errno = errno;
217 		goto error;
218 	}
219 	return fd;
220 error:
221 	close(fd);
222 	return -rte_errno;
223 }
224 
225 /**
226  * Send a request message to the kernel on the Netlink socket.
227  *
228  * @param[in] nlsk_fd
229  *   Netlink socket file descriptor.
230  * @param[in] nh
231  *   The Netlink message send to the kernel.
232  * @param[in] ssn
233  *   Sequence number.
234  * @param[in] req
235  *   Pointer to the request structure.
236  * @param[in] len
237  *   Length of the request in bytes.
238  *
239  * @return
240  *   The number of sent bytes on success, a negative errno value otherwise and
241  *   rte_errno is set.
242  */
243 static int
244 mlx5_nl_request(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn, void *req,
245 		int len)
246 {
247 	struct sockaddr_nl sa = {
248 		.nl_family = AF_NETLINK,
249 	};
250 	struct iovec iov[2] = {
251 		{ .iov_base = nh, .iov_len = sizeof(*nh), },
252 		{ .iov_base = req, .iov_len = len, },
253 	};
254 	struct msghdr msg = {
255 		.msg_name = &sa,
256 		.msg_namelen = sizeof(sa),
257 		.msg_iov = iov,
258 		.msg_iovlen = 2,
259 	};
260 	int send_bytes;
261 
262 	nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
263 	nh->nlmsg_seq = sn;
264 	send_bytes = sendmsg(nlsk_fd, &msg, 0);
265 	if (send_bytes < 0) {
266 		rte_errno = errno;
267 		return -rte_errno;
268 	}
269 	return send_bytes;
270 }
271 
272 /**
273  * Send a message to the kernel on the Netlink socket.
274  *
275  * @param[in] nlsk_fd
276  *   The Netlink socket file descriptor used for communication.
277  * @param[in] nh
278  *   The Netlink message send to the kernel.
279  * @param[in] sn
280  *   Sequence number.
281  *
282  * @return
283  *   The number of sent bytes on success, a negative errno value otherwise and
284  *   rte_errno is set.
285  */
286 static int
287 mlx5_nl_send(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn)
288 {
289 	struct sockaddr_nl sa = {
290 		.nl_family = AF_NETLINK,
291 	};
292 	struct iovec iov = {
293 		.iov_base = nh,
294 		.iov_len = nh->nlmsg_len,
295 	};
296 	struct msghdr msg = {
297 		.msg_name = &sa,
298 		.msg_namelen = sizeof(sa),
299 		.msg_iov = &iov,
300 		.msg_iovlen = 1,
301 	};
302 	int send_bytes;
303 
304 	nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
305 	nh->nlmsg_seq = sn;
306 	send_bytes = sendmsg(nlsk_fd, &msg, 0);
307 	if (send_bytes < 0) {
308 		rte_errno = errno;
309 		return -rte_errno;
310 	}
311 	return send_bytes;
312 }
313 
314 /**
315  * Receive a message from the kernel on the Netlink socket, following
316  * mlx5_nl_send().
317  *
318  * @param[in] nlsk_fd
319  *   The Netlink socket file descriptor used for communication.
320  * @param[in] sn
321  *   Sequence number.
322  * @param[in] cb
323  *   The callback function to call for each Netlink message received.
324  * @param[in, out] arg
325  *   Custom arguments for the callback.
326  *
327  * @return
328  *   0 on success, a negative errno value otherwise and rte_errno is set.
329  */
330 static int
331 mlx5_nl_recv(int nlsk_fd, uint32_t sn, int (*cb)(struct nlmsghdr *, void *arg),
332 	     void *arg)
333 {
334 	struct sockaddr_nl sa;
335 	void *buf = mlx5_malloc(0, MLX5_RECV_BUF_SIZE, 0, SOCKET_ID_ANY);
336 	struct iovec iov = {
337 		.iov_base = buf,
338 		.iov_len = MLX5_RECV_BUF_SIZE,
339 	};
340 	struct msghdr msg = {
341 		.msg_name = &sa,
342 		.msg_namelen = sizeof(sa),
343 		.msg_iov = &iov,
344 		/* One message at a time */
345 		.msg_iovlen = 1,
346 	};
347 	int multipart = 0;
348 	int ret = 0;
349 
350 	if (!buf) {
351 		rte_errno = ENOMEM;
352 		return -rte_errno;
353 	}
354 	do {
355 		struct nlmsghdr *nh;
356 		int recv_bytes = 0;
357 
358 		do {
359 			recv_bytes = recvmsg(nlsk_fd, &msg, 0);
360 			if (recv_bytes == -1) {
361 				rte_errno = errno;
362 				ret = -rte_errno;
363 				goto exit;
364 			}
365 			nh = (struct nlmsghdr *)buf;
366 		} while (nh->nlmsg_seq != sn);
367 		for (;
368 		     NLMSG_OK(nh, (unsigned int)recv_bytes);
369 		     nh = NLMSG_NEXT(nh, recv_bytes)) {
370 			if (nh->nlmsg_type == NLMSG_ERROR) {
371 				struct nlmsgerr *err_data = NLMSG_DATA(nh);
372 
373 				if (err_data->error < 0) {
374 					rte_errno = -err_data->error;
375 					ret = -rte_errno;
376 					goto exit;
377 				}
378 				/* Ack message. */
379 				ret = 0;
380 				goto exit;
381 			}
382 			/* Multi-part msgs and their trailing DONE message. */
383 			if (nh->nlmsg_flags & NLM_F_MULTI) {
384 				if (nh->nlmsg_type == NLMSG_DONE) {
385 					ret =  0;
386 					goto exit;
387 				}
388 				multipart = 1;
389 			}
390 			if (cb) {
391 				ret = cb(nh, arg);
392 				if (ret < 0)
393 					goto exit;
394 			}
395 		}
396 	} while (multipart);
397 exit:
398 	mlx5_free(buf);
399 	return ret;
400 }
401 
402 /**
403  * Parse Netlink message to retrieve the bridge MAC address.
404  *
405  * @param nh
406  *   Pointer to Netlink Message Header.
407  * @param arg
408  *   PMD data register with this callback.
409  *
410  * @return
411  *   0 on success, a negative errno value otherwise and rte_errno is set.
412  */
413 static int
414 mlx5_nl_mac_addr_cb(struct nlmsghdr *nh, void *arg)
415 {
416 	struct mlx5_nl_mac_addr *data = arg;
417 	struct ndmsg *r = NLMSG_DATA(nh);
418 	struct rtattr *attribute;
419 	int len;
420 
421 	len = nh->nlmsg_len - NLMSG_LENGTH(sizeof(*r));
422 	for (attribute = MLX5_NDA_RTA(r);
423 	     RTA_OK(attribute, len);
424 	     attribute = RTA_NEXT(attribute, len)) {
425 		if (attribute->rta_type == NDA_LLADDR) {
426 			if (data->mac_n == MLX5_MAX_MAC_ADDRESSES) {
427 				DRV_LOG(WARNING,
428 					"not enough room to finalize the"
429 					" request");
430 				rte_errno = ENOMEM;
431 				return -rte_errno;
432 			}
433 #ifdef RTE_LIBRTE_MLX5_DEBUG
434 			char m[RTE_ETHER_ADDR_FMT_SIZE];
435 
436 			rte_ether_format_addr(m, RTE_ETHER_ADDR_FMT_SIZE,
437 					      RTA_DATA(attribute));
438 			DRV_LOG(DEBUG, "bridge MAC address %s", m);
439 #endif
440 			memcpy(&(*data->mac)[data->mac_n++],
441 			       RTA_DATA(attribute), RTE_ETHER_ADDR_LEN);
442 		}
443 	}
444 	return 0;
445 }
446 
447 /**
448  * Get bridge MAC addresses.
449  *
450  * @param[in] nlsk_fd
451  *   Netlink socket file descriptor.
452  * @param[in] iface_idx
453  *   Net device interface index.
454  * @param mac[out]
455  *   Pointer to the array table of MAC addresses to fill.
456  *   Its size should be of MLX5_MAX_MAC_ADDRESSES.
457  * @param mac_n[out]
458  *   Number of entries filled in MAC array.
459  *
460  * @return
461  *   0 on success, a negative errno value otherwise and rte_errno is set.
462  */
463 static int
464 mlx5_nl_mac_addr_list(int nlsk_fd, unsigned int iface_idx,
465 		      struct rte_ether_addr (*mac)[], int *mac_n)
466 {
467 	struct {
468 		struct nlmsghdr	hdr;
469 		struct ifinfomsg ifm;
470 	} req = {
471 		.hdr = {
472 			.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
473 			.nlmsg_type = RTM_GETNEIGH,
474 			.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST,
475 		},
476 		.ifm = {
477 			.ifi_family = PF_BRIDGE,
478 			.ifi_index = iface_idx,
479 		},
480 	};
481 	struct mlx5_nl_mac_addr data = {
482 		.mac = mac,
483 		.mac_n = 0,
484 	};
485 	uint32_t sn = MLX5_NL_SN_GENERATE;
486 	int ret;
487 
488 	if (nlsk_fd == -1)
489 		return 0;
490 	ret = mlx5_nl_request(nlsk_fd, &req.hdr, sn, &req.ifm,
491 			      sizeof(struct ifinfomsg));
492 	if (ret < 0)
493 		goto error;
494 	ret = mlx5_nl_recv(nlsk_fd, sn, mlx5_nl_mac_addr_cb, &data);
495 	if (ret < 0)
496 		goto error;
497 	*mac_n = data.mac_n;
498 	return 0;
499 error:
500 	DRV_LOG(DEBUG, "Interface %u cannot retrieve MAC address list %s",
501 		iface_idx, strerror(rte_errno));
502 	return -rte_errno;
503 }
504 
505 /**
506  * Modify the MAC address neighbour table with Netlink.
507  *
508  * @param[in] nlsk_fd
509  *   Netlink socket file descriptor.
510  * @param[in] iface_idx
511  *   Net device interface index.
512  * @param mac
513  *   MAC address to consider.
514  * @param add
515  *   1 to add the MAC address, 0 to remove the MAC address.
516  *
517  * @return
518  *   0 on success, a negative errno value otherwise and rte_errno is set.
519  */
520 static int
521 mlx5_nl_mac_addr_modify(int nlsk_fd, unsigned int iface_idx,
522 			struct rte_ether_addr *mac, int add)
523 {
524 	struct {
525 		struct nlmsghdr hdr;
526 		struct ndmsg ndm;
527 		struct rtattr rta;
528 		uint8_t buffer[RTE_ETHER_ADDR_LEN];
529 	} req = {
530 		.hdr = {
531 			.nlmsg_len = NLMSG_LENGTH(sizeof(struct ndmsg)),
532 			.nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
533 				NLM_F_EXCL | NLM_F_ACK,
534 			.nlmsg_type = add ? RTM_NEWNEIGH : RTM_DELNEIGH,
535 		},
536 		.ndm = {
537 			.ndm_family = PF_BRIDGE,
538 			.ndm_state = NUD_NOARP | NUD_PERMANENT,
539 			.ndm_ifindex = iface_idx,
540 			.ndm_flags = NTF_SELF,
541 		},
542 		.rta = {
543 			.rta_type = NDA_LLADDR,
544 			.rta_len = RTA_LENGTH(RTE_ETHER_ADDR_LEN),
545 		},
546 	};
547 	uint32_t sn = MLX5_NL_SN_GENERATE;
548 	int ret;
549 
550 	if (nlsk_fd == -1)
551 		return 0;
552 	memcpy(RTA_DATA(&req.rta), mac, RTE_ETHER_ADDR_LEN);
553 	req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) +
554 		RTA_ALIGN(req.rta.rta_len);
555 	ret = mlx5_nl_send(nlsk_fd, &req.hdr, sn);
556 	if (ret < 0)
557 		goto error;
558 	ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
559 	if (ret < 0)
560 		goto error;
561 	return 0;
562 error:
563 #ifdef RTE_LIBRTE_MLX5_DEBUG
564 	{
565 		char m[RTE_ETHER_ADDR_FMT_SIZE];
566 
567 		rte_ether_format_addr(m, RTE_ETHER_ADDR_FMT_SIZE, mac);
568 		DRV_LOG(DEBUG,
569 			"Interface %u cannot %s MAC address %s %s",
570 			iface_idx,
571 			add ? "add" : "remove", m, strerror(rte_errno));
572 	}
573 #endif
574 	return -rte_errno;
575 }
576 
577 /**
578  * Modify the VF MAC address neighbour table with Netlink.
579  *
580  * @param[in] nlsk_fd
581  *   Netlink socket file descriptor.
582  * @param[in] iface_idx
583  *   Net device interface index.
584  * @param mac
585  *    MAC address to consider.
586  * @param vf_index
587  *    VF index.
588  *
589  * @return
590  *    0 on success, a negative errno value otherwise and rte_errno is set.
591  */
592 int
593 mlx5_nl_vf_mac_addr_modify(int nlsk_fd, unsigned int iface_idx,
594 			   struct rte_ether_addr *mac, int vf_index)
595 {
596 	int ret;
597 	struct {
598 		struct nlmsghdr hdr;
599 		struct ifinfomsg ifm;
600 		struct rtattr vf_list_rta;
601 		struct rtattr vf_info_rta;
602 		struct rtattr vf_mac_rta;
603 		struct ifla_vf_mac ivm;
604 	} req = {
605 		.hdr = {
606 			.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
607 			.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
608 			.nlmsg_type = RTM_BASE,
609 		},
610 		.ifm = {
611 			.ifi_index = iface_idx,
612 		},
613 		.vf_list_rta = {
614 			.rta_type = IFLA_VFINFO_LIST,
615 			.rta_len = RTA_ALIGN(RTA_LENGTH(0)),
616 		},
617 		.vf_info_rta = {
618 			.rta_type = IFLA_VF_INFO,
619 			.rta_len = RTA_ALIGN(RTA_LENGTH(0)),
620 		},
621 		.vf_mac_rta = {
622 			.rta_type = IFLA_VF_MAC,
623 		},
624 	};
625 	struct ifla_vf_mac ivm = {
626 		.vf = vf_index,
627 	};
628 	uint32_t sn = MLX5_NL_SN_GENERATE;
629 
630 	memcpy(&ivm.mac, mac, RTE_ETHER_ADDR_LEN);
631 	memcpy(RTA_DATA(&req.vf_mac_rta), &ivm, sizeof(ivm));
632 
633 	req.vf_mac_rta.rta_len = RTA_LENGTH(sizeof(ivm));
634 	req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) +
635 		RTA_ALIGN(req.vf_list_rta.rta_len) +
636 		RTA_ALIGN(req.vf_info_rta.rta_len) +
637 		RTA_ALIGN(req.vf_mac_rta.rta_len);
638 	req.vf_list_rta.rta_len = RTE_PTR_DIFF(NLMSG_TAIL(&req.hdr),
639 					       &req.vf_list_rta);
640 	req.vf_info_rta.rta_len = RTE_PTR_DIFF(NLMSG_TAIL(&req.hdr),
641 					       &req.vf_info_rta);
642 
643 	if (nlsk_fd < 0)
644 		return -1;
645 	ret = mlx5_nl_send(nlsk_fd, &req.hdr, sn);
646 	if (ret < 0)
647 		goto error;
648 	ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
649 	if (ret < 0)
650 		goto error;
651 	return 0;
652 error:
653 	DRV_LOG(ERR,
654 		"representor %u cannot set VF MAC address "
655 		"%02X:%02X:%02X:%02X:%02X:%02X : %s",
656 		vf_index,
657 		mac->addr_bytes[0], mac->addr_bytes[1],
658 		mac->addr_bytes[2], mac->addr_bytes[3],
659 		mac->addr_bytes[4], mac->addr_bytes[5],
660 		strerror(rte_errno));
661 	return -rte_errno;
662 }
663 
664 /**
665  * Add a MAC address.
666  *
667  * @param[in] nlsk_fd
668  *   Netlink socket file descriptor.
669  * @param[in] iface_idx
670  *   Net device interface index.
671  * @param mac_own
672  *   BITFIELD_DECLARE array to store the mac.
673  * @param mac
674  *   MAC address to register.
675  * @param index
676  *   MAC address index.
677  *
678  * @return
679  *   0 on success, a negative errno value otherwise and rte_errno is set.
680  */
681 int
682 mlx5_nl_mac_addr_add(int nlsk_fd, unsigned int iface_idx,
683 		     uint64_t *mac_own, struct rte_ether_addr *mac,
684 		     uint32_t index)
685 {
686 	int ret;
687 
688 	ret = mlx5_nl_mac_addr_modify(nlsk_fd, iface_idx, mac, 1);
689 	if (!ret) {
690 		MLX5_ASSERT(index < MLX5_MAX_MAC_ADDRESSES);
691 		if (index >= MLX5_MAX_MAC_ADDRESSES)
692 			return -EINVAL;
693 
694 		BITFIELD_SET(mac_own, index);
695 	}
696 	if (ret == -EEXIST)
697 		return 0;
698 	return ret;
699 }
700 
701 /**
702  * Remove a MAC address.
703  *
704  * @param[in] nlsk_fd
705  *   Netlink socket file descriptor.
706  * @param[in] iface_idx
707  *   Net device interface index.
708  * @param mac_own
709  *   BITFIELD_DECLARE array to store the mac.
710  * @param mac
711  *   MAC address to remove.
712  * @param index
713  *   MAC address index.
714  *
715  * @return
716  *   0 on success, a negative errno value otherwise and rte_errno is set.
717  */
718 int
719 mlx5_nl_mac_addr_remove(int nlsk_fd, unsigned int iface_idx, uint64_t *mac_own,
720 			struct rte_ether_addr *mac, uint32_t index)
721 {
722 	MLX5_ASSERT(index < MLX5_MAX_MAC_ADDRESSES);
723 	if (index >= MLX5_MAX_MAC_ADDRESSES)
724 		return -EINVAL;
725 
726 	BITFIELD_RESET(mac_own, index);
727 	return mlx5_nl_mac_addr_modify(nlsk_fd, iface_idx, mac, 0);
728 }
729 
730 /**
731  * Synchronize Netlink bridge table to the internal table.
732  *
733  * @param[in] nlsk_fd
734  *   Netlink socket file descriptor.
735  * @param[in] iface_idx
736  *   Net device interface index.
737  * @param mac_addrs
738  *   Mac addresses array to sync.
739  * @param n
740  *   @p mac_addrs array size.
741  */
742 void
743 mlx5_nl_mac_addr_sync(int nlsk_fd, unsigned int iface_idx,
744 		      struct rte_ether_addr *mac_addrs, int n)
745 {
746 	struct rte_ether_addr macs[n];
747 	int macs_n = 0;
748 	int i;
749 	int ret;
750 
751 	memset(macs, 0, n * sizeof(macs[0]));
752 	ret = mlx5_nl_mac_addr_list(nlsk_fd, iface_idx, &macs, &macs_n);
753 	if (ret)
754 		return;
755 	for (i = 0; i != macs_n; ++i) {
756 		int j;
757 
758 		/* Verify the address is not in the array yet. */
759 		for (j = 0; j != n; ++j)
760 			if (rte_is_same_ether_addr(&macs[i], &mac_addrs[j]))
761 				break;
762 		if (j != n)
763 			continue;
764 		if (rte_is_multicast_ether_addr(&macs[i])) {
765 			/* Find the first entry available. */
766 			for (j = MLX5_MAX_UC_MAC_ADDRESSES; j != n; ++j) {
767 				if (rte_is_zero_ether_addr(&mac_addrs[j])) {
768 					mac_addrs[j] = macs[i];
769 					break;
770 				}
771 			}
772 		} else {
773 			/* Find the first entry available. */
774 			for (j = 0; j != MLX5_MAX_UC_MAC_ADDRESSES; ++j) {
775 				if (rte_is_zero_ether_addr(&mac_addrs[j])) {
776 					mac_addrs[j] = macs[i];
777 					break;
778 				}
779 			}
780 		}
781 	}
782 }
783 
784 /**
785  * Flush all added MAC addresses.
786  *
787  * @param[in] nlsk_fd
788  *   Netlink socket file descriptor.
789  * @param[in] iface_idx
790  *   Net device interface index.
791  * @param[in] mac_addrs
792  *   Mac addresses array to flush.
793  * @param n
794  *   @p mac_addrs array size.
795  * @param mac_own
796  *   BITFIELD_DECLARE array to store the mac.
797  */
798 void
799 mlx5_nl_mac_addr_flush(int nlsk_fd, unsigned int iface_idx,
800 		       struct rte_ether_addr *mac_addrs, int n,
801 		       uint64_t *mac_own)
802 {
803 	int i;
804 
805 	if (n <= 0 || n > MLX5_MAX_MAC_ADDRESSES)
806 		return;
807 
808 	for (i = n - 1; i >= 0; --i) {
809 		struct rte_ether_addr *m = &mac_addrs[i];
810 
811 		if (BITFIELD_ISSET(mac_own, i))
812 			mlx5_nl_mac_addr_remove(nlsk_fd, iface_idx, mac_own, m,
813 						i);
814 	}
815 }
816 
817 /**
818  * Enable promiscuous / all multicast mode through Netlink.
819  *
820  * @param[in] nlsk_fd
821  *   Netlink socket file descriptor.
822  * @param[in] iface_idx
823  *   Net device interface index.
824  * @param flags
825  *   IFF_PROMISC for promiscuous, IFF_ALLMULTI for allmulti.
826  * @param enable
827  *   Nonzero to enable, disable otherwise.
828  *
829  * @return
830  *   0 on success, a negative errno value otherwise and rte_errno is set.
831  */
832 static int
833 mlx5_nl_device_flags(int nlsk_fd, unsigned int iface_idx, uint32_t flags,
834 		     int enable)
835 {
836 	struct {
837 		struct nlmsghdr hdr;
838 		struct ifinfomsg ifi;
839 	} req = {
840 		.hdr = {
841 			.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
842 			.nlmsg_type = RTM_NEWLINK,
843 			.nlmsg_flags = NLM_F_REQUEST,
844 		},
845 		.ifi = {
846 			.ifi_flags = enable ? flags : 0,
847 			.ifi_change = flags,
848 			.ifi_index = iface_idx,
849 		},
850 	};
851 	uint32_t sn = MLX5_NL_SN_GENERATE;
852 	int ret;
853 
854 	MLX5_ASSERT(!(flags & ~(IFF_PROMISC | IFF_ALLMULTI)));
855 	if (nlsk_fd < 0)
856 		return 0;
857 	ret = mlx5_nl_send(nlsk_fd, &req.hdr, sn);
858 	if (ret < 0)
859 		return ret;
860 	return 0;
861 }
862 
863 /**
864  * Enable promiscuous mode through Netlink.
865  *
866  * @param[in] nlsk_fd
867  *   Netlink socket file descriptor.
868  * @param[in] iface_idx
869  *   Net device interface index.
870  * @param enable
871  *   Nonzero to enable, disable otherwise.
872  *
873  * @return
874  *   0 on success, a negative errno value otherwise and rte_errno is set.
875  */
876 int
877 mlx5_nl_promisc(int nlsk_fd, unsigned int iface_idx, int enable)
878 {
879 	int ret = mlx5_nl_device_flags(nlsk_fd, iface_idx, IFF_PROMISC, enable);
880 
881 	if (ret)
882 		DRV_LOG(DEBUG,
883 			"Interface %u cannot %s promisc mode: Netlink error %s",
884 			iface_idx, enable ? "enable" : "disable",
885 			strerror(rte_errno));
886 	return ret;
887 }
888 
889 /**
890  * Enable all multicast mode through Netlink.
891  *
892  * @param[in] nlsk_fd
893  *   Netlink socket file descriptor.
894  * @param[in] iface_idx
895  *   Net device interface index.
896  * @param enable
897  *   Nonzero to enable, disable otherwise.
898  *
899  * @return
900  *   0 on success, a negative errno value otherwise and rte_errno is set.
901  */
902 int
903 mlx5_nl_allmulti(int nlsk_fd, unsigned int iface_idx, int enable)
904 {
905 	int ret = mlx5_nl_device_flags(nlsk_fd, iface_idx, IFF_ALLMULTI,
906 				       enable);
907 
908 	if (ret)
909 		DRV_LOG(DEBUG,
910 			"Interface %u cannot %s allmulti : Netlink error %s",
911 			iface_idx, enable ? "enable" : "disable",
912 			strerror(rte_errno));
913 	return ret;
914 }
915 
916 /**
917  * Process network interface information from Netlink message.
918  *
919  * @param nh
920  *   Pointer to Netlink message header.
921  * @param arg
922  *   Opaque data pointer for this callback.
923  *
924  * @return
925  *   0 on success, a negative errno value otherwise and rte_errno is set.
926  */
927 static int
928 mlx5_nl_cmdget_cb(struct nlmsghdr *nh, void *arg)
929 {
930 	struct mlx5_nl_ifindex_data *data = arg;
931 	struct mlx5_nl_ifindex_data local = {
932 		.flags = 0,
933 	};
934 	size_t off = NLMSG_HDRLEN;
935 
936 	if (nh->nlmsg_type !=
937 	    RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET) &&
938 	    nh->nlmsg_type !=
939 	    RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_PORT_GET))
940 		goto error;
941 	while (off < nh->nlmsg_len) {
942 		struct nlattr *na = (void *)((uintptr_t)nh + off);
943 		void *payload = (void *)((uintptr_t)na + NLA_HDRLEN);
944 
945 		if (na->nla_len > nh->nlmsg_len - off)
946 			goto error;
947 		switch (na->nla_type) {
948 		case RDMA_NLDEV_ATTR_DEV_INDEX:
949 			local.ibindex = *(uint32_t *)payload;
950 			local.flags |= MLX5_NL_CMD_GET_IB_INDEX;
951 			break;
952 		case RDMA_NLDEV_ATTR_DEV_NAME:
953 			if (!strcmp(payload, data->name))
954 				local.flags |= MLX5_NL_CMD_GET_IB_NAME;
955 			break;
956 		case RDMA_NLDEV_ATTR_NDEV_INDEX:
957 			local.ifindex = *(uint32_t *)payload;
958 			local.flags |= MLX5_NL_CMD_GET_NET_INDEX;
959 			break;
960 		case RDMA_NLDEV_ATTR_PORT_INDEX:
961 			local.portnum = *(uint32_t *)payload;
962 			local.flags |= MLX5_NL_CMD_GET_PORT_INDEX;
963 			break;
964 		default:
965 			break;
966 		}
967 		off += NLA_ALIGN(na->nla_len);
968 	}
969 	/*
970 	 * It is possible to have multiple messages for all
971 	 * Infiniband devices in the system with appropriate name.
972 	 * So we should gather parameters locally and copy to
973 	 * query context only in case of coinciding device name.
974 	 */
975 	if (local.flags & MLX5_NL_CMD_GET_IB_NAME) {
976 		data->flags = local.flags;
977 		data->ibindex = local.ibindex;
978 		data->ifindex = local.ifindex;
979 		data->portnum = local.portnum;
980 	}
981 	return 0;
982 error:
983 	rte_errno = EINVAL;
984 	return -rte_errno;
985 }
986 
987 /**
988  * Get index of network interface associated with some IB device.
989  *
990  * This is the only somewhat safe method to avoid resorting to heuristics
991  * when faced with port representors. Unfortunately it requires at least
992  * Linux 4.17.
993  *
994  * @param nl
995  *   Netlink socket of the RDMA kind (NETLINK_RDMA).
996  * @param[in] name
997  *   IB device name.
998  * @param[in] pindex
999  *   IB device port index, starting from 1
1000  * @return
1001  *   A valid (nonzero) interface index on success, 0 otherwise and rte_errno
1002  *   is set.
1003  */
1004 unsigned int
1005 mlx5_nl_ifindex(int nl, const char *name, uint32_t pindex)
1006 {
1007 	struct mlx5_nl_ifindex_data data = {
1008 		.name = name,
1009 		.flags = 0,
1010 		.ibindex = 0, /* Determined during first pass. */
1011 		.ifindex = 0, /* Determined during second pass. */
1012 	};
1013 	union {
1014 		struct nlmsghdr nh;
1015 		uint8_t buf[NLMSG_HDRLEN +
1016 			    NLA_HDRLEN + NLA_ALIGN(sizeof(data.ibindex)) +
1017 			    NLA_HDRLEN + NLA_ALIGN(sizeof(pindex))];
1018 	} req = {
1019 		.nh = {
1020 			.nlmsg_len = NLMSG_LENGTH(0),
1021 			.nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
1022 						       RDMA_NLDEV_CMD_GET),
1023 			.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,
1024 		},
1025 	};
1026 	struct nlattr *na;
1027 	uint32_t sn = MLX5_NL_SN_GENERATE;
1028 	int ret;
1029 
1030 	ret = mlx5_nl_send(nl, &req.nh, sn);
1031 	if (ret < 0)
1032 		return 0;
1033 	ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, &data);
1034 	if (ret < 0)
1035 		return 0;
1036 	if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) ||
1037 	    !(data.flags & MLX5_NL_CMD_GET_IB_INDEX))
1038 		goto error;
1039 	data.flags = 0;
1040 	sn = MLX5_NL_SN_GENERATE;
1041 	req.nh.nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
1042 					     RDMA_NLDEV_CMD_PORT_GET);
1043 	req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1044 	req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.buf) - NLMSG_HDRLEN);
1045 	na = (void *)((uintptr_t)req.buf + NLMSG_HDRLEN);
1046 	na->nla_len = NLA_HDRLEN + sizeof(data.ibindex);
1047 	na->nla_type = RDMA_NLDEV_ATTR_DEV_INDEX;
1048 	memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
1049 	       &data.ibindex, sizeof(data.ibindex));
1050 	na = (void *)((uintptr_t)na + NLA_ALIGN(na->nla_len));
1051 	na->nla_len = NLA_HDRLEN + sizeof(pindex);
1052 	na->nla_type = RDMA_NLDEV_ATTR_PORT_INDEX;
1053 	memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
1054 	       &pindex, sizeof(pindex));
1055 	ret = mlx5_nl_send(nl, &req.nh, sn);
1056 	if (ret < 0)
1057 		return 0;
1058 	ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, &data);
1059 	if (ret < 0)
1060 		return 0;
1061 	if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) ||
1062 	    !(data.flags & MLX5_NL_CMD_GET_IB_INDEX) ||
1063 	    !(data.flags & MLX5_NL_CMD_GET_NET_INDEX) ||
1064 	    !data.ifindex)
1065 		goto error;
1066 	return data.ifindex;
1067 error:
1068 	rte_errno = ENODEV;
1069 	return 0;
1070 }
1071 
1072 /**
1073  * Get the number of physical ports of given IB device.
1074  *
1075  * @param nl
1076  *   Netlink socket of the RDMA kind (NETLINK_RDMA).
1077  * @param[in] name
1078  *   IB device name.
1079  *
1080  * @return
1081  *   A valid (nonzero) number of ports on success, 0 otherwise
1082  *   and rte_errno is set.
1083  */
1084 unsigned int
1085 mlx5_nl_portnum(int nl, const char *name)
1086 {
1087 	struct mlx5_nl_ifindex_data data = {
1088 		.flags = 0,
1089 		.name = name,
1090 		.ifindex = 0,
1091 		.portnum = 0,
1092 	};
1093 	struct nlmsghdr req = {
1094 		.nlmsg_len = NLMSG_LENGTH(0),
1095 		.nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
1096 					       RDMA_NLDEV_CMD_GET),
1097 		.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,
1098 	};
1099 	uint32_t sn = MLX5_NL_SN_GENERATE;
1100 	int ret;
1101 
1102 	ret = mlx5_nl_send(nl, &req, sn);
1103 	if (ret < 0)
1104 		return 0;
1105 	ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, &data);
1106 	if (ret < 0)
1107 		return 0;
1108 	if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) ||
1109 	    !(data.flags & MLX5_NL_CMD_GET_IB_INDEX) ||
1110 	    !(data.flags & MLX5_NL_CMD_GET_PORT_INDEX)) {
1111 		rte_errno = ENODEV;
1112 		return 0;
1113 	}
1114 	if (!data.portnum)
1115 		rte_errno = EINVAL;
1116 	return data.portnum;
1117 }
1118 
1119 /**
1120  * Analyze gathered port parameters via Netlink to recognize master
1121  * and representor devices for E-Switch configuration.
1122  *
1123  * @param[in] num_vf_set
1124  *   flag of presence of number of VFs port attribute.
1125  * @param[inout] switch_info
1126  *   Port information, including port name as a number and port name
1127  *   type if recognized
1128  *
1129  * @return
1130  *   master and representor flags are set in switch_info according to
1131  *   recognized parameters (if any).
1132  */
1133 static void
1134 mlx5_nl_check_switch_info(bool num_vf_set,
1135 			  struct mlx5_switch_info *switch_info)
1136 {
1137 	switch (switch_info->name_type) {
1138 	case MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN:
1139 		/*
1140 		 * Name is not recognized, assume the master,
1141 		 * check the number of VFs key presence.
1142 		 */
1143 		switch_info->master = num_vf_set;
1144 		break;
1145 	case MLX5_PHYS_PORT_NAME_TYPE_NOTSET:
1146 		/*
1147 		 * Name is not set, this assumes the legacy naming
1148 		 * schema for master, just check if there is a
1149 		 * number of VFs key.
1150 		 */
1151 		switch_info->master = num_vf_set;
1152 		break;
1153 	case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
1154 		/* New uplink naming schema recognized. */
1155 		switch_info->master = 1;
1156 		break;
1157 	case MLX5_PHYS_PORT_NAME_TYPE_LEGACY:
1158 		/* Legacy representors naming schema. */
1159 		switch_info->representor = !num_vf_set;
1160 		break;
1161 	case MLX5_PHYS_PORT_NAME_TYPE_PFHPF:
1162 		/* Fallthrough */
1163 	case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
1164 		/* Fallthrough */
1165 	case MLX5_PHYS_PORT_NAME_TYPE_PFSF:
1166 		/* New representors naming schema. */
1167 		switch_info->representor = 1;
1168 		break;
1169 	}
1170 }
1171 
1172 /**
1173  * Process switch information from Netlink message.
1174  *
1175  * @param nh
1176  *   Pointer to Netlink message header.
1177  * @param arg
1178  *   Opaque data pointer for this callback.
1179  *
1180  * @return
1181  *   0 on success, a negative errno value otherwise and rte_errno is set.
1182  */
1183 static int
1184 mlx5_nl_switch_info_cb(struct nlmsghdr *nh, void *arg)
1185 {
1186 	struct mlx5_switch_info info = {
1187 		.master = 0,
1188 		.representor = 0,
1189 		.name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET,
1190 		.port_name = 0,
1191 		.switch_id = 0,
1192 	};
1193 	size_t off = NLMSG_LENGTH(sizeof(struct ifinfomsg));
1194 	bool switch_id_set = false;
1195 	bool num_vf_set = false;
1196 	int len;
1197 
1198 	if (nh->nlmsg_type != RTM_NEWLINK)
1199 		goto error;
1200 	while (off < nh->nlmsg_len) {
1201 		struct rtattr *ra = (void *)((uintptr_t)nh + off);
1202 		void *payload = RTA_DATA(ra);
1203 		unsigned int i;
1204 
1205 		if (ra->rta_len > nh->nlmsg_len - off)
1206 			goto error;
1207 		switch (ra->rta_type) {
1208 		case IFLA_NUM_VF:
1209 			num_vf_set = true;
1210 			break;
1211 		case IFLA_PHYS_PORT_NAME:
1212 			len = RTA_PAYLOAD(ra);
1213 			/* Some kernels do not pad attributes with zero. */
1214 			if (len > 0 && len < MLX5_PHYS_PORT_NAME_MAX) {
1215 				char name[MLX5_PHYS_PORT_NAME_MAX];
1216 
1217 				/*
1218 				 * We can't just patch the message with padding
1219 				 * zero - it might corrupt the following items
1220 				 * in the message, we have to copy the string
1221 				 * by attribute length and pad the copied one.
1222 				 */
1223 				memcpy(name, payload, len);
1224 				name[len] = 0;
1225 				mlx5_translate_port_name(name, &info);
1226 			} else {
1227 				info.name_type =
1228 					MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN;
1229 			}
1230 			break;
1231 		case IFLA_PHYS_SWITCH_ID:
1232 			info.switch_id = 0;
1233 			for (i = 0; i < RTA_PAYLOAD(ra); ++i) {
1234 				info.switch_id <<= 8;
1235 				info.switch_id |= ((uint8_t *)payload)[i];
1236 			}
1237 			switch_id_set = true;
1238 			break;
1239 		}
1240 		off += RTA_ALIGN(ra->rta_len);
1241 	}
1242 	if (switch_id_set) {
1243 		/* We have some E-Switch configuration. */
1244 		mlx5_nl_check_switch_info(num_vf_set, &info);
1245 	}
1246 	MLX5_ASSERT(!(info.master && info.representor));
1247 	memcpy(arg, &info, sizeof(info));
1248 	return 0;
1249 error:
1250 	rte_errno = EINVAL;
1251 	return -rte_errno;
1252 }
1253 
1254 /**
1255  * Get switch information associated with network interface.
1256  *
1257  * @param nl
1258  *   Netlink socket of the ROUTE kind (NETLINK_ROUTE).
1259  * @param ifindex
1260  *   Network interface index.
1261  * @param[out] info
1262  *   Switch information object, populated in case of success.
1263  *
1264  * @return
1265  *   0 on success, a negative errno value otherwise and rte_errno is set.
1266  */
1267 int
1268 mlx5_nl_switch_info(int nl, unsigned int ifindex,
1269 		    struct mlx5_switch_info *info)
1270 {
1271 	struct {
1272 		struct nlmsghdr nh;
1273 		struct ifinfomsg info;
1274 		struct rtattr rta;
1275 		uint32_t extmask;
1276 	} req = {
1277 		.nh = {
1278 			.nlmsg_len = NLMSG_LENGTH
1279 					(sizeof(req.info) +
1280 					 RTA_LENGTH(sizeof(uint32_t))),
1281 			.nlmsg_type = RTM_GETLINK,
1282 			.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
1283 		},
1284 		.info = {
1285 			.ifi_family = AF_UNSPEC,
1286 			.ifi_index = ifindex,
1287 		},
1288 		.rta = {
1289 			.rta_type = IFLA_EXT_MASK,
1290 			.rta_len = RTA_LENGTH(sizeof(int32_t)),
1291 		},
1292 		.extmask = RTE_LE32(1),
1293 	};
1294 	uint32_t sn = MLX5_NL_SN_GENERATE;
1295 	int ret;
1296 
1297 	ret = mlx5_nl_send(nl, &req.nh, sn);
1298 	if (ret >= 0)
1299 		ret = mlx5_nl_recv(nl, sn, mlx5_nl_switch_info_cb, info);
1300 	if (info->master && info->representor) {
1301 		DRV_LOG(ERR, "ifindex %u device is recognized as master"
1302 			     " and as representor", ifindex);
1303 		rte_errno = ENODEV;
1304 		ret = -rte_errno;
1305 	}
1306 	return ret;
1307 }
1308 
1309 /*
1310  * Delete VLAN network device by ifindex.
1311  *
1312  * @param[in] tcf
1313  *   Context object initialized by mlx5_nl_vlan_vmwa_init().
1314  * @param[in] ifindex
1315  *   Interface index of network device to delete.
1316  */
1317 void
1318 mlx5_nl_vlan_vmwa_delete(struct mlx5_nl_vlan_vmwa_context *vmwa,
1319 		      uint32_t ifindex)
1320 {
1321 	uint32_t sn = MLX5_NL_SN_GENERATE;
1322 	int ret;
1323 	struct {
1324 		struct nlmsghdr nh;
1325 		struct ifinfomsg info;
1326 	} req = {
1327 		.nh = {
1328 			.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
1329 			.nlmsg_type = RTM_DELLINK,
1330 			.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
1331 		},
1332 		.info = {
1333 			.ifi_family = AF_UNSPEC,
1334 			.ifi_index = ifindex,
1335 		},
1336 	};
1337 
1338 	if (ifindex) {
1339 		ret = mlx5_nl_send(vmwa->nl_socket, &req.nh, sn);
1340 		if (ret >= 0)
1341 			ret = mlx5_nl_recv(vmwa->nl_socket, sn, NULL, NULL);
1342 		if (ret < 0)
1343 			DRV_LOG(WARNING, "netlink: error deleting VLAN WA"
1344 				" ifindex %u, %d", ifindex, ret);
1345 	}
1346 }
1347 
1348 /* Set of subroutines to build Netlink message. */
1349 static struct nlattr *
1350 nl_msg_tail(struct nlmsghdr *nlh)
1351 {
1352 	return (struct nlattr *)
1353 		(((uint8_t *)nlh) + NLMSG_ALIGN(nlh->nlmsg_len));
1354 }
1355 
1356 static void
1357 nl_attr_put(struct nlmsghdr *nlh, int type, const void *data, int alen)
1358 {
1359 	struct nlattr *nla = nl_msg_tail(nlh);
1360 
1361 	nla->nla_type = type;
1362 	nla->nla_len = NLMSG_ALIGN(sizeof(struct nlattr)) + alen;
1363 	nlh->nlmsg_len += NLMSG_ALIGN(nla->nla_len);
1364 
1365 	if (alen)
1366 		memcpy((uint8_t *)nla + sizeof(struct nlattr), data, alen);
1367 }
1368 
1369 static struct nlattr *
1370 nl_attr_nest_start(struct nlmsghdr *nlh, int type)
1371 {
1372 	struct nlattr *nest = (struct nlattr *)nl_msg_tail(nlh);
1373 
1374 	nl_attr_put(nlh, type, NULL, 0);
1375 	return nest;
1376 }
1377 
1378 static void
1379 nl_attr_nest_end(struct nlmsghdr *nlh, struct nlattr *nest)
1380 {
1381 	nest->nla_len = (uint8_t *)nl_msg_tail(nlh) - (uint8_t *)nest;
1382 }
1383 
1384 /*
1385  * Create network VLAN device with specified VLAN tag.
1386  *
1387  * @param[in] tcf
1388  *   Context object initialized by mlx5_nl_vlan_vmwa_init().
1389  * @param[in] ifindex
1390  *   Base network interface index.
1391  * @param[in] tag
1392  *   VLAN tag for VLAN network device to create.
1393  */
1394 uint32_t
1395 mlx5_nl_vlan_vmwa_create(struct mlx5_nl_vlan_vmwa_context *vmwa,
1396 			 uint32_t ifindex, uint16_t tag)
1397 {
1398 	struct nlmsghdr *nlh;
1399 	struct ifinfomsg *ifm;
1400 	char name[sizeof(MLX5_VMWA_VLAN_DEVICE_PFX) + 32];
1401 
1402 	__rte_cache_aligned
1403 	uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1404 		    NLMSG_ALIGN(sizeof(struct ifinfomsg)) +
1405 		    NLMSG_ALIGN(sizeof(struct nlattr)) * 8 +
1406 		    NLMSG_ALIGN(sizeof(uint32_t)) +
1407 		    NLMSG_ALIGN(sizeof(name)) +
1408 		    NLMSG_ALIGN(sizeof("vlan")) +
1409 		    NLMSG_ALIGN(sizeof(uint32_t)) +
1410 		    NLMSG_ALIGN(sizeof(uint16_t)) + 16];
1411 	struct nlattr *na_info;
1412 	struct nlattr *na_vlan;
1413 	uint32_t sn = MLX5_NL_SN_GENERATE;
1414 	int ret;
1415 
1416 	memset(buf, 0, sizeof(buf));
1417 	nlh = (struct nlmsghdr *)buf;
1418 	nlh->nlmsg_len = sizeof(struct nlmsghdr);
1419 	nlh->nlmsg_type = RTM_NEWLINK;
1420 	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
1421 			   NLM_F_EXCL | NLM_F_ACK;
1422 	ifm = (struct ifinfomsg *)nl_msg_tail(nlh);
1423 	nlh->nlmsg_len += sizeof(struct ifinfomsg);
1424 	ifm->ifi_family = AF_UNSPEC;
1425 	ifm->ifi_type = 0;
1426 	ifm->ifi_index = 0;
1427 	ifm->ifi_flags = IFF_UP;
1428 	ifm->ifi_change = 0xffffffff;
1429 	nl_attr_put(nlh, IFLA_LINK, &ifindex, sizeof(ifindex));
1430 	ret = snprintf(name, sizeof(name), "%s.%u.%u",
1431 		       MLX5_VMWA_VLAN_DEVICE_PFX, ifindex, tag);
1432 	nl_attr_put(nlh, IFLA_IFNAME, name, ret + 1);
1433 	na_info = nl_attr_nest_start(nlh, IFLA_LINKINFO);
1434 	nl_attr_put(nlh, IFLA_INFO_KIND, "vlan", sizeof("vlan"));
1435 	na_vlan = nl_attr_nest_start(nlh, IFLA_INFO_DATA);
1436 	nl_attr_put(nlh, IFLA_VLAN_ID, &tag, sizeof(tag));
1437 	nl_attr_nest_end(nlh, na_vlan);
1438 	nl_attr_nest_end(nlh, na_info);
1439 	MLX5_ASSERT(sizeof(buf) >= nlh->nlmsg_len);
1440 	ret = mlx5_nl_send(vmwa->nl_socket, nlh, sn);
1441 	if (ret >= 0)
1442 		ret = mlx5_nl_recv(vmwa->nl_socket, sn, NULL, NULL);
1443 	if (ret < 0) {
1444 		DRV_LOG(WARNING, "netlink: VLAN %s create failure (%d)", name,
1445 			ret);
1446 	}
1447 	/* Try to get ifindex of created or pre-existing device. */
1448 	ret = if_nametoindex(name);
1449 	if (!ret) {
1450 		DRV_LOG(WARNING, "VLAN %s failed to get index (%d)", name,
1451 			errno);
1452 		return 0;
1453 	}
1454 	return ret;
1455 }
1456 
1457 /**
1458  * Parse Netlink message to retrieve the general family ID.
1459  *
1460  * @param nh
1461  *   Pointer to Netlink Message Header.
1462  * @param arg
1463  *   PMD data register with this callback.
1464  *
1465  * @return
1466  *   0 on success, a negative errno value otherwise and rte_errno is set.
1467  */
1468 static int
1469 mlx5_nl_family_id_cb(struct nlmsghdr *nh, void *arg)
1470 {
1471 
1472 	struct nlattr *tail = RTE_PTR_ADD(nh, nh->nlmsg_len);
1473 	struct nlattr *nla = RTE_PTR_ADD(nh, NLMSG_ALIGN(sizeof(*nh)) +
1474 					NLMSG_ALIGN(sizeof(struct genlmsghdr)));
1475 
1476 	for (; nla->nla_len && nla < tail;
1477 	     nla = RTE_PTR_ADD(nla, NLMSG_ALIGN(nla->nla_len))) {
1478 		if (nla->nla_type == CTRL_ATTR_FAMILY_ID) {
1479 			*(uint16_t *)arg = *(uint16_t *)(nla + 1);
1480 			return 0;
1481 		}
1482 	}
1483 	return -EINVAL;
1484 }
1485 
1486 #define MLX5_NL_MAX_ATTR_SIZE 100
1487 /**
1488  * Get generic netlink family ID.
1489  *
1490  * @param[in] nlsk_fd
1491  *   Netlink socket file descriptor.
1492  * @param[in] name
1493  *   The family name.
1494  *
1495  * @return
1496  *   ID >= 0 on success and @p enable is updated, a negative errno value
1497  *   otherwise and rte_errno is set.
1498  */
1499 static int
1500 mlx5_nl_generic_family_id_get(int nlsk_fd, const char *name)
1501 {
1502 	struct nlmsghdr *nlh;
1503 	struct genlmsghdr *genl;
1504 	uint32_t sn = MLX5_NL_SN_GENERATE;
1505 	int name_size = strlen(name) + 1;
1506 	int ret;
1507 	uint16_t id = -1;
1508 	uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1509 		    NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
1510 		    NLMSG_ALIGN(sizeof(struct nlattr)) +
1511 		    NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE)];
1512 
1513 	memset(buf, 0, sizeof(buf));
1514 	nlh = (struct nlmsghdr *)buf;
1515 	nlh->nlmsg_len = sizeof(struct nlmsghdr);
1516 	nlh->nlmsg_type = GENL_ID_CTRL;
1517 	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1518 	genl = (struct genlmsghdr *)nl_msg_tail(nlh);
1519 	nlh->nlmsg_len += sizeof(struct genlmsghdr);
1520 	genl->cmd = CTRL_CMD_GETFAMILY;
1521 	genl->version = 1;
1522 	nl_attr_put(nlh, CTRL_ATTR_FAMILY_NAME, name, name_size);
1523 	ret = mlx5_nl_send(nlsk_fd, nlh, sn);
1524 	if (ret >= 0)
1525 		ret = mlx5_nl_recv(nlsk_fd, sn, mlx5_nl_family_id_cb, &id);
1526 	if (ret < 0) {
1527 		DRV_LOG(DEBUG, "Failed to get Netlink %s family ID: %d.", name,
1528 			ret);
1529 		return ret;
1530 	}
1531 	DRV_LOG(DEBUG, "Netlink \"%s\" family ID is %u.", name, id);
1532 	return (int)id;
1533 }
1534 
1535 /**
1536  * Get Devlink family ID.
1537  *
1538  * @param[in] nlsk_fd
1539  *   Netlink socket file descriptor.
1540  *
1541  * @return
1542  *   ID >= 0 on success and @p enable is updated, a negative errno value
1543  *   otherwise and rte_errno is set.
1544  */
1545 
1546 int
1547 mlx5_nl_devlink_family_id_get(int nlsk_fd)
1548 {
1549 	return mlx5_nl_generic_family_id_get(nlsk_fd, DEVLINK_GENL_NAME);
1550 }
1551 
1552 /**
1553  * Parse Netlink message to retrieve the ROCE enable status.
1554  *
1555  * @param nh
1556  *   Pointer to Netlink Message Header.
1557  * @param arg
1558  *   PMD data register with this callback.
1559  *
1560  * @return
1561  *   0 on success, a negative errno value otherwise and rte_errno is set.
1562  */
1563 static int
1564 mlx5_nl_roce_cb(struct nlmsghdr *nh, void *arg)
1565 {
1566 
1567 	int ret = -EINVAL;
1568 	int *enable = arg;
1569 	struct nlattr *tail = RTE_PTR_ADD(nh, nh->nlmsg_len);
1570 	struct nlattr *nla = RTE_PTR_ADD(nh, NLMSG_ALIGN(sizeof(*nh)) +
1571 					NLMSG_ALIGN(sizeof(struct genlmsghdr)));
1572 
1573 	while (nla->nla_len && nla < tail) {
1574 		switch (nla->nla_type) {
1575 		/* Expected nested attributes case. */
1576 		case DEVLINK_ATTR_PARAM:
1577 		case DEVLINK_ATTR_PARAM_VALUES_LIST:
1578 		case DEVLINK_ATTR_PARAM_VALUE:
1579 			ret = 0;
1580 			nla += 1;
1581 			break;
1582 		case DEVLINK_ATTR_PARAM_VALUE_DATA:
1583 			*enable = 1;
1584 			return 0;
1585 		default:
1586 			nla = RTE_PTR_ADD(nla, NLMSG_ALIGN(nla->nla_len));
1587 		}
1588 	}
1589 	*enable = 0;
1590 	return ret;
1591 }
1592 
1593 /**
1594  * Get ROCE enable status through Netlink.
1595  *
1596  * @param[in] nlsk_fd
1597  *   Netlink socket file descriptor.
1598  * @param[in] family_id
1599  *   the Devlink family ID.
1600  * @param pci_addr
1601  *   The device PCI address.
1602  * @param[out] enable
1603  *   Where to store the enable status.
1604  *
1605  * @return
1606  *   0 on success and @p enable is updated, a negative errno value otherwise
1607  *   and rte_errno is set.
1608  */
1609 int
1610 mlx5_nl_enable_roce_get(int nlsk_fd, int family_id, const char *pci_addr,
1611 			int *enable)
1612 {
1613 	struct nlmsghdr *nlh;
1614 	struct genlmsghdr *genl;
1615 	uint32_t sn = MLX5_NL_SN_GENERATE;
1616 	int ret;
1617 	int cur_en = 0;
1618 	uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1619 		    NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
1620 		    NLMSG_ALIGN(sizeof(struct nlattr)) * 4 +
1621 		    NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE) * 4];
1622 
1623 	memset(buf, 0, sizeof(buf));
1624 	nlh = (struct nlmsghdr *)buf;
1625 	nlh->nlmsg_len = sizeof(struct nlmsghdr);
1626 	nlh->nlmsg_type = family_id;
1627 	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1628 	genl = (struct genlmsghdr *)nl_msg_tail(nlh);
1629 	nlh->nlmsg_len += sizeof(struct genlmsghdr);
1630 	genl->cmd = DEVLINK_CMD_PARAM_GET;
1631 	genl->version = DEVLINK_GENL_VERSION;
1632 	nl_attr_put(nlh, DEVLINK_ATTR_BUS_NAME, "pci", 4);
1633 	nl_attr_put(nlh, DEVLINK_ATTR_DEV_NAME, pci_addr, strlen(pci_addr) + 1);
1634 	nl_attr_put(nlh, DEVLINK_ATTR_PARAM_NAME, "enable_roce", 12);
1635 	ret = mlx5_nl_send(nlsk_fd, nlh, sn);
1636 	if (ret >= 0)
1637 		ret = mlx5_nl_recv(nlsk_fd, sn, mlx5_nl_roce_cb, &cur_en);
1638 	if (ret < 0) {
1639 		DRV_LOG(DEBUG, "Failed to get ROCE enable on device %s: %d.",
1640 			pci_addr, ret);
1641 		return ret;
1642 	}
1643 	*enable = cur_en;
1644 	DRV_LOG(DEBUG, "ROCE is %sabled for device \"%s\".",
1645 		cur_en ? "en" : "dis", pci_addr);
1646 	return ret;
1647 }
1648 
1649 /**
1650  * Reload mlx5 device kernel driver through Netlink.
1651  *
1652  * @param[in] nlsk_fd
1653  *   Netlink socket file descriptor.
1654  * @param[in] family_id
1655  *   the Devlink family ID.
1656  * @param pci_addr
1657  *   The device PCI address.
1658  * @param[out] enable
1659  *   The enable status to set.
1660  *
1661  * @return
1662  *   0 on success, a negative errno value otherwise and rte_errno is set.
1663  */
1664 int
1665 mlx5_nl_driver_reload(int nlsk_fd, int family_id, const char *pci_addr)
1666 {
1667 	struct nlmsghdr *nlh;
1668 	struct genlmsghdr *genl;
1669 	uint32_t sn = MLX5_NL_SN_GENERATE;
1670 	int ret;
1671 	uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1672 		    NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
1673 		    NLMSG_ALIGN(sizeof(struct nlattr)) * 2 +
1674 		    NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE) * 2];
1675 
1676 	memset(buf, 0, sizeof(buf));
1677 	nlh = (struct nlmsghdr *)buf;
1678 	nlh->nlmsg_len = sizeof(struct nlmsghdr);
1679 	nlh->nlmsg_type = family_id;
1680 	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1681 	genl = (struct genlmsghdr *)nl_msg_tail(nlh);
1682 	nlh->nlmsg_len += sizeof(struct genlmsghdr);
1683 	genl->cmd = DEVLINK_CMD_RELOAD;
1684 	genl->version = DEVLINK_GENL_VERSION;
1685 	nl_attr_put(nlh, DEVLINK_ATTR_BUS_NAME, "pci", 4);
1686 	nl_attr_put(nlh, DEVLINK_ATTR_DEV_NAME, pci_addr, strlen(pci_addr) + 1);
1687 	ret = mlx5_nl_send(nlsk_fd, nlh, sn);
1688 	if (ret >= 0)
1689 		ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
1690 	if (ret < 0) {
1691 		DRV_LOG(DEBUG, "Failed to reload %s device by Netlink - %d",
1692 			pci_addr, ret);
1693 		return ret;
1694 	}
1695 	DRV_LOG(DEBUG, "Device \"%s\" was reloaded by Netlink successfully.",
1696 		pci_addr);
1697 	return 0;
1698 }
1699 
1700 /**
1701  * Set ROCE enable status through Netlink.
1702  *
1703  * @param[in] nlsk_fd
1704  *   Netlink socket file descriptor.
1705  * @param[in] family_id
1706  *   the Devlink family ID.
1707  * @param pci_addr
1708  *   The device PCI address.
1709  * @param[out] enable
1710  *   The enable status to set.
1711  *
1712  * @return
1713  *   0 on success, a negative errno value otherwise and rte_errno is set.
1714  */
1715 int
1716 mlx5_nl_enable_roce_set(int nlsk_fd, int family_id, const char *pci_addr,
1717 			int enable)
1718 {
1719 	struct nlmsghdr *nlh;
1720 	struct genlmsghdr *genl;
1721 	uint32_t sn = MLX5_NL_SN_GENERATE;
1722 	int ret;
1723 	uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1724 		    NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
1725 		    NLMSG_ALIGN(sizeof(struct nlattr)) * 6 +
1726 		    NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE) * 6];
1727 	uint8_t cmode = DEVLINK_PARAM_CMODE_DRIVERINIT;
1728 	uint8_t ptype = NLA_FLAG;
1729 ;
1730 
1731 	memset(buf, 0, sizeof(buf));
1732 	nlh = (struct nlmsghdr *)buf;
1733 	nlh->nlmsg_len = sizeof(struct nlmsghdr);
1734 	nlh->nlmsg_type = family_id;
1735 	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1736 	genl = (struct genlmsghdr *)nl_msg_tail(nlh);
1737 	nlh->nlmsg_len += sizeof(struct genlmsghdr);
1738 	genl->cmd = DEVLINK_CMD_PARAM_SET;
1739 	genl->version = DEVLINK_GENL_VERSION;
1740 	nl_attr_put(nlh, DEVLINK_ATTR_BUS_NAME, "pci", 4);
1741 	nl_attr_put(nlh, DEVLINK_ATTR_DEV_NAME, pci_addr, strlen(pci_addr) + 1);
1742 	nl_attr_put(nlh, DEVLINK_ATTR_PARAM_NAME, "enable_roce", 12);
1743 	nl_attr_put(nlh, DEVLINK_ATTR_PARAM_VALUE_CMODE, &cmode, sizeof(cmode));
1744 	nl_attr_put(nlh, DEVLINK_ATTR_PARAM_TYPE, &ptype, sizeof(ptype));
1745 	if (enable)
1746 		nl_attr_put(nlh, DEVLINK_ATTR_PARAM_VALUE_DATA, NULL, 0);
1747 	ret = mlx5_nl_send(nlsk_fd, nlh, sn);
1748 	if (ret >= 0)
1749 		ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
1750 	if (ret < 0) {
1751 		DRV_LOG(DEBUG, "Failed to %sable ROCE for device %s by Netlink:"
1752 			" %d.", enable ? "en" : "dis", pci_addr, ret);
1753 		return ret;
1754 	}
1755 	DRV_LOG(DEBUG, "Device %s ROCE was %sabled by Netlink successfully.",
1756 		pci_addr, enable ? "en" : "dis");
1757 	/* Now, need to reload the driver. */
1758 	return mlx5_nl_driver_reload(nlsk_fd, family_id, pci_addr);
1759 }
1760