xref: /dpdk/drivers/common/mlx5/linux/mlx5_nl.c (revision bc700b6767278e49c4ea9c08bb43c0fd9ca3e70d)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2018 6WIND S.A.
3  * Copyright 2018 Mellanox Technologies, Ltd
4  */
5 
6 #include <errno.h>
7 #include <linux/if_link.h>
8 #include <linux/rtnetlink.h>
9 #include <linux/genetlink.h>
10 #include <net/if.h>
11 #include <rdma/rdma_netlink.h>
12 #include <stdbool.h>
13 #include <stdint.h>
14 #include <stdlib.h>
15 #include <stdalign.h>
16 #include <string.h>
17 #include <sys/socket.h>
18 #include <unistd.h>
19 
20 #include <rte_errno.h>
21 #include <rte_atomic.h>
22 
23 #include "mlx5_nl.h"
24 #include "mlx5_common_utils.h"
25 #include "mlx5_malloc.h"
26 #ifdef HAVE_DEVLINK
27 #include <linux/devlink.h>
28 #endif
29 
30 
31 /* Size of the buffer to receive kernel messages */
32 #define MLX5_NL_BUF_SIZE (32 * 1024)
33 /* Send buffer size for the Netlink socket */
34 #define MLX5_SEND_BUF_SIZE 32768
35 /* Receive buffer size for the Netlink socket */
36 #define MLX5_RECV_BUF_SIZE 32768
37 
38 /** Parameters of VLAN devices created by driver. */
39 #define MLX5_VMWA_VLAN_DEVICE_PFX "evmlx"
40 /*
41  * Define NDA_RTA as defined in iproute2 sources.
42  *
43  * see in iproute2 sources file include/libnetlink.h
44  */
45 #ifndef MLX5_NDA_RTA
46 #define MLX5_NDA_RTA(r) \
47 	((struct rtattr *)(((char *)(r)) + NLMSG_ALIGN(sizeof(struct ndmsg))))
48 #endif
49 /*
50  * Define NLMSG_TAIL as defined in iproute2 sources.
51  *
52  * see in iproute2 sources file include/libnetlink.h
53  */
54 #ifndef NLMSG_TAIL
55 #define NLMSG_TAIL(nmsg) \
56 	((struct rtattr *)(((char *)(nmsg)) + NLMSG_ALIGN((nmsg)->nlmsg_len)))
57 #endif
58 /*
59  * The following definitions are normally found in rdma/rdma_netlink.h,
60  * however they are so recent that most systems do not expose them yet.
61  */
62 #ifndef HAVE_RDMA_NL_NLDEV
63 #define RDMA_NL_NLDEV 5
64 #endif
65 #ifndef HAVE_RDMA_NLDEV_CMD_GET
66 #define RDMA_NLDEV_CMD_GET 1
67 #endif
68 #ifndef HAVE_RDMA_NLDEV_CMD_PORT_GET
69 #define RDMA_NLDEV_CMD_PORT_GET 5
70 #endif
71 #ifndef HAVE_RDMA_NLDEV_ATTR_DEV_INDEX
72 #define RDMA_NLDEV_ATTR_DEV_INDEX 1
73 #endif
74 #ifndef HAVE_RDMA_NLDEV_ATTR_DEV_NAME
75 #define RDMA_NLDEV_ATTR_DEV_NAME 2
76 #endif
77 #ifndef HAVE_RDMA_NLDEV_ATTR_PORT_INDEX
78 #define RDMA_NLDEV_ATTR_PORT_INDEX 3
79 #endif
80 #ifndef HAVE_RDMA_NLDEV_ATTR_NDEV_INDEX
81 #define RDMA_NLDEV_ATTR_NDEV_INDEX 50
82 #endif
83 
84 /* These are normally found in linux/if_link.h. */
85 #ifndef HAVE_IFLA_NUM_VF
86 #define IFLA_NUM_VF 21
87 #endif
88 #ifndef HAVE_IFLA_EXT_MASK
89 #define IFLA_EXT_MASK 29
90 #endif
91 #ifndef HAVE_IFLA_PHYS_SWITCH_ID
92 #define IFLA_PHYS_SWITCH_ID 36
93 #endif
94 #ifndef HAVE_IFLA_PHYS_PORT_NAME
95 #define IFLA_PHYS_PORT_NAME 38
96 #endif
97 
98 /*
99  * Some Devlink defines may be missed in old kernel versions,
100  * adjust used defines.
101  */
102 #ifndef DEVLINK_GENL_NAME
103 #define DEVLINK_GENL_NAME "devlink"
104 #endif
105 #ifndef DEVLINK_GENL_VERSION
106 #define DEVLINK_GENL_VERSION 1
107 #endif
108 #ifndef DEVLINK_ATTR_BUS_NAME
109 #define DEVLINK_ATTR_BUS_NAME 1
110 #endif
111 #ifndef DEVLINK_ATTR_DEV_NAME
112 #define DEVLINK_ATTR_DEV_NAME 2
113 #endif
114 #ifndef DEVLINK_ATTR_PARAM
115 #define DEVLINK_ATTR_PARAM 80
116 #endif
117 #ifndef DEVLINK_ATTR_PARAM_NAME
118 #define DEVLINK_ATTR_PARAM_NAME 81
119 #endif
120 #ifndef DEVLINK_ATTR_PARAM_TYPE
121 #define DEVLINK_ATTR_PARAM_TYPE 83
122 #endif
123 #ifndef DEVLINK_ATTR_PARAM_VALUES_LIST
124 #define DEVLINK_ATTR_PARAM_VALUES_LIST 84
125 #endif
126 #ifndef DEVLINK_ATTR_PARAM_VALUE
127 #define DEVLINK_ATTR_PARAM_VALUE 85
128 #endif
129 #ifndef DEVLINK_ATTR_PARAM_VALUE_DATA
130 #define DEVLINK_ATTR_PARAM_VALUE_DATA 86
131 #endif
132 #ifndef DEVLINK_ATTR_PARAM_VALUE_CMODE
133 #define DEVLINK_ATTR_PARAM_VALUE_CMODE 87
134 #endif
135 #ifndef DEVLINK_PARAM_CMODE_DRIVERINIT
136 #define DEVLINK_PARAM_CMODE_DRIVERINIT 1
137 #endif
138 #ifndef DEVLINK_CMD_RELOAD
139 #define DEVLINK_CMD_RELOAD 37
140 #endif
141 #ifndef DEVLINK_CMD_PARAM_GET
142 #define DEVLINK_CMD_PARAM_GET 38
143 #endif
144 #ifndef DEVLINK_CMD_PARAM_SET
145 #define DEVLINK_CMD_PARAM_SET 39
146 #endif
147 #ifndef NLA_FLAG
148 #define NLA_FLAG 6
149 #endif
150 
151 /* Add/remove MAC address through Netlink */
152 struct mlx5_nl_mac_addr {
153 	struct rte_ether_addr (*mac)[];
154 	/**< MAC address handled by the device. */
155 	int mac_n; /**< Number of addresses in the array. */
156 };
157 
158 #define MLX5_NL_CMD_GET_IB_NAME (1 << 0)
159 #define MLX5_NL_CMD_GET_IB_INDEX (1 << 1)
160 #define MLX5_NL_CMD_GET_NET_INDEX (1 << 2)
161 #define MLX5_NL_CMD_GET_PORT_INDEX (1 << 3)
162 
163 /** Data structure used by mlx5_nl_cmdget_cb(). */
164 struct mlx5_nl_ifindex_data {
165 	const char *name; /**< IB device name (in). */
166 	uint32_t flags; /**< found attribute flags (out). */
167 	uint32_t ibindex; /**< IB device index (out). */
168 	uint32_t ifindex; /**< Network interface index (out). */
169 	uint32_t portnum; /**< IB device max port number (out). */
170 };
171 
172 rte_atomic32_t atomic_sn = RTE_ATOMIC32_INIT(0);
173 
174 /* Generate Netlink sequence number. */
175 #define MLX5_NL_SN_GENERATE ((uint32_t)rte_atomic32_add_return(&atomic_sn, 1))
176 
177 /**
178  * Opens a Netlink socket.
179  *
180  * @param protocol
181  *   Netlink protocol (e.g. NETLINK_ROUTE, NETLINK_RDMA).
182  *
183  * @return
184  *   A file descriptor on success, a negative errno value otherwise and
185  *   rte_errno is set.
186  */
187 int
188 mlx5_nl_init(int protocol)
189 {
190 	int fd;
191 	int sndbuf_size = MLX5_SEND_BUF_SIZE;
192 	int rcvbuf_size = MLX5_RECV_BUF_SIZE;
193 	struct sockaddr_nl local = {
194 		.nl_family = AF_NETLINK,
195 	};
196 	int ret;
197 
198 	fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, protocol);
199 	if (fd == -1) {
200 		rte_errno = errno;
201 		return -rte_errno;
202 	}
203 	ret = setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &sndbuf_size, sizeof(int));
204 	if (ret == -1) {
205 		rte_errno = errno;
206 		goto error;
207 	}
208 	ret = setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &rcvbuf_size, sizeof(int));
209 	if (ret == -1) {
210 		rte_errno = errno;
211 		goto error;
212 	}
213 	ret = bind(fd, (struct sockaddr *)&local, sizeof(local));
214 	if (ret == -1) {
215 		rte_errno = errno;
216 		goto error;
217 	}
218 	return fd;
219 error:
220 	close(fd);
221 	return -rte_errno;
222 }
223 
224 /**
225  * Send a request message to the kernel on the Netlink socket.
226  *
227  * @param[in] nlsk_fd
228  *   Netlink socket file descriptor.
229  * @param[in] nh
230  *   The Netlink message send to the kernel.
231  * @param[in] ssn
232  *   Sequence number.
233  * @param[in] req
234  *   Pointer to the request structure.
235  * @param[in] len
236  *   Length of the request in bytes.
237  *
238  * @return
239  *   The number of sent bytes on success, a negative errno value otherwise and
240  *   rte_errno is set.
241  */
242 static int
243 mlx5_nl_request(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn, void *req,
244 		int len)
245 {
246 	struct sockaddr_nl sa = {
247 		.nl_family = AF_NETLINK,
248 	};
249 	struct iovec iov[2] = {
250 		{ .iov_base = nh, .iov_len = sizeof(*nh), },
251 		{ .iov_base = req, .iov_len = len, },
252 	};
253 	struct msghdr msg = {
254 		.msg_name = &sa,
255 		.msg_namelen = sizeof(sa),
256 		.msg_iov = iov,
257 		.msg_iovlen = 2,
258 	};
259 	int send_bytes;
260 
261 	nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
262 	nh->nlmsg_seq = sn;
263 	send_bytes = sendmsg(nlsk_fd, &msg, 0);
264 	if (send_bytes < 0) {
265 		rte_errno = errno;
266 		return -rte_errno;
267 	}
268 	return send_bytes;
269 }
270 
271 /**
272  * Send a message to the kernel on the Netlink socket.
273  *
274  * @param[in] nlsk_fd
275  *   The Netlink socket file descriptor used for communication.
276  * @param[in] nh
277  *   The Netlink message send to the kernel.
278  * @param[in] sn
279  *   Sequence number.
280  *
281  * @return
282  *   The number of sent bytes on success, a negative errno value otherwise and
283  *   rte_errno is set.
284  */
285 static int
286 mlx5_nl_send(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn)
287 {
288 	struct sockaddr_nl sa = {
289 		.nl_family = AF_NETLINK,
290 	};
291 	struct iovec iov = {
292 		.iov_base = nh,
293 		.iov_len = nh->nlmsg_len,
294 	};
295 	struct msghdr msg = {
296 		.msg_name = &sa,
297 		.msg_namelen = sizeof(sa),
298 		.msg_iov = &iov,
299 		.msg_iovlen = 1,
300 	};
301 	int send_bytes;
302 
303 	nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
304 	nh->nlmsg_seq = sn;
305 	send_bytes = sendmsg(nlsk_fd, &msg, 0);
306 	if (send_bytes < 0) {
307 		rte_errno = errno;
308 		return -rte_errno;
309 	}
310 	return send_bytes;
311 }
312 
313 /**
314  * Receive a message from the kernel on the Netlink socket, following
315  * mlx5_nl_send().
316  *
317  * @param[in] nlsk_fd
318  *   The Netlink socket file descriptor used for communication.
319  * @param[in] sn
320  *   Sequence number.
321  * @param[in] cb
322  *   The callback function to call for each Netlink message received.
323  * @param[in, out] arg
324  *   Custom arguments for the callback.
325  *
326  * @return
327  *   0 on success, a negative errno value otherwise and rte_errno is set.
328  */
329 static int
330 mlx5_nl_recv(int nlsk_fd, uint32_t sn, int (*cb)(struct nlmsghdr *, void *arg),
331 	     void *arg)
332 {
333 	struct sockaddr_nl sa;
334 	void *buf = mlx5_malloc(0, MLX5_RECV_BUF_SIZE, 0, SOCKET_ID_ANY);
335 	struct iovec iov = {
336 		.iov_base = buf,
337 		.iov_len = MLX5_RECV_BUF_SIZE,
338 	};
339 	struct msghdr msg = {
340 		.msg_name = &sa,
341 		.msg_namelen = sizeof(sa),
342 		.msg_iov = &iov,
343 		/* One message at a time */
344 		.msg_iovlen = 1,
345 	};
346 	int multipart = 0;
347 	int ret = 0;
348 
349 	if (!buf) {
350 		rte_errno = ENOMEM;
351 		return -rte_errno;
352 	}
353 	do {
354 		struct nlmsghdr *nh;
355 		int recv_bytes = 0;
356 
357 		do {
358 			recv_bytes = recvmsg(nlsk_fd, &msg, 0);
359 			if (recv_bytes == -1) {
360 				rte_errno = errno;
361 				ret = -rte_errno;
362 				goto exit;
363 			}
364 			nh = (struct nlmsghdr *)buf;
365 		} while (nh->nlmsg_seq != sn);
366 		for (;
367 		     NLMSG_OK(nh, (unsigned int)recv_bytes);
368 		     nh = NLMSG_NEXT(nh, recv_bytes)) {
369 			if (nh->nlmsg_type == NLMSG_ERROR) {
370 				struct nlmsgerr *err_data = NLMSG_DATA(nh);
371 
372 				if (err_data->error < 0) {
373 					rte_errno = -err_data->error;
374 					ret = -rte_errno;
375 					goto exit;
376 				}
377 				/* Ack message. */
378 				ret = 0;
379 				goto exit;
380 			}
381 			/* Multi-part msgs and their trailing DONE message. */
382 			if (nh->nlmsg_flags & NLM_F_MULTI) {
383 				if (nh->nlmsg_type == NLMSG_DONE) {
384 					ret =  0;
385 					goto exit;
386 				}
387 				multipart = 1;
388 			}
389 			if (cb) {
390 				ret = cb(nh, arg);
391 				if (ret < 0)
392 					goto exit;
393 			}
394 		}
395 	} while (multipart);
396 exit:
397 	mlx5_free(buf);
398 	return ret;
399 }
400 
401 /**
402  * Parse Netlink message to retrieve the bridge MAC address.
403  *
404  * @param nh
405  *   Pointer to Netlink Message Header.
406  * @param arg
407  *   PMD data register with this callback.
408  *
409  * @return
410  *   0 on success, a negative errno value otherwise and rte_errno is set.
411  */
412 static int
413 mlx5_nl_mac_addr_cb(struct nlmsghdr *nh, void *arg)
414 {
415 	struct mlx5_nl_mac_addr *data = arg;
416 	struct ndmsg *r = NLMSG_DATA(nh);
417 	struct rtattr *attribute;
418 	int len;
419 
420 	len = nh->nlmsg_len - NLMSG_LENGTH(sizeof(*r));
421 	for (attribute = MLX5_NDA_RTA(r);
422 	     RTA_OK(attribute, len);
423 	     attribute = RTA_NEXT(attribute, len)) {
424 		if (attribute->rta_type == NDA_LLADDR) {
425 			if (data->mac_n == MLX5_MAX_MAC_ADDRESSES) {
426 				DRV_LOG(WARNING,
427 					"not enough room to finalize the"
428 					" request");
429 				rte_errno = ENOMEM;
430 				return -rte_errno;
431 			}
432 #ifdef RTE_LIBRTE_MLX5_DEBUG
433 			char m[RTE_ETHER_ADDR_FMT_SIZE];
434 
435 			rte_ether_format_addr(m, RTE_ETHER_ADDR_FMT_SIZE,
436 					      RTA_DATA(attribute));
437 			DRV_LOG(DEBUG, "bridge MAC address %s", m);
438 #endif
439 			memcpy(&(*data->mac)[data->mac_n++],
440 			       RTA_DATA(attribute), RTE_ETHER_ADDR_LEN);
441 		}
442 	}
443 	return 0;
444 }
445 
446 /**
447  * Get bridge MAC addresses.
448  *
449  * @param[in] nlsk_fd
450  *   Netlink socket file descriptor.
451  * @param[in] iface_idx
452  *   Net device interface index.
453  * @param mac[out]
454  *   Pointer to the array table of MAC addresses to fill.
455  *   Its size should be of MLX5_MAX_MAC_ADDRESSES.
456  * @param mac_n[out]
457  *   Number of entries filled in MAC array.
458  *
459  * @return
460  *   0 on success, a negative errno value otherwise and rte_errno is set.
461  */
462 static int
463 mlx5_nl_mac_addr_list(int nlsk_fd, unsigned int iface_idx,
464 		      struct rte_ether_addr (*mac)[], int *mac_n)
465 {
466 	struct {
467 		struct nlmsghdr	hdr;
468 		struct ifinfomsg ifm;
469 	} req = {
470 		.hdr = {
471 			.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
472 			.nlmsg_type = RTM_GETNEIGH,
473 			.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST,
474 		},
475 		.ifm = {
476 			.ifi_family = PF_BRIDGE,
477 			.ifi_index = iface_idx,
478 		},
479 	};
480 	struct mlx5_nl_mac_addr data = {
481 		.mac = mac,
482 		.mac_n = 0,
483 	};
484 	uint32_t sn = MLX5_NL_SN_GENERATE;
485 	int ret;
486 
487 	if (nlsk_fd == -1)
488 		return 0;
489 	ret = mlx5_nl_request(nlsk_fd, &req.hdr, sn, &req.ifm,
490 			      sizeof(struct ifinfomsg));
491 	if (ret < 0)
492 		goto error;
493 	ret = mlx5_nl_recv(nlsk_fd, sn, mlx5_nl_mac_addr_cb, &data);
494 	if (ret < 0)
495 		goto error;
496 	*mac_n = data.mac_n;
497 	return 0;
498 error:
499 	DRV_LOG(DEBUG, "Interface %u cannot retrieve MAC address list %s",
500 		iface_idx, strerror(rte_errno));
501 	return -rte_errno;
502 }
503 
504 /**
505  * Modify the MAC address neighbour table with Netlink.
506  *
507  * @param[in] nlsk_fd
508  *   Netlink socket file descriptor.
509  * @param[in] iface_idx
510  *   Net device interface index.
511  * @param mac
512  *   MAC address to consider.
513  * @param add
514  *   1 to add the MAC address, 0 to remove the MAC address.
515  *
516  * @return
517  *   0 on success, a negative errno value otherwise and rte_errno is set.
518  */
519 static int
520 mlx5_nl_mac_addr_modify(int nlsk_fd, unsigned int iface_idx,
521 			struct rte_ether_addr *mac, int add)
522 {
523 	struct {
524 		struct nlmsghdr hdr;
525 		struct ndmsg ndm;
526 		struct rtattr rta;
527 		uint8_t buffer[RTE_ETHER_ADDR_LEN];
528 	} req = {
529 		.hdr = {
530 			.nlmsg_len = NLMSG_LENGTH(sizeof(struct ndmsg)),
531 			.nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
532 				NLM_F_EXCL | NLM_F_ACK,
533 			.nlmsg_type = add ? RTM_NEWNEIGH : RTM_DELNEIGH,
534 		},
535 		.ndm = {
536 			.ndm_family = PF_BRIDGE,
537 			.ndm_state = NUD_NOARP | NUD_PERMANENT,
538 			.ndm_ifindex = iface_idx,
539 			.ndm_flags = NTF_SELF,
540 		},
541 		.rta = {
542 			.rta_type = NDA_LLADDR,
543 			.rta_len = RTA_LENGTH(RTE_ETHER_ADDR_LEN),
544 		},
545 	};
546 	uint32_t sn = MLX5_NL_SN_GENERATE;
547 	int ret;
548 
549 	if (nlsk_fd == -1)
550 		return 0;
551 	memcpy(RTA_DATA(&req.rta), mac, RTE_ETHER_ADDR_LEN);
552 	req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) +
553 		RTA_ALIGN(req.rta.rta_len);
554 	ret = mlx5_nl_send(nlsk_fd, &req.hdr, sn);
555 	if (ret < 0)
556 		goto error;
557 	ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
558 	if (ret < 0)
559 		goto error;
560 	return 0;
561 error:
562 #ifdef RTE_LIBRTE_MLX5_DEBUG
563 	{
564 		char m[RTE_ETHER_ADDR_FMT_SIZE];
565 
566 		rte_ether_format_addr(m, RTE_ETHER_ADDR_FMT_SIZE, mac);
567 		DRV_LOG(DEBUG,
568 			"Interface %u cannot %s MAC address %s %s",
569 			iface_idx,
570 			add ? "add" : "remove", m, strerror(rte_errno));
571 	}
572 #endif
573 	return -rte_errno;
574 }
575 
576 /**
577  * Modify the VF MAC address neighbour table with Netlink.
578  *
579  * @param[in] nlsk_fd
580  *   Netlink socket file descriptor.
581  * @param[in] iface_idx
582  *   Net device interface index.
583  * @param mac
584  *    MAC address to consider.
585  * @param vf_index
586  *    VF index.
587  *
588  * @return
589  *    0 on success, a negative errno value otherwise and rte_errno is set.
590  */
591 int
592 mlx5_nl_vf_mac_addr_modify(int nlsk_fd, unsigned int iface_idx,
593 			   struct rte_ether_addr *mac, int vf_index)
594 {
595 	int ret;
596 	struct {
597 		struct nlmsghdr hdr;
598 		struct ifinfomsg ifm;
599 		struct rtattr vf_list_rta;
600 		struct rtattr vf_info_rta;
601 		struct rtattr vf_mac_rta;
602 		struct ifla_vf_mac ivm;
603 	} req = {
604 		.hdr = {
605 			.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
606 			.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
607 			.nlmsg_type = RTM_BASE,
608 		},
609 		.ifm = {
610 			.ifi_index = iface_idx,
611 		},
612 		.vf_list_rta = {
613 			.rta_type = IFLA_VFINFO_LIST,
614 			.rta_len = RTA_ALIGN(RTA_LENGTH(0)),
615 		},
616 		.vf_info_rta = {
617 			.rta_type = IFLA_VF_INFO,
618 			.rta_len = RTA_ALIGN(RTA_LENGTH(0)),
619 		},
620 		.vf_mac_rta = {
621 			.rta_type = IFLA_VF_MAC,
622 		},
623 	};
624 	struct ifla_vf_mac ivm = {
625 		.vf = vf_index,
626 	};
627 	uint32_t sn = MLX5_NL_SN_GENERATE;
628 
629 	memcpy(&ivm.mac, mac, RTE_ETHER_ADDR_LEN);
630 	memcpy(RTA_DATA(&req.vf_mac_rta), &ivm, sizeof(ivm));
631 
632 	req.vf_mac_rta.rta_len = RTA_LENGTH(sizeof(ivm));
633 	req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) +
634 		RTA_ALIGN(req.vf_list_rta.rta_len) +
635 		RTA_ALIGN(req.vf_info_rta.rta_len) +
636 		RTA_ALIGN(req.vf_mac_rta.rta_len);
637 	req.vf_list_rta.rta_len = RTE_PTR_DIFF(NLMSG_TAIL(&req.hdr),
638 					       &req.vf_list_rta);
639 	req.vf_info_rta.rta_len = RTE_PTR_DIFF(NLMSG_TAIL(&req.hdr),
640 					       &req.vf_info_rta);
641 
642 	if (nlsk_fd < 0)
643 		return -1;
644 	ret = mlx5_nl_send(nlsk_fd, &req.hdr, sn);
645 	if (ret < 0)
646 		goto error;
647 	ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
648 	if (ret < 0)
649 		goto error;
650 	return 0;
651 error:
652 	DRV_LOG(ERR,
653 		"representor %u cannot set VF MAC address "
654 		"%02X:%02X:%02X:%02X:%02X:%02X : %s",
655 		vf_index,
656 		mac->addr_bytes[0], mac->addr_bytes[1],
657 		mac->addr_bytes[2], mac->addr_bytes[3],
658 		mac->addr_bytes[4], mac->addr_bytes[5],
659 		strerror(rte_errno));
660 	return -rte_errno;
661 }
662 
663 /**
664  * Add a MAC address.
665  *
666  * @param[in] nlsk_fd
667  *   Netlink socket file descriptor.
668  * @param[in] iface_idx
669  *   Net device interface index.
670  * @param mac_own
671  *   BITFIELD_DECLARE array to store the mac.
672  * @param mac
673  *   MAC address to register.
674  * @param index
675  *   MAC address index.
676  *
677  * @return
678  *   0 on success, a negative errno value otherwise and rte_errno is set.
679  */
680 int
681 mlx5_nl_mac_addr_add(int nlsk_fd, unsigned int iface_idx,
682 		     uint64_t *mac_own, struct rte_ether_addr *mac,
683 		     uint32_t index)
684 {
685 	int ret;
686 
687 	ret = mlx5_nl_mac_addr_modify(nlsk_fd, iface_idx, mac, 1);
688 	if (!ret) {
689 		MLX5_ASSERT(index < MLX5_MAX_MAC_ADDRESSES);
690 		if (index >= MLX5_MAX_MAC_ADDRESSES)
691 			return -EINVAL;
692 
693 		BITFIELD_SET(mac_own, index);
694 	}
695 	if (ret == -EEXIST)
696 		return 0;
697 	return ret;
698 }
699 
700 /**
701  * Remove a MAC address.
702  *
703  * @param[in] nlsk_fd
704  *   Netlink socket file descriptor.
705  * @param[in] iface_idx
706  *   Net device interface index.
707  * @param mac_own
708  *   BITFIELD_DECLARE array to store the mac.
709  * @param mac
710  *   MAC address to remove.
711  * @param index
712  *   MAC address index.
713  *
714  * @return
715  *   0 on success, a negative errno value otherwise and rte_errno is set.
716  */
717 int
718 mlx5_nl_mac_addr_remove(int nlsk_fd, unsigned int iface_idx, uint64_t *mac_own,
719 			struct rte_ether_addr *mac, uint32_t index)
720 {
721 	MLX5_ASSERT(index < MLX5_MAX_MAC_ADDRESSES);
722 	if (index >= MLX5_MAX_MAC_ADDRESSES)
723 		return -EINVAL;
724 
725 	BITFIELD_RESET(mac_own, index);
726 	return mlx5_nl_mac_addr_modify(nlsk_fd, iface_idx, mac, 0);
727 }
728 
729 /**
730  * Synchronize Netlink bridge table to the internal table.
731  *
732  * @param[in] nlsk_fd
733  *   Netlink socket file descriptor.
734  * @param[in] iface_idx
735  *   Net device interface index.
736  * @param mac_addrs
737  *   Mac addresses array to sync.
738  * @param n
739  *   @p mac_addrs array size.
740  */
741 void
742 mlx5_nl_mac_addr_sync(int nlsk_fd, unsigned int iface_idx,
743 		      struct rte_ether_addr *mac_addrs, int n)
744 {
745 	struct rte_ether_addr macs[n];
746 	int macs_n = 0;
747 	int i;
748 	int ret;
749 
750 	ret = mlx5_nl_mac_addr_list(nlsk_fd, iface_idx, &macs, &macs_n);
751 	if (ret)
752 		return;
753 	for (i = 0; i != macs_n; ++i) {
754 		int j;
755 
756 		/* Verify the address is not in the array yet. */
757 		for (j = 0; j != n; ++j)
758 			if (rte_is_same_ether_addr(&macs[i], &mac_addrs[j]))
759 				break;
760 		if (j != n)
761 			continue;
762 		/* Find the first entry available. */
763 		for (j = 0; j != n; ++j) {
764 			if (rte_is_zero_ether_addr(&mac_addrs[j])) {
765 				mac_addrs[j] = macs[i];
766 				break;
767 			}
768 		}
769 	}
770 }
771 
772 /**
773  * Flush all added MAC addresses.
774  *
775  * @param[in] nlsk_fd
776  *   Netlink socket file descriptor.
777  * @param[in] iface_idx
778  *   Net device interface index.
779  * @param[in] mac_addrs
780  *   Mac addresses array to flush.
781  * @param n
782  *   @p mac_addrs array size.
783  * @param mac_own
784  *   BITFIELD_DECLARE array to store the mac.
785  */
786 void
787 mlx5_nl_mac_addr_flush(int nlsk_fd, unsigned int iface_idx,
788 		       struct rte_ether_addr *mac_addrs, int n,
789 		       uint64_t *mac_own)
790 {
791 	int i;
792 
793 	if (n <= 0 || n > MLX5_MAX_MAC_ADDRESSES)
794 		return;
795 
796 	for (i = n - 1; i >= 0; --i) {
797 		struct rte_ether_addr *m = &mac_addrs[i];
798 
799 		if (BITFIELD_ISSET(mac_own, i))
800 			mlx5_nl_mac_addr_remove(nlsk_fd, iface_idx, mac_own, m,
801 						i);
802 	}
803 }
804 
805 /**
806  * Enable promiscuous / all multicast mode through Netlink.
807  *
808  * @param[in] nlsk_fd
809  *   Netlink socket file descriptor.
810  * @param[in] iface_idx
811  *   Net device interface index.
812  * @param flags
813  *   IFF_PROMISC for promiscuous, IFF_ALLMULTI for allmulti.
814  * @param enable
815  *   Nonzero to enable, disable otherwise.
816  *
817  * @return
818  *   0 on success, a negative errno value otherwise and rte_errno is set.
819  */
820 static int
821 mlx5_nl_device_flags(int nlsk_fd, unsigned int iface_idx, uint32_t flags,
822 		     int enable)
823 {
824 	struct {
825 		struct nlmsghdr hdr;
826 		struct ifinfomsg ifi;
827 	} req = {
828 		.hdr = {
829 			.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
830 			.nlmsg_type = RTM_NEWLINK,
831 			.nlmsg_flags = NLM_F_REQUEST,
832 		},
833 		.ifi = {
834 			.ifi_flags = enable ? flags : 0,
835 			.ifi_change = flags,
836 			.ifi_index = iface_idx,
837 		},
838 	};
839 	uint32_t sn = MLX5_NL_SN_GENERATE;
840 	int ret;
841 
842 	MLX5_ASSERT(!(flags & ~(IFF_PROMISC | IFF_ALLMULTI)));
843 	if (nlsk_fd < 0)
844 		return 0;
845 	ret = mlx5_nl_send(nlsk_fd, &req.hdr, sn);
846 	if (ret < 0)
847 		return ret;
848 	return 0;
849 }
850 
851 /**
852  * Enable promiscuous mode through Netlink.
853  *
854  * @param[in] nlsk_fd
855  *   Netlink socket file descriptor.
856  * @param[in] iface_idx
857  *   Net device interface index.
858  * @param enable
859  *   Nonzero to enable, disable otherwise.
860  *
861  * @return
862  *   0 on success, a negative errno value otherwise and rte_errno is set.
863  */
864 int
865 mlx5_nl_promisc(int nlsk_fd, unsigned int iface_idx, int enable)
866 {
867 	int ret = mlx5_nl_device_flags(nlsk_fd, iface_idx, IFF_PROMISC, enable);
868 
869 	if (ret)
870 		DRV_LOG(DEBUG,
871 			"Interface %u cannot %s promisc mode: Netlink error %s",
872 			iface_idx, enable ? "enable" : "disable",
873 			strerror(rte_errno));
874 	return ret;
875 }
876 
877 /**
878  * Enable all multicast mode through Netlink.
879  *
880  * @param[in] nlsk_fd
881  *   Netlink socket file descriptor.
882  * @param[in] iface_idx
883  *   Net device interface index.
884  * @param enable
885  *   Nonzero to enable, disable otherwise.
886  *
887  * @return
888  *   0 on success, a negative errno value otherwise and rte_errno is set.
889  */
890 int
891 mlx5_nl_allmulti(int nlsk_fd, unsigned int iface_idx, int enable)
892 {
893 	int ret = mlx5_nl_device_flags(nlsk_fd, iface_idx, IFF_ALLMULTI,
894 				       enable);
895 
896 	if (ret)
897 		DRV_LOG(DEBUG,
898 			"Interface %u cannot %s allmulti : Netlink error %s",
899 			iface_idx, enable ? "enable" : "disable",
900 			strerror(rte_errno));
901 	return ret;
902 }
903 
904 /**
905  * Process network interface information from Netlink message.
906  *
907  * @param nh
908  *   Pointer to Netlink message header.
909  * @param arg
910  *   Opaque data pointer for this callback.
911  *
912  * @return
913  *   0 on success, a negative errno value otherwise and rte_errno is set.
914  */
915 static int
916 mlx5_nl_cmdget_cb(struct nlmsghdr *nh, void *arg)
917 {
918 	struct mlx5_nl_ifindex_data *data = arg;
919 	struct mlx5_nl_ifindex_data local = {
920 		.flags = 0,
921 	};
922 	size_t off = NLMSG_HDRLEN;
923 
924 	if (nh->nlmsg_type !=
925 	    RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET) &&
926 	    nh->nlmsg_type !=
927 	    RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_PORT_GET))
928 		goto error;
929 	while (off < nh->nlmsg_len) {
930 		struct nlattr *na = (void *)((uintptr_t)nh + off);
931 		void *payload = (void *)((uintptr_t)na + NLA_HDRLEN);
932 
933 		if (na->nla_len > nh->nlmsg_len - off)
934 			goto error;
935 		switch (na->nla_type) {
936 		case RDMA_NLDEV_ATTR_DEV_INDEX:
937 			local.ibindex = *(uint32_t *)payload;
938 			local.flags |= MLX5_NL_CMD_GET_IB_INDEX;
939 			break;
940 		case RDMA_NLDEV_ATTR_DEV_NAME:
941 			if (!strcmp(payload, data->name))
942 				local.flags |= MLX5_NL_CMD_GET_IB_NAME;
943 			break;
944 		case RDMA_NLDEV_ATTR_NDEV_INDEX:
945 			local.ifindex = *(uint32_t *)payload;
946 			local.flags |= MLX5_NL_CMD_GET_NET_INDEX;
947 			break;
948 		case RDMA_NLDEV_ATTR_PORT_INDEX:
949 			local.portnum = *(uint32_t *)payload;
950 			local.flags |= MLX5_NL_CMD_GET_PORT_INDEX;
951 			break;
952 		default:
953 			break;
954 		}
955 		off += NLA_ALIGN(na->nla_len);
956 	}
957 	/*
958 	 * It is possible to have multiple messages for all
959 	 * Infiniband devices in the system with appropriate name.
960 	 * So we should gather parameters locally and copy to
961 	 * query context only in case of coinciding device name.
962 	 */
963 	if (local.flags & MLX5_NL_CMD_GET_IB_NAME) {
964 		data->flags = local.flags;
965 		data->ibindex = local.ibindex;
966 		data->ifindex = local.ifindex;
967 		data->portnum = local.portnum;
968 	}
969 	return 0;
970 error:
971 	rte_errno = EINVAL;
972 	return -rte_errno;
973 }
974 
975 /**
976  * Get index of network interface associated with some IB device.
977  *
978  * This is the only somewhat safe method to avoid resorting to heuristics
979  * when faced with port representors. Unfortunately it requires at least
980  * Linux 4.17.
981  *
982  * @param nl
983  *   Netlink socket of the RDMA kind (NETLINK_RDMA).
984  * @param[in] name
985  *   IB device name.
986  * @param[in] pindex
987  *   IB device port index, starting from 1
988  * @return
989  *   A valid (nonzero) interface index on success, 0 otherwise and rte_errno
990  *   is set.
991  */
992 unsigned int
993 mlx5_nl_ifindex(int nl, const char *name, uint32_t pindex)
994 {
995 	struct mlx5_nl_ifindex_data data = {
996 		.name = name,
997 		.flags = 0,
998 		.ibindex = 0, /* Determined during first pass. */
999 		.ifindex = 0, /* Determined during second pass. */
1000 	};
1001 	union {
1002 		struct nlmsghdr nh;
1003 		uint8_t buf[NLMSG_HDRLEN +
1004 			    NLA_HDRLEN + NLA_ALIGN(sizeof(data.ibindex)) +
1005 			    NLA_HDRLEN + NLA_ALIGN(sizeof(pindex))];
1006 	} req = {
1007 		.nh = {
1008 			.nlmsg_len = NLMSG_LENGTH(0),
1009 			.nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
1010 						       RDMA_NLDEV_CMD_GET),
1011 			.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,
1012 		},
1013 	};
1014 	struct nlattr *na;
1015 	uint32_t sn = MLX5_NL_SN_GENERATE;
1016 	int ret;
1017 
1018 	ret = mlx5_nl_send(nl, &req.nh, sn);
1019 	if (ret < 0)
1020 		return 0;
1021 	ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, &data);
1022 	if (ret < 0)
1023 		return 0;
1024 	if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) ||
1025 	    !(data.flags & MLX5_NL_CMD_GET_IB_INDEX))
1026 		goto error;
1027 	data.flags = 0;
1028 	sn = MLX5_NL_SN_GENERATE;
1029 	req.nh.nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
1030 					     RDMA_NLDEV_CMD_PORT_GET);
1031 	req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1032 	req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.buf) - NLMSG_HDRLEN);
1033 	na = (void *)((uintptr_t)req.buf + NLMSG_HDRLEN);
1034 	na->nla_len = NLA_HDRLEN + sizeof(data.ibindex);
1035 	na->nla_type = RDMA_NLDEV_ATTR_DEV_INDEX;
1036 	memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
1037 	       &data.ibindex, sizeof(data.ibindex));
1038 	na = (void *)((uintptr_t)na + NLA_ALIGN(na->nla_len));
1039 	na->nla_len = NLA_HDRLEN + sizeof(pindex);
1040 	na->nla_type = RDMA_NLDEV_ATTR_PORT_INDEX;
1041 	memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
1042 	       &pindex, sizeof(pindex));
1043 	ret = mlx5_nl_send(nl, &req.nh, sn);
1044 	if (ret < 0)
1045 		return 0;
1046 	ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, &data);
1047 	if (ret < 0)
1048 		return 0;
1049 	if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) ||
1050 	    !(data.flags & MLX5_NL_CMD_GET_IB_INDEX) ||
1051 	    !(data.flags & MLX5_NL_CMD_GET_NET_INDEX) ||
1052 	    !data.ifindex)
1053 		goto error;
1054 	return data.ifindex;
1055 error:
1056 	rte_errno = ENODEV;
1057 	return 0;
1058 }
1059 
1060 /**
1061  * Get the number of physical ports of given IB device.
1062  *
1063  * @param nl
1064  *   Netlink socket of the RDMA kind (NETLINK_RDMA).
1065  * @param[in] name
1066  *   IB device name.
1067  *
1068  * @return
1069  *   A valid (nonzero) number of ports on success, 0 otherwise
1070  *   and rte_errno is set.
1071  */
1072 unsigned int
1073 mlx5_nl_portnum(int nl, const char *name)
1074 {
1075 	struct mlx5_nl_ifindex_data data = {
1076 		.flags = 0,
1077 		.name = name,
1078 		.ifindex = 0,
1079 		.portnum = 0,
1080 	};
1081 	struct nlmsghdr req = {
1082 		.nlmsg_len = NLMSG_LENGTH(0),
1083 		.nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
1084 					       RDMA_NLDEV_CMD_GET),
1085 		.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,
1086 	};
1087 	uint32_t sn = MLX5_NL_SN_GENERATE;
1088 	int ret;
1089 
1090 	ret = mlx5_nl_send(nl, &req, sn);
1091 	if (ret < 0)
1092 		return 0;
1093 	ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, &data);
1094 	if (ret < 0)
1095 		return 0;
1096 	if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) ||
1097 	    !(data.flags & MLX5_NL_CMD_GET_IB_INDEX) ||
1098 	    !(data.flags & MLX5_NL_CMD_GET_PORT_INDEX)) {
1099 		rte_errno = ENODEV;
1100 		return 0;
1101 	}
1102 	if (!data.portnum)
1103 		rte_errno = EINVAL;
1104 	return data.portnum;
1105 }
1106 
1107 /**
1108  * Analyze gathered port parameters via Netlink to recognize master
1109  * and representor devices for E-Switch configuration.
1110  *
1111  * @param[in] num_vf_set
1112  *   flag of presence of number of VFs port attribute.
1113  * @param[inout] switch_info
1114  *   Port information, including port name as a number and port name
1115  *   type if recognized
1116  *
1117  * @return
1118  *   master and representor flags are set in switch_info according to
1119  *   recognized parameters (if any).
1120  */
1121 static void
1122 mlx5_nl_check_switch_info(bool num_vf_set,
1123 			  struct mlx5_switch_info *switch_info)
1124 {
1125 	switch (switch_info->name_type) {
1126 	case MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN:
1127 		/*
1128 		 * Name is not recognized, assume the master,
1129 		 * check the number of VFs key presence.
1130 		 */
1131 		switch_info->master = num_vf_set;
1132 		break;
1133 	case MLX5_PHYS_PORT_NAME_TYPE_NOTSET:
1134 		/*
1135 		 * Name is not set, this assumes the legacy naming
1136 		 * schema for master, just check if there is a
1137 		 * number of VFs key.
1138 		 */
1139 		switch_info->master = num_vf_set;
1140 		break;
1141 	case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
1142 		/* New uplink naming schema recognized. */
1143 		switch_info->master = 1;
1144 		break;
1145 	case MLX5_PHYS_PORT_NAME_TYPE_LEGACY:
1146 		/* Legacy representors naming schema. */
1147 		switch_info->representor = !num_vf_set;
1148 		break;
1149 	case MLX5_PHYS_PORT_NAME_TYPE_PFHPF:
1150 		/* Fallthrough */
1151 	case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
1152 		/* New representors naming schema. */
1153 		switch_info->representor = 1;
1154 		break;
1155 	}
1156 }
1157 
1158 /**
1159  * Process switch information from Netlink message.
1160  *
1161  * @param nh
1162  *   Pointer to Netlink message header.
1163  * @param arg
1164  *   Opaque data pointer for this callback.
1165  *
1166  * @return
1167  *   0 on success, a negative errno value otherwise and rte_errno is set.
1168  */
1169 static int
1170 mlx5_nl_switch_info_cb(struct nlmsghdr *nh, void *arg)
1171 {
1172 	struct mlx5_switch_info info = {
1173 		.master = 0,
1174 		.representor = 0,
1175 		.name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET,
1176 		.port_name = 0,
1177 		.switch_id = 0,
1178 	};
1179 	size_t off = NLMSG_LENGTH(sizeof(struct ifinfomsg));
1180 	bool switch_id_set = false;
1181 	bool num_vf_set = false;
1182 
1183 	if (nh->nlmsg_type != RTM_NEWLINK)
1184 		goto error;
1185 	while (off < nh->nlmsg_len) {
1186 		struct rtattr *ra = (void *)((uintptr_t)nh + off);
1187 		void *payload = RTA_DATA(ra);
1188 		unsigned int i;
1189 
1190 		if (ra->rta_len > nh->nlmsg_len - off)
1191 			goto error;
1192 		switch (ra->rta_type) {
1193 		case IFLA_NUM_VF:
1194 			num_vf_set = true;
1195 			break;
1196 		case IFLA_PHYS_PORT_NAME:
1197 			mlx5_translate_port_name((char *)payload, &info);
1198 			break;
1199 		case IFLA_PHYS_SWITCH_ID:
1200 			info.switch_id = 0;
1201 			for (i = 0; i < RTA_PAYLOAD(ra); ++i) {
1202 				info.switch_id <<= 8;
1203 				info.switch_id |= ((uint8_t *)payload)[i];
1204 			}
1205 			switch_id_set = true;
1206 			break;
1207 		}
1208 		off += RTA_ALIGN(ra->rta_len);
1209 	}
1210 	if (switch_id_set) {
1211 		/* We have some E-Switch configuration. */
1212 		mlx5_nl_check_switch_info(num_vf_set, &info);
1213 	}
1214 	MLX5_ASSERT(!(info.master && info.representor));
1215 	memcpy(arg, &info, sizeof(info));
1216 	return 0;
1217 error:
1218 	rte_errno = EINVAL;
1219 	return -rte_errno;
1220 }
1221 
1222 /**
1223  * Get switch information associated with network interface.
1224  *
1225  * @param nl
1226  *   Netlink socket of the ROUTE kind (NETLINK_ROUTE).
1227  * @param ifindex
1228  *   Network interface index.
1229  * @param[out] info
1230  *   Switch information object, populated in case of success.
1231  *
1232  * @return
1233  *   0 on success, a negative errno value otherwise and rte_errno is set.
1234  */
1235 int
1236 mlx5_nl_switch_info(int nl, unsigned int ifindex,
1237 		    struct mlx5_switch_info *info)
1238 {
1239 	struct {
1240 		struct nlmsghdr nh;
1241 		struct ifinfomsg info;
1242 		struct rtattr rta;
1243 		uint32_t extmask;
1244 	} req = {
1245 		.nh = {
1246 			.nlmsg_len = NLMSG_LENGTH
1247 					(sizeof(req.info) +
1248 					 RTA_LENGTH(sizeof(uint32_t))),
1249 			.nlmsg_type = RTM_GETLINK,
1250 			.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
1251 		},
1252 		.info = {
1253 			.ifi_family = AF_UNSPEC,
1254 			.ifi_index = ifindex,
1255 		},
1256 		.rta = {
1257 			.rta_type = IFLA_EXT_MASK,
1258 			.rta_len = RTA_LENGTH(sizeof(int32_t)),
1259 		},
1260 		.extmask = RTE_LE32(1),
1261 	};
1262 	uint32_t sn = MLX5_NL_SN_GENERATE;
1263 	int ret;
1264 
1265 	ret = mlx5_nl_send(nl, &req.nh, sn);
1266 	if (ret >= 0)
1267 		ret = mlx5_nl_recv(nl, sn, mlx5_nl_switch_info_cb, info);
1268 	if (info->master && info->representor) {
1269 		DRV_LOG(ERR, "ifindex %u device is recognized as master"
1270 			     " and as representor", ifindex);
1271 		rte_errno = ENODEV;
1272 		ret = -rte_errno;
1273 	}
1274 	return ret;
1275 }
1276 
1277 /*
1278  * Delete VLAN network device by ifindex.
1279  *
1280  * @param[in] tcf
1281  *   Context object initialized by mlx5_nl_vlan_vmwa_init().
1282  * @param[in] ifindex
1283  *   Interface index of network device to delete.
1284  */
1285 void
1286 mlx5_nl_vlan_vmwa_delete(struct mlx5_nl_vlan_vmwa_context *vmwa,
1287 		      uint32_t ifindex)
1288 {
1289 	uint32_t sn = MLX5_NL_SN_GENERATE;
1290 	int ret;
1291 	struct {
1292 		struct nlmsghdr nh;
1293 		struct ifinfomsg info;
1294 	} req = {
1295 		.nh = {
1296 			.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
1297 			.nlmsg_type = RTM_DELLINK,
1298 			.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
1299 		},
1300 		.info = {
1301 			.ifi_family = AF_UNSPEC,
1302 			.ifi_index = ifindex,
1303 		},
1304 	};
1305 
1306 	if (ifindex) {
1307 		ret = mlx5_nl_send(vmwa->nl_socket, &req.nh, sn);
1308 		if (ret >= 0)
1309 			ret = mlx5_nl_recv(vmwa->nl_socket, sn, NULL, NULL);
1310 		if (ret < 0)
1311 			DRV_LOG(WARNING, "netlink: error deleting VLAN WA"
1312 				" ifindex %u, %d", ifindex, ret);
1313 	}
1314 }
1315 
1316 /* Set of subroutines to build Netlink message. */
1317 static struct nlattr *
1318 nl_msg_tail(struct nlmsghdr *nlh)
1319 {
1320 	return (struct nlattr *)
1321 		(((uint8_t *)nlh) + NLMSG_ALIGN(nlh->nlmsg_len));
1322 }
1323 
1324 static void
1325 nl_attr_put(struct nlmsghdr *nlh, int type, const void *data, int alen)
1326 {
1327 	struct nlattr *nla = nl_msg_tail(nlh);
1328 
1329 	nla->nla_type = type;
1330 	nla->nla_len = NLMSG_ALIGN(sizeof(struct nlattr)) + alen;
1331 	nlh->nlmsg_len += NLMSG_ALIGN(nla->nla_len);
1332 
1333 	if (alen)
1334 		memcpy((uint8_t *)nla + sizeof(struct nlattr), data, alen);
1335 }
1336 
1337 static struct nlattr *
1338 nl_attr_nest_start(struct nlmsghdr *nlh, int type)
1339 {
1340 	struct nlattr *nest = (struct nlattr *)nl_msg_tail(nlh);
1341 
1342 	nl_attr_put(nlh, type, NULL, 0);
1343 	return nest;
1344 }
1345 
1346 static void
1347 nl_attr_nest_end(struct nlmsghdr *nlh, struct nlattr *nest)
1348 {
1349 	nest->nla_len = (uint8_t *)nl_msg_tail(nlh) - (uint8_t *)nest;
1350 }
1351 
1352 /*
1353  * Create network VLAN device with specified VLAN tag.
1354  *
1355  * @param[in] tcf
1356  *   Context object initialized by mlx5_nl_vlan_vmwa_init().
1357  * @param[in] ifindex
1358  *   Base network interface index.
1359  * @param[in] tag
1360  *   VLAN tag for VLAN network device to create.
1361  */
1362 uint32_t
1363 mlx5_nl_vlan_vmwa_create(struct mlx5_nl_vlan_vmwa_context *vmwa,
1364 			 uint32_t ifindex, uint16_t tag)
1365 {
1366 	struct nlmsghdr *nlh;
1367 	struct ifinfomsg *ifm;
1368 	char name[sizeof(MLX5_VMWA_VLAN_DEVICE_PFX) + 32];
1369 
1370 	__rte_cache_aligned
1371 	uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1372 		    NLMSG_ALIGN(sizeof(struct ifinfomsg)) +
1373 		    NLMSG_ALIGN(sizeof(struct nlattr)) * 8 +
1374 		    NLMSG_ALIGN(sizeof(uint32_t)) +
1375 		    NLMSG_ALIGN(sizeof(name)) +
1376 		    NLMSG_ALIGN(sizeof("vlan")) +
1377 		    NLMSG_ALIGN(sizeof(uint32_t)) +
1378 		    NLMSG_ALIGN(sizeof(uint16_t)) + 16];
1379 	struct nlattr *na_info;
1380 	struct nlattr *na_vlan;
1381 	uint32_t sn = MLX5_NL_SN_GENERATE;
1382 	int ret;
1383 
1384 	memset(buf, 0, sizeof(buf));
1385 	nlh = (struct nlmsghdr *)buf;
1386 	nlh->nlmsg_len = sizeof(struct nlmsghdr);
1387 	nlh->nlmsg_type = RTM_NEWLINK;
1388 	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
1389 			   NLM_F_EXCL | NLM_F_ACK;
1390 	ifm = (struct ifinfomsg *)nl_msg_tail(nlh);
1391 	nlh->nlmsg_len += sizeof(struct ifinfomsg);
1392 	ifm->ifi_family = AF_UNSPEC;
1393 	ifm->ifi_type = 0;
1394 	ifm->ifi_index = 0;
1395 	ifm->ifi_flags = IFF_UP;
1396 	ifm->ifi_change = 0xffffffff;
1397 	nl_attr_put(nlh, IFLA_LINK, &ifindex, sizeof(ifindex));
1398 	ret = snprintf(name, sizeof(name), "%s.%u.%u",
1399 		       MLX5_VMWA_VLAN_DEVICE_PFX, ifindex, tag);
1400 	nl_attr_put(nlh, IFLA_IFNAME, name, ret + 1);
1401 	na_info = nl_attr_nest_start(nlh, IFLA_LINKINFO);
1402 	nl_attr_put(nlh, IFLA_INFO_KIND, "vlan", sizeof("vlan"));
1403 	na_vlan = nl_attr_nest_start(nlh, IFLA_INFO_DATA);
1404 	nl_attr_put(nlh, IFLA_VLAN_ID, &tag, sizeof(tag));
1405 	nl_attr_nest_end(nlh, na_vlan);
1406 	nl_attr_nest_end(nlh, na_info);
1407 	MLX5_ASSERT(sizeof(buf) >= nlh->nlmsg_len);
1408 	ret = mlx5_nl_send(vmwa->nl_socket, nlh, sn);
1409 	if (ret >= 0)
1410 		ret = mlx5_nl_recv(vmwa->nl_socket, sn, NULL, NULL);
1411 	if (ret < 0) {
1412 		DRV_LOG(WARNING, "netlink: VLAN %s create failure (%d)", name,
1413 			ret);
1414 	}
1415 	/* Try to get ifindex of created or pre-existing device. */
1416 	ret = if_nametoindex(name);
1417 	if (!ret) {
1418 		DRV_LOG(WARNING, "VLAN %s failed to get index (%d)", name,
1419 			errno);
1420 		return 0;
1421 	}
1422 	return ret;
1423 }
1424 
1425 /**
1426  * Parse Netlink message to retrieve the general family ID.
1427  *
1428  * @param nh
1429  *   Pointer to Netlink Message Header.
1430  * @param arg
1431  *   PMD data register with this callback.
1432  *
1433  * @return
1434  *   0 on success, a negative errno value otherwise and rte_errno is set.
1435  */
1436 static int
1437 mlx5_nl_family_id_cb(struct nlmsghdr *nh, void *arg)
1438 {
1439 
1440 	struct nlattr *tail = RTE_PTR_ADD(nh, nh->nlmsg_len);
1441 	struct nlattr *nla = RTE_PTR_ADD(nh, NLMSG_ALIGN(sizeof(*nh)) +
1442 					NLMSG_ALIGN(sizeof(struct genlmsghdr)));
1443 
1444 	for (; nla->nla_len && nla < tail;
1445 	     nla = RTE_PTR_ADD(nla, NLMSG_ALIGN(nla->nla_len))) {
1446 		if (nla->nla_type == CTRL_ATTR_FAMILY_ID) {
1447 			*(uint16_t *)arg = *(uint16_t *)(nla + 1);
1448 			return 0;
1449 		}
1450 	}
1451 	return -EINVAL;
1452 }
1453 
1454 #define MLX5_NL_MAX_ATTR_SIZE 100
1455 /**
1456  * Get generic netlink family ID.
1457  *
1458  * @param[in] nlsk_fd
1459  *   Netlink socket file descriptor.
1460  * @param[in] name
1461  *   The family name.
1462  *
1463  * @return
1464  *   ID >= 0 on success and @p enable is updated, a negative errno value
1465  *   otherwise and rte_errno is set.
1466  */
1467 static int
1468 mlx5_nl_generic_family_id_get(int nlsk_fd, const char *name)
1469 {
1470 	struct nlmsghdr *nlh;
1471 	struct genlmsghdr *genl;
1472 	uint32_t sn = MLX5_NL_SN_GENERATE;
1473 	int name_size = strlen(name) + 1;
1474 	int ret;
1475 	uint16_t id = -1;
1476 	uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1477 		    NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
1478 		    NLMSG_ALIGN(sizeof(struct nlattr)) +
1479 		    NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE)];
1480 
1481 	memset(buf, 0, sizeof(buf));
1482 	nlh = (struct nlmsghdr *)buf;
1483 	nlh->nlmsg_len = sizeof(struct nlmsghdr);
1484 	nlh->nlmsg_type = GENL_ID_CTRL;
1485 	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1486 	genl = (struct genlmsghdr *)nl_msg_tail(nlh);
1487 	nlh->nlmsg_len += sizeof(struct genlmsghdr);
1488 	genl->cmd = CTRL_CMD_GETFAMILY;
1489 	genl->version = 1;
1490 	nl_attr_put(nlh, CTRL_ATTR_FAMILY_NAME, name, name_size);
1491 	ret = mlx5_nl_send(nlsk_fd, nlh, sn);
1492 	if (ret >= 0)
1493 		ret = mlx5_nl_recv(nlsk_fd, sn, mlx5_nl_family_id_cb, &id);
1494 	if (ret < 0) {
1495 		DRV_LOG(DEBUG, "Failed to get Netlink %s family ID: %d.", name,
1496 			ret);
1497 		return ret;
1498 	}
1499 	DRV_LOG(DEBUG, "Netlink \"%s\" family ID is %u.", name, id);
1500 	return (int)id;
1501 }
1502 
1503 /**
1504  * Get Devlink family ID.
1505  *
1506  * @param[in] nlsk_fd
1507  *   Netlink socket file descriptor.
1508  *
1509  * @return
1510  *   ID >= 0 on success and @p enable is updated, a negative errno value
1511  *   otherwise and rte_errno is set.
1512  */
1513 
1514 int
1515 mlx5_nl_devlink_family_id_get(int nlsk_fd)
1516 {
1517 	return mlx5_nl_generic_family_id_get(nlsk_fd, DEVLINK_GENL_NAME);
1518 }
1519 
1520 /**
1521  * Parse Netlink message to retrieve the ROCE enable status.
1522  *
1523  * @param nh
1524  *   Pointer to Netlink Message Header.
1525  * @param arg
1526  *   PMD data register with this callback.
1527  *
1528  * @return
1529  *   0 on success, a negative errno value otherwise and rte_errno is set.
1530  */
1531 static int
1532 mlx5_nl_roce_cb(struct nlmsghdr *nh, void *arg)
1533 {
1534 
1535 	int ret = -EINVAL;
1536 	int *enable = arg;
1537 	struct nlattr *tail = RTE_PTR_ADD(nh, nh->nlmsg_len);
1538 	struct nlattr *nla = RTE_PTR_ADD(nh, NLMSG_ALIGN(sizeof(*nh)) +
1539 					NLMSG_ALIGN(sizeof(struct genlmsghdr)));
1540 
1541 	while (nla->nla_len && nla < tail) {
1542 		switch (nla->nla_type) {
1543 		/* Expected nested attributes case. */
1544 		case DEVLINK_ATTR_PARAM:
1545 		case DEVLINK_ATTR_PARAM_VALUES_LIST:
1546 		case DEVLINK_ATTR_PARAM_VALUE:
1547 			ret = 0;
1548 			nla += 1;
1549 			break;
1550 		case DEVLINK_ATTR_PARAM_VALUE_DATA:
1551 			*enable = 1;
1552 			return 0;
1553 		default:
1554 			nla = RTE_PTR_ADD(nla, NLMSG_ALIGN(nla->nla_len));
1555 		}
1556 	}
1557 	*enable = 0;
1558 	return ret;
1559 }
1560 
1561 /**
1562  * Get ROCE enable status through Netlink.
1563  *
1564  * @param[in] nlsk_fd
1565  *   Netlink socket file descriptor.
1566  * @param[in] family_id
1567  *   the Devlink family ID.
1568  * @param pci_addr
1569  *   The device PCI address.
1570  * @param[out] enable
1571  *   Where to store the enable status.
1572  *
1573  * @return
1574  *   0 on success and @p enable is updated, a negative errno value otherwise
1575  *   and rte_errno is set.
1576  */
1577 int
1578 mlx5_nl_enable_roce_get(int nlsk_fd, int family_id, const char *pci_addr,
1579 			int *enable)
1580 {
1581 	struct nlmsghdr *nlh;
1582 	struct genlmsghdr *genl;
1583 	uint32_t sn = MLX5_NL_SN_GENERATE;
1584 	int ret;
1585 	int cur_en = 0;
1586 	uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1587 		    NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
1588 		    NLMSG_ALIGN(sizeof(struct nlattr)) * 4 +
1589 		    NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE) * 4];
1590 
1591 	memset(buf, 0, sizeof(buf));
1592 	nlh = (struct nlmsghdr *)buf;
1593 	nlh->nlmsg_len = sizeof(struct nlmsghdr);
1594 	nlh->nlmsg_type = family_id;
1595 	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1596 	genl = (struct genlmsghdr *)nl_msg_tail(nlh);
1597 	nlh->nlmsg_len += sizeof(struct genlmsghdr);
1598 	genl->cmd = DEVLINK_CMD_PARAM_GET;
1599 	genl->version = DEVLINK_GENL_VERSION;
1600 	nl_attr_put(nlh, DEVLINK_ATTR_BUS_NAME, "pci", 4);
1601 	nl_attr_put(nlh, DEVLINK_ATTR_DEV_NAME, pci_addr, strlen(pci_addr) + 1);
1602 	nl_attr_put(nlh, DEVLINK_ATTR_PARAM_NAME, "enable_roce", 12);
1603 	ret = mlx5_nl_send(nlsk_fd, nlh, sn);
1604 	if (ret >= 0)
1605 		ret = mlx5_nl_recv(nlsk_fd, sn, mlx5_nl_roce_cb, &cur_en);
1606 	if (ret < 0) {
1607 		DRV_LOG(DEBUG, "Failed to get ROCE enable on device %s: %d.",
1608 			pci_addr, ret);
1609 		return ret;
1610 	}
1611 	*enable = cur_en;
1612 	DRV_LOG(DEBUG, "ROCE is %sabled for device \"%s\".",
1613 		cur_en ? "en" : "dis", pci_addr);
1614 	return ret;
1615 }
1616 
1617 /**
1618  * Reload mlx5 device kernel driver through Netlink.
1619  *
1620  * @param[in] nlsk_fd
1621  *   Netlink socket file descriptor.
1622  * @param[in] family_id
1623  *   the Devlink family ID.
1624  * @param pci_addr
1625  *   The device PCI address.
1626  * @param[out] enable
1627  *   The enable status to set.
1628  *
1629  * @return
1630  *   0 on success, a negative errno value otherwise and rte_errno is set.
1631  */
1632 int
1633 mlx5_nl_driver_reload(int nlsk_fd, int family_id, const char *pci_addr)
1634 {
1635 	struct nlmsghdr *nlh;
1636 	struct genlmsghdr *genl;
1637 	uint32_t sn = MLX5_NL_SN_GENERATE;
1638 	int ret;
1639 	uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1640 		    NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
1641 		    NLMSG_ALIGN(sizeof(struct nlattr)) * 2 +
1642 		    NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE) * 2];
1643 
1644 	memset(buf, 0, sizeof(buf));
1645 	nlh = (struct nlmsghdr *)buf;
1646 	nlh->nlmsg_len = sizeof(struct nlmsghdr);
1647 	nlh->nlmsg_type = family_id;
1648 	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1649 	genl = (struct genlmsghdr *)nl_msg_tail(nlh);
1650 	nlh->nlmsg_len += sizeof(struct genlmsghdr);
1651 	genl->cmd = DEVLINK_CMD_RELOAD;
1652 	genl->version = DEVLINK_GENL_VERSION;
1653 	nl_attr_put(nlh, DEVLINK_ATTR_BUS_NAME, "pci", 4);
1654 	nl_attr_put(nlh, DEVLINK_ATTR_DEV_NAME, pci_addr, strlen(pci_addr) + 1);
1655 	ret = mlx5_nl_send(nlsk_fd, nlh, sn);
1656 	if (ret >= 0)
1657 		ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
1658 	if (ret < 0) {
1659 		DRV_LOG(DEBUG, "Failed to reload %s device by Netlink - %d",
1660 			pci_addr, ret);
1661 		return ret;
1662 	}
1663 	DRV_LOG(DEBUG, "Device \"%s\" was reloaded by Netlink successfully.",
1664 		pci_addr);
1665 	return 0;
1666 }
1667 
1668 /**
1669  * Set ROCE enable status through Netlink.
1670  *
1671  * @param[in] nlsk_fd
1672  *   Netlink socket file descriptor.
1673  * @param[in] family_id
1674  *   the Devlink family ID.
1675  * @param pci_addr
1676  *   The device PCI address.
1677  * @param[out] enable
1678  *   The enable status to set.
1679  *
1680  * @return
1681  *   0 on success, a negative errno value otherwise and rte_errno is set.
1682  */
1683 int
1684 mlx5_nl_enable_roce_set(int nlsk_fd, int family_id, const char *pci_addr,
1685 			int enable)
1686 {
1687 	struct nlmsghdr *nlh;
1688 	struct genlmsghdr *genl;
1689 	uint32_t sn = MLX5_NL_SN_GENERATE;
1690 	int ret;
1691 	uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1692 		    NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
1693 		    NLMSG_ALIGN(sizeof(struct nlattr)) * 6 +
1694 		    NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE) * 6];
1695 	uint8_t cmode = DEVLINK_PARAM_CMODE_DRIVERINIT;
1696 	uint8_t ptype = NLA_FLAG;
1697 ;
1698 
1699 	memset(buf, 0, sizeof(buf));
1700 	nlh = (struct nlmsghdr *)buf;
1701 	nlh->nlmsg_len = sizeof(struct nlmsghdr);
1702 	nlh->nlmsg_type = family_id;
1703 	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1704 	genl = (struct genlmsghdr *)nl_msg_tail(nlh);
1705 	nlh->nlmsg_len += sizeof(struct genlmsghdr);
1706 	genl->cmd = DEVLINK_CMD_PARAM_SET;
1707 	genl->version = DEVLINK_GENL_VERSION;
1708 	nl_attr_put(nlh, DEVLINK_ATTR_BUS_NAME, "pci", 4);
1709 	nl_attr_put(nlh, DEVLINK_ATTR_DEV_NAME, pci_addr, strlen(pci_addr) + 1);
1710 	nl_attr_put(nlh, DEVLINK_ATTR_PARAM_NAME, "enable_roce", 12);
1711 	nl_attr_put(nlh, DEVLINK_ATTR_PARAM_VALUE_CMODE, &cmode, sizeof(cmode));
1712 	nl_attr_put(nlh, DEVLINK_ATTR_PARAM_TYPE, &ptype, sizeof(ptype));
1713 	if (enable)
1714 		nl_attr_put(nlh, DEVLINK_ATTR_PARAM_VALUE_DATA, NULL, 0);
1715 	ret = mlx5_nl_send(nlsk_fd, nlh, sn);
1716 	if (ret >= 0)
1717 		ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
1718 	if (ret < 0) {
1719 		DRV_LOG(DEBUG, "Failed to %sable ROCE for device %s by Netlink:"
1720 			" %d.", enable ? "en" : "dis", pci_addr, ret);
1721 		return ret;
1722 	}
1723 	DRV_LOG(DEBUG, "Device %s ROCE was %sabled by Netlink successfully.",
1724 		pci_addr, enable ? "en" : "dis");
1725 	/* Now, need to reload the driver. */
1726 	return mlx5_nl_driver_reload(nlsk_fd, family_id, pci_addr);
1727 }
1728